crawler_detect 0.1.12 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,29 @@
1
+ # frozen_string_literal: true
2
+
3
+ module CrawlerDetect
4
+ # Configuration of CrawlerDetect
5
+ #
6
+ # @see settings
7
+ # @since 1.0.0
8
+ class Config < ::Qonfig::DataSet
9
+ CUR_PATH = File.dirname(File.expand_path(__FILE__)).freeze
10
+ RAW_PATH = File.join(CUR_PATH, "library/raw").freeze
11
+
12
+ RAW_CRAWLERS_PATH = File.join(RAW_PATH, "Crawlers.json").freeze
13
+ RAW_EXCLUSIONS_PATH = File.join(RAW_PATH, "Exclusions.json").freeze
14
+ RAW_HEADERS_PATH = File.join(RAW_PATH, "Headers.json").freeze
15
+
16
+ # @return [String] path to crawlers raw JSON file
17
+ setting :raw_crawlers_path, RAW_CRAWLERS_PATH
18
+
19
+ # @return [String] path to exclusions raw JSON file
20
+ setting :raw_exclusions_path, RAW_EXCLUSIONS_PATH
21
+
22
+ # @return [String] path to headers raw JSON file
23
+ setting :raw_headers_path, RAW_HEADERS_PATH
24
+
25
+ validate :raw_crawlers_path, :string, strict: true
26
+ validate :raw_exclusions_path, :string, strict: true
27
+ validate :raw_headers_path, :string, strict: true
28
+ end
29
+ end
@@ -1,17 +1,22 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module CrawlerDetect
4
+ # since 0.1.0
4
5
  class Detector
6
+ # @param user_agent [String] User-agent string to detect
7
+ # @return [CrawlerDetect::Detector] instance of detector class
5
8
  def initialize(user_agent)
6
9
  @user_agent = user_agent.to_s.dup
7
10
  end
8
11
 
12
+ # @return [true, false] Is User-agent a crawler?
9
13
  def is_crawler?
10
14
  @is_crawler ||= begin
11
15
  !completely_exclusion? && matches_crawler_list?
12
16
  end
13
17
  end
14
18
 
19
+ # @return [String] The detected crawler name from RAW data
15
20
  def crawler_name
16
21
  return unless is_crawler?
17
22
  @crawler_name
@@ -19,22 +24,30 @@ module CrawlerDetect
19
24
 
20
25
  private
21
26
 
22
- def completely_exclusion?
23
- @user_agent.gsub!(exclusions_matcher, "")
24
- @user_agent.strip.length == 0
25
- end
27
+ # @private
28
+ # @return [true, false] Is User-agent in white-list?
29
+ def completely_exclusion?
30
+ @user_agent.gsub!(exclusions_matcher, "")
31
+ @user_agent.strip.length.zero?
32
+ end
26
33
 
27
- def matches_crawler_list?
28
- @crawler_name = crawlers_matcher.match(@user_agent).to_s.strip
29
- !@crawler_name.empty?
30
- end
34
+ # @private
35
+ # @return [true, false] Is User-agent in black-list?
36
+ def matches_crawler_list?
37
+ @crawler_name = crawlers_matcher.match(@user_agent).to_s.strip
38
+ !@crawler_name.empty?
39
+ end
31
40
 
32
- def exclusions_matcher
33
- CrawlerDetect::Library.get_regexp("exclusions")
34
- end
41
+ # @private
42
+ # @return [Regexp] White-list of User-agents
43
+ def exclusions_matcher
44
+ CrawlerDetect::Library.get_regexp("exclusions")
45
+ end
35
46
 
36
- def crawlers_matcher
37
- CrawlerDetect::Library.get_regexp("crawlers")
38
- end
47
+ # @private
48
+ # @return [Regexp] Black-list of User-agents
49
+ def crawlers_matcher
50
+ CrawlerDetect::Library.get_regexp("crawlers")
51
+ end
39
52
  end
40
53
  end
@@ -1,16 +1,22 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module CrawlerDetect
4
+ # @since 0.1.0
4
5
  module Library
6
+ DATA_CLASSES = [Library::Headers, Library::Exclusions, Library::Crawlers].freeze
7
+
5
8
  class << self
9
+ # @param param [String] Name of raw data
10
+ # @return [Regexp]
6
11
  def get_regexp(param)
7
12
  data = get_array(param)
8
- %r[#{data.join('|')}]i
13
+ %r{#{data.join('|')}}i
9
14
  end
10
15
 
16
+ # @param param [String] Name of raw data
17
+ # @return [Array]
11
18
  def get_array(param)
12
- const_name = "CrawlerDetect::Library::#{param.capitalize}::#{param.upcase}"
13
- const_get(const_name)
19
+ const_get("CrawlerDetect::Library::#{param.capitalize}").send(:data)
14
20
  end
15
21
  end
16
22
  end
@@ -1,1285 +1,14 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- # rubocop:disable Layout/TrailingWhitespace
4
3
  module CrawlerDetect
5
4
  module Library
5
+ # @since 0.1.0
6
6
  module Crawlers
7
- CRAWLERS = %q[
8
- .*Java.*outbrain
9
- YLT
10
- ^b0t$
11
- ^bluefish
12
- ^Calypso v\/
13
- ^COMODO DCV
14
- ^DangDang
15
- ^DavClnt
16
- ^FDM
17
- ^git\/
18
- ^Goose\/
19
- ^Grabber
20
- ^HTTPClient\/
21
- ^Java\/
22
- ^Jeode\/
23
- ^Jetty\/
24
- ^Mail\/
25
- ^Mget
26
- ^Microsoft URL Control
27
- ^NG\/[0-9\.]
28
- ^NING\/
29
- ^PHP\/[0-9]
30
- ^RMA\/
31
- ^Ruby|Ruby\/[0-9]
32
- ^VSE\/[0-9]
33
- ^WordPress\.com
34
- ^XRL\/[0-9]
35
- ^ZmEu
36
- 008\/
37
- 13TABS
38
- 192\.comAgent
39
- 2ip\.ru
40
- 404enemy
41
- 7Siters
42
- 80legs
43
- a\.pr-cy\.ru
44
- a3logics\.in
45
- A6-Indexer
46
- Abonti
47
- Aboundex
48
- aboutthedomain
49
- Accoona-AI-Agent
50
- acoon
51
- acrylicapps\.com\/pulp
52
- Acunetix
53
- AdAuth\/
54
- adbeat
55
- AddThis
56
- ADmantX
57
- AdminLabs
58
- adressendeutschland
59
- adreview\/
60
- adscanner
61
- Adstxtaggregator
62
- adstxt-worker
63
- adstxt\.com
64
- agentslug
65
- AHC
66
- aihit
67
- aiohttp\/
68
- Airmail
69
- akka-http\/
70
- akula\/
71
- alertra
72
- alexa site audit
73
- Alibaba\.Security\.Heimdall
74
- Alligator
75
- allloadin
76
- AllSubmitter
77
- alyze\.info
78
- amagit
79
- ^Amazon Simple Notification Service Agent$
80
- Anarchie
81
- AndroidDownloadManager
82
- Anemone
83
- AngleSharp
84
- annotate_google
85
- Ant\.com
86
- Anturis Agent
87
- AnyEvent-HTTP\/
88
- Apache Droid
89
- Apache OpenOffice
90
- Apache-HttpAsyncClient
91
- Apache-HttpClient
92
- ApacheBench
93
- Apexoo
94
- APIs-Google
95
- AportWorm\/
96
- AppBeat\/
97
- AppEngine-Google
98
- AppleSyndication
99
- Aprc\/[0-9]
100
- Arachmo
101
- arachnode
102
- Arachnophilia
103
- aria2
104
- Arukereso
105
- asafaweb
106
- AskQuickly
107
- Ask Jeeves
108
- ASPSeek
109
- Asterias
110
- Astute
111
- asynchttp
112
- Attach
113
- attohttpc
114
- autocite
115
- AutomaticWPTester
116
- Autonomy
117
- axios\/
118
- AWS Security Scanner
119
- B-l-i-t-z-B-O-T
120
- Backlink-Ceck
121
- backlink-check
122
- BacklinkHttpStatus
123
- BackStreet
124
- BackWeb
125
- Bad-Neighborhood
126
- Badass
127
- baidu\.com
128
- Bandit
129
- basicstate
130
- BatchFTP
131
- Battlezta Bazinga
132
- baypup\/
133
- BazQux
134
- BBBike
135
- BCKLINKS
136
- BDFetch
137
- BegunAdvertising
138
- Bewica-security-scan
139
- Bidtellect
140
- BigBozz
141
- Bigfoot
142
- biglotron
143
- BingLocalSearch
144
- BingPreview
145
- binlar
146
- biNu image cacher
147
- Bitacle
148
- biz_Directory
149
- Black Hole
150
- Blackboard Safeassign
151
- BlackWidow
152
- BlockNote\.Net
153
- BlogBridge
154
- Bloglines
155
- Bloglovin
156
- BlogPulseLive
157
- BlogSearch
158
- Blogtrottr
159
- BlowFish
160
- boitho\.com-dc
161
- Boost\.Beast
162
- BPImageWalker
163
- Braintree-Webhooks
164
- Branch Metrics API
165
- Branch-Passthrough
166
- Brandprotect
167
- BrandVerity
168
- Brandwatch
169
- Brodie\/
170
- Browsershots
171
- BUbiNG
172
- Buck\/
173
- Buddy
174
- BuiltWith
175
- Bullseye
176
- BunnySlippers
177
- Burf Search
178
- Butterfly\/
179
- BuzzSumo
180
- CAAM\/[0-9]
181
- CakePHP
182
- Calculon
183
- Canary%20Mail
184
- CaretNail
185
- catexplorador
186
- CC Metadata Scaper
187
- Cegbfeieh
188
- censys
189
- Cerberian Drtrs
190
- CERT\.at-Statistics-Survey
191
- cg-eye
192
- changedetection
193
- ChangesMeter
194
- Charlotte
195
- CheckHost
196
- checkprivacy
197
- CherryPicker
198
- ChinaClaw
199
- Chirp\/
200
- chkme\.com
201
- Chlooe
202
- Chromaxa
203
- CirrusExplorer
204
- CISPA Vulnerability Notification
205
- Citoid
206
- CJNetworkQuality
207
- Clarsentia
208
- clips\.ua\.ac\.be
209
- Cloud mapping
210
- CloudEndure
211
- CloudFlare-AlwaysOnline
212
- Cloudflare-Healthchecks
213
- Cloudinary
214
- cmcm\.com
215
- coccoc
216
- cognitiveseo
217
- colly -
218
- CommaFeed
219
- Commons-HttpClient
220
- commonscan
221
- contactbigdatafr
222
- contentkingapp
223
- convera
224
- CookieReports
225
- copyright sheriff
226
- CopyRightCheck
227
- Copyscape
228
- cortex\/
229
- Cosmos4j\.feedback
230
- Covario-IDS
231
- Craw\/
232
- Crescent
233
- Crowsnest
234
- Criteo
235
- CSHttp
236
- CSSCheck
237
- curb
238
- Curious George
239
- curl
240
- cuwhois\/
241
- cybo\.com
242
- DAP\/NetHTTP
243
- DareBoost
244
- DatabaseDriverMysqli
245
- DataCha0s
246
- Datafeedwatch
247
- Datanyze
248
- DataparkSearch
249
- dataprovider
250
- DataXu
251
- Daum(oa)?[ \/][0-9]
252
- dBpoweramp
253
- ddline
254
- deeris
255
- delve\.ai
256
- Demon
257
- DeuSu
258
- developers\.google\.com\/\+\/web\/snippet\/
259
- Devil
260
- Digg
261
- Digincore
262
- DigitalPebble
263
- Dirbuster
264
- Discourse Forum Onebox
265
- Disqus\/
266
- Dispatch\/
267
- DittoSpyder
268
- dlvr
269
- DMBrowser
270
- DNSPod-reporting
271
- docoloc
272
- Dolphin http client
273
- DomainAppender
274
- DomainLabz
275
- Donuts Content Explorer
276
- dotMailer content retrieval
277
- dotSemantic
278
- downforeveryoneorjustme
279
- Download Wonder
280
- downnotifier
281
- DowntimeDetector
282
- Drip
283
- drupact
284
- Drupal \(\+http:\/\/drupal\.org\/\)
285
- DTS Agent
286
- dubaiindex
287
- DuplexWeb-Google
288
- DynatraceSynthetic
289
- EARTHCOM
290
- Easy-Thumb
291
- EasyDL
292
- Ebingbong
293
- ec2linkfinder
294
- eCairn-Grabber
295
- eCatch
296
- ECCP
297
- eContext\/
298
- Ecxi
299
- EirGrabber
300
- ElectricMonk
301
- elefent
302
- EMail Exractor
303
- EMail Wolf
304
- EmailWolf
305
- Embarcadero
306
- Embed PHP Library
307
- Embedly
308
- endo\/
309
- europarchive\.org
310
- evc-batch
311
- EventMachine HttpClient
312
- Everwall Link Expander
313
- Evidon
314
- Evrinid
315
- ExactSearch
316
- ExaleadCloudview
317
- Excel\/
318
- exif
319
- ExoRank
320
- Exploratodo
321
- Express WebPictures
322
- Extreme Picture Finder
323
- EyeNetIE
324
- ezooms
325
- facebookexternalhit
326
- facebookexternalua
327
- facebookplatform
328
- fairshare
329
- Faraday v
330
- fasthttp
331
- Faveeo
332
- Favicon downloader
333
- faviconkit
334
- faviconarchive
335
- FavOrg
336
- Feed Wrangler
337
- Feedable\/
338
- Feedbin
339
- FeedBooster
340
- FeedBucket
341
- FeedBunch\/
342
- FeedBurner
343
- feeder
344
- Feedly
345
- FeedshowOnline
346
- Feedspot
347
- Feedwind\/
348
- FeedZcollector
349
- feeltiptop
350
- Fetch API
351
- Fetch\/[0-9]
352
- Fever\/[0-9]
353
- FHscan
354
- Filestack
355
- Fimap
356
- findlink
357
- findthatfile
358
- FlashGet
359
- FlipboardBrowserProxy
360
- FlipboardProxy
361
- FlipboardRSS
362
- Flock\/
363
- fluffy
364
- Flunky
365
- flynxapp
366
- forensiq
367
- FoundSeoTool
368
- http:\/\/www.neomo.de\/
369
- free thumbnails
370
- Freeuploader
371
- Funnelback
372
- Fuzz Faster U Fool
373
- G-i-g-a-b-o-t
374
- g00g1e\.net
375
- ganarvisitas
376
- geek-tools
377
- Genieo
378
- GentleSource
379
- GetCode
380
- Getintent
381
- GetLinkInfo
382
- getprismatic
383
- GetRight
384
- getroot
385
- GetURLInfo\/
386
- GetWeb
387
- Geziyor
388
- Ghost Inspector
389
- GigablastOpenSource
390
- GIS-LABS
391
- github-camo
392
- github\.com
393
- Goldfire Server
394
- Go [\d\.]* package http
395
- Go http package
396
- Go-Ahead-Got-It
397
- Go-http-client
398
- Go!Zilla
399
- gobyus
400
- gofetch
401
- GomezAgent
402
- gooblog
403
- Goodzer\/
404
- Google AppsViewer
405
- Google Desktop
406
- Google favicon
407
- Google Keyword Suggestion
408
- Google Keyword Tool
409
- Google Page Speed Insights
410
- Google PP Default
411
- Google Search Console
412
- Google Web Preview
413
- Google-Ads-Overview
414
- Google-Adwords
415
- Google-Apps-Script
416
- Google-Calendar-Importer
417
- Google-HotelAdsVerifier
418
- Google-HTTP-Java-Client
419
- Google-Publisher-Plugin
420
- Google-Read-Aloud
421
- Google-SearchByImage
422
- Google-Site-Verification
423
- Google-speakr
424
- Google-Structured-Data-Testing-Tool
425
- Google-Youtube-Links
426
- google-xrawler
427
- GoogleDocs
428
- GoogleHC\/
429
- GoogleProducer
430
- GoogleSites
431
- Google-Transparency-Report
432
- Gookey
433
- GoSpotCheck
434
- gosquared-thumbnailer
435
- Gotit
436
- GoZilla
437
- grabify
438
- GrabNet
439
- Grafula
440
- Grammarly
441
- GrapeFX
442
- GreatNews
443
- Gregarius
444
- GRequests
445
- grokkit
446
- grouphigh
447
- grub-client
448
- gSOAP\/
449
- GT::WWW
450
- GTmetrix
451
- GuzzleHttp
452
- gvfs\/
453
- HAA(A)?RTLAND http client
454
- Haansoft
455
- hackney\/
456
- Hadi Agent
457
- HappyApps-WebCheck
458
- Hatena
459
- Havij
460
- HaxerMen
461
- HeadlessChrome
462
- HEADMasterSEO
463
- HeartRails_Capture
464
- help@dataminr\.com
465
- heritrix
466
- Hexometer
467
- historious
468
- hkedcity
469
- hledejLevne\.cz
470
- Hloader
471
- HMView
472
- Holmes
473
- HonesoSearchEngine
474
- HootSuite Image proxy
475
- Hootsuite-WebFeed
476
- hosterstats
477
- HostTracker
478
- ht:\/\/check
479
- htdig
480
- HTMLparser
481
- htmlyse
482
- HTTP Banner Detection
483
- HTTP_Compression_Test
484
- http_request2
485
- http_requester
486
- http-get
487
- HTTP-Header-Abfrage
488
- http-kit
489
- http-request\/
490
- HTTP-Tiny
491
- HTTP::Lite
492
- http\.rb\/
493
- http_get
494
- HttpComponents
495
- httphr
496
- HTTPMon
497
- HTTPie
498
- httpRequest
499
- httpscheck
500
- httpssites_power
501
- httpunit
502
- HttpUrlConnection
503
- httrack
504
- huaweisymantec
505
- HubSpot
506
- Humanlinks
507
- i2kconnect\/
508
- Iblog
509
- ichiro
510
- Id-search
511
- IdeelaborPlagiaat
512
- IDG Twitter Links Resolver
513
- IDwhois\/
514
- Iframely
515
- igdeSpyder
516
- IlTrovatore
517
- Image Fetch
518
- Image Sucker
519
- ImageEngine\/
520
- ImageVisu\/
521
- Imagga
522
- imagineeasy
523
- imgsizer
524
- InAGist
525
- inbound\.li parser
526
- InDesign%20CC
527
- Indy Library
528
- InetURL
529
- infegy
530
- infohelfer
531
- InfoTekies
532
- InfoWizards Reciprocal Link
533
- inpwrd\.com
534
- instabid
535
- Instapaper
536
- Integrity
537
- integromedb
538
- Intelliseek
539
- InterGET
540
- internet_archive
541
- Internet Ninja
542
- InternetSeer
543
- internetVista monitor
544
- internetwache
545
- intraVnews
546
- IODC
547
- IOI
548
- iplabel
549
- ips-agent
550
- IPS\/[0-9]
551
- IPWorks HTTP\/S Component
552
- iqdb\/
553
- Iria
554
- Irokez
555
- isitup\.org
556
- iskanie
557
- isUp\.li
558
- iThemes Sync\/
559
- IZaBEE
560
- iZSearch
561
- JAHHO
562
- janforman
563
- Jaunt\/
564
- Jbrofuzz
565
- Jersey\/
566
- JetCar
567
- Jigsaw
568
- Jobboerse
569
- JobFeed discovery
570
- Jobg8 URL Monitor
571
- jobo
572
- Jobrapido
573
- Jobsearch1\.5
574
- JoinVision Generic
575
- JolokiaPwn
576
- Joomla
577
- Jorgee
578
- JS-Kit
579
- JustView
580
- Kaspersky Lab CFR link resolver
581
- Kelny\/
582
- Kerrigan\/
583
- KeyCDN
584
- Keyword Density
585
- Keywords Research
586
- khttp\/
587
- KickFire
588
- KimonoLabs\/
589
- Kml-Google
590
- knows\.is
591
- KOCMOHABT
592
- kouio
593
- kubectl
594
- kube-probe
595
- kulturarw3
596
- KumKie
597
- L\.webis
598
- Larbin
599
- Lavf\/
600
- LeechFTP
601
- LeechGet
602
- letsencrypt
603
- Lftp
604
- LibVLC
605
- LibWeb
606
- Libwhisker
607
- libwww
608
- Licorne
609
- Liferea\/
610
- Lightspeedsystems
611
- Lighthouse
612
- Likse
613
- limber\.io
614
- Link Valet
615
- link_thumbnailer
616
- LinkAlarm\/
617
- linkCheck
618
- linkdex
619
- LinkExaminer
620
- linkfluence
621
- linkpeek
622
- LinkPreviewGenerator
623
- LinkScan
624
- LinksManager
625
- LinkTiger
626
- LinkWalker
627
- Lipperhey
628
- Litemage_walker
629
- livedoor ScreenShot
630
- LoadImpactRload
631
- localsearch-web
632
- LongURL API
633
- longurl-r-package
634
- looid\.com
635
- looksystems\.net
636
- ltx71
637
- lua-resty-http
638
- lwp-request
639
- lwp-trivial
640
- LWP::Simple
641
- lycos
642
- LYT\.SR
643
- mabontland
644
- Mag-Net
645
- MagpieRSS
646
- Mail\.Ru
647
- MailChimp
648
- Majestic12
649
- makecontact\/
650
- Mandrill
651
- MapperCmd
652
- marketinggrader
653
- MarkMonitor
654
- MarkWatch
655
- Mass Downloader
656
- masscan\/
657
- Mata Hari
658
- Mediametric
659
- Mediapartners-Google
660
- mediawords
661
- MegaIndex\.ru
662
- MeltwaterNews
663
- Melvil Rawi
664
- MemGator
665
- Metaspinner
666
- MetaURI
667
- MFC_Tear_Sample
668
- MicroMessenger\/
669
- Microsearch
670
- Microsoft Office
671
- Microsoft Outlook
672
- Microsoft Windows Network Diagnostics
673
- Microsoft-WebDAV-MiniRedir
674
- Microsoft Data Access
675
- MIDown tool
676
- MIIxpc
677
- Mindjet
678
- Miniature\.io
679
- Miniflux
680
- Mister PiX
681
- mixdata dot com
682
- mixed-content-scan
683
- Mixmax-LinkPreview
684
- mixnode
685
- Mnogosearch
686
- mogimogi
687
- Mojeek
688
- Mojolicious \(Perl\)
689
- Monit\/
690
- monitis
691
- Monitority\/
692
- montastic
693
- MonTools
694
- Moreover
695
- Morfeus Fucking Scanner
696
- Morning Paper
697
- MovableType
698
- mowser
699
- Mr\.4x3 Powered
700
- Mrcgiguy
701
- MS Web Services Client Protocol
702
- MSFrontPage
703
- mShots
704
- MuckRack\/
705
- muhstik-scan
706
- MVAClient
707
- MxToolbox\/
708
- nagios
709
- Najdi\.si
710
- Name Intelligence
711
- Nameprotect
712
- Navroad
713
- NearSite
714
- Needle
715
- Nessus
716
- Net Vampire
717
- NetAnts
718
- NETCRAFT
719
- NetLyzer
720
- NetMechanic
721
- NetNewsWire
722
- Netpursual
723
- netresearch
724
- NetShelter ContentScan
725
- Netsparker
726
- NetTrack
727
- Netvibes
728
- NetZIP
729
- Neustar WPM
730
- NeutrinoAPI
731
- NewRelicPinger
732
- NewsBlur .*Finder
733
- NewsGator
734
- newsme
735
- newspaper\/
736
- NetSystemsResearch
737
- Nexgate Ruby Client
738
- NG-Search
739
- Nibbler
740
- NICErsPRO
741
- Nikto
742
- nineconnections
743
- NLNZ_IAHarvester
744
- Nmap Scripting Engine
745
- node-superagent
746
- node-urllib
747
- node\.io
748
- Nodemeter
749
- NodePing
750
- nominet\.org\.uk
751
- nominet\.uk
752
- Norton-Safeweb
753
- Notifixious
754
- notifyninja
755
- NotionEmbedder
756
- nuhk
757
- nutch
758
- Nuzzel
759
- nWormFeedFinder
760
- nyawc\/
761
- Nymesis
762
- NYU
763
- Ocelli\/
764
- Octopus
765
- oegp
766
- Offline Explorer
767
- Offline Navigator
768
- OgScrper
769
- okhttp
770
- omgili
771
- OMSC
772
- Online Domain Tools
773
- OpenCalaisSemanticProxy
774
- Openfind
775
- OpenLinkProfiler
776
- Openstat\/
777
- OpenVAS
778
- OPPO A33
779
- Optimizer
780
- Orbiter
781
- OrgProbe\/
782
- orion-semantics
783
- Outlook-Express
784
- Outlook-iOS
785
- ow\.ly
786
- Owler
787
- ownCloud News
788
- OxfordCloudService
789
- Page Valet
790
- page_verifier
791
- page scorer
792
- page2rss
793
- PageFreezer
794
- PageGrabber
795
- PagePeeker
796
- PageScorer
797
- Pagespeed\/
798
- Panopta
799
- panscient
800
- Papa Foto
801
- parsijoo
802
- Pavuk
803
- PayPal IPN
804
- pcBrowser
805
- Pcore-HTTP
806
- Pearltrees
807
- PECL::HTTP
808
- peerindex
809
- Peew
810
- PeoplePal
811
- Perlu -
812
- PhantomJS Screenshoter
813
- PhantomJS\/
814
- Photon\/
815
- phpservermon
816
- Pi-Monster
817
- Picscout
818
- Picsearch
819
- PictureFinder
820
- Pimonster
821
- ping\.blo\.gs
822
- Pingability
823
- PingAdmin\.Ru
824
- Pingdom
825
- Pingoscope
826
- PingSpot
827
- pinterest\.com
828
- Pixray
829
- Pizilla
830
- Plagger\/
831
- Ploetz \+ Zeller
832
- Plukkie
833
- plumanalytics
834
- PocketImageCache
835
- PocketParser
836
- Pockey
837
- POE-Component-Client-HTTP
838
- Polymail\/
839
- Pompos
840
- Porkbun
841
- Port Monitor
842
- postano
843
- PostmanRuntime
844
- PostPost
845
- postrank
846
- PowerPoint\/
847
- Prebid
848
- Priceonomics Analysis Engine
849
- PrintFriendly
850
- PritTorrent
851
- Prlog
852
- probethenet
853
- Project 25499
854
- prospectb2b
855
- Protopage
856
- ProWebWalker
857
- proximic
858
- PRTG Network Monitor
859
- pshtt, https scanning
860
- PTST
861
- PTST\/[0-9]+
862
- Pump
863
- python-httpx
864
- Python-httplib2
865
- python-requests
866
- Python-urllib
867
- Qirina Hurdler
868
- QQDownload
869
- QrafterPro
870
- Qseero
871
- Qualidator
872
- QueryN Metasearch
873
- queuedriver
874
- Quora Link Preview
875
- Qwantify
876
- Radian6
877
- RankActive
878
- RankFlex
879
- RankSonicSiteAuditor
880
- Re-re Studio
881
- ReactorNetty
882
- Readability
883
- RealDownload
884
- RealPlayer%20Downloader
885
- RebelMouse
886
- Recorder
887
- RecurPost\/
888
- redback\/
889
- ReederForMac
890
- Reeder\/
891
- ReGet
892
- RepoMonkey
893
- request\.js
894
- reqwest\/
895
- ResponseCodeTest
896
- RestSharp
897
- Riddler
898
- Rival IQ
899
- Robosourcer
900
- Robozilla
901
- ROI Hunter
902
- RPT-HTTPClient
903
- RSSOwl
904
- RyowlEngine
905
- safe-agent-scanner
906
- SalesIntelligent
907
- Saleslift
908
- Sendsay\.Ru
909
- SauceNAO
910
- SBIder
911
- sc-downloader
912
- scalaj-http
913
- Scamadviser-Frontend
914
- scan\.lol
915
- ScanAlert
916
- Scoop
917
- scooter
918
- ScoutJet
919
- ScoutURLMonitor
920
- ScrapeBox Page Scanner
921
- Scrapy
922
- Screaming
923
- ScreenShotService
924
- Scrubby
925
- Scrutiny\/
926
- search\.thunderstone
927
- Search37
928
- searchenginepromotionhelp
929
- Searchestate
930
- SearchExpress
931
- SearchSight
932
- Seeker
933
- semanticdiscovery
934
- semanticjuice
935
- Semiocast HTTP client
936
- Semrush
937
- sentry\/
938
- SEO Browser
939
- Seo Servis
940
- seo-nastroj\.cz
941
- seo4ajax
942
- Seobility
943
- SEOCentro
944
- SeoCheck
945
- SEOkicks
946
- SEOlizer
947
- Seomoz
948
- SEOprofiler
949
- SEOsearch
950
- seoscanners
951
- seositecheckup
952
- SEOstats
953
- servernfo
954
- sexsearcher
955
- Seznam
956
- Shelob
957
- Shodan
958
- Shoppimon
959
- ShopWiki
960
- shortURL lengthener
961
- ShortLinkTranslate
962
- shrinktheweb
963
- Sideqik
964
- SimplePie
965
- SimplyFast
966
- Siphon
967
- SISTRIX
968
- Site-Shot\/
969
- Site Sucker
970
- Site24x7
971
- SiteBar
972
- Sitebeam
973
- Sitebulb\/
974
- SiteCondor
975
- SiteExplorer
976
- SiteGuardian
977
- Siteimprove
978
- SiteIndexed
979
- Sitemap(s)? Generator
980
- SitemapGenerator
981
- SiteMonitor
982
- Siteshooter B0t
983
- SiteSnagger
984
- SiteSucker
985
- SiteTruth
986
- Sitevigil
987
- sitexy\.com
988
- SkypeUriPreview
989
- Slack\/
990
- slider\.com
991
- slurp
992
- SlySearch
993
- SmartDownload
994
- SMRF URL Expander
995
- SMUrlExpander
996
- Snake
997
- Snappy
998
- SnapSearch
999
- Snarfer\/
1000
- SniffRSS
1001
- sniptracker
1002
- Snoopy
1003
- SnowHaze Search
1004
- sogou web
1005
- SortSite
1006
- Sottopop
1007
- sovereign\.ai
1008
- SpaceBison
1009
- SpamExperts
1010
- Spammen
1011
- Spanner
1012
- spaziodati
1013
- SPDYCheck
1014
- Specificfeeds
1015
- speedy
1016
- SPEng
1017
- Spinn3r
1018
- spray-can
1019
- Sprinklr
1020
- spyonweb
1021
- sqlmap
1022
- Sqlworm
1023
- Sqworm
1024
- SSL Labs
1025
- ssl-tools
1026
- StackRambler
1027
- Statastico\/
1028
- StatusCake
1029
- Steeler
1030
- Stratagems Kumo
1031
- Stroke\.cz
1032
- StudioFACA
1033
- StumbleUpon
1034
- suchen
1035
- Sucuri
1036
- summify
1037
- SuperHTTP
1038
- Surphace Scout
1039
- Suzuran
1040
- Symfony BrowserKit
1041
- Symfony2 BrowserKit
1042
- SynHttpClient-Built
1043
- Sysomos
1044
- sysscan
1045
- Szukacz
1046
- T0PHackTeam
1047
- tAkeOut
1048
- Tarantula\/
1049
- Taringa UGC
1050
- TarmotGezgin
1051
- Teleport
1052
- Telesoft
1053
- Telesphoreo
1054
- Telesphorep
1055
- Tenon\.io
1056
- teoma
1057
- terrainformatica
1058
- Test Certificate Info
1059
- testuri
1060
- Tetrahedron
1061
- TextRazor Downloader
1062
- The Drop Reaper
1063
- The Expert HTML Source Viewer
1064
- The Knowledge AI
1065
- The Intraformant
1066
- theinternetrules
1067
- TheNomad
1068
- Thinklab
1069
- Thumbshots
1070
- ThumbSniper
1071
- Thumbor
1072
- timewe\.net
1073
- TinEye
1074
- Tiny Tiny RSS
1075
- TLSProbe\/
1076
- Toata
1077
- topster
1078
- touche\.com
1079
- Traackr\.com
1080
- tracemyfile
1081
- Trackuity
1082
- TrapitAgent
1083
- Trendiction
1084
- Trendsmap
1085
- trendspottr
1086
- truwoGPS
1087
- TryJsoup
1088
- TulipChain
1089
- Turingos
1090
- Turnitin
1091
- tweetedtimes
1092
- Tweetminster
1093
- Tweezler\/
1094
- twibble
1095
- Twice
1096
- Twikle
1097
- Twingly
1098
- Twisted PageGetter
1099
- Typhoeus
1100
- ubermetrics-technologies
1101
- uclassify
1102
- UdmSearch
1103
- unchaos
1104
- unirest-java
1105
- UniversalFeedParser
1106
- Unshorten\.It
1107
- Untiny
1108
- UnwindFetchor
1109
- updated
1110
- updown\.io daemon
1111
- Upflow
1112
- Uptimia
1113
- Urlcheckr
1114
- URL Verifier
1115
- URLitor
1116
- urlresolver
1117
- Urlstat
1118
- URLTester
1119
- UrlTrends Ranking Updater
1120
- URLy Warning
1121
- URLy\.Warning
1122
- Vacuum
1123
- Vagabondo
1124
- VB Project
1125
- vBSEO
1126
- VCI
1127
- via ggpht\.com GoogleImageProxy
1128
- Virusdie
1129
- visionutils
1130
- vkShare
1131
- VoidEYE
1132
- Voil
1133
- voltron
1134
- voyager\/
1135
- VSAgent\/
1136
- VSB-TUO\/
1137
- Vulnbusters Meter
1138
- VYU2
1139
- w3af\.org
1140
- W3C_Unicorn
1141
- W3C-checklink
1142
- W3C-mobileOK
1143
- WAC-OFU
1144
- Wallpapers\/[0-9]+
1145
- WallpapersHD
1146
- wangling
1147
- Wappalyzer
1148
- WatchMouse
1149
- WbSrch\/
1150
- WDT\.io
1151
- web-capture\.net
1152
- Web-sniffer
1153
- Web Auto
1154
- Web Collage
1155
- Web Enhancer
1156
- Web Fetch
1157
- Web Fuck
1158
- Web Pix
1159
- Web Sauger
1160
- Web spyder
1161
- Web Sucker
1162
- Webalta
1163
- Webauskunft
1164
- WebAuto
1165
- WebCapture
1166
- WebClient\/
1167
- webcollage
1168
- WebCookies
1169
- WebCopier
1170
- WebCorp
1171
- WebDataStats
1172
- WebDoc
1173
- WebEnhancer
1174
- WebFetch
1175
- WebFuck
1176
- WebGazer
1177
- WebGo IS
1178
- WebImageCollector
1179
- WebImages
1180
- WebIndex
1181
- webkit2png
1182
- WebLeacher
1183
- webmastercoffee
1184
- webmon\s
1185
- WebPix
1186
- WebReaper
1187
- WebSauger
1188
- webscreenie
1189
- Webshag
1190
- Webshot
1191
- Website Quester
1192
- websitepulse agent
1193
- WebsiteQuester
1194
- Websnapr
1195
- WebSniffer
1196
- Webster
1197
- WebStripper
1198
- WebSucker
1199
- Webthumb\/
1200
- WebThumbnail
1201
- WebWhacker
1202
- WebZIP
1203
- WeLikeLinks
1204
- WEPA
1205
- WeSEE
1206
- wf84
1207
- Wfuzz\/
1208
- wget
1209
- WhatsApp
1210
- WhatsMyIP
1211
- WhatWeb
1212
- WhereGoes\?
1213
- Whibse
1214
- WhoRunsCoinHive
1215
- Whynder Magnet
1216
- WinHttp-Autoproxy-Service
1217
- Windows-RSS-Platform
1218
- WinPodder
1219
- wkhtmlto
1220
- wmtips
1221
- Woko
1222
- Wolfram HTTPClient
1223
- woorankreview
1224
- Word\/
1225
- WordPress\/
1226
- worldping-api
1227
- WordupinfoSearch
1228
- wotbox
1229
- WP Engine Install Performance API
1230
- wpif
1231
- wprecon\.com survey
1232
- WPScan
1233
- wscheck
1234
- Wtrace
1235
- WWW-Collector-E
1236
- WWW-Mechanize
1237
- WWW::Document
1238
- WWW::Mechanize
1239
- www\.monitor\.us
1240
- WWWOFFLE
1241
- x09Mozilla
1242
- x22Mozilla
1243
- XaxisSemanticsClassifier
1244
- Xenu Link Sleuth
1245
- XING-contenttabreceiver
1246
- xpymep([0-9]?)\.exe
1247
- Y!J-(ASR|BSC)
1248
- Y\!J-BRW
1249
- Yaanb
1250
- yacy
1251
- Yahoo Link Preview
1252
- YahooCacheSystem
1253
- YahooYSMcm
1254
- YandeG
1255
- Yandex(?!Search)
1256
- yanga
1257
- yeti
1258
- Yo-yo
1259
- Yoleo Consumer
1260
- yoogliFetchAgent
1261
- YottaaMonitor
1262
- Your-Website-Sucks
1263
- yourls\.org
1264
- YoYs\.net
1265
- YP\.PL
1266
- Zabbix
1267
- Zade
1268
- Zao
1269
- Zauba
1270
- Zemanta Aggregator
1271
- Zend_Http_Client
1272
- Zend\\\Http\\\Client
1273
- Zermelo
1274
- Zeus
1275
- zgrab
1276
- ZnajdzFoto
1277
- ZnHTTP
1278
- Zombie\.js
1279
- Zoom\.Mac
1280
- ZyBorg
1281
- [a-z0-9\-_]*(bot|crawl|archiver|transcoder|spider|uptime|validator|fetcher|cron|checker|reader|extractor|monitoring|analyzer|scraper)
1282
- ].strip.split(/\n+/).freeze
7
+ extend Loader
8
+
9
+ def self.data
10
+ @data ||= load_raw(CrawlerDetect.config.settings.raw_crawlers_path).freeze
11
+ end
1283
12
  end
1284
13
  end
1285
14
  end