crawler_detect 0.1.11 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,24 +1,46 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require "crawler_detect/detector"
4
- require "crawler_detect/library"
5
- require "crawler_detect/library/crawlers"
6
- require "crawler_detect/library/exclusions"
7
- require "crawler_detect/library/headers"
8
- require "crawler_detect/version"
3
+ require "json"
4
+ require "qonfig"
9
5
 
10
- require "rack/crawler_detect"
6
+ require_relative "crawler_detect/config"
7
+ require_relative "crawler_detect/detector"
8
+ require_relative "crawler_detect/library/loader"
9
+ require_relative "crawler_detect/library/crawlers"
10
+ require_relative "crawler_detect/library/exclusions"
11
+ require_relative "crawler_detect/library/headers"
12
+ require_relative "crawler_detect/library"
13
+ require_relative "crawler_detect/version"
14
+ require_relative "rack/crawler_detect"
11
15
 
16
+ # @since 0.1.0
12
17
  module CrawlerDetect
13
18
  class << self
19
+ # @param user_agent [String] User-agent string to detect
20
+ # @return [CrawlerDetect::Detector] Instance of detector class
14
21
  def new(user_agent)
15
22
  detector(user_agent)
16
23
  end
17
24
 
25
+ # @param user_agent [String] User-agent string to detect
26
+ # @return [true, false] Is User-agent a crawler?
18
27
  def is_crawler?(user_agent)
19
28
  detector(user_agent).is_crawler?
20
29
  end
21
30
 
31
+ # @since 1.0.0
32
+ # @param config [Proc]
33
+ def setup!(&config)
34
+ @config = CrawlerDetect::Config.new(&config)
35
+ Library::DATA_CLASSES.each(&:reload_data)
36
+ end
37
+
38
+ # @since 1.0.0
39
+ # @return [CrawlerDetect::Config] Instance of configuration class
40
+ def config
41
+ @config ||= CrawlerDetect::Config.new
42
+ end
43
+
22
44
  private
23
45
 
24
46
  def detector(user_agent)
@@ -0,0 +1,29 @@
1
+ # frozen_string_literal: true
2
+
3
+ module CrawlerDetect
4
+ # Configuration of CrawlerDetect
5
+ #
6
+ # @see settings
7
+ # @since 1.0.0
8
+ class Config < ::Qonfig::DataSet
9
+ CUR_PATH = File.dirname(File.expand_path(__FILE__)).freeze
10
+ RAW_PATH = File.join(CUR_PATH, "library/raw").freeze
11
+
12
+ RAW_CRAWLERS_PATH = File.join(RAW_PATH, "Crawlers.json").freeze
13
+ RAW_EXCLUSIONS_PATH = File.join(RAW_PATH, "Exclusions.json").freeze
14
+ RAW_HEADERS_PATH = File.join(RAW_PATH, "Headers.json").freeze
15
+
16
+ # @return [String] path to crawlers raw JSON file
17
+ setting :raw_crawlers_path, RAW_CRAWLERS_PATH
18
+
19
+ # @return [String] path to exclusions raw JSON file
20
+ setting :raw_exclusions_path, RAW_EXCLUSIONS_PATH
21
+
22
+ # @return [String] path to headers raw JSON file
23
+ setting :raw_headers_path, RAW_HEADERS_PATH
24
+
25
+ validate :raw_crawlers_path, :string, strict: true
26
+ validate :raw_exclusions_path, :string, strict: true
27
+ validate :raw_headers_path, :string, strict: true
28
+ end
29
+ end
@@ -1,17 +1,22 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module CrawlerDetect
4
+ # since 0.1.0
4
5
  class Detector
6
+ # @param user_agent [String] User-agent string to detect
7
+ # @return [CrawlerDetect::Detector] instance of detector class
5
8
  def initialize(user_agent)
6
9
  @user_agent = user_agent.to_s.dup
7
10
  end
8
11
 
12
+ # @return [true, false] Is User-agent a crawler?
9
13
  def is_crawler?
10
14
  @is_crawler ||= begin
11
15
  !completely_exclusion? && matches_crawler_list?
12
16
  end
13
17
  end
14
18
 
19
+ # @return [String] The detected crawler name from RAW data
15
20
  def crawler_name
16
21
  return unless is_crawler?
17
22
  @crawler_name
@@ -19,22 +24,30 @@ module CrawlerDetect
19
24
 
20
25
  private
21
26
 
22
- def completely_exclusion?
23
- @user_agent.gsub!(exclusions_matcher, "")
24
- @user_agent.strip.length == 0
25
- end
27
+ # @private
28
+ # @return [true, false] Is User-agent in white-list?
29
+ def completely_exclusion?
30
+ @user_agent.gsub!(exclusions_matcher, "")
31
+ @user_agent.strip.length.zero?
32
+ end
26
33
 
27
- def matches_crawler_list?
28
- @crawler_name = crawlers_matcher.match(@user_agent).to_s.strip
29
- !@crawler_name.empty?
30
- end
34
+ # @private
35
+ # @return [true, false] Is User-agent in black-list?
36
+ def matches_crawler_list?
37
+ @crawler_name = crawlers_matcher.match(@user_agent).to_s.strip
38
+ !@crawler_name.empty?
39
+ end
31
40
 
32
- def exclusions_matcher
33
- CrawlerDetect::Library.get_regexp("exclusions")
34
- end
41
+ # @private
42
+ # @return [Regexp] White-list of User-agents
43
+ def exclusions_matcher
44
+ CrawlerDetect::Library.get_regexp("exclusions")
45
+ end
35
46
 
36
- def crawlers_matcher
37
- CrawlerDetect::Library.get_regexp("crawlers")
38
- end
47
+ # @private
48
+ # @return [Regexp] Black-list of User-agents
49
+ def crawlers_matcher
50
+ CrawlerDetect::Library.get_regexp("crawlers")
51
+ end
39
52
  end
40
53
  end
@@ -1,16 +1,22 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module CrawlerDetect
4
+ # @since 0.1.0
4
5
  module Library
6
+ DATA_CLASSES = [Library::Headers, Library::Exclusions, Library::Crawlers].freeze
7
+
5
8
  class << self
9
+ # @param param [String] Name of raw data
10
+ # @return [Regexp]
6
11
  def get_regexp(param)
7
12
  data = get_array(param)
8
- %r[#{data.join('|')}]i
13
+ %r{#{data.join('|')}}i
9
14
  end
10
15
 
16
+ # @param param [String] Name of raw data
17
+ # @return [Array]
11
18
  def get_array(param)
12
- const_name = "CrawlerDetect::Library::#{param.capitalize}::#{param.upcase}"
13
- const_get(const_name)
19
+ const_get("CrawlerDetect::Library::#{param.capitalize}").send(:data)
14
20
  end
15
21
  end
16
22
  end
@@ -1,1261 +1,14 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- # rubocop:disable Layout/TrailingWhitespace
4
3
  module CrawlerDetect
5
4
  module Library
5
+ # @since 0.1.0
6
6
  module Crawlers
7
- CRAWLERS = %q[
8
- .*Java.*outbrain
9
- YLT
10
- ^b0t$
11
- ^bluefish
12
- ^Calypso v\/
13
- ^COMODO DCV
14
- ^DangDang
15
- ^DavClnt
16
- ^FDM
17
- ^git\/
18
- ^Goose\/
19
- ^Grabber
20
- ^HTTPClient\/
21
- ^Java\/
22
- ^Jeode\/
23
- ^Jetty\/
24
- ^Mail\/
25
- ^Mget
26
- ^Microsoft URL Control
27
- ^NG\/[0-9\.]
28
- ^NING\/
29
- ^PHP\/[0-9]
30
- ^RMA\/
31
- ^Ruby|Ruby\/[0-9]
32
- ^VSE\/[0-9]
33
- ^WordPress\.com
34
- ^XRL\/[0-9]
35
- ^ZmEu
36
- 008\/
37
- 13TABS
38
- 192\.comAgent
39
- 2ip\.ru
40
- 404enemy
41
- 7Siters
42
- 80legs
43
- a\.pr-cy\.ru
44
- a3logics\.in
45
- A6-Indexer
46
- Abonti
47
- Aboundex
48
- aboutthedomain
49
- Accoona-AI-Agent
50
- acoon
51
- acrylicapps\.com\/pulp
52
- Acunetix
53
- AdAuth\/
54
- adbeat
55
- AddThis
56
- ADmantX
57
- AdminLabs
58
- adressendeutschland
59
- adscanner
60
- Adstxtaggregator
61
- adstxt-worker
62
- adstxt\.com
63
- agentslug
64
- AHC
65
- aihit
66
- aiohttp\/
67
- Airmail
68
- akka-http\/
69
- akula\/
70
- alertra
71
- alexa site audit
72
- Alibaba\.Security\.Heimdall
73
- Alligator
74
- allloadin
75
- AllSubmitter
76
- alyze\.info
77
- amagit
78
- ^Amazon Simple Notification Service Agent$
79
- Anarchie
80
- AndroidDownloadManager
81
- Anemone
82
- AngleSharp
83
- annotate_google
84
- Ant\.com
85
- Anturis Agent
86
- AnyEvent-HTTP\/
87
- Apache Droid
88
- Apache OpenOffice
89
- Apache-HttpAsyncClient
90
- Apache-HttpClient
91
- ApacheBench
92
- Apexoo
93
- APIs-Google
94
- AportWorm\/
95
- AppBeat\/
96
- AppEngine-Google
97
- AppleSyndication
98
- AppStoreScraperZ
99
- Aprc\/[0-9]
100
- Arachmo
101
- arachnode
102
- Arachnophilia
103
- aria2
104
- Arukereso
105
- asafaweb
106
- AskQuickly
107
- Ask Jeeves
108
- ASPSeek
109
- Asterias
110
- Astute
111
- asynchttp
112
- Attach
113
- autocite
114
- AutomaticWPTester
115
- Autonomy
116
- axios\/
117
- AWS Security Scanner
118
- B-l-i-t-z-B-O-T
119
- Backlink-Ceck
120
- backlink-check
121
- BacklinkHttpStatus
122
- BackStreet
123
- BackWeb
124
- Bad-Neighborhood
125
- Badass
126
- baidu\.com
127
- Bandit
128
- basicstate
129
- BatchFTP
130
- Battlezta Bazinga
131
- baypup\/
132
- BazQux
133
- BBBike
134
- BCKLINKS
135
- BDFetch
136
- BegunAdvertising
137
- Bidtellect
138
- BigBozz
139
- Bigfoot
140
- biglotron
141
- BingLocalSearch
142
- BingPreview
143
- binlar
144
- biNu image cacher
145
- Bitacle
146
- biz_Directory
147
- Black Hole
148
- Blackboard Safeassign
149
- BlackWidow
150
- BlockNote\.Net
151
- Bloglines
152
- Bloglovin
153
- BlogPulseLive
154
- BlogSearch
155
- Blogtrottr
156
- BlowFish
157
- boitho\.com-dc
158
- BPImageWalker
159
- Braintree-Webhooks
160
- Branch Metrics API
161
- Branch-Passthrough
162
- Brandprotect
163
- BrandVerity
164
- Brandwatch
165
- Brodie\/
166
- Browsershots
167
- BUbiNG
168
- Buck\/
169
- Buddy
170
- BuiltWith
171
- Bullseye
172
- BunnySlippers
173
- Burf Search
174
- Butterfly\/
175
- BuzzSumo
176
- CAAM\/[0-9]
177
- CakePHP
178
- Calculon
179
- Canary%20Mail
180
- CaretNail
181
- catexplorador
182
- CC Metadata Scaper
183
- Cegbfeieh
184
- censys
185
- Cerberian Drtrs
186
- CERT\.at-Statistics-Survey
187
- cg-eye
188
- changedetection
189
- ChangesMeter
190
- Charlotte
191
- CheckHost
192
- checkprivacy
193
- CherryPicker
194
- ChinaClaw
195
- Chirp\/
196
- chkme\.com
197
- Chlooe
198
- Chromaxa
199
- CirrusExplorer
200
- CISPA Vulnerability Notification
201
- Citoid
202
- CJNetworkQuality
203
- Clarsentia
204
- clips\.ua\.ac\.be
205
- Cloud mapping
206
- CloudEndure
207
- CloudFlare-AlwaysOnline
208
- Cloudinary
209
- cmcm\.com
210
- coccoc
211
- cognitiveseo
212
- colly -
213
- CommaFeed
214
- Commons-HttpClient
215
- commonscan
216
- contactbigdatafr
217
- contentkingapp
218
- convera
219
- CookieReports
220
- copyright sheriff
221
- CopyRightCheck
222
- Copyscape
223
- cortex\/
224
- Cosmos4j\.feedback
225
- Covario-IDS
226
- Craw\/
227
- Crescent
228
- Crowsnest
229
- Criteo
230
- CSHttp
231
- CSSCheck
232
- curb
233
- Curious George
234
- curl
235
- cuwhois\/
236
- cybo\.com
237
- DAP\/NetHTTP
238
- DareBoost
239
- DatabaseDriverMysqli
240
- DataCha0s
241
- Datafeedwatch
242
- Datanyze
243
- DataparkSearch
244
- dataprovider
245
- DataXu
246
- Daum(oa)?[ \/][0-9]
247
- dBpoweramp
248
- ddline
249
- deeris
250
- Demon
251
- DeuSu
252
- developers\.google\.com\/\+\/web\/snippet\/
253
- Devil
254
- Digg
255
- Digincore
256
- DigitalPebble
257
- Dirbuster
258
- Discourse Forum Onebox
259
- Disqus\/
260
- Dispatch\/
261
- DittoSpyder
262
- dlvr
263
- DMBrowser
264
- DNSPod-reporting
265
- docoloc
266
- Dolphin http client
267
- DomainAppender
268
- Donuts Content Explorer
269
- dotMailer content retrieval
270
- dotSemantic
271
- downforeveryoneorjustme
272
- Download Wonder
273
- downnotifier
274
- DowntimeDetector
275
- Drip
276
- drupact
277
- Drupal \(\+http:\/\/drupal\.org\/\)
278
- DTS Agent
279
- dubaiindex
280
- DuplexWeb-Google
281
- EARTHCOM
282
- Easy-Thumb
283
- EasyDL
284
- Ebingbong
285
- ec2linkfinder
286
- eCairn-Grabber
287
- eCatch
288
- ECCP
289
- eContext\/
290
- Ecxi
291
- EirGrabber
292
- ElectricMonk
293
- elefent
294
- EMail Exractor
295
- EMail Wolf
296
- EmailWolf
297
- Embarcadero
298
- Embed PHP Library
299
- Embedly
300
- endo\/
301
- europarchive\.org
302
- evc-batch
303
- EventMachine HttpClient
304
- Everwall Link Expander
305
- Evidon
306
- Evrinid
307
- ExactSearch
308
- ExaleadCloudview
309
- Excel\/
310
- exif
311
- Exploratodo
312
- Express WebPictures
313
- Extreme Picture Finder
314
- EyeNetIE
315
- ezooms
316
- facebookexternalhit
317
- facebookexternalua
318
- facebookplatform
319
- fairshare
320
- Faraday v
321
- fasthttp
322
- Faveeo
323
- Favicon downloader
324
- faviconkit
325
- faviconarchive
326
- FavOrg
327
- Feed Wrangler
328
- Feedable\/
329
- Feedbin
330
- FeedBooster
331
- FeedBucket
332
- FeedBunch\/
333
- FeedBurner
334
- feeder
335
- Feedly
336
- FeedshowOnline
337
- Feedspot
338
- Feedwind\/
339
- FeedZcollector
340
- feeltiptop
341
- Fetch API
342
- Fetch\/[0-9]
343
- Fever\/[0-9]
344
- FHscan
345
- Fimap
346
- findlink
347
- findthatfile
348
- FlashGet
349
- FlipboardBrowserProxy
350
- FlipboardProxy
351
- FlipboardRSS
352
- Flock\/
353
- fluffy
354
- Flunky
355
- flynxapp
356
- forensiq
357
- FoundSeoTool
358
- http:\/\/www.neomo.de\/
359
- free thumbnails
360
- Freeuploader
361
- Funnelback
362
- G-i-g-a-b-o-t
363
- g00g1e\.net
364
- ganarvisitas
365
- geek-tools
366
- Genieo
367
- GentleSource
368
- GetCode
369
- Getintent
370
- GetLinkInfo
371
- getprismatic
372
- GetRight
373
- getroot
374
- GetURLInfo\/
375
- GetWeb
376
- Geziyor
377
- Ghost Inspector
378
- GigablastOpenSource
379
- GIS-LABS
380
- github-camo
381
- github\.com
382
- Go [\d\.]* package http
383
- Go http package
384
- Go-Ahead-Got-It
385
- Go-http-client
386
- Go!Zilla
387
- gobyus
388
- gofetch
389
- GomezAgent
390
- gooblog
391
- Goodzer\/
392
- Google AppsViewer
393
- Google Desktop
394
- Google favicon
395
- Google Keyword Suggestion
396
- Google Keyword Tool
397
- Google Page Speed Insights
398
- Google PP Default
399
- Google Search Console
400
- Google Web Preview
401
- Google-Adwords
402
- Google-Apps-Script
403
- Google-Calendar-Importer
404
- Google-HotelAdsVerifier
405
- Google-HTTP-Java-Client
406
- Google-Publisher-Plugin
407
- Google-Read-Aloud
408
- Google-SearchByImage
409
- Google-Site-Verification
410
- Google-Structured-Data-Testing-Tool
411
- Google-Youtube-Links
412
- google-xrawler
413
- GoogleDocs
414
- GoogleHC\/
415
- GoogleProducer
416
- GoogleSites
417
- Google-Transparency-Report
418
- Gookey
419
- GoScraper
420
- GoSpotCheck
421
- gosquared-thumbnailer
422
- Gotit
423
- GoZilla
424
- grabify
425
- GrabNet
426
- Grafula
427
- Grammarly
428
- GrapeFX
429
- GreatNews
430
- Gregarius
431
- GRequests
432
- grokkit
433
- grouphigh
434
- grub-client
435
- gSOAP\/
436
- GT::WWW
437
- GTmetrix
438
- GuzzleHttp
439
- gvfs\/
440
- HAA(A)?RTLAND http client
441
- Haansoft
442
- hackney\/
443
- Hadi Agent
444
- HappyApps-WebCheck
445
- Hatena
446
- Havij
447
- HaxerMen
448
- HeadlessChrome
449
- HEADMasterSEO
450
- HeartRails_Capture
451
- help@dataminr\.com
452
- heritrix
453
- historious
454
- hkedcity
455
- hledejLevne\.cz
456
- Hloader
457
- HMView
458
- Holmes
459
- HonesoSearchEngine
460
- HootSuite Image proxy
461
- Hootsuite-WebFeed
462
- hosterstats
463
- HostTracker
464
- ht:\/\/check
465
- htdig
466
- HTMLparser
467
- htmlyse
468
- HTTP Banner Detection
469
- HTTP_Compression_Test
470
- http_request2
471
- http_requester
472
- http-get
473
- HTTP-Header-Abfrage
474
- http-kit
475
- http-request\/
476
- HTTP-Tiny
477
- HTTP::Lite
478
- http\.rb\/
479
- http_get
480
- HttpComponents
481
- httphr
482
- HTTPMon
483
- HTTPie
484
- httpRequest
485
- httpscheck
486
- httpssites_power
487
- httpunit
488
- HttpUrlConnection
489
- httrack
490
- huaweisymantec
491
- HubSpot
492
- Humanlinks
493
- i2kconnect\/
494
- Iblog
495
- ichiro
496
- Id-search
497
- IdeelaborPlagiaat
498
- IDG Twitter Links Resolver
499
- IDwhois\/
500
- Iframely
501
- igdeSpyder
502
- IlTrovatore
503
- Image Fetch
504
- Image Sucker
505
- ImageEngine\/
506
- ImageVisu\/
507
- Imagga
508
- imagineeasy
509
- imgsizer
510
- InAGist
511
- inbound\.li parser
512
- InDesign%20CC
513
- Indy Library
514
- InetURL
515
- infegy
516
- infohelfer
517
- InfoTekies
518
- InfoWizards Reciprocal Link
519
- inpwrd\.com
520
- instabid
521
- Instapaper
522
- Integrity
523
- integromedb
524
- Intelliseek
525
- InterGET
526
- internet_archive
527
- Internet Ninja
528
- InternetSeer
529
- internetVista monitor
530
- internetwache
531
- intraVnews
532
- IODC
533
- IOI
534
- iplabel
535
- ips-agent
536
- IPS\/[0-9]
537
- IPWorks HTTP\/S Component
538
- iqdb\/
539
- Iria
540
- Irokez
541
- isitup\.org
542
- iskanie
543
- isUp\.li
544
- iThemes Sync\/
545
- IZaBEE
546
- iZSearch
547
- JAHHO
548
- janforman
549
- Jaunt\/
550
- Jbrofuzz
551
- Jersey\/
552
- JetCar
553
- Jigsaw
554
- Jobboerse
555
- JobFeed discovery
556
- Jobg8 URL Monitor
557
- jobo
558
- Jobrapido
559
- Jobsearch1\.5
560
- JoinVision Generic
561
- JolokiaPwn
562
- Joomla
563
- Jorgee
564
- JS-Kit
565
- JustView
566
- Kaspersky Lab CFR link resolver
567
- Kelny\/
568
- Kerrigan\/
569
- KeyCDN
570
- Keyword Density
571
- Keywords Research
572
- khttp\/
573
- KickFire
574
- KimonoLabs\/
575
- Kml-Google
576
- knows\.is
577
- KOCMOHABT
578
- kouio
579
- kubectl
580
- kube-probe
581
- kulturarw3
582
- KumKie
583
- L\.webis
584
- Larbin
585
- Lavf\/
586
- LeechFTP
587
- LeechGet
588
- letsencrypt
589
- Lftp
590
- LibVLC
591
- LibWeb
592
- Libwhisker
593
- libwww
594
- Licorne
595
- Liferea\/
596
- Lightspeedsystems
597
- Lighthouse
598
- Likse
599
- Link Valet
600
- link_thumbnailer
601
- LinkAlarm\/
602
- linkCheck
603
- linkdex
604
- LinkExaminer
605
- linkfluence
606
- linkpeek
607
- LinkPreviewGenerator
608
- LinkScan
609
- LinksManager
610
- LinkTiger
611
- LinkWalker
612
- Lipperhey
613
- Litemage_walker
614
- livedoor ScreenShot
615
- LoadImpactRload
616
- localsearch-web
617
- LongURL API
618
- looid\.com
619
- looksystems\.net
620
- ltx71
621
- lua-resty-http
622
- lwp-request
623
- lwp-trivial
624
- LWP::Simple
625
- lycos
626
- LYT\.SR
627
- mabontland
628
- Mag-Net
629
- MagpieRSS
630
- Mail\.Ru
631
- MailChimp
632
- Majestic12
633
- makecontact\/
634
- Mandrill
635
- MapperCmd
636
- marketinggrader
637
- MarkMonitor
638
- MarkWatch
639
- Mass Downloader
640
- masscan\/
641
- Mata Hari
642
- Mediametric
643
- Mediapartners-Google
644
- mediawords
645
- MegaIndex\.ru
646
- MeltwaterNews
647
- Melvil Rawi
648
- MemGator
649
- Metaspinner
650
- MetaURI
651
- MFC_Tear_Sample
652
- Microsearch
653
- Microsoft Office
654
- Microsoft Outlook
655
- Microsoft Windows Network Diagnostics
656
- Microsoft-WebDAV-MiniRedir
657
- Microsoft Data Access
658
- MIDown tool
659
- MIIxpc
660
- Mindjet
661
- Miniature\.io
662
- Miniflux
663
- Mister PiX
664
- mixdata dot com
665
- mixed-content-scan
666
- Mixmax-LinkPreview
667
- mixnode
668
- Mnogosearch
669
- mogimogi
670
- Mojeek
671
- Mojolicious \(Perl\)
672
- Monit\/
673
- monitis
674
- Monitority\/
675
- montastic
676
- MonTools
677
- Moreover
678
- Morfeus Fucking Scanner
679
- Morning Paper
680
- MovableType
681
- mowser
682
- Mrcgiguy
683
- MS Web Services Client Protocol
684
- MSFrontPage
685
- mShots
686
- MuckRack\/
687
- muhstik-scan
688
- MVAClient
689
- MxToolbox\/
690
- nagios
691
- Najdi\.si
692
- Name Intelligence
693
- Nameprotect
694
- Navroad
695
- NearSite
696
- Needle
697
- Nessus
698
- Net Vampire
699
- NetAnts
700
- NETCRAFT
701
- NetLyzer
702
- NetMechanic
703
- NetNewsWire
704
- Netpursual
705
- netresearch
706
- NetShelter ContentScan
707
- Netsparker
708
- NetTrack
709
- Netvibes
710
- NetZIP
711
- Neustar WPM
712
- NeutrinoAPI
713
- NewRelicPinger
714
- NewsBlur .*Finder
715
- NewsGator
716
- newsme
717
- newspaper\/
718
- NetSystemsResearch
719
- Nexgate Ruby Client
720
- NG-Search
721
- Nibbler
722
- NICErsPRO
723
- Nikto
724
- nineconnections
725
- NLNZ_IAHarvester
726
- Nmap Scripting Engine
727
- node-superagent
728
- node-urllib
729
- node\.io
730
- Nodemeter
731
- NodePing
732
- nominet\.org\.uk
733
- nominet\.uk
734
- Norton-Safeweb
735
- Notifixious
736
- notifyninja
737
- NotionEmbedder
738
- nuhk
739
- nutch
740
- Nuzzel
741
- nWormFeedFinder
742
- nyawc\/
743
- Nymesis
744
- NYU
745
- Ocelli\/
746
- Octopus
747
- oegp
748
- Offline Explorer
749
- Offline Navigator
750
- OgScrper
751
- og-scraper
752
- okhttp
753
- omgili
754
- OMSC
755
- Online Domain Tools
756
- OpenCalaisSemanticProxy
757
- Openfind
758
- OpenLinkProfiler
759
- Openstat\/
760
- OpenVAS
761
- Optimizer
762
- Orbiter
763
- OrgProbe\/
764
- orion-semantics
765
- Outlook-Express
766
- Outlook-iOS
767
- ow\.ly
768
- Owler
769
- ownCloud News
770
- OxfordCloudService
771
- Page Valet
772
- page_verifier
773
- page scorer
774
- page2rss
775
- PageGrabber
776
- PagePeeker
777
- PageScorer
778
- Pagespeed\/
779
- Panopta
780
- panscient
781
- Papa Foto
782
- parsijoo
783
- Pavuk
784
- PayPal IPN
785
- pcBrowser
786
- Pcore-HTTP
787
- Pearltrees
788
- PECL::HTTP
789
- peerindex
790
- Peew
791
- PeoplePal
792
- Perlu -
793
- PhantomJS Screenshoter
794
- PhantomJS\/
795
- Photon\/
796
- phpservermon
797
- Pi-Monster
798
- Picscout
799
- Picsearch
800
- PictureFinder
801
- Pimonster
802
- ping\.blo\.gs
803
- Pingability
804
- PingAdmin\.Ru
805
- Pingdom
806
- Pingoscope
807
- PingSpot
808
- pinterest\.com
809
- Pixray
810
- Pizilla
811
- Plagger\/
812
- Ploetz \+ Zeller
813
- Plukkie
814
- plumanalytics
815
- PocketImageCache
816
- PocketParser
817
- Pockey
818
- POE-Component-Client-HTTP
819
- Polymail\/
820
- Pompos
821
- Porkbun
822
- Port Monitor
823
- postano
824
- PostmanRuntime
825
- PostPost
826
- postrank
827
- PowerPoint\/
828
- Priceonomics Analysis Engine
829
- PrintFriendly
830
- PritTorrent
831
- Prlog
832
- probethenet
833
- Project 25499
834
- prospectb2b
835
- Protopage
836
- ProWebWalker
837
- proximic
838
- PRTG Network Monitor
839
- pshtt, https scanning
840
- PTST
841
- PTST\/[0-9]+
842
- Pulsepoint XT3 web scraper
843
- Pump
844
- Python-httplib2
845
- python-requests
846
- Python-urllib
847
- Qirina Hurdler
848
- QQDownload
849
- QrafterPro
850
- Qseero
851
- Qualidator
852
- QueryN Metasearch
853
- queuedriver
854
- Quora Link Preview
855
- Qwantify
856
- Radian6
857
- RankActive
858
- RankFlex
859
- RankSonicSiteAuditor
860
- Re-re Studio
861
- ReactorNetty
862
- Readability
863
- RealDownload
864
- RealPlayer%20Downloader
865
- RebelMouse
866
- Recorder
867
- RecurPost\/
868
- redback\/
869
- ReederForMac
870
- Reeder\/
871
- ReGet
872
- RepoMonkey
873
- request\.js
874
- reqwest\/
875
- ResponseCodeTest
876
- RestSharp
877
- Riddler
878
- Rival IQ
879
- Robosourcer
880
- Robozilla
881
- ROI Hunter
882
- RPT-HTTPClient
883
- RSSOwl
884
- safe-agent-scanner
885
- SalesIntelligent
886
- Saleslift
887
- Sendsay\.Ru
888
- SauceNAO
889
- SBIder
890
- scalaj-http
891
- scan\.lol
892
- ScanAlert
893
- Scoop
894
- scooter
895
- ScoutJet
896
- ScoutURLMonitor
897
- ScrapeBox Page Scanner
898
- SimpleScraper
899
- Scrapy
900
- Screaming
901
- ScreenShotService
902
- Scrubby
903
- Scrutiny\/
904
- search\.thunderstone
905
- Search37
906
- searchenginepromotionhelp
907
- Searchestate
908
- SearchExpress
909
- SearchSight
910
- Seeker
911
- semanticdiscovery
912
- semanticjuice
913
- Semiocast HTTP client
914
- Semrush
915
- sentry\/
916
- SEO Browser
917
- Seo Servis
918
- seo-nastroj\.cz
919
- seo4ajax
920
- Seobility
921
- SEOCentro
922
- SeoCheck
923
- SEOkicks
924
- Seomoz
925
- SEOprofiler
926
- SEOsearch
927
- seoscanners
928
- seositecheckup
929
- SEOstats
930
- servernfo
931
- sexsearcher
932
- Seznam
933
- Shelob
934
- Shodan
935
- Shoppimon
936
- ShopWiki
937
- ShortLinkTranslate
938
- shrinktheweb
939
- Sideqik
940
- SimplePie
941
- SimplyFast
942
- Siphon
943
- SISTRIX
944
- Site-Shot\/
945
- Site Sucker
946
- Site24x7
947
- SiteBar
948
- Sitebeam
949
- Sitebulb\/
950
- SiteCondor
951
- SiteExplorer
952
- SiteGuardian
953
- Siteimprove
954
- SiteIndexed
955
- Sitemap(s)? Generator
956
- SitemapGenerator
957
- SiteMonitor
958
- Siteshooter B0t
959
- SiteSnagger
960
- SiteSucker
961
- SiteTruth
962
- Sitevigil
963
- sitexy\.com
964
- SkypeUriPreview
965
- Slack\/
966
- slider\.com
967
- slurp
968
- SlySearch
969
- SmartDownload
970
- SMRF URL Expander
971
- SMUrlExpander
972
- Snake
973
- Snappy
974
- SnapSearch
975
- Snarfer\/
976
- SniffRSS
977
- sniptracker
978
- Snoopy
979
- SnowHaze Search
980
- sogou web
981
- SortSite
982
- Sottopop
983
- sovereign\.ai
984
- SpaceBison
985
- SpamExperts
986
- Spammen
987
- Spanner
988
- spaziodati
989
- SPDYCheck
990
- Specificfeeds
991
- speedy
992
- SPEng
993
- Spinn3r
994
- spray-can
995
- Sprinklr
996
- spyonweb
997
- sqlmap
998
- Sqlworm
999
- Sqworm
1000
- SSL Labs
1001
- ssl-tools
1002
- StackRambler
1003
- Statastico\/
1004
- StatusCake
1005
- Steeler
1006
- Stratagems Kumo
1007
- Stroke\.cz
1008
- StudioFACA
1009
- StumbleUpon
1010
- suchen
1011
- Sucuri
1012
- summify
1013
- SuperHTTP
1014
- Surphace Scout
1015
- Suzuran
1016
- SwiteScraper
1017
- Symfony BrowserKit
1018
- Symfony2 BrowserKit
1019
- SynHttpClient-Built
1020
- Sysomos
1021
- sysscan
1022
- Szukacz
1023
- T0PHackTeam
1024
- tAkeOut
1025
- Tarantula\/
1026
- Taringa UGC
1027
- TarmotGezgin
1028
- Teleport
1029
- Telesoft
1030
- Telesphoreo
1031
- Telesphorep
1032
- Tenon\.io
1033
- teoma
1034
- terrainformatica
1035
- Test Certificate Info
1036
- testuri
1037
- Tetrahedron
1038
- TextRazor Downloader
1039
- The Drop Reaper
1040
- The Expert HTML Source Viewer
1041
- The Knowledge AI
1042
- The Intraformant
1043
- theinternetrules
1044
- TheNomad
1045
- Thinklab
1046
- Thumbshots
1047
- ThumbSniper
1048
- Thumbor
1049
- timewe\.net
1050
- TinEye
1051
- Tiny Tiny RSS
1052
- TLSProbe\/
1053
- Toata
1054
- topster
1055
- touche\.com
1056
- Traackr\.com
1057
- tracemyfile
1058
- Trackuity
1059
- TrapitAgent
1060
- Trendiction
1061
- Trendsmap
1062
- trendspottr
1063
- truwoGPS
1064
- TryJsoup
1065
- TulipChain
1066
- Turingos
1067
- Turnitin
1068
- tweetedtimes
1069
- Tweetminster
1070
- Tweezler\/
1071
- twibble
1072
- Twice
1073
- Twikle
1074
- Twingly
1075
- Twisted PageGetter
1076
- Typhoeus
1077
- ubermetrics-technologies
1078
- uclassify
1079
- UdmSearch
1080
- unchaos
1081
- unirest-java
1082
- UniversalFeedParser
1083
- Unshorten\.It
1084
- Untiny
1085
- UnwindFetchor
1086
- updated
1087
- updown\.io daemon
1088
- Upflow
1089
- Uptimia
1090
- Urlcheckr
1091
- URL Verifier
1092
- URLitor
1093
- urlresolver
1094
- Urlstat
1095
- URLTester
1096
- UrlTrends Ranking Updater
1097
- URLy Warning
1098
- URLy\.Warning
1099
- Vacuum
1100
- Vagabondo
1101
- VB Project
1102
- vBSEO
1103
- VCI
1104
- via ggpht\.com GoogleImageProxy
1105
- VidibleScraper
1106
- Virusdie
1107
- visionutils
1108
- vkShare
1109
- VoidEYE
1110
- Voil
1111
- voltron
1112
- voyager\/
1113
- VSAgent\/
1114
- VSB-TUO\/
1115
- Vulnbusters Meter
1116
- VYU2
1117
- w3af\.org
1118
- W3C_Unicorn
1119
- W3C-checklink
1120
- W3C-mobileOK
1121
- WAC-OFU
1122
- Wallpapers\/[0-9]+
1123
- WallpapersHD
1124
- wangling
1125
- Wappalyzer
1126
- WatchMouse
1127
- WbSrch\/
1128
- WDT\.io
1129
- web-capture\.net
1130
- Web-sniffer
1131
- Web Auto
1132
- Web Collage
1133
- Web Enhancer
1134
- Web Fetch
1135
- Web Fuck
1136
- Web Pix
1137
- Web Sauger
1138
- Web spyder
1139
- Web Sucker
1140
- Webalta
1141
- Webauskunft
1142
- WebAuto
1143
- WebCapture
1144
- WebClient\/
1145
- webcollage
1146
- WebCookies
1147
- WebCopier
1148
- WebCorp
1149
- WebDataStats
1150
- WebDoc
1151
- WebEnhancer
1152
- WebFetch
1153
- WebFuck
1154
- WebGazer
1155
- WebGo IS
1156
- WebImageCollector
1157
- WebImages
1158
- WebIndex
1159
- webkit2png
1160
- WebLeacher
1161
- webmastercoffee
1162
- webmon\s
1163
- WebPix
1164
- WebReaper
1165
- WebSauger
1166
- webscreenie
1167
- Webshag
1168
- Webshot
1169
- Website Quester
1170
- websitepulse agent
1171
- WebsiteQuester
1172
- Websnapr
1173
- WebSniffer
1174
- Webster
1175
- WebStripper
1176
- WebSucker
1177
- Webthumb\/
1178
- WebThumbnail
1179
- WebWhacker
1180
- WebZIP
1181
- WeLikeLinks
1182
- WEPA
1183
- WeSEE
1184
- wf84
1185
- Wfuzz\/
1186
- wget
1187
- WhatsApp
1188
- WhatsMyIP
1189
- WhatWeb
1190
- WhereGoes\?
1191
- Whibse
1192
- WhoRunsCoinHive
1193
- Whynder Magnet
1194
- Windows-RSS-Platform
1195
- WinPodder
1196
- wkhtmlto
1197
- wmtips
1198
- Woko
1199
- woorankreview
1200
- Word\/
1201
- WordPress\/
1202
- worldping-api
1203
- WordupinfoSearch
1204
- wotbox
1205
- WP Engine Install Performance API
1206
- wpif
1207
- wprecon\.com survey
1208
- WPScan
1209
- wscheck
1210
- Wtrace
1211
- WWW-Collector-E
1212
- WWW-Mechanize
1213
- WWW::Document
1214
- WWW::Mechanize
1215
- www\.monitor\.us
1216
- WWWOFFLE
1217
- x09Mozilla
1218
- x22Mozilla
1219
- XaxisSemanticsClassifier
1220
- Xenu Link Sleuth
1221
- XING-contenttabreceiver
1222
- xpymep([0-9]?)\.exe
1223
- Y!J-(ASR|BSC)
1224
- Y\!J-BRW
1225
- Yaanb
1226
- yacy
1227
- Yahoo Link Preview
1228
- YahooCacheSystem
1229
- YahooYSMcm
1230
- YandeG
1231
- Yandex(?!Search)
1232
- yanga
1233
- yeti
1234
- Yo-yo
1235
- Yoleo Consumer
1236
- yoogliFetchAgent
1237
- YottaaMonitor
1238
- Your-Website-Sucks
1239
- yourls\.org
1240
- YoYs\.net
1241
- YP\.PL
1242
- Zabbix
1243
- Zade
1244
- Zao
1245
- Zauba
1246
- Zemanta Aggregator
1247
- Zend_Http_Client
1248
- Zend\\\Http\\\Client
1249
- Zermelo
1250
- Zeus
1251
- zgrab
1252
- ZnajdzFoto
1253
- ZnHTTP
1254
- Zombie\.js
1255
- Zoom\.Mac
1256
- ZyBorg
1257
- [a-z0-9\-_]*(bot|crawl|archiver|transcoder|spider|uptime|validator|fetcher|cron|checker|reader|extractor|monitoring|analyzer)
1258
- ].strip.split(/\n+/).freeze
7
+ extend Loader
8
+
9
+ def self.data
10
+ @data ||= load_raw(CrawlerDetect.config.settings.raw_crawlers_path).freeze
11
+ end
1259
12
  end
1260
13
  end
1261
14
  end