crawler_detect 0.1.9 → 1.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,24 +1,46 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require "crawler_detect/detector"
4
- require "crawler_detect/library"
5
- require "crawler_detect/library/crawlers"
6
- require "crawler_detect/library/exclusions"
7
- require "crawler_detect/library/headers"
8
- require "crawler_detect/version"
3
+ require "oj"
4
+ require "qonfig"
9
5
 
10
- require "rack/crawler_detect"
6
+ require_relative "crawler_detect/config"
7
+ require_relative "crawler_detect/detector"
8
+ require_relative "crawler_detect/library/loader"
9
+ require_relative "crawler_detect/library/crawlers"
10
+ require_relative "crawler_detect/library/exclusions"
11
+ require_relative "crawler_detect/library/headers"
12
+ require_relative "crawler_detect/library"
13
+ require_relative "crawler_detect/version"
14
+ require_relative "rack/crawler_detect"
11
15
 
16
+ # @since 0.1.0
12
17
  module CrawlerDetect
13
18
  class << self
19
+ # @param user_agent [String] User-agent string to detect
20
+ # @return [CrawlerDetect::Detector] Instance of detector class
14
21
  def new(user_agent)
15
22
  detector(user_agent)
16
23
  end
17
24
 
25
+ # @param user_agent [String] User-agent string to detect
26
+ # @return [true, false] Is User-agent a crawler?
18
27
  def is_crawler?(user_agent)
19
28
  detector(user_agent).is_crawler?
20
29
  end
21
30
 
31
+ # @since 1.0.0
32
+ # @param config [Proc]
33
+ def setup!(&config)
34
+ @config = CrawlerDetect::Config.new(&config)
35
+ Library::DATA_CLASSES.each(&:reload_data)
36
+ end
37
+
38
+ # @since 1.0.0
39
+ # @return [CrawlerDetect::Config] Instance of configuration class
40
+ def config
41
+ @config ||= CrawlerDetect::Config.new
42
+ end
43
+
22
44
  private
23
45
 
24
46
  def detector(user_agent)
@@ -0,0 +1,29 @@
1
+ # frozen_string_literal: true
2
+
3
+ module CrawlerDetect
4
+ # Configuration of CrawlerDetect
5
+ #
6
+ # @see settings
7
+ # @since 1.0.0
8
+ class Config < ::Qonfig::DataSet
9
+ CUR_PATH = File.dirname(File.expand_path(__FILE__)).freeze
10
+ RAW_PATH = File.join(CUR_PATH, "library/raw").freeze
11
+
12
+ RAW_CRAWLERS_PATH = File.join(RAW_PATH, "Crawlers.json").freeze
13
+ RAW_EXCLUSIONS_PATH = File.join(RAW_PATH, "Exclusions.json").freeze
14
+ RAW_HEADERS_PATH = File.join(RAW_PATH, "Headers.json").freeze
15
+
16
+ # @return [String] path to crawlers raw JSON file
17
+ setting :raw_crawlers_path, RAW_CRAWLERS_PATH
18
+
19
+ # @return [String] path to exclusions raw JSON file
20
+ setting :raw_exclusions_path, RAW_EXCLUSIONS_PATH
21
+
22
+ # @return [String] path to headers raw JSON file
23
+ setting :raw_headers_path, RAW_HEADERS_PATH
24
+
25
+ validate :raw_crawlers_path, :string, strict: true
26
+ validate :raw_exclusions_path, :string, strict: true
27
+ validate :raw_headers_path, :string, strict: true
28
+ end
29
+ end
@@ -1,17 +1,22 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module CrawlerDetect
4
+ # since 0.1.0
4
5
  class Detector
6
+ # @param user_agent [String] User-agent string to detect
7
+ # @return [CrawlerDetect::Detector] instance of detector class
5
8
  def initialize(user_agent)
6
9
  @user_agent = user_agent.to_s.dup
7
10
  end
8
11
 
12
+ # @return [true, false] Is User-agent a crawler?
9
13
  def is_crawler?
10
14
  @is_crawler ||= begin
11
15
  !completely_exclusion? && matches_crawler_list?
12
16
  end
13
17
  end
14
18
 
19
+ # @return [String] The detected crawler name from RAW data
15
20
  def crawler_name
16
21
  return unless is_crawler?
17
22
  @crawler_name
@@ -19,22 +24,30 @@ module CrawlerDetect
19
24
 
20
25
  private
21
26
 
22
- def completely_exclusion?
23
- @user_agent.gsub!(exclusions_matcher, "")
24
- @user_agent.strip.length == 0
25
- end
27
+ # @private
28
+ # @return [true, false] Is User-agent in white-list?
29
+ def completely_exclusion?
30
+ @user_agent.gsub!(exclusions_matcher, "")
31
+ @user_agent.strip.length.zero?
32
+ end
26
33
 
27
- def matches_crawler_list?
28
- @crawler_name = crawlers_matcher.match(@user_agent).to_s.strip
29
- !@crawler_name.empty?
30
- end
34
+ # @private
35
+ # @return [true, false] Is User-agent in black-list?
36
+ def matches_crawler_list?
37
+ @crawler_name = crawlers_matcher.match(@user_agent).to_s.strip
38
+ !@crawler_name.empty?
39
+ end
31
40
 
32
- def exclusions_matcher
33
- CrawlerDetect::Library.get_regexp("exclusions")
34
- end
41
+ # @private
42
+ # @return [Regexp] White-list of User-agents
43
+ def exclusions_matcher
44
+ CrawlerDetect::Library.get_regexp("exclusions")
45
+ end
35
46
 
36
- def crawlers_matcher
37
- CrawlerDetect::Library.get_regexp("crawlers")
38
- end
47
+ # @private
48
+ # @return [Regexp] Black-list of User-agents
49
+ def crawlers_matcher
50
+ CrawlerDetect::Library.get_regexp("crawlers")
51
+ end
39
52
  end
40
53
  end
@@ -1,16 +1,22 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module CrawlerDetect
4
+ # @since 0.1.0
4
5
  module Library
6
+ DATA_CLASSES = [Library::Headers, Library::Exclusions, Library::Crawlers].freeze
7
+
5
8
  class << self
9
+ # @param param [String] Name of raw data
10
+ # @return [Regexp]
6
11
  def get_regexp(param)
7
12
  data = get_array(param)
8
- %r[#{data.join('|')}]i
13
+ %r{#{data.join('|')}}i
9
14
  end
10
15
 
16
+ # @param param [String] Name of raw data
17
+ # @return [Array]
11
18
  def get_array(param)
12
- const_name = "CrawlerDetect::Library::#{param.capitalize}::#{param.upcase}"
13
- const_get(const_name)
19
+ const_get("CrawlerDetect::Library::#{param.capitalize}").send(:data)
14
20
  end
15
21
  end
16
22
  end
@@ -1,1257 +1,14 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- # rubocop:disable Layout/TrailingWhitespace
4
3
  module CrawlerDetect
5
4
  module Library
5
+ # @since 0.1.0
6
6
  module Crawlers
7
- CRAWLERS = %q[
8
- .*Java.*outbrain
9
- YLT
10
- ^b0t$
11
- ^bluefish
12
- ^Calypso v\/
13
- ^COMODO DCV
14
- ^DangDang
15
- ^DavClnt
16
- ^FDM
17
- ^git\/
18
- ^Goose\/
19
- ^Grabber
20
- ^HTTPClient\/
21
- ^Java\/
22
- ^Jeode\/
23
- ^Jetty\/
24
- ^Mail\/
25
- ^Mget
26
- ^Microsoft URL Control
27
- ^NG\/[0-9\.]
28
- ^NING\/
29
- ^PHP\/[0-9]
30
- ^RMA\/
31
- ^Ruby|Ruby\/[0-9]
32
- ^VSE\/[0-9]
33
- ^WordPress\.com
34
- ^XRL\/[0-9]
35
- ^ZmEu
36
- 008\/
37
- 13TABS
38
- 192\.comAgent
39
- 2ip\.ru
40
- 404enemy
41
- 7Siters
42
- 80legs
43
- a\.pr-cy\.ru
44
- a3logics\.in
45
- A6-Indexer
46
- Abonti
47
- Aboundex
48
- aboutthedomain
49
- Accoona-AI-Agent
50
- acoon
51
- acrylicapps\.com\/pulp
52
- Acunetix
53
- AdAuth\/
54
- adbeat
55
- AddThis
56
- ADmantX
57
- AdminLabs
58
- adressendeutschland
59
- adscanner
60
- Adstxtaggregator
61
- adstxt-worker
62
- adstxt\.com
63
- agentslug
64
- AHC
65
- aihit
66
- aiohttp\/
67
- Airmail
68
- akka-http\/
69
- akula\/
70
- alertra
71
- alexa site audit
72
- Alibaba\.Security\.Heimdall
73
- Alligator
74
- allloadin
75
- AllSubmitter
76
- alyze\.info
77
- amagit
78
- ^Amazon Simple Notification Service Agent$
79
- Anarchie
80
- AndroidDownloadManager
81
- Anemone
82
- AngleSharp
83
- annotate_google
84
- Ant\.com
85
- Anturis Agent
86
- AnyEvent-HTTP\/
87
- Apache Droid
88
- Apache OpenOffice
89
- Apache-HttpAsyncClient
90
- Apache-HttpClient
91
- ApacheBench
92
- Apexoo
93
- APIs-Google
94
- AportWorm\/
95
- AppBeat\/
96
- AppEngine-Google
97
- AppleSyndication
98
- AppStoreScraperZ
99
- Aprc\/[0-9]
100
- Arachmo
101
- arachnode
102
- Arachnophilia
103
- aria2
104
- Arukereso
105
- asafaweb
106
- AskQuickly
107
- Ask Jeeves
108
- ASPSeek
109
- Asterias
110
- Astute
111
- asynchttp
112
- Attach
113
- autocite
114
- AutomaticWPTester
115
- Autonomy
116
- axios\/
117
- B-l-i-t-z-B-O-T
118
- Backlink-Ceck
119
- backlink-check
120
- BacklinkHttpStatus
121
- BackStreet
122
- BackWeb
123
- Bad-Neighborhood
124
- Badass
125
- baidu\.com
126
- Bandit
127
- basicstate
128
- BatchFTP
129
- Battlezta Bazinga
130
- baypup\/
131
- BazQux
132
- BBBike
133
- BCKLINKS
134
- BDFetch
135
- BegunAdvertising
136
- Bidtellect
137
- BigBozz
138
- Bigfoot
139
- biglotron
140
- BingLocalSearch
141
- BingPreview
142
- binlar
143
- biNu image cacher
144
- Bitacle
145
- biz_Directory
146
- Black Hole
147
- Blackboard Safeassign
148
- BlackWidow
149
- BlockNote\.Net
150
- Bloglines
151
- Bloglovin
152
- BlogPulseLive
153
- BlogSearch
154
- Blogtrottr
155
- BlowFish
156
- boitho\.com-dc
157
- BPImageWalker
158
- Braintree-Webhooks
159
- Branch Metrics API
160
- Branch-Passthrough
161
- Brandprotect
162
- BrandVerity
163
- Brandwatch
164
- Brodie\/
165
- Browsershots
166
- BUbiNG
167
- Buck\/
168
- Buddy
169
- BuiltWith
170
- Bullseye
171
- BunnySlippers
172
- Burf Search
173
- Butterfly\/
174
- BuzzSumo
175
- CAAM\/[0-9]
176
- CakePHP
177
- Calculon
178
- Canary%20Mail
179
- CaretNail
180
- catexplorador
181
- CC Metadata Scaper
182
- Cegbfeieh
183
- censys
184
- Cerberian Drtrs
185
- CERT\.at-Statistics-Survey
186
- cg-eye
187
- changedetection
188
- ChangesMeter
189
- Charlotte
190
- CheckHost
191
- checkprivacy
192
- CherryPicker
193
- ChinaClaw
194
- Chirp\/
195
- chkme\.com
196
- Chlooe
197
- Chromaxa
198
- CirrusExplorer
199
- CISPA Vulnerability Notification
200
- Citoid
201
- CJNetworkQuality
202
- Clarsentia
203
- clips\.ua\.ac\.be
204
- Cloud mapping
205
- CloudEndure
206
- CloudFlare-AlwaysOnline
207
- Cloudinary
208
- cmcm\.com
209
- coccoc
210
- cognitiveseo
211
- colly -
212
- CommaFeed
213
- Commons-HttpClient
214
- commonscan
215
- contactbigdatafr
216
- contentkingapp
217
- convera
218
- CookieReports
219
- copyright sheriff
220
- CopyRightCheck
221
- Copyscape
222
- cortex\/
223
- Cosmos4j\.feedback
224
- Covario-IDS
225
- Craw\/
226
- Crescent
227
- Crowsnest
228
- Criteo
229
- CSHttp
230
- CSSCheck
231
- curb
232
- Curious George
233
- curl
234
- cuwhois\/
235
- cybo\.com
236
- DAP\/NetHTTP
237
- DareBoost
238
- DatabaseDriverMysqli
239
- DataCha0s
240
- Datafeedwatch
241
- Datanyze
242
- DataparkSearch
243
- dataprovider
244
- DataXu
245
- Daum(oa)?[ \/][0-9]
246
- dBpoweramp
247
- ddline
248
- deeris
249
- Demon
250
- DeuSu
251
- developers\.google\.com\/\+\/web\/snippet\/
252
- Devil
253
- Digg
254
- Digincore
255
- DigitalPebble
256
- Dirbuster
257
- Discourse Forum Onebox
258
- Disqus\/
259
- Dispatch\/
260
- DittoSpyder
261
- dlvr
262
- DMBrowser
263
- DNSPod-reporting
264
- docoloc
265
- Dolphin http client
266
- DomainAppender
267
- Donuts Content Explorer
268
- dotMailer content retrieval
269
- dotSemantic
270
- downforeveryoneorjustme
271
- Download Wonder
272
- downnotifier
273
- DowntimeDetector
274
- Drip
275
- drupact
276
- Drupal \(\+http:\/\/drupal\.org\/\)
277
- DTS Agent
278
- dubaiindex
279
- DuplexWeb-Google
280
- EARTHCOM
281
- Easy-Thumb
282
- EasyDL
283
- Ebingbong
284
- ec2linkfinder
285
- eCairn-Grabber
286
- eCatch
287
- ECCP
288
- eContext\/
289
- Ecxi
290
- EirGrabber
291
- ElectricMonk
292
- elefent
293
- EMail Exractor
294
- EMail Wolf
295
- EmailWolf
296
- Embarcadero
297
- Embed PHP Library
298
- Embedly
299
- endo\/
300
- europarchive\.org
301
- evc-batch
302
- EventMachine HttpClient
303
- Everwall Link Expander
304
- Evidon
305
- Evrinid
306
- ExactSearch
307
- ExaleadCloudview
308
- Excel\/
309
- exif
310
- Exploratodo
311
- Express WebPictures
312
- Extreme Picture Finder
313
- EyeNetIE
314
- ezooms
315
- facebookexternalhit
316
- facebookexternalua
317
- facebookplatform
318
- fairshare
319
- Faraday v
320
- fasthttp
321
- Faveeo
322
- Favicon downloader
323
- faviconkit
324
- faviconarchive
325
- FavOrg
326
- Feed Wrangler
327
- Feedable\/
328
- Feedbin
329
- FeedBooster
330
- FeedBucket
331
- FeedBunch\/
332
- FeedBurner
333
- feeder
334
- Feedly
335
- FeedshowOnline
336
- Feedspot
337
- Feedwind\/
338
- FeedZcollector
339
- feeltiptop
340
- Fetch API
341
- Fetch\/[0-9]
342
- Fever\/[0-9]
343
- FHscan
344
- Fimap
345
- findlink
346
- findthatfile
347
- FlashGet
348
- FlipboardBrowserProxy
349
- FlipboardProxy
350
- FlipboardRSS
351
- Flock\/
352
- fluffy
353
- Flunky
354
- flynxapp
355
- forensiq
356
- FoundSeoTool
357
- http:\/\/www.neomo.de\/
358
- free thumbnails
359
- Freeuploader
360
- Funnelback
361
- G-i-g-a-b-o-t
362
- g00g1e\.net
363
- ganarvisitas
364
- geek-tools
365
- Genieo
366
- GentleSource
367
- GetCode
368
- Getintent
369
- GetLinkInfo
370
- getprismatic
371
- GetRight
372
- getroot
373
- GetURLInfo\/
374
- GetWeb
375
- Geziyor
376
- Ghost Inspector
377
- GigablastOpenSource
378
- GIS-LABS
379
- github-camo
380
- github\.com
381
- Go [\d\.]* package http
382
- Go http package
383
- Go-Ahead-Got-It
384
- Go-http-client
385
- Go!Zilla
386
- gobyus
387
- gofetch
388
- GomezAgent
389
- gooblog
390
- Goodzer\/
391
- Google AppsViewer
392
- Google Desktop
393
- Google favicon
394
- Google Keyword Suggestion
395
- Google Keyword Tool
396
- Google Page Speed Insights
397
- Google PP Default
398
- Google Search Console
399
- Google Web Preview
400
- Google-Adwords
401
- Google-Apps-Script
402
- Google-Calendar-Importer
403
- Google-HotelAdsVerifier
404
- Google-HTTP-Java-Client
405
- Google-Publisher-Plugin
406
- Google-Read-Aloud
407
- Google-SearchByImage
408
- Google-Site-Verification
409
- Google-Structured-Data-Testing-Tool
410
- Google-Youtube-Links
411
- google-xrawler
412
- GoogleDocs
413
- GoogleHC\/
414
- GoogleProducer
415
- GoogleSites
416
- Google-Transparency-Report
417
- Gookey
418
- GoScraper
419
- GoSpotCheck
420
- gosquared-thumbnailer
421
- Gotit
422
- GoZilla
423
- grabify
424
- GrabNet
425
- Grafula
426
- Grammarly
427
- GrapeFX
428
- GreatNews
429
- Gregarius
430
- GRequests
431
- grokkit
432
- grouphigh
433
- grub-client
434
- gSOAP\/
435
- GT::WWW
436
- GTmetrix
437
- GuzzleHttp
438
- gvfs\/
439
- HAA(A)?RTLAND http client
440
- Haansoft
441
- hackney\/
442
- Hadi Agent
443
- HappyApps-WebCheck
444
- Hatena
445
- Havij
446
- HeadlessChrome
447
- HEADMasterSEO
448
- HeartRails_Capture
449
- help@dataminr\.com
450
- heritrix
451
- historious
452
- hkedcity
453
- hledejLevne\.cz
454
- Hloader
455
- HMView
456
- Holmes
457
- HonesoSearchEngine
458
- HootSuite Image proxy
459
- Hootsuite-WebFeed
460
- hosterstats
461
- HostTracker
462
- ht:\/\/check
463
- htdig
464
- HTMLparser
465
- htmlyse
466
- HTTP Banner Detection
467
- HTTP_Compression_Test
468
- http_request2
469
- http_requester
470
- http-get
471
- HTTP-Header-Abfrage
472
- http-kit
473
- http-request\/
474
- HTTP-Tiny
475
- HTTP::Lite
476
- http\.rb\/
477
- http_get
478
- HttpComponents
479
- httphr
480
- HTTPMon
481
- HTTPie
482
- httpRequest
483
- httpscheck
484
- httpssites_power
485
- httpunit
486
- HttpUrlConnection
487
- httrack
488
- huaweisymantec
489
- HubSpot
490
- Humanlinks
491
- i2kconnect\/
492
- Iblog
493
- ichiro
494
- Id-search
495
- IdeelaborPlagiaat
496
- IDG Twitter Links Resolver
497
- IDwhois\/
498
- Iframely
499
- igdeSpyder
500
- IlTrovatore
501
- Image Fetch
502
- Image Sucker
503
- ImageEngine\/
504
- ImageVisu\/
505
- Imagga
506
- imagineeasy
507
- imgsizer
508
- InAGist
509
- inbound\.li parser
510
- InDesign%20CC
511
- Indy Library
512
- InetURL
513
- infegy
514
- infohelfer
515
- InfoTekies
516
- InfoWizards Reciprocal Link
517
- inpwrd\.com
518
- instabid
519
- Instapaper
520
- Integrity
521
- integromedb
522
- Intelliseek
523
- InterGET
524
- internet_archive
525
- Internet Ninja
526
- InternetSeer
527
- internetVista monitor
528
- internetwache
529
- intraVnews
530
- IODC
531
- IOI
532
- iplabel
533
- ips-agent
534
- IPS\/[0-9]
535
- IPWorks HTTP\/S Component
536
- iqdb\/
537
- Iria
538
- Irokez
539
- isitup\.org
540
- iskanie
541
- isUp\.li
542
- iThemes Sync\/
543
- IZaBEE
544
- iZSearch
545
- JAHHO
546
- janforman
547
- Jaunt\/
548
- Jbrofuzz
549
- Jersey\/
550
- JetCar
551
- Jigsaw
552
- Jobboerse
553
- JobFeed discovery
554
- Jobg8 URL Monitor
555
- jobo
556
- Jobrapido
557
- Jobsearch1\.5
558
- JoinVision Generic
559
- JolokiaPwn
560
- Joomla
561
- Jorgee
562
- JS-Kit
563
- JustView
564
- Kaspersky Lab CFR link resolver
565
- Kelny\/
566
- Kerrigan\/
567
- KeyCDN
568
- Keyword Density
569
- Keywords Research
570
- khttp\/
571
- KickFire
572
- KimonoLabs\/
573
- Kml-Google
574
- knows\.is
575
- KOCMOHABT
576
- kouio
577
- kube-probe
578
- kulturarw3
579
- KumKie
580
- L\.webis
581
- Larbin
582
- Lavf\/
583
- LeechFTP
584
- LeechGet
585
- letsencrypt
586
- Lftp
587
- LibVLC
588
- LibWeb
589
- Libwhisker
590
- libwww
591
- Licorne
592
- Liferea\/
593
- Lightspeedsystems
594
- Lighthouse
595
- Likse
596
- Link Valet
597
- link_thumbnailer
598
- LinkAlarm\/
599
- linkCheck
600
- linkdex
601
- LinkExaminer
602
- linkfluence
603
- linkpeek
604
- LinkPreviewGenerator
605
- LinkScan
606
- LinksManager
607
- LinkTiger
608
- LinkWalker
609
- Lipperhey
610
- Litemage_walker
611
- livedoor ScreenShot
612
- LoadImpactRload
613
- localsearch-web
614
- LongURL API
615
- looid\.com
616
- looksystems\.net
617
- ltx71
618
- lua-resty-http
619
- lwp-request
620
- lwp-trivial
621
- LWP::Simple
622
- lycos
623
- LYT\.SR
624
- mabontland
625
- Mag-Net
626
- MagpieRSS
627
- Mail\.Ru
628
- MailChimp
629
- Majestic12
630
- makecontact\/
631
- Mandrill
632
- MapperCmd
633
- marketinggrader
634
- MarkMonitor
635
- MarkWatch
636
- Mass Downloader
637
- masscan\/
638
- Mata Hari
639
- Mediametric
640
- Mediapartners-Google
641
- mediawords
642
- MegaIndex\.ru
643
- MeltwaterNews
644
- Melvil Rawi
645
- MemGator
646
- Metaspinner
647
- MetaURI
648
- MFC_Tear_Sample
649
- Microsearch
650
- Microsoft Office
651
- Microsoft Outlook
652
- Microsoft Windows Network Diagnostics
653
- Microsoft-WebDAV-MiniRedir
654
- Microsoft Data Access
655
- MIDown tool
656
- MIIxpc
657
- Mindjet
658
- Miniature\.io
659
- Miniflux
660
- Mister PiX
661
- mixdata dot com
662
- mixed-content-scan
663
- Mixmax-LinkPreview
664
- mixnode
665
- Mnogosearch
666
- mogimogi
667
- Mojeek
668
- Mojolicious \(Perl\)
669
- Monit\/
670
- monitis
671
- Monitority\/
672
- montastic
673
- MonTools
674
- Moreover
675
- Morfeus Fucking Scanner
676
- Morning Paper
677
- MovableType
678
- mowser
679
- Mrcgiguy
680
- MS Web Services Client Protocol
681
- MSFrontPage
682
- mShots
683
- MuckRack\/
684
- muhstik-scan
685
- MVAClient
686
- MxToolbox\/
687
- nagios
688
- Najdi\.si
689
- Name Intelligence
690
- Nameprotect
691
- Navroad
692
- NearSite
693
- Needle
694
- Nessus
695
- Net Vampire
696
- NetAnts
697
- NETCRAFT
698
- NetLyzer
699
- NetMechanic
700
- NetNewsWire
701
- Netpursual
702
- netresearch
703
- NetShelter ContentScan
704
- Netsparker
705
- NetTrack
706
- Netvibes
707
- NetZIP
708
- Neustar WPM
709
- NeutrinoAPI
710
- NewRelicPinger
711
- NewsBlur .*Finder
712
- NewsGator
713
- newsme
714
- newspaper\/
715
- Nexgate Ruby Client
716
- NG-Search
717
- Nibbler
718
- NICErsPRO
719
- Nikto
720
- nineconnections
721
- NLNZ_IAHarvester
722
- Nmap Scripting Engine
723
- node-superagent
724
- node-urllib
725
- node\.io
726
- Nodemeter
727
- NodePing
728
- nominet\.org\.uk
729
- nominet\.uk
730
- Norton-Safeweb
731
- Notifixious
732
- notifyninja
733
- NotionEmbedder
734
- nuhk
735
- nutch
736
- Nuzzel
737
- nWormFeedFinder
738
- nyawc\/
739
- Nymesis
740
- NYU
741
- Ocelli\/
742
- Octopus
743
- oegp
744
- Offline Explorer
745
- Offline Navigator
746
- OgScrper
747
- og-scraper
748
- okhttp
749
- omgili
750
- OMSC
751
- Online Domain Tools
752
- OpenCalaisSemanticProxy
753
- Openfind
754
- OpenLinkProfiler
755
- Openstat\/
756
- OpenVAS
757
- Optimizer
758
- Orbiter
759
- OrgProbe\/
760
- orion-semantics
761
- Outlook-Express
762
- Outlook-iOS
763
- ow\.ly
764
- Owler
765
- ownCloud News
766
- OxfordCloudService
767
- Page Valet
768
- page_verifier
769
- page scorer
770
- page2rss
771
- PageGrabber
772
- PagePeeker
773
- PageScorer
774
- Pagespeed\/
775
- Panopta
776
- panscient
777
- Papa Foto
778
- parsijoo
779
- Pavuk
780
- PayPal IPN
781
- pcBrowser
782
- Pcore-HTTP
783
- Pearltrees
784
- PECL::HTTP
785
- peerindex
786
- Peew
787
- PeoplePal
788
- Perlu -
789
- PhantomJS Screenshoter
790
- PhantomJS\/
791
- Photon\/
792
- phpservermon
793
- Pi-Monster
794
- Picscout
795
- Picsearch
796
- PictureFinder
797
- Pimonster
798
- ping\.blo\.gs
799
- Pingability
800
- PingAdmin\.Ru
801
- Pingdom
802
- Pingoscope
803
- PingSpot
804
- pinterest\.com
805
- Pixray
806
- Pizilla
807
- Plagger\/
808
- Ploetz \+ Zeller
809
- Plukkie
810
- plumanalytics
811
- PocketImageCache
812
- PocketParser
813
- Pockey
814
- POE-Component-Client-HTTP
815
- Polymail\/
816
- Pompos
817
- Porkbun
818
- Port Monitor
819
- postano
820
- PostmanRuntime
821
- PostPost
822
- postrank
823
- PowerPoint\/
824
- Priceonomics Analysis Engine
825
- PrintFriendly
826
- PritTorrent
827
- Prlog
828
- probethenet
829
- Project 25499
830
- prospectb2b
831
- Protopage
832
- ProWebWalker
833
- proximic
834
- PRTG Network Monitor
835
- pshtt, https scanning
836
- PTST
837
- PTST\/[0-9]+
838
- Pulsepoint XT3 web scraper
839
- Pump
840
- Python-httplib2
841
- python-requests
842
- Python-urllib
843
- Qirina Hurdler
844
- QQDownload
845
- QrafterPro
846
- Qseero
847
- Qualidator
848
- QueryN Metasearch
849
- queuedriver
850
- Quora Link Preview
851
- Qwantify
852
- Radian6
853
- RankActive
854
- RankFlex
855
- RankSonicSiteAuditor
856
- Re-re Studio
857
- ReactorNetty
858
- Readability
859
- RealDownload
860
- RealPlayer%20Downloader
861
- RebelMouse
862
- Recorder
863
- RecurPost\/
864
- redback\/
865
- ReederForMac
866
- Reeder\/
867
- ReGet
868
- RepoMonkey
869
- request\.js
870
- reqwest\/
871
- ResponseCodeTest
872
- RestSharp
873
- Riddler
874
- Rival IQ
875
- Robosourcer
876
- Robozilla
877
- ROI Hunter
878
- RPT-HTTPClient
879
- RSSOwl
880
- safe-agent-scanner
881
- SalesIntelligent
882
- Saleslift
883
- Sendsay\.Ru
884
- SauceNAO
885
- SBIder
886
- scalaj-http
887
- scan\.lol
888
- ScanAlert
889
- Scoop
890
- scooter
891
- ScoutJet
892
- ScoutURLMonitor
893
- ScrapeBox Page Scanner
894
- SimpleScraper
895
- Scrapy
896
- Screaming
897
- ScreenShotService
898
- Scrubby
899
- Scrutiny\/
900
- search\.thunderstone
901
- Search37
902
- searchenginepromotionhelp
903
- Searchestate
904
- SearchExpress
905
- SearchSight
906
- Seeker
907
- semanticdiscovery
908
- semanticjuice
909
- Semiocast HTTP client
910
- Semrush
911
- sentry\/
912
- SEO Browser
913
- Seo Servis
914
- seo-nastroj\.cz
915
- seo4ajax
916
- Seobility
917
- SEOCentro
918
- SeoCheck
919
- SEOkicks
920
- Seomoz
921
- SEOprofiler
922
- SEOsearch
923
- seoscanners
924
- seositecheckup
925
- SEOstats
926
- servernfo
927
- sexsearcher
928
- Seznam
929
- Shelob
930
- Shodan
931
- Shoppimon
932
- ShopWiki
933
- ShortLinkTranslate
934
- shrinktheweb
935
- Sideqik
936
- SimplePie
937
- SimplyFast
938
- Siphon
939
- SISTRIX
940
- Site-Shot\/
941
- Site Sucker
942
- Site24x7
943
- SiteBar
944
- Sitebeam
945
- Sitebulb\/
946
- SiteCondor
947
- SiteExplorer
948
- SiteGuardian
949
- Siteimprove
950
- SiteIndexed
951
- Sitemap(s)? Generator
952
- SitemapGenerator
953
- SiteMonitor
954
- Siteshooter B0t
955
- SiteSnagger
956
- SiteSucker
957
- SiteTruth
958
- Sitevigil
959
- sitexy\.com
960
- SkypeUriPreview
961
- Slack\/
962
- slider\.com
963
- slurp
964
- SlySearch
965
- SmartDownload
966
- SMRF URL Expander
967
- SMUrlExpander
968
- Snake
969
- Snappy
970
- SnapSearch
971
- Snarfer\/
972
- SniffRSS
973
- sniptracker
974
- Snoopy
975
- SnowHaze Search
976
- sogou web
977
- SortSite
978
- Sottopop
979
- sovereign\.ai
980
- SpaceBison
981
- SpamExperts
982
- Spammen
983
- Spanner
984
- spaziodati
985
- SPDYCheck
986
- Specificfeeds
987
- speedy
988
- SPEng
989
- Spinn3r
990
- spray-can
991
- Sprinklr
992
- spyonweb
993
- sqlmap
994
- Sqlworm
995
- Sqworm
996
- SSL Labs
997
- ssl-tools
998
- StackRambler
999
- Statastico\/
1000
- StatusCake
1001
- Steeler
1002
- Stratagems Kumo
1003
- Stroke\.cz
1004
- StudioFACA
1005
- StumbleUpon
1006
- suchen
1007
- Sucuri
1008
- summify
1009
- SuperHTTP
1010
- Surphace Scout
1011
- Suzuran
1012
- SwiteScraper
1013
- Symfony BrowserKit
1014
- Symfony2 BrowserKit
1015
- SynHttpClient-Built
1016
- Sysomos
1017
- sysscan
1018
- Szukacz
1019
- T0PHackTeam
1020
- tAkeOut
1021
- Tarantula\/
1022
- Taringa UGC
1023
- TarmotGezgin
1024
- Teleport
1025
- Telesoft
1026
- Telesphoreo
1027
- Telesphorep
1028
- Tenon\.io
1029
- teoma
1030
- terrainformatica
1031
- Test Certificate Info
1032
- testuri
1033
- Tetrahedron
1034
- TextRazor Downloader
1035
- The Drop Reaper
1036
- The Expert HTML Source Viewer
1037
- The Knowledge AI
1038
- The Intraformant
1039
- theinternetrules
1040
- TheNomad
1041
- Thinklab
1042
- Thumbshots
1043
- ThumbSniper
1044
- Thumbor
1045
- timewe\.net
1046
- TinEye
1047
- Tiny Tiny RSS
1048
- TLSProbe\/
1049
- Toata
1050
- topster
1051
- touche\.com
1052
- Traackr\.com
1053
- tracemyfile
1054
- Trackuity
1055
- TrapitAgent
1056
- Trendiction
1057
- Trendsmap
1058
- trendspottr
1059
- truwoGPS
1060
- TryJsoup
1061
- TulipChain
1062
- Turingos
1063
- Turnitin
1064
- tweetedtimes
1065
- Tweetminster
1066
- Tweezler\/
1067
- twibble
1068
- Twice
1069
- Twikle
1070
- Twingly
1071
- Twisted PageGetter
1072
- Typhoeus
1073
- ubermetrics-technologies
1074
- uclassify
1075
- UdmSearch
1076
- unchaos
1077
- unirest-java
1078
- UniversalFeedParser
1079
- Unshorten\.It
1080
- Untiny
1081
- UnwindFetchor
1082
- updated
1083
- updown\.io daemon
1084
- Upflow
1085
- Uptimia
1086
- Urlcheckr
1087
- URL Verifier
1088
- URLitor
1089
- urlresolver
1090
- Urlstat
1091
- URLTester
1092
- UrlTrends Ranking Updater
1093
- URLy Warning
1094
- URLy\.Warning
1095
- Vacuum
1096
- Vagabondo
1097
- VB Project
1098
- vBSEO
1099
- VCI
1100
- via ggpht\.com GoogleImageProxy
1101
- VidibleScraper
1102
- Virusdie
1103
- visionutils
1104
- vkShare
1105
- VoidEYE
1106
- Voil
1107
- voltron
1108
- voyager\/
1109
- VSAgent\/
1110
- VSB-TUO\/
1111
- Vulnbusters Meter
1112
- VYU2
1113
- w3af\.org
1114
- W3C_Unicorn
1115
- W3C-checklink
1116
- W3C-mobileOK
1117
- WAC-OFU
1118
- Wallpapers\/[0-9]+
1119
- WallpapersHD
1120
- wangling
1121
- Wappalyzer
1122
- WatchMouse
1123
- WbSrch\/
1124
- WDT\.io
1125
- web-capture\.net
1126
- Web-sniffer
1127
- Web Auto
1128
- Web Collage
1129
- Web Enhancer
1130
- Web Fetch
1131
- Web Fuck
1132
- Web Pix
1133
- Web Sauger
1134
- Web spyder
1135
- Web Sucker
1136
- Webalta
1137
- Webauskunft
1138
- WebAuto
1139
- WebCapture
1140
- WebClient\/
1141
- webcollage
1142
- WebCookies
1143
- WebCopier
1144
- WebCorp
1145
- WebDataStats
1146
- WebDoc
1147
- WebEnhancer
1148
- WebFetch
1149
- WebFuck
1150
- WebGazer
1151
- WebGo IS
1152
- WebImageCollector
1153
- WebImages
1154
- WebIndex
1155
- webkit2png
1156
- WebLeacher
1157
- webmastercoffee
1158
- webmon\s
1159
- WebPix
1160
- WebReaper
1161
- WebSauger
1162
- webscreenie
1163
- Webshag
1164
- Webshot
1165
- Website Quester
1166
- websitepulse agent
1167
- WebsiteQuester
1168
- Websnapr
1169
- WebSniffer
1170
- Webster
1171
- WebStripper
1172
- WebSucker
1173
- Webthumb\/
1174
- WebThumbnail
1175
- WebWhacker
1176
- WebZIP
1177
- WeLikeLinks
1178
- WEPA
1179
- WeSEE
1180
- wf84
1181
- Wfuzz\/
1182
- wget
1183
- WhatsApp
1184
- WhatsMyIP
1185
- WhatWeb
1186
- WhereGoes\?
1187
- Whibse
1188
- WhoRunsCoinHive
1189
- Whynder Magnet
1190
- Windows-RSS-Platform
1191
- WinPodder
1192
- wkhtmlto
1193
- wmtips
1194
- Woko
1195
- woorankreview
1196
- Word\/
1197
- WordPress\/
1198
- worldping-api
1199
- WordupinfoSearch
1200
- wotbox
1201
- WP Engine Install Performance API
1202
- wpif
1203
- wprecon\.com survey
1204
- WPScan
1205
- wscheck
1206
- Wtrace
1207
- WWW-Collector-E
1208
- WWW-Mechanize
1209
- WWW::Document
1210
- WWW::Mechanize
1211
- www\.monitor\.us
1212
- WWWOFFLE
1213
- x09Mozilla
1214
- x22Mozilla
1215
- XaxisSemanticsClassifier
1216
- Xenu Link Sleuth
1217
- XING-contenttabreceiver
1218
- xpymep([0-9]?)\.exe
1219
- Y!J-(ASR|BSC)
1220
- Y\!J-BRW
1221
- Yaanb
1222
- yacy
1223
- Yahoo Link Preview
1224
- YahooCacheSystem
1225
- YahooYSMcm
1226
- YandeG
1227
- Yandex(?!Search)
1228
- yanga
1229
- yeti
1230
- Yo-yo
1231
- Yoleo Consumer
1232
- yoogliFetchAgent
1233
- YottaaMonitor
1234
- Your-Website-Sucks
1235
- yourls\.org
1236
- YoYs\.net
1237
- YP\.PL
1238
- Zabbix
1239
- Zade
1240
- Zao
1241
- Zauba
1242
- Zemanta Aggregator
1243
- Zend_Http_Client
1244
- Zend\\\Http\\\Client
1245
- Zermelo
1246
- Zeus
1247
- zgrab
1248
- ZnajdzFoto
1249
- ZnHTTP
1250
- Zombie\.js
1251
- Zoom\.Mac
1252
- ZyBorg
1253
- [a-z0-9\-_]*(bot|crawl|archiver|transcoder|spider|uptime|validator|fetcher|cron|checker|reader|extractor|monitoring|analyzer)
1254
- ].strip.split(/\n+/).freeze
7
+ extend Loader
8
+
9
+ def self.data
10
+ @data ||= load_raw(CrawlerDetect.config.settings.raw_crawlers_path).freeze
11
+ end
1255
12
  end
1256
13
  end
1257
14
  end