crawler_detect 0.1.11 → 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,24 +1,46 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require "crawler_detect/detector"
4
- require "crawler_detect/library"
5
- require "crawler_detect/library/crawlers"
6
- require "crawler_detect/library/exclusions"
7
- require "crawler_detect/library/headers"
8
- require "crawler_detect/version"
3
+ require "json"
4
+ require "qonfig"
9
5
 
10
- require "rack/crawler_detect"
6
+ require_relative "crawler_detect/config"
7
+ require_relative "crawler_detect/detector"
8
+ require_relative "crawler_detect/library/loader"
9
+ require_relative "crawler_detect/library/crawlers"
10
+ require_relative "crawler_detect/library/exclusions"
11
+ require_relative "crawler_detect/library/headers"
12
+ require_relative "crawler_detect/library"
13
+ require_relative "crawler_detect/version"
14
+ require_relative "rack/crawler_detect"
11
15
 
16
+ # @since 0.1.0
12
17
  module CrawlerDetect
13
18
  class << self
19
+ # @param user_agent [String] User-agent string to detect
20
+ # @return [CrawlerDetect::Detector] Instance of detector class
14
21
  def new(user_agent)
15
22
  detector(user_agent)
16
23
  end
17
24
 
25
+ # @param user_agent [String] User-agent string to detect
26
+ # @return [true, false] Is User-agent a crawler?
18
27
  def is_crawler?(user_agent)
19
28
  detector(user_agent).is_crawler?
20
29
  end
21
30
 
31
+ # @since 1.0.0
32
+ # @param config [Proc]
33
+ def setup!(&config)
34
+ @config = CrawlerDetect::Config.new(&config)
35
+ Library::DATA_CLASSES.each(&:reload_data)
36
+ end
37
+
38
+ # @since 1.0.0
39
+ # @return [CrawlerDetect::Config] Instance of configuration class
40
+ def config
41
+ @config ||= CrawlerDetect::Config.new
42
+ end
43
+
22
44
  private
23
45
 
24
46
  def detector(user_agent)
@@ -0,0 +1,29 @@
1
+ # frozen_string_literal: true
2
+
3
+ module CrawlerDetect
4
+ # Configuration of CrawlerDetect
5
+ #
6
+ # @see settings
7
+ # @since 1.0.0
8
+ class Config < ::Qonfig::DataSet
9
+ CUR_PATH = File.dirname(File.expand_path(__FILE__)).freeze
10
+ RAW_PATH = File.join(CUR_PATH, "library/raw").freeze
11
+
12
+ RAW_CRAWLERS_PATH = File.join(RAW_PATH, "Crawlers.json").freeze
13
+ RAW_EXCLUSIONS_PATH = File.join(RAW_PATH, "Exclusions.json").freeze
14
+ RAW_HEADERS_PATH = File.join(RAW_PATH, "Headers.json").freeze
15
+
16
+ # @return [String] path to crawlers raw JSON file
17
+ setting :raw_crawlers_path, RAW_CRAWLERS_PATH
18
+
19
+ # @return [String] path to exclusions raw JSON file
20
+ setting :raw_exclusions_path, RAW_EXCLUSIONS_PATH
21
+
22
+ # @return [String] path to headers raw JSON file
23
+ setting :raw_headers_path, RAW_HEADERS_PATH
24
+
25
+ validate :raw_crawlers_path, :string, strict: true
26
+ validate :raw_exclusions_path, :string, strict: true
27
+ validate :raw_headers_path, :string, strict: true
28
+ end
29
+ end
@@ -1,17 +1,22 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module CrawlerDetect
4
+ # since 0.1.0
4
5
  class Detector
6
+ # @param user_agent [String] User-agent string to detect
7
+ # @return [CrawlerDetect::Detector] instance of detector class
5
8
  def initialize(user_agent)
6
9
  @user_agent = user_agent.to_s.dup
7
10
  end
8
11
 
12
+ # @return [true, false] Is User-agent a crawler?
9
13
  def is_crawler?
10
14
  @is_crawler ||= begin
11
15
  !completely_exclusion? && matches_crawler_list?
12
16
  end
13
17
  end
14
18
 
19
+ # @return [String] The detected crawler name from RAW data
15
20
  def crawler_name
16
21
  return unless is_crawler?
17
22
  @crawler_name
@@ -19,22 +24,30 @@ module CrawlerDetect
19
24
 
20
25
  private
21
26
 
22
- def completely_exclusion?
23
- @user_agent.gsub!(exclusions_matcher, "")
24
- @user_agent.strip.length == 0
25
- end
27
+ # @private
28
+ # @return [true, false] Is User-agent in white-list?
29
+ def completely_exclusion?
30
+ @user_agent.gsub!(exclusions_matcher, "")
31
+ @user_agent.strip.length.zero?
32
+ end
26
33
 
27
- def matches_crawler_list?
28
- @crawler_name = crawlers_matcher.match(@user_agent).to_s.strip
29
- !@crawler_name.empty?
30
- end
34
+ # @private
35
+ # @return [true, false] Is User-agent in black-list?
36
+ def matches_crawler_list?
37
+ @crawler_name = crawlers_matcher.match(@user_agent).to_s.strip
38
+ !@crawler_name.empty?
39
+ end
31
40
 
32
- def exclusions_matcher
33
- CrawlerDetect::Library.get_regexp("exclusions")
34
- end
41
+ # @private
42
+ # @return [Regexp] White-list of User-agents
43
+ def exclusions_matcher
44
+ CrawlerDetect::Library.get_regexp("exclusions")
45
+ end
35
46
 
36
- def crawlers_matcher
37
- CrawlerDetect::Library.get_regexp("crawlers")
38
- end
47
+ # @private
48
+ # @return [Regexp] Black-list of User-agents
49
+ def crawlers_matcher
50
+ CrawlerDetect::Library.get_regexp("crawlers")
51
+ end
39
52
  end
40
53
  end
@@ -1,16 +1,22 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module CrawlerDetect
4
+ # @since 0.1.0
4
5
  module Library
6
+ DATA_CLASSES = [Library::Headers, Library::Exclusions, Library::Crawlers].freeze
7
+
5
8
  class << self
9
+ # @param param [String] Name of raw data
10
+ # @return [Regexp]
6
11
  def get_regexp(param)
7
12
  data = get_array(param)
8
- %r[#{data.join('|')}]i
13
+ %r{#{data.join('|')}}i
9
14
  end
10
15
 
16
+ # @param param [String] Name of raw data
17
+ # @return [Array]
11
18
  def get_array(param)
12
- const_name = "CrawlerDetect::Library::#{param.capitalize}::#{param.upcase}"
13
- const_get(const_name)
19
+ const_get("CrawlerDetect::Library::#{param.capitalize}").send(:data)
14
20
  end
15
21
  end
16
22
  end
@@ -1,1261 +1,14 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- # rubocop:disable Layout/TrailingWhitespace
4
3
  module CrawlerDetect
5
4
  module Library
5
+ # @since 0.1.0
6
6
  module Crawlers
7
- CRAWLERS = %q[
8
- .*Java.*outbrain
9
- YLT
10
- ^b0t$
11
- ^bluefish
12
- ^Calypso v\/
13
- ^COMODO DCV
14
- ^DangDang
15
- ^DavClnt
16
- ^FDM
17
- ^git\/
18
- ^Goose\/
19
- ^Grabber
20
- ^HTTPClient\/
21
- ^Java\/
22
- ^Jeode\/
23
- ^Jetty\/
24
- ^Mail\/
25
- ^Mget
26
- ^Microsoft URL Control
27
- ^NG\/[0-9\.]
28
- ^NING\/
29
- ^PHP\/[0-9]
30
- ^RMA\/
31
- ^Ruby|Ruby\/[0-9]
32
- ^VSE\/[0-9]
33
- ^WordPress\.com
34
- ^XRL\/[0-9]
35
- ^ZmEu
36
- 008\/
37
- 13TABS
38
- 192\.comAgent
39
- 2ip\.ru
40
- 404enemy
41
- 7Siters
42
- 80legs
43
- a\.pr-cy\.ru
44
- a3logics\.in
45
- A6-Indexer
46
- Abonti
47
- Aboundex
48
- aboutthedomain
49
- Accoona-AI-Agent
50
- acoon
51
- acrylicapps\.com\/pulp
52
- Acunetix
53
- AdAuth\/
54
- adbeat
55
- AddThis
56
- ADmantX
57
- AdminLabs
58
- adressendeutschland
59
- adscanner
60
- Adstxtaggregator
61
- adstxt-worker
62
- adstxt\.com
63
- agentslug
64
- AHC
65
- aihit
66
- aiohttp\/
67
- Airmail
68
- akka-http\/
69
- akula\/
70
- alertra
71
- alexa site audit
72
- Alibaba\.Security\.Heimdall
73
- Alligator
74
- allloadin
75
- AllSubmitter
76
- alyze\.info
77
- amagit
78
- ^Amazon Simple Notification Service Agent$
79
- Anarchie
80
- AndroidDownloadManager
81
- Anemone
82
- AngleSharp
83
- annotate_google
84
- Ant\.com
85
- Anturis Agent
86
- AnyEvent-HTTP\/
87
- Apache Droid
88
- Apache OpenOffice
89
- Apache-HttpAsyncClient
90
- Apache-HttpClient
91
- ApacheBench
92
- Apexoo
93
- APIs-Google
94
- AportWorm\/
95
- AppBeat\/
96
- AppEngine-Google
97
- AppleSyndication
98
- AppStoreScraperZ
99
- Aprc\/[0-9]
100
- Arachmo
101
- arachnode
102
- Arachnophilia
103
- aria2
104
- Arukereso
105
- asafaweb
106
- AskQuickly
107
- Ask Jeeves
108
- ASPSeek
109
- Asterias
110
- Astute
111
- asynchttp
112
- Attach
113
- autocite
114
- AutomaticWPTester
115
- Autonomy
116
- axios\/
117
- AWS Security Scanner
118
- B-l-i-t-z-B-O-T
119
- Backlink-Ceck
120
- backlink-check
121
- BacklinkHttpStatus
122
- BackStreet
123
- BackWeb
124
- Bad-Neighborhood
125
- Badass
126
- baidu\.com
127
- Bandit
128
- basicstate
129
- BatchFTP
130
- Battlezta Bazinga
131
- baypup\/
132
- BazQux
133
- BBBike
134
- BCKLINKS
135
- BDFetch
136
- BegunAdvertising
137
- Bidtellect
138
- BigBozz
139
- Bigfoot
140
- biglotron
141
- BingLocalSearch
142
- BingPreview
143
- binlar
144
- biNu image cacher
145
- Bitacle
146
- biz_Directory
147
- Black Hole
148
- Blackboard Safeassign
149
- BlackWidow
150
- BlockNote\.Net
151
- Bloglines
152
- Bloglovin
153
- BlogPulseLive
154
- BlogSearch
155
- Blogtrottr
156
- BlowFish
157
- boitho\.com-dc
158
- BPImageWalker
159
- Braintree-Webhooks
160
- Branch Metrics API
161
- Branch-Passthrough
162
- Brandprotect
163
- BrandVerity
164
- Brandwatch
165
- Brodie\/
166
- Browsershots
167
- BUbiNG
168
- Buck\/
169
- Buddy
170
- BuiltWith
171
- Bullseye
172
- BunnySlippers
173
- Burf Search
174
- Butterfly\/
175
- BuzzSumo
176
- CAAM\/[0-9]
177
- CakePHP
178
- Calculon
179
- Canary%20Mail
180
- CaretNail
181
- catexplorador
182
- CC Metadata Scaper
183
- Cegbfeieh
184
- censys
185
- Cerberian Drtrs
186
- CERT\.at-Statistics-Survey
187
- cg-eye
188
- changedetection
189
- ChangesMeter
190
- Charlotte
191
- CheckHost
192
- checkprivacy
193
- CherryPicker
194
- ChinaClaw
195
- Chirp\/
196
- chkme\.com
197
- Chlooe
198
- Chromaxa
199
- CirrusExplorer
200
- CISPA Vulnerability Notification
201
- Citoid
202
- CJNetworkQuality
203
- Clarsentia
204
- clips\.ua\.ac\.be
205
- Cloud mapping
206
- CloudEndure
207
- CloudFlare-AlwaysOnline
208
- Cloudinary
209
- cmcm\.com
210
- coccoc
211
- cognitiveseo
212
- colly -
213
- CommaFeed
214
- Commons-HttpClient
215
- commonscan
216
- contactbigdatafr
217
- contentkingapp
218
- convera
219
- CookieReports
220
- copyright sheriff
221
- CopyRightCheck
222
- Copyscape
223
- cortex\/
224
- Cosmos4j\.feedback
225
- Covario-IDS
226
- Craw\/
227
- Crescent
228
- Crowsnest
229
- Criteo
230
- CSHttp
231
- CSSCheck
232
- curb
233
- Curious George
234
- curl
235
- cuwhois\/
236
- cybo\.com
237
- DAP\/NetHTTP
238
- DareBoost
239
- DatabaseDriverMysqli
240
- DataCha0s
241
- Datafeedwatch
242
- Datanyze
243
- DataparkSearch
244
- dataprovider
245
- DataXu
246
- Daum(oa)?[ \/][0-9]
247
- dBpoweramp
248
- ddline
249
- deeris
250
- Demon
251
- DeuSu
252
- developers\.google\.com\/\+\/web\/snippet\/
253
- Devil
254
- Digg
255
- Digincore
256
- DigitalPebble
257
- Dirbuster
258
- Discourse Forum Onebox
259
- Disqus\/
260
- Dispatch\/
261
- DittoSpyder
262
- dlvr
263
- DMBrowser
264
- DNSPod-reporting
265
- docoloc
266
- Dolphin http client
267
- DomainAppender
268
- Donuts Content Explorer
269
- dotMailer content retrieval
270
- dotSemantic
271
- downforeveryoneorjustme
272
- Download Wonder
273
- downnotifier
274
- DowntimeDetector
275
- Drip
276
- drupact
277
- Drupal \(\+http:\/\/drupal\.org\/\)
278
- DTS Agent
279
- dubaiindex
280
- DuplexWeb-Google
281
- EARTHCOM
282
- Easy-Thumb
283
- EasyDL
284
- Ebingbong
285
- ec2linkfinder
286
- eCairn-Grabber
287
- eCatch
288
- ECCP
289
- eContext\/
290
- Ecxi
291
- EirGrabber
292
- ElectricMonk
293
- elefent
294
- EMail Exractor
295
- EMail Wolf
296
- EmailWolf
297
- Embarcadero
298
- Embed PHP Library
299
- Embedly
300
- endo\/
301
- europarchive\.org
302
- evc-batch
303
- EventMachine HttpClient
304
- Everwall Link Expander
305
- Evidon
306
- Evrinid
307
- ExactSearch
308
- ExaleadCloudview
309
- Excel\/
310
- exif
311
- Exploratodo
312
- Express WebPictures
313
- Extreme Picture Finder
314
- EyeNetIE
315
- ezooms
316
- facebookexternalhit
317
- facebookexternalua
318
- facebookplatform
319
- fairshare
320
- Faraday v
321
- fasthttp
322
- Faveeo
323
- Favicon downloader
324
- faviconkit
325
- faviconarchive
326
- FavOrg
327
- Feed Wrangler
328
- Feedable\/
329
- Feedbin
330
- FeedBooster
331
- FeedBucket
332
- FeedBunch\/
333
- FeedBurner
334
- feeder
335
- Feedly
336
- FeedshowOnline
337
- Feedspot
338
- Feedwind\/
339
- FeedZcollector
340
- feeltiptop
341
- Fetch API
342
- Fetch\/[0-9]
343
- Fever\/[0-9]
344
- FHscan
345
- Fimap
346
- findlink
347
- findthatfile
348
- FlashGet
349
- FlipboardBrowserProxy
350
- FlipboardProxy
351
- FlipboardRSS
352
- Flock\/
353
- fluffy
354
- Flunky
355
- flynxapp
356
- forensiq
357
- FoundSeoTool
358
- http:\/\/www.neomo.de\/
359
- free thumbnails
360
- Freeuploader
361
- Funnelback
362
- G-i-g-a-b-o-t
363
- g00g1e\.net
364
- ganarvisitas
365
- geek-tools
366
- Genieo
367
- GentleSource
368
- GetCode
369
- Getintent
370
- GetLinkInfo
371
- getprismatic
372
- GetRight
373
- getroot
374
- GetURLInfo\/
375
- GetWeb
376
- Geziyor
377
- Ghost Inspector
378
- GigablastOpenSource
379
- GIS-LABS
380
- github-camo
381
- github\.com
382
- Go [\d\.]* package http
383
- Go http package
384
- Go-Ahead-Got-It
385
- Go-http-client
386
- Go!Zilla
387
- gobyus
388
- gofetch
389
- GomezAgent
390
- gooblog
391
- Goodzer\/
392
- Google AppsViewer
393
- Google Desktop
394
- Google favicon
395
- Google Keyword Suggestion
396
- Google Keyword Tool
397
- Google Page Speed Insights
398
- Google PP Default
399
- Google Search Console
400
- Google Web Preview
401
- Google-Adwords
402
- Google-Apps-Script
403
- Google-Calendar-Importer
404
- Google-HotelAdsVerifier
405
- Google-HTTP-Java-Client
406
- Google-Publisher-Plugin
407
- Google-Read-Aloud
408
- Google-SearchByImage
409
- Google-Site-Verification
410
- Google-Structured-Data-Testing-Tool
411
- Google-Youtube-Links
412
- google-xrawler
413
- GoogleDocs
414
- GoogleHC\/
415
- GoogleProducer
416
- GoogleSites
417
- Google-Transparency-Report
418
- Gookey
419
- GoScraper
420
- GoSpotCheck
421
- gosquared-thumbnailer
422
- Gotit
423
- GoZilla
424
- grabify
425
- GrabNet
426
- Grafula
427
- Grammarly
428
- GrapeFX
429
- GreatNews
430
- Gregarius
431
- GRequests
432
- grokkit
433
- grouphigh
434
- grub-client
435
- gSOAP\/
436
- GT::WWW
437
- GTmetrix
438
- GuzzleHttp
439
- gvfs\/
440
- HAA(A)?RTLAND http client
441
- Haansoft
442
- hackney\/
443
- Hadi Agent
444
- HappyApps-WebCheck
445
- Hatena
446
- Havij
447
- HaxerMen
448
- HeadlessChrome
449
- HEADMasterSEO
450
- HeartRails_Capture
451
- help@dataminr\.com
452
- heritrix
453
- historious
454
- hkedcity
455
- hledejLevne\.cz
456
- Hloader
457
- HMView
458
- Holmes
459
- HonesoSearchEngine
460
- HootSuite Image proxy
461
- Hootsuite-WebFeed
462
- hosterstats
463
- HostTracker
464
- ht:\/\/check
465
- htdig
466
- HTMLparser
467
- htmlyse
468
- HTTP Banner Detection
469
- HTTP_Compression_Test
470
- http_request2
471
- http_requester
472
- http-get
473
- HTTP-Header-Abfrage
474
- http-kit
475
- http-request\/
476
- HTTP-Tiny
477
- HTTP::Lite
478
- http\.rb\/
479
- http_get
480
- HttpComponents
481
- httphr
482
- HTTPMon
483
- HTTPie
484
- httpRequest
485
- httpscheck
486
- httpssites_power
487
- httpunit
488
- HttpUrlConnection
489
- httrack
490
- huaweisymantec
491
- HubSpot
492
- Humanlinks
493
- i2kconnect\/
494
- Iblog
495
- ichiro
496
- Id-search
497
- IdeelaborPlagiaat
498
- IDG Twitter Links Resolver
499
- IDwhois\/
500
- Iframely
501
- igdeSpyder
502
- IlTrovatore
503
- Image Fetch
504
- Image Sucker
505
- ImageEngine\/
506
- ImageVisu\/
507
- Imagga
508
- imagineeasy
509
- imgsizer
510
- InAGist
511
- inbound\.li parser
512
- InDesign%20CC
513
- Indy Library
514
- InetURL
515
- infegy
516
- infohelfer
517
- InfoTekies
518
- InfoWizards Reciprocal Link
519
- inpwrd\.com
520
- instabid
521
- Instapaper
522
- Integrity
523
- integromedb
524
- Intelliseek
525
- InterGET
526
- internet_archive
527
- Internet Ninja
528
- InternetSeer
529
- internetVista monitor
530
- internetwache
531
- intraVnews
532
- IODC
533
- IOI
534
- iplabel
535
- ips-agent
536
- IPS\/[0-9]
537
- IPWorks HTTP\/S Component
538
- iqdb\/
539
- Iria
540
- Irokez
541
- isitup\.org
542
- iskanie
543
- isUp\.li
544
- iThemes Sync\/
545
- IZaBEE
546
- iZSearch
547
- JAHHO
548
- janforman
549
- Jaunt\/
550
- Jbrofuzz
551
- Jersey\/
552
- JetCar
553
- Jigsaw
554
- Jobboerse
555
- JobFeed discovery
556
- Jobg8 URL Monitor
557
- jobo
558
- Jobrapido
559
- Jobsearch1\.5
560
- JoinVision Generic
561
- JolokiaPwn
562
- Joomla
563
- Jorgee
564
- JS-Kit
565
- JustView
566
- Kaspersky Lab CFR link resolver
567
- Kelny\/
568
- Kerrigan\/
569
- KeyCDN
570
- Keyword Density
571
- Keywords Research
572
- khttp\/
573
- KickFire
574
- KimonoLabs\/
575
- Kml-Google
576
- knows\.is
577
- KOCMOHABT
578
- kouio
579
- kubectl
580
- kube-probe
581
- kulturarw3
582
- KumKie
583
- L\.webis
584
- Larbin
585
- Lavf\/
586
- LeechFTP
587
- LeechGet
588
- letsencrypt
589
- Lftp
590
- LibVLC
591
- LibWeb
592
- Libwhisker
593
- libwww
594
- Licorne
595
- Liferea\/
596
- Lightspeedsystems
597
- Lighthouse
598
- Likse
599
- Link Valet
600
- link_thumbnailer
601
- LinkAlarm\/
602
- linkCheck
603
- linkdex
604
- LinkExaminer
605
- linkfluence
606
- linkpeek
607
- LinkPreviewGenerator
608
- LinkScan
609
- LinksManager
610
- LinkTiger
611
- LinkWalker
612
- Lipperhey
613
- Litemage_walker
614
- livedoor ScreenShot
615
- LoadImpactRload
616
- localsearch-web
617
- LongURL API
618
- looid\.com
619
- looksystems\.net
620
- ltx71
621
- lua-resty-http
622
- lwp-request
623
- lwp-trivial
624
- LWP::Simple
625
- lycos
626
- LYT\.SR
627
- mabontland
628
- Mag-Net
629
- MagpieRSS
630
- Mail\.Ru
631
- MailChimp
632
- Majestic12
633
- makecontact\/
634
- Mandrill
635
- MapperCmd
636
- marketinggrader
637
- MarkMonitor
638
- MarkWatch
639
- Mass Downloader
640
- masscan\/
641
- Mata Hari
642
- Mediametric
643
- Mediapartners-Google
644
- mediawords
645
- MegaIndex\.ru
646
- MeltwaterNews
647
- Melvil Rawi
648
- MemGator
649
- Metaspinner
650
- MetaURI
651
- MFC_Tear_Sample
652
- Microsearch
653
- Microsoft Office
654
- Microsoft Outlook
655
- Microsoft Windows Network Diagnostics
656
- Microsoft-WebDAV-MiniRedir
657
- Microsoft Data Access
658
- MIDown tool
659
- MIIxpc
660
- Mindjet
661
- Miniature\.io
662
- Miniflux
663
- Mister PiX
664
- mixdata dot com
665
- mixed-content-scan
666
- Mixmax-LinkPreview
667
- mixnode
668
- Mnogosearch
669
- mogimogi
670
- Mojeek
671
- Mojolicious \(Perl\)
672
- Monit\/
673
- monitis
674
- Monitority\/
675
- montastic
676
- MonTools
677
- Moreover
678
- Morfeus Fucking Scanner
679
- Morning Paper
680
- MovableType
681
- mowser
682
- Mrcgiguy
683
- MS Web Services Client Protocol
684
- MSFrontPage
685
- mShots
686
- MuckRack\/
687
- muhstik-scan
688
- MVAClient
689
- MxToolbox\/
690
- nagios
691
- Najdi\.si
692
- Name Intelligence
693
- Nameprotect
694
- Navroad
695
- NearSite
696
- Needle
697
- Nessus
698
- Net Vampire
699
- NetAnts
700
- NETCRAFT
701
- NetLyzer
702
- NetMechanic
703
- NetNewsWire
704
- Netpursual
705
- netresearch
706
- NetShelter ContentScan
707
- Netsparker
708
- NetTrack
709
- Netvibes
710
- NetZIP
711
- Neustar WPM
712
- NeutrinoAPI
713
- NewRelicPinger
714
- NewsBlur .*Finder
715
- NewsGator
716
- newsme
717
- newspaper\/
718
- NetSystemsResearch
719
- Nexgate Ruby Client
720
- NG-Search
721
- Nibbler
722
- NICErsPRO
723
- Nikto
724
- nineconnections
725
- NLNZ_IAHarvester
726
- Nmap Scripting Engine
727
- node-superagent
728
- node-urllib
729
- node\.io
730
- Nodemeter
731
- NodePing
732
- nominet\.org\.uk
733
- nominet\.uk
734
- Norton-Safeweb
735
- Notifixious
736
- notifyninja
737
- NotionEmbedder
738
- nuhk
739
- nutch
740
- Nuzzel
741
- nWormFeedFinder
742
- nyawc\/
743
- Nymesis
744
- NYU
745
- Ocelli\/
746
- Octopus
747
- oegp
748
- Offline Explorer
749
- Offline Navigator
750
- OgScrper
751
- og-scraper
752
- okhttp
753
- omgili
754
- OMSC
755
- Online Domain Tools
756
- OpenCalaisSemanticProxy
757
- Openfind
758
- OpenLinkProfiler
759
- Openstat\/
760
- OpenVAS
761
- Optimizer
762
- Orbiter
763
- OrgProbe\/
764
- orion-semantics
765
- Outlook-Express
766
- Outlook-iOS
767
- ow\.ly
768
- Owler
769
- ownCloud News
770
- OxfordCloudService
771
- Page Valet
772
- page_verifier
773
- page scorer
774
- page2rss
775
- PageGrabber
776
- PagePeeker
777
- PageScorer
778
- Pagespeed\/
779
- Panopta
780
- panscient
781
- Papa Foto
782
- parsijoo
783
- Pavuk
784
- PayPal IPN
785
- pcBrowser
786
- Pcore-HTTP
787
- Pearltrees
788
- PECL::HTTP
789
- peerindex
790
- Peew
791
- PeoplePal
792
- Perlu -
793
- PhantomJS Screenshoter
794
- PhantomJS\/
795
- Photon\/
796
- phpservermon
797
- Pi-Monster
798
- Picscout
799
- Picsearch
800
- PictureFinder
801
- Pimonster
802
- ping\.blo\.gs
803
- Pingability
804
- PingAdmin\.Ru
805
- Pingdom
806
- Pingoscope
807
- PingSpot
808
- pinterest\.com
809
- Pixray
810
- Pizilla
811
- Plagger\/
812
- Ploetz \+ Zeller
813
- Plukkie
814
- plumanalytics
815
- PocketImageCache
816
- PocketParser
817
- Pockey
818
- POE-Component-Client-HTTP
819
- Polymail\/
820
- Pompos
821
- Porkbun
822
- Port Monitor
823
- postano
824
- PostmanRuntime
825
- PostPost
826
- postrank
827
- PowerPoint\/
828
- Priceonomics Analysis Engine
829
- PrintFriendly
830
- PritTorrent
831
- Prlog
832
- probethenet
833
- Project 25499
834
- prospectb2b
835
- Protopage
836
- ProWebWalker
837
- proximic
838
- PRTG Network Monitor
839
- pshtt, https scanning
840
- PTST
841
- PTST\/[0-9]+
842
- Pulsepoint XT3 web scraper
843
- Pump
844
- Python-httplib2
845
- python-requests
846
- Python-urllib
847
- Qirina Hurdler
848
- QQDownload
849
- QrafterPro
850
- Qseero
851
- Qualidator
852
- QueryN Metasearch
853
- queuedriver
854
- Quora Link Preview
855
- Qwantify
856
- Radian6
857
- RankActive
858
- RankFlex
859
- RankSonicSiteAuditor
860
- Re-re Studio
861
- ReactorNetty
862
- Readability
863
- RealDownload
864
- RealPlayer%20Downloader
865
- RebelMouse
866
- Recorder
867
- RecurPost\/
868
- redback\/
869
- ReederForMac
870
- Reeder\/
871
- ReGet
872
- RepoMonkey
873
- request\.js
874
- reqwest\/
875
- ResponseCodeTest
876
- RestSharp
877
- Riddler
878
- Rival IQ
879
- Robosourcer
880
- Robozilla
881
- ROI Hunter
882
- RPT-HTTPClient
883
- RSSOwl
884
- safe-agent-scanner
885
- SalesIntelligent
886
- Saleslift
887
- Sendsay\.Ru
888
- SauceNAO
889
- SBIder
890
- scalaj-http
891
- scan\.lol
892
- ScanAlert
893
- Scoop
894
- scooter
895
- ScoutJet
896
- ScoutURLMonitor
897
- ScrapeBox Page Scanner
898
- SimpleScraper
899
- Scrapy
900
- Screaming
901
- ScreenShotService
902
- Scrubby
903
- Scrutiny\/
904
- search\.thunderstone
905
- Search37
906
- searchenginepromotionhelp
907
- Searchestate
908
- SearchExpress
909
- SearchSight
910
- Seeker
911
- semanticdiscovery
912
- semanticjuice
913
- Semiocast HTTP client
914
- Semrush
915
- sentry\/
916
- SEO Browser
917
- Seo Servis
918
- seo-nastroj\.cz
919
- seo4ajax
920
- Seobility
921
- SEOCentro
922
- SeoCheck
923
- SEOkicks
924
- Seomoz
925
- SEOprofiler
926
- SEOsearch
927
- seoscanners
928
- seositecheckup
929
- SEOstats
930
- servernfo
931
- sexsearcher
932
- Seznam
933
- Shelob
934
- Shodan
935
- Shoppimon
936
- ShopWiki
937
- ShortLinkTranslate
938
- shrinktheweb
939
- Sideqik
940
- SimplePie
941
- SimplyFast
942
- Siphon
943
- SISTRIX
944
- Site-Shot\/
945
- Site Sucker
946
- Site24x7
947
- SiteBar
948
- Sitebeam
949
- Sitebulb\/
950
- SiteCondor
951
- SiteExplorer
952
- SiteGuardian
953
- Siteimprove
954
- SiteIndexed
955
- Sitemap(s)? Generator
956
- SitemapGenerator
957
- SiteMonitor
958
- Siteshooter B0t
959
- SiteSnagger
960
- SiteSucker
961
- SiteTruth
962
- Sitevigil
963
- sitexy\.com
964
- SkypeUriPreview
965
- Slack\/
966
- slider\.com
967
- slurp
968
- SlySearch
969
- SmartDownload
970
- SMRF URL Expander
971
- SMUrlExpander
972
- Snake
973
- Snappy
974
- SnapSearch
975
- Snarfer\/
976
- SniffRSS
977
- sniptracker
978
- Snoopy
979
- SnowHaze Search
980
- sogou web
981
- SortSite
982
- Sottopop
983
- sovereign\.ai
984
- SpaceBison
985
- SpamExperts
986
- Spammen
987
- Spanner
988
- spaziodati
989
- SPDYCheck
990
- Specificfeeds
991
- speedy
992
- SPEng
993
- Spinn3r
994
- spray-can
995
- Sprinklr
996
- spyonweb
997
- sqlmap
998
- Sqlworm
999
- Sqworm
1000
- SSL Labs
1001
- ssl-tools
1002
- StackRambler
1003
- Statastico\/
1004
- StatusCake
1005
- Steeler
1006
- Stratagems Kumo
1007
- Stroke\.cz
1008
- StudioFACA
1009
- StumbleUpon
1010
- suchen
1011
- Sucuri
1012
- summify
1013
- SuperHTTP
1014
- Surphace Scout
1015
- Suzuran
1016
- SwiteScraper
1017
- Symfony BrowserKit
1018
- Symfony2 BrowserKit
1019
- SynHttpClient-Built
1020
- Sysomos
1021
- sysscan
1022
- Szukacz
1023
- T0PHackTeam
1024
- tAkeOut
1025
- Tarantula\/
1026
- Taringa UGC
1027
- TarmotGezgin
1028
- Teleport
1029
- Telesoft
1030
- Telesphoreo
1031
- Telesphorep
1032
- Tenon\.io
1033
- teoma
1034
- terrainformatica
1035
- Test Certificate Info
1036
- testuri
1037
- Tetrahedron
1038
- TextRazor Downloader
1039
- The Drop Reaper
1040
- The Expert HTML Source Viewer
1041
- The Knowledge AI
1042
- The Intraformant
1043
- theinternetrules
1044
- TheNomad
1045
- Thinklab
1046
- Thumbshots
1047
- ThumbSniper
1048
- Thumbor
1049
- timewe\.net
1050
- TinEye
1051
- Tiny Tiny RSS
1052
- TLSProbe\/
1053
- Toata
1054
- topster
1055
- touche\.com
1056
- Traackr\.com
1057
- tracemyfile
1058
- Trackuity
1059
- TrapitAgent
1060
- Trendiction
1061
- Trendsmap
1062
- trendspottr
1063
- truwoGPS
1064
- TryJsoup
1065
- TulipChain
1066
- Turingos
1067
- Turnitin
1068
- tweetedtimes
1069
- Tweetminster
1070
- Tweezler\/
1071
- twibble
1072
- Twice
1073
- Twikle
1074
- Twingly
1075
- Twisted PageGetter
1076
- Typhoeus
1077
- ubermetrics-technologies
1078
- uclassify
1079
- UdmSearch
1080
- unchaos
1081
- unirest-java
1082
- UniversalFeedParser
1083
- Unshorten\.It
1084
- Untiny
1085
- UnwindFetchor
1086
- updated
1087
- updown\.io daemon
1088
- Upflow
1089
- Uptimia
1090
- Urlcheckr
1091
- URL Verifier
1092
- URLitor
1093
- urlresolver
1094
- Urlstat
1095
- URLTester
1096
- UrlTrends Ranking Updater
1097
- URLy Warning
1098
- URLy\.Warning
1099
- Vacuum
1100
- Vagabondo
1101
- VB Project
1102
- vBSEO
1103
- VCI
1104
- via ggpht\.com GoogleImageProxy
1105
- VidibleScraper
1106
- Virusdie
1107
- visionutils
1108
- vkShare
1109
- VoidEYE
1110
- Voil
1111
- voltron
1112
- voyager\/
1113
- VSAgent\/
1114
- VSB-TUO\/
1115
- Vulnbusters Meter
1116
- VYU2
1117
- w3af\.org
1118
- W3C_Unicorn
1119
- W3C-checklink
1120
- W3C-mobileOK
1121
- WAC-OFU
1122
- Wallpapers\/[0-9]+
1123
- WallpapersHD
1124
- wangling
1125
- Wappalyzer
1126
- WatchMouse
1127
- WbSrch\/
1128
- WDT\.io
1129
- web-capture\.net
1130
- Web-sniffer
1131
- Web Auto
1132
- Web Collage
1133
- Web Enhancer
1134
- Web Fetch
1135
- Web Fuck
1136
- Web Pix
1137
- Web Sauger
1138
- Web spyder
1139
- Web Sucker
1140
- Webalta
1141
- Webauskunft
1142
- WebAuto
1143
- WebCapture
1144
- WebClient\/
1145
- webcollage
1146
- WebCookies
1147
- WebCopier
1148
- WebCorp
1149
- WebDataStats
1150
- WebDoc
1151
- WebEnhancer
1152
- WebFetch
1153
- WebFuck
1154
- WebGazer
1155
- WebGo IS
1156
- WebImageCollector
1157
- WebImages
1158
- WebIndex
1159
- webkit2png
1160
- WebLeacher
1161
- webmastercoffee
1162
- webmon\s
1163
- WebPix
1164
- WebReaper
1165
- WebSauger
1166
- webscreenie
1167
- Webshag
1168
- Webshot
1169
- Website Quester
1170
- websitepulse agent
1171
- WebsiteQuester
1172
- Websnapr
1173
- WebSniffer
1174
- Webster
1175
- WebStripper
1176
- WebSucker
1177
- Webthumb\/
1178
- WebThumbnail
1179
- WebWhacker
1180
- WebZIP
1181
- WeLikeLinks
1182
- WEPA
1183
- WeSEE
1184
- wf84
1185
- Wfuzz\/
1186
- wget
1187
- WhatsApp
1188
- WhatsMyIP
1189
- WhatWeb
1190
- WhereGoes\?
1191
- Whibse
1192
- WhoRunsCoinHive
1193
- Whynder Magnet
1194
- Windows-RSS-Platform
1195
- WinPodder
1196
- wkhtmlto
1197
- wmtips
1198
- Woko
1199
- woorankreview
1200
- Word\/
1201
- WordPress\/
1202
- worldping-api
1203
- WordupinfoSearch
1204
- wotbox
1205
- WP Engine Install Performance API
1206
- wpif
1207
- wprecon\.com survey
1208
- WPScan
1209
- wscheck
1210
- Wtrace
1211
- WWW-Collector-E
1212
- WWW-Mechanize
1213
- WWW::Document
1214
- WWW::Mechanize
1215
- www\.monitor\.us
1216
- WWWOFFLE
1217
- x09Mozilla
1218
- x22Mozilla
1219
- XaxisSemanticsClassifier
1220
- Xenu Link Sleuth
1221
- XING-contenttabreceiver
1222
- xpymep([0-9]?)\.exe
1223
- Y!J-(ASR|BSC)
1224
- Y\!J-BRW
1225
- Yaanb
1226
- yacy
1227
- Yahoo Link Preview
1228
- YahooCacheSystem
1229
- YahooYSMcm
1230
- YandeG
1231
- Yandex(?!Search)
1232
- yanga
1233
- yeti
1234
- Yo-yo
1235
- Yoleo Consumer
1236
- yoogliFetchAgent
1237
- YottaaMonitor
1238
- Your-Website-Sucks
1239
- yourls\.org
1240
- YoYs\.net
1241
- YP\.PL
1242
- Zabbix
1243
- Zade
1244
- Zao
1245
- Zauba
1246
- Zemanta Aggregator
1247
- Zend_Http_Client
1248
- Zend\\\Http\\\Client
1249
- Zermelo
1250
- Zeus
1251
- zgrab
1252
- ZnajdzFoto
1253
- ZnHTTP
1254
- Zombie\.js
1255
- Zoom\.Mac
1256
- ZyBorg
1257
- [a-z0-9\-_]*(bot|crawl|archiver|transcoder|spider|uptime|validator|fetcher|cron|checker|reader|extractor|monitoring|analyzer)
1258
- ].strip.split(/\n+/).freeze
7
+ extend Loader
8
+
9
+ def self.data
10
+ @data ||= load_raw(CrawlerDetect.config.settings.raw_crawlers_path).freeze
11
+ end
1259
12
  end
1260
13
  end
1261
14
  end