crawler_detect 0.1.9 → 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,24 +1,46 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require "crawler_detect/detector"
4
- require "crawler_detect/library"
5
- require "crawler_detect/library/crawlers"
6
- require "crawler_detect/library/exclusions"
7
- require "crawler_detect/library/headers"
8
- require "crawler_detect/version"
3
+ require "oj"
4
+ require "qonfig"
9
5
 
10
- require "rack/crawler_detect"
6
+ require_relative "crawler_detect/config"
7
+ require_relative "crawler_detect/detector"
8
+ require_relative "crawler_detect/library/loader"
9
+ require_relative "crawler_detect/library/crawlers"
10
+ require_relative "crawler_detect/library/exclusions"
11
+ require_relative "crawler_detect/library/headers"
12
+ require_relative "crawler_detect/library"
13
+ require_relative "crawler_detect/version"
14
+ require_relative "rack/crawler_detect"
11
15
 
16
+ # @since 0.1.0
12
17
  module CrawlerDetect
13
18
  class << self
19
+ # @param user_agent [String] User-agent string to detect
20
+ # @return [CrawlerDetect::Detector] Instance of detector class
14
21
  def new(user_agent)
15
22
  detector(user_agent)
16
23
  end
17
24
 
25
+ # @param user_agent [String] User-agent string to detect
26
+ # @return [true, false] Is User-agent a crawler?
18
27
  def is_crawler?(user_agent)
19
28
  detector(user_agent).is_crawler?
20
29
  end
21
30
 
31
+ # @since 1.0.0
32
+ # @param config [Proc]
33
+ def setup!(&config)
34
+ @config = CrawlerDetect::Config.new(&config)
35
+ Library::DATA_CLASSES.each(&:reload_data)
36
+ end
37
+
38
+ # @since 1.0.0
39
+ # @return [CrawlerDetect::Config] Instance of configuration class
40
+ def config
41
+ @config ||= CrawlerDetect::Config.new
42
+ end
43
+
22
44
  private
23
45
 
24
46
  def detector(user_agent)
@@ -0,0 +1,29 @@
1
+ # frozen_string_literal: true
2
+
3
+ module CrawlerDetect
4
+ # Configuration of CrawlerDetect
5
+ #
6
+ # @see settings
7
+ # @since 1.0.0
8
+ class Config < ::Qonfig::DataSet
9
+ CUR_PATH = File.dirname(File.expand_path(__FILE__)).freeze
10
+ RAW_PATH = File.join(CUR_PATH, "library/raw").freeze
11
+
12
+ RAW_CRAWLERS_PATH = File.join(RAW_PATH, "Crawlers.json").freeze
13
+ RAW_EXCLUSIONS_PATH = File.join(RAW_PATH, "Exclusions.json").freeze
14
+ RAW_HEADERS_PATH = File.join(RAW_PATH, "Headers.json").freeze
15
+
16
+ # @return [String] path to crawlers raw JSON file
17
+ setting :raw_crawlers_path, RAW_CRAWLERS_PATH
18
+
19
+ # @return [String] path to exclusions raw JSON file
20
+ setting :raw_exclusions_path, RAW_EXCLUSIONS_PATH
21
+
22
+ # @return [String] path to headers raw JSON file
23
+ setting :raw_headers_path, RAW_HEADERS_PATH
24
+
25
+ validate :raw_crawlers_path, :string, strict: true
26
+ validate :raw_exclusions_path, :string, strict: true
27
+ validate :raw_headers_path, :string, strict: true
28
+ end
29
+ end
@@ -1,17 +1,22 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module CrawlerDetect
4
+ # since 0.1.0
4
5
  class Detector
6
+ # @param user_agent [String] User-agent string to detect
7
+ # @return [CrawlerDetect::Detector] instance of detector class
5
8
  def initialize(user_agent)
6
9
  @user_agent = user_agent.to_s.dup
7
10
  end
8
11
 
12
+ # @return [true, false] Is User-agent a crawler?
9
13
  def is_crawler?
10
14
  @is_crawler ||= begin
11
15
  !completely_exclusion? && matches_crawler_list?
12
16
  end
13
17
  end
14
18
 
19
+ # @return [String] The detected crawler name from RAW data
15
20
  def crawler_name
16
21
  return unless is_crawler?
17
22
  @crawler_name
@@ -19,22 +24,30 @@ module CrawlerDetect
19
24
 
20
25
  private
21
26
 
22
- def completely_exclusion?
23
- @user_agent.gsub!(exclusions_matcher, "")
24
- @user_agent.strip.length == 0
25
- end
27
+ # @private
28
+ # @return [true, false] Is User-agent in white-list?
29
+ def completely_exclusion?
30
+ @user_agent.gsub!(exclusions_matcher, "")
31
+ @user_agent.strip.length.zero?
32
+ end
26
33
 
27
- def matches_crawler_list?
28
- @crawler_name = crawlers_matcher.match(@user_agent).to_s.strip
29
- !@crawler_name.empty?
30
- end
34
+ # @private
35
+ # @return [true, false] Is User-agent in black-list?
36
+ def matches_crawler_list?
37
+ @crawler_name = crawlers_matcher.match(@user_agent).to_s.strip
38
+ !@crawler_name.empty?
39
+ end
31
40
 
32
- def exclusions_matcher
33
- CrawlerDetect::Library.get_regexp("exclusions")
34
- end
41
+ # @private
42
+ # @return [Regexp] White-list of User-agents
43
+ def exclusions_matcher
44
+ CrawlerDetect::Library.get_regexp("exclusions")
45
+ end
35
46
 
36
- def crawlers_matcher
37
- CrawlerDetect::Library.get_regexp("crawlers")
38
- end
47
+ # @private
48
+ # @return [Regexp] Black-list of User-agents
49
+ def crawlers_matcher
50
+ CrawlerDetect::Library.get_regexp("crawlers")
51
+ end
39
52
  end
40
53
  end
@@ -1,16 +1,22 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module CrawlerDetect
4
+ # @since 0.1.0
4
5
  module Library
6
+ DATA_CLASSES = [Library::Headers, Library::Exclusions, Library::Crawlers].freeze
7
+
5
8
  class << self
9
+ # @param param [String] Name of raw data
10
+ # @return [Regexp]
6
11
  def get_regexp(param)
7
12
  data = get_array(param)
8
- %r[#{data.join('|')}]i
13
+ %r{#{data.join('|')}}i
9
14
  end
10
15
 
16
+ # @param param [String] Name of raw data
17
+ # @return [Array]
11
18
  def get_array(param)
12
- const_name = "CrawlerDetect::Library::#{param.capitalize}::#{param.upcase}"
13
- const_get(const_name)
19
+ const_get("CrawlerDetect::Library::#{param.capitalize}").send(:data)
14
20
  end
15
21
  end
16
22
  end
@@ -1,1257 +1,14 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- # rubocop:disable Layout/TrailingWhitespace
4
3
  module CrawlerDetect
5
4
  module Library
5
+ # @since 0.1.0
6
6
  module Crawlers
7
- CRAWLERS = %q[
8
- .*Java.*outbrain
9
- YLT
10
- ^b0t$
11
- ^bluefish
12
- ^Calypso v\/
13
- ^COMODO DCV
14
- ^DangDang
15
- ^DavClnt
16
- ^FDM
17
- ^git\/
18
- ^Goose\/
19
- ^Grabber
20
- ^HTTPClient\/
21
- ^Java\/
22
- ^Jeode\/
23
- ^Jetty\/
24
- ^Mail\/
25
- ^Mget
26
- ^Microsoft URL Control
27
- ^NG\/[0-9\.]
28
- ^NING\/
29
- ^PHP\/[0-9]
30
- ^RMA\/
31
- ^Ruby|Ruby\/[0-9]
32
- ^VSE\/[0-9]
33
- ^WordPress\.com
34
- ^XRL\/[0-9]
35
- ^ZmEu
36
- 008\/
37
- 13TABS
38
- 192\.comAgent
39
- 2ip\.ru
40
- 404enemy
41
- 7Siters
42
- 80legs
43
- a\.pr-cy\.ru
44
- a3logics\.in
45
- A6-Indexer
46
- Abonti
47
- Aboundex
48
- aboutthedomain
49
- Accoona-AI-Agent
50
- acoon
51
- acrylicapps\.com\/pulp
52
- Acunetix
53
- AdAuth\/
54
- adbeat
55
- AddThis
56
- ADmantX
57
- AdminLabs
58
- adressendeutschland
59
- adscanner
60
- Adstxtaggregator
61
- adstxt-worker
62
- adstxt\.com
63
- agentslug
64
- AHC
65
- aihit
66
- aiohttp\/
67
- Airmail
68
- akka-http\/
69
- akula\/
70
- alertra
71
- alexa site audit
72
- Alibaba\.Security\.Heimdall
73
- Alligator
74
- allloadin
75
- AllSubmitter
76
- alyze\.info
77
- amagit
78
- ^Amazon Simple Notification Service Agent$
79
- Anarchie
80
- AndroidDownloadManager
81
- Anemone
82
- AngleSharp
83
- annotate_google
84
- Ant\.com
85
- Anturis Agent
86
- AnyEvent-HTTP\/
87
- Apache Droid
88
- Apache OpenOffice
89
- Apache-HttpAsyncClient
90
- Apache-HttpClient
91
- ApacheBench
92
- Apexoo
93
- APIs-Google
94
- AportWorm\/
95
- AppBeat\/
96
- AppEngine-Google
97
- AppleSyndication
98
- AppStoreScraperZ
99
- Aprc\/[0-9]
100
- Arachmo
101
- arachnode
102
- Arachnophilia
103
- aria2
104
- Arukereso
105
- asafaweb
106
- AskQuickly
107
- Ask Jeeves
108
- ASPSeek
109
- Asterias
110
- Astute
111
- asynchttp
112
- Attach
113
- autocite
114
- AutomaticWPTester
115
- Autonomy
116
- axios\/
117
- B-l-i-t-z-B-O-T
118
- Backlink-Ceck
119
- backlink-check
120
- BacklinkHttpStatus
121
- BackStreet
122
- BackWeb
123
- Bad-Neighborhood
124
- Badass
125
- baidu\.com
126
- Bandit
127
- basicstate
128
- BatchFTP
129
- Battlezta Bazinga
130
- baypup\/
131
- BazQux
132
- BBBike
133
- BCKLINKS
134
- BDFetch
135
- BegunAdvertising
136
- Bidtellect
137
- BigBozz
138
- Bigfoot
139
- biglotron
140
- BingLocalSearch
141
- BingPreview
142
- binlar
143
- biNu image cacher
144
- Bitacle
145
- biz_Directory
146
- Black Hole
147
- Blackboard Safeassign
148
- BlackWidow
149
- BlockNote\.Net
150
- Bloglines
151
- Bloglovin
152
- BlogPulseLive
153
- BlogSearch
154
- Blogtrottr
155
- BlowFish
156
- boitho\.com-dc
157
- BPImageWalker
158
- Braintree-Webhooks
159
- Branch Metrics API
160
- Branch-Passthrough
161
- Brandprotect
162
- BrandVerity
163
- Brandwatch
164
- Brodie\/
165
- Browsershots
166
- BUbiNG
167
- Buck\/
168
- Buddy
169
- BuiltWith
170
- Bullseye
171
- BunnySlippers
172
- Burf Search
173
- Butterfly\/
174
- BuzzSumo
175
- CAAM\/[0-9]
176
- CakePHP
177
- Calculon
178
- Canary%20Mail
179
- CaretNail
180
- catexplorador
181
- CC Metadata Scaper
182
- Cegbfeieh
183
- censys
184
- Cerberian Drtrs
185
- CERT\.at-Statistics-Survey
186
- cg-eye
187
- changedetection
188
- ChangesMeter
189
- Charlotte
190
- CheckHost
191
- checkprivacy
192
- CherryPicker
193
- ChinaClaw
194
- Chirp\/
195
- chkme\.com
196
- Chlooe
197
- Chromaxa
198
- CirrusExplorer
199
- CISPA Vulnerability Notification
200
- Citoid
201
- CJNetworkQuality
202
- Clarsentia
203
- clips\.ua\.ac\.be
204
- Cloud mapping
205
- CloudEndure
206
- CloudFlare-AlwaysOnline
207
- Cloudinary
208
- cmcm\.com
209
- coccoc
210
- cognitiveseo
211
- colly -
212
- CommaFeed
213
- Commons-HttpClient
214
- commonscan
215
- contactbigdatafr
216
- contentkingapp
217
- convera
218
- CookieReports
219
- copyright sheriff
220
- CopyRightCheck
221
- Copyscape
222
- cortex\/
223
- Cosmos4j\.feedback
224
- Covario-IDS
225
- Craw\/
226
- Crescent
227
- Crowsnest
228
- Criteo
229
- CSHttp
230
- CSSCheck
231
- curb
232
- Curious George
233
- curl
234
- cuwhois\/
235
- cybo\.com
236
- DAP\/NetHTTP
237
- DareBoost
238
- DatabaseDriverMysqli
239
- DataCha0s
240
- Datafeedwatch
241
- Datanyze
242
- DataparkSearch
243
- dataprovider
244
- DataXu
245
- Daum(oa)?[ \/][0-9]
246
- dBpoweramp
247
- ddline
248
- deeris
249
- Demon
250
- DeuSu
251
- developers\.google\.com\/\+\/web\/snippet\/
252
- Devil
253
- Digg
254
- Digincore
255
- DigitalPebble
256
- Dirbuster
257
- Discourse Forum Onebox
258
- Disqus\/
259
- Dispatch\/
260
- DittoSpyder
261
- dlvr
262
- DMBrowser
263
- DNSPod-reporting
264
- docoloc
265
- Dolphin http client
266
- DomainAppender
267
- Donuts Content Explorer
268
- dotMailer content retrieval
269
- dotSemantic
270
- downforeveryoneorjustme
271
- Download Wonder
272
- downnotifier
273
- DowntimeDetector
274
- Drip
275
- drupact
276
- Drupal \(\+http:\/\/drupal\.org\/\)
277
- DTS Agent
278
- dubaiindex
279
- DuplexWeb-Google
280
- EARTHCOM
281
- Easy-Thumb
282
- EasyDL
283
- Ebingbong
284
- ec2linkfinder
285
- eCairn-Grabber
286
- eCatch
287
- ECCP
288
- eContext\/
289
- Ecxi
290
- EirGrabber
291
- ElectricMonk
292
- elefent
293
- EMail Exractor
294
- EMail Wolf
295
- EmailWolf
296
- Embarcadero
297
- Embed PHP Library
298
- Embedly
299
- endo\/
300
- europarchive\.org
301
- evc-batch
302
- EventMachine HttpClient
303
- Everwall Link Expander
304
- Evidon
305
- Evrinid
306
- ExactSearch
307
- ExaleadCloudview
308
- Excel\/
309
- exif
310
- Exploratodo
311
- Express WebPictures
312
- Extreme Picture Finder
313
- EyeNetIE
314
- ezooms
315
- facebookexternalhit
316
- facebookexternalua
317
- facebookplatform
318
- fairshare
319
- Faraday v
320
- fasthttp
321
- Faveeo
322
- Favicon downloader
323
- faviconkit
324
- faviconarchive
325
- FavOrg
326
- Feed Wrangler
327
- Feedable\/
328
- Feedbin
329
- FeedBooster
330
- FeedBucket
331
- FeedBunch\/
332
- FeedBurner
333
- feeder
334
- Feedly
335
- FeedshowOnline
336
- Feedspot
337
- Feedwind\/
338
- FeedZcollector
339
- feeltiptop
340
- Fetch API
341
- Fetch\/[0-9]
342
- Fever\/[0-9]
343
- FHscan
344
- Fimap
345
- findlink
346
- findthatfile
347
- FlashGet
348
- FlipboardBrowserProxy
349
- FlipboardProxy
350
- FlipboardRSS
351
- Flock\/
352
- fluffy
353
- Flunky
354
- flynxapp
355
- forensiq
356
- FoundSeoTool
357
- http:\/\/www.neomo.de\/
358
- free thumbnails
359
- Freeuploader
360
- Funnelback
361
- G-i-g-a-b-o-t
362
- g00g1e\.net
363
- ganarvisitas
364
- geek-tools
365
- Genieo
366
- GentleSource
367
- GetCode
368
- Getintent
369
- GetLinkInfo
370
- getprismatic
371
- GetRight
372
- getroot
373
- GetURLInfo\/
374
- GetWeb
375
- Geziyor
376
- Ghost Inspector
377
- GigablastOpenSource
378
- GIS-LABS
379
- github-camo
380
- github\.com
381
- Go [\d\.]* package http
382
- Go http package
383
- Go-Ahead-Got-It
384
- Go-http-client
385
- Go!Zilla
386
- gobyus
387
- gofetch
388
- GomezAgent
389
- gooblog
390
- Goodzer\/
391
- Google AppsViewer
392
- Google Desktop
393
- Google favicon
394
- Google Keyword Suggestion
395
- Google Keyword Tool
396
- Google Page Speed Insights
397
- Google PP Default
398
- Google Search Console
399
- Google Web Preview
400
- Google-Adwords
401
- Google-Apps-Script
402
- Google-Calendar-Importer
403
- Google-HotelAdsVerifier
404
- Google-HTTP-Java-Client
405
- Google-Publisher-Plugin
406
- Google-Read-Aloud
407
- Google-SearchByImage
408
- Google-Site-Verification
409
- Google-Structured-Data-Testing-Tool
410
- Google-Youtube-Links
411
- google-xrawler
412
- GoogleDocs
413
- GoogleHC\/
414
- GoogleProducer
415
- GoogleSites
416
- Google-Transparency-Report
417
- Gookey
418
- GoScraper
419
- GoSpotCheck
420
- gosquared-thumbnailer
421
- Gotit
422
- GoZilla
423
- grabify
424
- GrabNet
425
- Grafula
426
- Grammarly
427
- GrapeFX
428
- GreatNews
429
- Gregarius
430
- GRequests
431
- grokkit
432
- grouphigh
433
- grub-client
434
- gSOAP\/
435
- GT::WWW
436
- GTmetrix
437
- GuzzleHttp
438
- gvfs\/
439
- HAA(A)?RTLAND http client
440
- Haansoft
441
- hackney\/
442
- Hadi Agent
443
- HappyApps-WebCheck
444
- Hatena
445
- Havij
446
- HeadlessChrome
447
- HEADMasterSEO
448
- HeartRails_Capture
449
- help@dataminr\.com
450
- heritrix
451
- historious
452
- hkedcity
453
- hledejLevne\.cz
454
- Hloader
455
- HMView
456
- Holmes
457
- HonesoSearchEngine
458
- HootSuite Image proxy
459
- Hootsuite-WebFeed
460
- hosterstats
461
- HostTracker
462
- ht:\/\/check
463
- htdig
464
- HTMLparser
465
- htmlyse
466
- HTTP Banner Detection
467
- HTTP_Compression_Test
468
- http_request2
469
- http_requester
470
- http-get
471
- HTTP-Header-Abfrage
472
- http-kit
473
- http-request\/
474
- HTTP-Tiny
475
- HTTP::Lite
476
- http\.rb\/
477
- http_get
478
- HttpComponents
479
- httphr
480
- HTTPMon
481
- HTTPie
482
- httpRequest
483
- httpscheck
484
- httpssites_power
485
- httpunit
486
- HttpUrlConnection
487
- httrack
488
- huaweisymantec
489
- HubSpot
490
- Humanlinks
491
- i2kconnect\/
492
- Iblog
493
- ichiro
494
- Id-search
495
- IdeelaborPlagiaat
496
- IDG Twitter Links Resolver
497
- IDwhois\/
498
- Iframely
499
- igdeSpyder
500
- IlTrovatore
501
- Image Fetch
502
- Image Sucker
503
- ImageEngine\/
504
- ImageVisu\/
505
- Imagga
506
- imagineeasy
507
- imgsizer
508
- InAGist
509
- inbound\.li parser
510
- InDesign%20CC
511
- Indy Library
512
- InetURL
513
- infegy
514
- infohelfer
515
- InfoTekies
516
- InfoWizards Reciprocal Link
517
- inpwrd\.com
518
- instabid
519
- Instapaper
520
- Integrity
521
- integromedb
522
- Intelliseek
523
- InterGET
524
- internet_archive
525
- Internet Ninja
526
- InternetSeer
527
- internetVista monitor
528
- internetwache
529
- intraVnews
530
- IODC
531
- IOI
532
- iplabel
533
- ips-agent
534
- IPS\/[0-9]
535
- IPWorks HTTP\/S Component
536
- iqdb\/
537
- Iria
538
- Irokez
539
- isitup\.org
540
- iskanie
541
- isUp\.li
542
- iThemes Sync\/
543
- IZaBEE
544
- iZSearch
545
- JAHHO
546
- janforman
547
- Jaunt\/
548
- Jbrofuzz
549
- Jersey\/
550
- JetCar
551
- Jigsaw
552
- Jobboerse
553
- JobFeed discovery
554
- Jobg8 URL Monitor
555
- jobo
556
- Jobrapido
557
- Jobsearch1\.5
558
- JoinVision Generic
559
- JolokiaPwn
560
- Joomla
561
- Jorgee
562
- JS-Kit
563
- JustView
564
- Kaspersky Lab CFR link resolver
565
- Kelny\/
566
- Kerrigan\/
567
- KeyCDN
568
- Keyword Density
569
- Keywords Research
570
- khttp\/
571
- KickFire
572
- KimonoLabs\/
573
- Kml-Google
574
- knows\.is
575
- KOCMOHABT
576
- kouio
577
- kube-probe
578
- kulturarw3
579
- KumKie
580
- L\.webis
581
- Larbin
582
- Lavf\/
583
- LeechFTP
584
- LeechGet
585
- letsencrypt
586
- Lftp
587
- LibVLC
588
- LibWeb
589
- Libwhisker
590
- libwww
591
- Licorne
592
- Liferea\/
593
- Lightspeedsystems
594
- Lighthouse
595
- Likse
596
- Link Valet
597
- link_thumbnailer
598
- LinkAlarm\/
599
- linkCheck
600
- linkdex
601
- LinkExaminer
602
- linkfluence
603
- linkpeek
604
- LinkPreviewGenerator
605
- LinkScan
606
- LinksManager
607
- LinkTiger
608
- LinkWalker
609
- Lipperhey
610
- Litemage_walker
611
- livedoor ScreenShot
612
- LoadImpactRload
613
- localsearch-web
614
- LongURL API
615
- looid\.com
616
- looksystems\.net
617
- ltx71
618
- lua-resty-http
619
- lwp-request
620
- lwp-trivial
621
- LWP::Simple
622
- lycos
623
- LYT\.SR
624
- mabontland
625
- Mag-Net
626
- MagpieRSS
627
- Mail\.Ru
628
- MailChimp
629
- Majestic12
630
- makecontact\/
631
- Mandrill
632
- MapperCmd
633
- marketinggrader
634
- MarkMonitor
635
- MarkWatch
636
- Mass Downloader
637
- masscan\/
638
- Mata Hari
639
- Mediametric
640
- Mediapartners-Google
641
- mediawords
642
- MegaIndex\.ru
643
- MeltwaterNews
644
- Melvil Rawi
645
- MemGator
646
- Metaspinner
647
- MetaURI
648
- MFC_Tear_Sample
649
- Microsearch
650
- Microsoft Office
651
- Microsoft Outlook
652
- Microsoft Windows Network Diagnostics
653
- Microsoft-WebDAV-MiniRedir
654
- Microsoft Data Access
655
- MIDown tool
656
- MIIxpc
657
- Mindjet
658
- Miniature\.io
659
- Miniflux
660
- Mister PiX
661
- mixdata dot com
662
- mixed-content-scan
663
- Mixmax-LinkPreview
664
- mixnode
665
- Mnogosearch
666
- mogimogi
667
- Mojeek
668
- Mojolicious \(Perl\)
669
- Monit\/
670
- monitis
671
- Monitority\/
672
- montastic
673
- MonTools
674
- Moreover
675
- Morfeus Fucking Scanner
676
- Morning Paper
677
- MovableType
678
- mowser
679
- Mrcgiguy
680
- MS Web Services Client Protocol
681
- MSFrontPage
682
- mShots
683
- MuckRack\/
684
- muhstik-scan
685
- MVAClient
686
- MxToolbox\/
687
- nagios
688
- Najdi\.si
689
- Name Intelligence
690
- Nameprotect
691
- Navroad
692
- NearSite
693
- Needle
694
- Nessus
695
- Net Vampire
696
- NetAnts
697
- NETCRAFT
698
- NetLyzer
699
- NetMechanic
700
- NetNewsWire
701
- Netpursual
702
- netresearch
703
- NetShelter ContentScan
704
- Netsparker
705
- NetTrack
706
- Netvibes
707
- NetZIP
708
- Neustar WPM
709
- NeutrinoAPI
710
- NewRelicPinger
711
- NewsBlur .*Finder
712
- NewsGator
713
- newsme
714
- newspaper\/
715
- Nexgate Ruby Client
716
- NG-Search
717
- Nibbler
718
- NICErsPRO
719
- Nikto
720
- nineconnections
721
- NLNZ_IAHarvester
722
- Nmap Scripting Engine
723
- node-superagent
724
- node-urllib
725
- node\.io
726
- Nodemeter
727
- NodePing
728
- nominet\.org\.uk
729
- nominet\.uk
730
- Norton-Safeweb
731
- Notifixious
732
- notifyninja
733
- NotionEmbedder
734
- nuhk
735
- nutch
736
- Nuzzel
737
- nWormFeedFinder
738
- nyawc\/
739
- Nymesis
740
- NYU
741
- Ocelli\/
742
- Octopus
743
- oegp
744
- Offline Explorer
745
- Offline Navigator
746
- OgScrper
747
- og-scraper
748
- okhttp
749
- omgili
750
- OMSC
751
- Online Domain Tools
752
- OpenCalaisSemanticProxy
753
- Openfind
754
- OpenLinkProfiler
755
- Openstat\/
756
- OpenVAS
757
- Optimizer
758
- Orbiter
759
- OrgProbe\/
760
- orion-semantics
761
- Outlook-Express
762
- Outlook-iOS
763
- ow\.ly
764
- Owler
765
- ownCloud News
766
- OxfordCloudService
767
- Page Valet
768
- page_verifier
769
- page scorer
770
- page2rss
771
- PageGrabber
772
- PagePeeker
773
- PageScorer
774
- Pagespeed\/
775
- Panopta
776
- panscient
777
- Papa Foto
778
- parsijoo
779
- Pavuk
780
- PayPal IPN
781
- pcBrowser
782
- Pcore-HTTP
783
- Pearltrees
784
- PECL::HTTP
785
- peerindex
786
- Peew
787
- PeoplePal
788
- Perlu -
789
- PhantomJS Screenshoter
790
- PhantomJS\/
791
- Photon\/
792
- phpservermon
793
- Pi-Monster
794
- Picscout
795
- Picsearch
796
- PictureFinder
797
- Pimonster
798
- ping\.blo\.gs
799
- Pingability
800
- PingAdmin\.Ru
801
- Pingdom
802
- Pingoscope
803
- PingSpot
804
- pinterest\.com
805
- Pixray
806
- Pizilla
807
- Plagger\/
808
- Ploetz \+ Zeller
809
- Plukkie
810
- plumanalytics
811
- PocketImageCache
812
- PocketParser
813
- Pockey
814
- POE-Component-Client-HTTP
815
- Polymail\/
816
- Pompos
817
- Porkbun
818
- Port Monitor
819
- postano
820
- PostmanRuntime
821
- PostPost
822
- postrank
823
- PowerPoint\/
824
- Priceonomics Analysis Engine
825
- PrintFriendly
826
- PritTorrent
827
- Prlog
828
- probethenet
829
- Project 25499
830
- prospectb2b
831
- Protopage
832
- ProWebWalker
833
- proximic
834
- PRTG Network Monitor
835
- pshtt, https scanning
836
- PTST
837
- PTST\/[0-9]+
838
- Pulsepoint XT3 web scraper
839
- Pump
840
- Python-httplib2
841
- python-requests
842
- Python-urllib
843
- Qirina Hurdler
844
- QQDownload
845
- QrafterPro
846
- Qseero
847
- Qualidator
848
- QueryN Metasearch
849
- queuedriver
850
- Quora Link Preview
851
- Qwantify
852
- Radian6
853
- RankActive
854
- RankFlex
855
- RankSonicSiteAuditor
856
- Re-re Studio
857
- ReactorNetty
858
- Readability
859
- RealDownload
860
- RealPlayer%20Downloader
861
- RebelMouse
862
- Recorder
863
- RecurPost\/
864
- redback\/
865
- ReederForMac
866
- Reeder\/
867
- ReGet
868
- RepoMonkey
869
- request\.js
870
- reqwest\/
871
- ResponseCodeTest
872
- RestSharp
873
- Riddler
874
- Rival IQ
875
- Robosourcer
876
- Robozilla
877
- ROI Hunter
878
- RPT-HTTPClient
879
- RSSOwl
880
- safe-agent-scanner
881
- SalesIntelligent
882
- Saleslift
883
- Sendsay\.Ru
884
- SauceNAO
885
- SBIder
886
- scalaj-http
887
- scan\.lol
888
- ScanAlert
889
- Scoop
890
- scooter
891
- ScoutJet
892
- ScoutURLMonitor
893
- ScrapeBox Page Scanner
894
- SimpleScraper
895
- Scrapy
896
- Screaming
897
- ScreenShotService
898
- Scrubby
899
- Scrutiny\/
900
- search\.thunderstone
901
- Search37
902
- searchenginepromotionhelp
903
- Searchestate
904
- SearchExpress
905
- SearchSight
906
- Seeker
907
- semanticdiscovery
908
- semanticjuice
909
- Semiocast HTTP client
910
- Semrush
911
- sentry\/
912
- SEO Browser
913
- Seo Servis
914
- seo-nastroj\.cz
915
- seo4ajax
916
- Seobility
917
- SEOCentro
918
- SeoCheck
919
- SEOkicks
920
- Seomoz
921
- SEOprofiler
922
- SEOsearch
923
- seoscanners
924
- seositecheckup
925
- SEOstats
926
- servernfo
927
- sexsearcher
928
- Seznam
929
- Shelob
930
- Shodan
931
- Shoppimon
932
- ShopWiki
933
- ShortLinkTranslate
934
- shrinktheweb
935
- Sideqik
936
- SimplePie
937
- SimplyFast
938
- Siphon
939
- SISTRIX
940
- Site-Shot\/
941
- Site Sucker
942
- Site24x7
943
- SiteBar
944
- Sitebeam
945
- Sitebulb\/
946
- SiteCondor
947
- SiteExplorer
948
- SiteGuardian
949
- Siteimprove
950
- SiteIndexed
951
- Sitemap(s)? Generator
952
- SitemapGenerator
953
- SiteMonitor
954
- Siteshooter B0t
955
- SiteSnagger
956
- SiteSucker
957
- SiteTruth
958
- Sitevigil
959
- sitexy\.com
960
- SkypeUriPreview
961
- Slack\/
962
- slider\.com
963
- slurp
964
- SlySearch
965
- SmartDownload
966
- SMRF URL Expander
967
- SMUrlExpander
968
- Snake
969
- Snappy
970
- SnapSearch
971
- Snarfer\/
972
- SniffRSS
973
- sniptracker
974
- Snoopy
975
- SnowHaze Search
976
- sogou web
977
- SortSite
978
- Sottopop
979
- sovereign\.ai
980
- SpaceBison
981
- SpamExperts
982
- Spammen
983
- Spanner
984
- spaziodati
985
- SPDYCheck
986
- Specificfeeds
987
- speedy
988
- SPEng
989
- Spinn3r
990
- spray-can
991
- Sprinklr
992
- spyonweb
993
- sqlmap
994
- Sqlworm
995
- Sqworm
996
- SSL Labs
997
- ssl-tools
998
- StackRambler
999
- Statastico\/
1000
- StatusCake
1001
- Steeler
1002
- Stratagems Kumo
1003
- Stroke\.cz
1004
- StudioFACA
1005
- StumbleUpon
1006
- suchen
1007
- Sucuri
1008
- summify
1009
- SuperHTTP
1010
- Surphace Scout
1011
- Suzuran
1012
- SwiteScraper
1013
- Symfony BrowserKit
1014
- Symfony2 BrowserKit
1015
- SynHttpClient-Built
1016
- Sysomos
1017
- sysscan
1018
- Szukacz
1019
- T0PHackTeam
1020
- tAkeOut
1021
- Tarantula\/
1022
- Taringa UGC
1023
- TarmotGezgin
1024
- Teleport
1025
- Telesoft
1026
- Telesphoreo
1027
- Telesphorep
1028
- Tenon\.io
1029
- teoma
1030
- terrainformatica
1031
- Test Certificate Info
1032
- testuri
1033
- Tetrahedron
1034
- TextRazor Downloader
1035
- The Drop Reaper
1036
- The Expert HTML Source Viewer
1037
- The Knowledge AI
1038
- The Intraformant
1039
- theinternetrules
1040
- TheNomad
1041
- Thinklab
1042
- Thumbshots
1043
- ThumbSniper
1044
- Thumbor
1045
- timewe\.net
1046
- TinEye
1047
- Tiny Tiny RSS
1048
- TLSProbe\/
1049
- Toata
1050
- topster
1051
- touche\.com
1052
- Traackr\.com
1053
- tracemyfile
1054
- Trackuity
1055
- TrapitAgent
1056
- Trendiction
1057
- Trendsmap
1058
- trendspottr
1059
- truwoGPS
1060
- TryJsoup
1061
- TulipChain
1062
- Turingos
1063
- Turnitin
1064
- tweetedtimes
1065
- Tweetminster
1066
- Tweezler\/
1067
- twibble
1068
- Twice
1069
- Twikle
1070
- Twingly
1071
- Twisted PageGetter
1072
- Typhoeus
1073
- ubermetrics-technologies
1074
- uclassify
1075
- UdmSearch
1076
- unchaos
1077
- unirest-java
1078
- UniversalFeedParser
1079
- Unshorten\.It
1080
- Untiny
1081
- UnwindFetchor
1082
- updated
1083
- updown\.io daemon
1084
- Upflow
1085
- Uptimia
1086
- Urlcheckr
1087
- URL Verifier
1088
- URLitor
1089
- urlresolver
1090
- Urlstat
1091
- URLTester
1092
- UrlTrends Ranking Updater
1093
- URLy Warning
1094
- URLy\.Warning
1095
- Vacuum
1096
- Vagabondo
1097
- VB Project
1098
- vBSEO
1099
- VCI
1100
- via ggpht\.com GoogleImageProxy
1101
- VidibleScraper
1102
- Virusdie
1103
- visionutils
1104
- vkShare
1105
- VoidEYE
1106
- Voil
1107
- voltron
1108
- voyager\/
1109
- VSAgent\/
1110
- VSB-TUO\/
1111
- Vulnbusters Meter
1112
- VYU2
1113
- w3af\.org
1114
- W3C_Unicorn
1115
- W3C-checklink
1116
- W3C-mobileOK
1117
- WAC-OFU
1118
- Wallpapers\/[0-9]+
1119
- WallpapersHD
1120
- wangling
1121
- Wappalyzer
1122
- WatchMouse
1123
- WbSrch\/
1124
- WDT\.io
1125
- web-capture\.net
1126
- Web-sniffer
1127
- Web Auto
1128
- Web Collage
1129
- Web Enhancer
1130
- Web Fetch
1131
- Web Fuck
1132
- Web Pix
1133
- Web Sauger
1134
- Web spyder
1135
- Web Sucker
1136
- Webalta
1137
- Webauskunft
1138
- WebAuto
1139
- WebCapture
1140
- WebClient\/
1141
- webcollage
1142
- WebCookies
1143
- WebCopier
1144
- WebCorp
1145
- WebDataStats
1146
- WebDoc
1147
- WebEnhancer
1148
- WebFetch
1149
- WebFuck
1150
- WebGazer
1151
- WebGo IS
1152
- WebImageCollector
1153
- WebImages
1154
- WebIndex
1155
- webkit2png
1156
- WebLeacher
1157
- webmastercoffee
1158
- webmon\s
1159
- WebPix
1160
- WebReaper
1161
- WebSauger
1162
- webscreenie
1163
- Webshag
1164
- Webshot
1165
- Website Quester
1166
- websitepulse agent
1167
- WebsiteQuester
1168
- Websnapr
1169
- WebSniffer
1170
- Webster
1171
- WebStripper
1172
- WebSucker
1173
- Webthumb\/
1174
- WebThumbnail
1175
- WebWhacker
1176
- WebZIP
1177
- WeLikeLinks
1178
- WEPA
1179
- WeSEE
1180
- wf84
1181
- Wfuzz\/
1182
- wget
1183
- WhatsApp
1184
- WhatsMyIP
1185
- WhatWeb
1186
- WhereGoes\?
1187
- Whibse
1188
- WhoRunsCoinHive
1189
- Whynder Magnet
1190
- Windows-RSS-Platform
1191
- WinPodder
1192
- wkhtmlto
1193
- wmtips
1194
- Woko
1195
- woorankreview
1196
- Word\/
1197
- WordPress\/
1198
- worldping-api
1199
- WordupinfoSearch
1200
- wotbox
1201
- WP Engine Install Performance API
1202
- wpif
1203
- wprecon\.com survey
1204
- WPScan
1205
- wscheck
1206
- Wtrace
1207
- WWW-Collector-E
1208
- WWW-Mechanize
1209
- WWW::Document
1210
- WWW::Mechanize
1211
- www\.monitor\.us
1212
- WWWOFFLE
1213
- x09Mozilla
1214
- x22Mozilla
1215
- XaxisSemanticsClassifier
1216
- Xenu Link Sleuth
1217
- XING-contenttabreceiver
1218
- xpymep([0-9]?)\.exe
1219
- Y!J-(ASR|BSC)
1220
- Y\!J-BRW
1221
- Yaanb
1222
- yacy
1223
- Yahoo Link Preview
1224
- YahooCacheSystem
1225
- YahooYSMcm
1226
- YandeG
1227
- Yandex(?!Search)
1228
- yanga
1229
- yeti
1230
- Yo-yo
1231
- Yoleo Consumer
1232
- yoogliFetchAgent
1233
- YottaaMonitor
1234
- Your-Website-Sucks
1235
- yourls\.org
1236
- YoYs\.net
1237
- YP\.PL
1238
- Zabbix
1239
- Zade
1240
- Zao
1241
- Zauba
1242
- Zemanta Aggregator
1243
- Zend_Http_Client
1244
- Zend\\\Http\\\Client
1245
- Zermelo
1246
- Zeus
1247
- zgrab
1248
- ZnajdzFoto
1249
- ZnHTTP
1250
- Zombie\.js
1251
- Zoom\.Mac
1252
- ZyBorg
1253
- [a-z0-9\-_]*(bot|crawl|archiver|transcoder|spider|uptime|validator|fetcher|cron|checker|reader|extractor|monitoring|analyzer)
1254
- ].strip.split(/\n+/).freeze
7
+ extend Loader
8
+
9
+ def self.data
10
+ @data ||= load_raw(CrawlerDetect.config.settings.raw_crawlers_path).freeze
11
+ end
1255
12
  end
1256
13
  end
1257
14
  end