crawler_detect 0.1.12 → 1.1.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,10 @@
1
+ #!/bin/bash
2
+
3
+ DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
4
+
5
+ wget -O $DIR/../lib/crawler_detect/library/raw/Crawlers.json https://raw.githubusercontent.com/JayBizzle/Crawler-Detect/master/raw/Crawlers.json
6
+ wget -O $DIR/../lib/crawler_detect/library/raw/Exclusions.json https://raw.githubusercontent.com/JayBizzle/Crawler-Detect/master/raw/Exclusions.json
7
+ wget -O $DIR/../lib/crawler_detect/library/raw/Headers.json https://raw.githubusercontent.com/JayBizzle/Crawler-Detect/master/raw/Headers.json
8
+
9
+ wget -O $DIR/../spec/fixtures/crawlers.txt https://raw.githubusercontent.com/JayBizzle/Crawler-Detect/master/tests/crawlers.txt
10
+ wget -O $DIR/../spec/fixtures/devices.txt https://raw.githubusercontent.com/JayBizzle/Crawler-Detect/master/tests/devices.txt
@@ -1,6 +1,6 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- lib = File.expand_path("../lib", __FILE__)
3
+ lib = File.expand_path("lib", __dir__)
4
4
  $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
5
5
  require "crawler_detect/version"
6
6
 
@@ -17,14 +17,16 @@ Gem::Specification.new do |spec|
17
17
 
18
18
  # Specify which files should be added to the gem when it is released.
19
19
  # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
20
- spec.files = Dir.chdir(File.expand_path("..", __FILE__)) do
20
+ spec.files = Dir.chdir(File.expand_path(__dir__)) do
21
21
  `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
22
22
  end
23
23
  spec.bindir = "exe"
24
24
  spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
25
25
  spec.require_paths = ["lib"]
26
26
 
27
- spec.add_development_dependency "activesupport", "~> 5.2.0"
27
+ spec.add_dependency "qonfig", "~> 0.24"
28
+
29
+ spec.add_development_dependency "activesupport", "~> 6.0.3"
28
30
  spec.add_development_dependency "bundler", ">= 1.15"
29
31
  spec.add_development_dependency "fuubar", "~> 2.0"
30
32
  spec.add_development_dependency "parallel_tests", "~> 2.0"
@@ -32,4 +34,5 @@ Gem::Specification.new do |spec|
32
34
  spec.add_development_dependency "rack-test", "~> 1.1"
33
35
  spec.add_development_dependency "rake", ">= 10.0"
34
36
  spec.add_development_dependency "rspec", "~> 3.0"
37
+ spec.add_development_dependency "armitage-rubocop", "0.82"
35
38
  end
@@ -1,24 +1,46 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require "crawler_detect/detector"
4
- require "crawler_detect/library"
5
- require "crawler_detect/library/crawlers"
6
- require "crawler_detect/library/exclusions"
7
- require "crawler_detect/library/headers"
8
- require "crawler_detect/version"
3
+ require "json"
4
+ require "qonfig"
9
5
 
10
- require "rack/crawler_detect"
6
+ require_relative "crawler_detect/config"
7
+ require_relative "crawler_detect/detector"
8
+ require_relative "crawler_detect/library/loader"
9
+ require_relative "crawler_detect/library/crawlers"
10
+ require_relative "crawler_detect/library/exclusions"
11
+ require_relative "crawler_detect/library/headers"
12
+ require_relative "crawler_detect/library"
13
+ require_relative "crawler_detect/version"
14
+ require_relative "rack/crawler_detect"
11
15
 
16
+ # @since 0.1.0
12
17
  module CrawlerDetect
13
18
  class << self
19
+ # @param user_agent [String] User-agent string to detect
20
+ # @return [CrawlerDetect::Detector] Instance of detector class
14
21
  def new(user_agent)
15
22
  detector(user_agent)
16
23
  end
17
24
 
25
+ # @param user_agent [String] User-agent string to detect
26
+ # @return [true, false] Is User-agent a crawler?
18
27
  def is_crawler?(user_agent)
19
28
  detector(user_agent).is_crawler?
20
29
  end
21
30
 
31
+ # @since 1.0.0
32
+ # @param config [Proc]
33
+ def setup!(&config)
34
+ @config = CrawlerDetect::Config.new(&config)
35
+ Library::DATA_CLASSES.each(&:reload_data)
36
+ end
37
+
38
+ # @since 1.0.0
39
+ # @return [CrawlerDetect::Config] Instance of configuration class
40
+ def config
41
+ @config ||= CrawlerDetect::Config.new
42
+ end
43
+
22
44
  private
23
45
 
24
46
  def detector(user_agent)
@@ -0,0 +1,29 @@
1
+ # frozen_string_literal: true
2
+
3
+ module CrawlerDetect
4
+ # Configuration of CrawlerDetect
5
+ #
6
+ # @see settings
7
+ # @since 1.0.0
8
+ class Config < ::Qonfig::DataSet
9
+ CUR_PATH = File.dirname(File.expand_path(__FILE__)).freeze
10
+ RAW_PATH = File.join(CUR_PATH, "library/raw").freeze
11
+
12
+ RAW_CRAWLERS_PATH = File.join(RAW_PATH, "Crawlers.json").freeze
13
+ RAW_EXCLUSIONS_PATH = File.join(RAW_PATH, "Exclusions.json").freeze
14
+ RAW_HEADERS_PATH = File.join(RAW_PATH, "Headers.json").freeze
15
+
16
+ # @return [String] path to crawlers raw JSON file
17
+ setting :raw_crawlers_path, RAW_CRAWLERS_PATH
18
+
19
+ # @return [String] path to exclusions raw JSON file
20
+ setting :raw_exclusions_path, RAW_EXCLUSIONS_PATH
21
+
22
+ # @return [String] path to headers raw JSON file
23
+ setting :raw_headers_path, RAW_HEADERS_PATH
24
+
25
+ validate :raw_crawlers_path, :string, strict: true
26
+ validate :raw_exclusions_path, :string, strict: true
27
+ validate :raw_headers_path, :string, strict: true
28
+ end
29
+ end
@@ -1,17 +1,22 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module CrawlerDetect
4
+ # since 0.1.0
4
5
  class Detector
6
+ # @param user_agent [String] User-agent string to detect
7
+ # @return [CrawlerDetect::Detector] instance of detector class
5
8
  def initialize(user_agent)
6
9
  @user_agent = user_agent.to_s.dup
7
10
  end
8
11
 
12
+ # @return [true, false] Is User-agent a crawler?
9
13
  def is_crawler?
10
14
  @is_crawler ||= begin
11
15
  !completely_exclusion? && matches_crawler_list?
12
16
  end
13
17
  end
14
18
 
19
+ # @return [String] The detected crawler name from RAW data
15
20
  def crawler_name
16
21
  return unless is_crawler?
17
22
  @crawler_name
@@ -19,22 +24,30 @@ module CrawlerDetect
19
24
 
20
25
  private
21
26
 
22
- def completely_exclusion?
23
- @user_agent.gsub!(exclusions_matcher, "")
24
- @user_agent.strip.length == 0
25
- end
27
+ # @private
28
+ # @return [true, false] Is User-agent in white-list?
29
+ def completely_exclusion?
30
+ @user_agent.gsub!(exclusions_matcher, "")
31
+ @user_agent.strip.length.zero?
32
+ end
26
33
 
27
- def matches_crawler_list?
28
- @crawler_name = crawlers_matcher.match(@user_agent).to_s.strip
29
- !@crawler_name.empty?
30
- end
34
+ # @private
35
+ # @return [true, false] Is User-agent in black-list?
36
+ def matches_crawler_list?
37
+ @crawler_name = crawlers_matcher.match(@user_agent).to_s.strip
38
+ !@crawler_name.empty?
39
+ end
31
40
 
32
- def exclusions_matcher
33
- CrawlerDetect::Library.get_regexp("exclusions")
34
- end
41
+ # @private
42
+ # @return [Regexp] White-list of User-agents
43
+ def exclusions_matcher
44
+ CrawlerDetect::Library.get_regexp("exclusions")
45
+ end
35
46
 
36
- def crawlers_matcher
37
- CrawlerDetect::Library.get_regexp("crawlers")
38
- end
47
+ # @private
48
+ # @return [Regexp] Black-list of User-agents
49
+ def crawlers_matcher
50
+ CrawlerDetect::Library.get_regexp("crawlers")
51
+ end
39
52
  end
40
53
  end
@@ -1,16 +1,22 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module CrawlerDetect
4
+ # @since 0.1.0
4
5
  module Library
6
+ DATA_CLASSES = [Library::Headers, Library::Exclusions, Library::Crawlers].freeze
7
+
5
8
  class << self
9
+ # @param param [String] Name of raw data
10
+ # @return [Regexp]
6
11
  def get_regexp(param)
7
12
  data = get_array(param)
8
- %r[#{data.join('|')}]i
13
+ %r{#{data.join('|')}}i
9
14
  end
10
15
 
16
+ # @param param [String] Name of raw data
17
+ # @return [Array]
11
18
  def get_array(param)
12
- const_name = "CrawlerDetect::Library::#{param.capitalize}::#{param.upcase}"
13
- const_get(const_name)
19
+ const_get("CrawlerDetect::Library::#{param.capitalize}").send(:data)
14
20
  end
15
21
  end
16
22
  end
@@ -1,1285 +1,14 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- # rubocop:disable Layout/TrailingWhitespace
4
3
  module CrawlerDetect
5
4
  module Library
5
+ # @since 0.1.0
6
6
  module Crawlers
7
- CRAWLERS = %q[
8
- .*Java.*outbrain
9
- YLT
10
- ^b0t$
11
- ^bluefish
12
- ^Calypso v\/
13
- ^COMODO DCV
14
- ^DangDang
15
- ^DavClnt
16
- ^FDM
17
- ^git\/
18
- ^Goose\/
19
- ^Grabber
20
- ^HTTPClient\/
21
- ^Java\/
22
- ^Jeode\/
23
- ^Jetty\/
24
- ^Mail\/
25
- ^Mget
26
- ^Microsoft URL Control
27
- ^NG\/[0-9\.]
28
- ^NING\/
29
- ^PHP\/[0-9]
30
- ^RMA\/
31
- ^Ruby|Ruby\/[0-9]
32
- ^VSE\/[0-9]
33
- ^WordPress\.com
34
- ^XRL\/[0-9]
35
- ^ZmEu
36
- 008\/
37
- 13TABS
38
- 192\.comAgent
39
- 2ip\.ru
40
- 404enemy
41
- 7Siters
42
- 80legs
43
- a\.pr-cy\.ru
44
- a3logics\.in
45
- A6-Indexer
46
- Abonti
47
- Aboundex
48
- aboutthedomain
49
- Accoona-AI-Agent
50
- acoon
51
- acrylicapps\.com\/pulp
52
- Acunetix
53
- AdAuth\/
54
- adbeat
55
- AddThis
56
- ADmantX
57
- AdminLabs
58
- adressendeutschland
59
- adreview\/
60
- adscanner
61
- Adstxtaggregator
62
- adstxt-worker
63
- adstxt\.com
64
- agentslug
65
- AHC
66
- aihit
67
- aiohttp\/
68
- Airmail
69
- akka-http\/
70
- akula\/
71
- alertra
72
- alexa site audit
73
- Alibaba\.Security\.Heimdall
74
- Alligator
75
- allloadin
76
- AllSubmitter
77
- alyze\.info
78
- amagit
79
- ^Amazon Simple Notification Service Agent$
80
- Anarchie
81
- AndroidDownloadManager
82
- Anemone
83
- AngleSharp
84
- annotate_google
85
- Ant\.com
86
- Anturis Agent
87
- AnyEvent-HTTP\/
88
- Apache Droid
89
- Apache OpenOffice
90
- Apache-HttpAsyncClient
91
- Apache-HttpClient
92
- ApacheBench
93
- Apexoo
94
- APIs-Google
95
- AportWorm\/
96
- AppBeat\/
97
- AppEngine-Google
98
- AppleSyndication
99
- Aprc\/[0-9]
100
- Arachmo
101
- arachnode
102
- Arachnophilia
103
- aria2
104
- Arukereso
105
- asafaweb
106
- AskQuickly
107
- Ask Jeeves
108
- ASPSeek
109
- Asterias
110
- Astute
111
- asynchttp
112
- Attach
113
- attohttpc
114
- autocite
115
- AutomaticWPTester
116
- Autonomy
117
- axios\/
118
- AWS Security Scanner
119
- B-l-i-t-z-B-O-T
120
- Backlink-Ceck
121
- backlink-check
122
- BacklinkHttpStatus
123
- BackStreet
124
- BackWeb
125
- Bad-Neighborhood
126
- Badass
127
- baidu\.com
128
- Bandit
129
- basicstate
130
- BatchFTP
131
- Battlezta Bazinga
132
- baypup\/
133
- BazQux
134
- BBBike
135
- BCKLINKS
136
- BDFetch
137
- BegunAdvertising
138
- Bewica-security-scan
139
- Bidtellect
140
- BigBozz
141
- Bigfoot
142
- biglotron
143
- BingLocalSearch
144
- BingPreview
145
- binlar
146
- biNu image cacher
147
- Bitacle
148
- biz_Directory
149
- Black Hole
150
- Blackboard Safeassign
151
- BlackWidow
152
- BlockNote\.Net
153
- BlogBridge
154
- Bloglines
155
- Bloglovin
156
- BlogPulseLive
157
- BlogSearch
158
- Blogtrottr
159
- BlowFish
160
- boitho\.com-dc
161
- Boost\.Beast
162
- BPImageWalker
163
- Braintree-Webhooks
164
- Branch Metrics API
165
- Branch-Passthrough
166
- Brandprotect
167
- BrandVerity
168
- Brandwatch
169
- Brodie\/
170
- Browsershots
171
- BUbiNG
172
- Buck\/
173
- Buddy
174
- BuiltWith
175
- Bullseye
176
- BunnySlippers
177
- Burf Search
178
- Butterfly\/
179
- BuzzSumo
180
- CAAM\/[0-9]
181
- CakePHP
182
- Calculon
183
- Canary%20Mail
184
- CaretNail
185
- catexplorador
186
- CC Metadata Scaper
187
- Cegbfeieh
188
- censys
189
- Cerberian Drtrs
190
- CERT\.at-Statistics-Survey
191
- cg-eye
192
- changedetection
193
- ChangesMeter
194
- Charlotte
195
- CheckHost
196
- checkprivacy
197
- CherryPicker
198
- ChinaClaw
199
- Chirp\/
200
- chkme\.com
201
- Chlooe
202
- Chromaxa
203
- CirrusExplorer
204
- CISPA Vulnerability Notification
205
- Citoid
206
- CJNetworkQuality
207
- Clarsentia
208
- clips\.ua\.ac\.be
209
- Cloud mapping
210
- CloudEndure
211
- CloudFlare-AlwaysOnline
212
- Cloudflare-Healthchecks
213
- Cloudinary
214
- cmcm\.com
215
- coccoc
216
- cognitiveseo
217
- colly -
218
- CommaFeed
219
- Commons-HttpClient
220
- commonscan
221
- contactbigdatafr
222
- contentkingapp
223
- convera
224
- CookieReports
225
- copyright sheriff
226
- CopyRightCheck
227
- Copyscape
228
- cortex\/
229
- Cosmos4j\.feedback
230
- Covario-IDS
231
- Craw\/
232
- Crescent
233
- Crowsnest
234
- Criteo
235
- CSHttp
236
- CSSCheck
237
- curb
238
- Curious George
239
- curl
240
- cuwhois\/
241
- cybo\.com
242
- DAP\/NetHTTP
243
- DareBoost
244
- DatabaseDriverMysqli
245
- DataCha0s
246
- Datafeedwatch
247
- Datanyze
248
- DataparkSearch
249
- dataprovider
250
- DataXu
251
- Daum(oa)?[ \/][0-9]
252
- dBpoweramp
253
- ddline
254
- deeris
255
- delve\.ai
256
- Demon
257
- DeuSu
258
- developers\.google\.com\/\+\/web\/snippet\/
259
- Devil
260
- Digg
261
- Digincore
262
- DigitalPebble
263
- Dirbuster
264
- Discourse Forum Onebox
265
- Disqus\/
266
- Dispatch\/
267
- DittoSpyder
268
- dlvr
269
- DMBrowser
270
- DNSPod-reporting
271
- docoloc
272
- Dolphin http client
273
- DomainAppender
274
- DomainLabz
275
- Donuts Content Explorer
276
- dotMailer content retrieval
277
- dotSemantic
278
- downforeveryoneorjustme
279
- Download Wonder
280
- downnotifier
281
- DowntimeDetector
282
- Drip
283
- drupact
284
- Drupal \(\+http:\/\/drupal\.org\/\)
285
- DTS Agent
286
- dubaiindex
287
- DuplexWeb-Google
288
- DynatraceSynthetic
289
- EARTHCOM
290
- Easy-Thumb
291
- EasyDL
292
- Ebingbong
293
- ec2linkfinder
294
- eCairn-Grabber
295
- eCatch
296
- ECCP
297
- eContext\/
298
- Ecxi
299
- EirGrabber
300
- ElectricMonk
301
- elefent
302
- EMail Exractor
303
- EMail Wolf
304
- EmailWolf
305
- Embarcadero
306
- Embed PHP Library
307
- Embedly
308
- endo\/
309
- europarchive\.org
310
- evc-batch
311
- EventMachine HttpClient
312
- Everwall Link Expander
313
- Evidon
314
- Evrinid
315
- ExactSearch
316
- ExaleadCloudview
317
- Excel\/
318
- exif
319
- ExoRank
320
- Exploratodo
321
- Express WebPictures
322
- Extreme Picture Finder
323
- EyeNetIE
324
- ezooms
325
- facebookexternalhit
326
- facebookexternalua
327
- facebookplatform
328
- fairshare
329
- Faraday v
330
- fasthttp
331
- Faveeo
332
- Favicon downloader
333
- faviconkit
334
- faviconarchive
335
- FavOrg
336
- Feed Wrangler
337
- Feedable\/
338
- Feedbin
339
- FeedBooster
340
- FeedBucket
341
- FeedBunch\/
342
- FeedBurner
343
- feeder
344
- Feedly
345
- FeedshowOnline
346
- Feedspot
347
- Feedwind\/
348
- FeedZcollector
349
- feeltiptop
350
- Fetch API
351
- Fetch\/[0-9]
352
- Fever\/[0-9]
353
- FHscan
354
- Filestack
355
- Fimap
356
- findlink
357
- findthatfile
358
- FlashGet
359
- FlipboardBrowserProxy
360
- FlipboardProxy
361
- FlipboardRSS
362
- Flock\/
363
- fluffy
364
- Flunky
365
- flynxapp
366
- forensiq
367
- FoundSeoTool
368
- http:\/\/www.neomo.de\/
369
- free thumbnails
370
- Freeuploader
371
- Funnelback
372
- Fuzz Faster U Fool
373
- G-i-g-a-b-o-t
374
- g00g1e\.net
375
- ganarvisitas
376
- geek-tools
377
- Genieo
378
- GentleSource
379
- GetCode
380
- Getintent
381
- GetLinkInfo
382
- getprismatic
383
- GetRight
384
- getroot
385
- GetURLInfo\/
386
- GetWeb
387
- Geziyor
388
- Ghost Inspector
389
- GigablastOpenSource
390
- GIS-LABS
391
- github-camo
392
- github\.com
393
- Goldfire Server
394
- Go [\d\.]* package http
395
- Go http package
396
- Go-Ahead-Got-It
397
- Go-http-client
398
- Go!Zilla
399
- gobyus
400
- gofetch
401
- GomezAgent
402
- gooblog
403
- Goodzer\/
404
- Google AppsViewer
405
- Google Desktop
406
- Google favicon
407
- Google Keyword Suggestion
408
- Google Keyword Tool
409
- Google Page Speed Insights
410
- Google PP Default
411
- Google Search Console
412
- Google Web Preview
413
- Google-Ads-Overview
414
- Google-Adwords
415
- Google-Apps-Script
416
- Google-Calendar-Importer
417
- Google-HotelAdsVerifier
418
- Google-HTTP-Java-Client
419
- Google-Publisher-Plugin
420
- Google-Read-Aloud
421
- Google-SearchByImage
422
- Google-Site-Verification
423
- Google-speakr
424
- Google-Structured-Data-Testing-Tool
425
- Google-Youtube-Links
426
- google-xrawler
427
- GoogleDocs
428
- GoogleHC\/
429
- GoogleProducer
430
- GoogleSites
431
- Google-Transparency-Report
432
- Gookey
433
- GoSpotCheck
434
- gosquared-thumbnailer
435
- Gotit
436
- GoZilla
437
- grabify
438
- GrabNet
439
- Grafula
440
- Grammarly
441
- GrapeFX
442
- GreatNews
443
- Gregarius
444
- GRequests
445
- grokkit
446
- grouphigh
447
- grub-client
448
- gSOAP\/
449
- GT::WWW
450
- GTmetrix
451
- GuzzleHttp
452
- gvfs\/
453
- HAA(A)?RTLAND http client
454
- Haansoft
455
- hackney\/
456
- Hadi Agent
457
- HappyApps-WebCheck
458
- Hatena
459
- Havij
460
- HaxerMen
461
- HeadlessChrome
462
- HEADMasterSEO
463
- HeartRails_Capture
464
- help@dataminr\.com
465
- heritrix
466
- Hexometer
467
- historious
468
- hkedcity
469
- hledejLevne\.cz
470
- Hloader
471
- HMView
472
- Holmes
473
- HonesoSearchEngine
474
- HootSuite Image proxy
475
- Hootsuite-WebFeed
476
- hosterstats
477
- HostTracker
478
- ht:\/\/check
479
- htdig
480
- HTMLparser
481
- htmlyse
482
- HTTP Banner Detection
483
- HTTP_Compression_Test
484
- http_request2
485
- http_requester
486
- http-get
487
- HTTP-Header-Abfrage
488
- http-kit
489
- http-request\/
490
- HTTP-Tiny
491
- HTTP::Lite
492
- http\.rb\/
493
- http_get
494
- HttpComponents
495
- httphr
496
- HTTPMon
497
- HTTPie
498
- httpRequest
499
- httpscheck
500
- httpssites_power
501
- httpunit
502
- HttpUrlConnection
503
- httrack
504
- huaweisymantec
505
- HubSpot
506
- Humanlinks
507
- i2kconnect\/
508
- Iblog
509
- ichiro
510
- Id-search
511
- IdeelaborPlagiaat
512
- IDG Twitter Links Resolver
513
- IDwhois\/
514
- Iframely
515
- igdeSpyder
516
- IlTrovatore
517
- Image Fetch
518
- Image Sucker
519
- ImageEngine\/
520
- ImageVisu\/
521
- Imagga
522
- imagineeasy
523
- imgsizer
524
- InAGist
525
- inbound\.li parser
526
- InDesign%20CC
527
- Indy Library
528
- InetURL
529
- infegy
530
- infohelfer
531
- InfoTekies
532
- InfoWizards Reciprocal Link
533
- inpwrd\.com
534
- instabid
535
- Instapaper
536
- Integrity
537
- integromedb
538
- Intelliseek
539
- InterGET
540
- internet_archive
541
- Internet Ninja
542
- InternetSeer
543
- internetVista monitor
544
- internetwache
545
- intraVnews
546
- IODC
547
- IOI
548
- iplabel
549
- ips-agent
550
- IPS\/[0-9]
551
- IPWorks HTTP\/S Component
552
- iqdb\/
553
- Iria
554
- Irokez
555
- isitup\.org
556
- iskanie
557
- isUp\.li
558
- iThemes Sync\/
559
- IZaBEE
560
- iZSearch
561
- JAHHO
562
- janforman
563
- Jaunt\/
564
- Jbrofuzz
565
- Jersey\/
566
- JetCar
567
- Jigsaw
568
- Jobboerse
569
- JobFeed discovery
570
- Jobg8 URL Monitor
571
- jobo
572
- Jobrapido
573
- Jobsearch1\.5
574
- JoinVision Generic
575
- JolokiaPwn
576
- Joomla
577
- Jorgee
578
- JS-Kit
579
- JustView
580
- Kaspersky Lab CFR link resolver
581
- Kelny\/
582
- Kerrigan\/
583
- KeyCDN
584
- Keyword Density
585
- Keywords Research
586
- khttp\/
587
- KickFire
588
- KimonoLabs\/
589
- Kml-Google
590
- knows\.is
591
- KOCMOHABT
592
- kouio
593
- kubectl
594
- kube-probe
595
- kulturarw3
596
- KumKie
597
- L\.webis
598
- Larbin
599
- Lavf\/
600
- LeechFTP
601
- LeechGet
602
- letsencrypt
603
- Lftp
604
- LibVLC
605
- LibWeb
606
- Libwhisker
607
- libwww
608
- Licorne
609
- Liferea\/
610
- Lightspeedsystems
611
- Lighthouse
612
- Likse
613
- limber\.io
614
- Link Valet
615
- link_thumbnailer
616
- LinkAlarm\/
617
- linkCheck
618
- linkdex
619
- LinkExaminer
620
- linkfluence
621
- linkpeek
622
- LinkPreviewGenerator
623
- LinkScan
624
- LinksManager
625
- LinkTiger
626
- LinkWalker
627
- Lipperhey
628
- Litemage_walker
629
- livedoor ScreenShot
630
- LoadImpactRload
631
- localsearch-web
632
- LongURL API
633
- longurl-r-package
634
- looid\.com
635
- looksystems\.net
636
- ltx71
637
- lua-resty-http
638
- lwp-request
639
- lwp-trivial
640
- LWP::Simple
641
- lycos
642
- LYT\.SR
643
- mabontland
644
- Mag-Net
645
- MagpieRSS
646
- Mail\.Ru
647
- MailChimp
648
- Majestic12
649
- makecontact\/
650
- Mandrill
651
- MapperCmd
652
- marketinggrader
653
- MarkMonitor
654
- MarkWatch
655
- Mass Downloader
656
- masscan\/
657
- Mata Hari
658
- Mediametric
659
- Mediapartners-Google
660
- mediawords
661
- MegaIndex\.ru
662
- MeltwaterNews
663
- Melvil Rawi
664
- MemGator
665
- Metaspinner
666
- MetaURI
667
- MFC_Tear_Sample
668
- MicroMessenger\/
669
- Microsearch
670
- Microsoft Office
671
- Microsoft Outlook
672
- Microsoft Windows Network Diagnostics
673
- Microsoft-WebDAV-MiniRedir
674
- Microsoft Data Access
675
- MIDown tool
676
- MIIxpc
677
- Mindjet
678
- Miniature\.io
679
- Miniflux
680
- Mister PiX
681
- mixdata dot com
682
- mixed-content-scan
683
- Mixmax-LinkPreview
684
- mixnode
685
- Mnogosearch
686
- mogimogi
687
- Mojeek
688
- Mojolicious \(Perl\)
689
- Monit\/
690
- monitis
691
- Monitority\/
692
- montastic
693
- MonTools
694
- Moreover
695
- Morfeus Fucking Scanner
696
- Morning Paper
697
- MovableType
698
- mowser
699
- Mr\.4x3 Powered
700
- Mrcgiguy
701
- MS Web Services Client Protocol
702
- MSFrontPage
703
- mShots
704
- MuckRack\/
705
- muhstik-scan
706
- MVAClient
707
- MxToolbox\/
708
- nagios
709
- Najdi\.si
710
- Name Intelligence
711
- Nameprotect
712
- Navroad
713
- NearSite
714
- Needle
715
- Nessus
716
- Net Vampire
717
- NetAnts
718
- NETCRAFT
719
- NetLyzer
720
- NetMechanic
721
- NetNewsWire
722
- Netpursual
723
- netresearch
724
- NetShelter ContentScan
725
- Netsparker
726
- NetTrack
727
- Netvibes
728
- NetZIP
729
- Neustar WPM
730
- NeutrinoAPI
731
- NewRelicPinger
732
- NewsBlur .*Finder
733
- NewsGator
734
- newsme
735
- newspaper\/
736
- NetSystemsResearch
737
- Nexgate Ruby Client
738
- NG-Search
739
- Nibbler
740
- NICErsPRO
741
- Nikto
742
- nineconnections
743
- NLNZ_IAHarvester
744
- Nmap Scripting Engine
745
- node-superagent
746
- node-urllib
747
- node\.io
748
- Nodemeter
749
- NodePing
750
- nominet\.org\.uk
751
- nominet\.uk
752
- Norton-Safeweb
753
- Notifixious
754
- notifyninja
755
- NotionEmbedder
756
- nuhk
757
- nutch
758
- Nuzzel
759
- nWormFeedFinder
760
- nyawc\/
761
- Nymesis
762
- NYU
763
- Ocelli\/
764
- Octopus
765
- oegp
766
- Offline Explorer
767
- Offline Navigator
768
- OgScrper
769
- okhttp
770
- omgili
771
- OMSC
772
- Online Domain Tools
773
- OpenCalaisSemanticProxy
774
- Openfind
775
- OpenLinkProfiler
776
- Openstat\/
777
- OpenVAS
778
- OPPO A33
779
- Optimizer
780
- Orbiter
781
- OrgProbe\/
782
- orion-semantics
783
- Outlook-Express
784
- Outlook-iOS
785
- ow\.ly
786
- Owler
787
- ownCloud News
788
- OxfordCloudService
789
- Page Valet
790
- page_verifier
791
- page scorer
792
- page2rss
793
- PageFreezer
794
- PageGrabber
795
- PagePeeker
796
- PageScorer
797
- Pagespeed\/
798
- Panopta
799
- panscient
800
- Papa Foto
801
- parsijoo
802
- Pavuk
803
- PayPal IPN
804
- pcBrowser
805
- Pcore-HTTP
806
- Pearltrees
807
- PECL::HTTP
808
- peerindex
809
- Peew
810
- PeoplePal
811
- Perlu -
812
- PhantomJS Screenshoter
813
- PhantomJS\/
814
- Photon\/
815
- phpservermon
816
- Pi-Monster
817
- Picscout
818
- Picsearch
819
- PictureFinder
820
- Pimonster
821
- ping\.blo\.gs
822
- Pingability
823
- PingAdmin\.Ru
824
- Pingdom
825
- Pingoscope
826
- PingSpot
827
- pinterest\.com
828
- Pixray
829
- Pizilla
830
- Plagger\/
831
- Ploetz \+ Zeller
832
- Plukkie
833
- plumanalytics
834
- PocketImageCache
835
- PocketParser
836
- Pockey
837
- POE-Component-Client-HTTP
838
- Polymail\/
839
- Pompos
840
- Porkbun
841
- Port Monitor
842
- postano
843
- PostmanRuntime
844
- PostPost
845
- postrank
846
- PowerPoint\/
847
- Prebid
848
- Priceonomics Analysis Engine
849
- PrintFriendly
850
- PritTorrent
851
- Prlog
852
- probethenet
853
- Project 25499
854
- prospectb2b
855
- Protopage
856
- ProWebWalker
857
- proximic
858
- PRTG Network Monitor
859
- pshtt, https scanning
860
- PTST
861
- PTST\/[0-9]+
862
- Pump
863
- python-httpx
864
- Python-httplib2
865
- python-requests
866
- Python-urllib
867
- Qirina Hurdler
868
- QQDownload
869
- QrafterPro
870
- Qseero
871
- Qualidator
872
- QueryN Metasearch
873
- queuedriver
874
- Quora Link Preview
875
- Qwantify
876
- Radian6
877
- RankActive
878
- RankFlex
879
- RankSonicSiteAuditor
880
- Re-re Studio
881
- ReactorNetty
882
- Readability
883
- RealDownload
884
- RealPlayer%20Downloader
885
- RebelMouse
886
- Recorder
887
- RecurPost\/
888
- redback\/
889
- ReederForMac
890
- Reeder\/
891
- ReGet
892
- RepoMonkey
893
- request\.js
894
- reqwest\/
895
- ResponseCodeTest
896
- RestSharp
897
- Riddler
898
- Rival IQ
899
- Robosourcer
900
- Robozilla
901
- ROI Hunter
902
- RPT-HTTPClient
903
- RSSOwl
904
- RyowlEngine
905
- safe-agent-scanner
906
- SalesIntelligent
907
- Saleslift
908
- Sendsay\.Ru
909
- SauceNAO
910
- SBIder
911
- sc-downloader
912
- scalaj-http
913
- Scamadviser-Frontend
914
- scan\.lol
915
- ScanAlert
916
- Scoop
917
- scooter
918
- ScoutJet
919
- ScoutURLMonitor
920
- ScrapeBox Page Scanner
921
- Scrapy
922
- Screaming
923
- ScreenShotService
924
- Scrubby
925
- Scrutiny\/
926
- search\.thunderstone
927
- Search37
928
- searchenginepromotionhelp
929
- Searchestate
930
- SearchExpress
931
- SearchSight
932
- Seeker
933
- semanticdiscovery
934
- semanticjuice
935
- Semiocast HTTP client
936
- Semrush
937
- sentry\/
938
- SEO Browser
939
- Seo Servis
940
- seo-nastroj\.cz
941
- seo4ajax
942
- Seobility
943
- SEOCentro
944
- SeoCheck
945
- SEOkicks
946
- SEOlizer
947
- Seomoz
948
- SEOprofiler
949
- SEOsearch
950
- seoscanners
951
- seositecheckup
952
- SEOstats
953
- servernfo
954
- sexsearcher
955
- Seznam
956
- Shelob
957
- Shodan
958
- Shoppimon
959
- ShopWiki
960
- shortURL lengthener
961
- ShortLinkTranslate
962
- shrinktheweb
963
- Sideqik
964
- SimplePie
965
- SimplyFast
966
- Siphon
967
- SISTRIX
968
- Site-Shot\/
969
- Site Sucker
970
- Site24x7
971
- SiteBar
972
- Sitebeam
973
- Sitebulb\/
974
- SiteCondor
975
- SiteExplorer
976
- SiteGuardian
977
- Siteimprove
978
- SiteIndexed
979
- Sitemap(s)? Generator
980
- SitemapGenerator
981
- SiteMonitor
982
- Siteshooter B0t
983
- SiteSnagger
984
- SiteSucker
985
- SiteTruth
986
- Sitevigil
987
- sitexy\.com
988
- SkypeUriPreview
989
- Slack\/
990
- slider\.com
991
- slurp
992
- SlySearch
993
- SmartDownload
994
- SMRF URL Expander
995
- SMUrlExpander
996
- Snake
997
- Snappy
998
- SnapSearch
999
- Snarfer\/
1000
- SniffRSS
1001
- sniptracker
1002
- Snoopy
1003
- SnowHaze Search
1004
- sogou web
1005
- SortSite
1006
- Sottopop
1007
- sovereign\.ai
1008
- SpaceBison
1009
- SpamExperts
1010
- Spammen
1011
- Spanner
1012
- spaziodati
1013
- SPDYCheck
1014
- Specificfeeds
1015
- speedy
1016
- SPEng
1017
- Spinn3r
1018
- spray-can
1019
- Sprinklr
1020
- spyonweb
1021
- sqlmap
1022
- Sqlworm
1023
- Sqworm
1024
- SSL Labs
1025
- ssl-tools
1026
- StackRambler
1027
- Statastico\/
1028
- StatusCake
1029
- Steeler
1030
- Stratagems Kumo
1031
- Stroke\.cz
1032
- StudioFACA
1033
- StumbleUpon
1034
- suchen
1035
- Sucuri
1036
- summify
1037
- SuperHTTP
1038
- Surphace Scout
1039
- Suzuran
1040
- Symfony BrowserKit
1041
- Symfony2 BrowserKit
1042
- SynHttpClient-Built
1043
- Sysomos
1044
- sysscan
1045
- Szukacz
1046
- T0PHackTeam
1047
- tAkeOut
1048
- Tarantula\/
1049
- Taringa UGC
1050
- TarmotGezgin
1051
- Teleport
1052
- Telesoft
1053
- Telesphoreo
1054
- Telesphorep
1055
- Tenon\.io
1056
- teoma
1057
- terrainformatica
1058
- Test Certificate Info
1059
- testuri
1060
- Tetrahedron
1061
- TextRazor Downloader
1062
- The Drop Reaper
1063
- The Expert HTML Source Viewer
1064
- The Knowledge AI
1065
- The Intraformant
1066
- theinternetrules
1067
- TheNomad
1068
- Thinklab
1069
- Thumbshots
1070
- ThumbSniper
1071
- Thumbor
1072
- timewe\.net
1073
- TinEye
1074
- Tiny Tiny RSS
1075
- TLSProbe\/
1076
- Toata
1077
- topster
1078
- touche\.com
1079
- Traackr\.com
1080
- tracemyfile
1081
- Trackuity
1082
- TrapitAgent
1083
- Trendiction
1084
- Trendsmap
1085
- trendspottr
1086
- truwoGPS
1087
- TryJsoup
1088
- TulipChain
1089
- Turingos
1090
- Turnitin
1091
- tweetedtimes
1092
- Tweetminster
1093
- Tweezler\/
1094
- twibble
1095
- Twice
1096
- Twikle
1097
- Twingly
1098
- Twisted PageGetter
1099
- Typhoeus
1100
- ubermetrics-technologies
1101
- uclassify
1102
- UdmSearch
1103
- unchaos
1104
- unirest-java
1105
- UniversalFeedParser
1106
- Unshorten\.It
1107
- Untiny
1108
- UnwindFetchor
1109
- updated
1110
- updown\.io daemon
1111
- Upflow
1112
- Uptimia
1113
- Urlcheckr
1114
- URL Verifier
1115
- URLitor
1116
- urlresolver
1117
- Urlstat
1118
- URLTester
1119
- UrlTrends Ranking Updater
1120
- URLy Warning
1121
- URLy\.Warning
1122
- Vacuum
1123
- Vagabondo
1124
- VB Project
1125
- vBSEO
1126
- VCI
1127
- via ggpht\.com GoogleImageProxy
1128
- Virusdie
1129
- visionutils
1130
- vkShare
1131
- VoidEYE
1132
- Voil
1133
- voltron
1134
- voyager\/
1135
- VSAgent\/
1136
- VSB-TUO\/
1137
- Vulnbusters Meter
1138
- VYU2
1139
- w3af\.org
1140
- W3C_Unicorn
1141
- W3C-checklink
1142
- W3C-mobileOK
1143
- WAC-OFU
1144
- Wallpapers\/[0-9]+
1145
- WallpapersHD
1146
- wangling
1147
- Wappalyzer
1148
- WatchMouse
1149
- WbSrch\/
1150
- WDT\.io
1151
- web-capture\.net
1152
- Web-sniffer
1153
- Web Auto
1154
- Web Collage
1155
- Web Enhancer
1156
- Web Fetch
1157
- Web Fuck
1158
- Web Pix
1159
- Web Sauger
1160
- Web spyder
1161
- Web Sucker
1162
- Webalta
1163
- Webauskunft
1164
- WebAuto
1165
- WebCapture
1166
- WebClient\/
1167
- webcollage
1168
- WebCookies
1169
- WebCopier
1170
- WebCorp
1171
- WebDataStats
1172
- WebDoc
1173
- WebEnhancer
1174
- WebFetch
1175
- WebFuck
1176
- WebGazer
1177
- WebGo IS
1178
- WebImageCollector
1179
- WebImages
1180
- WebIndex
1181
- webkit2png
1182
- WebLeacher
1183
- webmastercoffee
1184
- webmon\s
1185
- WebPix
1186
- WebReaper
1187
- WebSauger
1188
- webscreenie
1189
- Webshag
1190
- Webshot
1191
- Website Quester
1192
- websitepulse agent
1193
- WebsiteQuester
1194
- Websnapr
1195
- WebSniffer
1196
- Webster
1197
- WebStripper
1198
- WebSucker
1199
- Webthumb\/
1200
- WebThumbnail
1201
- WebWhacker
1202
- WebZIP
1203
- WeLikeLinks
1204
- WEPA
1205
- WeSEE
1206
- wf84
1207
- Wfuzz\/
1208
- wget
1209
- WhatsApp
1210
- WhatsMyIP
1211
- WhatWeb
1212
- WhereGoes\?
1213
- Whibse
1214
- WhoRunsCoinHive
1215
- Whynder Magnet
1216
- WinHttp-Autoproxy-Service
1217
- Windows-RSS-Platform
1218
- WinPodder
1219
- wkhtmlto
1220
- wmtips
1221
- Woko
1222
- Wolfram HTTPClient
1223
- woorankreview
1224
- Word\/
1225
- WordPress\/
1226
- worldping-api
1227
- WordupinfoSearch
1228
- wotbox
1229
- WP Engine Install Performance API
1230
- wpif
1231
- wprecon\.com survey
1232
- WPScan
1233
- wscheck
1234
- Wtrace
1235
- WWW-Collector-E
1236
- WWW-Mechanize
1237
- WWW::Document
1238
- WWW::Mechanize
1239
- www\.monitor\.us
1240
- WWWOFFLE
1241
- x09Mozilla
1242
- x22Mozilla
1243
- XaxisSemanticsClassifier
1244
- Xenu Link Sleuth
1245
- XING-contenttabreceiver
1246
- xpymep([0-9]?)\.exe
1247
- Y!J-(ASR|BSC)
1248
- Y\!J-BRW
1249
- Yaanb
1250
- yacy
1251
- Yahoo Link Preview
1252
- YahooCacheSystem
1253
- YahooYSMcm
1254
- YandeG
1255
- Yandex(?!Search)
1256
- yanga
1257
- yeti
1258
- Yo-yo
1259
- Yoleo Consumer
1260
- yoogliFetchAgent
1261
- YottaaMonitor
1262
- Your-Website-Sucks
1263
- yourls\.org
1264
- YoYs\.net
1265
- YP\.PL
1266
- Zabbix
1267
- Zade
1268
- Zao
1269
- Zauba
1270
- Zemanta Aggregator
1271
- Zend_Http_Client
1272
- Zend\\\Http\\\Client
1273
- Zermelo
1274
- Zeus
1275
- zgrab
1276
- ZnajdzFoto
1277
- ZnHTTP
1278
- Zombie\.js
1279
- Zoom\.Mac
1280
- ZyBorg
1281
- [a-z0-9\-_]*(bot|crawl|archiver|transcoder|spider|uptime|validator|fetcher|cron|checker|reader|extractor|monitoring|analyzer|scraper)
1282
- ].strip.split(/\n+/).freeze
7
+ extend Loader
8
+
9
+ def self.data
10
+ @data ||= load_raw(CrawlerDetect.config.settings.raw_crawlers_path).freeze
11
+ end
1283
12
  end
1284
13
  end
1285
14
  end