voight_kampff 0.2.2 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (76) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +0 -4
  3. data/README.md +75 -0
  4. data/config.ru +7 -0
  5. data/config/crawler-user-agents.json +750 -0
  6. data/lib/tasks/voight_kampff.rake +9 -7
  7. data/lib/voight_kampff.rb +3 -29
  8. data/lib/voight_kampff/engine.rb +0 -5
  9. data/lib/voight_kampff/rack_request.rb +11 -0
  10. data/lib/voight_kampff/test.rb +18 -59
  11. data/lib/voight_kampff/version.rb +1 -1
  12. data/spec/controllers/replicants_controller_spec.rb +31 -0
  13. data/spec/internal/app/controllers/replicants_controller.rb +13 -0
  14. data/spec/internal/config/routes.rb +4 -0
  15. data/spec/internal/log/.gitignore +1 -0
  16. data/{tests/test_app → spec/internal}/public/favicon.ico +0 -0
  17. data/spec/lib/voight_kampff/rack_request_spec.rb +33 -0
  18. data/spec/lib/voight_kampff/test_spec.rb +28 -0
  19. data/spec/lib/voight_kampff_spec.rb +27 -0
  20. data/spec/spec_helper.rb +11 -0
  21. data/spec/support/humans.rb +9 -0
  22. data/spec/support/replicants.rb +5 -0
  23. data/voight_kampff.gemspec +4 -5
  24. metadata +30 -70
  25. data/.autotest +0 -22
  26. data/CHANGELOG.rdoc +0 -16
  27. data/README.markdown +0 -63
  28. data/Rakefile +0 -20
  29. data/config/initializers/extend_action_dispatch_request.rb +0 -46
  30. data/config/user_agents.yml +0 -12438
  31. data/lib/voight_kampff/base.rb +0 -7
  32. data/lib/voight_kampff/user_agents_parser.rb +0 -54
  33. data/tests/spec/minitest_helper.rb +0 -2
  34. data/tests/spec/voight_kampff_spec.rb +0 -10
  35. data/tests/test_app/.gitignore +0 -15
  36. data/tests/test_app/Gemfile +0 -39
  37. data/tests/test_app/Rakefile +0 -7
  38. data/tests/test_app/app/controllers/application_controller.rb +0 -3
  39. data/tests/test_app/app/helpers/application_helper.rb +0 -2
  40. data/tests/test_app/app/mailers/.gitkeep +0 -0
  41. data/tests/test_app/app/models/.gitkeep +0 -0
  42. data/tests/test_app/app/views/layouts/application.html.erb +0 -14
  43. data/tests/test_app/config.ru +0 -4
  44. data/tests/test_app/config/application.rb +0 -62
  45. data/tests/test_app/config/boot.rb +0 -6
  46. data/tests/test_app/config/database.yml +0 -25
  47. data/tests/test_app/config/environment.rb +0 -5
  48. data/tests/test_app/config/environments/development.rb +0 -37
  49. data/tests/test_app/config/environments/production.rb +0 -67
  50. data/tests/test_app/config/environments/test.rb +0 -37
  51. data/tests/test_app/config/initializers/backtrace_silencers.rb +0 -7
  52. data/tests/test_app/config/initializers/inflections.rb +0 -15
  53. data/tests/test_app/config/initializers/mime_types.rb +0 -5
  54. data/tests/test_app/config/initializers/secret_token.rb +0 -7
  55. data/tests/test_app/config/initializers/session_store.rb +0 -8
  56. data/tests/test_app/config/initializers/wrap_parameters.rb +0 -14
  57. data/tests/test_app/config/locales/en.yml +0 -5
  58. data/tests/test_app/config/routes.rb +0 -58
  59. data/tests/test_app/lib/assets/.gitkeep +0 -0
  60. data/tests/test_app/lib/tasks/.gitkeep +0 -0
  61. data/tests/test_app/log/.gitkeep +0 -0
  62. data/tests/test_app/public/404.html +0 -26
  63. data/tests/test_app/public/422.html +0 -26
  64. data/tests/test_app/public/500.html +0 -25
  65. data/tests/test_app/public/index.html +0 -241
  66. data/tests/test_app/public/robots.txt +0 -5
  67. data/tests/test_app/script/rails +0 -6
  68. data/tests/test_app/test/fixtures/.gitkeep +0 -0
  69. data/tests/test_app/test/functional/.gitkeep +0 -0
  70. data/tests/test_app/test/integration/.gitkeep +0 -0
  71. data/tests/test_app/test/performance/browsing_test.rb +0 -12
  72. data/tests/test_app/test/test_helper.rb +0 -13
  73. data/tests/test_app/test/unit/.gitkeep +0 -0
  74. data/tests/test_app/vendor/assets/javascripts/.gitkeep +0 -0
  75. data/tests/test_app/vendor/assets/stylesheets/.gitkeep +0 -0
  76. data/tests/test_app/vendor/plugins/.gitkeep +0 -0
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 75813f25729f2fd1d22b0c4840cd21c8eab78910
4
- data.tar.gz: 582b4e055d141b46e78972ff157128dc898fea53
3
+ metadata.gz: 2058aa68727ec3597c7c8cad15608eb95e5211e0
4
+ data.tar.gz: 29d2399f8e2957e52e833c3660270a23344335a0
5
5
  SHA512:
6
- metadata.gz: 06387acdcaf28822870e1eb33dd762a1f2643c4c6ebf63f02ed5100f7d8df2f3b19c23afbc2429ee9de690fb3e1de68bfa5fbfab5855056aedc38c01d97f6172
7
- data.tar.gz: d998ed0c398b0657834b9129885c7c6ef3cfb46e458bd97fdffc10ec8baa2c2bb01e1773a46743729a91c34eeab0178b94dc507d05b2cc9e2cc672eb79fc0f9e
6
+ metadata.gz: b35c1be62a8df9122d61c97cb7b0afb20200f7ce606528e998455ee579f6334daad0cfb58ce64d09b3b0df98c2f978df8407302fb0d88373a9ed4a85f0aece29
7
+ data.tar.gz: 3a7e5cbaa0bfb7ecb2d7cc83b36092651595564a6a4eed984b903cdd37961bf07842e80bcbcdae6bb88225e126fdf1866932a72abf84c098b611572236180671
data/Gemfile CHANGED
@@ -1,7 +1,3 @@
1
1
  source 'https://www.rubygems.org'
2
2
 
3
- gem 'rake'
4
-
5
3
  gemspec
6
-
7
-
data/README.md ADDED
@@ -0,0 +1,75 @@
1
+ Voight-Kampff
2
+ =============
3
+ [![Build Status](https://travis-ci.org/biola/Voight-Kampff.png?branch=master)](https://travis-ci.org/biola/Voight-Kampff)
4
+
5
+ Voight-Kampff relies on a [user agent](http://en.wikipedia.org/wiki/User_agent) list for its detection. It can easily tell you if a request is coming from a crawler, spider or bot. This can be especially helpful in analytics such as page hit tracking.
6
+
7
+ Installation
8
+ ------------
9
+ `gem install voight_kampff`
10
+
11
+ Configuration
12
+ -------------
13
+
14
+ A JSON file is used to match [user agent strings](http://simplyfast.info/browser) to a list of known bots.
15
+
16
+ If you'd like to use an [updated list](https://github.com/monperrus/crawler-user-agents) or make your own customizations, run `rake voight_kampff:import_user_agents`. This will download a `crawler-user-agents.json` file into the `./config` directory.
17
+
18
+ __Note:__ The pattern entries in the JSON file are evaluated as [regular expressions](http://en.wikipedia.org/wiki/Regular_expression).
19
+
20
+ Usage
21
+ -----
22
+ There are three ways to use Voight-Kampff
23
+
24
+ 1. Through Rack::Request such as in your [Ruby on Rails](http://rubyonrails.org) controllers:
25
+ `request.bot?`
26
+
27
+ 2. Through the `VoightKampff` module:
28
+ `VoightKampff.bot? 'your user agent string'`
29
+
30
+ 3. Through a `VoightKampff::Test` instance:
31
+ `VoightKampff::Test.new('your user agent string').bot?`
32
+
33
+ All of the above examples accept `human?` and `bot?` methods. All of these methods will return `true` or `false`.
34
+
35
+ Upgrading to version 1.0
36
+ ------------------------
37
+
38
+ Version 1.0 uses a new source for a list of bot user agent strings since the old source was no longer maintained. This new source, unfortuately, does not include as much detail. Therefore the following methods have been deprecated:
39
+ - `#browser?`
40
+ - `#checker?`
41
+ - `#downloader?`
42
+ - `#proxy?`
43
+ - `#crawler?`
44
+ - `#spam?`
45
+
46
+ In general the `#bot?` command tends to include all of these and I'm sure it's unlikely that anybody was getting this granular with their bot checking. So I see it as a small price to pay for an open and up to date bot list.
47
+
48
+ Also, the gem no longer extends `ActionDispatch::Request` instead it extends `Rack::Request` which `ActionDispatch::Request` inherits from. This allows the same functionality for Rails while opening the gem up to other rack-based projects.
49
+
50
+ FAQ
51
+ ---
52
+ __Q:__ __What's with the name?__
53
+ __A:__ It's the [machine in Blade Runner](http://en.wikipedia.org/wiki/Blade_Runner#Voight-Kampff_machine) that is used to test whether someone is a human or a replicant.
54
+
55
+ __Q:__ __I've found a bot that isn't being matched__
56
+ __A:__ The list is being pulled from [github.com/monperrus/crawler-user-agents](https://github.com/monperrus/crawler-user-agents).
57
+ If you'd like to have entries added to the list, please create a pull request with that project. Once that pull request is merged, feel free to create an issue here and I'll release a new gem version with the updated list. In the meantime you can always run `rake voight_kampff:import_user_agents` on your project to get that updated list.
58
+
59
+ __Q:__ __Why don't you use the user agent list from ______________
60
+ If you know of a better source for a list of bot user agent strings, please create an issue and let me know. I'm open to switching to a better source or supporting multiple sources. There are others out there but I like the openness of monperrus' list.
61
+
62
+ Thanks
63
+ ------
64
+ Thanks to [github.com/monperrus/crawler-user-agents](https://github.com/monperrus/crawler-user-agents) for providing an open and easily updatable list of bot user agents.
65
+
66
+ Contributing
67
+ ------------
68
+ PR without tests will not get merged, Make sure you write tests for api and rails app.
69
+ Feel free to ask for help, if you do not know how to write a determined test.
70
+
71
+ Running Tests?
72
+ --------------
73
+
74
+ - `bundle install`
75
+ - `bundle exec rspec`
data/config.ru ADDED
@@ -0,0 +1,7 @@
1
+ require 'rubygems'
2
+ require 'bundler'
3
+
4
+ Bundler.require :default, :development
5
+
6
+ Combustion.initialize! :action_controller
7
+ run Combustion::Application
@@ -0,0 +1,750 @@
1
+
2
+ [
3
+ {
4
+ "pattern": "googlebot\\/",
5
+ "url": "http://www.google.com/bot.html"
6
+ },
7
+ {
8
+ "pattern": "Googlebot-Mobile"
9
+ },
10
+ {
11
+ "pattern": "Googlebot-Image"
12
+ },
13
+ {
14
+ "pattern": "Mediapartners-Google",
15
+ "url": "https://support.google.com/webmasters/answer/1061943?hl=en"
16
+ },
17
+ {
18
+ "pattern": "bingbot",
19
+ "url": "http://www.bing.com/bingbot.htm"
20
+ },
21
+ {
22
+ "pattern": "slurp",
23
+ "url": "http://help.yahoo.com/help/us/ysearch/slurp"
24
+ },
25
+ {
26
+ "pattern": "java"
27
+ },
28
+ {
29
+ "pattern": "wget"
30
+ },
31
+ {
32
+ "pattern": "curl"
33
+ },
34
+ {
35
+ "pattern": "Commons-HttpClient"
36
+ },
37
+ {
38
+ "pattern": "Python-urllib"
39
+ },
40
+ {
41
+ "pattern": "libwww"
42
+ },
43
+ {
44
+ "pattern": "httpunit"
45
+ },
46
+ {
47
+ "pattern": "nutch"
48
+ },
49
+ {
50
+ "pattern": "phpcrawl",
51
+ "addition_date": "2012-09/17",
52
+ "url": "http://phpcrawl.cuab.de/"
53
+ },
54
+ {
55
+ "pattern": "msnbot",
56
+ "url": "http://search.msn.com/msnbot.htm"
57
+ },
58
+ {
59
+ "pattern": "jyxobot"
60
+ },
61
+ {
62
+ "pattern": "FAST-WebCrawler"
63
+ },
64
+ {
65
+ "pattern": "FAST Enterprise Crawler"
66
+ },
67
+ {
68
+ "pattern": "biglotron"
69
+ },
70
+ {
71
+ "pattern": "teoma"
72
+ },
73
+ {
74
+ "pattern": "convera"
75
+ },
76
+ {
77
+ "pattern": "seekbot"
78
+ },
79
+ {
80
+ "pattern": "gigablast",
81
+ "instances": ["Gigabot/2.0 (http://www.gigablast.com/spider.html)", "Gigabot/2.0 (http://www.gigablast.com/spider.html)", "GigablastOpenSource/1.0"],
82
+ "url": "https://github.com/gigablast/open-source-search-engine"
83
+ },
84
+ {
85
+ "pattern": "exabot"
86
+ },
87
+ {
88
+ "pattern": "ngbot"
89
+ },
90
+ {
91
+ "pattern": "ia_archiver"
92
+ },
93
+ {
94
+ "pattern": "GingerCrawler"
95
+ },
96
+ {
97
+ "pattern": "webmon "
98
+ },
99
+ {
100
+ "pattern": "httrack"
101
+ },
102
+ {
103
+ "pattern": "webcrawler"
104
+ },
105
+ {
106
+ "pattern": "grub.org"
107
+ },
108
+ {
109
+ "pattern": "UsineNouvelleCrawler"
110
+ },
111
+ {
112
+ "pattern": "antibot"
113
+ },
114
+ {
115
+ "pattern": "netresearchserver"
116
+ },
117
+ {
118
+ "pattern": "speedy"
119
+ },
120
+ {
121
+ "pattern": "fluffy"
122
+ },
123
+ {
124
+ "pattern": "bibnum.bnf"
125
+ },
126
+ {
127
+ "pattern": "findlink"
128
+ },
129
+ {
130
+ "pattern": "msrbot"
131
+ },
132
+ {
133
+ "pattern": "panscient"
134
+ },
135
+ {
136
+ "pattern": "yacybot"
137
+ },
138
+ {
139
+ "pattern": "AISearchBot"
140
+ },
141
+ {
142
+ "pattern": "IOI"
143
+ },
144
+ {
145
+ "pattern": "ips-agent"
146
+ },
147
+ {
148
+ "pattern": "tagoobot"
149
+ },
150
+ {
151
+ "pattern": "MJ12bot"
152
+ },
153
+ {
154
+ "pattern": "dotbot"
155
+ },
156
+ {
157
+ "pattern": "woriobot"
158
+ },
159
+ {
160
+ "pattern": "yanga"
161
+ },
162
+ {
163
+ "pattern": "buzzbot"
164
+ },
165
+ {
166
+ "pattern": "mlbot"
167
+ },
168
+ {
169
+ "pattern": "yandexbot",
170
+ "url": "http://yandex.com/bots",
171
+ "instances": ["Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots)"],
172
+ "addition_date": "2015/04/14"
173
+ },
174
+ {
175
+ "pattern": "purebot",
176
+ "addition_date": "2010/01/19"
177
+ },
178
+ {
179
+ "pattern": "Linguee Bot",
180
+ "addition_date": "2010/01/26",
181
+ "url": "http://www.linguee.com/bot"
182
+ },
183
+ {
184
+ "pattern": "Voyager",
185
+ "addition_date": "2010/02/01",
186
+ "url": "http://www.kosmix.com/crawler.html"
187
+ },
188
+ {
189
+ "pattern": "CyberPatrol",
190
+ "addition_date": "2010/02/11",
191
+ "url": "http://www.cyberpatrol.com/cyberpatrolcrawler.asp"
192
+ },
193
+ {
194
+ "pattern": "voilabot",
195
+ "addition_date": "2010/05/18"
196
+ },
197
+ {
198
+ "pattern": "baiduspider",
199
+ "addition_date": "2010/07/15",
200
+ "url": "http://www.baidu.jp/spider/"
201
+ },
202
+ {
203
+ "pattern": "citeseerxbot",
204
+ "addition_date": "2010/07/17"
205
+ },
206
+ {
207
+ "pattern": "spbot",
208
+ "addition_date": "2010/07/31",
209
+ "url": "http://www.seoprofiler.com/bot"
210
+ },
211
+ {
212
+ "pattern": "twengabot",
213
+ "addition_date": "2010/08/03",
214
+ "url": "http://www.twenga.com/bot.html"
215
+ },
216
+ {
217
+ "pattern": "postrank",
218
+ "addition_date": "2010/08/03",
219
+ "url": "http://www.postrank.com"
220
+ },
221
+ {
222
+ "pattern": "turnitinbot",
223
+ "addition_date": "2010/09/26",
224
+ "url": "http://www.turnitin.com"
225
+ },
226
+ {
227
+ "pattern": "scribdbot",
228
+ "addition_date": "2010/09/28",
229
+ "url": "http://www.scribd.com"
230
+ },
231
+ {
232
+ "pattern": "page2rss",
233
+ "addition_date": "2010/10/07",
234
+ "url": "http://www.page2rss.com"
235
+ },
236
+ {
237
+ "pattern": "sitebot",
238
+ "addition_date": "2010/12/15",
239
+ "url": "http://www.sitebot.org"
240
+ },
241
+ {
242
+ "pattern": "linkdex",
243
+ "addition_date": "2011/01/06",
244
+ "url": "http://www.linkdex.com"
245
+ },
246
+ {
247
+ "pattern": "Adidxbot",
248
+ "url": "http://onlinehelp.microsoft.com/en-us/bing/hh204496.aspx"
249
+ },
250
+ {
251
+ "pattern": "blekkobot",
252
+ "url": "http://blekko.com/about/blekkobot"
253
+ },
254
+ {
255
+ "pattern": "ezooms",
256
+ "addition_date": "2011/04/27",
257
+ "url": "http://www.phpbb.com/community/viewtopic.php?f=64&t=935605&start=450#p12948289"
258
+ },
259
+ {
260
+ "pattern": "dotbot",
261
+ "addition_date": "2011/04/27"
262
+ },
263
+ {
264
+ "pattern": "Mail.RU_Bot",
265
+ "addition_date": "2011/04/27",
266
+ "instances" : [
267
+ "Mozilla/5.0 (compatible; Linux x86_64; Mail.RU_Bot/2.0; +http://go.mail.ru/",
268
+ "Mozilla/5.0 (compatible; Mail.RU_Bot/2.0; +http://go.mail.ru/"
269
+ ]
270
+ },
271
+ {
272
+ "pattern": "discobot",
273
+ "addition_date": "2011/05/03",
274
+ "url": "http://discoveryengine.com/discobot.html"
275
+ },
276
+ {
277
+ "pattern": "heritrix",
278
+ "addition_date": "2011/06/21",
279
+ "url": "http://crawler.archive.org/"
280
+ },
281
+ {
282
+ "pattern": "findthatfile",
283
+ "addition_date": "2011/06/21",
284
+ "url": "http://www.findthatfile.com/"
285
+ },
286
+ {
287
+ "pattern": "europarchive.org",
288
+ "addition_date": "2011/06/21",
289
+ "url": ""
290
+ },
291
+ {
292
+ "pattern": "NerdByNature.Bot",
293
+ "addition_date": "2011/07/12",
294
+ "url": "http://www.nerdbynature.net/bot"
295
+ },
296
+ {
297
+ "pattern": "sistrix crawler",
298
+ "addition_date": "2011/08/02"
299
+ },
300
+ {
301
+ "pattern": "ahrefsbot",
302
+ "addition_date": "2011/08/28"
303
+ },
304
+ {
305
+ "pattern": "Aboundex",
306
+ "addition_date": "2011/09/28",
307
+ "url": "http://www.aboundex.com/crawler/"
308
+ },
309
+ {
310
+ "pattern": "domaincrawler",
311
+ "addition_date": "2011/10/21"
312
+ },
313
+ {
314
+ "pattern": "wbsearchbot",
315
+ "addition_date": "2011/12/21",
316
+ "url": "http://www.warebay.com/bot.html"
317
+ },
318
+ {
319
+ "pattern": "summify",
320
+ "addition_date": "2012/01/04",
321
+ "url": "http://summify.com"
322
+ },
323
+ {
324
+ "pattern": "ccbot",
325
+ "addition_date": "2012/02/05",
326
+ "url": "http://www.commoncrawl.org/bot.html"
327
+ },
328
+ {
329
+ "pattern": "edisterbot",
330
+ "addition_date": "2012/02/25"
331
+ },
332
+ {
333
+ "pattern": "seznambot",
334
+ "addition_date": "2012/03/14"
335
+ },
336
+ {
337
+ "pattern": "ec2linkfinder",
338
+ "addition_date": "2012/03/22"
339
+ },
340
+ {
341
+ "pattern": "gslfbot",
342
+ "addition_date": "2012/04/03"
343
+ },
344
+ {
345
+ "pattern": "aihitbot",
346
+ "addition_date": "2012/04/16"
347
+ },
348
+ {
349
+ "pattern": "intelium_bot",
350
+ "addition_date": "2012/05/07"
351
+ },
352
+ {
353
+ "pattern": "facebookexternalhit",
354
+ "addition_date": "2012/05/07"
355
+ },
356
+ {
357
+ "pattern": "yeti",
358
+ "addition_date": "2012/05/07"
359
+ },
360
+ {
361
+ "pattern": "RetrevoPageAnalyzer",
362
+ "addition_date": "2012/05/07"
363
+ },
364
+ {
365
+ "pattern": "lb-spider",
366
+ "addition_date": "2012/05/07"
367
+ },
368
+ {
369
+ "pattern": "sogou",
370
+ "addition_date": "2012/05/13",
371
+ "url": "http://www.sogou.com/docs/help/webmasters.htm#07"
372
+ },
373
+ {
374
+ "pattern": "lssbot",
375
+ "addition_date": "2012/05/15"
376
+ },
377
+ {
378
+ "pattern": "careerbot",
379
+ "addition_date": "2012/05/23",
380
+ "url": "http://www.career-x.de/bot.html"
381
+ },
382
+ {
383
+ "pattern": "wotbox",
384
+ "addition_date": "2012/06/12",
385
+ "url": "http://www.wotbox.com"
386
+ },
387
+ {
388
+ "pattern": "wocbot",
389
+ "addition_date": "2012/07/25",
390
+ "url": "http://www.wocodi.com/crawler"
391
+ },
392
+ {
393
+ "pattern": "ichiro",
394
+ "addition_date": "2012/08/28",
395
+ "url": "http://help.goo.ne.jp/help/article/1142"
396
+ },
397
+ {
398
+ "pattern": "DuckDuckBot",
399
+ "addition_date": "2012/09/19",
400
+ "url": "http://duckduckgo.com/duckduckbot.html"
401
+ },
402
+ {
403
+ "pattern": "lssrocketcrawler",
404
+ "addition_date": "2012/09/24"
405
+ },
406
+ {
407
+ "pattern": "drupact",
408
+ "addition_date": "2012/09/27",
409
+ "url": "http://www.arocom.de/drupact"
410
+ },
411
+ {
412
+ "pattern": "webcompanycrawler",
413
+ "addition_date": "2012/10/03"
414
+ },
415
+ {
416
+ "pattern": "acoonbot",
417
+ "addition_date": "2012/10/07",
418
+ "url": "http://www.acoon.de/robot.asp"
419
+ },
420
+ {
421
+ "pattern": "openindexspider",
422
+ "addition_date": "2012/10/26",
423
+ "url": "http://www.openindex.io/en/webmasters/spider.html"
424
+ },
425
+ {
426
+ "pattern": "gnam gnam spider",
427
+ "addition_date": "2012/10/31"
428
+ },
429
+ {
430
+ "pattern": "web-archive-net.com.bot"
431
+ },
432
+ {
433
+ "pattern": "backlinkcrawler",
434
+ "addition_date": "2013/01/04"
435
+ },
436
+ {
437
+ "pattern": "coccoc",
438
+ "addition_date": "2013/01/04",
439
+ "url": "http://help.coccoc.vn/"
440
+ },
441
+ {
442
+ "pattern": "integromedb",
443
+ "addition_date": "2013/01/10",
444
+ "url": "http://www.integromedb.org/Crawler"
445
+ },
446
+ {
447
+ "pattern": "content crawler spider",
448
+ "addition_date": "2013/01/11"
449
+ },
450
+ {
451
+ "pattern": "toplistbot",
452
+ "addition_date": "2013/02/05"
453
+ },
454
+ {
455
+ "pattern": "seokicks-robot",
456
+ "addition_date": "2013/02/25"
457
+ },
458
+ {
459
+ "pattern": "it2media-domain-crawler",
460
+ "addition_date": "2013/03/12"
461
+ },
462
+ {
463
+ "pattern": "ip-web-crawler.com",
464
+ "addition_date": "2013/03/22"
465
+ },
466
+ {
467
+ "pattern": "siteexplorer.info",
468
+ "addition_date": "2013/05/01"
469
+ },
470
+ {
471
+ "pattern": "elisabot",
472
+ "addition_date": "2013/06/27"
473
+ },
474
+ {
475
+ "pattern": "proximic",
476
+ "addition_date": "2013/09/12",
477
+ "url": "http://www.proximic.com/info/spider.php"
478
+ },
479
+ {
480
+ "pattern": "changedetection",
481
+ "addition_date": "2013/09/13",
482
+ "url": "http://www.changedetection.com/bot.html"
483
+ },
484
+ {
485
+ "pattern": "blexbot",
486
+ "addition_date": "2013/10/03",
487
+ "url": "http://webmeup-crawler.com/"
488
+ },
489
+ {
490
+ "pattern": "arabot",
491
+ "addition_date": "2013/10/09"
492
+ },
493
+ {
494
+ "pattern": "WeSEE:Search",
495
+ "addition_date": "2013/11/18"
496
+ },
497
+ {
498
+ "pattern": "niki-bot",
499
+ "addition_date": "2014/01/01"
500
+ },
501
+ {
502
+ "pattern": "CrystalSemanticsBot",
503
+ "addition_date": "2014/02/17",
504
+ "url": "http://www.crystalsemantics.com/user-agent/"
505
+ },
506
+ {
507
+ "pattern": "rogerbot",
508
+ "addition_date": "2014/02/28",
509
+ "url": "http://moz.com/help/pro/what-is-rogerbot-"
510
+ },
511
+ {
512
+ "pattern": "360Spider",
513
+ "addition_date": "2014/03/14",
514
+ "url": "http://needs-be.blogspot.co.uk/2013/02/how-to-block-spider360.html"
515
+ },
516
+ {
517
+ "pattern": "psbot",
518
+ "addition_date": "2014/03/31",
519
+ "url": "http://www.picsearch.com/bot.html"
520
+ },
521
+ {
522
+ "pattern": "InterfaxScanBot",
523
+ "addition_date": "2014/03/31",
524
+ "url": "http://scan-interfax.ru"
525
+ },
526
+ {
527
+ "pattern": "Lipperhey SEO Service",
528
+ "addition_date": "2014/04/01",
529
+ "url": "http://www.lipperhey.com/"
530
+ },
531
+ {
532
+ "pattern": "CC Metadata Scaper",
533
+ "addition_date": "2014/04/01",
534
+ "url": "http://wiki.creativecommons.org/Metadata_Scraper"
535
+ },
536
+ {
537
+ "pattern": "g00g1e.net",
538
+ "addition_date": "2014/04/01",
539
+ "url": "http://www.g00g1e.net/"
540
+ },
541
+ {
542
+ "pattern": "GrapeshotCrawler",
543
+ "addition_date": "2014/04/01",
544
+ "url": "http://www.grapeshot.co.uk/crawler.php"
545
+ },
546
+ {
547
+ "pattern": "urlappendbot",
548
+ "addition_date": "2014/05/10",
549
+ "url": "http://www.profound.net/urlappendbot.html"
550
+ },
551
+ {
552
+ "pattern": "brainobot",
553
+ "addition_date": "2014/06/24"
554
+ },
555
+ {
556
+ "pattern": "fr-crawler",
557
+ "addition_date": "2014/07/31",
558
+ "instances": ["Mozilla/5.0 (compatible; fr-crawler/1.1)"]
559
+ },
560
+ {
561
+ "pattern": "binlar",
562
+ "addition_date": "2014/09/12",
563
+ "instances": [
564
+ "binlar_2.6.3 binlar2.6.3@unspecified.mail",
565
+ "binlar_2.6.3 binlar_2.6.3@unspecified.mail",
566
+ "binlar_2.6.3 larbin2.6.3@unspecified.mail",
567
+ "binlar_2.6.3 phanendra_kalapala@McAfee.com",
568
+ "binlar_2.6.3 test@mgmt.mic"
569
+ ]
570
+ },
571
+ {
572
+ "pattern": "SimpleCrawler",
573
+ "addition_date": "2014/09/12",
574
+ "instances": ["SimpleCrawler/0.1" ]
575
+ },
576
+ {
577
+ "pattern": "Livelapbot",
578
+ "addition_date": "2014/09/12",
579
+ "instances": ["Livelapbot/0.1" ]
580
+ },
581
+ {
582
+ "pattern": "Twitterbot",
583
+ "addition_date": "2014/09/12",
584
+ "instances": ["Twitterbot/0.1", "Twitterbot/1.0" ]
585
+ },
586
+ {
587
+ "pattern": "cXensebot",
588
+ "addition_date": "2014/10/05",
589
+ "instances": ["cXensebot/1.1a"],
590
+ "url": "http://www.cxense.com/bot.html"
591
+ },
592
+ {
593
+ "pattern": "smtbot",
594
+ "addition_date": "2014/10/04",
595
+ "instances": ["Mozilla/5.0 (compatible; SMTBot/1.0; +http://www.similartech.com/smtbo)t", "SMTBot (similartech.com/smtbot)"],
596
+ "url": "http://www.similartech.com/smtbot"
597
+ },
598
+ {
599
+ "pattern": "bnf.fr_bot",
600
+ "addition_date": "2014/11/18",
601
+ "url": "http://www.bnf.fr/fr/outils/a.dl_web_capture_robot.html",
602
+ "instances": ["Mozilla/5.0 (compatible; bnf.fr_bot; +http://www.bnf.fr/fr/outils/a.dl_web_capture_robot.html)"]
603
+ },
604
+ {
605
+ "pattern": "A6-Indexer",
606
+ "addition_date": "2014/12/05",
607
+ "url": "http://www.a6corp.com/a6-web-scraping-policy/",
608
+ "instances": ["A6-Indexer"]
609
+ },
610
+ {
611
+ "pattern": "ADmantX",
612
+ "addition_date": "2014/12/05",
613
+ "url": "http://www.admantx.com",
614
+ "instances": ["ADmantX Platform Semantic Analyzer - ADmantX Inc. - www.admantx.com - support@admantx.com"]
615
+ },
616
+ {
617
+ "pattern": "Facebot",
618
+ "url": "https://developers.facebook.com/docs/sharing/best-practices#crawl",
619
+ "addition_date": "2014/12/30"
620
+ },
621
+ {
622
+ "pattern": "Twitterbot",
623
+ "url": "https://dev.twitter.com/cards/getting-started",
624
+ "addition_date": "2014/12/30"
625
+ },
626
+ {
627
+ "pattern": "OrangeBot",
628
+ "instances": ["Mozilla/5.0 (compatible; OrangeBot/2.0; support.orangebot@orange.com"],
629
+ "addition_date": "2015/01/12"
630
+ },
631
+ {
632
+ "pattern": "memorybot",
633
+ "url": "http://mignify.com/bot.htm",
634
+ "instances": ["Mozilla/5.0 (compatible; memorybot/1.21.14 +http://mignify.com/bot.html)"],
635
+ "addition_date": "2015/02/01"
636
+ },
637
+ {
638
+ "pattern": "AdvBot",
639
+ "url": "http://advbot.net/bot.html",
640
+ "instances": ["Mozilla/5.0 (compatible; AdvBot/2.0; +http://advbot.net/bot.html)"],
641
+ "addition_date": "2015/02/01"
642
+ },
643
+ {
644
+ "pattern": "MegaIndex",
645
+ "url": "https://www.megaindex.ru/?tab=linkAnalyze",
646
+ "instances": ["Mozilla/5.0 (compatible; MegaIndex.ru/2.0; +https://www.megaindex.ru/?tab=linkAnalyze)"],
647
+ "addition_date": "2015/03/28"
648
+ },
649
+ {
650
+ "pattern": "SemanticScholarBot",
651
+ "url": "http://s2.allenai.org/bot.html",
652
+ "instances": ["SemanticScholarBot/1.0 (+http://s2.allenai.org/bot.html)"],
653
+ "addition_date": "2015/03/28"
654
+ },
655
+ {
656
+ "pattern": "ltx71",
657
+ "url": "http://ltx71.com/",
658
+ "instances": ["ltx71 - (http://ltx71.com/)"],
659
+ "addition_date": "2015/04/04"
660
+ },
661
+ {
662
+ "pattern": "nerdybot",
663
+ "url": "http://nerdybot.com/",
664
+ "instances": ["nerdybot"],
665
+ "addition_date": "2015/04/05"
666
+ },
667
+ {
668
+ "pattern": "xovibot",
669
+ "url": "http://www.xovibot.net/",
670
+ "instances": ["Mozilla/5.0 (compatible; XoviBot/2.0; +http://www.xovibot.net/)"],
671
+ "addition_date": "2015/04/05"
672
+ },
673
+ {
674
+ "pattern": "BUbiNG",
675
+ "url": "http://law.di.unimi.it/BUbiNG.html",
676
+ "instances": ["BUbiNG (+http://law.di.unimi.it/BUbiNG.html)"],
677
+ "addition_date": "2015/04/06"
678
+ },
679
+ {
680
+ "pattern": "Qwantify",
681
+ "url": "https://www.qwant.com/",
682
+ "instances": ["Mozilla/5.0 (compatible; Qwantify/2.0n; +https://www.qwant.com/)/*"],
683
+ "addition_date": "2015/04/06"
684
+ },
685
+ {
686
+ "pattern": "archive.org_bot",
687
+ "url": "http://www.archive.org/details/archive.org_bot",
688
+ "instances": ["Mozilla/5.0 (compatible; archive.org_bot +http://www.archive.org/details/archive.org_bot)"],
689
+ "addition_date": "2015/04/14"
690
+ },
691
+ {
692
+ "pattern": "Applebot",
693
+ "url": "http://www.apple.com/go/applebot",
694
+ "addition_date": "2015/04/15"
695
+ },
696
+ {
697
+ "pattern": "TweetmemeBot",
698
+ "url": "http://datasift.com/bot.html",
699
+ "instances": ["Mozilla/5.0 (TweetmemeBot/4.0; +http://datasift.com/bot.html) Gecko/20100101 Firefox/31.0"],
700
+ "addition_date": "2015/04/15"
701
+ },
702
+ {
703
+ "pattern": "crawler4j",
704
+ "url": "https://github.com/yasserg/crawler4j",
705
+ "instances": ["crawler4j (http://code.google.com/p/crawler4j/)"],
706
+ "addition_date": "2015/05/07"
707
+ },
708
+ {
709
+ "pattern": "findxbot",
710
+ "url": "http://www.findxbot.com",
711
+ "instances": ["Mozilla/5.0 (compatible; Findxbot/1.0; +http://www.findxbot.com)"],
712
+ "addition_date": "2015/05/07"
713
+ },
714
+ {
715
+ "pattern": "SemrushBot",
716
+ "url": "http://www.semrush.com/bot.html",
717
+ "instances": ["Mozilla/5.0 (compatible; SemrushBot/0.98~bl; +http://www.semrush.com/bot.html)"],
718
+ "addition_date": "2015/05/26"
719
+ },
720
+ {
721
+ "pattern": "yoozBot",
722
+ "url": "http://yooz.ir",
723
+ "instances": ["Mozilla/5.0 (compatible; yoozBot-2.2; http://yooz.ir; info@yooz.ir)"],
724
+ "addition_date": "2015/05/26"
725
+ },
726
+ {
727
+ "pattern": "lipperhey",
728
+ "url": "http://www.lipperhey.com/",
729
+ "instances": ["Mozilla/5.0 (compatible; Lipperhey Link Explorer; http://www.lipperhey.com/)", "Mozilla/5.0 (compatible; Lipperhey SEO Service; http://www.lipperhey.com/)", "Mozilla/5.0 (compatible; Lipperhey Site Explorer; http://www.lipperhey.com/)", "Mozilla/5.0 (compatible; Lipperhey-Kaus-Australis/5.0; +https://www.lipperhey.com/en/about/)"],
730
+ "addition_date": "2015/08/26"
731
+ },
732
+ {
733
+ "pattern": "y!j-asr",
734
+ "url": "http://www.yahoo-help.jp/app/answers/detail/p/595/a_id/42716/",
735
+ "instances": ["Y!J-ASR/0.1 crawler (http://www.yahoo-help.jp/app/answers/detail/p/595/a_id/42716/)"],
736
+ "addition_date": "2015/05/26"
737
+ },
738
+ {
739
+ "pattern": "Domain Re-Animator Bot",
740
+ "url": "http://domainreanimator.com",
741
+ "instances": ["Domain Re-Animator Bot (http://domainreanimator.com) - support@domainreanimator.com"],
742
+ "addition_date": "2015/04/14"
743
+ },
744
+ {
745
+ "pattern": "AddThis",
746
+ "url": "https://www.addthis.com",
747
+ "instances": ["AddThis.com robot tech.support@clearspring.com"],
748
+ "addition_date": "2015/06/02"
749
+ }
750
+ ]