voight_kampff 0.2.2 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +0 -4
  3. data/README.md +75 -0
  4. data/config.ru +7 -0
  5. data/config/crawler-user-agents.json +750 -0
  6. data/lib/tasks/voight_kampff.rake +9 -7
  7. data/lib/voight_kampff.rb +3 -29
  8. data/lib/voight_kampff/engine.rb +0 -5
  9. data/lib/voight_kampff/rack_request.rb +11 -0
  10. data/lib/voight_kampff/test.rb +18 -59
  11. data/lib/voight_kampff/version.rb +1 -1
  12. data/spec/controllers/replicants_controller_spec.rb +31 -0
  13. data/spec/internal/app/controllers/replicants_controller.rb +13 -0
  14. data/spec/internal/config/routes.rb +4 -0
  15. data/spec/internal/log/.gitignore +1 -0
  16. data/{tests/test_app → spec/internal}/public/favicon.ico +0 -0
  17. data/spec/lib/voight_kampff/rack_request_spec.rb +33 -0
  18. data/spec/lib/voight_kampff/test_spec.rb +28 -0
  19. data/spec/lib/voight_kampff_spec.rb +27 -0
  20. data/spec/spec_helper.rb +11 -0
  21. data/spec/support/humans.rb +9 -0
  22. data/spec/support/replicants.rb +5 -0
  23. data/voight_kampff.gemspec +4 -5
  24. metadata +30 -70
  25. data/.autotest +0 -22
  26. data/CHANGELOG.rdoc +0 -16
  27. data/README.markdown +0 -63
  28. data/Rakefile +0 -20
  29. data/config/initializers/extend_action_dispatch_request.rb +0 -46
  30. data/config/user_agents.yml +0 -12438
  31. data/lib/voight_kampff/base.rb +0 -7
  32. data/lib/voight_kampff/user_agents_parser.rb +0 -54
  33. data/tests/spec/minitest_helper.rb +0 -2
  34. data/tests/spec/voight_kampff_spec.rb +0 -10
  35. data/tests/test_app/.gitignore +0 -15
  36. data/tests/test_app/Gemfile +0 -39
  37. data/tests/test_app/Rakefile +0 -7
  38. data/tests/test_app/app/controllers/application_controller.rb +0 -3
  39. data/tests/test_app/app/helpers/application_helper.rb +0 -2
  40. data/tests/test_app/app/mailers/.gitkeep +0 -0
  41. data/tests/test_app/app/models/.gitkeep +0 -0
  42. data/tests/test_app/app/views/layouts/application.html.erb +0 -14
  43. data/tests/test_app/config.ru +0 -4
  44. data/tests/test_app/config/application.rb +0 -62
  45. data/tests/test_app/config/boot.rb +0 -6
  46. data/tests/test_app/config/database.yml +0 -25
  47. data/tests/test_app/config/environment.rb +0 -5
  48. data/tests/test_app/config/environments/development.rb +0 -37
  49. data/tests/test_app/config/environments/production.rb +0 -67
  50. data/tests/test_app/config/environments/test.rb +0 -37
  51. data/tests/test_app/config/initializers/backtrace_silencers.rb +0 -7
  52. data/tests/test_app/config/initializers/inflections.rb +0 -15
  53. data/tests/test_app/config/initializers/mime_types.rb +0 -5
  54. data/tests/test_app/config/initializers/secret_token.rb +0 -7
  55. data/tests/test_app/config/initializers/session_store.rb +0 -8
  56. data/tests/test_app/config/initializers/wrap_parameters.rb +0 -14
  57. data/tests/test_app/config/locales/en.yml +0 -5
  58. data/tests/test_app/config/routes.rb +0 -58
  59. data/tests/test_app/lib/assets/.gitkeep +0 -0
  60. data/tests/test_app/lib/tasks/.gitkeep +0 -0
  61. data/tests/test_app/log/.gitkeep +0 -0
  62. data/tests/test_app/public/404.html +0 -26
  63. data/tests/test_app/public/422.html +0 -26
  64. data/tests/test_app/public/500.html +0 -25
  65. data/tests/test_app/public/index.html +0 -241
  66. data/tests/test_app/public/robots.txt +0 -5
  67. data/tests/test_app/script/rails +0 -6
  68. data/tests/test_app/test/fixtures/.gitkeep +0 -0
  69. data/tests/test_app/test/functional/.gitkeep +0 -0
  70. data/tests/test_app/test/integration/.gitkeep +0 -0
  71. data/tests/test_app/test/performance/browsing_test.rb +0 -12
  72. data/tests/test_app/test/test_helper.rb +0 -13
  73. data/tests/test_app/test/unit/.gitkeep +0 -0
  74. data/tests/test_app/vendor/assets/javascripts/.gitkeep +0 -0
  75. data/tests/test_app/vendor/assets/stylesheets/.gitkeep +0 -0
  76. data/tests/test_app/vendor/plugins/.gitkeep +0 -0
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 75813f25729f2fd1d22b0c4840cd21c8eab78910
4
- data.tar.gz: 582b4e055d141b46e78972ff157128dc898fea53
3
+ metadata.gz: 2058aa68727ec3597c7c8cad15608eb95e5211e0
4
+ data.tar.gz: 29d2399f8e2957e52e833c3660270a23344335a0
5
5
  SHA512:
6
- metadata.gz: 06387acdcaf28822870e1eb33dd762a1f2643c4c6ebf63f02ed5100f7d8df2f3b19c23afbc2429ee9de690fb3e1de68bfa5fbfab5855056aedc38c01d97f6172
7
- data.tar.gz: d998ed0c398b0657834b9129885c7c6ef3cfb46e458bd97fdffc10ec8baa2c2bb01e1773a46743729a91c34eeab0178b94dc507d05b2cc9e2cc672eb79fc0f9e
6
+ metadata.gz: b35c1be62a8df9122d61c97cb7b0afb20200f7ce606528e998455ee579f6334daad0cfb58ce64d09b3b0df98c2f978df8407302fb0d88373a9ed4a85f0aece29
7
+ data.tar.gz: 3a7e5cbaa0bfb7ecb2d7cc83b36092651595564a6a4eed984b903cdd37961bf07842e80bcbcdae6bb88225e126fdf1866932a72abf84c098b611572236180671
data/Gemfile CHANGED
@@ -1,7 +1,3 @@
1
1
  source 'https://www.rubygems.org'
2
2
 
3
- gem 'rake'
4
-
5
3
  gemspec
6
-
7
-
data/README.md ADDED
@@ -0,0 +1,75 @@
1
+ Voight-Kampff
2
+ =============
3
+ [![Build Status](https://travis-ci.org/biola/Voight-Kampff.png?branch=master)](https://travis-ci.org/biola/Voight-Kampff)
4
+
5
+ Voight-Kampff relies on a [user agent](http://en.wikipedia.org/wiki/User_agent) list for its detection. It can easily tell you if a request is coming from a crawler, spider or bot. This can be especially helpful in analytics such as page hit tracking.
6
+
7
+ Installation
8
+ ------------
9
+ `gem install voight_kampff`
10
+
11
+ Configuration
12
+ -------------
13
+
14
+ A JSON file is used to match [user agent strings](http://simplyfast.info/browser) to a list of known bots.
15
+
16
+ If you'd like to use an [updated list](https://github.com/monperrus/crawler-user-agents) or make your own customizations, run `rake voight_kampff:import_user_agents`. This will download a `crawler-user-agents.json` file into the `./config` directory.
17
+
18
+ __Note:__ The pattern entries in the JSON file are evaluated as [regular expressions](http://en.wikipedia.org/wiki/Regular_expression).
19
+
20
+ Usage
21
+ -----
22
+ There are three ways to use Voight-Kampff
23
+
24
+ 1. Through Rack::Request such as in your [Ruby on Rails](http://rubyonrails.org) controllers:
25
+ `request.bot?`
26
+
27
+ 2. Through the `VoightKampff` module:
28
+ `VoightKampff.bot? 'your user agent string'`
29
+
30
+ 3. Through a `VoightKampff::Test` instance:
31
+ `VoightKampff::Test.new('your user agent string').bot?`
32
+
33
+ All of the above examples accept `human?` and `bot?` methods. All of these methods will return `true` or `false`.
34
+
35
+ Upgrading to version 1.0
36
+ ------------------------
37
+
38
+ Version 1.0 uses a new source for a list of bot user agent strings since the old source was no longer maintained. This new source, unfortuately, does not include as much detail. Therefore the following methods have been deprecated:
39
+ - `#browser?`
40
+ - `#checker?`
41
+ - `#downloader?`
42
+ - `#proxy?`
43
+ - `#crawler?`
44
+ - `#spam?`
45
+
46
+ In general the `#bot?` command tends to include all of these and I'm sure it's unlikely that anybody was getting this granular with their bot checking. So I see it as a small price to pay for an open and up to date bot list.
47
+
48
+ Also, the gem no longer extends `ActionDispatch::Request` instead it extends `Rack::Request` which `ActionDispatch::Request` inherits from. This allows the same functionality for Rails while opening the gem up to other rack-based projects.
49
+
50
+ FAQ
51
+ ---
52
+ __Q:__ __What's with the name?__
53
+ __A:__ It's the [machine in Blade Runner](http://en.wikipedia.org/wiki/Blade_Runner#Voight-Kampff_machine) that is used to test whether someone is a human or a replicant.
54
+
55
+ __Q:__ __I've found a bot that isn't being matched__
56
+ __A:__ The list is being pulled from [github.com/monperrus/crawler-user-agents](https://github.com/monperrus/crawler-user-agents).
57
+ If you'd like to have entries added to the list, please create a pull request with that project. Once that pull request is merged, feel free to create an issue here and I'll release a new gem version with the updated list. In the meantime you can always run `rake voight_kampff:import_user_agents` on your project to get that updated list.
58
+
59
+ __Q:__ __Why don't you use the user agent list from ______________
60
+ If you know of a better source for a list of bot user agent strings, please create an issue and let me know. I'm open to switching to a better source or supporting multiple sources. There are others out there but I like the openness of monperrus' list.
61
+
62
+ Thanks
63
+ ------
64
+ Thanks to [github.com/monperrus/crawler-user-agents](https://github.com/monperrus/crawler-user-agents) for providing an open and easily updatable list of bot user agents.
65
+
66
+ Contributing
67
+ ------------
68
+ PR without tests will not get merged, Make sure you write tests for api and rails app.
69
+ Feel free to ask for help, if you do not know how to write a determined test.
70
+
71
+ Running Tests?
72
+ --------------
73
+
74
+ - `bundle install`
75
+ - `bundle exec rspec`
data/config.ru ADDED
@@ -0,0 +1,7 @@
1
+ require 'rubygems'
2
+ require 'bundler'
3
+
4
+ Bundler.require :default, :development
5
+
6
+ Combustion.initialize! :action_controller
7
+ run Combustion::Application
@@ -0,0 +1,750 @@
1
+
2
+ [
3
+ {
4
+ "pattern": "googlebot\\/",
5
+ "url": "http://www.google.com/bot.html"
6
+ },
7
+ {
8
+ "pattern": "Googlebot-Mobile"
9
+ },
10
+ {
11
+ "pattern": "Googlebot-Image"
12
+ },
13
+ {
14
+ "pattern": "Mediapartners-Google",
15
+ "url": "https://support.google.com/webmasters/answer/1061943?hl=en"
16
+ },
17
+ {
18
+ "pattern": "bingbot",
19
+ "url": "http://www.bing.com/bingbot.htm"
20
+ },
21
+ {
22
+ "pattern": "slurp",
23
+ "url": "http://help.yahoo.com/help/us/ysearch/slurp"
24
+ },
25
+ {
26
+ "pattern": "java"
27
+ },
28
+ {
29
+ "pattern": "wget"
30
+ },
31
+ {
32
+ "pattern": "curl"
33
+ },
34
+ {
35
+ "pattern": "Commons-HttpClient"
36
+ },
37
+ {
38
+ "pattern": "Python-urllib"
39
+ },
40
+ {
41
+ "pattern": "libwww"
42
+ },
43
+ {
44
+ "pattern": "httpunit"
45
+ },
46
+ {
47
+ "pattern": "nutch"
48
+ },
49
+ {
50
+ "pattern": "phpcrawl",
51
+ "addition_date": "2012-09/17",
52
+ "url": "http://phpcrawl.cuab.de/"
53
+ },
54
+ {
55
+ "pattern": "msnbot",
56
+ "url": "http://search.msn.com/msnbot.htm"
57
+ },
58
+ {
59
+ "pattern": "jyxobot"
60
+ },
61
+ {
62
+ "pattern": "FAST-WebCrawler"
63
+ },
64
+ {
65
+ "pattern": "FAST Enterprise Crawler"
66
+ },
67
+ {
68
+ "pattern": "biglotron"
69
+ },
70
+ {
71
+ "pattern": "teoma"
72
+ },
73
+ {
74
+ "pattern": "convera"
75
+ },
76
+ {
77
+ "pattern": "seekbot"
78
+ },
79
+ {
80
+ "pattern": "gigablast",
81
+ "instances": ["Gigabot/2.0 (http://www.gigablast.com/spider.html)", "Gigabot/2.0 (http://www.gigablast.com/spider.html)", "GigablastOpenSource/1.0"],
82
+ "url": "https://github.com/gigablast/open-source-search-engine"
83
+ },
84
+ {
85
+ "pattern": "exabot"
86
+ },
87
+ {
88
+ "pattern": "ngbot"
89
+ },
90
+ {
91
+ "pattern": "ia_archiver"
92
+ },
93
+ {
94
+ "pattern": "GingerCrawler"
95
+ },
96
+ {
97
+ "pattern": "webmon "
98
+ },
99
+ {
100
+ "pattern": "httrack"
101
+ },
102
+ {
103
+ "pattern": "webcrawler"
104
+ },
105
+ {
106
+ "pattern": "grub.org"
107
+ },
108
+ {
109
+ "pattern": "UsineNouvelleCrawler"
110
+ },
111
+ {
112
+ "pattern": "antibot"
113
+ },
114
+ {
115
+ "pattern": "netresearchserver"
116
+ },
117
+ {
118
+ "pattern": "speedy"
119
+ },
120
+ {
121
+ "pattern": "fluffy"
122
+ },
123
+ {
124
+ "pattern": "bibnum.bnf"
125
+ },
126
+ {
127
+ "pattern": "findlink"
128
+ },
129
+ {
130
+ "pattern": "msrbot"
131
+ },
132
+ {
133
+ "pattern": "panscient"
134
+ },
135
+ {
136
+ "pattern": "yacybot"
137
+ },
138
+ {
139
+ "pattern": "AISearchBot"
140
+ },
141
+ {
142
+ "pattern": "IOI"
143
+ },
144
+ {
145
+ "pattern": "ips-agent"
146
+ },
147
+ {
148
+ "pattern": "tagoobot"
149
+ },
150
+ {
151
+ "pattern": "MJ12bot"
152
+ },
153
+ {
154
+ "pattern": "dotbot"
155
+ },
156
+ {
157
+ "pattern": "woriobot"
158
+ },
159
+ {
160
+ "pattern": "yanga"
161
+ },
162
+ {
163
+ "pattern": "buzzbot"
164
+ },
165
+ {
166
+ "pattern": "mlbot"
167
+ },
168
+ {
169
+ "pattern": "yandexbot",
170
+ "url": "http://yandex.com/bots",
171
+ "instances": ["Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots)"],
172
+ "addition_date": "2015/04/14"
173
+ },
174
+ {
175
+ "pattern": "purebot",
176
+ "addition_date": "2010/01/19"
177
+ },
178
+ {
179
+ "pattern": "Linguee Bot",
180
+ "addition_date": "2010/01/26",
181
+ "url": "http://www.linguee.com/bot"
182
+ },
183
+ {
184
+ "pattern": "Voyager",
185
+ "addition_date": "2010/02/01",
186
+ "url": "http://www.kosmix.com/crawler.html"
187
+ },
188
+ {
189
+ "pattern": "CyberPatrol",
190
+ "addition_date": "2010/02/11",
191
+ "url": "http://www.cyberpatrol.com/cyberpatrolcrawler.asp"
192
+ },
193
+ {
194
+ "pattern": "voilabot",
195
+ "addition_date": "2010/05/18"
196
+ },
197
+ {
198
+ "pattern": "baiduspider",
199
+ "addition_date": "2010/07/15",
200
+ "url": "http://www.baidu.jp/spider/"
201
+ },
202
+ {
203
+ "pattern": "citeseerxbot",
204
+ "addition_date": "2010/07/17"
205
+ },
206
+ {
207
+ "pattern": "spbot",
208
+ "addition_date": "2010/07/31",
209
+ "url": "http://www.seoprofiler.com/bot"
210
+ },
211
+ {
212
+ "pattern": "twengabot",
213
+ "addition_date": "2010/08/03",
214
+ "url": "http://www.twenga.com/bot.html"
215
+ },
216
+ {
217
+ "pattern": "postrank",
218
+ "addition_date": "2010/08/03",
219
+ "url": "http://www.postrank.com"
220
+ },
221
+ {
222
+ "pattern": "turnitinbot",
223
+ "addition_date": "2010/09/26",
224
+ "url": "http://www.turnitin.com"
225
+ },
226
+ {
227
+ "pattern": "scribdbot",
228
+ "addition_date": "2010/09/28",
229
+ "url": "http://www.scribd.com"
230
+ },
231
+ {
232
+ "pattern": "page2rss",
233
+ "addition_date": "2010/10/07",
234
+ "url": "http://www.page2rss.com"
235
+ },
236
+ {
237
+ "pattern": "sitebot",
238
+ "addition_date": "2010/12/15",
239
+ "url": "http://www.sitebot.org"
240
+ },
241
+ {
242
+ "pattern": "linkdex",
243
+ "addition_date": "2011/01/06",
244
+ "url": "http://www.linkdex.com"
245
+ },
246
+ {
247
+ "pattern": "Adidxbot",
248
+ "url": "http://onlinehelp.microsoft.com/en-us/bing/hh204496.aspx"
249
+ },
250
+ {
251
+ "pattern": "blekkobot",
252
+ "url": "http://blekko.com/about/blekkobot"
253
+ },
254
+ {
255
+ "pattern": "ezooms",
256
+ "addition_date": "2011/04/27",
257
+ "url": "http://www.phpbb.com/community/viewtopic.php?f=64&t=935605&start=450#p12948289"
258
+ },
259
+ {
260
+ "pattern": "dotbot",
261
+ "addition_date": "2011/04/27"
262
+ },
263
+ {
264
+ "pattern": "Mail.RU_Bot",
265
+ "addition_date": "2011/04/27",
266
+ "instances" : [
267
+ "Mozilla/5.0 (compatible; Linux x86_64; Mail.RU_Bot/2.0; +http://go.mail.ru/",
268
+ "Mozilla/5.0 (compatible; Mail.RU_Bot/2.0; +http://go.mail.ru/"
269
+ ]
270
+ },
271
+ {
272
+ "pattern": "discobot",
273
+ "addition_date": "2011/05/03",
274
+ "url": "http://discoveryengine.com/discobot.html"
275
+ },
276
+ {
277
+ "pattern": "heritrix",
278
+ "addition_date": "2011/06/21",
279
+ "url": "http://crawler.archive.org/"
280
+ },
281
+ {
282
+ "pattern": "findthatfile",
283
+ "addition_date": "2011/06/21",
284
+ "url": "http://www.findthatfile.com/"
285
+ },
286
+ {
287
+ "pattern": "europarchive.org",
288
+ "addition_date": "2011/06/21",
289
+ "url": ""
290
+ },
291
+ {
292
+ "pattern": "NerdByNature.Bot",
293
+ "addition_date": "2011/07/12",
294
+ "url": "http://www.nerdbynature.net/bot"
295
+ },
296
+ {
297
+ "pattern": "sistrix crawler",
298
+ "addition_date": "2011/08/02"
299
+ },
300
+ {
301
+ "pattern": "ahrefsbot",
302
+ "addition_date": "2011/08/28"
303
+ },
304
+ {
305
+ "pattern": "Aboundex",
306
+ "addition_date": "2011/09/28",
307
+ "url": "http://www.aboundex.com/crawler/"
308
+ },
309
+ {
310
+ "pattern": "domaincrawler",
311
+ "addition_date": "2011/10/21"
312
+ },
313
+ {
314
+ "pattern": "wbsearchbot",
315
+ "addition_date": "2011/12/21",
316
+ "url": "http://www.warebay.com/bot.html"
317
+ },
318
+ {
319
+ "pattern": "summify",
320
+ "addition_date": "2012/01/04",
321
+ "url": "http://summify.com"
322
+ },
323
+ {
324
+ "pattern": "ccbot",
325
+ "addition_date": "2012/02/05",
326
+ "url": "http://www.commoncrawl.org/bot.html"
327
+ },
328
+ {
329
+ "pattern": "edisterbot",
330
+ "addition_date": "2012/02/25"
331
+ },
332
+ {
333
+ "pattern": "seznambot",
334
+ "addition_date": "2012/03/14"
335
+ },
336
+ {
337
+ "pattern": "ec2linkfinder",
338
+ "addition_date": "2012/03/22"
339
+ },
340
+ {
341
+ "pattern": "gslfbot",
342
+ "addition_date": "2012/04/03"
343
+ },
344
+ {
345
+ "pattern": "aihitbot",
346
+ "addition_date": "2012/04/16"
347
+ },
348
+ {
349
+ "pattern": "intelium_bot",
350
+ "addition_date": "2012/05/07"
351
+ },
352
+ {
353
+ "pattern": "facebookexternalhit",
354
+ "addition_date": "2012/05/07"
355
+ },
356
+ {
357
+ "pattern": "yeti",
358
+ "addition_date": "2012/05/07"
359
+ },
360
+ {
361
+ "pattern": "RetrevoPageAnalyzer",
362
+ "addition_date": "2012/05/07"
363
+ },
364
+ {
365
+ "pattern": "lb-spider",
366
+ "addition_date": "2012/05/07"
367
+ },
368
+ {
369
+ "pattern": "sogou",
370
+ "addition_date": "2012/05/13",
371
+ "url": "http://www.sogou.com/docs/help/webmasters.htm#07"
372
+ },
373
+ {
374
+ "pattern": "lssbot",
375
+ "addition_date": "2012/05/15"
376
+ },
377
+ {
378
+ "pattern": "careerbot",
379
+ "addition_date": "2012/05/23",
380
+ "url": "http://www.career-x.de/bot.html"
381
+ },
382
+ {
383
+ "pattern": "wotbox",
384
+ "addition_date": "2012/06/12",
385
+ "url": "http://www.wotbox.com"
386
+ },
387
+ {
388
+ "pattern": "wocbot",
389
+ "addition_date": "2012/07/25",
390
+ "url": "http://www.wocodi.com/crawler"
391
+ },
392
+ {
393
+ "pattern": "ichiro",
394
+ "addition_date": "2012/08/28",
395
+ "url": "http://help.goo.ne.jp/help/article/1142"
396
+ },
397
+ {
398
+ "pattern": "DuckDuckBot",
399
+ "addition_date": "2012/09/19",
400
+ "url": "http://duckduckgo.com/duckduckbot.html"
401
+ },
402
+ {
403
+ "pattern": "lssrocketcrawler",
404
+ "addition_date": "2012/09/24"
405
+ },
406
+ {
407
+ "pattern": "drupact",
408
+ "addition_date": "2012/09/27",
409
+ "url": "http://www.arocom.de/drupact"
410
+ },
411
+ {
412
+ "pattern": "webcompanycrawler",
413
+ "addition_date": "2012/10/03"
414
+ },
415
+ {
416
+ "pattern": "acoonbot",
417
+ "addition_date": "2012/10/07",
418
+ "url": "http://www.acoon.de/robot.asp"
419
+ },
420
+ {
421
+ "pattern": "openindexspider",
422
+ "addition_date": "2012/10/26",
423
+ "url": "http://www.openindex.io/en/webmasters/spider.html"
424
+ },
425
+ {
426
+ "pattern": "gnam gnam spider",
427
+ "addition_date": "2012/10/31"
428
+ },
429
+ {
430
+ "pattern": "web-archive-net.com.bot"
431
+ },
432
+ {
433
+ "pattern": "backlinkcrawler",
434
+ "addition_date": "2013/01/04"
435
+ },
436
+ {
437
+ "pattern": "coccoc",
438
+ "addition_date": "2013/01/04",
439
+ "url": "http://help.coccoc.vn/"
440
+ },
441
+ {
442
+ "pattern": "integromedb",
443
+ "addition_date": "2013/01/10",
444
+ "url": "http://www.integromedb.org/Crawler"
445
+ },
446
+ {
447
+ "pattern": "content crawler spider",
448
+ "addition_date": "2013/01/11"
449
+ },
450
+ {
451
+ "pattern": "toplistbot",
452
+ "addition_date": "2013/02/05"
453
+ },
454
+ {
455
+ "pattern": "seokicks-robot",
456
+ "addition_date": "2013/02/25"
457
+ },
458
+ {
459
+ "pattern": "it2media-domain-crawler",
460
+ "addition_date": "2013/03/12"
461
+ },
462
+ {
463
+ "pattern": "ip-web-crawler.com",
464
+ "addition_date": "2013/03/22"
465
+ },
466
+ {
467
+ "pattern": "siteexplorer.info",
468
+ "addition_date": "2013/05/01"
469
+ },
470
+ {
471
+ "pattern": "elisabot",
472
+ "addition_date": "2013/06/27"
473
+ },
474
+ {
475
+ "pattern": "proximic",
476
+ "addition_date": "2013/09/12",
477
+ "url": "http://www.proximic.com/info/spider.php"
478
+ },
479
+ {
480
+ "pattern": "changedetection",
481
+ "addition_date": "2013/09/13",
482
+ "url": "http://www.changedetection.com/bot.html"
483
+ },
484
+ {
485
+ "pattern": "blexbot",
486
+ "addition_date": "2013/10/03",
487
+ "url": "http://webmeup-crawler.com/"
488
+ },
489
+ {
490
+ "pattern": "arabot",
491
+ "addition_date": "2013/10/09"
492
+ },
493
+ {
494
+ "pattern": "WeSEE:Search",
495
+ "addition_date": "2013/11/18"
496
+ },
497
+ {
498
+ "pattern": "niki-bot",
499
+ "addition_date": "2014/01/01"
500
+ },
501
+ {
502
+ "pattern": "CrystalSemanticsBot",
503
+ "addition_date": "2014/02/17",
504
+ "url": "http://www.crystalsemantics.com/user-agent/"
505
+ },
506
+ {
507
+ "pattern": "rogerbot",
508
+ "addition_date": "2014/02/28",
509
+ "url": "http://moz.com/help/pro/what-is-rogerbot-"
510
+ },
511
+ {
512
+ "pattern": "360Spider",
513
+ "addition_date": "2014/03/14",
514
+ "url": "http://needs-be.blogspot.co.uk/2013/02/how-to-block-spider360.html"
515
+ },
516
+ {
517
+ "pattern": "psbot",
518
+ "addition_date": "2014/03/31",
519
+ "url": "http://www.picsearch.com/bot.html"
520
+ },
521
+ {
522
+ "pattern": "InterfaxScanBot",
523
+ "addition_date": "2014/03/31",
524
+ "url": "http://scan-interfax.ru"
525
+ },
526
+ {
527
+ "pattern": "Lipperhey SEO Service",
528
+ "addition_date": "2014/04/01",
529
+ "url": "http://www.lipperhey.com/"
530
+ },
531
+ {
532
+ "pattern": "CC Metadata Scaper",
533
+ "addition_date": "2014/04/01",
534
+ "url": "http://wiki.creativecommons.org/Metadata_Scraper"
535
+ },
536
+ {
537
+ "pattern": "g00g1e.net",
538
+ "addition_date": "2014/04/01",
539
+ "url": "http://www.g00g1e.net/"
540
+ },
541
+ {
542
+ "pattern": "GrapeshotCrawler",
543
+ "addition_date": "2014/04/01",
544
+ "url": "http://www.grapeshot.co.uk/crawler.php"
545
+ },
546
+ {
547
+ "pattern": "urlappendbot",
548
+ "addition_date": "2014/05/10",
549
+ "url": "http://www.profound.net/urlappendbot.html"
550
+ },
551
+ {
552
+ "pattern": "brainobot",
553
+ "addition_date": "2014/06/24"
554
+ },
555
+ {
556
+ "pattern": "fr-crawler",
557
+ "addition_date": "2014/07/31",
558
+ "instances": ["Mozilla/5.0 (compatible; fr-crawler/1.1)"]
559
+ },
560
+ {
561
+ "pattern": "binlar",
562
+ "addition_date": "2014/09/12",
563
+ "instances": [
564
+ "binlar_2.6.3 binlar2.6.3@unspecified.mail",
565
+ "binlar_2.6.3 binlar_2.6.3@unspecified.mail",
566
+ "binlar_2.6.3 larbin2.6.3@unspecified.mail",
567
+ "binlar_2.6.3 phanendra_kalapala@McAfee.com",
568
+ "binlar_2.6.3 test@mgmt.mic"
569
+ ]
570
+ },
571
+ {
572
+ "pattern": "SimpleCrawler",
573
+ "addition_date": "2014/09/12",
574
+ "instances": ["SimpleCrawler/0.1" ]
575
+ },
576
+ {
577
+ "pattern": "Livelapbot",
578
+ "addition_date": "2014/09/12",
579
+ "instances": ["Livelapbot/0.1" ]
580
+ },
581
+ {
582
+ "pattern": "Twitterbot",
583
+ "addition_date": "2014/09/12",
584
+ "instances": ["Twitterbot/0.1", "Twitterbot/1.0" ]
585
+ },
586
+ {
587
+ "pattern": "cXensebot",
588
+ "addition_date": "2014/10/05",
589
+ "instances": ["cXensebot/1.1a"],
590
+ "url": "http://www.cxense.com/bot.html"
591
+ },
592
+ {
593
+ "pattern": "smtbot",
594
+ "addition_date": "2014/10/04",
595
+ "instances": ["Mozilla/5.0 (compatible; SMTBot/1.0; +http://www.similartech.com/smtbo)t", "SMTBot (similartech.com/smtbot)"],
596
+ "url": "http://www.similartech.com/smtbot"
597
+ },
598
+ {
599
+ "pattern": "bnf.fr_bot",
600
+ "addition_date": "2014/11/18",
601
+ "url": "http://www.bnf.fr/fr/outils/a.dl_web_capture_robot.html",
602
+ "instances": ["Mozilla/5.0 (compatible; bnf.fr_bot; +http://www.bnf.fr/fr/outils/a.dl_web_capture_robot.html)"]
603
+ },
604
+ {
605
+ "pattern": "A6-Indexer",
606
+ "addition_date": "2014/12/05",
607
+ "url": "http://www.a6corp.com/a6-web-scraping-policy/",
608
+ "instances": ["A6-Indexer"]
609
+ },
610
+ {
611
+ "pattern": "ADmantX",
612
+ "addition_date": "2014/12/05",
613
+ "url": "http://www.admantx.com",
614
+ "instances": ["ADmantX Platform Semantic Analyzer - ADmantX Inc. - www.admantx.com - support@admantx.com"]
615
+ },
616
+ {
617
+ "pattern": "Facebot",
618
+ "url": "https://developers.facebook.com/docs/sharing/best-practices#crawl",
619
+ "addition_date": "2014/12/30"
620
+ },
621
+ {
622
+ "pattern": "Twitterbot",
623
+ "url": "https://dev.twitter.com/cards/getting-started",
624
+ "addition_date": "2014/12/30"
625
+ },
626
+ {
627
+ "pattern": "OrangeBot",
628
+ "instances": ["Mozilla/5.0 (compatible; OrangeBot/2.0; support.orangebot@orange.com"],
629
+ "addition_date": "2015/01/12"
630
+ },
631
+ {
632
+ "pattern": "memorybot",
633
+ "url": "http://mignify.com/bot.htm",
634
+ "instances": ["Mozilla/5.0 (compatible; memorybot/1.21.14 +http://mignify.com/bot.html)"],
635
+ "addition_date": "2015/02/01"
636
+ },
637
+ {
638
+ "pattern": "AdvBot",
639
+ "url": "http://advbot.net/bot.html",
640
+ "instances": ["Mozilla/5.0 (compatible; AdvBot/2.0; +http://advbot.net/bot.html)"],
641
+ "addition_date": "2015/02/01"
642
+ },
643
+ {
644
+ "pattern": "MegaIndex",
645
+ "url": "https://www.megaindex.ru/?tab=linkAnalyze",
646
+ "instances": ["Mozilla/5.0 (compatible; MegaIndex.ru/2.0; +https://www.megaindex.ru/?tab=linkAnalyze)"],
647
+ "addition_date": "2015/03/28"
648
+ },
649
+ {
650
+ "pattern": "SemanticScholarBot",
651
+ "url": "http://s2.allenai.org/bot.html",
652
+ "instances": ["SemanticScholarBot/1.0 (+http://s2.allenai.org/bot.html)"],
653
+ "addition_date": "2015/03/28"
654
+ },
655
+ {
656
+ "pattern": "ltx71",
657
+ "url": "http://ltx71.com/",
658
+ "instances": ["ltx71 - (http://ltx71.com/)"],
659
+ "addition_date": "2015/04/04"
660
+ },
661
+ {
662
+ "pattern": "nerdybot",
663
+ "url": "http://nerdybot.com/",
664
+ "instances": ["nerdybot"],
665
+ "addition_date": "2015/04/05"
666
+ },
667
+ {
668
+ "pattern": "xovibot",
669
+ "url": "http://www.xovibot.net/",
670
+ "instances": ["Mozilla/5.0 (compatible; XoviBot/2.0; +http://www.xovibot.net/)"],
671
+ "addition_date": "2015/04/05"
672
+ },
673
+ {
674
+ "pattern": "BUbiNG",
675
+ "url": "http://law.di.unimi.it/BUbiNG.html",
676
+ "instances": ["BUbiNG (+http://law.di.unimi.it/BUbiNG.html)"],
677
+ "addition_date": "2015/04/06"
678
+ },
679
+ {
680
+ "pattern": "Qwantify",
681
+ "url": "https://www.qwant.com/",
682
+ "instances": ["Mozilla/5.0 (compatible; Qwantify/2.0n; +https://www.qwant.com/)/*"],
683
+ "addition_date": "2015/04/06"
684
+ },
685
+ {
686
+ "pattern": "archive.org_bot",
687
+ "url": "http://www.archive.org/details/archive.org_bot",
688
+ "instances": ["Mozilla/5.0 (compatible; archive.org_bot +http://www.archive.org/details/archive.org_bot)"],
689
+ "addition_date": "2015/04/14"
690
+ },
691
+ {
692
+ "pattern": "Applebot",
693
+ "url": "http://www.apple.com/go/applebot",
694
+ "addition_date": "2015/04/15"
695
+ },
696
+ {
697
+ "pattern": "TweetmemeBot",
698
+ "url": "http://datasift.com/bot.html",
699
+ "instances": ["Mozilla/5.0 (TweetmemeBot/4.0; +http://datasift.com/bot.html) Gecko/20100101 Firefox/31.0"],
700
+ "addition_date": "2015/04/15"
701
+ },
702
+ {
703
+ "pattern": "crawler4j",
704
+ "url": "https://github.com/yasserg/crawler4j",
705
+ "instances": ["crawler4j (http://code.google.com/p/crawler4j/)"],
706
+ "addition_date": "2015/05/07"
707
+ },
708
+ {
709
+ "pattern": "findxbot",
710
+ "url": "http://www.findxbot.com",
711
+ "instances": ["Mozilla/5.0 (compatible; Findxbot/1.0; +http://www.findxbot.com)"],
712
+ "addition_date": "2015/05/07"
713
+ },
714
+ {
715
+ "pattern": "SemrushBot",
716
+ "url": "http://www.semrush.com/bot.html",
717
+ "instances": ["Mozilla/5.0 (compatible; SemrushBot/0.98~bl; +http://www.semrush.com/bot.html)"],
718
+ "addition_date": "2015/05/26"
719
+ },
720
+ {
721
+ "pattern": "yoozBot",
722
+ "url": "http://yooz.ir",
723
+ "instances": ["Mozilla/5.0 (compatible; yoozBot-2.2; http://yooz.ir; info@yooz.ir)"],
724
+ "addition_date": "2015/05/26"
725
+ },
726
+ {
727
+ "pattern": "lipperhey",
728
+ "url": "http://www.lipperhey.com/",
729
+ "instances": ["Mozilla/5.0 (compatible; Lipperhey Link Explorer; http://www.lipperhey.com/)", "Mozilla/5.0 (compatible; Lipperhey SEO Service; http://www.lipperhey.com/)", "Mozilla/5.0 (compatible; Lipperhey Site Explorer; http://www.lipperhey.com/)", "Mozilla/5.0 (compatible; Lipperhey-Kaus-Australis/5.0; +https://www.lipperhey.com/en/about/)"],
730
+ "addition_date": "2015/08/26"
731
+ },
732
+ {
733
+ "pattern": "y!j-asr",
734
+ "url": "http://www.yahoo-help.jp/app/answers/detail/p/595/a_id/42716/",
735
+ "instances": ["Y!J-ASR/0.1 crawler (http://www.yahoo-help.jp/app/answers/detail/p/595/a_id/42716/)"],
736
+ "addition_date": "2015/05/26"
737
+ },
738
+ {
739
+ "pattern": "Domain Re-Animator Bot",
740
+ "url": "http://domainreanimator.com",
741
+ "instances": ["Domain Re-Animator Bot (http://domainreanimator.com) - support@domainreanimator.com"],
742
+ "addition_date": "2015/04/14"
743
+ },
744
+ {
745
+ "pattern": "AddThis",
746
+ "url": "https://www.addthis.com",
747
+ "instances": ["AddThis.com robot tech.support@clearspring.com"],
748
+ "addition_date": "2015/06/02"
749
+ }
750
+ ]