isbot 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: e6dce122546bc6c2b1db39b6aad3efc3050a8296
4
+ data.tar.gz: 57bae46cea189c699d4b49b25bc5945a77a753df
5
+ SHA512:
6
+ metadata.gz: 42f6b6810f5b52494ea6c108a20e1414dc454cd433d1a06e9b24cfe1a6d188d093724d1f0593becf2c85a771b028775f3e60a28516e65d55f94ff0432f0716f4
7
+ data.tar.gz: cfd588c79b8275c88b49f14f593e56bd0ac42b11d76b89e48c4676544655d25843a71fcba21bb8a4ff758d59717f5cdb0a7f3bb28e004bb2df4b1ad8073c4328
data/README.md ADDED
@@ -0,0 +1,26 @@
1
+ # isbot
2
+
3
+ #### Install
4
+
5
+
6
+
7
+ #### Usage
8
+
9
+ ```` ruby
10
+ require 'isbot'
11
+
12
+ user_agent = 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'
13
+
14
+ # Three forms of use:
15
+
16
+ # 1. Use the is_bot function
17
+ puts is_bot user_agent # true
18
+
19
+ # 2. Use the monkey patch method of the String object
20
+ puts user_agent.is_bot? # true
21
+
22
+ # 3. Use the is_bot function with code_blocks
23
+ is_bot user_agent do |match_bot|
24
+ puts match_bot # Googlebot/
25
+ end
26
+ ````
data/data/data.json ADDED
@@ -0,0 +1,979 @@
1
+ [
2
+ {
3
+ "pattern": "Googlebot\\/",
4
+ "url": "http://www.google.com/bot.html"
5
+ },
6
+ {
7
+ "pattern": "Googlebot-Mobile"
8
+ },
9
+ {
10
+ "pattern": "Googlebot-Image"
11
+ },
12
+ {
13
+ "pattern": "Googlebot-News"
14
+ },
15
+ {
16
+ "pattern": "Googlebot-Video"
17
+ },
18
+ {
19
+ "pattern": "AdsBot-Google",
20
+ "url": "https://support.google.com/webmasters/answer/1061943?hl=en"
21
+ },
22
+ {
23
+ "pattern": "Mediapartners-Google",
24
+ "url": "https://support.google.com/webmasters/answer/1061943?hl=en"
25
+ },
26
+ {
27
+ "pattern": "bingbot",
28
+ "url": "http://www.bing.com/bingbot.htm"
29
+ },
30
+ {
31
+ "pattern": "slurp",
32
+ "url": "http://help.yahoo.com/help/us/ysearch/slurp"
33
+ },
34
+ {
35
+ "pattern": "java"
36
+ },
37
+ {
38
+ "pattern": "wget"
39
+ },
40
+ {
41
+ "pattern": "curl"
42
+ },
43
+ {
44
+ "pattern": "Commons-HttpClient"
45
+ },
46
+ {
47
+ "pattern": "Python-urllib"
48
+ },
49
+ {
50
+ "pattern": "libwww"
51
+ },
52
+ {
53
+ "pattern": "httpunit"
54
+ },
55
+ {
56
+ "pattern": "nutch"
57
+ },
58
+ {
59
+ "pattern": "Go-http-client",
60
+ "addition_date": "2016/03/26",
61
+ "url": "https://golang.org/pkg/net/http/",
62
+ "instances": ["Go-http-client/1.1"]
63
+ },
64
+ {
65
+ "pattern": "phpcrawl",
66
+ "addition_date": "2012-09/17",
67
+ "url": "http://phpcrawl.cuab.de/"
68
+ },
69
+ {
70
+ "pattern": "msnbot",
71
+ "url": "http://search.msn.com/msnbot.htm"
72
+ },
73
+ {
74
+ "pattern": "jyxobot"
75
+ },
76
+ {
77
+ "pattern": "FAST-WebCrawler"
78
+ },
79
+ {
80
+ "pattern": "FAST Enterprise Crawler"
81
+ },
82
+ {
83
+ "pattern": "biglotron"
84
+ },
85
+ {
86
+ "pattern": "teoma"
87
+ },
88
+ {
89
+ "pattern": "convera"
90
+ },
91
+ {
92
+ "pattern": "seekbot"
93
+ },
94
+ {
95
+ "pattern": "gigabot",
96
+ "instances": ["Gigabot/1.0", "Gigabot/2.0 (http://www.gigablast.com/spider.html)", "Gigabot/2.0 (http://www.gigablast.com/spider.html)"],
97
+ "url": "https://github.com/gigablast/open-source-search-engine"
98
+ },
99
+ {
100
+ "pattern": "gigablast",
101
+ "instances": ["GigablastOpenSource/1.0"],
102
+ "url": "https://github.com/gigablast/open-source-search-engine"
103
+ },
104
+ {
105
+ "pattern": "exabot"
106
+ },
107
+ {
108
+ "pattern": "ngbot"
109
+ },
110
+ {
111
+ "pattern": "ia_archiver"
112
+ },
113
+ {
114
+ "pattern": "GingerCrawler"
115
+ },
116
+ {
117
+ "pattern": "webmon "
118
+ },
119
+ {
120
+ "pattern": "httrack"
121
+ },
122
+ {
123
+ "pattern": "webcrawler"
124
+ },
125
+ {
126
+ "pattern": "grub.org"
127
+ },
128
+ {
129
+ "pattern": "UsineNouvelleCrawler"
130
+ },
131
+ {
132
+ "pattern": "antibot"
133
+ },
134
+ {
135
+ "pattern": "netresearchserver"
136
+ },
137
+ {
138
+ "pattern": "speedy"
139
+ },
140
+ {
141
+ "pattern": "fluffy"
142
+ },
143
+ {
144
+ "pattern": "bibnum.bnf"
145
+ },
146
+ {
147
+ "pattern": "findlink"
148
+ },
149
+ {
150
+ "pattern": "msrbot"
151
+ },
152
+ {
153
+ "pattern": "panscient"
154
+ },
155
+ {
156
+ "pattern": "yacybot"
157
+ },
158
+ {
159
+ "pattern": "AISearchBot"
160
+ },
161
+ {
162
+ "pattern": "IOI"
163
+ },
164
+ {
165
+ "pattern": "ips-agent"
166
+ },
167
+ {
168
+ "pattern": "tagoobot"
169
+ },
170
+ {
171
+ "pattern": "MJ12bot"
172
+ },
173
+ {
174
+ "pattern": "dotbot"
175
+ },
176
+ {
177
+ "pattern": "woriobot"
178
+ },
179
+ {
180
+ "pattern": "yanga"
181
+ },
182
+ {
183
+ "pattern": "buzzbot"
184
+ },
185
+ {
186
+ "pattern": "mlbot"
187
+ },
188
+ {
189
+ "pattern": "yandexbot",
190
+ "url": "http://yandex.com/bots",
191
+ "instances": ["Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots)"],
192
+ "addition_date": "2015/04/14"
193
+ },
194
+ {
195
+ "pattern": "yandex.com\\/bots",
196
+ "url": "https://yandex.com/support/webmaster/robot-workings/check-yandex-robots.xml#robot-in-logs",
197
+ "instances": ["Mozilla/5.0 (compatible; YandexWebmaster/2.0; +http://yandex.com/bots)"],
198
+ "addition_date": "2016/12/01"
199
+ },
200
+ {
201
+ "pattern": "purebot",
202
+ "addition_date": "2010/01/19"
203
+ },
204
+ {
205
+ "pattern": "Linguee Bot",
206
+ "addition_date": "2010/01/26",
207
+ "url": "http://www.linguee.com/bot"
208
+ },
209
+ {
210
+ "pattern": "CyberPatrol",
211
+ "addition_date": "2010/02/11",
212
+ "url": "http://www.cyberpatrol.com/cyberpatrolcrawler.asp"
213
+ },
214
+ {
215
+ "pattern": "voilabot",
216
+ "addition_date": "2010/05/18"
217
+ },
218
+ {
219
+ "pattern": "baiduspider",
220
+ "addition_date": "2010/07/15",
221
+ "url": "http://www.baidu.jp/spider/"
222
+ },
223
+ {
224
+ "pattern": "citeseerxbot",
225
+ "addition_date": "2010/07/17"
226
+ },
227
+ {
228
+ "pattern": "spbot",
229
+ "addition_date": "2010/07/31",
230
+ "url": "http://www.seoprofiler.com/bot"
231
+ },
232
+ {
233
+ "pattern": "twengabot",
234
+ "addition_date": "2010/08/03",
235
+ "url": "http://www.twenga.com/bot.html"
236
+ },
237
+ {
238
+ "pattern": "postrank",
239
+ "addition_date": "2010/08/03",
240
+ "url": "http://www.postrank.com"
241
+ },
242
+ {
243
+ "pattern": "turnitinbot",
244
+ "addition_date": "2010/09/26",
245
+ "url": "http://www.turnitin.com"
246
+ },
247
+ {
248
+ "pattern": "scribdbot",
249
+ "addition_date": "2010/09/28",
250
+ "url": "http://www.scribd.com"
251
+ },
252
+ {
253
+ "pattern": "page2rss",
254
+ "addition_date": "2010/10/07",
255
+ "url": "http://www.page2rss.com"
256
+ },
257
+ {
258
+ "pattern": "sitebot",
259
+ "addition_date": "2010/12/15",
260
+ "url": "http://www.sitebot.org"
261
+ },
262
+ {
263
+ "pattern": "linkdex",
264
+ "addition_date": "2011/01/06",
265
+ "url": "http://www.linkdex.com"
266
+ },
267
+ {
268
+ "pattern": "Adidxbot",
269
+ "url": "http://onlinehelp.microsoft.com/en-us/bing/hh204496.aspx"
270
+ },
271
+ {
272
+ "pattern": "blekkobot",
273
+ "url": "http://blekko.com/about/blekkobot"
274
+ },
275
+ {
276
+ "pattern": "ezooms",
277
+ "addition_date": "2011/04/27",
278
+ "url": "http://www.phpbb.com/community/viewtopic.php?f=64&t=935605&start=450#p12948289"
279
+ },
280
+ {
281
+ "pattern": "dotbot",
282
+ "addition_date": "2011/04/27"
283
+ },
284
+ {
285
+ "pattern": "Mail.RU_Bot",
286
+ "addition_date": "2011/04/27",
287
+ "instances" : [
288
+ "Mozilla/5.0 (compatible; Linux x86_64; Mail.RU_Bot/2.0; +http://go.mail.ru/",
289
+ "Mozilla/5.0 (compatible; Mail.RU_Bot/2.0; +http://go.mail.ru/"
290
+ ]
291
+ },
292
+ {
293
+ "pattern": "discobot",
294
+ "addition_date": "2011/05/03",
295
+ "url": "http://discoveryengine.com/discobot.html"
296
+ },
297
+ {
298
+ "pattern": "heritrix",
299
+ "addition_date": "2011/06/21",
300
+ "url": "http://crawler.archive.org/"
301
+ },
302
+ {
303
+ "pattern": "findthatfile",
304
+ "addition_date": "2011/06/21",
305
+ "url": "http://www.findthatfile.com/"
306
+ },
307
+ {
308
+ "pattern": "europarchive.org",
309
+ "addition_date": "2011/06/21",
310
+ "url": ""
311
+ },
312
+ {
313
+ "pattern": "NerdByNature.Bot",
314
+ "addition_date": "2011/07/12",
315
+ "url": "http://www.nerdbynature.net/bot"
316
+ },
317
+ {
318
+ "pattern": "sistrix crawler",
319
+ "addition_date": "2011/08/02"
320
+ },
321
+ {
322
+ "pattern": "ahrefsbot",
323
+ "addition_date": "2011/08/28"
324
+ },
325
+ {
326
+ "pattern": "Aboundex",
327
+ "addition_date": "2011/09/28",
328
+ "url": "http://www.aboundex.com/crawler/"
329
+ },
330
+ {
331
+ "pattern": "domaincrawler",
332
+ "addition_date": "2011/10/21"
333
+ },
334
+ {
335
+ "pattern": "wbsearchbot",
336
+ "addition_date": "2011/12/21",
337
+ "url": "http://www.warebay.com/bot.html"
338
+ },
339
+ {
340
+ "pattern": "summify",
341
+ "addition_date": "2012/01/04",
342
+ "url": "http://summify.com"
343
+ },
344
+ {
345
+ "pattern": "ccbot",
346
+ "addition_date": "2012/02/05",
347
+ "url": "http://www.commoncrawl.org/bot.html"
348
+ },
349
+ {
350
+ "pattern": "edisterbot",
351
+ "addition_date": "2012/02/25"
352
+ },
353
+ {
354
+ "pattern": "seznambot",
355
+ "addition_date": "2012/03/14"
356
+ },
357
+ {
358
+ "pattern": "ec2linkfinder",
359
+ "addition_date": "2012/03/22"
360
+ },
361
+ {
362
+ "pattern": "gslfbot",
363
+ "addition_date": "2012/04/03"
364
+ },
365
+ {
366
+ "pattern": "aihitbot",
367
+ "addition_date": "2012/04/16"
368
+ },
369
+ {
370
+ "pattern": "intelium_bot",
371
+ "addition_date": "2012/05/07"
372
+ },
373
+ {
374
+ "pattern": "facebookexternalhit",
375
+ "addition_date": "2012/05/07"
376
+ },
377
+ {
378
+ "pattern": "yeti",
379
+ "addition_date": "2012/05/07"
380
+ },
381
+ {
382
+ "pattern": "RetrevoPageAnalyzer",
383
+ "addition_date": "2012/05/07"
384
+ },
385
+ {
386
+ "pattern": "lb-spider",
387
+ "addition_date": "2012/05/07"
388
+ },
389
+ {
390
+ "pattern": "sogou",
391
+ "addition_date": "2012/05/13",
392
+ "url": "http://www.sogou.com/docs/help/webmasters.htm#07"
393
+ },
394
+ {
395
+ "pattern": "lssbot",
396
+ "addition_date": "2012/05/15"
397
+ },
398
+ {
399
+ "pattern": "careerbot",
400
+ "addition_date": "2012/05/23",
401
+ "url": "http://www.career-x.de/bot.html"
402
+ },
403
+ {
404
+ "pattern": "wotbox",
405
+ "addition_date": "2012/06/12",
406
+ "url": "http://www.wotbox.com"
407
+ },
408
+ {
409
+ "pattern": "wocbot",
410
+ "addition_date": "2012/07/25",
411
+ "url": "http://www.wocodi.com/crawler"
412
+ },
413
+ {
414
+ "pattern": "ichiro",
415
+ "addition_date": "2012/08/28",
416
+ "url": "http://help.goo.ne.jp/help/article/1142"
417
+ },
418
+ {
419
+ "pattern": "DuckDuckBot",
420
+ "addition_date": "2012/09/19",
421
+ "url": "http://duckduckgo.com/duckduckbot.html"
422
+ },
423
+ {
424
+ "pattern": "lssrocketcrawler",
425
+ "addition_date": "2012/09/24"
426
+ },
427
+ {
428
+ "pattern": "drupact",
429
+ "addition_date": "2012/09/27",
430
+ "url": "http://www.arocom.de/drupact"
431
+ },
432
+ {
433
+ "pattern": "webcompanycrawler",
434
+ "addition_date": "2012/10/03"
435
+ },
436
+ {
437
+ "pattern": "acoonbot",
438
+ "addition_date": "2012/10/07",
439
+ "url": "http://www.acoon.de/robot.asp"
440
+ },
441
+ {
442
+ "pattern": "openindexspider",
443
+ "addition_date": "2012/10/26",
444
+ "url": "http://www.openindex.io/en/webmasters/spider.html"
445
+ },
446
+ {
447
+ "pattern": "gnam gnam spider",
448
+ "addition_date": "2012/10/31"
449
+ },
450
+ {
451
+ "pattern": "web-archive-net.com.bot"
452
+ },
453
+ {
454
+ "pattern": "backlinkcrawler",
455
+ "addition_date": "2013/01/04"
456
+ },
457
+ {
458
+ "pattern": "coccoc",
459
+ "addition_date": "2013/01/04",
460
+ "url": "http://help.coccoc.vn/"
461
+ },
462
+ {
463
+ "pattern": "integromedb",
464
+ "addition_date": "2013/01/10",
465
+ "url": "http://www.integromedb.org/Crawler"
466
+ },
467
+ {
468
+ "pattern": "content crawler spider",
469
+ "addition_date": "2013/01/11"
470
+ },
471
+ {
472
+ "pattern": "toplistbot",
473
+ "addition_date": "2013/02/05"
474
+ },
475
+ {
476
+ "pattern": "seokicks-robot",
477
+ "addition_date": "2013/02/25"
478
+ },
479
+ {
480
+ "pattern": "it2media-domain-crawler",
481
+ "addition_date": "2013/03/12"
482
+ },
483
+ {
484
+ "pattern": "ip-web-crawler.com",
485
+ "addition_date": "2013/03/22"
486
+ },
487
+ {
488
+ "pattern": "siteexplorer.info",
489
+ "addition_date": "2013/05/01"
490
+ },
491
+ {
492
+ "pattern": "elisabot",
493
+ "addition_date": "2013/06/27"
494
+ },
495
+ {
496
+ "pattern": "proximic",
497
+ "addition_date": "2013/09/12",
498
+ "url": "http://www.proximic.com/info/spider.php"
499
+ },
500
+ {
501
+ "pattern": "changedetection",
502
+ "addition_date": "2013/09/13",
503
+ "url": "http://www.changedetection.com/bot.html"
504
+ },
505
+ {
506
+ "pattern": "blexbot",
507
+ "addition_date": "2013/10/03",
508
+ "url": "http://webmeup-crawler.com/"
509
+ },
510
+ {
511
+ "pattern": "arabot",
512
+ "addition_date": "2013/10/09"
513
+ },
514
+ {
515
+ "pattern": "WeSEE:Search",
516
+ "addition_date": "2013/11/18"
517
+ },
518
+ {
519
+ "pattern": "niki-bot",
520
+ "addition_date": "2014/01/01"
521
+ },
522
+ {
523
+ "pattern": "CrystalSemanticsBot",
524
+ "addition_date": "2014/02/17",
525
+ "url": "http://www.crystalsemantics.com/user-agent/"
526
+ },
527
+ {
528
+ "pattern": "rogerbot",
529
+ "addition_date": "2014/02/28",
530
+ "url": "http://moz.com/help/pro/what-is-rogerbot-"
531
+ },
532
+ {
533
+ "pattern": "360Spider",
534
+ "addition_date": "2014/03/14",
535
+ "url": "http://needs-be.blogspot.co.uk/2013/02/how-to-block-spider360.html"
536
+ },
537
+ {
538
+ "pattern": "psbot",
539
+ "addition_date": "2014/03/31",
540
+ "url": "http://www.picsearch.com/bot.html"
541
+ },
542
+ {
543
+ "pattern": "InterfaxScanBot",
544
+ "addition_date": "2014/03/31",
545
+ "url": "http://scan-interfax.ru"
546
+ },
547
+ {
548
+ "pattern": "Lipperhey SEO Service",
549
+ "addition_date": "2014/04/01",
550
+ "url": "http://www.lipperhey.com/"
551
+ },
552
+ {
553
+ "pattern": "CC Metadata Scaper",
554
+ "addition_date": "2014/04/01",
555
+ "url": "http://wiki.creativecommons.org/Metadata_Scraper"
556
+ },
557
+ {
558
+ "pattern": "g00g1e.net",
559
+ "addition_date": "2014/04/01",
560
+ "url": "http://www.g00g1e.net/"
561
+ },
562
+ {
563
+ "pattern": "GrapeshotCrawler",
564
+ "addition_date": "2014/04/01",
565
+ "url": "http://www.grapeshot.co.uk/crawler.php"
566
+ },
567
+ {
568
+ "pattern": "urlappendbot",
569
+ "addition_date": "2014/05/10",
570
+ "url": "http://www.profound.net/urlappendbot.html"
571
+ },
572
+ {
573
+ "pattern": "brainobot",
574
+ "addition_date": "2014/06/24"
575
+ },
576
+ {
577
+ "pattern": "fr-crawler",
578
+ "addition_date": "2014/07/31",
579
+ "instances": ["Mozilla/5.0 (compatible; fr-crawler/1.1)"]
580
+ },
581
+ {
582
+ "pattern": "binlar",
583
+ "addition_date": "2014/09/12",
584
+ "instances": [
585
+ "binlar_2.6.3 binlar2.6.3@unspecified.mail",
586
+ "binlar_2.6.3 binlar_2.6.3@unspecified.mail",
587
+ "binlar_2.6.3 larbin2.6.3@unspecified.mail",
588
+ "binlar_2.6.3 phanendra_kalapala@McAfee.com",
589
+ "binlar_2.6.3 test@mgmt.mic"
590
+ ]
591
+ },
592
+ {
593
+ "pattern": "SimpleCrawler",
594
+ "addition_date": "2014/09/12",
595
+ "instances": ["SimpleCrawler/0.1" ]
596
+ },
597
+ {
598
+ "pattern": "Livelapbot",
599
+ "addition_date": "2014/09/12",
600
+ "instances": ["Livelapbot/0.1" ]
601
+ },
602
+ {
603
+ "pattern": "Twitterbot",
604
+ "addition_date": "2014/09/12",
605
+ "instances": ["Twitterbot/0.1", "Twitterbot/1.0" ]
606
+ },
607
+ {
608
+ "pattern": "cXensebot",
609
+ "addition_date": "2014/10/05",
610
+ "instances": ["cXensebot/1.1a"],
611
+ "url": "http://www.cxense.com/bot.html"
612
+ },
613
+ {
614
+ "pattern": "smtbot",
615
+ "addition_date": "2014/10/04",
616
+ "instances": ["Mozilla/5.0 (compatible; SMTBot/1.0; +http://www.similartech.com/smtbo)t", "SMTBot (similartech.com/smtbot)"],
617
+ "url": "http://www.similartech.com/smtbot"
618
+ },
619
+ {
620
+ "pattern": "bnf.fr_bot",
621
+ "addition_date": "2014/11/18",
622
+ "url": "http://www.bnf.fr/fr/outils/a.dl_web_capture_robot.html",
623
+ "instances": ["Mozilla/5.0 (compatible; bnf.fr_bot; +http://www.bnf.fr/fr/outils/a.dl_web_capture_robot.html)"]
624
+ },
625
+ {
626
+ "pattern": "A6-Indexer",
627
+ "addition_date": "2014/12/05",
628
+ "url": "http://www.a6corp.com/a6-web-scraping-policy/",
629
+ "instances": ["A6-Indexer"]
630
+ },
631
+ {
632
+ "pattern": "ADmantX",
633
+ "addition_date": "2014/12/05",
634
+ "url": "http://www.admantx.com",
635
+ "instances": ["ADmantX Platform Semantic Analyzer - ADmantX Inc. - www.admantx.com - support@admantx.com"]
636
+ },
637
+ {
638
+ "pattern": "Facebot",
639
+ "url": "https://developers.facebook.com/docs/sharing/best-practices#crawl",
640
+ "addition_date": "2014/12/30"
641
+ },
642
+ {
643
+ "pattern": "Twitterbot",
644
+ "url": "https://dev.twitter.com/cards/getting-started",
645
+ "addition_date": "2014/12/30"
646
+ },
647
+ {
648
+ "pattern": "OrangeBot",
649
+ "instances": ["Mozilla/5.0 (compatible; OrangeBot/2.0; support.orangebot@orange.com"],
650
+ "addition_date": "2015/01/12"
651
+ },
652
+ {
653
+ "pattern": "memorybot",
654
+ "url": "http://mignify.com/bot.htm",
655
+ "instances": ["Mozilla/5.0 (compatible; memorybot/1.21.14 +http://mignify.com/bot.html)"],
656
+ "addition_date": "2015/02/01"
657
+ },
658
+ {
659
+ "pattern": "AdvBot",
660
+ "url": "http://advbot.net/bot.html",
661
+ "instances": ["Mozilla/5.0 (compatible; AdvBot/2.0; +http://advbot.net/bot.html)"],
662
+ "addition_date": "2015/02/01"
663
+ },
664
+ {
665
+ "pattern": "MegaIndex",
666
+ "url": "https://www.megaindex.ru/?tab=linkAnalyze",
667
+ "instances": ["Mozilla/5.0 (compatible; MegaIndex.ru/2.0; +https://www.megaindex.ru/?tab=linkAnalyze)"],
668
+ "addition_date": "2015/03/28"
669
+ },
670
+ {
671
+ "pattern": "SemanticScholarBot",
672
+ "url": "http://s2.allenai.org/bot.html",
673
+ "instances": ["SemanticScholarBot/1.0 (+http://s2.allenai.org/bot.html)"],
674
+ "addition_date": "2015/03/28"
675
+ },
676
+ {
677
+ "pattern": "ltx71",
678
+ "url": "http://ltx71.com/",
679
+ "instances": ["ltx71 - (http://ltx71.com/)"],
680
+ "addition_date": "2015/04/04"
681
+ },
682
+ {
683
+ "pattern": "nerdybot",
684
+ "url": "http://nerdybot.com/",
685
+ "instances": ["nerdybot"],
686
+ "addition_date": "2015/04/05"
687
+ },
688
+ {
689
+ "pattern": "xovibot",
690
+ "url": "http://www.xovibot.net/",
691
+ "instances": ["Mozilla/5.0 (compatible; XoviBot/2.0; +http://www.xovibot.net/)"],
692
+ "addition_date": "2015/04/05"
693
+ },
694
+ {
695
+ "pattern": "BUbiNG",
696
+ "url": "http://law.di.unimi.it/BUbiNG.html",
697
+ "instances": ["BUbiNG (+http://law.di.unimi.it/BUbiNG.html)"],
698
+ "addition_date": "2015/04/06"
699
+ },
700
+ {
701
+ "pattern": "Qwantify",
702
+ "url": "https://www.qwant.com/",
703
+ "instances": ["Mozilla/5.0 (compatible; Qwantify/2.0n; +https://www.qwant.com/)/*"],
704
+ "addition_date": "2015/04/06"
705
+ },
706
+ {
707
+ "pattern": "archive.org_bot",
708
+ "url": "http://www.archive.org/details/archive.org_bot",
709
+ "instances": ["Mozilla/5.0 (compatible; archive.org_bot +http://www.archive.org/details/archive.org_bot)"],
710
+ "addition_date": "2015/04/14"
711
+ },
712
+ {
713
+ "pattern": "Applebot",
714
+ "url": "http://www.apple.com/go/applebot",
715
+ "addition_date": "2015/04/15"
716
+ },
717
+ {
718
+ "pattern": "TweetmemeBot",
719
+ "url": "http://datasift.com/bot.html",
720
+ "instances": ["Mozilla/5.0 (TweetmemeBot/4.0; +http://datasift.com/bot.html) Gecko/20100101 Firefox/31.0"],
721
+ "addition_date": "2015/04/15"
722
+ },
723
+ {
724
+ "pattern": "crawler4j",
725
+ "url": "https://github.com/yasserg/crawler4j",
726
+ "instances": ["crawler4j (http://code.google.com/p/crawler4j/)"],
727
+ "addition_date": "2015/05/07"
728
+ },
729
+ {
730
+ "pattern": "findxbot",
731
+ "url": "http://www.findxbot.com",
732
+ "instances": ["Mozilla/5.0 (compatible; Findxbot/1.0; +http://www.findxbot.com)"],
733
+ "addition_date": "2015/05/07"
734
+ },
735
+ {
736
+ "pattern": "SemrushBot",
737
+ "url": "http://www.semrush.com/bot.html",
738
+ "instances": ["Mozilla/5.0 (compatible; SemrushBot/0.98~bl; +http://www.semrush.com/bot.html)"],
739
+ "addition_date": "2015/05/26"
740
+ },
741
+ {
742
+ "pattern": "yoozBot",
743
+ "url": "http://yooz.ir",
744
+ "instances": ["Mozilla/5.0 (compatible; yoozBot-2.2; http://yooz.ir; info@yooz.ir)"],
745
+ "addition_date": "2015/05/26"
746
+ },
747
+ {
748
+ "pattern": "lipperhey",
749
+ "url": "http://www.lipperhey.com/",
750
+ "instances": ["Mozilla/5.0 (compatible; Lipperhey Link Explorer; http://www.lipperhey.com/)", "Mozilla/5.0 (compatible; Lipperhey SEO Service; http://www.lipperhey.com/)", "Mozilla/5.0 (compatible; Lipperhey Site Explorer; http://www.lipperhey.com/)", "Mozilla/5.0 (compatible; Lipperhey-Kaus-Australis/5.0; +https://www.lipperhey.com/en/about/)"],
751
+ "addition_date": "2015/08/26"
752
+ },
753
+ {
754
+ "pattern": "y!j-asr",
755
+ "url": "http://www.yahoo-help.jp/app/answers/detail/p/595/a_id/42716/",
756
+ "instances": ["Y!J-ASR/0.1 crawler (http://www.yahoo-help.jp/app/answers/detail/p/595/a_id/42716/)"],
757
+ "addition_date": "2015/05/26"
758
+ },
759
+ {
760
+ "pattern": "Domain Re-Animator Bot",
761
+ "url": "http://domainreanimator.com",
762
+ "instances": ["Domain Re-Animator Bot (http://domainreanimator.com) - support@domainreanimator.com"],
763
+ "addition_date": "2015/04/14"
764
+ },
765
+ {
766
+ "pattern": "AddThis",
767
+ "url": "https://www.addthis.com",
768
+ "instances": ["AddThis.com robot tech.support@clearspring.com"],
769
+ "addition_date": "2015/06/02"
770
+ },
771
+ {
772
+ "pattern": "Screaming Frog SEO Spider",
773
+ "url": "http://www.screamingfrog.co.uk/seo-spider",
774
+ "instances": ["Screaming Frog SEO Spider/5.1"],
775
+ "addition_date": "2016/01/08"
776
+ },
777
+ {
778
+ "pattern": "MetaURI",
779
+ "url": "http://www.useragentstring.com/MetaURI_id_17683.php",
780
+ "instances": ["MetaURI API/2.0 +metauri.com"],
781
+ "addition_date": "2016/01/02"
782
+ },
783
+ {
784
+ "pattern": "Scrapy",
785
+ "url": "http://scrapy.org/",
786
+ "instances": ["Scrapy/1.0.3 (+http://scrapy.org)"],
787
+ "addition_date": "2016/01/02"
788
+ },
789
+ {
790
+ "pattern": "LivelapBot",
791
+ "url": "http://site.livelap.com/crawler",
792
+ "instances": ["LivelapBot/0.2 (http://site.livelap.com/crawler)"],
793
+ "addition_date": "2016/01/02"
794
+ },
795
+ {
796
+ "pattern": "OpenHoseBot",
797
+ "url": "http://www.openhose.org/bot.html",
798
+ "instances": ["Mozilla/5.0 (compatible; OpenHoseBot/2.1; +http://www.openhose.org/bot.html)"],
799
+ "addition_date": "2016/01/02"
800
+ },
801
+ {
802
+ "pattern": "CapsuleChecker",
803
+ "url": "http://www.capsulink.com/about",
804
+ "instances": ["CapsuleChecker (http://www.capsulink.com/)"],
805
+ "addition_date": "2016/01/02"
806
+ },
807
+ {
808
+ "pattern": "collection@infegy.com",
809
+ "url": "http://infegy.com/",
810
+ "instances": ["Mozilla/5.0 (compatible) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.73 Safari/537.36 collection@infegy.com"],
811
+ "addition_date": "2016/01/03"
812
+ },
813
+ {
814
+ "pattern": "IstellaBot",
815
+ "url": "http://www.tiscali.it/",
816
+ "instances": ["Mozilla/5.0 (compatible; IstellaBot/1.23.15 +http://www.tiscali.it/)"],
817
+ "addition_date": "2016/01/09"
818
+ },
819
+ {
820
+ "pattern": "DeuSu\\/",
821
+ "addition_date": "2016/01/23",
822
+ "url": "https://deusu.de/robot.html"
823
+ },
824
+ {
825
+ "pattern": "betaBot",
826
+ "addition_date": "2016/01/23"
827
+ },
828
+ {
829
+ "pattern": "Cliqzbot\\/",
830
+ "addition_date": "2016/01/23",
831
+ "url": "http://cliqz.com/company/cliqzbot"
832
+ },
833
+ {
834
+ "pattern": "MojeekBot\\/",
835
+ "addition_date": "2016/01/23",
836
+ "url": "https://www.mojeek.com/bot.html"
837
+ },
838
+ {
839
+ "pattern": "netEstate NE Crawler",
840
+ "addition_date": "2016/01/23",
841
+ "url": "+http://www.website-datenbank.de/"
842
+ },
843
+ {
844
+ "pattern": "SafeSearch microdata crawler",
845
+ "addition_date": "2016/01/23",
846
+ "url": "https://safesearch.avira.com"
847
+ },
848
+ {
849
+ "pattern": "Gluten Free Crawler\\/",
850
+ "addition_date": "2016/01/23",
851
+ "url": "http://glutenfreepleasure.com/"
852
+ },
853
+ {
854
+ "pattern": "Sonic",
855
+ "addition_date": "2016/02/08",
856
+ "url": "http://www.yama.info.waseda.ac.jp/~crawler/info.html"
857
+ },
858
+ {
859
+ "pattern": "Sysomos",
860
+ "addition_date": "2016/02/08",
861
+ "url": "http://www.sysomos.com"
862
+ },
863
+ {
864
+ "pattern": "Trove",
865
+ "addition_date": "2016/02/08",
866
+ "url": "http://www.trove.com"
867
+ },
868
+ {
869
+ "pattern": "deadlinkchecker",
870
+ "addition_date": "2016/02/08",
871
+ "url": "http://www.deadlinkchecker.com"
872
+ },
873
+ {
874
+ "pattern": "Slack-ImgProxy",
875
+ "addition_date": "2016/04/25",
876
+ "url": "https://api.slack.com/robots"
877
+ },
878
+ {
879
+ "pattern": "Embedly",
880
+ "addition_date": "2016/04/25",
881
+ "url": "http://support.embed.ly"
882
+ },
883
+ {
884
+ "pattern": "RankActiveLinkBot",
885
+ "addition_date": "2016/06/20",
886
+ "url": "https://rankactive.com/resources/rankactive-linkbot"
887
+ },
888
+ {
889
+ "pattern": "iskanie",
890
+ "addition_date": "2016/09/02",
891
+ "url": "http://www.iskanie.com",
892
+ "instances": ["iskanie (+http://www.iskanie.com)"]
893
+ },
894
+ {
895
+ "pattern": "SafeDNSBot",
896
+ "addition_date": "2016/09/10",
897
+ "url": "https://www.safedns.com/searchbot",
898
+ "instances": ["SafeDNSBot (https://www.safedns.com/searchbot)"]
899
+ },
900
+ {
901
+ "pattern": "SkypeUriPreview",
902
+ "addition_date": "2016/10/10",
903
+ "instances": ["Mozilla/5.0 (Windows NT 6.1; WOW64) SkypeUriPreview Preview/0.5"]
904
+ },
905
+ {
906
+ "pattern": "Veoozbot",
907
+ "addition_date": "2016/11/03",
908
+ "url": "http://www.veooz.com/veoozbot.html"
909
+ },
910
+ {
911
+ "pattern": "Slackbot",
912
+ "addition_date": "2016/11/03",
913
+ "url": "https://api.slack.com/robots"
914
+ },
915
+ {
916
+ "pattern": "redditbot",
917
+ "addition_date": "2016/11/03",
918
+ "url": "http://www.reddit.com/feedback"
919
+ },
920
+ {
921
+ "pattern": "datagnionbot",
922
+ "addition_date": "2016/11/03",
923
+ "url": "http://www.datagnion.com/bot.html"
924
+ },
925
+ {
926
+ "pattern": "Google-Adwords-Instant",
927
+ "addition_date": "2016/11/03",
928
+ "url": "http://www.google.com/adsbot.html"
929
+ },
930
+ {
931
+ "pattern": "adbeat_bot",
932
+ "addition_date": "2016/11/04"
933
+ },
934
+ {
935
+ "pattern": "Scanbot",
936
+ "addition_date": "2016/11/04"
937
+ },
938
+ {
939
+ "pattern": "WhatsApp",
940
+ "addition_date": "2016/11/15",
941
+ "url": "https://www.whatsapp.com/"
942
+ },
943
+ {
944
+ "pattern": "contxbot",
945
+ "addition_date": "2017/02/25",
946
+ "instances": ["Mozilla/5.0 (compatible;contxbot/1.0)"]
947
+ },
948
+ {
949
+ "pattern": "pinterest",
950
+ "addition_date": "2017/03/03",
951
+ "instances": ["Pinterest/0.2 (+http://www.pinterest.com/bot.html)"],
952
+ "url": "http://www.pinterest.com/bot.html"
953
+ },
954
+ {
955
+ "pattern": "electricmonk",
956
+ "addition_date": "2017/03/04",
957
+ "instances": ["Mozilla/5.0 (compatible; electricmonk/3.2.0 +https://www.duedil.com/our-crawler/)"],
958
+ "url": "https://www.duedil.com/our-crawler/"
959
+ },
960
+ {
961
+ "pattern": "GarlikCrawler",
962
+ "addition_date": "2017/03/18",
963
+ "instances": ["GarlikCrawler/1.2 (http://garlik.com/, crawler@garlik.com)"],
964
+ "url": "http://garlik.com/"
965
+ },
966
+ {
967
+ "pattern": "BingPreview\\/",
968
+ "addition_date": "2017/04/23",
969
+ "url": "https://www.bing.com/webmaster/help/which-crawlers-does-bing-use-8c184ec0"
970
+ },
971
+ {
972
+ "pattern": "vebidoobot",
973
+ "addition_date": "2017/05/08",
974
+ "instances": ["Mozilla/5.0 (compatible; vebidoobot/1.0; +https://blog.vebidoo.de/vebidoobot/"],
975
+ "url": "https://blog.vebidoo.de/vebidoobot/"
976
+ }
977
+
978
+
979
+ ]
data/isbot.gemspec ADDED
@@ -0,0 +1,13 @@
1
+ Gem::Specification.new do |s|
2
+ s.name = 'isbot'
3
+ s.version = '0.1.0'
4
+ s.date = '2017-05-16'
5
+ s.summary = 'detects bots/crawlers/spiders via the user agent.'
6
+ s.description = 'A simple library for detecting bots/crawlers/spiders through user-agent strings.'
7
+ s.authors = ['Hentioe']
8
+ s.email = 'meow.i5.br@gmai.com'
9
+ s.files = Dir['**/*']
10
+ s.homepage =
11
+ 'https://github.com/Hentioe/isbot'
12
+ s.license = 'MIT'
13
+ end
data/lib/Gemfile ADDED
@@ -0,0 +1 @@
1
+ source 'https://rubygems.org/'
data/lib/fetch.sh ADDED
@@ -0,0 +1,4 @@
1
+ #!/usr/bin/env bash
2
+
3
+ wget https://raw.githubusercontent.com/monperrus/crawler-user-agents/master/crawler-user-agents.json \
4
+ -O data.json
data/lib/isbot.rb ADDED
@@ -0,0 +1,38 @@
1
+ require_relative 'parser'
2
+
3
+ module IsBot
4
+ $init = false
5
+ $regex_str = ''
6
+ $regex = nil
7
+
8
+ def IsBot.regex
9
+ unless $init
10
+ list = IsBotParser::parse
11
+ list.each_with_index do |item, i|
12
+ $regex_str += "(#{item})"
13
+ $regex_str += '|' if i < (list.length - 1)
14
+ end
15
+ $init = true
16
+ $regex = /#{$regex_str}/i
17
+ end
18
+ $regex
19
+ end
20
+
21
+ end
22
+
23
+ def is_bot(user_agent, &block)
24
+ return false if user_agent == nil
25
+ user_agent.to_s.strip!
26
+ match_data = user_agent.match(IsBot::regex)
27
+ if block!=nil && match_data != nil
28
+ block.call match_data
29
+ return
30
+ end
31
+ match_data != nil
32
+ end
33
+
34
+ class String
35
+ def is_bot?
36
+ is_bot(self)
37
+ end
38
+ end
data/lib/parser.rb ADDED
@@ -0,0 +1,22 @@
1
+ require 'json'
2
+ require 'pathname'
3
+
4
+
5
+ module IsBotParser
6
+ $basedir = Pathname.new(__FILE__).dirname.parent
7
+ PATTERN_LIST = Array.new
8
+
9
+ def IsBotParser.data_file
10
+ $basedir + 'data' + 'data.json'
11
+ end
12
+
13
+ def IsBotParser.parse
14
+ json_str = IO.read(data_file)
15
+ list = JSON.parse(json_str)
16
+ list.each do |item|
17
+ pattern = item['pattern'].to_s
18
+ PATTERN_LIST.push(pattern)
19
+ end
20
+ PATTERN_LIST
21
+ end
22
+ end
@@ -0,0 +1,22 @@
1
+ require 'test/unit'
2
+ require_relative '../lib/isbot'
3
+
4
+ class IsBotTest < Test::Unit::TestCase
5
+ $list = [
6
+ 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)',
7
+ 'Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)',
8
+ 'Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)'
9
+ ]
10
+
11
+ def test_is_bot
12
+ $list.each {|ua| assert_true ua.is_bot?}
13
+ end
14
+
15
+ def test_is_bot_with_block
16
+ $list.each do |ua|
17
+ is_bot ua do |match_bot|
18
+ puts match_bot
19
+ end
20
+ end
21
+ end
22
+ end
metadata ADDED
@@ -0,0 +1,52 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: isbot
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Hentioe
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2017-05-16 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: A simple library for detecting bots/crawlers/spiders through user-agent
14
+ strings.
15
+ email: meow.i5.br@gmai.com
16
+ executables: []
17
+ extensions: []
18
+ extra_rdoc_files: []
19
+ files:
20
+ - README.md
21
+ - data/data.json
22
+ - isbot.gemspec
23
+ - lib/Gemfile
24
+ - lib/fetch.sh
25
+ - lib/isbot.rb
26
+ - lib/parser.rb
27
+ - tests/isbot_test.rb
28
+ homepage: https://github.com/Hentioe/isbot
29
+ licenses:
30
+ - MIT
31
+ metadata: {}
32
+ post_install_message:
33
+ rdoc_options: []
34
+ require_paths:
35
+ - lib
36
+ required_ruby_version: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ required_rubygems_version: !ruby/object:Gem::Requirement
42
+ requirements:
43
+ - - ">="
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ requirements: []
47
+ rubyforge_project:
48
+ rubygems_version: 2.5.1
49
+ signing_key:
50
+ specification_version: 4
51
+ summary: detects bots/crawlers/spiders via the user agent.
52
+ test_files: []