crawler-user-agents 1.0.128 → 1.0.129
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/crawler-user-agents.json +24 -24
- package/package.json +1 -1
- package/validate.py +6 -0
package/crawler-user-agents.json
CHANGED
|
@@ -405,7 +405,7 @@
|
|
|
405
405
|
}
|
|
406
406
|
,
|
|
407
407
|
{
|
|
408
|
-
"pattern": "grub
|
|
408
|
+
"pattern": "grub\\.org",
|
|
409
409
|
"instances": [
|
|
410
410
|
"Mozilla/4.0 (compatible; grub-client-0.3.0; Crawl your own stuff with http://grub.org)",
|
|
411
411
|
"Mozilla/4.0 (compatible; grub-client-1.0.4; Crawl your own stuff with http://grub.org)",
|
|
@@ -855,7 +855,7 @@
|
|
|
855
855
|
}
|
|
856
856
|
,
|
|
857
857
|
{
|
|
858
|
-
"pattern": "Mail
|
|
858
|
+
"pattern": "Mail\\.RU_Bot",
|
|
859
859
|
"addition_date": "2011/04/27",
|
|
860
860
|
"instances": [
|
|
861
861
|
"Mozilla/5.0 (compatible; Linux x86_64; Mail.RU_Bot/2.0; +http://go.mail.ru/help/robots)",
|
|
@@ -914,7 +914,7 @@
|
|
|
914
914
|
}
|
|
915
915
|
,
|
|
916
916
|
{
|
|
917
|
-
"pattern": "europarchive
|
|
917
|
+
"pattern": "europarchive\\.org",
|
|
918
918
|
"addition_date": "2011/06/21",
|
|
919
919
|
"url": "",
|
|
920
920
|
"instances": [
|
|
@@ -923,7 +923,7 @@
|
|
|
923
923
|
}
|
|
924
924
|
,
|
|
925
925
|
{
|
|
926
|
-
"pattern": "NerdByNature
|
|
926
|
+
"pattern": "NerdByNature\\.Bot",
|
|
927
927
|
"addition_date": "2011/07/12",
|
|
928
928
|
"url": "http://www.nerdbynature.net/bot",
|
|
929
929
|
"instances": [
|
|
@@ -1299,7 +1299,7 @@
|
|
|
1299
1299
|
}
|
|
1300
1300
|
,
|
|
1301
1301
|
{
|
|
1302
|
-
"pattern": "web-archive-net
|
|
1302
|
+
"pattern": "web-archive-net\\.com\\.bot",
|
|
1303
1303
|
"instances": []
|
|
1304
1304
|
}
|
|
1305
1305
|
,
|
|
@@ -1359,13 +1359,13 @@
|
|
|
1359
1359
|
}
|
|
1360
1360
|
,
|
|
1361
1361
|
{
|
|
1362
|
-
"pattern": "ip-web-crawler
|
|
1362
|
+
"pattern": "ip-web-crawler\\.com",
|
|
1363
1363
|
"addition_date": "2013/03/22",
|
|
1364
1364
|
"instances": []
|
|
1365
1365
|
}
|
|
1366
1366
|
,
|
|
1367
1367
|
{
|
|
1368
|
-
"pattern": "siteexplorer
|
|
1368
|
+
"pattern": "siteexplorer\\.info",
|
|
1369
1369
|
"addition_date": "2013/05/01",
|
|
1370
1370
|
"instances": [
|
|
1371
1371
|
"Mozilla/5.0 (compatible; SiteExplorer/1.0b; +http://siteexplorer.info/)",
|
|
@@ -1493,7 +1493,7 @@
|
|
|
1493
1493
|
}
|
|
1494
1494
|
,
|
|
1495
1495
|
{
|
|
1496
|
-
"pattern": "g00g1e
|
|
1496
|
+
"pattern": "g00g1e\\.net",
|
|
1497
1497
|
"addition_date": "2014/04/01",
|
|
1498
1498
|
"url": "http://www.g00g1e.net/",
|
|
1499
1499
|
"instances": []
|
|
@@ -1584,7 +1584,7 @@
|
|
|
1584
1584
|
}
|
|
1585
1585
|
,
|
|
1586
1586
|
{
|
|
1587
|
-
"pattern": "bnf
|
|
1587
|
+
"pattern": "bnf\\.fr_bot",
|
|
1588
1588
|
"addition_date": "2014/11/18",
|
|
1589
1589
|
"url": "http://www.bnf.fr/fr/outils/a.dl_web_capture_robot.html",
|
|
1590
1590
|
"instances": [
|
|
@@ -1715,7 +1715,7 @@
|
|
|
1715
1715
|
}
|
|
1716
1716
|
,
|
|
1717
1717
|
{
|
|
1718
|
-
"pattern": "archive
|
|
1718
|
+
"pattern": "archive\\.org_bot",
|
|
1719
1719
|
"url": "http://www.archive.org/details/archive.org_bot",
|
|
1720
1720
|
"depends_on": ["heritrix"],
|
|
1721
1721
|
"instances": [
|
|
@@ -1895,7 +1895,7 @@
|
|
|
1895
1895
|
}
|
|
1896
1896
|
,
|
|
1897
1897
|
{
|
|
1898
|
-
"pattern": "collection@infegy
|
|
1898
|
+
"pattern": "collection@infegy\\.com",
|
|
1899
1899
|
"url": "http://infegy.com/",
|
|
1900
1900
|
"instances": [
|
|
1901
1901
|
"Mozilla/5.0 (compatible) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.73 Safari/537.36 collection@infegy.com"
|
|
@@ -2179,7 +2179,7 @@
|
|
|
2179
2179
|
}
|
|
2180
2180
|
,
|
|
2181
2181
|
{
|
|
2182
|
-
"pattern": "pinterest
|
|
2182
|
+
"pattern": "pinterest\\.com\\/bot",
|
|
2183
2183
|
"addition_date": "2017/03/03",
|
|
2184
2184
|
"instances": [
|
|
2185
2185
|
"Mozilla/5.0 (compatible; Pinterestbot/1.0; +http://www.pinterest.com/bot.html)",
|
|
@@ -2805,7 +2805,7 @@
|
|
|
2805
2805
|
}
|
|
2806
2806
|
,
|
|
2807
2807
|
{
|
|
2808
|
-
"pattern": "Traackr
|
|
2808
|
+
"pattern": "Traackr\\.com",
|
|
2809
2809
|
"addition_date": "2017/11/02",
|
|
2810
2810
|
"url": "Traackr.com",
|
|
2811
2811
|
"instances": [
|
|
@@ -2941,7 +2941,7 @@
|
|
|
2941
2941
|
}
|
|
2942
2942
|
,
|
|
2943
2943
|
{
|
|
2944
|
-
"pattern": "filterdb
|
|
2944
|
+
"pattern": "filterdb\\.iss\\.net\\/crawler",
|
|
2945
2945
|
"addition_date": "2018/03/16",
|
|
2946
2946
|
"instances": [
|
|
2947
2947
|
"Mozilla/5.0 (compatible; oBot/2.3.1; +http://filterdb.iss.net/crawler/)"
|
|
@@ -3210,7 +3210,7 @@
|
|
|
3210
3210
|
}
|
|
3211
3211
|
,
|
|
3212
3212
|
{
|
|
3213
|
-
"pattern": "Bot
|
|
3213
|
+
"pattern": "Bot\\.AraTurka\\.com",
|
|
3214
3214
|
"addition_date": "2018/06/27",
|
|
3215
3215
|
"instances": [
|
|
3216
3216
|
"Bot.AraTurka.com/0.0.1"
|
|
@@ -3219,7 +3219,7 @@
|
|
|
3219
3219
|
}
|
|
3220
3220
|
,
|
|
3221
3221
|
{
|
|
3222
|
-
"pattern": "bot-pge
|
|
3222
|
+
"pattern": "bot-pge\\.chlooe\\.com",
|
|
3223
3223
|
"addition_date": "2018/06/27",
|
|
3224
3224
|
"instances": [
|
|
3225
3225
|
"bot-pge.chlooe.com/1.0.0 (+http://www.chlooe.com/)"
|
|
@@ -3397,7 +3397,7 @@
|
|
|
3397
3397
|
}
|
|
3398
3398
|
,
|
|
3399
3399
|
{
|
|
3400
|
-
"pattern": "Siteimprove
|
|
3400
|
+
"pattern": "Siteimprove\\.com",
|
|
3401
3401
|
"addition_date": "2018/06/22",
|
|
3402
3402
|
"instances": [
|
|
3403
3403
|
"Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0) LinkCheck by Siteimprove.com",
|
|
@@ -3506,7 +3506,7 @@
|
|
|
3506
3506
|
}
|
|
3507
3507
|
,
|
|
3508
3508
|
{
|
|
3509
|
-
"pattern": "PR-CY
|
|
3509
|
+
"pattern": "PR-CY\\.RU",
|
|
3510
3510
|
"addition_date": "2018/08/30",
|
|
3511
3511
|
"instances": [
|
|
3512
3512
|
"Mozilla/5.0 (compatible; PR-CY.RU; + https://a.pr-cy.ru)"
|
|
@@ -3827,7 +3827,7 @@
|
|
|
3827
3827
|
]
|
|
3828
3828
|
},
|
|
3829
3829
|
{
|
|
3830
|
-
"pattern": "Dataprovider
|
|
3830
|
+
"pattern": "Dataprovider\\.com",
|
|
3831
3831
|
"addition_date": "2018/11/24",
|
|
3832
3832
|
"instances": [
|
|
3833
3833
|
"Mozilla/5.0 (compatible; Dataprovider.com)"
|
|
@@ -3843,7 +3843,7 @@
|
|
|
3843
3843
|
"url": "http://www.grouphigh.com/"
|
|
3844
3844
|
},
|
|
3845
3845
|
{
|
|
3846
|
-
"pattern": "theoldreader
|
|
3846
|
+
"pattern": "theoldreader\\.com",
|
|
3847
3847
|
"addition_date": "2018/12/02",
|
|
3848
3848
|
"instances": [
|
|
3849
3849
|
"Mozilla/5.0 (compatible; theoldreader.com)"
|
|
@@ -3879,7 +3879,7 @@
|
|
|
3879
3879
|
}
|
|
3880
3880
|
,
|
|
3881
3881
|
{
|
|
3882
|
-
"pattern": "2ip
|
|
3882
|
+
"pattern": "2ip\\.ru",
|
|
3883
3883
|
"addition_date": "2019/02/12",
|
|
3884
3884
|
"instances": [
|
|
3885
3885
|
"2ip.ru CMS Detector (https://2ip.ru/cms/)"
|
|
@@ -5000,7 +5000,7 @@
|
|
|
5000
5000
|
"url": "https://metrics-tools.de/robot.html"
|
|
5001
5001
|
},
|
|
5002
5002
|
{
|
|
5003
|
-
"pattern": "hyscore
|
|
5003
|
+
"pattern": "hyscore\\.io",
|
|
5004
5004
|
"addition_date": "2023/09/08",
|
|
5005
5005
|
"instances": [
|
|
5006
5006
|
"Mozilla/5.0 (iPhone; CPU iPhone OS 8_3 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12F70 Safari/600.1. 4 (compatible; HyScore/1.0; +https://hyscore.io/crawler/)"
|
|
@@ -5104,7 +5104,7 @@
|
|
|
5104
5104
|
"url": "https://torus.company/bot.html"
|
|
5105
5105
|
},
|
|
5106
5106
|
{
|
|
5107
|
-
"pattern": "sempi
|
|
5107
|
+
"pattern": "sempi\\.tech",
|
|
5108
5108
|
"addition_date": "2023/09/08",
|
|
5109
5109
|
"instances": [
|
|
5110
5110
|
"Mozilla/5.0 (compatible; Semanticbot/1.0; +http://sempi.tech/bot.html)"
|
|
@@ -5160,7 +5160,7 @@
|
|
|
5160
5160
|
"url": "https://opengraphcheck.com"
|
|
5161
5161
|
},
|
|
5162
5162
|
{
|
|
5163
|
-
"pattern": "developers
|
|
5163
|
+
"pattern": "developers\\.google\\.com\\/\\+\\/web\\/snippet",
|
|
5164
5164
|
"addition_date": "2023/09/08",
|
|
5165
5165
|
"instances": [
|
|
5166
5166
|
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36 Google-PageRenderer Google (+https://developers.google.com/+/web/snippet/)",
|
package/package.json
CHANGED
package/validate.py
CHANGED
|
@@ -55,6 +55,12 @@ def main():
|
|
|
55
55
|
if re.search('[^\\\\]/', pattern):
|
|
56
56
|
raise ValueError('Pattern {!r} has an unescaped slash character'.format(pattern))
|
|
57
57
|
|
|
58
|
+
# check that no pattern contains unescaped dot .
|
|
59
|
+
for entry in json_data:
|
|
60
|
+
pattern = entry['pattern']
|
|
61
|
+
if re.search('[^\\\\]\\.', pattern):
|
|
62
|
+
raise ValueError('Pattern {!r} has an unescaped dot character'.format(pattern))
|
|
63
|
+
|
|
58
64
|
# check that we match the given instances
|
|
59
65
|
num_instances = 0
|
|
60
66
|
for entry in json_data:
|