crawler-user-agents 1.0.146 → 1.0.148

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -20,7 +20,8 @@ jobs:
20
20
  with:
21
21
  node-version: 20
22
22
  - run: node format.js --check
23
- - run: pip3 install -r requirements.txt
23
+ - run: python3 -mpip install -U pip
24
+ - run: python3 -mpip install -e .[dev]
24
25
  - run: py.test -vv
25
26
  - run: python3 validate.py
26
27
  - run: php validate.php
package/README.md CHANGED
@@ -41,10 +41,24 @@ Then:
41
41
 
42
42
  ```python
43
43
  import crawleruseragents
44
- if crawleruseragents.is_crawler("googlebot/"):
44
+ if crawleruseragents.is_crawler("Googlebot/"):
45
45
  # do something
46
46
  ```
47
47
 
48
+ or:
49
+
50
+ ```python
51
+ import crawleruseragents
52
+ indices = crawleruseragents.matching_crawlers("bingbot/2.0")
53
+ print("crawlers' indices:", indices)
54
+ print(
55
+ "crawler's URL:",
56
+ crawleruseragents.CRAWLER_USER_AGENTS_DATA[indices[0]]["url"]
57
+ )
58
+ ```
59
+
60
+ Note that `matching_crawlers` is much slower than `is_crawler`, if the given User-Agent does indeed match any crawlers.
61
+
48
62
  ### Go
49
63
 
50
64
  Go: use [this package](https://pkg.go.dev/github.com/monperrus/crawler-user-agents),
@@ -70,7 +84,7 @@ func main() {
70
84
 
71
85
  indices := agents.MatchingCrawlers(userAgent)
72
86
  fmt.Println("crawlers' indices:", indices)
73
- fmt.Println("crawler' URL:", agents.Crawlers[indices[0]].URL)
87
+ fmt.Println("crawler's URL:", agents.Crawlers[indices[0]].URL)
74
88
  }
75
89
  ```
76
90
 
package/__init__.py CHANGED
@@ -11,15 +11,24 @@ def load_json():
11
11
 
12
12
 
13
13
  CRAWLER_USER_AGENTS_DATA = load_json()
14
+ CRAWLER_USER_AGENTS_REGEXP = re.compile(
15
+ "|".join(i["pattern"] for i in CRAWLER_USER_AGENTS_DATA)
16
+ )
14
17
 
15
18
 
16
19
  def is_crawler(user_agent: str) -> bool:
17
- for crawler_user_agent in CRAWLER_USER_AGENTS_DATA:
18
- if re.search(crawler_user_agent["pattern"], user_agent, re.IGNORECASE):
19
- return True
20
- return False
21
-
22
-
23
- def is_crawler2(s):
24
- regexp = re.compile("|".join([i["pattern"] for i in CRAWLER_USER_AGENTS_DATA]))
25
- return regexp.search(s) is not None
20
+ """Return True if the given User-Agent matches a known crawler."""
21
+ return bool(CRAWLER_USER_AGENTS_REGEXP.search(user_agent))
22
+
23
+
24
+ def matching_crawlers(user_agent: str) -> list[int]:
25
+ """
26
+ Return a list of the indices in CRAWLER_USER_AGENTS_DATA of any crawlers
27
+ matching the given User-Agent.
28
+ """
29
+ result = []
30
+ if is_crawler(user_agent):
31
+ for num, crawler_user_agent in enumerate(CRAWLER_USER_AGENTS_DATA):
32
+ if re.search(crawler_user_agent["pattern"], user_agent):
33
+ result.append(num)
34
+ return result
@@ -757,7 +757,7 @@
757
757
  },
758
758
  {
759
759
  "pattern": "Adidxbot",
760
- "url": "http://onlinehelp.microsoft.com/en-us/bing/hh204496.aspx",
760
+ "url": "https://www.bing.com/webmasters/help/which-crawlers-does-bing-use-8c184ec0",
761
761
  "instances": []
762
762
  },
763
763
  {
@@ -848,9 +848,12 @@
848
848
  ]
849
849
  },
850
850
  {
851
- "pattern": "sistrix crawler",
851
+ "pattern": "(sistrix|SISTRIX) [cC]rawler",
852
852
  "addition_date": "2011/08/02",
853
- "instances": []
853
+ "url": "https://www.sistrix.com/tutorials/crawling-errors-in-the-optimizer/",
854
+ "instances": [
855
+ "Mozilla/5.0 (compatible; SISTRIX Crawler; http://crawler.sistrix.net/)"
856
+ ]
854
857
  },
855
858
  {
856
859
  "pattern": "Ahrefs(Bot|SiteAudit)",
@@ -1080,6 +1083,7 @@
1080
1083
  {
1081
1084
  "pattern": "lssbot",
1082
1085
  "addition_date": "2012/05/15",
1086
+ "url": "https://www.lssbot.com/",
1083
1087
  "instances": []
1084
1088
  },
1085
1089
  {
@@ -1178,6 +1182,7 @@
1178
1182
  {
1179
1183
  "pattern": "backlinkcrawler",
1180
1184
  "addition_date": "2013/01/04",
1185
+ "url": "http://www.backlinktest.com/crawler.html",
1181
1186
  "instances": []
1182
1187
  },
1183
1188
  {
@@ -2274,6 +2279,7 @@
2274
2279
  {
2275
2280
  "pattern": "LinkArchiver",
2276
2281
  "addition_date": "2017/09/24",
2282
+ "url": "https://github.com/thisisparker/linkarchiver",
2277
2283
  "instances": [
2278
2284
  "@LinkArchiver twitter bot"
2279
2285
  ]
@@ -2306,6 +2312,7 @@
2306
2312
  {
2307
2313
  "pattern": "dcrawl",
2308
2314
  "addition_date": "2017/09/22",
2315
+ "url": "https://github.com/kgretzky/dcrawl",
2309
2316
  "instances": [
2310
2317
  "dcrawl/1.0"
2311
2318
  ]
@@ -2454,6 +2461,7 @@
2454
2461
  {
2455
2462
  "pattern": "AHC\\/",
2456
2463
  "addition_date": "2017/11/02",
2464
+ "url": "https://github.com/AsyncHttpClient/async-http-client",
2457
2465
  "instances": [
2458
2466
  "AHC/2.0"
2459
2467
  ]
@@ -2525,7 +2533,7 @@
2525
2533
  {
2526
2534
  "pattern": "Traackr\\.com",
2527
2535
  "addition_date": "2017/11/02",
2528
- "url": "Traackr.com",
2536
+ "url": "https://www.traackr.com/",
2529
2537
  "instances": [
2530
2538
  "Traackr.com"
2531
2539
  ]
@@ -3146,7 +3154,7 @@
3146
3154
  "instances": [
3147
3155
  "Mozilla/5.0 zgrab/0.x"
3148
3156
  ],
3149
- "url": "https://zmap.io/"
3157
+ "url": "https://github.com/zmap/zgrab2"
3150
3158
  },
3151
3159
  {
3152
3160
  "pattern": "PR-CY\\.RU",
@@ -3270,6 +3278,7 @@
3270
3278
  {
3271
3279
  "pattern": "VelenPublicWebCrawler",
3272
3280
  "addition_date": "2018/10/09",
3281
+ "url": "https://velen.io/",
3273
3282
  "instances": [
3274
3283
  "VelenPublicWebCrawler (velen.io)"
3275
3284
  ]
@@ -3932,7 +3941,7 @@
3932
3941
  "instances": [
3933
3942
  "SentiBot www.sentibot.eu (compatible with Googlebot)"
3934
3943
  ],
3935
- "url": "https://www.sentibot.eu"
3944
+ "url": "https://sites.google.com/senti1.com/sentibot-eu/home"
3936
3945
  },
3937
3946
  {
3938
3947
  "pattern": "Domains Project\\/",
@@ -4018,7 +4027,7 @@
4018
4027
  "instances": [
4019
4028
  "rssbot/1.4.3 (+https://t.me/RustRssBot)"
4020
4029
  ],
4021
- "url": "https://t.me/RustRssBot"
4030
+ "url": "https://github.com/iovxw/rssbot"
4022
4031
  },
4023
4032
  {
4024
4033
  "pattern": "startmebot\\/",
@@ -4082,7 +4091,7 @@
4082
4091
  "Mozilla/5.0 (compatible; RidderBot/1.0; bot@ridder.co)",
4083
4092
  "Mozilla/5.0 (compatible; RidderBot/1.0; bot@ridder.co) (iPhone; CPU iPhone OS 8_4_1 like Mac OS X) Mobile/12H321"
4084
4093
  ],
4085
- "url": "http://brandonmedia.net"
4094
+ "url": "https://ridder.co/"
4086
4095
  },
4087
4096
  {
4088
4097
  "pattern": "Taboolabot",
@@ -4206,8 +4215,7 @@
4206
4215
  "addition_date": "2022/04/26",
4207
4216
  "instances": [
4208
4217
  "Mozilla/5.0 (compatible; Go-http-client/1.1; +centurybot9@gmail.com)"
4209
- ],
4210
- "url": "unknown"
4218
+ ]
4211
4219
  },
4212
4220
  {
4213
4221
  "pattern": "Viber",
@@ -4220,6 +4228,7 @@
4220
4228
  {
4221
4229
  "pattern": "e\\.ventures Investment Crawler",
4222
4230
  "addition_date": "2021/06/05",
4231
+ "url": "https://www.eventures.vc/",
4223
4232
  "instances": [
4224
4233
  "e.ventures Investment Crawler (eventures.vc)"
4225
4234
  ]
@@ -4227,6 +4236,7 @@
4227
4236
  {
4228
4237
  "pattern": "evc-batch",
4229
4238
  "addition_date": "2021/06/07",
4239
+ "url": "https://www.eventures.vc/",
4230
4240
  "instances": [
4231
4241
  "Mozilla/5.0 (compatible; evc-batch/2.0)"
4232
4242
  ]
@@ -4837,7 +4847,7 @@
4837
4847
  "instances": [
4838
4848
  "Mozilla/5.0 (compatible; ImagesiftBot; +imagesift.com)"
4839
4849
  ],
4840
- "url": "https://imagesift.com"
4850
+ "url": "https://imagesift.com/about"
4841
4851
  },
4842
4852
  {
4843
4853
  "pattern": "Expanse",
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "crawler-user-agents",
3
- "version": "1.0.146",
3
+ "version": "1.0.148",
4
4
  "main": "crawler-user-agents.json",
5
5
  "typings": "./index.d.ts",
6
6
  "author": "Martin Monperrus <martin.monperrus@gnieh.org>",
package/pyproject.toml CHANGED
@@ -8,6 +8,22 @@ authors = [
8
8
 
9
9
  readme = "README.md"
10
10
 
11
+ [project.optional-dependencies]
12
+ dev = [
13
+ "attrs==23.2.0",
14
+ "iniconfig==2.0.0",
15
+ "jsonschema==4.22.0",
16
+ "jsonschema-specifications==2023.12.1",
17
+ "packaging==24.0",
18
+ "pluggy==1.5.0",
19
+ "pytest==8.2.0",
20
+ "referencing==0.35.0",
21
+ "rpds-py==0.18.0",
22
+ ]
23
+
24
+ [project.urls]
25
+ Homepage = "https://github.com/monperrus/crawler-user-agents"
26
+
11
27
  [tool.setuptools]
12
28
  package-dir = {"crawleruseragents" = "."}
13
29
 
@@ -0,0 +1,39 @@
1
+ """
2
+ Simple tests for python harness
3
+
4
+ Usage:
5
+ $ pytest test_harness.py
6
+
7
+ """
8
+ from crawleruseragents import is_crawler, matching_crawlers
9
+
10
+
11
+ def test_match():
12
+ assert is_crawler("test Googlebot/2.0 test") is True
13
+
14
+
15
+ def test_nomatch():
16
+ assert is_crawler("!!!!!!!!!!!!") is False
17
+
18
+
19
+ def test_case():
20
+ assert is_crawler("test googlebot/2.0 test") is False
21
+
22
+
23
+ def test_matching_crawlers_match():
24
+ result = matching_crawlers("test Googlebot/2.0 test")
25
+ assert isinstance(result, list)
26
+ assert len(result) > 0
27
+ assert all(isinstance(val, int) for val in result)
28
+
29
+
30
+ def test_matching_crawlers_nomatch():
31
+ result = matching_crawlers("!!!!!!!!!!!!")
32
+ assert isinstance(result, list)
33
+ assert len(result) == 0
34
+
35
+
36
+ def test_matching_crawlers_case():
37
+ result = matching_crawlers("test googlebot/2.0 test")
38
+ assert isinstance(result, list)
39
+ assert len(result) == 0
package/requirements.txt DELETED
@@ -1,9 +0,0 @@
1
- attrs==23.2.0
2
- iniconfig==2.0.0
3
- jsonschema==4.22.0
4
- jsonschema-specifications==2023.12.1
5
- packaging==24.0
6
- pluggy==1.5.0
7
- pytest==8.2.0
8
- referencing==0.35.0
9
- rpds-py==0.18.0