crawler-user-agents 1.0.147 → 1.0.149

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -20,7 +20,8 @@ jobs:
20
20
  with:
21
21
  node-version: 20
22
22
  - run: node format.js --check
23
- - run: pip3 install -r requirements.txt
23
+ - run: python3 -mpip install -U pip
24
+ - run: python3 -mpip install -e .[dev]
24
25
  - run: py.test -vv
25
26
  - run: python3 validate.py
26
27
  - run: php validate.php
package/README.md CHANGED
@@ -6,7 +6,9 @@ This repository contains a list of of HTTP user-agents used by robots, crawlers,
6
6
  * Go package: <https://pkg.go.dev/github.com/monperrus/crawler-user-agents>
7
7
  * PyPi package: <https://pypi.org/project/crawler-user-agents/>
8
8
 
9
- Each `pattern` is a regular expression. It should work out-of-the-box wih your favorite regex library:
9
+ Each `pattern` is a regular expression. It should work out-of-the-box wih your favorite regex library.
10
+
11
+ If you use this project in a commercial product, [please sponsor it](https://github.com/sponsors/monperrus).
10
12
 
11
13
  ## Install
12
14
 
@@ -41,10 +43,24 @@ Then:
41
43
 
42
44
  ```python
43
45
  import crawleruseragents
44
- if crawleruseragents.is_crawler("googlebot/"):
46
+ if crawleruseragents.is_crawler("Googlebot/"):
45
47
  # do something
46
48
  ```
47
49
 
50
+ or:
51
+
52
+ ```python
53
+ import crawleruseragents
54
+ indices = crawleruseragents.matching_crawlers("bingbot/2.0")
55
+ print("crawlers' indices:", indices)
56
+ print(
57
+ "crawler's URL:",
58
+ crawleruseragents.CRAWLER_USER_AGENTS_DATA[indices[0]]["url"]
59
+ )
60
+ ```
61
+
62
+ Note that `matching_crawlers` is much slower than `is_crawler`, if the given User-Agent does indeed match any crawlers.
63
+
48
64
  ### Go
49
65
 
50
66
  Go: use [this package](https://pkg.go.dev/github.com/monperrus/crawler-user-agents),
@@ -70,7 +86,7 @@ func main() {
70
86
 
71
87
  indices := agents.MatchingCrawlers(userAgent)
72
88
  fmt.Println("crawlers' indices:", indices)
73
- fmt.Println("crawler' URL:", agents.Crawlers[indices[0]].URL)
89
+ fmt.Println("crawler's URL:", agents.Crawlers[indices[0]].URL)
74
90
  }
75
91
  ```
76
92
 
package/__init__.py CHANGED
@@ -11,15 +11,24 @@ def load_json():
11
11
 
12
12
 
13
13
  CRAWLER_USER_AGENTS_DATA = load_json()
14
+ CRAWLER_USER_AGENTS_REGEXP = re.compile(
15
+ "|".join(i["pattern"] for i in CRAWLER_USER_AGENTS_DATA)
16
+ )
14
17
 
15
18
 
16
19
  def is_crawler(user_agent: str) -> bool:
17
- for crawler_user_agent in CRAWLER_USER_AGENTS_DATA:
18
- if re.search(crawler_user_agent["pattern"], user_agent, re.IGNORECASE):
19
- return True
20
- return False
21
-
22
-
23
- def is_crawler2(s):
24
- regexp = re.compile("|".join([i["pattern"] for i in CRAWLER_USER_AGENTS_DATA]))
25
- return regexp.search(s) is not None
20
+ """Return True if the given User-Agent matches a known crawler."""
21
+ return bool(CRAWLER_USER_AGENTS_REGEXP.search(user_agent))
22
+
23
+
24
+ def matching_crawlers(user_agent: str) -> list[int]:
25
+ """
26
+ Return a list of the indices in CRAWLER_USER_AGENTS_DATA of any crawlers
27
+ matching the given User-Agent.
28
+ """
29
+ result = []
30
+ if is_crawler(user_agent):
31
+ for num, crawler_user_agent in enumerate(CRAWLER_USER_AGENTS_DATA):
32
+ if re.search(crawler_user_agent["pattern"], user_agent):
33
+ result.append(num)
34
+ return result
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "crawler-user-agents",
3
- "version": "1.0.147",
3
+ "version": "1.0.149",
4
4
  "main": "crawler-user-agents.json",
5
5
  "typings": "./index.d.ts",
6
6
  "author": "Martin Monperrus <martin.monperrus@gnieh.org>",
package/pyproject.toml CHANGED
@@ -8,6 +8,22 @@ authors = [
8
8
 
9
9
  readme = "README.md"
10
10
 
11
+ [project.optional-dependencies]
12
+ dev = [
13
+ "attrs==23.2.0",
14
+ "iniconfig==2.0.0",
15
+ "jsonschema==4.22.0",
16
+ "jsonschema-specifications==2023.12.1",
17
+ "packaging==24.0",
18
+ "pluggy==1.5.0",
19
+ "pytest==8.2.0",
20
+ "referencing==0.35.0",
21
+ "rpds-py==0.18.0",
22
+ ]
23
+
24
+ [project.urls]
25
+ Homepage = "https://github.com/monperrus/crawler-user-agents"
26
+
11
27
  [tool.setuptools]
12
28
  package-dir = {"crawleruseragents" = "."}
13
29
 
@@ -0,0 +1,39 @@
1
+ """
2
+ Simple tests for python harness
3
+
4
+ Usage:
5
+ $ pytest test_harness.py
6
+
7
+ """
8
+ from crawleruseragents import is_crawler, matching_crawlers
9
+
10
+
11
+ def test_match():
12
+ assert is_crawler("test Googlebot/2.0 test") is True
13
+
14
+
15
+ def test_nomatch():
16
+ assert is_crawler("!!!!!!!!!!!!") is False
17
+
18
+
19
+ def test_case():
20
+ assert is_crawler("test googlebot/2.0 test") is False
21
+
22
+
23
+ def test_matching_crawlers_match():
24
+ result = matching_crawlers("test Googlebot/2.0 test")
25
+ assert isinstance(result, list)
26
+ assert len(result) > 0
27
+ assert all(isinstance(val, int) for val in result)
28
+
29
+
30
+ def test_matching_crawlers_nomatch():
31
+ result = matching_crawlers("!!!!!!!!!!!!")
32
+ assert isinstance(result, list)
33
+ assert len(result) == 0
34
+
35
+
36
+ def test_matching_crawlers_case():
37
+ result = matching_crawlers("test googlebot/2.0 test")
38
+ assert isinstance(result, list)
39
+ assert len(result) == 0
package/requirements.txt DELETED
@@ -1,9 +0,0 @@
1
- attrs==23.2.0
2
- iniconfig==2.0.0
3
- jsonschema==4.22.0
4
- jsonschema-specifications==2023.12.1
5
- packaging==24.0
6
- pluggy==1.5.0
7
- pytest==8.2.0
8
- referencing==0.35.0
9
- rpds-py==0.18.0