crawler-user-agents 1.0.147 → 1.0.149
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.github/workflows/ci-validation.yml +2 -1
- package/README.md +19 -3
- package/__init__.py +18 -9
- package/package.json +1 -1
- package/pyproject.toml +16 -0
- package/test_harness.py +39 -0
- package/requirements.txt +0 -9
|
@@ -20,7 +20,8 @@ jobs:
|
|
|
20
20
|
with:
|
|
21
21
|
node-version: 20
|
|
22
22
|
- run: node format.js --check
|
|
23
|
-
- run:
|
|
23
|
+
- run: python3 -mpip install -U pip
|
|
24
|
+
- run: python3 -mpip install -e .[dev]
|
|
24
25
|
- run: py.test -vv
|
|
25
26
|
- run: python3 validate.py
|
|
26
27
|
- run: php validate.php
|
package/README.md
CHANGED
|
@@ -6,7 +6,9 @@ This repository contains a list of of HTTP user-agents used by robots, crawlers,
|
|
|
6
6
|
* Go package: <https://pkg.go.dev/github.com/monperrus/crawler-user-agents>
|
|
7
7
|
* PyPi package: <https://pypi.org/project/crawler-user-agents/>
|
|
8
8
|
|
|
9
|
-
Each `pattern` is a regular expression. It should work out-of-the-box wih your favorite regex library
|
|
9
|
+
Each `pattern` is a regular expression. It should work out-of-the-box wih your favorite regex library.
|
|
10
|
+
|
|
11
|
+
If you use this project in a commercial product, [please sponsor it](https://github.com/sponsors/monperrus).
|
|
10
12
|
|
|
11
13
|
## Install
|
|
12
14
|
|
|
@@ -41,10 +43,24 @@ Then:
|
|
|
41
43
|
|
|
42
44
|
```python
|
|
43
45
|
import crawleruseragents
|
|
44
|
-
if crawleruseragents.is_crawler("
|
|
46
|
+
if crawleruseragents.is_crawler("Googlebot/"):
|
|
45
47
|
# do something
|
|
46
48
|
```
|
|
47
49
|
|
|
50
|
+
or:
|
|
51
|
+
|
|
52
|
+
```python
|
|
53
|
+
import crawleruseragents
|
|
54
|
+
indices = crawleruseragents.matching_crawlers("bingbot/2.0")
|
|
55
|
+
print("crawlers' indices:", indices)
|
|
56
|
+
print(
|
|
57
|
+
"crawler's URL:",
|
|
58
|
+
crawleruseragents.CRAWLER_USER_AGENTS_DATA[indices[0]]["url"]
|
|
59
|
+
)
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
Note that `matching_crawlers` is much slower than `is_crawler`, if the given User-Agent does indeed match any crawlers.
|
|
63
|
+
|
|
48
64
|
### Go
|
|
49
65
|
|
|
50
66
|
Go: use [this package](https://pkg.go.dev/github.com/monperrus/crawler-user-agents),
|
|
@@ -70,7 +86,7 @@ func main() {
|
|
|
70
86
|
|
|
71
87
|
indices := agents.MatchingCrawlers(userAgent)
|
|
72
88
|
fmt.Println("crawlers' indices:", indices)
|
|
73
|
-
fmt.Println("crawler' URL:", agents.Crawlers[indices[0]].URL)
|
|
89
|
+
fmt.Println("crawler's URL:", agents.Crawlers[indices[0]].URL)
|
|
74
90
|
}
|
|
75
91
|
```
|
|
76
92
|
|
package/__init__.py
CHANGED
|
@@ -11,15 +11,24 @@ def load_json():
|
|
|
11
11
|
|
|
12
12
|
|
|
13
13
|
CRAWLER_USER_AGENTS_DATA = load_json()
|
|
14
|
+
CRAWLER_USER_AGENTS_REGEXP = re.compile(
|
|
15
|
+
"|".join(i["pattern"] for i in CRAWLER_USER_AGENTS_DATA)
|
|
16
|
+
)
|
|
14
17
|
|
|
15
18
|
|
|
16
19
|
def is_crawler(user_agent: str) -> bool:
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
20
|
+
"""Return True if the given User-Agent matches a known crawler."""
|
|
21
|
+
return bool(CRAWLER_USER_AGENTS_REGEXP.search(user_agent))
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def matching_crawlers(user_agent: str) -> list[int]:
|
|
25
|
+
"""
|
|
26
|
+
Return a list of the indices in CRAWLER_USER_AGENTS_DATA of any crawlers
|
|
27
|
+
matching the given User-Agent.
|
|
28
|
+
"""
|
|
29
|
+
result = []
|
|
30
|
+
if is_crawler(user_agent):
|
|
31
|
+
for num, crawler_user_agent in enumerate(CRAWLER_USER_AGENTS_DATA):
|
|
32
|
+
if re.search(crawler_user_agent["pattern"], user_agent):
|
|
33
|
+
result.append(num)
|
|
34
|
+
return result
|
package/package.json
CHANGED
package/pyproject.toml
CHANGED
|
@@ -8,6 +8,22 @@ authors = [
|
|
|
8
8
|
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
|
|
11
|
+
[project.optional-dependencies]
|
|
12
|
+
dev = [
|
|
13
|
+
"attrs==23.2.0",
|
|
14
|
+
"iniconfig==2.0.0",
|
|
15
|
+
"jsonschema==4.22.0",
|
|
16
|
+
"jsonschema-specifications==2023.12.1",
|
|
17
|
+
"packaging==24.0",
|
|
18
|
+
"pluggy==1.5.0",
|
|
19
|
+
"pytest==8.2.0",
|
|
20
|
+
"referencing==0.35.0",
|
|
21
|
+
"rpds-py==0.18.0",
|
|
22
|
+
]
|
|
23
|
+
|
|
24
|
+
[project.urls]
|
|
25
|
+
Homepage = "https://github.com/monperrus/crawler-user-agents"
|
|
26
|
+
|
|
11
27
|
[tool.setuptools]
|
|
12
28
|
package-dir = {"crawleruseragents" = "."}
|
|
13
29
|
|
package/test_harness.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Simple tests for python harness
|
|
3
|
+
|
|
4
|
+
Usage:
|
|
5
|
+
$ pytest test_harness.py
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
from crawleruseragents import is_crawler, matching_crawlers
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def test_match():
|
|
12
|
+
assert is_crawler("test Googlebot/2.0 test") is True
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def test_nomatch():
|
|
16
|
+
assert is_crawler("!!!!!!!!!!!!") is False
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def test_case():
|
|
20
|
+
assert is_crawler("test googlebot/2.0 test") is False
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def test_matching_crawlers_match():
|
|
24
|
+
result = matching_crawlers("test Googlebot/2.0 test")
|
|
25
|
+
assert isinstance(result, list)
|
|
26
|
+
assert len(result) > 0
|
|
27
|
+
assert all(isinstance(val, int) for val in result)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def test_matching_crawlers_nomatch():
|
|
31
|
+
result = matching_crawlers("!!!!!!!!!!!!")
|
|
32
|
+
assert isinstance(result, list)
|
|
33
|
+
assert len(result) == 0
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def test_matching_crawlers_case():
|
|
37
|
+
result = matching_crawlers("test googlebot/2.0 test")
|
|
38
|
+
assert isinstance(result, list)
|
|
39
|
+
assert len(result) == 0
|