crawler-user-agents 1.0.156 → 1.0.158
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/__init__.py +25 -8
- package/crawler-user-agents.json +1 -1
- package/package.json +1 -1
- package/test_harness.py +19 -5
package/__init__.py
CHANGED
|
@@ -1,8 +1,25 @@
|
|
|
1
1
|
import re
|
|
2
2
|
import json
|
|
3
|
+
from functools import cached_property
|
|
3
4
|
from pathlib import Path
|
|
4
5
|
|
|
5
6
|
|
|
7
|
+
class CrawlerPatterns:
|
|
8
|
+
def __init__(self):
|
|
9
|
+
pass
|
|
10
|
+
|
|
11
|
+
@cached_property
|
|
12
|
+
def case_insensitive(self):
|
|
13
|
+
return re.compile(
|
|
14
|
+
"|".join(i["pattern"] for i in CRAWLER_USER_AGENTS_DATA),
|
|
15
|
+
re.IGNORECASE
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
@cached_property
|
|
19
|
+
def case_sensitive(self):
|
|
20
|
+
return re.compile("|".join(i["pattern"] for i in CRAWLER_USER_AGENTS_DATA))
|
|
21
|
+
|
|
22
|
+
|
|
6
23
|
def load_json():
|
|
7
24
|
cwd = Path(__file__).parent
|
|
8
25
|
user_agents_file_path = cwd / "crawler-user-agents.json"
|
|
@@ -11,24 +28,24 @@ def load_json():
|
|
|
11
28
|
|
|
12
29
|
|
|
13
30
|
CRAWLER_USER_AGENTS_DATA = load_json()
|
|
14
|
-
|
|
15
|
-
"|".join(i["pattern"] for i in CRAWLER_USER_AGENTS_DATA)
|
|
16
|
-
)
|
|
31
|
+
CRAWLER_PATTERNS = CrawlerPatterns()
|
|
17
32
|
|
|
18
33
|
|
|
19
|
-
def is_crawler(user_agent: str) -> bool:
|
|
34
|
+
def is_crawler(user_agent: str, case_sensitive: bool = True) -> bool:
|
|
20
35
|
"""Return True if the given User-Agent matches a known crawler."""
|
|
21
|
-
|
|
36
|
+
if case_sensitive:
|
|
37
|
+
return bool(re.search(CRAWLER_PATTERNS.case_sensitive, user_agent))
|
|
38
|
+
return bool(re.search(CRAWLER_PATTERNS.case_insensitive, user_agent))
|
|
22
39
|
|
|
23
40
|
|
|
24
|
-
def matching_crawlers(user_agent: str) -> list[int]:
|
|
41
|
+
def matching_crawlers(user_agent: str, case_sensitive: bool = True) -> list[int]:
|
|
25
42
|
"""
|
|
26
43
|
Return a list of the indices in CRAWLER_USER_AGENTS_DATA of any crawlers
|
|
27
44
|
matching the given User-Agent.
|
|
28
45
|
"""
|
|
29
46
|
result = []
|
|
30
|
-
if is_crawler(user_agent):
|
|
47
|
+
if is_crawler(user_agent, case_sensitive):
|
|
31
48
|
for num, crawler_user_agent in enumerate(CRAWLER_USER_AGENTS_DATA):
|
|
32
|
-
if re.search(crawler_user_agent["pattern"], user_agent):
|
|
49
|
+
if re.search(crawler_user_agent["pattern"], user_agent, 0 if case_sensitive else re.IGNORECASE):
|
|
33
50
|
result.append(num)
|
|
34
51
|
return result
|
package/crawler-user-agents.json
CHANGED
|
@@ -2780,7 +2780,7 @@
|
|
|
2780
2780
|
"instances": [
|
|
2781
2781
|
"Mozilla/5.0 (compatible; seoscanners.net/1; +spider@seoscanners.net)"
|
|
2782
2782
|
],
|
|
2783
|
-
"url": "
|
|
2783
|
+
"url": "https://github.com/monperrus/crawler-user-agents/issues/384#issuecomment-2575367162"
|
|
2784
2784
|
},
|
|
2785
2785
|
{
|
|
2786
2786
|
"pattern": "Hatena",
|
package/package.json
CHANGED
package/test_harness.py
CHANGED
|
@@ -5,28 +5,42 @@ Usage:
|
|
|
5
5
|
$ pytest test_harness.py
|
|
6
6
|
|
|
7
7
|
"""
|
|
8
|
-
from crawleruseragents import is_crawler, matching_crawlers
|
|
9
8
|
|
|
9
|
+
from crawleruseragents import is_crawler, matching_crawlers
|
|
10
10
|
|
|
11
|
-
def test_match():
|
|
12
|
-
assert is_crawler("test Googlebot/2.0 test") is True
|
|
13
11
|
|
|
14
12
|
|
|
15
13
|
def test_nomatch():
|
|
16
14
|
assert is_crawler("!!!!!!!!!!!!") is False
|
|
17
15
|
|
|
18
16
|
|
|
19
|
-
def
|
|
17
|
+
def test_case_sensitive():
|
|
20
18
|
assert is_crawler("test googlebot/2.0 test") is False
|
|
21
19
|
|
|
22
20
|
|
|
23
|
-
def
|
|
21
|
+
def test_case_insensitive():
|
|
22
|
+
assert is_crawler("test googlebot/2.0 test", case_sensitive=False) is True
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def test_matching_crawlers_match_case_sensitive():
|
|
24
26
|
result = matching_crawlers("test Googlebot/2.0 test")
|
|
25
27
|
assert isinstance(result, list)
|
|
26
28
|
assert len(result) > 0
|
|
27
29
|
assert all(isinstance(val, int) for val in result)
|
|
28
30
|
|
|
29
31
|
|
|
32
|
+
def test_matching_crawlers_match_case_insensitive():
|
|
33
|
+
result = matching_crawlers("test googlebot/2.0 test", False)
|
|
34
|
+
assert isinstance(result, list)
|
|
35
|
+
assert len(result) > 0
|
|
36
|
+
assert all(isinstance(val, int) for val in result)
|
|
37
|
+
|
|
38
|
+
def test_matching_crawlers_match_lower_case_agent():
|
|
39
|
+
result = matching_crawlers("test googlebot/2.0 test")
|
|
40
|
+
assert isinstance(result, list)
|
|
41
|
+
assert len(result) == 0
|
|
42
|
+
|
|
43
|
+
|
|
30
44
|
def test_matching_crawlers_nomatch():
|
|
31
45
|
result = matching_crawlers("!!!!!!!!!!!!")
|
|
32
46
|
assert isinstance(result, list)
|