crawler-user-agents 1.43.0 → 1.44.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/__init__.py +22 -2
- package/package.json +1 -1
package/__init__.py
CHANGED
|
@@ -4,6 +4,9 @@ from functools import cached_property
|
|
|
4
4
|
from pathlib import Path
|
|
5
5
|
|
|
6
6
|
|
|
7
|
+
CHUNK_SIZE = 25
|
|
8
|
+
|
|
9
|
+
|
|
7
10
|
class CrawlerPatterns:
|
|
8
11
|
def __init__(self):
|
|
9
12
|
pass
|
|
@@ -19,6 +22,22 @@ class CrawlerPatterns:
|
|
|
19
22
|
def case_sensitive(self):
|
|
20
23
|
return re.compile("|".join(i["pattern"] for i in CRAWLER_USER_AGENTS_DATA))
|
|
21
24
|
|
|
25
|
+
@cached_property
|
|
26
|
+
def _chunks_case_sensitive(self):
|
|
27
|
+
patterns = [i["pattern"] for i in CRAWLER_USER_AGENTS_DATA]
|
|
28
|
+
return [
|
|
29
|
+
re.compile("|".join(patterns[i:i + CHUNK_SIZE]))
|
|
30
|
+
for i in range(0, len(patterns), CHUNK_SIZE)
|
|
31
|
+
]
|
|
32
|
+
|
|
33
|
+
@cached_property
|
|
34
|
+
def _chunks_case_insensitive(self):
|
|
35
|
+
patterns = [i["pattern"].lower() for i in CRAWLER_USER_AGENTS_DATA]
|
|
36
|
+
return [
|
|
37
|
+
re.compile("|".join(patterns[i:i + CHUNK_SIZE]))
|
|
38
|
+
for i in range(0, len(patterns), CHUNK_SIZE)
|
|
39
|
+
]
|
|
40
|
+
|
|
22
41
|
|
|
23
42
|
def load_json():
|
|
24
43
|
cwd = Path(__file__).parent
|
|
@@ -34,8 +53,9 @@ CRAWLER_PATTERNS = CrawlerPatterns()
|
|
|
34
53
|
def is_crawler(user_agent: str, case_sensitive: bool = True) -> bool:
|
|
35
54
|
"""Return True if the given User-Agent matches a known crawler."""
|
|
36
55
|
if case_sensitive:
|
|
37
|
-
return
|
|
38
|
-
|
|
56
|
+
return any(p.search(user_agent) for p in CRAWLER_PATTERNS._chunks_case_sensitive)
|
|
57
|
+
ua = user_agent.lower()
|
|
58
|
+
return any(p.search(ua) for p in CRAWLER_PATTERNS._chunks_case_insensitive)
|
|
39
59
|
|
|
40
60
|
|
|
41
61
|
def matching_crawlers(user_agent: str, case_sensitive: bool = True) -> list[int]:
|