crawler-user-agents 1.42.0 → 1.44.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/__init__.py +22 -2
- package/crawler-user-agents.json +5 -1
- package/package.json +1 -1
package/__init__.py
CHANGED
|
@@ -4,6 +4,9 @@ from functools import cached_property
|
|
|
4
4
|
from pathlib import Path
|
|
5
5
|
|
|
6
6
|
|
|
7
|
+
CHUNK_SIZE = 25
|
|
8
|
+
|
|
9
|
+
|
|
7
10
|
class CrawlerPatterns:
|
|
8
11
|
def __init__(self):
|
|
9
12
|
pass
|
|
@@ -19,6 +22,22 @@ class CrawlerPatterns:
|
|
|
19
22
|
def case_sensitive(self):
|
|
20
23
|
return re.compile("|".join(i["pattern"] for i in CRAWLER_USER_AGENTS_DATA))
|
|
21
24
|
|
|
25
|
+
@cached_property
|
|
26
|
+
def _chunks_case_sensitive(self):
|
|
27
|
+
patterns = [i["pattern"] for i in CRAWLER_USER_AGENTS_DATA]
|
|
28
|
+
return [
|
|
29
|
+
re.compile("|".join(patterns[i:i + CHUNK_SIZE]))
|
|
30
|
+
for i in range(0, len(patterns), CHUNK_SIZE)
|
|
31
|
+
]
|
|
32
|
+
|
|
33
|
+
@cached_property
|
|
34
|
+
def _chunks_case_insensitive(self):
|
|
35
|
+
patterns = [i["pattern"].lower() for i in CRAWLER_USER_AGENTS_DATA]
|
|
36
|
+
return [
|
|
37
|
+
re.compile("|".join(patterns[i:i + CHUNK_SIZE]))
|
|
38
|
+
for i in range(0, len(patterns), CHUNK_SIZE)
|
|
39
|
+
]
|
|
40
|
+
|
|
22
41
|
|
|
23
42
|
def load_json():
|
|
24
43
|
cwd = Path(__file__).parent
|
|
@@ -34,8 +53,9 @@ CRAWLER_PATTERNS = CrawlerPatterns()
|
|
|
34
53
|
def is_crawler(user_agent: str, case_sensitive: bool = True) -> bool:
|
|
35
54
|
"""Return True if the given User-Agent matches a known crawler."""
|
|
36
55
|
if case_sensitive:
|
|
37
|
-
return
|
|
38
|
-
|
|
56
|
+
return any(p.search(user_agent) for p in CRAWLER_PATTERNS._chunks_case_sensitive)
|
|
57
|
+
ua = user_agent.lower()
|
|
58
|
+
return any(p.search(ua) for p in CRAWLER_PATTERNS._chunks_case_insensitive)
|
|
39
59
|
|
|
40
60
|
|
|
41
61
|
def matching_crawlers(user_agent: str, case_sensitive: bool = True) -> list[int]:
|
package/crawler-user-agents.json
CHANGED
|
@@ -8118,6 +8118,10 @@
|
|
|
8118
8118
|
"instances": [
|
|
8119
8119
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/140.0.0.0 newsai/1.0 Safari/537.36"
|
|
8120
8120
|
],
|
|
8121
|
-
"description": "NewsAI crawler for news content aggregation and indexing"
|
|
8121
|
+
"description": "NewsAI crawler for news content aggregation and indexing",
|
|
8122
|
+
"tags": [
|
|
8123
|
+
"ai-crawler",
|
|
8124
|
+
"feed-reader"
|
|
8125
|
+
]
|
|
8122
8126
|
}
|
|
8123
8127
|
]
|