npm - crawler-user-agents - Versions diffs - 1.0.147 → 1.0.149 - Mend

crawler-user-agents 1.0.147 → 1.0.149

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/.github/workflows/ci-validation.yml +2 -1
package/README.md +19 -3
package/__init__.py +18 -9
package/package.json +1 -1
package/pyproject.toml +16 -0
package/test_harness.py +39 -0
package/requirements.txt +0 -9

package/.github/workflows/ci-validation.yml CHANGED Viewed

@@ -20,7 +20,8 @@ jobs:
         with:
           node-version: 20
       - run: node format.js --check
-      - run: pip3 install -r requirements.txt
+      - run: python3 -mpip install -U pip
+      - run: python3 -mpip install -e .[dev]
       - run: py.test -vv
       - run: python3 validate.py
       - run: php validate.php

package/README.md CHANGED Viewed

@@ -6,7 +6,9 @@ This repository contains a list of of HTTP user-agents used by robots, crawlers,
 * Go package: <https://pkg.go.dev/github.com/monperrus/crawler-user-agents>
 * PyPi package: <https://pypi.org/project/crawler-user-agents/>
-Each `pattern` is a regular expression. It should work out-of-the-box wih your favorite regex library:
+Each `pattern` is a regular expression. It should work out-of-the-box wih your favorite regex library.
+If you use this project in a commercial product, [please sponsor it](https://github.com/sponsors/monperrus).
 ## Install
@@ -41,10 +43,24 @@ Then:
 ```python
 import crawleruseragents
-if crawleruseragents.is_crawler("googlebot/"):
+if crawleruseragents.is_crawler("Googlebot/"):
    # do something
 ```
+or:
+```python
+import crawleruseragents
+indices = crawleruseragents.matching_crawlers("bingbot/2.0")
+print("crawlers' indices:", indices)
+print(
+    "crawler's URL:",
+    crawleruseragents.CRAWLER_USER_AGENTS_DATA[indices[0]]["url"]
+)
+```
+Note that `matching_crawlers` is much slower than `is_crawler`, if the given User-Agent does indeed match any crawlers.
 ### Go
 Go: use [this package](https://pkg.go.dev/github.com/monperrus/crawler-user-agents),
@@ -70,7 +86,7 @@ func main() {
 	indices := agents.MatchingCrawlers(userAgent)
 	fmt.Println("crawlers' indices:", indices)
-	fmt.Println("crawler' URL:", agents.Crawlers[indices[0]].URL)
+	fmt.Println("crawler's URL:", agents.Crawlers[indices[0]].URL)
 }
 ```

package/__init__.py CHANGED Viewed

@@ -11,15 +11,24 @@ def load_json():
 CRAWLER_USER_AGENTS_DATA = load_json()
+CRAWLER_USER_AGENTS_REGEXP = re.compile(
+    "|".join(i["pattern"] for i in CRAWLER_USER_AGENTS_DATA)
+)
 def is_crawler(user_agent: str) -> bool:
-    for crawler_user_agent in CRAWLER_USER_AGENTS_DATA:
-        if re.search(crawler_user_agent["pattern"], user_agent, re.IGNORECASE):
-            return True
-    return False
-def is_crawler2(s):
-    regexp = re.compile("|".join([i["pattern"] for i in CRAWLER_USER_AGENTS_DATA]))
-    return regexp.search(s) is not None
+    """Return True if the given User-Agent matches a known crawler."""
+    return bool(CRAWLER_USER_AGENTS_REGEXP.search(user_agent))
+def matching_crawlers(user_agent: str) -> list[int]:
+    """
+    Return a list of the indices in CRAWLER_USER_AGENTS_DATA of any crawlers
+    matching the given User-Agent.
+    """
+    result = []
+    if is_crawler(user_agent):
+        for num, crawler_user_agent in enumerate(CRAWLER_USER_AGENTS_DATA):
+            if re.search(crawler_user_agent["pattern"], user_agent):
+                result.append(num)
+    return result

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "crawler-user-agents",
-  "version": "1.0.147",
+  "version": "1.0.149",
   "main": "crawler-user-agents.json",
   "typings": "./index.d.ts",
   "author": "Martin Monperrus <martin.monperrus@gnieh.org>",

package/pyproject.toml CHANGED Viewed

@@ -8,6 +8,22 @@ authors = [
 readme = "README.md"
+[project.optional-dependencies]
+dev = [
+  "attrs==23.2.0",
+  "iniconfig==2.0.0",
+  "jsonschema==4.22.0",
+  "jsonschema-specifications==2023.12.1",
+  "packaging==24.0",
+  "pluggy==1.5.0",
+  "pytest==8.2.0",
+  "referencing==0.35.0",
+  "rpds-py==0.18.0",
+]
+[project.urls]
+Homepage = "https://github.com/monperrus/crawler-user-agents"
 [tool.setuptools]
 package-dir = {"crawleruseragents" = "."}

package/test_harness.py ADDED Viewed

@@ -0,0 +1,39 @@
+"""
+Simple tests for python harness
+Usage:
+$ pytest test_harness.py
+"""
+from crawleruseragents import is_crawler, matching_crawlers
+def test_match():
+    assert is_crawler("test Googlebot/2.0 test") is True
+def test_nomatch():
+    assert is_crawler("!!!!!!!!!!!!") is False
+def test_case():
+    assert is_crawler("test googlebot/2.0 test") is False
+def test_matching_crawlers_match():
+    result = matching_crawlers("test Googlebot/2.0 test")
+    assert isinstance(result, list)
+    assert len(result) > 0
+    assert all(isinstance(val, int) for val in result)
+def test_matching_crawlers_nomatch():
+    result = matching_crawlers("!!!!!!!!!!!!")
+    assert isinstance(result, list)
+    assert len(result) == 0
+def test_matching_crawlers_case():
+    result = matching_crawlers("test googlebot/2.0 test")
+    assert isinstance(result, list)
+    assert len(result) == 0

package/requirements.txt DELETED Viewed

@@ -1,9 +0,0 @@
-attrs==23.2.0
-iniconfig==2.0.0
-jsonschema==4.22.0
-jsonschema-specifications==2023.12.1
-packaging==24.0
-pluggy==1.5.0
-pytest==8.2.0
-referencing==0.35.0
-rpds-py==0.18.0