npm - crawler-user-agents - Versions diffs - 1.0.146 → 1.0.148 - Mend

crawler-user-agents 1.0.146 → 1.0.148

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

package/.github/workflows/ci-validation.yml +2 -1
package/README.md +16 -2
package/__init__.py +18 -9
package/crawler-user-agents.json +21 -11
package/package.json +1 -1
package/pyproject.toml +16 -0
package/test_harness.py +39 -0
package/requirements.txt +0 -9

package/.github/workflows/ci-validation.yml CHANGED Viewed

@@ -20,7 +20,8 @@ jobs:
         with:
           node-version: 20
       - run: node format.js --check
-      - run: pip3 install -r requirements.txt
+      - run: python3 -mpip install -U pip
+      - run: python3 -mpip install -e .[dev]
       - run: py.test -vv
       - run: python3 validate.py
       - run: php validate.php

package/README.md CHANGED Viewed

@@ -41,10 +41,24 @@ Then:
 ```python
 import crawleruseragents
-if crawleruseragents.is_crawler("googlebot/"):
+if crawleruseragents.is_crawler("Googlebot/"):
    # do something
 ```
+or:
+```python
+import crawleruseragents
+indices = crawleruseragents.matching_crawlers("bingbot/2.0")
+print("crawlers' indices:", indices)
+print(
+    "crawler's URL:",
+    crawleruseragents.CRAWLER_USER_AGENTS_DATA[indices[0]]["url"]
+)
+```
+Note that `matching_crawlers` is much slower than `is_crawler`, if the given User-Agent does indeed match any crawlers.
 ### Go
 Go: use [this package](https://pkg.go.dev/github.com/monperrus/crawler-user-agents),
@@ -70,7 +84,7 @@ func main() {
 	indices := agents.MatchingCrawlers(userAgent)
 	fmt.Println("crawlers' indices:", indices)
-	fmt.Println("crawler' URL:", agents.Crawlers[indices[0]].URL)
+	fmt.Println("crawler's URL:", agents.Crawlers[indices[0]].URL)
 }
 ```

package/__init__.py CHANGED Viewed

@@ -11,15 +11,24 @@ def load_json():
 CRAWLER_USER_AGENTS_DATA = load_json()
+CRAWLER_USER_AGENTS_REGEXP = re.compile(
+    "|".join(i["pattern"] for i in CRAWLER_USER_AGENTS_DATA)
+)
 def is_crawler(user_agent: str) -> bool:
-    for crawler_user_agent in CRAWLER_USER_AGENTS_DATA:
-        if re.search(crawler_user_agent["pattern"], user_agent, re.IGNORECASE):
-            return True
-    return False
-def is_crawler2(s):
-    regexp = re.compile("|".join([i["pattern"] for i in CRAWLER_USER_AGENTS_DATA]))
-    return regexp.search(s) is not None
+    """Return True if the given User-Agent matches a known crawler."""
+    return bool(CRAWLER_USER_AGENTS_REGEXP.search(user_agent))
+def matching_crawlers(user_agent: str) -> list[int]:
+    """
+    Return a list of the indices in CRAWLER_USER_AGENTS_DATA of any crawlers
+    matching the given User-Agent.
+    """
+    result = []
+    if is_crawler(user_agent):
+        for num, crawler_user_agent in enumerate(CRAWLER_USER_AGENTS_DATA):
+            if re.search(crawler_user_agent["pattern"], user_agent):
+                result.append(num)
+    return result

package/crawler-user-agents.json CHANGED Viewed

@@ -757,7 +757,7 @@
   },
   {
     "pattern": "Adidxbot",
-    "url": "http://onlinehelp.microsoft.com/en-us/bing/hh204496.aspx",
+    "url": "https://www.bing.com/webmasters/help/which-crawlers-does-bing-use-8c184ec0",
     "instances": []
   },
   {
@@ -848,9 +848,12 @@
     ]
   },
   {
-    "pattern": "sistrix crawler",
+    "pattern": "(sistrix|SISTRIX) [cC]rawler",
     "addition_date": "2011/08/02",
-    "instances": []
+    "url": "https://www.sistrix.com/tutorials/crawling-errors-in-the-optimizer/",
+    "instances": [
+      "Mozilla/5.0 (compatible; SISTRIX Crawler; http://crawler.sistrix.net/)"
+    ]
   },
   {
     "pattern": "Ahrefs(Bot|SiteAudit)",
@@ -1080,6 +1083,7 @@
   {
     "pattern": "lssbot",
     "addition_date": "2012/05/15",
+    "url": "https://www.lssbot.com/",
     "instances": []
   },
   {
@@ -1178,6 +1182,7 @@
   {
     "pattern": "backlinkcrawler",
     "addition_date": "2013/01/04",
+    "url": "http://www.backlinktest.com/crawler.html",
     "instances": []
   },
   {
@@ -2274,6 +2279,7 @@
   {
     "pattern": "LinkArchiver",
     "addition_date": "2017/09/24",
+    "url": "https://github.com/thisisparker/linkarchiver",
     "instances": [
       "@LinkArchiver twitter bot"
     ]
@@ -2306,6 +2312,7 @@
   {
     "pattern": "dcrawl",
     "addition_date": "2017/09/22",
+    "url": "https://github.com/kgretzky/dcrawl",
     "instances": [
       "dcrawl/1.0"
     ]
@@ -2454,6 +2461,7 @@
   {
     "pattern": "AHC\\/",
     "addition_date": "2017/11/02",
+    "url": "https://github.com/AsyncHttpClient/async-http-client",
     "instances": [
       "AHC/2.0"
     ]
@@ -2525,7 +2533,7 @@
   {
     "pattern": "Traackr\\.com",
     "addition_date": "2017/11/02",
-    "url": "Traackr.com",
+    "url": "https://www.traackr.com/",
     "instances": [
       "Traackr.com"
     ]
@@ -3146,7 +3154,7 @@
     "instances": [
       "Mozilla/5.0 zgrab/0.x"
     ],
-    "url": "https://zmap.io/"
+    "url": "https://github.com/zmap/zgrab2"
   },
   {
     "pattern": "PR-CY\\.RU",
@@ -3270,6 +3278,7 @@
   {
     "pattern": "VelenPublicWebCrawler",
     "addition_date": "2018/10/09",
+    "url": "https://velen.io/",
     "instances": [
       "VelenPublicWebCrawler (velen.io)"
     ]
@@ -3932,7 +3941,7 @@
     "instances": [
       "SentiBot www.sentibot.eu (compatible with Googlebot)"
     ],
-    "url": "https://www.sentibot.eu"
+    "url": "https://sites.google.com/senti1.com/sentibot-eu/home"
   },
   {
     "pattern": "Domains Project\\/",
@@ -4018,7 +4027,7 @@
     "instances": [
       "rssbot/1.4.3 (+https://t.me/RustRssBot)"
     ],
-    "url": "https://t.me/RustRssBot"
+    "url": "https://github.com/iovxw/rssbot"
   },
   {
     "pattern": "startmebot\\/",
@@ -4082,7 +4091,7 @@
       "Mozilla/5.0 (compatible; RidderBot/1.0; bot@ridder.co)",
       "Mozilla/5.0 (compatible; RidderBot/1.0; bot@ridder.co) (iPhone; CPU iPhone OS 8_4_1 like Mac OS X) Mobile/12H321"
     ],
-    "url": "http://brandonmedia.net"
+    "url": "https://ridder.co/"
   },
   {
     "pattern": "Taboolabot",
@@ -4206,8 +4215,7 @@
     "addition_date": "2022/04/26",
     "instances": [
       "Mozilla/5.0 (compatible; Go-http-client/1.1; +centurybot9@gmail.com)"
-    ],
-    "url": "unknown"
+    ]
   },
   {
     "pattern": "Viber",
@@ -4220,6 +4228,7 @@
   {
     "pattern": "e\\.ventures Investment Crawler",
     "addition_date": "2021/06/05",
+    "url": "https://www.eventures.vc/",
     "instances": [
       "e.ventures Investment Crawler (eventures.vc)"
     ]
@@ -4227,6 +4236,7 @@
   {
     "pattern": "evc-batch",
     "addition_date": "2021/06/07",
+    "url": "https://www.eventures.vc/",
     "instances": [
       "Mozilla/5.0 (compatible; evc-batch/2.0)"
     ]
@@ -4837,7 +4847,7 @@
     "instances": [
       "Mozilla/5.0 (compatible; ImagesiftBot; +imagesift.com)"
     ],
-    "url": "https://imagesift.com"
+    "url": "https://imagesift.com/about"
   },
   {
     "pattern": "Expanse",

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "crawler-user-agents",
-  "version": "1.0.146",
+  "version": "1.0.148",
   "main": "crawler-user-agents.json",
   "typings": "./index.d.ts",
   "author": "Martin Monperrus <martin.monperrus@gnieh.org>",

package/pyproject.toml CHANGED Viewed

@@ -8,6 +8,22 @@ authors = [
 readme = "README.md"
+[project.optional-dependencies]
+dev = [
+  "attrs==23.2.0",
+  "iniconfig==2.0.0",
+  "jsonschema==4.22.0",
+  "jsonschema-specifications==2023.12.1",
+  "packaging==24.0",
+  "pluggy==1.5.0",
+  "pytest==8.2.0",
+  "referencing==0.35.0",
+  "rpds-py==0.18.0",
+]
+[project.urls]
+Homepage = "https://github.com/monperrus/crawler-user-agents"
 [tool.setuptools]
 package-dir = {"crawleruseragents" = "."}

package/test_harness.py ADDED Viewed

@@ -0,0 +1,39 @@
+"""
+Simple tests for python harness
+Usage:
+$ pytest test_harness.py
+"""
+from crawleruseragents import is_crawler, matching_crawlers
+def test_match():
+    assert is_crawler("test Googlebot/2.0 test") is True
+def test_nomatch():
+    assert is_crawler("!!!!!!!!!!!!") is False
+def test_case():
+    assert is_crawler("test googlebot/2.0 test") is False
+def test_matching_crawlers_match():
+    result = matching_crawlers("test Googlebot/2.0 test")
+    assert isinstance(result, list)
+    assert len(result) > 0
+    assert all(isinstance(val, int) for val in result)
+def test_matching_crawlers_nomatch():
+    result = matching_crawlers("!!!!!!!!!!!!")
+    assert isinstance(result, list)
+    assert len(result) == 0
+def test_matching_crawlers_case():
+    result = matching_crawlers("test googlebot/2.0 test")
+    assert isinstance(result, list)
+    assert len(result) == 0

package/requirements.txt DELETED Viewed

@@ -1,9 +0,0 @@
-attrs==23.2.0
-iniconfig==2.0.0
-jsonschema==4.22.0
-jsonschema-specifications==2023.12.1
-packaging==24.0
-pluggy==1.5.0
-pytest==8.2.0
-referencing==0.35.0
-rpds-py==0.18.0