PyPI - linktrace - Versions diffs - 0.1.2__tar.gz → 0.2.0__tar.gz - Mend

linktrace 0.1.2tar.gz → 0.2.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

{linktrace-0.1.2 → linktrace-0.2.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: linktrace
-Version: 0.1.2
+Version: 0.2.0
 Summary: Async web crawler with rate limiting, robots.txt support, and broken link tracking
 License-File: LICENSE
 Requires-Python: >=3.12
@@ -37,7 +37,7 @@ Lightweight async web crawler for link analysis and HTML document processing.
 - 🔄 **Persistent sessions** — Connection pooling for 10-100x faster same-domain crawls
 - 🔁 **Retries + backoff** — Exponential backoff for transient errors (timeouts, 5xx)
 - ⏱️ **Rate limiting** — Per-domain rate limiting with asyncio.Lock, no thundering herd
-- 🤖 **robots.txt support** — Automatically respect Crawl-delay directives per domain
+- 🤖 **robots.txt support** — Automatically respect Crawl-delay directives and Disallow rules per domain
 - 🔍 **Broken link tracking** — Audit 404s and 5xx errors for site structure validation
 - 💾 **Optional caching** — Disk-based cache (1-day TTL) for repeat crawls
 - 🔐 **SSL verification** — Secure by default, with corporate proxy support
@@ -209,7 +209,7 @@ Use DFS for deep hierarchies (documentation sites, nested directories). Use BFS
 ### Rate Limiting & robots.txt
-By default, linktrace automatically respects robots.txt `Crawl-delay` directives and enforces per-domain rate limiting:
+By default, linktrace automatically respects robots.txt `Crawl-delay` directives and `Disallow` rules, enforcing per-domain rate limiting:
 ```python
 # Automatic robots.txt respect (default)
@@ -234,35 +234,34 @@ spider = Spider(
 await spider.run_async()
 ```
-### Broken Link Audit
+### Track Crawl Status
-Track 404s and 5xx errors for site maintenance:
+Monitor which pages returned error status codes:
 ```python
 spider = Spider(start_url="https://example.com", max_depth=2)
 documents = await spider.run_async()
-for doc in documents:
-    # Broken internal links (fix these first!)
-    for broken in doc.broken_internal_links:
-        print(f"{doc.url} → {broken.url} (HTTP {broken.status_code})")
-    # Broken external links (check if still valid)
-    for broken in doc.broken_external_links:
-        print(f"External: {broken.url} (HTTP {broken.status_code})")
+# Find pages with error responses
+error_pages = [doc for doc in documents if doc.status_code >= 400]
+for doc in error_pages:
+    print(f"Error: {doc.url} (HTTP {doc.status_code})")
+# Monitor disallowed pages (403 from robots.txt)
+disallowed = [doc for doc in documents if doc.status_code == 403]
+print(f"Disallowed by robots.txt: {len(disallowed)} pages")
 ```
-Stream broken links in real-time:
+Stream crawl status in real-time:
 ```python
-async def audit_broken(doc):
-    broken_count = len(doc.broken_internal_links) + len(doc.broken_external_links)
-    if broken_count > 0:
-        print(f"{doc.url}: {broken_count} broken links")
+async def track_errors(doc):
+    if doc.status_code >= 400:
+        print(f"❌ {doc.url} (HTTP {doc.status_code})")
 spider = Spider(
     start_url="https://example.com",
-    on_page_crawled=audit_broken,
+    on_page_crawled=track_errors,
     accumulate_results=False,
 )
 await spider.run_async()

{linktrace-0.1.2 → linktrace-0.2.0}/README.md RENAMED Viewed

@@ -14,7 +14,7 @@ Lightweight async web crawler for link analysis and HTML document processing.
 - 🔄 **Persistent sessions** — Connection pooling for 10-100x faster same-domain crawls
 - 🔁 **Retries + backoff** — Exponential backoff for transient errors (timeouts, 5xx)
 - ⏱️ **Rate limiting** — Per-domain rate limiting with asyncio.Lock, no thundering herd
-- 🤖 **robots.txt support** — Automatically respect Crawl-delay directives per domain
+- 🤖 **robots.txt support** — Automatically respect Crawl-delay directives and Disallow rules per domain
 - 🔍 **Broken link tracking** — Audit 404s and 5xx errors for site structure validation
 - 💾 **Optional caching** — Disk-based cache (1-day TTL) for repeat crawls
 - 🔐 **SSL verification** — Secure by default, with corporate proxy support
@@ -186,7 +186,7 @@ Use DFS for deep hierarchies (documentation sites, nested directories). Use BFS
 ### Rate Limiting & robots.txt
-By default, linktrace automatically respects robots.txt `Crawl-delay` directives and enforces per-domain rate limiting:
+By default, linktrace automatically respects robots.txt `Crawl-delay` directives and `Disallow` rules, enforcing per-domain rate limiting:
 ```python
 # Automatic robots.txt respect (default)
@@ -211,35 +211,34 @@ spider = Spider(
 await spider.run_async()
 ```
-### Broken Link Audit
+### Track Crawl Status
-Track 404s and 5xx errors for site maintenance:
+Monitor which pages returned error status codes:
 ```python
 spider = Spider(start_url="https://example.com", max_depth=2)
 documents = await spider.run_async()
-for doc in documents:
-    # Broken internal links (fix these first!)
-    for broken in doc.broken_internal_links:
-        print(f"{doc.url} → {broken.url} (HTTP {broken.status_code})")
-    # Broken external links (check if still valid)
-    for broken in doc.broken_external_links:
-        print(f"External: {broken.url} (HTTP {broken.status_code})")
+# Find pages with error responses
+error_pages = [doc for doc in documents if doc.status_code >= 400]
+for doc in error_pages:
+    print(f"Error: {doc.url} (HTTP {doc.status_code})")
+# Monitor disallowed pages (403 from robots.txt)
+disallowed = [doc for doc in documents if doc.status_code == 403]
+print(f"Disallowed by robots.txt: {len(disallowed)} pages")
 ```
-Stream broken links in real-time:
+Stream crawl status in real-time:
 ```python
-async def audit_broken(doc):
-    broken_count = len(doc.broken_internal_links) + len(doc.broken_external_links)
-    if broken_count > 0:
-        print(f"{doc.url}: {broken_count} broken links")
+async def track_errors(doc):
+    if doc.status_code >= 400:
+        print(f"❌ {doc.url} (HTTP {doc.status_code})")
 spider = Spider(
     start_url="https://example.com",
-    on_page_crawled=audit_broken,
+    on_page_crawled=track_errors,
     accumulate_results=False,
 )
 await spider.run_async()

{linktrace-0.1.2 → linktrace-0.2.0}/docs/examples.md RENAMED Viewed

@@ -562,9 +562,9 @@ async def main():
 asyncio.run(main())
 ```
-## 19. Track and Audit Broken Links
+## 19. Track Pages with Error Status
-Use new Document.broken_internal_links and broken_external_links for site audits.
+Find pages that returned 4xx/5xx errors during crawling.
 ```python
 import asyncio
@@ -577,56 +577,44 @@ async def main():
     )
     documents = await spider.run_async()
-    # Audit broken internal links (site structure issues)
-    print("=== BROKEN INTERNAL LINKS ===")
-    for doc in documents:
-        if doc.broken_internal_links:
-            for broken in doc.broken_internal_links:
-                print(f"  Page: {doc.url}")
-                print(f"    Bad link: {broken.url} (HTTP {broken.status_code})")
+    # Find pages with errors
+    print("=== PAGES WITH ERRORS ===")
+    error_pages = [doc for doc in documents if doc.status_code >= 400]
+    for doc in error_pages:
+        print(f"  {doc.url} (HTTP {doc.status_code})")
-    # Audit broken external links (might be out of date or blocked)
-    print("\n=== BROKEN EXTERNAL LINKS ===")
-    for doc in documents:
-        if doc.broken_external_links:
-            for broken in doc.broken_external_links:
-                print(f"  Page: {doc.url}")
-                print(f"    Bad link: {broken.url} (HTTP {broken.status_code})")
-    # Export broken links report
-    broken_count = sum(
-        len(doc.broken_internal_links) + len(doc.broken_external_links)
-        for doc in documents
-    )
-    print(f"\nTotal broken links found: {broken_count}")
+    # Check for disallowed pages (robots.txt)
+    print("\n=== DISALLOWED BY ROBOTS.TXT ===")
+    disallowed = [doc for doc in documents if doc.status_code == 403]
+    for doc in disallowed:
+        print(f"  {doc.url}")
+    # Summary
+    print(f"\nTotal: {len(documents)} pages")
+    print(f"Errors: {len(error_pages)}")
+    print(f"Disallowed: {len(disallowed)}")
 asyncio.run(main())
 ```
-## 20. Stream Broken Links to Report
+## 20. Stream Error Pages to Report
-Monitor broken links as crawl progresses using callbacks.
+Monitor pages with error responses as crawl progresses using callbacks.
 ```python
 import asyncio
 import json
 from linktrace import Spider
-async def track_broken_links(doc):
-    """Log broken links as they're discovered."""
-    if doc.broken_internal_links or doc.broken_external_links:
+async def track_errors(doc):
+    """Log error pages as they're discovered."""
+    if doc.status_code >= 400:
         report = {
             "url": doc.url,
-            "broken_internal": [
-                {"url": b.url, "status": b.status_code}
-                for b in doc.broken_internal_links
-            ],
-            "broken_external": [
-                {"url": b.url, "status": b.status_code}
-                for b in doc.broken_external_links
-            ]
+            "status": doc.status_code,
+            "status_text": "Disallowed" if doc.status_code == 403 else "Error"
         }
-        with open("broken_links_report.jsonl", "a") as f:
+        with open("error_pages.jsonl", "a") as f:
             json.dump(report, f)
             f.write("\n")
@@ -634,13 +622,13 @@ async def main():
     spider = Spider(
         start_url="https://example.com",
         max_depth=2,
-        on_page_crawled=track_broken_links,
+        on_page_crawled=track_errors,
         accumulate_results=False,  # Memory efficient
         request_delay=0.5          # Be polite
     )
     await spider.run_async()
-    print("Broken links report saved to broken_links_report.jsonl")
+    print("Error pages report saved to error_pages.jsonl")
 asyncio.run(main())
 ```

{linktrace-0.1.2 → linktrace-0.2.0}/justfile RENAMED Viewed

@@ -1,4 +1,4 @@
-# webcrawler — common tasks. Run `just` to list.
+# linktrace — common tasks. Run `just` to list.
 # Show available recipes
 default:
@@ -27,11 +27,11 @@ test:
 # Run tests with coverage
 test-cov:
-    uv run pytest -v --cov=WebCrawler --cov-report=term-missing --cov-report=html
+    uv run pytest -v --cov=linktrace --cov-report=term-missing --cov-report=html
 # Run the spider against the demo URL
 run:
-    uv run python -m WebCrawler.Spider
+    uv run python -m linktrace.Spider
 # Build the wheel + sdist into dist/
 build:

{linktrace-0.1.2 → linktrace-0.2.0}/linktrace/Crawler.py RENAMED Viewed

@@ -252,6 +252,14 @@ class Crawler:
                 doc.response_headers = cached.response_headers
                 return doc
+        # Check robots.txt before fetching
+        if self.respect_robots_txt and self.robots_manager:
+            if not await self.robots_manager.is_allowed(url):
+                self._logger.info(f"Disallowed by robots.txt: {url}")
+                doc = Document(url, None)
+                doc.status_code = 403
+                return doc
         # Get effective delay (robots.txt or configured)
         if self.respect_robots_txt and self.robots_manager:
             delay = await self.robots_manager.get_crawl_delay(url)
@@ -341,6 +349,8 @@ class Crawler:
                 if link.attrib["href"] and link.attrib["href"][:1] not in skip_words
             ]
+            # Use local list per document to avoid state leakage
+            found_links: list[HtmlLink] = []
             for link in links:
                 link_url = link.attrib["href"]
                 link_url = urljoin(url, link_url)
@@ -353,11 +363,11 @@ class Crawler:
                         link_url not in self.visited_urls
                         and link_url not in self._queue
                     ):
-                        self._links.append(HtmlLink(link_url, title))
+                        found_links.append(HtmlLink(link_url, title))
             doc.internal_links = [
                 link
-                for link in self._links
+                for link in found_links
                 if self.get_domain_parts(link.url) == self.get_domain_parts(doc.url)
                 or link.url[:1] == "/"
                 or link.url[:1] == ""
@@ -366,7 +376,7 @@ class Crawler:
             doc.external_links = [
                 link
-                for link in self._links
+                for link in found_links
                 if self.get_domain_parts(link.url) != self.get_domain_parts(doc.url)
                 and urlparse(link.url).scheme != ""
             ]

{linktrace-0.1.2 → linktrace-0.2.0}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 [project]
 name = "linktrace"
-version = "0.1.2"
+version = "0.2.0"
 description = "Async web crawler with rate limiting, robots.txt support, and broken link tracking"
 readme = "README.md"
 requires-python = ">=3.12"
@@ -44,7 +44,7 @@ dev = [
 packages = ["linktrace"]
 [tool.ruff]
-target-version = "0.1.2"
+target-version = "py312"
 line-length = 88
 [tool.ruff.lint]
@@ -82,7 +82,7 @@ precision = 2
 directory = "htmlcov"
 [tool.mypy]
-python_version = "0.1.2"
+python_version = "3.12"
 warn_return_any = true
 warn_unused_configs = true
 disallow_untyped_defs = false

{linktrace-0.1.2 → linktrace-0.2.0}/uv.lock RENAMED Viewed

@@ -796,7 +796,7 @@ wheels = [
 [[package]]
 name = "linktrace"
-version = "0.1.0"
+version = "0.1.2"
 source = { editable = "." }
 dependencies = [
     { name = "aiofiles" },