linktrace 0.1.2__tar.gz → 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. {linktrace-0.1.2 → linktrace-0.2.0}/PKG-INFO +18 -19
  2. {linktrace-0.1.2 → linktrace-0.2.0}/README.md +17 -18
  3. {linktrace-0.1.2 → linktrace-0.2.0}/docs/examples.md +27 -39
  4. {linktrace-0.1.2 → linktrace-0.2.0}/justfile +3 -3
  5. {linktrace-0.1.2 → linktrace-0.2.0}/linktrace/Crawler.py +13 -3
  6. {linktrace-0.1.2 → linktrace-0.2.0}/pyproject.toml +3 -3
  7. {linktrace-0.1.2 → linktrace-0.2.0}/uv.lock +1 -1
  8. {linktrace-0.1.2 → linktrace-0.2.0}/.coverage +0 -0
  9. {linktrace-0.1.2 → linktrace-0.2.0}/.github/workflows/publish.yml +0 -0
  10. {linktrace-0.1.2 → linktrace-0.2.0}/.gitignore +0 -0
  11. {linktrace-0.1.2 → linktrace-0.2.0}/.pre-commit-config.yaml +0 -0
  12. {linktrace-0.1.2 → linktrace-0.2.0}/.python-version +0 -0
  13. {linktrace-0.1.2 → linktrace-0.2.0}/.vscode/launch.json +0 -0
  14. {linktrace-0.1.2 → linktrace-0.2.0}/.vscode/settings.json +0 -0
  15. {linktrace-0.1.2 → linktrace-0.2.0}/LICENSE +0 -0
  16. {linktrace-0.1.2 → linktrace-0.2.0}/docs/api-reference.md +0 -0
  17. {linktrace-0.1.2 → linktrace-0.2.0}/docs/core-concepts.md +0 -0
  18. {linktrace-0.1.2 → linktrace-0.2.0}/docs/getting-started.md +0 -0
  19. {linktrace-0.1.2 → linktrace-0.2.0}/docs/troubleshooting.md +0 -0
  20. {linktrace-0.1.2 → linktrace-0.2.0}/linktrace/Serializers.py +0 -0
  21. {linktrace-0.1.2 → linktrace-0.2.0}/linktrace/Spider.py +0 -0
  22. {linktrace-0.1.2 → linktrace-0.2.0}/linktrace/__init__.py +0 -0
  23. {linktrace-0.1.2 → linktrace-0.2.0}/linktrace/cache.py +0 -0
  24. {linktrace-0.1.2 → linktrace-0.2.0}/linktrace/py.typed +0 -0
  25. {linktrace-0.1.2 → linktrace-0.2.0}/linktrace/robots.py +0 -0
  26. {linktrace-0.1.2 → linktrace-0.2.0}/notebooks/crawl_cnn.ipynb +0 -0
  27. {linktrace-0.1.2 → linktrace-0.2.0}/notebooks/crawl_cnn_callbacks.ipynb +0 -0
  28. {linktrace-0.1.2 → linktrace-0.2.0}/notebooks/crawl_tax_assessor.ipynb +0 -0
  29. {linktrace-0.1.2 → linktrace-0.2.0}/settings.yaml +0 -0
  30. {linktrace-0.1.2 → linktrace-0.2.0}/tests/__init__.py +0 -0
  31. {linktrace-0.1.2 → linktrace-0.2.0}/tests/conftest.py +0 -0
  32. {linktrace-0.1.2 → linktrace-0.2.0}/tests/test_crawler.py +0 -0
  33. {linktrace-0.1.2 → linktrace-0.2.0}/tests/test_models.py +0 -0
  34. {linktrace-0.1.2 → linktrace-0.2.0}/tests/test_rate_limiting_and_broken_links.py +0 -0
  35. {linktrace-0.1.2 → linktrace-0.2.0}/tests/test_serializers.py +0 -0
  36. {linktrace-0.1.2 → linktrace-0.2.0}/tests/test_spider.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: linktrace
3
- Version: 0.1.2
3
+ Version: 0.2.0
4
4
  Summary: Async web crawler with rate limiting, robots.txt support, and broken link tracking
5
5
  License-File: LICENSE
6
6
  Requires-Python: >=3.12
@@ -37,7 +37,7 @@ Lightweight async web crawler for link analysis and HTML document processing.
37
37
  - 🔄 **Persistent sessions** — Connection pooling for 10-100x faster same-domain crawls
38
38
  - 🔁 **Retries + backoff** — Exponential backoff for transient errors (timeouts, 5xx)
39
39
  - ⏱️ **Rate limiting** — Per-domain rate limiting with asyncio.Lock, no thundering herd
40
- - 🤖 **robots.txt support** — Automatically respect Crawl-delay directives per domain
40
+ - 🤖 **robots.txt support** — Automatically respect Crawl-delay directives and Disallow rules per domain
41
41
  - 🔍 **Broken link tracking** — Audit 404s and 5xx errors for site structure validation
42
42
  - 💾 **Optional caching** — Disk-based cache (1-day TTL) for repeat crawls
43
43
  - 🔐 **SSL verification** — Secure by default, with corporate proxy support
@@ -209,7 +209,7 @@ Use DFS for deep hierarchies (documentation sites, nested directories). Use BFS
209
209
 
210
210
  ### Rate Limiting & robots.txt
211
211
 
212
- By default, linktrace automatically respects robots.txt `Crawl-delay` directives and enforces per-domain rate limiting:
212
+ By default, linktrace automatically respects robots.txt `Crawl-delay` directives and `Disallow` rules, enforcing per-domain rate limiting:
213
213
 
214
214
  ```python
215
215
  # Automatic robots.txt respect (default)
@@ -234,35 +234,34 @@ spider = Spider(
234
234
  await spider.run_async()
235
235
  ```
236
236
 
237
- ### Broken Link Audit
237
+ ### Track Crawl Status
238
238
 
239
- Track 404s and 5xx errors for site maintenance:
239
+ Monitor which pages returned error status codes:
240
240
 
241
241
  ```python
242
242
  spider = Spider(start_url="https://example.com", max_depth=2)
243
243
  documents = await spider.run_async()
244
244
 
245
- for doc in documents:
246
- # Broken internal links (fix these first!)
247
- for broken in doc.broken_internal_links:
248
- print(f"{doc.url} → {broken.url} (HTTP {broken.status_code})")
249
-
250
- # Broken external links (check if still valid)
251
- for broken in doc.broken_external_links:
252
- print(f"External: {broken.url} (HTTP {broken.status_code})")
245
+ # Find pages with error responses
246
+ error_pages = [doc for doc in documents if doc.status_code >= 400]
247
+ for doc in error_pages:
248
+ print(f"Error: {doc.url} (HTTP {doc.status_code})")
249
+
250
+ # Monitor disallowed pages (403 from robots.txt)
251
+ disallowed = [doc for doc in documents if doc.status_code == 403]
252
+ print(f"Disallowed by robots.txt: {len(disallowed)} pages")
253
253
  ```
254
254
 
255
- Stream broken links in real-time:
255
+ Stream crawl status in real-time:
256
256
 
257
257
  ```python
258
- async def audit_broken(doc):
259
- broken_count = len(doc.broken_internal_links) + len(doc.broken_external_links)
260
- if broken_count > 0:
261
- print(f"{doc.url}: {broken_count} broken links")
258
+ async def track_errors(doc):
259
+ if doc.status_code >= 400:
260
+ print(f"❌ {doc.url} (HTTP {doc.status_code})")
262
261
 
263
262
  spider = Spider(
264
263
  start_url="https://example.com",
265
- on_page_crawled=audit_broken,
264
+ on_page_crawled=track_errors,
266
265
  accumulate_results=False,
267
266
  )
268
267
  await spider.run_async()
@@ -14,7 +14,7 @@ Lightweight async web crawler for link analysis and HTML document processing.
14
14
  - 🔄 **Persistent sessions** — Connection pooling for 10-100x faster same-domain crawls
15
15
  - 🔁 **Retries + backoff** — Exponential backoff for transient errors (timeouts, 5xx)
16
16
  - ⏱️ **Rate limiting** — Per-domain rate limiting with asyncio.Lock, no thundering herd
17
- - 🤖 **robots.txt support** — Automatically respect Crawl-delay directives per domain
17
+ - 🤖 **robots.txt support** — Automatically respect Crawl-delay directives and Disallow rules per domain
18
18
  - 🔍 **Broken link tracking** — Audit 404s and 5xx errors for site structure validation
19
19
  - 💾 **Optional caching** — Disk-based cache (1-day TTL) for repeat crawls
20
20
  - 🔐 **SSL verification** — Secure by default, with corporate proxy support
@@ -186,7 +186,7 @@ Use DFS for deep hierarchies (documentation sites, nested directories). Use BFS
186
186
 
187
187
  ### Rate Limiting & robots.txt
188
188
 
189
- By default, linktrace automatically respects robots.txt `Crawl-delay` directives and enforces per-domain rate limiting:
189
+ By default, linktrace automatically respects robots.txt `Crawl-delay` directives and `Disallow` rules, enforcing per-domain rate limiting:
190
190
 
191
191
  ```python
192
192
  # Automatic robots.txt respect (default)
@@ -211,35 +211,34 @@ spider = Spider(
211
211
  await spider.run_async()
212
212
  ```
213
213
 
214
- ### Broken Link Audit
214
+ ### Track Crawl Status
215
215
 
216
- Track 404s and 5xx errors for site maintenance:
216
+ Monitor which pages returned error status codes:
217
217
 
218
218
  ```python
219
219
  spider = Spider(start_url="https://example.com", max_depth=2)
220
220
  documents = await spider.run_async()
221
221
 
222
- for doc in documents:
223
- # Broken internal links (fix these first!)
224
- for broken in doc.broken_internal_links:
225
- print(f"{doc.url} → {broken.url} (HTTP {broken.status_code})")
226
-
227
- # Broken external links (check if still valid)
228
- for broken in doc.broken_external_links:
229
- print(f"External: {broken.url} (HTTP {broken.status_code})")
222
+ # Find pages with error responses
223
+ error_pages = [doc for doc in documents if doc.status_code >= 400]
224
+ for doc in error_pages:
225
+ print(f"Error: {doc.url} (HTTP {doc.status_code})")
226
+
227
+ # Monitor disallowed pages (403 from robots.txt)
228
+ disallowed = [doc for doc in documents if doc.status_code == 403]
229
+ print(f"Disallowed by robots.txt: {len(disallowed)} pages")
230
230
  ```
231
231
 
232
- Stream broken links in real-time:
232
+ Stream crawl status in real-time:
233
233
 
234
234
  ```python
235
- async def audit_broken(doc):
236
- broken_count = len(doc.broken_internal_links) + len(doc.broken_external_links)
237
- if broken_count > 0:
238
- print(f"{doc.url}: {broken_count} broken links")
235
+ async def track_errors(doc):
236
+ if doc.status_code >= 400:
237
+ print(f"❌ {doc.url} (HTTP {doc.status_code})")
239
238
 
240
239
  spider = Spider(
241
240
  start_url="https://example.com",
242
- on_page_crawled=audit_broken,
241
+ on_page_crawled=track_errors,
243
242
  accumulate_results=False,
244
243
  )
245
244
  await spider.run_async()
@@ -562,9 +562,9 @@ async def main():
562
562
  asyncio.run(main())
563
563
  ```
564
564
 
565
- ## 19. Track and Audit Broken Links
565
+ ## 19. Track Pages with Error Status
566
566
 
567
- Use new Document.broken_internal_links and broken_external_links for site audits.
567
+ Find pages that returned 4xx/5xx errors during crawling.
568
568
 
569
569
  ```python
570
570
  import asyncio
@@ -577,56 +577,44 @@ async def main():
577
577
  )
578
578
  documents = await spider.run_async()
579
579
 
580
- # Audit broken internal links (site structure issues)
581
- print("=== BROKEN INTERNAL LINKS ===")
582
- for doc in documents:
583
- if doc.broken_internal_links:
584
- for broken in doc.broken_internal_links:
585
- print(f" Page: {doc.url}")
586
- print(f" Bad link: {broken.url} (HTTP {broken.status_code})")
580
+ # Find pages with errors
581
+ print("=== PAGES WITH ERRORS ===")
582
+ error_pages = [doc for doc in documents if doc.status_code >= 400]
583
+ for doc in error_pages:
584
+ print(f" {doc.url} (HTTP {doc.status_code})")
587
585
 
588
- # Audit broken external links (might be out of date or blocked)
589
- print("\n=== BROKEN EXTERNAL LINKS ===")
590
- for doc in documents:
591
- if doc.broken_external_links:
592
- for broken in doc.broken_external_links:
593
- print(f" Page: {doc.url}")
594
- print(f" Bad link: {broken.url} (HTTP {broken.status_code})")
595
-
596
- # Export broken links report
597
- broken_count = sum(
598
- len(doc.broken_internal_links) + len(doc.broken_external_links)
599
- for doc in documents
600
- )
601
- print(f"\nTotal broken links found: {broken_count}")
586
+ # Check for disallowed pages (robots.txt)
587
+ print("\n=== DISALLOWED BY ROBOTS.TXT ===")
588
+ disallowed = [doc for doc in documents if doc.status_code == 403]
589
+ for doc in disallowed:
590
+ print(f" {doc.url}")
591
+
592
+ # Summary
593
+ print(f"\nTotal: {len(documents)} pages")
594
+ print(f"Errors: {len(error_pages)}")
595
+ print(f"Disallowed: {len(disallowed)}")
602
596
 
603
597
  asyncio.run(main())
604
598
  ```
605
599
 
606
- ## 20. Stream Broken Links to Report
600
+ ## 20. Stream Error Pages to Report
607
601
 
608
- Monitor broken links as crawl progresses using callbacks.
602
+ Monitor pages with error responses as crawl progresses using callbacks.
609
603
 
610
604
  ```python
611
605
  import asyncio
612
606
  import json
613
607
  from linktrace import Spider
614
608
 
615
- async def track_broken_links(doc):
616
- """Log broken links as they're discovered."""
617
- if doc.broken_internal_links or doc.broken_external_links:
609
+ async def track_errors(doc):
610
+ """Log error pages as they're discovered."""
611
+ if doc.status_code >= 400:
618
612
  report = {
619
613
  "url": doc.url,
620
- "broken_internal": [
621
- {"url": b.url, "status": b.status_code}
622
- for b in doc.broken_internal_links
623
- ],
624
- "broken_external": [
625
- {"url": b.url, "status": b.status_code}
626
- for b in doc.broken_external_links
627
- ]
614
+ "status": doc.status_code,
615
+ "status_text": "Disallowed" if doc.status_code == 403 else "Error"
628
616
  }
629
- with open("broken_links_report.jsonl", "a") as f:
617
+ with open("error_pages.jsonl", "a") as f:
630
618
  json.dump(report, f)
631
619
  f.write("\n")
632
620
 
@@ -634,13 +622,13 @@ async def main():
634
622
  spider = Spider(
635
623
  start_url="https://example.com",
636
624
  max_depth=2,
637
- on_page_crawled=track_broken_links,
625
+ on_page_crawled=track_errors,
638
626
  accumulate_results=False, # Memory efficient
639
627
  request_delay=0.5 # Be polite
640
628
  )
641
629
 
642
630
  await spider.run_async()
643
- print("Broken links report saved to broken_links_report.jsonl")
631
+ print("Error pages report saved to error_pages.jsonl")
644
632
 
645
633
  asyncio.run(main())
646
634
  ```
@@ -1,4 +1,4 @@
1
- # webcrawler — common tasks. Run `just` to list.
1
+ # linktrace — common tasks. Run `just` to list.
2
2
 
3
3
  # Show available recipes
4
4
  default:
@@ -27,11 +27,11 @@ test:
27
27
 
28
28
  # Run tests with coverage
29
29
  test-cov:
30
- uv run pytest -v --cov=WebCrawler --cov-report=term-missing --cov-report=html
30
+ uv run pytest -v --cov=linktrace --cov-report=term-missing --cov-report=html
31
31
 
32
32
  # Run the spider against the demo URL
33
33
  run:
34
- uv run python -m WebCrawler.Spider
34
+ uv run python -m linktrace.Spider
35
35
 
36
36
  # Build the wheel + sdist into dist/
37
37
  build:
@@ -252,6 +252,14 @@ class Crawler:
252
252
  doc.response_headers = cached.response_headers
253
253
  return doc
254
254
 
255
+ # Check robots.txt before fetching
256
+ if self.respect_robots_txt and self.robots_manager:
257
+ if not await self.robots_manager.is_allowed(url):
258
+ self._logger.info(f"Disallowed by robots.txt: {url}")
259
+ doc = Document(url, None)
260
+ doc.status_code = 403
261
+ return doc
262
+
255
263
  # Get effective delay (robots.txt or configured)
256
264
  if self.respect_robots_txt and self.robots_manager:
257
265
  delay = await self.robots_manager.get_crawl_delay(url)
@@ -341,6 +349,8 @@ class Crawler:
341
349
  if link.attrib["href"] and link.attrib["href"][:1] not in skip_words
342
350
  ]
343
351
 
352
+ # Use local list per document to avoid state leakage
353
+ found_links: list[HtmlLink] = []
344
354
  for link in links:
345
355
  link_url = link.attrib["href"]
346
356
  link_url = urljoin(url, link_url)
@@ -353,11 +363,11 @@ class Crawler:
353
363
  link_url not in self.visited_urls
354
364
  and link_url not in self._queue
355
365
  ):
356
- self._links.append(HtmlLink(link_url, title))
366
+ found_links.append(HtmlLink(link_url, title))
357
367
 
358
368
  doc.internal_links = [
359
369
  link
360
- for link in self._links
370
+ for link in found_links
361
371
  if self.get_domain_parts(link.url) == self.get_domain_parts(doc.url)
362
372
  or link.url[:1] == "/"
363
373
  or link.url[:1] == ""
@@ -366,7 +376,7 @@ class Crawler:
366
376
 
367
377
  doc.external_links = [
368
378
  link
369
- for link in self._links
379
+ for link in found_links
370
380
  if self.get_domain_parts(link.url) != self.get_domain_parts(doc.url)
371
381
  and urlparse(link.url).scheme != ""
372
382
  ]
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "linktrace"
7
- version = "0.1.2"
7
+ version = "0.2.0"
8
8
  description = "Async web crawler with rate limiting, robots.txt support, and broken link tracking"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.12"
@@ -44,7 +44,7 @@ dev = [
44
44
  packages = ["linktrace"]
45
45
 
46
46
  [tool.ruff]
47
- target-version = "0.1.2"
47
+ target-version = "py312"
48
48
  line-length = 88
49
49
 
50
50
  [tool.ruff.lint]
@@ -82,7 +82,7 @@ precision = 2
82
82
  directory = "htmlcov"
83
83
 
84
84
  [tool.mypy]
85
- python_version = "0.1.2"
85
+ python_version = "3.12"
86
86
  warn_return_any = true
87
87
  warn_unused_configs = true
88
88
  disallow_untyped_defs = false
@@ -796,7 +796,7 @@ wheels = [
796
796
 
797
797
  [[package]]
798
798
  name = "linktrace"
799
- version = "0.1.0"
799
+ version = "0.1.2"
800
800
  source = { editable = "." }
801
801
  dependencies = [
802
802
  { name = "aiofiles" },
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes