linktrace 0.1.2__tar.gz → 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {linktrace-0.1.2 → linktrace-0.2.0}/PKG-INFO +18 -19
- {linktrace-0.1.2 → linktrace-0.2.0}/README.md +17 -18
- {linktrace-0.1.2 → linktrace-0.2.0}/docs/examples.md +27 -39
- {linktrace-0.1.2 → linktrace-0.2.0}/justfile +3 -3
- {linktrace-0.1.2 → linktrace-0.2.0}/linktrace/Crawler.py +13 -3
- {linktrace-0.1.2 → linktrace-0.2.0}/pyproject.toml +3 -3
- {linktrace-0.1.2 → linktrace-0.2.0}/uv.lock +1 -1
- {linktrace-0.1.2 → linktrace-0.2.0}/.coverage +0 -0
- {linktrace-0.1.2 → linktrace-0.2.0}/.github/workflows/publish.yml +0 -0
- {linktrace-0.1.2 → linktrace-0.2.0}/.gitignore +0 -0
- {linktrace-0.1.2 → linktrace-0.2.0}/.pre-commit-config.yaml +0 -0
- {linktrace-0.1.2 → linktrace-0.2.0}/.python-version +0 -0
- {linktrace-0.1.2 → linktrace-0.2.0}/.vscode/launch.json +0 -0
- {linktrace-0.1.2 → linktrace-0.2.0}/.vscode/settings.json +0 -0
- {linktrace-0.1.2 → linktrace-0.2.0}/LICENSE +0 -0
- {linktrace-0.1.2 → linktrace-0.2.0}/docs/api-reference.md +0 -0
- {linktrace-0.1.2 → linktrace-0.2.0}/docs/core-concepts.md +0 -0
- {linktrace-0.1.2 → linktrace-0.2.0}/docs/getting-started.md +0 -0
- {linktrace-0.1.2 → linktrace-0.2.0}/docs/troubleshooting.md +0 -0
- {linktrace-0.1.2 → linktrace-0.2.0}/linktrace/Serializers.py +0 -0
- {linktrace-0.1.2 → linktrace-0.2.0}/linktrace/Spider.py +0 -0
- {linktrace-0.1.2 → linktrace-0.2.0}/linktrace/__init__.py +0 -0
- {linktrace-0.1.2 → linktrace-0.2.0}/linktrace/cache.py +0 -0
- {linktrace-0.1.2 → linktrace-0.2.0}/linktrace/py.typed +0 -0
- {linktrace-0.1.2 → linktrace-0.2.0}/linktrace/robots.py +0 -0
- {linktrace-0.1.2 → linktrace-0.2.0}/notebooks/crawl_cnn.ipynb +0 -0
- {linktrace-0.1.2 → linktrace-0.2.0}/notebooks/crawl_cnn_callbacks.ipynb +0 -0
- {linktrace-0.1.2 → linktrace-0.2.0}/notebooks/crawl_tax_assessor.ipynb +0 -0
- {linktrace-0.1.2 → linktrace-0.2.0}/settings.yaml +0 -0
- {linktrace-0.1.2 → linktrace-0.2.0}/tests/__init__.py +0 -0
- {linktrace-0.1.2 → linktrace-0.2.0}/tests/conftest.py +0 -0
- {linktrace-0.1.2 → linktrace-0.2.0}/tests/test_crawler.py +0 -0
- {linktrace-0.1.2 → linktrace-0.2.0}/tests/test_models.py +0 -0
- {linktrace-0.1.2 → linktrace-0.2.0}/tests/test_rate_limiting_and_broken_links.py +0 -0
- {linktrace-0.1.2 → linktrace-0.2.0}/tests/test_serializers.py +0 -0
- {linktrace-0.1.2 → linktrace-0.2.0}/tests/test_spider.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: linktrace
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.2.0
|
|
4
4
|
Summary: Async web crawler with rate limiting, robots.txt support, and broken link tracking
|
|
5
5
|
License-File: LICENSE
|
|
6
6
|
Requires-Python: >=3.12
|
|
@@ -37,7 +37,7 @@ Lightweight async web crawler for link analysis and HTML document processing.
|
|
|
37
37
|
- 🔄 **Persistent sessions** — Connection pooling for 10-100x faster same-domain crawls
|
|
38
38
|
- 🔁 **Retries + backoff** — Exponential backoff for transient errors (timeouts, 5xx)
|
|
39
39
|
- ⏱️ **Rate limiting** — Per-domain rate limiting with asyncio.Lock, no thundering herd
|
|
40
|
-
- 🤖 **robots.txt support** — Automatically respect Crawl-delay directives per domain
|
|
40
|
+
- 🤖 **robots.txt support** — Automatically respect Crawl-delay directives and Disallow rules per domain
|
|
41
41
|
- 🔍 **Broken link tracking** — Audit 404s and 5xx errors for site structure validation
|
|
42
42
|
- 💾 **Optional caching** — Disk-based cache (1-day TTL) for repeat crawls
|
|
43
43
|
- 🔐 **SSL verification** — Secure by default, with corporate proxy support
|
|
@@ -209,7 +209,7 @@ Use DFS for deep hierarchies (documentation sites, nested directories). Use BFS
|
|
|
209
209
|
|
|
210
210
|
### Rate Limiting & robots.txt
|
|
211
211
|
|
|
212
|
-
By default, linktrace automatically respects robots.txt `Crawl-delay` directives and
|
|
212
|
+
By default, linktrace automatically respects robots.txt `Crawl-delay` directives and `Disallow` rules, enforcing per-domain rate limiting:
|
|
213
213
|
|
|
214
214
|
```python
|
|
215
215
|
# Automatic robots.txt respect (default)
|
|
@@ -234,35 +234,34 @@ spider = Spider(
|
|
|
234
234
|
await spider.run_async()
|
|
235
235
|
```
|
|
236
236
|
|
|
237
|
-
###
|
|
237
|
+
### Track Crawl Status
|
|
238
238
|
|
|
239
|
-
|
|
239
|
+
Monitor which pages returned error status codes:
|
|
240
240
|
|
|
241
241
|
```python
|
|
242
242
|
spider = Spider(start_url="https://example.com", max_depth=2)
|
|
243
243
|
documents = await spider.run_async()
|
|
244
244
|
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
245
|
+
# Find pages with error responses
|
|
246
|
+
error_pages = [doc for doc in documents if doc.status_code >= 400]
|
|
247
|
+
for doc in error_pages:
|
|
248
|
+
print(f"Error: {doc.url} (HTTP {doc.status_code})")
|
|
249
|
+
|
|
250
|
+
# Monitor disallowed pages (403 from robots.txt)
|
|
251
|
+
disallowed = [doc for doc in documents if doc.status_code == 403]
|
|
252
|
+
print(f"Disallowed by robots.txt: {len(disallowed)} pages")
|
|
253
253
|
```
|
|
254
254
|
|
|
255
|
-
Stream
|
|
255
|
+
Stream crawl status in real-time:
|
|
256
256
|
|
|
257
257
|
```python
|
|
258
|
-
async def
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
print(f"{doc.url}: {broken_count} broken links")
|
|
258
|
+
async def track_errors(doc):
|
|
259
|
+
if doc.status_code >= 400:
|
|
260
|
+
print(f"❌ {doc.url} (HTTP {doc.status_code})")
|
|
262
261
|
|
|
263
262
|
spider = Spider(
|
|
264
263
|
start_url="https://example.com",
|
|
265
|
-
on_page_crawled=
|
|
264
|
+
on_page_crawled=track_errors,
|
|
266
265
|
accumulate_results=False,
|
|
267
266
|
)
|
|
268
267
|
await spider.run_async()
|
|
@@ -14,7 +14,7 @@ Lightweight async web crawler for link analysis and HTML document processing.
|
|
|
14
14
|
- 🔄 **Persistent sessions** — Connection pooling for 10-100x faster same-domain crawls
|
|
15
15
|
- 🔁 **Retries + backoff** — Exponential backoff for transient errors (timeouts, 5xx)
|
|
16
16
|
- ⏱️ **Rate limiting** — Per-domain rate limiting with asyncio.Lock, no thundering herd
|
|
17
|
-
- 🤖 **robots.txt support** — Automatically respect Crawl-delay directives per domain
|
|
17
|
+
- 🤖 **robots.txt support** — Automatically respect Crawl-delay directives and Disallow rules per domain
|
|
18
18
|
- 🔍 **Broken link tracking** — Audit 404s and 5xx errors for site structure validation
|
|
19
19
|
- 💾 **Optional caching** — Disk-based cache (1-day TTL) for repeat crawls
|
|
20
20
|
- 🔐 **SSL verification** — Secure by default, with corporate proxy support
|
|
@@ -186,7 +186,7 @@ Use DFS for deep hierarchies (documentation sites, nested directories). Use BFS
|
|
|
186
186
|
|
|
187
187
|
### Rate Limiting & robots.txt
|
|
188
188
|
|
|
189
|
-
By default, linktrace automatically respects robots.txt `Crawl-delay` directives and
|
|
189
|
+
By default, linktrace automatically respects robots.txt `Crawl-delay` directives and `Disallow` rules, enforcing per-domain rate limiting:
|
|
190
190
|
|
|
191
191
|
```python
|
|
192
192
|
# Automatic robots.txt respect (default)
|
|
@@ -211,35 +211,34 @@ spider = Spider(
|
|
|
211
211
|
await spider.run_async()
|
|
212
212
|
```
|
|
213
213
|
|
|
214
|
-
###
|
|
214
|
+
### Track Crawl Status
|
|
215
215
|
|
|
216
|
-
|
|
216
|
+
Monitor which pages returned error status codes:
|
|
217
217
|
|
|
218
218
|
```python
|
|
219
219
|
spider = Spider(start_url="https://example.com", max_depth=2)
|
|
220
220
|
documents = await spider.run_async()
|
|
221
221
|
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
222
|
+
# Find pages with error responses
|
|
223
|
+
error_pages = [doc for doc in documents if doc.status_code >= 400]
|
|
224
|
+
for doc in error_pages:
|
|
225
|
+
print(f"Error: {doc.url} (HTTP {doc.status_code})")
|
|
226
|
+
|
|
227
|
+
# Monitor disallowed pages (403 from robots.txt)
|
|
228
|
+
disallowed = [doc for doc in documents if doc.status_code == 403]
|
|
229
|
+
print(f"Disallowed by robots.txt: {len(disallowed)} pages")
|
|
230
230
|
```
|
|
231
231
|
|
|
232
|
-
Stream
|
|
232
|
+
Stream crawl status in real-time:
|
|
233
233
|
|
|
234
234
|
```python
|
|
235
|
-
async def
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
print(f"{doc.url}: {broken_count} broken links")
|
|
235
|
+
async def track_errors(doc):
|
|
236
|
+
if doc.status_code >= 400:
|
|
237
|
+
print(f"❌ {doc.url} (HTTP {doc.status_code})")
|
|
239
238
|
|
|
240
239
|
spider = Spider(
|
|
241
240
|
start_url="https://example.com",
|
|
242
|
-
on_page_crawled=
|
|
241
|
+
on_page_crawled=track_errors,
|
|
243
242
|
accumulate_results=False,
|
|
244
243
|
)
|
|
245
244
|
await spider.run_async()
|
|
@@ -562,9 +562,9 @@ async def main():
|
|
|
562
562
|
asyncio.run(main())
|
|
563
563
|
```
|
|
564
564
|
|
|
565
|
-
## 19. Track
|
|
565
|
+
## 19. Track Pages with Error Status
|
|
566
566
|
|
|
567
|
-
|
|
567
|
+
Find pages that returned 4xx/5xx errors during crawling.
|
|
568
568
|
|
|
569
569
|
```python
|
|
570
570
|
import asyncio
|
|
@@ -577,56 +577,44 @@ async def main():
|
|
|
577
577
|
)
|
|
578
578
|
documents = await spider.run_async()
|
|
579
579
|
|
|
580
|
-
#
|
|
581
|
-
print("===
|
|
582
|
-
for doc in documents
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
print(f" Page: {doc.url}")
|
|
586
|
-
print(f" Bad link: {broken.url} (HTTP {broken.status_code})")
|
|
580
|
+
# Find pages with errors
|
|
581
|
+
print("=== PAGES WITH ERRORS ===")
|
|
582
|
+
error_pages = [doc for doc in documents if doc.status_code >= 400]
|
|
583
|
+
for doc in error_pages:
|
|
584
|
+
print(f" {doc.url} (HTTP {doc.status_code})")
|
|
587
585
|
|
|
588
|
-
#
|
|
589
|
-
print("\n===
|
|
590
|
-
for doc in documents
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
len(doc.broken_internal_links) + len(doc.broken_external_links)
|
|
599
|
-
for doc in documents
|
|
600
|
-
)
|
|
601
|
-
print(f"\nTotal broken links found: {broken_count}")
|
|
586
|
+
# Check for disallowed pages (robots.txt)
|
|
587
|
+
print("\n=== DISALLOWED BY ROBOTS.TXT ===")
|
|
588
|
+
disallowed = [doc for doc in documents if doc.status_code == 403]
|
|
589
|
+
for doc in disallowed:
|
|
590
|
+
print(f" {doc.url}")
|
|
591
|
+
|
|
592
|
+
# Summary
|
|
593
|
+
print(f"\nTotal: {len(documents)} pages")
|
|
594
|
+
print(f"Errors: {len(error_pages)}")
|
|
595
|
+
print(f"Disallowed: {len(disallowed)}")
|
|
602
596
|
|
|
603
597
|
asyncio.run(main())
|
|
604
598
|
```
|
|
605
599
|
|
|
606
|
-
## 20. Stream
|
|
600
|
+
## 20. Stream Error Pages to Report
|
|
607
601
|
|
|
608
|
-
Monitor
|
|
602
|
+
Monitor pages with error responses as crawl progresses using callbacks.
|
|
609
603
|
|
|
610
604
|
```python
|
|
611
605
|
import asyncio
|
|
612
606
|
import json
|
|
613
607
|
from linktrace import Spider
|
|
614
608
|
|
|
615
|
-
async def
|
|
616
|
-
"""Log
|
|
617
|
-
if doc.
|
|
609
|
+
async def track_errors(doc):
|
|
610
|
+
"""Log error pages as they're discovered."""
|
|
611
|
+
if doc.status_code >= 400:
|
|
618
612
|
report = {
|
|
619
613
|
"url": doc.url,
|
|
620
|
-
"
|
|
621
|
-
|
|
622
|
-
for b in doc.broken_internal_links
|
|
623
|
-
],
|
|
624
|
-
"broken_external": [
|
|
625
|
-
{"url": b.url, "status": b.status_code}
|
|
626
|
-
for b in doc.broken_external_links
|
|
627
|
-
]
|
|
614
|
+
"status": doc.status_code,
|
|
615
|
+
"status_text": "Disallowed" if doc.status_code == 403 else "Error"
|
|
628
616
|
}
|
|
629
|
-
with open("
|
|
617
|
+
with open("error_pages.jsonl", "a") as f:
|
|
630
618
|
json.dump(report, f)
|
|
631
619
|
f.write("\n")
|
|
632
620
|
|
|
@@ -634,13 +622,13 @@ async def main():
|
|
|
634
622
|
spider = Spider(
|
|
635
623
|
start_url="https://example.com",
|
|
636
624
|
max_depth=2,
|
|
637
|
-
on_page_crawled=
|
|
625
|
+
on_page_crawled=track_errors,
|
|
638
626
|
accumulate_results=False, # Memory efficient
|
|
639
627
|
request_delay=0.5 # Be polite
|
|
640
628
|
)
|
|
641
629
|
|
|
642
630
|
await spider.run_async()
|
|
643
|
-
print("
|
|
631
|
+
print("Error pages report saved to error_pages.jsonl")
|
|
644
632
|
|
|
645
633
|
asyncio.run(main())
|
|
646
634
|
```
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
#
|
|
1
|
+
# linktrace — common tasks. Run `just` to list.
|
|
2
2
|
|
|
3
3
|
# Show available recipes
|
|
4
4
|
default:
|
|
@@ -27,11 +27,11 @@ test:
|
|
|
27
27
|
|
|
28
28
|
# Run tests with coverage
|
|
29
29
|
test-cov:
|
|
30
|
-
uv run pytest -v --cov=
|
|
30
|
+
uv run pytest -v --cov=linktrace --cov-report=term-missing --cov-report=html
|
|
31
31
|
|
|
32
32
|
# Run the spider against the demo URL
|
|
33
33
|
run:
|
|
34
|
-
uv run python -m
|
|
34
|
+
uv run python -m linktrace.Spider
|
|
35
35
|
|
|
36
36
|
# Build the wheel + sdist into dist/
|
|
37
37
|
build:
|
|
@@ -252,6 +252,14 @@ class Crawler:
|
|
|
252
252
|
doc.response_headers = cached.response_headers
|
|
253
253
|
return doc
|
|
254
254
|
|
|
255
|
+
# Check robots.txt before fetching
|
|
256
|
+
if self.respect_robots_txt and self.robots_manager:
|
|
257
|
+
if not await self.robots_manager.is_allowed(url):
|
|
258
|
+
self._logger.info(f"Disallowed by robots.txt: {url}")
|
|
259
|
+
doc = Document(url, None)
|
|
260
|
+
doc.status_code = 403
|
|
261
|
+
return doc
|
|
262
|
+
|
|
255
263
|
# Get effective delay (robots.txt or configured)
|
|
256
264
|
if self.respect_robots_txt and self.robots_manager:
|
|
257
265
|
delay = await self.robots_manager.get_crawl_delay(url)
|
|
@@ -341,6 +349,8 @@ class Crawler:
|
|
|
341
349
|
if link.attrib["href"] and link.attrib["href"][:1] not in skip_words
|
|
342
350
|
]
|
|
343
351
|
|
|
352
|
+
# Use local list per document to avoid state leakage
|
|
353
|
+
found_links: list[HtmlLink] = []
|
|
344
354
|
for link in links:
|
|
345
355
|
link_url = link.attrib["href"]
|
|
346
356
|
link_url = urljoin(url, link_url)
|
|
@@ -353,11 +363,11 @@ class Crawler:
|
|
|
353
363
|
link_url not in self.visited_urls
|
|
354
364
|
and link_url not in self._queue
|
|
355
365
|
):
|
|
356
|
-
|
|
366
|
+
found_links.append(HtmlLink(link_url, title))
|
|
357
367
|
|
|
358
368
|
doc.internal_links = [
|
|
359
369
|
link
|
|
360
|
-
for link in
|
|
370
|
+
for link in found_links
|
|
361
371
|
if self.get_domain_parts(link.url) == self.get_domain_parts(doc.url)
|
|
362
372
|
or link.url[:1] == "/"
|
|
363
373
|
or link.url[:1] == ""
|
|
@@ -366,7 +376,7 @@ class Crawler:
|
|
|
366
376
|
|
|
367
377
|
doc.external_links = [
|
|
368
378
|
link
|
|
369
|
-
for link in
|
|
379
|
+
for link in found_links
|
|
370
380
|
if self.get_domain_parts(link.url) != self.get_domain_parts(doc.url)
|
|
371
381
|
and urlparse(link.url).scheme != ""
|
|
372
382
|
]
|
|
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "linktrace"
|
|
7
|
-
version = "0.
|
|
7
|
+
version = "0.2.0"
|
|
8
8
|
description = "Async web crawler with rate limiting, robots.txt support, and broken link tracking"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.12"
|
|
@@ -44,7 +44,7 @@ dev = [
|
|
|
44
44
|
packages = ["linktrace"]
|
|
45
45
|
|
|
46
46
|
[tool.ruff]
|
|
47
|
-
target-version = "
|
|
47
|
+
target-version = "py312"
|
|
48
48
|
line-length = 88
|
|
49
49
|
|
|
50
50
|
[tool.ruff.lint]
|
|
@@ -82,7 +82,7 @@ precision = 2
|
|
|
82
82
|
directory = "htmlcov"
|
|
83
83
|
|
|
84
84
|
[tool.mypy]
|
|
85
|
-
python_version = "
|
|
85
|
+
python_version = "3.12"
|
|
86
86
|
warn_return_any = true
|
|
87
87
|
warn_unused_configs = true
|
|
88
88
|
disallow_untyped_defs = false
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|