linktrace 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. linktrace-0.1.0/.coverage +0 -0
  2. linktrace-0.1.0/.github/workflows/publish.yml +31 -0
  3. linktrace-0.1.0/.gitignore +13 -0
  4. linktrace-0.1.0/.pre-commit-config.yaml +27 -0
  5. linktrace-0.1.0/.python-version +1 -0
  6. linktrace-0.1.0/.vscode/launch.json +15 -0
  7. linktrace-0.1.0/.vscode/settings.json +17 -0
  8. linktrace-0.1.0/LICENSE +21 -0
  9. linktrace-0.1.0/PKG-INFO +390 -0
  10. linktrace-0.1.0/README.md +367 -0
  11. linktrace-0.1.0/WebCrawler/Crawler.py +396 -0
  12. linktrace-0.1.0/WebCrawler/Serializers.py +165 -0
  13. linktrace-0.1.0/WebCrawler/Spider.py +213 -0
  14. linktrace-0.1.0/WebCrawler/__init__.py +17 -0
  15. linktrace-0.1.0/WebCrawler/cache.py +109 -0
  16. linktrace-0.1.0/WebCrawler/py.typed +0 -0
  17. linktrace-0.1.0/WebCrawler/robots.py +117 -0
  18. linktrace-0.1.0/docs/api-reference.md +490 -0
  19. linktrace-0.1.0/docs/core-concepts.md +282 -0
  20. linktrace-0.1.0/docs/examples.md +646 -0
  21. linktrace-0.1.0/docs/getting-started.md +163 -0
  22. linktrace-0.1.0/docs/troubleshooting.md +413 -0
  23. linktrace-0.1.0/justfile +42 -0
  24. linktrace-0.1.0/notebooks/crawl_cnn.ipynb +2132 -0
  25. linktrace-0.1.0/notebooks/crawl_cnn_callbacks.ipynb +1842 -0
  26. linktrace-0.1.0/notebooks/crawl_tax_assessor.ipynb +5800 -0
  27. linktrace-0.1.0/pyproject.toml +94 -0
  28. linktrace-0.1.0/settings.yaml +31 -0
  29. linktrace-0.1.0/tests/__init__.py +0 -0
  30. linktrace-0.1.0/tests/conftest.py +51 -0
  31. linktrace-0.1.0/tests/test_crawler.py +201 -0
  32. linktrace-0.1.0/tests/test_models.py +120 -0
  33. linktrace-0.1.0/tests/test_rate_limiting_and_broken_links.py +245 -0
  34. linktrace-0.1.0/tests/test_serializers.py +184 -0
  35. linktrace-0.1.0/tests/test_spider.py +286 -0
  36. linktrace-0.1.0/uv.lock +1985 -0
Binary file
@@ -0,0 +1,31 @@
1
+ name: Publish to PyPI
2
+
3
+ on:
4
+ release:
5
+ types: [published]
6
+
7
+ jobs:
8
+ deploy:
9
+ runs-on: ubuntu-latest
10
+ permissions:
11
+ id-token: write
12
+ environment: pypi
13
+
14
+ steps:
15
+ - uses: actions/checkout@v4
16
+
17
+ - name: Set up Python
18
+ uses: actions/setup-python@v5
19
+ with:
20
+ python-version: "3.12"
21
+
22
+ - name: Install dependencies
23
+ run: |
24
+ python -m pip install --upgrade pip
25
+ pip install build
26
+
27
+ - name: Build package
28
+ run: python -m build
29
+
30
+ - name: Publish to PyPI
31
+ uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,13 @@
1
+ .venv/
2
+ __pycache__/
3
+ *.pyc
4
+ .ruff_cache/
5
+ *.log
6
+ dist/
7
+ .ipynb_checkpoints/
8
+ *.tmp
9
+ .claude_cache/
10
+ .assessor_cache/
11
+ .claude/
12
+ .mypy_cache/
13
+ .pytest_cache/
@@ -0,0 +1,27 @@
1
+ # Run `uv run pre-commit install` once to enable. Hooks run on `git commit`.
2
+ repos:
3
+ - repo: https://github.com/astral-sh/ruff-pre-commit
4
+ rev: v0.15.16
5
+ hooks:
6
+ # Lint + autofix anything safe (sorts imports, removes unused, etc.)
7
+ - id: ruff
8
+ args: [--fix]
9
+ exclude: "^notebooks/"
10
+ # Format (black-compatible)
11
+ - id: ruff-format
12
+ exclude: "^notebooks/"
13
+ - repo: https://github.com/pre-commit/mirrors-mypy
14
+ rev: v1.14.1
15
+ hooks:
16
+ - id: mypy
17
+ args: [--ignore-missing-imports]
18
+ additional_dependencies: [aiohttp, lxml, tldextract, tenacity, aiofiles]
19
+ - repo: local
20
+ hooks:
21
+ - id: pytest
22
+ name: pytest
23
+ entry: uv run pytest -q
24
+ language: system
25
+ types: [python]
26
+ pass_filenames: false
27
+ stages: [commit]
@@ -0,0 +1 @@
1
+ 3.12
@@ -0,0 +1,15 @@
1
+ {
2
+ // Use IntelliSense to learn about possible attributes.
3
+ // Hover to view descriptions of existing attributes.
4
+ // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
5
+ "version": "0.2.0",
6
+ "configurations": [
7
+ {
8
+ "name": "Python Debugger: Current File",
9
+ "type": "debugpy",
10
+ "request": "launch",
11
+ "program": "${file}",
12
+ "console": "integratedTerminal"
13
+ }
14
+ ]
15
+ }
@@ -0,0 +1,17 @@
1
+ {
2
+ "python.defaultInterpreterPath": "${workspaceFolder}/.venv/bin/python",
3
+ "python.testing.pytestArgs": [
4
+ "tests"
5
+ ],
6
+ "python.testing.pytestPath": "${workspaceFolder}/.venv/bin/pytest",
7
+ "python.testing.unittestEnabled": false,
8
+ "python.testing.pytestEnabled": true,
9
+ "chat.tools.terminal.autoApprove": {
10
+ "git add": true,
11
+ "git commit": true,
12
+ "/^python -m pytest tests/ -v --tb=short 2>&1 \\| head -100$/": {
13
+ "approve": true,
14
+ "matchCommandLine": true
15
+ }
16
+ }
17
+ }
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Jay Baywatch
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,390 @@
1
+ Metadata-Version: 2.4
2
+ Name: linktrace
3
+ Version: 0.1.0
4
+ Summary: Async web crawler with rate limiting, robots.txt support, and broken link tracking
5
+ License-File: LICENSE
6
+ Requires-Python: >=3.12
7
+ Requires-Dist: aiofiles>=23.0
8
+ Requires-Dist: aiohttp>=3.10
9
+ Requires-Dist: lxml>=5.0
10
+ Requires-Dist: tenacity>=8.2.3
11
+ Requires-Dist: tldextract>=5.0
12
+ Provides-Extra: pandas
13
+ Requires-Dist: pandas>=2.0; extra == 'pandas'
14
+ Provides-Extra: polars
15
+ Requires-Dist: polars>=1.0; extra == 'polars'
16
+ Provides-Extra: pyarrow
17
+ Requires-Dist: pyarrow>=14.0; extra == 'pyarrow'
18
+ Provides-Extra: serializers
19
+ Requires-Dist: pandas>=2.0; extra == 'serializers'
20
+ Requires-Dist: polars>=1.0; extra == 'serializers'
21
+ Requires-Dist: pyarrow>=14.0; extra == 'serializers'
22
+ Description-Content-Type: text/markdown
23
+
24
+ # WebCrawler
25
+
26
+ Lightweight async web crawler for link analysis and HTML document processing.
27
+
28
+ **Perfect for:** Site structure analysis, link tracking, concurrent page fetching, HTML document transformation.
29
+
30
+ **Not:** A replacement for Scrapy. Use this when you need simple, focused crawling with automatic link classification and clean document models.
31
+
32
+ ## Key Features
33
+
34
+ - ⚡ **Async/await native** — Built on asyncio + aiohttp for concurrent requests
35
+ - 🔗 **Automatic link classification** — Distinguishes internal vs external links by domain
36
+ - 📄 **Rich document model** — Full HTML source, parsed links, metadata, headers
37
+ - 🔄 **Persistent sessions** — Connection pooling for 10-100x faster same-domain crawls
38
+ - 🔁 **Retries + backoff** — Exponential backoff for transient errors (timeouts, 5xx)
39
+ - ⏱️ **Rate limiting** — Per-domain rate limiting with asyncio.Lock, no thundering herd
40
+ - 🤖 **robots.txt support** — Automatically respect Crawl-delay directives per domain
41
+ - 🔍 **Broken link tracking** — Audit 404s and 5xx errors for site structure validation
42
+ - 💾 **Optional caching** — Disk-based cache (1-day TTL) for repeat crawls
43
+ - 🔐 **SSL verification** — Secure by default, with corporate proxy support
44
+ - 🍪 **Automatic cookies** — Set-Cookie extraction and sending built-in
45
+ - 🔀 **Traversal strategies** — BFS (broad) or DFS (deep) crawling
46
+ - 📊 **Multi-format export** — JSON, Pandas, Polars, PyArrow for data analysis
47
+ - 📍 **Callbacks & streaming** — Process results as crawled without memory buildup
48
+
49
+ ## Quick Start
50
+
51
+ ```python
52
+ import asyncio
53
+ from WebCrawler import Spider
54
+
55
+ async def main():
56
+ spider = Spider(start_url="https://example.com", max_depth=2)
57
+ documents = await spider.run_async()
58
+
59
+ for doc in documents:
60
+ print(f"{doc.url}")
61
+ print(f" Internal links: {len(doc.internal_links)}")
62
+ print(f" External links: {len(doc.external_links)}")
63
+
64
+ asyncio.run(main())
65
+ ```
66
+
67
+ ## Installation
68
+
69
+ ```bash
70
+ pip install webcrawler
71
+ ```
72
+
73
+ **Optional export formats:**
74
+ ```bash
75
+ pip install webcrawler[serializers] # pandas + polars + pyarrow
76
+ pip install webcrawler[pandas] # Just pandas
77
+ ```
78
+
79
+ ## Core Concepts
80
+
81
+ ### Spider
82
+ High-level orchestrator that crawls multiple pages using BFS (breadth-first) or DFS (depth-first) traversal.
83
+
84
+ ### Crawler
85
+ Low-level engine that fetches and parses individual documents. Handles retries, caching, SSL, cookies, sessions.
86
+
87
+ ### Document
88
+ Rich object containing:
89
+ - `url` — page URL
90
+ - `title` — HTML title tag
91
+ - `source` — raw HTML
92
+ - `internal_links` — links to same domain
93
+ - `external_links` — links to other domains
94
+ - `status_code`, `response_headers`, `domain` — metadata
95
+
96
+ See [Core Concepts](docs/core-concepts.md) for more.
97
+
98
+ ## Configuration
99
+
100
+ ### Basic Crawl
101
+
102
+ ```python
103
+ spider = Spider(
104
+ start_url="https://example.com",
105
+ max_depth=3, # How deep to follow links
106
+ traversal_strategy="bfs" # "bfs" (default) or "dfs"
107
+ )
108
+ documents = await spider.run_async()
109
+ ```
110
+
111
+ ### Retries & Timeouts
112
+
113
+ ```python
114
+ spider = Spider(
115
+ start_url="https://example.com",
116
+ request_timeout=15, # Seconds per request (default: 30)
117
+ max_retries=5, # Retry transient errors (default: 3)
118
+ )
119
+ ```
120
+
121
+ ### Caching
122
+
123
+ ```python
124
+ spider = Spider(
125
+ start_url="https://example.com",
126
+ cache_dir=".webcrawler_cache" # Enable disk caching (default: None/disabled)
127
+ )
128
+ # 2nd run will be 10-50x faster for same URLs
129
+ ```
130
+
131
+ ### SSL & Corporate Proxies
132
+
133
+ ```python
134
+ # Default: verify SSL with system CA
135
+ spider = Spider(start_url="https://example.com")
136
+
137
+ # Corporate proxy with custom CA bundle
138
+ spider = Spider(
139
+ start_url="https://example.com",
140
+ ssl_verify="/path/to/corporate-ca.pem"
141
+ )
142
+
143
+ # Self-signed certs (testing only)
144
+ spider = Spider(
145
+ start_url="https://example.com",
146
+ ssl_verify=False # ⚠️ Insecure
147
+ )
148
+ ```
149
+
150
+ Cookies are handled automatically — no configuration needed.
151
+
152
+ ### Callbacks: Process Results in Real-Time
153
+
154
+ For large crawls, avoid memory buildup by processing documents as they're crawled:
155
+
156
+ ```python
157
+ # Stream results to disk
158
+ async def save_result(doc):
159
+ with open("results.jsonl", "a") as f:
160
+ f.write(json.dumps({"url": doc.url, "title": doc.title}) + "\n")
161
+
162
+ spider = Spider(
163
+ start_url="https://example.com",
164
+ on_page_crawled=save_result,
165
+ accumulate_results=False, # Don't keep in memory
166
+ )
167
+ await spider.run_async() # Returns [], file has results
168
+ ```
169
+
170
+ **Callback Hooks:**
171
+ - `on_page_crawled(doc)` — Called after each successful crawl. Return value accumulated if `accumulate_results=True`
172
+ - `on_error(url, exc)` — Called on crawl failures
173
+ - `on_crawl_complete()` — Called when crawl finishes (cleanup hook)
174
+
175
+ **Async Callbacks Supported:**
176
+ ```python
177
+ async def save_to_db(doc):
178
+ await db.insert(doc.url, doc.title)
179
+ return doc.url
180
+
181
+ spider = Spider(
182
+ start_url="https://example.com",
183
+ on_page_crawled=save_to_db, # Async callback
184
+ accumulate_results=True,
185
+ )
186
+ results = await spider.run_async() # Returns list of URLs
187
+ ```
188
+
189
+ **Return Logic:**
190
+ - No callback → returns all documents (default)
191
+ - Callback + `accumulate_results=False` → returns [] (streaming mode)
192
+ - Callback + `accumulate_results=True` → returns callback results
193
+
194
+ ### Traversal Strategies
195
+
196
+ **BFS (Breadth-First) — Default**
197
+ ```python
198
+ # Explores level by level: all depth-1 links, then depth-2, etc.
199
+ spider = Spider(start_url="https://example.com", max_depth=3, traversal_strategy="bfs")
200
+ ```
201
+
202
+ **DFS (Depth-First)**
203
+ ```python
204
+ # Follows single paths all the way down before exploring siblings
205
+ spider = Spider(start_url="https://example.com", max_depth=5, traversal_strategy="dfs")
206
+ ```
207
+
208
+ Use DFS for deep hierarchies (documentation sites, nested directories). Use BFS for broad exploration.
209
+
210
+ ### Rate Limiting & robots.txt
211
+
212
+ By default, WebCrawler automatically respects robots.txt `Crawl-delay` directives and enforces per-domain rate limiting:
213
+
214
+ ```python
215
+ # Automatic robots.txt respect (default)
216
+ spider = Spider(
217
+ start_url="https://example.com",
218
+ user_agent="MyBot/1.0", # Identifies your bot to robots.txt rules
219
+ )
220
+ await spider.run_async()
221
+ ```
222
+
223
+ Customize rate limiting:
224
+
225
+ ```python
226
+ # Enforce explicit delay (ignores robots.txt)
227
+ spider = Spider(
228
+ start_url="https://example.com",
229
+ request_delay=1.0, # 1 second between requests to same domain
230
+ respect_robots_txt=False, # Don't fetch robots.txt
231
+ )
232
+
233
+ # Concurrent requests to different domains, serialized to same domain
234
+ await spider.run_async()
235
+ ```
236
+
237
+ ### Broken Link Audit
238
+
239
+ Track 404s and 5xx errors for site maintenance:
240
+
241
+ ```python
242
+ spider = Spider(start_url="https://example.com", max_depth=2)
243
+ documents = await spider.run_async()
244
+
245
+ for doc in documents:
246
+ # Broken internal links (fix these first!)
247
+ for broken in doc.broken_internal_links:
248
+ print(f"{doc.url} → {broken.url} (HTTP {broken.status_code})")
249
+
250
+ # Broken external links (check if still valid)
251
+ for broken in doc.broken_external_links:
252
+ print(f"External: {broken.url} (HTTP {broken.status_code})")
253
+ ```
254
+
255
+ Stream broken links in real-time:
256
+
257
+ ```python
258
+ async def audit_broken(doc):
259
+ broken_count = len(doc.broken_internal_links) + len(doc.broken_external_links)
260
+ if broken_count > 0:
261
+ print(f"{doc.url}: {broken_count} broken links")
262
+
263
+ spider = Spider(
264
+ start_url="https://example.com",
265
+ on_page_crawled=audit_broken,
266
+ accumulate_results=False,
267
+ )
268
+ await spider.run_async()
269
+ ```
270
+
271
+ ### Export Data
272
+
273
+ ```python
274
+ from WebCrawler import Spider, Serializers
275
+
276
+ spider = Spider(start_url="https://example.com", max_depth=2)
277
+ documents = await spider.run_async()
278
+
279
+ # Export to JSON
280
+ serializer = Serializers(documents)
281
+ serializer.to_json("crawl.json", include_html=False)
282
+
283
+ # Export to Pandas (one row per link)
284
+ df = serializer.to_pandas()
285
+ print(df[["url", "title", "link_url", "link_type"]])
286
+
287
+ # Export to Polars (faster for large datasets)
288
+ df_polars = serializer.to_polars()
289
+
290
+ # Export to PyArrow (for data pipelines)
291
+ table = serializer.to_arrow()
292
+ ```
293
+
294
+ ### Link Analysis
295
+
296
+ ```python
297
+ from collections import Counter
298
+
299
+ spider = Spider(start_url="https://example.com", max_depth=2)
300
+ documents = await spider.run_async()
301
+
302
+ # Count external domains
303
+ external_domains = Counter()
304
+ for doc in documents:
305
+ for link in doc.external_links:
306
+ domain = link.url.split("/")[2]
307
+ external_domains[domain] += 1
308
+
309
+ print(external_domains.most_common(10))
310
+ ```
311
+
312
+ See [Examples](docs/examples.md) for more patterns.
313
+
314
+ ## Notebooks
315
+
316
+ Interactive examples in `notebooks/`:
317
+ - `crawl_cnn.ipynb` — Crawls CNN.com, analyzes link structure, demonstrates all export formats
318
+
319
+ ## API Reference
320
+
321
+ See [API Reference](docs/api-reference.md) for complete method documentation.
322
+
323
+ ## Troubleshooting
324
+
325
+ ### "SSL: CERTIFICATE_VERIFY_FAILED"
326
+ Use `ssl_verify=False` for self-signed certs (testing only), or `ssl_verify="/path/to/ca.pem"` for corporate proxies.
327
+
328
+ ### "Too many connections"
329
+ Reduce concurrency by lowering `max_retries` or increase timeouts. Default settings are conservative.
330
+
331
+ ### "Crawler hits timeout on deep sites"
332
+ Try DFS traversal instead of BFS, or increase `request_timeout`.
333
+
334
+ See [Troubleshooting](docs/troubleshooting.md) for more.
335
+
336
+ ## Performance
337
+
338
+ Typical performance (single-domain crawl):
339
+ - **First run:** ~50-500ms per page (network-bound)
340
+ - **Cached run:** ~1-10ms per page (2-50x faster)
341
+ - **Memory:** ~1MB per 100 pages
342
+
343
+ With persistent sessions + connection pooling, same-domain requests are 10-100x faster than per-request session setup.
344
+
345
+ ## Architecture
346
+
347
+ ```
348
+ Spider (orchestrator)
349
+ └─ Crawler (persistent session)
350
+ ├─ aiohttp (HTTP requests + connection pooling)
351
+ ├─ lxml (HTML parsing)
352
+ ├─ ResponseCache (optional disk caching)
353
+ └─ CookieJar (automatic cookie handling)
354
+ ```
355
+
356
+ Spider manages the crawl queue and traversal. Crawler handles individual document fetching/parsing. All requests share one persistent aiohttp session per Spider instance.
357
+
358
+ ## Why WebCrawler?
359
+
360
+ **vs Scrapy:** Lightweight, focused, simpler API for link analysis. Scrapy is better for complex extraction pipelines.
361
+
362
+ **vs requests + BeautifulSoup:** Built-in async concurrency, automatic session reuse, retries, caching. Better for crawling multiple pages.
363
+
364
+ **vs Selenium:** Pure HTTP crawler (no JS execution). Faster, lighter, but can't handle dynamic sites.
365
+
366
+ ## Testing
367
+
368
+ ```bash
369
+ just test # Run all tests
370
+ just test-cov # Run with coverage report
371
+ ```
372
+
373
+ All 91 tests pass. 100% of core crawling paths tested (rate limiting, broken link tracking, robots.txt, callbacks).
374
+
375
+ ## Contributing
376
+
377
+ Bug reports and pull requests welcome on GitHub.
378
+
379
+ ## License
380
+
381
+ MIT
382
+
383
+ ---
384
+
385
+ **Documentation:**
386
+ - [Getting Started](docs/getting-started.md)
387
+ - [Core Concepts](docs/core-concepts.md)
388
+ - [API Reference](docs/api-reference.md)
389
+ - [Examples](docs/examples.md)
390
+ - [Troubleshooting](docs/troubleshooting.md)