linktrace 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- linktrace-0.1.0/.coverage +0 -0
- linktrace-0.1.0/.github/workflows/publish.yml +31 -0
- linktrace-0.1.0/.gitignore +13 -0
- linktrace-0.1.0/.pre-commit-config.yaml +27 -0
- linktrace-0.1.0/.python-version +1 -0
- linktrace-0.1.0/.vscode/launch.json +15 -0
- linktrace-0.1.0/.vscode/settings.json +17 -0
- linktrace-0.1.0/LICENSE +21 -0
- linktrace-0.1.0/PKG-INFO +390 -0
- linktrace-0.1.0/README.md +367 -0
- linktrace-0.1.0/WebCrawler/Crawler.py +396 -0
- linktrace-0.1.0/WebCrawler/Serializers.py +165 -0
- linktrace-0.1.0/WebCrawler/Spider.py +213 -0
- linktrace-0.1.0/WebCrawler/__init__.py +17 -0
- linktrace-0.1.0/WebCrawler/cache.py +109 -0
- linktrace-0.1.0/WebCrawler/py.typed +0 -0
- linktrace-0.1.0/WebCrawler/robots.py +117 -0
- linktrace-0.1.0/docs/api-reference.md +490 -0
- linktrace-0.1.0/docs/core-concepts.md +282 -0
- linktrace-0.1.0/docs/examples.md +646 -0
- linktrace-0.1.0/docs/getting-started.md +163 -0
- linktrace-0.1.0/docs/troubleshooting.md +413 -0
- linktrace-0.1.0/justfile +42 -0
- linktrace-0.1.0/notebooks/crawl_cnn.ipynb +2132 -0
- linktrace-0.1.0/notebooks/crawl_cnn_callbacks.ipynb +1842 -0
- linktrace-0.1.0/notebooks/crawl_tax_assessor.ipynb +5800 -0
- linktrace-0.1.0/pyproject.toml +94 -0
- linktrace-0.1.0/settings.yaml +31 -0
- linktrace-0.1.0/tests/__init__.py +0 -0
- linktrace-0.1.0/tests/conftest.py +51 -0
- linktrace-0.1.0/tests/test_crawler.py +201 -0
- linktrace-0.1.0/tests/test_models.py +120 -0
- linktrace-0.1.0/tests/test_rate_limiting_and_broken_links.py +245 -0
- linktrace-0.1.0/tests/test_serializers.py +184 -0
- linktrace-0.1.0/tests/test_spider.py +286 -0
- linktrace-0.1.0/uv.lock +1985 -0
|
Binary file
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
name: Publish to PyPI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
release:
|
|
5
|
+
types: [published]
|
|
6
|
+
|
|
7
|
+
jobs:
|
|
8
|
+
deploy:
|
|
9
|
+
runs-on: ubuntu-latest
|
|
10
|
+
permissions:
|
|
11
|
+
id-token: write
|
|
12
|
+
environment: pypi
|
|
13
|
+
|
|
14
|
+
steps:
|
|
15
|
+
- uses: actions/checkout@v4
|
|
16
|
+
|
|
17
|
+
- name: Set up Python
|
|
18
|
+
uses: actions/setup-python@v5
|
|
19
|
+
with:
|
|
20
|
+
python-version: "3.12"
|
|
21
|
+
|
|
22
|
+
- name: Install dependencies
|
|
23
|
+
run: |
|
|
24
|
+
python -m pip install --upgrade pip
|
|
25
|
+
pip install build
|
|
26
|
+
|
|
27
|
+
- name: Build package
|
|
28
|
+
run: python -m build
|
|
29
|
+
|
|
30
|
+
- name: Publish to PyPI
|
|
31
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
# Run `uv run pre-commit install` once to enable. Hooks run on `git commit`.
|
|
2
|
+
repos:
|
|
3
|
+
- repo: https://github.com/astral-sh/ruff-pre-commit
|
|
4
|
+
rev: v0.15.16
|
|
5
|
+
hooks:
|
|
6
|
+
# Lint + autofix anything safe (sorts imports, removes unused, etc.)
|
|
7
|
+
- id: ruff
|
|
8
|
+
args: [--fix]
|
|
9
|
+
exclude: "^notebooks/"
|
|
10
|
+
# Format (black-compatible)
|
|
11
|
+
- id: ruff-format
|
|
12
|
+
exclude: "^notebooks/"
|
|
13
|
+
- repo: https://github.com/pre-commit/mirrors-mypy
|
|
14
|
+
rev: v1.14.1
|
|
15
|
+
hooks:
|
|
16
|
+
- id: mypy
|
|
17
|
+
args: [--ignore-missing-imports]
|
|
18
|
+
additional_dependencies: [aiohttp, lxml, tldextract, tenacity, aiofiles]
|
|
19
|
+
- repo: local
|
|
20
|
+
hooks:
|
|
21
|
+
- id: pytest
|
|
22
|
+
name: pytest
|
|
23
|
+
entry: uv run pytest -q
|
|
24
|
+
language: system
|
|
25
|
+
types: [python]
|
|
26
|
+
pass_filenames: false
|
|
27
|
+
stages: [commit]
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
3.12
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
{
|
|
2
|
+
// Use IntelliSense to learn about possible attributes.
|
|
3
|
+
// Hover to view descriptions of existing attributes.
|
|
4
|
+
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
|
|
5
|
+
"version": "0.2.0",
|
|
6
|
+
"configurations": [
|
|
7
|
+
{
|
|
8
|
+
"name": "Python Debugger: Current File",
|
|
9
|
+
"type": "debugpy",
|
|
10
|
+
"request": "launch",
|
|
11
|
+
"program": "${file}",
|
|
12
|
+
"console": "integratedTerminal"
|
|
13
|
+
}
|
|
14
|
+
]
|
|
15
|
+
}
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
{
|
|
2
|
+
"python.defaultInterpreterPath": "${workspaceFolder}/.venv/bin/python",
|
|
3
|
+
"python.testing.pytestArgs": [
|
|
4
|
+
"tests"
|
|
5
|
+
],
|
|
6
|
+
"python.testing.pytestPath": "${workspaceFolder}/.venv/bin/pytest",
|
|
7
|
+
"python.testing.unittestEnabled": false,
|
|
8
|
+
"python.testing.pytestEnabled": true,
|
|
9
|
+
"chat.tools.terminal.autoApprove": {
|
|
10
|
+
"git add": true,
|
|
11
|
+
"git commit": true,
|
|
12
|
+
"/^python -m pytest tests/ -v --tb=short 2>&1 \\| head -100$/": {
|
|
13
|
+
"approve": true,
|
|
14
|
+
"matchCommandLine": true
|
|
15
|
+
}
|
|
16
|
+
}
|
|
17
|
+
}
|
linktrace-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Jay Baywatch
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
linktrace-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,390 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: linktrace
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Async web crawler with rate limiting, robots.txt support, and broken link tracking
|
|
5
|
+
License-File: LICENSE
|
|
6
|
+
Requires-Python: >=3.12
|
|
7
|
+
Requires-Dist: aiofiles>=23.0
|
|
8
|
+
Requires-Dist: aiohttp>=3.10
|
|
9
|
+
Requires-Dist: lxml>=5.0
|
|
10
|
+
Requires-Dist: tenacity>=8.2.3
|
|
11
|
+
Requires-Dist: tldextract>=5.0
|
|
12
|
+
Provides-Extra: pandas
|
|
13
|
+
Requires-Dist: pandas>=2.0; extra == 'pandas'
|
|
14
|
+
Provides-Extra: polars
|
|
15
|
+
Requires-Dist: polars>=1.0; extra == 'polars'
|
|
16
|
+
Provides-Extra: pyarrow
|
|
17
|
+
Requires-Dist: pyarrow>=14.0; extra == 'pyarrow'
|
|
18
|
+
Provides-Extra: serializers
|
|
19
|
+
Requires-Dist: pandas>=2.0; extra == 'serializers'
|
|
20
|
+
Requires-Dist: polars>=1.0; extra == 'serializers'
|
|
21
|
+
Requires-Dist: pyarrow>=14.0; extra == 'serializers'
|
|
22
|
+
Description-Content-Type: text/markdown
|
|
23
|
+
|
|
24
|
+
# WebCrawler
|
|
25
|
+
|
|
26
|
+
Lightweight async web crawler for link analysis and HTML document processing.
|
|
27
|
+
|
|
28
|
+
**Perfect for:** Site structure analysis, link tracking, concurrent page fetching, HTML document transformation.
|
|
29
|
+
|
|
30
|
+
**Not:** A replacement for Scrapy. Use this when you need simple, focused crawling with automatic link classification and clean document models.
|
|
31
|
+
|
|
32
|
+
## Key Features
|
|
33
|
+
|
|
34
|
+
- ⚡ **Async/await native** — Built on asyncio + aiohttp for concurrent requests
|
|
35
|
+
- 🔗 **Automatic link classification** — Distinguishes internal vs external links by domain
|
|
36
|
+
- 📄 **Rich document model** — Full HTML source, parsed links, metadata, headers
|
|
37
|
+
- 🔄 **Persistent sessions** — Connection pooling for 10-100x faster same-domain crawls
|
|
38
|
+
- 🔁 **Retries + backoff** — Exponential backoff for transient errors (timeouts, 5xx)
|
|
39
|
+
- ⏱️ **Rate limiting** — Per-domain rate limiting with asyncio.Lock, no thundering herd
|
|
40
|
+
- 🤖 **robots.txt support** — Automatically respect Crawl-delay directives per domain
|
|
41
|
+
- 🔍 **Broken link tracking** — Audit 404s and 5xx errors for site structure validation
|
|
42
|
+
- 💾 **Optional caching** — Disk-based cache (1-day TTL) for repeat crawls
|
|
43
|
+
- 🔐 **SSL verification** — Secure by default, with corporate proxy support
|
|
44
|
+
- 🍪 **Automatic cookies** — Set-Cookie extraction and sending built-in
|
|
45
|
+
- 🔀 **Traversal strategies** — BFS (broad) or DFS (deep) crawling
|
|
46
|
+
- 📊 **Multi-format export** — JSON, Pandas, Polars, PyArrow for data analysis
|
|
47
|
+
- 📍 **Callbacks & streaming** — Process results as crawled without memory buildup
|
|
48
|
+
|
|
49
|
+
## Quick Start
|
|
50
|
+
|
|
51
|
+
```python
|
|
52
|
+
import asyncio
|
|
53
|
+
from WebCrawler import Spider
|
|
54
|
+
|
|
55
|
+
async def main():
|
|
56
|
+
spider = Spider(start_url="https://example.com", max_depth=2)
|
|
57
|
+
documents = await spider.run_async()
|
|
58
|
+
|
|
59
|
+
for doc in documents:
|
|
60
|
+
print(f"{doc.url}")
|
|
61
|
+
print(f" Internal links: {len(doc.internal_links)}")
|
|
62
|
+
print(f" External links: {len(doc.external_links)}")
|
|
63
|
+
|
|
64
|
+
asyncio.run(main())
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
## Installation
|
|
68
|
+
|
|
69
|
+
```bash
|
|
70
|
+
pip install webcrawler
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
**Optional export formats:**
|
|
74
|
+
```bash
|
|
75
|
+
pip install webcrawler[serializers] # pandas + polars + pyarrow
|
|
76
|
+
pip install webcrawler[pandas] # Just pandas
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
## Core Concepts
|
|
80
|
+
|
|
81
|
+
### Spider
|
|
82
|
+
High-level orchestrator that crawls multiple pages using BFS (breadth-first) or DFS (depth-first) traversal.
|
|
83
|
+
|
|
84
|
+
### Crawler
|
|
85
|
+
Low-level engine that fetches and parses individual documents. Handles retries, caching, SSL, cookies, sessions.
|
|
86
|
+
|
|
87
|
+
### Document
|
|
88
|
+
Rich object containing:
|
|
89
|
+
- `url` — page URL
|
|
90
|
+
- `title` — HTML title tag
|
|
91
|
+
- `source` — raw HTML
|
|
92
|
+
- `internal_links` — links to same domain
|
|
93
|
+
- `external_links` — links to other domains
|
|
94
|
+
- `status_code`, `response_headers`, `domain` — metadata
|
|
95
|
+
|
|
96
|
+
See [Core Concepts](docs/core-concepts.md) for more.
|
|
97
|
+
|
|
98
|
+
## Configuration
|
|
99
|
+
|
|
100
|
+
### Basic Crawl
|
|
101
|
+
|
|
102
|
+
```python
|
|
103
|
+
spider = Spider(
|
|
104
|
+
start_url="https://example.com",
|
|
105
|
+
max_depth=3, # How deep to follow links
|
|
106
|
+
traversal_strategy="bfs" # "bfs" (default) or "dfs"
|
|
107
|
+
)
|
|
108
|
+
documents = await spider.run_async()
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
### Retries & Timeouts
|
|
112
|
+
|
|
113
|
+
```python
|
|
114
|
+
spider = Spider(
|
|
115
|
+
start_url="https://example.com",
|
|
116
|
+
request_timeout=15, # Seconds per request (default: 30)
|
|
117
|
+
max_retries=5, # Retry transient errors (default: 3)
|
|
118
|
+
)
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
### Caching
|
|
122
|
+
|
|
123
|
+
```python
|
|
124
|
+
spider = Spider(
|
|
125
|
+
start_url="https://example.com",
|
|
126
|
+
cache_dir=".webcrawler_cache" # Enable disk caching (default: None/disabled)
|
|
127
|
+
)
|
|
128
|
+
# 2nd run will be 10-50x faster for same URLs
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
### SSL & Corporate Proxies
|
|
132
|
+
|
|
133
|
+
```python
|
|
134
|
+
# Default: verify SSL with system CA
|
|
135
|
+
spider = Spider(start_url="https://example.com")
|
|
136
|
+
|
|
137
|
+
# Corporate proxy with custom CA bundle
|
|
138
|
+
spider = Spider(
|
|
139
|
+
start_url="https://example.com",
|
|
140
|
+
ssl_verify="/path/to/corporate-ca.pem"
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
# Self-signed certs (testing only)
|
|
144
|
+
spider = Spider(
|
|
145
|
+
start_url="https://example.com",
|
|
146
|
+
ssl_verify=False # ⚠️ Insecure
|
|
147
|
+
)
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
Cookies are handled automatically — no configuration needed.
|
|
151
|
+
|
|
152
|
+
### Callbacks: Process Results in Real-Time
|
|
153
|
+
|
|
154
|
+
For large crawls, avoid memory buildup by processing documents as they're crawled:
|
|
155
|
+
|
|
156
|
+
```python
|
|
157
|
+
# Stream results to disk
|
|
158
|
+
async def save_result(doc):
|
|
159
|
+
with open("results.jsonl", "a") as f:
|
|
160
|
+
f.write(json.dumps({"url": doc.url, "title": doc.title}) + "\n")
|
|
161
|
+
|
|
162
|
+
spider = Spider(
|
|
163
|
+
start_url="https://example.com",
|
|
164
|
+
on_page_crawled=save_result,
|
|
165
|
+
accumulate_results=False, # Don't keep in memory
|
|
166
|
+
)
|
|
167
|
+
await spider.run_async() # Returns [], file has results
|
|
168
|
+
```
|
|
169
|
+
|
|
170
|
+
**Callback Hooks:**
|
|
171
|
+
- `on_page_crawled(doc)` — Called after each successful crawl. Return value accumulated if `accumulate_results=True`
|
|
172
|
+
- `on_error(url, exc)` — Called on crawl failures
|
|
173
|
+
- `on_crawl_complete()` — Called when crawl finishes (cleanup hook)
|
|
174
|
+
|
|
175
|
+
**Async Callbacks Supported:**
|
|
176
|
+
```python
|
|
177
|
+
async def save_to_db(doc):
|
|
178
|
+
await db.insert(doc.url, doc.title)
|
|
179
|
+
return doc.url
|
|
180
|
+
|
|
181
|
+
spider = Spider(
|
|
182
|
+
start_url="https://example.com",
|
|
183
|
+
on_page_crawled=save_to_db, # Async callback
|
|
184
|
+
accumulate_results=True,
|
|
185
|
+
)
|
|
186
|
+
results = await spider.run_async() # Returns list of URLs
|
|
187
|
+
```
|
|
188
|
+
|
|
189
|
+
**Return Logic:**
|
|
190
|
+
- No callback → returns all documents (default)
|
|
191
|
+
- Callback + `accumulate_results=False` → returns [] (streaming mode)
|
|
192
|
+
- Callback + `accumulate_results=True` → returns callback results
|
|
193
|
+
|
|
194
|
+
### Traversal Strategies
|
|
195
|
+
|
|
196
|
+
**BFS (Breadth-First) — Default**
|
|
197
|
+
```python
|
|
198
|
+
# Explores level by level: all depth-1 links, then depth-2, etc.
|
|
199
|
+
spider = Spider(start_url="https://example.com", max_depth=3, traversal_strategy="bfs")
|
|
200
|
+
```
|
|
201
|
+
|
|
202
|
+
**DFS (Depth-First)**
|
|
203
|
+
```python
|
|
204
|
+
# Follows single paths all the way down before exploring siblings
|
|
205
|
+
spider = Spider(start_url="https://example.com", max_depth=5, traversal_strategy="dfs")
|
|
206
|
+
```
|
|
207
|
+
|
|
208
|
+
Use DFS for deep hierarchies (documentation sites, nested directories). Use BFS for broad exploration.
|
|
209
|
+
|
|
210
|
+
### Rate Limiting & robots.txt
|
|
211
|
+
|
|
212
|
+
By default, WebCrawler automatically respects robots.txt `Crawl-delay` directives and enforces per-domain rate limiting:
|
|
213
|
+
|
|
214
|
+
```python
|
|
215
|
+
# Automatic robots.txt respect (default)
|
|
216
|
+
spider = Spider(
|
|
217
|
+
start_url="https://example.com",
|
|
218
|
+
user_agent="MyBot/1.0", # Identifies your bot to robots.txt rules
|
|
219
|
+
)
|
|
220
|
+
await spider.run_async()
|
|
221
|
+
```
|
|
222
|
+
|
|
223
|
+
Customize rate limiting:
|
|
224
|
+
|
|
225
|
+
```python
|
|
226
|
+
# Enforce explicit delay (ignores robots.txt)
|
|
227
|
+
spider = Spider(
|
|
228
|
+
start_url="https://example.com",
|
|
229
|
+
request_delay=1.0, # 1 second between requests to same domain
|
|
230
|
+
respect_robots_txt=False, # Don't fetch robots.txt
|
|
231
|
+
)
|
|
232
|
+
|
|
233
|
+
# Concurrent requests to different domains, serialized to same domain
|
|
234
|
+
await spider.run_async()
|
|
235
|
+
```
|
|
236
|
+
|
|
237
|
+
### Broken Link Audit
|
|
238
|
+
|
|
239
|
+
Track 404s and 5xx errors for site maintenance:
|
|
240
|
+
|
|
241
|
+
```python
|
|
242
|
+
spider = Spider(start_url="https://example.com", max_depth=2)
|
|
243
|
+
documents = await spider.run_async()
|
|
244
|
+
|
|
245
|
+
for doc in documents:
|
|
246
|
+
# Broken internal links (fix these first!)
|
|
247
|
+
for broken in doc.broken_internal_links:
|
|
248
|
+
print(f"{doc.url} → {broken.url} (HTTP {broken.status_code})")
|
|
249
|
+
|
|
250
|
+
# Broken external links (check if still valid)
|
|
251
|
+
for broken in doc.broken_external_links:
|
|
252
|
+
print(f"External: {broken.url} (HTTP {broken.status_code})")
|
|
253
|
+
```
|
|
254
|
+
|
|
255
|
+
Stream broken links in real-time:
|
|
256
|
+
|
|
257
|
+
```python
|
|
258
|
+
async def audit_broken(doc):
|
|
259
|
+
broken_count = len(doc.broken_internal_links) + len(doc.broken_external_links)
|
|
260
|
+
if broken_count > 0:
|
|
261
|
+
print(f"{doc.url}: {broken_count} broken links")
|
|
262
|
+
|
|
263
|
+
spider = Spider(
|
|
264
|
+
start_url="https://example.com",
|
|
265
|
+
on_page_crawled=audit_broken,
|
|
266
|
+
accumulate_results=False,
|
|
267
|
+
)
|
|
268
|
+
await spider.run_async()
|
|
269
|
+
```
|
|
270
|
+
|
|
271
|
+
### Export Data
|
|
272
|
+
|
|
273
|
+
```python
|
|
274
|
+
from WebCrawler import Spider, Serializers
|
|
275
|
+
|
|
276
|
+
spider = Spider(start_url="https://example.com", max_depth=2)
|
|
277
|
+
documents = await spider.run_async()
|
|
278
|
+
|
|
279
|
+
# Export to JSON
|
|
280
|
+
serializer = Serializers(documents)
|
|
281
|
+
serializer.to_json("crawl.json", include_html=False)
|
|
282
|
+
|
|
283
|
+
# Export to Pandas (one row per link)
|
|
284
|
+
df = serializer.to_pandas()
|
|
285
|
+
print(df[["url", "title", "link_url", "link_type"]])
|
|
286
|
+
|
|
287
|
+
# Export to Polars (faster for large datasets)
|
|
288
|
+
df_polars = serializer.to_polars()
|
|
289
|
+
|
|
290
|
+
# Export to PyArrow (for data pipelines)
|
|
291
|
+
table = serializer.to_arrow()
|
|
292
|
+
```
|
|
293
|
+
|
|
294
|
+
### Link Analysis
|
|
295
|
+
|
|
296
|
+
```python
|
|
297
|
+
from collections import Counter
|
|
298
|
+
|
|
299
|
+
spider = Spider(start_url="https://example.com", max_depth=2)
|
|
300
|
+
documents = await spider.run_async()
|
|
301
|
+
|
|
302
|
+
# Count external domains
|
|
303
|
+
external_domains = Counter()
|
|
304
|
+
for doc in documents:
|
|
305
|
+
for link in doc.external_links:
|
|
306
|
+
domain = link.url.split("/")[2]
|
|
307
|
+
external_domains[domain] += 1
|
|
308
|
+
|
|
309
|
+
print(external_domains.most_common(10))
|
|
310
|
+
```
|
|
311
|
+
|
|
312
|
+
See [Examples](docs/examples.md) for more patterns.
|
|
313
|
+
|
|
314
|
+
## Notebooks
|
|
315
|
+
|
|
316
|
+
Interactive examples in `notebooks/`:
|
|
317
|
+
- `crawl_cnn.ipynb` — Crawls CNN.com, analyzes link structure, demonstrates all export formats
|
|
318
|
+
|
|
319
|
+
## API Reference
|
|
320
|
+
|
|
321
|
+
See [API Reference](docs/api-reference.md) for complete method documentation.
|
|
322
|
+
|
|
323
|
+
## Troubleshooting
|
|
324
|
+
|
|
325
|
+
### "SSL: CERTIFICATE_VERIFY_FAILED"
|
|
326
|
+
Use `ssl_verify=False` for self-signed certs (testing only), or `ssl_verify="/path/to/ca.pem"` for corporate proxies.
|
|
327
|
+
|
|
328
|
+
### "Too many connections"
|
|
329
|
+
Reduce concurrency by lowering `max_retries` or increase timeouts. Default settings are conservative.
|
|
330
|
+
|
|
331
|
+
### "Crawler hits timeout on deep sites"
|
|
332
|
+
Try DFS traversal instead of BFS, or increase `request_timeout`.
|
|
333
|
+
|
|
334
|
+
See [Troubleshooting](docs/troubleshooting.md) for more.
|
|
335
|
+
|
|
336
|
+
## Performance
|
|
337
|
+
|
|
338
|
+
Typical performance (single-domain crawl):
|
|
339
|
+
- **First run:** ~50-500ms per page (network-bound)
|
|
340
|
+
- **Cached run:** ~1-10ms per page (2-50x faster)
|
|
341
|
+
- **Memory:** ~1MB per 100 pages
|
|
342
|
+
|
|
343
|
+
With persistent sessions + connection pooling, same-domain requests are 10-100x faster than per-request session setup.
|
|
344
|
+
|
|
345
|
+
## Architecture
|
|
346
|
+
|
|
347
|
+
```
|
|
348
|
+
Spider (orchestrator)
|
|
349
|
+
└─ Crawler (persistent session)
|
|
350
|
+
├─ aiohttp (HTTP requests + connection pooling)
|
|
351
|
+
├─ lxml (HTML parsing)
|
|
352
|
+
├─ ResponseCache (optional disk caching)
|
|
353
|
+
└─ CookieJar (automatic cookie handling)
|
|
354
|
+
```
|
|
355
|
+
|
|
356
|
+
Spider manages the crawl queue and traversal. Crawler handles individual document fetching/parsing. All requests share one persistent aiohttp session per Spider instance.
|
|
357
|
+
|
|
358
|
+
## Why WebCrawler?
|
|
359
|
+
|
|
360
|
+
**vs Scrapy:** Lightweight, focused, simpler API for link analysis. Scrapy is better for complex extraction pipelines.
|
|
361
|
+
|
|
362
|
+
**vs requests + BeautifulSoup:** Built-in async concurrency, automatic session reuse, retries, caching. Better for crawling multiple pages.
|
|
363
|
+
|
|
364
|
+
**vs Selenium:** Pure HTTP crawler (no JS execution). Faster, lighter, but can't handle dynamic sites.
|
|
365
|
+
|
|
366
|
+
## Testing
|
|
367
|
+
|
|
368
|
+
```bash
|
|
369
|
+
just test # Run all tests
|
|
370
|
+
just test-cov # Run with coverage report
|
|
371
|
+
```
|
|
372
|
+
|
|
373
|
+
All 91 tests pass. 100% of core crawling paths tested (rate limiting, broken link tracking, robots.txt, callbacks).
|
|
374
|
+
|
|
375
|
+
## Contributing
|
|
376
|
+
|
|
377
|
+
Bug reports and pull requests welcome on GitHub.
|
|
378
|
+
|
|
379
|
+
## License
|
|
380
|
+
|
|
381
|
+
MIT
|
|
382
|
+
|
|
383
|
+
---
|
|
384
|
+
|
|
385
|
+
**Documentation:**
|
|
386
|
+
- [Getting Started](docs/getting-started.md)
|
|
387
|
+
- [Core Concepts](docs/core-concepts.md)
|
|
388
|
+
- [API Reference](docs/api-reference.md)
|
|
389
|
+
- [Examples](docs/examples.md)
|
|
390
|
+
- [Troubleshooting](docs/troubleshooting.md)
|