linktrace 0.2.0__tar.gz → 0.2.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {linktrace-0.2.0 → linktrace-0.2.1}/PKG-INFO +15 -5
- {linktrace-0.2.0 → linktrace-0.2.1}/README.md +14 -4
- {linktrace-0.2.0 → linktrace-0.2.1}/pyproject.toml +3 -3
- {linktrace-0.2.0 → linktrace-0.2.1}/uv.lock +1 -1
- {linktrace-0.2.0 → linktrace-0.2.1}/.coverage +0 -0
- {linktrace-0.2.0 → linktrace-0.2.1}/.github/workflows/publish.yml +0 -0
- {linktrace-0.2.0 → linktrace-0.2.1}/.gitignore +0 -0
- {linktrace-0.2.0 → linktrace-0.2.1}/.pre-commit-config.yaml +0 -0
- {linktrace-0.2.0 → linktrace-0.2.1}/.python-version +0 -0
- {linktrace-0.2.0 → linktrace-0.2.1}/.vscode/launch.json +0 -0
- {linktrace-0.2.0 → linktrace-0.2.1}/.vscode/settings.json +0 -0
- {linktrace-0.2.0 → linktrace-0.2.1}/LICENSE +0 -0
- {linktrace-0.2.0 → linktrace-0.2.1}/docs/api-reference.md +0 -0
- {linktrace-0.2.0 → linktrace-0.2.1}/docs/core-concepts.md +0 -0
- {linktrace-0.2.0 → linktrace-0.2.1}/docs/examples.md +0 -0
- {linktrace-0.2.0 → linktrace-0.2.1}/docs/getting-started.md +0 -0
- {linktrace-0.2.0 → linktrace-0.2.1}/docs/troubleshooting.md +0 -0
- {linktrace-0.2.0 → linktrace-0.2.1}/justfile +0 -0
- {linktrace-0.2.0 → linktrace-0.2.1}/linktrace/Crawler.py +0 -0
- {linktrace-0.2.0 → linktrace-0.2.1}/linktrace/Serializers.py +0 -0
- {linktrace-0.2.0 → linktrace-0.2.1}/linktrace/Spider.py +0 -0
- {linktrace-0.2.0 → linktrace-0.2.1}/linktrace/__init__.py +0 -0
- {linktrace-0.2.0 → linktrace-0.2.1}/linktrace/cache.py +0 -0
- {linktrace-0.2.0 → linktrace-0.2.1}/linktrace/py.typed +0 -0
- {linktrace-0.2.0 → linktrace-0.2.1}/linktrace/robots.py +0 -0
- {linktrace-0.2.0 → linktrace-0.2.1}/notebooks/crawl_cnn.ipynb +0 -0
- {linktrace-0.2.0 → linktrace-0.2.1}/notebooks/crawl_cnn_callbacks.ipynb +0 -0
- {linktrace-0.2.0 → linktrace-0.2.1}/notebooks/crawl_tax_assessor.ipynb +0 -0
- {linktrace-0.2.0 → linktrace-0.2.1}/settings.yaml +0 -0
- {linktrace-0.2.0 → linktrace-0.2.1}/tests/__init__.py +0 -0
- {linktrace-0.2.0 → linktrace-0.2.1}/tests/conftest.py +0 -0
- {linktrace-0.2.0 → linktrace-0.2.1}/tests/test_crawler.py +0 -0
- {linktrace-0.2.0 → linktrace-0.2.1}/tests/test_models.py +0 -0
- {linktrace-0.2.0 → linktrace-0.2.1}/tests/test_rate_limiting_and_broken_links.py +0 -0
- {linktrace-0.2.0 → linktrace-0.2.1}/tests/test_serializers.py +0 -0
- {linktrace-0.2.0 → linktrace-0.2.1}/tests/test_spider.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: linktrace
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.1
|
|
4
4
|
Summary: Async web crawler with rate limiting, robots.txt support, and broken link tracking
|
|
5
5
|
License-File: LICENSE
|
|
6
6
|
Requires-Python: >=3.12
|
|
@@ -123,7 +123,7 @@ spider = Spider(
|
|
|
123
123
|
```python
|
|
124
124
|
spider = Spider(
|
|
125
125
|
start_url="https://example.com",
|
|
126
|
-
cache_dir=".
|
|
126
|
+
cache_dir=".linktrace_cache" # Enable disk caching (default: None/disabled)
|
|
127
127
|
)
|
|
128
128
|
# 2nd run will be 10-50x faster for same URLs
|
|
129
129
|
```
|
|
@@ -352,13 +352,23 @@ Spider (orchestrator)
|
|
|
352
352
|
└─ CookieJar (automatic cookie handling)
|
|
353
353
|
```
|
|
354
354
|
|
|
355
|
-
Spider manages the crawl queue and traversal. Crawler handles individual document fetching/parsing. All requests share one persistent aiohttp session per Spider instance.
|
|
355
|
+
Spider manages the crawl queue and traversal. Crawler handles individual document fetching/parsing. All requests share one persistent aiohttp session per Spider instance, so connection pooling, cookies, SSL configuration, and DNS caching are reused across the crawl.
|
|
356
356
|
|
|
357
357
|
## Why linktrace?
|
|
358
358
|
|
|
359
|
-
|
|
359
|
+
Scrapy is an excellent full crawling and extraction framework. `linktrace` is designed for a narrower job: fast async link analysis with minimal setup.
|
|
360
360
|
|
|
361
|
-
|
|
361
|
+
Instead of building a Scrapy project around spiders, requests, responses, callbacks, items, pipelines, middleware, and settings, `linktrace` gives you a direct document-centric API. Each crawled URL becomes a `Document` object containing the page source, title, status code, response headers, domain, internal links, external links, and crawl status metadata.
|
|
362
|
+
|
|
363
|
+
That makes `linktrace` useful when your goal is to inspect site structure, trace links, audit crawl status, or export crawl results to dataframe-oriented tools without creating a larger scraping project.
|
|
364
|
+
|
|
365
|
+
`linktrace` also reuses a persistent `aiohttp` session during a crawl. Connection pooling, cookie reuse, SSL configuration, request timeouts, per-host limits, and DNS caching are carried across requests, which can make repeated same-domain crawls much faster than creating a fresh client/session per URL.
|
|
366
|
+
|
|
367
|
+
**Use Scrapy when:** you need a mature scraping framework with item pipelines, middleware, schedulers, broad ecosystem support, and complex extraction workflows.
|
|
368
|
+
|
|
369
|
+
**Use linktrace when:** you want a focused async crawler that turns URLs into analyzable `Document` objects with automatic link classification and simple exports.
|
|
370
|
+
|
|
371
|
+
**vs requests + BeautifulSoup:** Built-in async concurrency, automatic session reuse, retries, caching, rate limiting, and structured document objects. Better for crawling multiple pages.
|
|
362
372
|
|
|
363
373
|
**vs Selenium:** Pure HTTP crawler (no JS execution). Faster, lighter, but can't handle dynamic sites.
|
|
364
374
|
|
|
@@ -100,7 +100,7 @@ spider = Spider(
|
|
|
100
100
|
```python
|
|
101
101
|
spider = Spider(
|
|
102
102
|
start_url="https://example.com",
|
|
103
|
-
cache_dir=".
|
|
103
|
+
cache_dir=".linktrace_cache" # Enable disk caching (default: None/disabled)
|
|
104
104
|
)
|
|
105
105
|
# 2nd run will be 10-50x faster for same URLs
|
|
106
106
|
```
|
|
@@ -329,13 +329,23 @@ Spider (orchestrator)
|
|
|
329
329
|
└─ CookieJar (automatic cookie handling)
|
|
330
330
|
```
|
|
331
331
|
|
|
332
|
-
Spider manages the crawl queue and traversal. Crawler handles individual document fetching/parsing. All requests share one persistent aiohttp session per Spider instance.
|
|
332
|
+
Spider manages the crawl queue and traversal. Crawler handles individual document fetching/parsing. All requests share one persistent aiohttp session per Spider instance, so connection pooling, cookies, SSL configuration, and DNS caching are reused across the crawl.
|
|
333
333
|
|
|
334
334
|
## Why linktrace?
|
|
335
335
|
|
|
336
|
-
|
|
336
|
+
Scrapy is an excellent full crawling and extraction framework. `linktrace` is designed for a narrower job: fast async link analysis with minimal setup.
|
|
337
337
|
|
|
338
|
-
|
|
338
|
+
Instead of building a Scrapy project around spiders, requests, responses, callbacks, items, pipelines, middleware, and settings, `linktrace` gives you a direct document-centric API. Each crawled URL becomes a `Document` object containing the page source, title, status code, response headers, domain, internal links, external links, and crawl status metadata.
|
|
339
|
+
|
|
340
|
+
That makes `linktrace` useful when your goal is to inspect site structure, trace links, audit crawl status, or export crawl results to dataframe-oriented tools without creating a larger scraping project.
|
|
341
|
+
|
|
342
|
+
`linktrace` also reuses a persistent `aiohttp` session during a crawl. Connection pooling, cookie reuse, SSL configuration, request timeouts, per-host limits, and DNS caching are carried across requests, which can make repeated same-domain crawls much faster than creating a fresh client/session per URL.
|
|
343
|
+
|
|
344
|
+
**Use Scrapy when:** you need a mature scraping framework with item pipelines, middleware, schedulers, broad ecosystem support, and complex extraction workflows.
|
|
345
|
+
|
|
346
|
+
**Use linktrace when:** you want a focused async crawler that turns URLs into analyzable `Document` objects with automatic link classification and simple exports.
|
|
347
|
+
|
|
348
|
+
**vs requests + BeautifulSoup:** Built-in async concurrency, automatic session reuse, retries, caching, rate limiting, and structured document objects. Better for crawling multiple pages.
|
|
339
349
|
|
|
340
350
|
**vs Selenium:** Pure HTTP crawler (no JS execution). Faster, lighter, but can't handle dynamic sites.
|
|
341
351
|
|
|
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "linktrace"
|
|
7
|
-
version = "0.2.
|
|
7
|
+
version = "0.2.1"
|
|
8
8
|
description = "Async web crawler with rate limiting, robots.txt support, and broken link tracking"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.12"
|
|
@@ -44,7 +44,7 @@ dev = [
|
|
|
44
44
|
packages = ["linktrace"]
|
|
45
45
|
|
|
46
46
|
[tool.ruff]
|
|
47
|
-
target-version = "
|
|
47
|
+
target-version = "0.2.1"
|
|
48
48
|
line-length = 88
|
|
49
49
|
|
|
50
50
|
[tool.ruff.lint]
|
|
@@ -82,7 +82,7 @@ precision = 2
|
|
|
82
82
|
directory = "htmlcov"
|
|
83
83
|
|
|
84
84
|
[tool.mypy]
|
|
85
|
-
python_version = "
|
|
85
|
+
python_version = "0.2.1"
|
|
86
86
|
warn_return_any = true
|
|
87
87
|
warn_unused_configs = true
|
|
88
88
|
disallow_untyped_defs = false
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|