linktrace 0.2.0__tar.gz → 0.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. {linktrace-0.2.0 → linktrace-0.2.1}/PKG-INFO +15 -5
  2. {linktrace-0.2.0 → linktrace-0.2.1}/README.md +14 -4
  3. {linktrace-0.2.0 → linktrace-0.2.1}/pyproject.toml +3 -3
  4. {linktrace-0.2.0 → linktrace-0.2.1}/uv.lock +1 -1
  5. {linktrace-0.2.0 → linktrace-0.2.1}/.coverage +0 -0
  6. {linktrace-0.2.0 → linktrace-0.2.1}/.github/workflows/publish.yml +0 -0
  7. {linktrace-0.2.0 → linktrace-0.2.1}/.gitignore +0 -0
  8. {linktrace-0.2.0 → linktrace-0.2.1}/.pre-commit-config.yaml +0 -0
  9. {linktrace-0.2.0 → linktrace-0.2.1}/.python-version +0 -0
  10. {linktrace-0.2.0 → linktrace-0.2.1}/.vscode/launch.json +0 -0
  11. {linktrace-0.2.0 → linktrace-0.2.1}/.vscode/settings.json +0 -0
  12. {linktrace-0.2.0 → linktrace-0.2.1}/LICENSE +0 -0
  13. {linktrace-0.2.0 → linktrace-0.2.1}/docs/api-reference.md +0 -0
  14. {linktrace-0.2.0 → linktrace-0.2.1}/docs/core-concepts.md +0 -0
  15. {linktrace-0.2.0 → linktrace-0.2.1}/docs/examples.md +0 -0
  16. {linktrace-0.2.0 → linktrace-0.2.1}/docs/getting-started.md +0 -0
  17. {linktrace-0.2.0 → linktrace-0.2.1}/docs/troubleshooting.md +0 -0
  18. {linktrace-0.2.0 → linktrace-0.2.1}/justfile +0 -0
  19. {linktrace-0.2.0 → linktrace-0.2.1}/linktrace/Crawler.py +0 -0
  20. {linktrace-0.2.0 → linktrace-0.2.1}/linktrace/Serializers.py +0 -0
  21. {linktrace-0.2.0 → linktrace-0.2.1}/linktrace/Spider.py +0 -0
  22. {linktrace-0.2.0 → linktrace-0.2.1}/linktrace/__init__.py +0 -0
  23. {linktrace-0.2.0 → linktrace-0.2.1}/linktrace/cache.py +0 -0
  24. {linktrace-0.2.0 → linktrace-0.2.1}/linktrace/py.typed +0 -0
  25. {linktrace-0.2.0 → linktrace-0.2.1}/linktrace/robots.py +0 -0
  26. {linktrace-0.2.0 → linktrace-0.2.1}/notebooks/crawl_cnn.ipynb +0 -0
  27. {linktrace-0.2.0 → linktrace-0.2.1}/notebooks/crawl_cnn_callbacks.ipynb +0 -0
  28. {linktrace-0.2.0 → linktrace-0.2.1}/notebooks/crawl_tax_assessor.ipynb +0 -0
  29. {linktrace-0.2.0 → linktrace-0.2.1}/settings.yaml +0 -0
  30. {linktrace-0.2.0 → linktrace-0.2.1}/tests/__init__.py +0 -0
  31. {linktrace-0.2.0 → linktrace-0.2.1}/tests/conftest.py +0 -0
  32. {linktrace-0.2.0 → linktrace-0.2.1}/tests/test_crawler.py +0 -0
  33. {linktrace-0.2.0 → linktrace-0.2.1}/tests/test_models.py +0 -0
  34. {linktrace-0.2.0 → linktrace-0.2.1}/tests/test_rate_limiting_and_broken_links.py +0 -0
  35. {linktrace-0.2.0 → linktrace-0.2.1}/tests/test_serializers.py +0 -0
  36. {linktrace-0.2.0 → linktrace-0.2.1}/tests/test_spider.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: linktrace
3
- Version: 0.2.0
3
+ Version: 0.2.1
4
4
  Summary: Async web crawler with rate limiting, robots.txt support, and broken link tracking
5
5
  License-File: LICENSE
6
6
  Requires-Python: >=3.12
@@ -123,7 +123,7 @@ spider = Spider(
123
123
  ```python
124
124
  spider = Spider(
125
125
  start_url="https://example.com",
126
- cache_dir=".webcrawler_cache" # Enable disk caching (default: None/disabled)
126
+ cache_dir=".linktrace_cache" # Enable disk caching (default: None/disabled)
127
127
  )
128
128
  # 2nd run will be 10-50x faster for same URLs
129
129
  ```
@@ -352,13 +352,23 @@ Spider (orchestrator)
352
352
  └─ CookieJar (automatic cookie handling)
353
353
  ```
354
354
 
355
- Spider manages the crawl queue and traversal. Crawler handles individual document fetching/parsing. All requests share one persistent aiohttp session per Spider instance.
355
+ Spider manages the crawl queue and traversal. Crawler handles individual document fetching/parsing. All requests share one persistent aiohttp session per Spider instance, so connection pooling, cookies, SSL configuration, and DNS caching are reused across the crawl.
356
356
 
357
357
  ## Why linktrace?
358
358
 
359
- **vs Scrapy:** Lightweight, focused, simpler API for link analysis. Scrapy is better for complex extraction pipelines.
359
+ Scrapy is an excellent full crawling and extraction framework. `linktrace` is designed for a narrower job: fast async link analysis with minimal setup.
360
360
 
361
- **vs requests + BeautifulSoup:** Built-in async concurrency, automatic session reuse, retries, caching. Better for crawling multiple pages.
361
+ Instead of building a Scrapy project around spiders, requests, responses, callbacks, items, pipelines, middleware, and settings, `linktrace` gives you a direct document-centric API. Each crawled URL becomes a `Document` object containing the page source, title, status code, response headers, domain, internal links, external links, and crawl status metadata.
362
+
363
+ That makes `linktrace` useful when your goal is to inspect site structure, trace links, audit crawl status, or export crawl results to dataframe-oriented tools without creating a larger scraping project.
364
+
365
+ `linktrace` also reuses a persistent `aiohttp` session during a crawl. Connection pooling, cookie reuse, SSL configuration, request timeouts, per-host limits, and DNS caching are carried across requests, which can make repeated same-domain crawls much faster than creating a fresh client/session per URL.
366
+
367
+ **Use Scrapy when:** you need a mature scraping framework with item pipelines, middleware, schedulers, broad ecosystem support, and complex extraction workflows.
368
+
369
+ **Use linktrace when:** you want a focused async crawler that turns URLs into analyzable `Document` objects with automatic link classification and simple exports.
370
+
371
+ **vs requests + BeautifulSoup:** Built-in async concurrency, automatic session reuse, retries, caching, rate limiting, and structured document objects. Better for crawling multiple pages.
362
372
 
363
373
  **vs Selenium:** Pure HTTP crawler (no JS execution). Faster, lighter, but can't handle dynamic sites.
364
374
 
@@ -100,7 +100,7 @@ spider = Spider(
100
100
  ```python
101
101
  spider = Spider(
102
102
  start_url="https://example.com",
103
- cache_dir=".webcrawler_cache" # Enable disk caching (default: None/disabled)
103
+ cache_dir=".linktrace_cache" # Enable disk caching (default: None/disabled)
104
104
  )
105
105
  # 2nd run will be 10-50x faster for same URLs
106
106
  ```
@@ -329,13 +329,23 @@ Spider (orchestrator)
329
329
  └─ CookieJar (automatic cookie handling)
330
330
  ```
331
331
 
332
- Spider manages the crawl queue and traversal. Crawler handles individual document fetching/parsing. All requests share one persistent aiohttp session per Spider instance.
332
+ Spider manages the crawl queue and traversal. Crawler handles individual document fetching/parsing. All requests share one persistent aiohttp session per Spider instance, so connection pooling, cookies, SSL configuration, and DNS caching are reused across the crawl.
333
333
 
334
334
  ## Why linktrace?
335
335
 
336
- **vs Scrapy:** Lightweight, focused, simpler API for link analysis. Scrapy is better for complex extraction pipelines.
336
+ Scrapy is an excellent full crawling and extraction framework. `linktrace` is designed for a narrower job: fast async link analysis with minimal setup.
337
337
 
338
- **vs requests + BeautifulSoup:** Built-in async concurrency, automatic session reuse, retries, caching. Better for crawling multiple pages.
338
+ Instead of building a Scrapy project around spiders, requests, responses, callbacks, items, pipelines, middleware, and settings, `linktrace` gives you a direct document-centric API. Each crawled URL becomes a `Document` object containing the page source, title, status code, response headers, domain, internal links, external links, and crawl status metadata.
339
+
340
+ That makes `linktrace` useful when your goal is to inspect site structure, trace links, audit crawl status, or export crawl results to dataframe-oriented tools without creating a larger scraping project.
341
+
342
+ `linktrace` also reuses a persistent `aiohttp` session during a crawl. Connection pooling, cookie reuse, SSL configuration, request timeouts, per-host limits, and DNS caching are carried across requests, which can make repeated same-domain crawls much faster than creating a fresh client/session per URL.
343
+
344
+ **Use Scrapy when:** you need a mature scraping framework with item pipelines, middleware, schedulers, broad ecosystem support, and complex extraction workflows.
345
+
346
+ **Use linktrace when:** you want a focused async crawler that turns URLs into analyzable `Document` objects with automatic link classification and simple exports.
347
+
348
+ **vs requests + BeautifulSoup:** Built-in async concurrency, automatic session reuse, retries, caching, rate limiting, and structured document objects. Better for crawling multiple pages.
339
349
 
340
350
  **vs Selenium:** Pure HTTP crawler (no JS execution). Faster, lighter, but can't handle dynamic sites.
341
351
 
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "linktrace"
7
- version = "0.2.0"
7
+ version = "0.2.1"
8
8
  description = "Async web crawler with rate limiting, robots.txt support, and broken link tracking"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.12"
@@ -44,7 +44,7 @@ dev = [
44
44
  packages = ["linktrace"]
45
45
 
46
46
  [tool.ruff]
47
- target-version = "py312"
47
+ target-version = "0.2.1"
48
48
  line-length = 88
49
49
 
50
50
  [tool.ruff.lint]
@@ -82,7 +82,7 @@ precision = 2
82
82
  directory = "htmlcov"
83
83
 
84
84
  [tool.mypy]
85
- python_version = "3.12"
85
+ python_version = "0.2.1"
86
86
  warn_return_any = true
87
87
  warn_unused_configs = true
88
88
  disallow_untyped_defs = false
@@ -796,7 +796,7 @@ wheels = [
796
796
 
797
797
  [[package]]
798
798
  name = "linktrace"
799
- version = "0.1.2"
799
+ version = "0.2.0"
800
800
  source = { editable = "." }
801
801
  dependencies = [
802
802
  { name = "aiofiles" },
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes