docpull 3.0.2__tar.gz → 4.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (98) hide show
  1. {docpull-3.0.2/src/docpull.egg-info → docpull-4.0.1}/PKG-INFO +42 -26
  2. {docpull-3.0.2 → docpull-4.0.1}/README.md +38 -24
  3. {docpull-3.0.2 → docpull-4.0.1}/pyproject.toml +4 -2
  4. {docpull-3.0.2 → docpull-4.0.1}/src/docpull/__init__.py +1 -6
  5. {docpull-3.0.2 → docpull-4.0.1}/src/docpull/cache/__init__.py +4 -0
  6. docpull-4.0.1/src/docpull/cache/frontier.py +199 -0
  7. {docpull-3.0.2 → docpull-4.0.1}/src/docpull/cache/manager.py +139 -134
  8. {docpull-3.0.2 → docpull-4.0.1}/src/docpull/cache/streaming_dedup.py +0 -17
  9. {docpull-3.0.2 → docpull-4.0.1}/src/docpull/cli.py +32 -17
  10. {docpull-3.0.2 → docpull-4.0.1}/src/docpull/conversion/markdown.py +21 -6
  11. {docpull-3.0.2 → docpull-4.0.1}/src/docpull/core/fetcher.py +105 -49
  12. {docpull-3.0.2 → docpull-4.0.1}/src/docpull/discovery/__init__.py +0 -4
  13. docpull-4.0.1/src/docpull/discovery/_fetch.py +33 -0
  14. {docpull-3.0.2 → docpull-4.0.1}/src/docpull/discovery/crawler.py +2 -47
  15. {docpull-3.0.2 → docpull-4.0.1}/src/docpull/discovery/filters.py +1 -6
  16. {docpull-3.0.2 → docpull-4.0.1}/src/docpull/discovery/link_extractors/enhanced.py +2 -27
  17. {docpull-3.0.2 → docpull-4.0.1}/src/docpull/discovery/link_extractors/static.py +2 -28
  18. {docpull-3.0.2 → docpull-4.0.1}/src/docpull/discovery/sitemap.py +2 -5
  19. {docpull-3.0.2 → docpull-4.0.1}/src/docpull/doctor.py +0 -13
  20. {docpull-3.0.2 → docpull-4.0.1}/src/docpull/http/client.py +40 -46
  21. {docpull-3.0.2 → docpull-4.0.1}/src/docpull/mcp/tools.py +103 -18
  22. {docpull-3.0.2 → docpull-4.0.1}/src/docpull/models/__init__.py +16 -3
  23. {docpull-3.0.2 → docpull-4.0.1}/src/docpull/models/config.py +0 -5
  24. docpull-4.0.1/src/docpull/models/document.py +78 -0
  25. {docpull-3.0.2 → docpull-4.0.1}/src/docpull/models/events.py +41 -2
  26. {docpull-3.0.2 → docpull-4.0.1}/src/docpull/models/profiles.py +6 -8
  27. docpull-4.0.1/src/docpull/models/run.py +60 -0
  28. {docpull-3.0.2 → docpull-4.0.1}/src/docpull/pipeline/base.py +58 -23
  29. docpull-4.0.1/src/docpull/pipeline/manifest.py +74 -0
  30. {docpull-3.0.2 → docpull-4.0.1}/src/docpull/pipeline/steps/convert.py +39 -6
  31. {docpull-3.0.2 → docpull-4.0.1}/src/docpull/pipeline/steps/dedup.py +11 -4
  32. {docpull-3.0.2 → docpull-4.0.1}/src/docpull/pipeline/steps/fetch.py +28 -10
  33. {docpull-3.0.2 → docpull-4.0.1}/src/docpull/pipeline/steps/save.py +53 -6
  34. {docpull-3.0.2 → docpull-4.0.1}/src/docpull/pipeline/steps/save_json.py +30 -11
  35. {docpull-3.0.2 → docpull-4.0.1}/src/docpull/pipeline/steps/save_ndjson.py +39 -21
  36. {docpull-3.0.2 → docpull-4.0.1}/src/docpull/pipeline/steps/save_sqlite.py +68 -10
  37. {docpull-3.0.2 → docpull-4.0.1}/src/docpull/pipeline/steps/validate.py +10 -7
  38. {docpull-3.0.2 → docpull-4.0.1}/src/docpull/security/robots.py +7 -5
  39. {docpull-3.0.2 → docpull-4.0.1}/src/docpull/security/url_validator.py +87 -38
  40. {docpull-3.0.2 → docpull-4.0.1/src/docpull.egg-info}/PKG-INFO +42 -26
  41. {docpull-3.0.2 → docpull-4.0.1}/src/docpull.egg-info/SOURCES.txt +9 -3
  42. {docpull-3.0.2 → docpull-4.0.1}/src/docpull.egg-info/requires.txt +3 -1
  43. {docpull-3.0.2 → docpull-4.0.1}/tests/test_cache_conditional_get.py +28 -0
  44. docpull-4.0.1/tests/test_ci_policy.py +38 -0
  45. docpull-4.0.1/tests/test_cli.py +86 -0
  46. {docpull-3.0.2 → docpull-4.0.1}/tests/test_conversion.py +34 -6
  47. {docpull-3.0.2 → docpull-4.0.1}/tests/test_convert_step_new.py +6 -8
  48. {docpull-3.0.2 → docpull-4.0.1}/tests/test_discovery.py +33 -0
  49. docpull-4.0.1/tests/test_frontier_resume.py +35 -0
  50. {docpull-3.0.2 → docpull-4.0.1}/tests/test_integration.py +67 -0
  51. {docpull-3.0.2 → docpull-4.0.1}/tests/test_mcp_tools.py +59 -1
  52. docpull-4.0.1/tests/test_outputs_e2e.py +120 -0
  53. {docpull-3.0.2 → docpull-4.0.1}/tests/test_save_ndjson.py +2 -0
  54. docpull-4.0.1/tests/test_save_sqlite.py +52 -0
  55. {docpull-3.0.2 → docpull-4.0.1}/tests/test_security_hardening.py +154 -0
  56. docpull-3.0.2/src/docpull/concurrency/__init__.py +0 -7
  57. docpull-3.0.2/src/docpull/concurrency/manager.py +0 -123
  58. docpull-3.0.2/src/docpull/logging_config.py +0 -53
  59. docpull-3.0.2/tests/test_cli.py +0 -20
  60. {docpull-3.0.2 → docpull-4.0.1}/LICENSE +0 -0
  61. {docpull-3.0.2 → docpull-4.0.1}/setup.cfg +0 -0
  62. {docpull-3.0.2 → docpull-4.0.1}/src/docpull/__main__.py +0 -0
  63. {docpull-3.0.2 → docpull-4.0.1}/src/docpull/conversion/__init__.py +0 -0
  64. {docpull-3.0.2 → docpull-4.0.1}/src/docpull/conversion/chunking.py +0 -0
  65. {docpull-3.0.2 → docpull-4.0.1}/src/docpull/conversion/extractor.py +0 -0
  66. {docpull-3.0.2 → docpull-4.0.1}/src/docpull/conversion/protocols.py +0 -0
  67. {docpull-3.0.2 → docpull-4.0.1}/src/docpull/conversion/special_cases.py +0 -0
  68. {docpull-3.0.2 → docpull-4.0.1}/src/docpull/conversion/trafilatura_extractor.py +0 -0
  69. {docpull-3.0.2 → docpull-4.0.1}/src/docpull/core/__init__.py +0 -0
  70. {docpull-3.0.2 → docpull-4.0.1}/src/docpull/discovery/composite.py +0 -0
  71. {docpull-3.0.2 → docpull-4.0.1}/src/docpull/discovery/link_extractors/__init__.py +0 -0
  72. {docpull-3.0.2 → docpull-4.0.1}/src/docpull/discovery/link_extractors/protocols.py +0 -0
  73. {docpull-3.0.2 → docpull-4.0.1}/src/docpull/discovery/protocols.py +0 -0
  74. {docpull-3.0.2 → docpull-4.0.1}/src/docpull/http/__init__.py +0 -0
  75. {docpull-3.0.2 → docpull-4.0.1}/src/docpull/http/protocols.py +0 -0
  76. {docpull-3.0.2 → docpull-4.0.1}/src/docpull/http/rate_limiter.py +0 -0
  77. {docpull-3.0.2 → docpull-4.0.1}/src/docpull/mcp/__init__.py +0 -0
  78. {docpull-3.0.2 → docpull-4.0.1}/src/docpull/mcp/server.py +0 -0
  79. {docpull-3.0.2 → docpull-4.0.1}/src/docpull/mcp/sources.py +0 -0
  80. {docpull-3.0.2 → docpull-4.0.1}/src/docpull/metadata_extractor.py +0 -0
  81. {docpull-3.0.2 → docpull-4.0.1}/src/docpull/pipeline/__init__.py +0 -0
  82. {docpull-3.0.2 → docpull-4.0.1}/src/docpull/pipeline/steps/__init__.py +0 -0
  83. {docpull-3.0.2 → docpull-4.0.1}/src/docpull/pipeline/steps/chunk.py +0 -0
  84. {docpull-3.0.2 → docpull-4.0.1}/src/docpull/pipeline/steps/metadata.py +0 -0
  85. {docpull-3.0.2 → docpull-4.0.1}/src/docpull/py.typed +0 -0
  86. {docpull-3.0.2 → docpull-4.0.1}/src/docpull/security/__init__.py +0 -0
  87. {docpull-3.0.2 → docpull-4.0.1}/src/docpull/time_utils.py +0 -0
  88. {docpull-3.0.2 → docpull-4.0.1}/src/docpull.egg-info/dependency_links.txt +0 -0
  89. {docpull-3.0.2 → docpull-4.0.1}/src/docpull.egg-info/entry_points.txt +0 -0
  90. {docpull-3.0.2 → docpull-4.0.1}/src/docpull.egg-info/top_level.txt +0 -0
  91. {docpull-3.0.2 → docpull-4.0.1}/tests/test_chunking.py +0 -0
  92. {docpull-3.0.2 → docpull-4.0.1}/tests/test_link_extractors.py +0 -0
  93. {docpull-3.0.2 → docpull-4.0.1}/tests/test_mcp_server.py +0 -0
  94. {docpull-3.0.2 → docpull-4.0.1}/tests/test_naming.py +0 -0
  95. {docpull-3.0.2 → docpull-4.0.1}/tests/test_pipeline.py +0 -0
  96. {docpull-3.0.2 → docpull-4.0.1}/tests/test_real_site_regressions.py +0 -0
  97. {docpull-3.0.2 → docpull-4.0.1}/tests/test_special_cases.py +0 -0
  98. {docpull-3.0.2 → docpull-4.0.1}/tests/test_time_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docpull
3
- Version: 3.0.2
3
+ Version: 4.0.1
4
4
  Summary: Pull documentation from the web and convert to clean markdown
5
5
  Author-email: Zachary Roth <support@raintree.technology>
6
6
  Maintainer-email: Raintree Technology <support@raintree.technology>
@@ -42,7 +42,7 @@ Requires-Dist: beautifulsoup4>=4.12.0
42
42
  Requires-Dist: html2text>=2020.1.16
43
43
  Requires-Dist: defusedxml>=0.7.1
44
44
  Requires-Dist: extruct>=0.15.0
45
- Requires-Dist: aiohttp>=3.9.0
45
+ Requires-Dist: aiohttp>=3.14.0
46
46
  Requires-Dist: idna>=3.15
47
47
  Requires-Dist: regex>=2024.11.6
48
48
  Requires-Dist: rich>=13.0.0
@@ -59,6 +59,7 @@ Provides-Extra: tokens
59
59
  Requires-Dist: tiktoken>=0.7.0; extra == "tokens"
60
60
  Provides-Extra: mcp
61
61
  Requires-Dist: mcp>=1.0.0; extra == "mcp"
62
+ Requires-Dist: pyjwt>=2.13.0; extra == "mcp"
62
63
  Requires-Dist: python-multipart>=0.0.27; extra == "mcp"
63
64
  Requires-Dist: starlette>=1.0.1; extra == "mcp"
64
65
  Provides-Extra: llm
@@ -69,6 +70,7 @@ Requires-Dist: url-normalize>=1.4.0; extra == "all"
69
70
  Requires-Dist: trafilatura>=1.12.0; extra == "all"
70
71
  Requires-Dist: tiktoken>=0.7.0; extra == "all"
71
72
  Requires-Dist: mcp>=1.0.0; extra == "all"
73
+ Requires-Dist: pyjwt>=2.13.0; extra == "all"
72
74
  Requires-Dist: python-multipart>=0.0.27; extra == "all"
73
75
  Requires-Dist: starlette>=1.0.1; extra == "all"
74
76
  Provides-Extra: dev
@@ -150,7 +152,7 @@ content directly from framework data feeds:
150
152
  | Mintlify | `__NEXT_DATA__` with Mintlify tagging |
151
153
  | OpenAPI | Renders `openapi.json` / `swagger.json` into Markdown |
152
154
  | Docusaurus| Detected and tagged; generic extractor produces Markdown |
153
- | Sphinx | Detected and tagged; generic extractor produces Markdown |
155
+ | Sphinx | Detected from generator metadata / Read the Docs hosts and tagged; generic extractor produces Markdown |
154
156
 
155
157
  JS-only SPAs with no server-rendered content are detected and skipped with a
156
158
  clear reason (or, with `--strict-js-required`, reported as an error so agents
@@ -213,8 +215,8 @@ async def tool_call(url: str) -> str:
213
215
 
214
216
  ```bash
215
217
  docpull https://site.com --profile rag # Default. Dedup, rich metadata.
216
- docpull https://site.com --profile llm # NDJSON + chunks + metadata.
217
- docpull https://site.com --profile mirror # Full archive, polite, cached.
218
+ docpull https://site.com --profile llm # NDJSON + chunks + metadata; JS-only pages skip unless --strict-js-required is passed.
219
+ docpull https://site.com --profile mirror # Full archive, polite, cached, hierarchical paths.
218
220
  docpull https://site.com --profile quick # Sampling: 50 pages, depth 2.
219
221
  ```
220
222
 
@@ -283,7 +285,9 @@ Write:
283
285
  - `add_source(name, url, description?, category?, max_pages?, force?)` — register a user alias (HTTPS-only, atomic write to `sources.yaml`).
284
286
  - `remove_source(name, delete_cache?)` — drop a user alias and (optionally) its cached docs.
285
287
 
286
- All tools that carry data also return `structuredContent` validated against an `outputSchema` for clients that prefer typed output.
288
+ All schema-backed tools return `structuredContent` validated against an
289
+ `outputSchema` for clients that prefer typed output. `fetch_url` intentionally
290
+ returns Markdown text directly.
287
291
 
288
292
  User-defined sources live in `~/.config/docpull-mcp/sources.yaml`:
289
293
 
@@ -296,16 +300,17 @@ sources:
296
300
  maxPages: 200
297
301
  ```
298
302
 
299
- ### About the `mcp/` directory in this repo
303
+ ### Supported MCP path
300
304
 
301
- The `mcp/` directory at the repo root is a separate TypeScript + Bun MCP
302
- server backed by PostgreSQL with pgvector for semantic search. It is not
303
- the Python MCP server shipped in the `docpull` package described above
304
- — that one is the right choice for almost every user and is installed
305
- with `pip install 'docpull[mcp]'`. The `mcp/` tree is mirrored to its
306
- own repo at [`raintree-technology/docpull-mcp`](https://github.com/raintree-technology/docpull-mcp);
307
- unless you specifically need pgvector-backed semantic search, ignore it
308
- and use `docpull mcp`.
305
+ The supported MCP server is the Python stdio server started by `docpull mcp`.
306
+ That is the only MCP path covered by the `docpull` package release contract and
307
+ the one agents, plugin users, Claude Code, Cursor, and Claude Desktop should
308
+ use.
309
+
310
+ This repository also contains an `mcp/` directory with an internal TypeScript +
311
+ Bun lab for PostgreSQL/pgvector semantic search. It is not shipped by the Python
312
+ package, is not documented as a user install path, and should be ignored unless
313
+ you are explicitly developing that lab.
309
314
 
310
315
  ## Output
311
316
 
@@ -325,9 +330,14 @@ source_type: "nextjs"
325
330
  NDJSON (one record per page or chunk):
326
331
 
327
332
  ```json
328
- {"url": "...", "title": "...", "content": "...", "hash": "...", "token_count": 842, "chunk_index": 0}
333
+ {"document_id": "doc_...", "chunk_id": "chunk_...", "url": "...", "title": "...", "content": "...", "hash": "...", "token_count": 842, "chunk_index": 0}
329
334
  ```
330
335
 
336
+ Every output format also writes `corpus.manifest.json` next to the generated
337
+ documents. The manifest records the run identity, output format, stable
338
+ `document_id` / `chunk_id` values, content hashes, relative output paths, and
339
+ chunk counts so regenerated corpora can be diffed and cited by agents.
340
+
331
341
  ## Security
332
342
 
333
343
  - HTTPS-only, mandatory robots.txt compliance
@@ -347,7 +357,7 @@ Run `docpull --help` for the full list. Highlights:
347
357
 
348
358
  ```
349
359
  Core:
350
- --profile {rag,mirror,quick,llm,custom}
360
+ --profile {rag,mirror,quick,llm}
351
361
  --single Fetch one URL (no crawl)
352
362
  --format {markdown,json,ndjson,sqlite}
353
363
  --stream Stream NDJSON to stdout
@@ -366,27 +376,33 @@ Cache:
366
376
  --cache Enable incremental updates
367
377
  --cache-dir DIR
368
378
  --cache-ttl DAYS
379
+
380
+ Crawl:
381
+ --max-concurrent N Global request concurrency
382
+ --per-host-concurrent N Per-host request concurrency
369
383
  ```
370
384
 
371
385
  ## Performance
372
386
 
373
387
  End-to-end numbers from `tests/benchmarks/test_10k_pages.py` against a
374
388
  synthetic 10,000-page localhost site (RAG profile, `max_concurrent=50`,
375
- HTTP keep-alive, 5% injected duplicate content):
389
+ `per_host_concurrent=50`, HTTP keep-alive, 5% injected duplicate content).
390
+ The benchmark emits progress every 1,000 pages plus a final JSON report for
391
+ trend tooling.
376
392
 
377
393
  | Metric | Value |
378
394
  |---|---|
379
- | Total wall time | ~27 s |
380
- | Discovery (sitemap parse) | ~80 ms |
381
- | Fetch + convert + save | ~27 s |
382
- | Per-page latency p50 / p95 / p99 | ~2.6 / 4.6 / 5.3 ms |
383
- | Peak RSS delta from baseline | ~28 MB |
384
- | Cache manifest size on disk | ~3.4 MB |
395
+ | Total wall time | ~333 s |
396
+ | Pages fetched / skipped / failed | 9,501 / 499 / 0 |
397
+ | Time to first saved page | ~130 ms |
398
+ | Per-page latency p50 / p95 / p99 | ~0 / 166 / 232 ms |
399
+ | Peak RSS delta from baseline | ~94 MB |
400
+ | Cache manifest size on disk | ~8.9 MB |
385
401
  | Duplicates detected (5% injected) | 499 / 500 |
386
402
 
387
403
  Reproduce with `make benchmark` (requires `aiohttp`; runs the gated
388
- benchmark in `tests/benchmarks/` and prints a JSON line you can pipe
389
- into trend tooling).
404
+ benchmark in `tests/benchmarks/` and prints progress plus a JSON line you can
405
+ pipe into trend tooling).
390
406
 
391
407
  ## Troubleshooting
392
408
 
@@ -62,7 +62,7 @@ content directly from framework data feeds:
62
62
  | Mintlify | `__NEXT_DATA__` with Mintlify tagging |
63
63
  | OpenAPI | Renders `openapi.json` / `swagger.json` into Markdown |
64
64
  | Docusaurus| Detected and tagged; generic extractor produces Markdown |
65
- | Sphinx | Detected and tagged; generic extractor produces Markdown |
65
+ | Sphinx | Detected from generator metadata / Read the Docs hosts and tagged; generic extractor produces Markdown |
66
66
 
67
67
  JS-only SPAs with no server-rendered content are detected and skipped with a
68
68
  clear reason (or, with `--strict-js-required`, reported as an error so agents
@@ -125,8 +125,8 @@ async def tool_call(url: str) -> str:
125
125
 
126
126
  ```bash
127
127
  docpull https://site.com --profile rag # Default. Dedup, rich metadata.
128
- docpull https://site.com --profile llm # NDJSON + chunks + metadata.
129
- docpull https://site.com --profile mirror # Full archive, polite, cached.
128
+ docpull https://site.com --profile llm # NDJSON + chunks + metadata; JS-only pages skip unless --strict-js-required is passed.
129
+ docpull https://site.com --profile mirror # Full archive, polite, cached, hierarchical paths.
130
130
  docpull https://site.com --profile quick # Sampling: 50 pages, depth 2.
131
131
  ```
132
132
 
@@ -195,7 +195,9 @@ Write:
195
195
  - `add_source(name, url, description?, category?, max_pages?, force?)` — register a user alias (HTTPS-only, atomic write to `sources.yaml`).
196
196
  - `remove_source(name, delete_cache?)` — drop a user alias and (optionally) its cached docs.
197
197
 
198
- All tools that carry data also return `structuredContent` validated against an `outputSchema` for clients that prefer typed output.
198
+ All schema-backed tools return `structuredContent` validated against an
199
+ `outputSchema` for clients that prefer typed output. `fetch_url` intentionally
200
+ returns Markdown text directly.
199
201
 
200
202
  User-defined sources live in `~/.config/docpull-mcp/sources.yaml`:
201
203
 
@@ -208,16 +210,17 @@ sources:
208
210
  maxPages: 200
209
211
  ```
210
212
 
211
- ### About the `mcp/` directory in this repo
213
+ ### Supported MCP path
212
214
 
213
- The `mcp/` directory at the repo root is a separate TypeScript + Bun MCP
214
- server backed by PostgreSQL with pgvector for semantic search. It is not
215
- the Python MCP server shipped in the `docpull` package described above
216
- — that one is the right choice for almost every user and is installed
217
- with `pip install 'docpull[mcp]'`. The `mcp/` tree is mirrored to its
218
- own repo at [`raintree-technology/docpull-mcp`](https://github.com/raintree-technology/docpull-mcp);
219
- unless you specifically need pgvector-backed semantic search, ignore it
220
- and use `docpull mcp`.
215
+ The supported MCP server is the Python stdio server started by `docpull mcp`.
216
+ That is the only MCP path covered by the `docpull` package release contract and
217
+ the one agents, plugin users, Claude Code, Cursor, and Claude Desktop should
218
+ use.
219
+
220
+ This repository also contains an `mcp/` directory with an internal TypeScript +
221
+ Bun lab for PostgreSQL/pgvector semantic search. It is not shipped by the Python
222
+ package, is not documented as a user install path, and should be ignored unless
223
+ you are explicitly developing that lab.
221
224
 
222
225
  ## Output
223
226
 
@@ -237,9 +240,14 @@ source_type: "nextjs"
237
240
  NDJSON (one record per page or chunk):
238
241
 
239
242
  ```json
240
- {"url": "...", "title": "...", "content": "...", "hash": "...", "token_count": 842, "chunk_index": 0}
243
+ {"document_id": "doc_...", "chunk_id": "chunk_...", "url": "...", "title": "...", "content": "...", "hash": "...", "token_count": 842, "chunk_index": 0}
241
244
  ```
242
245
 
246
+ Every output format also writes `corpus.manifest.json` next to the generated
247
+ documents. The manifest records the run identity, output format, stable
248
+ `document_id` / `chunk_id` values, content hashes, relative output paths, and
249
+ chunk counts so regenerated corpora can be diffed and cited by agents.
250
+
243
251
  ## Security
244
252
 
245
253
  - HTTPS-only, mandatory robots.txt compliance
@@ -259,7 +267,7 @@ Run `docpull --help` for the full list. Highlights:
259
267
 
260
268
  ```
261
269
  Core:
262
- --profile {rag,mirror,quick,llm,custom}
270
+ --profile {rag,mirror,quick,llm}
263
271
  --single Fetch one URL (no crawl)
264
272
  --format {markdown,json,ndjson,sqlite}
265
273
  --stream Stream NDJSON to stdout
@@ -278,27 +286,33 @@ Cache:
278
286
  --cache Enable incremental updates
279
287
  --cache-dir DIR
280
288
  --cache-ttl DAYS
289
+
290
+ Crawl:
291
+ --max-concurrent N Global request concurrency
292
+ --per-host-concurrent N Per-host request concurrency
281
293
  ```
282
294
 
283
295
  ## Performance
284
296
 
285
297
  End-to-end numbers from `tests/benchmarks/test_10k_pages.py` against a
286
298
  synthetic 10,000-page localhost site (RAG profile, `max_concurrent=50`,
287
- HTTP keep-alive, 5% injected duplicate content):
299
+ `per_host_concurrent=50`, HTTP keep-alive, 5% injected duplicate content).
300
+ The benchmark emits progress every 1,000 pages plus a final JSON report for
301
+ trend tooling.
288
302
 
289
303
  | Metric | Value |
290
304
  |---|---|
291
- | Total wall time | ~27 s |
292
- | Discovery (sitemap parse) | ~80 ms |
293
- | Fetch + convert + save | ~27 s |
294
- | Per-page latency p50 / p95 / p99 | ~2.6 / 4.6 / 5.3 ms |
295
- | Peak RSS delta from baseline | ~28 MB |
296
- | Cache manifest size on disk | ~3.4 MB |
305
+ | Total wall time | ~333 s |
306
+ | Pages fetched / skipped / failed | 9,501 / 499 / 0 |
307
+ | Time to first saved page | ~130 ms |
308
+ | Per-page latency p50 / p95 / p99 | ~0 / 166 / 232 ms |
309
+ | Peak RSS delta from baseline | ~94 MB |
310
+ | Cache manifest size on disk | ~8.9 MB |
297
311
  | Duplicates detected (5% injected) | 499 / 500 |
298
312
 
299
313
  Reproduce with `make benchmark` (requires `aiohttp`; runs the gated
300
- benchmark in `tests/benchmarks/` and prints a JSON line you can pipe
301
- into trend tooling).
314
+ benchmark in `tests/benchmarks/` and prints progress plus a JSON line you can
315
+ pipe into trend tooling).
302
316
 
303
317
  ## Troubleshooting
304
318
 
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "docpull"
7
- version = "3.0.2"
7
+ version = "4.0.1"
8
8
  dynamic = []
9
9
  description = "Pull documentation from the web and convert to clean markdown"
10
10
  readme = {file = "README.md", content-type = "text/markdown"}
@@ -66,7 +66,7 @@ dependencies = [
66
66
  "html2text>=2020.1.16",
67
67
  "defusedxml>=0.7.1",
68
68
  "extruct>=0.15.0",
69
- "aiohttp>=3.9.0",
69
+ "aiohttp>=3.14.0", # 3.14.0 fixes CVE-2026-34993 and CVE-2026-47265
70
70
  "idna>=3.15",
71
71
  "regex>=2024.11.6",
72
72
  "rich>=13.0.0",
@@ -90,6 +90,7 @@ tokens = [
90
90
  ]
91
91
  mcp = [
92
92
  "mcp>=1.0.0",
93
+ "pyjwt>=2.13.0",
93
94
  "python-multipart>=0.0.27",
94
95
  "starlette>=1.0.1",
95
96
  ]
@@ -102,6 +103,7 @@ all = [
102
103
  "trafilatura>=1.12.0",
103
104
  "tiktoken>=0.7.0",
104
105
  "mcp>=1.0.0",
106
+ "pyjwt>=2.13.0",
105
107
  "python-multipart>=0.0.27",
106
108
  "starlette>=1.0.1",
107
109
  ]
@@ -14,7 +14,7 @@ Usage:
14
14
  print(event)
15
15
  """
16
16
 
17
- __version__ = "3.0.2"
17
+ __version__ = "4.0.1"
18
18
 
19
19
  from .cache import CacheManager, StreamingDeduplicator
20
20
  from .conversion.chunking import Chunk, TokenCounter, chunk_markdown
@@ -34,12 +34,10 @@ from .pipeline.base import PageContext
34
34
 
35
35
  __all__ = [
36
36
  "__version__",
37
- # Core
38
37
  "Fetcher",
39
38
  "fetch_blocking",
40
39
  "fetch_one",
41
40
  "PageContext",
42
- # Config
43
41
  "DocpullConfig",
44
42
  "ProfileName",
45
43
  "CrawlConfig",
@@ -48,14 +46,11 @@ __all__ = [
48
46
  "NetworkConfig",
49
47
  "PerformanceConfig",
50
48
  "CacheConfig",
51
- # Events
52
49
  "EventType",
53
50
  "FetchEvent",
54
51
  "FetchStats",
55
- # Cache
56
52
  "CacheManager",
57
53
  "StreamingDeduplicator",
58
- # Chunking
59
54
  "Chunk",
60
55
  "TokenCounter",
61
56
  "chunk_markdown",
@@ -1,5 +1,6 @@
1
1
  """Caching and deduplication for docpull."""
2
2
 
3
+ from .frontier import FrontierEntry, FrontierState, FrontierStore
3
4
  from .manager import DEFAULT_TTL_DAYS, CacheManager, CacheState, ManifestEntry
4
5
  from .streaming_dedup import StreamingDeduplicator
5
6
 
@@ -7,6 +8,9 @@ __all__ = [
7
8
  "CacheManager",
8
9
  "CacheState",
9
10
  "ManifestEntry",
11
+ "FrontierEntry",
12
+ "FrontierState",
13
+ "FrontierStore",
10
14
  "StreamingDeduplicator",
11
15
  "DEFAULT_TTL_DAYS",
12
16
  ]
@@ -0,0 +1,199 @@
1
+ """Durable crawl frontier state for pause/resume and provenance."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import logging
7
+ from dataclasses import dataclass, field
8
+ from enum import Enum
9
+ from pathlib import Path
10
+ from typing import Any
11
+
12
+ from ..models.run import FRONTIER_SCHEMA_VERSION
13
+ from ..time_utils import utc_now_iso
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ class FrontierState(str, Enum):
19
+ """Lifecycle state for a URL in the crawl frontier."""
20
+
21
+ QUEUED = "queued"
22
+ PROCESSING = "processing"
23
+ SUCCEEDED = "succeeded"
24
+ SKIPPED = "skipped"
25
+ FAILED = "failed"
26
+
27
+
28
+ @dataclass
29
+ class FrontierEntry:
30
+ url: str
31
+ state: FrontierState = FrontierState.QUEUED
32
+ depth: int | None = None
33
+ source: str | None = None
34
+ discovered_at: str = field(default_factory=utc_now_iso)
35
+ updated_at: str = field(default_factory=utc_now_iso)
36
+ attempts: int = 0
37
+ last_error: str | None = None
38
+
39
+ @classmethod
40
+ def from_json(cls, data: dict[str, Any]) -> FrontierEntry | None:
41
+ url = data.get("url")
42
+ if not isinstance(url, str):
43
+ return None
44
+ try:
45
+ state = FrontierState(str(data.get("state", FrontierState.QUEUED.value)))
46
+ except ValueError:
47
+ state = FrontierState.QUEUED
48
+ attempts = data.get("attempts")
49
+ discovered_at = data.get("discovered_at")
50
+ updated_at = data.get("updated_at")
51
+ return cls(
52
+ url=url,
53
+ state=state,
54
+ depth=data.get("depth") if isinstance(data.get("depth"), int) else None,
55
+ source=data.get("source") if isinstance(data.get("source"), str) else None,
56
+ discovered_at=discovered_at if isinstance(discovered_at, str) else utc_now_iso(),
57
+ updated_at=updated_at if isinstance(updated_at, str) else utc_now_iso(),
58
+ attempts=attempts if isinstance(attempts, int) else 0,
59
+ last_error=data.get("last_error") if isinstance(data.get("last_error"), str) else None,
60
+ )
61
+
62
+ def to_json(self) -> dict[str, Any]:
63
+ return {
64
+ "url": self.url,
65
+ "state": self.state.value,
66
+ "depth": self.depth,
67
+ "source": self.source,
68
+ "discovered_at": self.discovered_at,
69
+ "updated_at": self.updated_at,
70
+ "attempts": self.attempts,
71
+ "last_error": self.last_error,
72
+ }
73
+
74
+
75
+ class FrontierStore:
76
+ """Small JSON-backed frontier store.
77
+
78
+ The store is intentionally simple because docpull is single-process today.
79
+ It gives us explicit URL lifecycle state and a compatibility fingerprint
80
+ without introducing a queue service or SQLite dependency for markdown users.
81
+ """
82
+
83
+ def __init__(self, path: Path):
84
+ self.path = Path(path)
85
+ self.entries: dict[str, FrontierEntry] = {}
86
+ self.start_url: str | None = None
87
+ self.run_fingerprint: dict[str, object] | None = None
88
+ self.created_at: str | None = None
89
+ self.updated_at: str | None = None
90
+ self._load()
91
+
92
+ def _load(self) -> None:
93
+ if not self.path.exists():
94
+ return
95
+ try:
96
+ data = json.loads(self.path.read_text(encoding="utf-8"))
97
+ except (OSError, json.JSONDecodeError) as err:
98
+ logger.warning("Could not load frontier store %s: %s", self.path, err)
99
+ return
100
+ if not isinstance(data, dict) or data.get("schema_version") != FRONTIER_SCHEMA_VERSION:
101
+ return
102
+ entries = data.get("entries")
103
+ if not isinstance(entries, list):
104
+ return
105
+ self.start_url = data.get("start_url") if isinstance(data.get("start_url"), str) else None
106
+ fingerprint = data.get("run_fingerprint")
107
+ self.run_fingerprint = fingerprint if isinstance(fingerprint, dict) else None
108
+ self.created_at = data.get("created_at") if isinstance(data.get("created_at"), str) else None
109
+ self.updated_at = data.get("updated_at") if isinstance(data.get("updated_at"), str) else None
110
+ for item in entries:
111
+ if not isinstance(item, dict):
112
+ continue
113
+ entry = FrontierEntry.from_json(item)
114
+ if entry:
115
+ self.entries[entry.url] = entry
116
+
117
+ def save(self) -> None:
118
+ self.path.parent.mkdir(parents=True, exist_ok=True)
119
+ now = utc_now_iso()
120
+ if self.created_at is None:
121
+ self.created_at = now
122
+ self.updated_at = now
123
+ data = {
124
+ "schema_version": FRONTIER_SCHEMA_VERSION,
125
+ "start_url": self.start_url,
126
+ "run_fingerprint": self.run_fingerprint,
127
+ "created_at": self.created_at,
128
+ "updated_at": self.updated_at,
129
+ "entries": [entry.to_json() for entry in self.entries.values()],
130
+ }
131
+ tmp = self.path.with_suffix(self.path.suffix + ".tmp")
132
+ try:
133
+ tmp.write_text(json.dumps(data, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")
134
+ tmp.replace(self.path)
135
+ except Exception:
136
+ tmp.unlink(missing_ok=True)
137
+ raise
138
+
139
+ def initialize(self, *, start_url: str, run_fingerprint: dict[str, object]) -> None:
140
+ if self.start_url != start_url or self.run_fingerprint != run_fingerprint:
141
+ self.entries.clear()
142
+ self.created_at = utc_now_iso()
143
+ self.start_url = start_url
144
+ self.run_fingerprint = run_fingerprint
145
+ self.save()
146
+
147
+ def compatible(self, *, start_url: str, run_fingerprint: dict[str, object]) -> bool:
148
+ return self.start_url == start_url and self.run_fingerprint == run_fingerprint
149
+
150
+ def add(self, url: str, *, depth: int | None = None, source: str | None = None) -> None:
151
+ if url in self.entries:
152
+ return
153
+ self.entries[url] = FrontierEntry(url=url, depth=depth, source=source)
154
+
155
+ def add_many(self, urls: list[str], *, source: str | None = None) -> None:
156
+ for url in urls:
157
+ self.add(url, source=source)
158
+
159
+ def mark_processing(self, url: str) -> None:
160
+ entry = self.entries.get(url)
161
+ if not entry:
162
+ self.add(url)
163
+ entry = self.entries[url]
164
+ entry.state = FrontierState.PROCESSING
165
+ entry.attempts += 1
166
+ entry.updated_at = utc_now_iso()
167
+ self.save()
168
+
169
+ def mark_succeeded(self, url: str) -> None:
170
+ self._mark_terminal(url, FrontierState.SUCCEEDED)
171
+
172
+ def mark_skipped(self, url: str) -> None:
173
+ self._mark_terminal(url, FrontierState.SKIPPED)
174
+
175
+ def mark_failed(self, url: str, error: str | None = None) -> None:
176
+ self._mark_terminal(url, FrontierState.FAILED, error=error)
177
+
178
+ def _mark_terminal(self, url: str, state: FrontierState, error: str | None = None) -> None:
179
+ entry = self.entries.get(url)
180
+ if not entry:
181
+ self.add(url)
182
+ entry = self.entries[url]
183
+ entry.state = state
184
+ entry.last_error = error
185
+ entry.updated_at = utc_now_iso()
186
+ self.save()
187
+
188
+ def pending_urls(self) -> list[str]:
189
+ terminal = {FrontierState.SUCCEEDED, FrontierState.SKIPPED}
190
+ return [url for url, entry in self.entries.items() if entry.state not in terminal]
191
+
192
+ def clear(self) -> None:
193
+ if self.path.exists():
194
+ self.path.unlink()
195
+ self.entries.clear()
196
+ self.start_url = None
197
+ self.run_fingerprint = None
198
+ self.created_at = None
199
+ self.updated_at = None