docpull 4.0.0__tar.gz → 4.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (95) hide show
  1. {docpull-4.0.0/src/docpull.egg-info → docpull-4.0.1}/PKG-INFO +39 -25
  2. {docpull-4.0.0 → docpull-4.0.1}/README.md +38 -24
  3. {docpull-4.0.0 → docpull-4.0.1}/pyproject.toml +1 -1
  4. {docpull-4.0.0 → docpull-4.0.1}/src/docpull/__init__.py +1 -6
  5. {docpull-4.0.0 → docpull-4.0.1}/src/docpull/cache/__init__.py +4 -0
  6. docpull-4.0.1/src/docpull/cache/frontier.py +199 -0
  7. {docpull-4.0.0 → docpull-4.0.1}/src/docpull/cache/manager.py +140 -23
  8. {docpull-4.0.0 → docpull-4.0.1}/src/docpull/cli.py +32 -17
  9. {docpull-4.0.0 → docpull-4.0.1}/src/docpull/core/fetcher.py +105 -49
  10. {docpull-4.0.0 → docpull-4.0.1}/src/docpull/discovery/__init__.py +0 -4
  11. {docpull-4.0.0 → docpull-4.0.1}/src/docpull/discovery/crawler.py +0 -19
  12. {docpull-4.0.0 → docpull-4.0.1}/src/docpull/discovery/filters.py +1 -6
  13. {docpull-4.0.0 → docpull-4.0.1}/src/docpull/discovery/sitemap.py +0 -4
  14. {docpull-4.0.0 → docpull-4.0.1}/src/docpull/doctor.py +0 -13
  15. {docpull-4.0.0 → docpull-4.0.1}/src/docpull/mcp/tools.py +92 -17
  16. {docpull-4.0.0 → docpull-4.0.1}/src/docpull/models/__init__.py +16 -3
  17. docpull-4.0.1/src/docpull/models/document.py +78 -0
  18. {docpull-4.0.0 → docpull-4.0.1}/src/docpull/models/events.py +41 -2
  19. {docpull-4.0.0 → docpull-4.0.1}/src/docpull/models/profiles.py +6 -8
  20. docpull-4.0.1/src/docpull/models/run.py +60 -0
  21. {docpull-4.0.0 → docpull-4.0.1}/src/docpull/pipeline/base.py +60 -12
  22. docpull-4.0.1/src/docpull/pipeline/manifest.py +74 -0
  23. {docpull-4.0.0 → docpull-4.0.1}/src/docpull/pipeline/steps/convert.py +39 -6
  24. {docpull-4.0.0 → docpull-4.0.1}/src/docpull/pipeline/steps/dedup.py +11 -4
  25. {docpull-4.0.0 → docpull-4.0.1}/src/docpull/pipeline/steps/fetch.py +12 -8
  26. {docpull-4.0.0 → docpull-4.0.1}/src/docpull/pipeline/steps/save.py +53 -6
  27. {docpull-4.0.0 → docpull-4.0.1}/src/docpull/pipeline/steps/save_json.py +30 -11
  28. {docpull-4.0.0 → docpull-4.0.1}/src/docpull/pipeline/steps/save_ndjson.py +39 -21
  29. {docpull-4.0.0 → docpull-4.0.1}/src/docpull/pipeline/steps/save_sqlite.py +68 -10
  30. {docpull-4.0.0 → docpull-4.0.1}/src/docpull/pipeline/steps/validate.py +10 -7
  31. {docpull-4.0.0 → docpull-4.0.1}/src/docpull/security/url_validator.py +0 -6
  32. {docpull-4.0.0 → docpull-4.0.1/src/docpull.egg-info}/PKG-INFO +39 -25
  33. {docpull-4.0.0 → docpull-4.0.1}/src/docpull.egg-info/SOURCES.txt +7 -0
  34. docpull-4.0.1/tests/test_cli.py +86 -0
  35. {docpull-4.0.0 → docpull-4.0.1}/tests/test_convert_step_new.py +6 -8
  36. docpull-4.0.1/tests/test_frontier_resume.py +35 -0
  37. {docpull-4.0.0 → docpull-4.0.1}/tests/test_integration.py +67 -0
  38. {docpull-4.0.0 → docpull-4.0.1}/tests/test_mcp_tools.py +39 -1
  39. docpull-4.0.1/tests/test_outputs_e2e.py +120 -0
  40. {docpull-4.0.0 → docpull-4.0.1}/tests/test_save_ndjson.py +2 -0
  41. docpull-4.0.1/tests/test_save_sqlite.py +52 -0
  42. {docpull-4.0.0 → docpull-4.0.1}/tests/test_security_hardening.py +62 -0
  43. docpull-4.0.0/tests/test_cli.py +0 -20
  44. {docpull-4.0.0 → docpull-4.0.1}/LICENSE +0 -0
  45. {docpull-4.0.0 → docpull-4.0.1}/setup.cfg +0 -0
  46. {docpull-4.0.0 → docpull-4.0.1}/src/docpull/__main__.py +0 -0
  47. {docpull-4.0.0 → docpull-4.0.1}/src/docpull/cache/streaming_dedup.py +0 -0
  48. {docpull-4.0.0 → docpull-4.0.1}/src/docpull/conversion/__init__.py +0 -0
  49. {docpull-4.0.0 → docpull-4.0.1}/src/docpull/conversion/chunking.py +0 -0
  50. {docpull-4.0.0 → docpull-4.0.1}/src/docpull/conversion/extractor.py +0 -0
  51. {docpull-4.0.0 → docpull-4.0.1}/src/docpull/conversion/markdown.py +0 -0
  52. {docpull-4.0.0 → docpull-4.0.1}/src/docpull/conversion/protocols.py +0 -0
  53. {docpull-4.0.0 → docpull-4.0.1}/src/docpull/conversion/special_cases.py +0 -0
  54. {docpull-4.0.0 → docpull-4.0.1}/src/docpull/conversion/trafilatura_extractor.py +0 -0
  55. {docpull-4.0.0 → docpull-4.0.1}/src/docpull/core/__init__.py +0 -0
  56. {docpull-4.0.0 → docpull-4.0.1}/src/docpull/discovery/_fetch.py +0 -0
  57. {docpull-4.0.0 → docpull-4.0.1}/src/docpull/discovery/composite.py +0 -0
  58. {docpull-4.0.0 → docpull-4.0.1}/src/docpull/discovery/link_extractors/__init__.py +0 -0
  59. {docpull-4.0.0 → docpull-4.0.1}/src/docpull/discovery/link_extractors/enhanced.py +0 -0
  60. {docpull-4.0.0 → docpull-4.0.1}/src/docpull/discovery/link_extractors/protocols.py +0 -0
  61. {docpull-4.0.0 → docpull-4.0.1}/src/docpull/discovery/link_extractors/static.py +0 -0
  62. {docpull-4.0.0 → docpull-4.0.1}/src/docpull/discovery/protocols.py +0 -0
  63. {docpull-4.0.0 → docpull-4.0.1}/src/docpull/http/__init__.py +0 -0
  64. {docpull-4.0.0 → docpull-4.0.1}/src/docpull/http/client.py +0 -0
  65. {docpull-4.0.0 → docpull-4.0.1}/src/docpull/http/protocols.py +0 -0
  66. {docpull-4.0.0 → docpull-4.0.1}/src/docpull/http/rate_limiter.py +0 -0
  67. {docpull-4.0.0 → docpull-4.0.1}/src/docpull/mcp/__init__.py +0 -0
  68. {docpull-4.0.0 → docpull-4.0.1}/src/docpull/mcp/server.py +0 -0
  69. {docpull-4.0.0 → docpull-4.0.1}/src/docpull/mcp/sources.py +0 -0
  70. {docpull-4.0.0 → docpull-4.0.1}/src/docpull/metadata_extractor.py +0 -0
  71. {docpull-4.0.0 → docpull-4.0.1}/src/docpull/models/config.py +0 -0
  72. {docpull-4.0.0 → docpull-4.0.1}/src/docpull/pipeline/__init__.py +0 -0
  73. {docpull-4.0.0 → docpull-4.0.1}/src/docpull/pipeline/steps/__init__.py +0 -0
  74. {docpull-4.0.0 → docpull-4.0.1}/src/docpull/pipeline/steps/chunk.py +0 -0
  75. {docpull-4.0.0 → docpull-4.0.1}/src/docpull/pipeline/steps/metadata.py +0 -0
  76. {docpull-4.0.0 → docpull-4.0.1}/src/docpull/py.typed +0 -0
  77. {docpull-4.0.0 → docpull-4.0.1}/src/docpull/security/__init__.py +0 -0
  78. {docpull-4.0.0 → docpull-4.0.1}/src/docpull/security/robots.py +0 -0
  79. {docpull-4.0.0 → docpull-4.0.1}/src/docpull/time_utils.py +0 -0
  80. {docpull-4.0.0 → docpull-4.0.1}/src/docpull.egg-info/dependency_links.txt +0 -0
  81. {docpull-4.0.0 → docpull-4.0.1}/src/docpull.egg-info/entry_points.txt +0 -0
  82. {docpull-4.0.0 → docpull-4.0.1}/src/docpull.egg-info/requires.txt +0 -0
  83. {docpull-4.0.0 → docpull-4.0.1}/src/docpull.egg-info/top_level.txt +0 -0
  84. {docpull-4.0.0 → docpull-4.0.1}/tests/test_cache_conditional_get.py +0 -0
  85. {docpull-4.0.0 → docpull-4.0.1}/tests/test_chunking.py +0 -0
  86. {docpull-4.0.0 → docpull-4.0.1}/tests/test_ci_policy.py +0 -0
  87. {docpull-4.0.0 → docpull-4.0.1}/tests/test_conversion.py +0 -0
  88. {docpull-4.0.0 → docpull-4.0.1}/tests/test_discovery.py +0 -0
  89. {docpull-4.0.0 → docpull-4.0.1}/tests/test_link_extractors.py +0 -0
  90. {docpull-4.0.0 → docpull-4.0.1}/tests/test_mcp_server.py +0 -0
  91. {docpull-4.0.0 → docpull-4.0.1}/tests/test_naming.py +0 -0
  92. {docpull-4.0.0 → docpull-4.0.1}/tests/test_pipeline.py +0 -0
  93. {docpull-4.0.0 → docpull-4.0.1}/tests/test_real_site_regressions.py +0 -0
  94. {docpull-4.0.0 → docpull-4.0.1}/tests/test_special_cases.py +0 -0
  95. {docpull-4.0.0 → docpull-4.0.1}/tests/test_time_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docpull
3
- Version: 4.0.0
3
+ Version: 4.0.1
4
4
  Summary: Pull documentation from the web and convert to clean markdown
5
5
  Author-email: Zachary Roth <support@raintree.technology>
6
6
  Maintainer-email: Raintree Technology <support@raintree.technology>
@@ -152,7 +152,7 @@ content directly from framework data feeds:
152
152
  | Mintlify | `__NEXT_DATA__` with Mintlify tagging |
153
153
  | OpenAPI | Renders `openapi.json` / `swagger.json` into Markdown |
154
154
  | Docusaurus| Detected and tagged; generic extractor produces Markdown |
155
- | Sphinx | Detected and tagged; generic extractor produces Markdown |
155
+ | Sphinx | Detected from generator metadata / Read the Docs hosts and tagged; generic extractor produces Markdown |
156
156
 
157
157
  JS-only SPAs with no server-rendered content are detected and skipped with a
158
158
  clear reason (or, with `--strict-js-required`, reported as an error so agents
@@ -215,8 +215,8 @@ async def tool_call(url: str) -> str:
215
215
 
216
216
  ```bash
217
217
  docpull https://site.com --profile rag # Default. Dedup, rich metadata.
218
- docpull https://site.com --profile llm # NDJSON + chunks + metadata.
219
- docpull https://site.com --profile mirror # Full archive, polite, cached.
218
+ docpull https://site.com --profile llm # NDJSON + chunks + metadata; JS-only pages skip unless --strict-js-required is passed.
219
+ docpull https://site.com --profile mirror # Full archive, polite, cached, hierarchical paths.
220
220
  docpull https://site.com --profile quick # Sampling: 50 pages, depth 2.
221
221
  ```
222
222
 
@@ -285,7 +285,9 @@ Write:
285
285
  - `add_source(name, url, description?, category?, max_pages?, force?)` — register a user alias (HTTPS-only, atomic write to `sources.yaml`).
286
286
  - `remove_source(name, delete_cache?)` — drop a user alias and (optionally) its cached docs.
287
287
 
288
- All tools that carry data also return `structuredContent` validated against an `outputSchema` for clients that prefer typed output.
288
+ All schema-backed tools return `structuredContent` validated against an
289
+ `outputSchema` for clients that prefer typed output. `fetch_url` intentionally
290
+ returns Markdown text directly.
289
291
 
290
292
  User-defined sources live in `~/.config/docpull-mcp/sources.yaml`:
291
293
 
@@ -298,16 +300,17 @@ sources:
298
300
  maxPages: 200
299
301
  ```
300
302
 
301
- ### About the `mcp/` directory in this repo
303
+ ### Supported MCP path
302
304
 
303
- The `mcp/` directory at the repo root is a separate TypeScript + Bun MCP
304
- server backed by PostgreSQL with pgvector for semantic search. It is not
305
- the Python MCP server shipped in the `docpull` package described above
306
- — that one is the right choice for almost every user and is installed
307
- with `pip install 'docpull[mcp]'`. The `mcp/` tree is mirrored to its
308
- own repo at [`raintree-technology/docpull-mcp`](https://github.com/raintree-technology/docpull-mcp);
309
- unless you specifically need pgvector-backed semantic search, ignore it
310
- and use `docpull mcp`.
305
+ The supported MCP server is the Python stdio server started by `docpull mcp`.
306
+ That is the only MCP path covered by the `docpull` package release contract and
307
+ the one agents, plugin users, Claude Code, Cursor, and Claude Desktop should
308
+ use.
309
+
310
+ This repository also contains an `mcp/` directory with an internal TypeScript +
311
+ Bun lab for PostgreSQL/pgvector semantic search. It is not shipped by the Python
312
+ package, is not documented as a user install path, and should be ignored unless
313
+ you are explicitly developing that lab.
311
314
 
312
315
  ## Output
313
316
 
@@ -327,9 +330,14 @@ source_type: "nextjs"
327
330
  NDJSON (one record per page or chunk):
328
331
 
329
332
  ```json
330
- {"url": "...", "title": "...", "content": "...", "hash": "...", "token_count": 842, "chunk_index": 0}
333
+ {"document_id": "doc_...", "chunk_id": "chunk_...", "url": "...", "title": "...", "content": "...", "hash": "...", "token_count": 842, "chunk_index": 0}
331
334
  ```
332
335
 
336
+ Every output format also writes `corpus.manifest.json` next to the generated
337
+ documents. The manifest records the run identity, output format, stable
338
+ `document_id` / `chunk_id` values, content hashes, relative output paths, and
339
+ chunk counts so regenerated corpora can be diffed and cited by agents.
340
+
333
341
  ## Security
334
342
 
335
343
  - HTTPS-only, mandatory robots.txt compliance
@@ -349,7 +357,7 @@ Run `docpull --help` for the full list. Highlights:
349
357
 
350
358
  ```
351
359
  Core:
352
- --profile {rag,mirror,quick,llm,custom}
360
+ --profile {rag,mirror,quick,llm}
353
361
  --single Fetch one URL (no crawl)
354
362
  --format {markdown,json,ndjson,sqlite}
355
363
  --stream Stream NDJSON to stdout
@@ -368,27 +376,33 @@ Cache:
368
376
  --cache Enable incremental updates
369
377
  --cache-dir DIR
370
378
  --cache-ttl DAYS
379
+
380
+ Crawl:
381
+ --max-concurrent N Global request concurrency
382
+ --per-host-concurrent N Per-host request concurrency
371
383
  ```
372
384
 
373
385
  ## Performance
374
386
 
375
387
  End-to-end numbers from `tests/benchmarks/test_10k_pages.py` against a
376
388
  synthetic 10,000-page localhost site (RAG profile, `max_concurrent=50`,
377
- HTTP keep-alive, 5% injected duplicate content):
389
+ `per_host_concurrent=50`, HTTP keep-alive, 5% injected duplicate content).
390
+ The benchmark emits progress every 1,000 pages plus a final JSON report for
391
+ trend tooling.
378
392
 
379
393
  | Metric | Value |
380
394
  |---|---|
381
- | Total wall time | ~27 s |
382
- | Discovery (sitemap parse) | ~80 ms |
383
- | Fetch + convert + save | ~27 s |
384
- | Per-page latency p50 / p95 / p99 | ~2.6 / 4.6 / 5.3 ms |
385
- | Peak RSS delta from baseline | ~28 MB |
386
- | Cache manifest size on disk | ~3.4 MB |
395
+ | Total wall time | ~333 s |
396
+ | Pages fetched / skipped / failed | 9,501 / 499 / 0 |
397
+ | Time to first saved page | ~130 ms |
398
+ | Per-page latency p50 / p95 / p99 | ~0 / 166 / 232 ms |
399
+ | Peak RSS delta from baseline | ~94 MB |
400
+ | Cache manifest size on disk | ~8.9 MB |
387
401
  | Duplicates detected (5% injected) | 499 / 500 |
388
402
 
389
403
  Reproduce with `make benchmark` (requires `aiohttp`; runs the gated
390
- benchmark in `tests/benchmarks/` and prints a JSON line you can pipe
391
- into trend tooling).
404
+ benchmark in `tests/benchmarks/` and prints progress plus a JSON line you can
405
+ pipe into trend tooling).
392
406
 
393
407
  ## Troubleshooting
394
408
 
@@ -62,7 +62,7 @@ content directly from framework data feeds:
62
62
  | Mintlify | `__NEXT_DATA__` with Mintlify tagging |
63
63
  | OpenAPI | Renders `openapi.json` / `swagger.json` into Markdown |
64
64
  | Docusaurus| Detected and tagged; generic extractor produces Markdown |
65
- | Sphinx | Detected and tagged; generic extractor produces Markdown |
65
+ | Sphinx | Detected from generator metadata / Read the Docs hosts and tagged; generic extractor produces Markdown |
66
66
 
67
67
  JS-only SPAs with no server-rendered content are detected and skipped with a
68
68
  clear reason (or, with `--strict-js-required`, reported as an error so agents
@@ -125,8 +125,8 @@ async def tool_call(url: str) -> str:
125
125
 
126
126
  ```bash
127
127
  docpull https://site.com --profile rag # Default. Dedup, rich metadata.
128
- docpull https://site.com --profile llm # NDJSON + chunks + metadata.
129
- docpull https://site.com --profile mirror # Full archive, polite, cached.
128
+ docpull https://site.com --profile llm # NDJSON + chunks + metadata; JS-only pages skip unless --strict-js-required is passed.
129
+ docpull https://site.com --profile mirror # Full archive, polite, cached, hierarchical paths.
130
130
  docpull https://site.com --profile quick # Sampling: 50 pages, depth 2.
131
131
  ```
132
132
 
@@ -195,7 +195,9 @@ Write:
195
195
  - `add_source(name, url, description?, category?, max_pages?, force?)` — register a user alias (HTTPS-only, atomic write to `sources.yaml`).
196
196
  - `remove_source(name, delete_cache?)` — drop a user alias and (optionally) its cached docs.
197
197
 
198
- All tools that carry data also return `structuredContent` validated against an `outputSchema` for clients that prefer typed output.
198
+ All schema-backed tools return `structuredContent` validated against an
199
+ `outputSchema` for clients that prefer typed output. `fetch_url` intentionally
200
+ returns Markdown text directly.
199
201
 
200
202
  User-defined sources live in `~/.config/docpull-mcp/sources.yaml`:
201
203
 
@@ -208,16 +210,17 @@ sources:
208
210
  maxPages: 200
209
211
  ```
210
212
 
211
- ### About the `mcp/` directory in this repo
213
+ ### Supported MCP path
212
214
 
213
- The `mcp/` directory at the repo root is a separate TypeScript + Bun MCP
214
- server backed by PostgreSQL with pgvector for semantic search. It is not
215
- the Python MCP server shipped in the `docpull` package described above
216
- — that one is the right choice for almost every user and is installed
217
- with `pip install 'docpull[mcp]'`. The `mcp/` tree is mirrored to its
218
- own repo at [`raintree-technology/docpull-mcp`](https://github.com/raintree-technology/docpull-mcp);
219
- unless you specifically need pgvector-backed semantic search, ignore it
220
- and use `docpull mcp`.
215
+ The supported MCP server is the Python stdio server started by `docpull mcp`.
216
+ That is the only MCP path covered by the `docpull` package release contract and
217
+ the one agents, plugin users, Claude Code, Cursor, and Claude Desktop should
218
+ use.
219
+
220
+ This repository also contains an `mcp/` directory with an internal TypeScript +
221
+ Bun lab for PostgreSQL/pgvector semantic search. It is not shipped by the Python
222
+ package, is not documented as a user install path, and should be ignored unless
223
+ you are explicitly developing that lab.
221
224
 
222
225
  ## Output
223
226
 
@@ -237,9 +240,14 @@ source_type: "nextjs"
237
240
  NDJSON (one record per page or chunk):
238
241
 
239
242
  ```json
240
- {"url": "...", "title": "...", "content": "...", "hash": "...", "token_count": 842, "chunk_index": 0}
243
+ {"document_id": "doc_...", "chunk_id": "chunk_...", "url": "...", "title": "...", "content": "...", "hash": "...", "token_count": 842, "chunk_index": 0}
241
244
  ```
242
245
 
246
+ Every output format also writes `corpus.manifest.json` next to the generated
247
+ documents. The manifest records the run identity, output format, stable
248
+ `document_id` / `chunk_id` values, content hashes, relative output paths, and
249
+ chunk counts so regenerated corpora can be diffed and cited by agents.
250
+
243
251
  ## Security
244
252
 
245
253
  - HTTPS-only, mandatory robots.txt compliance
@@ -259,7 +267,7 @@ Run `docpull --help` for the full list. Highlights:
259
267
 
260
268
  ```
261
269
  Core:
262
- --profile {rag,mirror,quick,llm,custom}
270
+ --profile {rag,mirror,quick,llm}
263
271
  --single Fetch one URL (no crawl)
264
272
  --format {markdown,json,ndjson,sqlite}
265
273
  --stream Stream NDJSON to stdout
@@ -278,27 +286,33 @@ Cache:
278
286
  --cache Enable incremental updates
279
287
  --cache-dir DIR
280
288
  --cache-ttl DAYS
289
+
290
+ Crawl:
291
+ --max-concurrent N Global request concurrency
292
+ --per-host-concurrent N Per-host request concurrency
281
293
  ```
282
294
 
283
295
  ## Performance
284
296
 
285
297
  End-to-end numbers from `tests/benchmarks/test_10k_pages.py` against a
286
298
  synthetic 10,000-page localhost site (RAG profile, `max_concurrent=50`,
287
- HTTP keep-alive, 5% injected duplicate content):
299
+ `per_host_concurrent=50`, HTTP keep-alive, 5% injected duplicate content).
300
+ The benchmark emits progress every 1,000 pages plus a final JSON report for
301
+ trend tooling.
288
302
 
289
303
  | Metric | Value |
290
304
  |---|---|
291
- | Total wall time | ~27 s |
292
- | Discovery (sitemap parse) | ~80 ms |
293
- | Fetch + convert + save | ~27 s |
294
- | Per-page latency p50 / p95 / p99 | ~2.6 / 4.6 / 5.3 ms |
295
- | Peak RSS delta from baseline | ~28 MB |
296
- | Cache manifest size on disk | ~3.4 MB |
305
+ | Total wall time | ~333 s |
306
+ | Pages fetched / skipped / failed | 9,501 / 499 / 0 |
307
+ | Time to first saved page | ~130 ms |
308
+ | Per-page latency p50 / p95 / p99 | ~0 / 166 / 232 ms |
309
+ | Peak RSS delta from baseline | ~94 MB |
310
+ | Cache manifest size on disk | ~8.9 MB |
297
311
  | Duplicates detected (5% injected) | 499 / 500 |
298
312
 
299
313
  Reproduce with `make benchmark` (requires `aiohttp`; runs the gated
300
- benchmark in `tests/benchmarks/` and prints a JSON line you can pipe
301
- into trend tooling).
314
+ benchmark in `tests/benchmarks/` and prints progress plus a JSON line you can
315
+ pipe into trend tooling).
302
316
 
303
317
  ## Troubleshooting
304
318
 
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "docpull"
7
- version = "4.0.0"
7
+ version = "4.0.1"
8
8
  dynamic = []
9
9
  description = "Pull documentation from the web and convert to clean markdown"
10
10
  readme = {file = "README.md", content-type = "text/markdown"}
@@ -14,7 +14,7 @@ Usage:
14
14
  print(event)
15
15
  """
16
16
 
17
- __version__ = "4.0.0"
17
+ __version__ = "4.0.1"
18
18
 
19
19
  from .cache import CacheManager, StreamingDeduplicator
20
20
  from .conversion.chunking import Chunk, TokenCounter, chunk_markdown
@@ -34,12 +34,10 @@ from .pipeline.base import PageContext
34
34
 
35
35
  __all__ = [
36
36
  "__version__",
37
- # Core
38
37
  "Fetcher",
39
38
  "fetch_blocking",
40
39
  "fetch_one",
41
40
  "PageContext",
42
- # Config
43
41
  "DocpullConfig",
44
42
  "ProfileName",
45
43
  "CrawlConfig",
@@ -48,14 +46,11 @@ __all__ = [
48
46
  "NetworkConfig",
49
47
  "PerformanceConfig",
50
48
  "CacheConfig",
51
- # Events
52
49
  "EventType",
53
50
  "FetchEvent",
54
51
  "FetchStats",
55
- # Cache
56
52
  "CacheManager",
57
53
  "StreamingDeduplicator",
58
- # Chunking
59
54
  "Chunk",
60
55
  "TokenCounter",
61
56
  "chunk_markdown",
@@ -1,5 +1,6 @@
1
1
  """Caching and deduplication for docpull."""
2
2
 
3
+ from .frontier import FrontierEntry, FrontierState, FrontierStore
3
4
  from .manager import DEFAULT_TTL_DAYS, CacheManager, CacheState, ManifestEntry
4
5
  from .streaming_dedup import StreamingDeduplicator
5
6
 
@@ -7,6 +8,9 @@ __all__ = [
7
8
  "CacheManager",
8
9
  "CacheState",
9
10
  "ManifestEntry",
11
+ "FrontierEntry",
12
+ "FrontierState",
13
+ "FrontierStore",
10
14
  "StreamingDeduplicator",
11
15
  "DEFAULT_TTL_DAYS",
12
16
  ]
@@ -0,0 +1,199 @@
1
+ """Durable crawl frontier state for pause/resume and provenance."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import logging
7
+ from dataclasses import dataclass, field
8
+ from enum import Enum
9
+ from pathlib import Path
10
+ from typing import Any
11
+
12
+ from ..models.run import FRONTIER_SCHEMA_VERSION
13
+ from ..time_utils import utc_now_iso
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ class FrontierState(str, Enum):
19
+ """Lifecycle state for a URL in the crawl frontier."""
20
+
21
+ QUEUED = "queued"
22
+ PROCESSING = "processing"
23
+ SUCCEEDED = "succeeded"
24
+ SKIPPED = "skipped"
25
+ FAILED = "failed"
26
+
27
+
28
+ @dataclass
29
+ class FrontierEntry:
30
+ url: str
31
+ state: FrontierState = FrontierState.QUEUED
32
+ depth: int | None = None
33
+ source: str | None = None
34
+ discovered_at: str = field(default_factory=utc_now_iso)
35
+ updated_at: str = field(default_factory=utc_now_iso)
36
+ attempts: int = 0
37
+ last_error: str | None = None
38
+
39
+ @classmethod
40
+ def from_json(cls, data: dict[str, Any]) -> FrontierEntry | None:
41
+ url = data.get("url")
42
+ if not isinstance(url, str):
43
+ return None
44
+ try:
45
+ state = FrontierState(str(data.get("state", FrontierState.QUEUED.value)))
46
+ except ValueError:
47
+ state = FrontierState.QUEUED
48
+ attempts = data.get("attempts")
49
+ discovered_at = data.get("discovered_at")
50
+ updated_at = data.get("updated_at")
51
+ return cls(
52
+ url=url,
53
+ state=state,
54
+ depth=data.get("depth") if isinstance(data.get("depth"), int) else None,
55
+ source=data.get("source") if isinstance(data.get("source"), str) else None,
56
+ discovered_at=discovered_at if isinstance(discovered_at, str) else utc_now_iso(),
57
+ updated_at=updated_at if isinstance(updated_at, str) else utc_now_iso(),
58
+ attempts=attempts if isinstance(attempts, int) else 0,
59
+ last_error=data.get("last_error") if isinstance(data.get("last_error"), str) else None,
60
+ )
61
+
62
+ def to_json(self) -> dict[str, Any]:
63
+ return {
64
+ "url": self.url,
65
+ "state": self.state.value,
66
+ "depth": self.depth,
67
+ "source": self.source,
68
+ "discovered_at": self.discovered_at,
69
+ "updated_at": self.updated_at,
70
+ "attempts": self.attempts,
71
+ "last_error": self.last_error,
72
+ }
73
+
74
+
75
+ class FrontierStore:
76
+ """Small JSON-backed frontier store.
77
+
78
+ The store is intentionally simple because docpull is single-process today.
79
+ It gives us explicit URL lifecycle state and a compatibility fingerprint
80
+ without introducing a queue service or SQLite dependency for markdown users.
81
+ """
82
+
83
+ def __init__(self, path: Path):
84
+ self.path = Path(path)
85
+ self.entries: dict[str, FrontierEntry] = {}
86
+ self.start_url: str | None = None
87
+ self.run_fingerprint: dict[str, object] | None = None
88
+ self.created_at: str | None = None
89
+ self.updated_at: str | None = None
90
+ self._load()
91
+
92
+ def _load(self) -> None:
93
+ if not self.path.exists():
94
+ return
95
+ try:
96
+ data = json.loads(self.path.read_text(encoding="utf-8"))
97
+ except (OSError, json.JSONDecodeError) as err:
98
+ logger.warning("Could not load frontier store %s: %s", self.path, err)
99
+ return
100
+ if not isinstance(data, dict) or data.get("schema_version") != FRONTIER_SCHEMA_VERSION:
101
+ return
102
+ entries = data.get("entries")
103
+ if not isinstance(entries, list):
104
+ return
105
+ self.start_url = data.get("start_url") if isinstance(data.get("start_url"), str) else None
106
+ fingerprint = data.get("run_fingerprint")
107
+ self.run_fingerprint = fingerprint if isinstance(fingerprint, dict) else None
108
+ self.created_at = data.get("created_at") if isinstance(data.get("created_at"), str) else None
109
+ self.updated_at = data.get("updated_at") if isinstance(data.get("updated_at"), str) else None
110
+ for item in entries:
111
+ if not isinstance(item, dict):
112
+ continue
113
+ entry = FrontierEntry.from_json(item)
114
+ if entry:
115
+ self.entries[entry.url] = entry
116
+
117
+ def save(self) -> None:
118
+ self.path.parent.mkdir(parents=True, exist_ok=True)
119
+ now = utc_now_iso()
120
+ if self.created_at is None:
121
+ self.created_at = now
122
+ self.updated_at = now
123
+ data = {
124
+ "schema_version": FRONTIER_SCHEMA_VERSION,
125
+ "start_url": self.start_url,
126
+ "run_fingerprint": self.run_fingerprint,
127
+ "created_at": self.created_at,
128
+ "updated_at": self.updated_at,
129
+ "entries": [entry.to_json() for entry in self.entries.values()],
130
+ }
131
+ tmp = self.path.with_suffix(self.path.suffix + ".tmp")
132
+ try:
133
+ tmp.write_text(json.dumps(data, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")
134
+ tmp.replace(self.path)
135
+ except Exception:
136
+ tmp.unlink(missing_ok=True)
137
+ raise
138
+
139
+ def initialize(self, *, start_url: str, run_fingerprint: dict[str, object]) -> None:
140
+ if self.start_url != start_url or self.run_fingerprint != run_fingerprint:
141
+ self.entries.clear()
142
+ self.created_at = utc_now_iso()
143
+ self.start_url = start_url
144
+ self.run_fingerprint = run_fingerprint
145
+ self.save()
146
+
147
+ def compatible(self, *, start_url: str, run_fingerprint: dict[str, object]) -> bool:
148
+ return self.start_url == start_url and self.run_fingerprint == run_fingerprint
149
+
150
+ def add(self, url: str, *, depth: int | None = None, source: str | None = None) -> None:
151
+ if url in self.entries:
152
+ return
153
+ self.entries[url] = FrontierEntry(url=url, depth=depth, source=source)
154
+
155
+ def add_many(self, urls: list[str], *, source: str | None = None) -> None:
156
+ for url in urls:
157
+ self.add(url, source=source)
158
+
159
+ def mark_processing(self, url: str) -> None:
160
+ entry = self.entries.get(url)
161
+ if not entry:
162
+ self.add(url)
163
+ entry = self.entries[url]
164
+ entry.state = FrontierState.PROCESSING
165
+ entry.attempts += 1
166
+ entry.updated_at = utc_now_iso()
167
+ self.save()
168
+
169
+ def mark_succeeded(self, url: str) -> None:
170
+ self._mark_terminal(url, FrontierState.SUCCEEDED)
171
+
172
+ def mark_skipped(self, url: str) -> None:
173
+ self._mark_terminal(url, FrontierState.SKIPPED)
174
+
175
+ def mark_failed(self, url: str, error: str | None = None) -> None:
176
+ self._mark_terminal(url, FrontierState.FAILED, error=error)
177
+
178
+ def _mark_terminal(self, url: str, state: FrontierState, error: str | None = None) -> None:
179
+ entry = self.entries.get(url)
180
+ if not entry:
181
+ self.add(url)
182
+ entry = self.entries[url]
183
+ entry.state = state
184
+ entry.last_error = error
185
+ entry.updated_at = utc_now_iso()
186
+ self.save()
187
+
188
+ def pending_urls(self) -> list[str]:
189
+ terminal = {FrontierState.SUCCEEDED, FrontierState.SKIPPED}
190
+ return [url for url, entry in self.entries.items() if entry.state not in terminal]
191
+
192
+ def clear(self) -> None:
193
+ if self.path.exists():
194
+ self.path.unlink()
195
+ self.entries.clear()
196
+ self.start_url = None
197
+ self.run_fingerprint = None
198
+ self.created_at = None
199
+ self.updated_at = None