docpull 3.0.2__tar.gz → 4.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docpull-3.0.2/src/docpull.egg-info → docpull-4.0.1}/PKG-INFO +42 -26
- {docpull-3.0.2 → docpull-4.0.1}/README.md +38 -24
- {docpull-3.0.2 → docpull-4.0.1}/pyproject.toml +4 -2
- {docpull-3.0.2 → docpull-4.0.1}/src/docpull/__init__.py +1 -6
- {docpull-3.0.2 → docpull-4.0.1}/src/docpull/cache/__init__.py +4 -0
- docpull-4.0.1/src/docpull/cache/frontier.py +199 -0
- {docpull-3.0.2 → docpull-4.0.1}/src/docpull/cache/manager.py +139 -134
- {docpull-3.0.2 → docpull-4.0.1}/src/docpull/cache/streaming_dedup.py +0 -17
- {docpull-3.0.2 → docpull-4.0.1}/src/docpull/cli.py +32 -17
- {docpull-3.0.2 → docpull-4.0.1}/src/docpull/conversion/markdown.py +21 -6
- {docpull-3.0.2 → docpull-4.0.1}/src/docpull/core/fetcher.py +105 -49
- {docpull-3.0.2 → docpull-4.0.1}/src/docpull/discovery/__init__.py +0 -4
- docpull-4.0.1/src/docpull/discovery/_fetch.py +33 -0
- {docpull-3.0.2 → docpull-4.0.1}/src/docpull/discovery/crawler.py +2 -47
- {docpull-3.0.2 → docpull-4.0.1}/src/docpull/discovery/filters.py +1 -6
- {docpull-3.0.2 → docpull-4.0.1}/src/docpull/discovery/link_extractors/enhanced.py +2 -27
- {docpull-3.0.2 → docpull-4.0.1}/src/docpull/discovery/link_extractors/static.py +2 -28
- {docpull-3.0.2 → docpull-4.0.1}/src/docpull/discovery/sitemap.py +2 -5
- {docpull-3.0.2 → docpull-4.0.1}/src/docpull/doctor.py +0 -13
- {docpull-3.0.2 → docpull-4.0.1}/src/docpull/http/client.py +40 -46
- {docpull-3.0.2 → docpull-4.0.1}/src/docpull/mcp/tools.py +103 -18
- {docpull-3.0.2 → docpull-4.0.1}/src/docpull/models/__init__.py +16 -3
- {docpull-3.0.2 → docpull-4.0.1}/src/docpull/models/config.py +0 -5
- docpull-4.0.1/src/docpull/models/document.py +78 -0
- {docpull-3.0.2 → docpull-4.0.1}/src/docpull/models/events.py +41 -2
- {docpull-3.0.2 → docpull-4.0.1}/src/docpull/models/profiles.py +6 -8
- docpull-4.0.1/src/docpull/models/run.py +60 -0
- {docpull-3.0.2 → docpull-4.0.1}/src/docpull/pipeline/base.py +58 -23
- docpull-4.0.1/src/docpull/pipeline/manifest.py +74 -0
- {docpull-3.0.2 → docpull-4.0.1}/src/docpull/pipeline/steps/convert.py +39 -6
- {docpull-3.0.2 → docpull-4.0.1}/src/docpull/pipeline/steps/dedup.py +11 -4
- {docpull-3.0.2 → docpull-4.0.1}/src/docpull/pipeline/steps/fetch.py +28 -10
- {docpull-3.0.2 → docpull-4.0.1}/src/docpull/pipeline/steps/save.py +53 -6
- {docpull-3.0.2 → docpull-4.0.1}/src/docpull/pipeline/steps/save_json.py +30 -11
- {docpull-3.0.2 → docpull-4.0.1}/src/docpull/pipeline/steps/save_ndjson.py +39 -21
- {docpull-3.0.2 → docpull-4.0.1}/src/docpull/pipeline/steps/save_sqlite.py +68 -10
- {docpull-3.0.2 → docpull-4.0.1}/src/docpull/pipeline/steps/validate.py +10 -7
- {docpull-3.0.2 → docpull-4.0.1}/src/docpull/security/robots.py +7 -5
- {docpull-3.0.2 → docpull-4.0.1}/src/docpull/security/url_validator.py +87 -38
- {docpull-3.0.2 → docpull-4.0.1/src/docpull.egg-info}/PKG-INFO +42 -26
- {docpull-3.0.2 → docpull-4.0.1}/src/docpull.egg-info/SOURCES.txt +9 -3
- {docpull-3.0.2 → docpull-4.0.1}/src/docpull.egg-info/requires.txt +3 -1
- {docpull-3.0.2 → docpull-4.0.1}/tests/test_cache_conditional_get.py +28 -0
- docpull-4.0.1/tests/test_ci_policy.py +38 -0
- docpull-4.0.1/tests/test_cli.py +86 -0
- {docpull-3.0.2 → docpull-4.0.1}/tests/test_conversion.py +34 -6
- {docpull-3.0.2 → docpull-4.0.1}/tests/test_convert_step_new.py +6 -8
- {docpull-3.0.2 → docpull-4.0.1}/tests/test_discovery.py +33 -0
- docpull-4.0.1/tests/test_frontier_resume.py +35 -0
- {docpull-3.0.2 → docpull-4.0.1}/tests/test_integration.py +67 -0
- {docpull-3.0.2 → docpull-4.0.1}/tests/test_mcp_tools.py +59 -1
- docpull-4.0.1/tests/test_outputs_e2e.py +120 -0
- {docpull-3.0.2 → docpull-4.0.1}/tests/test_save_ndjson.py +2 -0
- docpull-4.0.1/tests/test_save_sqlite.py +52 -0
- {docpull-3.0.2 → docpull-4.0.1}/tests/test_security_hardening.py +154 -0
- docpull-3.0.2/src/docpull/concurrency/__init__.py +0 -7
- docpull-3.0.2/src/docpull/concurrency/manager.py +0 -123
- docpull-3.0.2/src/docpull/logging_config.py +0 -53
- docpull-3.0.2/tests/test_cli.py +0 -20
- {docpull-3.0.2 → docpull-4.0.1}/LICENSE +0 -0
- {docpull-3.0.2 → docpull-4.0.1}/setup.cfg +0 -0
- {docpull-3.0.2 → docpull-4.0.1}/src/docpull/__main__.py +0 -0
- {docpull-3.0.2 → docpull-4.0.1}/src/docpull/conversion/__init__.py +0 -0
- {docpull-3.0.2 → docpull-4.0.1}/src/docpull/conversion/chunking.py +0 -0
- {docpull-3.0.2 → docpull-4.0.1}/src/docpull/conversion/extractor.py +0 -0
- {docpull-3.0.2 → docpull-4.0.1}/src/docpull/conversion/protocols.py +0 -0
- {docpull-3.0.2 → docpull-4.0.1}/src/docpull/conversion/special_cases.py +0 -0
- {docpull-3.0.2 → docpull-4.0.1}/src/docpull/conversion/trafilatura_extractor.py +0 -0
- {docpull-3.0.2 → docpull-4.0.1}/src/docpull/core/__init__.py +0 -0
- {docpull-3.0.2 → docpull-4.0.1}/src/docpull/discovery/composite.py +0 -0
- {docpull-3.0.2 → docpull-4.0.1}/src/docpull/discovery/link_extractors/__init__.py +0 -0
- {docpull-3.0.2 → docpull-4.0.1}/src/docpull/discovery/link_extractors/protocols.py +0 -0
- {docpull-3.0.2 → docpull-4.0.1}/src/docpull/discovery/protocols.py +0 -0
- {docpull-3.0.2 → docpull-4.0.1}/src/docpull/http/__init__.py +0 -0
- {docpull-3.0.2 → docpull-4.0.1}/src/docpull/http/protocols.py +0 -0
- {docpull-3.0.2 → docpull-4.0.1}/src/docpull/http/rate_limiter.py +0 -0
- {docpull-3.0.2 → docpull-4.0.1}/src/docpull/mcp/__init__.py +0 -0
- {docpull-3.0.2 → docpull-4.0.1}/src/docpull/mcp/server.py +0 -0
- {docpull-3.0.2 → docpull-4.0.1}/src/docpull/mcp/sources.py +0 -0
- {docpull-3.0.2 → docpull-4.0.1}/src/docpull/metadata_extractor.py +0 -0
- {docpull-3.0.2 → docpull-4.0.1}/src/docpull/pipeline/__init__.py +0 -0
- {docpull-3.0.2 → docpull-4.0.1}/src/docpull/pipeline/steps/__init__.py +0 -0
- {docpull-3.0.2 → docpull-4.0.1}/src/docpull/pipeline/steps/chunk.py +0 -0
- {docpull-3.0.2 → docpull-4.0.1}/src/docpull/pipeline/steps/metadata.py +0 -0
- {docpull-3.0.2 → docpull-4.0.1}/src/docpull/py.typed +0 -0
- {docpull-3.0.2 → docpull-4.0.1}/src/docpull/security/__init__.py +0 -0
- {docpull-3.0.2 → docpull-4.0.1}/src/docpull/time_utils.py +0 -0
- {docpull-3.0.2 → docpull-4.0.1}/src/docpull.egg-info/dependency_links.txt +0 -0
- {docpull-3.0.2 → docpull-4.0.1}/src/docpull.egg-info/entry_points.txt +0 -0
- {docpull-3.0.2 → docpull-4.0.1}/src/docpull.egg-info/top_level.txt +0 -0
- {docpull-3.0.2 → docpull-4.0.1}/tests/test_chunking.py +0 -0
- {docpull-3.0.2 → docpull-4.0.1}/tests/test_link_extractors.py +0 -0
- {docpull-3.0.2 → docpull-4.0.1}/tests/test_mcp_server.py +0 -0
- {docpull-3.0.2 → docpull-4.0.1}/tests/test_naming.py +0 -0
- {docpull-3.0.2 → docpull-4.0.1}/tests/test_pipeline.py +0 -0
- {docpull-3.0.2 → docpull-4.0.1}/tests/test_real_site_regressions.py +0 -0
- {docpull-3.0.2 → docpull-4.0.1}/tests/test_special_cases.py +0 -0
- {docpull-3.0.2 → docpull-4.0.1}/tests/test_time_utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: docpull
|
|
3
|
-
Version:
|
|
3
|
+
Version: 4.0.1
|
|
4
4
|
Summary: Pull documentation from the web and convert to clean markdown
|
|
5
5
|
Author-email: Zachary Roth <support@raintree.technology>
|
|
6
6
|
Maintainer-email: Raintree Technology <support@raintree.technology>
|
|
@@ -42,7 +42,7 @@ Requires-Dist: beautifulsoup4>=4.12.0
|
|
|
42
42
|
Requires-Dist: html2text>=2020.1.16
|
|
43
43
|
Requires-Dist: defusedxml>=0.7.1
|
|
44
44
|
Requires-Dist: extruct>=0.15.0
|
|
45
|
-
Requires-Dist: aiohttp>=3.
|
|
45
|
+
Requires-Dist: aiohttp>=3.14.0
|
|
46
46
|
Requires-Dist: idna>=3.15
|
|
47
47
|
Requires-Dist: regex>=2024.11.6
|
|
48
48
|
Requires-Dist: rich>=13.0.0
|
|
@@ -59,6 +59,7 @@ Provides-Extra: tokens
|
|
|
59
59
|
Requires-Dist: tiktoken>=0.7.0; extra == "tokens"
|
|
60
60
|
Provides-Extra: mcp
|
|
61
61
|
Requires-Dist: mcp>=1.0.0; extra == "mcp"
|
|
62
|
+
Requires-Dist: pyjwt>=2.13.0; extra == "mcp"
|
|
62
63
|
Requires-Dist: python-multipart>=0.0.27; extra == "mcp"
|
|
63
64
|
Requires-Dist: starlette>=1.0.1; extra == "mcp"
|
|
64
65
|
Provides-Extra: llm
|
|
@@ -69,6 +70,7 @@ Requires-Dist: url-normalize>=1.4.0; extra == "all"
|
|
|
69
70
|
Requires-Dist: trafilatura>=1.12.0; extra == "all"
|
|
70
71
|
Requires-Dist: tiktoken>=0.7.0; extra == "all"
|
|
71
72
|
Requires-Dist: mcp>=1.0.0; extra == "all"
|
|
73
|
+
Requires-Dist: pyjwt>=2.13.0; extra == "all"
|
|
72
74
|
Requires-Dist: python-multipart>=0.0.27; extra == "all"
|
|
73
75
|
Requires-Dist: starlette>=1.0.1; extra == "all"
|
|
74
76
|
Provides-Extra: dev
|
|
@@ -150,7 +152,7 @@ content directly from framework data feeds:
|
|
|
150
152
|
| Mintlify | `__NEXT_DATA__` with Mintlify tagging |
|
|
151
153
|
| OpenAPI | Renders `openapi.json` / `swagger.json` into Markdown |
|
|
152
154
|
| Docusaurus| Detected and tagged; generic extractor produces Markdown |
|
|
153
|
-
| Sphinx | Detected and tagged; generic extractor produces Markdown |
|
|
155
|
+
| Sphinx | Detected from generator metadata / Read the Docs hosts and tagged; generic extractor produces Markdown |
|
|
154
156
|
|
|
155
157
|
JS-only SPAs with no server-rendered content are detected and skipped with a
|
|
156
158
|
clear reason (or, with `--strict-js-required`, reported as an error so agents
|
|
@@ -213,8 +215,8 @@ async def tool_call(url: str) -> str:
|
|
|
213
215
|
|
|
214
216
|
```bash
|
|
215
217
|
docpull https://site.com --profile rag # Default. Dedup, rich metadata.
|
|
216
|
-
docpull https://site.com --profile llm # NDJSON + chunks + metadata.
|
|
217
|
-
docpull https://site.com --profile mirror # Full archive, polite, cached.
|
|
218
|
+
docpull https://site.com --profile llm # NDJSON + chunks + metadata; JS-only pages skip unless --strict-js-required is passed.
|
|
219
|
+
docpull https://site.com --profile mirror # Full archive, polite, cached, hierarchical paths.
|
|
218
220
|
docpull https://site.com --profile quick # Sampling: 50 pages, depth 2.
|
|
219
221
|
```
|
|
220
222
|
|
|
@@ -283,7 +285,9 @@ Write:
|
|
|
283
285
|
- `add_source(name, url, description?, category?, max_pages?, force?)` — register a user alias (HTTPS-only, atomic write to `sources.yaml`).
|
|
284
286
|
- `remove_source(name, delete_cache?)` — drop a user alias and (optionally) its cached docs.
|
|
285
287
|
|
|
286
|
-
All tools
|
|
288
|
+
All schema-backed tools return `structuredContent` validated against an
|
|
289
|
+
`outputSchema` for clients that prefer typed output. `fetch_url` intentionally
|
|
290
|
+
returns Markdown text directly.
|
|
287
291
|
|
|
288
292
|
User-defined sources live in `~/.config/docpull-mcp/sources.yaml`:
|
|
289
293
|
|
|
@@ -296,16 +300,17 @@ sources:
|
|
|
296
300
|
maxPages: 200
|
|
297
301
|
```
|
|
298
302
|
|
|
299
|
-
###
|
|
303
|
+
### Supported MCP path
|
|
300
304
|
|
|
301
|
-
The
|
|
302
|
-
|
|
303
|
-
the
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
and
|
|
305
|
+
The supported MCP server is the Python stdio server started by `docpull mcp`.
|
|
306
|
+
That is the only MCP path covered by the `docpull` package release contract and
|
|
307
|
+
the one agents, plugin users, Claude Code, Cursor, and Claude Desktop should
|
|
308
|
+
use.
|
|
309
|
+
|
|
310
|
+
This repository also contains an `mcp/` directory with an internal TypeScript +
|
|
311
|
+
Bun lab for PostgreSQL/pgvector semantic search. It is not shipped by the Python
|
|
312
|
+
package, is not documented as a user install path, and should be ignored unless
|
|
313
|
+
you are explicitly developing that lab.
|
|
309
314
|
|
|
310
315
|
## Output
|
|
311
316
|
|
|
@@ -325,9 +330,14 @@ source_type: "nextjs"
|
|
|
325
330
|
NDJSON (one record per page or chunk):
|
|
326
331
|
|
|
327
332
|
```json
|
|
328
|
-
{"url": "...", "title": "...", "content": "...", "hash": "...", "token_count": 842, "chunk_index": 0}
|
|
333
|
+
{"document_id": "doc_...", "chunk_id": "chunk_...", "url": "...", "title": "...", "content": "...", "hash": "...", "token_count": 842, "chunk_index": 0}
|
|
329
334
|
```
|
|
330
335
|
|
|
336
|
+
Every output format also writes `corpus.manifest.json` next to the generated
|
|
337
|
+
documents. The manifest records the run identity, output format, stable
|
|
338
|
+
`document_id` / `chunk_id` values, content hashes, relative output paths, and
|
|
339
|
+
chunk counts so regenerated corpora can be diffed and cited by agents.
|
|
340
|
+
|
|
331
341
|
## Security
|
|
332
342
|
|
|
333
343
|
- HTTPS-only, mandatory robots.txt compliance
|
|
@@ -347,7 +357,7 @@ Run `docpull --help` for the full list. Highlights:
|
|
|
347
357
|
|
|
348
358
|
```
|
|
349
359
|
Core:
|
|
350
|
-
--profile {rag,mirror,quick,llm
|
|
360
|
+
--profile {rag,mirror,quick,llm}
|
|
351
361
|
--single Fetch one URL (no crawl)
|
|
352
362
|
--format {markdown,json,ndjson,sqlite}
|
|
353
363
|
--stream Stream NDJSON to stdout
|
|
@@ -366,27 +376,33 @@ Cache:
|
|
|
366
376
|
--cache Enable incremental updates
|
|
367
377
|
--cache-dir DIR
|
|
368
378
|
--cache-ttl DAYS
|
|
379
|
+
|
|
380
|
+
Crawl:
|
|
381
|
+
--max-concurrent N Global request concurrency
|
|
382
|
+
--per-host-concurrent N Per-host request concurrency
|
|
369
383
|
```
|
|
370
384
|
|
|
371
385
|
## Performance
|
|
372
386
|
|
|
373
387
|
End-to-end numbers from `tests/benchmarks/test_10k_pages.py` against a
|
|
374
388
|
synthetic 10,000-page localhost site (RAG profile, `max_concurrent=50`,
|
|
375
|
-
HTTP keep-alive, 5% injected duplicate content)
|
|
389
|
+
`per_host_concurrent=50`, HTTP keep-alive, 5% injected duplicate content).
|
|
390
|
+
The benchmark emits progress every 1,000 pages plus a final JSON report for
|
|
391
|
+
trend tooling.
|
|
376
392
|
|
|
377
393
|
| Metric | Value |
|
|
378
394
|
|---|---|
|
|
379
|
-
| Total wall time | ~
|
|
380
|
-
|
|
|
381
|
-
|
|
|
382
|
-
| Per-page latency p50 / p95 / p99 | ~
|
|
383
|
-
| Peak RSS delta from baseline | ~
|
|
384
|
-
| Cache manifest size on disk | ~
|
|
395
|
+
| Total wall time | ~333 s |
|
|
396
|
+
| Pages fetched / skipped / failed | 9,501 / 499 / 0 |
|
|
397
|
+
| Time to first saved page | ~130 ms |
|
|
398
|
+
| Per-page latency p50 / p95 / p99 | ~0 / 166 / 232 ms |
|
|
399
|
+
| Peak RSS delta from baseline | ~94 MB |
|
|
400
|
+
| Cache manifest size on disk | ~8.9 MB |
|
|
385
401
|
| Duplicates detected (5% injected) | 499 / 500 |
|
|
386
402
|
|
|
387
403
|
Reproduce with `make benchmark` (requires `aiohttp`; runs the gated
|
|
388
|
-
benchmark in `tests/benchmarks/` and prints a JSON line you can
|
|
389
|
-
into trend tooling).
|
|
404
|
+
benchmark in `tests/benchmarks/` and prints progress plus a JSON line you can
|
|
405
|
+
pipe into trend tooling).
|
|
390
406
|
|
|
391
407
|
## Troubleshooting
|
|
392
408
|
|
|
@@ -62,7 +62,7 @@ content directly from framework data feeds:
|
|
|
62
62
|
| Mintlify | `__NEXT_DATA__` with Mintlify tagging |
|
|
63
63
|
| OpenAPI | Renders `openapi.json` / `swagger.json` into Markdown |
|
|
64
64
|
| Docusaurus| Detected and tagged; generic extractor produces Markdown |
|
|
65
|
-
| Sphinx | Detected and tagged; generic extractor produces Markdown |
|
|
65
|
+
| Sphinx | Detected from generator metadata / Read the Docs hosts and tagged; generic extractor produces Markdown |
|
|
66
66
|
|
|
67
67
|
JS-only SPAs with no server-rendered content are detected and skipped with a
|
|
68
68
|
clear reason (or, with `--strict-js-required`, reported as an error so agents
|
|
@@ -125,8 +125,8 @@ async def tool_call(url: str) -> str:
|
|
|
125
125
|
|
|
126
126
|
```bash
|
|
127
127
|
docpull https://site.com --profile rag # Default. Dedup, rich metadata.
|
|
128
|
-
docpull https://site.com --profile llm # NDJSON + chunks + metadata.
|
|
129
|
-
docpull https://site.com --profile mirror # Full archive, polite, cached.
|
|
128
|
+
docpull https://site.com --profile llm # NDJSON + chunks + metadata; JS-only pages skip unless --strict-js-required is passed.
|
|
129
|
+
docpull https://site.com --profile mirror # Full archive, polite, cached, hierarchical paths.
|
|
130
130
|
docpull https://site.com --profile quick # Sampling: 50 pages, depth 2.
|
|
131
131
|
```
|
|
132
132
|
|
|
@@ -195,7 +195,9 @@ Write:
|
|
|
195
195
|
- `add_source(name, url, description?, category?, max_pages?, force?)` — register a user alias (HTTPS-only, atomic write to `sources.yaml`).
|
|
196
196
|
- `remove_source(name, delete_cache?)` — drop a user alias and (optionally) its cached docs.
|
|
197
197
|
|
|
198
|
-
All tools
|
|
198
|
+
All schema-backed tools return `structuredContent` validated against an
|
|
199
|
+
`outputSchema` for clients that prefer typed output. `fetch_url` intentionally
|
|
200
|
+
returns Markdown text directly.
|
|
199
201
|
|
|
200
202
|
User-defined sources live in `~/.config/docpull-mcp/sources.yaml`:
|
|
201
203
|
|
|
@@ -208,16 +210,17 @@ sources:
|
|
|
208
210
|
maxPages: 200
|
|
209
211
|
```
|
|
210
212
|
|
|
211
|
-
###
|
|
213
|
+
### Supported MCP path
|
|
212
214
|
|
|
213
|
-
The
|
|
214
|
-
|
|
215
|
-
the
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
and
|
|
215
|
+
The supported MCP server is the Python stdio server started by `docpull mcp`.
|
|
216
|
+
That is the only MCP path covered by the `docpull` package release contract and
|
|
217
|
+
the one agents, plugin users, Claude Code, Cursor, and Claude Desktop should
|
|
218
|
+
use.
|
|
219
|
+
|
|
220
|
+
This repository also contains an `mcp/` directory with an internal TypeScript +
|
|
221
|
+
Bun lab for PostgreSQL/pgvector semantic search. It is not shipped by the Python
|
|
222
|
+
package, is not documented as a user install path, and should be ignored unless
|
|
223
|
+
you are explicitly developing that lab.
|
|
221
224
|
|
|
222
225
|
## Output
|
|
223
226
|
|
|
@@ -237,9 +240,14 @@ source_type: "nextjs"
|
|
|
237
240
|
NDJSON (one record per page or chunk):
|
|
238
241
|
|
|
239
242
|
```json
|
|
240
|
-
{"url": "...", "title": "...", "content": "...", "hash": "...", "token_count": 842, "chunk_index": 0}
|
|
243
|
+
{"document_id": "doc_...", "chunk_id": "chunk_...", "url": "...", "title": "...", "content": "...", "hash": "...", "token_count": 842, "chunk_index": 0}
|
|
241
244
|
```
|
|
242
245
|
|
|
246
|
+
Every output format also writes `corpus.manifest.json` next to the generated
|
|
247
|
+
documents. The manifest records the run identity, output format, stable
|
|
248
|
+
`document_id` / `chunk_id` values, content hashes, relative output paths, and
|
|
249
|
+
chunk counts so regenerated corpora can be diffed and cited by agents.
|
|
250
|
+
|
|
243
251
|
## Security
|
|
244
252
|
|
|
245
253
|
- HTTPS-only, mandatory robots.txt compliance
|
|
@@ -259,7 +267,7 @@ Run `docpull --help` for the full list. Highlights:
|
|
|
259
267
|
|
|
260
268
|
```
|
|
261
269
|
Core:
|
|
262
|
-
--profile {rag,mirror,quick,llm
|
|
270
|
+
--profile {rag,mirror,quick,llm}
|
|
263
271
|
--single Fetch one URL (no crawl)
|
|
264
272
|
--format {markdown,json,ndjson,sqlite}
|
|
265
273
|
--stream Stream NDJSON to stdout
|
|
@@ -278,27 +286,33 @@ Cache:
|
|
|
278
286
|
--cache Enable incremental updates
|
|
279
287
|
--cache-dir DIR
|
|
280
288
|
--cache-ttl DAYS
|
|
289
|
+
|
|
290
|
+
Crawl:
|
|
291
|
+
--max-concurrent N Global request concurrency
|
|
292
|
+
--per-host-concurrent N Per-host request concurrency
|
|
281
293
|
```
|
|
282
294
|
|
|
283
295
|
## Performance
|
|
284
296
|
|
|
285
297
|
End-to-end numbers from `tests/benchmarks/test_10k_pages.py` against a
|
|
286
298
|
synthetic 10,000-page localhost site (RAG profile, `max_concurrent=50`,
|
|
287
|
-
HTTP keep-alive, 5% injected duplicate content)
|
|
299
|
+
`per_host_concurrent=50`, HTTP keep-alive, 5% injected duplicate content).
|
|
300
|
+
The benchmark emits progress every 1,000 pages plus a final JSON report for
|
|
301
|
+
trend tooling.
|
|
288
302
|
|
|
289
303
|
| Metric | Value |
|
|
290
304
|
|---|---|
|
|
291
|
-
| Total wall time | ~
|
|
292
|
-
|
|
|
293
|
-
|
|
|
294
|
-
| Per-page latency p50 / p95 / p99 | ~
|
|
295
|
-
| Peak RSS delta from baseline | ~
|
|
296
|
-
| Cache manifest size on disk | ~
|
|
305
|
+
| Total wall time | ~333 s |
|
|
306
|
+
| Pages fetched / skipped / failed | 9,501 / 499 / 0 |
|
|
307
|
+
| Time to first saved page | ~130 ms |
|
|
308
|
+
| Per-page latency p50 / p95 / p99 | ~0 / 166 / 232 ms |
|
|
309
|
+
| Peak RSS delta from baseline | ~94 MB |
|
|
310
|
+
| Cache manifest size on disk | ~8.9 MB |
|
|
297
311
|
| Duplicates detected (5% injected) | 499 / 500 |
|
|
298
312
|
|
|
299
313
|
Reproduce with `make benchmark` (requires `aiohttp`; runs the gated
|
|
300
|
-
benchmark in `tests/benchmarks/` and prints a JSON line you can
|
|
301
|
-
into trend tooling).
|
|
314
|
+
benchmark in `tests/benchmarks/` and prints progress plus a JSON line you can
|
|
315
|
+
pipe into trend tooling).
|
|
302
316
|
|
|
303
317
|
## Troubleshooting
|
|
304
318
|
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "docpull"
|
|
7
|
-
version = "
|
|
7
|
+
version = "4.0.1"
|
|
8
8
|
dynamic = []
|
|
9
9
|
description = "Pull documentation from the web and convert to clean markdown"
|
|
10
10
|
readme = {file = "README.md", content-type = "text/markdown"}
|
|
@@ -66,7 +66,7 @@ dependencies = [
|
|
|
66
66
|
"html2text>=2020.1.16",
|
|
67
67
|
"defusedxml>=0.7.1",
|
|
68
68
|
"extruct>=0.15.0",
|
|
69
|
-
"aiohttp>=3.
|
|
69
|
+
"aiohttp>=3.14.0", # 3.14.0 fixes CVE-2026-34993 and CVE-2026-47265
|
|
70
70
|
"idna>=3.15",
|
|
71
71
|
"regex>=2024.11.6",
|
|
72
72
|
"rich>=13.0.0",
|
|
@@ -90,6 +90,7 @@ tokens = [
|
|
|
90
90
|
]
|
|
91
91
|
mcp = [
|
|
92
92
|
"mcp>=1.0.0",
|
|
93
|
+
"pyjwt>=2.13.0",
|
|
93
94
|
"python-multipart>=0.0.27",
|
|
94
95
|
"starlette>=1.0.1",
|
|
95
96
|
]
|
|
@@ -102,6 +103,7 @@ all = [
|
|
|
102
103
|
"trafilatura>=1.12.0",
|
|
103
104
|
"tiktoken>=0.7.0",
|
|
104
105
|
"mcp>=1.0.0",
|
|
106
|
+
"pyjwt>=2.13.0",
|
|
105
107
|
"python-multipart>=0.0.27",
|
|
106
108
|
"starlette>=1.0.1",
|
|
107
109
|
]
|
|
@@ -14,7 +14,7 @@ Usage:
|
|
|
14
14
|
print(event)
|
|
15
15
|
"""
|
|
16
16
|
|
|
17
|
-
__version__ = "
|
|
17
|
+
__version__ = "4.0.1"
|
|
18
18
|
|
|
19
19
|
from .cache import CacheManager, StreamingDeduplicator
|
|
20
20
|
from .conversion.chunking import Chunk, TokenCounter, chunk_markdown
|
|
@@ -34,12 +34,10 @@ from .pipeline.base import PageContext
|
|
|
34
34
|
|
|
35
35
|
__all__ = [
|
|
36
36
|
"__version__",
|
|
37
|
-
# Core
|
|
38
37
|
"Fetcher",
|
|
39
38
|
"fetch_blocking",
|
|
40
39
|
"fetch_one",
|
|
41
40
|
"PageContext",
|
|
42
|
-
# Config
|
|
43
41
|
"DocpullConfig",
|
|
44
42
|
"ProfileName",
|
|
45
43
|
"CrawlConfig",
|
|
@@ -48,14 +46,11 @@ __all__ = [
|
|
|
48
46
|
"NetworkConfig",
|
|
49
47
|
"PerformanceConfig",
|
|
50
48
|
"CacheConfig",
|
|
51
|
-
# Events
|
|
52
49
|
"EventType",
|
|
53
50
|
"FetchEvent",
|
|
54
51
|
"FetchStats",
|
|
55
|
-
# Cache
|
|
56
52
|
"CacheManager",
|
|
57
53
|
"StreamingDeduplicator",
|
|
58
|
-
# Chunking
|
|
59
54
|
"Chunk",
|
|
60
55
|
"TokenCounter",
|
|
61
56
|
"chunk_markdown",
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
"""Caching and deduplication for docpull."""
|
|
2
2
|
|
|
3
|
+
from .frontier import FrontierEntry, FrontierState, FrontierStore
|
|
3
4
|
from .manager import DEFAULT_TTL_DAYS, CacheManager, CacheState, ManifestEntry
|
|
4
5
|
from .streaming_dedup import StreamingDeduplicator
|
|
5
6
|
|
|
@@ -7,6 +8,9 @@ __all__ = [
|
|
|
7
8
|
"CacheManager",
|
|
8
9
|
"CacheState",
|
|
9
10
|
"ManifestEntry",
|
|
11
|
+
"FrontierEntry",
|
|
12
|
+
"FrontierState",
|
|
13
|
+
"FrontierStore",
|
|
10
14
|
"StreamingDeduplicator",
|
|
11
15
|
"DEFAULT_TTL_DAYS",
|
|
12
16
|
]
|
|
@@ -0,0 +1,199 @@
|
|
|
1
|
+
"""Durable crawl frontier state for pause/resume and provenance."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import logging
|
|
7
|
+
from dataclasses import dataclass, field
|
|
8
|
+
from enum import Enum
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Any
|
|
11
|
+
|
|
12
|
+
from ..models.run import FRONTIER_SCHEMA_VERSION
|
|
13
|
+
from ..time_utils import utc_now_iso
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class FrontierState(str, Enum):
|
|
19
|
+
"""Lifecycle state for a URL in the crawl frontier."""
|
|
20
|
+
|
|
21
|
+
QUEUED = "queued"
|
|
22
|
+
PROCESSING = "processing"
|
|
23
|
+
SUCCEEDED = "succeeded"
|
|
24
|
+
SKIPPED = "skipped"
|
|
25
|
+
FAILED = "failed"
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@dataclass
|
|
29
|
+
class FrontierEntry:
|
|
30
|
+
url: str
|
|
31
|
+
state: FrontierState = FrontierState.QUEUED
|
|
32
|
+
depth: int | None = None
|
|
33
|
+
source: str | None = None
|
|
34
|
+
discovered_at: str = field(default_factory=utc_now_iso)
|
|
35
|
+
updated_at: str = field(default_factory=utc_now_iso)
|
|
36
|
+
attempts: int = 0
|
|
37
|
+
last_error: str | None = None
|
|
38
|
+
|
|
39
|
+
@classmethod
|
|
40
|
+
def from_json(cls, data: dict[str, Any]) -> FrontierEntry | None:
|
|
41
|
+
url = data.get("url")
|
|
42
|
+
if not isinstance(url, str):
|
|
43
|
+
return None
|
|
44
|
+
try:
|
|
45
|
+
state = FrontierState(str(data.get("state", FrontierState.QUEUED.value)))
|
|
46
|
+
except ValueError:
|
|
47
|
+
state = FrontierState.QUEUED
|
|
48
|
+
attempts = data.get("attempts")
|
|
49
|
+
discovered_at = data.get("discovered_at")
|
|
50
|
+
updated_at = data.get("updated_at")
|
|
51
|
+
return cls(
|
|
52
|
+
url=url,
|
|
53
|
+
state=state,
|
|
54
|
+
depth=data.get("depth") if isinstance(data.get("depth"), int) else None,
|
|
55
|
+
source=data.get("source") if isinstance(data.get("source"), str) else None,
|
|
56
|
+
discovered_at=discovered_at if isinstance(discovered_at, str) else utc_now_iso(),
|
|
57
|
+
updated_at=updated_at if isinstance(updated_at, str) else utc_now_iso(),
|
|
58
|
+
attempts=attempts if isinstance(attempts, int) else 0,
|
|
59
|
+
last_error=data.get("last_error") if isinstance(data.get("last_error"), str) else None,
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
def to_json(self) -> dict[str, Any]:
|
|
63
|
+
return {
|
|
64
|
+
"url": self.url,
|
|
65
|
+
"state": self.state.value,
|
|
66
|
+
"depth": self.depth,
|
|
67
|
+
"source": self.source,
|
|
68
|
+
"discovered_at": self.discovered_at,
|
|
69
|
+
"updated_at": self.updated_at,
|
|
70
|
+
"attempts": self.attempts,
|
|
71
|
+
"last_error": self.last_error,
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
class FrontierStore:
|
|
76
|
+
"""Small JSON-backed frontier store.
|
|
77
|
+
|
|
78
|
+
The store is intentionally simple because docpull is single-process today.
|
|
79
|
+
It gives us explicit URL lifecycle state and a compatibility fingerprint
|
|
80
|
+
without introducing a queue service or SQLite dependency for markdown users.
|
|
81
|
+
"""
|
|
82
|
+
|
|
83
|
+
def __init__(self, path: Path):
|
|
84
|
+
self.path = Path(path)
|
|
85
|
+
self.entries: dict[str, FrontierEntry] = {}
|
|
86
|
+
self.start_url: str | None = None
|
|
87
|
+
self.run_fingerprint: dict[str, object] | None = None
|
|
88
|
+
self.created_at: str | None = None
|
|
89
|
+
self.updated_at: str | None = None
|
|
90
|
+
self._load()
|
|
91
|
+
|
|
92
|
+
def _load(self) -> None:
|
|
93
|
+
if not self.path.exists():
|
|
94
|
+
return
|
|
95
|
+
try:
|
|
96
|
+
data = json.loads(self.path.read_text(encoding="utf-8"))
|
|
97
|
+
except (OSError, json.JSONDecodeError) as err:
|
|
98
|
+
logger.warning("Could not load frontier store %s: %s", self.path, err)
|
|
99
|
+
return
|
|
100
|
+
if not isinstance(data, dict) or data.get("schema_version") != FRONTIER_SCHEMA_VERSION:
|
|
101
|
+
return
|
|
102
|
+
entries = data.get("entries")
|
|
103
|
+
if not isinstance(entries, list):
|
|
104
|
+
return
|
|
105
|
+
self.start_url = data.get("start_url") if isinstance(data.get("start_url"), str) else None
|
|
106
|
+
fingerprint = data.get("run_fingerprint")
|
|
107
|
+
self.run_fingerprint = fingerprint if isinstance(fingerprint, dict) else None
|
|
108
|
+
self.created_at = data.get("created_at") if isinstance(data.get("created_at"), str) else None
|
|
109
|
+
self.updated_at = data.get("updated_at") if isinstance(data.get("updated_at"), str) else None
|
|
110
|
+
for item in entries:
|
|
111
|
+
if not isinstance(item, dict):
|
|
112
|
+
continue
|
|
113
|
+
entry = FrontierEntry.from_json(item)
|
|
114
|
+
if entry:
|
|
115
|
+
self.entries[entry.url] = entry
|
|
116
|
+
|
|
117
|
+
def save(self) -> None:
|
|
118
|
+
self.path.parent.mkdir(parents=True, exist_ok=True)
|
|
119
|
+
now = utc_now_iso()
|
|
120
|
+
if self.created_at is None:
|
|
121
|
+
self.created_at = now
|
|
122
|
+
self.updated_at = now
|
|
123
|
+
data = {
|
|
124
|
+
"schema_version": FRONTIER_SCHEMA_VERSION,
|
|
125
|
+
"start_url": self.start_url,
|
|
126
|
+
"run_fingerprint": self.run_fingerprint,
|
|
127
|
+
"created_at": self.created_at,
|
|
128
|
+
"updated_at": self.updated_at,
|
|
129
|
+
"entries": [entry.to_json() for entry in self.entries.values()],
|
|
130
|
+
}
|
|
131
|
+
tmp = self.path.with_suffix(self.path.suffix + ".tmp")
|
|
132
|
+
try:
|
|
133
|
+
tmp.write_text(json.dumps(data, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")
|
|
134
|
+
tmp.replace(self.path)
|
|
135
|
+
except Exception:
|
|
136
|
+
tmp.unlink(missing_ok=True)
|
|
137
|
+
raise
|
|
138
|
+
|
|
139
|
+
def initialize(self, *, start_url: str, run_fingerprint: dict[str, object]) -> None:
|
|
140
|
+
if self.start_url != start_url or self.run_fingerprint != run_fingerprint:
|
|
141
|
+
self.entries.clear()
|
|
142
|
+
self.created_at = utc_now_iso()
|
|
143
|
+
self.start_url = start_url
|
|
144
|
+
self.run_fingerprint = run_fingerprint
|
|
145
|
+
self.save()
|
|
146
|
+
|
|
147
|
+
def compatible(self, *, start_url: str, run_fingerprint: dict[str, object]) -> bool:
|
|
148
|
+
return self.start_url == start_url and self.run_fingerprint == run_fingerprint
|
|
149
|
+
|
|
150
|
+
def add(self, url: str, *, depth: int | None = None, source: str | None = None) -> None:
|
|
151
|
+
if url in self.entries:
|
|
152
|
+
return
|
|
153
|
+
self.entries[url] = FrontierEntry(url=url, depth=depth, source=source)
|
|
154
|
+
|
|
155
|
+
def add_many(self, urls: list[str], *, source: str | None = None) -> None:
|
|
156
|
+
for url in urls:
|
|
157
|
+
self.add(url, source=source)
|
|
158
|
+
|
|
159
|
+
def mark_processing(self, url: str) -> None:
|
|
160
|
+
entry = self.entries.get(url)
|
|
161
|
+
if not entry:
|
|
162
|
+
self.add(url)
|
|
163
|
+
entry = self.entries[url]
|
|
164
|
+
entry.state = FrontierState.PROCESSING
|
|
165
|
+
entry.attempts += 1
|
|
166
|
+
entry.updated_at = utc_now_iso()
|
|
167
|
+
self.save()
|
|
168
|
+
|
|
169
|
+
def mark_succeeded(self, url: str) -> None:
|
|
170
|
+
self._mark_terminal(url, FrontierState.SUCCEEDED)
|
|
171
|
+
|
|
172
|
+
def mark_skipped(self, url: str) -> None:
|
|
173
|
+
self._mark_terminal(url, FrontierState.SKIPPED)
|
|
174
|
+
|
|
175
|
+
def mark_failed(self, url: str, error: str | None = None) -> None:
|
|
176
|
+
self._mark_terminal(url, FrontierState.FAILED, error=error)
|
|
177
|
+
|
|
178
|
+
def _mark_terminal(self, url: str, state: FrontierState, error: str | None = None) -> None:
|
|
179
|
+
entry = self.entries.get(url)
|
|
180
|
+
if not entry:
|
|
181
|
+
self.add(url)
|
|
182
|
+
entry = self.entries[url]
|
|
183
|
+
entry.state = state
|
|
184
|
+
entry.last_error = error
|
|
185
|
+
entry.updated_at = utc_now_iso()
|
|
186
|
+
self.save()
|
|
187
|
+
|
|
188
|
+
def pending_urls(self) -> list[str]:
|
|
189
|
+
terminal = {FrontierState.SUCCEEDED, FrontierState.SKIPPED}
|
|
190
|
+
return [url for url, entry in self.entries.items() if entry.state not in terminal]
|
|
191
|
+
|
|
192
|
+
def clear(self) -> None:
|
|
193
|
+
if self.path.exists():
|
|
194
|
+
self.path.unlink()
|
|
195
|
+
self.entries.clear()
|
|
196
|
+
self.start_url = None
|
|
197
|
+
self.run_fingerprint = None
|
|
198
|
+
self.created_at = None
|
|
199
|
+
self.updated_at = None
|