docpull 4.0.0__tar.gz → 4.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docpull-4.0.0/src/docpull.egg-info → docpull-4.0.1}/PKG-INFO +39 -25
- {docpull-4.0.0 → docpull-4.0.1}/README.md +38 -24
- {docpull-4.0.0 → docpull-4.0.1}/pyproject.toml +1 -1
- {docpull-4.0.0 → docpull-4.0.1}/src/docpull/__init__.py +1 -6
- {docpull-4.0.0 → docpull-4.0.1}/src/docpull/cache/__init__.py +4 -0
- docpull-4.0.1/src/docpull/cache/frontier.py +199 -0
- {docpull-4.0.0 → docpull-4.0.1}/src/docpull/cache/manager.py +140 -23
- {docpull-4.0.0 → docpull-4.0.1}/src/docpull/cli.py +32 -17
- {docpull-4.0.0 → docpull-4.0.1}/src/docpull/core/fetcher.py +105 -49
- {docpull-4.0.0 → docpull-4.0.1}/src/docpull/discovery/__init__.py +0 -4
- {docpull-4.0.0 → docpull-4.0.1}/src/docpull/discovery/crawler.py +0 -19
- {docpull-4.0.0 → docpull-4.0.1}/src/docpull/discovery/filters.py +1 -6
- {docpull-4.0.0 → docpull-4.0.1}/src/docpull/discovery/sitemap.py +0 -4
- {docpull-4.0.0 → docpull-4.0.1}/src/docpull/doctor.py +0 -13
- {docpull-4.0.0 → docpull-4.0.1}/src/docpull/mcp/tools.py +92 -17
- {docpull-4.0.0 → docpull-4.0.1}/src/docpull/models/__init__.py +16 -3
- docpull-4.0.1/src/docpull/models/document.py +78 -0
- {docpull-4.0.0 → docpull-4.0.1}/src/docpull/models/events.py +41 -2
- {docpull-4.0.0 → docpull-4.0.1}/src/docpull/models/profiles.py +6 -8
- docpull-4.0.1/src/docpull/models/run.py +60 -0
- {docpull-4.0.0 → docpull-4.0.1}/src/docpull/pipeline/base.py +60 -12
- docpull-4.0.1/src/docpull/pipeline/manifest.py +74 -0
- {docpull-4.0.0 → docpull-4.0.1}/src/docpull/pipeline/steps/convert.py +39 -6
- {docpull-4.0.0 → docpull-4.0.1}/src/docpull/pipeline/steps/dedup.py +11 -4
- {docpull-4.0.0 → docpull-4.0.1}/src/docpull/pipeline/steps/fetch.py +12 -8
- {docpull-4.0.0 → docpull-4.0.1}/src/docpull/pipeline/steps/save.py +53 -6
- {docpull-4.0.0 → docpull-4.0.1}/src/docpull/pipeline/steps/save_json.py +30 -11
- {docpull-4.0.0 → docpull-4.0.1}/src/docpull/pipeline/steps/save_ndjson.py +39 -21
- {docpull-4.0.0 → docpull-4.0.1}/src/docpull/pipeline/steps/save_sqlite.py +68 -10
- {docpull-4.0.0 → docpull-4.0.1}/src/docpull/pipeline/steps/validate.py +10 -7
- {docpull-4.0.0 → docpull-4.0.1}/src/docpull/security/url_validator.py +0 -6
- {docpull-4.0.0 → docpull-4.0.1/src/docpull.egg-info}/PKG-INFO +39 -25
- {docpull-4.0.0 → docpull-4.0.1}/src/docpull.egg-info/SOURCES.txt +7 -0
- docpull-4.0.1/tests/test_cli.py +86 -0
- {docpull-4.0.0 → docpull-4.0.1}/tests/test_convert_step_new.py +6 -8
- docpull-4.0.1/tests/test_frontier_resume.py +35 -0
- {docpull-4.0.0 → docpull-4.0.1}/tests/test_integration.py +67 -0
- {docpull-4.0.0 → docpull-4.0.1}/tests/test_mcp_tools.py +39 -1
- docpull-4.0.1/tests/test_outputs_e2e.py +120 -0
- {docpull-4.0.0 → docpull-4.0.1}/tests/test_save_ndjson.py +2 -0
- docpull-4.0.1/tests/test_save_sqlite.py +52 -0
- {docpull-4.0.0 → docpull-4.0.1}/tests/test_security_hardening.py +62 -0
- docpull-4.0.0/tests/test_cli.py +0 -20
- {docpull-4.0.0 → docpull-4.0.1}/LICENSE +0 -0
- {docpull-4.0.0 → docpull-4.0.1}/setup.cfg +0 -0
- {docpull-4.0.0 → docpull-4.0.1}/src/docpull/__main__.py +0 -0
- {docpull-4.0.0 → docpull-4.0.1}/src/docpull/cache/streaming_dedup.py +0 -0
- {docpull-4.0.0 → docpull-4.0.1}/src/docpull/conversion/__init__.py +0 -0
- {docpull-4.0.0 → docpull-4.0.1}/src/docpull/conversion/chunking.py +0 -0
- {docpull-4.0.0 → docpull-4.0.1}/src/docpull/conversion/extractor.py +0 -0
- {docpull-4.0.0 → docpull-4.0.1}/src/docpull/conversion/markdown.py +0 -0
- {docpull-4.0.0 → docpull-4.0.1}/src/docpull/conversion/protocols.py +0 -0
- {docpull-4.0.0 → docpull-4.0.1}/src/docpull/conversion/special_cases.py +0 -0
- {docpull-4.0.0 → docpull-4.0.1}/src/docpull/conversion/trafilatura_extractor.py +0 -0
- {docpull-4.0.0 → docpull-4.0.1}/src/docpull/core/__init__.py +0 -0
- {docpull-4.0.0 → docpull-4.0.1}/src/docpull/discovery/_fetch.py +0 -0
- {docpull-4.0.0 → docpull-4.0.1}/src/docpull/discovery/composite.py +0 -0
- {docpull-4.0.0 → docpull-4.0.1}/src/docpull/discovery/link_extractors/__init__.py +0 -0
- {docpull-4.0.0 → docpull-4.0.1}/src/docpull/discovery/link_extractors/enhanced.py +0 -0
- {docpull-4.0.0 → docpull-4.0.1}/src/docpull/discovery/link_extractors/protocols.py +0 -0
- {docpull-4.0.0 → docpull-4.0.1}/src/docpull/discovery/link_extractors/static.py +0 -0
- {docpull-4.0.0 → docpull-4.0.1}/src/docpull/discovery/protocols.py +0 -0
- {docpull-4.0.0 → docpull-4.0.1}/src/docpull/http/__init__.py +0 -0
- {docpull-4.0.0 → docpull-4.0.1}/src/docpull/http/client.py +0 -0
- {docpull-4.0.0 → docpull-4.0.1}/src/docpull/http/protocols.py +0 -0
- {docpull-4.0.0 → docpull-4.0.1}/src/docpull/http/rate_limiter.py +0 -0
- {docpull-4.0.0 → docpull-4.0.1}/src/docpull/mcp/__init__.py +0 -0
- {docpull-4.0.0 → docpull-4.0.1}/src/docpull/mcp/server.py +0 -0
- {docpull-4.0.0 → docpull-4.0.1}/src/docpull/mcp/sources.py +0 -0
- {docpull-4.0.0 → docpull-4.0.1}/src/docpull/metadata_extractor.py +0 -0
- {docpull-4.0.0 → docpull-4.0.1}/src/docpull/models/config.py +0 -0
- {docpull-4.0.0 → docpull-4.0.1}/src/docpull/pipeline/__init__.py +0 -0
- {docpull-4.0.0 → docpull-4.0.1}/src/docpull/pipeline/steps/__init__.py +0 -0
- {docpull-4.0.0 → docpull-4.0.1}/src/docpull/pipeline/steps/chunk.py +0 -0
- {docpull-4.0.0 → docpull-4.0.1}/src/docpull/pipeline/steps/metadata.py +0 -0
- {docpull-4.0.0 → docpull-4.0.1}/src/docpull/py.typed +0 -0
- {docpull-4.0.0 → docpull-4.0.1}/src/docpull/security/__init__.py +0 -0
- {docpull-4.0.0 → docpull-4.0.1}/src/docpull/security/robots.py +0 -0
- {docpull-4.0.0 → docpull-4.0.1}/src/docpull/time_utils.py +0 -0
- {docpull-4.0.0 → docpull-4.0.1}/src/docpull.egg-info/dependency_links.txt +0 -0
- {docpull-4.0.0 → docpull-4.0.1}/src/docpull.egg-info/entry_points.txt +0 -0
- {docpull-4.0.0 → docpull-4.0.1}/src/docpull.egg-info/requires.txt +0 -0
- {docpull-4.0.0 → docpull-4.0.1}/src/docpull.egg-info/top_level.txt +0 -0
- {docpull-4.0.0 → docpull-4.0.1}/tests/test_cache_conditional_get.py +0 -0
- {docpull-4.0.0 → docpull-4.0.1}/tests/test_chunking.py +0 -0
- {docpull-4.0.0 → docpull-4.0.1}/tests/test_ci_policy.py +0 -0
- {docpull-4.0.0 → docpull-4.0.1}/tests/test_conversion.py +0 -0
- {docpull-4.0.0 → docpull-4.0.1}/tests/test_discovery.py +0 -0
- {docpull-4.0.0 → docpull-4.0.1}/tests/test_link_extractors.py +0 -0
- {docpull-4.0.0 → docpull-4.0.1}/tests/test_mcp_server.py +0 -0
- {docpull-4.0.0 → docpull-4.0.1}/tests/test_naming.py +0 -0
- {docpull-4.0.0 → docpull-4.0.1}/tests/test_pipeline.py +0 -0
- {docpull-4.0.0 → docpull-4.0.1}/tests/test_real_site_regressions.py +0 -0
- {docpull-4.0.0 → docpull-4.0.1}/tests/test_special_cases.py +0 -0
- {docpull-4.0.0 → docpull-4.0.1}/tests/test_time_utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: docpull
|
|
3
|
-
Version: 4.0.
|
|
3
|
+
Version: 4.0.1
|
|
4
4
|
Summary: Pull documentation from the web and convert to clean markdown
|
|
5
5
|
Author-email: Zachary Roth <support@raintree.technology>
|
|
6
6
|
Maintainer-email: Raintree Technology <support@raintree.technology>
|
|
@@ -152,7 +152,7 @@ content directly from framework data feeds:
|
|
|
152
152
|
| Mintlify | `__NEXT_DATA__` with Mintlify tagging |
|
|
153
153
|
| OpenAPI | Renders `openapi.json` / `swagger.json` into Markdown |
|
|
154
154
|
| Docusaurus| Detected and tagged; generic extractor produces Markdown |
|
|
155
|
-
| Sphinx | Detected and tagged; generic extractor produces Markdown |
|
|
155
|
+
| Sphinx | Detected from generator metadata / Read the Docs hosts and tagged; generic extractor produces Markdown |
|
|
156
156
|
|
|
157
157
|
JS-only SPAs with no server-rendered content are detected and skipped with a
|
|
158
158
|
clear reason (or, with `--strict-js-required`, reported as an error so agents
|
|
@@ -215,8 +215,8 @@ async def tool_call(url: str) -> str:
|
|
|
215
215
|
|
|
216
216
|
```bash
|
|
217
217
|
docpull https://site.com --profile rag # Default. Dedup, rich metadata.
|
|
218
|
-
docpull https://site.com --profile llm # NDJSON + chunks + metadata.
|
|
219
|
-
docpull https://site.com --profile mirror # Full archive, polite, cached.
|
|
218
|
+
docpull https://site.com --profile llm # NDJSON + chunks + metadata; JS-only pages skip unless --strict-js-required is passed.
|
|
219
|
+
docpull https://site.com --profile mirror # Full archive, polite, cached, hierarchical paths.
|
|
220
220
|
docpull https://site.com --profile quick # Sampling: 50 pages, depth 2.
|
|
221
221
|
```
|
|
222
222
|
|
|
@@ -285,7 +285,9 @@ Write:
|
|
|
285
285
|
- `add_source(name, url, description?, category?, max_pages?, force?)` — register a user alias (HTTPS-only, atomic write to `sources.yaml`).
|
|
286
286
|
- `remove_source(name, delete_cache?)` — drop a user alias and (optionally) its cached docs.
|
|
287
287
|
|
|
288
|
-
All tools
|
|
288
|
+
All schema-backed tools return `structuredContent` validated against an
|
|
289
|
+
`outputSchema` for clients that prefer typed output. `fetch_url` intentionally
|
|
290
|
+
returns Markdown text directly.
|
|
289
291
|
|
|
290
292
|
User-defined sources live in `~/.config/docpull-mcp/sources.yaml`:
|
|
291
293
|
|
|
@@ -298,16 +300,17 @@ sources:
|
|
|
298
300
|
maxPages: 200
|
|
299
301
|
```
|
|
300
302
|
|
|
301
|
-
###
|
|
303
|
+
### Supported MCP path
|
|
302
304
|
|
|
303
|
-
The
|
|
304
|
-
|
|
305
|
-
the
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
and
|
|
305
|
+
The supported MCP server is the Python stdio server started by `docpull mcp`.
|
|
306
|
+
That is the only MCP path covered by the `docpull` package release contract and
|
|
307
|
+
the one agents, plugin users, Claude Code, Cursor, and Claude Desktop should
|
|
308
|
+
use.
|
|
309
|
+
|
|
310
|
+
This repository also contains an `mcp/` directory with an internal TypeScript +
|
|
311
|
+
Bun lab for PostgreSQL/pgvector semantic search. It is not shipped by the Python
|
|
312
|
+
package, is not documented as a user install path, and should be ignored unless
|
|
313
|
+
you are explicitly developing that lab.
|
|
311
314
|
|
|
312
315
|
## Output
|
|
313
316
|
|
|
@@ -327,9 +330,14 @@ source_type: "nextjs"
|
|
|
327
330
|
NDJSON (one record per page or chunk):
|
|
328
331
|
|
|
329
332
|
```json
|
|
330
|
-
{"url": "...", "title": "...", "content": "...", "hash": "...", "token_count": 842, "chunk_index": 0}
|
|
333
|
+
{"document_id": "doc_...", "chunk_id": "chunk_...", "url": "...", "title": "...", "content": "...", "hash": "...", "token_count": 842, "chunk_index": 0}
|
|
331
334
|
```
|
|
332
335
|
|
|
336
|
+
Every output format also writes `corpus.manifest.json` next to the generated
|
|
337
|
+
documents. The manifest records the run identity, output format, stable
|
|
338
|
+
`document_id` / `chunk_id` values, content hashes, relative output paths, and
|
|
339
|
+
chunk counts so regenerated corpora can be diffed and cited by agents.
|
|
340
|
+
|
|
333
341
|
## Security
|
|
334
342
|
|
|
335
343
|
- HTTPS-only, mandatory robots.txt compliance
|
|
@@ -349,7 +357,7 @@ Run `docpull --help` for the full list. Highlights:
|
|
|
349
357
|
|
|
350
358
|
```
|
|
351
359
|
Core:
|
|
352
|
-
--profile {rag,mirror,quick,llm
|
|
360
|
+
--profile {rag,mirror,quick,llm}
|
|
353
361
|
--single Fetch one URL (no crawl)
|
|
354
362
|
--format {markdown,json,ndjson,sqlite}
|
|
355
363
|
--stream Stream NDJSON to stdout
|
|
@@ -368,27 +376,33 @@ Cache:
|
|
|
368
376
|
--cache Enable incremental updates
|
|
369
377
|
--cache-dir DIR
|
|
370
378
|
--cache-ttl DAYS
|
|
379
|
+
|
|
380
|
+
Crawl:
|
|
381
|
+
--max-concurrent N Global request concurrency
|
|
382
|
+
--per-host-concurrent N Per-host request concurrency
|
|
371
383
|
```
|
|
372
384
|
|
|
373
385
|
## Performance
|
|
374
386
|
|
|
375
387
|
End-to-end numbers from `tests/benchmarks/test_10k_pages.py` against a
|
|
376
388
|
synthetic 10,000-page localhost site (RAG profile, `max_concurrent=50`,
|
|
377
|
-
HTTP keep-alive, 5% injected duplicate content)
|
|
389
|
+
`per_host_concurrent=50`, HTTP keep-alive, 5% injected duplicate content).
|
|
390
|
+
The benchmark emits progress every 1,000 pages plus a final JSON report for
|
|
391
|
+
trend tooling.
|
|
378
392
|
|
|
379
393
|
| Metric | Value |
|
|
380
394
|
|---|---|
|
|
381
|
-
| Total wall time | ~
|
|
382
|
-
|
|
|
383
|
-
|
|
|
384
|
-
| Per-page latency p50 / p95 / p99 | ~
|
|
385
|
-
| Peak RSS delta from baseline | ~
|
|
386
|
-
| Cache manifest size on disk | ~
|
|
395
|
+
| Total wall time | ~333 s |
|
|
396
|
+
| Pages fetched / skipped / failed | 9,501 / 499 / 0 |
|
|
397
|
+
| Time to first saved page | ~130 ms |
|
|
398
|
+
| Per-page latency p50 / p95 / p99 | ~0 / 166 / 232 ms |
|
|
399
|
+
| Peak RSS delta from baseline | ~94 MB |
|
|
400
|
+
| Cache manifest size on disk | ~8.9 MB |
|
|
387
401
|
| Duplicates detected (5% injected) | 499 / 500 |
|
|
388
402
|
|
|
389
403
|
Reproduce with `make benchmark` (requires `aiohttp`; runs the gated
|
|
390
|
-
benchmark in `tests/benchmarks/` and prints a JSON line you can
|
|
391
|
-
into trend tooling).
|
|
404
|
+
benchmark in `tests/benchmarks/` and prints progress plus a JSON line you can
|
|
405
|
+
pipe into trend tooling).
|
|
392
406
|
|
|
393
407
|
## Troubleshooting
|
|
394
408
|
|
|
@@ -62,7 +62,7 @@ content directly from framework data feeds:
|
|
|
62
62
|
| Mintlify | `__NEXT_DATA__` with Mintlify tagging |
|
|
63
63
|
| OpenAPI | Renders `openapi.json` / `swagger.json` into Markdown |
|
|
64
64
|
| Docusaurus| Detected and tagged; generic extractor produces Markdown |
|
|
65
|
-
| Sphinx | Detected and tagged; generic extractor produces Markdown |
|
|
65
|
+
| Sphinx | Detected from generator metadata / Read the Docs hosts and tagged; generic extractor produces Markdown |
|
|
66
66
|
|
|
67
67
|
JS-only SPAs with no server-rendered content are detected and skipped with a
|
|
68
68
|
clear reason (or, with `--strict-js-required`, reported as an error so agents
|
|
@@ -125,8 +125,8 @@ async def tool_call(url: str) -> str:
|
|
|
125
125
|
|
|
126
126
|
```bash
|
|
127
127
|
docpull https://site.com --profile rag # Default. Dedup, rich metadata.
|
|
128
|
-
docpull https://site.com --profile llm # NDJSON + chunks + metadata.
|
|
129
|
-
docpull https://site.com --profile mirror # Full archive, polite, cached.
|
|
128
|
+
docpull https://site.com --profile llm # NDJSON + chunks + metadata; JS-only pages skip unless --strict-js-required is passed.
|
|
129
|
+
docpull https://site.com --profile mirror # Full archive, polite, cached, hierarchical paths.
|
|
130
130
|
docpull https://site.com --profile quick # Sampling: 50 pages, depth 2.
|
|
131
131
|
```
|
|
132
132
|
|
|
@@ -195,7 +195,9 @@ Write:
|
|
|
195
195
|
- `add_source(name, url, description?, category?, max_pages?, force?)` — register a user alias (HTTPS-only, atomic write to `sources.yaml`).
|
|
196
196
|
- `remove_source(name, delete_cache?)` — drop a user alias and (optionally) its cached docs.
|
|
197
197
|
|
|
198
|
-
All tools
|
|
198
|
+
All schema-backed tools return `structuredContent` validated against an
|
|
199
|
+
`outputSchema` for clients that prefer typed output. `fetch_url` intentionally
|
|
200
|
+
returns Markdown text directly.
|
|
199
201
|
|
|
200
202
|
User-defined sources live in `~/.config/docpull-mcp/sources.yaml`:
|
|
201
203
|
|
|
@@ -208,16 +210,17 @@ sources:
|
|
|
208
210
|
maxPages: 200
|
|
209
211
|
```
|
|
210
212
|
|
|
211
|
-
###
|
|
213
|
+
### Supported MCP path
|
|
212
214
|
|
|
213
|
-
The
|
|
214
|
-
|
|
215
|
-
the
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
and
|
|
215
|
+
The supported MCP server is the Python stdio server started by `docpull mcp`.
|
|
216
|
+
That is the only MCP path covered by the `docpull` package release contract and
|
|
217
|
+
the one agents, plugin users, Claude Code, Cursor, and Claude Desktop should
|
|
218
|
+
use.
|
|
219
|
+
|
|
220
|
+
This repository also contains an `mcp/` directory with an internal TypeScript +
|
|
221
|
+
Bun lab for PostgreSQL/pgvector semantic search. It is not shipped by the Python
|
|
222
|
+
package, is not documented as a user install path, and should be ignored unless
|
|
223
|
+
you are explicitly developing that lab.
|
|
221
224
|
|
|
222
225
|
## Output
|
|
223
226
|
|
|
@@ -237,9 +240,14 @@ source_type: "nextjs"
|
|
|
237
240
|
NDJSON (one record per page or chunk):
|
|
238
241
|
|
|
239
242
|
```json
|
|
240
|
-
{"url": "...", "title": "...", "content": "...", "hash": "...", "token_count": 842, "chunk_index": 0}
|
|
243
|
+
{"document_id": "doc_...", "chunk_id": "chunk_...", "url": "...", "title": "...", "content": "...", "hash": "...", "token_count": 842, "chunk_index": 0}
|
|
241
244
|
```
|
|
242
245
|
|
|
246
|
+
Every output format also writes `corpus.manifest.json` next to the generated
|
|
247
|
+
documents. The manifest records the run identity, output format, stable
|
|
248
|
+
`document_id` / `chunk_id` values, content hashes, relative output paths, and
|
|
249
|
+
chunk counts so regenerated corpora can be diffed and cited by agents.
|
|
250
|
+
|
|
243
251
|
## Security
|
|
244
252
|
|
|
245
253
|
- HTTPS-only, mandatory robots.txt compliance
|
|
@@ -259,7 +267,7 @@ Run `docpull --help` for the full list. Highlights:
|
|
|
259
267
|
|
|
260
268
|
```
|
|
261
269
|
Core:
|
|
262
|
-
--profile {rag,mirror,quick,llm
|
|
270
|
+
--profile {rag,mirror,quick,llm}
|
|
263
271
|
--single Fetch one URL (no crawl)
|
|
264
272
|
--format {markdown,json,ndjson,sqlite}
|
|
265
273
|
--stream Stream NDJSON to stdout
|
|
@@ -278,27 +286,33 @@ Cache:
|
|
|
278
286
|
--cache Enable incremental updates
|
|
279
287
|
--cache-dir DIR
|
|
280
288
|
--cache-ttl DAYS
|
|
289
|
+
|
|
290
|
+
Crawl:
|
|
291
|
+
--max-concurrent N Global request concurrency
|
|
292
|
+
--per-host-concurrent N Per-host request concurrency
|
|
281
293
|
```
|
|
282
294
|
|
|
283
295
|
## Performance
|
|
284
296
|
|
|
285
297
|
End-to-end numbers from `tests/benchmarks/test_10k_pages.py` against a
|
|
286
298
|
synthetic 10,000-page localhost site (RAG profile, `max_concurrent=50`,
|
|
287
|
-
HTTP keep-alive, 5% injected duplicate content)
|
|
299
|
+
`per_host_concurrent=50`, HTTP keep-alive, 5% injected duplicate content).
|
|
300
|
+
The benchmark emits progress every 1,000 pages plus a final JSON report for
|
|
301
|
+
trend tooling.
|
|
288
302
|
|
|
289
303
|
| Metric | Value |
|
|
290
304
|
|---|---|
|
|
291
|
-
| Total wall time | ~
|
|
292
|
-
|
|
|
293
|
-
|
|
|
294
|
-
| Per-page latency p50 / p95 / p99 | ~
|
|
295
|
-
| Peak RSS delta from baseline | ~
|
|
296
|
-
| Cache manifest size on disk | ~
|
|
305
|
+
| Total wall time | ~333 s |
|
|
306
|
+
| Pages fetched / skipped / failed | 9,501 / 499 / 0 |
|
|
307
|
+
| Time to first saved page | ~130 ms |
|
|
308
|
+
| Per-page latency p50 / p95 / p99 | ~0 / 166 / 232 ms |
|
|
309
|
+
| Peak RSS delta from baseline | ~94 MB |
|
|
310
|
+
| Cache manifest size on disk | ~8.9 MB |
|
|
297
311
|
| Duplicates detected (5% injected) | 499 / 500 |
|
|
298
312
|
|
|
299
313
|
Reproduce with `make benchmark` (requires `aiohttp`; runs the gated
|
|
300
|
-
benchmark in `tests/benchmarks/` and prints a JSON line you can
|
|
301
|
-
into trend tooling).
|
|
314
|
+
benchmark in `tests/benchmarks/` and prints progress plus a JSON line you can
|
|
315
|
+
pipe into trend tooling).
|
|
302
316
|
|
|
303
317
|
## Troubleshooting
|
|
304
318
|
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "docpull"
|
|
7
|
-
version = "4.0.
|
|
7
|
+
version = "4.0.1"
|
|
8
8
|
dynamic = []
|
|
9
9
|
description = "Pull documentation from the web and convert to clean markdown"
|
|
10
10
|
readme = {file = "README.md", content-type = "text/markdown"}
|
|
@@ -14,7 +14,7 @@ Usage:
|
|
|
14
14
|
print(event)
|
|
15
15
|
"""
|
|
16
16
|
|
|
17
|
-
__version__ = "4.0.
|
|
17
|
+
__version__ = "4.0.1"
|
|
18
18
|
|
|
19
19
|
from .cache import CacheManager, StreamingDeduplicator
|
|
20
20
|
from .conversion.chunking import Chunk, TokenCounter, chunk_markdown
|
|
@@ -34,12 +34,10 @@ from .pipeline.base import PageContext
|
|
|
34
34
|
|
|
35
35
|
__all__ = [
|
|
36
36
|
"__version__",
|
|
37
|
-
# Core
|
|
38
37
|
"Fetcher",
|
|
39
38
|
"fetch_blocking",
|
|
40
39
|
"fetch_one",
|
|
41
40
|
"PageContext",
|
|
42
|
-
# Config
|
|
43
41
|
"DocpullConfig",
|
|
44
42
|
"ProfileName",
|
|
45
43
|
"CrawlConfig",
|
|
@@ -48,14 +46,11 @@ __all__ = [
|
|
|
48
46
|
"NetworkConfig",
|
|
49
47
|
"PerformanceConfig",
|
|
50
48
|
"CacheConfig",
|
|
51
|
-
# Events
|
|
52
49
|
"EventType",
|
|
53
50
|
"FetchEvent",
|
|
54
51
|
"FetchStats",
|
|
55
|
-
# Cache
|
|
56
52
|
"CacheManager",
|
|
57
53
|
"StreamingDeduplicator",
|
|
58
|
-
# Chunking
|
|
59
54
|
"Chunk",
|
|
60
55
|
"TokenCounter",
|
|
61
56
|
"chunk_markdown",
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
"""Caching and deduplication for docpull."""
|
|
2
2
|
|
|
3
|
+
from .frontier import FrontierEntry, FrontierState, FrontierStore
|
|
3
4
|
from .manager import DEFAULT_TTL_DAYS, CacheManager, CacheState, ManifestEntry
|
|
4
5
|
from .streaming_dedup import StreamingDeduplicator
|
|
5
6
|
|
|
@@ -7,6 +8,9 @@ __all__ = [
|
|
|
7
8
|
"CacheManager",
|
|
8
9
|
"CacheState",
|
|
9
10
|
"ManifestEntry",
|
|
11
|
+
"FrontierEntry",
|
|
12
|
+
"FrontierState",
|
|
13
|
+
"FrontierStore",
|
|
10
14
|
"StreamingDeduplicator",
|
|
11
15
|
"DEFAULT_TTL_DAYS",
|
|
12
16
|
]
|
|
@@ -0,0 +1,199 @@
|
|
|
1
|
+
"""Durable crawl frontier state for pause/resume and provenance."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import logging
|
|
7
|
+
from dataclasses import dataclass, field
|
|
8
|
+
from enum import Enum
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Any
|
|
11
|
+
|
|
12
|
+
from ..models.run import FRONTIER_SCHEMA_VERSION
|
|
13
|
+
from ..time_utils import utc_now_iso
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class FrontierState(str, Enum):
|
|
19
|
+
"""Lifecycle state for a URL in the crawl frontier."""
|
|
20
|
+
|
|
21
|
+
QUEUED = "queued"
|
|
22
|
+
PROCESSING = "processing"
|
|
23
|
+
SUCCEEDED = "succeeded"
|
|
24
|
+
SKIPPED = "skipped"
|
|
25
|
+
FAILED = "failed"
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@dataclass
|
|
29
|
+
class FrontierEntry:
|
|
30
|
+
url: str
|
|
31
|
+
state: FrontierState = FrontierState.QUEUED
|
|
32
|
+
depth: int | None = None
|
|
33
|
+
source: str | None = None
|
|
34
|
+
discovered_at: str = field(default_factory=utc_now_iso)
|
|
35
|
+
updated_at: str = field(default_factory=utc_now_iso)
|
|
36
|
+
attempts: int = 0
|
|
37
|
+
last_error: str | None = None
|
|
38
|
+
|
|
39
|
+
@classmethod
|
|
40
|
+
def from_json(cls, data: dict[str, Any]) -> FrontierEntry | None:
|
|
41
|
+
url = data.get("url")
|
|
42
|
+
if not isinstance(url, str):
|
|
43
|
+
return None
|
|
44
|
+
try:
|
|
45
|
+
state = FrontierState(str(data.get("state", FrontierState.QUEUED.value)))
|
|
46
|
+
except ValueError:
|
|
47
|
+
state = FrontierState.QUEUED
|
|
48
|
+
attempts = data.get("attempts")
|
|
49
|
+
discovered_at = data.get("discovered_at")
|
|
50
|
+
updated_at = data.get("updated_at")
|
|
51
|
+
return cls(
|
|
52
|
+
url=url,
|
|
53
|
+
state=state,
|
|
54
|
+
depth=data.get("depth") if isinstance(data.get("depth"), int) else None,
|
|
55
|
+
source=data.get("source") if isinstance(data.get("source"), str) else None,
|
|
56
|
+
discovered_at=discovered_at if isinstance(discovered_at, str) else utc_now_iso(),
|
|
57
|
+
updated_at=updated_at if isinstance(updated_at, str) else utc_now_iso(),
|
|
58
|
+
attempts=attempts if isinstance(attempts, int) else 0,
|
|
59
|
+
last_error=data.get("last_error") if isinstance(data.get("last_error"), str) else None,
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
def to_json(self) -> dict[str, Any]:
|
|
63
|
+
return {
|
|
64
|
+
"url": self.url,
|
|
65
|
+
"state": self.state.value,
|
|
66
|
+
"depth": self.depth,
|
|
67
|
+
"source": self.source,
|
|
68
|
+
"discovered_at": self.discovered_at,
|
|
69
|
+
"updated_at": self.updated_at,
|
|
70
|
+
"attempts": self.attempts,
|
|
71
|
+
"last_error": self.last_error,
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
class FrontierStore:
|
|
76
|
+
"""Small JSON-backed frontier store.
|
|
77
|
+
|
|
78
|
+
The store is intentionally simple because docpull is single-process today.
|
|
79
|
+
It gives us explicit URL lifecycle state and a compatibility fingerprint
|
|
80
|
+
without introducing a queue service or SQLite dependency for markdown users.
|
|
81
|
+
"""
|
|
82
|
+
|
|
83
|
+
def __init__(self, path: Path):
|
|
84
|
+
self.path = Path(path)
|
|
85
|
+
self.entries: dict[str, FrontierEntry] = {}
|
|
86
|
+
self.start_url: str | None = None
|
|
87
|
+
self.run_fingerprint: dict[str, object] | None = None
|
|
88
|
+
self.created_at: str | None = None
|
|
89
|
+
self.updated_at: str | None = None
|
|
90
|
+
self._load()
|
|
91
|
+
|
|
92
|
+
def _load(self) -> None:
|
|
93
|
+
if not self.path.exists():
|
|
94
|
+
return
|
|
95
|
+
try:
|
|
96
|
+
data = json.loads(self.path.read_text(encoding="utf-8"))
|
|
97
|
+
except (OSError, json.JSONDecodeError) as err:
|
|
98
|
+
logger.warning("Could not load frontier store %s: %s", self.path, err)
|
|
99
|
+
return
|
|
100
|
+
if not isinstance(data, dict) or data.get("schema_version") != FRONTIER_SCHEMA_VERSION:
|
|
101
|
+
return
|
|
102
|
+
entries = data.get("entries")
|
|
103
|
+
if not isinstance(entries, list):
|
|
104
|
+
return
|
|
105
|
+
self.start_url = data.get("start_url") if isinstance(data.get("start_url"), str) else None
|
|
106
|
+
fingerprint = data.get("run_fingerprint")
|
|
107
|
+
self.run_fingerprint = fingerprint if isinstance(fingerprint, dict) else None
|
|
108
|
+
self.created_at = data.get("created_at") if isinstance(data.get("created_at"), str) else None
|
|
109
|
+
self.updated_at = data.get("updated_at") if isinstance(data.get("updated_at"), str) else None
|
|
110
|
+
for item in entries:
|
|
111
|
+
if not isinstance(item, dict):
|
|
112
|
+
continue
|
|
113
|
+
entry = FrontierEntry.from_json(item)
|
|
114
|
+
if entry:
|
|
115
|
+
self.entries[entry.url] = entry
|
|
116
|
+
|
|
117
|
+
def save(self) -> None:
|
|
118
|
+
self.path.parent.mkdir(parents=True, exist_ok=True)
|
|
119
|
+
now = utc_now_iso()
|
|
120
|
+
if self.created_at is None:
|
|
121
|
+
self.created_at = now
|
|
122
|
+
self.updated_at = now
|
|
123
|
+
data = {
|
|
124
|
+
"schema_version": FRONTIER_SCHEMA_VERSION,
|
|
125
|
+
"start_url": self.start_url,
|
|
126
|
+
"run_fingerprint": self.run_fingerprint,
|
|
127
|
+
"created_at": self.created_at,
|
|
128
|
+
"updated_at": self.updated_at,
|
|
129
|
+
"entries": [entry.to_json() for entry in self.entries.values()],
|
|
130
|
+
}
|
|
131
|
+
tmp = self.path.with_suffix(self.path.suffix + ".tmp")
|
|
132
|
+
try:
|
|
133
|
+
tmp.write_text(json.dumps(data, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")
|
|
134
|
+
tmp.replace(self.path)
|
|
135
|
+
except Exception:
|
|
136
|
+
tmp.unlink(missing_ok=True)
|
|
137
|
+
raise
|
|
138
|
+
|
|
139
|
+
def initialize(self, *, start_url: str, run_fingerprint: dict[str, object]) -> None:
|
|
140
|
+
if self.start_url != start_url or self.run_fingerprint != run_fingerprint:
|
|
141
|
+
self.entries.clear()
|
|
142
|
+
self.created_at = utc_now_iso()
|
|
143
|
+
self.start_url = start_url
|
|
144
|
+
self.run_fingerprint = run_fingerprint
|
|
145
|
+
self.save()
|
|
146
|
+
|
|
147
|
+
def compatible(self, *, start_url: str, run_fingerprint: dict[str, object]) -> bool:
|
|
148
|
+
return self.start_url == start_url and self.run_fingerprint == run_fingerprint
|
|
149
|
+
|
|
150
|
+
def add(self, url: str, *, depth: int | None = None, source: str | None = None) -> None:
|
|
151
|
+
if url in self.entries:
|
|
152
|
+
return
|
|
153
|
+
self.entries[url] = FrontierEntry(url=url, depth=depth, source=source)
|
|
154
|
+
|
|
155
|
+
def add_many(self, urls: list[str], *, source: str | None = None) -> None:
|
|
156
|
+
for url in urls:
|
|
157
|
+
self.add(url, source=source)
|
|
158
|
+
|
|
159
|
+
def mark_processing(self, url: str) -> None:
|
|
160
|
+
entry = self.entries.get(url)
|
|
161
|
+
if not entry:
|
|
162
|
+
self.add(url)
|
|
163
|
+
entry = self.entries[url]
|
|
164
|
+
entry.state = FrontierState.PROCESSING
|
|
165
|
+
entry.attempts += 1
|
|
166
|
+
entry.updated_at = utc_now_iso()
|
|
167
|
+
self.save()
|
|
168
|
+
|
|
169
|
+
def mark_succeeded(self, url: str) -> None:
|
|
170
|
+
self._mark_terminal(url, FrontierState.SUCCEEDED)
|
|
171
|
+
|
|
172
|
+
def mark_skipped(self, url: str) -> None:
|
|
173
|
+
self._mark_terminal(url, FrontierState.SKIPPED)
|
|
174
|
+
|
|
175
|
+
def mark_failed(self, url: str, error: str | None = None) -> None:
|
|
176
|
+
self._mark_terminal(url, FrontierState.FAILED, error=error)
|
|
177
|
+
|
|
178
|
+
def _mark_terminal(self, url: str, state: FrontierState, error: str | None = None) -> None:
|
|
179
|
+
entry = self.entries.get(url)
|
|
180
|
+
if not entry:
|
|
181
|
+
self.add(url)
|
|
182
|
+
entry = self.entries[url]
|
|
183
|
+
entry.state = state
|
|
184
|
+
entry.last_error = error
|
|
185
|
+
entry.updated_at = utc_now_iso()
|
|
186
|
+
self.save()
|
|
187
|
+
|
|
188
|
+
def pending_urls(self) -> list[str]:
|
|
189
|
+
terminal = {FrontierState.SUCCEEDED, FrontierState.SKIPPED}
|
|
190
|
+
return [url for url, entry in self.entries.items() if entry.state not in terminal]
|
|
191
|
+
|
|
192
|
+
def clear(self) -> None:
|
|
193
|
+
if self.path.exists():
|
|
194
|
+
self.path.unlink()
|
|
195
|
+
self.entries.clear()
|
|
196
|
+
self.start_url = None
|
|
197
|
+
self.run_fingerprint = None
|
|
198
|
+
self.created_at = None
|
|
199
|
+
self.updated_at = None
|