docpull 2.5.1__tar.gz → 3.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docpull-2.5.1/src/docpull.egg-info → docpull-3.0.1}/PKG-INFO +20 -2
- {docpull-2.5.1 → docpull-3.0.1}/README.md +12 -0
- {docpull-2.5.1 → docpull-3.0.1}/pyproject.toml +24 -10
- {docpull-2.5.1 → docpull-3.0.1}/src/docpull/__init__.py +1 -1
- {docpull-2.5.1 → docpull-3.0.1}/src/docpull/cache/manager.py +10 -8
- {docpull-2.5.1 → docpull-3.0.1}/src/docpull/cli.py +5 -8
- {docpull-2.5.1 → docpull-3.0.1}/src/docpull/conversion/special_cases.py +13 -22
- {docpull-2.5.1 → docpull-3.0.1}/src/docpull/core/fetcher.py +10 -26
- {docpull-2.5.1 → docpull-3.0.1}/src/docpull/discovery/filters.py +8 -9
- {docpull-2.5.1 → docpull-3.0.1}/src/docpull/discovery/link_extractors/enhanced.py +2 -1
- {docpull-2.5.1 → docpull-3.0.1}/src/docpull/discovery/link_extractors/static.py +2 -1
- {docpull-2.5.1 → docpull-3.0.1}/src/docpull/http/client.py +20 -19
- {docpull-2.5.1 → docpull-3.0.1}/src/docpull/mcp/server.py +5 -3
- {docpull-2.5.1 → docpull-3.0.1}/src/docpull/mcp/sources.py +63 -6
- {docpull-2.5.1 → docpull-3.0.1}/src/docpull/mcp/tools.py +38 -56
- {docpull-2.5.1 → docpull-3.0.1}/src/docpull/models/config.py +10 -75
- {docpull-2.5.1 → docpull-3.0.1}/src/docpull/models/profiles.py +1 -5
- {docpull-2.5.1 → docpull-3.0.1}/src/docpull/pipeline/steps/convert.py +12 -12
- {docpull-2.5.1 → docpull-3.0.1}/src/docpull/pipeline/steps/fetch.py +2 -3
- {docpull-2.5.1 → docpull-3.0.1}/src/docpull/pipeline/steps/save.py +1 -1
- {docpull-2.5.1 → docpull-3.0.1}/src/docpull/pipeline/steps/save_json.py +4 -4
- {docpull-2.5.1 → docpull-3.0.1}/src/docpull/pipeline/steps/save_ndjson.py +2 -2
- {docpull-2.5.1 → docpull-3.0.1}/src/docpull/pipeline/steps/save_sqlite.py +2 -2
- {docpull-2.5.1 → docpull-3.0.1}/src/docpull/security/robots.py +17 -7
- {docpull-2.5.1 → docpull-3.0.1}/src/docpull/security/url_validator.py +7 -7
- docpull-3.0.1/src/docpull/time_utils.py +29 -0
- {docpull-2.5.1 → docpull-3.0.1/src/docpull.egg-info}/PKG-INFO +20 -2
- {docpull-2.5.1 → docpull-3.0.1}/src/docpull.egg-info/SOURCES.txt +7 -5
- {docpull-2.5.1 → docpull-3.0.1}/src/docpull.egg-info/requires.txt +7 -1
- {docpull-2.5.1 → docpull-3.0.1}/tests/test_cache_conditional_get.py +5 -12
- docpull-2.5.1/tests/test_v2_conversion.py → docpull-3.0.1/tests/test_conversion.py +26 -42
- {docpull-2.5.1 → docpull-3.0.1}/tests/test_convert_step_new.py +2 -6
- docpull-2.5.1/tests/test_v2_discovery.py → docpull-3.0.1/tests/test_discovery.py +1 -1
- docpull-2.5.1/tests/test_v2_integration.py → docpull-3.0.1/tests/test_integration.py +4 -5
- {docpull-2.5.1 → docpull-3.0.1}/tests/test_mcp_tools.py +60 -25
- {docpull-2.5.1 → docpull-3.0.1}/tests/test_naming.py +4 -23
- docpull-2.5.1/tests/test_v2_pipeline.py → docpull-3.0.1/tests/test_pipeline.py +2 -4
- {docpull-2.5.1 → docpull-3.0.1}/tests/test_special_cases.py +5 -17
- docpull-3.0.1/tests/test_time_utils.py +23 -0
- {docpull-2.5.1 → docpull-3.0.1}/LICENSE +0 -0
- {docpull-2.5.1 → docpull-3.0.1}/setup.cfg +0 -0
- {docpull-2.5.1 → docpull-3.0.1}/src/docpull/__main__.py +0 -0
- {docpull-2.5.1 → docpull-3.0.1}/src/docpull/cache/__init__.py +0 -0
- {docpull-2.5.1 → docpull-3.0.1}/src/docpull/cache/streaming_dedup.py +0 -0
- {docpull-2.5.1 → docpull-3.0.1}/src/docpull/concurrency/__init__.py +0 -0
- {docpull-2.5.1 → docpull-3.0.1}/src/docpull/concurrency/manager.py +0 -0
- {docpull-2.5.1 → docpull-3.0.1}/src/docpull/conversion/__init__.py +0 -0
- {docpull-2.5.1 → docpull-3.0.1}/src/docpull/conversion/chunking.py +0 -0
- {docpull-2.5.1 → docpull-3.0.1}/src/docpull/conversion/extractor.py +0 -0
- {docpull-2.5.1 → docpull-3.0.1}/src/docpull/conversion/markdown.py +0 -0
- {docpull-2.5.1 → docpull-3.0.1}/src/docpull/conversion/protocols.py +0 -0
- {docpull-2.5.1 → docpull-3.0.1}/src/docpull/conversion/trafilatura_extractor.py +0 -0
- {docpull-2.5.1 → docpull-3.0.1}/src/docpull/core/__init__.py +0 -0
- {docpull-2.5.1 → docpull-3.0.1}/src/docpull/discovery/__init__.py +0 -0
- {docpull-2.5.1 → docpull-3.0.1}/src/docpull/discovery/composite.py +0 -0
- {docpull-2.5.1 → docpull-3.0.1}/src/docpull/discovery/crawler.py +0 -0
- {docpull-2.5.1 → docpull-3.0.1}/src/docpull/discovery/link_extractors/__init__.py +0 -0
- {docpull-2.5.1 → docpull-3.0.1}/src/docpull/discovery/link_extractors/protocols.py +0 -0
- {docpull-2.5.1 → docpull-3.0.1}/src/docpull/discovery/protocols.py +0 -0
- {docpull-2.5.1 → docpull-3.0.1}/src/docpull/discovery/sitemap.py +0 -0
- {docpull-2.5.1 → docpull-3.0.1}/src/docpull/doctor.py +0 -0
- {docpull-2.5.1 → docpull-3.0.1}/src/docpull/http/__init__.py +0 -0
- {docpull-2.5.1 → docpull-3.0.1}/src/docpull/http/protocols.py +0 -0
- {docpull-2.5.1 → docpull-3.0.1}/src/docpull/http/rate_limiter.py +0 -0
- {docpull-2.5.1 → docpull-3.0.1}/src/docpull/logging_config.py +0 -0
- {docpull-2.5.1 → docpull-3.0.1}/src/docpull/mcp/__init__.py +0 -0
- {docpull-2.5.1 → docpull-3.0.1}/src/docpull/metadata_extractor.py +0 -0
- {docpull-2.5.1 → docpull-3.0.1}/src/docpull/models/__init__.py +0 -0
- {docpull-2.5.1 → docpull-3.0.1}/src/docpull/models/events.py +0 -0
- {docpull-2.5.1 → docpull-3.0.1}/src/docpull/pipeline/__init__.py +0 -0
- {docpull-2.5.1 → docpull-3.0.1}/src/docpull/pipeline/base.py +0 -0
- {docpull-2.5.1 → docpull-3.0.1}/src/docpull/pipeline/steps/__init__.py +0 -0
- {docpull-2.5.1 → docpull-3.0.1}/src/docpull/pipeline/steps/chunk.py +0 -0
- {docpull-2.5.1 → docpull-3.0.1}/src/docpull/pipeline/steps/dedup.py +0 -0
- {docpull-2.5.1 → docpull-3.0.1}/src/docpull/pipeline/steps/metadata.py +0 -0
- {docpull-2.5.1 → docpull-3.0.1}/src/docpull/pipeline/steps/validate.py +0 -0
- {docpull-2.5.1 → docpull-3.0.1}/src/docpull/py.typed +0 -0
- {docpull-2.5.1 → docpull-3.0.1}/src/docpull/security/__init__.py +0 -0
- {docpull-2.5.1 → docpull-3.0.1}/src/docpull.egg-info/dependency_links.txt +0 -0
- {docpull-2.5.1 → docpull-3.0.1}/src/docpull.egg-info/entry_points.txt +0 -0
- {docpull-2.5.1 → docpull-3.0.1}/src/docpull.egg-info/top_level.txt +0 -0
- {docpull-2.5.1 → docpull-3.0.1}/tests/test_chunking.py +0 -0
- {docpull-2.5.1 → docpull-3.0.1}/tests/test_cli.py +0 -0
- {docpull-2.5.1 → docpull-3.0.1}/tests/test_link_extractors.py +0 -0
- /docpull-2.5.1/tests/test_fixes_v2_3_0.py → /docpull-3.0.1/tests/test_real_site_regressions.py +0 -0
- {docpull-2.5.1 → docpull-3.0.1}/tests/test_save_ndjson.py +0 -0
- {docpull-2.5.1 → docpull-3.0.1}/tests/test_security_hardening.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: docpull
|
|
3
|
-
Version:
|
|
3
|
+
Version: 3.0.1
|
|
4
4
|
Summary: Pull documentation from the web and convert to clean markdown
|
|
5
5
|
Author-email: Zachary Roth <support@raintree.technology>
|
|
6
6
|
Maintainer-email: Raintree Technology <support@raintree.technology>
|
|
@@ -43,9 +43,12 @@ Requires-Dist: html2text>=2020.1.16
|
|
|
43
43
|
Requires-Dist: defusedxml>=0.7.1
|
|
44
44
|
Requires-Dist: extruct>=0.15.0
|
|
45
45
|
Requires-Dist: aiohttp>=3.9.0
|
|
46
|
+
Requires-Dist: idna>=3.15
|
|
47
|
+
Requires-Dist: regex>=2024.11.6
|
|
46
48
|
Requires-Dist: rich>=13.0.0
|
|
47
49
|
Requires-Dist: pyyaml>=6.0
|
|
48
50
|
Requires-Dist: pydantic>=2.0
|
|
51
|
+
Requires-Dist: urllib3>=2.7.0
|
|
49
52
|
Provides-Extra: proxy
|
|
50
53
|
Requires-Dist: aiohttp-socks>=0.8.0; extra == "proxy"
|
|
51
54
|
Provides-Extra: normalize
|
|
@@ -56,6 +59,8 @@ Provides-Extra: tokens
|
|
|
56
59
|
Requires-Dist: tiktoken>=0.7.0; extra == "tokens"
|
|
57
60
|
Provides-Extra: mcp
|
|
58
61
|
Requires-Dist: mcp>=1.0.0; extra == "mcp"
|
|
62
|
+
Requires-Dist: python-multipart>=0.0.27; extra == "mcp"
|
|
63
|
+
Requires-Dist: starlette>=1.0.1; extra == "mcp"
|
|
59
64
|
Provides-Extra: llm
|
|
60
65
|
Requires-Dist: tiktoken>=0.7.0; extra == "llm"
|
|
61
66
|
Provides-Extra: all
|
|
@@ -64,11 +69,12 @@ Requires-Dist: url-normalize>=1.4.0; extra == "all"
|
|
|
64
69
|
Requires-Dist: trafilatura>=1.12.0; extra == "all"
|
|
65
70
|
Requires-Dist: tiktoken>=0.7.0; extra == "all"
|
|
66
71
|
Requires-Dist: mcp>=1.0.0; extra == "all"
|
|
72
|
+
Requires-Dist: python-multipart>=0.0.27; extra == "all"
|
|
73
|
+
Requires-Dist: starlette>=1.0.1; extra == "all"
|
|
67
74
|
Provides-Extra: dev
|
|
68
75
|
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
|
69
76
|
Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
|
|
70
77
|
Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
|
|
71
|
-
Requires-Dist: black>=23.0.0; extra == "dev"
|
|
72
78
|
Requires-Dist: mypy>=1.0.0; extra == "dev"
|
|
73
79
|
Requires-Dist: ruff>=0.1.0; extra == "dev"
|
|
74
80
|
Requires-Dist: bandit>=1.7.0; extra == "dev"
|
|
@@ -280,6 +286,17 @@ sources:
|
|
|
280
286
|
maxPages: 200
|
|
281
287
|
```
|
|
282
288
|
|
|
289
|
+
### About the `mcp/` directory in this repo
|
|
290
|
+
|
|
291
|
+
The `mcp/` directory at the repo root is a separate TypeScript + Bun MCP
|
|
292
|
+
server backed by PostgreSQL with pgvector for semantic search. It is not
|
|
293
|
+
the Python MCP server shipped in the `docpull` package described above
|
|
294
|
+
— that one is the right choice for almost every user and is installed
|
|
295
|
+
with `pip install 'docpull[mcp]'`. The `mcp/` tree is mirrored to its
|
|
296
|
+
own repo at [`raintree-technology/docpull-mcp`](https://github.com/raintree-technology/docpull-mcp);
|
|
297
|
+
unless you specifically need pgvector-backed semantic search, ignore it
|
|
298
|
+
and use `docpull mcp`.
|
|
299
|
+
|
|
283
300
|
## Output
|
|
284
301
|
|
|
285
302
|
Markdown files with YAML frontmatter:
|
|
@@ -376,6 +393,7 @@ docpull URL --preview-urls # List URLs without fetching
|
|
|
376
393
|
- [PyPI](https://pypi.org/project/docpull/)
|
|
377
394
|
- [GitHub](https://github.com/raintree-technology/docpull)
|
|
378
395
|
- [Changelog](https://github.com/raintree-technology/docpull/blob/main/docs/CHANGELOG.md)
|
|
396
|
+
- [Metrics](https://github.com/raintree-technology/docpull/blob/main/METRICS.md) — auto-refreshed daily (PyPI downloads, plugin installs via clone count, traffic)
|
|
379
397
|
|
|
380
398
|
## License
|
|
381
399
|
|
|
@@ -198,6 +198,17 @@ sources:
|
|
|
198
198
|
maxPages: 200
|
|
199
199
|
```
|
|
200
200
|
|
|
201
|
+
### About the `mcp/` directory in this repo
|
|
202
|
+
|
|
203
|
+
The `mcp/` directory at the repo root is a separate TypeScript + Bun MCP
|
|
204
|
+
server backed by PostgreSQL with pgvector for semantic search. It is not
|
|
205
|
+
the Python MCP server shipped in the `docpull` package described above
|
|
206
|
+
— that one is the right choice for almost every user and is installed
|
|
207
|
+
with `pip install 'docpull[mcp]'`. The `mcp/` tree is mirrored to its
|
|
208
|
+
own repo at [`raintree-technology/docpull-mcp`](https://github.com/raintree-technology/docpull-mcp);
|
|
209
|
+
unless you specifically need pgvector-backed semantic search, ignore it
|
|
210
|
+
and use `docpull mcp`.
|
|
211
|
+
|
|
201
212
|
## Output
|
|
202
213
|
|
|
203
214
|
Markdown files with YAML frontmatter:
|
|
@@ -294,6 +305,7 @@ docpull URL --preview-urls # List URLs without fetching
|
|
|
294
305
|
- [PyPI](https://pypi.org/project/docpull/)
|
|
295
306
|
- [GitHub](https://github.com/raintree-technology/docpull)
|
|
296
307
|
- [Changelog](https://github.com/raintree-technology/docpull/blob/main/docs/CHANGELOG.md)
|
|
308
|
+
- [Metrics](https://github.com/raintree-technology/docpull/blob/main/METRICS.md) — auto-refreshed daily (PyPI downloads, plugin installs via clone count, traffic)
|
|
297
309
|
|
|
298
310
|
## License
|
|
299
311
|
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "docpull"
|
|
7
|
-
version = "
|
|
7
|
+
version = "3.0.1"
|
|
8
8
|
dynamic = []
|
|
9
9
|
description = "Pull documentation from the web and convert to clean markdown"
|
|
10
10
|
readme = {file = "README.md", content-type = "text/markdown"}
|
|
@@ -67,9 +67,12 @@ dependencies = [
|
|
|
67
67
|
"defusedxml>=0.7.1",
|
|
68
68
|
"extruct>=0.15.0",
|
|
69
69
|
"aiohttp>=3.9.0",
|
|
70
|
+
"idna>=3.15",
|
|
71
|
+
"regex>=2024.11.6",
|
|
70
72
|
"rich>=13.0.0",
|
|
71
73
|
"pyyaml>=6.0",
|
|
72
74
|
"pydantic>=2.0",
|
|
75
|
+
"urllib3>=2.7.0",
|
|
73
76
|
]
|
|
74
77
|
|
|
75
78
|
[project.optional-dependencies]
|
|
@@ -87,6 +90,8 @@ tokens = [
|
|
|
87
90
|
]
|
|
88
91
|
mcp = [
|
|
89
92
|
"mcp>=1.0.0",
|
|
93
|
+
"python-multipart>=0.0.27",
|
|
94
|
+
"starlette>=1.0.1",
|
|
90
95
|
]
|
|
91
96
|
llm = [
|
|
92
97
|
"tiktoken>=0.7.0",
|
|
@@ -97,12 +102,13 @@ all = [
|
|
|
97
102
|
"trafilatura>=1.12.0",
|
|
98
103
|
"tiktoken>=0.7.0",
|
|
99
104
|
"mcp>=1.0.0",
|
|
105
|
+
"python-multipart>=0.0.27",
|
|
106
|
+
"starlette>=1.0.1",
|
|
100
107
|
]
|
|
101
108
|
dev = [
|
|
102
109
|
"pytest>=7.0.0",
|
|
103
110
|
"pytest-cov>=4.0.0",
|
|
104
111
|
"pytest-asyncio>=0.21.0",
|
|
105
|
-
"black>=23.0.0",
|
|
106
112
|
"mypy>=1.0.0",
|
|
107
113
|
"ruff>=0.1.0",
|
|
108
114
|
"bandit>=1.7.0",
|
|
@@ -132,10 +138,6 @@ include = ["docpull*"]
|
|
|
132
138
|
[tool.setuptools.package-data]
|
|
133
139
|
docpull = ["py.typed"]
|
|
134
140
|
|
|
135
|
-
[tool.black]
|
|
136
|
-
line-length = 110
|
|
137
|
-
target-version = ["py310", "py311", "py312", "py313", "py314"]
|
|
138
|
-
|
|
139
141
|
[tool.ruff]
|
|
140
142
|
line-length = 110
|
|
141
143
|
target-version = "py310"
|
|
@@ -175,10 +177,22 @@ module = "docpull.models.*"
|
|
|
175
177
|
disallow_any_unimported = false
|
|
176
178
|
warn_return_any = false
|
|
177
179
|
|
|
178
|
-
[
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
180
|
+
[tool.bandit]
|
|
181
|
+
# Policy: every entry in `skips` MUST have a one-line justification
|
|
182
|
+
# above it explaining what bandit found, why it's a false positive
|
|
183
|
+
# *for this codebase*, and (if narrow) why a `# nosec BXXX # reason`
|
|
184
|
+
# annotation at the call site would have been worse. Bandit skips
|
|
185
|
+
# silence findings repo-wide, so the bar to add one is higher than
|
|
186
|
+
# silencing a single line. If a new skip is unavoidable, add it here
|
|
187
|
+
# in PR review, not as a drive-by.
|
|
188
|
+
#
|
|
189
|
+
# B101 (assert_used) — flags every `assert x is not None` we use for
|
|
190
|
+
# type narrowing. Bandit's concern is that assertions vanish under
|
|
191
|
+
# `python -O`. docpull is a CLI / SDK, never invoked with -O, and the
|
|
192
|
+
# narrowing asserts are not load-bearing safety checks. Skipping the
|
|
193
|
+
# rule globally keeps the existing idiom without 8+ inline `# nosec`
|
|
194
|
+
# annotations in fetcher.py / pipeline/steps/convert.py.
|
|
195
|
+
skips = ["B101"]
|
|
182
196
|
|
|
183
197
|
[tool.pytest.ini_options]
|
|
184
198
|
minversion = "7.0"
|
|
@@ -5,10 +5,12 @@ from __future__ import annotations
|
|
|
5
5
|
import hashlib
|
|
6
6
|
import json
|
|
7
7
|
import logging
|
|
8
|
-
from datetime import
|
|
8
|
+
from datetime import timedelta
|
|
9
9
|
from pathlib import Path
|
|
10
10
|
from typing import TypedDict
|
|
11
11
|
|
|
12
|
+
from ..time_utils import parse_persisted_datetime, utc_now, utc_now_iso
|
|
13
|
+
|
|
12
14
|
logger = logging.getLogger(__name__)
|
|
13
15
|
|
|
14
16
|
# Default TTL for cache entries (30 days)
|
|
@@ -257,7 +259,7 @@ class CacheManager:
|
|
|
257
259
|
self.manifest[url] = {
|
|
258
260
|
"checksum": self.compute_checksum(content),
|
|
259
261
|
"file_path": str(file_path),
|
|
260
|
-
"fetched_at":
|
|
262
|
+
"fetched_at": utc_now_iso(),
|
|
261
263
|
"size": len(content),
|
|
262
264
|
}
|
|
263
265
|
|
|
@@ -314,7 +316,7 @@ class CacheManager:
|
|
|
314
316
|
Note:
|
|
315
317
|
Changes are batched. Call flush() to persist to disk.
|
|
316
318
|
"""
|
|
317
|
-
self._state.last_run =
|
|
319
|
+
self._state.last_run = utc_now_iso()
|
|
318
320
|
self._state_dirty = True
|
|
319
321
|
|
|
320
322
|
def clear_state(self) -> None:
|
|
@@ -354,18 +356,18 @@ class CacheManager:
|
|
|
354
356
|
if ttl is None:
|
|
355
357
|
return 0
|
|
356
358
|
|
|
357
|
-
cutoff =
|
|
359
|
+
cutoff = utc_now() - timedelta(days=ttl)
|
|
358
360
|
to_remove = []
|
|
359
361
|
|
|
360
362
|
for url, entry in self.manifest.items():
|
|
361
363
|
fetched_at = entry.get("fetched_at")
|
|
362
364
|
if fetched_at:
|
|
363
365
|
try:
|
|
364
|
-
entry_time =
|
|
366
|
+
entry_time = parse_persisted_datetime(fetched_at)
|
|
365
367
|
if entry_time < cutoff:
|
|
366
368
|
to_remove.append(url)
|
|
367
|
-
except ValueError:
|
|
368
|
-
|
|
369
|
+
except ValueError as err:
|
|
370
|
+
logger.warning("Invalid cache timestamp for %s: %s", url, err)
|
|
369
371
|
|
|
370
372
|
for url in to_remove:
|
|
371
373
|
del self.manifest[url]
|
|
@@ -413,7 +415,7 @@ class CacheManager:
|
|
|
413
415
|
"""
|
|
414
416
|
data: DiscoveredUrlsState = {
|
|
415
417
|
"start_url": start_url,
|
|
416
|
-
"discovered_at":
|
|
418
|
+
"discovered_at": utc_now_iso(),
|
|
417
419
|
"urls": urls,
|
|
418
420
|
}
|
|
419
421
|
try:
|
|
@@ -13,12 +13,10 @@ if "--doctor" in sys.argv:
|
|
|
13
13
|
|
|
14
14
|
output_dir = None
|
|
15
15
|
if "--output-dir" in sys.argv or "-o" in sys.argv:
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
except (ValueError, IndexError):
|
|
21
|
-
pass
|
|
16
|
+
flag = "--output-dir" if "--output-dir" in sys.argv else "-o"
|
|
17
|
+
flag_idx = sys.argv.index(flag)
|
|
18
|
+
if flag_idx + 1 < len(sys.argv):
|
|
19
|
+
output_dir = Path(sys.argv[flag_idx + 1])
|
|
22
20
|
sys.exit(run_doctor(output_dir=output_dir))
|
|
23
21
|
|
|
24
22
|
# Verify core dependencies
|
|
@@ -562,8 +560,7 @@ def run_fetcher(args: argparse.Namespace) -> int:
|
|
|
562
560
|
n_chunks = len(ctx.chunks) if ctx.chunks else 0
|
|
563
561
|
extra = f" ({n_chunks} chunks)" if n_chunks else ""
|
|
564
562
|
console.print(
|
|
565
|
-
f"[green]Saved:[/green] {ctx.output_path} "
|
|
566
|
-
f"[{ctx.source_type or 'generic'}]{extra}"
|
|
563
|
+
f"[green]Saved:[/green] {ctx.output_path} [{ctx.source_type or 'generic'}]{extra}"
|
|
567
564
|
)
|
|
568
565
|
return 0
|
|
569
566
|
|
|
@@ -246,7 +246,8 @@ def _describe_type(schema: Any, spec: dict[str, Any]) -> str:
|
|
|
246
246
|
if not isinstance(schema, dict):
|
|
247
247
|
return "?"
|
|
248
248
|
if "$ref" in schema:
|
|
249
|
-
|
|
249
|
+
ref: str = schema["$ref"]
|
|
250
|
+
return ref.rsplit("/", 1)[-1]
|
|
250
251
|
for key in ("oneOf", "anyOf", "allOf"):
|
|
251
252
|
if isinstance(schema.get(key), list) and schema[key]:
|
|
252
253
|
seen: list[str] = []
|
|
@@ -319,7 +320,8 @@ class OpenApiExtractor:
|
|
|
319
320
|
return None
|
|
320
321
|
try:
|
|
321
322
|
data = json.loads(text)
|
|
322
|
-
except json.JSONDecodeError:
|
|
323
|
+
except json.JSONDecodeError as err:
|
|
324
|
+
logger.debug("OpenAPI extractor skipped %s: JSON parse failed: %s", url, err)
|
|
323
325
|
return None
|
|
324
326
|
if not isinstance(data, dict):
|
|
325
327
|
return None
|
|
@@ -349,9 +351,7 @@ class OpenApiExtractor:
|
|
|
349
351
|
for method, op in ops.items():
|
|
350
352
|
if method.lower() not in _HTTP_METHODS or not isinstance(op, dict):
|
|
351
353
|
continue
|
|
352
|
-
self._render_operation(
|
|
353
|
-
lines, path, method, op, shared_params, data
|
|
354
|
-
)
|
|
354
|
+
self._render_operation(lines, path, method, op, shared_params, data)
|
|
355
355
|
|
|
356
356
|
return SpecialCaseResult(
|
|
357
357
|
markdown="\n".join(lines).strip() + "\n",
|
|
@@ -410,9 +410,7 @@ class OpenApiExtractor:
|
|
|
410
410
|
lines.append(bullet)
|
|
411
411
|
lines.append("")
|
|
412
412
|
|
|
413
|
-
def _render_request_body(
|
|
414
|
-
self, lines: list[str], body: Any, spec: dict[str, Any]
|
|
415
|
-
) -> None:
|
|
413
|
+
def _render_request_body(self, lines: list[str], body: Any, spec: dict[str, Any]) -> None:
|
|
416
414
|
if not isinstance(body, dict):
|
|
417
415
|
return
|
|
418
416
|
if "$ref" in body:
|
|
@@ -455,9 +453,7 @@ class OpenApiExtractor:
|
|
|
455
453
|
lines.append(f"- body: {_describe_type(schema, spec)}")
|
|
456
454
|
lines.append("")
|
|
457
455
|
|
|
458
|
-
def _render_responses(
|
|
459
|
-
self, lines: list[str], responses: Any, spec: dict[str, Any]
|
|
460
|
-
) -> None:
|
|
456
|
+
def _render_responses(self, lines: list[str], responses: Any, spec: dict[str, Any]) -> None:
|
|
461
457
|
if not isinstance(responses, dict) or not responses:
|
|
462
458
|
return
|
|
463
459
|
lines.append("**Responses:**")
|
|
@@ -535,11 +531,7 @@ class MdxSourceExtractor:
|
|
|
535
531
|
for pattern in self._EDIT_PATTERNS:
|
|
536
532
|
match = pattern.search(text)
|
|
537
533
|
if match:
|
|
538
|
-
raw_url = (
|
|
539
|
-
match.group(1)
|
|
540
|
-
.replace("/blob/", "/raw/")
|
|
541
|
-
.replace("/edit/", "/raw/")
|
|
542
|
-
)
|
|
534
|
+
raw_url = match.group(1).replace("/blob/", "/raw/").replace("/edit/", "/raw/")
|
|
543
535
|
# Return None so downstream runs, but attach hint via a cache
|
|
544
536
|
# mechanism. Simpler: return None always; step reads the URL
|
|
545
537
|
# if needed by re-running the regex.
|
|
@@ -567,9 +559,7 @@ def find_mdx_source_url(html: bytes) -> str | None:
|
|
|
567
559
|
for pattern in MdxSourceExtractor._EDIT_PATTERNS:
|
|
568
560
|
match = pattern.search(text)
|
|
569
561
|
if match:
|
|
570
|
-
return (
|
|
571
|
-
match.group(1).replace("/blob/", "/raw/").replace("/edit/", "/raw/")
|
|
572
|
-
)
|
|
562
|
+
return match.group(1).replace("/blob/", "/raw/").replace("/edit/", "/raw/")
|
|
573
563
|
return None
|
|
574
564
|
|
|
575
565
|
|
|
@@ -580,8 +570,8 @@ def looks_like_spa(html: bytes, min_body_ratio: float = 0.05) -> bool:
|
|
|
580
570
|
"""Heuristic: does this HTML appear to be a JS-only SPA?
|
|
581
571
|
|
|
582
572
|
True when the non-script body text is very small relative to the overall
|
|
583
|
-
page size and the page contains script tags.
|
|
584
|
-
|
|
573
|
+
page size and the page contains script tags. This is a conservative signal
|
|
574
|
+
for warning an agent before it consumes empty Markdown.
|
|
585
575
|
"""
|
|
586
576
|
if len(html) < 500:
|
|
587
577
|
return False
|
|
@@ -589,7 +579,8 @@ def looks_like_spa(html: bytes, min_body_ratio: float = 0.05) -> bool:
|
|
|
589
579
|
return False
|
|
590
580
|
try:
|
|
591
581
|
soup = _soup(html)
|
|
592
|
-
except Exception: # noqa: BLE001
|
|
582
|
+
except Exception as err: # noqa: BLE001
|
|
583
|
+
logger.debug("SPA heuristic skipped malformed HTML: %s", err)
|
|
593
584
|
return False
|
|
594
585
|
# Remove scripts/styles before measuring.
|
|
595
586
|
for tag in soup(["script", "style", "noscript"]):
|
|
@@ -265,9 +265,7 @@ class Fetcher:
|
|
|
265
265
|
# built-in 50 MB ceiling.
|
|
266
266
|
max_content_size_kw: dict[str, int] = {}
|
|
267
267
|
if self.config.content_filter.max_file_size is not None:
|
|
268
|
-
max_content_size_kw["max_content_size"] = int(
|
|
269
|
-
self.config.content_filter.max_file_size
|
|
270
|
-
)
|
|
268
|
+
max_content_size_kw["max_content_size"] = int(self.config.content_filter.max_file_size)
|
|
271
269
|
self._http_client = AsyncHttpClient(
|
|
272
270
|
rate_limiter=self._rate_limiter,
|
|
273
271
|
max_retries=self.config.network.max_retries,
|
|
@@ -509,11 +507,7 @@ class Fetcher:
|
|
|
509
507
|
|
|
510
508
|
steps = self._pipeline.steps
|
|
511
509
|
if not save:
|
|
512
|
-
steps = [
|
|
513
|
-
s
|
|
514
|
-
for s in steps
|
|
515
|
-
if s.name not in {"save", "save_json", "save_ndjson", "save_sqlite"}
|
|
516
|
-
]
|
|
510
|
+
steps = [s for s in steps if s.name not in {"save", "save_json", "save_ndjson", "save_sqlite"}]
|
|
517
511
|
pipeline = type(self._pipeline)(steps=steps)
|
|
518
512
|
ctx = await pipeline.execute(url, output_path)
|
|
519
513
|
if ctx.error:
|
|
@@ -531,8 +525,8 @@ class Fetcher:
|
|
|
531
525
|
"""
|
|
532
526
|
Compute output path for a URL using the configured naming strategy.
|
|
533
527
|
|
|
534
|
-
- ``full
|
|
535
|
-
|
|
528
|
+
- ``full``: a single flattened filename (URL path joined with
|
|
529
|
+
underscores).
|
|
536
530
|
- ``hierarchical``: URL path preserved as nested directories,
|
|
537
531
|
terminating in ``<segment>.md`` or ``index.md`` for trailing
|
|
538
532
|
slashes. The leaf is `_validate_output_path`-safe — every segment
|
|
@@ -545,7 +539,6 @@ class Fetcher:
|
|
|
545
539
|
parts = _url_to_path_parts(url, self.config.url)
|
|
546
540
|
return output_dir.joinpath(*parts)
|
|
547
541
|
|
|
548
|
-
# full / flat / short: aliased to full until 3.0
|
|
549
542
|
filename = _url_to_filename(url, self.config.url)
|
|
550
543
|
return output_dir / filename
|
|
551
544
|
|
|
@@ -638,9 +631,7 @@ class Fetcher:
|
|
|
638
631
|
)
|
|
639
632
|
|
|
640
633
|
discovered: list[str] = []
|
|
641
|
-
async for url in self._discoverer.discover(
|
|
642
|
-
start_url, max_urls=self.config.crawl.max_pages
|
|
643
|
-
):
|
|
634
|
+
async for url in self._discoverer.discover(start_url, max_urls=self.config.crawl.max_pages):
|
|
644
635
|
discovered.append(url)
|
|
645
636
|
if self._cancelled:
|
|
646
637
|
yield FetchEvent(
|
|
@@ -756,9 +747,7 @@ class Fetcher:
|
|
|
756
747
|
)
|
|
757
748
|
)
|
|
758
749
|
try:
|
|
759
|
-
async for url in discoverer.discover(
|
|
760
|
-
start_url, max_urls=self.config.crawl.max_pages
|
|
761
|
-
):
|
|
750
|
+
async for url in discoverer.discover(start_url, max_urls=self.config.crawl.max_pages):
|
|
762
751
|
if self._cancelled:
|
|
763
752
|
break
|
|
764
753
|
await url_queue.put(url)
|
|
@@ -770,14 +759,10 @@ class Fetcher:
|
|
|
770
759
|
and self._cache_manager
|
|
771
760
|
and len(discovered_for_resume) % 200 == 0
|
|
772
761
|
):
|
|
773
|
-
self._cache_manager.save_discovered_urls(
|
|
774
|
-
list(discovered_for_resume), start_url
|
|
775
|
-
)
|
|
762
|
+
self._cache_manager.save_discovered_urls(list(discovered_for_resume), start_url)
|
|
776
763
|
finally:
|
|
777
764
|
if self.config.cache.enabled and self._cache_manager:
|
|
778
|
-
self._cache_manager.save_discovered_urls(
|
|
779
|
-
discovered_for_resume, start_url
|
|
780
|
-
)
|
|
765
|
+
self._cache_manager.save_discovered_urls(discovered_for_resume, start_url)
|
|
781
766
|
self._stats.urls_discovered = len(discovered_for_resume)
|
|
782
767
|
await event_queue.put(
|
|
783
768
|
FetchEvent(
|
|
@@ -810,6 +795,7 @@ class Fetcher:
|
|
|
810
795
|
continue
|
|
811
796
|
|
|
812
797
|
local_events: list[FetchEvent] = []
|
|
798
|
+
|
|
813
799
|
# Bind the per-iteration list as a default arg so ruff B023
|
|
814
800
|
# is happy. Closure is consumed synchronously by execute()
|
|
815
801
|
# before the next iteration anyway, so capture order is safe.
|
|
@@ -936,9 +922,7 @@ def fetch_one(url: str, **kwargs: object) -> PageContext:
|
|
|
936
922
|
"""
|
|
937
923
|
try:
|
|
938
924
|
asyncio.get_running_loop()
|
|
939
|
-
raise RuntimeError(
|
|
940
|
-
"fetch_one() called from async context. Use Fetcher.fetch_one() instead."
|
|
941
|
-
)
|
|
925
|
+
raise RuntimeError("fetch_one() called from async context. Use Fetcher.fetch_one() instead.")
|
|
942
926
|
except RuntimeError as exc:
|
|
943
927
|
if "no running event loop" not in str(exc).lower():
|
|
944
928
|
raise
|
|
@@ -29,19 +29,20 @@ def normalize_url(url: str) -> str:
|
|
|
29
29
|
Returns:
|
|
30
30
|
Normalized URL string
|
|
31
31
|
"""
|
|
32
|
-
# Use url_normalize library if available
|
|
32
|
+
# Use url_normalize library if available for case / percent-encoding
|
|
33
|
+
# cleanup. It does NOT strip fragments, so we always do that ourselves
|
|
34
|
+
# below — keeping behavior consistent whether the optional dep is
|
|
35
|
+
# installed or not.
|
|
33
36
|
if URL_NORMALIZE_AVAILABLE:
|
|
34
37
|
try:
|
|
35
|
-
|
|
36
|
-
|
|
38
|
+
normalized = url_normalize(url)
|
|
39
|
+
if normalized:
|
|
40
|
+
url = normalized
|
|
37
41
|
except ValueError:
|
|
38
42
|
logger.debug("url_normalize rejected URL during normalization", exc_info=True)
|
|
39
43
|
|
|
40
|
-
# Basic normalization
|
|
41
44
|
parsed = urlparse(url)
|
|
42
|
-
|
|
43
|
-
# Remove fragment
|
|
44
|
-
normalized = urlunparse(
|
|
45
|
+
return urlunparse(
|
|
45
46
|
(
|
|
46
47
|
parsed.scheme.lower(),
|
|
47
48
|
parsed.netloc.lower(),
|
|
@@ -52,8 +53,6 @@ def normalize_url(url: str) -> str:
|
|
|
52
53
|
)
|
|
53
54
|
)
|
|
54
55
|
|
|
55
|
-
return normalized
|
|
56
|
-
|
|
57
56
|
|
|
58
57
|
class PatternFilter:
|
|
59
58
|
"""
|
|
@@ -295,7 +295,8 @@ class EnhancedLinkExtractor:
|
|
|
295
295
|
|
|
296
296
|
try:
|
|
297
297
|
absolute_url = urljoin(base_url, href)
|
|
298
|
-
except Exception:
|
|
298
|
+
except Exception as err:
|
|
299
|
+
logger.debug("Could not resolve href %r against %s: %s", href, base_url, err)
|
|
299
300
|
return None
|
|
300
301
|
|
|
301
302
|
# Validate it's a proper URL
|
|
@@ -148,7 +148,8 @@ class StaticLinkExtractor:
|
|
|
148
148
|
"""
|
|
149
149
|
try:
|
|
150
150
|
absolute_url = urljoin(base_url, href)
|
|
151
|
-
except Exception:
|
|
151
|
+
except Exception as err:
|
|
152
|
+
logger.debug("Could not resolve href %r against %s: %s", href, base_url, err)
|
|
152
153
|
return None
|
|
153
154
|
|
|
154
155
|
# Remove fragment
|
|
@@ -12,7 +12,7 @@ from types import TracebackType
|
|
|
12
12
|
from urllib.parse import urljoin, urlparse
|
|
13
13
|
|
|
14
14
|
import aiohttp
|
|
15
|
-
from aiohttp.abc import AbstractResolver
|
|
15
|
+
from aiohttp.abc import AbstractResolver, ResolveResult
|
|
16
16
|
|
|
17
17
|
from ..security.url_validator import UrlValidator
|
|
18
18
|
from .protocols import HttpResponse
|
|
@@ -45,14 +45,14 @@ class _ValidatedResolver(AbstractResolver):
|
|
|
45
45
|
self,
|
|
46
46
|
host: str,
|
|
47
47
|
port: int = 0,
|
|
48
|
-
family:
|
|
49
|
-
) -> list[
|
|
48
|
+
family: socket.AddressFamily = socket.AF_UNSPEC,
|
|
49
|
+
) -> list[ResolveResult]:
|
|
50
50
|
try:
|
|
51
51
|
addresses = self._url_validator.resolve_allowed_addresses(host)
|
|
52
52
|
except ValueError as err:
|
|
53
53
|
raise OSError(str(err)) from err
|
|
54
54
|
|
|
55
|
-
results: list[
|
|
55
|
+
results: list[ResolveResult] = []
|
|
56
56
|
for address in addresses:
|
|
57
57
|
ip = ipaddress.ip_address(address)
|
|
58
58
|
entry_family = socket.AF_INET6 if ip.version == 6 else socket.AF_INET
|
|
@@ -60,14 +60,14 @@ class _ValidatedResolver(AbstractResolver):
|
|
|
60
60
|
continue
|
|
61
61
|
|
|
62
62
|
results.append(
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
63
|
+
ResolveResult(
|
|
64
|
+
hostname=host,
|
|
65
|
+
host=address,
|
|
66
|
+
port=port,
|
|
67
|
+
family=entry_family,
|
|
68
|
+
proto=socket.IPPROTO_TCP,
|
|
69
|
+
flags=socket.AI_NUMERICHOST,
|
|
70
|
+
)
|
|
71
71
|
)
|
|
72
72
|
|
|
73
73
|
if not results:
|
|
@@ -236,20 +236,21 @@ class AsyncHttpClient:
|
|
|
236
236
|
|
|
237
237
|
async def __aenter__(self) -> AsyncHttpClient:
|
|
238
238
|
"""Enter async context and create session."""
|
|
239
|
-
|
|
240
|
-
"limit": 100, # Total connection limit
|
|
241
|
-
"limit_per_host": 10, # Per-host connection limit
|
|
242
|
-
"ttl_dns_cache": 300, # DNS cache TTL
|
|
243
|
-
}
|
|
239
|
+
resolver: AbstractResolver | None = None
|
|
244
240
|
if self._url_validator is not None and self._proxy is None:
|
|
245
|
-
|
|
241
|
+
resolver = _ValidatedResolver(self._url_validator)
|
|
246
242
|
elif self._proxy is not None and self._url_validator is not None:
|
|
247
243
|
logger.warning(
|
|
248
244
|
"Proxy mode: DNS-pinning resolver is not active. "
|
|
249
245
|
"URL validation still runs pre-flight, but the proxy resolves DNS independently."
|
|
250
246
|
)
|
|
251
247
|
|
|
252
|
-
connector = aiohttp.TCPConnector(
|
|
248
|
+
connector = aiohttp.TCPConnector(
|
|
249
|
+
limit=100,
|
|
250
|
+
limit_per_host=10,
|
|
251
|
+
ttl_dns_cache=300,
|
|
252
|
+
resolver=resolver,
|
|
253
|
+
)
|
|
253
254
|
self._session = aiohttp.ClientSession(
|
|
254
255
|
connector=connector,
|
|
255
256
|
headers={"User-Agent": self._user_agent},
|
|
@@ -215,8 +215,7 @@ async def _run_stdio() -> int:
|
|
|
215
215
|
from mcp.types import CallToolResult, TextContent, Tool, ToolAnnotations
|
|
216
216
|
except ImportError:
|
|
217
217
|
print(
|
|
218
|
-
"docpull mcp requires the 'mcp' package. Install with: "
|
|
219
|
-
"pip install docpull[mcp]",
|
|
218
|
+
"docpull mcp requires the 'mcp' package. Install with: pip install docpull[mcp]",
|
|
220
219
|
file=sys.stderr,
|
|
221
220
|
)
|
|
222
221
|
return 1
|
|
@@ -590,7 +589,10 @@ async def _run_stdio() -> int:
|
|
|
590
589
|
# isError=False), and
|
|
591
590
|
# (b) errors on tools with an outputSchema don't fail the validator
|
|
592
591
|
# for "missing structured content."
|
|
593
|
-
content
|
|
592
|
+
# `content` is typed `list[TextContent | ImageContent | ...]` on the SDK
|
|
593
|
+
# side; list invariance means we have to widen the local annotation
|
|
594
|
+
# explicitly even though TextContent is one of the valid variants.
|
|
595
|
+
content: list[Any] = [TextContent(type="text", text=result.text)]
|
|
594
596
|
return CallToolResult(
|
|
595
597
|
content=content,
|
|
596
598
|
structuredContent=result.data if not result.is_error else None,
|