docpull 2.5.0__tar.gz → 3.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docpull-2.5.0/src/docpull.egg-info → docpull-3.0.0}/PKG-INFO +45 -8
- {docpull-2.5.0 → docpull-3.0.0}/README.md +44 -6
- {docpull-2.5.0 → docpull-3.0.0}/pyproject.toml +1 -6
- {docpull-2.5.0 → docpull-3.0.0}/src/docpull/__init__.py +1 -1
- {docpull-2.5.0 → docpull-3.0.0}/src/docpull/cli.py +1 -2
- {docpull-2.5.0 → docpull-3.0.0}/src/docpull/conversion/special_cases.py +7 -18
- {docpull-2.5.0 → docpull-3.0.0}/src/docpull/core/fetcher.py +10 -26
- {docpull-2.5.0 → docpull-3.0.0}/src/docpull/discovery/filters.py +8 -9
- {docpull-2.5.0 → docpull-3.0.0}/src/docpull/http/client.py +20 -19
- {docpull-2.5.0 → docpull-3.0.0}/src/docpull/mcp/server.py +17 -9
- {docpull-2.5.0 → docpull-3.0.0}/src/docpull/mcp/tools.py +34 -54
- {docpull-2.5.0 → docpull-3.0.0}/src/docpull/models/config.py +6 -73
- {docpull-2.5.0 → docpull-3.0.0}/src/docpull/models/profiles.py +1 -5
- {docpull-2.5.0 → docpull-3.0.0}/src/docpull/pipeline/steps/convert.py +12 -12
- {docpull-2.5.0 → docpull-3.0.0}/src/docpull/pipeline/steps/fetch.py +2 -3
- {docpull-2.5.0 → docpull-3.0.0}/src/docpull/pipeline/steps/save.py +1 -1
- {docpull-2.5.0 → docpull-3.0.0}/src/docpull/security/robots.py +13 -5
- {docpull-2.5.0 → docpull-3.0.0}/src/docpull/security/url_validator.py +2 -2
- {docpull-2.5.0 → docpull-3.0.0/src/docpull.egg-info}/PKG-INFO +45 -8
- {docpull-2.5.0 → docpull-3.0.0}/src/docpull.egg-info/SOURCES.txt +6 -6
- {docpull-2.5.0 → docpull-3.0.0}/src/docpull.egg-info/requires.txt +0 -1
- {docpull-2.5.0 → docpull-3.0.0}/tests/test_cache_conditional_get.py +5 -12
- docpull-2.5.0/tests/test_v2_conversion.py → docpull-3.0.0/tests/test_conversion.py +26 -42
- {docpull-2.5.0 → docpull-3.0.0}/tests/test_convert_step_new.py +2 -6
- docpull-2.5.0/tests/test_v2_discovery.py → docpull-3.0.0/tests/test_discovery.py +1 -1
- docpull-2.5.0/tests/test_v2_integration.py → docpull-3.0.0/tests/test_integration.py +4 -5
- {docpull-2.5.0 → docpull-3.0.0}/tests/test_mcp_tools.py +66 -20
- {docpull-2.5.0 → docpull-3.0.0}/tests/test_naming.py +4 -23
- docpull-2.5.0/tests/test_v2_pipeline.py → docpull-3.0.0/tests/test_pipeline.py +2 -4
- {docpull-2.5.0 → docpull-3.0.0}/tests/test_special_cases.py +5 -17
- {docpull-2.5.0 → docpull-3.0.0}/LICENSE +0 -0
- {docpull-2.5.0 → docpull-3.0.0}/setup.cfg +0 -0
- {docpull-2.5.0 → docpull-3.0.0}/src/docpull/__main__.py +0 -0
- {docpull-2.5.0 → docpull-3.0.0}/src/docpull/cache/__init__.py +0 -0
- {docpull-2.5.0 → docpull-3.0.0}/src/docpull/cache/manager.py +0 -0
- {docpull-2.5.0 → docpull-3.0.0}/src/docpull/cache/streaming_dedup.py +0 -0
- {docpull-2.5.0 → docpull-3.0.0}/src/docpull/concurrency/__init__.py +0 -0
- {docpull-2.5.0 → docpull-3.0.0}/src/docpull/concurrency/manager.py +0 -0
- {docpull-2.5.0 → docpull-3.0.0}/src/docpull/conversion/__init__.py +0 -0
- {docpull-2.5.0 → docpull-3.0.0}/src/docpull/conversion/chunking.py +0 -0
- {docpull-2.5.0 → docpull-3.0.0}/src/docpull/conversion/extractor.py +0 -0
- {docpull-2.5.0 → docpull-3.0.0}/src/docpull/conversion/markdown.py +0 -0
- {docpull-2.5.0 → docpull-3.0.0}/src/docpull/conversion/protocols.py +0 -0
- {docpull-2.5.0 → docpull-3.0.0}/src/docpull/conversion/trafilatura_extractor.py +0 -0
- {docpull-2.5.0 → docpull-3.0.0}/src/docpull/core/__init__.py +0 -0
- {docpull-2.5.0 → docpull-3.0.0}/src/docpull/discovery/__init__.py +0 -0
- {docpull-2.5.0 → docpull-3.0.0}/src/docpull/discovery/composite.py +0 -0
- {docpull-2.5.0 → docpull-3.0.0}/src/docpull/discovery/crawler.py +0 -0
- {docpull-2.5.0 → docpull-3.0.0}/src/docpull/discovery/link_extractors/__init__.py +0 -0
- {docpull-2.5.0 → docpull-3.0.0}/src/docpull/discovery/link_extractors/enhanced.py +0 -0
- {docpull-2.5.0 → docpull-3.0.0}/src/docpull/discovery/link_extractors/protocols.py +0 -0
- {docpull-2.5.0 → docpull-3.0.0}/src/docpull/discovery/link_extractors/static.py +0 -0
- {docpull-2.5.0 → docpull-3.0.0}/src/docpull/discovery/protocols.py +0 -0
- {docpull-2.5.0 → docpull-3.0.0}/src/docpull/discovery/sitemap.py +0 -0
- {docpull-2.5.0 → docpull-3.0.0}/src/docpull/doctor.py +0 -0
- {docpull-2.5.0 → docpull-3.0.0}/src/docpull/http/__init__.py +0 -0
- {docpull-2.5.0 → docpull-3.0.0}/src/docpull/http/protocols.py +0 -0
- {docpull-2.5.0 → docpull-3.0.0}/src/docpull/http/rate_limiter.py +0 -0
- {docpull-2.5.0 → docpull-3.0.0}/src/docpull/logging_config.py +0 -0
- {docpull-2.5.0 → docpull-3.0.0}/src/docpull/mcp/__init__.py +0 -0
- {docpull-2.5.0 → docpull-3.0.0}/src/docpull/mcp/sources.py +0 -0
- {docpull-2.5.0 → docpull-3.0.0}/src/docpull/metadata_extractor.py +0 -0
- {docpull-2.5.0 → docpull-3.0.0}/src/docpull/models/__init__.py +0 -0
- {docpull-2.5.0 → docpull-3.0.0}/src/docpull/models/events.py +0 -0
- {docpull-2.5.0 → docpull-3.0.0}/src/docpull/pipeline/__init__.py +0 -0
- {docpull-2.5.0 → docpull-3.0.0}/src/docpull/pipeline/base.py +0 -0
- {docpull-2.5.0 → docpull-3.0.0}/src/docpull/pipeline/steps/__init__.py +0 -0
- {docpull-2.5.0 → docpull-3.0.0}/src/docpull/pipeline/steps/chunk.py +0 -0
- {docpull-2.5.0 → docpull-3.0.0}/src/docpull/pipeline/steps/dedup.py +0 -0
- {docpull-2.5.0 → docpull-3.0.0}/src/docpull/pipeline/steps/metadata.py +0 -0
- {docpull-2.5.0 → docpull-3.0.0}/src/docpull/pipeline/steps/save_json.py +0 -0
- {docpull-2.5.0 → docpull-3.0.0}/src/docpull/pipeline/steps/save_ndjson.py +0 -0
- {docpull-2.5.0 → docpull-3.0.0}/src/docpull/pipeline/steps/save_sqlite.py +0 -0
- {docpull-2.5.0 → docpull-3.0.0}/src/docpull/pipeline/steps/validate.py +0 -0
- {docpull-2.5.0 → docpull-3.0.0}/src/docpull/py.typed +0 -0
- {docpull-2.5.0 → docpull-3.0.0}/src/docpull/security/__init__.py +0 -0
- {docpull-2.5.0 → docpull-3.0.0}/src/docpull.egg-info/dependency_links.txt +0 -0
- {docpull-2.5.0 → docpull-3.0.0}/src/docpull.egg-info/entry_points.txt +0 -0
- {docpull-2.5.0 → docpull-3.0.0}/src/docpull.egg-info/top_level.txt +0 -0
- {docpull-2.5.0 → docpull-3.0.0}/tests/test_chunking.py +0 -0
- {docpull-2.5.0 → docpull-3.0.0}/tests/test_cli.py +0 -0
- {docpull-2.5.0 → docpull-3.0.0}/tests/test_link_extractors.py +0 -0
- /docpull-2.5.0/tests/test_fixes_v2_3_0.py → /docpull-3.0.0/tests/test_real_site_regressions.py +0 -0
- {docpull-2.5.0 → docpull-3.0.0}/tests/test_save_ndjson.py +0 -0
- {docpull-2.5.0 → docpull-3.0.0}/tests/test_security_hardening.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: docpull
|
|
3
|
-
Version:
|
|
3
|
+
Version: 3.0.0
|
|
4
4
|
Summary: Pull documentation from the web and convert to clean markdown
|
|
5
5
|
Author-email: Zachary Roth <support@raintree.technology>
|
|
6
6
|
Maintainer-email: Raintree Technology <support@raintree.technology>
|
|
@@ -68,7 +68,6 @@ Provides-Extra: dev
|
|
|
68
68
|
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
|
69
69
|
Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
|
|
70
70
|
Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
|
|
71
|
-
Requires-Dist: black>=23.0.0; extra == "dev"
|
|
72
71
|
Requires-Dist: mypy>=1.0.0; extra == "dev"
|
|
73
72
|
Requires-Dist: ruff>=0.1.0; extra == "dev"
|
|
74
73
|
Requires-Dist: bandit>=1.7.0; extra == "dev"
|
|
@@ -222,7 +221,7 @@ pip install 'docpull[mcp]'
|
|
|
222
221
|
docpull mcp # starts the stdio server
|
|
223
222
|
```
|
|
224
223
|
|
|
225
|
-
Add to Claude Desktop or Claude Code:
|
|
224
|
+
Add to Claude Desktop or Claude Code manually:
|
|
226
225
|
|
|
227
226
|
```json
|
|
228
227
|
{
|
|
@@ -235,13 +234,39 @@ Add to Claude Desktop or Claude Code:
|
|
|
235
234
|
}
|
|
236
235
|
```
|
|
237
236
|
|
|
238
|
-
|
|
237
|
+
Or, if you use Claude Code, install the plugin instead — it bundles the MCP
|
|
238
|
+
server, five slash commands (`/docs-add`, `/docs-search`, `/docs-list`,
|
|
239
|
+
`/docs-refresh`, `/docs-remove`), and a meta-skill that teaches Claude
|
|
240
|
+
when to reach for docpull automatically:
|
|
239
241
|
|
|
240
|
-
|
|
241
|
-
|
|
242
|
+
```bash
|
|
243
|
+
# 1. Install docpull with the MCP extra (required for the plugin)
|
|
244
|
+
pip install 'docpull[mcp]'
|
|
245
|
+
```
|
|
246
|
+
|
|
247
|
+
```
|
|
248
|
+
# 2. Then in Claude Code:
|
|
249
|
+
/plugin marketplace add raintree-technology/docpull
|
|
250
|
+
/plugin install docpull@docpull
|
|
251
|
+
```
|
|
252
|
+
|
|
253
|
+
See [plugin/README.md](plugin/README.md) for details.
|
|
254
|
+
|
|
255
|
+
Tools exposed (8 total — read tools advertise `readOnlyHint` so hosts that auto-approve safe tools won't prompt):
|
|
256
|
+
|
|
257
|
+
Read:
|
|
258
|
+
- `fetch_url(url, max_tokens?)` — one-shot fetch, no crawl. HTTPS-only, SSRF-validated.
|
|
242
259
|
- `list_sources(category?)` — show available aliases (react, nextjs, fastapi, …)
|
|
243
|
-
- `list_indexed()` — what has been fetched locally
|
|
244
|
-
- `grep_docs(pattern, library?)` — regex search across fetched Markdown
|
|
260
|
+
- `list_indexed()` — what has been fetched locally, with last-fetched age
|
|
261
|
+
- `grep_docs(pattern, library?, limit?, context?)` — regex search across fetched Markdown (length-capped + wall-clock budgeted to mitigate ReDoS)
|
|
262
|
+
- `read_doc(library, path, line_start?, line_end?)` — read a specific cached file, optionally line-sliced
|
|
263
|
+
|
|
264
|
+
Write:
|
|
265
|
+
- `ensure_docs(source, force?, profile?)` — fetch a named library (cached 7 days). Forwards progress to clients that supply a `progressToken`.
|
|
266
|
+
- `add_source(name, url, description?, category?, max_pages?, force?)` — register a user alias (HTTPS-only, atomic write to `sources.yaml`).
|
|
267
|
+
- `remove_source(name, delete_cache?)` — drop a user alias and (optionally) its cached docs.
|
|
268
|
+
|
|
269
|
+
All tools that carry data also return `structuredContent` validated against an `outputSchema` for clients that prefer typed output.
|
|
245
270
|
|
|
246
271
|
User-defined sources live in `~/.config/docpull-mcp/sources.yaml`:
|
|
247
272
|
|
|
@@ -254,6 +279,17 @@ sources:
|
|
|
254
279
|
maxPages: 200
|
|
255
280
|
```
|
|
256
281
|
|
|
282
|
+
### About the `mcp/` directory in this repo
|
|
283
|
+
|
|
284
|
+
The `mcp/` directory at the repo root is a separate TypeScript + Bun MCP
|
|
285
|
+
server backed by PostgreSQL with pgvector for semantic search. It is not
|
|
286
|
+
the Python MCP server shipped in the `docpull` package described above
|
|
287
|
+
— that one is the right choice for almost every user and is installed
|
|
288
|
+
with `pip install 'docpull[mcp]'`. The `mcp/` tree is mirrored to its
|
|
289
|
+
own repo at [`raintree-technology/docpull-mcp`](https://github.com/raintree-technology/docpull-mcp);
|
|
290
|
+
unless you specifically need pgvector-backed semantic search, ignore it
|
|
291
|
+
and use `docpull mcp`.
|
|
292
|
+
|
|
257
293
|
## Output
|
|
258
294
|
|
|
259
295
|
Markdown files with YAML frontmatter:
|
|
@@ -350,6 +386,7 @@ docpull URL --preview-urls # List URLs without fetching
|
|
|
350
386
|
- [PyPI](https://pypi.org/project/docpull/)
|
|
351
387
|
- [GitHub](https://github.com/raintree-technology/docpull)
|
|
352
388
|
- [Changelog](https://github.com/raintree-technology/docpull/blob/main/docs/CHANGELOG.md)
|
|
389
|
+
- [Metrics](https://github.com/raintree-technology/docpull/blob/main/METRICS.md) — auto-refreshed daily (PyPI downloads, plugin installs via clone count, traffic)
|
|
353
390
|
|
|
354
391
|
## License
|
|
355
392
|
|
|
@@ -140,7 +140,7 @@ pip install 'docpull[mcp]'
|
|
|
140
140
|
docpull mcp # starts the stdio server
|
|
141
141
|
```
|
|
142
142
|
|
|
143
|
-
Add to Claude Desktop or Claude Code:
|
|
143
|
+
Add to Claude Desktop or Claude Code manually:
|
|
144
144
|
|
|
145
145
|
```json
|
|
146
146
|
{
|
|
@@ -153,13 +153,39 @@ Add to Claude Desktop or Claude Code:
|
|
|
153
153
|
}
|
|
154
154
|
```
|
|
155
155
|
|
|
156
|
-
|
|
156
|
+
Or, if you use Claude Code, install the plugin instead — it bundles the MCP
|
|
157
|
+
server, five slash commands (`/docs-add`, `/docs-search`, `/docs-list`,
|
|
158
|
+
`/docs-refresh`, `/docs-remove`), and a meta-skill that teaches Claude
|
|
159
|
+
when to reach for docpull automatically:
|
|
157
160
|
|
|
158
|
-
|
|
159
|
-
|
|
161
|
+
```bash
|
|
162
|
+
# 1. Install docpull with the MCP extra (required for the plugin)
|
|
163
|
+
pip install 'docpull[mcp]'
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
```
|
|
167
|
+
# 2. Then in Claude Code:
|
|
168
|
+
/plugin marketplace add raintree-technology/docpull
|
|
169
|
+
/plugin install docpull@docpull
|
|
170
|
+
```
|
|
171
|
+
|
|
172
|
+
See [plugin/README.md](plugin/README.md) for details.
|
|
173
|
+
|
|
174
|
+
Tools exposed (8 total — read tools advertise `readOnlyHint` so hosts that auto-approve safe tools won't prompt):
|
|
175
|
+
|
|
176
|
+
Read:
|
|
177
|
+
- `fetch_url(url, max_tokens?)` — one-shot fetch, no crawl. HTTPS-only, SSRF-validated.
|
|
160
178
|
- `list_sources(category?)` — show available aliases (react, nextjs, fastapi, …)
|
|
161
|
-
- `list_indexed()` — what has been fetched locally
|
|
162
|
-
- `grep_docs(pattern, library?)` — regex search across fetched Markdown
|
|
179
|
+
- `list_indexed()` — what has been fetched locally, with last-fetched age
|
|
180
|
+
- `grep_docs(pattern, library?, limit?, context?)` — regex search across fetched Markdown (length-capped + wall-clock budgeted to mitigate ReDoS)
|
|
181
|
+
- `read_doc(library, path, line_start?, line_end?)` — read a specific cached file, optionally line-sliced
|
|
182
|
+
|
|
183
|
+
Write:
|
|
184
|
+
- `ensure_docs(source, force?, profile?)` — fetch a named library (cached 7 days). Forwards progress to clients that supply a `progressToken`.
|
|
185
|
+
- `add_source(name, url, description?, category?, max_pages?, force?)` — register a user alias (HTTPS-only, atomic write to `sources.yaml`).
|
|
186
|
+
- `remove_source(name, delete_cache?)` — drop a user alias and (optionally) its cached docs.
|
|
187
|
+
|
|
188
|
+
All tools that carry data also return `structuredContent` validated against an `outputSchema` for clients that prefer typed output.
|
|
163
189
|
|
|
164
190
|
User-defined sources live in `~/.config/docpull-mcp/sources.yaml`:
|
|
165
191
|
|
|
@@ -172,6 +198,17 @@ sources:
|
|
|
172
198
|
maxPages: 200
|
|
173
199
|
```
|
|
174
200
|
|
|
201
|
+
### About the `mcp/` directory in this repo
|
|
202
|
+
|
|
203
|
+
The `mcp/` directory at the repo root is a separate TypeScript + Bun MCP
|
|
204
|
+
server backed by PostgreSQL with pgvector for semantic search. It is not
|
|
205
|
+
the Python MCP server shipped in the `docpull` package described above
|
|
206
|
+
— that one is the right choice for almost every user and is installed
|
|
207
|
+
with `pip install 'docpull[mcp]'`. The `mcp/` tree is mirrored to its
|
|
208
|
+
own repo at [`raintree-technology/docpull-mcp`](https://github.com/raintree-technology/docpull-mcp);
|
|
209
|
+
unless you specifically need pgvector-backed semantic search, ignore it
|
|
210
|
+
and use `docpull mcp`.
|
|
211
|
+
|
|
175
212
|
## Output
|
|
176
213
|
|
|
177
214
|
Markdown files with YAML frontmatter:
|
|
@@ -268,6 +305,7 @@ docpull URL --preview-urls # List URLs without fetching
|
|
|
268
305
|
- [PyPI](https://pypi.org/project/docpull/)
|
|
269
306
|
- [GitHub](https://github.com/raintree-technology/docpull)
|
|
270
307
|
- [Changelog](https://github.com/raintree-technology/docpull/blob/main/docs/CHANGELOG.md)
|
|
308
|
+
- [Metrics](https://github.com/raintree-technology/docpull/blob/main/METRICS.md) — auto-refreshed daily (PyPI downloads, plugin installs via clone count, traffic)
|
|
271
309
|
|
|
272
310
|
## License
|
|
273
311
|
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "docpull"
|
|
7
|
-
version = "
|
|
7
|
+
version = "3.0.0"
|
|
8
8
|
dynamic = []
|
|
9
9
|
description = "Pull documentation from the web and convert to clean markdown"
|
|
10
10
|
readme = {file = "README.md", content-type = "text/markdown"}
|
|
@@ -102,7 +102,6 @@ dev = [
|
|
|
102
102
|
"pytest>=7.0.0",
|
|
103
103
|
"pytest-cov>=4.0.0",
|
|
104
104
|
"pytest-asyncio>=0.21.0",
|
|
105
|
-
"black>=23.0.0",
|
|
106
105
|
"mypy>=1.0.0",
|
|
107
106
|
"ruff>=0.1.0",
|
|
108
107
|
"bandit>=1.7.0",
|
|
@@ -132,10 +131,6 @@ include = ["docpull*"]
|
|
|
132
131
|
[tool.setuptools.package-data]
|
|
133
132
|
docpull = ["py.typed"]
|
|
134
133
|
|
|
135
|
-
[tool.black]
|
|
136
|
-
line-length = 110
|
|
137
|
-
target-version = ["py310", "py311", "py312", "py313", "py314"]
|
|
138
|
-
|
|
139
134
|
[tool.ruff]
|
|
140
135
|
line-length = 110
|
|
141
136
|
target-version = "py310"
|
|
@@ -562,8 +562,7 @@ def run_fetcher(args: argparse.Namespace) -> int:
|
|
|
562
562
|
n_chunks = len(ctx.chunks) if ctx.chunks else 0
|
|
563
563
|
extra = f" ({n_chunks} chunks)" if n_chunks else ""
|
|
564
564
|
console.print(
|
|
565
|
-
f"[green]Saved:[/green] {ctx.output_path} "
|
|
566
|
-
f"[{ctx.source_type or 'generic'}]{extra}"
|
|
565
|
+
f"[green]Saved:[/green] {ctx.output_path} [{ctx.source_type or 'generic'}]{extra}"
|
|
567
566
|
)
|
|
568
567
|
return 0
|
|
569
568
|
|
|
@@ -246,7 +246,8 @@ def _describe_type(schema: Any, spec: dict[str, Any]) -> str:
|
|
|
246
246
|
if not isinstance(schema, dict):
|
|
247
247
|
return "?"
|
|
248
248
|
if "$ref" in schema:
|
|
249
|
-
|
|
249
|
+
ref: str = schema["$ref"]
|
|
250
|
+
return ref.rsplit("/", 1)[-1]
|
|
250
251
|
for key in ("oneOf", "anyOf", "allOf"):
|
|
251
252
|
if isinstance(schema.get(key), list) and schema[key]:
|
|
252
253
|
seen: list[str] = []
|
|
@@ -349,9 +350,7 @@ class OpenApiExtractor:
|
|
|
349
350
|
for method, op in ops.items():
|
|
350
351
|
if method.lower() not in _HTTP_METHODS or not isinstance(op, dict):
|
|
351
352
|
continue
|
|
352
|
-
self._render_operation(
|
|
353
|
-
lines, path, method, op, shared_params, data
|
|
354
|
-
)
|
|
353
|
+
self._render_operation(lines, path, method, op, shared_params, data)
|
|
355
354
|
|
|
356
355
|
return SpecialCaseResult(
|
|
357
356
|
markdown="\n".join(lines).strip() + "\n",
|
|
@@ -410,9 +409,7 @@ class OpenApiExtractor:
|
|
|
410
409
|
lines.append(bullet)
|
|
411
410
|
lines.append("")
|
|
412
411
|
|
|
413
|
-
def _render_request_body(
|
|
414
|
-
self, lines: list[str], body: Any, spec: dict[str, Any]
|
|
415
|
-
) -> None:
|
|
412
|
+
def _render_request_body(self, lines: list[str], body: Any, spec: dict[str, Any]) -> None:
|
|
416
413
|
if not isinstance(body, dict):
|
|
417
414
|
return
|
|
418
415
|
if "$ref" in body:
|
|
@@ -455,9 +452,7 @@ class OpenApiExtractor:
|
|
|
455
452
|
lines.append(f"- body: {_describe_type(schema, spec)}")
|
|
456
453
|
lines.append("")
|
|
457
454
|
|
|
458
|
-
def _render_responses(
|
|
459
|
-
self, lines: list[str], responses: Any, spec: dict[str, Any]
|
|
460
|
-
) -> None:
|
|
455
|
+
def _render_responses(self, lines: list[str], responses: Any, spec: dict[str, Any]) -> None:
|
|
461
456
|
if not isinstance(responses, dict) or not responses:
|
|
462
457
|
return
|
|
463
458
|
lines.append("**Responses:**")
|
|
@@ -535,11 +530,7 @@ class MdxSourceExtractor:
|
|
|
535
530
|
for pattern in self._EDIT_PATTERNS:
|
|
536
531
|
match = pattern.search(text)
|
|
537
532
|
if match:
|
|
538
|
-
raw_url = (
|
|
539
|
-
match.group(1)
|
|
540
|
-
.replace("/blob/", "/raw/")
|
|
541
|
-
.replace("/edit/", "/raw/")
|
|
542
|
-
)
|
|
533
|
+
raw_url = match.group(1).replace("/blob/", "/raw/").replace("/edit/", "/raw/")
|
|
543
534
|
# Return None so downstream runs, but attach hint via a cache
|
|
544
535
|
# mechanism. Simpler: return None always; step reads the URL
|
|
545
536
|
# if needed by re-running the regex.
|
|
@@ -567,9 +558,7 @@ def find_mdx_source_url(html: bytes) -> str | None:
|
|
|
567
558
|
for pattern in MdxSourceExtractor._EDIT_PATTERNS:
|
|
568
559
|
match = pattern.search(text)
|
|
569
560
|
if match:
|
|
570
|
-
return (
|
|
571
|
-
match.group(1).replace("/blob/", "/raw/").replace("/edit/", "/raw/")
|
|
572
|
-
)
|
|
561
|
+
return match.group(1).replace("/blob/", "/raw/").replace("/edit/", "/raw/")
|
|
573
562
|
return None
|
|
574
563
|
|
|
575
564
|
|
|
@@ -265,9 +265,7 @@ class Fetcher:
|
|
|
265
265
|
# built-in 50 MB ceiling.
|
|
266
266
|
max_content_size_kw: dict[str, int] = {}
|
|
267
267
|
if self.config.content_filter.max_file_size is not None:
|
|
268
|
-
max_content_size_kw["max_content_size"] = int(
|
|
269
|
-
self.config.content_filter.max_file_size
|
|
270
|
-
)
|
|
268
|
+
max_content_size_kw["max_content_size"] = int(self.config.content_filter.max_file_size)
|
|
271
269
|
self._http_client = AsyncHttpClient(
|
|
272
270
|
rate_limiter=self._rate_limiter,
|
|
273
271
|
max_retries=self.config.network.max_retries,
|
|
@@ -509,11 +507,7 @@ class Fetcher:
|
|
|
509
507
|
|
|
510
508
|
steps = self._pipeline.steps
|
|
511
509
|
if not save:
|
|
512
|
-
steps = [
|
|
513
|
-
s
|
|
514
|
-
for s in steps
|
|
515
|
-
if s.name not in {"save", "save_json", "save_ndjson", "save_sqlite"}
|
|
516
|
-
]
|
|
510
|
+
steps = [s for s in steps if s.name not in {"save", "save_json", "save_ndjson", "save_sqlite"}]
|
|
517
511
|
pipeline = type(self._pipeline)(steps=steps)
|
|
518
512
|
ctx = await pipeline.execute(url, output_path)
|
|
519
513
|
if ctx.error:
|
|
@@ -531,8 +525,8 @@ class Fetcher:
|
|
|
531
525
|
"""
|
|
532
526
|
Compute output path for a URL using the configured naming strategy.
|
|
533
527
|
|
|
534
|
-
- ``full
|
|
535
|
-
|
|
528
|
+
- ``full``: a single flattened filename (URL path joined with
|
|
529
|
+
underscores).
|
|
536
530
|
- ``hierarchical``: URL path preserved as nested directories,
|
|
537
531
|
terminating in ``<segment>.md`` or ``index.md`` for trailing
|
|
538
532
|
slashes. The leaf is `_validate_output_path`-safe — every segment
|
|
@@ -545,7 +539,6 @@ class Fetcher:
|
|
|
545
539
|
parts = _url_to_path_parts(url, self.config.url)
|
|
546
540
|
return output_dir.joinpath(*parts)
|
|
547
541
|
|
|
548
|
-
# full / flat / short: aliased to full until 3.0
|
|
549
542
|
filename = _url_to_filename(url, self.config.url)
|
|
550
543
|
return output_dir / filename
|
|
551
544
|
|
|
@@ -638,9 +631,7 @@ class Fetcher:
|
|
|
638
631
|
)
|
|
639
632
|
|
|
640
633
|
discovered: list[str] = []
|
|
641
|
-
async for url in self._discoverer.discover(
|
|
642
|
-
start_url, max_urls=self.config.crawl.max_pages
|
|
643
|
-
):
|
|
634
|
+
async for url in self._discoverer.discover(start_url, max_urls=self.config.crawl.max_pages):
|
|
644
635
|
discovered.append(url)
|
|
645
636
|
if self._cancelled:
|
|
646
637
|
yield FetchEvent(
|
|
@@ -756,9 +747,7 @@ class Fetcher:
|
|
|
756
747
|
)
|
|
757
748
|
)
|
|
758
749
|
try:
|
|
759
|
-
async for url in discoverer.discover(
|
|
760
|
-
start_url, max_urls=self.config.crawl.max_pages
|
|
761
|
-
):
|
|
750
|
+
async for url in discoverer.discover(start_url, max_urls=self.config.crawl.max_pages):
|
|
762
751
|
if self._cancelled:
|
|
763
752
|
break
|
|
764
753
|
await url_queue.put(url)
|
|
@@ -770,14 +759,10 @@ class Fetcher:
|
|
|
770
759
|
and self._cache_manager
|
|
771
760
|
and len(discovered_for_resume) % 200 == 0
|
|
772
761
|
):
|
|
773
|
-
self._cache_manager.save_discovered_urls(
|
|
774
|
-
list(discovered_for_resume), start_url
|
|
775
|
-
)
|
|
762
|
+
self._cache_manager.save_discovered_urls(list(discovered_for_resume), start_url)
|
|
776
763
|
finally:
|
|
777
764
|
if self.config.cache.enabled and self._cache_manager:
|
|
778
|
-
self._cache_manager.save_discovered_urls(
|
|
779
|
-
discovered_for_resume, start_url
|
|
780
|
-
)
|
|
765
|
+
self._cache_manager.save_discovered_urls(discovered_for_resume, start_url)
|
|
781
766
|
self._stats.urls_discovered = len(discovered_for_resume)
|
|
782
767
|
await event_queue.put(
|
|
783
768
|
FetchEvent(
|
|
@@ -810,6 +795,7 @@ class Fetcher:
|
|
|
810
795
|
continue
|
|
811
796
|
|
|
812
797
|
local_events: list[FetchEvent] = []
|
|
798
|
+
|
|
813
799
|
# Bind the per-iteration list as a default arg so ruff B023
|
|
814
800
|
# is happy. Closure is consumed synchronously by execute()
|
|
815
801
|
# before the next iteration anyway, so capture order is safe.
|
|
@@ -936,9 +922,7 @@ def fetch_one(url: str, **kwargs: object) -> PageContext:
|
|
|
936
922
|
"""
|
|
937
923
|
try:
|
|
938
924
|
asyncio.get_running_loop()
|
|
939
|
-
raise RuntimeError(
|
|
940
|
-
"fetch_one() called from async context. Use Fetcher.fetch_one() instead."
|
|
941
|
-
)
|
|
925
|
+
raise RuntimeError("fetch_one() called from async context. Use Fetcher.fetch_one() instead.")
|
|
942
926
|
except RuntimeError as exc:
|
|
943
927
|
if "no running event loop" not in str(exc).lower():
|
|
944
928
|
raise
|
|
@@ -29,19 +29,20 @@ def normalize_url(url: str) -> str:
|
|
|
29
29
|
Returns:
|
|
30
30
|
Normalized URL string
|
|
31
31
|
"""
|
|
32
|
-
# Use url_normalize library if available
|
|
32
|
+
# Use url_normalize library if available for case / percent-encoding
|
|
33
|
+
# cleanup. It does NOT strip fragments, so we always do that ourselves
|
|
34
|
+
# below — keeping behavior consistent whether the optional dep is
|
|
35
|
+
# installed or not.
|
|
33
36
|
if URL_NORMALIZE_AVAILABLE:
|
|
34
37
|
try:
|
|
35
|
-
|
|
36
|
-
|
|
38
|
+
normalized = url_normalize(url)
|
|
39
|
+
if normalized:
|
|
40
|
+
url = normalized
|
|
37
41
|
except ValueError:
|
|
38
42
|
logger.debug("url_normalize rejected URL during normalization", exc_info=True)
|
|
39
43
|
|
|
40
|
-
# Basic normalization
|
|
41
44
|
parsed = urlparse(url)
|
|
42
|
-
|
|
43
|
-
# Remove fragment
|
|
44
|
-
normalized = urlunparse(
|
|
45
|
+
return urlunparse(
|
|
45
46
|
(
|
|
46
47
|
parsed.scheme.lower(),
|
|
47
48
|
parsed.netloc.lower(),
|
|
@@ -52,8 +53,6 @@ def normalize_url(url: str) -> str:
|
|
|
52
53
|
)
|
|
53
54
|
)
|
|
54
55
|
|
|
55
|
-
return normalized
|
|
56
|
-
|
|
57
56
|
|
|
58
57
|
class PatternFilter:
|
|
59
58
|
"""
|
|
@@ -12,7 +12,7 @@ from types import TracebackType
|
|
|
12
12
|
from urllib.parse import urljoin, urlparse
|
|
13
13
|
|
|
14
14
|
import aiohttp
|
|
15
|
-
from aiohttp.abc import AbstractResolver
|
|
15
|
+
from aiohttp.abc import AbstractResolver, ResolveResult
|
|
16
16
|
|
|
17
17
|
from ..security.url_validator import UrlValidator
|
|
18
18
|
from .protocols import HttpResponse
|
|
@@ -45,14 +45,14 @@ class _ValidatedResolver(AbstractResolver):
|
|
|
45
45
|
self,
|
|
46
46
|
host: str,
|
|
47
47
|
port: int = 0,
|
|
48
|
-
family:
|
|
49
|
-
) -> list[
|
|
48
|
+
family: socket.AddressFamily = socket.AF_UNSPEC,
|
|
49
|
+
) -> list[ResolveResult]:
|
|
50
50
|
try:
|
|
51
51
|
addresses = self._url_validator.resolve_allowed_addresses(host)
|
|
52
52
|
except ValueError as err:
|
|
53
53
|
raise OSError(str(err)) from err
|
|
54
54
|
|
|
55
|
-
results: list[
|
|
55
|
+
results: list[ResolveResult] = []
|
|
56
56
|
for address in addresses:
|
|
57
57
|
ip = ipaddress.ip_address(address)
|
|
58
58
|
entry_family = socket.AF_INET6 if ip.version == 6 else socket.AF_INET
|
|
@@ -60,14 +60,14 @@ class _ValidatedResolver(AbstractResolver):
|
|
|
60
60
|
continue
|
|
61
61
|
|
|
62
62
|
results.append(
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
63
|
+
ResolveResult(
|
|
64
|
+
hostname=host,
|
|
65
|
+
host=address,
|
|
66
|
+
port=port,
|
|
67
|
+
family=entry_family,
|
|
68
|
+
proto=socket.IPPROTO_TCP,
|
|
69
|
+
flags=socket.AI_NUMERICHOST,
|
|
70
|
+
)
|
|
71
71
|
)
|
|
72
72
|
|
|
73
73
|
if not results:
|
|
@@ -236,20 +236,21 @@ class AsyncHttpClient:
|
|
|
236
236
|
|
|
237
237
|
async def __aenter__(self) -> AsyncHttpClient:
|
|
238
238
|
"""Enter async context and create session."""
|
|
239
|
-
|
|
240
|
-
"limit": 100, # Total connection limit
|
|
241
|
-
"limit_per_host": 10, # Per-host connection limit
|
|
242
|
-
"ttl_dns_cache": 300, # DNS cache TTL
|
|
243
|
-
}
|
|
239
|
+
resolver: AbstractResolver | None = None
|
|
244
240
|
if self._url_validator is not None and self._proxy is None:
|
|
245
|
-
|
|
241
|
+
resolver = _ValidatedResolver(self._url_validator)
|
|
246
242
|
elif self._proxy is not None and self._url_validator is not None:
|
|
247
243
|
logger.warning(
|
|
248
244
|
"Proxy mode: DNS-pinning resolver is not active. "
|
|
249
245
|
"URL validation still runs pre-flight, but the proxy resolves DNS independently."
|
|
250
246
|
)
|
|
251
247
|
|
|
252
|
-
connector = aiohttp.TCPConnector(
|
|
248
|
+
connector = aiohttp.TCPConnector(
|
|
249
|
+
limit=100,
|
|
250
|
+
limit_per_host=10,
|
|
251
|
+
ttl_dns_cache=300,
|
|
252
|
+
resolver=resolver,
|
|
253
|
+
)
|
|
253
254
|
self._session = aiohttp.ClientSession(
|
|
254
255
|
connector=connector,
|
|
255
256
|
headers={"User-Agent": self._user_agent},
|
|
@@ -103,7 +103,11 @@ _GREP_DOCS_OUTPUT_SCHEMA = {
|
|
|
103
103
|
"items": {
|
|
104
104
|
"type": "object",
|
|
105
105
|
"properties": {
|
|
106
|
-
"
|
|
106
|
+
"library": {"type": "string"},
|
|
107
|
+
"path": {
|
|
108
|
+
"type": "string",
|
|
109
|
+
"description": "Relative to the library root; pass directly to read_doc",
|
|
110
|
+
},
|
|
107
111
|
"match_count": {"type": "integer"},
|
|
108
112
|
"matches": {
|
|
109
113
|
"type": "array",
|
|
@@ -119,7 +123,7 @@ _GREP_DOCS_OUTPUT_SCHEMA = {
|
|
|
119
123
|
},
|
|
120
124
|
},
|
|
121
125
|
},
|
|
122
|
-
"required": ["path", "match_count", "matches"],
|
|
126
|
+
"required": ["library", "path", "match_count", "matches"],
|
|
123
127
|
},
|
|
124
128
|
},
|
|
125
129
|
"truncated": {"type": "boolean"},
|
|
@@ -211,8 +215,7 @@ async def _run_stdio() -> int:
|
|
|
211
215
|
from mcp.types import CallToolResult, TextContent, Tool, ToolAnnotations
|
|
212
216
|
except ImportError:
|
|
213
217
|
print(
|
|
214
|
-
"docpull mcp requires the 'mcp' package. Install with: "
|
|
215
|
-
"pip install docpull[mcp]",
|
|
218
|
+
"docpull mcp requires the 'mcp' package. Install with: pip install docpull[mcp]",
|
|
216
219
|
file=sys.stderr,
|
|
217
220
|
)
|
|
218
221
|
return 1
|
|
@@ -333,8 +336,9 @@ async def _run_stdio() -> int:
|
|
|
333
336
|
description=(
|
|
334
337
|
"Regex search through fetched Markdown. Results are ranked by "
|
|
335
338
|
"match density (most matches per file first) and rendered with "
|
|
336
|
-
"lines of surrounding context.
|
|
337
|
-
"
|
|
339
|
+
"lines of surrounding context. Each result returns the library "
|
|
340
|
+
"and a path relative to the library root, so you can feed both "
|
|
341
|
+
"fields straight into read_doc. Use ensure_docs first."
|
|
338
342
|
),
|
|
339
343
|
annotations=ToolAnnotations(
|
|
340
344
|
title="Regex-search cached docs",
|
|
@@ -370,8 +374,9 @@ async def _run_stdio() -> int:
|
|
|
370
374
|
name="read_doc",
|
|
371
375
|
description=(
|
|
372
376
|
"Read a Markdown file from a fetched library, optionally sliced "
|
|
373
|
-
"by line range. The natural follow-up to grep_docs: pass
|
|
374
|
-
"library
|
|
377
|
+
"by line range. The natural follow-up to grep_docs: pass each "
|
|
378
|
+
"result's library and path (path is already relative to the "
|
|
379
|
+
"library root) to pull more surrounding context."
|
|
375
380
|
),
|
|
376
381
|
annotations=ToolAnnotations(
|
|
377
382
|
title="Read a cached doc file",
|
|
@@ -584,7 +589,10 @@ async def _run_stdio() -> int:
|
|
|
584
589
|
# isError=False), and
|
|
585
590
|
# (b) errors on tools with an outputSchema don't fail the validator
|
|
586
591
|
# for "missing structured content."
|
|
587
|
-
content
|
|
592
|
+
# `content` is typed `list[TextContent | ImageContent | ...]` on the SDK
|
|
593
|
+
# side; list invariance means we have to widen the local annotation
|
|
594
|
+
# explicitly even though TextContent is one of the valid variants.
|
|
595
|
+
content: list[Any] = [TextContent(type="text", text=result.text)]
|
|
588
596
|
return CallToolResult(
|
|
589
597
|
content=content,
|
|
590
598
|
structuredContent=result.data if not result.is_error else None,
|