docpull 2.5.0__tar.gz → 3.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. {docpull-2.5.0/src/docpull.egg-info → docpull-3.0.0}/PKG-INFO +45 -8
  2. {docpull-2.5.0 → docpull-3.0.0}/README.md +44 -6
  3. {docpull-2.5.0 → docpull-3.0.0}/pyproject.toml +1 -6
  4. {docpull-2.5.0 → docpull-3.0.0}/src/docpull/__init__.py +1 -1
  5. {docpull-2.5.0 → docpull-3.0.0}/src/docpull/cli.py +1 -2
  6. {docpull-2.5.0 → docpull-3.0.0}/src/docpull/conversion/special_cases.py +7 -18
  7. {docpull-2.5.0 → docpull-3.0.0}/src/docpull/core/fetcher.py +10 -26
  8. {docpull-2.5.0 → docpull-3.0.0}/src/docpull/discovery/filters.py +8 -9
  9. {docpull-2.5.0 → docpull-3.0.0}/src/docpull/http/client.py +20 -19
  10. {docpull-2.5.0 → docpull-3.0.0}/src/docpull/mcp/server.py +17 -9
  11. {docpull-2.5.0 → docpull-3.0.0}/src/docpull/mcp/tools.py +34 -54
  12. {docpull-2.5.0 → docpull-3.0.0}/src/docpull/models/config.py +6 -73
  13. {docpull-2.5.0 → docpull-3.0.0}/src/docpull/models/profiles.py +1 -5
  14. {docpull-2.5.0 → docpull-3.0.0}/src/docpull/pipeline/steps/convert.py +12 -12
  15. {docpull-2.5.0 → docpull-3.0.0}/src/docpull/pipeline/steps/fetch.py +2 -3
  16. {docpull-2.5.0 → docpull-3.0.0}/src/docpull/pipeline/steps/save.py +1 -1
  17. {docpull-2.5.0 → docpull-3.0.0}/src/docpull/security/robots.py +13 -5
  18. {docpull-2.5.0 → docpull-3.0.0}/src/docpull/security/url_validator.py +2 -2
  19. {docpull-2.5.0 → docpull-3.0.0/src/docpull.egg-info}/PKG-INFO +45 -8
  20. {docpull-2.5.0 → docpull-3.0.0}/src/docpull.egg-info/SOURCES.txt +6 -6
  21. {docpull-2.5.0 → docpull-3.0.0}/src/docpull.egg-info/requires.txt +0 -1
  22. {docpull-2.5.0 → docpull-3.0.0}/tests/test_cache_conditional_get.py +5 -12
  23. docpull-2.5.0/tests/test_v2_conversion.py → docpull-3.0.0/tests/test_conversion.py +26 -42
  24. {docpull-2.5.0 → docpull-3.0.0}/tests/test_convert_step_new.py +2 -6
  25. docpull-2.5.0/tests/test_v2_discovery.py → docpull-3.0.0/tests/test_discovery.py +1 -1
  26. docpull-2.5.0/tests/test_v2_integration.py → docpull-3.0.0/tests/test_integration.py +4 -5
  27. {docpull-2.5.0 → docpull-3.0.0}/tests/test_mcp_tools.py +66 -20
  28. {docpull-2.5.0 → docpull-3.0.0}/tests/test_naming.py +4 -23
  29. docpull-2.5.0/tests/test_v2_pipeline.py → docpull-3.0.0/tests/test_pipeline.py +2 -4
  30. {docpull-2.5.0 → docpull-3.0.0}/tests/test_special_cases.py +5 -17
  31. {docpull-2.5.0 → docpull-3.0.0}/LICENSE +0 -0
  32. {docpull-2.5.0 → docpull-3.0.0}/setup.cfg +0 -0
  33. {docpull-2.5.0 → docpull-3.0.0}/src/docpull/__main__.py +0 -0
  34. {docpull-2.5.0 → docpull-3.0.0}/src/docpull/cache/__init__.py +0 -0
  35. {docpull-2.5.0 → docpull-3.0.0}/src/docpull/cache/manager.py +0 -0
  36. {docpull-2.5.0 → docpull-3.0.0}/src/docpull/cache/streaming_dedup.py +0 -0
  37. {docpull-2.5.0 → docpull-3.0.0}/src/docpull/concurrency/__init__.py +0 -0
  38. {docpull-2.5.0 → docpull-3.0.0}/src/docpull/concurrency/manager.py +0 -0
  39. {docpull-2.5.0 → docpull-3.0.0}/src/docpull/conversion/__init__.py +0 -0
  40. {docpull-2.5.0 → docpull-3.0.0}/src/docpull/conversion/chunking.py +0 -0
  41. {docpull-2.5.0 → docpull-3.0.0}/src/docpull/conversion/extractor.py +0 -0
  42. {docpull-2.5.0 → docpull-3.0.0}/src/docpull/conversion/markdown.py +0 -0
  43. {docpull-2.5.0 → docpull-3.0.0}/src/docpull/conversion/protocols.py +0 -0
  44. {docpull-2.5.0 → docpull-3.0.0}/src/docpull/conversion/trafilatura_extractor.py +0 -0
  45. {docpull-2.5.0 → docpull-3.0.0}/src/docpull/core/__init__.py +0 -0
  46. {docpull-2.5.0 → docpull-3.0.0}/src/docpull/discovery/__init__.py +0 -0
  47. {docpull-2.5.0 → docpull-3.0.0}/src/docpull/discovery/composite.py +0 -0
  48. {docpull-2.5.0 → docpull-3.0.0}/src/docpull/discovery/crawler.py +0 -0
  49. {docpull-2.5.0 → docpull-3.0.0}/src/docpull/discovery/link_extractors/__init__.py +0 -0
  50. {docpull-2.5.0 → docpull-3.0.0}/src/docpull/discovery/link_extractors/enhanced.py +0 -0
  51. {docpull-2.5.0 → docpull-3.0.0}/src/docpull/discovery/link_extractors/protocols.py +0 -0
  52. {docpull-2.5.0 → docpull-3.0.0}/src/docpull/discovery/link_extractors/static.py +0 -0
  53. {docpull-2.5.0 → docpull-3.0.0}/src/docpull/discovery/protocols.py +0 -0
  54. {docpull-2.5.0 → docpull-3.0.0}/src/docpull/discovery/sitemap.py +0 -0
  55. {docpull-2.5.0 → docpull-3.0.0}/src/docpull/doctor.py +0 -0
  56. {docpull-2.5.0 → docpull-3.0.0}/src/docpull/http/__init__.py +0 -0
  57. {docpull-2.5.0 → docpull-3.0.0}/src/docpull/http/protocols.py +0 -0
  58. {docpull-2.5.0 → docpull-3.0.0}/src/docpull/http/rate_limiter.py +0 -0
  59. {docpull-2.5.0 → docpull-3.0.0}/src/docpull/logging_config.py +0 -0
  60. {docpull-2.5.0 → docpull-3.0.0}/src/docpull/mcp/__init__.py +0 -0
  61. {docpull-2.5.0 → docpull-3.0.0}/src/docpull/mcp/sources.py +0 -0
  62. {docpull-2.5.0 → docpull-3.0.0}/src/docpull/metadata_extractor.py +0 -0
  63. {docpull-2.5.0 → docpull-3.0.0}/src/docpull/models/__init__.py +0 -0
  64. {docpull-2.5.0 → docpull-3.0.0}/src/docpull/models/events.py +0 -0
  65. {docpull-2.5.0 → docpull-3.0.0}/src/docpull/pipeline/__init__.py +0 -0
  66. {docpull-2.5.0 → docpull-3.0.0}/src/docpull/pipeline/base.py +0 -0
  67. {docpull-2.5.0 → docpull-3.0.0}/src/docpull/pipeline/steps/__init__.py +0 -0
  68. {docpull-2.5.0 → docpull-3.0.0}/src/docpull/pipeline/steps/chunk.py +0 -0
  69. {docpull-2.5.0 → docpull-3.0.0}/src/docpull/pipeline/steps/dedup.py +0 -0
  70. {docpull-2.5.0 → docpull-3.0.0}/src/docpull/pipeline/steps/metadata.py +0 -0
  71. {docpull-2.5.0 → docpull-3.0.0}/src/docpull/pipeline/steps/save_json.py +0 -0
  72. {docpull-2.5.0 → docpull-3.0.0}/src/docpull/pipeline/steps/save_ndjson.py +0 -0
  73. {docpull-2.5.0 → docpull-3.0.0}/src/docpull/pipeline/steps/save_sqlite.py +0 -0
  74. {docpull-2.5.0 → docpull-3.0.0}/src/docpull/pipeline/steps/validate.py +0 -0
  75. {docpull-2.5.0 → docpull-3.0.0}/src/docpull/py.typed +0 -0
  76. {docpull-2.5.0 → docpull-3.0.0}/src/docpull/security/__init__.py +0 -0
  77. {docpull-2.5.0 → docpull-3.0.0}/src/docpull.egg-info/dependency_links.txt +0 -0
  78. {docpull-2.5.0 → docpull-3.0.0}/src/docpull.egg-info/entry_points.txt +0 -0
  79. {docpull-2.5.0 → docpull-3.0.0}/src/docpull.egg-info/top_level.txt +0 -0
  80. {docpull-2.5.0 → docpull-3.0.0}/tests/test_chunking.py +0 -0
  81. {docpull-2.5.0 → docpull-3.0.0}/tests/test_cli.py +0 -0
  82. {docpull-2.5.0 → docpull-3.0.0}/tests/test_link_extractors.py +0 -0
  83. /docpull-2.5.0/tests/test_fixes_v2_3_0.py → /docpull-3.0.0/tests/test_real_site_regressions.py +0 -0
  84. {docpull-2.5.0 → docpull-3.0.0}/tests/test_save_ndjson.py +0 -0
  85. {docpull-2.5.0 → docpull-3.0.0}/tests/test_security_hardening.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docpull
3
- Version: 2.5.0
3
+ Version: 3.0.0
4
4
  Summary: Pull documentation from the web and convert to clean markdown
5
5
  Author-email: Zachary Roth <support@raintree.technology>
6
6
  Maintainer-email: Raintree Technology <support@raintree.technology>
@@ -68,7 +68,6 @@ Provides-Extra: dev
68
68
  Requires-Dist: pytest>=7.0.0; extra == "dev"
69
69
  Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
70
70
  Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
71
- Requires-Dist: black>=23.0.0; extra == "dev"
72
71
  Requires-Dist: mypy>=1.0.0; extra == "dev"
73
72
  Requires-Dist: ruff>=0.1.0; extra == "dev"
74
73
  Requires-Dist: bandit>=1.7.0; extra == "dev"
@@ -222,7 +221,7 @@ pip install 'docpull[mcp]'
222
221
  docpull mcp # starts the stdio server
223
222
  ```
224
223
 
225
- Add to Claude Desktop or Claude Code:
224
+ Add to Claude Desktop or Claude Code manually:
226
225
 
227
226
  ```json
228
227
  {
@@ -235,13 +234,39 @@ Add to Claude Desktop or Claude Code:
235
234
  }
236
235
  ```
237
236
 
238
- Tools exposed:
237
+ Or, if you use Claude Code, install the plugin instead — it bundles the MCP
238
+ server, five slash commands (`/docs-add`, `/docs-search`, `/docs-list`,
239
+ `/docs-refresh`, `/docs-remove`), and a meta-skill that teaches Claude
240
+ when to reach for docpull automatically:
239
241
 
240
- - `fetch_url(url, max_tokens?)` — one-shot fetch, no crawl
241
- - `ensure_docs(source, force?)` fetch a named library (cached 7 days)
242
+ ```bash
243
+ # 1. Install docpull with the MCP extra (required for the plugin)
244
+ pip install 'docpull[mcp]'
245
+ ```
246
+
247
+ ```
248
+ # 2. Then in Claude Code:
249
+ /plugin marketplace add raintree-technology/docpull
250
+ /plugin install docpull@docpull
251
+ ```
252
+
253
+ See [plugin/README.md](plugin/README.md) for details.
254
+
255
+ Tools exposed (8 total — read tools advertise `readOnlyHint` so hosts that auto-approve safe tools won't prompt):
256
+
257
+ Read:
258
+ - `fetch_url(url, max_tokens?)` — one-shot fetch, no crawl. HTTPS-only, SSRF-validated.
242
259
  - `list_sources(category?)` — show available aliases (react, nextjs, fastapi, …)
243
- - `list_indexed()` — what has been fetched locally
244
- - `grep_docs(pattern, library?)` — regex search across fetched Markdown
260
+ - `list_indexed()` — what has been fetched locally, with last-fetched age
261
+ - `grep_docs(pattern, library?, limit?, context?)` — regex search across fetched Markdown (length-capped + wall-clock budgeted to mitigate ReDoS)
262
+ - `read_doc(library, path, line_start?, line_end?)` — read a specific cached file, optionally line-sliced
263
+
264
+ Write:
265
+ - `ensure_docs(source, force?, profile?)` — fetch a named library (cached 7 days). Forwards progress to clients that supply a `progressToken`.
266
+ - `add_source(name, url, description?, category?, max_pages?, force?)` — register a user alias (HTTPS-only, atomic write to `sources.yaml`).
267
+ - `remove_source(name, delete_cache?)` — drop a user alias and (optionally) its cached docs.
268
+
269
+ All tools that carry data also return `structuredContent` validated against an `outputSchema` for clients that prefer typed output.
245
270
 
246
271
  User-defined sources live in `~/.config/docpull-mcp/sources.yaml`:
247
272
 
@@ -254,6 +279,17 @@ sources:
254
279
  maxPages: 200
255
280
  ```
256
281
 
282
+ ### About the `mcp/` directory in this repo
283
+
284
+ The `mcp/` directory at the repo root is a separate TypeScript + Bun MCP
285
+ server backed by PostgreSQL with pgvector for semantic search. It is not
286
+ the Python MCP server shipped in the `docpull` package described above
287
+ — that one is the right choice for almost every user and is installed
288
+ with `pip install 'docpull[mcp]'`. The `mcp/` tree is mirrored to its
289
+ own repo at [`raintree-technology/docpull-mcp`](https://github.com/raintree-technology/docpull-mcp);
290
+ unless you specifically need pgvector-backed semantic search, ignore it
291
+ and use `docpull mcp`.
292
+
257
293
  ## Output
258
294
 
259
295
  Markdown files with YAML frontmatter:
@@ -350,6 +386,7 @@ docpull URL --preview-urls # List URLs without fetching
350
386
  - [PyPI](https://pypi.org/project/docpull/)
351
387
  - [GitHub](https://github.com/raintree-technology/docpull)
352
388
  - [Changelog](https://github.com/raintree-technology/docpull/blob/main/docs/CHANGELOG.md)
389
+ - [Metrics](https://github.com/raintree-technology/docpull/blob/main/METRICS.md) — auto-refreshed daily (PyPI downloads, plugin installs via clone count, traffic)
353
390
 
354
391
  ## License
355
392
 
@@ -140,7 +140,7 @@ pip install 'docpull[mcp]'
140
140
  docpull mcp # starts the stdio server
141
141
  ```
142
142
 
143
- Add to Claude Desktop or Claude Code:
143
+ Add to Claude Desktop or Claude Code manually:
144
144
 
145
145
  ```json
146
146
  {
@@ -153,13 +153,39 @@ Add to Claude Desktop or Claude Code:
153
153
  }
154
154
  ```
155
155
 
156
- Tools exposed:
156
+ Or, if you use Claude Code, install the plugin instead — it bundles the MCP
157
+ server, five slash commands (`/docs-add`, `/docs-search`, `/docs-list`,
158
+ `/docs-refresh`, `/docs-remove`), and a meta-skill that teaches Claude
159
+ when to reach for docpull automatically:
157
160
 
158
- - `fetch_url(url, max_tokens?)` — one-shot fetch, no crawl
159
- - `ensure_docs(source, force?)` fetch a named library (cached 7 days)
161
+ ```bash
162
+ # 1. Install docpull with the MCP extra (required for the plugin)
163
+ pip install 'docpull[mcp]'
164
+ ```
165
+
166
+ ```
167
+ # 2. Then in Claude Code:
168
+ /plugin marketplace add raintree-technology/docpull
169
+ /plugin install docpull@docpull
170
+ ```
171
+
172
+ See [plugin/README.md](plugin/README.md) for details.
173
+
174
+ Tools exposed (8 total — read tools advertise `readOnlyHint` so hosts that auto-approve safe tools won't prompt):
175
+
176
+ Read:
177
+ - `fetch_url(url, max_tokens?)` — one-shot fetch, no crawl. HTTPS-only, SSRF-validated.
160
178
  - `list_sources(category?)` — show available aliases (react, nextjs, fastapi, …)
161
- - `list_indexed()` — what has been fetched locally
162
- - `grep_docs(pattern, library?)` — regex search across fetched Markdown
179
+ - `list_indexed()` — what has been fetched locally, with last-fetched age
180
+ - `grep_docs(pattern, library?, limit?, context?)` — regex search across fetched Markdown (length-capped + wall-clock budgeted to mitigate ReDoS)
181
+ - `read_doc(library, path, line_start?, line_end?)` — read a specific cached file, optionally line-sliced
182
+
183
+ Write:
184
+ - `ensure_docs(source, force?, profile?)` — fetch a named library (cached 7 days). Forwards progress to clients that supply a `progressToken`.
185
+ - `add_source(name, url, description?, category?, max_pages?, force?)` — register a user alias (HTTPS-only, atomic write to `sources.yaml`).
186
+ - `remove_source(name, delete_cache?)` — drop a user alias and (optionally) its cached docs.
187
+
188
+ All tools that carry data also return `structuredContent` validated against an `outputSchema` for clients that prefer typed output.
163
189
 
164
190
  User-defined sources live in `~/.config/docpull-mcp/sources.yaml`:
165
191
 
@@ -172,6 +198,17 @@ sources:
172
198
  maxPages: 200
173
199
  ```
174
200
 
201
+ ### About the `mcp/` directory in this repo
202
+
203
+ The `mcp/` directory at the repo root is a separate TypeScript + Bun MCP
204
+ server backed by PostgreSQL with pgvector for semantic search. It is not
205
+ the Python MCP server shipped in the `docpull` package described above
206
+ — that one is the right choice for almost every user and is installed
207
+ with `pip install 'docpull[mcp]'`. The `mcp/` tree is mirrored to its
208
+ own repo at [`raintree-technology/docpull-mcp`](https://github.com/raintree-technology/docpull-mcp);
209
+ unless you specifically need pgvector-backed semantic search, ignore it
210
+ and use `docpull mcp`.
211
+
175
212
  ## Output
176
213
 
177
214
  Markdown files with YAML frontmatter:
@@ -268,6 +305,7 @@ docpull URL --preview-urls # List URLs without fetching
268
305
  - [PyPI](https://pypi.org/project/docpull/)
269
306
  - [GitHub](https://github.com/raintree-technology/docpull)
270
307
  - [Changelog](https://github.com/raintree-technology/docpull/blob/main/docs/CHANGELOG.md)
308
+ - [Metrics](https://github.com/raintree-technology/docpull/blob/main/METRICS.md) — auto-refreshed daily (PyPI downloads, plugin installs via clone count, traffic)
271
309
 
272
310
  ## License
273
311
 
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "docpull"
7
- version = "2.5.0"
7
+ version = "3.0.0"
8
8
  dynamic = []
9
9
  description = "Pull documentation from the web and convert to clean markdown"
10
10
  readme = {file = "README.md", content-type = "text/markdown"}
@@ -102,7 +102,6 @@ dev = [
102
102
  "pytest>=7.0.0",
103
103
  "pytest-cov>=4.0.0",
104
104
  "pytest-asyncio>=0.21.0",
105
- "black>=23.0.0",
106
105
  "mypy>=1.0.0",
107
106
  "ruff>=0.1.0",
108
107
  "bandit>=1.7.0",
@@ -132,10 +131,6 @@ include = ["docpull*"]
132
131
  [tool.setuptools.package-data]
133
132
  docpull = ["py.typed"]
134
133
 
135
- [tool.black]
136
- line-length = 110
137
- target-version = ["py310", "py311", "py312", "py313", "py314"]
138
-
139
134
  [tool.ruff]
140
135
  line-length = 110
141
136
  target-version = "py310"
@@ -14,7 +14,7 @@ Usage:
14
14
  print(event)
15
15
  """
16
16
 
17
- __version__ = "2.5.0"
17
+ __version__ = "3.0.0"
18
18
 
19
19
  from .cache import CacheManager, StreamingDeduplicator
20
20
  from .conversion.chunking import Chunk, TokenCounter, chunk_markdown
@@ -562,8 +562,7 @@ def run_fetcher(args: argparse.Namespace) -> int:
562
562
  n_chunks = len(ctx.chunks) if ctx.chunks else 0
563
563
  extra = f" ({n_chunks} chunks)" if n_chunks else ""
564
564
  console.print(
565
- f"[green]Saved:[/green] {ctx.output_path} "
566
- f"[{ctx.source_type or 'generic'}]{extra}"
565
+ f"[green]Saved:[/green] {ctx.output_path} [{ctx.source_type or 'generic'}]{extra}"
567
566
  )
568
567
  return 0
569
568
 
@@ -246,7 +246,8 @@ def _describe_type(schema: Any, spec: dict[str, Any]) -> str:
246
246
  if not isinstance(schema, dict):
247
247
  return "?"
248
248
  if "$ref" in schema:
249
- return schema["$ref"].rsplit("/", 1)[-1]
249
+ ref: str = schema["$ref"]
250
+ return ref.rsplit("/", 1)[-1]
250
251
  for key in ("oneOf", "anyOf", "allOf"):
251
252
  if isinstance(schema.get(key), list) and schema[key]:
252
253
  seen: list[str] = []
@@ -349,9 +350,7 @@ class OpenApiExtractor:
349
350
  for method, op in ops.items():
350
351
  if method.lower() not in _HTTP_METHODS or not isinstance(op, dict):
351
352
  continue
352
- self._render_operation(
353
- lines, path, method, op, shared_params, data
354
- )
353
+ self._render_operation(lines, path, method, op, shared_params, data)
355
354
 
356
355
  return SpecialCaseResult(
357
356
  markdown="\n".join(lines).strip() + "\n",
@@ -410,9 +409,7 @@ class OpenApiExtractor:
410
409
  lines.append(bullet)
411
410
  lines.append("")
412
411
 
413
- def _render_request_body(
414
- self, lines: list[str], body: Any, spec: dict[str, Any]
415
- ) -> None:
412
+ def _render_request_body(self, lines: list[str], body: Any, spec: dict[str, Any]) -> None:
416
413
  if not isinstance(body, dict):
417
414
  return
418
415
  if "$ref" in body:
@@ -455,9 +452,7 @@ class OpenApiExtractor:
455
452
  lines.append(f"- body: {_describe_type(schema, spec)}")
456
453
  lines.append("")
457
454
 
458
- def _render_responses(
459
- self, lines: list[str], responses: Any, spec: dict[str, Any]
460
- ) -> None:
455
+ def _render_responses(self, lines: list[str], responses: Any, spec: dict[str, Any]) -> None:
461
456
  if not isinstance(responses, dict) or not responses:
462
457
  return
463
458
  lines.append("**Responses:**")
@@ -535,11 +530,7 @@ class MdxSourceExtractor:
535
530
  for pattern in self._EDIT_PATTERNS:
536
531
  match = pattern.search(text)
537
532
  if match:
538
- raw_url = (
539
- match.group(1)
540
- .replace("/blob/", "/raw/")
541
- .replace("/edit/", "/raw/")
542
- )
533
+ raw_url = match.group(1).replace("/blob/", "/raw/").replace("/edit/", "/raw/")
543
534
  # Return None so downstream runs, but attach hint via a cache
544
535
  # mechanism. Simpler: return None always; step reads the URL
545
536
  # if needed by re-running the regex.
@@ -567,9 +558,7 @@ def find_mdx_source_url(html: bytes) -> str | None:
567
558
  for pattern in MdxSourceExtractor._EDIT_PATTERNS:
568
559
  match = pattern.search(text)
569
560
  if match:
570
- return (
571
- match.group(1).replace("/blob/", "/raw/").replace("/edit/", "/raw/")
572
- )
561
+ return match.group(1).replace("/blob/", "/raw/").replace("/edit/", "/raw/")
573
562
  return None
574
563
 
575
564
 
@@ -265,9 +265,7 @@ class Fetcher:
265
265
  # built-in 50 MB ceiling.
266
266
  max_content_size_kw: dict[str, int] = {}
267
267
  if self.config.content_filter.max_file_size is not None:
268
- max_content_size_kw["max_content_size"] = int(
269
- self.config.content_filter.max_file_size
270
- )
268
+ max_content_size_kw["max_content_size"] = int(self.config.content_filter.max_file_size)
271
269
  self._http_client = AsyncHttpClient(
272
270
  rate_limiter=self._rate_limiter,
273
271
  max_retries=self.config.network.max_retries,
@@ -509,11 +507,7 @@ class Fetcher:
509
507
 
510
508
  steps = self._pipeline.steps
511
509
  if not save:
512
- steps = [
513
- s
514
- for s in steps
515
- if s.name not in {"save", "save_json", "save_ndjson", "save_sqlite"}
516
- ]
510
+ steps = [s for s in steps if s.name not in {"save", "save_json", "save_ndjson", "save_sqlite"}]
517
511
  pipeline = type(self._pipeline)(steps=steps)
518
512
  ctx = await pipeline.execute(url, output_path)
519
513
  if ctx.error:
@@ -531,8 +525,8 @@ class Fetcher:
531
525
  """
532
526
  Compute output path for a URL using the configured naming strategy.
533
527
 
534
- - ``full`` / ``flat`` / ``short``: a single flattened filename
535
- (URL path joined with underscores).
528
+ - ``full``: a single flattened filename (URL path joined with
529
+ underscores).
536
530
  - ``hierarchical``: URL path preserved as nested directories,
537
531
  terminating in ``<segment>.md`` or ``index.md`` for trailing
538
532
  slashes. The leaf is `_validate_output_path`-safe — every segment
@@ -545,7 +539,6 @@ class Fetcher:
545
539
  parts = _url_to_path_parts(url, self.config.url)
546
540
  return output_dir.joinpath(*parts)
547
541
 
548
- # full / flat / short: aliased to full until 3.0
549
542
  filename = _url_to_filename(url, self.config.url)
550
543
  return output_dir / filename
551
544
 
@@ -638,9 +631,7 @@ class Fetcher:
638
631
  )
639
632
 
640
633
  discovered: list[str] = []
641
- async for url in self._discoverer.discover(
642
- start_url, max_urls=self.config.crawl.max_pages
643
- ):
634
+ async for url in self._discoverer.discover(start_url, max_urls=self.config.crawl.max_pages):
644
635
  discovered.append(url)
645
636
  if self._cancelled:
646
637
  yield FetchEvent(
@@ -756,9 +747,7 @@ class Fetcher:
756
747
  )
757
748
  )
758
749
  try:
759
- async for url in discoverer.discover(
760
- start_url, max_urls=self.config.crawl.max_pages
761
- ):
750
+ async for url in discoverer.discover(start_url, max_urls=self.config.crawl.max_pages):
762
751
  if self._cancelled:
763
752
  break
764
753
  await url_queue.put(url)
@@ -770,14 +759,10 @@ class Fetcher:
770
759
  and self._cache_manager
771
760
  and len(discovered_for_resume) % 200 == 0
772
761
  ):
773
- self._cache_manager.save_discovered_urls(
774
- list(discovered_for_resume), start_url
775
- )
762
+ self._cache_manager.save_discovered_urls(list(discovered_for_resume), start_url)
776
763
  finally:
777
764
  if self.config.cache.enabled and self._cache_manager:
778
- self._cache_manager.save_discovered_urls(
779
- discovered_for_resume, start_url
780
- )
765
+ self._cache_manager.save_discovered_urls(discovered_for_resume, start_url)
781
766
  self._stats.urls_discovered = len(discovered_for_resume)
782
767
  await event_queue.put(
783
768
  FetchEvent(
@@ -810,6 +795,7 @@ class Fetcher:
810
795
  continue
811
796
 
812
797
  local_events: list[FetchEvent] = []
798
+
813
799
  # Bind the per-iteration list as a default arg so ruff B023
814
800
  # is happy. Closure is consumed synchronously by execute()
815
801
  # before the next iteration anyway, so capture order is safe.
@@ -936,9 +922,7 @@ def fetch_one(url: str, **kwargs: object) -> PageContext:
936
922
  """
937
923
  try:
938
924
  asyncio.get_running_loop()
939
- raise RuntimeError(
940
- "fetch_one() called from async context. Use Fetcher.fetch_one() instead."
941
- )
925
+ raise RuntimeError("fetch_one() called from async context. Use Fetcher.fetch_one() instead.")
942
926
  except RuntimeError as exc:
943
927
  if "no running event loop" not in str(exc).lower():
944
928
  raise
@@ -29,19 +29,20 @@ def normalize_url(url: str) -> str:
29
29
  Returns:
30
30
  Normalized URL string
31
31
  """
32
- # Use url_normalize library if available
32
+ # Use url_normalize library if available for case / percent-encoding
33
+ # cleanup. It does NOT strip fragments, so we always do that ourselves
34
+ # below — keeping behavior consistent whether the optional dep is
35
+ # installed or not.
33
36
  if URL_NORMALIZE_AVAILABLE:
34
37
  try:
35
- result: str = url_normalize(url)
36
- return result
38
+ normalized = url_normalize(url)
39
+ if normalized:
40
+ url = normalized
37
41
  except ValueError:
38
42
  logger.debug("url_normalize rejected URL during normalization", exc_info=True)
39
43
 
40
- # Basic normalization
41
44
  parsed = urlparse(url)
42
-
43
- # Remove fragment
44
- normalized = urlunparse(
45
+ return urlunparse(
45
46
  (
46
47
  parsed.scheme.lower(),
47
48
  parsed.netloc.lower(),
@@ -52,8 +53,6 @@ def normalize_url(url: str) -> str:
52
53
  )
53
54
  )
54
55
 
55
- return normalized
56
-
57
56
 
58
57
  class PatternFilter:
59
58
  """
@@ -12,7 +12,7 @@ from types import TracebackType
12
12
  from urllib.parse import urljoin, urlparse
13
13
 
14
14
  import aiohttp
15
- from aiohttp.abc import AbstractResolver
15
+ from aiohttp.abc import AbstractResolver, ResolveResult
16
16
 
17
17
  from ..security.url_validator import UrlValidator
18
18
  from .protocols import HttpResponse
@@ -45,14 +45,14 @@ class _ValidatedResolver(AbstractResolver):
45
45
  self,
46
46
  host: str,
47
47
  port: int = 0,
48
- family: int = socket.AF_UNSPEC,
49
- ) -> list[dict[str, object]]:
48
+ family: socket.AddressFamily = socket.AF_UNSPEC,
49
+ ) -> list[ResolveResult]:
50
50
  try:
51
51
  addresses = self._url_validator.resolve_allowed_addresses(host)
52
52
  except ValueError as err:
53
53
  raise OSError(str(err)) from err
54
54
 
55
- results: list[dict[str, object]] = []
55
+ results: list[ResolveResult] = []
56
56
  for address in addresses:
57
57
  ip = ipaddress.ip_address(address)
58
58
  entry_family = socket.AF_INET6 if ip.version == 6 else socket.AF_INET
@@ -60,14 +60,14 @@ class _ValidatedResolver(AbstractResolver):
60
60
  continue
61
61
 
62
62
  results.append(
63
- {
64
- "hostname": host,
65
- "host": address,
66
- "port": port,
67
- "family": entry_family,
68
- "proto": socket.IPPROTO_TCP,
69
- "flags": socket.AI_NUMERICHOST,
70
- }
63
+ ResolveResult(
64
+ hostname=host,
65
+ host=address,
66
+ port=port,
67
+ family=entry_family,
68
+ proto=socket.IPPROTO_TCP,
69
+ flags=socket.AI_NUMERICHOST,
70
+ )
71
71
  )
72
72
 
73
73
  if not results:
@@ -236,20 +236,21 @@ class AsyncHttpClient:
236
236
 
237
237
  async def __aenter__(self) -> AsyncHttpClient:
238
238
  """Enter async context and create session."""
239
- connector_kwargs: dict[str, object] = {
240
- "limit": 100, # Total connection limit
241
- "limit_per_host": 10, # Per-host connection limit
242
- "ttl_dns_cache": 300, # DNS cache TTL
243
- }
239
+ resolver: AbstractResolver | None = None
244
240
  if self._url_validator is not None and self._proxy is None:
245
- connector_kwargs["resolver"] = _ValidatedResolver(self._url_validator)
241
+ resolver = _ValidatedResolver(self._url_validator)
246
242
  elif self._proxy is not None and self._url_validator is not None:
247
243
  logger.warning(
248
244
  "Proxy mode: DNS-pinning resolver is not active. "
249
245
  "URL validation still runs pre-flight, but the proxy resolves DNS independently."
250
246
  )
251
247
 
252
- connector = aiohttp.TCPConnector(**connector_kwargs)
248
+ connector = aiohttp.TCPConnector(
249
+ limit=100,
250
+ limit_per_host=10,
251
+ ttl_dns_cache=300,
252
+ resolver=resolver,
253
+ )
253
254
  self._session = aiohttp.ClientSession(
254
255
  connector=connector,
255
256
  headers={"User-Agent": self._user_agent},
@@ -103,7 +103,11 @@ _GREP_DOCS_OUTPUT_SCHEMA = {
103
103
  "items": {
104
104
  "type": "object",
105
105
  "properties": {
106
- "path": {"type": "string"},
106
+ "library": {"type": "string"},
107
+ "path": {
108
+ "type": "string",
109
+ "description": "Relative to the library root; pass directly to read_doc",
110
+ },
107
111
  "match_count": {"type": "integer"},
108
112
  "matches": {
109
113
  "type": "array",
@@ -119,7 +123,7 @@ _GREP_DOCS_OUTPUT_SCHEMA = {
119
123
  },
120
124
  },
121
125
  },
122
- "required": ["path", "match_count", "matches"],
126
+ "required": ["library", "path", "match_count", "matches"],
123
127
  },
124
128
  },
125
129
  "truncated": {"type": "boolean"},
@@ -211,8 +215,7 @@ async def _run_stdio() -> int:
211
215
  from mcp.types import CallToolResult, TextContent, Tool, ToolAnnotations
212
216
  except ImportError:
213
217
  print(
214
- "docpull mcp requires the 'mcp' package. Install with: "
215
- "pip install docpull[mcp]",
218
+ "docpull mcp requires the 'mcp' package. Install with: pip install docpull[mcp]",
216
219
  file=sys.stderr,
217
220
  )
218
221
  return 1
@@ -333,8 +336,9 @@ async def _run_stdio() -> int:
333
336
  description=(
334
337
  "Regex search through fetched Markdown. Results are ranked by "
335
338
  "match density (most matches per file first) and rendered with "
336
- "lines of surrounding context. Use ensure_docs first; then "
337
- "read_doc to pull more context around a hit."
339
+ "lines of surrounding context. Each result returns the library "
340
+ "and a path relative to the library root, so you can feed both "
341
+ "fields straight into read_doc. Use ensure_docs first."
338
342
  ),
339
343
  annotations=ToolAnnotations(
340
344
  title="Regex-search cached docs",
@@ -370,8 +374,9 @@ async def _run_stdio() -> int:
370
374
  name="read_doc",
371
375
  description=(
372
376
  "Read a Markdown file from a fetched library, optionally sliced "
373
- "by line range. The natural follow-up to grep_docs: pass the "
374
- "library + path it returned to pull more surrounding context."
377
+ "by line range. The natural follow-up to grep_docs: pass each "
378
+ "result's library and path (path is already relative to the "
379
+ "library root) to pull more surrounding context."
375
380
  ),
376
381
  annotations=ToolAnnotations(
377
382
  title="Read a cached doc file",
@@ -584,7 +589,10 @@ async def _run_stdio() -> int:
584
589
  # isError=False), and
585
590
  # (b) errors on tools with an outputSchema don't fail the validator
586
591
  # for "missing structured content."
587
- content = [TextContent(type="text", text=result.text)]
592
+ # `content` is typed `list[TextContent | ImageContent | ...]` on the SDK
593
+ # side; list invariance means we have to widen the local annotation
594
+ # explicitly even though TextContent is one of the valid variants.
595
+ content: list[Any] = [TextContent(type="text", text=result.text)]
588
596
  return CallToolResult(
589
597
  content=content,
590
598
  structuredContent=result.data if not result.is_error else None,