docpull 2.5.1__tar.gz → 3.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. {docpull-2.5.1/src/docpull.egg-info → docpull-3.0.1}/PKG-INFO +20 -2
  2. {docpull-2.5.1 → docpull-3.0.1}/README.md +12 -0
  3. {docpull-2.5.1 → docpull-3.0.1}/pyproject.toml +24 -10
  4. {docpull-2.5.1 → docpull-3.0.1}/src/docpull/__init__.py +1 -1
  5. {docpull-2.5.1 → docpull-3.0.1}/src/docpull/cache/manager.py +10 -8
  6. {docpull-2.5.1 → docpull-3.0.1}/src/docpull/cli.py +5 -8
  7. {docpull-2.5.1 → docpull-3.0.1}/src/docpull/conversion/special_cases.py +13 -22
  8. {docpull-2.5.1 → docpull-3.0.1}/src/docpull/core/fetcher.py +10 -26
  9. {docpull-2.5.1 → docpull-3.0.1}/src/docpull/discovery/filters.py +8 -9
  10. {docpull-2.5.1 → docpull-3.0.1}/src/docpull/discovery/link_extractors/enhanced.py +2 -1
  11. {docpull-2.5.1 → docpull-3.0.1}/src/docpull/discovery/link_extractors/static.py +2 -1
  12. {docpull-2.5.1 → docpull-3.0.1}/src/docpull/http/client.py +20 -19
  13. {docpull-2.5.1 → docpull-3.0.1}/src/docpull/mcp/server.py +5 -3
  14. {docpull-2.5.1 → docpull-3.0.1}/src/docpull/mcp/sources.py +63 -6
  15. {docpull-2.5.1 → docpull-3.0.1}/src/docpull/mcp/tools.py +38 -56
  16. {docpull-2.5.1 → docpull-3.0.1}/src/docpull/models/config.py +10 -75
  17. {docpull-2.5.1 → docpull-3.0.1}/src/docpull/models/profiles.py +1 -5
  18. {docpull-2.5.1 → docpull-3.0.1}/src/docpull/pipeline/steps/convert.py +12 -12
  19. {docpull-2.5.1 → docpull-3.0.1}/src/docpull/pipeline/steps/fetch.py +2 -3
  20. {docpull-2.5.1 → docpull-3.0.1}/src/docpull/pipeline/steps/save.py +1 -1
  21. {docpull-2.5.1 → docpull-3.0.1}/src/docpull/pipeline/steps/save_json.py +4 -4
  22. {docpull-2.5.1 → docpull-3.0.1}/src/docpull/pipeline/steps/save_ndjson.py +2 -2
  23. {docpull-2.5.1 → docpull-3.0.1}/src/docpull/pipeline/steps/save_sqlite.py +2 -2
  24. {docpull-2.5.1 → docpull-3.0.1}/src/docpull/security/robots.py +17 -7
  25. {docpull-2.5.1 → docpull-3.0.1}/src/docpull/security/url_validator.py +7 -7
  26. docpull-3.0.1/src/docpull/time_utils.py +29 -0
  27. {docpull-2.5.1 → docpull-3.0.1/src/docpull.egg-info}/PKG-INFO +20 -2
  28. {docpull-2.5.1 → docpull-3.0.1}/src/docpull.egg-info/SOURCES.txt +7 -5
  29. {docpull-2.5.1 → docpull-3.0.1}/src/docpull.egg-info/requires.txt +7 -1
  30. {docpull-2.5.1 → docpull-3.0.1}/tests/test_cache_conditional_get.py +5 -12
  31. docpull-2.5.1/tests/test_v2_conversion.py → docpull-3.0.1/tests/test_conversion.py +26 -42
  32. {docpull-2.5.1 → docpull-3.0.1}/tests/test_convert_step_new.py +2 -6
  33. docpull-2.5.1/tests/test_v2_discovery.py → docpull-3.0.1/tests/test_discovery.py +1 -1
  34. docpull-2.5.1/tests/test_v2_integration.py → docpull-3.0.1/tests/test_integration.py +4 -5
  35. {docpull-2.5.1 → docpull-3.0.1}/tests/test_mcp_tools.py +60 -25
  36. {docpull-2.5.1 → docpull-3.0.1}/tests/test_naming.py +4 -23
  37. docpull-2.5.1/tests/test_v2_pipeline.py → docpull-3.0.1/tests/test_pipeline.py +2 -4
  38. {docpull-2.5.1 → docpull-3.0.1}/tests/test_special_cases.py +5 -17
  39. docpull-3.0.1/tests/test_time_utils.py +23 -0
  40. {docpull-2.5.1 → docpull-3.0.1}/LICENSE +0 -0
  41. {docpull-2.5.1 → docpull-3.0.1}/setup.cfg +0 -0
  42. {docpull-2.5.1 → docpull-3.0.1}/src/docpull/__main__.py +0 -0
  43. {docpull-2.5.1 → docpull-3.0.1}/src/docpull/cache/__init__.py +0 -0
  44. {docpull-2.5.1 → docpull-3.0.1}/src/docpull/cache/streaming_dedup.py +0 -0
  45. {docpull-2.5.1 → docpull-3.0.1}/src/docpull/concurrency/__init__.py +0 -0
  46. {docpull-2.5.1 → docpull-3.0.1}/src/docpull/concurrency/manager.py +0 -0
  47. {docpull-2.5.1 → docpull-3.0.1}/src/docpull/conversion/__init__.py +0 -0
  48. {docpull-2.5.1 → docpull-3.0.1}/src/docpull/conversion/chunking.py +0 -0
  49. {docpull-2.5.1 → docpull-3.0.1}/src/docpull/conversion/extractor.py +0 -0
  50. {docpull-2.5.1 → docpull-3.0.1}/src/docpull/conversion/markdown.py +0 -0
  51. {docpull-2.5.1 → docpull-3.0.1}/src/docpull/conversion/protocols.py +0 -0
  52. {docpull-2.5.1 → docpull-3.0.1}/src/docpull/conversion/trafilatura_extractor.py +0 -0
  53. {docpull-2.5.1 → docpull-3.0.1}/src/docpull/core/__init__.py +0 -0
  54. {docpull-2.5.1 → docpull-3.0.1}/src/docpull/discovery/__init__.py +0 -0
  55. {docpull-2.5.1 → docpull-3.0.1}/src/docpull/discovery/composite.py +0 -0
  56. {docpull-2.5.1 → docpull-3.0.1}/src/docpull/discovery/crawler.py +0 -0
  57. {docpull-2.5.1 → docpull-3.0.1}/src/docpull/discovery/link_extractors/__init__.py +0 -0
  58. {docpull-2.5.1 → docpull-3.0.1}/src/docpull/discovery/link_extractors/protocols.py +0 -0
  59. {docpull-2.5.1 → docpull-3.0.1}/src/docpull/discovery/protocols.py +0 -0
  60. {docpull-2.5.1 → docpull-3.0.1}/src/docpull/discovery/sitemap.py +0 -0
  61. {docpull-2.5.1 → docpull-3.0.1}/src/docpull/doctor.py +0 -0
  62. {docpull-2.5.1 → docpull-3.0.1}/src/docpull/http/__init__.py +0 -0
  63. {docpull-2.5.1 → docpull-3.0.1}/src/docpull/http/protocols.py +0 -0
  64. {docpull-2.5.1 → docpull-3.0.1}/src/docpull/http/rate_limiter.py +0 -0
  65. {docpull-2.5.1 → docpull-3.0.1}/src/docpull/logging_config.py +0 -0
  66. {docpull-2.5.1 → docpull-3.0.1}/src/docpull/mcp/__init__.py +0 -0
  67. {docpull-2.5.1 → docpull-3.0.1}/src/docpull/metadata_extractor.py +0 -0
  68. {docpull-2.5.1 → docpull-3.0.1}/src/docpull/models/__init__.py +0 -0
  69. {docpull-2.5.1 → docpull-3.0.1}/src/docpull/models/events.py +0 -0
  70. {docpull-2.5.1 → docpull-3.0.1}/src/docpull/pipeline/__init__.py +0 -0
  71. {docpull-2.5.1 → docpull-3.0.1}/src/docpull/pipeline/base.py +0 -0
  72. {docpull-2.5.1 → docpull-3.0.1}/src/docpull/pipeline/steps/__init__.py +0 -0
  73. {docpull-2.5.1 → docpull-3.0.1}/src/docpull/pipeline/steps/chunk.py +0 -0
  74. {docpull-2.5.1 → docpull-3.0.1}/src/docpull/pipeline/steps/dedup.py +0 -0
  75. {docpull-2.5.1 → docpull-3.0.1}/src/docpull/pipeline/steps/metadata.py +0 -0
  76. {docpull-2.5.1 → docpull-3.0.1}/src/docpull/pipeline/steps/validate.py +0 -0
  77. {docpull-2.5.1 → docpull-3.0.1}/src/docpull/py.typed +0 -0
  78. {docpull-2.5.1 → docpull-3.0.1}/src/docpull/security/__init__.py +0 -0
  79. {docpull-2.5.1 → docpull-3.0.1}/src/docpull.egg-info/dependency_links.txt +0 -0
  80. {docpull-2.5.1 → docpull-3.0.1}/src/docpull.egg-info/entry_points.txt +0 -0
  81. {docpull-2.5.1 → docpull-3.0.1}/src/docpull.egg-info/top_level.txt +0 -0
  82. {docpull-2.5.1 → docpull-3.0.1}/tests/test_chunking.py +0 -0
  83. {docpull-2.5.1 → docpull-3.0.1}/tests/test_cli.py +0 -0
  84. {docpull-2.5.1 → docpull-3.0.1}/tests/test_link_extractors.py +0 -0
  85. /docpull-2.5.1/tests/test_fixes_v2_3_0.py → /docpull-3.0.1/tests/test_real_site_regressions.py +0 -0
  86. {docpull-2.5.1 → docpull-3.0.1}/tests/test_save_ndjson.py +0 -0
  87. {docpull-2.5.1 → docpull-3.0.1}/tests/test_security_hardening.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docpull
3
- Version: 2.5.1
3
+ Version: 3.0.1
4
4
  Summary: Pull documentation from the web and convert to clean markdown
5
5
  Author-email: Zachary Roth <support@raintree.technology>
6
6
  Maintainer-email: Raintree Technology <support@raintree.technology>
@@ -43,9 +43,12 @@ Requires-Dist: html2text>=2020.1.16
43
43
  Requires-Dist: defusedxml>=0.7.1
44
44
  Requires-Dist: extruct>=0.15.0
45
45
  Requires-Dist: aiohttp>=3.9.0
46
+ Requires-Dist: idna>=3.15
47
+ Requires-Dist: regex>=2024.11.6
46
48
  Requires-Dist: rich>=13.0.0
47
49
  Requires-Dist: pyyaml>=6.0
48
50
  Requires-Dist: pydantic>=2.0
51
+ Requires-Dist: urllib3>=2.7.0
49
52
  Provides-Extra: proxy
50
53
  Requires-Dist: aiohttp-socks>=0.8.0; extra == "proxy"
51
54
  Provides-Extra: normalize
@@ -56,6 +59,8 @@ Provides-Extra: tokens
56
59
  Requires-Dist: tiktoken>=0.7.0; extra == "tokens"
57
60
  Provides-Extra: mcp
58
61
  Requires-Dist: mcp>=1.0.0; extra == "mcp"
62
+ Requires-Dist: python-multipart>=0.0.27; extra == "mcp"
63
+ Requires-Dist: starlette>=1.0.1; extra == "mcp"
59
64
  Provides-Extra: llm
60
65
  Requires-Dist: tiktoken>=0.7.0; extra == "llm"
61
66
  Provides-Extra: all
@@ -64,11 +69,12 @@ Requires-Dist: url-normalize>=1.4.0; extra == "all"
64
69
  Requires-Dist: trafilatura>=1.12.0; extra == "all"
65
70
  Requires-Dist: tiktoken>=0.7.0; extra == "all"
66
71
  Requires-Dist: mcp>=1.0.0; extra == "all"
72
+ Requires-Dist: python-multipart>=0.0.27; extra == "all"
73
+ Requires-Dist: starlette>=1.0.1; extra == "all"
67
74
  Provides-Extra: dev
68
75
  Requires-Dist: pytest>=7.0.0; extra == "dev"
69
76
  Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
70
77
  Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
71
- Requires-Dist: black>=23.0.0; extra == "dev"
72
78
  Requires-Dist: mypy>=1.0.0; extra == "dev"
73
79
  Requires-Dist: ruff>=0.1.0; extra == "dev"
74
80
  Requires-Dist: bandit>=1.7.0; extra == "dev"
@@ -280,6 +286,17 @@ sources:
280
286
  maxPages: 200
281
287
  ```
282
288
 
289
+ ### About the `mcp/` directory in this repo
290
+
291
+ The `mcp/` directory at the repo root is a separate TypeScript + Bun MCP
292
+ server backed by PostgreSQL with pgvector for semantic search. It is not
293
+ the Python MCP server shipped in the `docpull` package described above
294
+ — that one is the right choice for almost every user and is installed
295
+ with `pip install 'docpull[mcp]'`. The `mcp/` tree is mirrored to its
296
+ own repo at [`raintree-technology/docpull-mcp`](https://github.com/raintree-technology/docpull-mcp);
297
+ unless you specifically need pgvector-backed semantic search, ignore it
298
+ and use `docpull mcp`.
299
+
283
300
  ## Output
284
301
 
285
302
  Markdown files with YAML frontmatter:
@@ -376,6 +393,7 @@ docpull URL --preview-urls # List URLs without fetching
376
393
  - [PyPI](https://pypi.org/project/docpull/)
377
394
  - [GitHub](https://github.com/raintree-technology/docpull)
378
395
  - [Changelog](https://github.com/raintree-technology/docpull/blob/main/docs/CHANGELOG.md)
396
+ - [Metrics](https://github.com/raintree-technology/docpull/blob/main/METRICS.md) — auto-refreshed daily (PyPI downloads, plugin installs via clone count, traffic)
379
397
 
380
398
  ## License
381
399
 
@@ -198,6 +198,17 @@ sources:
198
198
  maxPages: 200
199
199
  ```
200
200
 
201
+ ### About the `mcp/` directory in this repo
202
+
203
+ The `mcp/` directory at the repo root is a separate TypeScript + Bun MCP
204
+ server backed by PostgreSQL with pgvector for semantic search. It is not
205
+ the Python MCP server shipped in the `docpull` package described above
206
+ — that one is the right choice for almost every user and is installed
207
+ with `pip install 'docpull[mcp]'`. The `mcp/` tree is mirrored to its
208
+ own repo at [`raintree-technology/docpull-mcp`](https://github.com/raintree-technology/docpull-mcp);
209
+ unless you specifically need pgvector-backed semantic search, ignore it
210
+ and use `docpull mcp`.
211
+
201
212
  ## Output
202
213
 
203
214
  Markdown files with YAML frontmatter:
@@ -294,6 +305,7 @@ docpull URL --preview-urls # List URLs without fetching
294
305
  - [PyPI](https://pypi.org/project/docpull/)
295
306
  - [GitHub](https://github.com/raintree-technology/docpull)
296
307
  - [Changelog](https://github.com/raintree-technology/docpull/blob/main/docs/CHANGELOG.md)
308
+ - [Metrics](https://github.com/raintree-technology/docpull/blob/main/METRICS.md) — auto-refreshed daily (PyPI downloads, plugin installs via clone count, traffic)
297
309
 
298
310
  ## License
299
311
 
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "docpull"
7
- version = "2.5.1"
7
+ version = "3.0.1"
8
8
  dynamic = []
9
9
  description = "Pull documentation from the web and convert to clean markdown"
10
10
  readme = {file = "README.md", content-type = "text/markdown"}
@@ -67,9 +67,12 @@ dependencies = [
67
67
  "defusedxml>=0.7.1",
68
68
  "extruct>=0.15.0",
69
69
  "aiohttp>=3.9.0",
70
+ "idna>=3.15",
71
+ "regex>=2024.11.6",
70
72
  "rich>=13.0.0",
71
73
  "pyyaml>=6.0",
72
74
  "pydantic>=2.0",
75
+ "urllib3>=2.7.0",
73
76
  ]
74
77
 
75
78
  [project.optional-dependencies]
@@ -87,6 +90,8 @@ tokens = [
87
90
  ]
88
91
  mcp = [
89
92
  "mcp>=1.0.0",
93
+ "python-multipart>=0.0.27",
94
+ "starlette>=1.0.1",
90
95
  ]
91
96
  llm = [
92
97
  "tiktoken>=0.7.0",
@@ -97,12 +102,13 @@ all = [
97
102
  "trafilatura>=1.12.0",
98
103
  "tiktoken>=0.7.0",
99
104
  "mcp>=1.0.0",
105
+ "python-multipart>=0.0.27",
106
+ "starlette>=1.0.1",
100
107
  ]
101
108
  dev = [
102
109
  "pytest>=7.0.0",
103
110
  "pytest-cov>=4.0.0",
104
111
  "pytest-asyncio>=0.21.0",
105
- "black>=23.0.0",
106
112
  "mypy>=1.0.0",
107
113
  "ruff>=0.1.0",
108
114
  "bandit>=1.7.0",
@@ -132,10 +138,6 @@ include = ["docpull*"]
132
138
  [tool.setuptools.package-data]
133
139
  docpull = ["py.typed"]
134
140
 
135
- [tool.black]
136
- line-length = 110
137
- target-version = ["py310", "py311", "py312", "py313", "py314"]
138
-
139
141
  [tool.ruff]
140
142
  line-length = 110
141
143
  target-version = "py310"
@@ -175,10 +177,22 @@ module = "docpull.models.*"
175
177
  disallow_any_unimported = false
176
178
  warn_return_any = false
177
179
 
178
- [[tool.mypy.overrides]]
179
- module = "tests.*"
180
- disallow_untyped_defs = false
181
- disallow_any_unimported = false
180
+ [tool.bandit]
181
+ # Policy: every entry in `skips` MUST have a one-line justification
182
+ # above it explaining what bandit found, why it's a false positive
183
+ # *for this codebase*, and (if narrow) why a `# nosec BXXX # reason`
184
+ # annotation at the call site would have been worse. Bandit skips
185
+ # silence findings repo-wide, so the bar to add one is higher than
186
+ # silencing a single line. If a new skip is unavoidable, add it here
187
+ # in PR review, not as a drive-by.
188
+ #
189
+ # B101 (assert_used) — flags every `assert x is not None` we use for
190
+ # type narrowing. Bandit's concern is that assertions vanish under
191
+ # `python -O`. docpull is a CLI / SDK, never invoked with -O, and the
192
+ # narrowing asserts are not load-bearing safety checks. Skipping the
193
+ # rule globally keeps the existing idiom without 8+ inline `# nosec`
194
+ # annotations in fetcher.py / pipeline/steps/convert.py.
195
+ skips = ["B101"]
182
196
 
183
197
  [tool.pytest.ini_options]
184
198
  minversion = "7.0"
@@ -14,7 +14,7 @@ Usage:
14
14
  print(event)
15
15
  """
16
16
 
17
- __version__ = "2.5.1"
17
+ __version__ = "3.0.0"
18
18
 
19
19
  from .cache import CacheManager, StreamingDeduplicator
20
20
  from .conversion.chunking import Chunk, TokenCounter, chunk_markdown
@@ -5,10 +5,12 @@ from __future__ import annotations
5
5
  import hashlib
6
6
  import json
7
7
  import logging
8
- from datetime import datetime, timedelta
8
+ from datetime import timedelta
9
9
  from pathlib import Path
10
10
  from typing import TypedDict
11
11
 
12
+ from ..time_utils import parse_persisted_datetime, utc_now, utc_now_iso
13
+
12
14
  logger = logging.getLogger(__name__)
13
15
 
14
16
  # Default TTL for cache entries (30 days)
@@ -257,7 +259,7 @@ class CacheManager:
257
259
  self.manifest[url] = {
258
260
  "checksum": self.compute_checksum(content),
259
261
  "file_path": str(file_path),
260
- "fetched_at": datetime.now().isoformat(),
262
+ "fetched_at": utc_now_iso(),
261
263
  "size": len(content),
262
264
  }
263
265
 
@@ -314,7 +316,7 @@ class CacheManager:
314
316
  Note:
315
317
  Changes are batched. Call flush() to persist to disk.
316
318
  """
317
- self._state.last_run = datetime.now().isoformat()
319
+ self._state.last_run = utc_now_iso()
318
320
  self._state_dirty = True
319
321
 
320
322
  def clear_state(self) -> None:
@@ -354,18 +356,18 @@ class CacheManager:
354
356
  if ttl is None:
355
357
  return 0
356
358
 
357
- cutoff = datetime.now() - timedelta(days=ttl)
359
+ cutoff = utc_now() - timedelta(days=ttl)
358
360
  to_remove = []
359
361
 
360
362
  for url, entry in self.manifest.items():
361
363
  fetched_at = entry.get("fetched_at")
362
364
  if fetched_at:
363
365
  try:
364
- entry_time = datetime.fromisoformat(fetched_at)
366
+ entry_time = parse_persisted_datetime(fetched_at)
365
367
  if entry_time < cutoff:
366
368
  to_remove.append(url)
367
- except ValueError:
368
- pass # Invalid date format, skip
369
+ except ValueError as err:
370
+ logger.warning("Invalid cache timestamp for %s: %s", url, err)
369
371
 
370
372
  for url in to_remove:
371
373
  del self.manifest[url]
@@ -413,7 +415,7 @@ class CacheManager:
413
415
  """
414
416
  data: DiscoveredUrlsState = {
415
417
  "start_url": start_url,
416
- "discovered_at": datetime.now().isoformat(),
418
+ "discovered_at": utc_now_iso(),
417
419
  "urls": urls,
418
420
  }
419
421
  try:
@@ -13,12 +13,10 @@ if "--doctor" in sys.argv:
13
13
 
14
14
  output_dir = None
15
15
  if "--output-dir" in sys.argv or "-o" in sys.argv:
16
- try:
17
- flag_idx = sys.argv.index("--output-dir") if "--output-dir" in sys.argv else sys.argv.index("-o")
18
- if flag_idx + 1 < len(sys.argv):
19
- output_dir = Path(sys.argv[flag_idx + 1])
20
- except (ValueError, IndexError):
21
- pass
16
+ flag = "--output-dir" if "--output-dir" in sys.argv else "-o"
17
+ flag_idx = sys.argv.index(flag)
18
+ if flag_idx + 1 < len(sys.argv):
19
+ output_dir = Path(sys.argv[flag_idx + 1])
22
20
  sys.exit(run_doctor(output_dir=output_dir))
23
21
 
24
22
  # Verify core dependencies
@@ -562,8 +560,7 @@ def run_fetcher(args: argparse.Namespace) -> int:
562
560
  n_chunks = len(ctx.chunks) if ctx.chunks else 0
563
561
  extra = f" ({n_chunks} chunks)" if n_chunks else ""
564
562
  console.print(
565
- f"[green]Saved:[/green] {ctx.output_path} "
566
- f"[{ctx.source_type or 'generic'}]{extra}"
563
+ f"[green]Saved:[/green] {ctx.output_path} [{ctx.source_type or 'generic'}]{extra}"
567
564
  )
568
565
  return 0
569
566
 
@@ -246,7 +246,8 @@ def _describe_type(schema: Any, spec: dict[str, Any]) -> str:
246
246
  if not isinstance(schema, dict):
247
247
  return "?"
248
248
  if "$ref" in schema:
249
- return schema["$ref"].rsplit("/", 1)[-1]
249
+ ref: str = schema["$ref"]
250
+ return ref.rsplit("/", 1)[-1]
250
251
  for key in ("oneOf", "anyOf", "allOf"):
251
252
  if isinstance(schema.get(key), list) and schema[key]:
252
253
  seen: list[str] = []
@@ -319,7 +320,8 @@ class OpenApiExtractor:
319
320
  return None
320
321
  try:
321
322
  data = json.loads(text)
322
- except json.JSONDecodeError:
323
+ except json.JSONDecodeError as err:
324
+ logger.debug("OpenAPI extractor skipped %s: JSON parse failed: %s", url, err)
323
325
  return None
324
326
  if not isinstance(data, dict):
325
327
  return None
@@ -349,9 +351,7 @@ class OpenApiExtractor:
349
351
  for method, op in ops.items():
350
352
  if method.lower() not in _HTTP_METHODS or not isinstance(op, dict):
351
353
  continue
352
- self._render_operation(
353
- lines, path, method, op, shared_params, data
354
- )
354
+ self._render_operation(lines, path, method, op, shared_params, data)
355
355
 
356
356
  return SpecialCaseResult(
357
357
  markdown="\n".join(lines).strip() + "\n",
@@ -410,9 +410,7 @@ class OpenApiExtractor:
410
410
  lines.append(bullet)
411
411
  lines.append("")
412
412
 
413
- def _render_request_body(
414
- self, lines: list[str], body: Any, spec: dict[str, Any]
415
- ) -> None:
413
+ def _render_request_body(self, lines: list[str], body: Any, spec: dict[str, Any]) -> None:
416
414
  if not isinstance(body, dict):
417
415
  return
418
416
  if "$ref" in body:
@@ -455,9 +453,7 @@ class OpenApiExtractor:
455
453
  lines.append(f"- body: {_describe_type(schema, spec)}")
456
454
  lines.append("")
457
455
 
458
- def _render_responses(
459
- self, lines: list[str], responses: Any, spec: dict[str, Any]
460
- ) -> None:
456
+ def _render_responses(self, lines: list[str], responses: Any, spec: dict[str, Any]) -> None:
461
457
  if not isinstance(responses, dict) or not responses:
462
458
  return
463
459
  lines.append("**Responses:**")
@@ -535,11 +531,7 @@ class MdxSourceExtractor:
535
531
  for pattern in self._EDIT_PATTERNS:
536
532
  match = pattern.search(text)
537
533
  if match:
538
- raw_url = (
539
- match.group(1)
540
- .replace("/blob/", "/raw/")
541
- .replace("/edit/", "/raw/")
542
- )
534
+ raw_url = match.group(1).replace("/blob/", "/raw/").replace("/edit/", "/raw/")
543
535
  # Return None so downstream runs, but attach hint via a cache
544
536
  # mechanism. Simpler: return None always; step reads the URL
545
537
  # if needed by re-running the regex.
@@ -567,9 +559,7 @@ def find_mdx_source_url(html: bytes) -> str | None:
567
559
  for pattern in MdxSourceExtractor._EDIT_PATTERNS:
568
560
  match = pattern.search(text)
569
561
  if match:
570
- return (
571
- match.group(1).replace("/blob/", "/raw/").replace("/edit/", "/raw/")
572
- )
562
+ return match.group(1).replace("/blob/", "/raw/").replace("/edit/", "/raw/")
573
563
  return None
574
564
 
575
565
 
@@ -580,8 +570,8 @@ def looks_like_spa(html: bytes, min_body_ratio: float = 0.05) -> bool:
580
570
  """Heuristic: does this HTML appear to be a JS-only SPA?
581
571
 
582
572
  True when the non-script body text is very small relative to the overall
583
- page size and the page contains script tags. Not perfect, but good enough
584
- to warn an agent before it consumes empty Markdown.
573
+ page size and the page contains script tags. This is a conservative signal
574
+ for warning an agent before it consumes empty Markdown.
585
575
  """
586
576
  if len(html) < 500:
587
577
  return False
@@ -589,7 +579,8 @@ def looks_like_spa(html: bytes, min_body_ratio: float = 0.05) -> bool:
589
579
  return False
590
580
  try:
591
581
  soup = _soup(html)
592
- except Exception: # noqa: BLE001
582
+ except Exception as err: # noqa: BLE001
583
+ logger.debug("SPA heuristic skipped malformed HTML: %s", err)
593
584
  return False
594
585
  # Remove scripts/styles before measuring.
595
586
  for tag in soup(["script", "style", "noscript"]):
@@ -265,9 +265,7 @@ class Fetcher:
265
265
  # built-in 50 MB ceiling.
266
266
  max_content_size_kw: dict[str, int] = {}
267
267
  if self.config.content_filter.max_file_size is not None:
268
- max_content_size_kw["max_content_size"] = int(
269
- self.config.content_filter.max_file_size
270
- )
268
+ max_content_size_kw["max_content_size"] = int(self.config.content_filter.max_file_size)
271
269
  self._http_client = AsyncHttpClient(
272
270
  rate_limiter=self._rate_limiter,
273
271
  max_retries=self.config.network.max_retries,
@@ -509,11 +507,7 @@ class Fetcher:
509
507
 
510
508
  steps = self._pipeline.steps
511
509
  if not save:
512
- steps = [
513
- s
514
- for s in steps
515
- if s.name not in {"save", "save_json", "save_ndjson", "save_sqlite"}
516
- ]
510
+ steps = [s for s in steps if s.name not in {"save", "save_json", "save_ndjson", "save_sqlite"}]
517
511
  pipeline = type(self._pipeline)(steps=steps)
518
512
  ctx = await pipeline.execute(url, output_path)
519
513
  if ctx.error:
@@ -531,8 +525,8 @@ class Fetcher:
531
525
  """
532
526
  Compute output path for a URL using the configured naming strategy.
533
527
 
534
- - ``full`` / ``flat`` / ``short``: a single flattened filename
535
- (URL path joined with underscores).
528
+ - ``full``: a single flattened filename (URL path joined with
529
+ underscores).
536
530
  - ``hierarchical``: URL path preserved as nested directories,
537
531
  terminating in ``<segment>.md`` or ``index.md`` for trailing
538
532
  slashes. The leaf is `_validate_output_path`-safe — every segment
@@ -545,7 +539,6 @@ class Fetcher:
545
539
  parts = _url_to_path_parts(url, self.config.url)
546
540
  return output_dir.joinpath(*parts)
547
541
 
548
- # full / flat / short: aliased to full until 3.0
549
542
  filename = _url_to_filename(url, self.config.url)
550
543
  return output_dir / filename
551
544
 
@@ -638,9 +631,7 @@ class Fetcher:
638
631
  )
639
632
 
640
633
  discovered: list[str] = []
641
- async for url in self._discoverer.discover(
642
- start_url, max_urls=self.config.crawl.max_pages
643
- ):
634
+ async for url in self._discoverer.discover(start_url, max_urls=self.config.crawl.max_pages):
644
635
  discovered.append(url)
645
636
  if self._cancelled:
646
637
  yield FetchEvent(
@@ -756,9 +747,7 @@ class Fetcher:
756
747
  )
757
748
  )
758
749
  try:
759
- async for url in discoverer.discover(
760
- start_url, max_urls=self.config.crawl.max_pages
761
- ):
750
+ async for url in discoverer.discover(start_url, max_urls=self.config.crawl.max_pages):
762
751
  if self._cancelled:
763
752
  break
764
753
  await url_queue.put(url)
@@ -770,14 +759,10 @@ class Fetcher:
770
759
  and self._cache_manager
771
760
  and len(discovered_for_resume) % 200 == 0
772
761
  ):
773
- self._cache_manager.save_discovered_urls(
774
- list(discovered_for_resume), start_url
775
- )
762
+ self._cache_manager.save_discovered_urls(list(discovered_for_resume), start_url)
776
763
  finally:
777
764
  if self.config.cache.enabled and self._cache_manager:
778
- self._cache_manager.save_discovered_urls(
779
- discovered_for_resume, start_url
780
- )
765
+ self._cache_manager.save_discovered_urls(discovered_for_resume, start_url)
781
766
  self._stats.urls_discovered = len(discovered_for_resume)
782
767
  await event_queue.put(
783
768
  FetchEvent(
@@ -810,6 +795,7 @@ class Fetcher:
810
795
  continue
811
796
 
812
797
  local_events: list[FetchEvent] = []
798
+
813
799
  # Bind the per-iteration list as a default arg so ruff B023
814
800
  # is happy. Closure is consumed synchronously by execute()
815
801
  # before the next iteration anyway, so capture order is safe.
@@ -936,9 +922,7 @@ def fetch_one(url: str, **kwargs: object) -> PageContext:
936
922
  """
937
923
  try:
938
924
  asyncio.get_running_loop()
939
- raise RuntimeError(
940
- "fetch_one() called from async context. Use Fetcher.fetch_one() instead."
941
- )
925
+ raise RuntimeError("fetch_one() called from async context. Use Fetcher.fetch_one() instead.")
942
926
  except RuntimeError as exc:
943
927
  if "no running event loop" not in str(exc).lower():
944
928
  raise
@@ -29,19 +29,20 @@ def normalize_url(url: str) -> str:
29
29
  Returns:
30
30
  Normalized URL string
31
31
  """
32
- # Use url_normalize library if available
32
+ # Use url_normalize library if available for case / percent-encoding
33
+ # cleanup. It does NOT strip fragments, so we always do that ourselves
34
+ # below — keeping behavior consistent whether the optional dep is
35
+ # installed or not.
33
36
  if URL_NORMALIZE_AVAILABLE:
34
37
  try:
35
- result: str = url_normalize(url)
36
- return result
38
+ normalized = url_normalize(url)
39
+ if normalized:
40
+ url = normalized
37
41
  except ValueError:
38
42
  logger.debug("url_normalize rejected URL during normalization", exc_info=True)
39
43
 
40
- # Basic normalization
41
44
  parsed = urlparse(url)
42
-
43
- # Remove fragment
44
- normalized = urlunparse(
45
+ return urlunparse(
45
46
  (
46
47
  parsed.scheme.lower(),
47
48
  parsed.netloc.lower(),
@@ -52,8 +53,6 @@ def normalize_url(url: str) -> str:
52
53
  )
53
54
  )
54
55
 
55
- return normalized
56
-
57
56
 
58
57
  class PatternFilter:
59
58
  """
@@ -295,7 +295,8 @@ class EnhancedLinkExtractor:
295
295
 
296
296
  try:
297
297
  absolute_url = urljoin(base_url, href)
298
- except Exception:
298
+ except Exception as err:
299
+ logger.debug("Could not resolve href %r against %s: %s", href, base_url, err)
299
300
  return None
300
301
 
301
302
  # Validate it's a proper URL
@@ -148,7 +148,8 @@ class StaticLinkExtractor:
148
148
  """
149
149
  try:
150
150
  absolute_url = urljoin(base_url, href)
151
- except Exception:
151
+ except Exception as err:
152
+ logger.debug("Could not resolve href %r against %s: %s", href, base_url, err)
152
153
  return None
153
154
 
154
155
  # Remove fragment
@@ -12,7 +12,7 @@ from types import TracebackType
12
12
  from urllib.parse import urljoin, urlparse
13
13
 
14
14
  import aiohttp
15
- from aiohttp.abc import AbstractResolver
15
+ from aiohttp.abc import AbstractResolver, ResolveResult
16
16
 
17
17
  from ..security.url_validator import UrlValidator
18
18
  from .protocols import HttpResponse
@@ -45,14 +45,14 @@ class _ValidatedResolver(AbstractResolver):
45
45
  self,
46
46
  host: str,
47
47
  port: int = 0,
48
- family: int = socket.AF_UNSPEC,
49
- ) -> list[dict[str, object]]:
48
+ family: socket.AddressFamily = socket.AF_UNSPEC,
49
+ ) -> list[ResolveResult]:
50
50
  try:
51
51
  addresses = self._url_validator.resolve_allowed_addresses(host)
52
52
  except ValueError as err:
53
53
  raise OSError(str(err)) from err
54
54
 
55
- results: list[dict[str, object]] = []
55
+ results: list[ResolveResult] = []
56
56
  for address in addresses:
57
57
  ip = ipaddress.ip_address(address)
58
58
  entry_family = socket.AF_INET6 if ip.version == 6 else socket.AF_INET
@@ -60,14 +60,14 @@ class _ValidatedResolver(AbstractResolver):
60
60
  continue
61
61
 
62
62
  results.append(
63
- {
64
- "hostname": host,
65
- "host": address,
66
- "port": port,
67
- "family": entry_family,
68
- "proto": socket.IPPROTO_TCP,
69
- "flags": socket.AI_NUMERICHOST,
70
- }
63
+ ResolveResult(
64
+ hostname=host,
65
+ host=address,
66
+ port=port,
67
+ family=entry_family,
68
+ proto=socket.IPPROTO_TCP,
69
+ flags=socket.AI_NUMERICHOST,
70
+ )
71
71
  )
72
72
 
73
73
  if not results:
@@ -236,20 +236,21 @@ class AsyncHttpClient:
236
236
 
237
237
  async def __aenter__(self) -> AsyncHttpClient:
238
238
  """Enter async context and create session."""
239
- connector_kwargs: dict[str, object] = {
240
- "limit": 100, # Total connection limit
241
- "limit_per_host": 10, # Per-host connection limit
242
- "ttl_dns_cache": 300, # DNS cache TTL
243
- }
239
+ resolver: AbstractResolver | None = None
244
240
  if self._url_validator is not None and self._proxy is None:
245
- connector_kwargs["resolver"] = _ValidatedResolver(self._url_validator)
241
+ resolver = _ValidatedResolver(self._url_validator)
246
242
  elif self._proxy is not None and self._url_validator is not None:
247
243
  logger.warning(
248
244
  "Proxy mode: DNS-pinning resolver is not active. "
249
245
  "URL validation still runs pre-flight, but the proxy resolves DNS independently."
250
246
  )
251
247
 
252
- connector = aiohttp.TCPConnector(**connector_kwargs)
248
+ connector = aiohttp.TCPConnector(
249
+ limit=100,
250
+ limit_per_host=10,
251
+ ttl_dns_cache=300,
252
+ resolver=resolver,
253
+ )
253
254
  self._session = aiohttp.ClientSession(
254
255
  connector=connector,
255
256
  headers={"User-Agent": self._user_agent},
@@ -215,8 +215,7 @@ async def _run_stdio() -> int:
215
215
  from mcp.types import CallToolResult, TextContent, Tool, ToolAnnotations
216
216
  except ImportError:
217
217
  print(
218
- "docpull mcp requires the 'mcp' package. Install with: "
219
- "pip install docpull[mcp]",
218
+ "docpull mcp requires the 'mcp' package. Install with: pip install docpull[mcp]",
220
219
  file=sys.stderr,
221
220
  )
222
221
  return 1
@@ -590,7 +589,10 @@ async def _run_stdio() -> int:
590
589
  # isError=False), and
591
590
  # (b) errors on tools with an outputSchema don't fail the validator
592
591
  # for "missing structured content."
593
- content = [TextContent(type="text", text=result.text)]
592
+ # `content` is typed `list[TextContent | ImageContent | ...]` on the SDK
593
+ # side; list invariance means we have to widen the local annotation
594
+ # explicitly even though TextContent is one of the valid variants.
595
+ content: list[Any] = [TextContent(type="text", text=result.text)]
594
596
  return CallToolResult(
595
597
  content=content,
596
598
  structuredContent=result.data if not result.is_error else None,