docpull 2.5.1__tar.gz → 3.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. {docpull-2.5.1/src/docpull.egg-info → docpull-3.0.0}/PKG-INFO +13 -2
  2. {docpull-2.5.1 → docpull-3.0.0}/README.md +12 -0
  3. {docpull-2.5.1 → docpull-3.0.0}/pyproject.toml +1 -6
  4. {docpull-2.5.1 → docpull-3.0.0}/src/docpull/__init__.py +1 -1
  5. {docpull-2.5.1 → docpull-3.0.0}/src/docpull/cli.py +1 -2
  6. {docpull-2.5.1 → docpull-3.0.0}/src/docpull/conversion/special_cases.py +7 -18
  7. {docpull-2.5.1 → docpull-3.0.0}/src/docpull/core/fetcher.py +10 -26
  8. {docpull-2.5.1 → docpull-3.0.0}/src/docpull/discovery/filters.py +8 -9
  9. {docpull-2.5.1 → docpull-3.0.0}/src/docpull/http/client.py +20 -19
  10. {docpull-2.5.1 → docpull-3.0.0}/src/docpull/mcp/server.py +5 -3
  11. {docpull-2.5.1 → docpull-3.0.0}/src/docpull/mcp/tools.py +16 -45
  12. {docpull-2.5.1 → docpull-3.0.0}/src/docpull/models/config.py +6 -73
  13. {docpull-2.5.1 → docpull-3.0.0}/src/docpull/models/profiles.py +1 -5
  14. {docpull-2.5.1 → docpull-3.0.0}/src/docpull/pipeline/steps/convert.py +12 -12
  15. {docpull-2.5.1 → docpull-3.0.0}/src/docpull/pipeline/steps/fetch.py +2 -3
  16. {docpull-2.5.1 → docpull-3.0.0}/src/docpull/pipeline/steps/save.py +1 -1
  17. {docpull-2.5.1 → docpull-3.0.0}/src/docpull/security/robots.py +13 -5
  18. {docpull-2.5.1 → docpull-3.0.0}/src/docpull/security/url_validator.py +2 -2
  19. {docpull-2.5.1 → docpull-3.0.0/src/docpull.egg-info}/PKG-INFO +13 -2
  20. {docpull-2.5.1 → docpull-3.0.0}/src/docpull.egg-info/SOURCES.txt +6 -6
  21. {docpull-2.5.1 → docpull-3.0.0}/src/docpull.egg-info/requires.txt +0 -1
  22. {docpull-2.5.1 → docpull-3.0.0}/tests/test_cache_conditional_get.py +5 -12
  23. docpull-2.5.1/tests/test_v2_conversion.py → docpull-3.0.0/tests/test_conversion.py +26 -42
  24. {docpull-2.5.1 → docpull-3.0.0}/tests/test_convert_step_new.py +2 -6
  25. docpull-2.5.1/tests/test_v2_discovery.py → docpull-3.0.0/tests/test_discovery.py +1 -1
  26. docpull-2.5.1/tests/test_v2_integration.py → docpull-3.0.0/tests/test_integration.py +4 -5
  27. {docpull-2.5.1 → docpull-3.0.0}/tests/test_mcp_tools.py +9 -25
  28. {docpull-2.5.1 → docpull-3.0.0}/tests/test_naming.py +4 -23
  29. docpull-2.5.1/tests/test_v2_pipeline.py → docpull-3.0.0/tests/test_pipeline.py +2 -4
  30. {docpull-2.5.1 → docpull-3.0.0}/tests/test_special_cases.py +5 -17
  31. {docpull-2.5.1 → docpull-3.0.0}/LICENSE +0 -0
  32. {docpull-2.5.1 → docpull-3.0.0}/setup.cfg +0 -0
  33. {docpull-2.5.1 → docpull-3.0.0}/src/docpull/__main__.py +0 -0
  34. {docpull-2.5.1 → docpull-3.0.0}/src/docpull/cache/__init__.py +0 -0
  35. {docpull-2.5.1 → docpull-3.0.0}/src/docpull/cache/manager.py +0 -0
  36. {docpull-2.5.1 → docpull-3.0.0}/src/docpull/cache/streaming_dedup.py +0 -0
  37. {docpull-2.5.1 → docpull-3.0.0}/src/docpull/concurrency/__init__.py +0 -0
  38. {docpull-2.5.1 → docpull-3.0.0}/src/docpull/concurrency/manager.py +0 -0
  39. {docpull-2.5.1 → docpull-3.0.0}/src/docpull/conversion/__init__.py +0 -0
  40. {docpull-2.5.1 → docpull-3.0.0}/src/docpull/conversion/chunking.py +0 -0
  41. {docpull-2.5.1 → docpull-3.0.0}/src/docpull/conversion/extractor.py +0 -0
  42. {docpull-2.5.1 → docpull-3.0.0}/src/docpull/conversion/markdown.py +0 -0
  43. {docpull-2.5.1 → docpull-3.0.0}/src/docpull/conversion/protocols.py +0 -0
  44. {docpull-2.5.1 → docpull-3.0.0}/src/docpull/conversion/trafilatura_extractor.py +0 -0
  45. {docpull-2.5.1 → docpull-3.0.0}/src/docpull/core/__init__.py +0 -0
  46. {docpull-2.5.1 → docpull-3.0.0}/src/docpull/discovery/__init__.py +0 -0
  47. {docpull-2.5.1 → docpull-3.0.0}/src/docpull/discovery/composite.py +0 -0
  48. {docpull-2.5.1 → docpull-3.0.0}/src/docpull/discovery/crawler.py +0 -0
  49. {docpull-2.5.1 → docpull-3.0.0}/src/docpull/discovery/link_extractors/__init__.py +0 -0
  50. {docpull-2.5.1 → docpull-3.0.0}/src/docpull/discovery/link_extractors/enhanced.py +0 -0
  51. {docpull-2.5.1 → docpull-3.0.0}/src/docpull/discovery/link_extractors/protocols.py +0 -0
  52. {docpull-2.5.1 → docpull-3.0.0}/src/docpull/discovery/link_extractors/static.py +0 -0
  53. {docpull-2.5.1 → docpull-3.0.0}/src/docpull/discovery/protocols.py +0 -0
  54. {docpull-2.5.1 → docpull-3.0.0}/src/docpull/discovery/sitemap.py +0 -0
  55. {docpull-2.5.1 → docpull-3.0.0}/src/docpull/doctor.py +0 -0
  56. {docpull-2.5.1 → docpull-3.0.0}/src/docpull/http/__init__.py +0 -0
  57. {docpull-2.5.1 → docpull-3.0.0}/src/docpull/http/protocols.py +0 -0
  58. {docpull-2.5.1 → docpull-3.0.0}/src/docpull/http/rate_limiter.py +0 -0
  59. {docpull-2.5.1 → docpull-3.0.0}/src/docpull/logging_config.py +0 -0
  60. {docpull-2.5.1 → docpull-3.0.0}/src/docpull/mcp/__init__.py +0 -0
  61. {docpull-2.5.1 → docpull-3.0.0}/src/docpull/mcp/sources.py +0 -0
  62. {docpull-2.5.1 → docpull-3.0.0}/src/docpull/metadata_extractor.py +0 -0
  63. {docpull-2.5.1 → docpull-3.0.0}/src/docpull/models/__init__.py +0 -0
  64. {docpull-2.5.1 → docpull-3.0.0}/src/docpull/models/events.py +0 -0
  65. {docpull-2.5.1 → docpull-3.0.0}/src/docpull/pipeline/__init__.py +0 -0
  66. {docpull-2.5.1 → docpull-3.0.0}/src/docpull/pipeline/base.py +0 -0
  67. {docpull-2.5.1 → docpull-3.0.0}/src/docpull/pipeline/steps/__init__.py +0 -0
  68. {docpull-2.5.1 → docpull-3.0.0}/src/docpull/pipeline/steps/chunk.py +0 -0
  69. {docpull-2.5.1 → docpull-3.0.0}/src/docpull/pipeline/steps/dedup.py +0 -0
  70. {docpull-2.5.1 → docpull-3.0.0}/src/docpull/pipeline/steps/metadata.py +0 -0
  71. {docpull-2.5.1 → docpull-3.0.0}/src/docpull/pipeline/steps/save_json.py +0 -0
  72. {docpull-2.5.1 → docpull-3.0.0}/src/docpull/pipeline/steps/save_ndjson.py +0 -0
  73. {docpull-2.5.1 → docpull-3.0.0}/src/docpull/pipeline/steps/save_sqlite.py +0 -0
  74. {docpull-2.5.1 → docpull-3.0.0}/src/docpull/pipeline/steps/validate.py +0 -0
  75. {docpull-2.5.1 → docpull-3.0.0}/src/docpull/py.typed +0 -0
  76. {docpull-2.5.1 → docpull-3.0.0}/src/docpull/security/__init__.py +0 -0
  77. {docpull-2.5.1 → docpull-3.0.0}/src/docpull.egg-info/dependency_links.txt +0 -0
  78. {docpull-2.5.1 → docpull-3.0.0}/src/docpull.egg-info/entry_points.txt +0 -0
  79. {docpull-2.5.1 → docpull-3.0.0}/src/docpull.egg-info/top_level.txt +0 -0
  80. {docpull-2.5.1 → docpull-3.0.0}/tests/test_chunking.py +0 -0
  81. {docpull-2.5.1 → docpull-3.0.0}/tests/test_cli.py +0 -0
  82. {docpull-2.5.1 → docpull-3.0.0}/tests/test_link_extractors.py +0 -0
  83. /docpull-2.5.1/tests/test_fixes_v2_3_0.py → /docpull-3.0.0/tests/test_real_site_regressions.py +0 -0
  84. {docpull-2.5.1 → docpull-3.0.0}/tests/test_save_ndjson.py +0 -0
  85. {docpull-2.5.1 → docpull-3.0.0}/tests/test_security_hardening.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docpull
3
- Version: 2.5.1
3
+ Version: 3.0.0
4
4
  Summary: Pull documentation from the web and convert to clean markdown
5
5
  Author-email: Zachary Roth <support@raintree.technology>
6
6
  Maintainer-email: Raintree Technology <support@raintree.technology>
@@ -68,7 +68,6 @@ Provides-Extra: dev
68
68
  Requires-Dist: pytest>=7.0.0; extra == "dev"
69
69
  Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
70
70
  Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
71
- Requires-Dist: black>=23.0.0; extra == "dev"
72
71
  Requires-Dist: mypy>=1.0.0; extra == "dev"
73
72
  Requires-Dist: ruff>=0.1.0; extra == "dev"
74
73
  Requires-Dist: bandit>=1.7.0; extra == "dev"
@@ -280,6 +279,17 @@ sources:
280
279
  maxPages: 200
281
280
  ```
282
281
 
282
+ ### About the `mcp/` directory in this repo
283
+
284
+ The `mcp/` directory at the repo root is a separate TypeScript + Bun MCP
285
+ server backed by PostgreSQL with pgvector for semantic search. It is not
286
+ the Python MCP server shipped in the `docpull` package described above
287
+ — that one is the right choice for almost every user and is installed
288
+ with `pip install 'docpull[mcp]'`. The `mcp/` tree is mirrored to its
289
+ own repo at [`raintree-technology/docpull-mcp`](https://github.com/raintree-technology/docpull-mcp);
290
+ unless you specifically need pgvector-backed semantic search, ignore it
291
+ and use `docpull mcp`.
292
+
283
293
  ## Output
284
294
 
285
295
  Markdown files with YAML frontmatter:
@@ -376,6 +386,7 @@ docpull URL --preview-urls # List URLs without fetching
376
386
  - [PyPI](https://pypi.org/project/docpull/)
377
387
  - [GitHub](https://github.com/raintree-technology/docpull)
378
388
  - [Changelog](https://github.com/raintree-technology/docpull/blob/main/docs/CHANGELOG.md)
389
+ - [Metrics](https://github.com/raintree-technology/docpull/blob/main/METRICS.md) — auto-refreshed daily (PyPI downloads, plugin installs via clone count, traffic)
379
390
 
380
391
  ## License
381
392
 
@@ -198,6 +198,17 @@ sources:
198
198
  maxPages: 200
199
199
  ```
200
200
 
201
+ ### About the `mcp/` directory in this repo
202
+
203
+ The `mcp/` directory at the repo root is a separate TypeScript + Bun MCP
204
+ server backed by PostgreSQL with pgvector for semantic search. It is not
205
+ the Python MCP server shipped in the `docpull` package described above
206
+ — that one is the right choice for almost every user and is installed
207
+ with `pip install 'docpull[mcp]'`. The `mcp/` tree is mirrored to its
208
+ own repo at [`raintree-technology/docpull-mcp`](https://github.com/raintree-technology/docpull-mcp);
209
+ unless you specifically need pgvector-backed semantic search, ignore it
210
+ and use `docpull mcp`.
211
+
201
212
  ## Output
202
213
 
203
214
  Markdown files with YAML frontmatter:
@@ -294,6 +305,7 @@ docpull URL --preview-urls # List URLs without fetching
294
305
  - [PyPI](https://pypi.org/project/docpull/)
295
306
  - [GitHub](https://github.com/raintree-technology/docpull)
296
307
  - [Changelog](https://github.com/raintree-technology/docpull/blob/main/docs/CHANGELOG.md)
308
+ - [Metrics](https://github.com/raintree-technology/docpull/blob/main/METRICS.md) — auto-refreshed daily (PyPI downloads, plugin installs via clone count, traffic)
297
309
 
298
310
  ## License
299
311
 
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "docpull"
7
- version = "2.5.1"
7
+ version = "3.0.0"
8
8
  dynamic = []
9
9
  description = "Pull documentation from the web and convert to clean markdown"
10
10
  readme = {file = "README.md", content-type = "text/markdown"}
@@ -102,7 +102,6 @@ dev = [
102
102
  "pytest>=7.0.0",
103
103
  "pytest-cov>=4.0.0",
104
104
  "pytest-asyncio>=0.21.0",
105
- "black>=23.0.0",
106
105
  "mypy>=1.0.0",
107
106
  "ruff>=0.1.0",
108
107
  "bandit>=1.7.0",
@@ -132,10 +131,6 @@ include = ["docpull*"]
132
131
  [tool.setuptools.package-data]
133
132
  docpull = ["py.typed"]
134
133
 
135
- [tool.black]
136
- line-length = 110
137
- target-version = ["py310", "py311", "py312", "py313", "py314"]
138
-
139
134
  [tool.ruff]
140
135
  line-length = 110
141
136
  target-version = "py310"
@@ -14,7 +14,7 @@ Usage:
14
14
  print(event)
15
15
  """
16
16
 
17
- __version__ = "2.5.1"
17
+ __version__ = "3.0.0"
18
18
 
19
19
  from .cache import CacheManager, StreamingDeduplicator
20
20
  from .conversion.chunking import Chunk, TokenCounter, chunk_markdown
@@ -562,8 +562,7 @@ def run_fetcher(args: argparse.Namespace) -> int:
562
562
  n_chunks = len(ctx.chunks) if ctx.chunks else 0
563
563
  extra = f" ({n_chunks} chunks)" if n_chunks else ""
564
564
  console.print(
565
- f"[green]Saved:[/green] {ctx.output_path} "
566
- f"[{ctx.source_type or 'generic'}]{extra}"
565
+ f"[green]Saved:[/green] {ctx.output_path} [{ctx.source_type or 'generic'}]{extra}"
567
566
  )
568
567
  return 0
569
568
 
@@ -246,7 +246,8 @@ def _describe_type(schema: Any, spec: dict[str, Any]) -> str:
246
246
  if not isinstance(schema, dict):
247
247
  return "?"
248
248
  if "$ref" in schema:
249
- return schema["$ref"].rsplit("/", 1)[-1]
249
+ ref: str = schema["$ref"]
250
+ return ref.rsplit("/", 1)[-1]
250
251
  for key in ("oneOf", "anyOf", "allOf"):
251
252
  if isinstance(schema.get(key), list) and schema[key]:
252
253
  seen: list[str] = []
@@ -349,9 +350,7 @@ class OpenApiExtractor:
349
350
  for method, op in ops.items():
350
351
  if method.lower() not in _HTTP_METHODS or not isinstance(op, dict):
351
352
  continue
352
- self._render_operation(
353
- lines, path, method, op, shared_params, data
354
- )
353
+ self._render_operation(lines, path, method, op, shared_params, data)
355
354
 
356
355
  return SpecialCaseResult(
357
356
  markdown="\n".join(lines).strip() + "\n",
@@ -410,9 +409,7 @@ class OpenApiExtractor:
410
409
  lines.append(bullet)
411
410
  lines.append("")
412
411
 
413
- def _render_request_body(
414
- self, lines: list[str], body: Any, spec: dict[str, Any]
415
- ) -> None:
412
+ def _render_request_body(self, lines: list[str], body: Any, spec: dict[str, Any]) -> None:
416
413
  if not isinstance(body, dict):
417
414
  return
418
415
  if "$ref" in body:
@@ -455,9 +452,7 @@ class OpenApiExtractor:
455
452
  lines.append(f"- body: {_describe_type(schema, spec)}")
456
453
  lines.append("")
457
454
 
458
- def _render_responses(
459
- self, lines: list[str], responses: Any, spec: dict[str, Any]
460
- ) -> None:
455
+ def _render_responses(self, lines: list[str], responses: Any, spec: dict[str, Any]) -> None:
461
456
  if not isinstance(responses, dict) or not responses:
462
457
  return
463
458
  lines.append("**Responses:**")
@@ -535,11 +530,7 @@ class MdxSourceExtractor:
535
530
  for pattern in self._EDIT_PATTERNS:
536
531
  match = pattern.search(text)
537
532
  if match:
538
- raw_url = (
539
- match.group(1)
540
- .replace("/blob/", "/raw/")
541
- .replace("/edit/", "/raw/")
542
- )
533
+ raw_url = match.group(1).replace("/blob/", "/raw/").replace("/edit/", "/raw/")
543
534
  # Return None so downstream runs, but attach hint via a cache
544
535
  # mechanism. Simpler: return None always; step reads the URL
545
536
  # if needed by re-running the regex.
@@ -567,9 +558,7 @@ def find_mdx_source_url(html: bytes) -> str | None:
567
558
  for pattern in MdxSourceExtractor._EDIT_PATTERNS:
568
559
  match = pattern.search(text)
569
560
  if match:
570
- return (
571
- match.group(1).replace("/blob/", "/raw/").replace("/edit/", "/raw/")
572
- )
561
+ return match.group(1).replace("/blob/", "/raw/").replace("/edit/", "/raw/")
573
562
  return None
574
563
 
575
564
 
@@ -265,9 +265,7 @@ class Fetcher:
265
265
  # built-in 50 MB ceiling.
266
266
  max_content_size_kw: dict[str, int] = {}
267
267
  if self.config.content_filter.max_file_size is not None:
268
- max_content_size_kw["max_content_size"] = int(
269
- self.config.content_filter.max_file_size
270
- )
268
+ max_content_size_kw["max_content_size"] = int(self.config.content_filter.max_file_size)
271
269
  self._http_client = AsyncHttpClient(
272
270
  rate_limiter=self._rate_limiter,
273
271
  max_retries=self.config.network.max_retries,
@@ -509,11 +507,7 @@ class Fetcher:
509
507
 
510
508
  steps = self._pipeline.steps
511
509
  if not save:
512
- steps = [
513
- s
514
- for s in steps
515
- if s.name not in {"save", "save_json", "save_ndjson", "save_sqlite"}
516
- ]
510
+ steps = [s for s in steps if s.name not in {"save", "save_json", "save_ndjson", "save_sqlite"}]
517
511
  pipeline = type(self._pipeline)(steps=steps)
518
512
  ctx = await pipeline.execute(url, output_path)
519
513
  if ctx.error:
@@ -531,8 +525,8 @@ class Fetcher:
531
525
  """
532
526
  Compute output path for a URL using the configured naming strategy.
533
527
 
534
- - ``full`` / ``flat`` / ``short``: a single flattened filename
535
- (URL path joined with underscores).
528
+ - ``full``: a single flattened filename (URL path joined with
529
+ underscores).
536
530
  - ``hierarchical``: URL path preserved as nested directories,
537
531
  terminating in ``<segment>.md`` or ``index.md`` for trailing
538
532
  slashes. The leaf is `_validate_output_path`-safe — every segment
@@ -545,7 +539,6 @@ class Fetcher:
545
539
  parts = _url_to_path_parts(url, self.config.url)
546
540
  return output_dir.joinpath(*parts)
547
541
 
548
- # full / flat / short: aliased to full until 3.0
549
542
  filename = _url_to_filename(url, self.config.url)
550
543
  return output_dir / filename
551
544
 
@@ -638,9 +631,7 @@ class Fetcher:
638
631
  )
639
632
 
640
633
  discovered: list[str] = []
641
- async for url in self._discoverer.discover(
642
- start_url, max_urls=self.config.crawl.max_pages
643
- ):
634
+ async for url in self._discoverer.discover(start_url, max_urls=self.config.crawl.max_pages):
644
635
  discovered.append(url)
645
636
  if self._cancelled:
646
637
  yield FetchEvent(
@@ -756,9 +747,7 @@ class Fetcher:
756
747
  )
757
748
  )
758
749
  try:
759
- async for url in discoverer.discover(
760
- start_url, max_urls=self.config.crawl.max_pages
761
- ):
750
+ async for url in discoverer.discover(start_url, max_urls=self.config.crawl.max_pages):
762
751
  if self._cancelled:
763
752
  break
764
753
  await url_queue.put(url)
@@ -770,14 +759,10 @@ class Fetcher:
770
759
  and self._cache_manager
771
760
  and len(discovered_for_resume) % 200 == 0
772
761
  ):
773
- self._cache_manager.save_discovered_urls(
774
- list(discovered_for_resume), start_url
775
- )
762
+ self._cache_manager.save_discovered_urls(list(discovered_for_resume), start_url)
776
763
  finally:
777
764
  if self.config.cache.enabled and self._cache_manager:
778
- self._cache_manager.save_discovered_urls(
779
- discovered_for_resume, start_url
780
- )
765
+ self._cache_manager.save_discovered_urls(discovered_for_resume, start_url)
781
766
  self._stats.urls_discovered = len(discovered_for_resume)
782
767
  await event_queue.put(
783
768
  FetchEvent(
@@ -810,6 +795,7 @@ class Fetcher:
810
795
  continue
811
796
 
812
797
  local_events: list[FetchEvent] = []
798
+
813
799
  # Bind the per-iteration list as a default arg so ruff B023
814
800
  # is happy. Closure is consumed synchronously by execute()
815
801
  # before the next iteration anyway, so capture order is safe.
@@ -936,9 +922,7 @@ def fetch_one(url: str, **kwargs: object) -> PageContext:
936
922
  """
937
923
  try:
938
924
  asyncio.get_running_loop()
939
- raise RuntimeError(
940
- "fetch_one() called from async context. Use Fetcher.fetch_one() instead."
941
- )
925
+ raise RuntimeError("fetch_one() called from async context. Use Fetcher.fetch_one() instead.")
942
926
  except RuntimeError as exc:
943
927
  if "no running event loop" not in str(exc).lower():
944
928
  raise
@@ -29,19 +29,20 @@ def normalize_url(url: str) -> str:
29
29
  Returns:
30
30
  Normalized URL string
31
31
  """
32
- # Use url_normalize library if available
32
+ # Use url_normalize library if available for case / percent-encoding
33
+ # cleanup. It does NOT strip fragments, so we always do that ourselves
34
+ # below — keeping behavior consistent whether the optional dep is
35
+ # installed or not.
33
36
  if URL_NORMALIZE_AVAILABLE:
34
37
  try:
35
- result: str = url_normalize(url)
36
- return result
38
+ normalized = url_normalize(url)
39
+ if normalized:
40
+ url = normalized
37
41
  except ValueError:
38
42
  logger.debug("url_normalize rejected URL during normalization", exc_info=True)
39
43
 
40
- # Basic normalization
41
44
  parsed = urlparse(url)
42
-
43
- # Remove fragment
44
- normalized = urlunparse(
45
+ return urlunparse(
45
46
  (
46
47
  parsed.scheme.lower(),
47
48
  parsed.netloc.lower(),
@@ -52,8 +53,6 @@ def normalize_url(url: str) -> str:
52
53
  )
53
54
  )
54
55
 
55
- return normalized
56
-
57
56
 
58
57
  class PatternFilter:
59
58
  """
@@ -12,7 +12,7 @@ from types import TracebackType
12
12
  from urllib.parse import urljoin, urlparse
13
13
 
14
14
  import aiohttp
15
- from aiohttp.abc import AbstractResolver
15
+ from aiohttp.abc import AbstractResolver, ResolveResult
16
16
 
17
17
  from ..security.url_validator import UrlValidator
18
18
  from .protocols import HttpResponse
@@ -45,14 +45,14 @@ class _ValidatedResolver(AbstractResolver):
45
45
  self,
46
46
  host: str,
47
47
  port: int = 0,
48
- family: int = socket.AF_UNSPEC,
49
- ) -> list[dict[str, object]]:
48
+ family: socket.AddressFamily = socket.AF_UNSPEC,
49
+ ) -> list[ResolveResult]:
50
50
  try:
51
51
  addresses = self._url_validator.resolve_allowed_addresses(host)
52
52
  except ValueError as err:
53
53
  raise OSError(str(err)) from err
54
54
 
55
- results: list[dict[str, object]] = []
55
+ results: list[ResolveResult] = []
56
56
  for address in addresses:
57
57
  ip = ipaddress.ip_address(address)
58
58
  entry_family = socket.AF_INET6 if ip.version == 6 else socket.AF_INET
@@ -60,14 +60,14 @@ class _ValidatedResolver(AbstractResolver):
60
60
  continue
61
61
 
62
62
  results.append(
63
- {
64
- "hostname": host,
65
- "host": address,
66
- "port": port,
67
- "family": entry_family,
68
- "proto": socket.IPPROTO_TCP,
69
- "flags": socket.AI_NUMERICHOST,
70
- }
63
+ ResolveResult(
64
+ hostname=host,
65
+ host=address,
66
+ port=port,
67
+ family=entry_family,
68
+ proto=socket.IPPROTO_TCP,
69
+ flags=socket.AI_NUMERICHOST,
70
+ )
71
71
  )
72
72
 
73
73
  if not results:
@@ -236,20 +236,21 @@ class AsyncHttpClient:
236
236
 
237
237
  async def __aenter__(self) -> AsyncHttpClient:
238
238
  """Enter async context and create session."""
239
- connector_kwargs: dict[str, object] = {
240
- "limit": 100, # Total connection limit
241
- "limit_per_host": 10, # Per-host connection limit
242
- "ttl_dns_cache": 300, # DNS cache TTL
243
- }
239
+ resolver: AbstractResolver | None = None
244
240
  if self._url_validator is not None and self._proxy is None:
245
- connector_kwargs["resolver"] = _ValidatedResolver(self._url_validator)
241
+ resolver = _ValidatedResolver(self._url_validator)
246
242
  elif self._proxy is not None and self._url_validator is not None:
247
243
  logger.warning(
248
244
  "Proxy mode: DNS-pinning resolver is not active. "
249
245
  "URL validation still runs pre-flight, but the proxy resolves DNS independently."
250
246
  )
251
247
 
252
- connector = aiohttp.TCPConnector(**connector_kwargs)
248
+ connector = aiohttp.TCPConnector(
249
+ limit=100,
250
+ limit_per_host=10,
251
+ ttl_dns_cache=300,
252
+ resolver=resolver,
253
+ )
253
254
  self._session = aiohttp.ClientSession(
254
255
  connector=connector,
255
256
  headers={"User-Agent": self._user_agent},
@@ -215,8 +215,7 @@ async def _run_stdio() -> int:
215
215
  from mcp.types import CallToolResult, TextContent, Tool, ToolAnnotations
216
216
  except ImportError:
217
217
  print(
218
- "docpull mcp requires the 'mcp' package. Install with: "
219
- "pip install docpull[mcp]",
218
+ "docpull mcp requires the 'mcp' package. Install with: pip install docpull[mcp]",
220
219
  file=sys.stderr,
221
220
  )
222
221
  return 1
@@ -590,7 +589,10 @@ async def _run_stdio() -> int:
590
589
  # isError=False), and
591
590
  # (b) errors on tools with an outputSchema don't fail the validator
592
591
  # for "missing structured content."
593
- content = [TextContent(type="text", text=result.text)]
592
+ # `content` is typed `list[TextContent | ImageContent | ...]` on the SDK
593
+ # side; list invariance means we have to widen the local annotation
594
+ # explicitly even though TextContent is one of the valid variants.
595
+ content: list[Any] = [TextContent(type="text", text=result.text)]
594
596
  return CallToolResult(
595
597
  content=content,
596
598
  structuredContent=result.data if not result.is_error else None,
@@ -26,7 +26,7 @@ from typing import Any
26
26
  import yaml
27
27
 
28
28
  from ..core.fetcher import Fetcher
29
- from ..models.config import DocpullConfig, ProfileName
29
+ from ..models.config import CrawlConfig, DocpullConfig, OutputConfig, ProfileName
30
30
  from ..security.url_validator import UrlValidator
31
31
  from .sources import (
32
32
  _URL_SCHEME_RE,
@@ -195,16 +195,10 @@ async def ensure_docs(
195
195
  target_dir = _source_dir(docs_dir, source)
196
196
  meta_path = _meta_path(docs_dir, source)
197
197
 
198
- if (
199
- not force
200
- and _cache_fresh(meta_path)
201
- and target_dir.exists()
202
- and any(target_dir.rglob("*.md"))
203
- ):
198
+ if not force and _cache_fresh(meta_path) and target_dir.exists() and any(target_dir.rglob("*.md")):
204
199
  files = list(target_dir.rglob("*.md"))
205
200
  return ToolResult(
206
- f"Cached: {source} ({len(files)} files at {target_dir}). "
207
- "Call with force=true to refresh.",
201
+ f"Cached: {source} ({len(files)} files at {target_dir}). Call with force=true to refresh.",
208
202
  data={
209
203
  "source": source,
210
204
  "cached": True,
@@ -216,8 +210,8 @@ async def ensure_docs(
216
210
  config = DocpullConfig(
217
211
  url=resolved.url,
218
212
  profile=profile_enum,
219
- crawl={"max_pages": resolved.max_pages} if resolved.max_pages else {},
220
- output={"directory": target_dir},
213
+ crawl=CrawlConfig(max_pages=resolved.max_pages) if resolved.max_pages else CrawlConfig(),
214
+ output=OutputConfig(directory=target_dir),
221
215
  )
222
216
  fetched = 0
223
217
  crashed = False
@@ -264,13 +258,11 @@ async def fetch_url(url: str, *, max_tokens: int | None = None) -> ToolResult:
264
258
  if not validation.is_valid:
265
259
  return ToolResult(f"URL rejected: {validation.rejection_reason}", is_error=True)
266
260
 
267
- output_kwargs: dict[str, Any] = {}
268
- if max_tokens:
269
- output_kwargs["max_tokens_per_file"] = max_tokens
261
+ output_cfg = OutputConfig(max_tokens_per_file=max_tokens) if max_tokens else OutputConfig()
270
262
  config = DocpullConfig(
271
263
  url=url,
272
264
  profile=ProfileName.CUSTOM,
273
- output=output_kwargs or None,
265
+ output=output_cfg,
274
266
  )
275
267
  async with Fetcher(config) as fetcher:
276
268
  ctx = await fetcher.fetch_one(url, save=False)
@@ -288,10 +280,7 @@ async def fetch_url(url: str, *, max_tokens: int | None = None) -> ToolResult:
288
280
  ]
289
281
  body = "\n\n".join(parts)
290
282
  chunks_meta = f" _chunks: {len(ctx.chunks)}_" if ctx.chunks else ""
291
- header = (
292
- f"# {ctx.title or url}\n"
293
- f"_source: {url}_ _type: {ctx.source_type or 'generic'}_{chunks_meta}\n\n"
294
- )
283
+ header = f"# {ctx.title or url}\n_source: {url}_ _type: {ctx.source_type or 'generic'}_{chunks_meta}\n\n"
295
284
  return ToolResult(header + body)
296
285
 
297
286
 
@@ -471,16 +460,9 @@ def grep_docs(
471
460
  matches: list[tuple[int, list[str], str, list[str]]] = []
472
461
  for idx, line in enumerate(lines):
473
462
  if regex.search(line):
474
- before = (
475
- [lines[i].rstrip() for i in range(max(0, idx - context), idx)]
476
- if context
477
- else []
478
- )
463
+ before = [lines[i].rstrip() for i in range(max(0, idx - context), idx)] if context else []
479
464
  after = (
480
- [
481
- lines[i].rstrip()
482
- for i in range(idx + 1, min(len(lines), idx + 1 + context))
483
- ]
465
+ [lines[i].rstrip() for i in range(idx + 1, min(len(lines), idx + 1 + context))]
484
466
  if context
485
467
  else []
486
468
  )
@@ -532,9 +514,7 @@ def grep_docs(
532
514
  for off, line in enumerate(after, start=1):
533
515
  chunk.append(f" {lineno + off:>4}- {line}")
534
516
  block_lines.append("\n".join(chunk))
535
- rendered_matches.append(
536
- {"lineno": lineno, "before": before, "line": hit, "after": after}
537
- )
517
+ rendered_matches.append({"lineno": lineno, "before": before, "line": hit, "after": after})
538
518
  rendered += 1
539
519
  blocks.append("\n\n".join(block_lines))
540
520
  files_payload.append(
@@ -710,28 +690,19 @@ def add_source(
710
690
  )
711
691
  validation = _ADD_SOURCE_VALIDATOR.validate(url)
712
692
  if not validation.is_valid:
713
- return ToolResult(
714
- f"URL rejected: {validation.rejection_reason}", is_error=True
715
- )
693
+ return ToolResult(f"URL rejected: {validation.rejection_reason}", is_error=True)
716
694
  if description is not None and len(description) > MAX_DESCRIPTION_LEN:
717
- return ToolResult(
718
- f"Description too long (>{MAX_DESCRIPTION_LEN} chars).", is_error=True
719
- )
695
+ return ToolResult(f"Description too long (>{MAX_DESCRIPTION_LEN} chars).", is_error=True)
720
696
  if category is not None and category not in ALLOWED_USER_CATEGORIES:
721
697
  valid = ", ".join(sorted(ALLOWED_USER_CATEGORIES))
722
- return ToolResult(
723
- f"Unknown category '{category}'. Valid: {valid}", is_error=True
724
- )
698
+ return ToolResult(f"Unknown category '{category}'. Valid: {valid}", is_error=True)
725
699
  if max_pages is not None and (max_pages < 1 or max_pages > 100_000):
726
- return ToolResult(
727
- "max_pages must be between 1 and 100000.", is_error=True
728
- )
700
+ return ToolResult("max_pages must be between 1 and 100000.", is_error=True)
729
701
 
730
702
  is_builtin = name in BUILTIN_SOURCES
731
703
  if is_builtin and not force:
732
704
  return ToolResult(
733
- f"'{name}' is a builtin source. Pass force=true to shadow it with a "
734
- "user override.",
705
+ f"'{name}' is a builtin source. Pass force=true to shadow it with a user override.",
735
706
  is_error=True,
736
707
  )
737
708