docpull 2.4.0__tar.gz → 2.5.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. {docpull-2.4.0/src/docpull.egg-info → docpull-2.5.0}/PKG-INFO +1 -1
  2. {docpull-2.4.0 → docpull-2.5.0}/pyproject.toml +1 -1
  3. {docpull-2.4.0 → docpull-2.5.0}/src/docpull/__init__.py +1 -1
  4. docpull-2.5.0/src/docpull/mcp/server.py +615 -0
  5. {docpull-2.4.0 → docpull-2.5.0}/src/docpull/mcp/sources.py +18 -1
  6. docpull-2.5.0/src/docpull/mcp/tools.py +830 -0
  7. {docpull-2.4.0 → docpull-2.5.0/src/docpull.egg-info}/PKG-INFO +1 -1
  8. docpull-2.5.0/tests/test_mcp_tools.py +617 -0
  9. docpull-2.4.0/src/docpull/mcp/server.py +0 -200
  10. docpull-2.4.0/src/docpull/mcp/tools.py +0 -360
  11. docpull-2.4.0/tests/test_mcp_tools.py +0 -189
  12. {docpull-2.4.0 → docpull-2.5.0}/LICENSE +0 -0
  13. {docpull-2.4.0 → docpull-2.5.0}/README.md +0 -0
  14. {docpull-2.4.0 → docpull-2.5.0}/setup.cfg +0 -0
  15. {docpull-2.4.0 → docpull-2.5.0}/src/docpull/__main__.py +0 -0
  16. {docpull-2.4.0 → docpull-2.5.0}/src/docpull/cache/__init__.py +0 -0
  17. {docpull-2.4.0 → docpull-2.5.0}/src/docpull/cache/manager.py +0 -0
  18. {docpull-2.4.0 → docpull-2.5.0}/src/docpull/cache/streaming_dedup.py +0 -0
  19. {docpull-2.4.0 → docpull-2.5.0}/src/docpull/cli.py +0 -0
  20. {docpull-2.4.0 → docpull-2.5.0}/src/docpull/concurrency/__init__.py +0 -0
  21. {docpull-2.4.0 → docpull-2.5.0}/src/docpull/concurrency/manager.py +0 -0
  22. {docpull-2.4.0 → docpull-2.5.0}/src/docpull/conversion/__init__.py +0 -0
  23. {docpull-2.4.0 → docpull-2.5.0}/src/docpull/conversion/chunking.py +0 -0
  24. {docpull-2.4.0 → docpull-2.5.0}/src/docpull/conversion/extractor.py +0 -0
  25. {docpull-2.4.0 → docpull-2.5.0}/src/docpull/conversion/markdown.py +0 -0
  26. {docpull-2.4.0 → docpull-2.5.0}/src/docpull/conversion/protocols.py +0 -0
  27. {docpull-2.4.0 → docpull-2.5.0}/src/docpull/conversion/special_cases.py +0 -0
  28. {docpull-2.4.0 → docpull-2.5.0}/src/docpull/conversion/trafilatura_extractor.py +0 -0
  29. {docpull-2.4.0 → docpull-2.5.0}/src/docpull/core/__init__.py +0 -0
  30. {docpull-2.4.0 → docpull-2.5.0}/src/docpull/core/fetcher.py +0 -0
  31. {docpull-2.4.0 → docpull-2.5.0}/src/docpull/discovery/__init__.py +0 -0
  32. {docpull-2.4.0 → docpull-2.5.0}/src/docpull/discovery/composite.py +0 -0
  33. {docpull-2.4.0 → docpull-2.5.0}/src/docpull/discovery/crawler.py +0 -0
  34. {docpull-2.4.0 → docpull-2.5.0}/src/docpull/discovery/filters.py +0 -0
  35. {docpull-2.4.0 → docpull-2.5.0}/src/docpull/discovery/link_extractors/__init__.py +0 -0
  36. {docpull-2.4.0 → docpull-2.5.0}/src/docpull/discovery/link_extractors/enhanced.py +0 -0
  37. {docpull-2.4.0 → docpull-2.5.0}/src/docpull/discovery/link_extractors/protocols.py +0 -0
  38. {docpull-2.4.0 → docpull-2.5.0}/src/docpull/discovery/link_extractors/static.py +0 -0
  39. {docpull-2.4.0 → docpull-2.5.0}/src/docpull/discovery/protocols.py +0 -0
  40. {docpull-2.4.0 → docpull-2.5.0}/src/docpull/discovery/sitemap.py +0 -0
  41. {docpull-2.4.0 → docpull-2.5.0}/src/docpull/doctor.py +0 -0
  42. {docpull-2.4.0 → docpull-2.5.0}/src/docpull/http/__init__.py +0 -0
  43. {docpull-2.4.0 → docpull-2.5.0}/src/docpull/http/client.py +0 -0
  44. {docpull-2.4.0 → docpull-2.5.0}/src/docpull/http/protocols.py +0 -0
  45. {docpull-2.4.0 → docpull-2.5.0}/src/docpull/http/rate_limiter.py +0 -0
  46. {docpull-2.4.0 → docpull-2.5.0}/src/docpull/logging_config.py +0 -0
  47. {docpull-2.4.0 → docpull-2.5.0}/src/docpull/mcp/__init__.py +0 -0
  48. {docpull-2.4.0 → docpull-2.5.0}/src/docpull/metadata_extractor.py +0 -0
  49. {docpull-2.4.0 → docpull-2.5.0}/src/docpull/models/__init__.py +0 -0
  50. {docpull-2.4.0 → docpull-2.5.0}/src/docpull/models/config.py +0 -0
  51. {docpull-2.4.0 → docpull-2.5.0}/src/docpull/models/events.py +0 -0
  52. {docpull-2.4.0 → docpull-2.5.0}/src/docpull/models/profiles.py +0 -0
  53. {docpull-2.4.0 → docpull-2.5.0}/src/docpull/pipeline/__init__.py +0 -0
  54. {docpull-2.4.0 → docpull-2.5.0}/src/docpull/pipeline/base.py +0 -0
  55. {docpull-2.4.0 → docpull-2.5.0}/src/docpull/pipeline/steps/__init__.py +0 -0
  56. {docpull-2.4.0 → docpull-2.5.0}/src/docpull/pipeline/steps/chunk.py +0 -0
  57. {docpull-2.4.0 → docpull-2.5.0}/src/docpull/pipeline/steps/convert.py +0 -0
  58. {docpull-2.4.0 → docpull-2.5.0}/src/docpull/pipeline/steps/dedup.py +0 -0
  59. {docpull-2.4.0 → docpull-2.5.0}/src/docpull/pipeline/steps/fetch.py +0 -0
  60. {docpull-2.4.0 → docpull-2.5.0}/src/docpull/pipeline/steps/metadata.py +0 -0
  61. {docpull-2.4.0 → docpull-2.5.0}/src/docpull/pipeline/steps/save.py +0 -0
  62. {docpull-2.4.0 → docpull-2.5.0}/src/docpull/pipeline/steps/save_json.py +0 -0
  63. {docpull-2.4.0 → docpull-2.5.0}/src/docpull/pipeline/steps/save_ndjson.py +0 -0
  64. {docpull-2.4.0 → docpull-2.5.0}/src/docpull/pipeline/steps/save_sqlite.py +0 -0
  65. {docpull-2.4.0 → docpull-2.5.0}/src/docpull/pipeline/steps/validate.py +0 -0
  66. {docpull-2.4.0 → docpull-2.5.0}/src/docpull/py.typed +0 -0
  67. {docpull-2.4.0 → docpull-2.5.0}/src/docpull/security/__init__.py +0 -0
  68. {docpull-2.4.0 → docpull-2.5.0}/src/docpull/security/robots.py +0 -0
  69. {docpull-2.4.0 → docpull-2.5.0}/src/docpull/security/url_validator.py +0 -0
  70. {docpull-2.4.0 → docpull-2.5.0}/src/docpull.egg-info/SOURCES.txt +0 -0
  71. {docpull-2.4.0 → docpull-2.5.0}/src/docpull.egg-info/dependency_links.txt +0 -0
  72. {docpull-2.4.0 → docpull-2.5.0}/src/docpull.egg-info/entry_points.txt +0 -0
  73. {docpull-2.4.0 → docpull-2.5.0}/src/docpull.egg-info/requires.txt +0 -0
  74. {docpull-2.4.0 → docpull-2.5.0}/src/docpull.egg-info/top_level.txt +0 -0
  75. {docpull-2.4.0 → docpull-2.5.0}/tests/test_cache_conditional_get.py +0 -0
  76. {docpull-2.4.0 → docpull-2.5.0}/tests/test_chunking.py +0 -0
  77. {docpull-2.4.0 → docpull-2.5.0}/tests/test_cli.py +0 -0
  78. {docpull-2.4.0 → docpull-2.5.0}/tests/test_convert_step_new.py +0 -0
  79. {docpull-2.4.0 → docpull-2.5.0}/tests/test_fixes_v2_3_0.py +0 -0
  80. {docpull-2.4.0 → docpull-2.5.0}/tests/test_link_extractors.py +0 -0
  81. {docpull-2.4.0 → docpull-2.5.0}/tests/test_naming.py +0 -0
  82. {docpull-2.4.0 → docpull-2.5.0}/tests/test_save_ndjson.py +0 -0
  83. {docpull-2.4.0 → docpull-2.5.0}/tests/test_security_hardening.py +0 -0
  84. {docpull-2.4.0 → docpull-2.5.0}/tests/test_special_cases.py +0 -0
  85. {docpull-2.4.0 → docpull-2.5.0}/tests/test_v2_conversion.py +0 -0
  86. {docpull-2.4.0 → docpull-2.5.0}/tests/test_v2_discovery.py +0 -0
  87. {docpull-2.4.0 → docpull-2.5.0}/tests/test_v2_integration.py +0 -0
  88. {docpull-2.4.0 → docpull-2.5.0}/tests/test_v2_pipeline.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docpull
3
- Version: 2.4.0
3
+ Version: 2.5.0
4
4
  Summary: Pull documentation from the web and convert to clean markdown
5
5
  Author-email: Zachary Roth <support@raintree.technology>
6
6
  Maintainer-email: Raintree Technology <support@raintree.technology>
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "docpull"
7
- version = "2.4.0"
7
+ version = "2.5.0"
8
8
  dynamic = []
9
9
  description = "Pull documentation from the web and convert to clean markdown"
10
10
  readme = {file = "README.md", content-type = "text/markdown"}
@@ -14,7 +14,7 @@ Usage:
14
14
  print(event)
15
15
  """
16
16
 
17
- __version__ = "2.4.0"
17
+ __version__ = "2.5.0"
18
18
 
19
19
  from .cache import CacheManager, StreamingDeduplicator
20
20
  from .conversion.chunking import Chunk, TokenCounter, chunk_markdown
@@ -0,0 +1,615 @@
1
+ """stdio MCP server exposing docpull tools to AI agents.
2
+
3
+ Requires the optional ``mcp`` Python package (install with
4
+ ``pip install docpull[mcp]``). The server registers eight tools:
5
+
6
+ Read-only:
7
+ - ``fetch_url(url)`` — one-shot fetch, no discovery. Agent-oriented fast path.
8
+ - ``list_sources(category?)`` — show available aliases.
9
+ - ``list_indexed()`` — show what has been fetched.
10
+ - ``grep_docs(pattern, library?, limit?)`` — regex search through cached docs.
11
+ - ``read_doc(library, path, line_start?, line_end?)`` — read a fetched file.
12
+
13
+ Write:
14
+ - ``ensure_docs(source, force?)`` — fetch (or refresh) a named library.
15
+ - ``add_source(name, url, ...)`` — add or update a user source alias.
16
+ - ``remove_source(name, delete_cache?)`` — remove a user source alias.
17
+ """
18
+
19
+ from __future__ import annotations
20
+
21
+ import argparse
22
+ import asyncio
23
+ import logging
24
+ import sys
25
+ from typing import Any
26
+
27
+ from .tools import (
28
+ ToolResult,
29
+ add_source,
30
+ ensure_docs,
31
+ fetch_url,
32
+ grep_docs,
33
+ list_indexed,
34
+ list_sources,
35
+ read_doc,
36
+ remove_source,
37
+ )
38
+
39
+ logger = logging.getLogger(__name__)
40
+
41
+ SERVER_INSTRUCTIONS = (
42
+ "Call list_sources to discover aliases before ensure_docs. "
43
+ "Use ensure_docs for a whole library (cached 7 days), fetch_url for one "
44
+ "ad-hoc HTTPS page. After ensure_docs, use grep_docs to find passages "
45
+ "and read_doc to pull the surrounding lines. Use add_source / "
46
+ "remove_source to manage the user-defined registry."
47
+ )
48
+
49
+
50
+ # Output schemas — keep these next to the tool list so they stay in sync.
51
+ # Tools that return free-form Markdown (fetch_url) intentionally omit a
52
+ # schema; the rest expose structured payloads alongside the rendered text.
53
+
54
+ _LIST_SOURCES_OUTPUT_SCHEMA = {
55
+ "type": "object",
56
+ "properties": {
57
+ "sources": {
58
+ "type": "array",
59
+ "items": {
60
+ "type": "object",
61
+ "properties": {
62
+ "name": {"type": "string"},
63
+ "url": {"type": "string"},
64
+ "description": {"type": "string"},
65
+ "category": {"type": "string"},
66
+ "max_pages": {"type": "integer"},
67
+ },
68
+ "required": ["name", "url", "description", "category"],
69
+ },
70
+ },
71
+ },
72
+ "required": ["sources"],
73
+ }
74
+
75
+ _LIST_INDEXED_OUTPUT_SCHEMA = {
76
+ "type": "object",
77
+ "properties": {
78
+ "libraries": {
79
+ "type": "array",
80
+ "items": {
81
+ "type": "object",
82
+ "properties": {
83
+ "name": {"type": "string"},
84
+ "file_count": {"type": "integer"},
85
+ "fresh": {"type": "boolean"},
86
+ "fetched_at": {"type": "string"},
87
+ "age_seconds": {"type": "integer"},
88
+ },
89
+ "required": ["name", "file_count", "fresh"],
90
+ },
91
+ },
92
+ },
93
+ "required": ["libraries"],
94
+ }
95
+
96
+ _GREP_DOCS_OUTPUT_SCHEMA = {
97
+ "type": "object",
98
+ "properties": {
99
+ "pattern": {"type": "string"},
100
+ "total_matches": {"type": "integer"},
101
+ "files": {
102
+ "type": "array",
103
+ "items": {
104
+ "type": "object",
105
+ "properties": {
106
+ "path": {"type": "string"},
107
+ "match_count": {"type": "integer"},
108
+ "matches": {
109
+ "type": "array",
110
+ "items": {
111
+ "type": "object",
112
+ "properties": {
113
+ "lineno": {"type": "integer"},
114
+ "before": {"type": "array", "items": {"type": "string"}},
115
+ "line": {"type": "string"},
116
+ "after": {"type": "array", "items": {"type": "string"}},
117
+ },
118
+ "required": ["lineno", "before", "line", "after"],
119
+ },
120
+ },
121
+ },
122
+ "required": ["path", "match_count", "matches"],
123
+ },
124
+ },
125
+ "truncated": {"type": "boolean"},
126
+ "timed_out": {"type": "boolean"},
127
+ },
128
+ "required": ["pattern", "total_matches", "files", "truncated", "timed_out"],
129
+ }
130
+
131
+ _READ_DOC_OUTPUT_SCHEMA = {
132
+ "type": "object",
133
+ "properties": {
134
+ "library": {"type": "string"},
135
+ "path": {"type": "string"},
136
+ "line_start": {"type": "integer"},
137
+ "line_end": {"type": "integer"},
138
+ "total_lines": {"type": "integer"},
139
+ "text": {"type": "string"},
140
+ },
141
+ "required": ["library", "path", "line_start", "line_end", "total_lines", "text"],
142
+ }
143
+
144
+ _ENSURE_DOCS_OUTPUT_SCHEMA = {
145
+ "type": "object",
146
+ "properties": {
147
+ "source": {"type": "string"},
148
+ "cached": {"type": "boolean"},
149
+ "file_count": {"type": "integer"},
150
+ "pages_fetched": {"type": "integer"},
151
+ "pages_skipped": {"type": "integer"},
152
+ "pages_failed": {"type": "integer"},
153
+ "target_dir": {"type": "string"},
154
+ },
155
+ "required": ["source", "cached", "target_dir"],
156
+ }
157
+
158
+ _ADD_SOURCE_OUTPUT_SCHEMA = {
159
+ "type": "object",
160
+ "properties": {
161
+ "name": {"type": "string"},
162
+ "url": {"type": "string"},
163
+ "replaced": {"type": "boolean"},
164
+ "shadowed_builtin": {"type": "boolean"},
165
+ "config_path": {"type": "string"},
166
+ },
167
+ "required": ["name", "url", "replaced", "shadowed_builtin", "config_path"],
168
+ }
169
+
170
+ _REMOVE_SOURCE_OUTPUT_SCHEMA = {
171
+ "type": "object",
172
+ "properties": {
173
+ "name": {"type": "string"},
174
+ "removed": {"type": "boolean"},
175
+ "cache_deleted": {"type": "boolean"},
176
+ "config_path": {"type": "string"},
177
+ },
178
+ "required": ["name", "removed", "cache_deleted"],
179
+ }
180
+
181
+
182
+ def _coerce_int(value: Any, *, name: str, default: int) -> int:
183
+ """Accept int or numeric string; reject anything else with a clear error."""
184
+ if value is None:
185
+ return default
186
+ if isinstance(value, bool): # bool is a subclass of int — exclude
187
+ raise ValueError(f"'{name}' must be an integer, got bool")
188
+ if isinstance(value, int):
189
+ return value
190
+ if isinstance(value, str):
191
+ try:
192
+ return int(value)
193
+ except ValueError as err:
194
+ raise ValueError(f"'{name}' must be an integer: {err}") from None
195
+ raise ValueError(f"'{name}' must be an integer, got {type(value).__name__}")
196
+
197
+
198
+ def _require_str(arguments: dict[str, Any], key: str) -> str:
199
+ if key not in arguments:
200
+ raise ValueError(f"Missing required argument: '{key}'")
201
+ value = arguments[key]
202
+ if not isinstance(value, str) or not value:
203
+ raise ValueError(f"'{key}' must be a non-empty string")
204
+ return value
205
+
206
+
207
+ async def _run_stdio() -> int:
208
+ try:
209
+ from mcp.server import Server
210
+ from mcp.server.stdio import stdio_server
211
+ from mcp.types import CallToolResult, TextContent, Tool, ToolAnnotations
212
+ except ImportError:
213
+ print(
214
+ "docpull mcp requires the 'mcp' package. Install with: "
215
+ "pip install docpull[mcp]",
216
+ file=sys.stderr,
217
+ )
218
+ return 1
219
+
220
+ server: Server = Server("docpull", instructions=SERVER_INSTRUCTIONS)
221
+
222
+ @server.list_tools() # type: ignore[misc,no-untyped-call]
223
+ async def _list_tools() -> list[Tool]:
224
+ return [
225
+ Tool(
226
+ name="fetch_url",
227
+ description=(
228
+ "Fetch a single HTTPS URL and return clean Markdown. No discovery "
229
+ "or crawl — the agent-friendly fast path. Returns the page's "
230
+ "Markdown with source and detected framework in the header. "
231
+ "Optionally chunk the output with max_tokens. Rejects non-HTTPS "
232
+ "URLs, localhost, and private IPs. For whole libraries use "
233
+ "ensure_docs instead."
234
+ ),
235
+ annotations=ToolAnnotations(
236
+ title="Fetch one HTTPS page",
237
+ readOnlyHint=True,
238
+ openWorldHint=True,
239
+ idempotentHint=True,
240
+ ),
241
+ inputSchema={
242
+ "type": "object",
243
+ "properties": {
244
+ "url": {
245
+ "type": "string",
246
+ "description": "HTTPS URL to fetch",
247
+ "pattern": "^https://",
248
+ },
249
+ "max_tokens": {
250
+ "type": "integer",
251
+ "minimum": 100,
252
+ "maximum": 200000,
253
+ "description": "If set, split into chunks of this many tokens",
254
+ },
255
+ },
256
+ "required": ["url"],
257
+ },
258
+ ),
259
+ Tool(
260
+ name="ensure_docs",
261
+ description=(
262
+ "Fetch documentation for a named source alias (e.g. 'react', "
263
+ "'nextjs'). Uses a 7-day cache; pass force=true to refresh. "
264
+ "Optional profile selects fetch behavior: rag (default, "
265
+ "balanced for retrieval), mirror (full archive), quick "
266
+ "(fast/shallow), llm (NDJSON chunks). Use list_sources to "
267
+ "discover aliases first."
268
+ ),
269
+ annotations=ToolAnnotations(
270
+ title="Fetch a documentation library",
271
+ readOnlyHint=False,
272
+ destructiveHint=False,
273
+ idempotentHint=True,
274
+ openWorldHint=True,
275
+ ),
276
+ inputSchema={
277
+ "type": "object",
278
+ "properties": {
279
+ "source": {"type": "string"},
280
+ "force": {"type": "boolean", "default": False},
281
+ "profile": {
282
+ "type": "string",
283
+ "enum": ["rag", "mirror", "quick", "llm"],
284
+ "default": "rag",
285
+ },
286
+ },
287
+ "required": ["source"],
288
+ },
289
+ outputSchema=_ENSURE_DOCS_OUTPUT_SCHEMA,
290
+ ),
291
+ Tool(
292
+ name="list_sources",
293
+ description=(
294
+ "List configured documentation source aliases, optionally "
295
+ "filtered by category. Use this to discover what ensure_docs "
296
+ "can fetch."
297
+ ),
298
+ annotations=ToolAnnotations(
299
+ title="List configured source aliases",
300
+ readOnlyHint=True,
301
+ openWorldHint=False,
302
+ idempotentHint=True,
303
+ ),
304
+ inputSchema={
305
+ "type": "object",
306
+ "properties": {
307
+ "category": {
308
+ "type": "string",
309
+ "enum": ["frontend", "backend", "ai", "database", "user"],
310
+ "description": "Filter by category",
311
+ }
312
+ },
313
+ },
314
+ outputSchema=_LIST_SOURCES_OUTPUT_SCHEMA,
315
+ ),
316
+ Tool(
317
+ name="list_indexed",
318
+ description=(
319
+ "List sources that have been fetched to the local docs "
320
+ "directory, with last-fetched age. Sorted alphabetically."
321
+ ),
322
+ annotations=ToolAnnotations(
323
+ title="List locally cached libraries",
324
+ readOnlyHint=True,
325
+ openWorldHint=False,
326
+ idempotentHint=True,
327
+ ),
328
+ inputSchema={"type": "object", "properties": {}},
329
+ outputSchema=_LIST_INDEXED_OUTPUT_SCHEMA,
330
+ ),
331
+ Tool(
332
+ name="grep_docs",
333
+ description=(
334
+ "Regex search through fetched Markdown. Results are ranked by "
335
+ "match density (most matches per file first) and rendered with "
336
+ "lines of surrounding context. Use ensure_docs first; then "
337
+ "read_doc to pull more context around a hit."
338
+ ),
339
+ annotations=ToolAnnotations(
340
+ title="Regex-search cached docs",
341
+ readOnlyHint=True,
342
+ openWorldHint=False,
343
+ idempotentHint=True,
344
+ ),
345
+ inputSchema={
346
+ "type": "object",
347
+ "properties": {
348
+ "pattern": {"type": "string", "maxLength": 1000},
349
+ "library": {
350
+ "type": "string",
351
+ "pattern": "^[a-zA-Z0-9_.-]+$",
352
+ "maxLength": 128,
353
+ "description": "Restrict to one library (name from list_indexed)",
354
+ },
355
+ "limit": {"type": "integer", "default": 20, "minimum": 1, "maximum": 200},
356
+ "case_sensitive": {"type": "boolean", "default": False},
357
+ "context": {
358
+ "type": "integer",
359
+ "default": 1,
360
+ "minimum": 0,
361
+ "maximum": 3,
362
+ "description": "Lines of context per match (0 = none)",
363
+ },
364
+ },
365
+ "required": ["pattern"],
366
+ },
367
+ outputSchema=_GREP_DOCS_OUTPUT_SCHEMA,
368
+ ),
369
+ Tool(
370
+ name="read_doc",
371
+ description=(
372
+ "Read a Markdown file from a fetched library, optionally sliced "
373
+ "by line range. The natural follow-up to grep_docs: pass the "
374
+ "library + path it returned to pull more surrounding context."
375
+ ),
376
+ annotations=ToolAnnotations(
377
+ title="Read a cached doc file",
378
+ readOnlyHint=True,
379
+ openWorldHint=False,
380
+ idempotentHint=True,
381
+ ),
382
+ inputSchema={
383
+ "type": "object",
384
+ "properties": {
385
+ "library": {
386
+ "type": "string",
387
+ "pattern": "^[a-zA-Z0-9_.-]+$",
388
+ "maxLength": 128,
389
+ },
390
+ "path": {"type": "string", "description": "Relative path under the library"},
391
+ "line_start": {"type": "integer", "minimum": 1},
392
+ "line_end": {"type": "integer", "minimum": 1},
393
+ },
394
+ "required": ["library", "path"],
395
+ },
396
+ outputSchema=_READ_DOC_OUTPUT_SCHEMA,
397
+ ),
398
+ Tool(
399
+ name="add_source",
400
+ description=(
401
+ "Add or update a user source alias in the writable "
402
+ "sources.yaml. Refuses to shadow a builtin alias unless "
403
+ "force=true. URL is HTTPS-only and validated against the "
404
+ "same SSRF rules as fetch_url. Use list_sources to confirm "
405
+ "the change."
406
+ ),
407
+ annotations=ToolAnnotations(
408
+ title="Add or update a user source",
409
+ readOnlyHint=False,
410
+ destructiveHint=False,
411
+ idempotentHint=True,
412
+ openWorldHint=False,
413
+ ),
414
+ inputSchema={
415
+ "type": "object",
416
+ "properties": {
417
+ "name": {
418
+ "type": "string",
419
+ "pattern": "^[a-zA-Z0-9_.-]+$",
420
+ "maxLength": 128,
421
+ "description": "Alias name (alnum + _ . -)",
422
+ },
423
+ "url": {
424
+ "type": "string",
425
+ "pattern": "^https://",
426
+ "description": "HTTPS URL to crawl",
427
+ },
428
+ "description": {"type": "string", "maxLength": 500},
429
+ "category": {
430
+ "type": "string",
431
+ "enum": ["frontend", "backend", "ai", "database", "user"],
432
+ },
433
+ "max_pages": {
434
+ "type": "integer",
435
+ "minimum": 1,
436
+ "maximum": 100000,
437
+ },
438
+ "force": {
439
+ "type": "boolean",
440
+ "default": False,
441
+ "description": "Override a builtin alias of the same name",
442
+ },
443
+ },
444
+ "required": ["name", "url"],
445
+ },
446
+ outputSchema=_ADD_SOURCE_OUTPUT_SCHEMA,
447
+ ),
448
+ Tool(
449
+ name="remove_source",
450
+ description=(
451
+ "Remove a user source alias. Optionally delete its cached "
452
+ "docs (delete_cache=true). Cannot remove a builtin source — "
453
+ "to stop using one, just don't call ensure_docs on it."
454
+ ),
455
+ annotations=ToolAnnotations(
456
+ title="Remove a user source",
457
+ readOnlyHint=False,
458
+ destructiveHint=True,
459
+ idempotentHint=True,
460
+ openWorldHint=False,
461
+ ),
462
+ inputSchema={
463
+ "type": "object",
464
+ "properties": {
465
+ "name": {
466
+ "type": "string",
467
+ "pattern": "^[a-zA-Z0-9_.-]+$",
468
+ "maxLength": 128,
469
+ },
470
+ "delete_cache": {
471
+ "type": "boolean",
472
+ "default": False,
473
+ "description": "Also delete the cached docs directory",
474
+ },
475
+ },
476
+ "required": ["name"],
477
+ },
478
+ outputSchema=_REMOVE_SOURCE_OUTPUT_SCHEMA,
479
+ ),
480
+ ]
481
+
482
+ async def _make_progress_callback() -> Any:
483
+ """Return ``(pages_done, total_or_none) -> awaitable`` bound to the
484
+ current request's progressToken, or ``None`` if the client did not
485
+ request progress."""
486
+ ctx = server.request_context
487
+ if ctx.meta is None or ctx.meta.progressToken is None:
488
+ return None
489
+ token = ctx.meta.progressToken
490
+ session = ctx.session
491
+
492
+ async def _cb(done: int, total: int | None) -> None:
493
+ try:
494
+ await session.send_progress_notification(
495
+ progress_token=token,
496
+ progress=float(done),
497
+ total=float(total) if total is not None else None,
498
+ )
499
+ except Exception: # noqa: BLE001
500
+ logger.debug("progress notification failed", exc_info=True)
501
+
502
+ return _cb
503
+
504
+ @server.call_tool() # type: ignore[misc,no-untyped-call]
505
+ async def _call_tool(name: str, arguments: dict[str, Any]) -> CallToolResult:
506
+ try:
507
+ if name == "fetch_url":
508
+ url = _require_str(arguments, "url")
509
+ max_tokens = _coerce_int(arguments.get("max_tokens"), name="max_tokens", default=0)
510
+ result = await fetch_url(url, max_tokens=max_tokens or None)
511
+ elif name == "ensure_docs":
512
+ source = _require_str(arguments, "source")
513
+ on_progress = await _make_progress_callback()
514
+ result = await ensure_docs(
515
+ source,
516
+ force=bool(arguments.get("force", False)),
517
+ profile=arguments.get("profile"),
518
+ on_progress=on_progress,
519
+ )
520
+ elif name == "list_sources":
521
+ category = arguments.get("category")
522
+ if category is not None and not isinstance(category, str):
523
+ raise ValueError("'category' must be a string")
524
+ result = list_sources(category)
525
+ elif name == "list_indexed":
526
+ result = list_indexed()
527
+ elif name == "grep_docs":
528
+ pattern = _require_str(arguments, "pattern")
529
+ library = arguments.get("library")
530
+ if library is not None and not isinstance(library, str):
531
+ raise ValueError("'library' must be a string")
532
+ result = grep_docs(
533
+ pattern,
534
+ library=library,
535
+ limit=_coerce_int(arguments.get("limit"), name="limit", default=20),
536
+ case_sensitive=bool(arguments.get("case_sensitive", False)),
537
+ context=_coerce_int(arguments.get("context"), name="context", default=1),
538
+ )
539
+ elif name == "read_doc":
540
+ library = _require_str(arguments, "library")
541
+ path = _require_str(arguments, "path")
542
+ line_start = arguments.get("line_start")
543
+ line_end = arguments.get("line_end")
544
+ result = read_doc(
545
+ library,
546
+ path,
547
+ line_start=_coerce_int(line_start, name="line_start", default=0) or None,
548
+ line_end=_coerce_int(line_end, name="line_end", default=0) or None,
549
+ )
550
+ elif name == "add_source":
551
+ add_name = _require_str(arguments, "name")
552
+ add_url = _require_str(arguments, "url")
553
+ description = arguments.get("description")
554
+ if description is not None and not isinstance(description, str):
555
+ raise ValueError("'description' must be a string")
556
+ category = arguments.get("category")
557
+ if category is not None and not isinstance(category, str):
558
+ raise ValueError("'category' must be a string")
559
+ max_pages = arguments.get("max_pages")
560
+ result = add_source(
561
+ add_name,
562
+ add_url,
563
+ description=description,
564
+ category=category,
565
+ max_pages=_coerce_int(max_pages, name="max_pages", default=0) or None,
566
+ force=bool(arguments.get("force", False)),
567
+ )
568
+ elif name == "remove_source":
569
+ rm_name = _require_str(arguments, "name")
570
+ result = remove_source(
571
+ rm_name,
572
+ delete_cache=bool(arguments.get("delete_cache", False)),
573
+ )
574
+ else:
575
+ result = ToolResult(f"Unknown tool: {name}", is_error=True)
576
+ except ValueError as err:
577
+ result = ToolResult(str(err), is_error=True)
578
+ except Exception as err: # noqa: BLE001
579
+ logger.exception("Tool %s raised", name)
580
+ result = ToolResult(f"Tool error: {err}", is_error=True)
581
+
582
+ # Return CallToolResult directly so:
583
+ # (a) ``is_error`` propagates (the SDK's tuple/list paths hardcode
584
+ # isError=False), and
585
+ # (b) errors on tools with an outputSchema don't fail the validator
586
+ # for "missing structured content."
587
+ content = [TextContent(type="text", text=result.text)]
588
+ return CallToolResult(
589
+ content=content,
590
+ structuredContent=result.data if not result.is_error else None,
591
+ isError=result.is_error,
592
+ )
593
+
594
+ async with stdio_server() as (read, write):
595
+ await server.run(read, write, server.create_initialization_options())
596
+ return 0
597
+
598
+
599
+ def run_mcp_server(argv: list[str]) -> int:
600
+ """Entry point for ``docpull mcp``."""
601
+ parser = argparse.ArgumentParser(prog="docpull mcp", description="Run the docpull MCP server over stdio.")
602
+ parser.add_argument("--verbose", "-v", action="store_true", help="Enable debug logging")
603
+ args = parser.parse_args(argv)
604
+ logging.basicConfig(
605
+ level=logging.DEBUG if args.verbose else logging.WARNING,
606
+ format="%(asctime)s %(levelname)s %(name)s %(message)s",
607
+ stream=sys.stderr,
608
+ )
609
+ try:
610
+ return asyncio.run(_run_stdio())
611
+ except KeyboardInterrupt:
612
+ return 0
613
+
614
+
615
+ __all__ = ["run_mcp_server"]