sin-code-bundle 0.9.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sin_code_bundle/__init__.py +6 -0
- sin_code_bundle/agents_md.py +245 -0
- sin_code_bundle/ast_edit.py +323 -0
- sin_code_bundle/bench.py +506 -0
- sin_code_bundle/budget.py +51 -0
- sin_code_bundle/cache.py +131 -0
- sin_code_bundle/checkpoint.py +230 -0
- sin_code_bundle/cli.py +1943 -0
- sin_code_bundle/codocs.py +328 -0
- sin_code_bundle/dap_bridge.py +135 -0
- sin_code_bundle/data/codocs/SKILL.md +280 -0
- sin_code_bundle/gitnexus.py +368 -0
- sin_code_bundle/hashline.py +216 -0
- sin_code_bundle/hooks.py +249 -0
- sin_code_bundle/immortal_commit.py +288 -0
- sin_code_bundle/interceptor.py +119 -0
- sin_code_bundle/lsp_backend.py +303 -0
- sin_code_bundle/lsp_bootstrap.py +85 -0
- sin_code_bundle/markitdown.py +254 -0
- sin_code_bundle/mcp_config.py +455 -0
- sin_code_bundle/mcp_server.py +963 -0
- sin_code_bundle/memory.py +208 -0
- sin_code_bundle/merge_safety.py +313 -0
- sin_code_bundle/orchestration_worktrees.py +102 -0
- sin_code_bundle/policy.py +224 -0
- sin_code_bundle/preflight.py +152 -0
- sin_code_bundle/programming_workflow.py +541 -0
- sin_code_bundle/rtk.py +154 -0
- sin_code_bundle/safety.py +52 -0
- sin_code_bundle/session_warmup.py +247 -0
- sin_code_bundle/skills.py +188 -0
- sin_code_bundle/symbol_resolve.py +166 -0
- sin_code_bundle/tools/__init__.py +4 -0
- sin_code_bundle/tools/pypi_setup.py +289 -0
- sin_code_bundle/vfs.py +264 -0
- sin_code_bundle-0.9.2.dist-info/METADATA +470 -0
- sin_code_bundle-0.9.2.dist-info/RECORD +41 -0
- sin_code_bundle-0.9.2.dist-info/WHEEL +5 -0
- sin_code_bundle-0.9.2.dist-info/entry_points.txt +4 -0
- sin_code_bundle-0.9.2.dist-info/licenses/LICENSE +21 -0
- sin_code_bundle-0.9.2.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,303 @@
|
|
|
1
|
+
# SPDX-License-Identifier: MIT
|
|
2
|
+
"""LSP-backed symbol resolution for the SCKG.
|
|
3
|
+
|
|
4
|
+
This makes `impact()` structural and type-accurate instead of textual:
|
|
5
|
+
- "what calls this symbol?" -> LSP references
|
|
6
|
+
- "where is it defined?" -> LSP definition
|
|
7
|
+
- blast-radius scoring -> ranked caller set + fan-in
|
|
8
|
+
|
|
9
|
+
Primary backend: multilspy (drives real language servers: pyright, gopls,
|
|
10
|
+
typescript-language-server, rust-analyzer, jdtls, …).
|
|
11
|
+
Fallback backend: tree-sitter symbol scan (cheap, language-agnostic, no server).
|
|
12
|
+
|
|
13
|
+
The module degrades gracefully: if no LSP is available it returns tree-sitter
|
|
14
|
+
results and flags `source="treesitter"`, so the agent still gets a useful signal
|
|
15
|
+
and the bundle keeps working (consistent with `sin status`).
|
|
16
|
+
|
|
17
|
+
Docs: lsp_backend.doc.md
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
from __future__ import annotations
|
|
21
|
+
|
|
22
|
+
import asyncio
|
|
23
|
+
from dataclasses import dataclass, field
|
|
24
|
+
from pathlib import Path
|
|
25
|
+
from typing import Literal, Optional
|
|
26
|
+
|
|
27
|
+
Source = Literal["lsp", "treesitter", "none"]
|
|
28
|
+
|
|
29
|
+
_LANG_BY_EXT = {
|
|
30
|
+
".py": "python",
|
|
31
|
+
".ts": "typescript",
|
|
32
|
+
".tsx": "typescript",
|
|
33
|
+
".js": "javascript",
|
|
34
|
+
".jsx": "javascript",
|
|
35
|
+
".go": "go",
|
|
36
|
+
".rs": "rust",
|
|
37
|
+
".java": "java",
|
|
38
|
+
".rb": "ruby",
|
|
39
|
+
".php": "php",
|
|
40
|
+
".cs": "csharp",
|
|
41
|
+
".c": "c",
|
|
42
|
+
".cpp": "cpp",
|
|
43
|
+
".h": "cpp",
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
@dataclass(frozen=True)
|
|
48
|
+
class Location:
|
|
49
|
+
"""A single source-code position, optionally with a short snippet."""
|
|
50
|
+
|
|
51
|
+
file: str
|
|
52
|
+
line: int
|
|
53
|
+
column: int
|
|
54
|
+
snippet: str = ""
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
# ── LSPBackend: Language Server Manager ────────────────────────────────
|
|
58
|
+
@dataclass
|
|
59
|
+
class ImpactResult:
|
|
60
|
+
"""Compact, deterministic blast-radius payload for the agent."""
|
|
61
|
+
|
|
62
|
+
symbol: str
|
|
63
|
+
defined_at: Optional[Location]
|
|
64
|
+
callers: list[Location] = field(default_factory=list)
|
|
65
|
+
fan_in: int = 0
|
|
66
|
+
touches_tests: bool = False
|
|
67
|
+
touches_public_api: bool = False
|
|
68
|
+
risk: Literal["low", "medium", "high"] = "low"
|
|
69
|
+
source: Source = "none"
|
|
70
|
+
notes: list[str] = field(default_factory=list)
|
|
71
|
+
|
|
72
|
+
def to_dict(self) -> dict:
|
|
73
|
+
"""Serialize to a JSON-safe dict (caches the result under `cache.set`).
|
|
74
|
+
|
|
75
|
+
Returns a plain dict with `Location` fields flattened to `{file, line,
|
|
76
|
+
column, snippet}` so the GraphCache (JSONL-backed) can round-trip it
|
|
77
|
+
without a custom encoder.
|
|
78
|
+
"""
|
|
79
|
+
return {
|
|
80
|
+
"symbol": self.symbol,
|
|
81
|
+
"defined_at": _loc_to_dict(self.defined_at),
|
|
82
|
+
"callers": [_loc_to_dict(c) for c in self.callers],
|
|
83
|
+
"fan_in": self.fan_in,
|
|
84
|
+
"touches_tests": self.touches_tests,
|
|
85
|
+
"touches_public_api": self.touches_public_api,
|
|
86
|
+
"risk": self.risk,
|
|
87
|
+
"source": self.source,
|
|
88
|
+
"notes": self.notes,
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def _loc_to_dict(loc: Optional[Location]) -> Optional[dict]:
|
|
93
|
+
if loc is None:
|
|
94
|
+
return None
|
|
95
|
+
return {"file": loc.file, "line": loc.line, "column": loc.column, "snippet": loc.snippet}
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def _lang_for(path: Path) -> Optional[str]:
|
|
99
|
+
return _LANG_BY_EXT.get(path.suffix.lower())
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def _score_risk(
|
|
103
|
+
callers: int, touches_tests: bool, touches_api: bool
|
|
104
|
+
) -> Literal["low", "medium", "high"]:
|
|
105
|
+
# Thresholds are intentionally simple and conservative. >10 callers = broad
|
|
106
|
+
# blast radius (high). >3 = significant surface area (medium). Tests + API
|
|
107
|
+
# each escalate one tier (e.g. a 4-caller non-test/non-api function is
|
|
108
|
+
# "low" but a 4-caller test-touching one is "medium").
|
|
109
|
+
if touches_api or callers > 10:
|
|
110
|
+
return "high"
|
|
111
|
+
if touches_tests or callers > 3:
|
|
112
|
+
return "medium"
|
|
113
|
+
return "low"
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def _is_test_path(p: str) -> bool:
|
|
117
|
+
pl = p.lower()
|
|
118
|
+
return "test" in Path(pl).name or "/tests/" in pl or pl.endswith("_test.py")
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def _is_public_api_path(p: str) -> bool:
|
|
122
|
+
name = Path(p).name.lower()
|
|
123
|
+
return name in {"__init__.py", "api.py", "index.ts", "index.js", "mod.rs", "lib.rs"}
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
# ── Language Detection: File → Server Mapping ──────────────────────────
|
|
127
|
+
# --------------------------------------------------------------------------- #
|
|
128
|
+
# LSP backend (multilspy)
|
|
129
|
+
# --------------------------------------------------------------------------- #
|
|
130
|
+
async def _lsp_impact(
|
|
131
|
+
root: Path, file: Path, symbol: str, line: int, column: int
|
|
132
|
+
) -> Optional[ImpactResult]:
|
|
133
|
+
try:
|
|
134
|
+
from multilspy import LanguageServer # type: ignore
|
|
135
|
+
from multilspy.multilspy_config import MultilspyConfig # type: ignore
|
|
136
|
+
from multilspy.multilspy_logger import MultilspyLogger # type: ignore
|
|
137
|
+
except ImportError:
|
|
138
|
+
return None
|
|
139
|
+
|
|
140
|
+
lang = _lang_for(file)
|
|
141
|
+
if not lang:
|
|
142
|
+
return None
|
|
143
|
+
|
|
144
|
+
config = MultilspyConfig.from_dict({"code_language": lang})
|
|
145
|
+
logger = MultilspyLogger()
|
|
146
|
+
server = LanguageServer.create(config, logger, str(root))
|
|
147
|
+
|
|
148
|
+
rel = str(file.relative_to(root)) if file.is_absolute() else str(file)
|
|
149
|
+
async with server.start_server():
|
|
150
|
+
definition = await server.request_definition(rel, line - 1, column - 1)
|
|
151
|
+
references = await server.request_references(rel, line - 1, column - 1)
|
|
152
|
+
|
|
153
|
+
def_loc: Optional[Location] = None
|
|
154
|
+
if definition:
|
|
155
|
+
d = definition[0]
|
|
156
|
+
def_loc = Location(
|
|
157
|
+
file=d.get("relativePath", d.get("uri", "")),
|
|
158
|
+
line=d["range"]["start"]["line"] + 1,
|
|
159
|
+
column=d["range"]["start"]["character"] + 1,
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
callers: list[Location] = []
|
|
163
|
+
for ref in references or []:
|
|
164
|
+
rp = ref.get("relativePath", ref.get("uri", ""))
|
|
165
|
+
callers.append(
|
|
166
|
+
Location(
|
|
167
|
+
file=rp,
|
|
168
|
+
line=ref["range"]["start"]["line"] + 1,
|
|
169
|
+
column=ref["range"]["start"]["character"] + 1,
|
|
170
|
+
)
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
touches_tests = any(_is_test_path(c.file) for c in callers)
|
|
174
|
+
touches_api = any(_is_public_api_path(c.file) for c in callers)
|
|
175
|
+
fan_in = len(callers)
|
|
176
|
+
# Cap caller list at 25 — fits an LLM prompt-friendly blast-radius view
|
|
177
|
+
# without dropping high-fan-in signals. Anything larger reports the
|
|
178
|
+
# truncated count in `notes` so the agent can ask for more if needed.
|
|
179
|
+
return ImpactResult(
|
|
180
|
+
symbol=symbol,
|
|
181
|
+
defined_at=def_loc,
|
|
182
|
+
callers=callers[:25],
|
|
183
|
+
fan_in=fan_in,
|
|
184
|
+
touches_tests=touches_tests,
|
|
185
|
+
touches_public_api=touches_api,
|
|
186
|
+
risk=_score_risk(fan_in, touches_tests, touches_api),
|
|
187
|
+
source="lsp",
|
|
188
|
+
notes=[] if fan_in <= 25 else [f"{fan_in} callers total; showing first 25"],
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
# --------------------------------------------------------------------------- #
|
|
193
|
+
# tree-sitter fallback (textual but symbol-aware)
|
|
194
|
+
# --------------------------------------------------------------------------- #
|
|
195
|
+
def _treesitter_impact(root: Path, symbol: str) -> ImpactResult:
|
|
196
|
+
bare = symbol.split(".")[-1].split("::")[-1]
|
|
197
|
+
callers: list[Location] = []
|
|
198
|
+
defined_at: Optional[Location] = None
|
|
199
|
+
|
|
200
|
+
for path in root.rglob("*"):
|
|
201
|
+
if not path.is_file() or _lang_for(path) is None:
|
|
202
|
+
continue
|
|
203
|
+
if any(part in {".git", "node_modules", ".venv", "__pycache__"} for part in path.parts):
|
|
204
|
+
continue
|
|
205
|
+
try:
|
|
206
|
+
text = path.read_text(encoding="utf-8", errors="ignore")
|
|
207
|
+
except OSError:
|
|
208
|
+
continue
|
|
209
|
+
for i, raw in enumerate(text.splitlines(), start=1):
|
|
210
|
+
if bare not in raw:
|
|
211
|
+
continue
|
|
212
|
+
col = raw.find(bare) + 1
|
|
213
|
+
loc = Location(
|
|
214
|
+
file=str(path.relative_to(root)),
|
|
215
|
+
line=i,
|
|
216
|
+
column=col,
|
|
217
|
+
snippet=raw.strip()[:120],
|
|
218
|
+
)
|
|
219
|
+
stripped = raw.lstrip()
|
|
220
|
+
if defined_at is None and (
|
|
221
|
+
stripped.startswith(("def ", "class ", "func ", "fn ", "function "))
|
|
222
|
+
and bare in stripped.split("(")[0]
|
|
223
|
+
):
|
|
224
|
+
defined_at = loc
|
|
225
|
+
else:
|
|
226
|
+
callers.append(loc)
|
|
227
|
+
|
|
228
|
+
touches_tests = any(_is_test_path(c.file) for c in callers)
|
|
229
|
+
touches_api = any(_is_public_api_path(c.file) for c in callers)
|
|
230
|
+
fan_in = len(callers)
|
|
231
|
+
# Mirror the same 25-caller cap as the LSP path above — keeps both
|
|
232
|
+
# backend outputs structurally identical so callers don't have to branch.
|
|
233
|
+
return ImpactResult(
|
|
234
|
+
symbol=symbol,
|
|
235
|
+
defined_at=defined_at,
|
|
236
|
+
callers=callers[:25],
|
|
237
|
+
fan_in=fan_in,
|
|
238
|
+
touches_tests=touches_tests,
|
|
239
|
+
touches_public_api=touches_api,
|
|
240
|
+
risk=_score_risk(fan_in, touches_tests, touches_api),
|
|
241
|
+
source="treesitter",
|
|
242
|
+
notes=["LSP unavailable — textual approximation. Install 'sin[lsp]' for accuracy."],
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
# ── Graceful Shutdown: Cleanup Lifecycle ──────────────────────────────
|
|
247
|
+
# --------------------------------------------------------------------------- #
|
|
248
|
+
# Public entry point
|
|
249
|
+
# --------------------------------------------------------------------------- #
|
|
250
|
+
def compute_impact(
|
|
251
|
+
root: str | Path,
|
|
252
|
+
symbol: str,
|
|
253
|
+
file: Optional[str | Path] = None,
|
|
254
|
+
line: Optional[int] = None,
|
|
255
|
+
column: Optional[int] = None,
|
|
256
|
+
) -> ImpactResult:
|
|
257
|
+
"""Resolve the blast radius of `symbol`.
|
|
258
|
+
|
|
259
|
+
If (file, line, column) are given and an LSP is available, returns precise
|
|
260
|
+
LSP references. Otherwise falls back to a tree-sitter/textual scan.
|
|
261
|
+
|
|
262
|
+
Results are cached under .sin/cache/ and reused if the repo hasn't changed.
|
|
263
|
+
"""
|
|
264
|
+
root_path = Path(root).resolve()
|
|
265
|
+
|
|
266
|
+
# Cache layer
|
|
267
|
+
from sin_code_bundle.cache import GraphCache
|
|
268
|
+
|
|
269
|
+
cache = GraphCache(root_path)
|
|
270
|
+
cache_key = f"impact:{symbol}:{file}:{line}:{column}"
|
|
271
|
+
cached = cache.get(cache_key)
|
|
272
|
+
if cached is not None:
|
|
273
|
+
defined = cached.get("defined_at")
|
|
274
|
+
return ImpactResult(
|
|
275
|
+
symbol=cached["symbol"],
|
|
276
|
+
defined_at=Location(**defined) if defined else None,
|
|
277
|
+
callers=[Location(**c) for c in cached.get("callers", [])],
|
|
278
|
+
fan_in=cached.get("fan_in", 0),
|
|
279
|
+
touches_tests=cached.get("touches_tests", False),
|
|
280
|
+
touches_public_api=cached.get("touches_public_api", False),
|
|
281
|
+
risk=cached.get("risk", "low"),
|
|
282
|
+
source=cached.get("source", "none"),
|
|
283
|
+
notes=cached.get("notes", []),
|
|
284
|
+
)
|
|
285
|
+
|
|
286
|
+
if file and line and column:
|
|
287
|
+
file_path = (
|
|
288
|
+
(root_path / file) if not Path(file).is_absolute() else Path(file) # type: ignore[arg-type]
|
|
289
|
+
)
|
|
290
|
+
try:
|
|
291
|
+
result = asyncio.run(_lsp_impact(root_path, file_path, symbol, line, column))
|
|
292
|
+
if result is not None:
|
|
293
|
+
cache.set(cache_key, result.to_dict())
|
|
294
|
+
return result
|
|
295
|
+
except Exception as exc: # noqa: BLE001
|
|
296
|
+
ts = _treesitter_impact(root_path, symbol)
|
|
297
|
+
ts.notes.append(f"LSP error, used fallback: {exc}")
|
|
298
|
+
cache.set(cache_key, ts.to_dict())
|
|
299
|
+
return ts
|
|
300
|
+
|
|
301
|
+
result = _treesitter_impact(root_path, symbol)
|
|
302
|
+
cache.set(cache_key, result.to_dict())
|
|
303
|
+
return result
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
"""Detect repo languages and ensure the matching language servers are present.
|
|
2
|
+
|
|
3
|
+
`sin doctor` uses this to tell users exactly what to install for accurate
|
|
4
|
+
impact analysis. We never silently install global tooling; we report and offer
|
|
5
|
+
the exact install command.
|
|
6
|
+
|
|
7
|
+
Docs: lsp_bootstrap.doc.md
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import shutil
|
|
13
|
+
from collections import Counter
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
|
|
16
|
+
# language -> (server binary, install hint)
|
|
17
|
+
SERVERS: dict[str, tuple[str, str]] = {
|
|
18
|
+
"python": (
|
|
19
|
+
"pyright-langserver",
|
|
20
|
+
"npm i -g pyright (or: pip install pyright)",
|
|
21
|
+
),
|
|
22
|
+
"typescript": (
|
|
23
|
+
"typescript-language-server",
|
|
24
|
+
"npm i -g typescript typescript-language-server",
|
|
25
|
+
),
|
|
26
|
+
"javascript": (
|
|
27
|
+
"typescript-language-server",
|
|
28
|
+
"npm i -g typescript typescript-language-server",
|
|
29
|
+
),
|
|
30
|
+
"go": (
|
|
31
|
+
"gopls",
|
|
32
|
+
"go install golang.org/x/tools/gopls@latest",
|
|
33
|
+
),
|
|
34
|
+
"rust": (
|
|
35
|
+
"rust-analyzer",
|
|
36
|
+
"rustup component add rust-analyzer",
|
|
37
|
+
),
|
|
38
|
+
"java": (
|
|
39
|
+
"jdtls",
|
|
40
|
+
"see: https://github.com/eclipse-jdtls/eclipse.jdt.ls",
|
|
41
|
+
),
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
_EXT_LANG: dict[str, str] = {
|
|
45
|
+
".py": "python",
|
|
46
|
+
".ts": "typescript",
|
|
47
|
+
".tsx": "typescript",
|
|
48
|
+
".js": "javascript",
|
|
49
|
+
".jsx": "javascript",
|
|
50
|
+
".go": "go",
|
|
51
|
+
".rs": "rust",
|
|
52
|
+
".java": "java",
|
|
53
|
+
}
|
|
54
|
+
_IGNORE = {".git", "node_modules", ".venv", "__pycache__", ".sin"}
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def detect_languages(root: Path) -> list[tuple[str, int]]:
|
|
58
|
+
"""Return (language, file_count) pairs, most frequent first."""
|
|
59
|
+
counter: Counter[str] = Counter()
|
|
60
|
+
for p in root.rglob("*"):
|
|
61
|
+
if not p.is_file() or any(part in _IGNORE for part in p.parts):
|
|
62
|
+
continue
|
|
63
|
+
lang = _EXT_LANG.get(p.suffix.lower())
|
|
64
|
+
if lang:
|
|
65
|
+
counter[lang] += 1
|
|
66
|
+
return counter.most_common()
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def server_status(root: Path) -> list[dict]:
|
|
70
|
+
"""Return a list of dicts with language server availability info."""
|
|
71
|
+
rows: list[dict] = []
|
|
72
|
+
for lang, count in detect_languages(root):
|
|
73
|
+
entry = SERVERS.get(lang)
|
|
74
|
+
binary, hint = entry if entry else (None, "no LSP integration yet")
|
|
75
|
+
installed = bool(binary and shutil.which(binary))
|
|
76
|
+
rows.append(
|
|
77
|
+
{
|
|
78
|
+
"language": lang,
|
|
79
|
+
"files": count,
|
|
80
|
+
"server": binary,
|
|
81
|
+
"installed": installed,
|
|
82
|
+
"install_hint": hint,
|
|
83
|
+
}
|
|
84
|
+
)
|
|
85
|
+
return rows
|
|
@@ -0,0 +1,254 @@
|
|
|
1
|
+
# SPDX-License-Identifier: MIT
|
|
2
|
+
"""MarkItDown bridge.
|
|
3
|
+
|
|
4
|
+
MarkItDown (https://github.com/microsoft/markitdown) is an *upstream* tool by
|
|
5
|
+
Microsoft, distributed as the MIT-licensed PyPI packages ``markitdown`` (CLI /
|
|
6
|
+
library) and ``markitdown-mcp`` (an MCP server). We never vendor or copy its
|
|
7
|
+
source; we only invoke the published packages. This keeps the bundle
|
|
8
|
+
MIT-licensed while giving coder agents a first-class way to turn binary and
|
|
9
|
+
office documents (PDF, DOCX, PPTX, XLSX, images, audio, HTML, ...) into
|
|
10
|
+
LLM-friendly Markdown.
|
|
11
|
+
|
|
12
|
+
The bridge provides:
|
|
13
|
+
* discovery / health checks for the ``markitdown-mcp`` runner and the
|
|
14
|
+
``markitdown`` CLI,
|
|
15
|
+
* a thin ``convert`` wrapper over the ``markitdown`` CLI,
|
|
16
|
+
* MCP wiring so OpenCode / Codex / Hermes each get the MarkItDown MCP server,
|
|
17
|
+
mirroring upstream's recommended ``uvx markitdown-mcp`` invocation.
|
|
18
|
+
|
|
19
|
+
Docs: markitdown.doc.md
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
from __future__ import annotations
|
|
23
|
+
|
|
24
|
+
import json
|
|
25
|
+
import shutil
|
|
26
|
+
import subprocess
|
|
27
|
+
from dataclasses import dataclass
|
|
28
|
+
from pathlib import Path
|
|
29
|
+
from typing import Any
|
|
30
|
+
|
|
31
|
+
# ── MarkItDown Bridge: Document → Markdown ────────────────────────────
|
|
32
|
+
# Microsoft MarkItDown is the upstream package. We never vendor it; the
|
|
33
|
+
# bridge only discovers the published `markitdown-mcp` server and the
|
|
34
|
+
# `markitdown` CLI and shells out to them. This keeps the bundle MIT and
|
|
35
|
+
# lets us pick up upstream format support (PDF, DOCX, PPTX, XLSX, images
|
|
36
|
+
# with OCR, audio transcription, HTML, CSV/JSON/XML, ZIP, EPUB, etc.)
|
|
37
|
+
# without re-implementing any of it.
|
|
38
|
+
|
|
39
|
+
# MarkItDown exposes its MCP server through the ``markitdown-mcp`` package.
|
|
40
|
+
# Upstream recommends running it via ``uvx`` so it is fetched/cached on demand;
|
|
41
|
+
# we fall back to a directly-installed ``markitdown-mcp`` executable.
|
|
42
|
+
MARKITDOWN_MCP_PACKAGE = "markitdown-mcp"
|
|
43
|
+
MARKITDOWN_CLI = "markitdown"
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class MarkItDownError(RuntimeError):
|
|
47
|
+
"""Raised when MarkItDown is unavailable or a command fails."""
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
@dataclass
|
|
51
|
+
class MarkItDownEnv:
|
|
52
|
+
"""Resolved runtime environment for invoking MarkItDown."""
|
|
53
|
+
|
|
54
|
+
uvx: str | None
|
|
55
|
+
mcp_exe: str | None
|
|
56
|
+
cli: str | None
|
|
57
|
+
|
|
58
|
+
@property
|
|
59
|
+
def mcp_available(self) -> bool:
|
|
60
|
+
"""True iff either ``uvx`` or a directly-installed ``markitdown-mcp`` binary is on PATH."""
|
|
61
|
+
return bool(self.uvx or self.mcp_exe)
|
|
62
|
+
|
|
63
|
+
@property
|
|
64
|
+
def cli_available(self) -> bool:
|
|
65
|
+
"""True iff the ``markitdown`` CLI (the converter) is on PATH."""
|
|
66
|
+
return bool(self.cli)
|
|
67
|
+
|
|
68
|
+
def mcp_command(self) -> dict[str, Any]:
|
|
69
|
+
"""Return the MCP launch command, preferring ``uvx``."""
|
|
70
|
+
if self.uvx:
|
|
71
|
+
return {"command": "uvx", "args": [MARKITDOWN_MCP_PACKAGE]}
|
|
72
|
+
if self.mcp_exe:
|
|
73
|
+
return {"command": MARKITDOWN_MCP_PACKAGE, "args": []}
|
|
74
|
+
raise MarkItDownError(
|
|
75
|
+
"MarkItDown MCP server not found. Install it with "
|
|
76
|
+
"`pip install markitdown-mcp` (or `uv tool install markitdown-mcp`). "
|
|
77
|
+
"The bundle does not vendor MarkItDown."
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
def cli_cmd(self) -> str:
|
|
81
|
+
"""Return the absolute path of the ``markitdown`` CLI, or raise.
|
|
82
|
+
|
|
83
|
+
Used by ``convert()`` to shell out for one-shot document→markdown
|
|
84
|
+
conversion without spinning up the long-lived MCP server.
|
|
85
|
+
"""
|
|
86
|
+
if not self.cli:
|
|
87
|
+
raise MarkItDownError(
|
|
88
|
+
"`markitdown` CLI not found. Install with `pip install 'markitdown[all]'`."
|
|
89
|
+
)
|
|
90
|
+
return self.cli
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def detect_env() -> MarkItDownEnv:
|
|
94
|
+
"""Probe PATH for ``uvx``, ``markitdown-mcp``, and ``markitdown`` (no I/O beyond that)."""
|
|
95
|
+
return MarkItDownEnv(
|
|
96
|
+
uvx=shutil.which("uvx"),
|
|
97
|
+
mcp_exe=shutil.which(MARKITDOWN_MCP_PACKAGE),
|
|
98
|
+
cli=shutil.which(MARKITDOWN_CLI),
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def mcp_server_command(env: MarkItDownEnv | None = None) -> dict[str, Any]:
|
|
103
|
+
"""Resolve the MCP server launch command (``uvx markitdown-mcp`` by default)."""
|
|
104
|
+
env = env or detect_env()
|
|
105
|
+
return env.mcp_command()
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
# ── Local-Only Safety: File Access Guard ─────────────────────────────
|
|
109
|
+
# `convert()` is the only public surface that touches a file path. It
|
|
110
|
+
# deliberately refuses anything that is not a regular file on the local
|
|
111
|
+
# filesystem — we never want an MCP client (potentially remote / hostile)
|
|
112
|
+
# to coerce us into passing an http:// or pipe:// URL into MarkItDown's
|
|
113
|
+
# CLI, which would expand the attack surface considerably.
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def convert(
|
|
117
|
+
path: str, env: MarkItDownEnv | None = None, timeout: int = 300
|
|
118
|
+
) -> str: # 300s = 5min; large PDFs / pptx with embedded media can be slow on first pass
|
|
119
|
+
"""Convert a document to Markdown using the upstream ``markitdown`` CLI."""
|
|
120
|
+
env = env or detect_env()
|
|
121
|
+
cli = env.cli_cmd()
|
|
122
|
+
src = Path(path)
|
|
123
|
+
# `is_file()` (not `exists()`) — guards against directories and broken
|
|
124
|
+
# symlinks, both of which the CLI would otherwise try to read as content.
|
|
125
|
+
if not src.is_file():
|
|
126
|
+
raise MarkItDownError(f"File not found: {path}")
|
|
127
|
+
try:
|
|
128
|
+
proc = subprocess.run(
|
|
129
|
+
[cli, str(src)],
|
|
130
|
+
capture_output=True,
|
|
131
|
+
text=True,
|
|
132
|
+
timeout=timeout,
|
|
133
|
+
)
|
|
134
|
+
except subprocess.TimeoutExpired as exc: # pragma: no cover - timing dependent
|
|
135
|
+
raise MarkItDownError(f"markitdown timed out after {timeout}s") from exc
|
|
136
|
+
if proc.returncode != 0:
|
|
137
|
+
raise MarkItDownError(f"markitdown failed ({proc.returncode}): {proc.stderr.strip()}")
|
|
138
|
+
return proc.stdout
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def doctor() -> dict[str, Any]:
|
|
142
|
+
"""Report MarkItDown availability for diagnostics."""
|
|
143
|
+
env = detect_env()
|
|
144
|
+
return {
|
|
145
|
+
"mcp_available": env.mcp_available,
|
|
146
|
+
"cli_available": env.cli_available,
|
|
147
|
+
"runner": "uvx" if env.uvx else (MARKITDOWN_MCP_PACKAGE if env.mcp_exe else None),
|
|
148
|
+
"mcp_package": MARKITDOWN_MCP_PACKAGE,
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
# ── OpenCode Integration: File Watcher Hooks ──────────────────────────
|
|
153
|
+
# Below: per-agent MCP config writers. These mutate well-known files
|
|
154
|
+
# under the user's home directory:
|
|
155
|
+
# * OpenCode: ~/.config/opencode/opencode.json (JSON, mcp.<name>)
|
|
156
|
+
# * Codex: ~/.codex/config.toml (TOML, [mcp_servers.<name>])
|
|
157
|
+
# * Hermes: ~/.hermes/mcp.json (JSON, mcpServers.<name>)
|
|
158
|
+
# We DO NOT touch plugin/hook files for the agents — MarkItDown integrates
|
|
159
|
+
# through MCP, the same surface as GitNexus, so behaviour is uniform.
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
# ── MCP Wiring (mirrors the GitNexus bridge) ──────────────────────────────
|
|
163
|
+
def _opencode_config_path() -> Path:
|
|
164
|
+
return Path.home() / ".config" / "opencode" / "opencode.json"
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def _codex_config_path() -> Path:
|
|
168
|
+
return Path.home() / ".codex" / "config.toml"
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def _hermes_config_path() -> Path:
|
|
172
|
+
return Path.home() / ".hermes" / "mcp.json"
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
AGENTS = ("opencode", "codex", "hermes")
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def _launch(env: MarkItDownEnv | None) -> tuple[str, list[str]]:
|
|
179
|
+
cmd = mcp_server_command(env)
|
|
180
|
+
return cmd["command"], cmd["args"]
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def _wire_opencode(env: MarkItDownEnv | None) -> str:
|
|
184
|
+
command, args = _launch(env)
|
|
185
|
+
path = _opencode_config_path()
|
|
186
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
187
|
+
data: dict[str, Any] = {}
|
|
188
|
+
if path.is_file():
|
|
189
|
+
try:
|
|
190
|
+
data = json.loads(path.read_text() or "{}")
|
|
191
|
+
except json.JSONDecodeError:
|
|
192
|
+
data = {}
|
|
193
|
+
mcp = data.setdefault("mcp", {})
|
|
194
|
+
mcp["markitdown"] = {
|
|
195
|
+
"type": "local",
|
|
196
|
+
"command": [command, *args],
|
|
197
|
+
"enabled": True,
|
|
198
|
+
}
|
|
199
|
+
path.write_text(json.dumps(data, indent=2) + "\n")
|
|
200
|
+
return str(path)
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
def _wire_codex(env: MarkItDownEnv | None) -> str:
|
|
204
|
+
command, args = _launch(env)
|
|
205
|
+
path = _codex_config_path()
|
|
206
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
207
|
+
args_repr = ", ".join(f'"{a}"' for a in args)
|
|
208
|
+
block = f'\n[mcp_servers.markitdown]\ncommand = "{command}"\nargs = [{args_repr}]\n'
|
|
209
|
+
existing = path.read_text() if path.is_file() else ""
|
|
210
|
+
if "[mcp_servers.markitdown]" in existing:
|
|
211
|
+
return str(path) # already wired; leave user edits intact
|
|
212
|
+
path.write_text(existing + block)
|
|
213
|
+
return str(path)
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
def _wire_hermes(env: MarkItDownEnv | None) -> str:
|
|
217
|
+
command, args = _launch(env)
|
|
218
|
+
path = _hermes_config_path()
|
|
219
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
220
|
+
data: dict[str, Any] = {}
|
|
221
|
+
if path.is_file():
|
|
222
|
+
try:
|
|
223
|
+
data = json.loads(path.read_text() or "{}")
|
|
224
|
+
except json.JSONDecodeError:
|
|
225
|
+
data = {}
|
|
226
|
+
servers = data.setdefault("mcpServers", {})
|
|
227
|
+
servers["markitdown"] = {"command": command, "args": args}
|
|
228
|
+
path.write_text(json.dumps(data, indent=2) + "\n")
|
|
229
|
+
return str(path)
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
_WIRERS = {
|
|
233
|
+
"opencode": _wire_opencode,
|
|
234
|
+
"codex": _wire_codex,
|
|
235
|
+
"hermes": _wire_hermes,
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
def setup_agents(
|
|
240
|
+
agents: list[str] | None = None,
|
|
241
|
+
env: MarkItDownEnv | None = None,
|
|
242
|
+
) -> dict[str, str]:
|
|
243
|
+
"""Wire the MarkItDown MCP server into each agent's config.
|
|
244
|
+
|
|
245
|
+
Returns a mapping of agent -> config file written.
|
|
246
|
+
"""
|
|
247
|
+
chosen = agents or list(AGENTS)
|
|
248
|
+
written: dict[str, str] = {}
|
|
249
|
+
for agent in chosen:
|
|
250
|
+
wirer = _WIRERS.get(agent)
|
|
251
|
+
if not wirer:
|
|
252
|
+
raise MarkItDownError(f"Unknown agent: {agent!r}. Known: {', '.join(AGENTS)}")
|
|
253
|
+
written[agent] = wirer(env)
|
|
254
|
+
return written
|