mdengine 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- md_generator/__init__.py +5 -0
- md_generator/archive/__init__.py +1 -0
- md_generator/archive/api/__init__.py +1 -0
- md_generator/archive/api/convert_runner.py +29 -0
- md_generator/archive/api/jobs.py +93 -0
- md_generator/archive/api/main.py +204 -0
- md_generator/archive/api/mcp_server.py +38 -0
- md_generator/archive/api/mcp_setup.py +84 -0
- md_generator/archive/api/query_options.py +36 -0
- md_generator/archive/api/settings.py +32 -0
- md_generator/archive/convert_impl.py +729 -0
- md_generator/archive/converter.py +123 -0
- md_generator/archive/options.py +26 -0
- md_generator/image/__init__.py +5 -0
- md_generator/image/api/__init__.py +1 -0
- md_generator/image/api/main.py +314 -0
- md_generator/image/api/mcp_server.py +169 -0
- md_generator/image/api/query_options.py +42 -0
- md_generator/image/api/settings.py +44 -0
- md_generator/image/api/staging.py +46 -0
- md_generator/image/api/zip_bundle.py +17 -0
- md_generator/image/backends/__init__.py +10 -0
- md_generator/image/backends/base.py +17 -0
- md_generator/image/backends/easy.py +60 -0
- md_generator/image/backends/paddle.py +88 -0
- md_generator/image/backends/tesseract.py +45 -0
- md_generator/image/convert_impl.py +117 -0
- md_generator/image/converter.py +111 -0
- md_generator/image/emit.py +67 -0
- md_generator/image/io_util.py +53 -0
- md_generator/image/utils.py +18 -0
- md_generator/pdf/__init__.py +1 -0
- md_generator/pdf/api/__init__.py +1 -0
- md_generator/pdf/api/main.py +261 -0
- md_generator/pdf/api/mcp_server.py +95 -0
- md_generator/pdf/api/settings.py +40 -0
- md_generator/pdf/api/zip_bundle.py +17 -0
- md_generator/pdf/converter.py +70 -0
- md_generator/pdf/md_emit.py +42 -0
- md_generator/pdf/pdf_extract.py +227 -0
- md_generator/pdf/utils.py +55 -0
- md_generator/ppt/__init__.py +1 -0
- md_generator/ppt/api/__init__.py +1 -0
- md_generator/ppt/api/convert_runner.py +30 -0
- md_generator/ppt/api/jobs.py +93 -0
- md_generator/ppt/api/main.py +197 -0
- md_generator/ppt/api/mcp_server.py +38 -0
- md_generator/ppt/api/mcp_setup.py +81 -0
- md_generator/ppt/api/query_options.py +35 -0
- md_generator/ppt/api/settings.py +24 -0
- md_generator/ppt/convert_impl.py +231 -0
- md_generator/ppt/converter.py +81 -0
- md_generator/ppt/embedded_extract.py +190 -0
- md_generator/ppt/ooxml_media.py +94 -0
- md_generator/ppt/options.py +39 -0
- md_generator/ppt/post_assets.py +172 -0
- md_generator/ppt/text_formatting.py +97 -0
- md_generator/ppt/vendor_pdf_md/__init__.py +5 -0
- md_generator/ppt/vendor_pdf_md/convert.py +76 -0
- md_generator/ppt/vendor_word_md/__init__.py +5 -0
- md_generator/ppt/vendor_word_md/convert.py +43 -0
- md_generator/ppt/zip_deep.py +31 -0
- md_generator/text/__init__.py +1 -0
- md_generator/text/api/__init__.py +1 -0
- md_generator/text/api/convert_runner.py +24 -0
- md_generator/text/api/jobs.py +93 -0
- md_generator/text/api/main.py +183 -0
- md_generator/text/api/mcp_server.py +38 -0
- md_generator/text/api/mcp_setup.py +78 -0
- md_generator/text/api/query_options.py +17 -0
- md_generator/text/api/settings.py +24 -0
- md_generator/text/convert_impl.py +72 -0
- md_generator/text/converter.py +73 -0
- md_generator/text/format_detect.py +53 -0
- md_generator/text/md_emit_json.py +144 -0
- md_generator/text/md_emit_txt.py +115 -0
- md_generator/text/md_emit_xml.py +127 -0
- md_generator/text/options.py +27 -0
- md_generator/word/__init__.py +3 -0
- md_generator/word/api/__init__.py +1 -0
- md_generator/word/api/convert_util.py +41 -0
- md_generator/word/api/jobs.py +86 -0
- md_generator/word/api/main.py +172 -0
- md_generator/word/api/mcp_server.py +98 -0
- md_generator/word/artifact.py +30 -0
- md_generator/word/converter.py +192 -0
- md_generator/word/settings.py +58 -0
- md_generator/xlsx/__init__.py +8 -0
- md_generator/xlsx/api/__init__.py +1 -0
- md_generator/xlsx/api/app.py +293 -0
- md_generator/xlsx/convert_config.py +58 -0
- md_generator/xlsx/converter.py +67 -0
- md_generator/xlsx/converter_core.py +135 -0
- md_generator/xlsx/excel_reader.py +170 -0
- md_generator/xlsx/markdown_emitter.py +143 -0
- md_generator/xlsx/mcp_server.py +50 -0
- mdengine-0.1.0.dist-info/METADATA +509 -0
- mdengine-0.1.0.dist-info/RECORD +102 -0
- mdengine-0.1.0.dist-info/WHEEL +5 -0
- mdengine-0.1.0.dist-info/entry_points.txt +8 -0
- mdengine-0.1.0.dist-info/licenses/LICENSE +21 -0
- mdengine-0.1.0.dist-info/top_level.txt +1 -0
md_generator/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""zip-to-md: ZIP archive to Markdown + assets."""
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""HTTP API and MCP entrypoints for zip-to-md."""
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import io
|
|
4
|
+
import zipfile
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from md_generator.archive.convert_impl import convert_zip
|
|
8
|
+
from md_generator.archive.options import ConvertOptions
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def build_artifact_zip_bytes(zip_path: Path, options: ConvertOptions) -> bytes:
|
|
12
|
+
"""Run conversion into a temp artifact dir; return ZIP bytes (document.md + assets/)."""
|
|
13
|
+
import tempfile
|
|
14
|
+
|
|
15
|
+
buf = io.BytesIO()
|
|
16
|
+
with tempfile.TemporaryDirectory() as td:
|
|
17
|
+
out = Path(td) / "artifact"
|
|
18
|
+
convert_zip(zip_path, out, options)
|
|
19
|
+
with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as zf:
|
|
20
|
+
doc = out / "document.md"
|
|
21
|
+
if doc.is_file():
|
|
22
|
+
zf.write(doc, "document.md")
|
|
23
|
+
assets = out / "assets"
|
|
24
|
+
if assets.is_dir():
|
|
25
|
+
for p in sorted(assets.rglob("*")):
|
|
26
|
+
if p.is_file():
|
|
27
|
+
arc = Path("assets") / p.relative_to(assets)
|
|
28
|
+
zf.write(p, arc.as_posix())
|
|
29
|
+
return buf.getvalue()
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import shutil
|
|
4
|
+
import threading
|
|
5
|
+
import time
|
|
6
|
+
import uuid
|
|
7
|
+
from dataclasses import dataclass, field
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Callable
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass
|
|
13
|
+
class Job:
|
|
14
|
+
job_id: str
|
|
15
|
+
workspace: Path
|
|
16
|
+
status: str = "queued"
|
|
17
|
+
error: str | None = None
|
|
18
|
+
created_at: float = field(default_factory=time.time)
|
|
19
|
+
zip_path: Path | None = None
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class JobStore:
|
|
23
|
+
def __init__(self, base_temp: Path | None, ttl_seconds: int) -> None:
|
|
24
|
+
self._base = base_temp
|
|
25
|
+
self._ttl = ttl_seconds
|
|
26
|
+
self._jobs: dict[str, Job] = {}
|
|
27
|
+
self._lock = threading.Lock()
|
|
28
|
+
self._sweeper_started = False
|
|
29
|
+
|
|
30
|
+
def _root(self) -> Path:
|
|
31
|
+
import tempfile
|
|
32
|
+
|
|
33
|
+
if self._base:
|
|
34
|
+
p = Path(self._base)
|
|
35
|
+
p.mkdir(parents=True, exist_ok=True)
|
|
36
|
+
return p
|
|
37
|
+
return Path(tempfile.gettempdir()) / "zip-to-md-jobs"
|
|
38
|
+
|
|
39
|
+
def start_sweeper(self) -> None:
|
|
40
|
+
if self._sweeper_started:
|
|
41
|
+
return
|
|
42
|
+
self._sweeper_started = True
|
|
43
|
+
|
|
44
|
+
def loop() -> None:
|
|
45
|
+
while True:
|
|
46
|
+
time.sleep(60)
|
|
47
|
+
self.sweep()
|
|
48
|
+
|
|
49
|
+
t = threading.Thread(target=loop, daemon=True)
|
|
50
|
+
t.start()
|
|
51
|
+
|
|
52
|
+
def sweep(self) -> None:
|
|
53
|
+
now = time.time()
|
|
54
|
+
with self._lock:
|
|
55
|
+
dead: list[str] = []
|
|
56
|
+
for jid, job in self._jobs.items():
|
|
57
|
+
if job.status in ("done", "failed") and now - job.created_at > self._ttl:
|
|
58
|
+
dead.append(jid)
|
|
59
|
+
for jid in dead:
|
|
60
|
+
job = self._jobs.pop(jid, None)
|
|
61
|
+
if job and job.workspace.exists():
|
|
62
|
+
shutil.rmtree(job.workspace, ignore_errors=True)
|
|
63
|
+
|
|
64
|
+
def create_job(self) -> Job:
|
|
65
|
+
jid = str(uuid.uuid4())
|
|
66
|
+
ws = self._root() / jid
|
|
67
|
+
ws.mkdir(parents=True, exist_ok=True)
|
|
68
|
+
job = Job(job_id=jid, workspace=ws)
|
|
69
|
+
with self._lock:
|
|
70
|
+
self._jobs[jid] = job
|
|
71
|
+
return job
|
|
72
|
+
|
|
73
|
+
def get(self, job_id: str) -> Job | None:
|
|
74
|
+
with self._lock:
|
|
75
|
+
return self._jobs.get(job_id)
|
|
76
|
+
|
|
77
|
+
def run_async(self, job: Job, fn: Callable[[], None]) -> None:
|
|
78
|
+
def target() -> None:
|
|
79
|
+
try:
|
|
80
|
+
job.status = "running"
|
|
81
|
+
fn()
|
|
82
|
+
job.status = "done"
|
|
83
|
+
except Exception as e:
|
|
84
|
+
job.status = "failed"
|
|
85
|
+
job.error = str(e)
|
|
86
|
+
|
|
87
|
+
threading.Thread(target=target, daemon=True).start()
|
|
88
|
+
|
|
89
|
+
def remove_after_download(self, job: Job) -> None:
|
|
90
|
+
with self._lock:
|
|
91
|
+
self._jobs.pop(job.job_id, None)
|
|
92
|
+
if job.workspace.exists():
|
|
93
|
+
shutil.rmtree(job.workspace, ignore_errors=True)
|
|
@@ -0,0 +1,204 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from contextlib import asynccontextmanager
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
from fastapi import FastAPI, File, HTTPException, Request, UploadFile
|
|
7
|
+
from fastapi.middleware.cors import CORSMiddleware
|
|
8
|
+
from fastapi.responses import FileResponse, Response
|
|
9
|
+
from starlette.background import BackgroundTask
|
|
10
|
+
|
|
11
|
+
from md_generator.archive.api.convert_runner import build_artifact_zip_bytes
|
|
12
|
+
from md_generator.archive.api.jobs import JobStore
|
|
13
|
+
from md_generator.archive.api.mcp_setup import build_mcp_stack
|
|
14
|
+
from md_generator.archive.api.query_options import convert_options_from_query
|
|
15
|
+
from md_generator.archive.api.settings import ApiSettings, cors_list
|
|
16
|
+
from md_generator.archive.options import DEFAULT_IMAGE_TO_MD_ENGINES
|
|
17
|
+
|
|
18
|
+
_mcp, _mcp_http = build_mcp_stack(mount_under_fastapi=True)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@asynccontextmanager
|
|
22
|
+
async def lifespan(app: FastAPI):
|
|
23
|
+
settings = ApiSettings()
|
|
24
|
+
base = Path(settings.temp_dir) if settings.temp_dir else None
|
|
25
|
+
store = JobStore(base, settings.job_ttl_seconds)
|
|
26
|
+
store.start_sweeper()
|
|
27
|
+
app.state.settings = settings
|
|
28
|
+
app.state.job_store = store
|
|
29
|
+
async with _mcp.session_manager.run():
|
|
30
|
+
yield
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
app = FastAPI(title="zip-to-md", lifespan=lifespan)
|
|
34
|
+
app.mount("/mcp", _mcp_http)
|
|
35
|
+
|
|
36
|
+
_bootstrap_settings = ApiSettings()
|
|
37
|
+
_origins = cors_list(_bootstrap_settings)
|
|
38
|
+
app.add_middleware(
|
|
39
|
+
CORSMiddleware,
|
|
40
|
+
allow_origins=_origins,
|
|
41
|
+
allow_credentials="*" not in _origins,
|
|
42
|
+
allow_methods=["*"],
|
|
43
|
+
allow_headers=["*"],
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def _merge_repo_root(settings: ApiSettings, query_repo: str | None) -> str | None:
|
|
48
|
+
if query_repo and query_repo.strip():
|
|
49
|
+
return query_repo.strip()
|
|
50
|
+
return settings.repo_root
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
async def _read_upload_limited(upload: UploadFile, max_bytes: int) -> bytes:
|
|
54
|
+
data = bytearray()
|
|
55
|
+
chunk_size = 1024 * 1024
|
|
56
|
+
while True:
|
|
57
|
+
chunk = await upload.read(chunk_size)
|
|
58
|
+
if not chunk:
|
|
59
|
+
break
|
|
60
|
+
data += chunk
|
|
61
|
+
if len(data) > max_bytes:
|
|
62
|
+
raise HTTPException(
|
|
63
|
+
status_code=413,
|
|
64
|
+
detail="Upload exceeds ZIP_TO_MD_MAX_UPLOAD_MB",
|
|
65
|
+
)
|
|
66
|
+
return bytes(data)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
@app.post("/convert/sync")
|
|
70
|
+
async def convert_sync(
|
|
71
|
+
request: Request,
|
|
72
|
+
file: UploadFile = File(...),
|
|
73
|
+
enable_office: bool = True,
|
|
74
|
+
image_ocr: bool = False,
|
|
75
|
+
pdf_ocr: bool = False,
|
|
76
|
+
max_bytes: int = 512_000,
|
|
77
|
+
expand_nested_zips: bool = True,
|
|
78
|
+
max_nested_zip_depth: int = 16,
|
|
79
|
+
repo_root: str | None = None,
|
|
80
|
+
use_image_to_md: bool = True,
|
|
81
|
+
image_to_md_engines: str = DEFAULT_IMAGE_TO_MD_ENGINES,
|
|
82
|
+
image_to_md_strategy: str = "best",
|
|
83
|
+
image_to_md_title: str = "",
|
|
84
|
+
) -> Response:
|
|
85
|
+
settings: ApiSettings = request.app.state.settings
|
|
86
|
+
if not file.filename or not file.filename.lower().endswith(".zip"):
|
|
87
|
+
raise HTTPException(400, detail="Expected a .zip file upload (multipart field 'file')")
|
|
88
|
+
max_u = settings.max_upload_mb * 1024 * 1024
|
|
89
|
+
max_sync = settings.max_sync_upload_mb * 1024 * 1024
|
|
90
|
+
body = await _read_upload_limited(file, max_u)
|
|
91
|
+
if len(body) > max_sync:
|
|
92
|
+
raise HTTPException(
|
|
93
|
+
status_code=409,
|
|
94
|
+
detail="File too large for synchronous conversion; use POST /convert/jobs",
|
|
95
|
+
)
|
|
96
|
+
rr = _merge_repo_root(settings, repo_root)
|
|
97
|
+
opts = convert_options_from_query(
|
|
98
|
+
enable_office=enable_office,
|
|
99
|
+
image_ocr=image_ocr,
|
|
100
|
+
pdf_ocr=pdf_ocr,
|
|
101
|
+
max_bytes=max_bytes,
|
|
102
|
+
expand_nested_zips=expand_nested_zips,
|
|
103
|
+
max_nested_zip_depth=max_nested_zip_depth,
|
|
104
|
+
repo_root=rr,
|
|
105
|
+
use_image_to_md=use_image_to_md,
|
|
106
|
+
image_to_md_engines=image_to_md_engines,
|
|
107
|
+
image_to_md_strategy=image_to_md_strategy,
|
|
108
|
+
image_to_md_title=image_to_md_title,
|
|
109
|
+
)
|
|
110
|
+
import tempfile
|
|
111
|
+
|
|
112
|
+
with tempfile.NamedTemporaryFile(suffix=".zip", delete=False) as tf:
|
|
113
|
+
tf.write(body)
|
|
114
|
+
tmp_in = Path(tf.name)
|
|
115
|
+
try:
|
|
116
|
+
zbytes = build_artifact_zip_bytes(tmp_in, opts)
|
|
117
|
+
finally:
|
|
118
|
+
tmp_in.unlink(missing_ok=True)
|
|
119
|
+
return Response(
|
|
120
|
+
content=zbytes,
|
|
121
|
+
media_type="application/zip",
|
|
122
|
+
headers={"Content-Disposition": 'attachment; filename="artifact.zip"'},
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
@app.post("/convert/jobs")
|
|
127
|
+
async def convert_jobs(
|
|
128
|
+
request: Request,
|
|
129
|
+
file: UploadFile = File(...),
|
|
130
|
+
enable_office: bool = True,
|
|
131
|
+
image_ocr: bool = False,
|
|
132
|
+
pdf_ocr: bool = False,
|
|
133
|
+
max_bytes: int = 512_000,
|
|
134
|
+
expand_nested_zips: bool = True,
|
|
135
|
+
max_nested_zip_depth: int = 16,
|
|
136
|
+
repo_root: str | None = None,
|
|
137
|
+
use_image_to_md: bool = True,
|
|
138
|
+
image_to_md_engines: str = DEFAULT_IMAGE_TO_MD_ENGINES,
|
|
139
|
+
image_to_md_strategy: str = "best",
|
|
140
|
+
image_to_md_title: str = "",
|
|
141
|
+
) -> dict:
|
|
142
|
+
settings: ApiSettings = request.app.state.settings
|
|
143
|
+
store: JobStore = request.app.state.job_store
|
|
144
|
+
if not file.filename or not file.filename.lower().endswith(".zip"):
|
|
145
|
+
raise HTTPException(400, detail="Expected a .zip file upload (multipart field 'file')")
|
|
146
|
+
max_u = settings.max_upload_mb * 1024 * 1024
|
|
147
|
+
body = await _read_upload_limited(file, max_u)
|
|
148
|
+
rr = _merge_repo_root(settings, repo_root)
|
|
149
|
+
opts = convert_options_from_query(
|
|
150
|
+
enable_office=enable_office,
|
|
151
|
+
image_ocr=image_ocr,
|
|
152
|
+
pdf_ocr=pdf_ocr,
|
|
153
|
+
max_bytes=max_bytes,
|
|
154
|
+
expand_nested_zips=expand_nested_zips,
|
|
155
|
+
max_nested_zip_depth=max_nested_zip_depth,
|
|
156
|
+
repo_root=rr,
|
|
157
|
+
use_image_to_md=use_image_to_md,
|
|
158
|
+
image_to_md_engines=image_to_md_engines,
|
|
159
|
+
image_to_md_strategy=image_to_md_strategy,
|
|
160
|
+
image_to_md_title=image_to_md_title,
|
|
161
|
+
)
|
|
162
|
+
job = store.create_job()
|
|
163
|
+
inp = job.workspace / "upload.zip"
|
|
164
|
+
inp.write_bytes(body)
|
|
165
|
+
|
|
166
|
+
def work() -> None:
|
|
167
|
+
zpath = job.workspace / "artifact.zip"
|
|
168
|
+
data = build_artifact_zip_bytes(inp, opts)
|
|
169
|
+
zpath.write_bytes(data)
|
|
170
|
+
job.zip_path = zpath
|
|
171
|
+
|
|
172
|
+
store.run_async(job, work)
|
|
173
|
+
return {"job_id": job.job_id, "status": job.status}
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
@app.get("/convert/jobs/{job_id}")
|
|
177
|
+
async def job_status(request: Request, job_id: str) -> dict:
|
|
178
|
+
store: JobStore = request.app.state.job_store
|
|
179
|
+
job = store.get(job_id)
|
|
180
|
+
if not job:
|
|
181
|
+
raise HTTPException(404, detail="Unknown job_id")
|
|
182
|
+
return {
|
|
183
|
+
"status": job.status,
|
|
184
|
+
"error": job.error,
|
|
185
|
+
"created_at": job.created_at,
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
@app.get("/convert/jobs/{job_id}/download", response_class=FileResponse)
|
|
190
|
+
async def job_download(request: Request, job_id: str) -> FileResponse:
|
|
191
|
+
store: JobStore = request.app.state.job_store
|
|
192
|
+
job = store.get(job_id)
|
|
193
|
+
if not job:
|
|
194
|
+
raise HTTPException(404, detail="Unknown job_id")
|
|
195
|
+
if job.status != "done" or not job.zip_path or not job.zip_path.is_file():
|
|
196
|
+
raise HTTPException(400, detail="Job is not ready for download")
|
|
197
|
+
path = job.zip_path
|
|
198
|
+
task = BackgroundTask(store.remove_after_download, job)
|
|
199
|
+
return FileResponse(
|
|
200
|
+
path,
|
|
201
|
+
media_type="application/zip",
|
|
202
|
+
filename="artifact.zip",
|
|
203
|
+
background=task,
|
|
204
|
+
)
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Standalone MCP server (stdio, SSE, or streamable-http).
|
|
3
|
+
|
|
4
|
+
Examples:
|
|
5
|
+
python -m api.mcp_server
|
|
6
|
+
python -m api.mcp_server --transport sse
|
|
7
|
+
python -m api.mcp_server --transport streamable-http
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import argparse
|
|
13
|
+
import asyncio
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def main() -> None:
|
|
17
|
+
parser = argparse.ArgumentParser(description="zip-to-md MCP server")
|
|
18
|
+
parser.add_argument(
|
|
19
|
+
"--transport",
|
|
20
|
+
choices=("stdio", "sse", "streamable-http"),
|
|
21
|
+
default="stdio",
|
|
22
|
+
)
|
|
23
|
+
args = parser.parse_args()
|
|
24
|
+
|
|
25
|
+
from md_generator.archive.api.mcp_setup import build_mcp_stack
|
|
26
|
+
|
|
27
|
+
mcp, _ = build_mcp_stack(mount_under_fastapi=False)
|
|
28
|
+
|
|
29
|
+
if args.transport == "stdio":
|
|
30
|
+
asyncio.run(mcp.run_stdio_async())
|
|
31
|
+
elif args.transport == "sse":
|
|
32
|
+
asyncio.run(mcp.run_sse_async())
|
|
33
|
+
else:
|
|
34
|
+
asyncio.run(mcp.run_streamable_http_async())
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
if __name__ == "__main__":
|
|
38
|
+
main()
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import base64
|
|
4
|
+
import re
|
|
5
|
+
import tempfile
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
from mcp.server.fastmcp import FastMCP
|
|
9
|
+
|
|
10
|
+
from md_generator.archive.api.convert_runner import build_artifact_zip_bytes
|
|
11
|
+
from md_generator.archive.api.query_options import convert_options_from_query
|
|
12
|
+
from md_generator.archive.api.settings import ApiSettings
|
|
13
|
+
from md_generator.archive.options import ConvertOptions
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _decode_base64_zip(data: str) -> bytes:
|
|
17
|
+
s = data.strip()
|
|
18
|
+
if s.startswith("data:"):
|
|
19
|
+
parts = s.split(",", 1)
|
|
20
|
+
if len(parts) == 2:
|
|
21
|
+
s = parts[1]
|
|
22
|
+
return base64.b64decode(s, validate=False)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def build_mcp_stack(*, mount_under_fastapi: bool = False) -> tuple[FastMCP, object]:
|
|
26
|
+
path = "/" if mount_under_fastapi else "/mcp"
|
|
27
|
+
mcp = FastMCP(
|
|
28
|
+
"zip-to-md",
|
|
29
|
+
instructions="Convert .zip archives to Markdown artifact ZIP bundles (document.md + assets/).",
|
|
30
|
+
streamable_http_path=path,
|
|
31
|
+
)
|
|
32
|
+
settings = ApiSettings()
|
|
33
|
+
|
|
34
|
+
def _opts() -> ConvertOptions:
|
|
35
|
+
return convert_options_from_query(
|
|
36
|
+
repo_root=settings.repo_root,
|
|
37
|
+
use_image_to_md=settings.use_image_to_md,
|
|
38
|
+
image_to_md_engines=settings.image_to_md_engines,
|
|
39
|
+
image_to_md_strategy=settings.image_to_md_strategy,
|
|
40
|
+
image_to_md_title=settings.image_to_md_title,
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
@mcp.tool()
|
|
44
|
+
def convert_zip_to_artifact_zip(zip_path: str) -> str:
|
|
45
|
+
"""Convert a local .zip path on the server to a temporary artifact.zip path."""
|
|
46
|
+
src = Path(zip_path).expanduser().resolve()
|
|
47
|
+
if not src.is_file() or src.suffix.lower() != ".zip":
|
|
48
|
+
raise ValueError("zip_path must be an existing .zip file")
|
|
49
|
+
data = build_artifact_zip_bytes(src, _opts())
|
|
50
|
+
fd, name = tempfile.mkstemp(suffix=".zip", prefix="zip-to-md-artifact-")
|
|
51
|
+
import os
|
|
52
|
+
|
|
53
|
+
os.close(fd)
|
|
54
|
+
out = Path(name)
|
|
55
|
+
out.write_bytes(data)
|
|
56
|
+
return str(out)
|
|
57
|
+
|
|
58
|
+
@mcp.tool()
|
|
59
|
+
def convert_zip_base64_to_artifact_zip(
|
|
60
|
+
zip_base64: str,
|
|
61
|
+
filename: str = "upload.zip",
|
|
62
|
+
) -> str:
|
|
63
|
+
"""Decode base64 .zip (optional data:...;base64, prefix) and write artifact.zip path."""
|
|
64
|
+
raw = _decode_base64_zip(zip_base64)
|
|
65
|
+
max_b = settings.max_upload_mb * 1024 * 1024
|
|
66
|
+
if len(raw) > max_b:
|
|
67
|
+
raise ValueError(f"Decoded file exceeds ZIP_TO_MD_MAX_UPLOAD_MB ({settings.max_upload_mb})")
|
|
68
|
+
safe = re.sub(r"[^\w.\-]+", "_", filename) or "upload.zip"
|
|
69
|
+
if not safe.lower().endswith(".zip"):
|
|
70
|
+
safe += ".zip"
|
|
71
|
+
with tempfile.TemporaryDirectory() as td:
|
|
72
|
+
p = Path(td) / safe
|
|
73
|
+
p.write_bytes(raw)
|
|
74
|
+
data = build_artifact_zip_bytes(p, _opts())
|
|
75
|
+
fd, name = tempfile.mkstemp(suffix=".zip", prefix="zip-to-md-artifact-")
|
|
76
|
+
import os
|
|
77
|
+
|
|
78
|
+
os.close(fd)
|
|
79
|
+
out = Path(name)
|
|
80
|
+
out.write_bytes(data)
|
|
81
|
+
return str(out)
|
|
82
|
+
|
|
83
|
+
sub = mcp.streamable_http_app()
|
|
84
|
+
return mcp, sub
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from md_generator.archive.options import DEFAULT_IMAGE_TO_MD_ENGINES, ConvertOptions
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def convert_options_from_query(
|
|
7
|
+
*,
|
|
8
|
+
enable_office: bool = True,
|
|
9
|
+
image_ocr: bool = False,
|
|
10
|
+
pdf_ocr: bool = False,
|
|
11
|
+
max_bytes: int = 512_000,
|
|
12
|
+
expand_nested_zips: bool = True,
|
|
13
|
+
max_nested_zip_depth: int = 16,
|
|
14
|
+
repo_root: str | None = None,
|
|
15
|
+
use_image_to_md: bool = True,
|
|
16
|
+
image_to_md_engines: str = DEFAULT_IMAGE_TO_MD_ENGINES,
|
|
17
|
+
image_to_md_strategy: str = "best",
|
|
18
|
+
image_to_md_title: str = "",
|
|
19
|
+
) -> ConvertOptions:
|
|
20
|
+
"""Map API query parameters to ConvertOptions."""
|
|
21
|
+
return ConvertOptions(
|
|
22
|
+
artifact_layout=True,
|
|
23
|
+
verbose=False,
|
|
24
|
+
enable_office=enable_office,
|
|
25
|
+
image_ocr=image_ocr,
|
|
26
|
+
pdf_ocr=pdf_ocr,
|
|
27
|
+
max_bytes=max_bytes,
|
|
28
|
+
repo_root=repo_root,
|
|
29
|
+
expand_nested_zips=expand_nested_zips,
|
|
30
|
+
max_nested_zip_depth=max_nested_zip_depth,
|
|
31
|
+
use_image_to_md=use_image_to_md,
|
|
32
|
+
image_to_md_engines=(image_to_md_engines or DEFAULT_IMAGE_TO_MD_ENGINES).strip()
|
|
33
|
+
or DEFAULT_IMAGE_TO_MD_ENGINES,
|
|
34
|
+
image_to_md_strategy=image_to_md_strategy if image_to_md_strategy in ("best", "compare") else "best",
|
|
35
|
+
image_to_md_title=(image_to_md_title or "").strip(),
|
|
36
|
+
)
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pydantic_settings import BaseSettings, SettingsConfigDict
|
|
4
|
+
|
|
5
|
+
from md_generator.archive.options import DEFAULT_IMAGE_TO_MD_ENGINES
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class ApiSettings(BaseSettings):
|
|
9
|
+
model_config = SettingsConfigDict(
|
|
10
|
+
env_prefix="ZIP_TO_MD_",
|
|
11
|
+
env_file=".env",
|
|
12
|
+
extra="ignore",
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
max_upload_mb: int = 200
|
|
16
|
+
max_sync_upload_mb: int = 40
|
|
17
|
+
job_ttl_seconds: int = 3600
|
|
18
|
+
temp_dir: str | None = None
|
|
19
|
+
cors_origins: str = "*"
|
|
20
|
+
"""Deprecated: ignored; converters are imported from the md_generator package."""
|
|
21
|
+
repo_root: str | None = None
|
|
22
|
+
use_image_to_md: bool = True
|
|
23
|
+
image_to_md_engines: str = DEFAULT_IMAGE_TO_MD_ENGINES
|
|
24
|
+
image_to_md_strategy: str = "best"
|
|
25
|
+
image_to_md_title: str = ""
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def cors_list(settings: ApiSettings) -> list[str]:
|
|
29
|
+
raw = (settings.cors_origins or "*").strip()
|
|
30
|
+
if raw == "*":
|
|
31
|
+
return ["*"]
|
|
32
|
+
return [o.strip() for o in raw.split(",") if o.strip()]
|