mdengine 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (102) hide show
  1. md_generator/__init__.py +5 -0
  2. md_generator/archive/__init__.py +1 -0
  3. md_generator/archive/api/__init__.py +1 -0
  4. md_generator/archive/api/convert_runner.py +29 -0
  5. md_generator/archive/api/jobs.py +93 -0
  6. md_generator/archive/api/main.py +204 -0
  7. md_generator/archive/api/mcp_server.py +38 -0
  8. md_generator/archive/api/mcp_setup.py +84 -0
  9. md_generator/archive/api/query_options.py +36 -0
  10. md_generator/archive/api/settings.py +32 -0
  11. md_generator/archive/convert_impl.py +729 -0
  12. md_generator/archive/converter.py +123 -0
  13. md_generator/archive/options.py +26 -0
  14. md_generator/image/__init__.py +5 -0
  15. md_generator/image/api/__init__.py +1 -0
  16. md_generator/image/api/main.py +314 -0
  17. md_generator/image/api/mcp_server.py +169 -0
  18. md_generator/image/api/query_options.py +42 -0
  19. md_generator/image/api/settings.py +44 -0
  20. md_generator/image/api/staging.py +46 -0
  21. md_generator/image/api/zip_bundle.py +17 -0
  22. md_generator/image/backends/__init__.py +10 -0
  23. md_generator/image/backends/base.py +17 -0
  24. md_generator/image/backends/easy.py +60 -0
  25. md_generator/image/backends/paddle.py +88 -0
  26. md_generator/image/backends/tesseract.py +45 -0
  27. md_generator/image/convert_impl.py +117 -0
  28. md_generator/image/converter.py +111 -0
  29. md_generator/image/emit.py +67 -0
  30. md_generator/image/io_util.py +53 -0
  31. md_generator/image/utils.py +18 -0
  32. md_generator/pdf/__init__.py +1 -0
  33. md_generator/pdf/api/__init__.py +1 -0
  34. md_generator/pdf/api/main.py +261 -0
  35. md_generator/pdf/api/mcp_server.py +95 -0
  36. md_generator/pdf/api/settings.py +40 -0
  37. md_generator/pdf/api/zip_bundle.py +17 -0
  38. md_generator/pdf/converter.py +70 -0
  39. md_generator/pdf/md_emit.py +42 -0
  40. md_generator/pdf/pdf_extract.py +227 -0
  41. md_generator/pdf/utils.py +55 -0
  42. md_generator/ppt/__init__.py +1 -0
  43. md_generator/ppt/api/__init__.py +1 -0
  44. md_generator/ppt/api/convert_runner.py +30 -0
  45. md_generator/ppt/api/jobs.py +93 -0
  46. md_generator/ppt/api/main.py +197 -0
  47. md_generator/ppt/api/mcp_server.py +38 -0
  48. md_generator/ppt/api/mcp_setup.py +81 -0
  49. md_generator/ppt/api/query_options.py +35 -0
  50. md_generator/ppt/api/settings.py +24 -0
  51. md_generator/ppt/convert_impl.py +231 -0
  52. md_generator/ppt/converter.py +81 -0
  53. md_generator/ppt/embedded_extract.py +190 -0
  54. md_generator/ppt/ooxml_media.py +94 -0
  55. md_generator/ppt/options.py +39 -0
  56. md_generator/ppt/post_assets.py +172 -0
  57. md_generator/ppt/text_formatting.py +97 -0
  58. md_generator/ppt/vendor_pdf_md/__init__.py +5 -0
  59. md_generator/ppt/vendor_pdf_md/convert.py +76 -0
  60. md_generator/ppt/vendor_word_md/__init__.py +5 -0
  61. md_generator/ppt/vendor_word_md/convert.py +43 -0
  62. md_generator/ppt/zip_deep.py +31 -0
  63. md_generator/text/__init__.py +1 -0
  64. md_generator/text/api/__init__.py +1 -0
  65. md_generator/text/api/convert_runner.py +24 -0
  66. md_generator/text/api/jobs.py +93 -0
  67. md_generator/text/api/main.py +183 -0
  68. md_generator/text/api/mcp_server.py +38 -0
  69. md_generator/text/api/mcp_setup.py +78 -0
  70. md_generator/text/api/query_options.py +17 -0
  71. md_generator/text/api/settings.py +24 -0
  72. md_generator/text/convert_impl.py +72 -0
  73. md_generator/text/converter.py +73 -0
  74. md_generator/text/format_detect.py +53 -0
  75. md_generator/text/md_emit_json.py +144 -0
  76. md_generator/text/md_emit_txt.py +115 -0
  77. md_generator/text/md_emit_xml.py +127 -0
  78. md_generator/text/options.py +27 -0
  79. md_generator/word/__init__.py +3 -0
  80. md_generator/word/api/__init__.py +1 -0
  81. md_generator/word/api/convert_util.py +41 -0
  82. md_generator/word/api/jobs.py +86 -0
  83. md_generator/word/api/main.py +172 -0
  84. md_generator/word/api/mcp_server.py +98 -0
  85. md_generator/word/artifact.py +30 -0
  86. md_generator/word/converter.py +192 -0
  87. md_generator/word/settings.py +58 -0
  88. md_generator/xlsx/__init__.py +8 -0
  89. md_generator/xlsx/api/__init__.py +1 -0
  90. md_generator/xlsx/api/app.py +293 -0
  91. md_generator/xlsx/convert_config.py +58 -0
  92. md_generator/xlsx/converter.py +67 -0
  93. md_generator/xlsx/converter_core.py +135 -0
  94. md_generator/xlsx/excel_reader.py +170 -0
  95. md_generator/xlsx/markdown_emitter.py +143 -0
  96. md_generator/xlsx/mcp_server.py +50 -0
  97. mdengine-0.1.0.dist-info/METADATA +509 -0
  98. mdengine-0.1.0.dist-info/RECORD +102 -0
  99. mdengine-0.1.0.dist-info/WHEEL +5 -0
  100. mdengine-0.1.0.dist-info/entry_points.txt +8 -0
  101. mdengine-0.1.0.dist-info/licenses/LICENSE +21 -0
  102. mdengine-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,5 @@
1
+ """md_generator: PDF, Office, image, text, and ZIP to Markdown conversion."""
2
+
3
+ __version__ = "0.1.0"
4
+
5
+ __all__ = ["__version__"]
@@ -0,0 +1 @@
1
+ """zip-to-md: ZIP archive to Markdown + assets."""
@@ -0,0 +1 @@
1
+ """HTTP API and MCP entrypoints for zip-to-md."""
@@ -0,0 +1,29 @@
1
+ from __future__ import annotations
2
+
3
+ import io
4
+ import zipfile
5
+ from pathlib import Path
6
+
7
+ from md_generator.archive.convert_impl import convert_zip
8
+ from md_generator.archive.options import ConvertOptions
9
+
10
+
11
+ def build_artifact_zip_bytes(zip_path: Path, options: ConvertOptions) -> bytes:
12
+ """Run conversion into a temp artifact dir; return ZIP bytes (document.md + assets/)."""
13
+ import tempfile
14
+
15
+ buf = io.BytesIO()
16
+ with tempfile.TemporaryDirectory() as td:
17
+ out = Path(td) / "artifact"
18
+ convert_zip(zip_path, out, options)
19
+ with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as zf:
20
+ doc = out / "document.md"
21
+ if doc.is_file():
22
+ zf.write(doc, "document.md")
23
+ assets = out / "assets"
24
+ if assets.is_dir():
25
+ for p in sorted(assets.rglob("*")):
26
+ if p.is_file():
27
+ arc = Path("assets") / p.relative_to(assets)
28
+ zf.write(p, arc.as_posix())
29
+ return buf.getvalue()
@@ -0,0 +1,93 @@
1
+ from __future__ import annotations
2
+
3
+ import shutil
4
+ import threading
5
+ import time
6
+ import uuid
7
+ from dataclasses import dataclass, field
8
+ from pathlib import Path
9
+ from typing import Callable
10
+
11
+
12
+ @dataclass
13
+ class Job:
14
+ job_id: str
15
+ workspace: Path
16
+ status: str = "queued"
17
+ error: str | None = None
18
+ created_at: float = field(default_factory=time.time)
19
+ zip_path: Path | None = None
20
+
21
+
22
+ class JobStore:
23
+ def __init__(self, base_temp: Path | None, ttl_seconds: int) -> None:
24
+ self._base = base_temp
25
+ self._ttl = ttl_seconds
26
+ self._jobs: dict[str, Job] = {}
27
+ self._lock = threading.Lock()
28
+ self._sweeper_started = False
29
+
30
+ def _root(self) -> Path:
31
+ import tempfile
32
+
33
+ if self._base:
34
+ p = Path(self._base)
35
+ p.mkdir(parents=True, exist_ok=True)
36
+ return p
37
+ return Path(tempfile.gettempdir()) / "zip-to-md-jobs"
38
+
39
+ def start_sweeper(self) -> None:
40
+ if self._sweeper_started:
41
+ return
42
+ self._sweeper_started = True
43
+
44
+ def loop() -> None:
45
+ while True:
46
+ time.sleep(60)
47
+ self.sweep()
48
+
49
+ t = threading.Thread(target=loop, daemon=True)
50
+ t.start()
51
+
52
+ def sweep(self) -> None:
53
+ now = time.time()
54
+ with self._lock:
55
+ dead: list[str] = []
56
+ for jid, job in self._jobs.items():
57
+ if job.status in ("done", "failed") and now - job.created_at > self._ttl:
58
+ dead.append(jid)
59
+ for jid in dead:
60
+ job = self._jobs.pop(jid, None)
61
+ if job and job.workspace.exists():
62
+ shutil.rmtree(job.workspace, ignore_errors=True)
63
+
64
+ def create_job(self) -> Job:
65
+ jid = str(uuid.uuid4())
66
+ ws = self._root() / jid
67
+ ws.mkdir(parents=True, exist_ok=True)
68
+ job = Job(job_id=jid, workspace=ws)
69
+ with self._lock:
70
+ self._jobs[jid] = job
71
+ return job
72
+
73
+ def get(self, job_id: str) -> Job | None:
74
+ with self._lock:
75
+ return self._jobs.get(job_id)
76
+
77
+ def run_async(self, job: Job, fn: Callable[[], None]) -> None:
78
+ def target() -> None:
79
+ try:
80
+ job.status = "running"
81
+ fn()
82
+ job.status = "done"
83
+ except Exception as e:
84
+ job.status = "failed"
85
+ job.error = str(e)
86
+
87
+ threading.Thread(target=target, daemon=True).start()
88
+
89
+ def remove_after_download(self, job: Job) -> None:
90
+ with self._lock:
91
+ self._jobs.pop(job.job_id, None)
92
+ if job.workspace.exists():
93
+ shutil.rmtree(job.workspace, ignore_errors=True)
@@ -0,0 +1,204 @@
1
+ from __future__ import annotations
2
+
3
+ from contextlib import asynccontextmanager
4
+ from pathlib import Path
5
+
6
+ from fastapi import FastAPI, File, HTTPException, Request, UploadFile
7
+ from fastapi.middleware.cors import CORSMiddleware
8
+ from fastapi.responses import FileResponse, Response
9
+ from starlette.background import BackgroundTask
10
+
11
+ from md_generator.archive.api.convert_runner import build_artifact_zip_bytes
12
+ from md_generator.archive.api.jobs import JobStore
13
+ from md_generator.archive.api.mcp_setup import build_mcp_stack
14
+ from md_generator.archive.api.query_options import convert_options_from_query
15
+ from md_generator.archive.api.settings import ApiSettings, cors_list
16
+ from md_generator.archive.options import DEFAULT_IMAGE_TO_MD_ENGINES
17
+
18
+ _mcp, _mcp_http = build_mcp_stack(mount_under_fastapi=True)
19
+
20
+
21
+ @asynccontextmanager
22
+ async def lifespan(app: FastAPI):
23
+ settings = ApiSettings()
24
+ base = Path(settings.temp_dir) if settings.temp_dir else None
25
+ store = JobStore(base, settings.job_ttl_seconds)
26
+ store.start_sweeper()
27
+ app.state.settings = settings
28
+ app.state.job_store = store
29
+ async with _mcp.session_manager.run():
30
+ yield
31
+
32
+
33
+ app = FastAPI(title="zip-to-md", lifespan=lifespan)
34
+ app.mount("/mcp", _mcp_http)
35
+
36
+ _bootstrap_settings = ApiSettings()
37
+ _origins = cors_list(_bootstrap_settings)
38
+ app.add_middleware(
39
+ CORSMiddleware,
40
+ allow_origins=_origins,
41
+ allow_credentials="*" not in _origins,
42
+ allow_methods=["*"],
43
+ allow_headers=["*"],
44
+ )
45
+
46
+
47
+ def _merge_repo_root(settings: ApiSettings, query_repo: str | None) -> str | None:
48
+ if query_repo and query_repo.strip():
49
+ return query_repo.strip()
50
+ return settings.repo_root
51
+
52
+
53
+ async def _read_upload_limited(upload: UploadFile, max_bytes: int) -> bytes:
54
+ data = bytearray()
55
+ chunk_size = 1024 * 1024
56
+ while True:
57
+ chunk = await upload.read(chunk_size)
58
+ if not chunk:
59
+ break
60
+ data += chunk
61
+ if len(data) > max_bytes:
62
+ raise HTTPException(
63
+ status_code=413,
64
+ detail="Upload exceeds ZIP_TO_MD_MAX_UPLOAD_MB",
65
+ )
66
+ return bytes(data)
67
+
68
+
69
+ @app.post("/convert/sync")
70
+ async def convert_sync(
71
+ request: Request,
72
+ file: UploadFile = File(...),
73
+ enable_office: bool = True,
74
+ image_ocr: bool = False,
75
+ pdf_ocr: bool = False,
76
+ max_bytes: int = 512_000,
77
+ expand_nested_zips: bool = True,
78
+ max_nested_zip_depth: int = 16,
79
+ repo_root: str | None = None,
80
+ use_image_to_md: bool = True,
81
+ image_to_md_engines: str = DEFAULT_IMAGE_TO_MD_ENGINES,
82
+ image_to_md_strategy: str = "best",
83
+ image_to_md_title: str = "",
84
+ ) -> Response:
85
+ settings: ApiSettings = request.app.state.settings
86
+ if not file.filename or not file.filename.lower().endswith(".zip"):
87
+ raise HTTPException(400, detail="Expected a .zip file upload (multipart field 'file')")
88
+ max_u = settings.max_upload_mb * 1024 * 1024
89
+ max_sync = settings.max_sync_upload_mb * 1024 * 1024
90
+ body = await _read_upload_limited(file, max_u)
91
+ if len(body) > max_sync:
92
+ raise HTTPException(
93
+ status_code=409,
94
+ detail="File too large for synchronous conversion; use POST /convert/jobs",
95
+ )
96
+ rr = _merge_repo_root(settings, repo_root)
97
+ opts = convert_options_from_query(
98
+ enable_office=enable_office,
99
+ image_ocr=image_ocr,
100
+ pdf_ocr=pdf_ocr,
101
+ max_bytes=max_bytes,
102
+ expand_nested_zips=expand_nested_zips,
103
+ max_nested_zip_depth=max_nested_zip_depth,
104
+ repo_root=rr,
105
+ use_image_to_md=use_image_to_md,
106
+ image_to_md_engines=image_to_md_engines,
107
+ image_to_md_strategy=image_to_md_strategy,
108
+ image_to_md_title=image_to_md_title,
109
+ )
110
+ import tempfile
111
+
112
+ with tempfile.NamedTemporaryFile(suffix=".zip", delete=False) as tf:
113
+ tf.write(body)
114
+ tmp_in = Path(tf.name)
115
+ try:
116
+ zbytes = build_artifact_zip_bytes(tmp_in, opts)
117
+ finally:
118
+ tmp_in.unlink(missing_ok=True)
119
+ return Response(
120
+ content=zbytes,
121
+ media_type="application/zip",
122
+ headers={"Content-Disposition": 'attachment; filename="artifact.zip"'},
123
+ )
124
+
125
+
126
+ @app.post("/convert/jobs")
127
+ async def convert_jobs(
128
+ request: Request,
129
+ file: UploadFile = File(...),
130
+ enable_office: bool = True,
131
+ image_ocr: bool = False,
132
+ pdf_ocr: bool = False,
133
+ max_bytes: int = 512_000,
134
+ expand_nested_zips: bool = True,
135
+ max_nested_zip_depth: int = 16,
136
+ repo_root: str | None = None,
137
+ use_image_to_md: bool = True,
138
+ image_to_md_engines: str = DEFAULT_IMAGE_TO_MD_ENGINES,
139
+ image_to_md_strategy: str = "best",
140
+ image_to_md_title: str = "",
141
+ ) -> dict:
142
+ settings: ApiSettings = request.app.state.settings
143
+ store: JobStore = request.app.state.job_store
144
+ if not file.filename or not file.filename.lower().endswith(".zip"):
145
+ raise HTTPException(400, detail="Expected a .zip file upload (multipart field 'file')")
146
+ max_u = settings.max_upload_mb * 1024 * 1024
147
+ body = await _read_upload_limited(file, max_u)
148
+ rr = _merge_repo_root(settings, repo_root)
149
+ opts = convert_options_from_query(
150
+ enable_office=enable_office,
151
+ image_ocr=image_ocr,
152
+ pdf_ocr=pdf_ocr,
153
+ max_bytes=max_bytes,
154
+ expand_nested_zips=expand_nested_zips,
155
+ max_nested_zip_depth=max_nested_zip_depth,
156
+ repo_root=rr,
157
+ use_image_to_md=use_image_to_md,
158
+ image_to_md_engines=image_to_md_engines,
159
+ image_to_md_strategy=image_to_md_strategy,
160
+ image_to_md_title=image_to_md_title,
161
+ )
162
+ job = store.create_job()
163
+ inp = job.workspace / "upload.zip"
164
+ inp.write_bytes(body)
165
+
166
+ def work() -> None:
167
+ zpath = job.workspace / "artifact.zip"
168
+ data = build_artifact_zip_bytes(inp, opts)
169
+ zpath.write_bytes(data)
170
+ job.zip_path = zpath
171
+
172
+ store.run_async(job, work)
173
+ return {"job_id": job.job_id, "status": job.status}
174
+
175
+
176
+ @app.get("/convert/jobs/{job_id}")
177
+ async def job_status(request: Request, job_id: str) -> dict:
178
+ store: JobStore = request.app.state.job_store
179
+ job = store.get(job_id)
180
+ if not job:
181
+ raise HTTPException(404, detail="Unknown job_id")
182
+ return {
183
+ "status": job.status,
184
+ "error": job.error,
185
+ "created_at": job.created_at,
186
+ }
187
+
188
+
189
+ @app.get("/convert/jobs/{job_id}/download", response_class=FileResponse)
190
+ async def job_download(request: Request, job_id: str) -> FileResponse:
191
+ store: JobStore = request.app.state.job_store
192
+ job = store.get(job_id)
193
+ if not job:
194
+ raise HTTPException(404, detail="Unknown job_id")
195
+ if job.status != "done" or not job.zip_path or not job.zip_path.is_file():
196
+ raise HTTPException(400, detail="Job is not ready for download")
197
+ path = job.zip_path
198
+ task = BackgroundTask(store.remove_after_download, job)
199
+ return FileResponse(
200
+ path,
201
+ media_type="application/zip",
202
+ filename="artifact.zip",
203
+ background=task,
204
+ )
@@ -0,0 +1,38 @@
1
+ """
2
+ Standalone MCP server (stdio, SSE, or streamable-http).
3
+
4
+ Examples:
5
+ python -m api.mcp_server
6
+ python -m api.mcp_server --transport sse
7
+ python -m api.mcp_server --transport streamable-http
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import argparse
13
+ import asyncio
14
+
15
+
16
+ def main() -> None:
17
+ parser = argparse.ArgumentParser(description="zip-to-md MCP server")
18
+ parser.add_argument(
19
+ "--transport",
20
+ choices=("stdio", "sse", "streamable-http"),
21
+ default="stdio",
22
+ )
23
+ args = parser.parse_args()
24
+
25
+ from md_generator.archive.api.mcp_setup import build_mcp_stack
26
+
27
+ mcp, _ = build_mcp_stack(mount_under_fastapi=False)
28
+
29
+ if args.transport == "stdio":
30
+ asyncio.run(mcp.run_stdio_async())
31
+ elif args.transport == "sse":
32
+ asyncio.run(mcp.run_sse_async())
33
+ else:
34
+ asyncio.run(mcp.run_streamable_http_async())
35
+
36
+
37
+ if __name__ == "__main__":
38
+ main()
@@ -0,0 +1,84 @@
1
+ from __future__ import annotations
2
+
3
+ import base64
4
+ import re
5
+ import tempfile
6
+ from pathlib import Path
7
+
8
+ from mcp.server.fastmcp import FastMCP
9
+
10
+ from md_generator.archive.api.convert_runner import build_artifact_zip_bytes
11
+ from md_generator.archive.api.query_options import convert_options_from_query
12
+ from md_generator.archive.api.settings import ApiSettings
13
+ from md_generator.archive.options import ConvertOptions
14
+
15
+
16
+ def _decode_base64_zip(data: str) -> bytes:
17
+ s = data.strip()
18
+ if s.startswith("data:"):
19
+ parts = s.split(",", 1)
20
+ if len(parts) == 2:
21
+ s = parts[1]
22
+ return base64.b64decode(s, validate=False)
23
+
24
+
25
+ def build_mcp_stack(*, mount_under_fastapi: bool = False) -> tuple[FastMCP, object]:
26
+ path = "/" if mount_under_fastapi else "/mcp"
27
+ mcp = FastMCP(
28
+ "zip-to-md",
29
+ instructions="Convert .zip archives to Markdown artifact ZIP bundles (document.md + assets/).",
30
+ streamable_http_path=path,
31
+ )
32
+ settings = ApiSettings()
33
+
34
+ def _opts() -> ConvertOptions:
35
+ return convert_options_from_query(
36
+ repo_root=settings.repo_root,
37
+ use_image_to_md=settings.use_image_to_md,
38
+ image_to_md_engines=settings.image_to_md_engines,
39
+ image_to_md_strategy=settings.image_to_md_strategy,
40
+ image_to_md_title=settings.image_to_md_title,
41
+ )
42
+
43
+ @mcp.tool()
44
+ def convert_zip_to_artifact_zip(zip_path: str) -> str:
45
+ """Convert a local .zip path on the server to a temporary artifact.zip path."""
46
+ src = Path(zip_path).expanduser().resolve()
47
+ if not src.is_file() or src.suffix.lower() != ".zip":
48
+ raise ValueError("zip_path must be an existing .zip file")
49
+ data = build_artifact_zip_bytes(src, _opts())
50
+ fd, name = tempfile.mkstemp(suffix=".zip", prefix="zip-to-md-artifact-")
51
+ import os
52
+
53
+ os.close(fd)
54
+ out = Path(name)
55
+ out.write_bytes(data)
56
+ return str(out)
57
+
58
+ @mcp.tool()
59
+ def convert_zip_base64_to_artifact_zip(
60
+ zip_base64: str,
61
+ filename: str = "upload.zip",
62
+ ) -> str:
63
+ """Decode base64 .zip (optional data:...;base64, prefix) and write artifact.zip path."""
64
+ raw = _decode_base64_zip(zip_base64)
65
+ max_b = settings.max_upload_mb * 1024 * 1024
66
+ if len(raw) > max_b:
67
+ raise ValueError(f"Decoded file exceeds ZIP_TO_MD_MAX_UPLOAD_MB ({settings.max_upload_mb})")
68
+ safe = re.sub(r"[^\w.\-]+", "_", filename) or "upload.zip"
69
+ if not safe.lower().endswith(".zip"):
70
+ safe += ".zip"
71
+ with tempfile.TemporaryDirectory() as td:
72
+ p = Path(td) / safe
73
+ p.write_bytes(raw)
74
+ data = build_artifact_zip_bytes(p, _opts())
75
+ fd, name = tempfile.mkstemp(suffix=".zip", prefix="zip-to-md-artifact-")
76
+ import os
77
+
78
+ os.close(fd)
79
+ out = Path(name)
80
+ out.write_bytes(data)
81
+ return str(out)
82
+
83
+ sub = mcp.streamable_http_app()
84
+ return mcp, sub
@@ -0,0 +1,36 @@
1
+ from __future__ import annotations
2
+
3
+ from md_generator.archive.options import DEFAULT_IMAGE_TO_MD_ENGINES, ConvertOptions
4
+
5
+
6
+ def convert_options_from_query(
7
+ *,
8
+ enable_office: bool = True,
9
+ image_ocr: bool = False,
10
+ pdf_ocr: bool = False,
11
+ max_bytes: int = 512_000,
12
+ expand_nested_zips: bool = True,
13
+ max_nested_zip_depth: int = 16,
14
+ repo_root: str | None = None,
15
+ use_image_to_md: bool = True,
16
+ image_to_md_engines: str = DEFAULT_IMAGE_TO_MD_ENGINES,
17
+ image_to_md_strategy: str = "best",
18
+ image_to_md_title: str = "",
19
+ ) -> ConvertOptions:
20
+ """Map API query parameters to ConvertOptions."""
21
+ return ConvertOptions(
22
+ artifact_layout=True,
23
+ verbose=False,
24
+ enable_office=enable_office,
25
+ image_ocr=image_ocr,
26
+ pdf_ocr=pdf_ocr,
27
+ max_bytes=max_bytes,
28
+ repo_root=repo_root,
29
+ expand_nested_zips=expand_nested_zips,
30
+ max_nested_zip_depth=max_nested_zip_depth,
31
+ use_image_to_md=use_image_to_md,
32
+ image_to_md_engines=(image_to_md_engines or DEFAULT_IMAGE_TO_MD_ENGINES).strip()
33
+ or DEFAULT_IMAGE_TO_MD_ENGINES,
34
+ image_to_md_strategy=image_to_md_strategy if image_to_md_strategy in ("best", "compare") else "best",
35
+ image_to_md_title=(image_to_md_title or "").strip(),
36
+ )
@@ -0,0 +1,32 @@
1
+ from __future__ import annotations
2
+
3
+ from pydantic_settings import BaseSettings, SettingsConfigDict
4
+
5
+ from md_generator.archive.options import DEFAULT_IMAGE_TO_MD_ENGINES
6
+
7
+
8
+ class ApiSettings(BaseSettings):
9
+ model_config = SettingsConfigDict(
10
+ env_prefix="ZIP_TO_MD_",
11
+ env_file=".env",
12
+ extra="ignore",
13
+ )
14
+
15
+ max_upload_mb: int = 200
16
+ max_sync_upload_mb: int = 40
17
+ job_ttl_seconds: int = 3600
18
+ temp_dir: str | None = None
19
+ cors_origins: str = "*"
20
+ """Deprecated: ignored; converters are imported from the md_generator package."""
21
+ repo_root: str | None = None
22
+ use_image_to_md: bool = True
23
+ image_to_md_engines: str = DEFAULT_IMAGE_TO_MD_ENGINES
24
+ image_to_md_strategy: str = "best"
25
+ image_to_md_title: str = ""
26
+
27
+
28
+ def cors_list(settings: ApiSettings) -> list[str]:
29
+ raw = (settings.cors_origins or "*").strip()
30
+ if raw == "*":
31
+ return ["*"]
32
+ return [o.strip() for o in raw.split(",") if o.strip()]