onetool-mcp 1.0.0b1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bench/__init__.py +5 -0
- bench/cli.py +69 -0
- bench/harness/__init__.py +66 -0
- bench/harness/client.py +692 -0
- bench/harness/config.py +397 -0
- bench/harness/csv_writer.py +109 -0
- bench/harness/evaluate.py +512 -0
- bench/harness/metrics.py +283 -0
- bench/harness/runner.py +899 -0
- bench/py.typed +0 -0
- bench/reporter.py +629 -0
- bench/run.py +487 -0
- bench/secrets.py +101 -0
- bench/utils.py +16 -0
- onetool/__init__.py +4 -0
- onetool/cli.py +391 -0
- onetool/py.typed +0 -0
- onetool_mcp-1.0.0b1.dist-info/METADATA +163 -0
- onetool_mcp-1.0.0b1.dist-info/RECORD +132 -0
- onetool_mcp-1.0.0b1.dist-info/WHEEL +4 -0
- onetool_mcp-1.0.0b1.dist-info/entry_points.txt +3 -0
- onetool_mcp-1.0.0b1.dist-info/licenses/LICENSE.txt +687 -0
- onetool_mcp-1.0.0b1.dist-info/licenses/NOTICE.txt +64 -0
- ot/__init__.py +37 -0
- ot/__main__.py +6 -0
- ot/_cli.py +107 -0
- ot/_tui.py +53 -0
- ot/config/__init__.py +46 -0
- ot/config/defaults/bench.yaml +4 -0
- ot/config/defaults/diagram-templates/api-flow.mmd +33 -0
- ot/config/defaults/diagram-templates/c4-context.puml +30 -0
- ot/config/defaults/diagram-templates/class-diagram.mmd +87 -0
- ot/config/defaults/diagram-templates/feature-mindmap.mmd +70 -0
- ot/config/defaults/diagram-templates/microservices.d2 +81 -0
- ot/config/defaults/diagram-templates/project-gantt.mmd +37 -0
- ot/config/defaults/diagram-templates/state-machine.mmd +42 -0
- ot/config/defaults/onetool.yaml +25 -0
- ot/config/defaults/prompts.yaml +97 -0
- ot/config/defaults/servers.yaml +7 -0
- ot/config/defaults/snippets.yaml +4 -0
- ot/config/defaults/tool_templates/__init__.py +7 -0
- ot/config/defaults/tool_templates/extension.py +52 -0
- ot/config/defaults/tool_templates/isolated.py +61 -0
- ot/config/dynamic.py +121 -0
- ot/config/global_templates/__init__.py +2 -0
- ot/config/global_templates/bench-secrets-template.yaml +6 -0
- ot/config/global_templates/bench.yaml +9 -0
- ot/config/global_templates/onetool.yaml +27 -0
- ot/config/global_templates/secrets-template.yaml +44 -0
- ot/config/global_templates/servers.yaml +18 -0
- ot/config/global_templates/snippets.yaml +235 -0
- ot/config/loader.py +1087 -0
- ot/config/mcp.py +145 -0
- ot/config/secrets.py +190 -0
- ot/config/tool_config.py +125 -0
- ot/decorators.py +116 -0
- ot/executor/__init__.py +35 -0
- ot/executor/base.py +16 -0
- ot/executor/fence_processor.py +83 -0
- ot/executor/linter.py +142 -0
- ot/executor/pack_proxy.py +260 -0
- ot/executor/param_resolver.py +140 -0
- ot/executor/pep723.py +288 -0
- ot/executor/result_store.py +369 -0
- ot/executor/runner.py +496 -0
- ot/executor/simple.py +163 -0
- ot/executor/tool_loader.py +396 -0
- ot/executor/validator.py +398 -0
- ot/executor/worker_pool.py +388 -0
- ot/executor/worker_proxy.py +189 -0
- ot/http_client.py +145 -0
- ot/logging/__init__.py +37 -0
- ot/logging/config.py +315 -0
- ot/logging/entry.py +213 -0
- ot/logging/format.py +188 -0
- ot/logging/span.py +349 -0
- ot/meta.py +1555 -0
- ot/paths.py +453 -0
- ot/prompts.py +218 -0
- ot/proxy/__init__.py +21 -0
- ot/proxy/manager.py +396 -0
- ot/py.typed +0 -0
- ot/registry/__init__.py +189 -0
- ot/registry/models.py +57 -0
- ot/registry/parser.py +269 -0
- ot/registry/registry.py +413 -0
- ot/server.py +315 -0
- ot/shortcuts/__init__.py +15 -0
- ot/shortcuts/aliases.py +87 -0
- ot/shortcuts/snippets.py +258 -0
- ot/stats/__init__.py +35 -0
- ot/stats/html.py +250 -0
- ot/stats/jsonl_writer.py +283 -0
- ot/stats/reader.py +354 -0
- ot/stats/timing.py +57 -0
- ot/support.py +63 -0
- ot/tools.py +114 -0
- ot/utils/__init__.py +81 -0
- ot/utils/batch.py +161 -0
- ot/utils/cache.py +120 -0
- ot/utils/deps.py +403 -0
- ot/utils/exceptions.py +23 -0
- ot/utils/factory.py +179 -0
- ot/utils/format.py +65 -0
- ot/utils/http.py +202 -0
- ot/utils/platform.py +45 -0
- ot/utils/sanitize.py +130 -0
- ot/utils/truncate.py +69 -0
- ot_tools/__init__.py +4 -0
- ot_tools/_convert/__init__.py +12 -0
- ot_tools/_convert/excel.py +279 -0
- ot_tools/_convert/pdf.py +254 -0
- ot_tools/_convert/powerpoint.py +268 -0
- ot_tools/_convert/utils.py +358 -0
- ot_tools/_convert/word.py +283 -0
- ot_tools/brave_search.py +604 -0
- ot_tools/code_search.py +736 -0
- ot_tools/context7.py +495 -0
- ot_tools/convert.py +614 -0
- ot_tools/db.py +415 -0
- ot_tools/diagram.py +1604 -0
- ot_tools/diagram.yaml +167 -0
- ot_tools/excel.py +1372 -0
- ot_tools/file.py +1348 -0
- ot_tools/firecrawl.py +732 -0
- ot_tools/grounding_search.py +646 -0
- ot_tools/package.py +604 -0
- ot_tools/py.typed +0 -0
- ot_tools/ripgrep.py +544 -0
- ot_tools/scaffold.py +471 -0
- ot_tools/transform.py +213 -0
- ot_tools/web_fetch.py +384 -0
ot_tools/convert.py
ADDED
|
@@ -0,0 +1,614 @@
|
|
|
1
|
+
"""Document conversion tools for OneTool.
|
|
2
|
+
|
|
3
|
+
Converts PDF, Word, PowerPoint, and Excel documents to Markdown
|
|
4
|
+
with LLM-optimised output including YAML frontmatter and TOC.
|
|
5
|
+
|
|
6
|
+
Supports glob patterns for batch conversion with async parallel processing.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
# Pack for dot notation: convert.pdf(), convert.word(), etc.
|
|
12
|
+
pack = "convert"
|
|
13
|
+
|
|
14
|
+
__all__ = ["auto", "excel", "pdf", "powerpoint", "word"]
|
|
15
|
+
|
|
16
|
+
# Dependency declarations for CLI validation
|
|
17
|
+
# Use dict format for packages where import_name differs from package name
|
|
18
|
+
__ot_requires__ = {
|
|
19
|
+
"lib": [
|
|
20
|
+
{"name": "pymupdf", "import_name": "fitz", "install": "pip install pymupdf"},
|
|
21
|
+
{"name": "python-docx", "import_name": "docx", "install": "pip install python-docx"},
|
|
22
|
+
{"name": "python-pptx", "import_name": "pptx", "install": "pip install python-pptx"},
|
|
23
|
+
("openpyxl", "pip install openpyxl"),
|
|
24
|
+
{"name": "Pillow", "import_name": "PIL", "install": "pip install Pillow"},
|
|
25
|
+
{"name": "formulas", "import_name": "formulas", "install": "pip install formulas", "optional": True},
|
|
26
|
+
],
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
import asyncio
|
|
30
|
+
import atexit
|
|
31
|
+
import os
|
|
32
|
+
from collections.abc import Callable
|
|
33
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
34
|
+
from pathlib import Path
|
|
35
|
+
from typing import Any
|
|
36
|
+
|
|
37
|
+
from ot.logging import LogSpan
|
|
38
|
+
from ot.paths import resolve_cwd_path
|
|
39
|
+
from ot_tools._convert import (
|
|
40
|
+
convert_excel,
|
|
41
|
+
convert_pdf,
|
|
42
|
+
convert_powerpoint,
|
|
43
|
+
convert_word,
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
# Type alias for converter functions
|
|
47
|
+
ConverterFunc = Callable[[Path, Path, str], dict[str, Any]]
|
|
48
|
+
|
|
49
|
+
# Shared thread pool for file conversions (created lazily, sized for parallelism)
|
|
50
|
+
_conversion_executor: ThreadPoolExecutor | None = None
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _get_conversion_executor() -> ThreadPoolExecutor:
|
|
54
|
+
"""Get or create the shared conversion thread pool."""
|
|
55
|
+
global _conversion_executor
|
|
56
|
+
if _conversion_executor is None:
|
|
57
|
+
# Use CPU count but cap at reasonable max for I/O-bound work
|
|
58
|
+
max_workers = min(os.cpu_count() or 4, 8)
|
|
59
|
+
_conversion_executor = ThreadPoolExecutor(
|
|
60
|
+
max_workers=max_workers,
|
|
61
|
+
thread_name_prefix="convert",
|
|
62
|
+
)
|
|
63
|
+
return _conversion_executor
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def _shutdown_executor() -> None:
|
|
67
|
+
"""Shutdown the conversion thread pool on exit."""
|
|
68
|
+
global _conversion_executor
|
|
69
|
+
if _conversion_executor is not None:
|
|
70
|
+
_conversion_executor.shutdown(wait=False)
|
|
71
|
+
_conversion_executor = None
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
atexit.register(_shutdown_executor)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def _resolve_glob(pattern: str) -> list[Path]:
|
|
78
|
+
"""Resolve glob pattern to list of files.
|
|
79
|
+
|
|
80
|
+
Uses SDK resolve_cwd_path() for consistent path resolution.
|
|
81
|
+
|
|
82
|
+
Args:
|
|
83
|
+
pattern: Glob pattern (can include ~, relative, or absolute paths)
|
|
84
|
+
|
|
85
|
+
Returns:
|
|
86
|
+
List of matching file paths
|
|
87
|
+
"""
|
|
88
|
+
cwd = resolve_cwd_path(".")
|
|
89
|
+
# Expand ~ and resolve relative to project dir
|
|
90
|
+
path = Path(pattern).expanduser()
|
|
91
|
+
if not path.is_absolute():
|
|
92
|
+
path = cwd / pattern
|
|
93
|
+
|
|
94
|
+
# If pattern has no glob chars and exists, return it directly
|
|
95
|
+
if path.exists() and path.is_file():
|
|
96
|
+
return [path]
|
|
97
|
+
|
|
98
|
+
# Otherwise glob from parent
|
|
99
|
+
parent = path.parent
|
|
100
|
+
glob_pattern = path.name
|
|
101
|
+
|
|
102
|
+
# Handle recursive globs in parent
|
|
103
|
+
if "**" in str(path):
|
|
104
|
+
# Find the base directory before **
|
|
105
|
+
parts = Path(pattern).expanduser().parts
|
|
106
|
+
base_parts: list[str] = []
|
|
107
|
+
glob_parts: list[str] = []
|
|
108
|
+
found_glob = False
|
|
109
|
+
for part in parts:
|
|
110
|
+
if "**" in part or "*" in part or "?" in part:
|
|
111
|
+
found_glob = True
|
|
112
|
+
if found_glob:
|
|
113
|
+
glob_parts.append(part)
|
|
114
|
+
else:
|
|
115
|
+
base_parts.append(part)
|
|
116
|
+
|
|
117
|
+
if base_parts:
|
|
118
|
+
base = Path(*base_parts)
|
|
119
|
+
if not base.is_absolute():
|
|
120
|
+
base = cwd / base
|
|
121
|
+
else:
|
|
122
|
+
base = cwd
|
|
123
|
+
|
|
124
|
+
glob_pattern = str(Path(*glob_parts)) if glob_parts else "*"
|
|
125
|
+
return list(base.glob(glob_pattern))
|
|
126
|
+
|
|
127
|
+
# Simple glob in directory
|
|
128
|
+
if not parent.is_absolute():
|
|
129
|
+
parent = cwd / parent.relative_to(".") if str(parent) != "." else cwd
|
|
130
|
+
|
|
131
|
+
if parent.exists():
|
|
132
|
+
return list(parent.glob(glob_pattern))
|
|
133
|
+
|
|
134
|
+
return []
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def _get_source_rel(path: Path) -> str:
|
|
138
|
+
"""Get relative path for frontmatter source field."""
|
|
139
|
+
cwd = resolve_cwd_path(".")
|
|
140
|
+
try:
|
|
141
|
+
return str(path.relative_to(cwd))
|
|
142
|
+
except ValueError:
|
|
143
|
+
return str(path)
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def _resolve_output_dir(output_dir: str) -> Path:
|
|
147
|
+
"""Resolve output directory path.
|
|
148
|
+
|
|
149
|
+
Uses SDK resolve_cwd_path() for consistent path resolution.
|
|
150
|
+
"""
|
|
151
|
+
return resolve_cwd_path(output_dir)
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
async def _convert_file_async(
|
|
155
|
+
converter: Any,
|
|
156
|
+
input_path: Path,
|
|
157
|
+
output_dir: Path,
|
|
158
|
+
source_rel: str,
|
|
159
|
+
**kwargs: Any,
|
|
160
|
+
) -> dict[str, Any]:
|
|
161
|
+
"""Run conversion in shared thread pool for async execution."""
|
|
162
|
+
loop = asyncio.get_event_loop()
|
|
163
|
+
executor = _get_conversion_executor()
|
|
164
|
+
return await loop.run_in_executor(
|
|
165
|
+
executor,
|
|
166
|
+
lambda: converter(input_path, output_dir, source_rel, **kwargs),
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
async def _convert_batch_async(
|
|
171
|
+
files: list[Path],
|
|
172
|
+
output_dir: Path,
|
|
173
|
+
converter: Any,
|
|
174
|
+
**kwargs: Any,
|
|
175
|
+
) -> dict[str, Any]:
|
|
176
|
+
"""Convert multiple files in parallel."""
|
|
177
|
+
tasks = []
|
|
178
|
+
for path in files:
|
|
179
|
+
source_rel = _get_source_rel(path)
|
|
180
|
+
tasks.append(_convert_file_async(converter, path, output_dir, source_rel, **kwargs))
|
|
181
|
+
|
|
182
|
+
results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
183
|
+
|
|
184
|
+
converted = 0
|
|
185
|
+
failed = 0
|
|
186
|
+
outputs: list[str] = []
|
|
187
|
+
errors: list[str] = []
|
|
188
|
+
|
|
189
|
+
for path, res in zip(files, results, strict=True):
|
|
190
|
+
if isinstance(res, BaseException):
|
|
191
|
+
failed += 1
|
|
192
|
+
errors.append(f"{path.name}: {res}")
|
|
193
|
+
else:
|
|
194
|
+
converted += 1
|
|
195
|
+
outputs.append(res["output"])
|
|
196
|
+
|
|
197
|
+
return {
|
|
198
|
+
"converted": converted,
|
|
199
|
+
"failed": failed,
|
|
200
|
+
"outputs": outputs,
|
|
201
|
+
"errors": errors,
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
async def _convert_auto_batch_async(
|
|
206
|
+
files: list[Path],
|
|
207
|
+
output_dir: Path,
|
|
208
|
+
converters: dict[str, ConverterFunc],
|
|
209
|
+
) -> dict[str, Any]:
|
|
210
|
+
"""Convert multiple files in parallel with auto-detection."""
|
|
211
|
+
tasks = []
|
|
212
|
+
task_paths: list[Path] = []
|
|
213
|
+
skipped = 0
|
|
214
|
+
|
|
215
|
+
for path in files:
|
|
216
|
+
ext = path.suffix.lower()
|
|
217
|
+
if ext not in converters:
|
|
218
|
+
skipped += 1
|
|
219
|
+
continue
|
|
220
|
+
|
|
221
|
+
source_rel = _get_source_rel(path)
|
|
222
|
+
converter = converters[ext]
|
|
223
|
+
tasks.append(_convert_file_async(converter, path, output_dir, source_rel))
|
|
224
|
+
task_paths.append(path)
|
|
225
|
+
|
|
226
|
+
if not tasks:
|
|
227
|
+
return {
|
|
228
|
+
"converted": 0,
|
|
229
|
+
"failed": 0,
|
|
230
|
+
"skipped": skipped,
|
|
231
|
+
"outputs": [],
|
|
232
|
+
"errors": [],
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
236
|
+
|
|
237
|
+
converted = 0
|
|
238
|
+
failed = 0
|
|
239
|
+
outputs: list[str] = []
|
|
240
|
+
errors: list[str] = []
|
|
241
|
+
|
|
242
|
+
for path, res in zip(task_paths, results, strict=True):
|
|
243
|
+
if isinstance(res, BaseException):
|
|
244
|
+
failed += 1
|
|
245
|
+
errors.append(f"{path.name}: {res}")
|
|
246
|
+
else:
|
|
247
|
+
converted += 1
|
|
248
|
+
outputs.append(res["output"])
|
|
249
|
+
|
|
250
|
+
return {
|
|
251
|
+
"converted": converted,
|
|
252
|
+
"failed": failed,
|
|
253
|
+
"skipped": skipped,
|
|
254
|
+
"outputs": outputs,
|
|
255
|
+
"errors": errors,
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
def pdf(
|
|
260
|
+
*,
|
|
261
|
+
pattern: str,
|
|
262
|
+
output_dir: str,
|
|
263
|
+
) -> str:
|
|
264
|
+
"""Convert PDF documents to Markdown.
|
|
265
|
+
|
|
266
|
+
Converts PDF files to Markdown with page-by-page text extraction,
|
|
267
|
+
embedded image export, and outline-based heading structure.
|
|
268
|
+
|
|
269
|
+
Args:
|
|
270
|
+
pattern: Glob pattern for input files (e.g., "docs/*.pdf", "report.pdf")
|
|
271
|
+
output_dir: Directory for output files
|
|
272
|
+
|
|
273
|
+
Returns:
|
|
274
|
+
Conversion summary with output paths, or error message
|
|
275
|
+
|
|
276
|
+
Example:
|
|
277
|
+
convert.pdf(pattern="docs/report.pdf", output_dir="docs/md")
|
|
278
|
+
convert.pdf(pattern="input/*.pdf", output_dir="output")
|
|
279
|
+
"""
|
|
280
|
+
with LogSpan(span="convert.pdf", pattern=pattern, output_dir=output_dir) as s:
|
|
281
|
+
files = _resolve_glob(pattern)
|
|
282
|
+
if not files:
|
|
283
|
+
s.add(error="no_match")
|
|
284
|
+
return f"No files matched pattern: {pattern}"
|
|
285
|
+
|
|
286
|
+
out_path = _resolve_output_dir(output_dir)
|
|
287
|
+
|
|
288
|
+
if len(files) == 1:
|
|
289
|
+
# Single file conversion
|
|
290
|
+
try:
|
|
291
|
+
source_rel = _get_source_rel(files[0])
|
|
292
|
+
result = convert_pdf(files[0], out_path, source_rel)
|
|
293
|
+
s.add(converted=1, pages=result["pages"], images=result["images"])
|
|
294
|
+
return f"Converted {files[0].name}: {result['pages']} pages, {result['images']} images\nOutput: {result['output']}"
|
|
295
|
+
except Exception as e:
|
|
296
|
+
s.add(error=str(e))
|
|
297
|
+
return f"Error converting {files[0].name}: {e}"
|
|
298
|
+
|
|
299
|
+
# Batch conversion
|
|
300
|
+
try:
|
|
301
|
+
result = asyncio.run(_convert_batch_async(files, out_path, convert_pdf))
|
|
302
|
+
s.add(converted=result["converted"], failed=result["failed"])
|
|
303
|
+
|
|
304
|
+
lines = [f"Converted {result['converted']} files, {result['failed']} failed"]
|
|
305
|
+
if result["outputs"]:
|
|
306
|
+
lines.append("\nOutputs:")
|
|
307
|
+
for output in result["outputs"]:
|
|
308
|
+
lines.append(f" {output}")
|
|
309
|
+
if result["errors"]:
|
|
310
|
+
lines.append("\nErrors:")
|
|
311
|
+
for error in result["errors"]:
|
|
312
|
+
lines.append(f" {error}")
|
|
313
|
+
|
|
314
|
+
return "\n".join(lines)
|
|
315
|
+
except Exception as e:
|
|
316
|
+
s.add(error=str(e))
|
|
317
|
+
return f"Error: {e}"
|
|
318
|
+
|
|
319
|
+
|
|
320
|
+
def word(
|
|
321
|
+
*,
|
|
322
|
+
pattern: str,
|
|
323
|
+
output_dir: str,
|
|
324
|
+
) -> str:
|
|
325
|
+
"""Convert Word documents to Markdown.
|
|
326
|
+
|
|
327
|
+
Converts DOCX files to Markdown with heading style detection,
|
|
328
|
+
table conversion, and embedded image export.
|
|
329
|
+
|
|
330
|
+
Args:
|
|
331
|
+
pattern: Glob pattern for input files (e.g., "docs/*.docx", "spec.docx")
|
|
332
|
+
output_dir: Directory for output files
|
|
333
|
+
|
|
334
|
+
Returns:
|
|
335
|
+
Conversion summary with output paths, or error message
|
|
336
|
+
|
|
337
|
+
Example:
|
|
338
|
+
convert.word(pattern="specs/design.docx", output_dir="specs/md")
|
|
339
|
+
convert.word(pattern="docs/**/*.docx", output_dir="output")
|
|
340
|
+
"""
|
|
341
|
+
with LogSpan(span="convert.word", pattern=pattern, output_dir=output_dir) as s:
|
|
342
|
+
files = _resolve_glob(pattern)
|
|
343
|
+
if not files:
|
|
344
|
+
s.add(error="no_match")
|
|
345
|
+
return f"No files matched pattern: {pattern}"
|
|
346
|
+
|
|
347
|
+
out_path = _resolve_output_dir(output_dir)
|
|
348
|
+
|
|
349
|
+
if len(files) == 1:
|
|
350
|
+
try:
|
|
351
|
+
source_rel = _get_source_rel(files[0])
|
|
352
|
+
result = convert_word(files[0], out_path, source_rel)
|
|
353
|
+
s.add(
|
|
354
|
+
converted=1,
|
|
355
|
+
paragraphs=result["paragraphs"],
|
|
356
|
+
tables=result["tables"],
|
|
357
|
+
images=result["images"],
|
|
358
|
+
)
|
|
359
|
+
return f"Converted {files[0].name}: {result['paragraphs']} paragraphs, {result['tables']} tables, {result['images']} images\nOutput: {result['output']}"
|
|
360
|
+
except Exception as e:
|
|
361
|
+
s.add(error=str(e))
|
|
362
|
+
return f"Error converting {files[0].name}: {e}"
|
|
363
|
+
|
|
364
|
+
try:
|
|
365
|
+
result = asyncio.run(_convert_batch_async(files, out_path, convert_word))
|
|
366
|
+
s.add(converted=result["converted"], failed=result["failed"])
|
|
367
|
+
|
|
368
|
+
lines = [f"Converted {result['converted']} files, {result['failed']} failed"]
|
|
369
|
+
if result["outputs"]:
|
|
370
|
+
lines.append("\nOutputs:")
|
|
371
|
+
for output in result["outputs"]:
|
|
372
|
+
lines.append(f" {output}")
|
|
373
|
+
if result["errors"]:
|
|
374
|
+
lines.append("\nErrors:")
|
|
375
|
+
for error in result["errors"]:
|
|
376
|
+
lines.append(f" {error}")
|
|
377
|
+
|
|
378
|
+
return "\n".join(lines)
|
|
379
|
+
except Exception as e:
|
|
380
|
+
s.add(error=str(e))
|
|
381
|
+
return f"Error: {e}"
|
|
382
|
+
|
|
383
|
+
|
|
384
|
+
def powerpoint(
|
|
385
|
+
*,
|
|
386
|
+
pattern: str,
|
|
387
|
+
output_dir: str,
|
|
388
|
+
include_notes: bool = False,
|
|
389
|
+
) -> str:
|
|
390
|
+
"""Convert PowerPoint presentations to Markdown.
|
|
391
|
+
|
|
392
|
+
Converts PPTX files to Markdown with slide structure,
|
|
393
|
+
table conversion, and embedded image export.
|
|
394
|
+
|
|
395
|
+
Args:
|
|
396
|
+
pattern: Glob pattern for input files (e.g., "slides/*.pptx")
|
|
397
|
+
output_dir: Directory for output files
|
|
398
|
+
include_notes: Include speaker notes after slide content
|
|
399
|
+
|
|
400
|
+
Returns:
|
|
401
|
+
Conversion summary with output paths, or error message
|
|
402
|
+
|
|
403
|
+
Example:
|
|
404
|
+
convert.powerpoint(pattern="slides/deck.pptx", output_dir="slides/md")
|
|
405
|
+
convert.powerpoint(pattern="presentations/*.pptx", output_dir="output", include_notes=True)
|
|
406
|
+
"""
|
|
407
|
+
with LogSpan(
|
|
408
|
+
span="convert.powerpoint",
|
|
409
|
+
pattern=pattern,
|
|
410
|
+
output_dir=output_dir,
|
|
411
|
+
include_notes=include_notes,
|
|
412
|
+
) as s:
|
|
413
|
+
files = _resolve_glob(pattern)
|
|
414
|
+
if not files:
|
|
415
|
+
s.add(error="no_match")
|
|
416
|
+
return f"No files matched pattern: {pattern}"
|
|
417
|
+
|
|
418
|
+
out_path = _resolve_output_dir(output_dir)
|
|
419
|
+
|
|
420
|
+
if len(files) == 1:
|
|
421
|
+
try:
|
|
422
|
+
source_rel = _get_source_rel(files[0])
|
|
423
|
+
result = convert_powerpoint(
|
|
424
|
+
files[0], out_path, source_rel, include_notes=include_notes
|
|
425
|
+
)
|
|
426
|
+
s.add(converted=1, slides=result["slides"], images=result["images"])
|
|
427
|
+
return f"Converted {files[0].name}: {result['slides']} slides, {result['images']} images\nOutput: {result['output']}"
|
|
428
|
+
except Exception as e:
|
|
429
|
+
s.add(error=str(e))
|
|
430
|
+
return f"Error converting {files[0].name}: {e}"
|
|
431
|
+
|
|
432
|
+
try:
|
|
433
|
+
result = asyncio.run(
|
|
434
|
+
_convert_batch_async(
|
|
435
|
+
files, out_path, convert_powerpoint, include_notes=include_notes
|
|
436
|
+
)
|
|
437
|
+
)
|
|
438
|
+
s.add(converted=result["converted"], failed=result["failed"])
|
|
439
|
+
|
|
440
|
+
lines = [f"Converted {result['converted']} files, {result['failed']} failed"]
|
|
441
|
+
if result["outputs"]:
|
|
442
|
+
lines.append("\nOutputs:")
|
|
443
|
+
for output in result["outputs"]:
|
|
444
|
+
lines.append(f" {output}")
|
|
445
|
+
if result["errors"]:
|
|
446
|
+
lines.append("\nErrors:")
|
|
447
|
+
for error in result["errors"]:
|
|
448
|
+
lines.append(f" {error}")
|
|
449
|
+
|
|
450
|
+
return "\n".join(lines)
|
|
451
|
+
except Exception as e:
|
|
452
|
+
s.add(error=str(e))
|
|
453
|
+
return f"Error: {e}"
|
|
454
|
+
|
|
455
|
+
|
|
456
|
+
def excel(
|
|
457
|
+
*,
|
|
458
|
+
pattern: str,
|
|
459
|
+
output_dir: str,
|
|
460
|
+
include_formulas: bool = False,
|
|
461
|
+
compute_formulas: bool = False,
|
|
462
|
+
) -> str:
|
|
463
|
+
"""Convert Excel spreadsheets to Markdown.
|
|
464
|
+
|
|
465
|
+
Converts XLSX files to Markdown tables with sheet-based sections.
|
|
466
|
+
Uses streaming for memory-efficient processing of large files.
|
|
467
|
+
|
|
468
|
+
Args:
|
|
469
|
+
pattern: Glob pattern for input files (e.g., "data/*.xlsx")
|
|
470
|
+
output_dir: Directory for output files
|
|
471
|
+
include_formulas: Include cell formulas as comments
|
|
472
|
+
compute_formulas: Evaluate formulas when cached values are missing
|
|
473
|
+
(requires 'formulas' library: pip install formulas)
|
|
474
|
+
|
|
475
|
+
Returns:
|
|
476
|
+
Conversion summary with output paths, or error message
|
|
477
|
+
|
|
478
|
+
Example:
|
|
479
|
+
convert.excel(pattern="data/report.xlsx", output_dir="data/md")
|
|
480
|
+
convert.excel(pattern="spreadsheets/*.xlsx", output_dir="output", include_formulas=True)
|
|
481
|
+
convert.excel(pattern="data/*.xlsx", output_dir="out", compute_formulas=True)
|
|
482
|
+
"""
|
|
483
|
+
with LogSpan(
|
|
484
|
+
span="convert.excel",
|
|
485
|
+
pattern=pattern,
|
|
486
|
+
output_dir=output_dir,
|
|
487
|
+
include_formulas=include_formulas,
|
|
488
|
+
compute_formulas=compute_formulas,
|
|
489
|
+
) as s:
|
|
490
|
+
files = _resolve_glob(pattern)
|
|
491
|
+
if not files:
|
|
492
|
+
s.add(error="no_match")
|
|
493
|
+
return f"No files matched pattern: {pattern}"
|
|
494
|
+
|
|
495
|
+
out_path = _resolve_output_dir(output_dir)
|
|
496
|
+
|
|
497
|
+
if len(files) == 1:
|
|
498
|
+
try:
|
|
499
|
+
source_rel = _get_source_rel(files[0])
|
|
500
|
+
result = convert_excel(
|
|
501
|
+
files[0], out_path, source_rel,
|
|
502
|
+
include_formulas=include_formulas,
|
|
503
|
+
compute_formulas=compute_formulas,
|
|
504
|
+
)
|
|
505
|
+
s.add(converted=1, sheets=result["sheets"], rows=result["rows"])
|
|
506
|
+
return f"Converted {files[0].name}: {result['sheets']} sheets, {result['rows']} rows\nOutput: {result['output']}"
|
|
507
|
+
except Exception as e:
|
|
508
|
+
s.add(error=str(e))
|
|
509
|
+
return f"Error converting {files[0].name}: {e}"
|
|
510
|
+
|
|
511
|
+
try:
|
|
512
|
+
result = asyncio.run(
|
|
513
|
+
_convert_batch_async(
|
|
514
|
+
files, out_path, convert_excel,
|
|
515
|
+
include_formulas=include_formulas,
|
|
516
|
+
compute_formulas=compute_formulas,
|
|
517
|
+
)
|
|
518
|
+
)
|
|
519
|
+
s.add(converted=result["converted"], failed=result["failed"])
|
|
520
|
+
|
|
521
|
+
lines = [f"Converted {result['converted']} files, {result['failed']} failed"]
|
|
522
|
+
if result["outputs"]:
|
|
523
|
+
lines.append("\nOutputs:")
|
|
524
|
+
for output in result["outputs"]:
|
|
525
|
+
lines.append(f" {output}")
|
|
526
|
+
if result["errors"]:
|
|
527
|
+
lines.append("\nErrors:")
|
|
528
|
+
for error in result["errors"]:
|
|
529
|
+
lines.append(f" {error}")
|
|
530
|
+
|
|
531
|
+
return "\n".join(lines)
|
|
532
|
+
except Exception as e:
|
|
533
|
+
s.add(error=str(e))
|
|
534
|
+
return f"Error: {e}"
|
|
535
|
+
|
|
536
|
+
|
|
537
|
+
def auto(
|
|
538
|
+
*,
|
|
539
|
+
pattern: str,
|
|
540
|
+
output_dir: str,
|
|
541
|
+
) -> str:
|
|
542
|
+
"""Auto-detect format and convert documents to Markdown.
|
|
543
|
+
|
|
544
|
+
Detects file format from extension and uses the appropriate converter.
|
|
545
|
+
Supports PDF, DOCX, PPTX, and XLSX formats.
|
|
546
|
+
|
|
547
|
+
Args:
|
|
548
|
+
pattern: Glob pattern for input files (e.g., "docs/*", "input/**/*")
|
|
549
|
+
output_dir: Directory for output files
|
|
550
|
+
|
|
551
|
+
Returns:
|
|
552
|
+
Conversion summary with output paths, or error message
|
|
553
|
+
|
|
554
|
+
Example:
|
|
555
|
+
convert.auto(pattern="docs/*", output_dir="output")
|
|
556
|
+
convert.auto(pattern="input/**/*.{pdf,docx}", output_dir="converted")
|
|
557
|
+
"""
|
|
558
|
+
with LogSpan(span="convert.auto", pattern=pattern, output_dir=output_dir) as s:
|
|
559
|
+
files = _resolve_glob(pattern)
|
|
560
|
+
if not files:
|
|
561
|
+
s.add(error="no_match")
|
|
562
|
+
return f"No files matched pattern: {pattern}"
|
|
563
|
+
|
|
564
|
+
out_path = _resolve_output_dir(output_dir)
|
|
565
|
+
|
|
566
|
+
# Converters by extension
|
|
567
|
+
converters: dict[str, ConverterFunc] = {
|
|
568
|
+
".pdf": convert_pdf,
|
|
569
|
+
".docx": convert_word,
|
|
570
|
+
".pptx": convert_powerpoint,
|
|
571
|
+
".xlsx": convert_excel,
|
|
572
|
+
}
|
|
573
|
+
|
|
574
|
+
# Single supported file - convert directly
|
|
575
|
+
supported_files = [f for f in files if f.suffix.lower() in converters]
|
|
576
|
+
skipped = len(files) - len(supported_files)
|
|
577
|
+
|
|
578
|
+
if len(supported_files) == 1:
|
|
579
|
+
path = supported_files[0]
|
|
580
|
+
try:
|
|
581
|
+
source_rel = _get_source_rel(path)
|
|
582
|
+
result = converters[path.suffix.lower()](path, out_path, source_rel)
|
|
583
|
+
s.add(converted=1, failed=0, skipped=skipped)
|
|
584
|
+
msg = f"Converted {path.name}\nOutput: {result['output']}"
|
|
585
|
+
if skipped:
|
|
586
|
+
msg += f"\n{skipped} skipped (unsupported format)"
|
|
587
|
+
return msg
|
|
588
|
+
except Exception as e:
|
|
589
|
+
s.add(converted=0, failed=1, skipped=skipped, error=str(e))
|
|
590
|
+
return f"Error converting {path.name}: {e}"
|
|
591
|
+
|
|
592
|
+
if not supported_files:
|
|
593
|
+
s.add(converted=0, failed=0, skipped=skipped)
|
|
594
|
+
return f"No supported files found. {skipped} skipped (unsupported format)"
|
|
595
|
+
|
|
596
|
+
# Batch conversion with async parallel processing
|
|
597
|
+
try:
|
|
598
|
+
result = asyncio.run(_convert_auto_batch_async(files, out_path, converters))
|
|
599
|
+
s.add(converted=result["converted"], failed=result["failed"], skipped=result["skipped"])
|
|
600
|
+
|
|
601
|
+
lines = [f"Converted {result['converted']} files, {result['failed']} failed, {result['skipped']} skipped (unsupported format)"]
|
|
602
|
+
if result["outputs"]:
|
|
603
|
+
lines.append("\nOutputs:")
|
|
604
|
+
for output in result["outputs"]:
|
|
605
|
+
lines.append(f" {output}")
|
|
606
|
+
if result["errors"]:
|
|
607
|
+
lines.append("\nErrors:")
|
|
608
|
+
for error in result["errors"]:
|
|
609
|
+
lines.append(f" {error}")
|
|
610
|
+
|
|
611
|
+
return "\n".join(lines)
|
|
612
|
+
except Exception as e:
|
|
613
|
+
s.add(error=str(e))
|
|
614
|
+
return f"Error: {e}"
|