onetool-mcp 1.0.0b1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bench/__init__.py +5 -0
- bench/cli.py +69 -0
- bench/harness/__init__.py +66 -0
- bench/harness/client.py +692 -0
- bench/harness/config.py +397 -0
- bench/harness/csv_writer.py +109 -0
- bench/harness/evaluate.py +512 -0
- bench/harness/metrics.py +283 -0
- bench/harness/runner.py +899 -0
- bench/py.typed +0 -0
- bench/reporter.py +629 -0
- bench/run.py +487 -0
- bench/secrets.py +101 -0
- bench/utils.py +16 -0
- onetool/__init__.py +4 -0
- onetool/cli.py +391 -0
- onetool/py.typed +0 -0
- onetool_mcp-1.0.0b1.dist-info/METADATA +163 -0
- onetool_mcp-1.0.0b1.dist-info/RECORD +132 -0
- onetool_mcp-1.0.0b1.dist-info/WHEEL +4 -0
- onetool_mcp-1.0.0b1.dist-info/entry_points.txt +3 -0
- onetool_mcp-1.0.0b1.dist-info/licenses/LICENSE.txt +687 -0
- onetool_mcp-1.0.0b1.dist-info/licenses/NOTICE.txt +64 -0
- ot/__init__.py +37 -0
- ot/__main__.py +6 -0
- ot/_cli.py +107 -0
- ot/_tui.py +53 -0
- ot/config/__init__.py +46 -0
- ot/config/defaults/bench.yaml +4 -0
- ot/config/defaults/diagram-templates/api-flow.mmd +33 -0
- ot/config/defaults/diagram-templates/c4-context.puml +30 -0
- ot/config/defaults/diagram-templates/class-diagram.mmd +87 -0
- ot/config/defaults/diagram-templates/feature-mindmap.mmd +70 -0
- ot/config/defaults/diagram-templates/microservices.d2 +81 -0
- ot/config/defaults/diagram-templates/project-gantt.mmd +37 -0
- ot/config/defaults/diagram-templates/state-machine.mmd +42 -0
- ot/config/defaults/onetool.yaml +25 -0
- ot/config/defaults/prompts.yaml +97 -0
- ot/config/defaults/servers.yaml +7 -0
- ot/config/defaults/snippets.yaml +4 -0
- ot/config/defaults/tool_templates/__init__.py +7 -0
- ot/config/defaults/tool_templates/extension.py +52 -0
- ot/config/defaults/tool_templates/isolated.py +61 -0
- ot/config/dynamic.py +121 -0
- ot/config/global_templates/__init__.py +2 -0
- ot/config/global_templates/bench-secrets-template.yaml +6 -0
- ot/config/global_templates/bench.yaml +9 -0
- ot/config/global_templates/onetool.yaml +27 -0
- ot/config/global_templates/secrets-template.yaml +44 -0
- ot/config/global_templates/servers.yaml +18 -0
- ot/config/global_templates/snippets.yaml +235 -0
- ot/config/loader.py +1087 -0
- ot/config/mcp.py +145 -0
- ot/config/secrets.py +190 -0
- ot/config/tool_config.py +125 -0
- ot/decorators.py +116 -0
- ot/executor/__init__.py +35 -0
- ot/executor/base.py +16 -0
- ot/executor/fence_processor.py +83 -0
- ot/executor/linter.py +142 -0
- ot/executor/pack_proxy.py +260 -0
- ot/executor/param_resolver.py +140 -0
- ot/executor/pep723.py +288 -0
- ot/executor/result_store.py +369 -0
- ot/executor/runner.py +496 -0
- ot/executor/simple.py +163 -0
- ot/executor/tool_loader.py +396 -0
- ot/executor/validator.py +398 -0
- ot/executor/worker_pool.py +388 -0
- ot/executor/worker_proxy.py +189 -0
- ot/http_client.py +145 -0
- ot/logging/__init__.py +37 -0
- ot/logging/config.py +315 -0
- ot/logging/entry.py +213 -0
- ot/logging/format.py +188 -0
- ot/logging/span.py +349 -0
- ot/meta.py +1555 -0
- ot/paths.py +453 -0
- ot/prompts.py +218 -0
- ot/proxy/__init__.py +21 -0
- ot/proxy/manager.py +396 -0
- ot/py.typed +0 -0
- ot/registry/__init__.py +189 -0
- ot/registry/models.py +57 -0
- ot/registry/parser.py +269 -0
- ot/registry/registry.py +413 -0
- ot/server.py +315 -0
- ot/shortcuts/__init__.py +15 -0
- ot/shortcuts/aliases.py +87 -0
- ot/shortcuts/snippets.py +258 -0
- ot/stats/__init__.py +35 -0
- ot/stats/html.py +250 -0
- ot/stats/jsonl_writer.py +283 -0
- ot/stats/reader.py +354 -0
- ot/stats/timing.py +57 -0
- ot/support.py +63 -0
- ot/tools.py +114 -0
- ot/utils/__init__.py +81 -0
- ot/utils/batch.py +161 -0
- ot/utils/cache.py +120 -0
- ot/utils/deps.py +403 -0
- ot/utils/exceptions.py +23 -0
- ot/utils/factory.py +179 -0
- ot/utils/format.py +65 -0
- ot/utils/http.py +202 -0
- ot/utils/platform.py +45 -0
- ot/utils/sanitize.py +130 -0
- ot/utils/truncate.py +69 -0
- ot_tools/__init__.py +4 -0
- ot_tools/_convert/__init__.py +12 -0
- ot_tools/_convert/excel.py +279 -0
- ot_tools/_convert/pdf.py +254 -0
- ot_tools/_convert/powerpoint.py +268 -0
- ot_tools/_convert/utils.py +358 -0
- ot_tools/_convert/word.py +283 -0
- ot_tools/brave_search.py +604 -0
- ot_tools/code_search.py +736 -0
- ot_tools/context7.py +495 -0
- ot_tools/convert.py +614 -0
- ot_tools/db.py +415 -0
- ot_tools/diagram.py +1604 -0
- ot_tools/diagram.yaml +167 -0
- ot_tools/excel.py +1372 -0
- ot_tools/file.py +1348 -0
- ot_tools/firecrawl.py +732 -0
- ot_tools/grounding_search.py +646 -0
- ot_tools/package.py +604 -0
- ot_tools/py.typed +0 -0
- ot_tools/ripgrep.py +544 -0
- ot_tools/scaffold.py +471 -0
- ot_tools/transform.py +213 -0
- ot_tools/web_fetch.py +384 -0
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
"""Document conversion utilities for OneTool.
|
|
2
|
+
|
|
3
|
+
Provides PDF, Word, PowerPoint, and Excel to Markdown conversion
|
|
4
|
+
with LLM-optimised output including YAML frontmatter and TOC.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from ot_tools._convert.excel import convert_excel
|
|
8
|
+
from ot_tools._convert.pdf import convert_pdf
|
|
9
|
+
from ot_tools._convert.powerpoint import convert_powerpoint
|
|
10
|
+
from ot_tools._convert.word import convert_word
|
|
11
|
+
|
|
12
|
+
__all__ = ["convert_excel", "convert_pdf", "convert_powerpoint", "convert_word"]
|
|
@@ -0,0 +1,279 @@
|
|
|
1
|
+
"""Excel workbook to Markdown converter.
|
|
2
|
+
|
|
3
|
+
Converts XLSX spreadsheets to Markdown with:
|
|
4
|
+
- Streaming row processing via openpyxl read_only mode
|
|
5
|
+
- Sheet-based sections
|
|
6
|
+
- Optional formula extraction
|
|
7
|
+
- YAML frontmatter and TOC generation
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
from pathlib import Path # noqa: TC003 (used at runtime)
|
|
13
|
+
from typing import Any
|
|
14
|
+
|
|
15
|
+
from ot_tools._convert.utils import (
|
|
16
|
+
IncrementalWriter,
|
|
17
|
+
compute_file_checksum,
|
|
18
|
+
get_mtime_iso,
|
|
19
|
+
normalise_whitespace,
|
|
20
|
+
write_toc_file,
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def convert_excel(
|
|
25
|
+
input_path: Path,
|
|
26
|
+
output_dir: Path,
|
|
27
|
+
source_rel: str,
|
|
28
|
+
*,
|
|
29
|
+
include_formulas: bool = False,
|
|
30
|
+
compute_formulas: bool = False,
|
|
31
|
+
) -> dict[str, Any]:
|
|
32
|
+
"""Convert Excel workbook to Markdown.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
input_path: Path to XLSX file
|
|
36
|
+
output_dir: Directory for output files
|
|
37
|
+
source_rel: Relative path to source for frontmatter
|
|
38
|
+
include_formulas: Include cell formulas as comments
|
|
39
|
+
compute_formulas: Evaluate formulas when cached values are missing
|
|
40
|
+
(requires 'formulas' library: pip install formulas)
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
Dict with 'output', 'sheets', 'rows' keys
|
|
44
|
+
"""
|
|
45
|
+
try:
|
|
46
|
+
from openpyxl import load_workbook # type: ignore[import-untyped]
|
|
47
|
+
except ImportError as e:
|
|
48
|
+
raise ImportError(
|
|
49
|
+
"openpyxl is required for convert. Install with: pip install openpyxl"
|
|
50
|
+
) from e
|
|
51
|
+
|
|
52
|
+
# Load formula model if compute_formulas is enabled
|
|
53
|
+
formula_model: Any = None
|
|
54
|
+
if compute_formulas:
|
|
55
|
+
try:
|
|
56
|
+
import formulas # type: ignore[import-untyped]
|
|
57
|
+
|
|
58
|
+
formula_model = formulas.ExcelModel().loads(str(input_path)).finish()
|
|
59
|
+
except ImportError:
|
|
60
|
+
raise ImportError(
|
|
61
|
+
"formulas library is required for compute_formulas. "
|
|
62
|
+
"Install with: pip install formulas"
|
|
63
|
+
) from None
|
|
64
|
+
except Exception:
|
|
65
|
+
# If formula model fails to load, continue without it
|
|
66
|
+
formula_model = None
|
|
67
|
+
|
|
68
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
69
|
+
|
|
70
|
+
# Load workbook
|
|
71
|
+
# - read_only=False when computing formulas (need full access)
|
|
72
|
+
# - data_only=True: get cached computed values (no formulas)
|
|
73
|
+
# - data_only=False: get formulas as cell values (when include_formulas=True)
|
|
74
|
+
read_only = not compute_formulas
|
|
75
|
+
wb = load_workbook(input_path, read_only=read_only, data_only=not include_formulas)
|
|
76
|
+
|
|
77
|
+
# Get metadata for frontmatter
|
|
78
|
+
checksum = compute_file_checksum(input_path)
|
|
79
|
+
mtime = get_mtime_iso(input_path)
|
|
80
|
+
total_sheets = len(wb.sheetnames)
|
|
81
|
+
|
|
82
|
+
writer = IncrementalWriter()
|
|
83
|
+
total_rows = 0
|
|
84
|
+
|
|
85
|
+
# Process each sheet (single workbook - no double loading)
|
|
86
|
+
for sheet_name in wb.sheetnames:
|
|
87
|
+
ws = wb[sheet_name]
|
|
88
|
+
rows = _process_sheet(writer, sheet_name, ws, include_formulas, formula_model)
|
|
89
|
+
total_rows += rows
|
|
90
|
+
|
|
91
|
+
wb.close()
|
|
92
|
+
|
|
93
|
+
# Write main output (pure content, no frontmatter - line numbers start at 1)
|
|
94
|
+
content = normalise_whitespace(writer.get_content())
|
|
95
|
+
output_path = output_dir / f"{input_path.stem}.md"
|
|
96
|
+
output_path.write_text(content, encoding="utf-8")
|
|
97
|
+
|
|
98
|
+
# Write separate TOC file (includes frontmatter)
|
|
99
|
+
headings = writer.get_headings()
|
|
100
|
+
toc_path = write_toc_file(
|
|
101
|
+
headings=headings,
|
|
102
|
+
output_dir=output_dir,
|
|
103
|
+
stem=input_path.stem,
|
|
104
|
+
source=source_rel,
|
|
105
|
+
converted=mtime,
|
|
106
|
+
pages=total_sheets,
|
|
107
|
+
checksum=checksum,
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
return {
|
|
111
|
+
"output": str(output_path),
|
|
112
|
+
"toc": str(toc_path),
|
|
113
|
+
"sheets": total_sheets,
|
|
114
|
+
"rows": total_rows,
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def _process_sheet(
|
|
119
|
+
writer: IncrementalWriter,
|
|
120
|
+
sheet_name: str,
|
|
121
|
+
ws: Any,
|
|
122
|
+
include_formulas: bool,
|
|
123
|
+
formula_model: Any = None,
|
|
124
|
+
) -> int:
|
|
125
|
+
"""Process a single worksheet with streaming (O(1) memory for row data).
|
|
126
|
+
|
|
127
|
+
When include_formulas=True, the workbook was loaded with data_only=False,
|
|
128
|
+
so formula cells contain the formula string as their value.
|
|
129
|
+
|
|
130
|
+
Args:
|
|
131
|
+
writer: IncrementalWriter for output
|
|
132
|
+
sheet_name: Name of the worksheet
|
|
133
|
+
ws: Worksheet object
|
|
134
|
+
include_formulas: Whether to include formulas in output
|
|
135
|
+
formula_model: Optional formulas.ExcelModel for computing formula values
|
|
136
|
+
|
|
137
|
+
Returns:
|
|
138
|
+
Number of rows processed
|
|
139
|
+
"""
|
|
140
|
+
writer.write_heading(2, f"Sheet: {sheet_name}")
|
|
141
|
+
|
|
142
|
+
# First pass: count max columns (streaming, no data storage)
|
|
143
|
+
max_cols = 0
|
|
144
|
+
row_count = 0
|
|
145
|
+
for row in ws.iter_rows():
|
|
146
|
+
max_cols = max(max_cols, len(row))
|
|
147
|
+
row_count += 1
|
|
148
|
+
|
|
149
|
+
if row_count == 0:
|
|
150
|
+
writer.write("(empty sheet)\n\n")
|
|
151
|
+
return 0
|
|
152
|
+
|
|
153
|
+
# Second pass: stream rows directly to writer
|
|
154
|
+
rows_iter = iter(ws.iter_rows())
|
|
155
|
+
|
|
156
|
+
# Get header (first row)
|
|
157
|
+
first_row = next(rows_iter)
|
|
158
|
+
header = [
|
|
159
|
+
_get_cell_value(cell, sheet_name, 1, j + 1, formula_model)
|
|
160
|
+
for j, cell in enumerate(first_row)
|
|
161
|
+
]
|
|
162
|
+
# Pad header to max_cols
|
|
163
|
+
while len(header) < max_cols:
|
|
164
|
+
header.append("")
|
|
165
|
+
|
|
166
|
+
# Write header
|
|
167
|
+
writer.write("| " + " | ".join(_escape_pipe(c) for c in header) + " |\n")
|
|
168
|
+
writer.write("| " + " | ".join("---" for _ in header) + " |\n")
|
|
169
|
+
|
|
170
|
+
# Collect formulas as we go (just formula tuples, not full row data)
|
|
171
|
+
# Format: (col_letter, row_num, formula_string)
|
|
172
|
+
formulas: list[tuple[str, int, str]] = []
|
|
173
|
+
|
|
174
|
+
# Check first row for formulas (cell values are formulas when include_formulas=True)
|
|
175
|
+
if include_formulas:
|
|
176
|
+
for j, cell in enumerate(first_row):
|
|
177
|
+
try:
|
|
178
|
+
value = cell.value
|
|
179
|
+
if isinstance(value, str) and value.startswith("="):
|
|
180
|
+
formulas.append((_col_letter(j + 1), 1, value))
|
|
181
|
+
except Exception:
|
|
182
|
+
pass
|
|
183
|
+
|
|
184
|
+
# Stream remaining rows directly to writer
|
|
185
|
+
current_row = 2 # 1-indexed, header was row 1
|
|
186
|
+
for row in rows_iter:
|
|
187
|
+
row_values = [
|
|
188
|
+
_get_cell_value(cell, sheet_name, current_row, j + 1, formula_model)
|
|
189
|
+
for j, cell in enumerate(row)
|
|
190
|
+
]
|
|
191
|
+
# Pad row to max_cols
|
|
192
|
+
while len(row_values) < max_cols:
|
|
193
|
+
row_values.append("")
|
|
194
|
+
|
|
195
|
+
writer.write("| " + " | ".join(_escape_pipe(c) for c in row_values[:len(header)]) + " |\n")
|
|
196
|
+
|
|
197
|
+
# Track formulas for this row (cell values are formulas when include_formulas=True)
|
|
198
|
+
if include_formulas:
|
|
199
|
+
for j, cell in enumerate(row):
|
|
200
|
+
try:
|
|
201
|
+
value = cell.value
|
|
202
|
+
if isinstance(value, str) and value.startswith("="):
|
|
203
|
+
formulas.append((_col_letter(j + 1), current_row, value))
|
|
204
|
+
except Exception:
|
|
205
|
+
pass
|
|
206
|
+
|
|
207
|
+
current_row += 1
|
|
208
|
+
|
|
209
|
+
writer.write("\n")
|
|
210
|
+
|
|
211
|
+
# Add formulas section if any formulas found
|
|
212
|
+
if formulas:
|
|
213
|
+
writer.write("**Formulas:**\n\n")
|
|
214
|
+
writer.write("```\n")
|
|
215
|
+
for col_letter, row_num, formula in formulas:
|
|
216
|
+
writer.write(f"{col_letter}{row_num}: {formula}\n")
|
|
217
|
+
writer.write("```\n\n")
|
|
218
|
+
|
|
219
|
+
return row_count
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
def _get_cell_value(
|
|
223
|
+
cell: Any,
|
|
224
|
+
sheet_name: str,
|
|
225
|
+
row_num: int,
|
|
226
|
+
col_num: int,
|
|
227
|
+
formula_model: Any,
|
|
228
|
+
) -> str:
|
|
229
|
+
"""Get cell value, optionally computing from formula model.
|
|
230
|
+
|
|
231
|
+
Args:
|
|
232
|
+
cell: openpyxl cell object
|
|
233
|
+
sheet_name: Name of the worksheet (for formula lookup)
|
|
234
|
+
row_num: 1-indexed row number
|
|
235
|
+
col_num: 1-indexed column number
|
|
236
|
+
formula_model: Optional formulas.ExcelModel for computing values
|
|
237
|
+
|
|
238
|
+
Returns:
|
|
239
|
+
String representation of cell value
|
|
240
|
+
"""
|
|
241
|
+
value = cell.value
|
|
242
|
+
|
|
243
|
+
# If we have a value, use it
|
|
244
|
+
if value is not None:
|
|
245
|
+
return str(value)
|
|
246
|
+
|
|
247
|
+
# If no formula model, return empty
|
|
248
|
+
if formula_model is None:
|
|
249
|
+
return ""
|
|
250
|
+
|
|
251
|
+
# Try to compute value from formula model
|
|
252
|
+
try:
|
|
253
|
+
# Build cell reference like "'Sheet1'!A1"
|
|
254
|
+
col_letter = _col_letter(col_num)
|
|
255
|
+
cell_ref = f"'{sheet_name}'!{col_letter}{row_num}"
|
|
256
|
+
computed = formula_model.calculate(cell_ref)
|
|
257
|
+
if computed is not None and computed != cell_ref:
|
|
258
|
+
# Handle numpy arrays and other types
|
|
259
|
+
if hasattr(computed, "item"):
|
|
260
|
+
computed = computed.item()
|
|
261
|
+
return str(computed)
|
|
262
|
+
except Exception:
|
|
263
|
+
pass
|
|
264
|
+
|
|
265
|
+
return ""
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
def _escape_pipe(text: str) -> str:
|
|
269
|
+
"""Escape pipe characters for Markdown tables."""
|
|
270
|
+
return text.replace("|", "\\|").replace("\n", " ")
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
def _col_letter(n: int) -> str:
|
|
274
|
+
"""Convert column number to letter (1=A, 2=B, ..., 27=AA)."""
|
|
275
|
+
result = ""
|
|
276
|
+
while n > 0:
|
|
277
|
+
n, remainder = divmod(n - 1, 26)
|
|
278
|
+
result = chr(65 + remainder) + result
|
|
279
|
+
return result
|
ot_tools/_convert/pdf.py
ADDED
|
@@ -0,0 +1,254 @@
|
|
|
1
|
+
"""PDF to Markdown converter.
|
|
2
|
+
|
|
3
|
+
Converts PDF documents to Markdown with:
|
|
4
|
+
- Lazy page loading via PyMuPDF
|
|
5
|
+
- Outline-based heading extraction
|
|
6
|
+
- Hash-based image naming for diff stability
|
|
7
|
+
- YAML frontmatter and TOC generation
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import io
|
|
13
|
+
from pathlib import Path # noqa: TC003 (used at runtime)
|
|
14
|
+
from typing import TYPE_CHECKING, Any
|
|
15
|
+
|
|
16
|
+
try:
|
|
17
|
+
import fitz # type: ignore[import-untyped] # PyMuPDF
|
|
18
|
+
except ImportError as e:
|
|
19
|
+
raise ImportError(
|
|
20
|
+
"pymupdf is required for convert. Install with: pip install pymupdf"
|
|
21
|
+
) from e
|
|
22
|
+
|
|
23
|
+
from PIL import Image
|
|
24
|
+
|
|
25
|
+
if TYPE_CHECKING:
|
|
26
|
+
from PIL.Image import Image as PILImage
|
|
27
|
+
|
|
28
|
+
from ot_tools._convert.utils import (
|
|
29
|
+
IncrementalWriter,
|
|
30
|
+
compute_file_checksum,
|
|
31
|
+
compute_image_hash,
|
|
32
|
+
get_mtime_iso,
|
|
33
|
+
normalise_whitespace,
|
|
34
|
+
write_toc_file,
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _merge_smask(image_bytes: bytes, sm_bytes: bytes) -> bytes:
|
|
39
|
+
"""Merge soft-mask into image for transparency.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
image_bytes: Base image bytes
|
|
43
|
+
sm_bytes: Soft-mask bytes
|
|
44
|
+
|
|
45
|
+
Returns:
|
|
46
|
+
PNG bytes with transparency
|
|
47
|
+
"""
|
|
48
|
+
with (
|
|
49
|
+
Image.open(io.BytesIO(image_bytes)) as im_file,
|
|
50
|
+
Image.open(io.BytesIO(sm_bytes)) as mask_file,
|
|
51
|
+
):
|
|
52
|
+
mask: PILImage = mask_file.convert("L")
|
|
53
|
+
im: PILImage = im_file.convert("RGBA")
|
|
54
|
+
if mask.size != im.size:
|
|
55
|
+
mask = mask.resize(im.size)
|
|
56
|
+
im.putalpha(mask)
|
|
57
|
+
buf = io.BytesIO()
|
|
58
|
+
im.save(buf, format="PNG")
|
|
59
|
+
return buf.getvalue()
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _detect_image_format(image_bytes: bytes) -> str:
|
|
63
|
+
"""Detect image format from bytes.
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
image_bytes: Image data
|
|
67
|
+
|
|
68
|
+
Returns:
|
|
69
|
+
File extension (e.g., 'png', 'jpg')
|
|
70
|
+
"""
|
|
71
|
+
try:
|
|
72
|
+
with Image.open(io.BytesIO(image_bytes)) as im:
|
|
73
|
+
format_map = {
|
|
74
|
+
"JPEG": "jpg",
|
|
75
|
+
"PNG": "png",
|
|
76
|
+
"GIF": "gif",
|
|
77
|
+
"BMP": "bmp",
|
|
78
|
+
"TIFF": "tiff",
|
|
79
|
+
"WEBP": "webp",
|
|
80
|
+
}
|
|
81
|
+
return format_map.get(im.format or "", "png")
|
|
82
|
+
except Exception:
|
|
83
|
+
return "png"
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def _get_outline_headings(doc: fitz.Document) -> list[tuple[int, str, int]]:
|
|
87
|
+
"""Extract outline/bookmarks from PDF.
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
doc: PyMuPDF document
|
|
91
|
+
|
|
92
|
+
Returns:
|
|
93
|
+
List of (level, title, page_number) tuples
|
|
94
|
+
"""
|
|
95
|
+
try:
|
|
96
|
+
toc = doc.get_toc()
|
|
97
|
+
return [(level, title, page) for level, title, page in toc]
|
|
98
|
+
except Exception:
|
|
99
|
+
return []
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def _extract_and_save_image(
|
|
103
|
+
doc: fitz.Document,
|
|
104
|
+
xref: int,
|
|
105
|
+
images_dir: Path,
|
|
106
|
+
writer: IncrementalWriter,
|
|
107
|
+
) -> bool:
|
|
108
|
+
"""Extract a single image and save to disk.
|
|
109
|
+
|
|
110
|
+
This function encapsulates image processing so that memory (image_bytes)
|
|
111
|
+
is freed when the function returns, preventing accumulation.
|
|
112
|
+
|
|
113
|
+
Args:
|
|
114
|
+
doc: PyMuPDF document
|
|
115
|
+
xref: Image xref in the document
|
|
116
|
+
images_dir: Directory for saving images
|
|
117
|
+
writer: Incremental writer for markdown output
|
|
118
|
+
|
|
119
|
+
Returns:
|
|
120
|
+
True if image was successfully extracted, False otherwise
|
|
121
|
+
"""
|
|
122
|
+
base_image = doc.extract_image(xref)
|
|
123
|
+
image_bytes = base_image.get("image")
|
|
124
|
+
smask = base_image.get("smask")
|
|
125
|
+
|
|
126
|
+
if not image_bytes:
|
|
127
|
+
return False
|
|
128
|
+
|
|
129
|
+
# Handle soft-mask (transparency)
|
|
130
|
+
if smask:
|
|
131
|
+
try:
|
|
132
|
+
sm_base = doc.extract_image(smask)
|
|
133
|
+
sm_bytes = sm_base.get("image")
|
|
134
|
+
if sm_bytes:
|
|
135
|
+
image_bytes = _merge_smask(image_bytes, sm_bytes)
|
|
136
|
+
extension = "png"
|
|
137
|
+
else:
|
|
138
|
+
extension = _detect_image_format(image_bytes)
|
|
139
|
+
except Exception:
|
|
140
|
+
extension = _detect_image_format(image_bytes)
|
|
141
|
+
else:
|
|
142
|
+
extension = _detect_image_format(image_bytes)
|
|
143
|
+
|
|
144
|
+
# Hash-based naming for diff stability
|
|
145
|
+
img_hash = compute_image_hash(image_bytes)
|
|
146
|
+
img_name = f"img_{img_hash}.{extension}"
|
|
147
|
+
img_path = images_dir / img_name
|
|
148
|
+
|
|
149
|
+
# Only write if not already extracted (dedup by hash)
|
|
150
|
+
if not img_path.exists():
|
|
151
|
+
images_dir.mkdir(parents=True, exist_ok=True)
|
|
152
|
+
img_path.write_bytes(image_bytes)
|
|
153
|
+
|
|
154
|
+
rel_path = f"{images_dir.name}/{img_name}"
|
|
155
|
+
writer.write(f"\n\n")
|
|
156
|
+
|
|
157
|
+
return True
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def convert_pdf(
|
|
161
|
+
input_path: Path,
|
|
162
|
+
output_dir: Path,
|
|
163
|
+
source_rel: str,
|
|
164
|
+
) -> dict[str, Any]:
|
|
165
|
+
"""Convert PDF to Markdown.
|
|
166
|
+
|
|
167
|
+
Args:
|
|
168
|
+
input_path: Path to PDF file
|
|
169
|
+
output_dir: Directory for output files
|
|
170
|
+
source_rel: Relative path to source for frontmatter
|
|
171
|
+
|
|
172
|
+
Returns:
|
|
173
|
+
Dict with 'output', 'pages', 'images' keys
|
|
174
|
+
"""
|
|
175
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
176
|
+
|
|
177
|
+
doc = fitz.open(input_path)
|
|
178
|
+
try:
|
|
179
|
+
total_pages = len(doc)
|
|
180
|
+
|
|
181
|
+
# Get metadata for frontmatter
|
|
182
|
+
checksum = compute_file_checksum(input_path)
|
|
183
|
+
mtime = get_mtime_iso(input_path)
|
|
184
|
+
|
|
185
|
+
# Get outline for heading insertion
|
|
186
|
+
outline = _get_outline_headings(doc)
|
|
187
|
+
outline_by_page: dict[int, list[tuple[int, str]]] = {}
|
|
188
|
+
for level, title, page in outline:
|
|
189
|
+
if page not in outline_by_page:
|
|
190
|
+
outline_by_page[page] = []
|
|
191
|
+
outline_by_page[page].append((level, title))
|
|
192
|
+
|
|
193
|
+
# Set up images directory
|
|
194
|
+
images_dir = output_dir / f"{input_path.stem}_images"
|
|
195
|
+
writer = IncrementalWriter()
|
|
196
|
+
images_extracted = 0
|
|
197
|
+
|
|
198
|
+
# Process pages with lazy loading
|
|
199
|
+
for pageno in range(total_pages):
|
|
200
|
+
page = doc[pageno]
|
|
201
|
+
page_num = pageno + 1
|
|
202
|
+
|
|
203
|
+
# Insert outline headings for this page
|
|
204
|
+
if page_num in outline_by_page:
|
|
205
|
+
for level, title in outline_by_page[page_num]:
|
|
206
|
+
writer.write_heading(min(level, 6), title)
|
|
207
|
+
elif not outline:
|
|
208
|
+
# No outline - use page numbers as structure
|
|
209
|
+
writer.write_heading(1, f"Page {page_num}")
|
|
210
|
+
|
|
211
|
+
# Extract text
|
|
212
|
+
text = page.get_text("text")
|
|
213
|
+
if text.strip():
|
|
214
|
+
writer.write(text.rstrip() + "\n\n")
|
|
215
|
+
|
|
216
|
+
# Extract images - process one at a time to minimize memory
|
|
217
|
+
image_list = page.get_images(full=True)
|
|
218
|
+
for img in image_list:
|
|
219
|
+
xref = img[0]
|
|
220
|
+
try:
|
|
221
|
+
result = _extract_and_save_image(
|
|
222
|
+
doc, xref, images_dir, writer
|
|
223
|
+
)
|
|
224
|
+
if result:
|
|
225
|
+
images_extracted += 1
|
|
226
|
+
except Exception:
|
|
227
|
+
# Skip failed image extraction
|
|
228
|
+
continue
|
|
229
|
+
finally:
|
|
230
|
+
doc.close()
|
|
231
|
+
|
|
232
|
+
# Write main output (pure content, no frontmatter - line numbers start at 1)
|
|
233
|
+
content = normalise_whitespace(writer.get_content())
|
|
234
|
+
output_path = output_dir / f"{input_path.stem}.md"
|
|
235
|
+
output_path.write_text(content, encoding="utf-8")
|
|
236
|
+
|
|
237
|
+
# Write separate TOC file (includes frontmatter)
|
|
238
|
+
headings = writer.get_headings()
|
|
239
|
+
toc_path = write_toc_file(
|
|
240
|
+
headings=headings,
|
|
241
|
+
output_dir=output_dir,
|
|
242
|
+
stem=input_path.stem,
|
|
243
|
+
source=source_rel,
|
|
244
|
+
converted=mtime,
|
|
245
|
+
pages=total_pages,
|
|
246
|
+
checksum=checksum,
|
|
247
|
+
)
|
|
248
|
+
|
|
249
|
+
return {
|
|
250
|
+
"output": str(output_path),
|
|
251
|
+
"toc": str(toc_path),
|
|
252
|
+
"pages": total_pages,
|
|
253
|
+
"images": images_extracted,
|
|
254
|
+
}
|