onetool-mcp 1.0.0b1__py3-none-any.whl → 1.0.0rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- onetool/cli.py +63 -4
- onetool_mcp-1.0.0rc2.dist-info/METADATA +266 -0
- onetool_mcp-1.0.0rc2.dist-info/RECORD +129 -0
- {onetool_mcp-1.0.0b1.dist-info → onetool_mcp-1.0.0rc2.dist-info}/licenses/LICENSE.txt +1 -1
- {onetool_mcp-1.0.0b1.dist-info → onetool_mcp-1.0.0rc2.dist-info}/licenses/NOTICE.txt +54 -64
- ot/__main__.py +6 -6
- ot/config/__init__.py +48 -46
- ot/config/global_templates/__init__.py +2 -2
- ot/config/{defaults → global_templates}/diagram-templates/api-flow.mmd +33 -33
- ot/config/{defaults → global_templates}/diagram-templates/c4-context.puml +30 -30
- ot/config/{defaults → global_templates}/diagram-templates/class-diagram.mmd +87 -87
- ot/config/{defaults → global_templates}/diagram-templates/feature-mindmap.mmd +70 -70
- ot/config/{defaults → global_templates}/diagram-templates/microservices.d2 +81 -81
- ot/config/{defaults → global_templates}/diagram-templates/project-gantt.mmd +37 -37
- ot/config/{defaults → global_templates}/diagram-templates/state-machine.mmd +42 -42
- ot/config/global_templates/diagram.yaml +167 -0
- ot/config/global_templates/onetool.yaml +3 -1
- ot/config/{defaults → global_templates}/prompts.yaml +102 -97
- ot/config/global_templates/security.yaml +31 -0
- ot/config/global_templates/servers.yaml +93 -12
- ot/config/global_templates/snippets.yaml +5 -26
- ot/config/{defaults → global_templates}/tool_templates/__init__.py +7 -7
- ot/config/loader.py +221 -105
- ot/config/mcp.py +5 -1
- ot/config/secrets.py +192 -190
- ot/decorators.py +116 -116
- ot/executor/__init__.py +35 -35
- ot/executor/base.py +16 -16
- ot/executor/fence_processor.py +83 -83
- ot/executor/linter.py +142 -142
- ot/executor/pep723.py +288 -288
- ot/executor/runner.py +20 -6
- ot/executor/simple.py +163 -163
- ot/executor/validator.py +603 -164
- ot/http_client.py +145 -145
- ot/logging/__init__.py +37 -37
- ot/logging/entry.py +213 -213
- ot/logging/format.py +191 -188
- ot/logging/span.py +349 -349
- ot/meta.py +236 -14
- ot/paths.py +32 -49
- ot/prompts.py +218 -218
- ot/proxy/manager.py +14 -2
- ot/registry/__init__.py +189 -189
- ot/registry/parser.py +269 -269
- ot/server.py +330 -315
- ot/shortcuts/__init__.py +15 -15
- ot/shortcuts/aliases.py +87 -87
- ot/shortcuts/snippets.py +258 -258
- ot/stats/__init__.py +35 -35
- ot/stats/html.py +2 -2
- ot/stats/reader.py +354 -354
- ot/stats/timing.py +57 -57
- ot/support.py +63 -63
- ot/tools.py +1 -1
- ot/utils/batch.py +161 -161
- ot/utils/cache.py +120 -120
- ot/utils/exceptions.py +23 -23
- ot/utils/factory.py +178 -179
- ot/utils/format.py +65 -65
- ot/utils/http.py +202 -202
- ot/utils/platform.py +45 -45
- ot/utils/truncate.py +69 -69
- ot_tools/__init__.py +4 -4
- ot_tools/_convert/__init__.py +12 -12
- ot_tools/_convert/pdf.py +254 -254
- ot_tools/diagram.yaml +167 -167
- ot_tools/scaffold.py +2 -2
- ot_tools/transform.py +124 -19
- ot_tools/web_fetch.py +94 -43
- onetool_mcp-1.0.0b1.dist-info/METADATA +0 -163
- onetool_mcp-1.0.0b1.dist-info/RECORD +0 -132
- ot/config/defaults/bench.yaml +0 -4
- ot/config/defaults/onetool.yaml +0 -25
- ot/config/defaults/servers.yaml +0 -7
- ot/config/defaults/snippets.yaml +0 -4
- ot_tools/firecrawl.py +0 -732
- {onetool_mcp-1.0.0b1.dist-info → onetool_mcp-1.0.0rc2.dist-info}/WHEEL +0 -0
- {onetool_mcp-1.0.0b1.dist-info → onetool_mcp-1.0.0rc2.dist-info}/entry_points.txt +0 -0
- /ot/config/{defaults → global_templates}/tool_templates/extension.py +0 -0
- /ot/config/{defaults → global_templates}/tool_templates/isolated.py +0 -0
ot_tools/_convert/pdf.py
CHANGED
|
@@ -1,254 +1,254 @@
|
|
|
1
|
-
"""PDF to Markdown converter.
|
|
2
|
-
|
|
3
|
-
Converts PDF documents to Markdown with:
|
|
4
|
-
- Lazy page loading via PyMuPDF
|
|
5
|
-
- Outline-based heading extraction
|
|
6
|
-
- Hash-based image naming for diff stability
|
|
7
|
-
- YAML frontmatter and TOC generation
|
|
8
|
-
"""
|
|
9
|
-
|
|
10
|
-
from __future__ import annotations
|
|
11
|
-
|
|
12
|
-
import io
|
|
13
|
-
from pathlib import Path # noqa: TC003 (used at runtime)
|
|
14
|
-
from typing import TYPE_CHECKING, Any
|
|
15
|
-
|
|
16
|
-
try:
|
|
17
|
-
import fitz # type: ignore[import-untyped] # PyMuPDF
|
|
18
|
-
except ImportError as e:
|
|
19
|
-
raise ImportError(
|
|
20
|
-
"pymupdf is required for convert. Install with: pip install pymupdf"
|
|
21
|
-
) from e
|
|
22
|
-
|
|
23
|
-
from PIL import Image
|
|
24
|
-
|
|
25
|
-
if TYPE_CHECKING:
|
|
26
|
-
from PIL.Image import Image as PILImage
|
|
27
|
-
|
|
28
|
-
from ot_tools._convert.utils import (
|
|
29
|
-
IncrementalWriter,
|
|
30
|
-
compute_file_checksum,
|
|
31
|
-
compute_image_hash,
|
|
32
|
-
get_mtime_iso,
|
|
33
|
-
normalise_whitespace,
|
|
34
|
-
write_toc_file,
|
|
35
|
-
)
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
def _merge_smask(image_bytes: bytes, sm_bytes: bytes) -> bytes:
|
|
39
|
-
"""Merge soft-mask into image for transparency.
|
|
40
|
-
|
|
41
|
-
Args:
|
|
42
|
-
image_bytes: Base image bytes
|
|
43
|
-
sm_bytes: Soft-mask bytes
|
|
44
|
-
|
|
45
|
-
Returns:
|
|
46
|
-
PNG bytes with transparency
|
|
47
|
-
"""
|
|
48
|
-
with (
|
|
49
|
-
Image.open(io.BytesIO(image_bytes)) as im_file,
|
|
50
|
-
Image.open(io.BytesIO(sm_bytes)) as mask_file,
|
|
51
|
-
):
|
|
52
|
-
mask: PILImage = mask_file.convert("L")
|
|
53
|
-
im: PILImage = im_file.convert("RGBA")
|
|
54
|
-
if mask.size != im.size:
|
|
55
|
-
mask = mask.resize(im.size)
|
|
56
|
-
im.putalpha(mask)
|
|
57
|
-
buf = io.BytesIO()
|
|
58
|
-
im.save(buf, format="PNG")
|
|
59
|
-
return buf.getvalue()
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
def _detect_image_format(image_bytes: bytes) -> str:
|
|
63
|
-
"""Detect image format from bytes.
|
|
64
|
-
|
|
65
|
-
Args:
|
|
66
|
-
image_bytes: Image data
|
|
67
|
-
|
|
68
|
-
Returns:
|
|
69
|
-
File extension (e.g., 'png', 'jpg')
|
|
70
|
-
"""
|
|
71
|
-
try:
|
|
72
|
-
with Image.open(io.BytesIO(image_bytes)) as im:
|
|
73
|
-
format_map = {
|
|
74
|
-
"JPEG": "jpg",
|
|
75
|
-
"PNG": "png",
|
|
76
|
-
"GIF": "gif",
|
|
77
|
-
"BMP": "bmp",
|
|
78
|
-
"TIFF": "tiff",
|
|
79
|
-
"WEBP": "webp",
|
|
80
|
-
}
|
|
81
|
-
return format_map.get(im.format or "", "png")
|
|
82
|
-
except Exception:
|
|
83
|
-
return "png"
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
def _get_outline_headings(doc: fitz.Document) -> list[tuple[int, str, int]]:
|
|
87
|
-
"""Extract outline/bookmarks from PDF.
|
|
88
|
-
|
|
89
|
-
Args:
|
|
90
|
-
doc: PyMuPDF document
|
|
91
|
-
|
|
92
|
-
Returns:
|
|
93
|
-
List of (level, title, page_number) tuples
|
|
94
|
-
"""
|
|
95
|
-
try:
|
|
96
|
-
toc = doc.get_toc()
|
|
97
|
-
return [(level, title, page) for level, title, page in toc]
|
|
98
|
-
except Exception:
|
|
99
|
-
return []
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
def _extract_and_save_image(
|
|
103
|
-
doc: fitz.Document,
|
|
104
|
-
xref: int,
|
|
105
|
-
images_dir: Path,
|
|
106
|
-
writer: IncrementalWriter,
|
|
107
|
-
) -> bool:
|
|
108
|
-
"""Extract a single image and save to disk.
|
|
109
|
-
|
|
110
|
-
This function encapsulates image processing so that memory (image_bytes)
|
|
111
|
-
is freed when the function returns, preventing accumulation.
|
|
112
|
-
|
|
113
|
-
Args:
|
|
114
|
-
doc: PyMuPDF document
|
|
115
|
-
xref: Image xref in the document
|
|
116
|
-
images_dir: Directory for saving images
|
|
117
|
-
writer: Incremental writer for markdown output
|
|
118
|
-
|
|
119
|
-
Returns:
|
|
120
|
-
True if image was successfully extracted, False otherwise
|
|
121
|
-
"""
|
|
122
|
-
base_image = doc.extract_image(xref)
|
|
123
|
-
image_bytes = base_image.get("image")
|
|
124
|
-
smask = base_image.get("smask")
|
|
125
|
-
|
|
126
|
-
if not image_bytes:
|
|
127
|
-
return False
|
|
128
|
-
|
|
129
|
-
# Handle soft-mask (transparency)
|
|
130
|
-
if smask:
|
|
131
|
-
try:
|
|
132
|
-
sm_base = doc.extract_image(smask)
|
|
133
|
-
sm_bytes = sm_base.get("image")
|
|
134
|
-
if sm_bytes:
|
|
135
|
-
image_bytes = _merge_smask(image_bytes, sm_bytes)
|
|
136
|
-
extension = "png"
|
|
137
|
-
else:
|
|
138
|
-
extension = _detect_image_format(image_bytes)
|
|
139
|
-
except Exception:
|
|
140
|
-
extension = _detect_image_format(image_bytes)
|
|
141
|
-
else:
|
|
142
|
-
extension = _detect_image_format(image_bytes)
|
|
143
|
-
|
|
144
|
-
# Hash-based naming for diff stability
|
|
145
|
-
img_hash = compute_image_hash(image_bytes)
|
|
146
|
-
img_name = f"img_{img_hash}.{extension}"
|
|
147
|
-
img_path = images_dir / img_name
|
|
148
|
-
|
|
149
|
-
# Only write if not already extracted (dedup by hash)
|
|
150
|
-
if not img_path.exists():
|
|
151
|
-
images_dir.mkdir(parents=True, exist_ok=True)
|
|
152
|
-
img_path.write_bytes(image_bytes)
|
|
153
|
-
|
|
154
|
-
rel_path = f"{images_dir.name}/{img_name}"
|
|
155
|
-
writer.write(f"\n\n")
|
|
156
|
-
|
|
157
|
-
return True
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
def convert_pdf(
|
|
161
|
-
input_path: Path,
|
|
162
|
-
output_dir: Path,
|
|
163
|
-
source_rel: str,
|
|
164
|
-
) -> dict[str, Any]:
|
|
165
|
-
"""Convert PDF to Markdown.
|
|
166
|
-
|
|
167
|
-
Args:
|
|
168
|
-
input_path: Path to PDF file
|
|
169
|
-
output_dir: Directory for output files
|
|
170
|
-
source_rel: Relative path to source for frontmatter
|
|
171
|
-
|
|
172
|
-
Returns:
|
|
173
|
-
Dict with 'output', 'pages', 'images' keys
|
|
174
|
-
"""
|
|
175
|
-
output_dir.mkdir(parents=True, exist_ok=True)
|
|
176
|
-
|
|
177
|
-
doc = fitz.open(input_path)
|
|
178
|
-
try:
|
|
179
|
-
total_pages = len(doc)
|
|
180
|
-
|
|
181
|
-
# Get metadata for frontmatter
|
|
182
|
-
checksum = compute_file_checksum(input_path)
|
|
183
|
-
mtime = get_mtime_iso(input_path)
|
|
184
|
-
|
|
185
|
-
# Get outline for heading insertion
|
|
186
|
-
outline = _get_outline_headings(doc)
|
|
187
|
-
outline_by_page: dict[int, list[tuple[int, str]]] = {}
|
|
188
|
-
for level, title, page in outline:
|
|
189
|
-
if page not in outline_by_page:
|
|
190
|
-
outline_by_page[page] = []
|
|
191
|
-
outline_by_page[page].append((level, title))
|
|
192
|
-
|
|
193
|
-
# Set up images directory
|
|
194
|
-
images_dir = output_dir / f"{input_path.stem}_images"
|
|
195
|
-
writer = IncrementalWriter()
|
|
196
|
-
images_extracted = 0
|
|
197
|
-
|
|
198
|
-
# Process pages with lazy loading
|
|
199
|
-
for pageno in range(total_pages):
|
|
200
|
-
page = doc[pageno]
|
|
201
|
-
page_num = pageno + 1
|
|
202
|
-
|
|
203
|
-
# Insert outline headings for this page
|
|
204
|
-
if page_num in outline_by_page:
|
|
205
|
-
for level, title in outline_by_page[page_num]:
|
|
206
|
-
writer.write_heading(min(level, 6), title)
|
|
207
|
-
elif not outline:
|
|
208
|
-
# No outline - use page numbers as structure
|
|
209
|
-
writer.write_heading(1, f"Page {page_num}")
|
|
210
|
-
|
|
211
|
-
# Extract text
|
|
212
|
-
text = page.get_text("text")
|
|
213
|
-
if text.strip():
|
|
214
|
-
writer.write(text.rstrip() + "\n\n")
|
|
215
|
-
|
|
216
|
-
# Extract images - process one at a time to minimize memory
|
|
217
|
-
image_list = page.get_images(full=True)
|
|
218
|
-
for img in image_list:
|
|
219
|
-
xref = img[0]
|
|
220
|
-
try:
|
|
221
|
-
result = _extract_and_save_image(
|
|
222
|
-
doc, xref, images_dir, writer
|
|
223
|
-
)
|
|
224
|
-
if result:
|
|
225
|
-
images_extracted += 1
|
|
226
|
-
except Exception:
|
|
227
|
-
# Skip failed image extraction
|
|
228
|
-
continue
|
|
229
|
-
finally:
|
|
230
|
-
doc.close()
|
|
231
|
-
|
|
232
|
-
# Write main output (pure content, no frontmatter - line numbers start at 1)
|
|
233
|
-
content = normalise_whitespace(writer.get_content())
|
|
234
|
-
output_path = output_dir / f"{input_path.stem}.md"
|
|
235
|
-
output_path.write_text(content, encoding="utf-8")
|
|
236
|
-
|
|
237
|
-
# Write separate TOC file (includes frontmatter)
|
|
238
|
-
headings = writer.get_headings()
|
|
239
|
-
toc_path = write_toc_file(
|
|
240
|
-
headings=headings,
|
|
241
|
-
output_dir=output_dir,
|
|
242
|
-
stem=input_path.stem,
|
|
243
|
-
source=source_rel,
|
|
244
|
-
converted=mtime,
|
|
245
|
-
pages=total_pages,
|
|
246
|
-
checksum=checksum,
|
|
247
|
-
)
|
|
248
|
-
|
|
249
|
-
return {
|
|
250
|
-
"output": str(output_path),
|
|
251
|
-
"toc": str(toc_path),
|
|
252
|
-
"pages": total_pages,
|
|
253
|
-
"images": images_extracted,
|
|
254
|
-
}
|
|
1
|
+
"""PDF to Markdown converter.
|
|
2
|
+
|
|
3
|
+
Converts PDF documents to Markdown with:
|
|
4
|
+
- Lazy page loading via PyMuPDF
|
|
5
|
+
- Outline-based heading extraction
|
|
6
|
+
- Hash-based image naming for diff stability
|
|
7
|
+
- YAML frontmatter and TOC generation
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import io
|
|
13
|
+
from pathlib import Path # noqa: TC003 (used at runtime)
|
|
14
|
+
from typing import TYPE_CHECKING, Any
|
|
15
|
+
|
|
16
|
+
try:
|
|
17
|
+
import fitz # type: ignore[import-untyped] # PyMuPDF
|
|
18
|
+
except ImportError as e:
|
|
19
|
+
raise ImportError(
|
|
20
|
+
"pymupdf is required for convert. Install with: pip install pymupdf"
|
|
21
|
+
) from e
|
|
22
|
+
|
|
23
|
+
from PIL import Image
|
|
24
|
+
|
|
25
|
+
if TYPE_CHECKING:
|
|
26
|
+
from PIL.Image import Image as PILImage
|
|
27
|
+
|
|
28
|
+
from ot_tools._convert.utils import (
|
|
29
|
+
IncrementalWriter,
|
|
30
|
+
compute_file_checksum,
|
|
31
|
+
compute_image_hash,
|
|
32
|
+
get_mtime_iso,
|
|
33
|
+
normalise_whitespace,
|
|
34
|
+
write_toc_file,
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _merge_smask(image_bytes: bytes, sm_bytes: bytes) -> bytes:
|
|
39
|
+
"""Merge soft-mask into image for transparency.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
image_bytes: Base image bytes
|
|
43
|
+
sm_bytes: Soft-mask bytes
|
|
44
|
+
|
|
45
|
+
Returns:
|
|
46
|
+
PNG bytes with transparency
|
|
47
|
+
"""
|
|
48
|
+
with (
|
|
49
|
+
Image.open(io.BytesIO(image_bytes)) as im_file,
|
|
50
|
+
Image.open(io.BytesIO(sm_bytes)) as mask_file,
|
|
51
|
+
):
|
|
52
|
+
mask: PILImage = mask_file.convert("L")
|
|
53
|
+
im: PILImage = im_file.convert("RGBA")
|
|
54
|
+
if mask.size != im.size:
|
|
55
|
+
mask = mask.resize(im.size)
|
|
56
|
+
im.putalpha(mask)
|
|
57
|
+
buf = io.BytesIO()
|
|
58
|
+
im.save(buf, format="PNG")
|
|
59
|
+
return buf.getvalue()
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _detect_image_format(image_bytes: bytes) -> str:
|
|
63
|
+
"""Detect image format from bytes.
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
image_bytes: Image data
|
|
67
|
+
|
|
68
|
+
Returns:
|
|
69
|
+
File extension (e.g., 'png', 'jpg')
|
|
70
|
+
"""
|
|
71
|
+
try:
|
|
72
|
+
with Image.open(io.BytesIO(image_bytes)) as im:
|
|
73
|
+
format_map = {
|
|
74
|
+
"JPEG": "jpg",
|
|
75
|
+
"PNG": "png",
|
|
76
|
+
"GIF": "gif",
|
|
77
|
+
"BMP": "bmp",
|
|
78
|
+
"TIFF": "tiff",
|
|
79
|
+
"WEBP": "webp",
|
|
80
|
+
}
|
|
81
|
+
return format_map.get(im.format or "", "png")
|
|
82
|
+
except Exception:
|
|
83
|
+
return "png"
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def _get_outline_headings(doc: fitz.Document) -> list[tuple[int, str, int]]:
|
|
87
|
+
"""Extract outline/bookmarks from PDF.
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
doc: PyMuPDF document
|
|
91
|
+
|
|
92
|
+
Returns:
|
|
93
|
+
List of (level, title, page_number) tuples
|
|
94
|
+
"""
|
|
95
|
+
try:
|
|
96
|
+
toc = doc.get_toc()
|
|
97
|
+
return [(level, title, page) for level, title, page in toc]
|
|
98
|
+
except Exception:
|
|
99
|
+
return []
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def _extract_and_save_image(
|
|
103
|
+
doc: fitz.Document,
|
|
104
|
+
xref: int,
|
|
105
|
+
images_dir: Path,
|
|
106
|
+
writer: IncrementalWriter,
|
|
107
|
+
) -> bool:
|
|
108
|
+
"""Extract a single image and save to disk.
|
|
109
|
+
|
|
110
|
+
This function encapsulates image processing so that memory (image_bytes)
|
|
111
|
+
is freed when the function returns, preventing accumulation.
|
|
112
|
+
|
|
113
|
+
Args:
|
|
114
|
+
doc: PyMuPDF document
|
|
115
|
+
xref: Image xref in the document
|
|
116
|
+
images_dir: Directory for saving images
|
|
117
|
+
writer: Incremental writer for markdown output
|
|
118
|
+
|
|
119
|
+
Returns:
|
|
120
|
+
True if image was successfully extracted, False otherwise
|
|
121
|
+
"""
|
|
122
|
+
base_image = doc.extract_image(xref)
|
|
123
|
+
image_bytes = base_image.get("image")
|
|
124
|
+
smask = base_image.get("smask")
|
|
125
|
+
|
|
126
|
+
if not image_bytes:
|
|
127
|
+
return False
|
|
128
|
+
|
|
129
|
+
# Handle soft-mask (transparency)
|
|
130
|
+
if smask:
|
|
131
|
+
try:
|
|
132
|
+
sm_base = doc.extract_image(smask)
|
|
133
|
+
sm_bytes = sm_base.get("image")
|
|
134
|
+
if sm_bytes:
|
|
135
|
+
image_bytes = _merge_smask(image_bytes, sm_bytes)
|
|
136
|
+
extension = "png"
|
|
137
|
+
else:
|
|
138
|
+
extension = _detect_image_format(image_bytes)
|
|
139
|
+
except Exception:
|
|
140
|
+
extension = _detect_image_format(image_bytes)
|
|
141
|
+
else:
|
|
142
|
+
extension = _detect_image_format(image_bytes)
|
|
143
|
+
|
|
144
|
+
# Hash-based naming for diff stability
|
|
145
|
+
img_hash = compute_image_hash(image_bytes)
|
|
146
|
+
img_name = f"img_{img_hash}.{extension}"
|
|
147
|
+
img_path = images_dir / img_name
|
|
148
|
+
|
|
149
|
+
# Only write if not already extracted (dedup by hash)
|
|
150
|
+
if not img_path.exists():
|
|
151
|
+
images_dir.mkdir(parents=True, exist_ok=True)
|
|
152
|
+
img_path.write_bytes(image_bytes)
|
|
153
|
+
|
|
154
|
+
rel_path = f"{images_dir.name}/{img_name}"
|
|
155
|
+
writer.write(f"\n\n")
|
|
156
|
+
|
|
157
|
+
return True
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def convert_pdf(
|
|
161
|
+
input_path: Path,
|
|
162
|
+
output_dir: Path,
|
|
163
|
+
source_rel: str,
|
|
164
|
+
) -> dict[str, Any]:
|
|
165
|
+
"""Convert PDF to Markdown.
|
|
166
|
+
|
|
167
|
+
Args:
|
|
168
|
+
input_path: Path to PDF file
|
|
169
|
+
output_dir: Directory for output files
|
|
170
|
+
source_rel: Relative path to source for frontmatter
|
|
171
|
+
|
|
172
|
+
Returns:
|
|
173
|
+
Dict with 'output', 'pages', 'images' keys
|
|
174
|
+
"""
|
|
175
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
176
|
+
|
|
177
|
+
doc = fitz.open(input_path)
|
|
178
|
+
try:
|
|
179
|
+
total_pages = len(doc)
|
|
180
|
+
|
|
181
|
+
# Get metadata for frontmatter
|
|
182
|
+
checksum = compute_file_checksum(input_path)
|
|
183
|
+
mtime = get_mtime_iso(input_path)
|
|
184
|
+
|
|
185
|
+
# Get outline for heading insertion
|
|
186
|
+
outline = _get_outline_headings(doc)
|
|
187
|
+
outline_by_page: dict[int, list[tuple[int, str]]] = {}
|
|
188
|
+
for level, title, page in outline:
|
|
189
|
+
if page not in outline_by_page:
|
|
190
|
+
outline_by_page[page] = []
|
|
191
|
+
outline_by_page[page].append((level, title))
|
|
192
|
+
|
|
193
|
+
# Set up images directory
|
|
194
|
+
images_dir = output_dir / f"{input_path.stem}_images"
|
|
195
|
+
writer = IncrementalWriter()
|
|
196
|
+
images_extracted = 0
|
|
197
|
+
|
|
198
|
+
# Process pages with lazy loading
|
|
199
|
+
for pageno in range(total_pages):
|
|
200
|
+
page = doc[pageno]
|
|
201
|
+
page_num = pageno + 1
|
|
202
|
+
|
|
203
|
+
# Insert outline headings for this page
|
|
204
|
+
if page_num in outline_by_page:
|
|
205
|
+
for level, title in outline_by_page[page_num]:
|
|
206
|
+
writer.write_heading(min(level, 6), title)
|
|
207
|
+
elif not outline:
|
|
208
|
+
# No outline - use page numbers as structure
|
|
209
|
+
writer.write_heading(1, f"Page {page_num}")
|
|
210
|
+
|
|
211
|
+
# Extract text
|
|
212
|
+
text = page.get_text("text")
|
|
213
|
+
if text.strip():
|
|
214
|
+
writer.write(text.rstrip() + "\n\n")
|
|
215
|
+
|
|
216
|
+
# Extract images - process one at a time to minimize memory
|
|
217
|
+
image_list = page.get_images(full=True)
|
|
218
|
+
for img in image_list:
|
|
219
|
+
xref = img[0]
|
|
220
|
+
try:
|
|
221
|
+
result = _extract_and_save_image(
|
|
222
|
+
doc, xref, images_dir, writer
|
|
223
|
+
)
|
|
224
|
+
if result:
|
|
225
|
+
images_extracted += 1
|
|
226
|
+
except Exception:
|
|
227
|
+
# Skip failed image extraction
|
|
228
|
+
continue
|
|
229
|
+
finally:
|
|
230
|
+
doc.close()
|
|
231
|
+
|
|
232
|
+
# Write main output (pure content, no frontmatter - line numbers start at 1)
|
|
233
|
+
content = normalise_whitespace(writer.get_content())
|
|
234
|
+
output_path = output_dir / f"{input_path.stem}.md"
|
|
235
|
+
output_path.write_text(content, encoding="utf-8")
|
|
236
|
+
|
|
237
|
+
# Write separate TOC file (includes frontmatter)
|
|
238
|
+
headings = writer.get_headings()
|
|
239
|
+
toc_path = write_toc_file(
|
|
240
|
+
headings=headings,
|
|
241
|
+
output_dir=output_dir,
|
|
242
|
+
stem=input_path.stem,
|
|
243
|
+
source=source_rel,
|
|
244
|
+
converted=mtime,
|
|
245
|
+
pages=total_pages,
|
|
246
|
+
checksum=checksum,
|
|
247
|
+
)
|
|
248
|
+
|
|
249
|
+
return {
|
|
250
|
+
"output": str(output_path),
|
|
251
|
+
"toc": str(toc_path),
|
|
252
|
+
"pages": total_pages,
|
|
253
|
+
"images": images_extracted,
|
|
254
|
+
}
|