onetool-mcp 1.0.0b1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bench/__init__.py +5 -0
- bench/cli.py +69 -0
- bench/harness/__init__.py +66 -0
- bench/harness/client.py +692 -0
- bench/harness/config.py +397 -0
- bench/harness/csv_writer.py +109 -0
- bench/harness/evaluate.py +512 -0
- bench/harness/metrics.py +283 -0
- bench/harness/runner.py +899 -0
- bench/py.typed +0 -0
- bench/reporter.py +629 -0
- bench/run.py +487 -0
- bench/secrets.py +101 -0
- bench/utils.py +16 -0
- onetool/__init__.py +4 -0
- onetool/cli.py +391 -0
- onetool/py.typed +0 -0
- onetool_mcp-1.0.0b1.dist-info/METADATA +163 -0
- onetool_mcp-1.0.0b1.dist-info/RECORD +132 -0
- onetool_mcp-1.0.0b1.dist-info/WHEEL +4 -0
- onetool_mcp-1.0.0b1.dist-info/entry_points.txt +3 -0
- onetool_mcp-1.0.0b1.dist-info/licenses/LICENSE.txt +687 -0
- onetool_mcp-1.0.0b1.dist-info/licenses/NOTICE.txt +64 -0
- ot/__init__.py +37 -0
- ot/__main__.py +6 -0
- ot/_cli.py +107 -0
- ot/_tui.py +53 -0
- ot/config/__init__.py +46 -0
- ot/config/defaults/bench.yaml +4 -0
- ot/config/defaults/diagram-templates/api-flow.mmd +33 -0
- ot/config/defaults/diagram-templates/c4-context.puml +30 -0
- ot/config/defaults/diagram-templates/class-diagram.mmd +87 -0
- ot/config/defaults/diagram-templates/feature-mindmap.mmd +70 -0
- ot/config/defaults/diagram-templates/microservices.d2 +81 -0
- ot/config/defaults/diagram-templates/project-gantt.mmd +37 -0
- ot/config/defaults/diagram-templates/state-machine.mmd +42 -0
- ot/config/defaults/onetool.yaml +25 -0
- ot/config/defaults/prompts.yaml +97 -0
- ot/config/defaults/servers.yaml +7 -0
- ot/config/defaults/snippets.yaml +4 -0
- ot/config/defaults/tool_templates/__init__.py +7 -0
- ot/config/defaults/tool_templates/extension.py +52 -0
- ot/config/defaults/tool_templates/isolated.py +61 -0
- ot/config/dynamic.py +121 -0
- ot/config/global_templates/__init__.py +2 -0
- ot/config/global_templates/bench-secrets-template.yaml +6 -0
- ot/config/global_templates/bench.yaml +9 -0
- ot/config/global_templates/onetool.yaml +27 -0
- ot/config/global_templates/secrets-template.yaml +44 -0
- ot/config/global_templates/servers.yaml +18 -0
- ot/config/global_templates/snippets.yaml +235 -0
- ot/config/loader.py +1087 -0
- ot/config/mcp.py +145 -0
- ot/config/secrets.py +190 -0
- ot/config/tool_config.py +125 -0
- ot/decorators.py +116 -0
- ot/executor/__init__.py +35 -0
- ot/executor/base.py +16 -0
- ot/executor/fence_processor.py +83 -0
- ot/executor/linter.py +142 -0
- ot/executor/pack_proxy.py +260 -0
- ot/executor/param_resolver.py +140 -0
- ot/executor/pep723.py +288 -0
- ot/executor/result_store.py +369 -0
- ot/executor/runner.py +496 -0
- ot/executor/simple.py +163 -0
- ot/executor/tool_loader.py +396 -0
- ot/executor/validator.py +398 -0
- ot/executor/worker_pool.py +388 -0
- ot/executor/worker_proxy.py +189 -0
- ot/http_client.py +145 -0
- ot/logging/__init__.py +37 -0
- ot/logging/config.py +315 -0
- ot/logging/entry.py +213 -0
- ot/logging/format.py +188 -0
- ot/logging/span.py +349 -0
- ot/meta.py +1555 -0
- ot/paths.py +453 -0
- ot/prompts.py +218 -0
- ot/proxy/__init__.py +21 -0
- ot/proxy/manager.py +396 -0
- ot/py.typed +0 -0
- ot/registry/__init__.py +189 -0
- ot/registry/models.py +57 -0
- ot/registry/parser.py +269 -0
- ot/registry/registry.py +413 -0
- ot/server.py +315 -0
- ot/shortcuts/__init__.py +15 -0
- ot/shortcuts/aliases.py +87 -0
- ot/shortcuts/snippets.py +258 -0
- ot/stats/__init__.py +35 -0
- ot/stats/html.py +250 -0
- ot/stats/jsonl_writer.py +283 -0
- ot/stats/reader.py +354 -0
- ot/stats/timing.py +57 -0
- ot/support.py +63 -0
- ot/tools.py +114 -0
- ot/utils/__init__.py +81 -0
- ot/utils/batch.py +161 -0
- ot/utils/cache.py +120 -0
- ot/utils/deps.py +403 -0
- ot/utils/exceptions.py +23 -0
- ot/utils/factory.py +179 -0
- ot/utils/format.py +65 -0
- ot/utils/http.py +202 -0
- ot/utils/platform.py +45 -0
- ot/utils/sanitize.py +130 -0
- ot/utils/truncate.py +69 -0
- ot_tools/__init__.py +4 -0
- ot_tools/_convert/__init__.py +12 -0
- ot_tools/_convert/excel.py +279 -0
- ot_tools/_convert/pdf.py +254 -0
- ot_tools/_convert/powerpoint.py +268 -0
- ot_tools/_convert/utils.py +358 -0
- ot_tools/_convert/word.py +283 -0
- ot_tools/brave_search.py +604 -0
- ot_tools/code_search.py +736 -0
- ot_tools/context7.py +495 -0
- ot_tools/convert.py +614 -0
- ot_tools/db.py +415 -0
- ot_tools/diagram.py +1604 -0
- ot_tools/diagram.yaml +167 -0
- ot_tools/excel.py +1372 -0
- ot_tools/file.py +1348 -0
- ot_tools/firecrawl.py +732 -0
- ot_tools/grounding_search.py +646 -0
- ot_tools/package.py +604 -0
- ot_tools/py.typed +0 -0
- ot_tools/ripgrep.py +544 -0
- ot_tools/scaffold.py +471 -0
- ot_tools/transform.py +213 -0
- ot_tools/web_fetch.py +384 -0
|
@@ -0,0 +1,268 @@
|
|
|
1
|
+
"""PowerPoint to Markdown converter.
|
|
2
|
+
|
|
3
|
+
Converts PPTX presentations to Markdown with:
|
|
4
|
+
- Slide title extraction
|
|
5
|
+
- Table conversion
|
|
6
|
+
- Hash-based image naming for diff stability
|
|
7
|
+
- YAML frontmatter and TOC generation
|
|
8
|
+
- Optional speaker notes
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
from pathlib import Path # noqa: TC003 (used at runtime)
|
|
14
|
+
from typing import TYPE_CHECKING, Any
|
|
15
|
+
|
|
16
|
+
try:
|
|
17
|
+
from pptx import Presentation
|
|
18
|
+
from pptx.enum.shapes import MSO_SHAPE_TYPE
|
|
19
|
+
from pptx.shapes.picture import Picture
|
|
20
|
+
except ImportError as e:
|
|
21
|
+
raise ImportError(
|
|
22
|
+
"python-pptx is required for convert. Install with: pip install python-pptx"
|
|
23
|
+
) from e
|
|
24
|
+
|
|
25
|
+
from ot_tools._convert.utils import (
|
|
26
|
+
IncrementalWriter,
|
|
27
|
+
compute_file_checksum,
|
|
28
|
+
get_mtime_iso,
|
|
29
|
+
normalise_whitespace,
|
|
30
|
+
save_image,
|
|
31
|
+
write_toc_file,
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
if TYPE_CHECKING:
|
|
35
|
+
from pptx.presentation import Presentation as PresentationType
|
|
36
|
+
from pptx.shapes.base import BaseShape
|
|
37
|
+
from pptx.slide import Slide
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def convert_powerpoint(
|
|
41
|
+
input_path: Path,
|
|
42
|
+
output_dir: Path,
|
|
43
|
+
source_rel: str,
|
|
44
|
+
*,
|
|
45
|
+
include_notes: bool = False,
|
|
46
|
+
) -> dict[str, Any]:
|
|
47
|
+
"""Convert PowerPoint presentation to Markdown.
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
input_path: Path to PPTX file
|
|
51
|
+
output_dir: Directory for output files
|
|
52
|
+
source_rel: Relative path to source for frontmatter
|
|
53
|
+
include_notes: Include speaker notes after slide content
|
|
54
|
+
|
|
55
|
+
Returns:
|
|
56
|
+
Dict with 'output', 'slides', 'images' keys
|
|
57
|
+
"""
|
|
58
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
59
|
+
|
|
60
|
+
prs: PresentationType = Presentation(str(input_path))
|
|
61
|
+
try:
|
|
62
|
+
# Get metadata for frontmatter
|
|
63
|
+
checksum = compute_file_checksum(input_path)
|
|
64
|
+
mtime = get_mtime_iso(input_path)
|
|
65
|
+
total_slides = len(prs.slides)
|
|
66
|
+
|
|
67
|
+
# Set up images directory
|
|
68
|
+
images_dir = output_dir / f"{input_path.stem}_images"
|
|
69
|
+
writer = IncrementalWriter()
|
|
70
|
+
images_extracted = 0
|
|
71
|
+
|
|
72
|
+
# Process slides
|
|
73
|
+
for slide_idx, slide in enumerate(prs.slides, 1):
|
|
74
|
+
imgs = _process_slide(
|
|
75
|
+
slide, slide_idx, writer, images_dir, include_notes
|
|
76
|
+
)
|
|
77
|
+
images_extracted += imgs
|
|
78
|
+
finally:
|
|
79
|
+
# Ensure presentation resources are released
|
|
80
|
+
# python-pptx Presentation doesn't have explicit close, but we can
|
|
81
|
+
# help garbage collection by clearing references
|
|
82
|
+
del prs
|
|
83
|
+
|
|
84
|
+
# Write main output (pure content, no frontmatter - line numbers start at 1)
|
|
85
|
+
content = normalise_whitespace(writer.get_content())
|
|
86
|
+
output_path = output_dir / f"{input_path.stem}.md"
|
|
87
|
+
output_path.write_text(content, encoding="utf-8")
|
|
88
|
+
|
|
89
|
+
# Write separate TOC file (includes frontmatter)
|
|
90
|
+
headings = writer.get_headings()
|
|
91
|
+
toc_path = write_toc_file(
|
|
92
|
+
headings=headings,
|
|
93
|
+
output_dir=output_dir,
|
|
94
|
+
stem=input_path.stem,
|
|
95
|
+
source=source_rel,
|
|
96
|
+
converted=mtime,
|
|
97
|
+
pages=total_slides,
|
|
98
|
+
checksum=checksum,
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
return {
|
|
102
|
+
"output": str(output_path),
|
|
103
|
+
"toc": str(toc_path),
|
|
104
|
+
"slides": total_slides,
|
|
105
|
+
"images": images_extracted,
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def _process_slide(
|
|
110
|
+
slide: Slide,
|
|
111
|
+
slide_number: int,
|
|
112
|
+
writer: IncrementalWriter,
|
|
113
|
+
images_dir: Path,
|
|
114
|
+
include_notes: bool,
|
|
115
|
+
) -> int:
|
|
116
|
+
"""Process a single slide.
|
|
117
|
+
|
|
118
|
+
Returns:
|
|
119
|
+
Number of images extracted
|
|
120
|
+
"""
|
|
121
|
+
images_extracted = 0
|
|
122
|
+
|
|
123
|
+
# Get slide title
|
|
124
|
+
title = f"Slide {slide_number}"
|
|
125
|
+
try:
|
|
126
|
+
if (
|
|
127
|
+
hasattr(slide, "shapes")
|
|
128
|
+
and hasattr(slide.shapes, "title")
|
|
129
|
+
and slide.shapes.title
|
|
130
|
+
and slide.shapes.title.text.strip()
|
|
131
|
+
):
|
|
132
|
+
title = slide.shapes.title.text.strip()
|
|
133
|
+
except AttributeError:
|
|
134
|
+
pass
|
|
135
|
+
|
|
136
|
+
writer.write_heading(2, title)
|
|
137
|
+
|
|
138
|
+
# Process shapes
|
|
139
|
+
text_content: list[str] = []
|
|
140
|
+
tables_content: list[str] = []
|
|
141
|
+
|
|
142
|
+
for shape in slide.shapes:
|
|
143
|
+
shape_type = getattr(shape, "shape_type", None)
|
|
144
|
+
|
|
145
|
+
# Tables (check FIRST - tables also have .text attribute)
|
|
146
|
+
if shape_type == MSO_SHAPE_TYPE.TABLE:
|
|
147
|
+
try:
|
|
148
|
+
if hasattr(shape, "table"):
|
|
149
|
+
table_md = _process_table(shape.table)
|
|
150
|
+
if table_md:
|
|
151
|
+
tables_content.append(table_md)
|
|
152
|
+
except (AttributeError, ValueError):
|
|
153
|
+
pass
|
|
154
|
+
|
|
155
|
+
# Images
|
|
156
|
+
elif shape_type == MSO_SHAPE_TYPE.PICTURE:
|
|
157
|
+
try:
|
|
158
|
+
if isinstance(shape, Picture):
|
|
159
|
+
img_ref = _process_image(shape, images_dir)
|
|
160
|
+
if img_ref:
|
|
161
|
+
writer.write(img_ref + "\n\n")
|
|
162
|
+
images_extracted += 1
|
|
163
|
+
except Exception:
|
|
164
|
+
pass
|
|
165
|
+
|
|
166
|
+
# Text shapes (check LAST - after tables and images)
|
|
167
|
+
elif hasattr(shape, "text") and shape.text and shape.text.strip():
|
|
168
|
+
# Skip title shape (already processed)
|
|
169
|
+
if shape == getattr(slide.shapes, "title", None):
|
|
170
|
+
continue
|
|
171
|
+
text_content.append(_process_text_shape(shape))
|
|
172
|
+
|
|
173
|
+
# Write text content
|
|
174
|
+
for text in text_content:
|
|
175
|
+
if text:
|
|
176
|
+
writer.write(text + "\n\n")
|
|
177
|
+
|
|
178
|
+
# Write tables
|
|
179
|
+
for table in tables_content:
|
|
180
|
+
if table:
|
|
181
|
+
writer.write(table + "\n\n")
|
|
182
|
+
|
|
183
|
+
# Add speaker notes if requested
|
|
184
|
+
if include_notes and hasattr(slide, "notes_slide"):
|
|
185
|
+
try:
|
|
186
|
+
notes_frame = slide.notes_slide.notes_text_frame
|
|
187
|
+
if notes_frame and notes_frame.text.strip():
|
|
188
|
+
writer.write("**Speaker Notes:**\n\n")
|
|
189
|
+
writer.write(f"> {notes_frame.text.strip()}\n\n")
|
|
190
|
+
except Exception:
|
|
191
|
+
pass
|
|
192
|
+
|
|
193
|
+
# Add slide separator
|
|
194
|
+
writer.write("---\n\n")
|
|
195
|
+
|
|
196
|
+
return images_extracted
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
def _process_text_shape(shape: BaseShape) -> str:
|
|
200
|
+
"""Process text from a shape."""
|
|
201
|
+
if not hasattr(shape, "text") or not shape.text.strip():
|
|
202
|
+
return ""
|
|
203
|
+
|
|
204
|
+
text: str = str(shape.text).strip()
|
|
205
|
+
lines = text.split("\n")
|
|
206
|
+
|
|
207
|
+
if len(lines) > 1:
|
|
208
|
+
# Format as bullet list
|
|
209
|
+
processed: list[str] = []
|
|
210
|
+
for line in lines:
|
|
211
|
+
line = line.strip()
|
|
212
|
+
if line:
|
|
213
|
+
# Remove existing bullet markers
|
|
214
|
+
for marker in ("•", "-", "*", "○", "▪", "▫"):
|
|
215
|
+
if line.startswith(marker):
|
|
216
|
+
line = line[1:].strip()
|
|
217
|
+
break
|
|
218
|
+
processed.append(f"- {line}")
|
|
219
|
+
return "\n".join(processed)
|
|
220
|
+
|
|
221
|
+
return text
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
def _process_table(table: Any) -> str:
|
|
225
|
+
"""Convert table to Markdown."""
|
|
226
|
+
if not hasattr(table, "rows") or not table.rows:
|
|
227
|
+
return ""
|
|
228
|
+
|
|
229
|
+
lines: list[str] = []
|
|
230
|
+
|
|
231
|
+
# Header row
|
|
232
|
+
header_cells: list[str] = []
|
|
233
|
+
for cell in table.rows[0].cells:
|
|
234
|
+
header_cells.append(cell.text.strip() if hasattr(cell, "text") else "")
|
|
235
|
+
|
|
236
|
+
if not header_cells:
|
|
237
|
+
return ""
|
|
238
|
+
|
|
239
|
+
lines.append("| " + " | ".join(header_cells) + " |")
|
|
240
|
+
lines.append("| " + " | ".join("---" for _ in header_cells) + " |")
|
|
241
|
+
|
|
242
|
+
# Data rows
|
|
243
|
+
for i in range(1, len(table.rows)):
|
|
244
|
+
row = table.rows[i]
|
|
245
|
+
cells: list[str] = []
|
|
246
|
+
for cell in row.cells:
|
|
247
|
+
cells.append(cell.text.strip() if hasattr(cell, "text") else "")
|
|
248
|
+
|
|
249
|
+
while len(cells) < len(header_cells):
|
|
250
|
+
cells.append("")
|
|
251
|
+
|
|
252
|
+
lines.append("| " + " | ".join(cells[: len(header_cells)]) + " |")
|
|
253
|
+
|
|
254
|
+
return "\n".join(lines)
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
def _process_image(shape: Picture, images_dir: Path) -> str:
|
|
258
|
+
"""Extract and save image with hash-based naming."""
|
|
259
|
+
try:
|
|
260
|
+
image_data = shape.image.blob
|
|
261
|
+
content_type = shape.image.content_type
|
|
262
|
+
|
|
263
|
+
img_path = save_image(image_data, images_dir, content_type)
|
|
264
|
+
rel_path = f"{images_dir.name}/{img_path.name}"
|
|
265
|
+
return f""
|
|
266
|
+
|
|
267
|
+
except Exception:
|
|
268
|
+
return ""
|
|
@@ -0,0 +1,358 @@
|
|
|
1
|
+
"""Shared utilities for document converters.
|
|
2
|
+
|
|
3
|
+
Provides diff-stable output formatting including:
|
|
4
|
+
- YAML frontmatter generation
|
|
5
|
+
- TOC generation with line ranges
|
|
6
|
+
- Hash-based image naming
|
|
7
|
+
- Whitespace normalisation
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import hashlib
|
|
13
|
+
import io
|
|
14
|
+
import re
|
|
15
|
+
from datetime import UTC, datetime
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
from typing import TYPE_CHECKING
|
|
18
|
+
|
|
19
|
+
if TYPE_CHECKING:
|
|
20
|
+
from collections.abc import Sequence
|
|
21
|
+
|
|
22
|
+
from functools import lru_cache
|
|
23
|
+
|
|
24
|
+
_CHECKSUM_CACHE_MAX_SIZE = 100
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@lru_cache(maxsize=_CHECKSUM_CACHE_MAX_SIZE)
|
|
28
|
+
def _compute_checksum_cached(
|
|
29
|
+
path_str: str,
|
|
30
|
+
mtime: float, # noqa: ARG001 - used as cache key
|
|
31
|
+
size: int, # noqa: ARG001 - used as cache key
|
|
32
|
+
) -> str:
|
|
33
|
+
"""Cached checksum computation (thread-safe via lru_cache).
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
path_str: Resolved path string
|
|
37
|
+
mtime: File modification time (for cache invalidation)
|
|
38
|
+
size: File size in bytes (for cache invalidation)
|
|
39
|
+
|
|
40
|
+
Returns:
|
|
41
|
+
Checksum in format 'sha256:abc123...'
|
|
42
|
+
"""
|
|
43
|
+
path = Path(path_str)
|
|
44
|
+
sha256 = hashlib.sha256()
|
|
45
|
+
with path.open("rb") as f:
|
|
46
|
+
for chunk in iter(lambda: f.read(8192), b""):
|
|
47
|
+
sha256.update(chunk)
|
|
48
|
+
return f"sha256:{sha256.hexdigest()}"
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def compute_file_checksum(path: Path) -> str:
|
|
52
|
+
"""Compute SHA256 checksum of a file (with thread-safe caching).
|
|
53
|
+
|
|
54
|
+
Results are cached based on path+mtime+size to avoid redundant reads
|
|
55
|
+
when the same file is processed multiple times.
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
path: Path to file
|
|
59
|
+
|
|
60
|
+
Returns:
|
|
61
|
+
Checksum in format 'sha256:abc123...'
|
|
62
|
+
"""
|
|
63
|
+
stat = path.stat()
|
|
64
|
+
return _compute_checksum_cached(str(path.resolve()), stat.st_mtime, stat.st_size)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def compute_image_hash(data: bytes) -> str:
|
|
68
|
+
"""Compute hash for image naming (first 8 chars of SHA256).
|
|
69
|
+
|
|
70
|
+
Args:
|
|
71
|
+
data: Image bytes
|
|
72
|
+
|
|
73
|
+
Returns:
|
|
74
|
+
8-character hex hash
|
|
75
|
+
"""
|
|
76
|
+
return hashlib.sha256(data).hexdigest()[:8]
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def save_image(data: bytes, images_dir: Path, content_type: str) -> Path:
|
|
80
|
+
"""Save image with hash-based naming for diff stability.
|
|
81
|
+
|
|
82
|
+
Args:
|
|
83
|
+
data: Image bytes
|
|
84
|
+
images_dir: Directory to save image
|
|
85
|
+
content_type: MIME content type (e.g., "image/png", "image/jpeg")
|
|
86
|
+
|
|
87
|
+
Returns:
|
|
88
|
+
Path to saved image file
|
|
89
|
+
"""
|
|
90
|
+
# Determine extension from content type
|
|
91
|
+
if "jpeg" in content_type or "jpg" in content_type:
|
|
92
|
+
extension = "jpg"
|
|
93
|
+
elif "png" in content_type:
|
|
94
|
+
extension = "png"
|
|
95
|
+
elif "gif" in content_type:
|
|
96
|
+
extension = "gif"
|
|
97
|
+
else:
|
|
98
|
+
extension = "png"
|
|
99
|
+
|
|
100
|
+
# Hash-based naming for diff stability
|
|
101
|
+
img_hash = compute_image_hash(data)
|
|
102
|
+
img_name = f"img_{img_hash}.{extension}"
|
|
103
|
+
img_path = images_dir / img_name
|
|
104
|
+
|
|
105
|
+
# Only write if not already extracted (dedup by hash)
|
|
106
|
+
if not img_path.exists():
|
|
107
|
+
images_dir.mkdir(parents=True, exist_ok=True)
|
|
108
|
+
img_path.write_bytes(data)
|
|
109
|
+
|
|
110
|
+
return img_path
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def get_mtime_iso(path: Path) -> str:
|
|
114
|
+
"""Get file modification time as ISO 8601 string.
|
|
115
|
+
|
|
116
|
+
Args:
|
|
117
|
+
path: Path to file
|
|
118
|
+
|
|
119
|
+
Returns:
|
|
120
|
+
ISO 8601 timestamp with Z suffix
|
|
121
|
+
"""
|
|
122
|
+
mtime = path.stat().st_mtime
|
|
123
|
+
return datetime.fromtimestamp(mtime, tz=UTC).strftime("%Y-%m-%dT%H:%M:%SZ")
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def generate_frontmatter(
|
|
127
|
+
*,
|
|
128
|
+
source: str,
|
|
129
|
+
converted: str,
|
|
130
|
+
pages: int | str,
|
|
131
|
+
checksum: str,
|
|
132
|
+
) -> str:
|
|
133
|
+
"""Generate YAML frontmatter for converted document.
|
|
134
|
+
|
|
135
|
+
Args:
|
|
136
|
+
source: Relative path to source file
|
|
137
|
+
converted: ISO 8601 timestamp (source file mtime)
|
|
138
|
+
pages: Page/slide/sheet count (may be prefixed with ~ for estimates)
|
|
139
|
+
checksum: SHA256 hash of source file
|
|
140
|
+
|
|
141
|
+
Returns:
|
|
142
|
+
YAML frontmatter block including delimiters
|
|
143
|
+
"""
|
|
144
|
+
return f"""---
|
|
145
|
+
source: {source}
|
|
146
|
+
converted: {converted}
|
|
147
|
+
pages: {pages}
|
|
148
|
+
checksum: {checksum}
|
|
149
|
+
---
|
|
150
|
+
"""
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def generate_toc(
|
|
154
|
+
headings: Sequence[tuple[int, str, int, int]],
|
|
155
|
+
main_file: str,
|
|
156
|
+
source: str,
|
|
157
|
+
converted: str,
|
|
158
|
+
pages: int | str,
|
|
159
|
+
checksum: str,
|
|
160
|
+
) -> str:
|
|
161
|
+
"""Generate table of contents as a separate file with line ranges.
|
|
162
|
+
|
|
163
|
+
Creates a TOC document with frontmatter and instructions for LLMs
|
|
164
|
+
on how to use the line numbers to navigate the main document.
|
|
165
|
+
|
|
166
|
+
Args:
|
|
167
|
+
headings: List of (level, title, start_line, end_line) tuples
|
|
168
|
+
Level 1 = H1, Level 2 = H2, etc.
|
|
169
|
+
main_file: Filename of the main markdown file (for linking)
|
|
170
|
+
source: Original source file path (for reference)
|
|
171
|
+
converted: ISO 8601 timestamp (source file mtime)
|
|
172
|
+
pages: Page/slide/sheet count (may be prefixed with ~ for estimates)
|
|
173
|
+
checksum: SHA256 hash of source file
|
|
174
|
+
|
|
175
|
+
Returns:
|
|
176
|
+
Complete markdown TOC document with frontmatter
|
|
177
|
+
"""
|
|
178
|
+
lines = [
|
|
179
|
+
"---",
|
|
180
|
+
f"source: {source}",
|
|
181
|
+
f"converted: {converted}",
|
|
182
|
+
f"pages: {pages}",
|
|
183
|
+
f"checksum: {checksum}",
|
|
184
|
+
"---",
|
|
185
|
+
"",
|
|
186
|
+
"# Table of Contents",
|
|
187
|
+
"",
|
|
188
|
+
f"**Document:** [{main_file}]({main_file})",
|
|
189
|
+
"",
|
|
190
|
+
"## How to Use This TOC",
|
|
191
|
+
"",
|
|
192
|
+
"Each entry shows `(lines <start>-<end>)` for the main document.",
|
|
193
|
+
"To read a section efficiently:",
|
|
194
|
+
"",
|
|
195
|
+
"1. Find the section you need below",
|
|
196
|
+
f"2. Use the line range to read only that portion of [{main_file}]({main_file})",
|
|
197
|
+
"3. Line numbers are exact - no offset needed",
|
|
198
|
+
"",
|
|
199
|
+
"---",
|
|
200
|
+
"",
|
|
201
|
+
"## Contents",
|
|
202
|
+
"",
|
|
203
|
+
]
|
|
204
|
+
|
|
205
|
+
if not headings:
|
|
206
|
+
lines.append("*No headings found in document.*")
|
|
207
|
+
else:
|
|
208
|
+
for level, title, start_line, end_line in headings:
|
|
209
|
+
indent = " " * (level - 1)
|
|
210
|
+
# Create anchor linking to section in main file
|
|
211
|
+
anchor = _slugify(title)
|
|
212
|
+
lines.append(
|
|
213
|
+
f"{indent}- [{title}]({main_file}#{anchor}) (lines {start_line}-{end_line})"
|
|
214
|
+
)
|
|
215
|
+
|
|
216
|
+
lines.append("")
|
|
217
|
+
return "\n".join(lines)
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
def _slugify(text: str) -> str:
|
|
221
|
+
"""Convert text to URL-safe anchor.
|
|
222
|
+
|
|
223
|
+
Args:
|
|
224
|
+
text: Text to slugify
|
|
225
|
+
|
|
226
|
+
Returns:
|
|
227
|
+
Lowercase slug with hyphens
|
|
228
|
+
"""
|
|
229
|
+
# Remove non-alphanumeric chars, replace spaces with hyphens
|
|
230
|
+
slug = re.sub(r"[^\w\s-]", "", text.lower())
|
|
231
|
+
return re.sub(r"[\s_]+", "-", slug).strip("-")
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
def write_toc_file(
|
|
235
|
+
headings: list[tuple[int, str, int, int]],
|
|
236
|
+
output_dir: Path,
|
|
237
|
+
stem: str,
|
|
238
|
+
source: str,
|
|
239
|
+
converted: str,
|
|
240
|
+
pages: int | str,
|
|
241
|
+
checksum: str,
|
|
242
|
+
) -> Path:
|
|
243
|
+
"""Write TOC to a separate file with frontmatter.
|
|
244
|
+
|
|
245
|
+
Args:
|
|
246
|
+
headings: List of (level, title, start_line, end_line) tuples
|
|
247
|
+
output_dir: Directory for output files
|
|
248
|
+
stem: Base filename (without extension)
|
|
249
|
+
source: Original source file path
|
|
250
|
+
converted: ISO 8601 timestamp (source file mtime)
|
|
251
|
+
pages: Page/slide/sheet count (may be prefixed with ~ for estimates)
|
|
252
|
+
checksum: SHA256 hash of source file
|
|
253
|
+
|
|
254
|
+
Returns:
|
|
255
|
+
Path to the written TOC file
|
|
256
|
+
"""
|
|
257
|
+
main_file = f"{stem}.md"
|
|
258
|
+
toc_content = generate_toc(headings, main_file, source, converted, pages, checksum)
|
|
259
|
+
toc_path = output_dir / f"{stem}.toc.md"
|
|
260
|
+
toc_path.write_text(toc_content, encoding="utf-8")
|
|
261
|
+
return toc_path
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
def normalise_whitespace(content: str) -> str:
|
|
265
|
+
"""Normalise whitespace for diff-stable output.
|
|
266
|
+
|
|
267
|
+
- Converts CRLF to LF
|
|
268
|
+
- Removes trailing whitespace
|
|
269
|
+
- Ensures consistent blank line spacing (max 2 consecutive)
|
|
270
|
+
- Ensures single trailing newline
|
|
271
|
+
|
|
272
|
+
Args:
|
|
273
|
+
content: Raw content
|
|
274
|
+
|
|
275
|
+
Returns:
|
|
276
|
+
Normalised content
|
|
277
|
+
"""
|
|
278
|
+
# Normalise line endings
|
|
279
|
+
content = content.replace("\r\n", "\n").replace("\r", "\n")
|
|
280
|
+
|
|
281
|
+
# Remove trailing whitespace from each line
|
|
282
|
+
lines = [line.rstrip() for line in content.split("\n")]
|
|
283
|
+
|
|
284
|
+
# Collapse multiple blank lines to max 2
|
|
285
|
+
result_lines: list[str] = []
|
|
286
|
+
blank_count = 0
|
|
287
|
+
for line in lines:
|
|
288
|
+
if line == "":
|
|
289
|
+
blank_count += 1
|
|
290
|
+
if blank_count <= 2:
|
|
291
|
+
result_lines.append(line)
|
|
292
|
+
else:
|
|
293
|
+
blank_count = 0
|
|
294
|
+
result_lines.append(line)
|
|
295
|
+
|
|
296
|
+
# Join and ensure single trailing newline
|
|
297
|
+
result = "\n".join(result_lines).rstrip("\n") + "\n"
|
|
298
|
+
return result
|
|
299
|
+
|
|
300
|
+
|
|
301
|
+
class IncrementalWriter:
|
|
302
|
+
"""Write content incrementally to track line numbers.
|
|
303
|
+
|
|
304
|
+
Buffers content and tracks line numbers for TOC generation.
|
|
305
|
+
"""
|
|
306
|
+
|
|
307
|
+
def __init__(self) -> None:
|
|
308
|
+
self._buffer = io.StringIO()
|
|
309
|
+
self._line_count = 0
|
|
310
|
+
self._headings: list[tuple[int, str, int, int]] = []
|
|
311
|
+
self._current_heading: tuple[int, str, int] | None = None
|
|
312
|
+
|
|
313
|
+
def write(self, text: str) -> None:
|
|
314
|
+
"""Write text to buffer."""
|
|
315
|
+
self._buffer.write(text)
|
|
316
|
+
self._line_count += text.count("\n")
|
|
317
|
+
|
|
318
|
+
def write_heading(self, level: int, title: str) -> None:
|
|
319
|
+
"""Write a heading and track it for TOC.
|
|
320
|
+
|
|
321
|
+
Args:
|
|
322
|
+
level: Heading level (1-6)
|
|
323
|
+
title: Heading text
|
|
324
|
+
"""
|
|
325
|
+
# Close previous heading
|
|
326
|
+
if self._current_heading:
|
|
327
|
+
prev_level, prev_title, prev_start = self._current_heading
|
|
328
|
+
self._headings.append((prev_level, prev_title, prev_start, self._line_count))
|
|
329
|
+
|
|
330
|
+
# Start new heading
|
|
331
|
+
heading_line = self._line_count + 1
|
|
332
|
+
self._current_heading = (level, title, heading_line)
|
|
333
|
+
|
|
334
|
+
# Write the heading
|
|
335
|
+
prefix = "#" * level
|
|
336
|
+
self.write(f"{prefix} {title}\n\n")
|
|
337
|
+
|
|
338
|
+
def close_heading(self) -> None:
|
|
339
|
+
"""Close the current heading section."""
|
|
340
|
+
if self._current_heading:
|
|
341
|
+
prev_level, prev_title, prev_start = self._current_heading
|
|
342
|
+
self._headings.append((prev_level, prev_title, prev_start, self._line_count))
|
|
343
|
+
self._current_heading = None
|
|
344
|
+
|
|
345
|
+
def get_content(self) -> str:
|
|
346
|
+
"""Get buffered content."""
|
|
347
|
+
return self._buffer.getvalue()
|
|
348
|
+
|
|
349
|
+
def get_headings(self) -> list[tuple[int, str, int, int]]:
|
|
350
|
+
"""Get collected headings for TOC generation."""
|
|
351
|
+
# Close any open heading
|
|
352
|
+
self.close_heading()
|
|
353
|
+
return self._headings
|
|
354
|
+
|
|
355
|
+
@property
|
|
356
|
+
def line_count(self) -> int:
|
|
357
|
+
"""Current line count."""
|
|
358
|
+
return self._line_count
|