onetool-mcp 1.0.0b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (132) hide show
  1. bench/__init__.py +5 -0
  2. bench/cli.py +69 -0
  3. bench/harness/__init__.py +66 -0
  4. bench/harness/client.py +692 -0
  5. bench/harness/config.py +397 -0
  6. bench/harness/csv_writer.py +109 -0
  7. bench/harness/evaluate.py +512 -0
  8. bench/harness/metrics.py +283 -0
  9. bench/harness/runner.py +899 -0
  10. bench/py.typed +0 -0
  11. bench/reporter.py +629 -0
  12. bench/run.py +487 -0
  13. bench/secrets.py +101 -0
  14. bench/utils.py +16 -0
  15. onetool/__init__.py +4 -0
  16. onetool/cli.py +391 -0
  17. onetool/py.typed +0 -0
  18. onetool_mcp-1.0.0b1.dist-info/METADATA +163 -0
  19. onetool_mcp-1.0.0b1.dist-info/RECORD +132 -0
  20. onetool_mcp-1.0.0b1.dist-info/WHEEL +4 -0
  21. onetool_mcp-1.0.0b1.dist-info/entry_points.txt +3 -0
  22. onetool_mcp-1.0.0b1.dist-info/licenses/LICENSE.txt +687 -0
  23. onetool_mcp-1.0.0b1.dist-info/licenses/NOTICE.txt +64 -0
  24. ot/__init__.py +37 -0
  25. ot/__main__.py +6 -0
  26. ot/_cli.py +107 -0
  27. ot/_tui.py +53 -0
  28. ot/config/__init__.py +46 -0
  29. ot/config/defaults/bench.yaml +4 -0
  30. ot/config/defaults/diagram-templates/api-flow.mmd +33 -0
  31. ot/config/defaults/diagram-templates/c4-context.puml +30 -0
  32. ot/config/defaults/diagram-templates/class-diagram.mmd +87 -0
  33. ot/config/defaults/diagram-templates/feature-mindmap.mmd +70 -0
  34. ot/config/defaults/diagram-templates/microservices.d2 +81 -0
  35. ot/config/defaults/diagram-templates/project-gantt.mmd +37 -0
  36. ot/config/defaults/diagram-templates/state-machine.mmd +42 -0
  37. ot/config/defaults/onetool.yaml +25 -0
  38. ot/config/defaults/prompts.yaml +97 -0
  39. ot/config/defaults/servers.yaml +7 -0
  40. ot/config/defaults/snippets.yaml +4 -0
  41. ot/config/defaults/tool_templates/__init__.py +7 -0
  42. ot/config/defaults/tool_templates/extension.py +52 -0
  43. ot/config/defaults/tool_templates/isolated.py +61 -0
  44. ot/config/dynamic.py +121 -0
  45. ot/config/global_templates/__init__.py +2 -0
  46. ot/config/global_templates/bench-secrets-template.yaml +6 -0
  47. ot/config/global_templates/bench.yaml +9 -0
  48. ot/config/global_templates/onetool.yaml +27 -0
  49. ot/config/global_templates/secrets-template.yaml +44 -0
  50. ot/config/global_templates/servers.yaml +18 -0
  51. ot/config/global_templates/snippets.yaml +235 -0
  52. ot/config/loader.py +1087 -0
  53. ot/config/mcp.py +145 -0
  54. ot/config/secrets.py +190 -0
  55. ot/config/tool_config.py +125 -0
  56. ot/decorators.py +116 -0
  57. ot/executor/__init__.py +35 -0
  58. ot/executor/base.py +16 -0
  59. ot/executor/fence_processor.py +83 -0
  60. ot/executor/linter.py +142 -0
  61. ot/executor/pack_proxy.py +260 -0
  62. ot/executor/param_resolver.py +140 -0
  63. ot/executor/pep723.py +288 -0
  64. ot/executor/result_store.py +369 -0
  65. ot/executor/runner.py +496 -0
  66. ot/executor/simple.py +163 -0
  67. ot/executor/tool_loader.py +396 -0
  68. ot/executor/validator.py +398 -0
  69. ot/executor/worker_pool.py +388 -0
  70. ot/executor/worker_proxy.py +189 -0
  71. ot/http_client.py +145 -0
  72. ot/logging/__init__.py +37 -0
  73. ot/logging/config.py +315 -0
  74. ot/logging/entry.py +213 -0
  75. ot/logging/format.py +188 -0
  76. ot/logging/span.py +349 -0
  77. ot/meta.py +1555 -0
  78. ot/paths.py +453 -0
  79. ot/prompts.py +218 -0
  80. ot/proxy/__init__.py +21 -0
  81. ot/proxy/manager.py +396 -0
  82. ot/py.typed +0 -0
  83. ot/registry/__init__.py +189 -0
  84. ot/registry/models.py +57 -0
  85. ot/registry/parser.py +269 -0
  86. ot/registry/registry.py +413 -0
  87. ot/server.py +315 -0
  88. ot/shortcuts/__init__.py +15 -0
  89. ot/shortcuts/aliases.py +87 -0
  90. ot/shortcuts/snippets.py +258 -0
  91. ot/stats/__init__.py +35 -0
  92. ot/stats/html.py +250 -0
  93. ot/stats/jsonl_writer.py +283 -0
  94. ot/stats/reader.py +354 -0
  95. ot/stats/timing.py +57 -0
  96. ot/support.py +63 -0
  97. ot/tools.py +114 -0
  98. ot/utils/__init__.py +81 -0
  99. ot/utils/batch.py +161 -0
  100. ot/utils/cache.py +120 -0
  101. ot/utils/deps.py +403 -0
  102. ot/utils/exceptions.py +23 -0
  103. ot/utils/factory.py +179 -0
  104. ot/utils/format.py +65 -0
  105. ot/utils/http.py +202 -0
  106. ot/utils/platform.py +45 -0
  107. ot/utils/sanitize.py +130 -0
  108. ot/utils/truncate.py +69 -0
  109. ot_tools/__init__.py +4 -0
  110. ot_tools/_convert/__init__.py +12 -0
  111. ot_tools/_convert/excel.py +279 -0
  112. ot_tools/_convert/pdf.py +254 -0
  113. ot_tools/_convert/powerpoint.py +268 -0
  114. ot_tools/_convert/utils.py +358 -0
  115. ot_tools/_convert/word.py +283 -0
  116. ot_tools/brave_search.py +604 -0
  117. ot_tools/code_search.py +736 -0
  118. ot_tools/context7.py +495 -0
  119. ot_tools/convert.py +614 -0
  120. ot_tools/db.py +415 -0
  121. ot_tools/diagram.py +1604 -0
  122. ot_tools/diagram.yaml +167 -0
  123. ot_tools/excel.py +1372 -0
  124. ot_tools/file.py +1348 -0
  125. ot_tools/firecrawl.py +732 -0
  126. ot_tools/grounding_search.py +646 -0
  127. ot_tools/package.py +604 -0
  128. ot_tools/py.typed +0 -0
  129. ot_tools/ripgrep.py +544 -0
  130. ot_tools/scaffold.py +471 -0
  131. ot_tools/transform.py +213 -0
  132. ot_tools/web_fetch.py +384 -0
@@ -0,0 +1,268 @@
1
+ """PowerPoint to Markdown converter.
2
+
3
+ Converts PPTX presentations to Markdown with:
4
+ - Slide title extraction
5
+ - Table conversion
6
+ - Hash-based image naming for diff stability
7
+ - YAML frontmatter and TOC generation
8
+ - Optional speaker notes
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ from pathlib import Path # noqa: TC003 (used at runtime)
14
+ from typing import TYPE_CHECKING, Any
15
+
16
+ try:
17
+ from pptx import Presentation
18
+ from pptx.enum.shapes import MSO_SHAPE_TYPE
19
+ from pptx.shapes.picture import Picture
20
+ except ImportError as e:
21
+ raise ImportError(
22
+ "python-pptx is required for convert. Install with: pip install python-pptx"
23
+ ) from e
24
+
25
+ from ot_tools._convert.utils import (
26
+ IncrementalWriter,
27
+ compute_file_checksum,
28
+ get_mtime_iso,
29
+ normalise_whitespace,
30
+ save_image,
31
+ write_toc_file,
32
+ )
33
+
34
+ if TYPE_CHECKING:
35
+ from pptx.presentation import Presentation as PresentationType
36
+ from pptx.shapes.base import BaseShape
37
+ from pptx.slide import Slide
38
+
39
+
40
+ def convert_powerpoint(
41
+ input_path: Path,
42
+ output_dir: Path,
43
+ source_rel: str,
44
+ *,
45
+ include_notes: bool = False,
46
+ ) -> dict[str, Any]:
47
+ """Convert PowerPoint presentation to Markdown.
48
+
49
+ Args:
50
+ input_path: Path to PPTX file
51
+ output_dir: Directory for output files
52
+ source_rel: Relative path to source for frontmatter
53
+ include_notes: Include speaker notes after slide content
54
+
55
+ Returns:
56
+ Dict with 'output', 'slides', 'images' keys
57
+ """
58
+ output_dir.mkdir(parents=True, exist_ok=True)
59
+
60
+ prs: PresentationType = Presentation(str(input_path))
61
+ try:
62
+ # Get metadata for frontmatter
63
+ checksum = compute_file_checksum(input_path)
64
+ mtime = get_mtime_iso(input_path)
65
+ total_slides = len(prs.slides)
66
+
67
+ # Set up images directory
68
+ images_dir = output_dir / f"{input_path.stem}_images"
69
+ writer = IncrementalWriter()
70
+ images_extracted = 0
71
+
72
+ # Process slides
73
+ for slide_idx, slide in enumerate(prs.slides, 1):
74
+ imgs = _process_slide(
75
+ slide, slide_idx, writer, images_dir, include_notes
76
+ )
77
+ images_extracted += imgs
78
+ finally:
79
+ # Ensure presentation resources are released
80
+ # python-pptx Presentation doesn't have explicit close, but we can
81
+ # help garbage collection by clearing references
82
+ del prs
83
+
84
+ # Write main output (pure content, no frontmatter - line numbers start at 1)
85
+ content = normalise_whitespace(writer.get_content())
86
+ output_path = output_dir / f"{input_path.stem}.md"
87
+ output_path.write_text(content, encoding="utf-8")
88
+
89
+ # Write separate TOC file (includes frontmatter)
90
+ headings = writer.get_headings()
91
+ toc_path = write_toc_file(
92
+ headings=headings,
93
+ output_dir=output_dir,
94
+ stem=input_path.stem,
95
+ source=source_rel,
96
+ converted=mtime,
97
+ pages=total_slides,
98
+ checksum=checksum,
99
+ )
100
+
101
+ return {
102
+ "output": str(output_path),
103
+ "toc": str(toc_path),
104
+ "slides": total_slides,
105
+ "images": images_extracted,
106
+ }
107
+
108
+
109
+ def _process_slide(
110
+ slide: Slide,
111
+ slide_number: int,
112
+ writer: IncrementalWriter,
113
+ images_dir: Path,
114
+ include_notes: bool,
115
+ ) -> int:
116
+ """Process a single slide.
117
+
118
+ Returns:
119
+ Number of images extracted
120
+ """
121
+ images_extracted = 0
122
+
123
+ # Get slide title
124
+ title = f"Slide {slide_number}"
125
+ try:
126
+ if (
127
+ hasattr(slide, "shapes")
128
+ and hasattr(slide.shapes, "title")
129
+ and slide.shapes.title
130
+ and slide.shapes.title.text.strip()
131
+ ):
132
+ title = slide.shapes.title.text.strip()
133
+ except AttributeError:
134
+ pass
135
+
136
+ writer.write_heading(2, title)
137
+
138
+ # Process shapes
139
+ text_content: list[str] = []
140
+ tables_content: list[str] = []
141
+
142
+ for shape in slide.shapes:
143
+ shape_type = getattr(shape, "shape_type", None)
144
+
145
+ # Tables (check FIRST - tables also have .text attribute)
146
+ if shape_type == MSO_SHAPE_TYPE.TABLE:
147
+ try:
148
+ if hasattr(shape, "table"):
149
+ table_md = _process_table(shape.table)
150
+ if table_md:
151
+ tables_content.append(table_md)
152
+ except (AttributeError, ValueError):
153
+ pass
154
+
155
+ # Images
156
+ elif shape_type == MSO_SHAPE_TYPE.PICTURE:
157
+ try:
158
+ if isinstance(shape, Picture):
159
+ img_ref = _process_image(shape, images_dir)
160
+ if img_ref:
161
+ writer.write(img_ref + "\n\n")
162
+ images_extracted += 1
163
+ except Exception:
164
+ pass
165
+
166
+ # Text shapes (check LAST - after tables and images)
167
+ elif hasattr(shape, "text") and shape.text and shape.text.strip():
168
+ # Skip title shape (already processed)
169
+ if shape == getattr(slide.shapes, "title", None):
170
+ continue
171
+ text_content.append(_process_text_shape(shape))
172
+
173
+ # Write text content
174
+ for text in text_content:
175
+ if text:
176
+ writer.write(text + "\n\n")
177
+
178
+ # Write tables
179
+ for table in tables_content:
180
+ if table:
181
+ writer.write(table + "\n\n")
182
+
183
+ # Add speaker notes if requested
184
+ if include_notes and hasattr(slide, "notes_slide"):
185
+ try:
186
+ notes_frame = slide.notes_slide.notes_text_frame
187
+ if notes_frame and notes_frame.text.strip():
188
+ writer.write("**Speaker Notes:**\n\n")
189
+ writer.write(f"> {notes_frame.text.strip()}\n\n")
190
+ except Exception:
191
+ pass
192
+
193
+ # Add slide separator
194
+ writer.write("---\n\n")
195
+
196
+ return images_extracted
197
+
198
+
199
+ def _process_text_shape(shape: BaseShape) -> str:
200
+ """Process text from a shape."""
201
+ if not hasattr(shape, "text") or not shape.text.strip():
202
+ return ""
203
+
204
+ text: str = str(shape.text).strip()
205
+ lines = text.split("\n")
206
+
207
+ if len(lines) > 1:
208
+ # Format as bullet list
209
+ processed: list[str] = []
210
+ for line in lines:
211
+ line = line.strip()
212
+ if line:
213
+ # Remove existing bullet markers
214
+ for marker in ("•", "-", "*", "○", "▪", "▫"):
215
+ if line.startswith(marker):
216
+ line = line[1:].strip()
217
+ break
218
+ processed.append(f"- {line}")
219
+ return "\n".join(processed)
220
+
221
+ return text
222
+
223
+
224
+ def _process_table(table: Any) -> str:
225
+ """Convert table to Markdown."""
226
+ if not hasattr(table, "rows") or not table.rows:
227
+ return ""
228
+
229
+ lines: list[str] = []
230
+
231
+ # Header row
232
+ header_cells: list[str] = []
233
+ for cell in table.rows[0].cells:
234
+ header_cells.append(cell.text.strip() if hasattr(cell, "text") else "")
235
+
236
+ if not header_cells:
237
+ return ""
238
+
239
+ lines.append("| " + " | ".join(header_cells) + " |")
240
+ lines.append("| " + " | ".join("---" for _ in header_cells) + " |")
241
+
242
+ # Data rows
243
+ for i in range(1, len(table.rows)):
244
+ row = table.rows[i]
245
+ cells: list[str] = []
246
+ for cell in row.cells:
247
+ cells.append(cell.text.strip() if hasattr(cell, "text") else "")
248
+
249
+ while len(cells) < len(header_cells):
250
+ cells.append("")
251
+
252
+ lines.append("| " + " | ".join(cells[: len(header_cells)]) + " |")
253
+
254
+ return "\n".join(lines)
255
+
256
+
257
+ def _process_image(shape: Picture, images_dir: Path) -> str:
258
+ """Extract and save image with hash-based naming."""
259
+ try:
260
+ image_data = shape.image.blob
261
+ content_type = shape.image.content_type
262
+
263
+ img_path = save_image(image_data, images_dir, content_type)
264
+ rel_path = f"{images_dir.name}/{img_path.name}"
265
+ return f"![{img_path.name}]({rel_path})"
266
+
267
+ except Exception:
268
+ return ""
@@ -0,0 +1,358 @@
1
+ """Shared utilities for document converters.
2
+
3
+ Provides diff-stable output formatting including:
4
+ - YAML frontmatter generation
5
+ - TOC generation with line ranges
6
+ - Hash-based image naming
7
+ - Whitespace normalisation
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import hashlib
13
+ import io
14
+ import re
15
+ from datetime import UTC, datetime
16
+ from pathlib import Path
17
+ from typing import TYPE_CHECKING
18
+
19
+ if TYPE_CHECKING:
20
+ from collections.abc import Sequence
21
+
22
+ from functools import lru_cache
23
+
24
+ _CHECKSUM_CACHE_MAX_SIZE = 100
25
+
26
+
27
+ @lru_cache(maxsize=_CHECKSUM_CACHE_MAX_SIZE)
28
+ def _compute_checksum_cached(
29
+ path_str: str,
30
+ mtime: float, # noqa: ARG001 - used as cache key
31
+ size: int, # noqa: ARG001 - used as cache key
32
+ ) -> str:
33
+ """Cached checksum computation (thread-safe via lru_cache).
34
+
35
+ Args:
36
+ path_str: Resolved path string
37
+ mtime: File modification time (for cache invalidation)
38
+ size: File size in bytes (for cache invalidation)
39
+
40
+ Returns:
41
+ Checksum in format 'sha256:abc123...'
42
+ """
43
+ path = Path(path_str)
44
+ sha256 = hashlib.sha256()
45
+ with path.open("rb") as f:
46
+ for chunk in iter(lambda: f.read(8192), b""):
47
+ sha256.update(chunk)
48
+ return f"sha256:{sha256.hexdigest()}"
49
+
50
+
51
+ def compute_file_checksum(path: Path) -> str:
52
+ """Compute SHA256 checksum of a file (with thread-safe caching).
53
+
54
+ Results are cached based on path+mtime+size to avoid redundant reads
55
+ when the same file is processed multiple times.
56
+
57
+ Args:
58
+ path: Path to file
59
+
60
+ Returns:
61
+ Checksum in format 'sha256:abc123...'
62
+ """
63
+ stat = path.stat()
64
+ return _compute_checksum_cached(str(path.resolve()), stat.st_mtime, stat.st_size)
65
+
66
+
67
+ def compute_image_hash(data: bytes) -> str:
68
+ """Compute hash for image naming (first 8 chars of SHA256).
69
+
70
+ Args:
71
+ data: Image bytes
72
+
73
+ Returns:
74
+ 8-character hex hash
75
+ """
76
+ return hashlib.sha256(data).hexdigest()[:8]
77
+
78
+
79
+ def save_image(data: bytes, images_dir: Path, content_type: str) -> Path:
80
+ """Save image with hash-based naming for diff stability.
81
+
82
+ Args:
83
+ data: Image bytes
84
+ images_dir: Directory to save image
85
+ content_type: MIME content type (e.g., "image/png", "image/jpeg")
86
+
87
+ Returns:
88
+ Path to saved image file
89
+ """
90
+ # Determine extension from content type
91
+ if "jpeg" in content_type or "jpg" in content_type:
92
+ extension = "jpg"
93
+ elif "png" in content_type:
94
+ extension = "png"
95
+ elif "gif" in content_type:
96
+ extension = "gif"
97
+ else:
98
+ extension = "png"
99
+
100
+ # Hash-based naming for diff stability
101
+ img_hash = compute_image_hash(data)
102
+ img_name = f"img_{img_hash}.{extension}"
103
+ img_path = images_dir / img_name
104
+
105
+ # Only write if not already extracted (dedup by hash)
106
+ if not img_path.exists():
107
+ images_dir.mkdir(parents=True, exist_ok=True)
108
+ img_path.write_bytes(data)
109
+
110
+ return img_path
111
+
112
+
113
+ def get_mtime_iso(path: Path) -> str:
114
+ """Get file modification time as ISO 8601 string.
115
+
116
+ Args:
117
+ path: Path to file
118
+
119
+ Returns:
120
+ ISO 8601 timestamp with Z suffix
121
+ """
122
+ mtime = path.stat().st_mtime
123
+ return datetime.fromtimestamp(mtime, tz=UTC).strftime("%Y-%m-%dT%H:%M:%SZ")
124
+
125
+
126
+ def generate_frontmatter(
127
+ *,
128
+ source: str,
129
+ converted: str,
130
+ pages: int | str,
131
+ checksum: str,
132
+ ) -> str:
133
+ """Generate YAML frontmatter for converted document.
134
+
135
+ Args:
136
+ source: Relative path to source file
137
+ converted: ISO 8601 timestamp (source file mtime)
138
+ pages: Page/slide/sheet count (may be prefixed with ~ for estimates)
139
+ checksum: SHA256 hash of source file
140
+
141
+ Returns:
142
+ YAML frontmatter block including delimiters
143
+ """
144
+ return f"""---
145
+ source: {source}
146
+ converted: {converted}
147
+ pages: {pages}
148
+ checksum: {checksum}
149
+ ---
150
+ """
151
+
152
+
153
+ def generate_toc(
154
+ headings: Sequence[tuple[int, str, int, int]],
155
+ main_file: str,
156
+ source: str,
157
+ converted: str,
158
+ pages: int | str,
159
+ checksum: str,
160
+ ) -> str:
161
+ """Generate table of contents as a separate file with line ranges.
162
+
163
+ Creates a TOC document with frontmatter and instructions for LLMs
164
+ on how to use the line numbers to navigate the main document.
165
+
166
+ Args:
167
+ headings: List of (level, title, start_line, end_line) tuples
168
+ Level 1 = H1, Level 2 = H2, etc.
169
+ main_file: Filename of the main markdown file (for linking)
170
+ source: Original source file path (for reference)
171
+ converted: ISO 8601 timestamp (source file mtime)
172
+ pages: Page/slide/sheet count (may be prefixed with ~ for estimates)
173
+ checksum: SHA256 hash of source file
174
+
175
+ Returns:
176
+ Complete markdown TOC document with frontmatter
177
+ """
178
+ lines = [
179
+ "---",
180
+ f"source: {source}",
181
+ f"converted: {converted}",
182
+ f"pages: {pages}",
183
+ f"checksum: {checksum}",
184
+ "---",
185
+ "",
186
+ "# Table of Contents",
187
+ "",
188
+ f"**Document:** [{main_file}]({main_file})",
189
+ "",
190
+ "## How to Use This TOC",
191
+ "",
192
+ "Each entry shows `(lines <start>-<end>)` for the main document.",
193
+ "To read a section efficiently:",
194
+ "",
195
+ "1. Find the section you need below",
196
+ f"2. Use the line range to read only that portion of [{main_file}]({main_file})",
197
+ "3. Line numbers are exact - no offset needed",
198
+ "",
199
+ "---",
200
+ "",
201
+ "## Contents",
202
+ "",
203
+ ]
204
+
205
+ if not headings:
206
+ lines.append("*No headings found in document.*")
207
+ else:
208
+ for level, title, start_line, end_line in headings:
209
+ indent = " " * (level - 1)
210
+ # Create anchor linking to section in main file
211
+ anchor = _slugify(title)
212
+ lines.append(
213
+ f"{indent}- [{title}]({main_file}#{anchor}) (lines {start_line}-{end_line})"
214
+ )
215
+
216
+ lines.append("")
217
+ return "\n".join(lines)
218
+
219
+
220
+ def _slugify(text: str) -> str:
221
+ """Convert text to URL-safe anchor.
222
+
223
+ Args:
224
+ text: Text to slugify
225
+
226
+ Returns:
227
+ Lowercase slug with hyphens
228
+ """
229
+ # Remove non-alphanumeric chars, replace spaces with hyphens
230
+ slug = re.sub(r"[^\w\s-]", "", text.lower())
231
+ return re.sub(r"[\s_]+", "-", slug).strip("-")
232
+
233
+
234
+ def write_toc_file(
235
+ headings: list[tuple[int, str, int, int]],
236
+ output_dir: Path,
237
+ stem: str,
238
+ source: str,
239
+ converted: str,
240
+ pages: int | str,
241
+ checksum: str,
242
+ ) -> Path:
243
+ """Write TOC to a separate file with frontmatter.
244
+
245
+ Args:
246
+ headings: List of (level, title, start_line, end_line) tuples
247
+ output_dir: Directory for output files
248
+ stem: Base filename (without extension)
249
+ source: Original source file path
250
+ converted: ISO 8601 timestamp (source file mtime)
251
+ pages: Page/slide/sheet count (may be prefixed with ~ for estimates)
252
+ checksum: SHA256 hash of source file
253
+
254
+ Returns:
255
+ Path to the written TOC file
256
+ """
257
+ main_file = f"{stem}.md"
258
+ toc_content = generate_toc(headings, main_file, source, converted, pages, checksum)
259
+ toc_path = output_dir / f"{stem}.toc.md"
260
+ toc_path.write_text(toc_content, encoding="utf-8")
261
+ return toc_path
262
+
263
+
264
+ def normalise_whitespace(content: str) -> str:
265
+ """Normalise whitespace for diff-stable output.
266
+
267
+ - Converts CRLF to LF
268
+ - Removes trailing whitespace
269
+ - Ensures consistent blank line spacing (max 2 consecutive)
270
+ - Ensures single trailing newline
271
+
272
+ Args:
273
+ content: Raw content
274
+
275
+ Returns:
276
+ Normalised content
277
+ """
278
+ # Normalise line endings
279
+ content = content.replace("\r\n", "\n").replace("\r", "\n")
280
+
281
+ # Remove trailing whitespace from each line
282
+ lines = [line.rstrip() for line in content.split("\n")]
283
+
284
+ # Collapse multiple blank lines to max 2
285
+ result_lines: list[str] = []
286
+ blank_count = 0
287
+ for line in lines:
288
+ if line == "":
289
+ blank_count += 1
290
+ if blank_count <= 2:
291
+ result_lines.append(line)
292
+ else:
293
+ blank_count = 0
294
+ result_lines.append(line)
295
+
296
+ # Join and ensure single trailing newline
297
+ result = "\n".join(result_lines).rstrip("\n") + "\n"
298
+ return result
299
+
300
+
301
+ class IncrementalWriter:
302
+ """Write content incrementally to track line numbers.
303
+
304
+ Buffers content and tracks line numbers for TOC generation.
305
+ """
306
+
307
+ def __init__(self) -> None:
308
+ self._buffer = io.StringIO()
309
+ self._line_count = 0
310
+ self._headings: list[tuple[int, str, int, int]] = []
311
+ self._current_heading: tuple[int, str, int] | None = None
312
+
313
+ def write(self, text: str) -> None:
314
+ """Write text to buffer."""
315
+ self._buffer.write(text)
316
+ self._line_count += text.count("\n")
317
+
318
+ def write_heading(self, level: int, title: str) -> None:
319
+ """Write a heading and track it for TOC.
320
+
321
+ Args:
322
+ level: Heading level (1-6)
323
+ title: Heading text
324
+ """
325
+ # Close previous heading
326
+ if self._current_heading:
327
+ prev_level, prev_title, prev_start = self._current_heading
328
+ self._headings.append((prev_level, prev_title, prev_start, self._line_count))
329
+
330
+ # Start new heading
331
+ heading_line = self._line_count + 1
332
+ self._current_heading = (level, title, heading_line)
333
+
334
+ # Write the heading
335
+ prefix = "#" * level
336
+ self.write(f"{prefix} {title}\n\n")
337
+
338
+ def close_heading(self) -> None:
339
+ """Close the current heading section."""
340
+ if self._current_heading:
341
+ prev_level, prev_title, prev_start = self._current_heading
342
+ self._headings.append((prev_level, prev_title, prev_start, self._line_count))
343
+ self._current_heading = None
344
+
345
+ def get_content(self) -> str:
346
+ """Get buffered content."""
347
+ return self._buffer.getvalue()
348
+
349
+ def get_headings(self) -> list[tuple[int, str, int, int]]:
350
+ """Get collected headings for TOC generation."""
351
+ # Close any open heading
352
+ self.close_heading()
353
+ return self._headings
354
+
355
+ @property
356
+ def line_count(self) -> int:
357
+ """Current line count."""
358
+ return self._line_count