onetool-mcp 1.0.0b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (132) hide show
  1. bench/__init__.py +5 -0
  2. bench/cli.py +69 -0
  3. bench/harness/__init__.py +66 -0
  4. bench/harness/client.py +692 -0
  5. bench/harness/config.py +397 -0
  6. bench/harness/csv_writer.py +109 -0
  7. bench/harness/evaluate.py +512 -0
  8. bench/harness/metrics.py +283 -0
  9. bench/harness/runner.py +899 -0
  10. bench/py.typed +0 -0
  11. bench/reporter.py +629 -0
  12. bench/run.py +487 -0
  13. bench/secrets.py +101 -0
  14. bench/utils.py +16 -0
  15. onetool/__init__.py +4 -0
  16. onetool/cli.py +391 -0
  17. onetool/py.typed +0 -0
  18. onetool_mcp-1.0.0b1.dist-info/METADATA +163 -0
  19. onetool_mcp-1.0.0b1.dist-info/RECORD +132 -0
  20. onetool_mcp-1.0.0b1.dist-info/WHEEL +4 -0
  21. onetool_mcp-1.0.0b1.dist-info/entry_points.txt +3 -0
  22. onetool_mcp-1.0.0b1.dist-info/licenses/LICENSE.txt +687 -0
  23. onetool_mcp-1.0.0b1.dist-info/licenses/NOTICE.txt +64 -0
  24. ot/__init__.py +37 -0
  25. ot/__main__.py +6 -0
  26. ot/_cli.py +107 -0
  27. ot/_tui.py +53 -0
  28. ot/config/__init__.py +46 -0
  29. ot/config/defaults/bench.yaml +4 -0
  30. ot/config/defaults/diagram-templates/api-flow.mmd +33 -0
  31. ot/config/defaults/diagram-templates/c4-context.puml +30 -0
  32. ot/config/defaults/diagram-templates/class-diagram.mmd +87 -0
  33. ot/config/defaults/diagram-templates/feature-mindmap.mmd +70 -0
  34. ot/config/defaults/diagram-templates/microservices.d2 +81 -0
  35. ot/config/defaults/diagram-templates/project-gantt.mmd +37 -0
  36. ot/config/defaults/diagram-templates/state-machine.mmd +42 -0
  37. ot/config/defaults/onetool.yaml +25 -0
  38. ot/config/defaults/prompts.yaml +97 -0
  39. ot/config/defaults/servers.yaml +7 -0
  40. ot/config/defaults/snippets.yaml +4 -0
  41. ot/config/defaults/tool_templates/__init__.py +7 -0
  42. ot/config/defaults/tool_templates/extension.py +52 -0
  43. ot/config/defaults/tool_templates/isolated.py +61 -0
  44. ot/config/dynamic.py +121 -0
  45. ot/config/global_templates/__init__.py +2 -0
  46. ot/config/global_templates/bench-secrets-template.yaml +6 -0
  47. ot/config/global_templates/bench.yaml +9 -0
  48. ot/config/global_templates/onetool.yaml +27 -0
  49. ot/config/global_templates/secrets-template.yaml +44 -0
  50. ot/config/global_templates/servers.yaml +18 -0
  51. ot/config/global_templates/snippets.yaml +235 -0
  52. ot/config/loader.py +1087 -0
  53. ot/config/mcp.py +145 -0
  54. ot/config/secrets.py +190 -0
  55. ot/config/tool_config.py +125 -0
  56. ot/decorators.py +116 -0
  57. ot/executor/__init__.py +35 -0
  58. ot/executor/base.py +16 -0
  59. ot/executor/fence_processor.py +83 -0
  60. ot/executor/linter.py +142 -0
  61. ot/executor/pack_proxy.py +260 -0
  62. ot/executor/param_resolver.py +140 -0
  63. ot/executor/pep723.py +288 -0
  64. ot/executor/result_store.py +369 -0
  65. ot/executor/runner.py +496 -0
  66. ot/executor/simple.py +163 -0
  67. ot/executor/tool_loader.py +396 -0
  68. ot/executor/validator.py +398 -0
  69. ot/executor/worker_pool.py +388 -0
  70. ot/executor/worker_proxy.py +189 -0
  71. ot/http_client.py +145 -0
  72. ot/logging/__init__.py +37 -0
  73. ot/logging/config.py +315 -0
  74. ot/logging/entry.py +213 -0
  75. ot/logging/format.py +188 -0
  76. ot/logging/span.py +349 -0
  77. ot/meta.py +1555 -0
  78. ot/paths.py +453 -0
  79. ot/prompts.py +218 -0
  80. ot/proxy/__init__.py +21 -0
  81. ot/proxy/manager.py +396 -0
  82. ot/py.typed +0 -0
  83. ot/registry/__init__.py +189 -0
  84. ot/registry/models.py +57 -0
  85. ot/registry/parser.py +269 -0
  86. ot/registry/registry.py +413 -0
  87. ot/server.py +315 -0
  88. ot/shortcuts/__init__.py +15 -0
  89. ot/shortcuts/aliases.py +87 -0
  90. ot/shortcuts/snippets.py +258 -0
  91. ot/stats/__init__.py +35 -0
  92. ot/stats/html.py +250 -0
  93. ot/stats/jsonl_writer.py +283 -0
  94. ot/stats/reader.py +354 -0
  95. ot/stats/timing.py +57 -0
  96. ot/support.py +63 -0
  97. ot/tools.py +114 -0
  98. ot/utils/__init__.py +81 -0
  99. ot/utils/batch.py +161 -0
  100. ot/utils/cache.py +120 -0
  101. ot/utils/deps.py +403 -0
  102. ot/utils/exceptions.py +23 -0
  103. ot/utils/factory.py +179 -0
  104. ot/utils/format.py +65 -0
  105. ot/utils/http.py +202 -0
  106. ot/utils/platform.py +45 -0
  107. ot/utils/sanitize.py +130 -0
  108. ot/utils/truncate.py +69 -0
  109. ot_tools/__init__.py +4 -0
  110. ot_tools/_convert/__init__.py +12 -0
  111. ot_tools/_convert/excel.py +279 -0
  112. ot_tools/_convert/pdf.py +254 -0
  113. ot_tools/_convert/powerpoint.py +268 -0
  114. ot_tools/_convert/utils.py +358 -0
  115. ot_tools/_convert/word.py +283 -0
  116. ot_tools/brave_search.py +604 -0
  117. ot_tools/code_search.py +736 -0
  118. ot_tools/context7.py +495 -0
  119. ot_tools/convert.py +614 -0
  120. ot_tools/db.py +415 -0
  121. ot_tools/diagram.py +1604 -0
  122. ot_tools/diagram.yaml +167 -0
  123. ot_tools/excel.py +1372 -0
  124. ot_tools/file.py +1348 -0
  125. ot_tools/firecrawl.py +732 -0
  126. ot_tools/grounding_search.py +646 -0
  127. ot_tools/package.py +604 -0
  128. ot_tools/py.typed +0 -0
  129. ot_tools/ripgrep.py +544 -0
  130. ot_tools/scaffold.py +471 -0
  131. ot_tools/transform.py +213 -0
  132. ot_tools/web_fetch.py +384 -0
@@ -0,0 +1,12 @@
1
+ """Document conversion utilities for OneTool.
2
+
3
+ Provides PDF, Word, PowerPoint, and Excel to Markdown conversion
4
+ with LLM-optimised output including YAML frontmatter and TOC.
5
+ """
6
+
7
+ from ot_tools._convert.excel import convert_excel
8
+ from ot_tools._convert.pdf import convert_pdf
9
+ from ot_tools._convert.powerpoint import convert_powerpoint
10
+ from ot_tools._convert.word import convert_word
11
+
12
+ __all__ = ["convert_excel", "convert_pdf", "convert_powerpoint", "convert_word"]
@@ -0,0 +1,279 @@
1
+ """Excel workbook to Markdown converter.
2
+
3
+ Converts XLSX spreadsheets to Markdown with:
4
+ - Streaming row processing via openpyxl read_only mode
5
+ - Sheet-based sections
6
+ - Optional formula extraction
7
+ - YAML frontmatter and TOC generation
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ from pathlib import Path # noqa: TC003 (used at runtime)
13
+ from typing import Any
14
+
15
+ from ot_tools._convert.utils import (
16
+ IncrementalWriter,
17
+ compute_file_checksum,
18
+ get_mtime_iso,
19
+ normalise_whitespace,
20
+ write_toc_file,
21
+ )
22
+
23
+
24
+ def convert_excel(
25
+ input_path: Path,
26
+ output_dir: Path,
27
+ source_rel: str,
28
+ *,
29
+ include_formulas: bool = False,
30
+ compute_formulas: bool = False,
31
+ ) -> dict[str, Any]:
32
+ """Convert Excel workbook to Markdown.
33
+
34
+ Args:
35
+ input_path: Path to XLSX file
36
+ output_dir: Directory for output files
37
+ source_rel: Relative path to source for frontmatter
38
+ include_formulas: Include cell formulas as comments
39
+ compute_formulas: Evaluate formulas when cached values are missing
40
+ (requires 'formulas' library: pip install formulas)
41
+
42
+ Returns:
43
+ Dict with 'output', 'sheets', 'rows' keys
44
+ """
45
+ try:
46
+ from openpyxl import load_workbook # type: ignore[import-untyped]
47
+ except ImportError as e:
48
+ raise ImportError(
49
+ "openpyxl is required for convert. Install with: pip install openpyxl"
50
+ ) from e
51
+
52
+ # Load formula model if compute_formulas is enabled
53
+ formula_model: Any = None
54
+ if compute_formulas:
55
+ try:
56
+ import formulas # type: ignore[import-untyped]
57
+
58
+ formula_model = formulas.ExcelModel().loads(str(input_path)).finish()
59
+ except ImportError:
60
+ raise ImportError(
61
+ "formulas library is required for compute_formulas. "
62
+ "Install with: pip install formulas"
63
+ ) from None
64
+ except Exception:
65
+ # If formula model fails to load, continue without it
66
+ formula_model = None
67
+
68
+ output_dir.mkdir(parents=True, exist_ok=True)
69
+
70
+ # Load workbook
71
+ # - read_only=False when computing formulas (need full access)
72
+ # - data_only=True: get cached computed values (no formulas)
73
+ # - data_only=False: get formulas as cell values (when include_formulas=True)
74
+ read_only = not compute_formulas
75
+ wb = load_workbook(input_path, read_only=read_only, data_only=not include_formulas)
76
+
77
+ # Get metadata for frontmatter
78
+ checksum = compute_file_checksum(input_path)
79
+ mtime = get_mtime_iso(input_path)
80
+ total_sheets = len(wb.sheetnames)
81
+
82
+ writer = IncrementalWriter()
83
+ total_rows = 0
84
+
85
+ # Process each sheet (single workbook - no double loading)
86
+ for sheet_name in wb.sheetnames:
87
+ ws = wb[sheet_name]
88
+ rows = _process_sheet(writer, sheet_name, ws, include_formulas, formula_model)
89
+ total_rows += rows
90
+
91
+ wb.close()
92
+
93
+ # Write main output (pure content, no frontmatter - line numbers start at 1)
94
+ content = normalise_whitespace(writer.get_content())
95
+ output_path = output_dir / f"{input_path.stem}.md"
96
+ output_path.write_text(content, encoding="utf-8")
97
+
98
+ # Write separate TOC file (includes frontmatter)
99
+ headings = writer.get_headings()
100
+ toc_path = write_toc_file(
101
+ headings=headings,
102
+ output_dir=output_dir,
103
+ stem=input_path.stem,
104
+ source=source_rel,
105
+ converted=mtime,
106
+ pages=total_sheets,
107
+ checksum=checksum,
108
+ )
109
+
110
+ return {
111
+ "output": str(output_path),
112
+ "toc": str(toc_path),
113
+ "sheets": total_sheets,
114
+ "rows": total_rows,
115
+ }
116
+
117
+
118
+ def _process_sheet(
119
+ writer: IncrementalWriter,
120
+ sheet_name: str,
121
+ ws: Any,
122
+ include_formulas: bool,
123
+ formula_model: Any = None,
124
+ ) -> int:
125
+ """Process a single worksheet with streaming (O(1) memory for row data).
126
+
127
+ When include_formulas=True, the workbook was loaded with data_only=False,
128
+ so formula cells contain the formula string as their value.
129
+
130
+ Args:
131
+ writer: IncrementalWriter for output
132
+ sheet_name: Name of the worksheet
133
+ ws: Worksheet object
134
+ include_formulas: Whether to include formulas in output
135
+ formula_model: Optional formulas.ExcelModel for computing formula values
136
+
137
+ Returns:
138
+ Number of rows processed
139
+ """
140
+ writer.write_heading(2, f"Sheet: {sheet_name}")
141
+
142
+ # First pass: count max columns (streaming, no data storage)
143
+ max_cols = 0
144
+ row_count = 0
145
+ for row in ws.iter_rows():
146
+ max_cols = max(max_cols, len(row))
147
+ row_count += 1
148
+
149
+ if row_count == 0:
150
+ writer.write("(empty sheet)\n\n")
151
+ return 0
152
+
153
+ # Second pass: stream rows directly to writer
154
+ rows_iter = iter(ws.iter_rows())
155
+
156
+ # Get header (first row)
157
+ first_row = next(rows_iter)
158
+ header = [
159
+ _get_cell_value(cell, sheet_name, 1, j + 1, formula_model)
160
+ for j, cell in enumerate(first_row)
161
+ ]
162
+ # Pad header to max_cols
163
+ while len(header) < max_cols:
164
+ header.append("")
165
+
166
+ # Write header
167
+ writer.write("| " + " | ".join(_escape_pipe(c) for c in header) + " |\n")
168
+ writer.write("| " + " | ".join("---" for _ in header) + " |\n")
169
+
170
+ # Collect formulas as we go (just formula tuples, not full row data)
171
+ # Format: (col_letter, row_num, formula_string)
172
+ formulas: list[tuple[str, int, str]] = []
173
+
174
+ # Check first row for formulas (cell values are formulas when include_formulas=True)
175
+ if include_formulas:
176
+ for j, cell in enumerate(first_row):
177
+ try:
178
+ value = cell.value
179
+ if isinstance(value, str) and value.startswith("="):
180
+ formulas.append((_col_letter(j + 1), 1, value))
181
+ except Exception:
182
+ pass
183
+
184
+ # Stream remaining rows directly to writer
185
+ current_row = 2 # 1-indexed, header was row 1
186
+ for row in rows_iter:
187
+ row_values = [
188
+ _get_cell_value(cell, sheet_name, current_row, j + 1, formula_model)
189
+ for j, cell in enumerate(row)
190
+ ]
191
+ # Pad row to max_cols
192
+ while len(row_values) < max_cols:
193
+ row_values.append("")
194
+
195
+ writer.write("| " + " | ".join(_escape_pipe(c) for c in row_values[:len(header)]) + " |\n")
196
+
197
+ # Track formulas for this row (cell values are formulas when include_formulas=True)
198
+ if include_formulas:
199
+ for j, cell in enumerate(row):
200
+ try:
201
+ value = cell.value
202
+ if isinstance(value, str) and value.startswith("="):
203
+ formulas.append((_col_letter(j + 1), current_row, value))
204
+ except Exception:
205
+ pass
206
+
207
+ current_row += 1
208
+
209
+ writer.write("\n")
210
+
211
+ # Add formulas section if any formulas found
212
+ if formulas:
213
+ writer.write("**Formulas:**\n\n")
214
+ writer.write("```\n")
215
+ for col_letter, row_num, formula in formulas:
216
+ writer.write(f"{col_letter}{row_num}: {formula}\n")
217
+ writer.write("```\n\n")
218
+
219
+ return row_count
220
+
221
+
222
+ def _get_cell_value(
223
+ cell: Any,
224
+ sheet_name: str,
225
+ row_num: int,
226
+ col_num: int,
227
+ formula_model: Any,
228
+ ) -> str:
229
+ """Get cell value, optionally computing from formula model.
230
+
231
+ Args:
232
+ cell: openpyxl cell object
233
+ sheet_name: Name of the worksheet (for formula lookup)
234
+ row_num: 1-indexed row number
235
+ col_num: 1-indexed column number
236
+ formula_model: Optional formulas.ExcelModel for computing values
237
+
238
+ Returns:
239
+ String representation of cell value
240
+ """
241
+ value = cell.value
242
+
243
+ # If we have a value, use it
244
+ if value is not None:
245
+ return str(value)
246
+
247
+ # If no formula model, return empty
248
+ if formula_model is None:
249
+ return ""
250
+
251
+ # Try to compute value from formula model
252
+ try:
253
+ # Build cell reference like "'Sheet1'!A1"
254
+ col_letter = _col_letter(col_num)
255
+ cell_ref = f"'{sheet_name}'!{col_letter}{row_num}"
256
+ computed = formula_model.calculate(cell_ref)
257
+ if computed is not None and computed != cell_ref:
258
+ # Handle numpy arrays and other types
259
+ if hasattr(computed, "item"):
260
+ computed = computed.item()
261
+ return str(computed)
262
+ except Exception:
263
+ pass
264
+
265
+ return ""
266
+
267
+
268
+ def _escape_pipe(text: str) -> str:
269
+ """Escape pipe characters for Markdown tables."""
270
+ return text.replace("|", "\\|").replace("\n", " ")
271
+
272
+
273
+ def _col_letter(n: int) -> str:
274
+ """Convert column number to letter (1=A, 2=B, ..., 27=AA)."""
275
+ result = ""
276
+ while n > 0:
277
+ n, remainder = divmod(n - 1, 26)
278
+ result = chr(65 + remainder) + result
279
+ return result
@@ -0,0 +1,254 @@
1
+ """PDF to Markdown converter.
2
+
3
+ Converts PDF documents to Markdown with:
4
+ - Lazy page loading via PyMuPDF
5
+ - Outline-based heading extraction
6
+ - Hash-based image naming for diff stability
7
+ - YAML frontmatter and TOC generation
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import io
13
+ from pathlib import Path # noqa: TC003 (used at runtime)
14
+ from typing import TYPE_CHECKING, Any
15
+
16
+ try:
17
+ import fitz # type: ignore[import-untyped] # PyMuPDF
18
+ except ImportError as e:
19
+ raise ImportError(
20
+ "pymupdf is required for convert. Install with: pip install pymupdf"
21
+ ) from e
22
+
23
+ from PIL import Image
24
+
25
+ if TYPE_CHECKING:
26
+ from PIL.Image import Image as PILImage
27
+
28
+ from ot_tools._convert.utils import (
29
+ IncrementalWriter,
30
+ compute_file_checksum,
31
+ compute_image_hash,
32
+ get_mtime_iso,
33
+ normalise_whitespace,
34
+ write_toc_file,
35
+ )
36
+
37
+
38
+ def _merge_smask(image_bytes: bytes, sm_bytes: bytes) -> bytes:
39
+ """Merge soft-mask into image for transparency.
40
+
41
+ Args:
42
+ image_bytes: Base image bytes
43
+ sm_bytes: Soft-mask bytes
44
+
45
+ Returns:
46
+ PNG bytes with transparency
47
+ """
48
+ with (
49
+ Image.open(io.BytesIO(image_bytes)) as im_file,
50
+ Image.open(io.BytesIO(sm_bytes)) as mask_file,
51
+ ):
52
+ mask: PILImage = mask_file.convert("L")
53
+ im: PILImage = im_file.convert("RGBA")
54
+ if mask.size != im.size:
55
+ mask = mask.resize(im.size)
56
+ im.putalpha(mask)
57
+ buf = io.BytesIO()
58
+ im.save(buf, format="PNG")
59
+ return buf.getvalue()
60
+
61
+
62
+ def _detect_image_format(image_bytes: bytes) -> str:
63
+ """Detect image format from bytes.
64
+
65
+ Args:
66
+ image_bytes: Image data
67
+
68
+ Returns:
69
+ File extension (e.g., 'png', 'jpg')
70
+ """
71
+ try:
72
+ with Image.open(io.BytesIO(image_bytes)) as im:
73
+ format_map = {
74
+ "JPEG": "jpg",
75
+ "PNG": "png",
76
+ "GIF": "gif",
77
+ "BMP": "bmp",
78
+ "TIFF": "tiff",
79
+ "WEBP": "webp",
80
+ }
81
+ return format_map.get(im.format or "", "png")
82
+ except Exception:
83
+ return "png"
84
+
85
+
86
+ def _get_outline_headings(doc: fitz.Document) -> list[tuple[int, str, int]]:
87
+ """Extract outline/bookmarks from PDF.
88
+
89
+ Args:
90
+ doc: PyMuPDF document
91
+
92
+ Returns:
93
+ List of (level, title, page_number) tuples
94
+ """
95
+ try:
96
+ toc = doc.get_toc()
97
+ return [(level, title, page) for level, title, page in toc]
98
+ except Exception:
99
+ return []
100
+
101
+
102
+ def _extract_and_save_image(
103
+ doc: fitz.Document,
104
+ xref: int,
105
+ images_dir: Path,
106
+ writer: IncrementalWriter,
107
+ ) -> bool:
108
+ """Extract a single image and save to disk.
109
+
110
+ This function encapsulates image processing so that memory (image_bytes)
111
+ is freed when the function returns, preventing accumulation.
112
+
113
+ Args:
114
+ doc: PyMuPDF document
115
+ xref: Image xref in the document
116
+ images_dir: Directory for saving images
117
+ writer: Incremental writer for markdown output
118
+
119
+ Returns:
120
+ True if image was successfully extracted, False otherwise
121
+ """
122
+ base_image = doc.extract_image(xref)
123
+ image_bytes = base_image.get("image")
124
+ smask = base_image.get("smask")
125
+
126
+ if not image_bytes:
127
+ return False
128
+
129
+ # Handle soft-mask (transparency)
130
+ if smask:
131
+ try:
132
+ sm_base = doc.extract_image(smask)
133
+ sm_bytes = sm_base.get("image")
134
+ if sm_bytes:
135
+ image_bytes = _merge_smask(image_bytes, sm_bytes)
136
+ extension = "png"
137
+ else:
138
+ extension = _detect_image_format(image_bytes)
139
+ except Exception:
140
+ extension = _detect_image_format(image_bytes)
141
+ else:
142
+ extension = _detect_image_format(image_bytes)
143
+
144
+ # Hash-based naming for diff stability
145
+ img_hash = compute_image_hash(image_bytes)
146
+ img_name = f"img_{img_hash}.{extension}"
147
+ img_path = images_dir / img_name
148
+
149
+ # Only write if not already extracted (dedup by hash)
150
+ if not img_path.exists():
151
+ images_dir.mkdir(parents=True, exist_ok=True)
152
+ img_path.write_bytes(image_bytes)
153
+
154
+ rel_path = f"{images_dir.name}/{img_name}"
155
+ writer.write(f"![{img_name}]({rel_path})\n\n")
156
+
157
+ return True
158
+
159
+
160
+ def convert_pdf(
161
+ input_path: Path,
162
+ output_dir: Path,
163
+ source_rel: str,
164
+ ) -> dict[str, Any]:
165
+ """Convert PDF to Markdown.
166
+
167
+ Args:
168
+ input_path: Path to PDF file
169
+ output_dir: Directory for output files
170
+ source_rel: Relative path to source for frontmatter
171
+
172
+ Returns:
173
+ Dict with 'output', 'pages', 'images' keys
174
+ """
175
+ output_dir.mkdir(parents=True, exist_ok=True)
176
+
177
+ doc = fitz.open(input_path)
178
+ try:
179
+ total_pages = len(doc)
180
+
181
+ # Get metadata for frontmatter
182
+ checksum = compute_file_checksum(input_path)
183
+ mtime = get_mtime_iso(input_path)
184
+
185
+ # Get outline for heading insertion
186
+ outline = _get_outline_headings(doc)
187
+ outline_by_page: dict[int, list[tuple[int, str]]] = {}
188
+ for level, title, page in outline:
189
+ if page not in outline_by_page:
190
+ outline_by_page[page] = []
191
+ outline_by_page[page].append((level, title))
192
+
193
+ # Set up images directory
194
+ images_dir = output_dir / f"{input_path.stem}_images"
195
+ writer = IncrementalWriter()
196
+ images_extracted = 0
197
+
198
+ # Process pages with lazy loading
199
+ for pageno in range(total_pages):
200
+ page = doc[pageno]
201
+ page_num = pageno + 1
202
+
203
+ # Insert outline headings for this page
204
+ if page_num in outline_by_page:
205
+ for level, title in outline_by_page[page_num]:
206
+ writer.write_heading(min(level, 6), title)
207
+ elif not outline:
208
+ # No outline - use page numbers as structure
209
+ writer.write_heading(1, f"Page {page_num}")
210
+
211
+ # Extract text
212
+ text = page.get_text("text")
213
+ if text.strip():
214
+ writer.write(text.rstrip() + "\n\n")
215
+
216
+ # Extract images - process one at a time to minimize memory
217
+ image_list = page.get_images(full=True)
218
+ for img in image_list:
219
+ xref = img[0]
220
+ try:
221
+ result = _extract_and_save_image(
222
+ doc, xref, images_dir, writer
223
+ )
224
+ if result:
225
+ images_extracted += 1
226
+ except Exception:
227
+ # Skip failed image extraction
228
+ continue
229
+ finally:
230
+ doc.close()
231
+
232
+ # Write main output (pure content, no frontmatter - line numbers start at 1)
233
+ content = normalise_whitespace(writer.get_content())
234
+ output_path = output_dir / f"{input_path.stem}.md"
235
+ output_path.write_text(content, encoding="utf-8")
236
+
237
+ # Write separate TOC file (includes frontmatter)
238
+ headings = writer.get_headings()
239
+ toc_path = write_toc_file(
240
+ headings=headings,
241
+ output_dir=output_dir,
242
+ stem=input_path.stem,
243
+ source=source_rel,
244
+ converted=mtime,
245
+ pages=total_pages,
246
+ checksum=checksum,
247
+ )
248
+
249
+ return {
250
+ "output": str(output_path),
251
+ "toc": str(toc_path),
252
+ "pages": total_pages,
253
+ "images": images_extracted,
254
+ }