markitai 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- markitai/__init__.py +3 -0
- markitai/batch.py +1316 -0
- markitai/cli.py +3979 -0
- markitai/config.py +602 -0
- markitai/config.schema.json +748 -0
- markitai/constants.py +222 -0
- markitai/converter/__init__.py +49 -0
- markitai/converter/_patches.py +98 -0
- markitai/converter/base.py +164 -0
- markitai/converter/image.py +181 -0
- markitai/converter/legacy.py +606 -0
- markitai/converter/office.py +526 -0
- markitai/converter/pdf.py +679 -0
- markitai/converter/text.py +63 -0
- markitai/fetch.py +1725 -0
- markitai/image.py +1335 -0
- markitai/json_order.py +550 -0
- markitai/llm.py +4339 -0
- markitai/ocr.py +347 -0
- markitai/prompts/__init__.py +159 -0
- markitai/prompts/cleaner.md +93 -0
- markitai/prompts/document_enhance.md +77 -0
- markitai/prompts/document_enhance_complete.md +65 -0
- markitai/prompts/document_process.md +60 -0
- markitai/prompts/frontmatter.md +28 -0
- markitai/prompts/image_analysis.md +21 -0
- markitai/prompts/image_caption.md +8 -0
- markitai/prompts/image_description.md +13 -0
- markitai/prompts/page_content.md +17 -0
- markitai/prompts/url_enhance.md +78 -0
- markitai/security.py +286 -0
- markitai/types.py +30 -0
- markitai/urls.py +187 -0
- markitai/utils/__init__.py +33 -0
- markitai/utils/executor.py +69 -0
- markitai/utils/mime.py +85 -0
- markitai/utils/office.py +262 -0
- markitai/utils/output.py +53 -0
- markitai/utils/paths.py +81 -0
- markitai/utils/text.py +359 -0
- markitai/workflow/__init__.py +37 -0
- markitai/workflow/core.py +760 -0
- markitai/workflow/helpers.py +509 -0
- markitai/workflow/single.py +369 -0
- markitai-0.3.0.dist-info/METADATA +159 -0
- markitai-0.3.0.dist-info/RECORD +48 -0
- markitai-0.3.0.dist-info/WHEEL +4 -0
- markitai-0.3.0.dist-info/entry_points.txt +2 -0
|
@@ -0,0 +1,606 @@
|
|
|
1
|
+
"""Legacy Office format converters (DOC, PPT, XLS - Office 97-2003).
|
|
2
|
+
|
|
3
|
+
These formats require conversion to modern formats first.
|
|
4
|
+
Conversion priority:
|
|
5
|
+
1. MS Office COM (Windows) - faster and more accurate
|
|
6
|
+
2. LibreOffice CLI (cross-platform) - fallback
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import platform
|
|
12
|
+
import subprocess
|
|
13
|
+
import tempfile
|
|
14
|
+
from collections.abc import Callable
|
|
15
|
+
from dataclasses import dataclass
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
from typing import TYPE_CHECKING
|
|
18
|
+
|
|
19
|
+
from loguru import logger
|
|
20
|
+
|
|
21
|
+
from markitai.converter.base import (
|
|
22
|
+
BaseConverter,
|
|
23
|
+
ConvertResult,
|
|
24
|
+
FileFormat,
|
|
25
|
+
register_converter,
|
|
26
|
+
)
|
|
27
|
+
from markitai.converter.office import OfficeConverter, PptxConverter
|
|
28
|
+
from markitai.utils.office import (
|
|
29
|
+
check_ms_excel_available,
|
|
30
|
+
check_ms_powerpoint_available,
|
|
31
|
+
check_ms_word_available,
|
|
32
|
+
find_libreoffice,
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
if TYPE_CHECKING:
|
|
36
|
+
from markitai.config import MarkitaiConfig
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
# =============================================================================
|
|
40
|
+
# COM Application Configuration
|
|
41
|
+
# =============================================================================
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
@dataclass
|
|
45
|
+
class COMAppConfig:
|
|
46
|
+
"""Configuration for a COM Office application."""
|
|
47
|
+
|
|
48
|
+
name: str # Display name (Word, PowerPoint, Excel)
|
|
49
|
+
com_class: str # COM ProgID (Word.Application, etc.)
|
|
50
|
+
input_ext: str # Source extension (.doc, .ppt, .xls)
|
|
51
|
+
output_ext: str # Target extension (.docx, .pptx, .xlsx)
|
|
52
|
+
save_format: int # SaveAs format code
|
|
53
|
+
init_script: str # PowerShell initialization lines
|
|
54
|
+
open_script: str # PowerShell document open command (uses {input})
|
|
55
|
+
save_script: str # PowerShell save command (uses {output}, {format})
|
|
56
|
+
close_script: str # PowerShell close command
|
|
57
|
+
cleanup_script: str # PowerShell cleanup lines
|
|
58
|
+
availability_check: Callable[[], bool] # Function to check if app is available
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
# PowerPoint configuration
|
|
62
|
+
POWERPOINT_CONFIG = COMAppConfig(
|
|
63
|
+
name="PowerPoint",
|
|
64
|
+
com_class="PowerPoint.Application",
|
|
65
|
+
input_ext=".ppt",
|
|
66
|
+
output_ext=".pptx",
|
|
67
|
+
save_format=24, # ppSaveAsOpenXMLPresentation
|
|
68
|
+
init_script="$app.Visible = [Microsoft.Office.Core.MsoTriState]::msoFalse",
|
|
69
|
+
open_script="$doc = $app.Presentations.Open('{input}', $true, $false, $false)",
|
|
70
|
+
save_script="$doc.SaveAs('{output}', {format})",
|
|
71
|
+
close_script="$doc.Close()",
|
|
72
|
+
cleanup_script="",
|
|
73
|
+
availability_check=check_ms_powerpoint_available,
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
# Word configuration
|
|
77
|
+
WORD_CONFIG = COMAppConfig(
|
|
78
|
+
name="Word",
|
|
79
|
+
com_class="Word.Application",
|
|
80
|
+
input_ext=".doc",
|
|
81
|
+
output_ext=".docx",
|
|
82
|
+
save_format=16, # wdFormatDocumentDefault
|
|
83
|
+
init_script="$app.Visible = $false",
|
|
84
|
+
open_script="$doc = $app.Documents.Open('{input}')",
|
|
85
|
+
save_script="$doc.SaveAs2('{output}', {format})",
|
|
86
|
+
close_script="$doc.Close()",
|
|
87
|
+
cleanup_script="",
|
|
88
|
+
availability_check=check_ms_word_available,
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
# Excel configuration
|
|
92
|
+
EXCEL_CONFIG = COMAppConfig(
|
|
93
|
+
name="Excel",
|
|
94
|
+
com_class="Excel.Application",
|
|
95
|
+
input_ext=".xls",
|
|
96
|
+
output_ext=".xlsx",
|
|
97
|
+
save_format=51, # xlOpenXMLWorkbook
|
|
98
|
+
init_script="$app.Visible = $false\n$app.DisplayAlerts = $false",
|
|
99
|
+
open_script="$doc = $app.Workbooks.Open('{input}')",
|
|
100
|
+
save_script="$doc.SaveAs('{output}', {format})",
|
|
101
|
+
close_script="$doc.Close($false)",
|
|
102
|
+
cleanup_script="",
|
|
103
|
+
availability_check=check_ms_excel_available,
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
# Map extension to config
|
|
107
|
+
COM_CONFIGS: dict[str, COMAppConfig] = {
|
|
108
|
+
".ppt": POWERPOINT_CONFIG,
|
|
109
|
+
".doc": WORD_CONFIG,
|
|
110
|
+
".xls": EXCEL_CONFIG,
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
# =============================================================================
|
|
115
|
+
# Single File COM Conversion
|
|
116
|
+
# =============================================================================
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def _build_single_file_script(
|
|
120
|
+
config: COMAppConfig, input_path: str, output_path: str
|
|
121
|
+
) -> str:
|
|
122
|
+
"""Build PowerShell script for single file conversion.
|
|
123
|
+
|
|
124
|
+
Args:
|
|
125
|
+
config: COM application configuration
|
|
126
|
+
input_path: Escaped input file path
|
|
127
|
+
output_path: Escaped output file path
|
|
128
|
+
|
|
129
|
+
Returns:
|
|
130
|
+
PowerShell script string
|
|
131
|
+
"""
|
|
132
|
+
open_cmd = config.open_script.format(input=input_path)
|
|
133
|
+
save_cmd = config.save_script.format(output=output_path, format=config.save_format)
|
|
134
|
+
|
|
135
|
+
return f"""
|
|
136
|
+
$app = New-Object -ComObject {config.com_class}
|
|
137
|
+
{config.init_script}
|
|
138
|
+
try {{
|
|
139
|
+
{open_cmd}
|
|
140
|
+
{save_cmd}
|
|
141
|
+
{config.close_script}
|
|
142
|
+
Write-Host "SUCCESS"
|
|
143
|
+
}} catch {{
|
|
144
|
+
Write-Host "FAILED: $_"
|
|
145
|
+
}} finally {{
|
|
146
|
+
$app.Quit()
|
|
147
|
+
[System.Runtime.Interopservices.Marshal]::ReleaseComObject($app) | Out-Null
|
|
148
|
+
}}
|
|
149
|
+
"""
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def _convert_with_com(
|
|
153
|
+
input_file: Path,
|
|
154
|
+
output_dir: Path,
|
|
155
|
+
config: COMAppConfig,
|
|
156
|
+
) -> Path | None:
|
|
157
|
+
"""Convert a file using MS Office COM (Windows only).
|
|
158
|
+
|
|
159
|
+
Uses PowerShell subprocess for COM access, which provides:
|
|
160
|
+
- Process isolation (safe for concurrent execution)
|
|
161
|
+
- No pywin32 dependency required
|
|
162
|
+
- Automatic COM object cleanup
|
|
163
|
+
|
|
164
|
+
Args:
|
|
165
|
+
input_file: Path to the source file
|
|
166
|
+
output_dir: Directory for the converted file
|
|
167
|
+
config: COM application configuration
|
|
168
|
+
|
|
169
|
+
Returns:
|
|
170
|
+
Path to the converted file, or None if conversion failed
|
|
171
|
+
"""
|
|
172
|
+
if platform.system() != "Windows":
|
|
173
|
+
return None
|
|
174
|
+
|
|
175
|
+
output_file = output_dir / (input_file.stem + config.output_ext)
|
|
176
|
+
|
|
177
|
+
# Escape single quotes for PowerShell string
|
|
178
|
+
input_path = str(input_file.resolve()).replace("'", "''")
|
|
179
|
+
output_path = str(output_file.resolve()).replace("'", "''")
|
|
180
|
+
|
|
181
|
+
ps_script = _build_single_file_script(config, input_path, output_path)
|
|
182
|
+
|
|
183
|
+
try:
|
|
184
|
+
result = subprocess.run(
|
|
185
|
+
["powershell", "-NoProfile", "-Command", ps_script],
|
|
186
|
+
capture_output=True,
|
|
187
|
+
text=True,
|
|
188
|
+
timeout=120,
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
if "SUCCESS" in result.stdout and output_file.exists():
|
|
192
|
+
logger.debug(f"MS {config.name} conversion succeeded: {output_file}")
|
|
193
|
+
return output_file
|
|
194
|
+
else:
|
|
195
|
+
logger.warning(f"MS {config.name} conversion failed: {result.stdout}")
|
|
196
|
+
return None
|
|
197
|
+
|
|
198
|
+
except subprocess.TimeoutExpired:
|
|
199
|
+
logger.warning(f"MS {config.name} conversion timed out")
|
|
200
|
+
return None
|
|
201
|
+
except Exception as e:
|
|
202
|
+
logger.warning(f"MS {config.name} conversion error: {e}")
|
|
203
|
+
return None
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
# =============================================================================
|
|
207
|
+
# Batch COM Conversion
|
|
208
|
+
# =============================================================================
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
def _build_batch_script(config: COMAppConfig, files_array: str) -> str:
|
|
212
|
+
"""Build PowerShell script for batch file conversion.
|
|
213
|
+
|
|
214
|
+
Args:
|
|
215
|
+
config: COM application configuration
|
|
216
|
+
files_array: PowerShell array string of file entries
|
|
217
|
+
|
|
218
|
+
Returns:
|
|
219
|
+
PowerShell script string
|
|
220
|
+
"""
|
|
221
|
+
# Build the loop body with proper variable substitution
|
|
222
|
+
open_cmd = config.open_script.replace("'{input}'", "$file.Input")
|
|
223
|
+
save_cmd = config.save_script.replace("'{output}'", "$file.Output").replace(
|
|
224
|
+
"{format}", str(config.save_format)
|
|
225
|
+
)
|
|
226
|
+
|
|
227
|
+
return f"""
|
|
228
|
+
$files = {files_array}
|
|
229
|
+
$app = New-Object -ComObject {config.com_class}
|
|
230
|
+
{config.init_script}
|
|
231
|
+
$results = @()
|
|
232
|
+
try {{
|
|
233
|
+
foreach ($file in $files) {{
|
|
234
|
+
try {{
|
|
235
|
+
{open_cmd}
|
|
236
|
+
{save_cmd}
|
|
237
|
+
{config.close_script}
|
|
238
|
+
$results += "OK:" + $file.Input
|
|
239
|
+
}} catch {{
|
|
240
|
+
$results += "FAIL:" + $file.Input + ":" + $_
|
|
241
|
+
}}
|
|
242
|
+
}}
|
|
243
|
+
}} finally {{
|
|
244
|
+
$app.Quit()
|
|
245
|
+
[System.Runtime.Interopservices.Marshal]::ReleaseComObject($app) | Out-Null
|
|
246
|
+
}}
|
|
247
|
+
$results -join "`n"
|
|
248
|
+
"""
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
def _run_batch_conversion(
|
|
252
|
+
ps_script: str,
|
|
253
|
+
files: list[Path],
|
|
254
|
+
output_dir: Path,
|
|
255
|
+
new_ext: str,
|
|
256
|
+
app_name: str,
|
|
257
|
+
) -> dict[Path, Path]:
|
|
258
|
+
"""Execute batch conversion PowerShell script and parse results.
|
|
259
|
+
|
|
260
|
+
Args:
|
|
261
|
+
ps_script: PowerShell script to execute
|
|
262
|
+
files: List of input files
|
|
263
|
+
output_dir: Output directory for converted files
|
|
264
|
+
new_ext: New file extension
|
|
265
|
+
app_name: Application name for logging
|
|
266
|
+
|
|
267
|
+
Returns:
|
|
268
|
+
Dict mapping original file path to converted file path
|
|
269
|
+
"""
|
|
270
|
+
results: dict[Path, Path] = {}
|
|
271
|
+
|
|
272
|
+
try:
|
|
273
|
+
logger.info(f"Batch converting {len(files)} files with MS {app_name}...")
|
|
274
|
+
proc_result = subprocess.run(
|
|
275
|
+
["powershell", "-NoProfile", "-Command", ps_script],
|
|
276
|
+
capture_output=True,
|
|
277
|
+
text=True,
|
|
278
|
+
timeout=120 * len(files), # Scale timeout with file count
|
|
279
|
+
)
|
|
280
|
+
|
|
281
|
+
# Parse results
|
|
282
|
+
for line in proc_result.stdout.strip().split("\n"):
|
|
283
|
+
if line.startswith("OK:"):
|
|
284
|
+
input_path = Path(line[3:].strip())
|
|
285
|
+
output_path = output_dir / (input_path.stem + new_ext)
|
|
286
|
+
if output_path.exists():
|
|
287
|
+
# Find original file in list (case-insensitive match)
|
|
288
|
+
for f in files:
|
|
289
|
+
if (
|
|
290
|
+
f.resolve() == input_path
|
|
291
|
+
or str(f.resolve()).lower() == line[3:].strip().lower()
|
|
292
|
+
):
|
|
293
|
+
results[f] = output_path
|
|
294
|
+
break
|
|
295
|
+
elif line.startswith("FAIL:"):
|
|
296
|
+
parts = line[5:].split(":", 1)
|
|
297
|
+
logger.warning(
|
|
298
|
+
f"MS {app_name} failed for {parts[0]}: {parts[1] if len(parts) > 1 else 'unknown'}"
|
|
299
|
+
)
|
|
300
|
+
|
|
301
|
+
logger.info(
|
|
302
|
+
f"MS {app_name} batch conversion: {len(results)}/{len(files)} succeeded"
|
|
303
|
+
)
|
|
304
|
+
|
|
305
|
+
except subprocess.TimeoutExpired:
|
|
306
|
+
logger.warning(f"MS {app_name} batch conversion timed out")
|
|
307
|
+
except Exception as e:
|
|
308
|
+
logger.warning(f"MS {app_name} batch conversion error: {e}")
|
|
309
|
+
|
|
310
|
+
return results
|
|
311
|
+
|
|
312
|
+
|
|
313
|
+
def _batch_convert_with_com(
|
|
314
|
+
files: list[Path],
|
|
315
|
+
output_dir: Path,
|
|
316
|
+
config: COMAppConfig,
|
|
317
|
+
) -> dict[Path, Path]:
|
|
318
|
+
"""Batch convert files using a single COM session.
|
|
319
|
+
|
|
320
|
+
Args:
|
|
321
|
+
files: List of files to convert
|
|
322
|
+
output_dir: Output directory
|
|
323
|
+
config: COM application configuration
|
|
324
|
+
|
|
325
|
+
Returns:
|
|
326
|
+
Dict mapping original file path to converted file path
|
|
327
|
+
"""
|
|
328
|
+
if not files:
|
|
329
|
+
return {}
|
|
330
|
+
|
|
331
|
+
# Build file list for PowerShell
|
|
332
|
+
file_entries = []
|
|
333
|
+
for f in files:
|
|
334
|
+
input_path = str(f.resolve()).replace("'", "''")
|
|
335
|
+
output_path = str(
|
|
336
|
+
(output_dir / (f.stem + config.output_ext)).resolve()
|
|
337
|
+
).replace("'", "''")
|
|
338
|
+
file_entries.append(f"@{{Input='{input_path}'; Output='{output_path}'}}")
|
|
339
|
+
|
|
340
|
+
files_array = "@(" + ", ".join(file_entries) + ")"
|
|
341
|
+
ps_script = _build_batch_script(config, files_array)
|
|
342
|
+
|
|
343
|
+
return _run_batch_conversion(
|
|
344
|
+
ps_script, files, output_dir, config.output_ext, config.name
|
|
345
|
+
)
|
|
346
|
+
|
|
347
|
+
|
|
348
|
+
def batch_convert_legacy_files(
|
|
349
|
+
files: list[Path],
|
|
350
|
+
output_dir: Path,
|
|
351
|
+
) -> dict[Path, Path]:
|
|
352
|
+
"""Batch convert legacy Office files using a single COM session per app.
|
|
353
|
+
|
|
354
|
+
This significantly reduces overhead by:
|
|
355
|
+
- Starting each Office application only once
|
|
356
|
+
- Processing all files of the same type in one session
|
|
357
|
+
- Running Word, PowerPoint, and Excel conversions in parallel
|
|
358
|
+
- Reducing PowerShell process spawn overhead
|
|
359
|
+
|
|
360
|
+
Args:
|
|
361
|
+
files: List of legacy format files (.doc, .ppt, .xls)
|
|
362
|
+
output_dir: Directory for converted files
|
|
363
|
+
|
|
364
|
+
Returns:
|
|
365
|
+
Dict mapping original file path to converted file path.
|
|
366
|
+
Files that failed conversion are not included.
|
|
367
|
+
"""
|
|
368
|
+
if platform.system() != "Windows":
|
|
369
|
+
return {}
|
|
370
|
+
|
|
371
|
+
import concurrent.futures
|
|
372
|
+
|
|
373
|
+
# Group files by type
|
|
374
|
+
files_by_ext: dict[str, list[Path]] = {}
|
|
375
|
+
for f in files:
|
|
376
|
+
ext = f.suffix.lower()
|
|
377
|
+
if ext in COM_CONFIGS:
|
|
378
|
+
if ext not in files_by_ext:
|
|
379
|
+
files_by_ext[ext] = []
|
|
380
|
+
files_by_ext[ext].append(f)
|
|
381
|
+
|
|
382
|
+
results: dict[Path, Path] = {}
|
|
383
|
+
|
|
384
|
+
# Run conversions for different Office apps in parallel
|
|
385
|
+
with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
|
|
386
|
+
futures = []
|
|
387
|
+
|
|
388
|
+
for ext, file_list in files_by_ext.items():
|
|
389
|
+
config = COM_CONFIGS[ext]
|
|
390
|
+
if file_list and config.availability_check():
|
|
391
|
+
futures.append(
|
|
392
|
+
executor.submit(
|
|
393
|
+
_batch_convert_with_com, file_list, output_dir, config
|
|
394
|
+
)
|
|
395
|
+
)
|
|
396
|
+
|
|
397
|
+
# Collect results
|
|
398
|
+
for future in concurrent.futures.as_completed(futures):
|
|
399
|
+
try:
|
|
400
|
+
converted = future.result()
|
|
401
|
+
results.update(converted)
|
|
402
|
+
except Exception as e:
|
|
403
|
+
logger.warning(f"Batch conversion failed: {e}")
|
|
404
|
+
|
|
405
|
+
return results
|
|
406
|
+
|
|
407
|
+
|
|
408
|
+
# =============================================================================
|
|
409
|
+
# Legacy Office Converter Class
|
|
410
|
+
# =============================================================================
|
|
411
|
+
|
|
412
|
+
|
|
413
|
+
class LegacyOfficeConverter(BaseConverter):
|
|
414
|
+
"""Base converter for legacy Office documents (DOC, PPT, XLS).
|
|
415
|
+
|
|
416
|
+
Conversion priority:
|
|
417
|
+
1. MS Office COM (Windows) - faster and more accurate
|
|
418
|
+
2. LibreOffice CLI (cross-platform) - fallback
|
|
419
|
+
"""
|
|
420
|
+
|
|
421
|
+
# Mapping of legacy format to target format
|
|
422
|
+
TARGET_FORMAT: dict[str, str] = {
|
|
423
|
+
".doc": "docx",
|
|
424
|
+
".ppt": "pptx",
|
|
425
|
+
".xls": "xlsx",
|
|
426
|
+
}
|
|
427
|
+
|
|
428
|
+
def __init__(self, config: MarkitaiConfig | None = None) -> None:
|
|
429
|
+
super().__init__(config)
|
|
430
|
+
self._office_converter = OfficeConverter(config)
|
|
431
|
+
self._pptx_converter = PptxConverter(config)
|
|
432
|
+
self._soffice_path = find_libreoffice()
|
|
433
|
+
|
|
434
|
+
def _convert_legacy_format(
|
|
435
|
+
self,
|
|
436
|
+
input_path: Path,
|
|
437
|
+
target_format: str,
|
|
438
|
+
output_dir: Path,
|
|
439
|
+
) -> Path:
|
|
440
|
+
"""Convert legacy format to modern format.
|
|
441
|
+
|
|
442
|
+
Tries MS Office COM first (Windows), falls back to LibreOffice.
|
|
443
|
+
|
|
444
|
+
Args:
|
|
445
|
+
input_path: Path to the legacy format file
|
|
446
|
+
target_format: Target format (docx, pptx, xlsx)
|
|
447
|
+
output_dir: Directory for converted file
|
|
448
|
+
|
|
449
|
+
Returns:
|
|
450
|
+
Path to the converted file
|
|
451
|
+
|
|
452
|
+
Raises:
|
|
453
|
+
RuntimeError: If conversion fails with all methods
|
|
454
|
+
"""
|
|
455
|
+
suffix = input_path.suffix.lower()
|
|
456
|
+
converted_path: Path | None = None
|
|
457
|
+
|
|
458
|
+
# Try MS Office COM first (Windows only)
|
|
459
|
+
config = COM_CONFIGS.get(suffix)
|
|
460
|
+
if config and config.availability_check():
|
|
461
|
+
logger.info(f"Converting {input_path.name} with MS {config.name}...")
|
|
462
|
+
converted_path = _convert_with_com(input_path, output_dir, config)
|
|
463
|
+
if converted_path:
|
|
464
|
+
return converted_path
|
|
465
|
+
logger.warning(f"MS {config.name} conversion failed, trying LibreOffice...")
|
|
466
|
+
|
|
467
|
+
# Fallback to LibreOffice
|
|
468
|
+
if self._soffice_path:
|
|
469
|
+
logger.info(f"Converting {input_path.name} with LibreOffice...")
|
|
470
|
+
return self._convert_with_libreoffice(input_path, target_format, output_dir)
|
|
471
|
+
|
|
472
|
+
# No conversion method available
|
|
473
|
+
if platform.system() == "Windows":
|
|
474
|
+
raise RuntimeError(
|
|
475
|
+
f"Cannot convert {suffix} files. "
|
|
476
|
+
"Install Microsoft Office (recommended) or LibreOffice."
|
|
477
|
+
)
|
|
478
|
+
else:
|
|
479
|
+
raise RuntimeError(f"Cannot convert {suffix} files. Install LibreOffice.")
|
|
480
|
+
|
|
481
|
+
def _convert_with_libreoffice(
|
|
482
|
+
self,
|
|
483
|
+
input_path: Path,
|
|
484
|
+
target_format: str,
|
|
485
|
+
output_dir: Path,
|
|
486
|
+
) -> Path:
|
|
487
|
+
"""Convert legacy format using LibreOffice CLI.
|
|
488
|
+
|
|
489
|
+
Uses isolated user profile to support concurrent LibreOffice processes.
|
|
490
|
+
"""
|
|
491
|
+
if not self._soffice_path:
|
|
492
|
+
raise RuntimeError(
|
|
493
|
+
"LibreOffice not found. Install LibreOffice to convert "
|
|
494
|
+
f"{input_path.suffix} files."
|
|
495
|
+
)
|
|
496
|
+
|
|
497
|
+
# Create isolated user profile for concurrent execution
|
|
498
|
+
# LibreOffice uses a shared user config directory by default,
|
|
499
|
+
# which causes conflicts when multiple processes run simultaneously
|
|
500
|
+
with tempfile.TemporaryDirectory(prefix="lo_profile_") as profile_dir:
|
|
501
|
+
profile_url = Path(profile_dir).as_uri()
|
|
502
|
+
|
|
503
|
+
# Run LibreOffice conversion with isolated profile
|
|
504
|
+
cmd = [
|
|
505
|
+
self._soffice_path,
|
|
506
|
+
"--headless",
|
|
507
|
+
f"-env:UserInstallation={profile_url}",
|
|
508
|
+
"--convert-to",
|
|
509
|
+
target_format,
|
|
510
|
+
"--outdir",
|
|
511
|
+
str(output_dir),
|
|
512
|
+
str(input_path),
|
|
513
|
+
]
|
|
514
|
+
|
|
515
|
+
logger.debug(f"Running LibreOffice: {' '.join(cmd)}")
|
|
516
|
+
|
|
517
|
+
try:
|
|
518
|
+
result = subprocess.run(
|
|
519
|
+
cmd,
|
|
520
|
+
capture_output=True,
|
|
521
|
+
text=True,
|
|
522
|
+
timeout=120,
|
|
523
|
+
)
|
|
524
|
+
|
|
525
|
+
if result.returncode != 0:
|
|
526
|
+
raise RuntimeError(
|
|
527
|
+
f"LibreOffice conversion failed: {result.stderr}"
|
|
528
|
+
)
|
|
529
|
+
|
|
530
|
+
except subprocess.TimeoutExpired:
|
|
531
|
+
raise RuntimeError("LibreOffice conversion timed out")
|
|
532
|
+
|
|
533
|
+
# Find converted file
|
|
534
|
+
converted_name = input_path.stem + "." + target_format
|
|
535
|
+
converted_path = output_dir / converted_name
|
|
536
|
+
|
|
537
|
+
if not converted_path.exists():
|
|
538
|
+
raise RuntimeError(f"Converted file not found: {converted_path}")
|
|
539
|
+
|
|
540
|
+
return converted_path
|
|
541
|
+
|
|
542
|
+
def convert(
|
|
543
|
+
self, input_path: Path, output_dir: Path | None = None
|
|
544
|
+
) -> ConvertResult:
|
|
545
|
+
"""Convert legacy Office document to Markdown.
|
|
546
|
+
|
|
547
|
+
Args:
|
|
548
|
+
input_path: Path to the input file
|
|
549
|
+
output_dir: Optional output directory for extracted images
|
|
550
|
+
|
|
551
|
+
Returns:
|
|
552
|
+
ConvertResult containing markdown and extracted images
|
|
553
|
+
"""
|
|
554
|
+
input_path = Path(input_path)
|
|
555
|
+
suffix = input_path.suffix.lower()
|
|
556
|
+
|
|
557
|
+
target_format = self.TARGET_FORMAT.get(suffix)
|
|
558
|
+
if not target_format:
|
|
559
|
+
raise ValueError(f"Unsupported format: {suffix}")
|
|
560
|
+
|
|
561
|
+
# Create temp directory for conversion
|
|
562
|
+
with tempfile.TemporaryDirectory() as temp_dir:
|
|
563
|
+
temp_path = Path(temp_dir)
|
|
564
|
+
|
|
565
|
+
# Convert to modern format (COM first, LibreOffice fallback)
|
|
566
|
+
converted_path = self._convert_legacy_format(
|
|
567
|
+
input_path, target_format, temp_path
|
|
568
|
+
)
|
|
569
|
+
|
|
570
|
+
# Process with appropriate converter based on target format
|
|
571
|
+
if target_format == "pptx":
|
|
572
|
+
result = self._pptx_converter.convert(converted_path, output_dir)
|
|
573
|
+
else:
|
|
574
|
+
result = self._office_converter.convert(converted_path, output_dir)
|
|
575
|
+
|
|
576
|
+
# Update metadata
|
|
577
|
+
result.metadata["original_format"] = suffix.lstrip(".").upper()
|
|
578
|
+
result.metadata["source"] = str(input_path)
|
|
579
|
+
|
|
580
|
+
return result
|
|
581
|
+
|
|
582
|
+
|
|
583
|
+
# =============================================================================
|
|
584
|
+
# Registered Converters
|
|
585
|
+
# =============================================================================
|
|
586
|
+
|
|
587
|
+
|
|
588
|
+
@register_converter(FileFormat.DOC)
|
|
589
|
+
class DocConverter(LegacyOfficeConverter):
|
|
590
|
+
"""Converter for legacy DOC (Word 97-2003) documents."""
|
|
591
|
+
|
|
592
|
+
supported_formats = [FileFormat.DOC]
|
|
593
|
+
|
|
594
|
+
|
|
595
|
+
@register_converter(FileFormat.PPT)
|
|
596
|
+
class PptConverter(LegacyOfficeConverter):
|
|
597
|
+
"""Converter for legacy PPT (PowerPoint 97-2003) documents."""
|
|
598
|
+
|
|
599
|
+
supported_formats = [FileFormat.PPT]
|
|
600
|
+
|
|
601
|
+
|
|
602
|
+
@register_converter(FileFormat.XLS)
|
|
603
|
+
class XlsConverter(LegacyOfficeConverter):
|
|
604
|
+
"""Converter for legacy XLS (Excel 97-2003) documents."""
|
|
605
|
+
|
|
606
|
+
supported_formats = [FileFormat.XLS]
|