markitai 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. markitai/__init__.py +3 -0
  2. markitai/batch.py +1316 -0
  3. markitai/cli.py +3979 -0
  4. markitai/config.py +602 -0
  5. markitai/config.schema.json +748 -0
  6. markitai/constants.py +222 -0
  7. markitai/converter/__init__.py +49 -0
  8. markitai/converter/_patches.py +98 -0
  9. markitai/converter/base.py +164 -0
  10. markitai/converter/image.py +181 -0
  11. markitai/converter/legacy.py +606 -0
  12. markitai/converter/office.py +526 -0
  13. markitai/converter/pdf.py +679 -0
  14. markitai/converter/text.py +63 -0
  15. markitai/fetch.py +1725 -0
  16. markitai/image.py +1335 -0
  17. markitai/json_order.py +550 -0
  18. markitai/llm.py +4339 -0
  19. markitai/ocr.py +347 -0
  20. markitai/prompts/__init__.py +159 -0
  21. markitai/prompts/cleaner.md +93 -0
  22. markitai/prompts/document_enhance.md +77 -0
  23. markitai/prompts/document_enhance_complete.md +65 -0
  24. markitai/prompts/document_process.md +60 -0
  25. markitai/prompts/frontmatter.md +28 -0
  26. markitai/prompts/image_analysis.md +21 -0
  27. markitai/prompts/image_caption.md +8 -0
  28. markitai/prompts/image_description.md +13 -0
  29. markitai/prompts/page_content.md +17 -0
  30. markitai/prompts/url_enhance.md +78 -0
  31. markitai/security.py +286 -0
  32. markitai/types.py +30 -0
  33. markitai/urls.py +187 -0
  34. markitai/utils/__init__.py +33 -0
  35. markitai/utils/executor.py +69 -0
  36. markitai/utils/mime.py +85 -0
  37. markitai/utils/office.py +262 -0
  38. markitai/utils/output.py +53 -0
  39. markitai/utils/paths.py +81 -0
  40. markitai/utils/text.py +359 -0
  41. markitai/workflow/__init__.py +37 -0
  42. markitai/workflow/core.py +760 -0
  43. markitai/workflow/helpers.py +509 -0
  44. markitai/workflow/single.py +369 -0
  45. markitai-0.3.0.dist-info/METADATA +159 -0
  46. markitai-0.3.0.dist-info/RECORD +48 -0
  47. markitai-0.3.0.dist-info/WHEEL +4 -0
  48. markitai-0.3.0.dist-info/entry_points.txt +2 -0
@@ -0,0 +1,606 @@
1
+ """Legacy Office format converters (DOC, PPT, XLS - Office 97-2003).
2
+
3
+ These formats require conversion to modern formats first.
4
+ Conversion priority:
5
+ 1. MS Office COM (Windows) - faster and more accurate
6
+ 2. LibreOffice CLI (cross-platform) - fallback
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import platform
12
+ import subprocess
13
+ import tempfile
14
+ from collections.abc import Callable
15
+ from dataclasses import dataclass
16
+ from pathlib import Path
17
+ from typing import TYPE_CHECKING
18
+
19
+ from loguru import logger
20
+
21
+ from markitai.converter.base import (
22
+ BaseConverter,
23
+ ConvertResult,
24
+ FileFormat,
25
+ register_converter,
26
+ )
27
+ from markitai.converter.office import OfficeConverter, PptxConverter
28
+ from markitai.utils.office import (
29
+ check_ms_excel_available,
30
+ check_ms_powerpoint_available,
31
+ check_ms_word_available,
32
+ find_libreoffice,
33
+ )
34
+
35
+ if TYPE_CHECKING:
36
+ from markitai.config import MarkitaiConfig
37
+
38
+
39
+ # =============================================================================
40
+ # COM Application Configuration
41
+ # =============================================================================
42
+
43
+
44
+ @dataclass
45
+ class COMAppConfig:
46
+ """Configuration for a COM Office application."""
47
+
48
+ name: str # Display name (Word, PowerPoint, Excel)
49
+ com_class: str # COM ProgID (Word.Application, etc.)
50
+ input_ext: str # Source extension (.doc, .ppt, .xls)
51
+ output_ext: str # Target extension (.docx, .pptx, .xlsx)
52
+ save_format: int # SaveAs format code
53
+ init_script: str # PowerShell initialization lines
54
+ open_script: str # PowerShell document open command (uses {input})
55
+ save_script: str # PowerShell save command (uses {output}, {format})
56
+ close_script: str # PowerShell close command
57
+ cleanup_script: str # PowerShell cleanup lines
58
+ availability_check: Callable[[], bool] # Function to check if app is available
59
+
60
+
61
+ # PowerPoint configuration
62
+ POWERPOINT_CONFIG = COMAppConfig(
63
+ name="PowerPoint",
64
+ com_class="PowerPoint.Application",
65
+ input_ext=".ppt",
66
+ output_ext=".pptx",
67
+ save_format=24, # ppSaveAsOpenXMLPresentation
68
+ init_script="$app.Visible = [Microsoft.Office.Core.MsoTriState]::msoFalse",
69
+ open_script="$doc = $app.Presentations.Open('{input}', $true, $false, $false)",
70
+ save_script="$doc.SaveAs('{output}', {format})",
71
+ close_script="$doc.Close()",
72
+ cleanup_script="",
73
+ availability_check=check_ms_powerpoint_available,
74
+ )
75
+
76
+ # Word configuration
77
+ WORD_CONFIG = COMAppConfig(
78
+ name="Word",
79
+ com_class="Word.Application",
80
+ input_ext=".doc",
81
+ output_ext=".docx",
82
+ save_format=16, # wdFormatDocumentDefault
83
+ init_script="$app.Visible = $false",
84
+ open_script="$doc = $app.Documents.Open('{input}')",
85
+ save_script="$doc.SaveAs2('{output}', {format})",
86
+ close_script="$doc.Close()",
87
+ cleanup_script="",
88
+ availability_check=check_ms_word_available,
89
+ )
90
+
91
+ # Excel configuration
92
+ EXCEL_CONFIG = COMAppConfig(
93
+ name="Excel",
94
+ com_class="Excel.Application",
95
+ input_ext=".xls",
96
+ output_ext=".xlsx",
97
+ save_format=51, # xlOpenXMLWorkbook
98
+ init_script="$app.Visible = $false\n$app.DisplayAlerts = $false",
99
+ open_script="$doc = $app.Workbooks.Open('{input}')",
100
+ save_script="$doc.SaveAs('{output}', {format})",
101
+ close_script="$doc.Close($false)",
102
+ cleanup_script="",
103
+ availability_check=check_ms_excel_available,
104
+ )
105
+
106
+ # Map extension to config
107
+ COM_CONFIGS: dict[str, COMAppConfig] = {
108
+ ".ppt": POWERPOINT_CONFIG,
109
+ ".doc": WORD_CONFIG,
110
+ ".xls": EXCEL_CONFIG,
111
+ }
112
+
113
+
114
+ # =============================================================================
115
+ # Single File COM Conversion
116
+ # =============================================================================
117
+
118
+
119
+ def _build_single_file_script(
120
+ config: COMAppConfig, input_path: str, output_path: str
121
+ ) -> str:
122
+ """Build PowerShell script for single file conversion.
123
+
124
+ Args:
125
+ config: COM application configuration
126
+ input_path: Escaped input file path
127
+ output_path: Escaped output file path
128
+
129
+ Returns:
130
+ PowerShell script string
131
+ """
132
+ open_cmd = config.open_script.format(input=input_path)
133
+ save_cmd = config.save_script.format(output=output_path, format=config.save_format)
134
+
135
+ return f"""
136
+ $app = New-Object -ComObject {config.com_class}
137
+ {config.init_script}
138
+ try {{
139
+ {open_cmd}
140
+ {save_cmd}
141
+ {config.close_script}
142
+ Write-Host "SUCCESS"
143
+ }} catch {{
144
+ Write-Host "FAILED: $_"
145
+ }} finally {{
146
+ $app.Quit()
147
+ [System.Runtime.Interopservices.Marshal]::ReleaseComObject($app) | Out-Null
148
+ }}
149
+ """
150
+
151
+
152
+ def _convert_with_com(
153
+ input_file: Path,
154
+ output_dir: Path,
155
+ config: COMAppConfig,
156
+ ) -> Path | None:
157
+ """Convert a file using MS Office COM (Windows only).
158
+
159
+ Uses PowerShell subprocess for COM access, which provides:
160
+ - Process isolation (safe for concurrent execution)
161
+ - No pywin32 dependency required
162
+ - Automatic COM object cleanup
163
+
164
+ Args:
165
+ input_file: Path to the source file
166
+ output_dir: Directory for the converted file
167
+ config: COM application configuration
168
+
169
+ Returns:
170
+ Path to the converted file, or None if conversion failed
171
+ """
172
+ if platform.system() != "Windows":
173
+ return None
174
+
175
+ output_file = output_dir / (input_file.stem + config.output_ext)
176
+
177
+ # Escape single quotes for PowerShell string
178
+ input_path = str(input_file.resolve()).replace("'", "''")
179
+ output_path = str(output_file.resolve()).replace("'", "''")
180
+
181
+ ps_script = _build_single_file_script(config, input_path, output_path)
182
+
183
+ try:
184
+ result = subprocess.run(
185
+ ["powershell", "-NoProfile", "-Command", ps_script],
186
+ capture_output=True,
187
+ text=True,
188
+ timeout=120,
189
+ )
190
+
191
+ if "SUCCESS" in result.stdout and output_file.exists():
192
+ logger.debug(f"MS {config.name} conversion succeeded: {output_file}")
193
+ return output_file
194
+ else:
195
+ logger.warning(f"MS {config.name} conversion failed: {result.stdout}")
196
+ return None
197
+
198
+ except subprocess.TimeoutExpired:
199
+ logger.warning(f"MS {config.name} conversion timed out")
200
+ return None
201
+ except Exception as e:
202
+ logger.warning(f"MS {config.name} conversion error: {e}")
203
+ return None
204
+
205
+
206
+ # =============================================================================
207
+ # Batch COM Conversion
208
+ # =============================================================================
209
+
210
+
211
+ def _build_batch_script(config: COMAppConfig, files_array: str) -> str:
212
+ """Build PowerShell script for batch file conversion.
213
+
214
+ Args:
215
+ config: COM application configuration
216
+ files_array: PowerShell array string of file entries
217
+
218
+ Returns:
219
+ PowerShell script string
220
+ """
221
+ # Build the loop body with proper variable substitution
222
+ open_cmd = config.open_script.replace("'{input}'", "$file.Input")
223
+ save_cmd = config.save_script.replace("'{output}'", "$file.Output").replace(
224
+ "{format}", str(config.save_format)
225
+ )
226
+
227
+ return f"""
228
+ $files = {files_array}
229
+ $app = New-Object -ComObject {config.com_class}
230
+ {config.init_script}
231
+ $results = @()
232
+ try {{
233
+ foreach ($file in $files) {{
234
+ try {{
235
+ {open_cmd}
236
+ {save_cmd}
237
+ {config.close_script}
238
+ $results += "OK:" + $file.Input
239
+ }} catch {{
240
+ $results += "FAIL:" + $file.Input + ":" + $_
241
+ }}
242
+ }}
243
+ }} finally {{
244
+ $app.Quit()
245
+ [System.Runtime.Interopservices.Marshal]::ReleaseComObject($app) | Out-Null
246
+ }}
247
+ $results -join "`n"
248
+ """
249
+
250
+
251
+ def _run_batch_conversion(
252
+ ps_script: str,
253
+ files: list[Path],
254
+ output_dir: Path,
255
+ new_ext: str,
256
+ app_name: str,
257
+ ) -> dict[Path, Path]:
258
+ """Execute batch conversion PowerShell script and parse results.
259
+
260
+ Args:
261
+ ps_script: PowerShell script to execute
262
+ files: List of input files
263
+ output_dir: Output directory for converted files
264
+ new_ext: New file extension
265
+ app_name: Application name for logging
266
+
267
+ Returns:
268
+ Dict mapping original file path to converted file path
269
+ """
270
+ results: dict[Path, Path] = {}
271
+
272
+ try:
273
+ logger.info(f"Batch converting {len(files)} files with MS {app_name}...")
274
+ proc_result = subprocess.run(
275
+ ["powershell", "-NoProfile", "-Command", ps_script],
276
+ capture_output=True,
277
+ text=True,
278
+ timeout=120 * len(files), # Scale timeout with file count
279
+ )
280
+
281
+ # Parse results
282
+ for line in proc_result.stdout.strip().split("\n"):
283
+ if line.startswith("OK:"):
284
+ input_path = Path(line[3:].strip())
285
+ output_path = output_dir / (input_path.stem + new_ext)
286
+ if output_path.exists():
287
+ # Find original file in list (case-insensitive match)
288
+ for f in files:
289
+ if (
290
+ f.resolve() == input_path
291
+ or str(f.resolve()).lower() == line[3:].strip().lower()
292
+ ):
293
+ results[f] = output_path
294
+ break
295
+ elif line.startswith("FAIL:"):
296
+ parts = line[5:].split(":", 1)
297
+ logger.warning(
298
+ f"MS {app_name} failed for {parts[0]}: {parts[1] if len(parts) > 1 else 'unknown'}"
299
+ )
300
+
301
+ logger.info(
302
+ f"MS {app_name} batch conversion: {len(results)}/{len(files)} succeeded"
303
+ )
304
+
305
+ except subprocess.TimeoutExpired:
306
+ logger.warning(f"MS {app_name} batch conversion timed out")
307
+ except Exception as e:
308
+ logger.warning(f"MS {app_name} batch conversion error: {e}")
309
+
310
+ return results
311
+
312
+
313
+ def _batch_convert_with_com(
314
+ files: list[Path],
315
+ output_dir: Path,
316
+ config: COMAppConfig,
317
+ ) -> dict[Path, Path]:
318
+ """Batch convert files using a single COM session.
319
+
320
+ Args:
321
+ files: List of files to convert
322
+ output_dir: Output directory
323
+ config: COM application configuration
324
+
325
+ Returns:
326
+ Dict mapping original file path to converted file path
327
+ """
328
+ if not files:
329
+ return {}
330
+
331
+ # Build file list for PowerShell
332
+ file_entries = []
333
+ for f in files:
334
+ input_path = str(f.resolve()).replace("'", "''")
335
+ output_path = str(
336
+ (output_dir / (f.stem + config.output_ext)).resolve()
337
+ ).replace("'", "''")
338
+ file_entries.append(f"@{{Input='{input_path}'; Output='{output_path}'}}")
339
+
340
+ files_array = "@(" + ", ".join(file_entries) + ")"
341
+ ps_script = _build_batch_script(config, files_array)
342
+
343
+ return _run_batch_conversion(
344
+ ps_script, files, output_dir, config.output_ext, config.name
345
+ )
346
+
347
+
348
+ def batch_convert_legacy_files(
349
+ files: list[Path],
350
+ output_dir: Path,
351
+ ) -> dict[Path, Path]:
352
+ """Batch convert legacy Office files using a single COM session per app.
353
+
354
+ This significantly reduces overhead by:
355
+ - Starting each Office application only once
356
+ - Processing all files of the same type in one session
357
+ - Running Word, PowerPoint, and Excel conversions in parallel
358
+ - Reducing PowerShell process spawn overhead
359
+
360
+ Args:
361
+ files: List of legacy format files (.doc, .ppt, .xls)
362
+ output_dir: Directory for converted files
363
+
364
+ Returns:
365
+ Dict mapping original file path to converted file path.
366
+ Files that failed conversion are not included.
367
+ """
368
+ if platform.system() != "Windows":
369
+ return {}
370
+
371
+ import concurrent.futures
372
+
373
+ # Group files by type
374
+ files_by_ext: dict[str, list[Path]] = {}
375
+ for f in files:
376
+ ext = f.suffix.lower()
377
+ if ext in COM_CONFIGS:
378
+ if ext not in files_by_ext:
379
+ files_by_ext[ext] = []
380
+ files_by_ext[ext].append(f)
381
+
382
+ results: dict[Path, Path] = {}
383
+
384
+ # Run conversions for different Office apps in parallel
385
+ with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
386
+ futures = []
387
+
388
+ for ext, file_list in files_by_ext.items():
389
+ config = COM_CONFIGS[ext]
390
+ if file_list and config.availability_check():
391
+ futures.append(
392
+ executor.submit(
393
+ _batch_convert_with_com, file_list, output_dir, config
394
+ )
395
+ )
396
+
397
+ # Collect results
398
+ for future in concurrent.futures.as_completed(futures):
399
+ try:
400
+ converted = future.result()
401
+ results.update(converted)
402
+ except Exception as e:
403
+ logger.warning(f"Batch conversion failed: {e}")
404
+
405
+ return results
406
+
407
+
408
+ # =============================================================================
409
+ # Legacy Office Converter Class
410
+ # =============================================================================
411
+
412
+
413
+ class LegacyOfficeConverter(BaseConverter):
414
+ """Base converter for legacy Office documents (DOC, PPT, XLS).
415
+
416
+ Conversion priority:
417
+ 1. MS Office COM (Windows) - faster and more accurate
418
+ 2. LibreOffice CLI (cross-platform) - fallback
419
+ """
420
+
421
+ # Mapping of legacy format to target format
422
+ TARGET_FORMAT: dict[str, str] = {
423
+ ".doc": "docx",
424
+ ".ppt": "pptx",
425
+ ".xls": "xlsx",
426
+ }
427
+
428
+ def __init__(self, config: MarkitaiConfig | None = None) -> None:
429
+ super().__init__(config)
430
+ self._office_converter = OfficeConverter(config)
431
+ self._pptx_converter = PptxConverter(config)
432
+ self._soffice_path = find_libreoffice()
433
+
434
+ def _convert_legacy_format(
435
+ self,
436
+ input_path: Path,
437
+ target_format: str,
438
+ output_dir: Path,
439
+ ) -> Path:
440
+ """Convert legacy format to modern format.
441
+
442
+ Tries MS Office COM first (Windows), falls back to LibreOffice.
443
+
444
+ Args:
445
+ input_path: Path to the legacy format file
446
+ target_format: Target format (docx, pptx, xlsx)
447
+ output_dir: Directory for converted file
448
+
449
+ Returns:
450
+ Path to the converted file
451
+
452
+ Raises:
453
+ RuntimeError: If conversion fails with all methods
454
+ """
455
+ suffix = input_path.suffix.lower()
456
+ converted_path: Path | None = None
457
+
458
+ # Try MS Office COM first (Windows only)
459
+ config = COM_CONFIGS.get(suffix)
460
+ if config and config.availability_check():
461
+ logger.info(f"Converting {input_path.name} with MS {config.name}...")
462
+ converted_path = _convert_with_com(input_path, output_dir, config)
463
+ if converted_path:
464
+ return converted_path
465
+ logger.warning(f"MS {config.name} conversion failed, trying LibreOffice...")
466
+
467
+ # Fallback to LibreOffice
468
+ if self._soffice_path:
469
+ logger.info(f"Converting {input_path.name} with LibreOffice...")
470
+ return self._convert_with_libreoffice(input_path, target_format, output_dir)
471
+
472
+ # No conversion method available
473
+ if platform.system() == "Windows":
474
+ raise RuntimeError(
475
+ f"Cannot convert {suffix} files. "
476
+ "Install Microsoft Office (recommended) or LibreOffice."
477
+ )
478
+ else:
479
+ raise RuntimeError(f"Cannot convert {suffix} files. Install LibreOffice.")
480
+
481
+ def _convert_with_libreoffice(
482
+ self,
483
+ input_path: Path,
484
+ target_format: str,
485
+ output_dir: Path,
486
+ ) -> Path:
487
+ """Convert legacy format using LibreOffice CLI.
488
+
489
+ Uses isolated user profile to support concurrent LibreOffice processes.
490
+ """
491
+ if not self._soffice_path:
492
+ raise RuntimeError(
493
+ "LibreOffice not found. Install LibreOffice to convert "
494
+ f"{input_path.suffix} files."
495
+ )
496
+
497
+ # Create isolated user profile for concurrent execution
498
+ # LibreOffice uses a shared user config directory by default,
499
+ # which causes conflicts when multiple processes run simultaneously
500
+ with tempfile.TemporaryDirectory(prefix="lo_profile_") as profile_dir:
501
+ profile_url = Path(profile_dir).as_uri()
502
+
503
+ # Run LibreOffice conversion with isolated profile
504
+ cmd = [
505
+ self._soffice_path,
506
+ "--headless",
507
+ f"-env:UserInstallation={profile_url}",
508
+ "--convert-to",
509
+ target_format,
510
+ "--outdir",
511
+ str(output_dir),
512
+ str(input_path),
513
+ ]
514
+
515
+ logger.debug(f"Running LibreOffice: {' '.join(cmd)}")
516
+
517
+ try:
518
+ result = subprocess.run(
519
+ cmd,
520
+ capture_output=True,
521
+ text=True,
522
+ timeout=120,
523
+ )
524
+
525
+ if result.returncode != 0:
526
+ raise RuntimeError(
527
+ f"LibreOffice conversion failed: {result.stderr}"
528
+ )
529
+
530
+ except subprocess.TimeoutExpired:
531
+ raise RuntimeError("LibreOffice conversion timed out")
532
+
533
+ # Find converted file
534
+ converted_name = input_path.stem + "." + target_format
535
+ converted_path = output_dir / converted_name
536
+
537
+ if not converted_path.exists():
538
+ raise RuntimeError(f"Converted file not found: {converted_path}")
539
+
540
+ return converted_path
541
+
542
+ def convert(
543
+ self, input_path: Path, output_dir: Path | None = None
544
+ ) -> ConvertResult:
545
+ """Convert legacy Office document to Markdown.
546
+
547
+ Args:
548
+ input_path: Path to the input file
549
+ output_dir: Optional output directory for extracted images
550
+
551
+ Returns:
552
+ ConvertResult containing markdown and extracted images
553
+ """
554
+ input_path = Path(input_path)
555
+ suffix = input_path.suffix.lower()
556
+
557
+ target_format = self.TARGET_FORMAT.get(suffix)
558
+ if not target_format:
559
+ raise ValueError(f"Unsupported format: {suffix}")
560
+
561
+ # Create temp directory for conversion
562
+ with tempfile.TemporaryDirectory() as temp_dir:
563
+ temp_path = Path(temp_dir)
564
+
565
+ # Convert to modern format (COM first, LibreOffice fallback)
566
+ converted_path = self._convert_legacy_format(
567
+ input_path, target_format, temp_path
568
+ )
569
+
570
+ # Process with appropriate converter based on target format
571
+ if target_format == "pptx":
572
+ result = self._pptx_converter.convert(converted_path, output_dir)
573
+ else:
574
+ result = self._office_converter.convert(converted_path, output_dir)
575
+
576
+ # Update metadata
577
+ result.metadata["original_format"] = suffix.lstrip(".").upper()
578
+ result.metadata["source"] = str(input_path)
579
+
580
+ return result
581
+
582
+
583
+ # =============================================================================
584
+ # Registered Converters
585
+ # =============================================================================
586
+
587
+
588
+ @register_converter(FileFormat.DOC)
589
+ class DocConverter(LegacyOfficeConverter):
590
+ """Converter for legacy DOC (Word 97-2003) documents."""
591
+
592
+ supported_formats = [FileFormat.DOC]
593
+
594
+
595
+ @register_converter(FileFormat.PPT)
596
+ class PptConverter(LegacyOfficeConverter):
597
+ """Converter for legacy PPT (PowerPoint 97-2003) documents."""
598
+
599
+ supported_formats = [FileFormat.PPT]
600
+
601
+
602
+ @register_converter(FileFormat.XLS)
603
+ class XlsConverter(LegacyOfficeConverter):
604
+ """Converter for legacy XLS (Excel 97-2003) documents."""
605
+
606
+ supported_formats = [FileFormat.XLS]