markitai 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. markitai/__init__.py +3 -0
  2. markitai/batch.py +1316 -0
  3. markitai/cli.py +3979 -0
  4. markitai/config.py +602 -0
  5. markitai/config.schema.json +748 -0
  6. markitai/constants.py +222 -0
  7. markitai/converter/__init__.py +49 -0
  8. markitai/converter/_patches.py +98 -0
  9. markitai/converter/base.py +164 -0
  10. markitai/converter/image.py +181 -0
  11. markitai/converter/legacy.py +606 -0
  12. markitai/converter/office.py +526 -0
  13. markitai/converter/pdf.py +679 -0
  14. markitai/converter/text.py +63 -0
  15. markitai/fetch.py +1725 -0
  16. markitai/image.py +1335 -0
  17. markitai/json_order.py +550 -0
  18. markitai/llm.py +4339 -0
  19. markitai/ocr.py +347 -0
  20. markitai/prompts/__init__.py +159 -0
  21. markitai/prompts/cleaner.md +93 -0
  22. markitai/prompts/document_enhance.md +77 -0
  23. markitai/prompts/document_enhance_complete.md +65 -0
  24. markitai/prompts/document_process.md +60 -0
  25. markitai/prompts/frontmatter.md +28 -0
  26. markitai/prompts/image_analysis.md +21 -0
  27. markitai/prompts/image_caption.md +8 -0
  28. markitai/prompts/image_description.md +13 -0
  29. markitai/prompts/page_content.md +17 -0
  30. markitai/prompts/url_enhance.md +78 -0
  31. markitai/security.py +286 -0
  32. markitai/types.py +30 -0
  33. markitai/urls.py +187 -0
  34. markitai/utils/__init__.py +33 -0
  35. markitai/utils/executor.py +69 -0
  36. markitai/utils/mime.py +85 -0
  37. markitai/utils/office.py +262 -0
  38. markitai/utils/output.py +53 -0
  39. markitai/utils/paths.py +81 -0
  40. markitai/utils/text.py +359 -0
  41. markitai/workflow/__init__.py +37 -0
  42. markitai/workflow/core.py +760 -0
  43. markitai/workflow/helpers.py +509 -0
  44. markitai/workflow/single.py +369 -0
  45. markitai-0.3.0.dist-info/METADATA +159 -0
  46. markitai-0.3.0.dist-info/RECORD +48 -0
  47. markitai-0.3.0.dist-info/WHEEL +4 -0
  48. markitai-0.3.0.dist-info/entry_points.txt +2 -0
markitai/urls.py ADDED
@@ -0,0 +1,187 @@
1
+ """URL list parsing module for batch URL processing."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import re
7
+ from dataclasses import dataclass
8
+ from pathlib import Path
9
+
10
+ from loguru import logger
11
+
12
+ # URL pattern for validation
13
+ _URL_PATTERN = re.compile(r"^https?://", re.IGNORECASE)
14
+
15
+
16
+ @dataclass
17
+ class UrlEntry:
18
+ """Represents a URL entry from a URL list file.
19
+
20
+ Attributes:
21
+ url: The URL to process
22
+ output_name: Optional custom output filename (without extension)
23
+ """
24
+
25
+ url: str
26
+ output_name: str | None = None
27
+
28
+
29
+ class UrlListParseError(Exception):
30
+ """Raised when URL list file cannot be parsed."""
31
+
32
+ pass
33
+
34
+
35
+ def is_url_list_file(path: Path) -> bool:
36
+ """Check if path is a URL list file.
37
+
38
+ URL list files are identified by the .urls extension.
39
+
40
+ Args:
41
+ path: Path to check
42
+
43
+ Returns:
44
+ True if the file has .urls extension
45
+ """
46
+ return path.suffix.lower() == ".urls"
47
+
48
+
49
+ def parse_url_list(file_path: Path) -> list[UrlEntry]:
50
+ """Parse a URL list file.
51
+
52
+ Supported formats:
53
+ 1. Plain text: one URL per line
54
+ - Empty lines are ignored
55
+ - Lines starting with # are comments
56
+ - Lines can optionally have a custom output name after whitespace:
57
+ https://example.com custom_name
58
+
59
+ 2. JSON array of strings:
60
+ ["https://example1.com", "https://example2.com"]
61
+
62
+ 3. JSON array of objects:
63
+ [
64
+ {"url": "https://example1.com"},
65
+ {"url": "https://example2.com", "output_name": "custom"}
66
+ ]
67
+
68
+ Args:
69
+ file_path: Path to the URL list file
70
+
71
+ Returns:
72
+ List of UrlEntry objects
73
+
74
+ Raises:
75
+ UrlListParseError: If the file cannot be parsed
76
+ FileNotFoundError: If the file does not exist
77
+ """
78
+ if not file_path.exists():
79
+ raise FileNotFoundError(f"URL list file not found: {file_path}")
80
+
81
+ content = file_path.read_text(encoding="utf-8").strip()
82
+
83
+ if not content:
84
+ return []
85
+
86
+ # Try JSON first
87
+ if content.startswith("["):
88
+ return _parse_json_url_list(content, file_path)
89
+
90
+ # Fall back to plain text
91
+ return _parse_text_url_list(content, file_path)
92
+
93
+
94
+ def _parse_json_url_list(content: str, file_path: Path) -> list[UrlEntry]:
95
+ """Parse JSON format URL list."""
96
+ try:
97
+ data = json.loads(content)
98
+ except json.JSONDecodeError as e:
99
+ raise UrlListParseError(f"Invalid JSON in {file_path}: {e}")
100
+
101
+ if not isinstance(data, list):
102
+ raise UrlListParseError(
103
+ f"Expected JSON array in {file_path}, got {type(data).__name__}"
104
+ )
105
+
106
+ entries = []
107
+ for i, item in enumerate(data):
108
+ if isinstance(item, str):
109
+ # Simple string URL
110
+ url = item.strip()
111
+ if not url:
112
+ continue
113
+ if not _URL_PATTERN.match(url):
114
+ logger.warning(f"Skipping invalid URL at index {i}: {url[:50]}...")
115
+ continue
116
+ entries.append(UrlEntry(url=url))
117
+
118
+ elif isinstance(item, dict):
119
+ # Object with url and optional output_name
120
+ url = item.get("url", "").strip()
121
+ if not url:
122
+ logger.warning(f"Skipping entry at index {i}: missing 'url' field")
123
+ continue
124
+ if not _URL_PATTERN.match(url):
125
+ logger.warning(f"Skipping invalid URL at index {i}: {url[:50]}...")
126
+ continue
127
+
128
+ output_name = item.get("output_name")
129
+ if output_name:
130
+ output_name = str(output_name).strip() or None
131
+
132
+ entries.append(UrlEntry(url=url, output_name=output_name))
133
+
134
+ else:
135
+ logger.warning(
136
+ f"Skipping entry at index {i}: expected string or object, got {type(item).__name__}"
137
+ )
138
+
139
+ return entries
140
+
141
+
142
+ def _parse_text_url_list(content: str, file_path: Path) -> list[UrlEntry]:
143
+ """Parse plain text format URL list."""
144
+ entries = []
145
+
146
+ for line_num, line in enumerate(content.splitlines(), start=1):
147
+ line = line.strip()
148
+
149
+ # Skip empty lines and comments
150
+ if not line or line.startswith("#"):
151
+ continue
152
+
153
+ # Split line into URL and optional output name
154
+ parts = line.split(None, 1) # Split on first whitespace
155
+ url = parts[0]
156
+
157
+ if not _URL_PATTERN.match(url):
158
+ logger.warning(f"Skipping invalid URL at line {line_num}: {url[:50]}...")
159
+ continue
160
+
161
+ output_name = None
162
+ if len(parts) > 1:
163
+ output_name = parts[1].strip()
164
+ # Remove quotes if present
165
+ if (output_name.startswith('"') and output_name.endswith('"')) or (
166
+ output_name.startswith("'") and output_name.endswith("'")
167
+ ):
168
+ output_name = output_name[1:-1]
169
+
170
+ entries.append(UrlEntry(url=url, output_name=output_name or None))
171
+
172
+ return entries
173
+
174
+
175
+ def find_url_list_files(directory: Path) -> list[Path]:
176
+ """Find all .urls files in a directory (recursive).
177
+
178
+ Args:
179
+ directory: Directory to search
180
+
181
+ Returns:
182
+ List of paths to .urls files, sorted by path
183
+ """
184
+ if not directory.is_dir():
185
+ return []
186
+
187
+ return sorted(directory.glob("**/*.urls"))
@@ -0,0 +1,33 @@
1
+ """Markitai utilities."""
2
+
3
+ from markitai.utils.executor import (
4
+ get_converter_executor,
5
+ run_in_converter_thread,
6
+ shutdown_converter_executor,
7
+ )
8
+ from markitai.utils.mime import get_extension_from_mime, get_mime_type
9
+ from markitai.utils.office import find_libreoffice, has_ms_office
10
+ from markitai.utils.output import resolve_output_path
11
+ from markitai.utils.paths import (
12
+ ensure_assets_dir,
13
+ ensure_dir,
14
+ ensure_screenshots_dir,
15
+ ensure_subdir,
16
+ )
17
+ from markitai.utils.text import normalize_markdown_whitespace
18
+
19
+ __all__ = [
20
+ "ensure_assets_dir",
21
+ "ensure_dir",
22
+ "ensure_screenshots_dir",
23
+ "ensure_subdir",
24
+ "find_libreoffice",
25
+ "get_converter_executor",
26
+ "get_extension_from_mime",
27
+ "get_mime_type",
28
+ "has_ms_office",
29
+ "normalize_markdown_whitespace",
30
+ "resolve_output_path",
31
+ "run_in_converter_thread",
32
+ "shutdown_converter_executor",
33
+ ]
@@ -0,0 +1,69 @@
1
+ """Shared ThreadPoolExecutor for CPU-bound converter operations."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import os
7
+ import threading
8
+ from collections.abc import Callable
9
+ from concurrent.futures import ThreadPoolExecutor
10
+ from typing import Any, TypeVar
11
+
12
+ T = TypeVar("T")
13
+
14
+ # Global converter thread pool executor with thread-safe initialization
15
+ _CONVERTER_EXECUTOR: ThreadPoolExecutor | None = None
16
+ _CONVERTER_MAX_WORKERS = min(os.cpu_count() or 4, 8)
17
+ _EXECUTOR_LOCK = threading.Lock()
18
+
19
+
20
+ def get_converter_executor() -> ThreadPoolExecutor:
21
+ """Get or create the shared converter thread pool executor.
22
+
23
+ Uses double-checked locking for thread-safe lazy initialization.
24
+
25
+ Returns:
26
+ Shared ThreadPoolExecutor instance for converter operations
27
+ """
28
+ global _CONVERTER_EXECUTOR
29
+ if _CONVERTER_EXECUTOR is None:
30
+ with _EXECUTOR_LOCK:
31
+ # Double-check after acquiring lock
32
+ if _CONVERTER_EXECUTOR is None:
33
+ _CONVERTER_EXECUTOR = ThreadPoolExecutor(
34
+ max_workers=_CONVERTER_MAX_WORKERS,
35
+ thread_name_prefix="markitai-converter",
36
+ )
37
+ return _CONVERTER_EXECUTOR
38
+
39
+
40
+ async def run_in_converter_thread(
41
+ func: Callable[..., T], *args: Any, **kwargs: Any
42
+ ) -> T:
43
+ """Run a function in the shared converter thread pool.
44
+
45
+ This is used for CPU-bound converter operations (PDF parsing,
46
+ document conversion, etc.) to avoid blocking the event loop.
47
+
48
+ Args:
49
+ func: Function to run in thread pool
50
+ *args: Positional arguments to pass to func
51
+ **kwargs: Keyword arguments to pass to func
52
+
53
+ Returns:
54
+ Result of func(*args, **kwargs)
55
+ """
56
+ loop = asyncio.get_running_loop()
57
+ executor = get_converter_executor()
58
+ return await loop.run_in_executor(executor, lambda: func(*args, **kwargs))
59
+
60
+
61
+ def shutdown_converter_executor() -> None:
62
+ """Shutdown the shared converter executor.
63
+
64
+ Call this during application cleanup to ensure clean shutdown.
65
+ """
66
+ global _CONVERTER_EXECUTOR
67
+ if _CONVERTER_EXECUTOR is not None:
68
+ _CONVERTER_EXECUTOR.shutdown(wait=True)
69
+ _CONVERTER_EXECUTOR = None
markitai/utils/mime.py ADDED
@@ -0,0 +1,85 @@
1
+ """MIME type utilities for image handling.
2
+
3
+ This module provides helper functions for MIME type operations,
4
+ using the centralized mappings defined in constants.py.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from markitai.constants import EXTENSION_TO_MIME, MIME_TO_EXTENSION
10
+
11
+ # MIME types supported by vision LLMs (Anthropic Claude, Google Gemini, OpenAI GPT-4V)
12
+ # SVG, BMP, ICO etc. are NOT supported
13
+ LLM_SUPPORTED_MIME_TYPES = frozenset(
14
+ {"image/jpeg", "image/png", "image/gif", "image/webp"}
15
+ )
16
+
17
+
18
+ def get_mime_type(extension: str, default: str = "image/jpeg") -> str:
19
+ """Get MIME type from file extension.
20
+
21
+ Args:
22
+ extension: File extension (with or without leading dot), e.g. ".jpg" or "jpg"
23
+ default: Default MIME type if extension is not recognized
24
+
25
+ Returns:
26
+ MIME type string, e.g. "image/jpeg"
27
+
28
+ Examples:
29
+ >>> get_mime_type(".jpg")
30
+ 'image/jpeg'
31
+ >>> get_mime_type("png")
32
+ 'image/png'
33
+ >>> get_mime_type(".unknown")
34
+ 'image/jpeg'
35
+ """
36
+ # Normalize extension to have leading dot and be lowercase
37
+ ext = extension.lower()
38
+ if not ext.startswith("."):
39
+ ext = f".{ext}"
40
+ return EXTENSION_TO_MIME.get(ext, default)
41
+
42
+
43
+ def get_extension_from_mime(mime_type: str, default: str = ".jpg") -> str:
44
+ """Get file extension from MIME type.
45
+
46
+ Args:
47
+ mime_type: MIME type string, e.g. "image/jpeg"
48
+ default: Default extension if MIME type is not recognized
49
+
50
+ Returns:
51
+ File extension with leading dot, e.g. ".jpg"
52
+
53
+ Examples:
54
+ >>> get_extension_from_mime("image/jpeg")
55
+ '.jpg'
56
+ >>> get_extension_from_mime("image/png")
57
+ '.png'
58
+ >>> get_extension_from_mime("image/unknown")
59
+ '.jpg'
60
+ """
61
+ # Handle content-type with parameters (e.g. "image/jpeg; charset=utf-8")
62
+ clean_mime = mime_type.lower().split(";")[0].strip()
63
+ return MIME_TO_EXTENSION.get(clean_mime, default)
64
+
65
+
66
+ def is_llm_supported_image(extension: str) -> bool:
67
+ """Check if image format is supported by vision LLMs.
68
+
69
+ Vision LLMs (Claude, Gemini, GPT-4V) only support jpeg, png, gif, webp.
70
+ Formats like SVG, BMP, ICO are NOT supported.
71
+
72
+ Args:
73
+ extension: File extension (with or without leading dot)
74
+
75
+ Returns:
76
+ True if the format is supported by vision LLMs
77
+
78
+ Examples:
79
+ >>> is_llm_supported_image(".jpg")
80
+ True
81
+ >>> is_llm_supported_image(".svg")
82
+ False
83
+ """
84
+ mime_type = get_mime_type(extension, default="")
85
+ return mime_type in LLM_SUPPORTED_MIME_TYPES
@@ -0,0 +1,262 @@
1
+ """Office application detection utilities.
2
+
3
+ Provides detection for MS Office (Windows) and LibreOffice (cross-platform).
4
+ - MS Office COM: Used for legacy format conversion (.doc/.ppt) and PPTX slide rendering
5
+ - LibreOffice: Used as fallback for legacy format conversion and PDF export
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import platform
11
+ import shutil
12
+ from functools import lru_cache
13
+ from pathlib import Path
14
+
15
+ from loguru import logger
16
+
17
+ # Common MS Office installation paths on Windows
18
+ _MS_OFFICE_PATHS = [
19
+ # Microsoft 365 / Office 2019+ (Click-to-Run)
20
+ r"C:\Program Files\Microsoft Office\root\Office16",
21
+ r"C:\Program Files (x86)\Microsoft Office\root\Office16",
22
+ # Office 2016 (MSI)
23
+ r"C:\Program Files\Microsoft Office\Office16",
24
+ r"C:\Program Files (x86)\Microsoft Office\Office16",
25
+ # Office 2013
26
+ r"C:\Program Files\Microsoft Office\Office15",
27
+ r"C:\Program Files (x86)\Microsoft Office\Office15",
28
+ # Office 2010
29
+ r"C:\Program Files\Microsoft Office\Office14",
30
+ r"C:\Program Files (x86)\Microsoft Office\Office14",
31
+ ]
32
+
33
+
34
+ def _is_windows() -> bool:
35
+ """Check if running on Windows."""
36
+ return platform.system() == "Windows"
37
+
38
+
39
+ def _check_office_exe_exists(app_name: str) -> bool:
40
+ """Check if an Office application executable exists in common paths.
41
+
42
+ Args:
43
+ app_name: Application name without extension (e.g., "POWERPNT", "WINWORD", "EXCEL")
44
+
45
+ Returns:
46
+ True if the executable is found in any common path.
47
+ """
48
+ exe_name = f"{app_name}.EXE"
49
+ for office_path in _MS_OFFICE_PATHS:
50
+ exe_path = Path(office_path) / exe_name
51
+ if exe_path.exists():
52
+ logger.debug(f"Found {app_name} at: {exe_path}")
53
+ return True
54
+ return False
55
+
56
+
57
+ @lru_cache(maxsize=1)
58
+ def check_ms_powerpoint_available() -> bool:
59
+ """Check if MS Office PowerPoint is installed (Windows only).
60
+
61
+ Detection strategy:
62
+ 1. Windows Registry lookup (fast, preferred)
63
+ 2. Direct file path check (fallback for Click-to-Run installations)
64
+
65
+ Returns:
66
+ True if PowerPoint is installed, False otherwise.
67
+ """
68
+ if not _is_windows():
69
+ return False
70
+
71
+ # Method 1: Registry lookup
72
+ try:
73
+ import winreg # type: ignore[import-not-found]
74
+
75
+ try:
76
+ key = winreg.OpenKey(winreg.HKEY_CLASSES_ROOT, r"PowerPoint.Application") # type: ignore[attr-defined]
77
+ winreg.CloseKey(key) # type: ignore[attr-defined]
78
+ logger.debug("MS PowerPoint detected via registry")
79
+ return True
80
+ except OSError:
81
+ pass # Registry key not found, try file path
82
+ except ImportError:
83
+ pass # winreg not available
84
+
85
+ # Method 2: Direct file path check (for Click-to-Run installations)
86
+ if _check_office_exe_exists("POWERPNT"):
87
+ logger.debug("MS PowerPoint detected via file path")
88
+ return True
89
+
90
+ logger.debug("MS PowerPoint not found")
91
+ return False
92
+
93
+
94
+ @lru_cache(maxsize=1)
95
+ def check_ms_word_available() -> bool:
96
+ """Check if MS Office Word is installed (Windows only).
97
+
98
+ Detection strategy:
99
+ 1. Windows Registry lookup (fast, preferred)
100
+ 2. Direct file path check (fallback for Click-to-Run installations)
101
+
102
+ Returns:
103
+ True if Word is installed, False otherwise.
104
+ """
105
+ if not _is_windows():
106
+ return False
107
+
108
+ # Method 1: Registry lookup
109
+ try:
110
+ import winreg # type: ignore[import-not-found]
111
+
112
+ try:
113
+ key = winreg.OpenKey(winreg.HKEY_CLASSES_ROOT, r"Word.Application") # type: ignore[attr-defined]
114
+ winreg.CloseKey(key) # type: ignore[attr-defined]
115
+ logger.debug("MS Word detected via registry")
116
+ return True
117
+ except OSError:
118
+ pass # Registry key not found, try file path
119
+ except ImportError:
120
+ pass # winreg not available
121
+
122
+ # Method 2: Direct file path check (for Click-to-Run installations)
123
+ if _check_office_exe_exists("WINWORD"):
124
+ logger.debug("MS Word detected via file path")
125
+ return True
126
+
127
+ logger.debug("MS Word not found")
128
+ return False
129
+
130
+
131
+ @lru_cache(maxsize=1)
132
+ def check_ms_excel_available() -> bool:
133
+ """Check if MS Office Excel is installed (Windows only).
134
+
135
+ Detection strategy:
136
+ 1. Windows Registry lookup (fast, preferred)
137
+ 2. Direct file path check (fallback for Click-to-Run installations)
138
+
139
+ Returns:
140
+ True if Excel is installed, False otherwise.
141
+ """
142
+ if not _is_windows():
143
+ return False
144
+
145
+ # Method 1: Registry lookup
146
+ try:
147
+ import winreg # type: ignore[import-not-found]
148
+
149
+ try:
150
+ key = winreg.OpenKey(winreg.HKEY_CLASSES_ROOT, r"Excel.Application") # type: ignore[attr-defined]
151
+ winreg.CloseKey(key) # type: ignore[attr-defined]
152
+ logger.debug("MS Excel detected via registry")
153
+ return True
154
+ except OSError:
155
+ pass # Registry key not found, try file path
156
+ except ImportError:
157
+ pass # winreg not available
158
+
159
+ # Method 2: Direct file path check (for Click-to-Run installations)
160
+ if _check_office_exe_exists("EXCEL"):
161
+ logger.debug("MS Excel detected via file path")
162
+ return True
163
+
164
+ logger.debug("MS Excel not found")
165
+ return False
166
+
167
+
168
+ import threading
169
+
170
+ # Thread-safe cache for has_ms_office result
171
+ _ms_office_check_lock = threading.Lock()
172
+ _ms_office_checked = False
173
+ _ms_office_available = False
174
+
175
+
176
+ def has_ms_office() -> bool:
177
+ """Detect if MS Office PowerPoint is available via COM (Windows only).
178
+
179
+ Used for optional high-quality PPTX slide rendering.
180
+ Text extraction uses MarkItDown (cross-platform) and doesn't need COM.
181
+
182
+ Note: For checking installation status, prefer `check_ms_powerpoint_available()`
183
+ which uses registry lookup and is faster.
184
+
185
+ Returns:
186
+ True if PowerPoint COM is available, False otherwise.
187
+ """
188
+ global _ms_office_checked, _ms_office_available
189
+
190
+ # Fast path: already checked
191
+ if _ms_office_checked:
192
+ return _ms_office_available
193
+
194
+ if not _is_windows():
195
+ _ms_office_checked = True
196
+ _ms_office_available = False
197
+ return False
198
+
199
+ # Thread-safe check with proper COM initialization
200
+ with _ms_office_check_lock:
201
+ # Double-check after acquiring lock
202
+ if _ms_office_checked:
203
+ return _ms_office_available
204
+
205
+ try:
206
+ import pythoncom # type: ignore[import-not-found]
207
+ import win32com.client # type: ignore[import-not-found]
208
+
209
+ # Initialize COM for this thread (required in worker threads)
210
+ pythoncom.CoInitialize()
211
+ try:
212
+ # Check PowerPoint availability (most relevant for PPTX)
213
+ ppt = win32com.client.Dispatch("PowerPoint.Application")
214
+ ppt.Quit()
215
+ logger.debug("MS Office (PowerPoint) detected via COM")
216
+ _ms_office_available = True
217
+ finally:
218
+ pythoncom.CoUninitialize()
219
+ except Exception:
220
+ logger.debug("MS Office not available via COM")
221
+ _ms_office_available = False
222
+
223
+ _ms_office_checked = True
224
+ return _ms_office_available
225
+
226
+
227
+ @lru_cache(maxsize=1)
228
+ def find_libreoffice() -> str | None:
229
+ """Find LibreOffice soffice executable (cached).
230
+
231
+ Searches PATH first, then common installation paths.
232
+
233
+ Returns:
234
+ Path to soffice executable, or None if not found.
235
+ """
236
+ # Check PATH first
237
+ for cmd in ("soffice", "libreoffice"):
238
+ path = shutil.which(cmd)
239
+ if path:
240
+ logger.debug(f"LibreOffice found in PATH: {path}")
241
+ return path
242
+
243
+ # Check common installation paths
244
+ common_paths = [
245
+ # Linux
246
+ "/usr/bin/soffice",
247
+ "/usr/local/bin/soffice",
248
+ "/opt/libreoffice/program/soffice",
249
+ # macOS
250
+ "/Applications/LibreOffice.app/Contents/MacOS/soffice",
251
+ # Windows
252
+ r"C:\Program Files\LibreOffice\program\soffice.exe",
253
+ r"C:\Program Files (x86)\LibreOffice\program\soffice.exe",
254
+ ]
255
+
256
+ for path in common_paths:
257
+ if shutil.which(path):
258
+ logger.debug(f"LibreOffice found at: {path}")
259
+ return path
260
+
261
+ logger.debug("LibreOffice not found")
262
+ return None
@@ -0,0 +1,53 @@
1
+ """Output path utilities for Markitai."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+
7
+
8
+ def resolve_output_path(
9
+ base_path: Path,
10
+ on_conflict: str,
11
+ ) -> Path | None:
12
+ """Resolve output path based on conflict strategy.
13
+
14
+ Args:
15
+ base_path: The original output file path
16
+ on_conflict: Conflict resolution strategy ("skip", "overwrite", "rename")
17
+
18
+ Returns:
19
+ Resolved path, or None if file should be skipped.
20
+ For rename strategy: file.pdf.md -> file.pdf.v2.md -> file.pdf.v3.md
21
+ For rename with .llm.md: file.pdf.llm.md -> file.pdf.v2.llm.md
22
+ This ensures files sort in natural order (A-Z).
23
+ """
24
+ if not base_path.exists():
25
+ return base_path
26
+
27
+ if on_conflict == "skip":
28
+ return None
29
+ elif on_conflict == "overwrite":
30
+ return base_path
31
+ else: # rename
32
+ # Parse filename to insert version number before .md/.llm.md suffix
33
+ # e.g., "file.pdf.md" -> "file.pdf.v2.md" -> "file.pdf.v3.md"
34
+ # e.g., "file.pdf.llm.md" -> "file.pdf.v2.llm.md"
35
+ # This ensures files sort in natural A-Z order (.md < .v2.md < .v3.md)
36
+ name = base_path.name
37
+
38
+ # Determine the markitai suffix (.md or .llm.md)
39
+ if name.endswith(".llm.md"):
40
+ base_stem = name[:-7] # Remove ".llm.md" -> "file.pdf"
41
+ markitai_suffix = ".llm.md"
42
+ else:
43
+ base_stem = name[:-3] # Remove ".md" -> "file.pdf"
44
+ markitai_suffix = ".md"
45
+
46
+ # Find next available sequence number
47
+ seq = 2
48
+ while True:
49
+ new_name = f"{base_stem}.v{seq}{markitai_suffix}"
50
+ new_path = base_path.parent / new_name
51
+ if not new_path.exists():
52
+ return new_path
53
+ seq += 1