markitai 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- markitai/__init__.py +3 -0
- markitai/batch.py +1316 -0
- markitai/cli.py +3979 -0
- markitai/config.py +602 -0
- markitai/config.schema.json +748 -0
- markitai/constants.py +222 -0
- markitai/converter/__init__.py +49 -0
- markitai/converter/_patches.py +98 -0
- markitai/converter/base.py +164 -0
- markitai/converter/image.py +181 -0
- markitai/converter/legacy.py +606 -0
- markitai/converter/office.py +526 -0
- markitai/converter/pdf.py +679 -0
- markitai/converter/text.py +63 -0
- markitai/fetch.py +1725 -0
- markitai/image.py +1335 -0
- markitai/json_order.py +550 -0
- markitai/llm.py +4339 -0
- markitai/ocr.py +347 -0
- markitai/prompts/__init__.py +159 -0
- markitai/prompts/cleaner.md +93 -0
- markitai/prompts/document_enhance.md +77 -0
- markitai/prompts/document_enhance_complete.md +65 -0
- markitai/prompts/document_process.md +60 -0
- markitai/prompts/frontmatter.md +28 -0
- markitai/prompts/image_analysis.md +21 -0
- markitai/prompts/image_caption.md +8 -0
- markitai/prompts/image_description.md +13 -0
- markitai/prompts/page_content.md +17 -0
- markitai/prompts/url_enhance.md +78 -0
- markitai/security.py +286 -0
- markitai/types.py +30 -0
- markitai/urls.py +187 -0
- markitai/utils/__init__.py +33 -0
- markitai/utils/executor.py +69 -0
- markitai/utils/mime.py +85 -0
- markitai/utils/office.py +262 -0
- markitai/utils/output.py +53 -0
- markitai/utils/paths.py +81 -0
- markitai/utils/text.py +359 -0
- markitai/workflow/__init__.py +37 -0
- markitai/workflow/core.py +760 -0
- markitai/workflow/helpers.py +509 -0
- markitai/workflow/single.py +369 -0
- markitai-0.3.0.dist-info/METADATA +159 -0
- markitai-0.3.0.dist-info/RECORD +48 -0
- markitai-0.3.0.dist-info/WHEEL +4 -0
- markitai-0.3.0.dist-info/entry_points.txt +2 -0
markitai/constants.py
ADDED
|
@@ -0,0 +1,222 @@
|
|
|
1
|
+
"""Centralized constants for markitai.
|
|
2
|
+
|
|
3
|
+
This module contains all hardcoded constants used throughout the codebase.
|
|
4
|
+
Grouping them here makes it easier to:
|
|
5
|
+
- Find and modify default values
|
|
6
|
+
- Understand system limits at a glance
|
|
7
|
+
- Maintain consistency across modules
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
# =============================================================================
|
|
13
|
+
# File Size Limits
|
|
14
|
+
# =============================================================================
|
|
15
|
+
|
|
16
|
+
MAX_STATE_FILE_SIZE = 10 * 1024 * 1024 # 10 MB - batch state file
|
|
17
|
+
MAX_IMAGE_SIZE = 100 * 1024 * 1024 # 100 MB - single image
|
|
18
|
+
MAX_TOTAL_IMAGES_SIZE = 500 * 1024 * 1024 # 500 MB - all images combined
|
|
19
|
+
MAX_DOCUMENT_SIZE = 500 * 1024 * 1024 # 500 MB - input document
|
|
20
|
+
|
|
21
|
+
# =============================================================================
|
|
22
|
+
# LLM Processing
|
|
23
|
+
# =============================================================================
|
|
24
|
+
|
|
25
|
+
# Retry settings
|
|
26
|
+
DEFAULT_MAX_RETRIES = 2
|
|
27
|
+
DEFAULT_RETRY_BASE_DELAY = 1.0 # seconds
|
|
28
|
+
DEFAULT_RETRY_MAX_DELAY = 60.0 # seconds
|
|
29
|
+
|
|
30
|
+
# Instructor retry settings (for structured JSON output validation)
|
|
31
|
+
# When LLM returns malformed JSON, Instructor can retry with validation error
|
|
32
|
+
# feedback, allowing the LLM to fix issues like incorrect escaping
|
|
33
|
+
DEFAULT_INSTRUCTOR_MAX_RETRIES = 1
|
|
34
|
+
|
|
35
|
+
# Token limits
|
|
36
|
+
DEFAULT_MAX_OUTPUT_TOKENS = 8192 # Conservative default for most models
|
|
37
|
+
DEFAULT_MAX_CONTENT_CHARS = (
|
|
38
|
+
32000 # Max chars for document processing (truncation threshold)
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
# Concurrency
|
|
42
|
+
DEFAULT_IO_CONCURRENCY = 20 # I/O operations (file reads, etc.)
|
|
43
|
+
DEFAULT_LLM_CONCURRENCY = 10 # LLM API calls (config default)
|
|
44
|
+
DEFAULT_BATCH_CONCURRENCY = 10 # Batch file processing (config default)
|
|
45
|
+
DEFAULT_URL_CONCURRENCY = 5 # URL fetching (separate from file processing)
|
|
46
|
+
|
|
47
|
+
# Batch sizes
|
|
48
|
+
DEFAULT_MAX_IMAGES_PER_BATCH = 10 # Images per LLM vision call
|
|
49
|
+
DEFAULT_MAX_PAGES_PER_BATCH = 5 # Pages per LLM call for document processing (reduced from 10 to avoid max_tokens)
|
|
50
|
+
|
|
51
|
+
# Router settings
|
|
52
|
+
DEFAULT_ROUTER_NUM_RETRIES = 2
|
|
53
|
+
DEFAULT_ROUTER_TIMEOUT = 120 # seconds
|
|
54
|
+
|
|
55
|
+
# Note: RETRYABLE_ERRORS tuple is defined in llm.py as it contains
|
|
56
|
+
# actual exception classes from litellm that cannot be imported here
|
|
57
|
+
|
|
58
|
+
# =============================================================================
|
|
59
|
+
# Image Processing
|
|
60
|
+
# =============================================================================
|
|
61
|
+
|
|
62
|
+
DEFAULT_IMAGE_QUALITY = 75 # JPEG quality (1-100)
|
|
63
|
+
DEFAULT_RENDER_DPI = 150 # DPI for page screenshots (PDF, PPTX, etc.)
|
|
64
|
+
DEFAULT_IMAGE_IO_CONCURRENCY = 8 # Concurrent I/O for image saving (optimized for NVMe)
|
|
65
|
+
DEFAULT_IMAGE_MULTIPROCESS_THRESHOLD = (
|
|
66
|
+
10 # Use multiprocess compression when images > this
|
|
67
|
+
)
|
|
68
|
+
DEFAULT_IMAGE_MAX_WIDTH = 1920
|
|
69
|
+
DEFAULT_IMAGE_MAX_HEIGHT = 99999 # Effectively unlimited (thumbnail won't upscale)
|
|
70
|
+
|
|
71
|
+
# Image filter thresholds
|
|
72
|
+
DEFAULT_IMAGE_FILTER_MIN_WIDTH = 50
|
|
73
|
+
DEFAULT_IMAGE_FILTER_MIN_HEIGHT = 50
|
|
74
|
+
DEFAULT_IMAGE_FILTER_MIN_AREA = 5000
|
|
75
|
+
|
|
76
|
+
# =============================================================================
|
|
77
|
+
# Cache Settings
|
|
78
|
+
# =============================================================================
|
|
79
|
+
|
|
80
|
+
# In-memory cache (legacy, still used for image bytes cache)
|
|
81
|
+
DEFAULT_CACHE_MAXSIZE = 100 # Max entries in LLM content cache
|
|
82
|
+
DEFAULT_CACHE_TTL_SECONDS = 300 # Cache TTL (5 minutes)
|
|
83
|
+
|
|
84
|
+
# Persistent SQLite cache
|
|
85
|
+
DEFAULT_CACHE_SIZE_LIMIT = 512 * 1024 * 1024 # 512 MB per cache file
|
|
86
|
+
DEFAULT_GLOBAL_CACHE_DIR = "~/.markitai" # Global cache directory
|
|
87
|
+
DEFAULT_PROJECT_CACHE_DIR = ".markitai" # Project-level cache directory
|
|
88
|
+
DEFAULT_CACHE_DB_FILENAME = "cache.db" # SQLite database filename
|
|
89
|
+
DEFAULT_CACHE_CONTENT_TRUNCATE = 50000 # Truncate content for hash key (chars)
|
|
90
|
+
|
|
91
|
+
# =============================================================================
|
|
92
|
+
# Batch Processing
|
|
93
|
+
# =============================================================================
|
|
94
|
+
|
|
95
|
+
DEFAULT_STATE_FLUSH_INTERVAL_SECONDS = 10 # Increased to reduce I/O overhead
|
|
96
|
+
DEFAULT_SCAN_MAX_DEPTH = 5
|
|
97
|
+
DEFAULT_SCAN_MAX_FILES = 10000
|
|
98
|
+
|
|
99
|
+
# =============================================================================
|
|
100
|
+
# Logging
|
|
101
|
+
# =============================================================================
|
|
102
|
+
|
|
103
|
+
DEFAULT_LOG_ROTATION = "10 MB"
|
|
104
|
+
DEFAULT_LOG_RETENTION = "7 days"
|
|
105
|
+
|
|
106
|
+
# =============================================================================
|
|
107
|
+
# UI / Display
|
|
108
|
+
# =============================================================================
|
|
109
|
+
|
|
110
|
+
DEFAULT_LOG_PANEL_MAX_LINES = 8 # Lines shown in verbose mode log panel
|
|
111
|
+
DEFAULT_JSON_INDENT = 2 # JSON output indentation
|
|
112
|
+
|
|
113
|
+
# =============================================================================
|
|
114
|
+
# Paths and Filenames
|
|
115
|
+
# =============================================================================
|
|
116
|
+
|
|
117
|
+
DEFAULT_OUTPUT_DIR = "./output"
|
|
118
|
+
DEFAULT_PROMPTS_DIR = "~/.markitai/prompts"
|
|
119
|
+
DEFAULT_LOG_DIR = "~/.markitai/logs"
|
|
120
|
+
CONFIG_FILENAME = "markitai.json"
|
|
121
|
+
|
|
122
|
+
# =============================================================================
|
|
123
|
+
# OCR
|
|
124
|
+
# =============================================================================
|
|
125
|
+
|
|
126
|
+
DEFAULT_OCR_LANG = "en"
|
|
127
|
+
DEFAULT_OCR_SAMPLE_PAGES = 3 # Pages to sample for scanned PDF detection
|
|
128
|
+
|
|
129
|
+
# =============================================================================
|
|
130
|
+
# Misc Defaults
|
|
131
|
+
# =============================================================================
|
|
132
|
+
|
|
133
|
+
DEFAULT_MODEL_WEIGHT = 1 # Default model weight in router
|
|
134
|
+
DEFAULT_SCREENSHOT_MAX_BYTES = int(
|
|
135
|
+
3.5 * 1024 * 1024
|
|
136
|
+
) # 3.5 MB max (base64 adds ~33%, must stay under 5MB API limit)
|
|
137
|
+
|
|
138
|
+
# URL Screenshot settings
|
|
139
|
+
DEFAULT_SCREENSHOT_VIEWPORT_WIDTH = 1920
|
|
140
|
+
DEFAULT_SCREENSHOT_VIEWPORT_HEIGHT = 1080
|
|
141
|
+
DEFAULT_SCREENSHOT_QUALITY = 75 # JPEG quality (1-100)
|
|
142
|
+
DEFAULT_SCREENSHOT_MAX_HEIGHT = 10000 # Max height for full-page URL screenshots
|
|
143
|
+
DEFAULT_ROUTING_STRATEGY = "simple-shuffle"
|
|
144
|
+
DEFAULT_IMAGE_FORMAT = "jpeg"
|
|
145
|
+
DEFAULT_ON_CONFLICT = "rename"
|
|
146
|
+
DEFAULT_LOG_LEVEL = "INFO"
|
|
147
|
+
|
|
148
|
+
# =============================================================================
|
|
149
|
+
# URL Fetch Settings
|
|
150
|
+
# =============================================================================
|
|
151
|
+
|
|
152
|
+
DEFAULT_FETCH_STRATEGY = "auto" # auto | static | browser | jina
|
|
153
|
+
DEFAULT_AGENT_BROWSER_COMMAND = "agent-browser"
|
|
154
|
+
DEFAULT_AGENT_BROWSER_TIMEOUT = 30000 # ms
|
|
155
|
+
DEFAULT_AGENT_BROWSER_WAIT_FOR = (
|
|
156
|
+
"domcontentloaded" # load | domcontentloaded | networkidle
|
|
157
|
+
)
|
|
158
|
+
DEFAULT_AGENT_BROWSER_EXTRA_WAIT_MS = (
|
|
159
|
+
1000 # Extra wait after load state (for JS rendering)
|
|
160
|
+
)
|
|
161
|
+
DEFAULT_JINA_TIMEOUT = 30 # seconds
|
|
162
|
+
DEFAULT_JINA_BASE_URL = "https://r.jina.ai"
|
|
163
|
+
|
|
164
|
+
# Domains that typically require JavaScript rendering
|
|
165
|
+
DEFAULT_FETCH_FALLBACK_PATTERNS: tuple[str, ...] = (
|
|
166
|
+
"twitter.com",
|
|
167
|
+
"x.com",
|
|
168
|
+
"instagram.com",
|
|
169
|
+
"facebook.com",
|
|
170
|
+
"linkedin.com",
|
|
171
|
+
"threads.net",
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
# Patterns that indicate JavaScript is required
|
|
175
|
+
JS_REQUIRED_PATTERNS: tuple[str, ...] = (
|
|
176
|
+
"JavaScript is disabled",
|
|
177
|
+
"JavaScript is required",
|
|
178
|
+
"Please enable JavaScript",
|
|
179
|
+
"This page requires JavaScript",
|
|
180
|
+
"You need to enable JavaScript",
|
|
181
|
+
"enable javascript",
|
|
182
|
+
"requires javascript",
|
|
183
|
+
"noscript",
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
# =============================================================================
|
|
187
|
+
# MIME Type Mappings
|
|
188
|
+
# =============================================================================
|
|
189
|
+
|
|
190
|
+
# Extension to MIME type mapping (for encoding images to send to LLM APIs)
|
|
191
|
+
EXTENSION_TO_MIME: dict[str, str] = {
|
|
192
|
+
".jpg": "image/jpeg",
|
|
193
|
+
".jpeg": "image/jpeg",
|
|
194
|
+
".png": "image/png",
|
|
195
|
+
".gif": "image/gif",
|
|
196
|
+
".webp": "image/webp",
|
|
197
|
+
".bmp": "image/bmp",
|
|
198
|
+
".svg": "image/svg+xml",
|
|
199
|
+
".ico": "image/x-icon",
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
# MIME type to extension mapping (for decoding content-type headers)
|
|
203
|
+
MIME_TO_EXTENSION: dict[str, str] = {
|
|
204
|
+
"image/jpeg": ".jpg",
|
|
205
|
+
"image/png": ".png",
|
|
206
|
+
"image/gif": ".gif",
|
|
207
|
+
"image/webp": ".webp",
|
|
208
|
+
"image/svg+xml": ".svg",
|
|
209
|
+
"image/bmp": ".bmp",
|
|
210
|
+
"image/x-icon": ".ico",
|
|
211
|
+
"image/vnd.microsoft.icon": ".ico",
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
# Supported image extensions for standalone image detection
|
|
215
|
+
IMAGE_EXTENSIONS: tuple[str, ...] = (
|
|
216
|
+
".jpg",
|
|
217
|
+
".jpeg",
|
|
218
|
+
".png",
|
|
219
|
+
".webp",
|
|
220
|
+
".gif",
|
|
221
|
+
".bmp",
|
|
222
|
+
)
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
"""Converter modules for various document formats."""
|
|
2
|
+
|
|
3
|
+
# Apply compatibility patches before importing converters
|
|
4
|
+
from markitai.converter._patches import apply_all_patches
|
|
5
|
+
|
|
6
|
+
apply_all_patches()
|
|
7
|
+
|
|
8
|
+
from markitai.converter.base import (
|
|
9
|
+
BaseConverter,
|
|
10
|
+
ConvertResult,
|
|
11
|
+
ExtractedImage,
|
|
12
|
+
FileFormat,
|
|
13
|
+
detect_format,
|
|
14
|
+
get_converter,
|
|
15
|
+
)
|
|
16
|
+
from markitai.converter.image import (
|
|
17
|
+
JpegConverter,
|
|
18
|
+
JpgConverter,
|
|
19
|
+
PngConverter,
|
|
20
|
+
WebpConverter,
|
|
21
|
+
)
|
|
22
|
+
from markitai.converter.legacy import DocConverter, PptConverter, XlsConverter
|
|
23
|
+
|
|
24
|
+
# Import converters to register them
|
|
25
|
+
from markitai.converter.office import DocxConverter, PptxConverter, XlsxConverter
|
|
26
|
+
from markitai.converter.pdf import PdfConverter
|
|
27
|
+
from markitai.converter.text import MarkdownConverter, TxtConverter
|
|
28
|
+
|
|
29
|
+
__all__ = [
|
|
30
|
+
"BaseConverter",
|
|
31
|
+
"ConvertResult",
|
|
32
|
+
"ExtractedImage",
|
|
33
|
+
"FileFormat",
|
|
34
|
+
"get_converter",
|
|
35
|
+
"detect_format",
|
|
36
|
+
"DocxConverter",
|
|
37
|
+
"PptxConverter",
|
|
38
|
+
"XlsxConverter",
|
|
39
|
+
"PdfConverter",
|
|
40
|
+
"TxtConverter",
|
|
41
|
+
"MarkdownConverter",
|
|
42
|
+
"DocConverter",
|
|
43
|
+
"PptConverter",
|
|
44
|
+
"XlsConverter",
|
|
45
|
+
"JpegConverter",
|
|
46
|
+
"JpgConverter",
|
|
47
|
+
"PngConverter",
|
|
48
|
+
"WebpConverter",
|
|
49
|
+
]
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
"""Compatibility patches for third-party libraries.
|
|
2
|
+
|
|
3
|
+
This module applies monkey patches to fix known issues in dependencies:
|
|
4
|
+
- openpyxl 3.1.x FileVersion TypeError ('bg' argument)
|
|
5
|
+
- lxml XMLSyntaxError from malformed PPTX files
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import functools
|
|
11
|
+
from typing import Any
|
|
12
|
+
|
|
13
|
+
_patches_applied = False
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def apply_openpyxl_patches() -> None:
|
|
17
|
+
"""Apply patches for openpyxl compatibility issues.
|
|
18
|
+
|
|
19
|
+
Fixes: TypeError: FileVersion.__init__() got an unexpected keyword argument 'bg'
|
|
20
|
+
|
|
21
|
+
This issue occurs in openpyxl 3.1.x when reading Excel files created by
|
|
22
|
+
older versions of MS Office or converted from .xls format.
|
|
23
|
+
"""
|
|
24
|
+
try:
|
|
25
|
+
from openpyxl.workbook.properties import FileVersion
|
|
26
|
+
except ImportError:
|
|
27
|
+
return # openpyxl not installed
|
|
28
|
+
|
|
29
|
+
original_init = FileVersion.__init__
|
|
30
|
+
|
|
31
|
+
@functools.wraps(original_init)
|
|
32
|
+
def patched_init(self: Any, *args: Any, **kwargs: Any) -> None:
|
|
33
|
+
# Remove unsupported keyword arguments
|
|
34
|
+
unsupported_keys = ["bg"]
|
|
35
|
+
for key in unsupported_keys:
|
|
36
|
+
kwargs.pop(key, None)
|
|
37
|
+
original_init(self, *args, **kwargs)
|
|
38
|
+
|
|
39
|
+
# Only patch if not already patched
|
|
40
|
+
if not getattr(FileVersion.__init__, "_markitai_patched", False):
|
|
41
|
+
FileVersion.__init__ = patched_init
|
|
42
|
+
FileVersion.__init__._markitai_patched = True # type: ignore[attr-defined]
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def apply_pptx_patches() -> None:
|
|
46
|
+
"""Apply patches for python-pptx/lxml compatibility issues.
|
|
47
|
+
|
|
48
|
+
Fixes: XMLSyntaxError from malformed PPTX files converted from PPT
|
|
49
|
+
|
|
50
|
+
When MS Office converts .ppt to .pptx, it may produce XML with
|
|
51
|
+
mismatched tags (e.g., 'rupB' vs 'bgPr'). This patch makes the
|
|
52
|
+
XML parser more lenient.
|
|
53
|
+
"""
|
|
54
|
+
# Check if lxml is available
|
|
55
|
+
try:
|
|
56
|
+
from lxml import etree
|
|
57
|
+
except ImportError:
|
|
58
|
+
return # lxml not installed
|
|
59
|
+
|
|
60
|
+
# Check if python-pptx is installed
|
|
61
|
+
try:
|
|
62
|
+
import pptx.oxml
|
|
63
|
+
except ImportError:
|
|
64
|
+
return # python-pptx not installed
|
|
65
|
+
|
|
66
|
+
# Create a lenient parser that recovers from errors
|
|
67
|
+
_lenient_parser = etree.XMLParser(recover=True, remove_blank_text=True)
|
|
68
|
+
|
|
69
|
+
original_parse_xml = pptx.oxml.parse_xml
|
|
70
|
+
|
|
71
|
+
@functools.wraps(original_parse_xml)
|
|
72
|
+
def patched_parse_xml(xml: bytes | str) -> Any:
|
|
73
|
+
try:
|
|
74
|
+
return original_parse_xml(xml)
|
|
75
|
+
except etree.XMLSyntaxError:
|
|
76
|
+
# Fallback to lenient parser
|
|
77
|
+
if isinstance(xml, str):
|
|
78
|
+
xml = xml.encode("utf-8")
|
|
79
|
+
return etree.fromstring(xml, parser=_lenient_parser)
|
|
80
|
+
|
|
81
|
+
if not getattr(pptx.oxml.parse_xml, "_markitai_patched", False):
|
|
82
|
+
pptx.oxml.parse_xml = patched_parse_xml
|
|
83
|
+
pptx.oxml.parse_xml._markitai_patched = True # type: ignore[attr-defined]
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def apply_all_patches() -> None:
|
|
87
|
+
"""Apply all compatibility patches.
|
|
88
|
+
|
|
89
|
+
This function is idempotent - calling it multiple times has no effect.
|
|
90
|
+
"""
|
|
91
|
+
global _patches_applied
|
|
92
|
+
if _patches_applied:
|
|
93
|
+
return
|
|
94
|
+
|
|
95
|
+
apply_openpyxl_patches()
|
|
96
|
+
apply_pptx_patches()
|
|
97
|
+
|
|
98
|
+
_patches_applied = True
|
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
"""Base converter classes and utilities."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from abc import ABC, abstractmethod
|
|
6
|
+
from dataclasses import dataclass, field
|
|
7
|
+
from enum import Enum
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import TYPE_CHECKING
|
|
10
|
+
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
from markitai.config import MarkitaiConfig
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class FileFormat(Enum):
|
|
16
|
+
"""Supported file formats."""
|
|
17
|
+
|
|
18
|
+
# Office Open XML formats (2007+)
|
|
19
|
+
DOCX = "docx"
|
|
20
|
+
PPTX = "pptx"
|
|
21
|
+
XLSX = "xlsx"
|
|
22
|
+
|
|
23
|
+
# Legacy Office formats (97-2003), requires LibreOffice
|
|
24
|
+
DOC = "doc"
|
|
25
|
+
PPT = "ppt"
|
|
26
|
+
XLS = "xls"
|
|
27
|
+
|
|
28
|
+
# PDF
|
|
29
|
+
PDF = "pdf"
|
|
30
|
+
|
|
31
|
+
# Text
|
|
32
|
+
TXT = "txt"
|
|
33
|
+
MD = "md"
|
|
34
|
+
|
|
35
|
+
# Images
|
|
36
|
+
JPEG = "jpeg"
|
|
37
|
+
JPG = "jpg"
|
|
38
|
+
PNG = "png"
|
|
39
|
+
WEBP = "webp"
|
|
40
|
+
|
|
41
|
+
# Unknown
|
|
42
|
+
UNKNOWN = "unknown"
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
# Mapping of file extensions to formats
|
|
46
|
+
EXTENSION_MAP: dict[str, FileFormat] = {
|
|
47
|
+
".docx": FileFormat.DOCX,
|
|
48
|
+
".doc": FileFormat.DOC,
|
|
49
|
+
".pptx": FileFormat.PPTX,
|
|
50
|
+
".ppt": FileFormat.PPT,
|
|
51
|
+
".xlsx": FileFormat.XLSX,
|
|
52
|
+
".xls": FileFormat.XLS,
|
|
53
|
+
".pdf": FileFormat.PDF,
|
|
54
|
+
".txt": FileFormat.TXT,
|
|
55
|
+
".md": FileFormat.MD,
|
|
56
|
+
".markdown": FileFormat.MD,
|
|
57
|
+
".jpeg": FileFormat.JPEG,
|
|
58
|
+
".jpg": FileFormat.JPG,
|
|
59
|
+
".png": FileFormat.PNG,
|
|
60
|
+
".webp": FileFormat.WEBP,
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def detect_format(path: Path | str) -> FileFormat:
|
|
65
|
+
"""Detect file format from extension."""
|
|
66
|
+
path = Path(path)
|
|
67
|
+
ext = path.suffix.lower()
|
|
68
|
+
return EXTENSION_MAP.get(ext, FileFormat.UNKNOWN)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
@dataclass
|
|
72
|
+
class ExtractedImage:
|
|
73
|
+
"""Represents an image extracted from a document."""
|
|
74
|
+
|
|
75
|
+
path: Path
|
|
76
|
+
index: int
|
|
77
|
+
original_name: str
|
|
78
|
+
mime_type: str
|
|
79
|
+
width: int
|
|
80
|
+
height: int
|
|
81
|
+
data: bytes | None = None # Raw image data before saving
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
@dataclass
|
|
85
|
+
class ConvertResult:
|
|
86
|
+
"""Result of a document conversion."""
|
|
87
|
+
|
|
88
|
+
markdown: str
|
|
89
|
+
images: list[ExtractedImage] = field(default_factory=list)
|
|
90
|
+
metadata: dict = field(default_factory=dict)
|
|
91
|
+
|
|
92
|
+
@property
|
|
93
|
+
def has_images(self) -> bool:
|
|
94
|
+
"""Check if any images were extracted."""
|
|
95
|
+
return len(self.images) > 0
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
class BaseConverter(ABC):
|
|
99
|
+
"""Abstract base class for document converters."""
|
|
100
|
+
|
|
101
|
+
# Formats this converter can handle
|
|
102
|
+
supported_formats: list[FileFormat] = []
|
|
103
|
+
|
|
104
|
+
def __init__(self, config: MarkitaiConfig | None = None) -> None:
|
|
105
|
+
"""Initialize converter with optional configuration."""
|
|
106
|
+
self.config = config
|
|
107
|
+
|
|
108
|
+
@abstractmethod
|
|
109
|
+
def convert(
|
|
110
|
+
self, input_path: Path, output_dir: Path | None = None
|
|
111
|
+
) -> ConvertResult:
|
|
112
|
+
"""
|
|
113
|
+
Convert a document to Markdown.
|
|
114
|
+
|
|
115
|
+
Args:
|
|
116
|
+
input_path: Path to the input file
|
|
117
|
+
output_dir: Optional output directory for extracted images
|
|
118
|
+
|
|
119
|
+
Returns:
|
|
120
|
+
ConvertResult containing markdown and extracted images
|
|
121
|
+
"""
|
|
122
|
+
pass
|
|
123
|
+
|
|
124
|
+
def can_convert(self, path: Path | str) -> bool:
|
|
125
|
+
"""Check if this converter can handle the given file."""
|
|
126
|
+
fmt = detect_format(path)
|
|
127
|
+
return fmt in self.supported_formats
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
# Registry of converters by format
|
|
131
|
+
_converter_registry: dict[FileFormat, type[BaseConverter]] = {}
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def register_converter(fmt: FileFormat):
|
|
135
|
+
"""Decorator to register a converter for a file format."""
|
|
136
|
+
|
|
137
|
+
def decorator(cls: type[BaseConverter]):
|
|
138
|
+
_converter_registry[fmt] = cls
|
|
139
|
+
return cls
|
|
140
|
+
|
|
141
|
+
return decorator
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def get_converter(
|
|
145
|
+
path: Path | str,
|
|
146
|
+
config: MarkitaiConfig | None = None,
|
|
147
|
+
) -> BaseConverter | None:
|
|
148
|
+
"""
|
|
149
|
+
Get an appropriate converter for the given file.
|
|
150
|
+
|
|
151
|
+
Args:
|
|
152
|
+
path: Path to the file to convert
|
|
153
|
+
config: Optional configuration
|
|
154
|
+
|
|
155
|
+
Returns:
|
|
156
|
+
A converter instance or None if no converter found
|
|
157
|
+
"""
|
|
158
|
+
fmt = detect_format(path)
|
|
159
|
+
converter_cls = _converter_registry.get(fmt)
|
|
160
|
+
|
|
161
|
+
if converter_cls is None:
|
|
162
|
+
return None
|
|
163
|
+
|
|
164
|
+
return converter_cls(config=config)
|