markitai 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. markitai/__init__.py +3 -0
  2. markitai/batch.py +1316 -0
  3. markitai/cli.py +3979 -0
  4. markitai/config.py +602 -0
  5. markitai/config.schema.json +748 -0
  6. markitai/constants.py +222 -0
  7. markitai/converter/__init__.py +49 -0
  8. markitai/converter/_patches.py +98 -0
  9. markitai/converter/base.py +164 -0
  10. markitai/converter/image.py +181 -0
  11. markitai/converter/legacy.py +606 -0
  12. markitai/converter/office.py +526 -0
  13. markitai/converter/pdf.py +679 -0
  14. markitai/converter/text.py +63 -0
  15. markitai/fetch.py +1725 -0
  16. markitai/image.py +1335 -0
  17. markitai/json_order.py +550 -0
  18. markitai/llm.py +4339 -0
  19. markitai/ocr.py +347 -0
  20. markitai/prompts/__init__.py +159 -0
  21. markitai/prompts/cleaner.md +93 -0
  22. markitai/prompts/document_enhance.md +77 -0
  23. markitai/prompts/document_enhance_complete.md +65 -0
  24. markitai/prompts/document_process.md +60 -0
  25. markitai/prompts/frontmatter.md +28 -0
  26. markitai/prompts/image_analysis.md +21 -0
  27. markitai/prompts/image_caption.md +8 -0
  28. markitai/prompts/image_description.md +13 -0
  29. markitai/prompts/page_content.md +17 -0
  30. markitai/prompts/url_enhance.md +78 -0
  31. markitai/security.py +286 -0
  32. markitai/types.py +30 -0
  33. markitai/urls.py +187 -0
  34. markitai/utils/__init__.py +33 -0
  35. markitai/utils/executor.py +69 -0
  36. markitai/utils/mime.py +85 -0
  37. markitai/utils/office.py +262 -0
  38. markitai/utils/output.py +53 -0
  39. markitai/utils/paths.py +81 -0
  40. markitai/utils/text.py +359 -0
  41. markitai/workflow/__init__.py +37 -0
  42. markitai/workflow/core.py +760 -0
  43. markitai/workflow/helpers.py +509 -0
  44. markitai/workflow/single.py +369 -0
  45. markitai-0.3.0.dist-info/METADATA +159 -0
  46. markitai-0.3.0.dist-info/RECORD +48 -0
  47. markitai-0.3.0.dist-info/WHEEL +4 -0
  48. markitai-0.3.0.dist-info/entry_points.txt +2 -0
markitai/constants.py ADDED
@@ -0,0 +1,222 @@
1
+ """Centralized constants for markitai.
2
+
3
+ This module contains all hardcoded constants used throughout the codebase.
4
+ Grouping them here makes it easier to:
5
+ - Find and modify default values
6
+ - Understand system limits at a glance
7
+ - Maintain consistency across modules
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ # =============================================================================
13
+ # File Size Limits
14
+ # =============================================================================
15
+
16
+ MAX_STATE_FILE_SIZE = 10 * 1024 * 1024 # 10 MB - batch state file
17
+ MAX_IMAGE_SIZE = 100 * 1024 * 1024 # 100 MB - single image
18
+ MAX_TOTAL_IMAGES_SIZE = 500 * 1024 * 1024 # 500 MB - all images combined
19
+ MAX_DOCUMENT_SIZE = 500 * 1024 * 1024 # 500 MB - input document
20
+
21
+ # =============================================================================
22
+ # LLM Processing
23
+ # =============================================================================
24
+
25
+ # Retry settings
26
+ DEFAULT_MAX_RETRIES = 2
27
+ DEFAULT_RETRY_BASE_DELAY = 1.0 # seconds
28
+ DEFAULT_RETRY_MAX_DELAY = 60.0 # seconds
29
+
30
+ # Instructor retry settings (for structured JSON output validation)
31
+ # When LLM returns malformed JSON, Instructor can retry with validation error
32
+ # feedback, allowing the LLM to fix issues like incorrect escaping
33
+ DEFAULT_INSTRUCTOR_MAX_RETRIES = 1
34
+
35
+ # Token limits
36
+ DEFAULT_MAX_OUTPUT_TOKENS = 8192 # Conservative default for most models
37
+ DEFAULT_MAX_CONTENT_CHARS = (
38
+ 32000 # Max chars for document processing (truncation threshold)
39
+ )
40
+
41
+ # Concurrency
42
+ DEFAULT_IO_CONCURRENCY = 20 # I/O operations (file reads, etc.)
43
+ DEFAULT_LLM_CONCURRENCY = 10 # LLM API calls (config default)
44
+ DEFAULT_BATCH_CONCURRENCY = 10 # Batch file processing (config default)
45
+ DEFAULT_URL_CONCURRENCY = 5 # URL fetching (separate from file processing)
46
+
47
+ # Batch sizes
48
+ DEFAULT_MAX_IMAGES_PER_BATCH = 10 # Images per LLM vision call
49
+ DEFAULT_MAX_PAGES_PER_BATCH = 5 # Pages per LLM call for document processing (reduced from 10 to avoid max_tokens)
50
+
51
+ # Router settings
52
+ DEFAULT_ROUTER_NUM_RETRIES = 2
53
+ DEFAULT_ROUTER_TIMEOUT = 120 # seconds
54
+
55
+ # Note: RETRYABLE_ERRORS tuple is defined in llm.py as it contains
56
+ # actual exception classes from litellm that cannot be imported here
57
+
58
+ # =============================================================================
59
+ # Image Processing
60
+ # =============================================================================
61
+
62
+ DEFAULT_IMAGE_QUALITY = 75 # JPEG quality (1-100)
63
+ DEFAULT_RENDER_DPI = 150 # DPI for page screenshots (PDF, PPTX, etc.)
64
+ DEFAULT_IMAGE_IO_CONCURRENCY = 8 # Concurrent I/O for image saving (optimized for NVMe)
65
+ DEFAULT_IMAGE_MULTIPROCESS_THRESHOLD = (
66
+ 10 # Use multiprocess compression when images > this
67
+ )
68
+ DEFAULT_IMAGE_MAX_WIDTH = 1920
69
+ DEFAULT_IMAGE_MAX_HEIGHT = 99999 # Effectively unlimited (thumbnail won't upscale)
70
+
71
+ # Image filter thresholds
72
+ DEFAULT_IMAGE_FILTER_MIN_WIDTH = 50
73
+ DEFAULT_IMAGE_FILTER_MIN_HEIGHT = 50
74
+ DEFAULT_IMAGE_FILTER_MIN_AREA = 5000
75
+
76
+ # =============================================================================
77
+ # Cache Settings
78
+ # =============================================================================
79
+
80
+ # In-memory cache (legacy, still used for image bytes cache)
81
+ DEFAULT_CACHE_MAXSIZE = 100 # Max entries in LLM content cache
82
+ DEFAULT_CACHE_TTL_SECONDS = 300 # Cache TTL (5 minutes)
83
+
84
+ # Persistent SQLite cache
85
+ DEFAULT_CACHE_SIZE_LIMIT = 512 * 1024 * 1024 # 512 MB per cache file
86
+ DEFAULT_GLOBAL_CACHE_DIR = "~/.markitai" # Global cache directory
87
+ DEFAULT_PROJECT_CACHE_DIR = ".markitai" # Project-level cache directory
88
+ DEFAULT_CACHE_DB_FILENAME = "cache.db" # SQLite database filename
89
+ DEFAULT_CACHE_CONTENT_TRUNCATE = 50000 # Truncate content for hash key (chars)
90
+
91
+ # =============================================================================
92
+ # Batch Processing
93
+ # =============================================================================
94
+
95
+ DEFAULT_STATE_FLUSH_INTERVAL_SECONDS = 10 # Increased to reduce I/O overhead
96
+ DEFAULT_SCAN_MAX_DEPTH = 5
97
+ DEFAULT_SCAN_MAX_FILES = 10000
98
+
99
+ # =============================================================================
100
+ # Logging
101
+ # =============================================================================
102
+
103
+ DEFAULT_LOG_ROTATION = "10 MB"
104
+ DEFAULT_LOG_RETENTION = "7 days"
105
+
106
+ # =============================================================================
107
+ # UI / Display
108
+ # =============================================================================
109
+
110
+ DEFAULT_LOG_PANEL_MAX_LINES = 8 # Lines shown in verbose mode log panel
111
+ DEFAULT_JSON_INDENT = 2 # JSON output indentation
112
+
113
+ # =============================================================================
114
+ # Paths and Filenames
115
+ # =============================================================================
116
+
117
+ DEFAULT_OUTPUT_DIR = "./output"
118
+ DEFAULT_PROMPTS_DIR = "~/.markitai/prompts"
119
+ DEFAULT_LOG_DIR = "~/.markitai/logs"
120
+ CONFIG_FILENAME = "markitai.json"
121
+
122
+ # =============================================================================
123
+ # OCR
124
+ # =============================================================================
125
+
126
+ DEFAULT_OCR_LANG = "en"
127
+ DEFAULT_OCR_SAMPLE_PAGES = 3 # Pages to sample for scanned PDF detection
128
+
129
+ # =============================================================================
130
+ # Misc Defaults
131
+ # =============================================================================
132
+
133
+ DEFAULT_MODEL_WEIGHT = 1 # Default model weight in router
134
+ DEFAULT_SCREENSHOT_MAX_BYTES = int(
135
+ 3.5 * 1024 * 1024
136
+ ) # 3.5 MB max (base64 adds ~33%, must stay under 5MB API limit)
137
+
138
+ # URL Screenshot settings
139
+ DEFAULT_SCREENSHOT_VIEWPORT_WIDTH = 1920
140
+ DEFAULT_SCREENSHOT_VIEWPORT_HEIGHT = 1080
141
+ DEFAULT_SCREENSHOT_QUALITY = 75 # JPEG quality (1-100)
142
+ DEFAULT_SCREENSHOT_MAX_HEIGHT = 10000 # Max height for full-page URL screenshots
143
+ DEFAULT_ROUTING_STRATEGY = "simple-shuffle"
144
+ DEFAULT_IMAGE_FORMAT = "jpeg"
145
+ DEFAULT_ON_CONFLICT = "rename"
146
+ DEFAULT_LOG_LEVEL = "INFO"
147
+
148
+ # =============================================================================
149
+ # URL Fetch Settings
150
+ # =============================================================================
151
+
152
+ DEFAULT_FETCH_STRATEGY = "auto" # auto | static | browser | jina
153
+ DEFAULT_AGENT_BROWSER_COMMAND = "agent-browser"
154
+ DEFAULT_AGENT_BROWSER_TIMEOUT = 30000 # ms
155
+ DEFAULT_AGENT_BROWSER_WAIT_FOR = (
156
+ "domcontentloaded" # load | domcontentloaded | networkidle
157
+ )
158
+ DEFAULT_AGENT_BROWSER_EXTRA_WAIT_MS = (
159
+ 1000 # Extra wait after load state (for JS rendering)
160
+ )
161
+ DEFAULT_JINA_TIMEOUT = 30 # seconds
162
+ DEFAULT_JINA_BASE_URL = "https://r.jina.ai"
163
+
164
+ # Domains that typically require JavaScript rendering
165
+ DEFAULT_FETCH_FALLBACK_PATTERNS: tuple[str, ...] = (
166
+ "twitter.com",
167
+ "x.com",
168
+ "instagram.com",
169
+ "facebook.com",
170
+ "linkedin.com",
171
+ "threads.net",
172
+ )
173
+
174
+ # Patterns that indicate JavaScript is required
175
+ JS_REQUIRED_PATTERNS: tuple[str, ...] = (
176
+ "JavaScript is disabled",
177
+ "JavaScript is required",
178
+ "Please enable JavaScript",
179
+ "This page requires JavaScript",
180
+ "You need to enable JavaScript",
181
+ "enable javascript",
182
+ "requires javascript",
183
+ "noscript",
184
+ )
185
+
186
+ # =============================================================================
187
+ # MIME Type Mappings
188
+ # =============================================================================
189
+
190
+ # Extension to MIME type mapping (for encoding images to send to LLM APIs)
191
+ EXTENSION_TO_MIME: dict[str, str] = {
192
+ ".jpg": "image/jpeg",
193
+ ".jpeg": "image/jpeg",
194
+ ".png": "image/png",
195
+ ".gif": "image/gif",
196
+ ".webp": "image/webp",
197
+ ".bmp": "image/bmp",
198
+ ".svg": "image/svg+xml",
199
+ ".ico": "image/x-icon",
200
+ }
201
+
202
+ # MIME type to extension mapping (for decoding content-type headers)
203
+ MIME_TO_EXTENSION: dict[str, str] = {
204
+ "image/jpeg": ".jpg",
205
+ "image/png": ".png",
206
+ "image/gif": ".gif",
207
+ "image/webp": ".webp",
208
+ "image/svg+xml": ".svg",
209
+ "image/bmp": ".bmp",
210
+ "image/x-icon": ".ico",
211
+ "image/vnd.microsoft.icon": ".ico",
212
+ }
213
+
214
+ # Supported image extensions for standalone image detection
215
+ IMAGE_EXTENSIONS: tuple[str, ...] = (
216
+ ".jpg",
217
+ ".jpeg",
218
+ ".png",
219
+ ".webp",
220
+ ".gif",
221
+ ".bmp",
222
+ )
@@ -0,0 +1,49 @@
1
+ """Converter modules for various document formats."""
2
+
3
+ # Apply compatibility patches before importing converters
4
+ from markitai.converter._patches import apply_all_patches
5
+
6
+ apply_all_patches()
7
+
8
+ from markitai.converter.base import (
9
+ BaseConverter,
10
+ ConvertResult,
11
+ ExtractedImage,
12
+ FileFormat,
13
+ detect_format,
14
+ get_converter,
15
+ )
16
+ from markitai.converter.image import (
17
+ JpegConverter,
18
+ JpgConverter,
19
+ PngConverter,
20
+ WebpConverter,
21
+ )
22
+ from markitai.converter.legacy import DocConverter, PptConverter, XlsConverter
23
+
24
+ # Import converters to register them
25
+ from markitai.converter.office import DocxConverter, PptxConverter, XlsxConverter
26
+ from markitai.converter.pdf import PdfConverter
27
+ from markitai.converter.text import MarkdownConverter, TxtConverter
28
+
29
+ __all__ = [
30
+ "BaseConverter",
31
+ "ConvertResult",
32
+ "ExtractedImage",
33
+ "FileFormat",
34
+ "get_converter",
35
+ "detect_format",
36
+ "DocxConverter",
37
+ "PptxConverter",
38
+ "XlsxConverter",
39
+ "PdfConverter",
40
+ "TxtConverter",
41
+ "MarkdownConverter",
42
+ "DocConverter",
43
+ "PptConverter",
44
+ "XlsConverter",
45
+ "JpegConverter",
46
+ "JpgConverter",
47
+ "PngConverter",
48
+ "WebpConverter",
49
+ ]
@@ -0,0 +1,98 @@
1
+ """Compatibility patches for third-party libraries.
2
+
3
+ This module applies monkey patches to fix known issues in dependencies:
4
+ - openpyxl 3.1.x FileVersion TypeError ('bg' argument)
5
+ - lxml XMLSyntaxError from malformed PPTX files
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import functools
11
+ from typing import Any
12
+
13
+ _patches_applied = False
14
+
15
+
16
+ def apply_openpyxl_patches() -> None:
17
+ """Apply patches for openpyxl compatibility issues.
18
+
19
+ Fixes: TypeError: FileVersion.__init__() got an unexpected keyword argument 'bg'
20
+
21
+ This issue occurs in openpyxl 3.1.x when reading Excel files created by
22
+ older versions of MS Office or converted from .xls format.
23
+ """
24
+ try:
25
+ from openpyxl.workbook.properties import FileVersion
26
+ except ImportError:
27
+ return # openpyxl not installed
28
+
29
+ original_init = FileVersion.__init__
30
+
31
+ @functools.wraps(original_init)
32
+ def patched_init(self: Any, *args: Any, **kwargs: Any) -> None:
33
+ # Remove unsupported keyword arguments
34
+ unsupported_keys = ["bg"]
35
+ for key in unsupported_keys:
36
+ kwargs.pop(key, None)
37
+ original_init(self, *args, **kwargs)
38
+
39
+ # Only patch if not already patched
40
+ if not getattr(FileVersion.__init__, "_markitai_patched", False):
41
+ FileVersion.__init__ = patched_init
42
+ FileVersion.__init__._markitai_patched = True # type: ignore[attr-defined]
43
+
44
+
45
+ def apply_pptx_patches() -> None:
46
+ """Apply patches for python-pptx/lxml compatibility issues.
47
+
48
+ Fixes: XMLSyntaxError from malformed PPTX files converted from PPT
49
+
50
+ When MS Office converts .ppt to .pptx, it may produce XML with
51
+ mismatched tags (e.g., 'rupB' vs 'bgPr'). This patch makes the
52
+ XML parser more lenient.
53
+ """
54
+ # Check if lxml is available
55
+ try:
56
+ from lxml import etree
57
+ except ImportError:
58
+ return # lxml not installed
59
+
60
+ # Check if python-pptx is installed
61
+ try:
62
+ import pptx.oxml
63
+ except ImportError:
64
+ return # python-pptx not installed
65
+
66
+ # Create a lenient parser that recovers from errors
67
+ _lenient_parser = etree.XMLParser(recover=True, remove_blank_text=True)
68
+
69
+ original_parse_xml = pptx.oxml.parse_xml
70
+
71
+ @functools.wraps(original_parse_xml)
72
+ def patched_parse_xml(xml: bytes | str) -> Any:
73
+ try:
74
+ return original_parse_xml(xml)
75
+ except etree.XMLSyntaxError:
76
+ # Fallback to lenient parser
77
+ if isinstance(xml, str):
78
+ xml = xml.encode("utf-8")
79
+ return etree.fromstring(xml, parser=_lenient_parser)
80
+
81
+ if not getattr(pptx.oxml.parse_xml, "_markitai_patched", False):
82
+ pptx.oxml.parse_xml = patched_parse_xml
83
+ pptx.oxml.parse_xml._markitai_patched = True # type: ignore[attr-defined]
84
+
85
+
86
+ def apply_all_patches() -> None:
87
+ """Apply all compatibility patches.
88
+
89
+ This function is idempotent - calling it multiple times has no effect.
90
+ """
91
+ global _patches_applied
92
+ if _patches_applied:
93
+ return
94
+
95
+ apply_openpyxl_patches()
96
+ apply_pptx_patches()
97
+
98
+ _patches_applied = True
@@ -0,0 +1,164 @@
1
+ """Base converter classes and utilities."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from abc import ABC, abstractmethod
6
+ from dataclasses import dataclass, field
7
+ from enum import Enum
8
+ from pathlib import Path
9
+ from typing import TYPE_CHECKING
10
+
11
+ if TYPE_CHECKING:
12
+ from markitai.config import MarkitaiConfig
13
+
14
+
15
+ class FileFormat(Enum):
16
+ """Supported file formats."""
17
+
18
+ # Office Open XML formats (2007+)
19
+ DOCX = "docx"
20
+ PPTX = "pptx"
21
+ XLSX = "xlsx"
22
+
23
+ # Legacy Office formats (97-2003), requires LibreOffice
24
+ DOC = "doc"
25
+ PPT = "ppt"
26
+ XLS = "xls"
27
+
28
+ # PDF
29
+ PDF = "pdf"
30
+
31
+ # Text
32
+ TXT = "txt"
33
+ MD = "md"
34
+
35
+ # Images
36
+ JPEG = "jpeg"
37
+ JPG = "jpg"
38
+ PNG = "png"
39
+ WEBP = "webp"
40
+
41
+ # Unknown
42
+ UNKNOWN = "unknown"
43
+
44
+
45
+ # Mapping of file extensions to formats
46
+ EXTENSION_MAP: dict[str, FileFormat] = {
47
+ ".docx": FileFormat.DOCX,
48
+ ".doc": FileFormat.DOC,
49
+ ".pptx": FileFormat.PPTX,
50
+ ".ppt": FileFormat.PPT,
51
+ ".xlsx": FileFormat.XLSX,
52
+ ".xls": FileFormat.XLS,
53
+ ".pdf": FileFormat.PDF,
54
+ ".txt": FileFormat.TXT,
55
+ ".md": FileFormat.MD,
56
+ ".markdown": FileFormat.MD,
57
+ ".jpeg": FileFormat.JPEG,
58
+ ".jpg": FileFormat.JPG,
59
+ ".png": FileFormat.PNG,
60
+ ".webp": FileFormat.WEBP,
61
+ }
62
+
63
+
64
+ def detect_format(path: Path | str) -> FileFormat:
65
+ """Detect file format from extension."""
66
+ path = Path(path)
67
+ ext = path.suffix.lower()
68
+ return EXTENSION_MAP.get(ext, FileFormat.UNKNOWN)
69
+
70
+
71
+ @dataclass
72
+ class ExtractedImage:
73
+ """Represents an image extracted from a document."""
74
+
75
+ path: Path
76
+ index: int
77
+ original_name: str
78
+ mime_type: str
79
+ width: int
80
+ height: int
81
+ data: bytes | None = None # Raw image data before saving
82
+
83
+
84
+ @dataclass
85
+ class ConvertResult:
86
+ """Result of a document conversion."""
87
+
88
+ markdown: str
89
+ images: list[ExtractedImage] = field(default_factory=list)
90
+ metadata: dict = field(default_factory=dict)
91
+
92
+ @property
93
+ def has_images(self) -> bool:
94
+ """Check if any images were extracted."""
95
+ return len(self.images) > 0
96
+
97
+
98
+ class BaseConverter(ABC):
99
+ """Abstract base class for document converters."""
100
+
101
+ # Formats this converter can handle
102
+ supported_formats: list[FileFormat] = []
103
+
104
+ def __init__(self, config: MarkitaiConfig | None = None) -> None:
105
+ """Initialize converter with optional configuration."""
106
+ self.config = config
107
+
108
+ @abstractmethod
109
+ def convert(
110
+ self, input_path: Path, output_dir: Path | None = None
111
+ ) -> ConvertResult:
112
+ """
113
+ Convert a document to Markdown.
114
+
115
+ Args:
116
+ input_path: Path to the input file
117
+ output_dir: Optional output directory for extracted images
118
+
119
+ Returns:
120
+ ConvertResult containing markdown and extracted images
121
+ """
122
+ pass
123
+
124
+ def can_convert(self, path: Path | str) -> bool:
125
+ """Check if this converter can handle the given file."""
126
+ fmt = detect_format(path)
127
+ return fmt in self.supported_formats
128
+
129
+
130
+ # Registry of converters by format
131
+ _converter_registry: dict[FileFormat, type[BaseConverter]] = {}
132
+
133
+
134
+ def register_converter(fmt: FileFormat):
135
+ """Decorator to register a converter for a file format."""
136
+
137
+ def decorator(cls: type[BaseConverter]):
138
+ _converter_registry[fmt] = cls
139
+ return cls
140
+
141
+ return decorator
142
+
143
+
144
+ def get_converter(
145
+ path: Path | str,
146
+ config: MarkitaiConfig | None = None,
147
+ ) -> BaseConverter | None:
148
+ """
149
+ Get an appropriate converter for the given file.
150
+
151
+ Args:
152
+ path: Path to the file to convert
153
+ config: Optional configuration
154
+
155
+ Returns:
156
+ A converter instance or None if no converter found
157
+ """
158
+ fmt = detect_format(path)
159
+ converter_cls = _converter_registry.get(fmt)
160
+
161
+ if converter_cls is None:
162
+ return None
163
+
164
+ return converter_cls(config=config)