markitai 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- markitai/__init__.py +3 -0
- markitai/batch.py +1316 -0
- markitai/cli.py +3979 -0
- markitai/config.py +602 -0
- markitai/config.schema.json +748 -0
- markitai/constants.py +222 -0
- markitai/converter/__init__.py +49 -0
- markitai/converter/_patches.py +98 -0
- markitai/converter/base.py +164 -0
- markitai/converter/image.py +181 -0
- markitai/converter/legacy.py +606 -0
- markitai/converter/office.py +526 -0
- markitai/converter/pdf.py +679 -0
- markitai/converter/text.py +63 -0
- markitai/fetch.py +1725 -0
- markitai/image.py +1335 -0
- markitai/json_order.py +550 -0
- markitai/llm.py +4339 -0
- markitai/ocr.py +347 -0
- markitai/prompts/__init__.py +159 -0
- markitai/prompts/cleaner.md +93 -0
- markitai/prompts/document_enhance.md +77 -0
- markitai/prompts/document_enhance_complete.md +65 -0
- markitai/prompts/document_process.md +60 -0
- markitai/prompts/frontmatter.md +28 -0
- markitai/prompts/image_analysis.md +21 -0
- markitai/prompts/image_caption.md +8 -0
- markitai/prompts/image_description.md +13 -0
- markitai/prompts/page_content.md +17 -0
- markitai/prompts/url_enhance.md +78 -0
- markitai/security.py +286 -0
- markitai/types.py +30 -0
- markitai/urls.py +187 -0
- markitai/utils/__init__.py +33 -0
- markitai/utils/executor.py +69 -0
- markitai/utils/mime.py +85 -0
- markitai/utils/office.py +262 -0
- markitai/utils/output.py +53 -0
- markitai/utils/paths.py +81 -0
- markitai/utils/text.py +359 -0
- markitai/workflow/__init__.py +37 -0
- markitai/workflow/core.py +760 -0
- markitai/workflow/helpers.py +509 -0
- markitai/workflow/single.py +369 -0
- markitai-0.3.0.dist-info/METADATA +159 -0
- markitai-0.3.0.dist-info/RECORD +48 -0
- markitai-0.3.0.dist-info/WHEEL +4 -0
- markitai-0.3.0.dist-info/entry_points.txt +2 -0
|
@@ -0,0 +1,181 @@
|
|
|
1
|
+
"""Image file converters using OCR or LLM Vision."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import shutil
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import TYPE_CHECKING
|
|
8
|
+
|
|
9
|
+
from loguru import logger
|
|
10
|
+
|
|
11
|
+
from markitai.converter.base import (
|
|
12
|
+
BaseConverter,
|
|
13
|
+
ConvertResult,
|
|
14
|
+
FileFormat,
|
|
15
|
+
register_converter,
|
|
16
|
+
)
|
|
17
|
+
from markitai.utils.paths import ensure_assets_dir
|
|
18
|
+
|
|
19
|
+
if TYPE_CHECKING:
|
|
20
|
+
pass
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class ImageConverter(BaseConverter):
|
|
24
|
+
"""Converter for image files using OCR or LLM Vision.
|
|
25
|
+
|
|
26
|
+
Extracts text from images using RapidOCR by default,
|
|
27
|
+
or LLM Vision when --llm --alt|--desc flags are used.
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
supported_formats = [
|
|
31
|
+
FileFormat.JPEG,
|
|
32
|
+
FileFormat.JPG,
|
|
33
|
+
FileFormat.PNG,
|
|
34
|
+
FileFormat.WEBP,
|
|
35
|
+
]
|
|
36
|
+
|
|
37
|
+
def convert(
|
|
38
|
+
self, input_path: Path, output_dir: Path | None = None
|
|
39
|
+
) -> ConvertResult:
|
|
40
|
+
"""Convert image to Markdown by extracting text via OCR.
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
input_path: Path to the image file
|
|
44
|
+
output_dir: Optional output directory for copying image
|
|
45
|
+
|
|
46
|
+
Returns:
|
|
47
|
+
ConvertResult containing markdown with OCR text
|
|
48
|
+
"""
|
|
49
|
+
input_path = Path(input_path)
|
|
50
|
+
|
|
51
|
+
# Check if OCR and LLM are enabled in config
|
|
52
|
+
use_ocr = self.config and self.config.ocr.enabled
|
|
53
|
+
use_llm = self.config and self.config.llm.enabled
|
|
54
|
+
|
|
55
|
+
# Copy image to assets directory and get relative path
|
|
56
|
+
image_ref_path = self._copy_to_assets(input_path, output_dir)
|
|
57
|
+
|
|
58
|
+
if use_ocr and use_llm:
|
|
59
|
+
# --ocr --llm: Skip OCR, let LLM Vision analyze the image later
|
|
60
|
+
# Just return a placeholder - LLM will process it in cli.py
|
|
61
|
+
markdown = self._create_image_placeholder(input_path, image_ref_path)
|
|
62
|
+
return ConvertResult(
|
|
63
|
+
markdown=markdown,
|
|
64
|
+
images=[],
|
|
65
|
+
metadata={
|
|
66
|
+
"format": input_path.suffix.lstrip(".").upper(),
|
|
67
|
+
"source": str(input_path),
|
|
68
|
+
"asset_path": image_ref_path,
|
|
69
|
+
},
|
|
70
|
+
)
|
|
71
|
+
elif use_ocr:
|
|
72
|
+
# --ocr only: Use RapidOCR
|
|
73
|
+
markdown = self._convert_with_ocr(input_path, image_ref_path)
|
|
74
|
+
else:
|
|
75
|
+
# Just return a placeholder with image reference
|
|
76
|
+
markdown = self._create_image_placeholder(input_path, image_ref_path)
|
|
77
|
+
|
|
78
|
+
return ConvertResult(
|
|
79
|
+
markdown=markdown,
|
|
80
|
+
images=[], # No embedded images to extract
|
|
81
|
+
metadata={
|
|
82
|
+
"format": input_path.suffix.lstrip(".").upper(),
|
|
83
|
+
"source": str(input_path),
|
|
84
|
+
"ocr_used": use_ocr and not use_llm,
|
|
85
|
+
"asset_path": image_ref_path,
|
|
86
|
+
},
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
def _copy_to_assets(self, input_path: Path, output_dir: Path | None) -> str:
|
|
90
|
+
"""Copy image to assets directory and return relative path.
|
|
91
|
+
|
|
92
|
+
Args:
|
|
93
|
+
input_path: Path to the source image file
|
|
94
|
+
output_dir: Output directory (assets will be created inside)
|
|
95
|
+
|
|
96
|
+
Returns:
|
|
97
|
+
Relative path to use in markdown (e.g., "assets/image.jpg")
|
|
98
|
+
"""
|
|
99
|
+
if output_dir is None:
|
|
100
|
+
# No output directory specified, use original filename
|
|
101
|
+
return input_path.name
|
|
102
|
+
|
|
103
|
+
assets_dir = ensure_assets_dir(output_dir)
|
|
104
|
+
|
|
105
|
+
# Copy image to assets directory
|
|
106
|
+
dest_path = assets_dir / input_path.name
|
|
107
|
+
if not dest_path.exists():
|
|
108
|
+
shutil.copy2(input_path, dest_path)
|
|
109
|
+
logger.debug(f"Copied {input_path.name} to {dest_path}")
|
|
110
|
+
|
|
111
|
+
return f"assets/{input_path.name}"
|
|
112
|
+
|
|
113
|
+
def _convert_with_ocr(self, input_path: Path, image_ref_path: str) -> str:
|
|
114
|
+
"""Convert image using OCR.
|
|
115
|
+
|
|
116
|
+
Args:
|
|
117
|
+
input_path: Path to the image file
|
|
118
|
+
image_ref_path: Relative path for image reference in markdown
|
|
119
|
+
|
|
120
|
+
Returns:
|
|
121
|
+
Markdown with OCR extracted text
|
|
122
|
+
"""
|
|
123
|
+
try:
|
|
124
|
+
from markitai.ocr import OCRProcessor
|
|
125
|
+
|
|
126
|
+
processor = OCRProcessor(self.config.ocr if self.config else None)
|
|
127
|
+
result = processor.recognize_to_markdown(input_path)
|
|
128
|
+
|
|
129
|
+
if result.strip():
|
|
130
|
+
logger.debug(f"OCR extracted text from {input_path.name}")
|
|
131
|
+
return f"# {input_path.stem}\n\n{result}"
|
|
132
|
+
else:
|
|
133
|
+
logger.warning(f"OCR found no text in {input_path.name}")
|
|
134
|
+
return self._create_image_placeholder(input_path, image_ref_path)
|
|
135
|
+
|
|
136
|
+
except ImportError:
|
|
137
|
+
logger.warning("RapidOCR not available, returning placeholder")
|
|
138
|
+
return self._create_image_placeholder(input_path, image_ref_path)
|
|
139
|
+
except Exception as e:
|
|
140
|
+
logger.warning(f"OCR failed for {input_path.name}: {e}")
|
|
141
|
+
return self._create_image_placeholder(input_path, image_ref_path)
|
|
142
|
+
|
|
143
|
+
def _create_image_placeholder(self, input_path: Path, image_ref_path: str) -> str:
|
|
144
|
+
"""Create a placeholder markdown for the image.
|
|
145
|
+
|
|
146
|
+
Args:
|
|
147
|
+
input_path: Path to the image file
|
|
148
|
+
image_ref_path: Relative path for image reference in markdown
|
|
149
|
+
|
|
150
|
+
Returns:
|
|
151
|
+
Markdown with image placeholder
|
|
152
|
+
"""
|
|
153
|
+
return f"# {input_path.stem}\n\n\n"
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
@register_converter(FileFormat.JPEG)
|
|
157
|
+
class JpegConverter(ImageConverter):
|
|
158
|
+
"""Converter for JPEG images."""
|
|
159
|
+
|
|
160
|
+
supported_formats = [FileFormat.JPEG]
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
@register_converter(FileFormat.JPG)
|
|
164
|
+
class JpgConverter(ImageConverter):
|
|
165
|
+
"""Converter for JPG images."""
|
|
166
|
+
|
|
167
|
+
supported_formats = [FileFormat.JPG]
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
@register_converter(FileFormat.PNG)
|
|
171
|
+
class PngConverter(ImageConverter):
|
|
172
|
+
"""Converter for PNG images."""
|
|
173
|
+
|
|
174
|
+
supported_formats = [FileFormat.PNG]
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
@register_converter(FileFormat.WEBP)
|
|
178
|
+
class WebpConverter(ImageConverter):
|
|
179
|
+
"""Converter for WebP images."""
|
|
180
|
+
|
|
181
|
+
supported_formats = [FileFormat.WEBP]
|