markitai 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. markitai/__init__.py +3 -0
  2. markitai/batch.py +1316 -0
  3. markitai/cli.py +3979 -0
  4. markitai/config.py +602 -0
  5. markitai/config.schema.json +748 -0
  6. markitai/constants.py +222 -0
  7. markitai/converter/__init__.py +49 -0
  8. markitai/converter/_patches.py +98 -0
  9. markitai/converter/base.py +164 -0
  10. markitai/converter/image.py +181 -0
  11. markitai/converter/legacy.py +606 -0
  12. markitai/converter/office.py +526 -0
  13. markitai/converter/pdf.py +679 -0
  14. markitai/converter/text.py +63 -0
  15. markitai/fetch.py +1725 -0
  16. markitai/image.py +1335 -0
  17. markitai/json_order.py +550 -0
  18. markitai/llm.py +4339 -0
  19. markitai/ocr.py +347 -0
  20. markitai/prompts/__init__.py +159 -0
  21. markitai/prompts/cleaner.md +93 -0
  22. markitai/prompts/document_enhance.md +77 -0
  23. markitai/prompts/document_enhance_complete.md +65 -0
  24. markitai/prompts/document_process.md +60 -0
  25. markitai/prompts/frontmatter.md +28 -0
  26. markitai/prompts/image_analysis.md +21 -0
  27. markitai/prompts/image_caption.md +8 -0
  28. markitai/prompts/image_description.md +13 -0
  29. markitai/prompts/page_content.md +17 -0
  30. markitai/prompts/url_enhance.md +78 -0
  31. markitai/security.py +286 -0
  32. markitai/types.py +30 -0
  33. markitai/urls.py +187 -0
  34. markitai/utils/__init__.py +33 -0
  35. markitai/utils/executor.py +69 -0
  36. markitai/utils/mime.py +85 -0
  37. markitai/utils/office.py +262 -0
  38. markitai/utils/output.py +53 -0
  39. markitai/utils/paths.py +81 -0
  40. markitai/utils/text.py +359 -0
  41. markitai/workflow/__init__.py +37 -0
  42. markitai/workflow/core.py +760 -0
  43. markitai/workflow/helpers.py +509 -0
  44. markitai/workflow/single.py +369 -0
  45. markitai-0.3.0.dist-info/METADATA +159 -0
  46. markitai-0.3.0.dist-info/RECORD +48 -0
  47. markitai-0.3.0.dist-info/WHEEL +4 -0
  48. markitai-0.3.0.dist-info/entry_points.txt +2 -0
@@ -0,0 +1,181 @@
1
+ """Image file converters using OCR or LLM Vision."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import shutil
6
+ from pathlib import Path
7
+ from typing import TYPE_CHECKING
8
+
9
+ from loguru import logger
10
+
11
+ from markitai.converter.base import (
12
+ BaseConverter,
13
+ ConvertResult,
14
+ FileFormat,
15
+ register_converter,
16
+ )
17
+ from markitai.utils.paths import ensure_assets_dir
18
+
19
+ if TYPE_CHECKING:
20
+ pass
21
+
22
+
23
+ class ImageConverter(BaseConverter):
24
+ """Converter for image files using OCR or LLM Vision.
25
+
26
+ Extracts text from images using RapidOCR by default,
27
+ or LLM Vision when --llm --alt|--desc flags are used.
28
+ """
29
+
30
+ supported_formats = [
31
+ FileFormat.JPEG,
32
+ FileFormat.JPG,
33
+ FileFormat.PNG,
34
+ FileFormat.WEBP,
35
+ ]
36
+
37
+ def convert(
38
+ self, input_path: Path, output_dir: Path | None = None
39
+ ) -> ConvertResult:
40
+ """Convert image to Markdown by extracting text via OCR.
41
+
42
+ Args:
43
+ input_path: Path to the image file
44
+ output_dir: Optional output directory for copying image
45
+
46
+ Returns:
47
+ ConvertResult containing markdown with OCR text
48
+ """
49
+ input_path = Path(input_path)
50
+
51
+ # Check if OCR and LLM are enabled in config
52
+ use_ocr = self.config and self.config.ocr.enabled
53
+ use_llm = self.config and self.config.llm.enabled
54
+
55
+ # Copy image to assets directory and get relative path
56
+ image_ref_path = self._copy_to_assets(input_path, output_dir)
57
+
58
+ if use_ocr and use_llm:
59
+ # --ocr --llm: Skip OCR, let LLM Vision analyze the image later
60
+ # Just return a placeholder - LLM will process it in cli.py
61
+ markdown = self._create_image_placeholder(input_path, image_ref_path)
62
+ return ConvertResult(
63
+ markdown=markdown,
64
+ images=[],
65
+ metadata={
66
+ "format": input_path.suffix.lstrip(".").upper(),
67
+ "source": str(input_path),
68
+ "asset_path": image_ref_path,
69
+ },
70
+ )
71
+ elif use_ocr:
72
+ # --ocr only: Use RapidOCR
73
+ markdown = self._convert_with_ocr(input_path, image_ref_path)
74
+ else:
75
+ # Just return a placeholder with image reference
76
+ markdown = self._create_image_placeholder(input_path, image_ref_path)
77
+
78
+ return ConvertResult(
79
+ markdown=markdown,
80
+ images=[], # No embedded images to extract
81
+ metadata={
82
+ "format": input_path.suffix.lstrip(".").upper(),
83
+ "source": str(input_path),
84
+ "ocr_used": use_ocr and not use_llm,
85
+ "asset_path": image_ref_path,
86
+ },
87
+ )
88
+
89
+ def _copy_to_assets(self, input_path: Path, output_dir: Path | None) -> str:
90
+ """Copy image to assets directory and return relative path.
91
+
92
+ Args:
93
+ input_path: Path to the source image file
94
+ output_dir: Output directory (assets will be created inside)
95
+
96
+ Returns:
97
+ Relative path to use in markdown (e.g., "assets/image.jpg")
98
+ """
99
+ if output_dir is None:
100
+ # No output directory specified, use original filename
101
+ return input_path.name
102
+
103
+ assets_dir = ensure_assets_dir(output_dir)
104
+
105
+ # Copy image to assets directory
106
+ dest_path = assets_dir / input_path.name
107
+ if not dest_path.exists():
108
+ shutil.copy2(input_path, dest_path)
109
+ logger.debug(f"Copied {input_path.name} to {dest_path}")
110
+
111
+ return f"assets/{input_path.name}"
112
+
113
+ def _convert_with_ocr(self, input_path: Path, image_ref_path: str) -> str:
114
+ """Convert image using OCR.
115
+
116
+ Args:
117
+ input_path: Path to the image file
118
+ image_ref_path: Relative path for image reference in markdown
119
+
120
+ Returns:
121
+ Markdown with OCR extracted text
122
+ """
123
+ try:
124
+ from markitai.ocr import OCRProcessor
125
+
126
+ processor = OCRProcessor(self.config.ocr if self.config else None)
127
+ result = processor.recognize_to_markdown(input_path)
128
+
129
+ if result.strip():
130
+ logger.debug(f"OCR extracted text from {input_path.name}")
131
+ return f"# {input_path.stem}\n\n{result}"
132
+ else:
133
+ logger.warning(f"OCR found no text in {input_path.name}")
134
+ return self._create_image_placeholder(input_path, image_ref_path)
135
+
136
+ except ImportError:
137
+ logger.warning("RapidOCR not available, returning placeholder")
138
+ return self._create_image_placeholder(input_path, image_ref_path)
139
+ except Exception as e:
140
+ logger.warning(f"OCR failed for {input_path.name}: {e}")
141
+ return self._create_image_placeholder(input_path, image_ref_path)
142
+
143
+ def _create_image_placeholder(self, input_path: Path, image_ref_path: str) -> str:
144
+ """Create a placeholder markdown for the image.
145
+
146
+ Args:
147
+ input_path: Path to the image file
148
+ image_ref_path: Relative path for image reference in markdown
149
+
150
+ Returns:
151
+ Markdown with image placeholder
152
+ """
153
+ return f"# {input_path.stem}\n\n![{input_path.stem}]({image_ref_path})\n"
154
+
155
+
156
+ @register_converter(FileFormat.JPEG)
157
+ class JpegConverter(ImageConverter):
158
+ """Converter for JPEG images."""
159
+
160
+ supported_formats = [FileFormat.JPEG]
161
+
162
+
163
+ @register_converter(FileFormat.JPG)
164
+ class JpgConverter(ImageConverter):
165
+ """Converter for JPG images."""
166
+
167
+ supported_formats = [FileFormat.JPG]
168
+
169
+
170
+ @register_converter(FileFormat.PNG)
171
+ class PngConverter(ImageConverter):
172
+ """Converter for PNG images."""
173
+
174
+ supported_formats = [FileFormat.PNG]
175
+
176
+
177
+ @register_converter(FileFormat.WEBP)
178
+ class WebpConverter(ImageConverter):
179
+ """Converter for WebP images."""
180
+
181
+ supported_formats = [FileFormat.WEBP]