markitai 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. markitai/__init__.py +3 -0
  2. markitai/batch.py +1316 -0
  3. markitai/cli.py +3979 -0
  4. markitai/config.py +602 -0
  5. markitai/config.schema.json +748 -0
  6. markitai/constants.py +222 -0
  7. markitai/converter/__init__.py +49 -0
  8. markitai/converter/_patches.py +98 -0
  9. markitai/converter/base.py +164 -0
  10. markitai/converter/image.py +181 -0
  11. markitai/converter/legacy.py +606 -0
  12. markitai/converter/office.py +526 -0
  13. markitai/converter/pdf.py +679 -0
  14. markitai/converter/text.py +63 -0
  15. markitai/fetch.py +1725 -0
  16. markitai/image.py +1335 -0
  17. markitai/json_order.py +550 -0
  18. markitai/llm.py +4339 -0
  19. markitai/ocr.py +347 -0
  20. markitai/prompts/__init__.py +159 -0
  21. markitai/prompts/cleaner.md +93 -0
  22. markitai/prompts/document_enhance.md +77 -0
  23. markitai/prompts/document_enhance_complete.md +65 -0
  24. markitai/prompts/document_process.md +60 -0
  25. markitai/prompts/frontmatter.md +28 -0
  26. markitai/prompts/image_analysis.md +21 -0
  27. markitai/prompts/image_caption.md +8 -0
  28. markitai/prompts/image_description.md +13 -0
  29. markitai/prompts/page_content.md +17 -0
  30. markitai/prompts/url_enhance.md +78 -0
  31. markitai/security.py +286 -0
  32. markitai/types.py +30 -0
  33. markitai/urls.py +187 -0
  34. markitai/utils/__init__.py +33 -0
  35. markitai/utils/executor.py +69 -0
  36. markitai/utils/mime.py +85 -0
  37. markitai/utils/office.py +262 -0
  38. markitai/utils/output.py +53 -0
  39. markitai/utils/paths.py +81 -0
  40. markitai/utils/text.py +359 -0
  41. markitai/workflow/__init__.py +37 -0
  42. markitai/workflow/core.py +760 -0
  43. markitai/workflow/helpers.py +509 -0
  44. markitai/workflow/single.py +369 -0
  45. markitai-0.3.0.dist-info/METADATA +159 -0
  46. markitai-0.3.0.dist-info/RECORD +48 -0
  47. markitai-0.3.0.dist-info/WHEEL +4 -0
  48. markitai-0.3.0.dist-info/entry_points.txt +2 -0
@@ -0,0 +1,526 @@
1
+ """Office document converters (DOCX, PPTX, XLSX)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import tempfile
6
+ from pathlib import Path
7
+ from typing import TYPE_CHECKING
8
+
9
+ from loguru import logger
10
+ from markitdown import MarkItDown
11
+
12
+ from markitai.constants import DEFAULT_RENDER_DPI
13
+ from markitai.converter.base import (
14
+ BaseConverter,
15
+ ConvertResult,
16
+ ExtractedImage,
17
+ FileFormat,
18
+ register_converter,
19
+ )
20
+ from markitai.image import ImageProcessor
21
+ from markitai.utils.office import find_libreoffice, has_ms_office
22
+ from markitai.utils.paths import ensure_screenshots_dir
23
+
24
+ if TYPE_CHECKING:
25
+ from markitai.config import MarkitaiConfig
26
+
27
+
28
+ class OfficeConverter(BaseConverter):
29
+ """Base converter for Office documents.
30
+
31
+ Uses MarkItDown for text extraction (cross-platform).
32
+ COM is only used for slide/page rendering when needed.
33
+ """
34
+
35
+ def __init__(self, config: MarkitaiConfig | None = None) -> None:
36
+ super().__init__(config)
37
+ self._markitdown = MarkItDown()
38
+
39
+ def convert(
40
+ self, input_path: Path, output_dir: Path | None = None
41
+ ) -> ConvertResult:
42
+ """Convert Office document to Markdown using MarkItDown."""
43
+ return self._convert_with_markitdown(Path(input_path))
44
+
45
+ def _convert_with_markitdown(self, input_path: Path) -> ConvertResult:
46
+ """Convert using MarkItDown library."""
47
+ result = self._markitdown.convert(input_path, keep_data_uris=True)
48
+
49
+ metadata = {
50
+ "source": str(input_path),
51
+ "format": input_path.suffix.lstrip(".").upper(),
52
+ "converter": "markitdown",
53
+ }
54
+
55
+ if result.title:
56
+ metadata["title"] = result.title
57
+
58
+ return ConvertResult(
59
+ markdown=result.markdown,
60
+ images=[],
61
+ metadata=metadata,
62
+ )
63
+
64
+
65
+ @register_converter(FileFormat.DOCX)
66
+ class DocxConverter(OfficeConverter):
67
+ """Converter for DOCX (Word) documents.
68
+
69
+ Uses MarkItDown directly (via python-docx) - cross-platform.
70
+ """
71
+
72
+ supported_formats = [FileFormat.DOCX]
73
+
74
+
75
+ @register_converter(FileFormat.PPTX)
76
+ class PptxConverter(OfficeConverter):
77
+ """Converter for PPTX (PowerPoint) documents.
78
+
79
+ Text extraction uses MarkItDown (via python-pptx) - cross-platform.
80
+ Slide rendering uses COM (Windows) or LibreOffice (Linux/macOS).
81
+
82
+ Modes:
83
+ - Default: Text extraction only
84
+ - --screenshot: Text + slide screenshots
85
+ - --ocr: Text + commented slide images
86
+ - --ocr --llm: Text + slides for LLM Vision
87
+ """
88
+
89
+ supported_formats = [FileFormat.PPTX]
90
+
91
+ def convert(
92
+ self, input_path: Path, output_dir: Path | None = None
93
+ ) -> ConvertResult:
94
+ """Convert PPTX document to Markdown.
95
+
96
+ Modes:
97
+ - --ocr --llm: Extract text + render slides for LLM Vision
98
+ - --ocr only: Extract text + commented slide images
99
+ - Default: Standard text extraction
100
+ """
101
+ input_path = Path(input_path)
102
+
103
+ use_ocr = self.config and self.config.ocr.enabled
104
+ use_llm = self.config and self.config.llm.enabled
105
+
106
+ if use_ocr and use_llm:
107
+ # --ocr --llm: Extract text + render slides for LLM
108
+ logger.info("PPTX OCR+LLM mode: extracting text and rendering slides")
109
+ return self._render_slides_for_llm(input_path, output_dir)
110
+ elif use_ocr:
111
+ # --ocr only: Extract text + commented slide images
112
+ logger.info("PPTX OCR mode: extracting text with slide images (commented)")
113
+ return self._convert_with_ocr(input_path, output_dir)
114
+
115
+ # Standard conversion - use MarkItDown directly (cross-platform)
116
+ # COM is only needed for slide screenshots, not text extraction
117
+ result = self._convert_with_markitdown(input_path)
118
+
119
+ # Render slide screenshots if enabled (independent of OCR)
120
+ enable_screenshot = self.config and self.config.screenshot.enabled
121
+ if enable_screenshot and output_dir:
122
+ screenshots_dir = ensure_screenshots_dir(output_dir)
123
+
124
+ # Get image format from config
125
+ image_format = "jpg"
126
+ if self.config:
127
+ fmt = self.config.image.format
128
+ image_format = "jpg" if fmt == "jpeg" else fmt
129
+
130
+ images, slide_images = self._render_slides_to_images(
131
+ input_path, screenshots_dir, image_format
132
+ )
133
+
134
+ # Update metadata with page_images for LLM processing
135
+ result.metadata["page_images"] = slide_images
136
+ result.metadata["pages"] = len(slide_images)
137
+ result.metadata["extracted_text"] = result.markdown
138
+ result.images = images
139
+
140
+ logger.debug(f"Rendered {len(slide_images)} slide screenshots")
141
+
142
+ return result
143
+
144
+ def _convert_with_ocr(
145
+ self, input_path: Path, output_dir: Path | None = None
146
+ ) -> ConvertResult:
147
+ """Convert PPTX with text extraction + commented slide images.
148
+
149
+ Args:
150
+ input_path: Path to the PPTX file
151
+ output_dir: Output directory for slide images
152
+
153
+ Returns:
154
+ ConvertResult with text content and commented image references
155
+ """
156
+ # First, extract text using MarkItDown
157
+ text_result = self._convert_with_markitdown(input_path)
158
+ extracted_text = text_result.markdown
159
+
160
+ # Check if screenshot is enabled
161
+ enable_screenshot = self.config and self.config.screenshot.enabled
162
+
163
+ images: list[ExtractedImage] = []
164
+ slide_images: list[dict] = []
165
+
166
+ # Render slides as images (only if screenshot enabled)
167
+ if enable_screenshot:
168
+ # Setup screenshots directory for slide images
169
+ if output_dir:
170
+ screenshots_dir = ensure_screenshots_dir(output_dir)
171
+ else:
172
+ screenshots_dir = Path(tempfile.mkdtemp())
173
+
174
+ # Get image format from config
175
+ image_format = "jpg"
176
+ if self.config:
177
+ fmt = self.config.image.format
178
+ image_format = "jpg" if fmt == "jpeg" else fmt
179
+
180
+ images, slide_images = self._render_slides_to_images(
181
+ input_path, screenshots_dir, image_format
182
+ )
183
+
184
+ # Build markdown with extracted text and commented slide images
185
+ markdown_parts = [extracted_text]
186
+ if enable_screenshot and slide_images:
187
+ markdown_parts.append("\n\n<!-- Slide images for reference -->")
188
+ for slide_info in slide_images:
189
+ markdown_parts.append(
190
+ f"<!-- ![Slide {slide_info['page']}](screenshots/{slide_info['name']}) -->"
191
+ )
192
+
193
+ markdown = "\n".join(markdown_parts)
194
+
195
+ return ConvertResult(
196
+ markdown=markdown,
197
+ images=images,
198
+ metadata={
199
+ "source": str(input_path),
200
+ "format": "PPTX",
201
+ "ocr_used": True,
202
+ "slides": len(images),
203
+ },
204
+ )
205
+
206
+ def _render_slides_to_images(
207
+ self, input_path: Path, screenshots_dir: Path, image_format: str
208
+ ) -> tuple[list[ExtractedImage], list[dict]]:
209
+ """Render slides to images using the best available method.
210
+
211
+ Args:
212
+ input_path: Path to the PPTX file
213
+ screenshots_dir: Directory to save screenshot images
214
+ image_format: Image format (jpg, png, etc.)
215
+
216
+ Returns:
217
+ Tuple of (ExtractedImage list, slide info list for metadata)
218
+ """
219
+
220
+ # Try Windows COM first
221
+ if has_ms_office():
222
+ try:
223
+ return self._render_slides_with_com(
224
+ input_path, screenshots_dir, image_format
225
+ )
226
+ except Exception as e:
227
+ logger.warning(f"COM rendering failed, trying PDF fallback: {e}")
228
+
229
+ # Fallback: Convert to PDF and render pages
230
+ # Log a hint for Windows users without MS Office
231
+ import platform
232
+
233
+ if platform.system() == "Windows":
234
+ logger.warning(
235
+ "[PPTX] MS Office not available. "
236
+ "Install Microsoft Office for faster slide rendering. "
237
+ "Falling back to LibreOffice PDF conversion..."
238
+ )
239
+
240
+ return self._render_slides_via_pdf(input_path, screenshots_dir, image_format)
241
+
242
+ def _render_slides_with_com(
243
+ self, input_path: Path, screenshots_dir: Path, image_format: str
244
+ ) -> tuple[list[ExtractedImage], list[dict]]:
245
+ """Render slides using PowerPoint COM automation."""
246
+ import pythoncom # type: ignore[import-not-found]
247
+ import win32com.client # type: ignore[import-not-found]
248
+
249
+ logger.debug(f"Rendering slides with PowerPoint COM: {input_path.name}")
250
+
251
+ ppt = None
252
+ presentation = None
253
+ images: list[ExtractedImage] = []
254
+ slide_images: list[dict] = []
255
+
256
+ # Create ImageProcessor for compression with config
257
+ img_processor = ImageProcessor(self.config.image if self.config else None)
258
+
259
+ # Initialize COM for this thread (required for asyncio thread pool)
260
+ pythoncom.CoInitialize()
261
+ try:
262
+ ppt = win32com.client.Dispatch("PowerPoint.Application")
263
+ presentation = ppt.Presentations.Open(
264
+ str(input_path.resolve()),
265
+ ReadOnly=True,
266
+ Untitled=False,
267
+ WithWindow=False,
268
+ )
269
+
270
+ export_format = "JPG" if image_format == "jpg" else image_format.upper()
271
+
272
+ for i, slide in enumerate(presentation.Slides, 1):
273
+ image_name = f"{input_path.name}.slide{i:04d}.{image_format}"
274
+ image_path = screenshots_dir / image_name
275
+
276
+ slide.Export(str(image_path.resolve()), export_format)
277
+
278
+ # Apply compression with configured quality
279
+ from PIL import Image
280
+
281
+ with Image.open(image_path) as img:
282
+ original_width, original_height = img.size
283
+
284
+ # Compress if enabled in config
285
+ if self.config and self.config.image.compress:
286
+ format_map = {
287
+ "jpg": "JPEG",
288
+ "jpeg": "JPEG",
289
+ "png": "PNG",
290
+ "webp": "WEBP",
291
+ }
292
+ output_format = format_map.get(image_format, "JPEG")
293
+ compressed_img, compressed_data = img_processor.compress(
294
+ img.copy(),
295
+ quality=self.config.image.quality,
296
+ max_size=(
297
+ self.config.image.max_width,
298
+ self.config.image.max_height,
299
+ ),
300
+ output_format=output_format,
301
+ )
302
+ image_path.write_bytes(compressed_data)
303
+ width, height = compressed_img.size
304
+ else:
305
+ width, height = original_width, original_height
306
+
307
+ images.append(
308
+ ExtractedImage(
309
+ path=image_path,
310
+ index=i,
311
+ original_name=image_name,
312
+ mime_type=f"image/{image_format}",
313
+ width=width,
314
+ height=height,
315
+ )
316
+ )
317
+ slide_images.append(
318
+ {
319
+ "page": i,
320
+ "path": str(image_path),
321
+ "name": image_name,
322
+ }
323
+ )
324
+ logger.debug(f"Rendered slide {i}/{len(presentation.Slides)}")
325
+
326
+ presentation.Close()
327
+ presentation = None
328
+
329
+ finally:
330
+ if presentation:
331
+ try:
332
+ presentation.Close()
333
+ except Exception:
334
+ pass
335
+ if ppt:
336
+ try:
337
+ ppt.Quit()
338
+ except Exception:
339
+ pass
340
+ pythoncom.CoUninitialize()
341
+
342
+ return images, slide_images
343
+
344
+ def _render_slides_via_pdf(
345
+ self, input_path: Path, screenshots_dir: Path, image_format: str
346
+ ) -> tuple[list[ExtractedImage], list[dict]]:
347
+ """Render slides by converting to PDF first."""
348
+ import subprocess
349
+ import time
350
+
351
+ logger.info(f"[PPTX] Rendering slides via PDF: {input_path.name}")
352
+
353
+ soffice_cmd = find_libreoffice()
354
+ if not soffice_cmd:
355
+ import platform
356
+
357
+ if platform.system() == "Windows":
358
+ logger.warning(
359
+ "[PPTX] Cannot render slides: Neither MS Office nor LibreOffice found. "
360
+ "Install Microsoft Office (recommended) or LibreOffice to enable slide rendering."
361
+ )
362
+ else:
363
+ logger.warning(
364
+ "[PPTX] Cannot render slides: LibreOffice not found. "
365
+ "Install LibreOffice to enable slide rendering."
366
+ )
367
+ return [], []
368
+
369
+ with tempfile.TemporaryDirectory() as temp_dir:
370
+ temp_path = Path(temp_dir)
371
+ pdf_path = temp_path / f"{input_path.stem}.pdf"
372
+
373
+ # Create isolated user profile for concurrent LibreOffice execution
374
+ profile_path = temp_path / "lo_profile"
375
+ profile_path.mkdir()
376
+ profile_url = profile_path.as_uri()
377
+
378
+ try:
379
+ lo_start = time.perf_counter()
380
+ result = subprocess.run(
381
+ [
382
+ soffice_cmd,
383
+ "--headless",
384
+ f"-env:UserInstallation={profile_url}",
385
+ "--convert-to",
386
+ "pdf",
387
+ "--outdir",
388
+ str(temp_path),
389
+ str(input_path),
390
+ ],
391
+ capture_output=True,
392
+ text=True,
393
+ timeout=600,
394
+ )
395
+ lo_time = time.perf_counter() - lo_start
396
+ logger.info(f"[PPTX] LibreOffice conversion: {lo_time:.2f}s")
397
+ if result.returncode != 0 or not pdf_path.exists():
398
+ logger.warning(f"[PPTX] LibreOffice failed: {result.stderr}")
399
+ return [], []
400
+ except subprocess.TimeoutExpired:
401
+ logger.error("[PPTX] LibreOffice timeout (>600s)")
402
+ return [], []
403
+ except Exception as e:
404
+ logger.error(f"[PPTX] LibreOffice error: {e}")
405
+ return [], []
406
+
407
+ try:
408
+ import pymupdf
409
+ except ImportError:
410
+ return [], []
411
+
412
+ render_start = time.perf_counter()
413
+ # Create ImageProcessor for compression
414
+ img_processor = ImageProcessor(self.config.image if self.config else None)
415
+
416
+ doc = pymupdf.open(pdf_path)
417
+ try:
418
+ images: list[ExtractedImage] = []
419
+ slide_images: list[dict] = []
420
+ dpi = DEFAULT_RENDER_DPI
421
+
422
+ for page_num in range(len(doc)):
423
+ page = doc[page_num]
424
+ mat = pymupdf.Matrix(dpi / 72, dpi / 72)
425
+ pix = page.get_pixmap(matrix=mat)
426
+
427
+ image_name = (
428
+ f"{input_path.name}.slide{page_num + 1:04d}.{image_format}"
429
+ )
430
+ image_path = screenshots_dir / image_name
431
+ # Save with compression (ensures < 5MB for LLM)
432
+ final_size = img_processor.save_screenshot(
433
+ pix.samples, pix.width, pix.height, image_path
434
+ )
435
+
436
+ images.append(
437
+ ExtractedImage(
438
+ path=image_path,
439
+ index=page_num + 1,
440
+ original_name=image_name,
441
+ mime_type=f"image/{image_format}",
442
+ width=final_size[0],
443
+ height=final_size[1],
444
+ )
445
+ )
446
+ slide_images.append(
447
+ {
448
+ "page": page_num + 1,
449
+ "path": str(image_path),
450
+ "name": image_name,
451
+ }
452
+ )
453
+
454
+ render_time = time.perf_counter() - render_start
455
+ logger.info(f"[PPTX] Rendered {len(doc)} slides: {render_time:.2f}s")
456
+ return images, slide_images
457
+ finally:
458
+ doc.close()
459
+
460
+ def _render_slides_for_llm(
461
+ self, input_path: Path, output_dir: Path | None = None
462
+ ) -> ConvertResult:
463
+ """Extract text and render slides for LLM Vision analysis.
464
+
465
+ This method:
466
+ 1. Extracts text using MarkItDown (fast, preserves structure)
467
+ 2. Renders each slide as an image (if enable_screenshot is True)
468
+
469
+ The CLI will send both text + images to LLM for enhanced analysis.
470
+
471
+ Args:
472
+ input_path: Path to the PPTX file
473
+ output_dir: Optional output directory for slide images
474
+
475
+ Returns:
476
+ ConvertResult with extracted text and slide images
477
+ """
478
+ # Step 1: Extract text using MarkItDown
479
+ text_result = self._convert_with_markitdown(input_path)
480
+ extracted_text = text_result.markdown
481
+
482
+ # Check if screenshot is enabled
483
+ enable_screenshot = self.config and self.config.screenshot.enabled
484
+
485
+ images: list[ExtractedImage] = []
486
+ slide_images: list[dict] = []
487
+
488
+ # Step 2: Render slides to images (only if screenshot enabled)
489
+ if enable_screenshot:
490
+ # Determine output path for slide images
491
+ if output_dir:
492
+ screenshots_dir = ensure_screenshots_dir(output_dir)
493
+ else:
494
+ screenshots_dir = Path(tempfile.mkdtemp())
495
+
496
+ # Get image format from config
497
+ image_format = "jpg"
498
+ if self.config:
499
+ fmt = self.config.image.format
500
+ image_format = "jpg" if fmt == "jpeg" else fmt
501
+
502
+ images, slide_images = self._render_slides_to_images(
503
+ input_path, screenshots_dir, image_format
504
+ )
505
+
506
+ return ConvertResult(
507
+ markdown=extracted_text,
508
+ images=images,
509
+ metadata={
510
+ "source": str(input_path),
511
+ "format": "PPTX",
512
+ "slides": len(images),
513
+ "extracted_text": extracted_text,
514
+ "page_images": slide_images,
515
+ },
516
+ )
517
+
518
+
519
+ @register_converter(FileFormat.XLSX)
520
+ class XlsxConverter(OfficeConverter):
521
+ """Converter for XLSX (Excel) documents.
522
+
523
+ Uses MarkItDown directly (via openpyxl) - cross-platform.
524
+ """
525
+
526
+ supported_formats = [FileFormat.XLSX]