markitai 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. markitai/__init__.py +3 -0
  2. markitai/batch.py +1316 -0
  3. markitai/cli.py +3979 -0
  4. markitai/config.py +602 -0
  5. markitai/config.schema.json +748 -0
  6. markitai/constants.py +222 -0
  7. markitai/converter/__init__.py +49 -0
  8. markitai/converter/_patches.py +98 -0
  9. markitai/converter/base.py +164 -0
  10. markitai/converter/image.py +181 -0
  11. markitai/converter/legacy.py +606 -0
  12. markitai/converter/office.py +526 -0
  13. markitai/converter/pdf.py +679 -0
  14. markitai/converter/text.py +63 -0
  15. markitai/fetch.py +1725 -0
  16. markitai/image.py +1335 -0
  17. markitai/json_order.py +550 -0
  18. markitai/llm.py +4339 -0
  19. markitai/ocr.py +347 -0
  20. markitai/prompts/__init__.py +159 -0
  21. markitai/prompts/cleaner.md +93 -0
  22. markitai/prompts/document_enhance.md +77 -0
  23. markitai/prompts/document_enhance_complete.md +65 -0
  24. markitai/prompts/document_process.md +60 -0
  25. markitai/prompts/frontmatter.md +28 -0
  26. markitai/prompts/image_analysis.md +21 -0
  27. markitai/prompts/image_caption.md +8 -0
  28. markitai/prompts/image_description.md +13 -0
  29. markitai/prompts/page_content.md +17 -0
  30. markitai/prompts/url_enhance.md +78 -0
  31. markitai/security.py +286 -0
  32. markitai/types.py +30 -0
  33. markitai/urls.py +187 -0
  34. markitai/utils/__init__.py +33 -0
  35. markitai/utils/executor.py +69 -0
  36. markitai/utils/mime.py +85 -0
  37. markitai/utils/office.py +262 -0
  38. markitai/utils/output.py +53 -0
  39. markitai/utils/paths.py +81 -0
  40. markitai/utils/text.py +359 -0
  41. markitai/workflow/__init__.py +37 -0
  42. markitai/workflow/core.py +760 -0
  43. markitai/workflow/helpers.py +509 -0
  44. markitai/workflow/single.py +369 -0
  45. markitai-0.3.0.dist-info/METADATA +159 -0
  46. markitai-0.3.0.dist-info/RECORD +48 -0
  47. markitai-0.3.0.dist-info/WHEEL +4 -0
  48. markitai-0.3.0.dist-info/entry_points.txt +2 -0
@@ -0,0 +1,369 @@
1
+ """Single file workflow processing."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import re
7
+ from dataclasses import dataclass, field
8
+ from datetime import datetime
9
+ from pathlib import Path
10
+ from typing import TYPE_CHECKING, Any
11
+
12
+ from loguru import logger
13
+
14
+ from markitai.security import atomic_write_text
15
+
16
+ if TYPE_CHECKING:
17
+ from markitai.config import MarkitaiConfig
18
+ from markitai.llm import LLMProcessor
19
+
20
+
21
+ @dataclass
22
+ class ImageAnalysisResult:
23
+ """Result of image analysis for a single source file."""
24
+
25
+ source_file: str
26
+ assets: list[dict[str, Any]]
27
+
28
+
29
+ @dataclass
30
+ class WorkflowResult:
31
+ """Result of processing a file through the workflow."""
32
+
33
+ markdown: str
34
+ llm_cost: float = 0.0
35
+ llm_usage: dict[str, dict[str, Any]] = field(default_factory=dict)
36
+ image_analysis: ImageAnalysisResult | None = None
37
+
38
+
39
+ class SingleFileWorkflow:
40
+ """Workflow for processing a single file with LLM enhancement.
41
+
42
+ This class encapsulates the LLM processing logic extracted from cli.py,
43
+ including document processing, image analysis, and vision enhancement.
44
+ """
45
+
46
+ def __init__(
47
+ self,
48
+ config: MarkitaiConfig,
49
+ processor: LLMProcessor | None = None,
50
+ project_dir: Path | None = None,
51
+ no_cache: bool = False,
52
+ no_cache_patterns: list[str] | None = None,
53
+ ) -> None:
54
+ """Initialize workflow.
55
+
56
+ Args:
57
+ config: Markitai configuration
58
+ processor: Optional shared LLMProcessor (created if not provided)
59
+ project_dir: Optional project directory for project-level cache
60
+ no_cache: If True, skip reading from cache but still write results
61
+ no_cache_patterns: List of glob patterns to skip cache for specific files
62
+ """
63
+ self.config = config
64
+ self._processor = processor
65
+ self._project_dir = project_dir
66
+ self._no_cache = no_cache
67
+ self._no_cache_patterns = no_cache_patterns
68
+ self._llm_cost = 0.0
69
+ self._llm_usage: dict[str, dict[str, Any]] = {}
70
+
71
+ @property
72
+ def processor(self) -> LLMProcessor:
73
+ """Get or create LLM processor."""
74
+ if self._processor is None:
75
+ from markitai.workflow.helpers import create_llm_processor
76
+
77
+ # Create a temporary config with the no_cache settings
78
+ # This is needed because SingleFileWorkflow stores these separately
79
+ temp_config = self.config.model_copy()
80
+ temp_config.cache.no_cache = self._no_cache
81
+ temp_config.cache.no_cache_patterns = self._no_cache_patterns or []
82
+
83
+ self._processor = create_llm_processor(
84
+ temp_config, project_dir=self._project_dir
85
+ )
86
+ return self._processor
87
+
88
+ def _merge_usage(self, usage: dict[str, dict[str, Any]]) -> None:
89
+ """Merge usage statistics into workflow totals."""
90
+ from markitai.workflow.helpers import merge_llm_usage
91
+
92
+ merge_llm_usage(self._llm_usage, usage)
93
+
94
+ async def process_document_with_llm(
95
+ self,
96
+ markdown: str,
97
+ source: str,
98
+ output_file: Path,
99
+ page_images: list[dict] | None = None,
100
+ ) -> tuple[str, float, dict[str, dict[str, Any]]]:
101
+ """Process markdown with LLM (clean + frontmatter).
102
+
103
+ Args:
104
+ markdown: Markdown content to process
105
+ source: Source file name
106
+ output_file: Output file path for .llm.md
107
+ page_images: Optional list of page image info dicts
108
+
109
+ Returns:
110
+ Tuple of (markdown, cost_usd, llm_usage)
111
+ """
112
+ try:
113
+ cleaned, frontmatter = await self.processor.process_document(
114
+ markdown, source
115
+ )
116
+
117
+ # Write LLM version
118
+ llm_output = output_file.with_suffix(".llm.md")
119
+ llm_content = self.processor.format_llm_output(cleaned, frontmatter)
120
+
121
+ # Append commented image links if provided
122
+ if page_images:
123
+ commented_images = [
124
+ f"<!-- ![Page {img['page']}](screenshots/{img['name']}) -->"
125
+ for img in sorted(page_images, key=lambda x: x.get("page", 0))
126
+ ]
127
+ llm_content += "\n\n<!-- Page images for reference -->\n" + "\n".join(
128
+ commented_images
129
+ )
130
+
131
+ atomic_write_text(llm_output, llm_content)
132
+ logger.info(f"Written LLM version: {llm_output}")
133
+
134
+ # Use context-based tracking for accurate per-file usage in concurrent scenarios
135
+ cost = self.processor.get_context_cost(source)
136
+ usage = self.processor.get_context_usage(source)
137
+ return markdown, cost, usage
138
+
139
+ except Exception as e:
140
+ logger.warning(f"LLM processing failed: {e}")
141
+ return markdown, 0.0, {}
142
+
143
+ async def analyze_images(
144
+ self,
145
+ image_paths: list[Path],
146
+ markdown: str,
147
+ output_file: Path,
148
+ input_path: Path | None = None,
149
+ concurrency_limit: int | None = None,
150
+ ) -> tuple[str, float, dict[str, dict[str, Any]], ImageAnalysisResult | None]:
151
+ """Analyze images with LLM Vision.
152
+
153
+ Args:
154
+ image_paths: List of image file paths
155
+ markdown: Original markdown content
156
+ output_file: Output markdown file path
157
+ input_path: Source input file path
158
+ concurrency_limit: Max concurrent requests
159
+
160
+ Returns:
161
+ Tuple of (updated markdown, cost_usd, llm_usage, image_analysis_result)
162
+ """
163
+ from markitai.llm import ImageAnalysis
164
+ from markitai.workflow.helpers import detect_language
165
+
166
+ alt_enabled = self.config.image.alt_enabled
167
+ desc_enabled = self.config.image.desc_enabled
168
+
169
+ # Use unique context for accurate per-file usage tracking in concurrent scenarios
170
+ source_path = (
171
+ str(input_path.resolve()) if input_path else str(output_file.resolve())
172
+ )
173
+ context = f"{source_path}:images"
174
+
175
+ try:
176
+ # Detect document language from markdown content
177
+ language = detect_language(markdown)
178
+
179
+ async def analyze_single_image(
180
+ image_path: Path,
181
+ ) -> tuple[Path, ImageAnalysis | None, str]:
182
+ """Analyze a single image."""
183
+ timestamp = datetime.now().astimezone().isoformat()
184
+ try:
185
+ analysis = await self.processor.analyze_image(
186
+ image_path, language=language, context=context
187
+ )
188
+ return image_path, analysis, timestamp
189
+ except Exception as e:
190
+ logger.warning(f"Failed to analyze image {image_path.name}: {e}")
191
+ return image_path, None, timestamp
192
+
193
+ # Queue-based analysis with concurrency limit
194
+ logger.info(f"Analyzing {len(image_paths)} images...")
195
+ limit = (
196
+ concurrency_limit
197
+ if concurrency_limit is not None
198
+ else self.config.llm.concurrency
199
+ )
200
+ worker_count = min(len(image_paths), max(1, limit))
201
+ queue: asyncio.Queue[Path] = asyncio.Queue()
202
+ for image_path in image_paths:
203
+ queue.put_nowait(image_path)
204
+
205
+ results_map: dict[Path, tuple[Path, ImageAnalysis | None, str]] = {}
206
+
207
+ async def worker() -> None:
208
+ while True:
209
+ try:
210
+ image_path = queue.get_nowait()
211
+ except asyncio.QueueEmpty:
212
+ break
213
+ result = await analyze_single_image(image_path)
214
+ results_map[image_path] = result
215
+ queue.task_done()
216
+
217
+ workers = [asyncio.create_task(worker()) for _ in range(worker_count)]
218
+ await queue.join()
219
+ for task in workers:
220
+ task.cancel()
221
+ await asyncio.gather(*workers, return_exceptions=True)
222
+
223
+ results = [results_map[p] for p in image_paths if p in results_map]
224
+
225
+ # Collect asset descriptions for JSON output
226
+ asset_descriptions: list[dict[str, Any]] = []
227
+
228
+ # Process results
229
+ for image_path, analysis, timestamp in results:
230
+ # Use default values if analysis failed
231
+ # This ensures the image is still recorded in images.json
232
+ if analysis is None:
233
+ analysis_caption = "Image"
234
+ analysis_desc = "Image analysis failed"
235
+ analysis_text = ""
236
+ analysis_usage: dict[str, Any] = {}
237
+ else:
238
+ analysis_caption = analysis.caption
239
+ analysis_desc = analysis.description
240
+ analysis_text = analysis.extracted_text or ""
241
+ analysis_usage = analysis.llm_usage or {}
242
+
243
+ # Collect for JSON output (if desc_enabled)
244
+ if desc_enabled:
245
+ asset_descriptions.append(
246
+ {
247
+ "asset": str(image_path.resolve()),
248
+ "alt": analysis_caption,
249
+ "desc": analysis_desc,
250
+ "text": analysis_text,
251
+ "llm_usage": analysis_usage,
252
+ "created": timestamp,
253
+ }
254
+ )
255
+
256
+ # Update alt text in markdown (if alt_enabled)
257
+ if alt_enabled:
258
+ old_pattern = rf"!\[[^\]]*\]\([^)]*{re.escape(image_path.name)}\)"
259
+ new_ref = f"![{analysis_caption}](assets/{image_path.name})"
260
+ markdown = re.sub(old_pattern, new_ref, markdown)
261
+
262
+ # Check if this is a standalone image file
263
+ from markitai.constants import IMAGE_EXTENSIONS
264
+
265
+ is_standalone_image = (
266
+ input_path is not None
267
+ and input_path.suffix.lower() in IMAGE_EXTENSIONS
268
+ and len(image_paths) == 1
269
+ )
270
+
271
+ # Update/create .llm.md file
272
+ llm_output = output_file.with_suffix(".llm.md")
273
+ if is_standalone_image and results and results[0][1] is not None:
274
+ # For standalone images, create rich formatted content with frontmatter
275
+ from markitai.utils.text import normalize_markdown_whitespace
276
+ from markitai.workflow.helpers import format_standalone_image_markdown
277
+
278
+ # input_path is guaranteed non-None by is_standalone_image check
279
+ assert input_path is not None
280
+ _, analysis, _ = results[0]
281
+ if analysis:
282
+ rich_content = format_standalone_image_markdown(
283
+ input_path,
284
+ analysis,
285
+ f"assets/{input_path.name}",
286
+ include_frontmatter=True,
287
+ )
288
+ rich_content = normalize_markdown_whitespace(rich_content)
289
+ atomic_write_text(llm_output, rich_content)
290
+ logger.info(f"Written LLM version: {llm_output}")
291
+ elif alt_enabled:
292
+ # NOTE: Alt text update moved to caller (workflow/core.py) to avoid race condition.
293
+ # The caller will apply alt text updates after document processing completes.
294
+ # See P0-4 fix: image analysis no longer waits for .llm.md file.
295
+ pass
296
+
297
+ # Build analysis result for caller to aggregate
298
+ analysis_result: ImageAnalysisResult | None = None
299
+ if desc_enabled and asset_descriptions:
300
+ source_path = (
301
+ str(input_path.resolve()) if input_path else output_file.stem
302
+ )
303
+ analysis_result = ImageAnalysisResult(
304
+ source_file=source_path,
305
+ assets=asset_descriptions,
306
+ )
307
+
308
+ # Use context-based tracking for accurate per-file usage in concurrent scenarios
309
+ return (
310
+ markdown,
311
+ self.processor.get_context_cost(context),
312
+ self.processor.get_context_usage(context),
313
+ analysis_result,
314
+ )
315
+
316
+ except Exception as e:
317
+ logger.warning(f"Image analysis failed: {e}")
318
+ return markdown, 0.0, {}, None
319
+
320
+ async def enhance_with_vision(
321
+ self,
322
+ extracted_text: str,
323
+ page_images: list[dict],
324
+ source: str = "document",
325
+ ) -> tuple[str, str, float, dict[str, dict[str, Any]]]:
326
+ """Enhance document by combining extracted text with page images.
327
+
328
+ Args:
329
+ extracted_text: Text extracted by pymupdf4llm/markitdown
330
+ page_images: List of page image info dicts with 'path' key
331
+ source: Source file name for logging context
332
+
333
+ Returns:
334
+ Tuple of (cleaned_markdown, frontmatter_yaml, cost_usd, llm_usage)
335
+ """
336
+ try:
337
+ # Sort images by page number
338
+ def get_page_num(img_info: dict) -> int:
339
+ return img_info.get("page", 0)
340
+
341
+ sorted_images = sorted(page_images, key=get_page_num)
342
+
343
+ # Convert to Path list
344
+ image_paths = [Path(img["path"]) for img in sorted_images]
345
+
346
+ logger.info(
347
+ f"[START] {source}: Enhancing with {len(image_paths)} page images..."
348
+ )
349
+
350
+ # Call the combined enhancement method (clean + frontmatter)
351
+ (
352
+ cleaned_content,
353
+ frontmatter,
354
+ ) = await self.processor.enhance_document_complete(
355
+ extracted_text, image_paths, source=source
356
+ )
357
+
358
+ # Use context-based tracking for accurate per-file usage in concurrent scenarios
359
+ return (
360
+ cleaned_content,
361
+ frontmatter,
362
+ self.processor.get_context_cost(source),
363
+ self.processor.get_context_usage(source),
364
+ )
365
+
366
+ except Exception as e:
367
+ logger.warning(f"Document enhancement failed: {e}")
368
+ basic_frontmatter = f"title: {source}\nsource: {source}"
369
+ return extracted_text, basic_frontmatter, 0.0, {}
@@ -0,0 +1,159 @@
1
+ Metadata-Version: 2.4
2
+ Name: markitai
3
+ Version: 0.3.0
4
+ Summary: Document to Markdown converter with LLM enhancement
5
+ Project-URL: Homepage, https://markitai.ynewtime.com
6
+ Project-URL: Documentation, https://markitai.ynewtime.com/guide/getting-started
7
+ Project-URL: Repository, https://github.com/Ynewtime/markitai
8
+ Project-URL: Changelog, https://github.com/Ynewtime/markitai/blob/main/CHANGELOG.md
9
+ Author-email: Ynewtime <longqiliuye@gmail.com>
10
+ License-Expression: MIT
11
+ Keywords: converter,docx,llm,markdown,ocr,pdf
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Environment :: Console
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: License :: OSI Approved :: MIT License
16
+ Classifier: Operating System :: OS Independent
17
+ Classifier: Programming Language :: Python :: 3
18
+ Classifier: Programming Language :: Python :: 3.11
19
+ Classifier: Programming Language :: Python :: 3.12
20
+ Classifier: Programming Language :: Python :: 3.13
21
+ Classifier: Topic :: Text Processing :: Markup :: Markdown
22
+ Classifier: Topic :: Utilities
23
+ Requires-Python: >=3.11
24
+ Requires-Dist: aiofiles>=25.1.0
25
+ Requires-Dist: click>=8.1.0
26
+ Requires-Dist: instructor>=1.14.0
27
+ Requires-Dist: litellm>=1.80.16
28
+ Requires-Dist: loguru>=0.7.3
29
+ Requires-Dist: markitdown[all]>=0.1.4
30
+ Requires-Dist: pillow>=12.1.0
31
+ Requires-Dist: pydantic>=2.10.0
32
+ Requires-Dist: pymupdf4llm>=0.2.9
33
+ Requires-Dist: python-dotenv>=1.2.1
34
+ Requires-Dist: pywin32>=310; sys_platform == 'win32'
35
+ Requires-Dist: rapidocr>=3.5.0
36
+ Requires-Dist: rich>=14.2.0
37
+ Provides-Extra: all
38
+ Description-Content-Type: text/markdown
39
+
40
+ # Markitai
41
+
42
+ 开箱即用的 Markdown 转换器,原生支持 LLM 增强。
43
+
44
+ ## 特性
45
+
46
+ - **多格式支持** - DOCX/DOC, PPTX/PPT, XLSX/XLS, PDF, TXT, MD, JPG/PNG/WebP, URLs
47
+ - **LLM 增强** - 格式清洗、元数据生成、图片分析
48
+ - **批量处理** - 并发转换、断点恢复、进度显示
49
+ - **OCR 识别** - 扫描版 PDF 和图片文字提取
50
+ - **URL 转换** - 直接转换网页,支持 SPA 浏览器渲染
51
+
52
+ ## 安装
53
+
54
+ ### 一键安装(推荐)
55
+
56
+ ```bash
57
+ # Linux/macOS
58
+ curl -fsSL https://raw.githubusercontent.com/Ynewtime/markitai/main/scripts/setup.sh | sh
59
+
60
+ # Windows (PowerShell)
61
+ irm https://raw.githubusercontent.com/Ynewtime/markitai/main/scripts/setup.ps1 | iex
62
+ ```
63
+
64
+ ### 手动安装
65
+
66
+ ```bash
67
+ # 需要 Python 3.11+
68
+ uv tool install markitai
69
+
70
+ # 或使用 pip
71
+ pip install --user markitai
72
+ ```
73
+
74
+ ## 快速开始
75
+
76
+ ```bash
77
+ # 基础转换
78
+ markitai document.docx
79
+
80
+ # URL 转换
81
+ markitai https://example.com/article
82
+
83
+ # LLM 增强
84
+ markitai document.docx --llm
85
+
86
+ # 使用预设
87
+ markitai document.pdf --preset rich # LLM + alt + desc + screenshot
88
+ markitai document.pdf --preset standard # LLM + alt + desc
89
+ markitai document.pdf --preset minimal # 仅基础转换
90
+
91
+ # 批量处理
92
+ markitai ./docs -o ./output
93
+
94
+ # 断点恢复
95
+ markitai ./docs -o ./output --resume
96
+
97
+ # URL 批量处理(自动识别 .urls 文件)
98
+ markitai urls.urls -o ./output
99
+ ```
100
+
101
+ ## 输出结构
102
+
103
+ ```
104
+ output/
105
+ ├── document.docx.md # 基础 Markdown
106
+ ├── document.docx.llm.md # LLM 优化版
107
+ ├── assets/
108
+ │ ├── document.docx.0001.jpg
109
+ │ └── images.json # 图片描述
110
+ ├── screenshots/ # 页面截图(--screenshot 时)
111
+ │ └── example_com.full.jpg
112
+ ```
113
+
114
+ ## 配置
115
+
116
+ 优先级:命令行 > 环境变量 > 配置文件 > 默认值
117
+
118
+ ```bash
119
+ # 查看配置
120
+ markitai config list
121
+
122
+ # 初始化配置文件
123
+ markitai config init -o .
124
+
125
+ # 查看缓存状态
126
+ markitai cache stats
127
+
128
+ # 清理缓存
129
+ markitai cache clear
130
+ ```
131
+
132
+ 配置文件路径:`./markitai.json` 或 `~/.markitai/config.json`
133
+
134
+ ## 环境变量
135
+
136
+ | 变量 | 说明 |
137
+ |------|------|
138
+ | `OPENAI_API_KEY` | OpenAI API Key |
139
+ | `GEMINI_API_KEY` | Google Gemini API Key |
140
+ | `DEEPSEEK_API_KEY` | DeepSeek API Key |
141
+ | `ANTHROPIC_API_KEY` | Anthropic API Key |
142
+ | `JINA_API_KEY` | Jina Reader API Key(URL 转换) |
143
+
144
+ ## 依赖
145
+
146
+ - [pymupdf4llm](https://github.com/pymupdf/RAG) - PDF 转换
147
+ - [markitdown](https://github.com/microsoft/markitdown) - Office 文档和 URL 转换
148
+ - [LiteLLM](https://github.com/BerriAI/litellm) - LLM 网关
149
+ - [RapidOCR](https://github.com/RapidAI/RapidOCR) - OCR 识别
150
+
151
+ ## 文档
152
+
153
+ - [快速开始](https://ynewtime.github.io/markitai/guide/getting-started)
154
+ - [配置说明](https://ynewtime.github.io/markitai/guide/configuration)
155
+ - [CLI 命令参考](https://ynewtime.github.io/markitai/guide/cli)
156
+
157
+ ## License
158
+
159
+ MIT
@@ -0,0 +1,48 @@
1
+ markitai/__init__.py,sha256=nL8_TGxWstLYM-_InoDAX9IVbfSHHjzgH4iroiwdQVI,93
2
+ markitai/batch.py,sha256=qNz6_AF12vK3CEzslvhFcAY7KVSYqtSWhjRM8TdmqoA,49576
3
+ markitai/cli.py,sha256=jnvNvr-sMh3bVCooDnBYtND_kpyblAUCUWncK0_-DcY,144061
4
+ markitai/config.py,sha256=Vop1pal-m2tZ3aNjX6YCMhwsyp4rFMdbaOjJFIlL8oQ,19821
5
+ markitai/config.schema.json,sha256=PiVwmPo3g5fI0qciH7OPhUon-t1gaHa8z4bmDQZaBoo,17400
6
+ markitai/constants.py,sha256=onIkq0He4d6sd_z0dBvsJleurx_6QMQIfVtNKzeeQPg,8084
7
+ markitai/fetch.py,sha256=3ciywXpxvcx0Pw_Ho3gSZVCL8-dTfHdwL9cIlDXEGIA,61460
8
+ markitai/image.py,sha256=KlTCV4GKKXRCjl7vhhJzmTLtO3ATM21R82EcPlJ0xDs,49327
9
+ markitai/json_order.py,sha256=JcBU_49SuQWd0YyJesspurpVDkp8YdibB3ADSMAvQlU,16905
10
+ markitai/llm.py,sha256=XhoG4m6ys_j7uZt5L-b6E6M99x9Zwy1e2aAginawU7w,167464
11
+ markitai/ocr.py,sha256=eiaY1g39w4a-00xp1JZtvO8eHZO34STP_o6yk18m_-w,10394
12
+ markitai/security.py,sha256=QaEeblAAaShkdbF_ojvsHflhoxyCde7licR0BbbCm7A,8113
13
+ markitai/types.py,sha256=dTTpUjBV_GwYZi7JRFFi4_teVJ_gVjpsVVDrcCxKvrw,839
14
+ markitai/urls.py,sha256=ILcZf41C7JzignXV9Ru1QgciogQXQ_W7x0GeD72R-DY,5213
15
+ markitai/converter/__init__.py,sha256=Dr8pHofoGbOTbzfCSBOOxsA_LUa-qrJOpxSPISKdgKY,1184
16
+ markitai/converter/_patches.py,sha256=tPiP3vwpsdr4BGPU2zReBBPwHQOtXvu9vzDZmZfLPHc,3067
17
+ markitai/converter/base.py,sha256=EOU1qLWPNbF9SUQX5bmoWP-AqJz7vypllEV6fMi72Hs,3851
18
+ markitai/converter/image.py,sha256=tSKgi4oDYaXJabGocWE982Soe0k955NY5nNM-br37Es,5874
19
+ markitai/converter/legacy.py,sha256=R089rJyacE4K0KATMjfOQii_sgxK8agjvYm_7jIiVz0,19504
20
+ markitai/converter/office.py,sha256=p9Wl5q_Z7C8LDuSXOdYpIIcDsabrFrIuGmVQ7kX8oms,19059
21
+ markitai/converter/pdf.py,sha256=Riq577qBJI2uz6RzzbMjwDaMfGwNZ50a3gxezqFzQok,26418
22
+ markitai/converter/text.py,sha256=ldWv0iUVblAB7MfqyoV2CGs4k36aBxrhSU_6JwLpWds,1432
23
+ markitai/prompts/__init__.py,sha256=fsVei-9jCqInY2OvtxYQVGxyjriU-r1bOoic8N2lS24,4595
24
+ markitai/prompts/cleaner.md,sha256=QJS-ttDYhkSqFJ5lx5MwCfckGkvk6FzZ1bIRuVqoT5I,4656
25
+ markitai/prompts/document_enhance.md,sha256=mM48qXa38kUdjOuirZBpb4HKFSBpVoqAvV9tEG21dJ0,4116
26
+ markitai/prompts/document_enhance_complete.md,sha256=EoGeODZAY-DkJ0dqJTYqPglMqCgwfZ9N-kRQvnzFFR0,3098
27
+ markitai/prompts/document_process.md,sha256=kOiElomRQBg92hPaN6-QaELcq49ynF9Esby5D0OCzBo,2011
28
+ markitai/prompts/frontmatter.md,sha256=PNzgql9RDwDEaqi-5dDf278XxcVuuYNqe7TMSt7gyNE,998
29
+ markitai/prompts/image_analysis.md,sha256=jn8bvR89MS0ZGcXeeKHXWiWEdVyvWwrf8gHi1jSE-zs,842
30
+ markitai/prompts/image_caption.md,sha256=EZXvBjzdjveTKcM8ePfiBJJ5jkzCjlL5ixmw10OqDJM,305
31
+ markitai/prompts/image_description.md,sha256=Wmwq2V96cawKE3p3QpDOHkybNd3ocJUf1bZGU8_qMDM,403
32
+ markitai/prompts/page_content.md,sha256=Sndp_MABteHImrWln3_k3ZJMQfX0fQH2_AI5Hxcao7E,821
33
+ markitai/prompts/url_enhance.md,sha256=3wzTx_QGSuf7k1zuDKXOtcrA7ofqn0fYmwD7Oh9CaTE,3465
34
+ markitai/utils/__init__.py,sha256=K1OWjX--LON8hbc_yB3zYR8TsZ_sc39CjzMWbcgUznQ,899
35
+ markitai/utils/executor.py,sha256=nEONsq41ZJtnqpne0pXQIMjET1rtyMJTbMYNUUcgKgc,2179
36
+ markitai/utils/mime.py,sha256=uak2YY2Z3Bl6dbZk4Xi47b2BPmFvRJJlzILHeW0G2P0,2566
37
+ markitai/utils/office.py,sha256=apeHynsrcspUJX392wnVheiAf-I1t1WOos41l-Nh_04,8221
38
+ markitai/utils/output.py,sha256=9ZZbuY6G2DoLzpcUcSiKCEdgfZ2jW-2GhRQer7uwJWw,1779
39
+ markitai/utils/paths.py,sha256=7TPh7kkWIzFSDhW67TzAZGFNaTQxaYUfPj6ng5K2SGw,1962
40
+ markitai/utils/text.py,sha256=dtpVk8qz_O7ZLCSEj55EeVPGnZ7Zpif4koaW2pH9mkU,12242
41
+ markitai/workflow/__init__.py,sha256=lw_gRgl_M-kGFuBHMXU5biEu8xI5I2z9SOFb8cLU3Ck,898
42
+ markitai/workflow/core.py,sha256=Q2F8i7Bd5WJ2m3GXYTtqdb7AL6InYHND_w1uh2s4cQ4,25172
43
+ markitai/workflow/helpers.py,sha256=Z256Cm6a4NasqedWwp205IVWZ_-whHlS8mO5xYyGOgQ,16824
44
+ markitai/workflow/single.py,sha256=qwRkmK-3wgYM_EGv9osvXeT_6L0gRJjaHD0RuLNX6uU,14357
45
+ markitai-0.3.0.dist-info/METADATA,sha256=afBehA4YbHk31XfyPXPt4FOJzZtGBCx2XvRT9h7sR8A,4395
46
+ markitai-0.3.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
47
+ markitai-0.3.0.dist-info/entry_points.txt,sha256=6gpnr_12uwxTs9MqPxphvcZeFQUIk2PtqGRMMneEHsI,46
48
+ markitai-0.3.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.28.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ markitai = markitai.cli:app