markitai 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- markitai/__init__.py +3 -0
- markitai/batch.py +1316 -0
- markitai/cli.py +3979 -0
- markitai/config.py +602 -0
- markitai/config.schema.json +748 -0
- markitai/constants.py +222 -0
- markitai/converter/__init__.py +49 -0
- markitai/converter/_patches.py +98 -0
- markitai/converter/base.py +164 -0
- markitai/converter/image.py +181 -0
- markitai/converter/legacy.py +606 -0
- markitai/converter/office.py +526 -0
- markitai/converter/pdf.py +679 -0
- markitai/converter/text.py +63 -0
- markitai/fetch.py +1725 -0
- markitai/image.py +1335 -0
- markitai/json_order.py +550 -0
- markitai/llm.py +4339 -0
- markitai/ocr.py +347 -0
- markitai/prompts/__init__.py +159 -0
- markitai/prompts/cleaner.md +93 -0
- markitai/prompts/document_enhance.md +77 -0
- markitai/prompts/document_enhance_complete.md +65 -0
- markitai/prompts/document_process.md +60 -0
- markitai/prompts/frontmatter.md +28 -0
- markitai/prompts/image_analysis.md +21 -0
- markitai/prompts/image_caption.md +8 -0
- markitai/prompts/image_description.md +13 -0
- markitai/prompts/page_content.md +17 -0
- markitai/prompts/url_enhance.md +78 -0
- markitai/security.py +286 -0
- markitai/types.py +30 -0
- markitai/urls.py +187 -0
- markitai/utils/__init__.py +33 -0
- markitai/utils/executor.py +69 -0
- markitai/utils/mime.py +85 -0
- markitai/utils/office.py +262 -0
- markitai/utils/output.py +53 -0
- markitai/utils/paths.py +81 -0
- markitai/utils/text.py +359 -0
- markitai/workflow/__init__.py +37 -0
- markitai/workflow/core.py +760 -0
- markitai/workflow/helpers.py +509 -0
- markitai/workflow/single.py +369 -0
- markitai-0.3.0.dist-info/METADATA +159 -0
- markitai-0.3.0.dist-info/RECORD +48 -0
- markitai-0.3.0.dist-info/WHEEL +4 -0
- markitai-0.3.0.dist-info/entry_points.txt +2 -0
|
@@ -0,0 +1,369 @@
|
|
|
1
|
+
"""Single file workflow processing."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import re
|
|
7
|
+
from dataclasses import dataclass, field
|
|
8
|
+
from datetime import datetime
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import TYPE_CHECKING, Any
|
|
11
|
+
|
|
12
|
+
from loguru import logger
|
|
13
|
+
|
|
14
|
+
from markitai.security import atomic_write_text
|
|
15
|
+
|
|
16
|
+
if TYPE_CHECKING:
|
|
17
|
+
from markitai.config import MarkitaiConfig
|
|
18
|
+
from markitai.llm import LLMProcessor
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass
|
|
22
|
+
class ImageAnalysisResult:
|
|
23
|
+
"""Result of image analysis for a single source file."""
|
|
24
|
+
|
|
25
|
+
source_file: str
|
|
26
|
+
assets: list[dict[str, Any]]
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@dataclass
|
|
30
|
+
class WorkflowResult:
|
|
31
|
+
"""Result of processing a file through the workflow."""
|
|
32
|
+
|
|
33
|
+
markdown: str
|
|
34
|
+
llm_cost: float = 0.0
|
|
35
|
+
llm_usage: dict[str, dict[str, Any]] = field(default_factory=dict)
|
|
36
|
+
image_analysis: ImageAnalysisResult | None = None
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class SingleFileWorkflow:
|
|
40
|
+
"""Workflow for processing a single file with LLM enhancement.
|
|
41
|
+
|
|
42
|
+
This class encapsulates the LLM processing logic extracted from cli.py,
|
|
43
|
+
including document processing, image analysis, and vision enhancement.
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
def __init__(
|
|
47
|
+
self,
|
|
48
|
+
config: MarkitaiConfig,
|
|
49
|
+
processor: LLMProcessor | None = None,
|
|
50
|
+
project_dir: Path | None = None,
|
|
51
|
+
no_cache: bool = False,
|
|
52
|
+
no_cache_patterns: list[str] | None = None,
|
|
53
|
+
) -> None:
|
|
54
|
+
"""Initialize workflow.
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
config: Markitai configuration
|
|
58
|
+
processor: Optional shared LLMProcessor (created if not provided)
|
|
59
|
+
project_dir: Optional project directory for project-level cache
|
|
60
|
+
no_cache: If True, skip reading from cache but still write results
|
|
61
|
+
no_cache_patterns: List of glob patterns to skip cache for specific files
|
|
62
|
+
"""
|
|
63
|
+
self.config = config
|
|
64
|
+
self._processor = processor
|
|
65
|
+
self._project_dir = project_dir
|
|
66
|
+
self._no_cache = no_cache
|
|
67
|
+
self._no_cache_patterns = no_cache_patterns
|
|
68
|
+
self._llm_cost = 0.0
|
|
69
|
+
self._llm_usage: dict[str, dict[str, Any]] = {}
|
|
70
|
+
|
|
71
|
+
@property
|
|
72
|
+
def processor(self) -> LLMProcessor:
|
|
73
|
+
"""Get or create LLM processor."""
|
|
74
|
+
if self._processor is None:
|
|
75
|
+
from markitai.workflow.helpers import create_llm_processor
|
|
76
|
+
|
|
77
|
+
# Create a temporary config with the no_cache settings
|
|
78
|
+
# This is needed because SingleFileWorkflow stores these separately
|
|
79
|
+
temp_config = self.config.model_copy()
|
|
80
|
+
temp_config.cache.no_cache = self._no_cache
|
|
81
|
+
temp_config.cache.no_cache_patterns = self._no_cache_patterns or []
|
|
82
|
+
|
|
83
|
+
self._processor = create_llm_processor(
|
|
84
|
+
temp_config, project_dir=self._project_dir
|
|
85
|
+
)
|
|
86
|
+
return self._processor
|
|
87
|
+
|
|
88
|
+
def _merge_usage(self, usage: dict[str, dict[str, Any]]) -> None:
|
|
89
|
+
"""Merge usage statistics into workflow totals."""
|
|
90
|
+
from markitai.workflow.helpers import merge_llm_usage
|
|
91
|
+
|
|
92
|
+
merge_llm_usage(self._llm_usage, usage)
|
|
93
|
+
|
|
94
|
+
async def process_document_with_llm(
|
|
95
|
+
self,
|
|
96
|
+
markdown: str,
|
|
97
|
+
source: str,
|
|
98
|
+
output_file: Path,
|
|
99
|
+
page_images: list[dict] | None = None,
|
|
100
|
+
) -> tuple[str, float, dict[str, dict[str, Any]]]:
|
|
101
|
+
"""Process markdown with LLM (clean + frontmatter).
|
|
102
|
+
|
|
103
|
+
Args:
|
|
104
|
+
markdown: Markdown content to process
|
|
105
|
+
source: Source file name
|
|
106
|
+
output_file: Output file path for .llm.md
|
|
107
|
+
page_images: Optional list of page image info dicts
|
|
108
|
+
|
|
109
|
+
Returns:
|
|
110
|
+
Tuple of (markdown, cost_usd, llm_usage)
|
|
111
|
+
"""
|
|
112
|
+
try:
|
|
113
|
+
cleaned, frontmatter = await self.processor.process_document(
|
|
114
|
+
markdown, source
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
# Write LLM version
|
|
118
|
+
llm_output = output_file.with_suffix(".llm.md")
|
|
119
|
+
llm_content = self.processor.format_llm_output(cleaned, frontmatter)
|
|
120
|
+
|
|
121
|
+
# Append commented image links if provided
|
|
122
|
+
if page_images:
|
|
123
|
+
commented_images = [
|
|
124
|
+
f"<!-- ![Page {img['page']}](screenshots/{img['name']}) -->"
|
|
125
|
+
for img in sorted(page_images, key=lambda x: x.get("page", 0))
|
|
126
|
+
]
|
|
127
|
+
llm_content += "\n\n<!-- Page images for reference -->\n" + "\n".join(
|
|
128
|
+
commented_images
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
atomic_write_text(llm_output, llm_content)
|
|
132
|
+
logger.info(f"Written LLM version: {llm_output}")
|
|
133
|
+
|
|
134
|
+
# Use context-based tracking for accurate per-file usage in concurrent scenarios
|
|
135
|
+
cost = self.processor.get_context_cost(source)
|
|
136
|
+
usage = self.processor.get_context_usage(source)
|
|
137
|
+
return markdown, cost, usage
|
|
138
|
+
|
|
139
|
+
except Exception as e:
|
|
140
|
+
logger.warning(f"LLM processing failed: {e}")
|
|
141
|
+
return markdown, 0.0, {}
|
|
142
|
+
|
|
143
|
+
async def analyze_images(
|
|
144
|
+
self,
|
|
145
|
+
image_paths: list[Path],
|
|
146
|
+
markdown: str,
|
|
147
|
+
output_file: Path,
|
|
148
|
+
input_path: Path | None = None,
|
|
149
|
+
concurrency_limit: int | None = None,
|
|
150
|
+
) -> tuple[str, float, dict[str, dict[str, Any]], ImageAnalysisResult | None]:
|
|
151
|
+
"""Analyze images with LLM Vision.
|
|
152
|
+
|
|
153
|
+
Args:
|
|
154
|
+
image_paths: List of image file paths
|
|
155
|
+
markdown: Original markdown content
|
|
156
|
+
output_file: Output markdown file path
|
|
157
|
+
input_path: Source input file path
|
|
158
|
+
concurrency_limit: Max concurrent requests
|
|
159
|
+
|
|
160
|
+
Returns:
|
|
161
|
+
Tuple of (updated markdown, cost_usd, llm_usage, image_analysis_result)
|
|
162
|
+
"""
|
|
163
|
+
from markitai.llm import ImageAnalysis
|
|
164
|
+
from markitai.workflow.helpers import detect_language
|
|
165
|
+
|
|
166
|
+
alt_enabled = self.config.image.alt_enabled
|
|
167
|
+
desc_enabled = self.config.image.desc_enabled
|
|
168
|
+
|
|
169
|
+
# Use unique context for accurate per-file usage tracking in concurrent scenarios
|
|
170
|
+
source_path = (
|
|
171
|
+
str(input_path.resolve()) if input_path else str(output_file.resolve())
|
|
172
|
+
)
|
|
173
|
+
context = f"{source_path}:images"
|
|
174
|
+
|
|
175
|
+
try:
|
|
176
|
+
# Detect document language from markdown content
|
|
177
|
+
language = detect_language(markdown)
|
|
178
|
+
|
|
179
|
+
async def analyze_single_image(
|
|
180
|
+
image_path: Path,
|
|
181
|
+
) -> tuple[Path, ImageAnalysis | None, str]:
|
|
182
|
+
"""Analyze a single image."""
|
|
183
|
+
timestamp = datetime.now().astimezone().isoformat()
|
|
184
|
+
try:
|
|
185
|
+
analysis = await self.processor.analyze_image(
|
|
186
|
+
image_path, language=language, context=context
|
|
187
|
+
)
|
|
188
|
+
return image_path, analysis, timestamp
|
|
189
|
+
except Exception as e:
|
|
190
|
+
logger.warning(f"Failed to analyze image {image_path.name}: {e}")
|
|
191
|
+
return image_path, None, timestamp
|
|
192
|
+
|
|
193
|
+
# Queue-based analysis with concurrency limit
|
|
194
|
+
logger.info(f"Analyzing {len(image_paths)} images...")
|
|
195
|
+
limit = (
|
|
196
|
+
concurrency_limit
|
|
197
|
+
if concurrency_limit is not None
|
|
198
|
+
else self.config.llm.concurrency
|
|
199
|
+
)
|
|
200
|
+
worker_count = min(len(image_paths), max(1, limit))
|
|
201
|
+
queue: asyncio.Queue[Path] = asyncio.Queue()
|
|
202
|
+
for image_path in image_paths:
|
|
203
|
+
queue.put_nowait(image_path)
|
|
204
|
+
|
|
205
|
+
results_map: dict[Path, tuple[Path, ImageAnalysis | None, str]] = {}
|
|
206
|
+
|
|
207
|
+
async def worker() -> None:
|
|
208
|
+
while True:
|
|
209
|
+
try:
|
|
210
|
+
image_path = queue.get_nowait()
|
|
211
|
+
except asyncio.QueueEmpty:
|
|
212
|
+
break
|
|
213
|
+
result = await analyze_single_image(image_path)
|
|
214
|
+
results_map[image_path] = result
|
|
215
|
+
queue.task_done()
|
|
216
|
+
|
|
217
|
+
workers = [asyncio.create_task(worker()) for _ in range(worker_count)]
|
|
218
|
+
await queue.join()
|
|
219
|
+
for task in workers:
|
|
220
|
+
task.cancel()
|
|
221
|
+
await asyncio.gather(*workers, return_exceptions=True)
|
|
222
|
+
|
|
223
|
+
results = [results_map[p] for p in image_paths if p in results_map]
|
|
224
|
+
|
|
225
|
+
# Collect asset descriptions for JSON output
|
|
226
|
+
asset_descriptions: list[dict[str, Any]] = []
|
|
227
|
+
|
|
228
|
+
# Process results
|
|
229
|
+
for image_path, analysis, timestamp in results:
|
|
230
|
+
# Use default values if analysis failed
|
|
231
|
+
# This ensures the image is still recorded in images.json
|
|
232
|
+
if analysis is None:
|
|
233
|
+
analysis_caption = "Image"
|
|
234
|
+
analysis_desc = "Image analysis failed"
|
|
235
|
+
analysis_text = ""
|
|
236
|
+
analysis_usage: dict[str, Any] = {}
|
|
237
|
+
else:
|
|
238
|
+
analysis_caption = analysis.caption
|
|
239
|
+
analysis_desc = analysis.description
|
|
240
|
+
analysis_text = analysis.extracted_text or ""
|
|
241
|
+
analysis_usage = analysis.llm_usage or {}
|
|
242
|
+
|
|
243
|
+
# Collect for JSON output (if desc_enabled)
|
|
244
|
+
if desc_enabled:
|
|
245
|
+
asset_descriptions.append(
|
|
246
|
+
{
|
|
247
|
+
"asset": str(image_path.resolve()),
|
|
248
|
+
"alt": analysis_caption,
|
|
249
|
+
"desc": analysis_desc,
|
|
250
|
+
"text": analysis_text,
|
|
251
|
+
"llm_usage": analysis_usage,
|
|
252
|
+
"created": timestamp,
|
|
253
|
+
}
|
|
254
|
+
)
|
|
255
|
+
|
|
256
|
+
# Update alt text in markdown (if alt_enabled)
|
|
257
|
+
if alt_enabled:
|
|
258
|
+
old_pattern = rf"!\[[^\]]*\]\([^)]*{re.escape(image_path.name)}\)"
|
|
259
|
+
new_ref = f""
|
|
260
|
+
markdown = re.sub(old_pattern, new_ref, markdown)
|
|
261
|
+
|
|
262
|
+
# Check if this is a standalone image file
|
|
263
|
+
from markitai.constants import IMAGE_EXTENSIONS
|
|
264
|
+
|
|
265
|
+
is_standalone_image = (
|
|
266
|
+
input_path is not None
|
|
267
|
+
and input_path.suffix.lower() in IMAGE_EXTENSIONS
|
|
268
|
+
and len(image_paths) == 1
|
|
269
|
+
)
|
|
270
|
+
|
|
271
|
+
# Update/create .llm.md file
|
|
272
|
+
llm_output = output_file.with_suffix(".llm.md")
|
|
273
|
+
if is_standalone_image and results and results[0][1] is not None:
|
|
274
|
+
# For standalone images, create rich formatted content with frontmatter
|
|
275
|
+
from markitai.utils.text import normalize_markdown_whitespace
|
|
276
|
+
from markitai.workflow.helpers import format_standalone_image_markdown
|
|
277
|
+
|
|
278
|
+
# input_path is guaranteed non-None by is_standalone_image check
|
|
279
|
+
assert input_path is not None
|
|
280
|
+
_, analysis, _ = results[0]
|
|
281
|
+
if analysis:
|
|
282
|
+
rich_content = format_standalone_image_markdown(
|
|
283
|
+
input_path,
|
|
284
|
+
analysis,
|
|
285
|
+
f"assets/{input_path.name}",
|
|
286
|
+
include_frontmatter=True,
|
|
287
|
+
)
|
|
288
|
+
rich_content = normalize_markdown_whitespace(rich_content)
|
|
289
|
+
atomic_write_text(llm_output, rich_content)
|
|
290
|
+
logger.info(f"Written LLM version: {llm_output}")
|
|
291
|
+
elif alt_enabled:
|
|
292
|
+
# NOTE: Alt text update moved to caller (workflow/core.py) to avoid race condition.
|
|
293
|
+
# The caller will apply alt text updates after document processing completes.
|
|
294
|
+
# See P0-4 fix: image analysis no longer waits for .llm.md file.
|
|
295
|
+
pass
|
|
296
|
+
|
|
297
|
+
# Build analysis result for caller to aggregate
|
|
298
|
+
analysis_result: ImageAnalysisResult | None = None
|
|
299
|
+
if desc_enabled and asset_descriptions:
|
|
300
|
+
source_path = (
|
|
301
|
+
str(input_path.resolve()) if input_path else output_file.stem
|
|
302
|
+
)
|
|
303
|
+
analysis_result = ImageAnalysisResult(
|
|
304
|
+
source_file=source_path,
|
|
305
|
+
assets=asset_descriptions,
|
|
306
|
+
)
|
|
307
|
+
|
|
308
|
+
# Use context-based tracking for accurate per-file usage in concurrent scenarios
|
|
309
|
+
return (
|
|
310
|
+
markdown,
|
|
311
|
+
self.processor.get_context_cost(context),
|
|
312
|
+
self.processor.get_context_usage(context),
|
|
313
|
+
analysis_result,
|
|
314
|
+
)
|
|
315
|
+
|
|
316
|
+
except Exception as e:
|
|
317
|
+
logger.warning(f"Image analysis failed: {e}")
|
|
318
|
+
return markdown, 0.0, {}, None
|
|
319
|
+
|
|
320
|
+
async def enhance_with_vision(
|
|
321
|
+
self,
|
|
322
|
+
extracted_text: str,
|
|
323
|
+
page_images: list[dict],
|
|
324
|
+
source: str = "document",
|
|
325
|
+
) -> tuple[str, str, float, dict[str, dict[str, Any]]]:
|
|
326
|
+
"""Enhance document by combining extracted text with page images.
|
|
327
|
+
|
|
328
|
+
Args:
|
|
329
|
+
extracted_text: Text extracted by pymupdf4llm/markitdown
|
|
330
|
+
page_images: List of page image info dicts with 'path' key
|
|
331
|
+
source: Source file name for logging context
|
|
332
|
+
|
|
333
|
+
Returns:
|
|
334
|
+
Tuple of (cleaned_markdown, frontmatter_yaml, cost_usd, llm_usage)
|
|
335
|
+
"""
|
|
336
|
+
try:
|
|
337
|
+
# Sort images by page number
|
|
338
|
+
def get_page_num(img_info: dict) -> int:
|
|
339
|
+
return img_info.get("page", 0)
|
|
340
|
+
|
|
341
|
+
sorted_images = sorted(page_images, key=get_page_num)
|
|
342
|
+
|
|
343
|
+
# Convert to Path list
|
|
344
|
+
image_paths = [Path(img["path"]) for img in sorted_images]
|
|
345
|
+
|
|
346
|
+
logger.info(
|
|
347
|
+
f"[START] {source}: Enhancing with {len(image_paths)} page images..."
|
|
348
|
+
)
|
|
349
|
+
|
|
350
|
+
# Call the combined enhancement method (clean + frontmatter)
|
|
351
|
+
(
|
|
352
|
+
cleaned_content,
|
|
353
|
+
frontmatter,
|
|
354
|
+
) = await self.processor.enhance_document_complete(
|
|
355
|
+
extracted_text, image_paths, source=source
|
|
356
|
+
)
|
|
357
|
+
|
|
358
|
+
# Use context-based tracking for accurate per-file usage in concurrent scenarios
|
|
359
|
+
return (
|
|
360
|
+
cleaned_content,
|
|
361
|
+
frontmatter,
|
|
362
|
+
self.processor.get_context_cost(source),
|
|
363
|
+
self.processor.get_context_usage(source),
|
|
364
|
+
)
|
|
365
|
+
|
|
366
|
+
except Exception as e:
|
|
367
|
+
logger.warning(f"Document enhancement failed: {e}")
|
|
368
|
+
basic_frontmatter = f"title: {source}\nsource: {source}"
|
|
369
|
+
return extracted_text, basic_frontmatter, 0.0, {}
|
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: markitai
|
|
3
|
+
Version: 0.3.0
|
|
4
|
+
Summary: Document to Markdown converter with LLM enhancement
|
|
5
|
+
Project-URL: Homepage, https://markitai.ynewtime.com
|
|
6
|
+
Project-URL: Documentation, https://markitai.ynewtime.com/guide/getting-started
|
|
7
|
+
Project-URL: Repository, https://github.com/Ynewtime/markitai
|
|
8
|
+
Project-URL: Changelog, https://github.com/Ynewtime/markitai/blob/main/CHANGELOG.md
|
|
9
|
+
Author-email: Ynewtime <longqiliuye@gmail.com>
|
|
10
|
+
License-Expression: MIT
|
|
11
|
+
Keywords: converter,docx,llm,markdown,ocr,pdf
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Environment :: Console
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
16
|
+
Classifier: Operating System :: OS Independent
|
|
17
|
+
Classifier: Programming Language :: Python :: 3
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
21
|
+
Classifier: Topic :: Text Processing :: Markup :: Markdown
|
|
22
|
+
Classifier: Topic :: Utilities
|
|
23
|
+
Requires-Python: >=3.11
|
|
24
|
+
Requires-Dist: aiofiles>=25.1.0
|
|
25
|
+
Requires-Dist: click>=8.1.0
|
|
26
|
+
Requires-Dist: instructor>=1.14.0
|
|
27
|
+
Requires-Dist: litellm>=1.80.16
|
|
28
|
+
Requires-Dist: loguru>=0.7.3
|
|
29
|
+
Requires-Dist: markitdown[all]>=0.1.4
|
|
30
|
+
Requires-Dist: pillow>=12.1.0
|
|
31
|
+
Requires-Dist: pydantic>=2.10.0
|
|
32
|
+
Requires-Dist: pymupdf4llm>=0.2.9
|
|
33
|
+
Requires-Dist: python-dotenv>=1.2.1
|
|
34
|
+
Requires-Dist: pywin32>=310; sys_platform == 'win32'
|
|
35
|
+
Requires-Dist: rapidocr>=3.5.0
|
|
36
|
+
Requires-Dist: rich>=14.2.0
|
|
37
|
+
Provides-Extra: all
|
|
38
|
+
Description-Content-Type: text/markdown
|
|
39
|
+
|
|
40
|
+
# Markitai
|
|
41
|
+
|
|
42
|
+
开箱即用的 Markdown 转换器,原生支持 LLM 增强。
|
|
43
|
+
|
|
44
|
+
## 特性
|
|
45
|
+
|
|
46
|
+
- **多格式支持** - DOCX/DOC, PPTX/PPT, XLSX/XLS, PDF, TXT, MD, JPG/PNG/WebP, URLs
|
|
47
|
+
- **LLM 增强** - 格式清洗、元数据生成、图片分析
|
|
48
|
+
- **批量处理** - 并发转换、断点恢复、进度显示
|
|
49
|
+
- **OCR 识别** - 扫描版 PDF 和图片文字提取
|
|
50
|
+
- **URL 转换** - 直接转换网页,支持 SPA 浏览器渲染
|
|
51
|
+
|
|
52
|
+
## 安装
|
|
53
|
+
|
|
54
|
+
### 一键安装(推荐)
|
|
55
|
+
|
|
56
|
+
```bash
|
|
57
|
+
# Linux/macOS
|
|
58
|
+
curl -fsSL https://raw.githubusercontent.com/Ynewtime/markitai/main/scripts/setup.sh | sh
|
|
59
|
+
|
|
60
|
+
# Windows (PowerShell)
|
|
61
|
+
irm https://raw.githubusercontent.com/Ynewtime/markitai/main/scripts/setup.ps1 | iex
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
### 手动安装
|
|
65
|
+
|
|
66
|
+
```bash
|
|
67
|
+
# 需要 Python 3.11+
|
|
68
|
+
uv tool install markitai
|
|
69
|
+
|
|
70
|
+
# 或使用 pip
|
|
71
|
+
pip install --user markitai
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
## 快速开始
|
|
75
|
+
|
|
76
|
+
```bash
|
|
77
|
+
# 基础转换
|
|
78
|
+
markitai document.docx
|
|
79
|
+
|
|
80
|
+
# URL 转换
|
|
81
|
+
markitai https://example.com/article
|
|
82
|
+
|
|
83
|
+
# LLM 增强
|
|
84
|
+
markitai document.docx --llm
|
|
85
|
+
|
|
86
|
+
# 使用预设
|
|
87
|
+
markitai document.pdf --preset rich # LLM + alt + desc + screenshot
|
|
88
|
+
markitai document.pdf --preset standard # LLM + alt + desc
|
|
89
|
+
markitai document.pdf --preset minimal # 仅基础转换
|
|
90
|
+
|
|
91
|
+
# 批量处理
|
|
92
|
+
markitai ./docs -o ./output
|
|
93
|
+
|
|
94
|
+
# 断点恢复
|
|
95
|
+
markitai ./docs -o ./output --resume
|
|
96
|
+
|
|
97
|
+
# URL 批量处理(自动识别 .urls 文件)
|
|
98
|
+
markitai urls.urls -o ./output
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
## 输出结构
|
|
102
|
+
|
|
103
|
+
```
|
|
104
|
+
output/
|
|
105
|
+
├── document.docx.md # 基础 Markdown
|
|
106
|
+
├── document.docx.llm.md # LLM 优化版
|
|
107
|
+
├── assets/
|
|
108
|
+
│ ├── document.docx.0001.jpg
|
|
109
|
+
│ └── images.json # 图片描述
|
|
110
|
+
├── screenshots/ # 页面截图(--screenshot 时)
|
|
111
|
+
│ └── example_com.full.jpg
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
## 配置
|
|
115
|
+
|
|
116
|
+
优先级:命令行 > 环境变量 > 配置文件 > 默认值
|
|
117
|
+
|
|
118
|
+
```bash
|
|
119
|
+
# 查看配置
|
|
120
|
+
markitai config list
|
|
121
|
+
|
|
122
|
+
# 初始化配置文件
|
|
123
|
+
markitai config init -o .
|
|
124
|
+
|
|
125
|
+
# 查看缓存状态
|
|
126
|
+
markitai cache stats
|
|
127
|
+
|
|
128
|
+
# 清理缓存
|
|
129
|
+
markitai cache clear
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
配置文件路径:`./markitai.json` 或 `~/.markitai/config.json`
|
|
133
|
+
|
|
134
|
+
## 环境变量
|
|
135
|
+
|
|
136
|
+
| 变量 | 说明 |
|
|
137
|
+
|------|------|
|
|
138
|
+
| `OPENAI_API_KEY` | OpenAI API Key |
|
|
139
|
+
| `GEMINI_API_KEY` | Google Gemini API Key |
|
|
140
|
+
| `DEEPSEEK_API_KEY` | DeepSeek API Key |
|
|
141
|
+
| `ANTHROPIC_API_KEY` | Anthropic API Key |
|
|
142
|
+
| `JINA_API_KEY` | Jina Reader API Key(URL 转换) |
|
|
143
|
+
|
|
144
|
+
## 依赖
|
|
145
|
+
|
|
146
|
+
- [pymupdf4llm](https://github.com/pymupdf/RAG) - PDF 转换
|
|
147
|
+
- [markitdown](https://github.com/microsoft/markitdown) - Office 文档和 URL 转换
|
|
148
|
+
- [LiteLLM](https://github.com/BerriAI/litellm) - LLM 网关
|
|
149
|
+
- [RapidOCR](https://github.com/RapidAI/RapidOCR) - OCR 识别
|
|
150
|
+
|
|
151
|
+
## 文档
|
|
152
|
+
|
|
153
|
+
- [快速开始](https://ynewtime.github.io/markitai/guide/getting-started)
|
|
154
|
+
- [配置说明](https://ynewtime.github.io/markitai/guide/configuration)
|
|
155
|
+
- [CLI 命令参考](https://ynewtime.github.io/markitai/guide/cli)
|
|
156
|
+
|
|
157
|
+
## License
|
|
158
|
+
|
|
159
|
+
MIT
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
markitai/__init__.py,sha256=nL8_TGxWstLYM-_InoDAX9IVbfSHHjzgH4iroiwdQVI,93
|
|
2
|
+
markitai/batch.py,sha256=qNz6_AF12vK3CEzslvhFcAY7KVSYqtSWhjRM8TdmqoA,49576
|
|
3
|
+
markitai/cli.py,sha256=jnvNvr-sMh3bVCooDnBYtND_kpyblAUCUWncK0_-DcY,144061
|
|
4
|
+
markitai/config.py,sha256=Vop1pal-m2tZ3aNjX6YCMhwsyp4rFMdbaOjJFIlL8oQ,19821
|
|
5
|
+
markitai/config.schema.json,sha256=PiVwmPo3g5fI0qciH7OPhUon-t1gaHa8z4bmDQZaBoo,17400
|
|
6
|
+
markitai/constants.py,sha256=onIkq0He4d6sd_z0dBvsJleurx_6QMQIfVtNKzeeQPg,8084
|
|
7
|
+
markitai/fetch.py,sha256=3ciywXpxvcx0Pw_Ho3gSZVCL8-dTfHdwL9cIlDXEGIA,61460
|
|
8
|
+
markitai/image.py,sha256=KlTCV4GKKXRCjl7vhhJzmTLtO3ATM21R82EcPlJ0xDs,49327
|
|
9
|
+
markitai/json_order.py,sha256=JcBU_49SuQWd0YyJesspurpVDkp8YdibB3ADSMAvQlU,16905
|
|
10
|
+
markitai/llm.py,sha256=XhoG4m6ys_j7uZt5L-b6E6M99x9Zwy1e2aAginawU7w,167464
|
|
11
|
+
markitai/ocr.py,sha256=eiaY1g39w4a-00xp1JZtvO8eHZO34STP_o6yk18m_-w,10394
|
|
12
|
+
markitai/security.py,sha256=QaEeblAAaShkdbF_ojvsHflhoxyCde7licR0BbbCm7A,8113
|
|
13
|
+
markitai/types.py,sha256=dTTpUjBV_GwYZi7JRFFi4_teVJ_gVjpsVVDrcCxKvrw,839
|
|
14
|
+
markitai/urls.py,sha256=ILcZf41C7JzignXV9Ru1QgciogQXQ_W7x0GeD72R-DY,5213
|
|
15
|
+
markitai/converter/__init__.py,sha256=Dr8pHofoGbOTbzfCSBOOxsA_LUa-qrJOpxSPISKdgKY,1184
|
|
16
|
+
markitai/converter/_patches.py,sha256=tPiP3vwpsdr4BGPU2zReBBPwHQOtXvu9vzDZmZfLPHc,3067
|
|
17
|
+
markitai/converter/base.py,sha256=EOU1qLWPNbF9SUQX5bmoWP-AqJz7vypllEV6fMi72Hs,3851
|
|
18
|
+
markitai/converter/image.py,sha256=tSKgi4oDYaXJabGocWE982Soe0k955NY5nNM-br37Es,5874
|
|
19
|
+
markitai/converter/legacy.py,sha256=R089rJyacE4K0KATMjfOQii_sgxK8agjvYm_7jIiVz0,19504
|
|
20
|
+
markitai/converter/office.py,sha256=p9Wl5q_Z7C8LDuSXOdYpIIcDsabrFrIuGmVQ7kX8oms,19059
|
|
21
|
+
markitai/converter/pdf.py,sha256=Riq577qBJI2uz6RzzbMjwDaMfGwNZ50a3gxezqFzQok,26418
|
|
22
|
+
markitai/converter/text.py,sha256=ldWv0iUVblAB7MfqyoV2CGs4k36aBxrhSU_6JwLpWds,1432
|
|
23
|
+
markitai/prompts/__init__.py,sha256=fsVei-9jCqInY2OvtxYQVGxyjriU-r1bOoic8N2lS24,4595
|
|
24
|
+
markitai/prompts/cleaner.md,sha256=QJS-ttDYhkSqFJ5lx5MwCfckGkvk6FzZ1bIRuVqoT5I,4656
|
|
25
|
+
markitai/prompts/document_enhance.md,sha256=mM48qXa38kUdjOuirZBpb4HKFSBpVoqAvV9tEG21dJ0,4116
|
|
26
|
+
markitai/prompts/document_enhance_complete.md,sha256=EoGeODZAY-DkJ0dqJTYqPglMqCgwfZ9N-kRQvnzFFR0,3098
|
|
27
|
+
markitai/prompts/document_process.md,sha256=kOiElomRQBg92hPaN6-QaELcq49ynF9Esby5D0OCzBo,2011
|
|
28
|
+
markitai/prompts/frontmatter.md,sha256=PNzgql9RDwDEaqi-5dDf278XxcVuuYNqe7TMSt7gyNE,998
|
|
29
|
+
markitai/prompts/image_analysis.md,sha256=jn8bvR89MS0ZGcXeeKHXWiWEdVyvWwrf8gHi1jSE-zs,842
|
|
30
|
+
markitai/prompts/image_caption.md,sha256=EZXvBjzdjveTKcM8ePfiBJJ5jkzCjlL5ixmw10OqDJM,305
|
|
31
|
+
markitai/prompts/image_description.md,sha256=Wmwq2V96cawKE3p3QpDOHkybNd3ocJUf1bZGU8_qMDM,403
|
|
32
|
+
markitai/prompts/page_content.md,sha256=Sndp_MABteHImrWln3_k3ZJMQfX0fQH2_AI5Hxcao7E,821
|
|
33
|
+
markitai/prompts/url_enhance.md,sha256=3wzTx_QGSuf7k1zuDKXOtcrA7ofqn0fYmwD7Oh9CaTE,3465
|
|
34
|
+
markitai/utils/__init__.py,sha256=K1OWjX--LON8hbc_yB3zYR8TsZ_sc39CjzMWbcgUznQ,899
|
|
35
|
+
markitai/utils/executor.py,sha256=nEONsq41ZJtnqpne0pXQIMjET1rtyMJTbMYNUUcgKgc,2179
|
|
36
|
+
markitai/utils/mime.py,sha256=uak2YY2Z3Bl6dbZk4Xi47b2BPmFvRJJlzILHeW0G2P0,2566
|
|
37
|
+
markitai/utils/office.py,sha256=apeHynsrcspUJX392wnVheiAf-I1t1WOos41l-Nh_04,8221
|
|
38
|
+
markitai/utils/output.py,sha256=9ZZbuY6G2DoLzpcUcSiKCEdgfZ2jW-2GhRQer7uwJWw,1779
|
|
39
|
+
markitai/utils/paths.py,sha256=7TPh7kkWIzFSDhW67TzAZGFNaTQxaYUfPj6ng5K2SGw,1962
|
|
40
|
+
markitai/utils/text.py,sha256=dtpVk8qz_O7ZLCSEj55EeVPGnZ7Zpif4koaW2pH9mkU,12242
|
|
41
|
+
markitai/workflow/__init__.py,sha256=lw_gRgl_M-kGFuBHMXU5biEu8xI5I2z9SOFb8cLU3Ck,898
|
|
42
|
+
markitai/workflow/core.py,sha256=Q2F8i7Bd5WJ2m3GXYTtqdb7AL6InYHND_w1uh2s4cQ4,25172
|
|
43
|
+
markitai/workflow/helpers.py,sha256=Z256Cm6a4NasqedWwp205IVWZ_-whHlS8mO5xYyGOgQ,16824
|
|
44
|
+
markitai/workflow/single.py,sha256=qwRkmK-3wgYM_EGv9osvXeT_6L0gRJjaHD0RuLNX6uU,14357
|
|
45
|
+
markitai-0.3.0.dist-info/METADATA,sha256=afBehA4YbHk31XfyPXPt4FOJzZtGBCx2XvRT9h7sR8A,4395
|
|
46
|
+
markitai-0.3.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
47
|
+
markitai-0.3.0.dist-info/entry_points.txt,sha256=6gpnr_12uwxTs9MqPxphvcZeFQUIk2PtqGRMMneEHsI,46
|
|
48
|
+
markitai-0.3.0.dist-info/RECORD,,
|