markitai 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. markitai/__init__.py +3 -0
  2. markitai/batch.py +1316 -0
  3. markitai/cli.py +3979 -0
  4. markitai/config.py +602 -0
  5. markitai/config.schema.json +748 -0
  6. markitai/constants.py +222 -0
  7. markitai/converter/__init__.py +49 -0
  8. markitai/converter/_patches.py +98 -0
  9. markitai/converter/base.py +164 -0
  10. markitai/converter/image.py +181 -0
  11. markitai/converter/legacy.py +606 -0
  12. markitai/converter/office.py +526 -0
  13. markitai/converter/pdf.py +679 -0
  14. markitai/converter/text.py +63 -0
  15. markitai/fetch.py +1725 -0
  16. markitai/image.py +1335 -0
  17. markitai/json_order.py +550 -0
  18. markitai/llm.py +4339 -0
  19. markitai/ocr.py +347 -0
  20. markitai/prompts/__init__.py +159 -0
  21. markitai/prompts/cleaner.md +93 -0
  22. markitai/prompts/document_enhance.md +77 -0
  23. markitai/prompts/document_enhance_complete.md +65 -0
  24. markitai/prompts/document_process.md +60 -0
  25. markitai/prompts/frontmatter.md +28 -0
  26. markitai/prompts/image_analysis.md +21 -0
  27. markitai/prompts/image_caption.md +8 -0
  28. markitai/prompts/image_description.md +13 -0
  29. markitai/prompts/page_content.md +17 -0
  30. markitai/prompts/url_enhance.md +78 -0
  31. markitai/security.py +286 -0
  32. markitai/types.py +30 -0
  33. markitai/urls.py +187 -0
  34. markitai/utils/__init__.py +33 -0
  35. markitai/utils/executor.py +69 -0
  36. markitai/utils/mime.py +85 -0
  37. markitai/utils/office.py +262 -0
  38. markitai/utils/output.py +53 -0
  39. markitai/utils/paths.py +81 -0
  40. markitai/utils/text.py +359 -0
  41. markitai/workflow/__init__.py +37 -0
  42. markitai/workflow/core.py +760 -0
  43. markitai/workflow/helpers.py +509 -0
  44. markitai/workflow/single.py +369 -0
  45. markitai-0.3.0.dist-info/METADATA +159 -0
  46. markitai-0.3.0.dist-info/RECORD +48 -0
  47. markitai-0.3.0.dist-info/WHEEL +4 -0
  48. markitai-0.3.0.dist-info/entry_points.txt +2 -0
markitai/ocr.py ADDED
@@ -0,0 +1,347 @@
1
+ """OCR module using RapidOCR."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+ from pathlib import Path
7
+ from typing import TYPE_CHECKING, Any
8
+
9
+ from loguru import logger
10
+
11
+ from markitai.constants import DEFAULT_OCR_SAMPLE_PAGES, DEFAULT_RENDER_DPI
12
+
13
+ if TYPE_CHECKING:
14
+ from markitai.config import OCRConfig
15
+
16
+
17
+ @dataclass
18
+ class OCRResult:
19
+ """Result of OCR processing."""
20
+
21
+ text: str
22
+ confidence: float
23
+ boxes: list[list[float]]
24
+
25
+
26
+ class OCRProcessor:
27
+ """OCR processor using RapidOCR."""
28
+
29
+ def __init__(self, config: OCRConfig | None = None) -> None:
30
+ """
31
+ Initialize OCR processor.
32
+
33
+ Args:
34
+ config: Optional OCR configuration
35
+ """
36
+ self.config = config
37
+ self._engine = None
38
+
39
+ @property
40
+ def engine(self):
41
+ """Get or create the RapidOCR engine (lazy loading)."""
42
+ if self._engine is None:
43
+ self._engine = self._create_engine()
44
+ return self._engine
45
+
46
+ def _get_lang_enum(self, lang_code: str):
47
+ """Map language code to RapidOCR LangRec enum."""
48
+ from rapidocr import LangRec
49
+
50
+ lang_map = {
51
+ "zh": LangRec.CH,
52
+ "ch": LangRec.CH,
53
+ "en": LangRec.EN,
54
+ "ja": LangRec.JAPAN,
55
+ "japan": LangRec.JAPAN,
56
+ "ko": LangRec.KOREAN,
57
+ "korean": LangRec.KOREAN,
58
+ "ar": LangRec.ARABIC,
59
+ "arabic": LangRec.ARABIC,
60
+ "th": LangRec.TH,
61
+ "latin": LangRec.LATIN,
62
+ }
63
+ return lang_map.get(lang_code.lower(), LangRec.CH)
64
+
65
+ def _create_engine(self):
66
+ """Create RapidOCR engine with configuration."""
67
+ try:
68
+ from rapidocr import RapidOCR
69
+ except ImportError as e:
70
+ raise ImportError(
71
+ "RapidOCR is not installed. "
72
+ "Install with: pip install rapidocr onnxruntime"
73
+ ) from e
74
+
75
+ # Build params
76
+ params = {
77
+ "Global.log_level": "warning", # Reduce logging noise
78
+ }
79
+
80
+ # Set language if configured (must use LangRec enum)
81
+ # Note: RapidOCR params dict expects specific types, type checker doesn't recognize LangRec enum
82
+ if self.config and self.config.lang:
83
+ params["Rec.lang_type"] = self._get_lang_enum(self.config.lang) # type: ignore[assignment]
84
+
85
+ return RapidOCR(params=params)
86
+
87
+ def recognize(self, image_path: Path | str) -> OCRResult:
88
+ """
89
+ Perform OCR on an image file.
90
+
91
+ Args:
92
+ image_path: Path to the image file
93
+
94
+ Returns:
95
+ OCRResult with recognized text and metadata
96
+ """
97
+ image_path = Path(image_path)
98
+
99
+ if not image_path.exists():
100
+ raise FileNotFoundError(f"Image not found: {image_path}")
101
+
102
+ logger.debug(f"Running OCR on: {image_path.name}")
103
+
104
+ result: Any = self.engine(str(image_path))
105
+
106
+ # Extract text from result (RapidOCR returns union type with incomplete stubs)
107
+ # Use 'is not None' to avoid numpy array boolean ambiguity
108
+ texts = list(result.txts) if result.txts is not None else []
109
+ scores = list(result.scores) if result.scores is not None else []
110
+ boxes = list(result.boxes) if result.boxes is not None else []
111
+
112
+ # Join all recognized text
113
+ full_text = "\n".join(texts)
114
+
115
+ # Calculate average confidence
116
+ avg_confidence = sum(scores) / len(scores) if scores else 0.0
117
+
118
+ logger.debug(
119
+ f"OCR completed: {len(texts)} text blocks, "
120
+ f"avg confidence: {avg_confidence:.2f}"
121
+ )
122
+
123
+ return OCRResult(
124
+ text=full_text,
125
+ confidence=avg_confidence,
126
+ boxes=[
127
+ box.tolist() if hasattr(box, "tolist") else list(box) for box in boxes
128
+ ],
129
+ )
130
+
131
+ def recognize_numpy(self, image_array: Any) -> OCRResult:
132
+ """
133
+ Perform OCR on a numpy array (RGB image data).
134
+
135
+ This is more efficient than recognize_bytes as it avoids
136
+ intermediate file I/O when the image is already in memory.
137
+
138
+ Args:
139
+ image_array: numpy array of shape (H, W, 3) or (H, W, 4) in RGB(A) format
140
+
141
+ Returns:
142
+ OCRResult with recognized text and metadata
143
+ """
144
+ import numpy as np
145
+
146
+ # Ensure we have a proper numpy array
147
+ if not isinstance(image_array, np.ndarray):
148
+ raise TypeError(f"Expected numpy array, got {type(image_array)}")
149
+
150
+ logger.debug(f"Running OCR on numpy array: shape={image_array.shape}")
151
+
152
+ # RapidOCR can accept numpy arrays directly
153
+ result: Any = self.engine(image_array)
154
+
155
+ # Extract text from result
156
+ # Use 'is not None' to avoid numpy array boolean ambiguity
157
+ texts = list(result.txts) if result.txts is not None else []
158
+ scores = list(result.scores) if result.scores is not None else []
159
+ boxes = list(result.boxes) if result.boxes is not None else []
160
+
161
+ # Join all recognized text
162
+ full_text = "\n".join(texts)
163
+
164
+ # Calculate average confidence
165
+ avg_confidence = sum(scores) / len(scores) if scores else 0.0
166
+
167
+ logger.debug(
168
+ f"OCR completed: {len(texts)} text blocks, "
169
+ f"avg confidence: {avg_confidence:.2f}"
170
+ )
171
+
172
+ return OCRResult(
173
+ text=full_text,
174
+ confidence=avg_confidence,
175
+ boxes=[
176
+ box.tolist() if hasattr(box, "tolist") else list(box) for box in boxes
177
+ ],
178
+ )
179
+
180
+ def recognize_bytes(self, image_data: bytes) -> OCRResult:
181
+ """
182
+ Perform OCR on image bytes.
183
+
184
+ Args:
185
+ image_data: Raw image bytes
186
+
187
+ Returns:
188
+ OCRResult with recognized text and metadata
189
+ """
190
+ import io
191
+
192
+ import numpy as np
193
+ from PIL import Image
194
+
195
+ # Load image from bytes
196
+ image = Image.open(io.BytesIO(image_data))
197
+
198
+ # Convert to RGB if needed (RapidOCR works best with RGB)
199
+ if image.mode != "RGB":
200
+ image = image.convert("RGB")
201
+
202
+ # Convert to numpy array and use recognize_numpy directly
203
+ # This avoids temporary file I/O
204
+ image_array = np.array(image)
205
+ return self.recognize_numpy(image_array)
206
+
207
+ def recognize_pdf_page(
208
+ self,
209
+ pdf_path: Path,
210
+ page_num: int,
211
+ dpi: int = DEFAULT_RENDER_DPI,
212
+ ) -> OCRResult:
213
+ """
214
+ Perform OCR on a specific PDF page.
215
+
216
+ Args:
217
+ pdf_path: Path to the PDF file
218
+ page_num: Page number (0-indexed)
219
+ dpi: Resolution for rendering
220
+
221
+ Returns:
222
+ OCRResult with recognized text
223
+ """
224
+ try:
225
+ import fitz # pymupdf
226
+ except ImportError as e:
227
+ raise ImportError(
228
+ "PyMuPDF is not installed. Install with: pip install pymupdf"
229
+ ) from e
230
+
231
+ doc = fitz.open(pdf_path)
232
+ try:
233
+ if page_num >= len(doc):
234
+ raise ValueError(
235
+ f"Page {page_num} out of range. PDF has {len(doc)} pages."
236
+ )
237
+
238
+ page = doc[page_num]
239
+
240
+ # Render page to image
241
+ mat = fitz.Matrix(dpi / 72, dpi / 72)
242
+ pix = page.get_pixmap(matrix=mat)
243
+
244
+ # Use recognize_pixmap for direct processing
245
+ return self.recognize_pixmap(pix.samples, pix.width, pix.height, pix.n)
246
+
247
+ finally:
248
+ doc.close()
249
+
250
+ def recognize_pixmap(
251
+ self,
252
+ samples: bytes,
253
+ width: int,
254
+ height: int,
255
+ n_channels: int,
256
+ ) -> OCRResult:
257
+ """
258
+ Perform OCR on raw pixel data (e.g., from pymupdf pixmap).
259
+
260
+ This method is optimized for use with pymupdf's pixmap.samples,
261
+ avoiding redundant image encoding/decoding.
262
+
263
+ Args:
264
+ samples: Raw pixel data bytes
265
+ width: Image width in pixels
266
+ height: Image height in pixels
267
+ n_channels: Number of color channels (3 for RGB, 4 for RGBA)
268
+
269
+ Returns:
270
+ OCRResult with recognized text
271
+ """
272
+ import numpy as np
273
+
274
+ # Convert raw bytes to numpy array
275
+ image_array = np.frombuffer(samples, dtype=np.uint8).reshape(
276
+ (height, width, n_channels)
277
+ )
278
+
279
+ # If RGBA, convert to RGB
280
+ if n_channels == 4:
281
+ image_array = image_array[:, :, :3]
282
+
283
+ return self.recognize_numpy(image_array)
284
+
285
+ def is_scanned_pdf(
286
+ self, pdf_path: Path, sample_pages: int = DEFAULT_OCR_SAMPLE_PAGES
287
+ ) -> bool:
288
+ """
289
+ Check if a PDF is likely scanned (image-based).
290
+
291
+ Args:
292
+ pdf_path: Path to the PDF file
293
+ sample_pages: Number of pages to sample
294
+
295
+ Returns:
296
+ True if PDF appears to be scanned
297
+ """
298
+ try:
299
+ import fitz
300
+ except ImportError:
301
+ return False
302
+
303
+ doc = fitz.open(pdf_path)
304
+ try:
305
+ total_text_length = 0
306
+ pages_to_check = min(sample_pages, len(doc))
307
+
308
+ for i in range(pages_to_check):
309
+ page = doc[i]
310
+ # Note: pymupdf get_text() returns str but type stubs say Any
311
+ text: str = page.get_text() # type: ignore[assignment]
312
+ total_text_length += len(text.strip())
313
+
314
+ # If very little text extracted, likely scanned
315
+ avg_text_per_page = total_text_length / pages_to_check
316
+ return avg_text_per_page < 100 # Threshold: less than 100 chars per page
317
+
318
+ finally:
319
+ doc.close()
320
+
321
+ def recognize_to_markdown(self, image_path: Path | str) -> str:
322
+ """
323
+ Perform OCR and format result as markdown.
324
+
325
+ Uses RapidOCR's built-in to_markdown() method if available.
326
+
327
+ Args:
328
+ image_path: Path to the image file
329
+
330
+ Returns:
331
+ Markdown formatted text
332
+ """
333
+ image_path = Path(image_path)
334
+
335
+ result: Any = self.engine(
336
+ str(image_path),
337
+ return_word_box=True,
338
+ return_single_char_box=True,
339
+ )
340
+
341
+ # Try to use built-in markdown conversion
342
+ if hasattr(result, "to_markdown"):
343
+ return result.to_markdown()
344
+
345
+ # Fallback: simple text extraction
346
+ texts = result.txts if result.txts else []
347
+ return "\n\n".join(texts)
@@ -0,0 +1,159 @@
1
+ """Prompt management module."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from datetime import datetime
6
+ from pathlib import Path
7
+ from typing import TYPE_CHECKING
8
+
9
+ if TYPE_CHECKING:
10
+ from markitai.config import PromptsConfig
11
+
12
+
13
+ # Built-in prompts directory
14
+ BUILTIN_PROMPTS_DIR = Path(__file__).parent
15
+
16
+
17
+ class PromptManager:
18
+ """Manager for loading and rendering prompts."""
19
+
20
+ # Available prompt names
21
+ PROMPT_NAMES = (
22
+ "cleaner",
23
+ "frontmatter",
24
+ "image_caption",
25
+ "image_description",
26
+ "image_analysis",
27
+ "page_content",
28
+ "document_enhance",
29
+ "document_process",
30
+ "document_enhance_complete",
31
+ "url_enhance",
32
+ )
33
+
34
+ def __init__(self, config: PromptsConfig | None = None) -> None:
35
+ """
36
+ Initialize prompt manager.
37
+
38
+ Args:
39
+ config: Optional prompts configuration
40
+ """
41
+ self.config = config
42
+ self._cache: dict[str, str] = {}
43
+
44
+ def get_prompt(self, name: str, **variables: str) -> str:
45
+ """
46
+ Get a prompt by name with variables substituted.
47
+
48
+ Args:
49
+ name: Prompt name (cleaner, frontmatter, image_caption, image_description)
50
+ **variables: Variables to substitute in the template
51
+
52
+ Returns:
53
+ Rendered prompt string
54
+
55
+ Raises:
56
+ ValueError: If prompt name is not valid
57
+ """
58
+ if name not in self.PROMPT_NAMES:
59
+ raise ValueError(
60
+ f"Unknown prompt: {name}. Valid names: {', '.join(self.PROMPT_NAMES)}"
61
+ )
62
+
63
+ template = self._load_prompt(name)
64
+ return self._render(template, **variables)
65
+
66
+ def _load_prompt(self, name: str) -> str:
67
+ """
68
+ Load prompt template from file.
69
+
70
+ Priority:
71
+ 1. Config-specified path
72
+ 2. Custom directory
73
+ 3. Built-in prompts
74
+ """
75
+ # Check cache first
76
+ if name in self._cache:
77
+ return self._cache[name]
78
+
79
+ template = None
80
+
81
+ # 1. Check config-specified path
82
+ if self.config:
83
+ config_path = getattr(self.config, name, None)
84
+ if config_path:
85
+ path = Path(config_path).expanduser()
86
+ if path.exists():
87
+ template = path.read_text(encoding="utf-8")
88
+
89
+ # 2. Check custom directory
90
+ if template is None and self.config:
91
+ custom_dir = Path(self.config.dir).expanduser()
92
+ custom_path = custom_dir / f"{name}.md"
93
+ if custom_path.exists():
94
+ template = custom_path.read_text(encoding="utf-8")
95
+
96
+ # 3. Fall back to built-in
97
+ if template is None:
98
+ builtin_path = BUILTIN_PROMPTS_DIR / f"{name}.md"
99
+ if builtin_path.exists():
100
+ template = builtin_path.read_text(encoding="utf-8")
101
+ else:
102
+ raise FileNotFoundError(f"Built-in prompt not found: {name}")
103
+
104
+ # Cache the template
105
+ self._cache[name] = template
106
+ return template
107
+
108
+ def _render(self, template: str, **variables: str) -> str:
109
+ """
110
+ Render a template with variables.
111
+
112
+ Uses simple {variable} substitution.
113
+ """
114
+ result = template
115
+
116
+ # Add default variables
117
+ if "timestamp" not in variables:
118
+ variables["timestamp"] = datetime.now().astimezone().isoformat()
119
+
120
+ for key, value in variables.items():
121
+ result = result.replace(f"{{{key}}}", str(value))
122
+
123
+ return result
124
+
125
+ def clear_cache(self) -> None:
126
+ """Clear the prompt cache."""
127
+ self._cache.clear()
128
+
129
+ def list_prompts(self) -> dict[str, str]:
130
+ """
131
+ List all available prompts with their sources.
132
+
133
+ Returns:
134
+ Dict mapping prompt names to their source paths
135
+ """
136
+ result = {}
137
+
138
+ for name in self.PROMPT_NAMES:
139
+ source = "built-in"
140
+
141
+ # Check config-specified path
142
+ if self.config:
143
+ config_path = getattr(self.config, name, None)
144
+ if config_path:
145
+ path = Path(config_path).expanduser()
146
+ if path.exists():
147
+ source = str(path)
148
+ result[name] = source
149
+ continue
150
+
151
+ # Check custom directory
152
+ custom_dir = Path(self.config.dir).expanduser()
153
+ custom_path = custom_dir / f"{name}.md"
154
+ if custom_path.exists():
155
+ source = str(custom_path)
156
+
157
+ result[name] = source
158
+
159
+ return result
@@ -0,0 +1,93 @@
1
+ Markdown 格式优化任务。
2
+
3
+ 【输出要求】
4
+ - 直接输出纯 Markdown,不要包裹在代码块中
5
+ - 不要添加任何解释或说明
6
+
7
+ 【核心原则 - 必须严格遵守】
8
+ - **禁止翻译(CRITICAL - DO NOT TRANSLATE)**:
9
+ - 英文输入 → 英文输出(English in → English out)
10
+ - 中文输入 → 中文输出(中文输入 → 中文输出)
11
+ - 绝对禁止将任何语言翻译成另一种语言
12
+ - 违反此规则将导致输出无效
13
+ - **禁止改写**:保留原文的用词和表达方式,只做格式调整
14
+ - 英文内容用英文标点,中文内容用中文标点
15
+ - 混合语言文档 → 根据上下文判断,保持局部一致性
16
+
17
+ 【清理规范】
18
+ - 保留幻灯片标记(如 `<!-- Slide number: X -->`),不要添加新的 slide 注释
19
+ - 保留页面图片注释(如 `<!-- Page images for reference -->` 和 `<!-- ![Page X](...) -->`)
20
+ - 删除其他 HTML 注释
21
+ - **删除 PPT/PDF 页眉页脚**(IMPORTANT - 必须删除):
22
+ - 特征:每页/slide 末尾重复出现的 2-4 行短文本(每行 < 30 字符)+ 页码
23
+ - 示例 1:`FTD\nFREE TEST DATA\n2`(品牌缩写 + 品牌名 + 页码)
24
+ - 示例 2:`Company Name\n© 2024\n5`
25
+ - 删除条件:相同模式在 ≥3 页重复出现
26
+ - 位置:通常在每个 `<!-- Slide number: X -->` 或 `<!-- Page number: X -->` 块的末尾
27
+ - 删除图表残留的孤立数字行(如坐标轴标签被单独提取成行)
28
+ - 删除无意义的重复标题(如每页都有相同的文档标题)
29
+ - 删除标题前的孤立短文本(无格式的1-3个词,如重复的文档名)
30
+
31
+ 【空行规范】
32
+ - 标题(#)前后各保留一个空行
33
+ - 代码块(```)前后各保留一个空行
34
+ - 列表块前后各保留一个空行
35
+ - 表格前后各保留一个空行
36
+ - HTML 注释(如 `<!-- Page number: X -->`、`<!-- Slide number: X -->`)前后各保留一个空行
37
+ - 段落间保留一个空行,删除多余空行
38
+
39
+ 【标点与强调】
40
+ - 根据文档语言使用对应标点
41
+ - 闭合强调标记放在标点内侧:错误「内容。**」→ 正确「内容**。」
42
+ - 合并连续强调标记:「**这是****一个****强调**」→「**这是一个强调**」
43
+ - 粗体/斜体标记与中文之间不加空格
44
+
45
+ 【代码块规范】
46
+ - 嵌套时外层反引号数 = 内层最大反引号数 + 1
47
+
48
+ 【列表规范】
49
+ - 无序列表统一使用 - 符号
50
+ - 有序列表使用 1. 2. 3. 格式
51
+ - 嵌套列表缩进 2 空格
52
+
53
+ 【段落规范】
54
+ - 合并不应断行的段落(同一句话被错误换行)
55
+ - 保留有意义的换行(如诗歌、地址)
56
+
57
+ 【链接格式修复】
58
+ - 修复换行的链接:将 `[标题\n\n描述](url)` 合并为 `[标题](url)`
59
+ - 示例:
60
+ - 错误:`[Evergreen notes\n\nEvergreen notes allow you to...](/evergreen-notes)`
61
+ - 正确:`[Evergreen notes](/evergreen-notes)`
62
+
63
+ 【表格规范】
64
+ - 若列头为空且内容语义清晰,可根据语义补充列头
65
+ - 若第一列是纯数字行号且无列头,可补充行号列头(如 # 或 No.)
66
+ - **CRITICAL: 列头语言必须严格与表格数据内容的语言一致**(非 frontmatter 的语言)
67
+ - 英文/拉丁文数据 → 英文列头(**绝对禁止使用中文列头**)
68
+ - 中文数据 → 中文列头(**绝对禁止使用英文列头**)
69
+ - Lorem ipsum 视为英文/拉丁文,必须用英文列头
70
+ - 示例:数据 "Dulce, Abril, Female, United States" → 列头 "# | First Name | Last Name | Gender | Country"
71
+ - 示例:数据 "Lorem ipsum, Dolor sit" → 列头 "# | Text | Category" (NOT 文本内容 | 分类)
72
+ - 示例:数据 "张三, 李四, 男, 中国" → 列头 "# | 姓 | 名 | 性别 | 国家"
73
+
74
+ 【保持不变 - 严格保留,不得删除或修改】
75
+ - 代码块内容
76
+ - 表格行列结构(列头补充除外)
77
+ - **所有图片链接 `![...](...)` 必须完整保留,URL 不得修改**
78
+ - **所有超链接 `[...](...)` 必须完整保留,URL 不得修改**
79
+ - HTML 注释中的图片链接(如 `<!-- ![Page X](...) -->`)
80
+ - **所有 `__MARKITAI_*__` 占位符必须原样保留**(如 `__MARKITAI_IMG_0__`、`__MARKITAI_SLIDE_0__`)
81
+ - **原文内容(禁止翻译或改写)**
82
+
83
+ 【URL 保护 - CRITICAL】
84
+ - **禁止修改任何 URL** - 图片链接和超链接的 URL 必须与原文完全一致
85
+ - **禁止编造 URL** - 绝对不能猜测、推断或生成原文中不存在的 URL
86
+ - **禁止替换 URL** - 即使 URL 看起来"不正确",也必须保留原样
87
+ - 示例:原文 `![](https://example.com/a.jpg)` → 输出必须是 `![](https://example.com/a.jpg)`
88
+ - 违反此规则将导致输出无效
89
+
90
+ 输入:
91
+ {content}
92
+
93
+ 直接输出优化后的 Markdown:
@@ -0,0 +1,77 @@
1
+ 你是一个文档格式清理专家。你的任务是清理提取文本中的格式问题,同时保持内容完整性。
2
+
3
+ 你会收到:
4
+ 1. **提取的文本**:程序提取的 Markdown 内容
5
+ 2. **页面图片**:用于验证格式的视觉参考
6
+
7
+ ## 核心原则 - 必须严格遵守
8
+
9
+ - **禁止翻译(CRITICAL - DO NOT TRANSLATE)**:
10
+ - 英文输入 → 英文输出(English in → English out)
11
+ - 中文输入 → 中文输出(中文输入 → 中文输出)
12
+ - 绝对禁止将任何语言翻译成另一种语言
13
+ - 违反此规则将导致输出无效
14
+ - **禁止改写**:保留原文的用词和表达方式,只做格式调整
15
+
16
+ ## 清理任务
17
+
18
+ 【删除残留 - 仅删除明显垃圾,不删除正文】
19
+ - 删除图表提取残留的孤立数字行(如单独一行的 "12", "10", "8" 等,通常是坐标轴标签)
20
+ - 删除 PPT/PDF 页眉页脚:
21
+ - 特征:每页末尾重复出现的 2-4 行短文本(每行 < 30 字符)
22
+ - 示例:`FTD\nFREE TEST DATA\n2`(品牌名 + 页码)
23
+ - 示例:`Company Name\n© 2024\n5`
24
+ - **仅当相同文本在多页重复出现时才删除**
25
+ - 删除无意义的重复标题(如每页都有相同的文档名)
26
+
27
+ 【格式修正】
28
+ - 参考页面图片修正标题层级(##、###等)
29
+ - 修正列表格式(缩进、符号)
30
+ - 修正表格结构
31
+ - 为 `![](assets/...)` 图片添加简短 alt text
32
+ - 修复换行的链接格式:将 `[文本\n\n描述](url)` 合并为 `[文本](url)`
33
+
34
+ 【空行规范】
35
+ - 标题(#)前后各保留一个空行
36
+ - 列表块/表格前后各保留一个空行
37
+ - 段落间保留一个空行,删除多余空行
38
+
39
+ ## 禁止事项 - CRITICAL
40
+
41
+ - **禁止翻译任何内容** - 原文是什么语言就保留什么语言
42
+ - **禁止删除任何正文段落**(CRITICAL - DO NOT DELETE CONTENT):
43
+ - 每个 `<!-- Page number: X -->` 标记的页面内容必须完整保留
44
+ - 输入有多少页,输出就必须有多少页
45
+ - 只能删除明显的残留/垃圾(孤立数字、重复页眉页脚)
46
+ - 如果不确定是否应该删除,就保留
47
+ - **页码注释必须与内容对齐**(CRITICAL - PAGE MARKER ALIGNMENT):
48
+ - `<!-- Page number: X -->` 注释后面的内容必须是第 X 页的实际内容
49
+ - 禁止将一个页面的内容移动到另一个页码注释下
50
+ - 如果某页内容为空,保留页码注释即可,不要删除
51
+ - 输出的页码顺序必须与输入完全一致(1, 2, 3... 不能变成 1, 3, 2...)
52
+ - **禁止移动内容位置** - 保持原有顺序
53
+ - **禁止重写或改述内容** - 保留原文
54
+ - **禁止添加新内容** - 只做清理
55
+ - **禁止用代码块包裹输出** - 直接输出纯 Markdown,不要用 \`\`\`markdown 包裹
56
+ - **必须保留所有链接** - `[文本](url)` 原样保留,URL 不得修改
57
+ - **必须保留所有图片引用** - `![...](assets/...)` 位置不变,URL 不得修改
58
+ - **禁止修改任何 URL** - 图片链接和超链接的 URL 必须与原文完全一致
59
+ - **禁止编造 URL** - 绝对不能猜测、推断或生成原文中不存在的 URL
60
+ - **必须保留所有 Slide 注释** - `<!-- Slide number: X -->` 原样保留在每个 slide 内容开头,位置不变,不要添加新的 slide 注释
61
+ - **必须保留所有页码注释** - `<!-- Page number: X -->` 原样保留在每页内容开头,位置不变,不要添加新的页码注释
62
+ - **必须保留所有占位符** - `__MARKITAI_*__` 格式的占位符原样保留,位置不变
63
+ - **禁止输出页面截图引用** - 不要输出 `![Page X](screenshots/...)`
64
+ - **禁止输出页面/图片标记** - 不要输出 `## Page X Image:`、`__MARKITAI_PAGE_LABEL_X__`、`__MARKITAI_IMG_LABEL_X__` 等系统内部标记
65
+
66
+ ## 图片语法规范
67
+
68
+ 图片引用必须严格遵循 Markdown 语法,**不要添加多余的括号**:
69
+ - 正确: `![alt text](assets/image.jpg)`
70
+ - 错误: `![alt text](assets/image.jpg))` (多余的右括号)
71
+ - 错误: `![alt text](assets/image.jpg)))` (多余的右括号)
72
+
73
+ ## 输出要求
74
+
75
+ - 仅输出清理后的 Markdown 内容
76
+ - 输出语言与源文档保持一致
77
+ - 不要添加任何说明文字