markitdown-plus 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- markitdown_plus/__about__.py +7 -0
- markitdown_plus/__init__.py +12 -0
- markitdown_plus/assets.py +154 -0
- markitdown_plus/batch.py +387 -0
- markitdown_plus/chunker.py +433 -0
- markitdown_plus/cleaner.py +158 -0
- markitdown_plus/cli.py +205 -0
- markitdown_plus/converter.py +58 -0
- markitdown_plus/errors.py +13 -0
- markitdown_plus/manifest.py +164 -0
- markitdown_plus/metadata.py +97 -0
- markitdown_plus/utils.py +52 -0
- markitdown_plus-0.2.0.dist-info/METADATA +292 -0
- markitdown_plus-0.2.0.dist-info/RECORD +17 -0
- markitdown_plus-0.2.0.dist-info/WHEEL +4 -0
- markitdown_plus-0.2.0.dist-info/entry_points.txt +2 -0
- markitdown_plus-0.2.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,433 @@
|
|
|
1
|
+
"""RAG-ready Markdown chunking utilities."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import re
|
|
7
|
+
from collections.abc import Iterable
|
|
8
|
+
from dataclasses import asdict, dataclass
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
|
|
11
|
+
from .utils import safe_stem, short_hash
|
|
12
|
+
|
|
13
|
+
TOKEN_RATIOS: dict[str, float] = {
|
|
14
|
+
"gpt4": 0.75,
|
|
15
|
+
"gpt-4": 0.75,
|
|
16
|
+
"gpt-4o": 0.75,
|
|
17
|
+
"deepseek": 0.75,
|
|
18
|
+
"claude": 3.5,
|
|
19
|
+
"gemini": 4.0,
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
CHUNK_STRATEGIES = {"heading", "fixed", "semantic-lite"}
|
|
23
|
+
|
|
24
|
+
ABBREVIATIONS = (
|
|
25
|
+
"Mr.", "Mrs.", "Ms.", "Dr.", "Prof.", "Sr.", "Jr.", "St.",
|
|
26
|
+
"vs.", "e.g.", "i.e.", "etc.", "Fig.", "Eq.", "No.", "Inc.",
|
|
27
|
+
"Ltd.", "Co.", "U.S.", "U.K.", "Ph.D.", "M.D.", "a.m.", "p.m.",
|
|
28
|
+
"Jan.", "Feb.", "Mar.", "Apr.", "Jun.", "Jul.", "Aug.", "Sep.",
|
|
29
|
+
"Sept.", "Oct.", "Nov.", "Dec.",
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
SEMANTIC_BREAK_PREFIXES = (
|
|
33
|
+
"summary", "conclusion", "key takeaway", "key takeaways", "next steps",
|
|
34
|
+
"recommendation", "recommendations", "background", "methodology", "results",
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@dataclass
|
|
39
|
+
class Chunk:
|
|
40
|
+
"""One RAG-ready text chunk."""
|
|
41
|
+
|
|
42
|
+
id: str
|
|
43
|
+
source: str
|
|
44
|
+
index: int
|
|
45
|
+
heading_path: list[str]
|
|
46
|
+
text: str
|
|
47
|
+
token_estimate: int
|
|
48
|
+
strategy: str = "heading"
|
|
49
|
+
|
|
50
|
+
def to_json(self) -> str:
|
|
51
|
+
"""Serialize to one JSONL row."""
|
|
52
|
+
return json.dumps(asdict(self), ensure_ascii=False)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def estimate_tokens(text: str, model: str = "gpt4", url_penalty: float = 5.0) -> int:
|
|
56
|
+
"""Estimate token count for chunk sizing.
|
|
57
|
+
|
|
58
|
+
This is intentionally lightweight and dependency-free. It accounts for CJK
|
|
59
|
+
text, English-like words, URLs, and different rough tokenizer families.
|
|
60
|
+
"""
|
|
61
|
+
if not text:
|
|
62
|
+
return 0
|
|
63
|
+
cjk_chars = len(re.findall(r"[\u4e00-\u9fff]", text))
|
|
64
|
+
word_count = len(re.findall(r"\b\w+\b", text))
|
|
65
|
+
url_count = len(re.findall(r"https?://\S+", text))
|
|
66
|
+
ratio = TOKEN_RATIOS.get(model.lower(), TOKEN_RATIOS["gpt4"])
|
|
67
|
+
return max(1, int(cjk_chars / 2 + word_count / ratio + url_count * url_penalty))
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def _protect_abbreviations(text: str) -> tuple[str, dict[str, str]]:
|
|
71
|
+
replacements: dict[str, str] = {}
|
|
72
|
+
protected = text
|
|
73
|
+
for index, abbr in enumerate(ABBREVIATIONS):
|
|
74
|
+
token = f"__MDP_ABBR_{index}__"
|
|
75
|
+
protected = protected.replace(abbr, token)
|
|
76
|
+
replacements[token] = abbr
|
|
77
|
+
return protected, replacements
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def _restore_abbreviations(text: str, replacements: dict[str, str]) -> str:
|
|
81
|
+
restored = text
|
|
82
|
+
for token, abbr in replacements.items():
|
|
83
|
+
restored = restored.replace(token, abbr)
|
|
84
|
+
return restored
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def _split_sentences(text: str) -> list[str]:
|
|
88
|
+
"""Split text into sentences with common abbreviation protection."""
|
|
89
|
+
normalized = re.sub(r"\s+", " ", text).strip()
|
|
90
|
+
if not normalized:
|
|
91
|
+
return []
|
|
92
|
+
|
|
93
|
+
protected, replacements = _protect_abbreviations(normalized)
|
|
94
|
+
parts: list[str] = []
|
|
95
|
+
start = 0
|
|
96
|
+
for match in re.finditer(r"[.!?。!?]+(?:[\"'”’」』\)\]\}】]*)", protected):
|
|
97
|
+
end = match.end()
|
|
98
|
+
next_char = protected[end : end + 1]
|
|
99
|
+
should_split = end == len(protected) or next_char.isspace() or protected[match.start()] in "。!?"
|
|
100
|
+
if should_split:
|
|
101
|
+
piece = protected[start:end].strip()
|
|
102
|
+
if piece:
|
|
103
|
+
parts.append(_restore_abbreviations(piece, replacements))
|
|
104
|
+
start = end
|
|
105
|
+
while start < len(protected) and protected[start].isspace():
|
|
106
|
+
start += 1
|
|
107
|
+
|
|
108
|
+
tail = protected[start:].strip()
|
|
109
|
+
if tail:
|
|
110
|
+
parts.append(_restore_abbreviations(tail, replacements))
|
|
111
|
+
return parts or [_restore_abbreviations(protected, replacements)]
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def _split_paragraphs(text: str) -> list[str]:
|
|
115
|
+
"""Split Markdown into paragraphs without breaking fenced code blocks."""
|
|
116
|
+
paragraphs: list[str] = []
|
|
117
|
+
current: list[str] = []
|
|
118
|
+
in_code_block = False
|
|
119
|
+
|
|
120
|
+
def flush() -> None:
|
|
121
|
+
nonlocal current
|
|
122
|
+
if current and any(line.strip() for line in current):
|
|
123
|
+
paragraphs.append("\n".join(current).strip())
|
|
124
|
+
current = []
|
|
125
|
+
|
|
126
|
+
for line in text.split("\n"):
|
|
127
|
+
stripped = line.strip()
|
|
128
|
+
if stripped.startswith(("```", "~~~")):
|
|
129
|
+
current.append(line)
|
|
130
|
+
in_code_block = not in_code_block
|
|
131
|
+
continue
|
|
132
|
+
if not in_code_block and not stripped:
|
|
133
|
+
flush()
|
|
134
|
+
continue
|
|
135
|
+
current.append(line)
|
|
136
|
+
|
|
137
|
+
flush()
|
|
138
|
+
return paragraphs
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def _split_oversized_sentence(sentence: str, max_tokens: int, model: str) -> list[str]:
|
|
142
|
+
words = re.findall(r"\S+", sentence)
|
|
143
|
+
if len(words) <= 1:
|
|
144
|
+
return [sentence]
|
|
145
|
+
|
|
146
|
+
pieces: list[str] = []
|
|
147
|
+
current: list[str] = []
|
|
148
|
+
for word in words:
|
|
149
|
+
candidate = " ".join(current + [word])
|
|
150
|
+
if current and estimate_tokens(candidate, model=model) > max_tokens:
|
|
151
|
+
pieces.append(" ".join(current).strip())
|
|
152
|
+
current = [word]
|
|
153
|
+
else:
|
|
154
|
+
current.append(word)
|
|
155
|
+
if current:
|
|
156
|
+
pieces.append(" ".join(current).strip())
|
|
157
|
+
return pieces
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def _split_long_text(text: str, max_tokens: int, model: str = "gpt4") -> list[str]:
|
|
161
|
+
paragraphs = _split_paragraphs(text)
|
|
162
|
+
chunks: list[str] = []
|
|
163
|
+
current: list[str] = []
|
|
164
|
+
|
|
165
|
+
def flush() -> None:
|
|
166
|
+
nonlocal current
|
|
167
|
+
if current:
|
|
168
|
+
chunks.append("\n\n".join(current).strip())
|
|
169
|
+
current = []
|
|
170
|
+
|
|
171
|
+
for paragraph in paragraphs:
|
|
172
|
+
paragraph_tokens = estimate_tokens(paragraph, model=model)
|
|
173
|
+
if paragraph_tokens > max_tokens:
|
|
174
|
+
flush()
|
|
175
|
+
sentences = _split_sentences(paragraph)
|
|
176
|
+
sentence_buffer: list[str] = []
|
|
177
|
+
for sentence in sentences:
|
|
178
|
+
if estimate_tokens(sentence, model=model) > max_tokens:
|
|
179
|
+
if sentence_buffer:
|
|
180
|
+
chunks.append(" ".join(sentence_buffer).strip())
|
|
181
|
+
sentence_buffer = []
|
|
182
|
+
chunks.extend(_split_oversized_sentence(sentence, max_tokens=max_tokens, model=model))
|
|
183
|
+
continue
|
|
184
|
+
|
|
185
|
+
candidate = " ".join(sentence_buffer + [sentence]).strip()
|
|
186
|
+
if sentence_buffer and estimate_tokens(candidate, model=model) > max_tokens:
|
|
187
|
+
chunks.append(" ".join(sentence_buffer).strip())
|
|
188
|
+
sentence_buffer = [sentence]
|
|
189
|
+
else:
|
|
190
|
+
sentence_buffer.append(sentence)
|
|
191
|
+
if sentence_buffer:
|
|
192
|
+
chunks.append(" ".join(sentence_buffer).strip())
|
|
193
|
+
continue
|
|
194
|
+
|
|
195
|
+
candidate = "\n\n".join(current + [paragraph]).strip()
|
|
196
|
+
if current and estimate_tokens(candidate, model=model) > max_tokens:
|
|
197
|
+
flush()
|
|
198
|
+
current.append(paragraph)
|
|
199
|
+
else:
|
|
200
|
+
current.append(paragraph)
|
|
201
|
+
flush()
|
|
202
|
+
return chunks
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
def _source_hash(source: str) -> str:
|
|
206
|
+
path = Path(source).expanduser()
|
|
207
|
+
try:
|
|
208
|
+
value = str(path.resolve(strict=False))
|
|
209
|
+
except OSError:
|
|
210
|
+
value = source
|
|
211
|
+
return short_hash(value)
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
def _make_chunk(
|
|
215
|
+
*,
|
|
216
|
+
source: str,
|
|
217
|
+
source_hash: str,
|
|
218
|
+
source_stem: str,
|
|
219
|
+
index: int,
|
|
220
|
+
heading_path: list[str],
|
|
221
|
+
text: str,
|
|
222
|
+
model: str,
|
|
223
|
+
strategy: str,
|
|
224
|
+
) -> Chunk:
|
|
225
|
+
return Chunk(
|
|
226
|
+
id=f"{source_stem}-{source_hash}-{index:04d}",
|
|
227
|
+
source=source,
|
|
228
|
+
index=index,
|
|
229
|
+
heading_path=heading_path,
|
|
230
|
+
text=text.strip(),
|
|
231
|
+
token_estimate=estimate_tokens(text, model=model),
|
|
232
|
+
strategy=strategy,
|
|
233
|
+
)
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
def _apply_overlap(piece: str, previous_tail: str, overlap: int) -> str:
|
|
237
|
+
if overlap and previous_tail:
|
|
238
|
+
return previous_tail + "\n\n" + piece
|
|
239
|
+
return piece
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
def _tail(piece: str, overlap: int) -> str:
|
|
243
|
+
if not overlap:
|
|
244
|
+
return ""
|
|
245
|
+
words = re.findall(r"\S+", piece)
|
|
246
|
+
return " ".join(words[-overlap:]) if words else ""
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
def _sections_from_headings(markdown: str) -> list[tuple[list[str], list[str]]]:
|
|
250
|
+
heading_stack: list[tuple[int, str]] = []
|
|
251
|
+
sections: list[tuple[list[str], list[str]]] = []
|
|
252
|
+
current_lines: list[str] = []
|
|
253
|
+
|
|
254
|
+
def current_heading_path() -> list[str]:
|
|
255
|
+
return [title for _, title in heading_stack]
|
|
256
|
+
|
|
257
|
+
def flush_section() -> None:
|
|
258
|
+
nonlocal current_lines
|
|
259
|
+
text = "\n".join(current_lines).strip()
|
|
260
|
+
if text:
|
|
261
|
+
sections.append((current_heading_path(), current_lines.copy()))
|
|
262
|
+
current_lines = []
|
|
263
|
+
|
|
264
|
+
for line in markdown.split("\n"):
|
|
265
|
+
heading = re.match(r"^(#{1,6})\s+(.+?)\s*$", line)
|
|
266
|
+
if heading:
|
|
267
|
+
flush_section()
|
|
268
|
+
level = len(heading.group(1))
|
|
269
|
+
title = heading.group(2).strip()
|
|
270
|
+
heading_stack = [(lvl, txt) for lvl, txt in heading_stack if lvl < level]
|
|
271
|
+
heading_stack.append((level, title))
|
|
272
|
+
current_lines.append(line)
|
|
273
|
+
else:
|
|
274
|
+
current_lines.append(line)
|
|
275
|
+
flush_section()
|
|
276
|
+
|
|
277
|
+
if not sections and markdown.strip():
|
|
278
|
+
sections = [([], markdown.split("\n"))]
|
|
279
|
+
return sections
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
def _chunk_heading(markdown: str, source: str, max_tokens: int, overlap: int, model: str) -> list[Chunk]:
|
|
283
|
+
result: list[Chunk] = []
|
|
284
|
+
source_hash = _source_hash(source)
|
|
285
|
+
source_stem = safe_stem(Path(source))
|
|
286
|
+
previous_tail = ""
|
|
287
|
+
|
|
288
|
+
for heading_path, lines in _sections_from_headings(markdown):
|
|
289
|
+
section_text = "\n".join(lines).strip()
|
|
290
|
+
if not section_text:
|
|
291
|
+
continue
|
|
292
|
+
pieces = _split_long_text(section_text, max_tokens=max_tokens, model=model)
|
|
293
|
+
for piece in pieces:
|
|
294
|
+
text = _apply_overlap(piece, previous_tail, overlap)
|
|
295
|
+
result.append(
|
|
296
|
+
_make_chunk(
|
|
297
|
+
source=source,
|
|
298
|
+
source_hash=source_hash,
|
|
299
|
+
source_stem=source_stem,
|
|
300
|
+
index=len(result) + 1,
|
|
301
|
+
heading_path=heading_path,
|
|
302
|
+
text=text,
|
|
303
|
+
model=model,
|
|
304
|
+
strategy="heading",
|
|
305
|
+
)
|
|
306
|
+
)
|
|
307
|
+
previous_tail = _tail(piece, overlap)
|
|
308
|
+
return result
|
|
309
|
+
|
|
310
|
+
|
|
311
|
+
def _chunk_fixed(markdown: str, source: str, max_tokens: int, overlap: int, model: str) -> list[Chunk]:
|
|
312
|
+
result: list[Chunk] = []
|
|
313
|
+
source_hash = _source_hash(source)
|
|
314
|
+
source_stem = safe_stem(Path(source))
|
|
315
|
+
previous_tail = ""
|
|
316
|
+
for piece in _split_long_text(markdown.strip(), max_tokens=max_tokens, model=model):
|
|
317
|
+
text = _apply_overlap(piece, previous_tail, overlap)
|
|
318
|
+
result.append(
|
|
319
|
+
_make_chunk(
|
|
320
|
+
source=source,
|
|
321
|
+
source_hash=source_hash,
|
|
322
|
+
source_stem=source_stem,
|
|
323
|
+
index=len(result) + 1,
|
|
324
|
+
heading_path=[],
|
|
325
|
+
text=text,
|
|
326
|
+
model=model,
|
|
327
|
+
strategy="fixed",
|
|
328
|
+
)
|
|
329
|
+
)
|
|
330
|
+
previous_tail = _tail(piece, overlap)
|
|
331
|
+
return result
|
|
332
|
+
|
|
333
|
+
|
|
334
|
+
def _is_semantic_break(paragraph: str) -> bool:
|
|
335
|
+
stripped = paragraph.strip()
|
|
336
|
+
if not stripped:
|
|
337
|
+
return False
|
|
338
|
+
if re.match(r"^#{1,6}\s+", stripped):
|
|
339
|
+
return True
|
|
340
|
+
lower = re.sub(r"[^a-z\s-]", "", stripped.lower()).strip()
|
|
341
|
+
return any(lower.startswith(prefix) for prefix in SEMANTIC_BREAK_PREFIXES)
|
|
342
|
+
|
|
343
|
+
|
|
344
|
+
def _chunk_semantic_lite(markdown: str, source: str, max_tokens: int, overlap: int, model: str) -> list[Chunk]:
|
|
345
|
+
"""Rule-based semantic-lite chunking.
|
|
346
|
+
|
|
347
|
+
It keeps paragraphs together, starts new chunks at obvious topical cues, and
|
|
348
|
+
falls back to the same long-text splitter for oversized paragraphs.
|
|
349
|
+
"""
|
|
350
|
+
source_hash = _source_hash(source)
|
|
351
|
+
source_stem = safe_stem(Path(source))
|
|
352
|
+
pieces: list[str] = []
|
|
353
|
+
current: list[str] = []
|
|
354
|
+
|
|
355
|
+
def flush() -> None:
|
|
356
|
+
nonlocal current
|
|
357
|
+
if current:
|
|
358
|
+
pieces.append("\n\n".join(current).strip())
|
|
359
|
+
current = []
|
|
360
|
+
|
|
361
|
+
for paragraph in _split_paragraphs(markdown):
|
|
362
|
+
if estimate_tokens(paragraph, model=model) > max_tokens:
|
|
363
|
+
flush()
|
|
364
|
+
pieces.extend(_split_long_text(paragraph, max_tokens=max_tokens, model=model))
|
|
365
|
+
continue
|
|
366
|
+
|
|
367
|
+
candidate = "\n\n".join(current + [paragraph]).strip()
|
|
368
|
+
should_break = bool(current) and (_is_semantic_break(paragraph) or estimate_tokens(candidate, model=model) > max_tokens)
|
|
369
|
+
if should_break:
|
|
370
|
+
flush()
|
|
371
|
+
current.append(paragraph)
|
|
372
|
+
flush()
|
|
373
|
+
|
|
374
|
+
result: list[Chunk] = []
|
|
375
|
+
previous_tail = ""
|
|
376
|
+
for piece in pieces:
|
|
377
|
+
text = _apply_overlap(piece, previous_tail, overlap)
|
|
378
|
+
headings = [m.group(2).strip() for m in re.finditer(r"^(#{1,6})\s+(.+?)$", piece, flags=re.MULTILINE)]
|
|
379
|
+
result.append(
|
|
380
|
+
_make_chunk(
|
|
381
|
+
source=source,
|
|
382
|
+
source_hash=source_hash,
|
|
383
|
+
source_stem=source_stem,
|
|
384
|
+
index=len(result) + 1,
|
|
385
|
+
heading_path=headings[-3:],
|
|
386
|
+
text=text,
|
|
387
|
+
model=model,
|
|
388
|
+
strategy="semantic-lite",
|
|
389
|
+
)
|
|
390
|
+
)
|
|
391
|
+
previous_tail = _tail(piece, overlap)
|
|
392
|
+
return result
|
|
393
|
+
|
|
394
|
+
|
|
395
|
+
def chunk_markdown(
|
|
396
|
+
markdown: str,
|
|
397
|
+
source: str = "document.md",
|
|
398
|
+
max_tokens: int = 800,
|
|
399
|
+
overlap: int = 0,
|
|
400
|
+
model: str = "gpt4",
|
|
401
|
+
strategy: str = "heading",
|
|
402
|
+
) -> list[Chunk]:
|
|
403
|
+
"""Split Markdown into JSONL-friendly chunks.
|
|
404
|
+
|
|
405
|
+
Strategies:
|
|
406
|
+
- heading: heading-aware chunks, the default from v0.1.x.
|
|
407
|
+
- fixed: stable size chunks, ignoring heading boundaries.
|
|
408
|
+
- semantic-lite: dependency-free topical boundary hints.
|
|
409
|
+
"""
|
|
410
|
+
if max_tokens < 100:
|
|
411
|
+
raise ValueError("max_tokens must be at least 100")
|
|
412
|
+
if overlap < 0:
|
|
413
|
+
raise ValueError("overlap cannot be negative")
|
|
414
|
+
if strategy not in CHUNK_STRATEGIES:
|
|
415
|
+
raise ValueError(f"Unknown chunk strategy: {strategy}. Choose from: {', '.join(sorted(CHUNK_STRATEGIES))}")
|
|
416
|
+
|
|
417
|
+
if not markdown.strip():
|
|
418
|
+
return []
|
|
419
|
+
if strategy == "fixed":
|
|
420
|
+
return _chunk_fixed(markdown, source, max_tokens, overlap, model)
|
|
421
|
+
if strategy == "semantic-lite":
|
|
422
|
+
return _chunk_semantic_lite(markdown, source, max_tokens, overlap, model)
|
|
423
|
+
return _chunk_heading(markdown, source, max_tokens, overlap, model)
|
|
424
|
+
|
|
425
|
+
|
|
426
|
+
def write_jsonl(chunks: Iterable[Chunk], output_path: str | Path) -> Path:
|
|
427
|
+
"""Write chunks to a JSONL file."""
|
|
428
|
+
output = Path(output_path)
|
|
429
|
+
output.parent.mkdir(parents=True, exist_ok=True)
|
|
430
|
+
with output.open("w", encoding="utf-8") as f:
|
|
431
|
+
for chunk in chunks:
|
|
432
|
+
f.write(chunk.to_json() + "\n")
|
|
433
|
+
return output
|
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
"""Markdown cleanup utilities."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
from collections import Counter
|
|
7
|
+
|
|
8
|
+
PAGE_NUM_PATTERNS = [
|
|
9
|
+
re.compile(r"^\d{1,5}$"),
|
|
10
|
+
re.compile(r"^page\s+\d+(\s+of\s+\d+)?$", re.IGNORECASE),
|
|
11
|
+
re.compile(r"^[•∙\-–—\[\]\(\)]?\s*\d{1,5}\s*[•∙\-–—\[\]\(\)]?$"),
|
|
12
|
+
re.compile(r"^\d{1,5}\s*/\s*\d{1,5}$"),
|
|
13
|
+
re.compile(r"^[•∙\-–—]?\s*[ivxlcdm]{1,12}\s*[•∙\-–—]?$", re.IGNORECASE),
|
|
14
|
+
]
|
|
15
|
+
|
|
16
|
+
SENTENCE_END = (".", "!", "?", "。", "!", "?")
|
|
17
|
+
STRUCTURAL_PREFIXES = ("#", "- ", "* ", "> ", "|")
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def normalize_newlines(text: str) -> str:
|
|
21
|
+
"""Normalize Windows, Unix, and old Mac newlines with a single regex pass."""
|
|
22
|
+
return re.sub(r"\r\n?|\n", "\n", text)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def remove_cid_artifacts(text: str) -> str:
|
|
26
|
+
"""Remove common PDF extraction artifacts such as `(cid:123)`."""
|
|
27
|
+
return re.sub(r"\(cid:\s*\d+\)", "", text, flags=re.IGNORECASE)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def is_page_number(line: str) -> bool:
|
|
31
|
+
"""Return True when a line looks like a standalone page marker."""
|
|
32
|
+
stripped = line.strip()
|
|
33
|
+
if not stripped:
|
|
34
|
+
return False
|
|
35
|
+
return any(pattern.fullmatch(stripped) for pattern in PAGE_NUM_PATTERNS)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def remove_lonely_page_numbers(text: str) -> str:
|
|
39
|
+
"""Remove lines that are only page numbers or simple page markers."""
|
|
40
|
+
return "\n".join(line for line in text.split("\n") if not is_page_number(line))
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def remove_repeated_short_lines(text: str, min_repeats: int | None = None) -> str:
|
|
44
|
+
"""Remove repeated short non-heading lines that often come from headers/footers.
|
|
45
|
+
|
|
46
|
+
The threshold adapts to the document length. Short documents can have a
|
|
47
|
+
repeated header/footer only twice; long documents need stronger evidence so
|
|
48
|
+
legitimate repeated table rows are less likely to be removed.
|
|
49
|
+
"""
|
|
50
|
+
raw_lines = text.split("\n")
|
|
51
|
+
line_count = len(raw_lines)
|
|
52
|
+
if min_repeats is None:
|
|
53
|
+
min_repeats = 2 if line_count < 80 else max(3, line_count // 25)
|
|
54
|
+
|
|
55
|
+
normalized = [re.sub(r"\s+", " ", line.strip()) for line in raw_lines]
|
|
56
|
+
|
|
57
|
+
def is_candidate(line: str) -> bool:
|
|
58
|
+
if not (2 <= len(line) <= 90):
|
|
59
|
+
return False
|
|
60
|
+
if line.startswith(STRUCTURAL_PREFIXES):
|
|
61
|
+
return False
|
|
62
|
+
if re.match(r"^\d+[.)]\s+", line):
|
|
63
|
+
return False
|
|
64
|
+
return True
|
|
65
|
+
|
|
66
|
+
counts = Counter(line for line in normalized if is_candidate(line))
|
|
67
|
+
|
|
68
|
+
cleaned: list[str] = []
|
|
69
|
+
for original, key in zip(raw_lines, normalized, strict=True):
|
|
70
|
+
if key and is_candidate(key) and counts[key] >= min_repeats:
|
|
71
|
+
continue
|
|
72
|
+
cleaned.append(original)
|
|
73
|
+
return "\n".join(cleaned)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def normalize_heading_spacing(text: str) -> str:
|
|
77
|
+
"""Ensure Markdown headings have one space after # markers."""
|
|
78
|
+
return re.sub(r"^(#{1,6})([^#\s].*)$", r"\1 \2", text, flags=re.MULTILINE)
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def fix_broken_lines(text: str) -> str:
|
|
82
|
+
"""Repair simple PDF line breaks inside paragraphs.
|
|
83
|
+
|
|
84
|
+
This intentionally stays conservative so tables, lists, blockquotes, and code
|
|
85
|
+
blocks are not destroyed.
|
|
86
|
+
"""
|
|
87
|
+
lines = text.split("\n")
|
|
88
|
+
output: list[str] = []
|
|
89
|
+
buffer = ""
|
|
90
|
+
in_code_block = False
|
|
91
|
+
|
|
92
|
+
def flush() -> None:
|
|
93
|
+
nonlocal buffer
|
|
94
|
+
if buffer:
|
|
95
|
+
output.append(buffer.strip())
|
|
96
|
+
buffer = ""
|
|
97
|
+
|
|
98
|
+
for line in lines:
|
|
99
|
+
stripped = line.strip()
|
|
100
|
+
|
|
101
|
+
if stripped.startswith("```") or stripped.startswith("~~~"):
|
|
102
|
+
flush()
|
|
103
|
+
output.append(line.rstrip())
|
|
104
|
+
in_code_block = not in_code_block
|
|
105
|
+
continue
|
|
106
|
+
|
|
107
|
+
if in_code_block:
|
|
108
|
+
output.append(line.rstrip())
|
|
109
|
+
continue
|
|
110
|
+
|
|
111
|
+
if not stripped:
|
|
112
|
+
flush()
|
|
113
|
+
output.append("")
|
|
114
|
+
continue
|
|
115
|
+
|
|
116
|
+
if stripped.startswith(STRUCTURAL_PREFIXES) or re.match(r"^\d+[.)]\s+", stripped):
|
|
117
|
+
flush()
|
|
118
|
+
output.append(line.rstrip())
|
|
119
|
+
continue
|
|
120
|
+
|
|
121
|
+
if buffer:
|
|
122
|
+
if buffer.endswith("-"):
|
|
123
|
+
buffer = buffer[:-1] + stripped
|
|
124
|
+
elif buffer.endswith("/"):
|
|
125
|
+
buffer = buffer[:-1] + stripped
|
|
126
|
+
elif not buffer.endswith(SENTENCE_END):
|
|
127
|
+
buffer += " " + stripped
|
|
128
|
+
else:
|
|
129
|
+
flush()
|
|
130
|
+
buffer = stripped
|
|
131
|
+
else:
|
|
132
|
+
buffer = stripped
|
|
133
|
+
|
|
134
|
+
flush()
|
|
135
|
+
return "\n".join(output)
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def remove_excess_blank_lines(text: str, trim_trailing_whitespace: bool = True) -> str:
|
|
139
|
+
"""Collapse 3+ blank lines to two blank lines.
|
|
140
|
+
|
|
141
|
+
Trailing whitespace trimming stays enabled by default because converted PDF
|
|
142
|
+
text often contains stray spaces that harm Markdown diffs.
|
|
143
|
+
"""
|
|
144
|
+
if trim_trailing_whitespace:
|
|
145
|
+
text = re.sub(r"[ \t]+$", "", text, flags=re.MULTILINE)
|
|
146
|
+
return re.sub(r"\n{3,}", "\n\n", text)
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def clean_markdown(text: str) -> str:
|
|
150
|
+
"""Run the default Markdown cleanup pipeline."""
|
|
151
|
+
text = normalize_newlines(text)
|
|
152
|
+
text = remove_cid_artifacts(text)
|
|
153
|
+
text = remove_lonely_page_numbers(text)
|
|
154
|
+
text = remove_repeated_short_lines(text)
|
|
155
|
+
text = normalize_heading_spacing(text)
|
|
156
|
+
text = fix_broken_lines(text)
|
|
157
|
+
text = remove_excess_blank_lines(text)
|
|
158
|
+
return text.strip() + "\n" if text.strip() else ""
|