markitdown-plus 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,433 @@
1
+ """RAG-ready Markdown chunking utilities."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import re
7
+ from collections.abc import Iterable
8
+ from dataclasses import asdict, dataclass
9
+ from pathlib import Path
10
+
11
+ from .utils import safe_stem, short_hash
12
+
13
+ TOKEN_RATIOS: dict[str, float] = {
14
+ "gpt4": 0.75,
15
+ "gpt-4": 0.75,
16
+ "gpt-4o": 0.75,
17
+ "deepseek": 0.75,
18
+ "claude": 3.5,
19
+ "gemini": 4.0,
20
+ }
21
+
22
+ CHUNK_STRATEGIES = {"heading", "fixed", "semantic-lite"}
23
+
24
+ ABBREVIATIONS = (
25
+ "Mr.", "Mrs.", "Ms.", "Dr.", "Prof.", "Sr.", "Jr.", "St.",
26
+ "vs.", "e.g.", "i.e.", "etc.", "Fig.", "Eq.", "No.", "Inc.",
27
+ "Ltd.", "Co.", "U.S.", "U.K.", "Ph.D.", "M.D.", "a.m.", "p.m.",
28
+ "Jan.", "Feb.", "Mar.", "Apr.", "Jun.", "Jul.", "Aug.", "Sep.",
29
+ "Sept.", "Oct.", "Nov.", "Dec.",
30
+ )
31
+
32
+ SEMANTIC_BREAK_PREFIXES = (
33
+ "summary", "conclusion", "key takeaway", "key takeaways", "next steps",
34
+ "recommendation", "recommendations", "background", "methodology", "results",
35
+ )
36
+
37
+
38
+ @dataclass
39
+ class Chunk:
40
+ """One RAG-ready text chunk."""
41
+
42
+ id: str
43
+ source: str
44
+ index: int
45
+ heading_path: list[str]
46
+ text: str
47
+ token_estimate: int
48
+ strategy: str = "heading"
49
+
50
+ def to_json(self) -> str:
51
+ """Serialize to one JSONL row."""
52
+ return json.dumps(asdict(self), ensure_ascii=False)
53
+
54
+
55
+ def estimate_tokens(text: str, model: str = "gpt4", url_penalty: float = 5.0) -> int:
56
+ """Estimate token count for chunk sizing.
57
+
58
+ This is intentionally lightweight and dependency-free. It accounts for CJK
59
+ text, English-like words, URLs, and different rough tokenizer families.
60
+ """
61
+ if not text:
62
+ return 0
63
+ cjk_chars = len(re.findall(r"[\u4e00-\u9fff]", text))
64
+ word_count = len(re.findall(r"\b\w+\b", text))
65
+ url_count = len(re.findall(r"https?://\S+", text))
66
+ ratio = TOKEN_RATIOS.get(model.lower(), TOKEN_RATIOS["gpt4"])
67
+ return max(1, int(cjk_chars / 2 + word_count / ratio + url_count * url_penalty))
68
+
69
+
70
+ def _protect_abbreviations(text: str) -> tuple[str, dict[str, str]]:
71
+ replacements: dict[str, str] = {}
72
+ protected = text
73
+ for index, abbr in enumerate(ABBREVIATIONS):
74
+ token = f"__MDP_ABBR_{index}__"
75
+ protected = protected.replace(abbr, token)
76
+ replacements[token] = abbr
77
+ return protected, replacements
78
+
79
+
80
+ def _restore_abbreviations(text: str, replacements: dict[str, str]) -> str:
81
+ restored = text
82
+ for token, abbr in replacements.items():
83
+ restored = restored.replace(token, abbr)
84
+ return restored
85
+
86
+
87
+ def _split_sentences(text: str) -> list[str]:
88
+ """Split text into sentences with common abbreviation protection."""
89
+ normalized = re.sub(r"\s+", " ", text).strip()
90
+ if not normalized:
91
+ return []
92
+
93
+ protected, replacements = _protect_abbreviations(normalized)
94
+ parts: list[str] = []
95
+ start = 0
96
+ for match in re.finditer(r"[.!?。!?]+(?:[\"'”’」』\)\]\}】]*)", protected):
97
+ end = match.end()
98
+ next_char = protected[end : end + 1]
99
+ should_split = end == len(protected) or next_char.isspace() or protected[match.start()] in "。!?"
100
+ if should_split:
101
+ piece = protected[start:end].strip()
102
+ if piece:
103
+ parts.append(_restore_abbreviations(piece, replacements))
104
+ start = end
105
+ while start < len(protected) and protected[start].isspace():
106
+ start += 1
107
+
108
+ tail = protected[start:].strip()
109
+ if tail:
110
+ parts.append(_restore_abbreviations(tail, replacements))
111
+ return parts or [_restore_abbreviations(protected, replacements)]
112
+
113
+
114
+ def _split_paragraphs(text: str) -> list[str]:
115
+ """Split Markdown into paragraphs without breaking fenced code blocks."""
116
+ paragraphs: list[str] = []
117
+ current: list[str] = []
118
+ in_code_block = False
119
+
120
+ def flush() -> None:
121
+ nonlocal current
122
+ if current and any(line.strip() for line in current):
123
+ paragraphs.append("\n".join(current).strip())
124
+ current = []
125
+
126
+ for line in text.split("\n"):
127
+ stripped = line.strip()
128
+ if stripped.startswith(("```", "~~~")):
129
+ current.append(line)
130
+ in_code_block = not in_code_block
131
+ continue
132
+ if not in_code_block and not stripped:
133
+ flush()
134
+ continue
135
+ current.append(line)
136
+
137
+ flush()
138
+ return paragraphs
139
+
140
+
141
+ def _split_oversized_sentence(sentence: str, max_tokens: int, model: str) -> list[str]:
142
+ words = re.findall(r"\S+", sentence)
143
+ if len(words) <= 1:
144
+ return [sentence]
145
+
146
+ pieces: list[str] = []
147
+ current: list[str] = []
148
+ for word in words:
149
+ candidate = " ".join(current + [word])
150
+ if current and estimate_tokens(candidate, model=model) > max_tokens:
151
+ pieces.append(" ".join(current).strip())
152
+ current = [word]
153
+ else:
154
+ current.append(word)
155
+ if current:
156
+ pieces.append(" ".join(current).strip())
157
+ return pieces
158
+
159
+
160
+ def _split_long_text(text: str, max_tokens: int, model: str = "gpt4") -> list[str]:
161
+ paragraphs = _split_paragraphs(text)
162
+ chunks: list[str] = []
163
+ current: list[str] = []
164
+
165
+ def flush() -> None:
166
+ nonlocal current
167
+ if current:
168
+ chunks.append("\n\n".join(current).strip())
169
+ current = []
170
+
171
+ for paragraph in paragraphs:
172
+ paragraph_tokens = estimate_tokens(paragraph, model=model)
173
+ if paragraph_tokens > max_tokens:
174
+ flush()
175
+ sentences = _split_sentences(paragraph)
176
+ sentence_buffer: list[str] = []
177
+ for sentence in sentences:
178
+ if estimate_tokens(sentence, model=model) > max_tokens:
179
+ if sentence_buffer:
180
+ chunks.append(" ".join(sentence_buffer).strip())
181
+ sentence_buffer = []
182
+ chunks.extend(_split_oversized_sentence(sentence, max_tokens=max_tokens, model=model))
183
+ continue
184
+
185
+ candidate = " ".join(sentence_buffer + [sentence]).strip()
186
+ if sentence_buffer and estimate_tokens(candidate, model=model) > max_tokens:
187
+ chunks.append(" ".join(sentence_buffer).strip())
188
+ sentence_buffer = [sentence]
189
+ else:
190
+ sentence_buffer.append(sentence)
191
+ if sentence_buffer:
192
+ chunks.append(" ".join(sentence_buffer).strip())
193
+ continue
194
+
195
+ candidate = "\n\n".join(current + [paragraph]).strip()
196
+ if current and estimate_tokens(candidate, model=model) > max_tokens:
197
+ flush()
198
+ current.append(paragraph)
199
+ else:
200
+ current.append(paragraph)
201
+ flush()
202
+ return chunks
203
+
204
+
205
+ def _source_hash(source: str) -> str:
206
+ path = Path(source).expanduser()
207
+ try:
208
+ value = str(path.resolve(strict=False))
209
+ except OSError:
210
+ value = source
211
+ return short_hash(value)
212
+
213
+
214
+ def _make_chunk(
215
+ *,
216
+ source: str,
217
+ source_hash: str,
218
+ source_stem: str,
219
+ index: int,
220
+ heading_path: list[str],
221
+ text: str,
222
+ model: str,
223
+ strategy: str,
224
+ ) -> Chunk:
225
+ return Chunk(
226
+ id=f"{source_stem}-{source_hash}-{index:04d}",
227
+ source=source,
228
+ index=index,
229
+ heading_path=heading_path,
230
+ text=text.strip(),
231
+ token_estimate=estimate_tokens(text, model=model),
232
+ strategy=strategy,
233
+ )
234
+
235
+
236
+ def _apply_overlap(piece: str, previous_tail: str, overlap: int) -> str:
237
+ if overlap and previous_tail:
238
+ return previous_tail + "\n\n" + piece
239
+ return piece
240
+
241
+
242
+ def _tail(piece: str, overlap: int) -> str:
243
+ if not overlap:
244
+ return ""
245
+ words = re.findall(r"\S+", piece)
246
+ return " ".join(words[-overlap:]) if words else ""
247
+
248
+
249
+ def _sections_from_headings(markdown: str) -> list[tuple[list[str], list[str]]]:
250
+ heading_stack: list[tuple[int, str]] = []
251
+ sections: list[tuple[list[str], list[str]]] = []
252
+ current_lines: list[str] = []
253
+
254
+ def current_heading_path() -> list[str]:
255
+ return [title for _, title in heading_stack]
256
+
257
+ def flush_section() -> None:
258
+ nonlocal current_lines
259
+ text = "\n".join(current_lines).strip()
260
+ if text:
261
+ sections.append((current_heading_path(), current_lines.copy()))
262
+ current_lines = []
263
+
264
+ for line in markdown.split("\n"):
265
+ heading = re.match(r"^(#{1,6})\s+(.+?)\s*$", line)
266
+ if heading:
267
+ flush_section()
268
+ level = len(heading.group(1))
269
+ title = heading.group(2).strip()
270
+ heading_stack = [(lvl, txt) for lvl, txt in heading_stack if lvl < level]
271
+ heading_stack.append((level, title))
272
+ current_lines.append(line)
273
+ else:
274
+ current_lines.append(line)
275
+ flush_section()
276
+
277
+ if not sections and markdown.strip():
278
+ sections = [([], markdown.split("\n"))]
279
+ return sections
280
+
281
+
282
+ def _chunk_heading(markdown: str, source: str, max_tokens: int, overlap: int, model: str) -> list[Chunk]:
283
+ result: list[Chunk] = []
284
+ source_hash = _source_hash(source)
285
+ source_stem = safe_stem(Path(source))
286
+ previous_tail = ""
287
+
288
+ for heading_path, lines in _sections_from_headings(markdown):
289
+ section_text = "\n".join(lines).strip()
290
+ if not section_text:
291
+ continue
292
+ pieces = _split_long_text(section_text, max_tokens=max_tokens, model=model)
293
+ for piece in pieces:
294
+ text = _apply_overlap(piece, previous_tail, overlap)
295
+ result.append(
296
+ _make_chunk(
297
+ source=source,
298
+ source_hash=source_hash,
299
+ source_stem=source_stem,
300
+ index=len(result) + 1,
301
+ heading_path=heading_path,
302
+ text=text,
303
+ model=model,
304
+ strategy="heading",
305
+ )
306
+ )
307
+ previous_tail = _tail(piece, overlap)
308
+ return result
309
+
310
+
311
+ def _chunk_fixed(markdown: str, source: str, max_tokens: int, overlap: int, model: str) -> list[Chunk]:
312
+ result: list[Chunk] = []
313
+ source_hash = _source_hash(source)
314
+ source_stem = safe_stem(Path(source))
315
+ previous_tail = ""
316
+ for piece in _split_long_text(markdown.strip(), max_tokens=max_tokens, model=model):
317
+ text = _apply_overlap(piece, previous_tail, overlap)
318
+ result.append(
319
+ _make_chunk(
320
+ source=source,
321
+ source_hash=source_hash,
322
+ source_stem=source_stem,
323
+ index=len(result) + 1,
324
+ heading_path=[],
325
+ text=text,
326
+ model=model,
327
+ strategy="fixed",
328
+ )
329
+ )
330
+ previous_tail = _tail(piece, overlap)
331
+ return result
332
+
333
+
334
+ def _is_semantic_break(paragraph: str) -> bool:
335
+ stripped = paragraph.strip()
336
+ if not stripped:
337
+ return False
338
+ if re.match(r"^#{1,6}\s+", stripped):
339
+ return True
340
+ lower = re.sub(r"[^a-z\s-]", "", stripped.lower()).strip()
341
+ return any(lower.startswith(prefix) for prefix in SEMANTIC_BREAK_PREFIXES)
342
+
343
+
344
+ def _chunk_semantic_lite(markdown: str, source: str, max_tokens: int, overlap: int, model: str) -> list[Chunk]:
345
+ """Rule-based semantic-lite chunking.
346
+
347
+ It keeps paragraphs together, starts new chunks at obvious topical cues, and
348
+ falls back to the same long-text splitter for oversized paragraphs.
349
+ """
350
+ source_hash = _source_hash(source)
351
+ source_stem = safe_stem(Path(source))
352
+ pieces: list[str] = []
353
+ current: list[str] = []
354
+
355
+ def flush() -> None:
356
+ nonlocal current
357
+ if current:
358
+ pieces.append("\n\n".join(current).strip())
359
+ current = []
360
+
361
+ for paragraph in _split_paragraphs(markdown):
362
+ if estimate_tokens(paragraph, model=model) > max_tokens:
363
+ flush()
364
+ pieces.extend(_split_long_text(paragraph, max_tokens=max_tokens, model=model))
365
+ continue
366
+
367
+ candidate = "\n\n".join(current + [paragraph]).strip()
368
+ should_break = bool(current) and (_is_semantic_break(paragraph) or estimate_tokens(candidate, model=model) > max_tokens)
369
+ if should_break:
370
+ flush()
371
+ current.append(paragraph)
372
+ flush()
373
+
374
+ result: list[Chunk] = []
375
+ previous_tail = ""
376
+ for piece in pieces:
377
+ text = _apply_overlap(piece, previous_tail, overlap)
378
+ headings = [m.group(2).strip() for m in re.finditer(r"^(#{1,6})\s+(.+?)$", piece, flags=re.MULTILINE)]
379
+ result.append(
380
+ _make_chunk(
381
+ source=source,
382
+ source_hash=source_hash,
383
+ source_stem=source_stem,
384
+ index=len(result) + 1,
385
+ heading_path=headings[-3:],
386
+ text=text,
387
+ model=model,
388
+ strategy="semantic-lite",
389
+ )
390
+ )
391
+ previous_tail = _tail(piece, overlap)
392
+ return result
393
+
394
+
395
+ def chunk_markdown(
396
+ markdown: str,
397
+ source: str = "document.md",
398
+ max_tokens: int = 800,
399
+ overlap: int = 0,
400
+ model: str = "gpt4",
401
+ strategy: str = "heading",
402
+ ) -> list[Chunk]:
403
+ """Split Markdown into JSONL-friendly chunks.
404
+
405
+ Strategies:
406
+ - heading: heading-aware chunks, the default from v0.1.x.
407
+ - fixed: stable size chunks, ignoring heading boundaries.
408
+ - semantic-lite: dependency-free topical boundary hints.
409
+ """
410
+ if max_tokens < 100:
411
+ raise ValueError("max_tokens must be at least 100")
412
+ if overlap < 0:
413
+ raise ValueError("overlap cannot be negative")
414
+ if strategy not in CHUNK_STRATEGIES:
415
+ raise ValueError(f"Unknown chunk strategy: {strategy}. Choose from: {', '.join(sorted(CHUNK_STRATEGIES))}")
416
+
417
+ if not markdown.strip():
418
+ return []
419
+ if strategy == "fixed":
420
+ return _chunk_fixed(markdown, source, max_tokens, overlap, model)
421
+ if strategy == "semantic-lite":
422
+ return _chunk_semantic_lite(markdown, source, max_tokens, overlap, model)
423
+ return _chunk_heading(markdown, source, max_tokens, overlap, model)
424
+
425
+
426
+ def write_jsonl(chunks: Iterable[Chunk], output_path: str | Path) -> Path:
427
+ """Write chunks to a JSONL file."""
428
+ output = Path(output_path)
429
+ output.parent.mkdir(parents=True, exist_ok=True)
430
+ with output.open("w", encoding="utf-8") as f:
431
+ for chunk in chunks:
432
+ f.write(chunk.to_json() + "\n")
433
+ return output
@@ -0,0 +1,158 @@
1
+ """Markdown cleanup utilities."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+ from collections import Counter
7
+
8
+ PAGE_NUM_PATTERNS = [
9
+ re.compile(r"^\d{1,5}$"),
10
+ re.compile(r"^page\s+\d+(\s+of\s+\d+)?$", re.IGNORECASE),
11
+ re.compile(r"^[•∙\-–—\[\]\(\)]?\s*\d{1,5}\s*[•∙\-–—\[\]\(\)]?$"),
12
+ re.compile(r"^\d{1,5}\s*/\s*\d{1,5}$"),
13
+ re.compile(r"^[•∙\-–—]?\s*[ivxlcdm]{1,12}\s*[•∙\-–—]?$", re.IGNORECASE),
14
+ ]
15
+
16
+ SENTENCE_END = (".", "!", "?", "。", "!", "?")
17
+ STRUCTURAL_PREFIXES = ("#", "- ", "* ", "> ", "|")
18
+
19
+
20
+ def normalize_newlines(text: str) -> str:
21
+ """Normalize Windows, Unix, and old Mac newlines with a single regex pass."""
22
+ return re.sub(r"\r\n?|\n", "\n", text)
23
+
24
+
25
+ def remove_cid_artifacts(text: str) -> str:
26
+ """Remove common PDF extraction artifacts such as `(cid:123)`."""
27
+ return re.sub(r"\(cid:\s*\d+\)", "", text, flags=re.IGNORECASE)
28
+
29
+
30
+ def is_page_number(line: str) -> bool:
31
+ """Return True when a line looks like a standalone page marker."""
32
+ stripped = line.strip()
33
+ if not stripped:
34
+ return False
35
+ return any(pattern.fullmatch(stripped) for pattern in PAGE_NUM_PATTERNS)
36
+
37
+
38
+ def remove_lonely_page_numbers(text: str) -> str:
39
+ """Remove lines that are only page numbers or simple page markers."""
40
+ return "\n".join(line for line in text.split("\n") if not is_page_number(line))
41
+
42
+
43
+ def remove_repeated_short_lines(text: str, min_repeats: int | None = None) -> str:
44
+ """Remove repeated short non-heading lines that often come from headers/footers.
45
+
46
+ The threshold adapts to the document length. Short documents can have a
47
+ repeated header/footer only twice; long documents need stronger evidence so
48
+ legitimate repeated table rows are less likely to be removed.
49
+ """
50
+ raw_lines = text.split("\n")
51
+ line_count = len(raw_lines)
52
+ if min_repeats is None:
53
+ min_repeats = 2 if line_count < 80 else max(3, line_count // 25)
54
+
55
+ normalized = [re.sub(r"\s+", " ", line.strip()) for line in raw_lines]
56
+
57
+ def is_candidate(line: str) -> bool:
58
+ if not (2 <= len(line) <= 90):
59
+ return False
60
+ if line.startswith(STRUCTURAL_PREFIXES):
61
+ return False
62
+ if re.match(r"^\d+[.)]\s+", line):
63
+ return False
64
+ return True
65
+
66
+ counts = Counter(line for line in normalized if is_candidate(line))
67
+
68
+ cleaned: list[str] = []
69
+ for original, key in zip(raw_lines, normalized, strict=True):
70
+ if key and is_candidate(key) and counts[key] >= min_repeats:
71
+ continue
72
+ cleaned.append(original)
73
+ return "\n".join(cleaned)
74
+
75
+
76
+ def normalize_heading_spacing(text: str) -> str:
77
+ """Ensure Markdown headings have one space after # markers."""
78
+ return re.sub(r"^(#{1,6})([^#\s].*)$", r"\1 \2", text, flags=re.MULTILINE)
79
+
80
+
81
+ def fix_broken_lines(text: str) -> str:
82
+ """Repair simple PDF line breaks inside paragraphs.
83
+
84
+ This intentionally stays conservative so tables, lists, blockquotes, and code
85
+ blocks are not destroyed.
86
+ """
87
+ lines = text.split("\n")
88
+ output: list[str] = []
89
+ buffer = ""
90
+ in_code_block = False
91
+
92
+ def flush() -> None:
93
+ nonlocal buffer
94
+ if buffer:
95
+ output.append(buffer.strip())
96
+ buffer = ""
97
+
98
+ for line in lines:
99
+ stripped = line.strip()
100
+
101
+ if stripped.startswith("```") or stripped.startswith("~~~"):
102
+ flush()
103
+ output.append(line.rstrip())
104
+ in_code_block = not in_code_block
105
+ continue
106
+
107
+ if in_code_block:
108
+ output.append(line.rstrip())
109
+ continue
110
+
111
+ if not stripped:
112
+ flush()
113
+ output.append("")
114
+ continue
115
+
116
+ if stripped.startswith(STRUCTURAL_PREFIXES) or re.match(r"^\d+[.)]\s+", stripped):
117
+ flush()
118
+ output.append(line.rstrip())
119
+ continue
120
+
121
+ if buffer:
122
+ if buffer.endswith("-"):
123
+ buffer = buffer[:-1] + stripped
124
+ elif buffer.endswith("/"):
125
+ buffer = buffer[:-1] + stripped
126
+ elif not buffer.endswith(SENTENCE_END):
127
+ buffer += " " + stripped
128
+ else:
129
+ flush()
130
+ buffer = stripped
131
+ else:
132
+ buffer = stripped
133
+
134
+ flush()
135
+ return "\n".join(output)
136
+
137
+
138
+ def remove_excess_blank_lines(text: str, trim_trailing_whitespace: bool = True) -> str:
139
+ """Collapse 3+ blank lines to two blank lines.
140
+
141
+ Trailing whitespace trimming stays enabled by default because converted PDF
142
+ text often contains stray spaces that harm Markdown diffs.
143
+ """
144
+ if trim_trailing_whitespace:
145
+ text = re.sub(r"[ \t]+$", "", text, flags=re.MULTILINE)
146
+ return re.sub(r"\n{3,}", "\n\n", text)
147
+
148
+
149
+ def clean_markdown(text: str) -> str:
150
+ """Run the default Markdown cleanup pipeline."""
151
+ text = normalize_newlines(text)
152
+ text = remove_cid_artifacts(text)
153
+ text = remove_lonely_page_numbers(text)
154
+ text = remove_repeated_short_lines(text)
155
+ text = normalize_heading_spacing(text)
156
+ text = fix_broken_lines(text)
157
+ text = remove_excess_blank_lines(text)
158
+ return text.strip() + "\n" if text.strip() else ""