markitdown-plus 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ """Package metadata."""
2
+
3
+ __version__ = "0.2.0"
4
+ __description__ = (
5
+ "Batch conversion, asset extraction, and RAG-ready output toolkit "
6
+ "for Microsoft MarkItDown."
7
+ )
@@ -0,0 +1,12 @@
1
+ """MarkItDown Plus.
2
+
3
+ Batch conversion, Markdown cleanup, JSONL chunks, and manifest output for
4
+ Microsoft MarkItDown powered document pipelines.
5
+ """
6
+
7
+ from .__about__ import __version__
8
+ from .cleaner import clean_markdown
9
+ from .chunker import chunk_markdown
10
+ from .converter import PlusConverter
11
+
12
+ __all__ = ["PlusConverter", "clean_markdown", "chunk_markdown", "__version__"]
@@ -0,0 +1,154 @@
1
+ """Asset extraction helpers for converted documents.
2
+
3
+ The first implementation focuses on dependency-light extraction:
4
+ - Office Open XML containers (.docx, .pptx, .xlsx): extract */media/* files.
5
+ - HTML files: copy local <img src="..."> assets next to the batch output.
6
+
7
+ PDF image extraction is intentionally not implemented here because reliable PDF
8
+ asset recovery needs heavier format-specific dependencies. The function returns
9
+ an empty list for unsupported formats instead of failing the conversion.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import html
15
+ import re
16
+ import os
17
+ import shutil
18
+ import zipfile
19
+ from dataclasses import asdict, dataclass
20
+ from pathlib import Path
21
+ from urllib.parse import unquote, urlparse
22
+
23
+ from .utils import ensure_dir, safe_stem
24
+
25
+ OFFICE_EXTENSIONS = {".docx", ".pptx", ".xlsx"}
26
+ IMAGE_EXTENSIONS = {".png", ".jpg", ".jpeg", ".gif", ".bmp", ".webp", ".tiff", ".svg"}
27
+
28
+
29
+ @dataclass(frozen=True)
30
+ class AssetRecord:
31
+ """One extracted or copied asset."""
32
+
33
+ source: str
34
+ output_path: str
35
+ markdown_path: str
36
+ kind: str = "image"
37
+
38
+ def to_dict(self) -> dict[str, str]:
39
+ """Return a JSON-friendly representation."""
40
+ return asdict(self)
41
+
42
+
43
+ def _asset_name(stem: str, index: int, suffix: str) -> str:
44
+ suffix = suffix if suffix.startswith(".") else f".{suffix}"
45
+ return f"{stem}_img_{index:03d}{suffix.lower()}"
46
+
47
+
48
+ def _markdown_relative_path(markdown_path: Path, asset_path: Path) -> str:
49
+ """Return a portable relative path from a Markdown file to an asset."""
50
+ try:
51
+ return Path(os.path.relpath(asset_path, start=markdown_path.parent)).as_posix()
52
+ except ValueError: # Windows different drives, or other platform edge cases
53
+ return asset_path.as_posix()
54
+
55
+
56
+ def _extract_office_assets(source: Path, markdown_path: Path, assets_dir: Path) -> list[AssetRecord]:
57
+ records: list[AssetRecord] = []
58
+ stem = safe_stem(source)
59
+ try:
60
+ with zipfile.ZipFile(source) as archive:
61
+ media_names = [
62
+ name
63
+ for name in archive.namelist()
64
+ if "/media/" in name.lower() and Path(name).suffix.lower() in IMAGE_EXTENSIONS
65
+ ]
66
+ for index, name in enumerate(sorted(media_names), start=1):
67
+ suffix = Path(name).suffix or ".bin"
68
+ output = assets_dir / _asset_name(stem, index, suffix)
69
+ ensure_dir(output.parent)
70
+ with archive.open(name) as src, output.open("wb") as dst:
71
+ shutil.copyfileobj(src, dst)
72
+ records.append(
73
+ AssetRecord(
74
+ source=name,
75
+ output_path=str(output),
76
+ markdown_path=_markdown_relative_path(markdown_path, output),
77
+ )
78
+ )
79
+ except (zipfile.BadZipFile, OSError):
80
+ return []
81
+ return records
82
+
83
+
84
+ def _local_html_image_paths(source: Path) -> list[str]:
85
+ try:
86
+ text = source.read_text(encoding="utf-8", errors="ignore")
87
+ except OSError:
88
+ return []
89
+
90
+ paths: list[str] = []
91
+ pattern = re.compile(r"<img\b[^>]*?\bsrc\s*=\s*(['\"])(.*?)\1", re.IGNORECASE | re.DOTALL)
92
+ for match in pattern.finditer(text):
93
+ raw_src = html.unescape(match.group(2).strip())
94
+ parsed = urlparse(raw_src)
95
+ if parsed.scheme in {"http", "https", "data"} or parsed.netloc:
96
+ continue
97
+ local = unquote(parsed.path)
98
+ if local:
99
+ paths.append(local)
100
+ return paths
101
+
102
+
103
+ def _copy_html_assets(source: Path, markdown_path: Path, assets_dir: Path) -> list[AssetRecord]:
104
+ records: list[AssetRecord] = []
105
+ stem = safe_stem(source)
106
+ seen: set[Path] = set()
107
+ for raw_index, src in enumerate(_local_html_image_paths(source), start=1):
108
+ candidate = (source.parent / src).resolve(strict=False)
109
+ if candidate in seen or not candidate.exists() or not candidate.is_file():
110
+ continue
111
+ if candidate.suffix.lower() not in IMAGE_EXTENSIONS:
112
+ continue
113
+ seen.add(candidate)
114
+ output = assets_dir / _asset_name(stem, len(records) + 1, candidate.suffix)
115
+ ensure_dir(output.parent)
116
+ shutil.copy2(candidate, output)
117
+ records.append(
118
+ AssetRecord(
119
+ source=str(candidate),
120
+ output_path=str(output),
121
+ markdown_path=_markdown_relative_path(markdown_path, output),
122
+ )
123
+ )
124
+ return records
125
+
126
+
127
+ def extract_assets(source_path: str | Path, markdown_path: str | Path, assets_dir: str | Path) -> list[AssetRecord]:
128
+ """Extract supported image assets and return records for Markdown linking.
129
+
130
+ Unsupported source types return an empty list. This keeps `--extract-assets`
131
+ safe to enable in large mixed folders.
132
+ """
133
+ source = Path(source_path)
134
+ markdown = Path(markdown_path)
135
+ output_dir = ensure_dir(Path(assets_dir))
136
+ suffix = source.suffix.lower()
137
+
138
+ if suffix in OFFICE_EXTENSIONS:
139
+ return _extract_office_assets(source, markdown, output_dir)
140
+ if suffix in {".html", ".htm"}:
141
+ return _copy_html_assets(source, markdown, output_dir)
142
+ return []
143
+
144
+
145
+ def append_asset_links(markdown: str, assets: list[AssetRecord]) -> str:
146
+ """Append an extracted-assets section to Markdown when assets exist."""
147
+ if not assets:
148
+ return markdown
149
+
150
+ body = markdown.rstrip()
151
+ lines = ["", "", "## Extracted Assets", ""]
152
+ for index, asset in enumerate(assets, start=1):
153
+ lines.append(f"![Extracted asset {index}]({asset.markdown_path})")
154
+ return body + "\n".join(lines) + "\n"
@@ -0,0 +1,387 @@
1
+ """Batch conversion workflow."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ import sys
7
+ import time
8
+ from collections.abc import Iterator
9
+ from concurrent.futures import ThreadPoolExecutor, as_completed
10
+ from dataclasses import dataclass
11
+ from pathlib import Path
12
+
13
+ from .assets import append_asset_links, extract_assets
14
+ from .chunker import chunk_markdown, write_jsonl
15
+ from .cleaner import clean_markdown
16
+ from .converter import PlusConverter
17
+ from .manifest import Manifest, ManifestRecord, append_manifest_record, write_manifest
18
+ from .metadata import build_metadata, write_metadata
19
+ from .utils import ensure_dir, make_output_path
20
+
21
+ DEFAULT_EXTENSIONS = {
22
+ ".pdf", ".docx", ".doc", ".pptx", ".ppt", ".xlsx", ".xls", ".html", ".htm",
23
+ ".csv", ".json", ".xml", ".txt", ".jpg", ".jpeg", ".png", ".gif", ".bmp",
24
+ ".webp", ".wav", ".mp3", ".m4a", ".epub", ".zip",
25
+ }
26
+
27
+
28
+ @dataclass
29
+ class BatchOptions:
30
+ """Options for a batch conversion run."""
31
+
32
+ input_path: Path
33
+ output_dir: Path
34
+ recursive: bool = False
35
+ extensions: set[str] | None = None
36
+ clean: bool = False
37
+ rag: bool = False
38
+ max_tokens: int = 800
39
+ overlap: int = 0
40
+ token_model: str = "gpt4"
41
+ chunk_strategy: str = "heading"
42
+ enable_plugins: bool = False
43
+ dry_run: bool = False
44
+ continue_on_error: bool = True
45
+ checkpoint_interval: int = 50
46
+ show_progress: bool = False
47
+ manifest_memory_limit: int = 10_000
48
+ workers: int = 1
49
+ extract_assets: bool = False
50
+
51
+
52
+ @dataclass
53
+ class _ProcessResult:
54
+ source: Path
55
+ success: bool
56
+ output_path: str | None = None
57
+ chunks_path: str | None = None
58
+ metadata_path: str | None = None
59
+ error: str | None = None
60
+
61
+
62
+ def parse_extensions(types: str | None) -> set[str] | None:
63
+ """Parse a CLI comma-separated extension/type string."""
64
+ if not types:
65
+ return None
66
+ parsed: set[str] = set()
67
+ for item in types.split(","):
68
+ value = item.strip().lower()
69
+ if not value:
70
+ continue
71
+ parsed.add(value if value.startswith(".") else f".{value}")
72
+ return parsed or None
73
+
74
+
75
+ def discover_files(input_path: str | Path, recursive: bool = False, extensions: set[str] | None = None) -> list[Path]:
76
+ """Discover supported files from a file or directory."""
77
+ path = Path(input_path)
78
+ allowed = extensions or DEFAULT_EXTENSIONS
79
+
80
+ if path.is_file():
81
+ return [path] if path.suffix.lower() in allowed else []
82
+ if not path.exists():
83
+ raise FileNotFoundError(f"Input path does not exist: {path}")
84
+ if not path.is_dir():
85
+ raise ValueError(f"Input path is not a file or directory: {path}")
86
+
87
+ pattern = "**/*" if recursive else "*"
88
+ files = [p for p in path.glob(pattern) if p.is_file() and p.suffix.lower() in allowed]
89
+ return sorted(files)
90
+
91
+
92
+ def validate_input_output_paths(input_path: Path, output_dir: Path) -> None:
93
+ """Prevent input/output conflicts that can overwrite files or cause recursive loops."""
94
+ input_resolved = input_path.resolve(strict=False)
95
+ output_resolved = output_dir.resolve(strict=False)
96
+
97
+ if input_resolved == output_resolved:
98
+ raise ValueError("Input and output paths must be different")
99
+
100
+ if input_path.exists() and input_path.is_dir():
101
+ try:
102
+ output_resolved.relative_to(input_resolved)
103
+ except ValueError:
104
+ return
105
+ raise ValueError(
106
+ "Output directory must not be inside the input directory. "
107
+ "Choose a separate output path to avoid recursive conversion loops."
108
+ )
109
+
110
+
111
+ def _print_progress(index: int, total: int, source: Path) -> None:
112
+ print(f"[{index}/{total}] Converting: {source}", file=sys.stderr)
113
+
114
+
115
+ def _is_ci_environment() -> bool:
116
+ return os.environ.get("CI", "").strip().lower() in {"1", "true", "yes", "on"}
117
+
118
+
119
+ def _iter_with_progress(files: list[Path], show_progress: bool) -> Iterator[tuple[int, Path]]:
120
+ """Yield files with optional tqdm progress and dependency-free fallback."""
121
+ if not show_progress:
122
+ for index, source in enumerate(files, start=1):
123
+ yield index, source
124
+ return
125
+
126
+ if not _is_ci_environment() and sys.stderr.isatty():
127
+ try:
128
+ from tqdm import tqdm # type: ignore[import-not-found]
129
+ except ImportError:
130
+ pass
131
+ else:
132
+ iterator = tqdm(files, desc="Converting", unit="file", file=sys.stderr)
133
+ for index, source in enumerate(iterator, start=1):
134
+ yield index, source
135
+ return
136
+
137
+ total = len(files)
138
+ for index, source in enumerate(files, start=1):
139
+ _print_progress(index, total, source)
140
+ yield index, source
141
+
142
+
143
+ def _progress_done(index: int, total: int, source: Path, show_progress: bool) -> None:
144
+ if show_progress and (_is_ci_environment() or not sys.stderr.isatty()):
145
+ _print_progress(index, total, source)
146
+
147
+
148
+ def _should_stream_manifest(file_count: int, limit: int) -> bool:
149
+ return limit >= 0 and file_count > limit
150
+
151
+
152
+ def _stream_record(record: ManifestRecord, records_path: Path | None, failed_records_path: Path | None) -> None:
153
+ if records_path is not None:
154
+ append_manifest_record(records_path, record)
155
+ if record.status == "failed" and failed_records_path is not None:
156
+ append_manifest_record(failed_records_path, record)
157
+
158
+
159
+ def _make_dirs(output_dir: Path, rag: bool, extract_asset_flag: bool) -> tuple[Path, Path | None, Path, Path | None]:
160
+ markdown_dir = ensure_dir(output_dir / "markdown")
161
+ chunks_dir = ensure_dir(output_dir / "chunks") if rag else None
162
+ metadata_dir = ensure_dir(output_dir / "metadata")
163
+ assets_dir = ensure_dir(output_dir / "assets") if extract_asset_flag else None
164
+ return markdown_dir, chunks_dir, metadata_dir, assets_dir
165
+
166
+
167
+ def _process_one(
168
+ source: Path,
169
+ *,
170
+ input_root: Path,
171
+ markdown_dir: Path,
172
+ chunks_dir: Path | None,
173
+ metadata_dir: Path,
174
+ assets_dir: Path | None,
175
+ options: BatchOptions,
176
+ ) -> _ProcessResult:
177
+ started_at = time.perf_counter()
178
+ try:
179
+ converter = PlusConverter(enable_plugins=options.enable_plugins)
180
+ markdown = converter.convert_file(source)
181
+ if options.clean:
182
+ markdown = clean_markdown(markdown)
183
+
184
+ markdown_path = make_output_path(source, input_root, markdown_dir, ".md")
185
+ ensure_dir(markdown_path.parent)
186
+
187
+ asset_records = []
188
+ if options.extract_assets and assets_dir is not None:
189
+ asset_records = extract_assets(source, markdown_path, assets_dir)
190
+ markdown = append_asset_links(markdown, asset_records)
191
+
192
+ markdown_path.write_text(markdown, encoding="utf-8")
193
+
194
+ chunks_path: Path | None = None
195
+ if options.rag and chunks_dir is not None:
196
+ chunks_path = make_output_path(source, input_root, chunks_dir, ".jsonl")
197
+ chunks = chunk_markdown(
198
+ markdown,
199
+ source=str(source),
200
+ max_tokens=options.max_tokens,
201
+ overlap=options.overlap,
202
+ model=options.token_model,
203
+ strategy=options.chunk_strategy,
204
+ )
205
+ write_jsonl(chunks, chunks_path)
206
+
207
+ elapsed = time.perf_counter() - started_at
208
+ metadata_path = make_output_path(source, input_root, metadata_dir, ".json")
209
+ metadata = build_metadata(
210
+ source,
211
+ markdown_path,
212
+ clean_enabled=options.clean,
213
+ rag_enabled=options.rag,
214
+ extract_assets_enabled=options.extract_assets,
215
+ chunk_strategy=options.chunk_strategy,
216
+ assets=[asset.to_dict() for asset in asset_records],
217
+ conversion_time_seconds=elapsed,
218
+ )
219
+ write_metadata(metadata, metadata_path)
220
+
221
+ return _ProcessResult(
222
+ source=source,
223
+ success=True,
224
+ output_path=str(markdown_path),
225
+ chunks_path=str(chunks_path) if chunks_path else None,
226
+ metadata_path=str(metadata_path),
227
+ )
228
+ except Exception as exc:
229
+ return _ProcessResult(source=source, success=False, error=str(exc))
230
+
231
+
232
+ def _record_result(manifest: Manifest, result: _ProcessResult) -> ManifestRecord:
233
+ if result.success:
234
+ return manifest.add_success(
235
+ source_path=str(result.source),
236
+ output_path=result.output_path or "",
237
+ chunks_path=result.chunks_path,
238
+ metadata_path=result.metadata_path,
239
+ )
240
+ return manifest.add_failed(str(result.source), result.error or "unknown error")
241
+
242
+
243
+ def _prepare_manifest(manifest: Manifest, output_dir: Path, file_count: int, limit: int) -> tuple[Path | None, Path | None]:
244
+ records_path: Path | None = None
245
+ failed_records_path: Path | None = None
246
+ if _should_stream_manifest(file_count, limit):
247
+ records_path = output_dir / "manifest-records.jsonl"
248
+ failed_records_path = output_dir / "failed.jsonl"
249
+ records_path.unlink(missing_ok=True)
250
+ failed_records_path.unlink(missing_ok=True)
251
+ manifest.enable_streaming(records_path=records_path, failed_records_path=failed_records_path)
252
+ return records_path, failed_records_path
253
+
254
+
255
+ def _run_batch_sequential(
256
+ files: list[Path],
257
+ *,
258
+ input_root: Path,
259
+ output_dir: Path,
260
+ manifest: Manifest,
261
+ records_path: Path | None,
262
+ failed_records_path: Path | None,
263
+ markdown_dir: Path,
264
+ chunks_dir: Path | None,
265
+ metadata_dir: Path,
266
+ assets_dir: Path | None,
267
+ options: BatchOptions,
268
+ ) -> Manifest:
269
+ try:
270
+ for index, source in _iter_with_progress(files, options.show_progress):
271
+ result = _process_one(
272
+ source,
273
+ input_root=input_root,
274
+ markdown_dir=markdown_dir,
275
+ chunks_dir=chunks_dir,
276
+ metadata_dir=metadata_dir,
277
+ assets_dir=assets_dir,
278
+ options=options,
279
+ )
280
+ record = _record_result(manifest, result)
281
+ _stream_record(record, records_path, failed_records_path)
282
+ if not result.success:
283
+ write_manifest(manifest, output_dir)
284
+ if not options.continue_on_error:
285
+ break
286
+ if options.checkpoint_interval > 0 and index % options.checkpoint_interval == 0:
287
+ write_manifest(manifest, output_dir)
288
+ except KeyboardInterrupt:
289
+ write_manifest(manifest, output_dir)
290
+ raise
291
+ return manifest
292
+
293
+
294
+ def _run_batch_parallel(
295
+ files: list[Path],
296
+ *,
297
+ input_root: Path,
298
+ output_dir: Path,
299
+ manifest: Manifest,
300
+ records_path: Path | None,
301
+ failed_records_path: Path | None,
302
+ markdown_dir: Path,
303
+ chunks_dir: Path | None,
304
+ metadata_dir: Path,
305
+ assets_dir: Path | None,
306
+ options: BatchOptions,
307
+ ) -> Manifest:
308
+ max_workers = min(options.workers, len(files)) if files else 1
309
+ completed = 0
310
+ try:
311
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
312
+ futures = {
313
+ executor.submit(
314
+ _process_one,
315
+ source,
316
+ input_root=input_root,
317
+ markdown_dir=markdown_dir,
318
+ chunks_dir=chunks_dir,
319
+ metadata_dir=metadata_dir,
320
+ assets_dir=assets_dir,
321
+ options=options,
322
+ ): source
323
+ for source in files
324
+ }
325
+ for future in as_completed(futures):
326
+ source = futures[future]
327
+ completed += 1
328
+ try:
329
+ result = future.result()
330
+ except Exception as exc: # defensive; _process_one should catch
331
+ result = _ProcessResult(source=source, success=False, error=str(exc))
332
+ record = _record_result(manifest, result)
333
+ _stream_record(record, records_path, failed_records_path)
334
+ _progress_done(completed, len(files), source, options.show_progress)
335
+ if not result.success:
336
+ write_manifest(manifest, output_dir)
337
+ if not options.continue_on_error:
338
+ break
339
+ if options.checkpoint_interval > 0 and completed % options.checkpoint_interval == 0:
340
+ write_manifest(manifest, output_dir)
341
+ except KeyboardInterrupt:
342
+ write_manifest(manifest, output_dir)
343
+ raise
344
+ return manifest
345
+
346
+
347
+ def run_batch(options: BatchOptions) -> Manifest:
348
+ """Run a batch conversion job."""
349
+ input_path = options.input_path
350
+ validate_input_output_paths(input_path, options.output_dir)
351
+
352
+ output_dir = ensure_dir(options.output_dir)
353
+ markdown_dir, chunks_dir, metadata_dir, assets_dir = _make_dirs(output_dir, options.rag, options.extract_assets)
354
+
355
+ files = discover_files(input_path, options.recursive, options.extensions)
356
+ input_root = input_path if input_path.is_dir() else input_path.parent
357
+ worker_count = max(1, options.workers)
358
+
359
+ manifest = Manifest(source=str(input_path), output=str(output_dir))
360
+ records_path, failed_records_path = _prepare_manifest(
361
+ manifest, output_dir, len(files), options.manifest_memory_limit
362
+ )
363
+
364
+ if options.dry_run:
365
+ for source in files:
366
+ record = manifest.add_success(str(source), output_path="DRY_RUN")
367
+ _stream_record(record, records_path, failed_records_path)
368
+ write_manifest(manifest, output_dir)
369
+ return manifest
370
+
371
+ runner = _run_batch_parallel if worker_count > 1 and len(files) > 1 else _run_batch_sequential
372
+ runner(
373
+ files,
374
+ input_root=input_root,
375
+ output_dir=output_dir,
376
+ manifest=manifest,
377
+ records_path=records_path,
378
+ failed_records_path=failed_records_path,
379
+ markdown_dir=markdown_dir,
380
+ chunks_dir=chunks_dir,
381
+ metadata_dir=metadata_dir,
382
+ assets_dir=assets_dir,
383
+ options=BatchOptions(**{**options.__dict__, "workers": worker_count}),
384
+ )
385
+
386
+ write_manifest(manifest, output_dir)
387
+ return manifest