markitdown-plus 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- markitdown_plus/__about__.py +7 -0
- markitdown_plus/__init__.py +12 -0
- markitdown_plus/assets.py +154 -0
- markitdown_plus/batch.py +387 -0
- markitdown_plus/chunker.py +433 -0
- markitdown_plus/cleaner.py +158 -0
- markitdown_plus/cli.py +205 -0
- markitdown_plus/converter.py +58 -0
- markitdown_plus/errors.py +13 -0
- markitdown_plus/manifest.py +164 -0
- markitdown_plus/metadata.py +97 -0
- markitdown_plus/utils.py +52 -0
- markitdown_plus-0.2.0.dist-info/METADATA +292 -0
- markitdown_plus-0.2.0.dist-info/RECORD +17 -0
- markitdown_plus-0.2.0.dist-info/WHEEL +4 -0
- markitdown_plus-0.2.0.dist-info/entry_points.txt +2 -0
- markitdown_plus-0.2.0.dist-info/licenses/LICENSE +21 -0
markitdown_plus/cli.py
ADDED
|
@@ -0,0 +1,205 @@
|
|
|
1
|
+
"""Command-line interface for markitdown-plus."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import argparse
|
|
6
|
+
import os
|
|
7
|
+
import sys
|
|
8
|
+
import traceback
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
|
|
11
|
+
from .__about__ import __description__, __version__
|
|
12
|
+
from .batch import BatchOptions, parse_extensions, run_batch
|
|
13
|
+
from .chunker import CHUNK_STRATEGIES, chunk_markdown, write_jsonl
|
|
14
|
+
from .cleaner import clean_markdown
|
|
15
|
+
from .converter import PlusConverter
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _ensure_file_output_path(output: str | Path, example: str) -> Path:
|
|
19
|
+
output_path = Path(output)
|
|
20
|
+
if output_path.exists() and output_path.is_dir():
|
|
21
|
+
raise ValueError(
|
|
22
|
+
f"Output path is a directory, not a file: {output_path}. "
|
|
23
|
+
f"Please specify a file path, for example: {example}"
|
|
24
|
+
)
|
|
25
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
26
|
+
return output_path
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _default_workers() -> int:
|
|
30
|
+
return max(1, min(4, os.cpu_count() or 1))
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _add_common_options(parser: argparse.ArgumentParser) -> None:
|
|
34
|
+
parser.add_argument("-v", "--verbose", action="store_true", help="Show full traceback on error.")
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _build_parser() -> argparse.ArgumentParser:
|
|
38
|
+
parser = argparse.ArgumentParser(prog="markitdown-plus", description=__description__)
|
|
39
|
+
parser.add_argument("--version", action="version", version=f"markitdown-plus {__version__}")
|
|
40
|
+
|
|
41
|
+
subparsers = parser.add_subparsers(dest="command", required=True)
|
|
42
|
+
|
|
43
|
+
convert = subparsers.add_parser(
|
|
44
|
+
"convert",
|
|
45
|
+
help="Convert one file or a folder to Markdown using Microsoft MarkItDown.",
|
|
46
|
+
)
|
|
47
|
+
convert.add_argument("input", help="Input file or directory.")
|
|
48
|
+
convert.add_argument("-o", "--output", required=True, help="Output directory.")
|
|
49
|
+
convert.add_argument("-r", "--recursive", action="store_true", help="Scan directories recursively.")
|
|
50
|
+
convert.add_argument(
|
|
51
|
+
"--types",
|
|
52
|
+
help="Comma-separated extensions to include, for example: pdf,docx,pptx,xlsx,html,csv",
|
|
53
|
+
)
|
|
54
|
+
convert.add_argument("--clean", action="store_true", help="Clean converted Markdown.")
|
|
55
|
+
convert.add_argument("--rag", action="store_true", help="Export JSONL chunks for RAG pipelines.")
|
|
56
|
+
convert.add_argument("--chunk-size", type=int, default=800, help="Target max token estimate per chunk.")
|
|
57
|
+
convert.add_argument("--overlap", type=int, default=0, help="Word overlap between adjacent chunks.")
|
|
58
|
+
convert.add_argument("--model", default="gpt4", help="Token estimate model profile: gpt4, claude, gemini, deepseek.")
|
|
59
|
+
convert.add_argument(
|
|
60
|
+
"--chunk-strategy",
|
|
61
|
+
choices=sorted(CHUNK_STRATEGIES),
|
|
62
|
+
default="heading",
|
|
63
|
+
help="RAG chunking strategy: heading, fixed, or semantic-lite.",
|
|
64
|
+
)
|
|
65
|
+
convert.add_argument("--plugins", action="store_true", help="Enable installed MarkItDown plugins.")
|
|
66
|
+
convert.add_argument("--dry-run", action="store_true", help="List matching files without converting.")
|
|
67
|
+
convert.add_argument("--fail-fast", action="store_true", help="Stop at the first conversion error.")
|
|
68
|
+
convert.add_argument("--quiet", action="store_true", help="Hide progress output.")
|
|
69
|
+
convert.add_argument("--progress", action="store_true", help="Show tqdm progress bar when available.")
|
|
70
|
+
convert.add_argument(
|
|
71
|
+
"--workers",
|
|
72
|
+
type=int,
|
|
73
|
+
default=1,
|
|
74
|
+
help=f"Parallel conversion workers. Use 0 for auto ({_default_workers()}).",
|
|
75
|
+
)
|
|
76
|
+
convert.add_argument("--extract-assets", action="store_true", help="Extract DOCX/PPTX/XLSX/HTML image assets when possible.")
|
|
77
|
+
_add_common_options(convert)
|
|
78
|
+
|
|
79
|
+
clean = subparsers.add_parser("clean", help="Clean an existing Markdown file.")
|
|
80
|
+
clean.add_argument("input", help="Input Markdown file.")
|
|
81
|
+
clean.add_argument("-o", "--output", required=True, help="Output Markdown file.")
|
|
82
|
+
_add_common_options(clean)
|
|
83
|
+
|
|
84
|
+
chunk = subparsers.add_parser("chunk", help="Chunk an existing Markdown file to JSONL.")
|
|
85
|
+
chunk.add_argument("input", help="Input Markdown file.")
|
|
86
|
+
chunk.add_argument("-o", "--output", required=True, help="Output JSONL file.")
|
|
87
|
+
chunk.add_argument("--chunk-size", type=int, default=800, help="Target max token estimate per chunk.")
|
|
88
|
+
chunk.add_argument("--overlap", type=int, default=0, help="Word overlap between adjacent chunks.")
|
|
89
|
+
chunk.add_argument("--model", default="gpt4", help="Token estimate model profile: gpt4, claude, gemini, deepseek.")
|
|
90
|
+
chunk.add_argument(
|
|
91
|
+
"--chunk-strategy",
|
|
92
|
+
choices=sorted(CHUNK_STRATEGIES),
|
|
93
|
+
default="heading",
|
|
94
|
+
help="RAG chunking strategy: heading, fixed, or semantic-lite.",
|
|
95
|
+
)
|
|
96
|
+
_add_common_options(chunk)
|
|
97
|
+
|
|
98
|
+
single = subparsers.add_parser("single", help="Convert one file and write Markdown directly.")
|
|
99
|
+
single.add_argument("input", help="Input file.")
|
|
100
|
+
single.add_argument("-o", "--output", required=True, help="Output Markdown file.")
|
|
101
|
+
single.add_argument("--clean", action="store_true", help="Clean converted Markdown.")
|
|
102
|
+
single.add_argument("--plugins", action="store_true", help="Enable installed MarkItDown plugins.")
|
|
103
|
+
_add_common_options(single)
|
|
104
|
+
|
|
105
|
+
return parser
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def command_convert(args: argparse.Namespace) -> int:
|
|
109
|
+
workers = _default_workers() if args.workers == 0 else max(1, args.workers)
|
|
110
|
+
options = BatchOptions(
|
|
111
|
+
input_path=Path(args.input),
|
|
112
|
+
output_dir=Path(args.output),
|
|
113
|
+
recursive=args.recursive,
|
|
114
|
+
extensions=parse_extensions(args.types),
|
|
115
|
+
clean=args.clean,
|
|
116
|
+
rag=args.rag,
|
|
117
|
+
max_tokens=args.chunk_size,
|
|
118
|
+
overlap=args.overlap,
|
|
119
|
+
token_model=args.model,
|
|
120
|
+
chunk_strategy=args.chunk_strategy,
|
|
121
|
+
enable_plugins=args.plugins,
|
|
122
|
+
dry_run=args.dry_run,
|
|
123
|
+
continue_on_error=not args.fail_fast,
|
|
124
|
+
show_progress=(args.progress or not args.quiet) and not args.dry_run,
|
|
125
|
+
workers=workers,
|
|
126
|
+
extract_assets=args.extract_assets,
|
|
127
|
+
)
|
|
128
|
+
manifest = run_batch(options)
|
|
129
|
+
if args.dry_run:
|
|
130
|
+
print(f"Found: {manifest.total} file(s)")
|
|
131
|
+
for record in manifest.files:
|
|
132
|
+
print(record.source_path)
|
|
133
|
+
print(f"Manifest: {Path(args.output) / 'manifest.json'}")
|
|
134
|
+
return 0
|
|
135
|
+
print(f"Converted: {manifest.success}/{manifest.total}")
|
|
136
|
+
if workers > 1:
|
|
137
|
+
print(f"Workers: {workers}")
|
|
138
|
+
if manifest.failed:
|
|
139
|
+
print(f"Failed: {manifest.failed} file(s). See failed.json or failed.jsonl in {args.output}", file=sys.stderr)
|
|
140
|
+
return 1
|
|
141
|
+
print(f"Manifest: {Path(args.output) / 'manifest.json'}")
|
|
142
|
+
return 0
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def command_clean(args: argparse.Namespace) -> int:
|
|
146
|
+
input_path = Path(args.input)
|
|
147
|
+
output_path = _ensure_file_output_path(args.output, "-o output/clean.md")
|
|
148
|
+
text = input_path.read_text(encoding="utf-8")
|
|
149
|
+
output_path.write_text(clean_markdown(text), encoding="utf-8")
|
|
150
|
+
print(f"Cleaned Markdown: {output_path}")
|
|
151
|
+
return 0
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def command_chunk(args: argparse.Namespace) -> int:
|
|
155
|
+
input_path = Path(args.input)
|
|
156
|
+
output_path = _ensure_file_output_path(args.output, "-o output/chunks.jsonl")
|
|
157
|
+
markdown = input_path.read_text(encoding="utf-8")
|
|
158
|
+
chunks = chunk_markdown(
|
|
159
|
+
markdown,
|
|
160
|
+
source=str(input_path),
|
|
161
|
+
max_tokens=args.chunk_size,
|
|
162
|
+
overlap=args.overlap,
|
|
163
|
+
model=args.model,
|
|
164
|
+
strategy=args.chunk_strategy,
|
|
165
|
+
)
|
|
166
|
+
write_jsonl(chunks, output_path)
|
|
167
|
+
print(f"Chunks: {len(chunks)} -> {output_path}")
|
|
168
|
+
return 0
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def command_single(args: argparse.Namespace) -> int:
|
|
172
|
+
converter = PlusConverter(enable_plugins=args.plugins)
|
|
173
|
+
markdown = converter.convert_file(args.input)
|
|
174
|
+
if args.clean:
|
|
175
|
+
markdown = clean_markdown(markdown)
|
|
176
|
+
output_path = _ensure_file_output_path(args.output, "-o output/report.md")
|
|
177
|
+
output_path.write_text(markdown, encoding="utf-8")
|
|
178
|
+
print(f"Markdown: {output_path}")
|
|
179
|
+
return 0
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def main(argv: list[str] | None = None) -> int:
|
|
183
|
+
parser = _build_parser()
|
|
184
|
+
args = parser.parse_args(argv)
|
|
185
|
+
|
|
186
|
+
try:
|
|
187
|
+
if args.command == "convert":
|
|
188
|
+
return command_convert(args)
|
|
189
|
+
if args.command == "clean":
|
|
190
|
+
return command_clean(args)
|
|
191
|
+
if args.command == "chunk":
|
|
192
|
+
return command_chunk(args)
|
|
193
|
+
if args.command == "single":
|
|
194
|
+
return command_single(args)
|
|
195
|
+
parser.error(f"Unknown command: {args.command}")
|
|
196
|
+
except Exception as exc:
|
|
197
|
+
if getattr(args, "verbose", False):
|
|
198
|
+
traceback.print_exc()
|
|
199
|
+
print(f"Error: {exc}", file=sys.stderr)
|
|
200
|
+
return 2
|
|
201
|
+
return 0
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
if __name__ == "__main__": # pragma: no cover
|
|
205
|
+
raise SystemExit(main())
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
"""Thin wrapper around Microsoft MarkItDown."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
from .errors import ConversionError, DependencyError
|
|
9
|
+
|
|
10
|
+
logger = logging.getLogger(__name__)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class PlusConverter:
|
|
14
|
+
"""Convert files to Markdown with Microsoft MarkItDown.
|
|
15
|
+
|
|
16
|
+
The wrapper keeps markitdown-plus code isolated from small upstream API
|
|
17
|
+
changes. It prefers `result.markdown`, then falls back to older aliases.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
def __init__(self, enable_plugins: bool = False) -> None:
|
|
21
|
+
try:
|
|
22
|
+
from markitdown import MarkItDown # type: ignore
|
|
23
|
+
except Exception as exc: # pragma: no cover - depends on environment
|
|
24
|
+
raise DependencyError(
|
|
25
|
+
"markitdown package not found. Run: pip install 'markitdown[all]'"
|
|
26
|
+
) from exc
|
|
27
|
+
|
|
28
|
+
try:
|
|
29
|
+
self._markitdown = MarkItDown(enable_plugins=enable_plugins)
|
|
30
|
+
except TypeError: # pragma: no cover - compatibility fallback
|
|
31
|
+
if enable_plugins:
|
|
32
|
+
logger.warning(
|
|
33
|
+
"Installed markitdown version does not support enable_plugins. "
|
|
34
|
+
"Upgrade with: pip install --upgrade markitdown"
|
|
35
|
+
)
|
|
36
|
+
self._markitdown = MarkItDown()
|
|
37
|
+
|
|
38
|
+
def convert_file(self, path: str | Path) -> str:
|
|
39
|
+
"""Convert one file and return Markdown text."""
|
|
40
|
+
source = Path(path)
|
|
41
|
+
if not source.exists():
|
|
42
|
+
raise ConversionError(f"File does not exist: {source}")
|
|
43
|
+
if not source.is_file():
|
|
44
|
+
raise ConversionError(f"Input is not a file: {source}")
|
|
45
|
+
if source.stat().st_size == 0:
|
|
46
|
+
logger.warning("Empty file: %s", source)
|
|
47
|
+
|
|
48
|
+
try:
|
|
49
|
+
result = self._markitdown.convert(str(source))
|
|
50
|
+
except Exception as exc:
|
|
51
|
+
raise ConversionError(f"Failed to convert {source}: {exc}") from exc
|
|
52
|
+
|
|
53
|
+
markdown = getattr(result, "markdown", None)
|
|
54
|
+
if markdown is None:
|
|
55
|
+
markdown = getattr(result, "text_content", None)
|
|
56
|
+
if markdown is None:
|
|
57
|
+
markdown = str(result)
|
|
58
|
+
return str(markdown).strip() + "\n"
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
"""Custom exceptions used by markitdown-plus."""
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class MarkItDownPlusError(Exception):
|
|
5
|
+
"""Base exception for markitdown-plus."""
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class ConversionError(MarkItDownPlusError):
|
|
9
|
+
"""Raised when a file cannot be converted."""
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class DependencyError(MarkItDownPlusError):
|
|
13
|
+
"""Raised when an optional or required dependency is missing."""
|
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
"""Batch conversion manifest support."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from dataclasses import asdict, dataclass, field
|
|
7
|
+
from datetime import datetime, timezone
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
from .__about__ import __version__
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _drop_none(value: Any) -> Any:
|
|
15
|
+
"""Recursively remove None values from dictionaries/lists for cleaner JSON."""
|
|
16
|
+
if isinstance(value, dict):
|
|
17
|
+
return {k: _drop_none(v) for k, v in value.items() if v is not None}
|
|
18
|
+
if isinstance(value, list):
|
|
19
|
+
return [_drop_none(item) for item in value]
|
|
20
|
+
return value
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@dataclass
|
|
24
|
+
class ManifestRecord:
|
|
25
|
+
"""One file record inside a conversion manifest."""
|
|
26
|
+
|
|
27
|
+
source_path: str
|
|
28
|
+
status: str
|
|
29
|
+
output_path: str | None = None
|
|
30
|
+
chunks_path: str | None = None
|
|
31
|
+
metadata_path: str | None = None
|
|
32
|
+
error: str | None = None
|
|
33
|
+
|
|
34
|
+
def to_dict(self) -> dict[str, Any]:
|
|
35
|
+
"""Return a JSON-friendly record without meaningless null fields."""
|
|
36
|
+
return _drop_none(asdict(self))
|
|
37
|
+
|
|
38
|
+
def to_json(self) -> str:
|
|
39
|
+
"""Serialize this record as one JSONL row."""
|
|
40
|
+
return json.dumps(self.to_dict(), ensure_ascii=False)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@dataclass
|
|
44
|
+
class Manifest:
|
|
45
|
+
"""A conversion run manifest.
|
|
46
|
+
|
|
47
|
+
For very large folders, callers may disable in-memory record storage and
|
|
48
|
+
stream each record to `manifest-records.jsonl`. The summary counters still
|
|
49
|
+
stay in memory, while the heavy per-file data is kept on disk.
|
|
50
|
+
"""
|
|
51
|
+
|
|
52
|
+
tool: str = "markitdown-plus"
|
|
53
|
+
version: str = __version__
|
|
54
|
+
created_at: str = field(default_factory=lambda: datetime.now(timezone.utc).isoformat())
|
|
55
|
+
source: str = ""
|
|
56
|
+
output: str = ""
|
|
57
|
+
total: int = 0
|
|
58
|
+
success: int = 0
|
|
59
|
+
failed: int = 0
|
|
60
|
+
files: list[ManifestRecord] = field(default_factory=list)
|
|
61
|
+
files_truncated: bool = False
|
|
62
|
+
records_path: str | None = None
|
|
63
|
+
failed_records_path: str | None = None
|
|
64
|
+
_store_records: bool = field(default=True, repr=False, compare=False)
|
|
65
|
+
|
|
66
|
+
def enable_streaming(self, records_path: str | Path, failed_records_path: str | Path | None = None) -> None:
|
|
67
|
+
"""Store per-file records on disk instead of keeping them all in memory."""
|
|
68
|
+
self._store_records = False
|
|
69
|
+
self.files_truncated = True
|
|
70
|
+
self.records_path = str(records_path)
|
|
71
|
+
if failed_records_path is not None:
|
|
72
|
+
self.failed_records_path = str(failed_records_path)
|
|
73
|
+
|
|
74
|
+
def _add_record(self, record: ManifestRecord) -> ManifestRecord:
|
|
75
|
+
if self._store_records:
|
|
76
|
+
self.files.append(record)
|
|
77
|
+
else:
|
|
78
|
+
self.files_truncated = True
|
|
79
|
+
|
|
80
|
+
self.total += 1
|
|
81
|
+
if record.status == "success":
|
|
82
|
+
self.success += 1
|
|
83
|
+
elif record.status == "failed":
|
|
84
|
+
self.failed += 1
|
|
85
|
+
return record
|
|
86
|
+
|
|
87
|
+
def add_success(
|
|
88
|
+
self,
|
|
89
|
+
source_path: str,
|
|
90
|
+
output_path: str,
|
|
91
|
+
chunks_path: str | None = None,
|
|
92
|
+
metadata_path: str | None = None,
|
|
93
|
+
) -> ManifestRecord:
|
|
94
|
+
return self._add_record(
|
|
95
|
+
ManifestRecord(
|
|
96
|
+
source_path=source_path,
|
|
97
|
+
status="success",
|
|
98
|
+
output_path=output_path,
|
|
99
|
+
chunks_path=chunks_path,
|
|
100
|
+
metadata_path=metadata_path,
|
|
101
|
+
)
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
def add_failed(self, source_path: str, error: str) -> ManifestRecord:
|
|
105
|
+
return self._add_record(ManifestRecord(source_path=source_path, status="failed", error=error))
|
|
106
|
+
|
|
107
|
+
def _recount(self) -> None:
|
|
108
|
+
"""Recalculate counters when records are edited externally."""
|
|
109
|
+
self.total = len(self.files)
|
|
110
|
+
self.success = sum(1 for record in self.files if record.status == "success")
|
|
111
|
+
self.failed = sum(1 for record in self.files if record.status == "failed")
|
|
112
|
+
|
|
113
|
+
def to_dict(self) -> dict[str, Any]:
|
|
114
|
+
payload = {
|
|
115
|
+
"tool": self.tool,
|
|
116
|
+
"version": self.version,
|
|
117
|
+
"created_at": self.created_at,
|
|
118
|
+
"source": self.source,
|
|
119
|
+
"output": self.output,
|
|
120
|
+
"total": self.total,
|
|
121
|
+
"success": self.success,
|
|
122
|
+
"failed": self.failed,
|
|
123
|
+
"files_truncated": self.files_truncated,
|
|
124
|
+
"records_path": self.records_path,
|
|
125
|
+
"failed_records_path": self.failed_records_path,
|
|
126
|
+
"files": [record.to_dict() for record in self.files],
|
|
127
|
+
}
|
|
128
|
+
return _drop_none(payload)
|
|
129
|
+
|
|
130
|
+
def failed_records(self) -> list[dict[str, Any]]:
|
|
131
|
+
return [record.to_dict() for record in self.files if record.status == "failed"]
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def append_manifest_record(path: str | Path, record: ManifestRecord) -> Path:
|
|
135
|
+
"""Append one manifest record to a JSONL file."""
|
|
136
|
+
output = Path(path)
|
|
137
|
+
output.parent.mkdir(parents=True, exist_ok=True)
|
|
138
|
+
with output.open("a", encoding="utf-8") as f:
|
|
139
|
+
f.write(record.to_json() + "\n")
|
|
140
|
+
return output
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def write_manifest(manifest: Manifest, output_dir: str | Path) -> tuple[Path, Path | None]:
|
|
144
|
+
output = Path(output_dir)
|
|
145
|
+
output.mkdir(parents=True, exist_ok=True)
|
|
146
|
+
manifest_path = output / "manifest.json"
|
|
147
|
+
manifest_path.write_text(
|
|
148
|
+
json.dumps(manifest.to_dict(), ensure_ascii=False, indent=2),
|
|
149
|
+
encoding="utf-8",
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
failed_path: Path | None = None
|
|
153
|
+
failed_records = manifest.failed_records()
|
|
154
|
+
if manifest.failed and failed_records:
|
|
155
|
+
failed_path = output / "failed.json"
|
|
156
|
+
failed_path.write_text(
|
|
157
|
+
json.dumps(failed_records, ensure_ascii=False, indent=2),
|
|
158
|
+
encoding="utf-8",
|
|
159
|
+
)
|
|
160
|
+
else:
|
|
161
|
+
old_failed_path = output / "failed.json"
|
|
162
|
+
if old_failed_path.exists():
|
|
163
|
+
old_failed_path.unlink()
|
|
164
|
+
return manifest_path, failed_path
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
"""Metadata helpers for converted documents."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from dataclasses import asdict, dataclass, field
|
|
7
|
+
from datetime import datetime, timezone
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
from .__about__ import __version__
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass
|
|
15
|
+
class FileMetadata:
|
|
16
|
+
"""Basic conversion metadata for one source file."""
|
|
17
|
+
|
|
18
|
+
source_path: str
|
|
19
|
+
output_path: str
|
|
20
|
+
file_name: str
|
|
21
|
+
extension: str
|
|
22
|
+
source_size_bytes: int
|
|
23
|
+
output_size_bytes: int
|
|
24
|
+
converted_at: str
|
|
25
|
+
clean_enabled: bool = False
|
|
26
|
+
rag_enabled: bool = False
|
|
27
|
+
extract_assets_enabled: bool = False
|
|
28
|
+
chunk_strategy: str = "heading"
|
|
29
|
+
assets_count: int = 0
|
|
30
|
+
assets: list[dict[str, Any]] = field(default_factory=list)
|
|
31
|
+
conversion_time_seconds: float = 0.0
|
|
32
|
+
markitdown_plus_version: str = __version__
|
|
33
|
+
markitdown_version: str | None = None
|
|
34
|
+
|
|
35
|
+
@property
|
|
36
|
+
def size_bytes(self) -> int:
|
|
37
|
+
"""Backward-compatible alias for v0.1.0 metadata tests/users."""
|
|
38
|
+
return self.source_size_bytes
|
|
39
|
+
|
|
40
|
+
def to_dict(self) -> dict[str, object]:
|
|
41
|
+
payload = asdict(self)
|
|
42
|
+
if payload.get("markitdown_version") is None:
|
|
43
|
+
payload.pop("markitdown_version", None)
|
|
44
|
+
if not payload.get("assets"):
|
|
45
|
+
payload.pop("assets", None)
|
|
46
|
+
return payload
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def _markitdown_version() -> str | None:
|
|
50
|
+
try:
|
|
51
|
+
from importlib.metadata import version
|
|
52
|
+
|
|
53
|
+
return version("markitdown")
|
|
54
|
+
except Exception:
|
|
55
|
+
return None
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def build_metadata(
|
|
59
|
+
source_path: str | Path,
|
|
60
|
+
output_path: str | Path,
|
|
61
|
+
*,
|
|
62
|
+
clean_enabled: bool = False,
|
|
63
|
+
rag_enabled: bool = False,
|
|
64
|
+
extract_assets_enabled: bool = False,
|
|
65
|
+
chunk_strategy: str = "heading",
|
|
66
|
+
assets: list[dict[str, Any]] | None = None,
|
|
67
|
+
conversion_time_seconds: float = 0.0,
|
|
68
|
+
) -> FileMetadata:
|
|
69
|
+
source = Path(source_path)
|
|
70
|
+
output = Path(output_path)
|
|
71
|
+
source_size = source.stat().st_size if source.exists() else 0
|
|
72
|
+
output_size = output.stat().st_size if output.exists() else 0
|
|
73
|
+
asset_list = assets or []
|
|
74
|
+
return FileMetadata(
|
|
75
|
+
source_path=str(source),
|
|
76
|
+
output_path=str(output),
|
|
77
|
+
file_name=source.name,
|
|
78
|
+
extension=source.suffix.lower(),
|
|
79
|
+
source_size_bytes=source_size,
|
|
80
|
+
output_size_bytes=output_size,
|
|
81
|
+
converted_at=datetime.now(timezone.utc).isoformat(),
|
|
82
|
+
clean_enabled=clean_enabled,
|
|
83
|
+
rag_enabled=rag_enabled,
|
|
84
|
+
extract_assets_enabled=extract_assets_enabled,
|
|
85
|
+
chunk_strategy=chunk_strategy,
|
|
86
|
+
assets_count=len(asset_list),
|
|
87
|
+
assets=asset_list,
|
|
88
|
+
conversion_time_seconds=round(conversion_time_seconds, 6),
|
|
89
|
+
markitdown_version=_markitdown_version(),
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def write_metadata(metadata: FileMetadata, output_path: str | Path) -> Path:
|
|
94
|
+
output = Path(output_path)
|
|
95
|
+
output.parent.mkdir(parents=True, exist_ok=True)
|
|
96
|
+
output.write_text(json.dumps(metadata.to_dict(), ensure_ascii=False, indent=2), encoding="utf-8")
|
|
97
|
+
return output
|
markitdown_plus/utils.py
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
"""Small utility helpers."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import hashlib
|
|
6
|
+
import re
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
WINDOWS_UNSAFE_CHARS = r'[<>:"/\\|?*\x00-\x1f]+'
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def ensure_dir(path: Path) -> Path:
|
|
13
|
+
"""Create a directory if it does not already exist and return it."""
|
|
14
|
+
path.mkdir(parents=True, exist_ok=True)
|
|
15
|
+
return path
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def safe_stem(path: Path) -> str:
|
|
19
|
+
"""Return a filesystem-safe stem while preserving Unicode names.
|
|
20
|
+
|
|
21
|
+
Unlike ASCII-only sanitizers, this keeps Chinese, Japanese, Greek, Cyrillic,
|
|
22
|
+
Arabic, and other valid filename characters. It only replaces characters that
|
|
23
|
+
are unsafe on common filesystems, especially Windows.
|
|
24
|
+
"""
|
|
25
|
+
stem = path.stem.strip() or "document"
|
|
26
|
+
stem = re.sub(WINDOWS_UNSAFE_CHARS, "-", stem)
|
|
27
|
+
stem = re.sub(r"-+", "-", stem).strip("-._")
|
|
28
|
+
return stem or "document"
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def safe_path_part(value: str) -> str:
|
|
32
|
+
"""Sanitize one relative path component while preserving Unicode text."""
|
|
33
|
+
cleaned = re.sub(WINDOWS_UNSAFE_CHARS, "-", value.strip())
|
|
34
|
+
cleaned = re.sub(r"-+", "-", cleaned).strip("-._")
|
|
35
|
+
return cleaned or "document"
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def short_hash(value: str, length: int = 8) -> str:
|
|
39
|
+
"""Return a short stable hash for IDs."""
|
|
40
|
+
return hashlib.sha256(value.encode("utf-8", errors="ignore")).hexdigest()[:length]
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def make_output_path(source: Path, input_root: Path, output_root: Path, suffix: str) -> Path:
|
|
44
|
+
"""Build an output path that preserves relative directory structure."""
|
|
45
|
+
try:
|
|
46
|
+
relative = source.relative_to(input_root)
|
|
47
|
+
except ValueError:
|
|
48
|
+
relative = Path(source.name)
|
|
49
|
+
|
|
50
|
+
clean_parts = [safe_path_part(part) for part in relative.parts]
|
|
51
|
+
relative_clean = Path(*clean_parts)
|
|
52
|
+
return output_root / relative_clean.with_suffix(suffix)
|