markitdown-plus 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- markitdown_plus-0.2.0/.coverage +0 -0
- markitdown_plus-0.2.0/.github/FUNDING.yml +1 -0
- markitdown_plus-0.2.0/.github/workflows/ci.yml +25 -0
- markitdown_plus-0.2.0/.gitignore +13 -0
- markitdown_plus-0.2.0/.hermes/TODO.md +17 -0
- markitdown_plus-0.2.0/.hypothesis/.gitignore +9 -0
- markitdown_plus-0.2.0/.hypothesis/constants/086764ccb2d66f8a +4 -0
- markitdown_plus-0.2.0/.hypothesis/constants/110871bc3b4f5e27 +4 -0
- markitdown_plus-0.2.0/.hypothesis/constants/1d68e3aa4dbfdf79 +4 -0
- markitdown_plus-0.2.0/.hypothesis/constants/332dd3b2262038d8 +4 -0
- markitdown_plus-0.2.0/.hypothesis/constants/359268f49f691cc8 +4 -0
- markitdown_plus-0.2.0/.hypothesis/constants/4214944546cf5c7c +4 -0
- markitdown_plus-0.2.0/.hypothesis/constants/7b4b0b2b153d41d3 +4 -0
- markitdown_plus-0.2.0/.hypothesis/constants/8f91968cbf2d23ac +4 -0
- markitdown_plus-0.2.0/.hypothesis/constants/92e6eaf999a14c55 +4 -0
- markitdown_plus-0.2.0/.hypothesis/constants/c4ac8b71d8e3e21f +4 -0
- markitdown_plus-0.2.0/.hypothesis/constants/c55212a6e09c5e51 +4 -0
- markitdown_plus-0.2.0/.hypothesis/constants/e79837f15d416198 +4 -0
- markitdown_plus-0.2.0/.hypothesis/unicode_data/15.1.0/charmap.json.gz +0 -0
- markitdown_plus-0.2.0/.hypothesis/unicode_data/15.1.0/codec-utf-8.json.gz +0 -0
- markitdown_plus-0.2.0/CHANGELOG.md +48 -0
- markitdown_plus-0.2.0/LICENSE +21 -0
- markitdown_plus-0.2.0/PKG-INFO +292 -0
- markitdown_plus-0.2.0/README.md +251 -0
- markitdown_plus-0.2.0/docs/comparison-with-markitdown.md +17 -0
- markitdown_plus-0.2.0/docs/rag-output.md +17 -0
- markitdown_plus-0.2.0/docs/usage.md +25 -0
- markitdown_plus-0.2.0/examples/sample_docs/hello.txt +5 -0
- markitdown_plus-0.2.0/pyproject.toml +88 -0
- markitdown_plus-0.2.0/requirements.txt +5 -0
- markitdown_plus-0.2.0/src/markitdown_plus/__about__.py +7 -0
- markitdown_plus-0.2.0/src/markitdown_plus/__init__.py +12 -0
- markitdown_plus-0.2.0/src/markitdown_plus/assets.py +154 -0
- markitdown_plus-0.2.0/src/markitdown_plus/batch.py +387 -0
- markitdown_plus-0.2.0/src/markitdown_plus/chunker.py +433 -0
- markitdown_plus-0.2.0/src/markitdown_plus/cleaner.py +158 -0
- markitdown_plus-0.2.0/src/markitdown_plus/cli.py +205 -0
- markitdown_plus-0.2.0/src/markitdown_plus/converter.py +58 -0
- markitdown_plus-0.2.0/src/markitdown_plus/errors.py +13 -0
- markitdown_plus-0.2.0/src/markitdown_plus/manifest.py +164 -0
- markitdown_plus-0.2.0/src/markitdown_plus/metadata.py +97 -0
- markitdown_plus-0.2.0/src/markitdown_plus/utils.py +52 -0
- markitdown_plus-0.2.0/tests/test_assets.py +59 -0
- markitdown_plus-0.2.0/tests/test_batch.py +263 -0
- markitdown_plus-0.2.0/tests/test_benchmark_optional.py +18 -0
- markitdown_plus-0.2.0/tests/test_chunker.py +149 -0
- markitdown_plus-0.2.0/tests/test_cleaner.py +97 -0
- markitdown_plus-0.2.0/tests/test_cli.py +102 -0
- markitdown_plus-0.2.0/tests/test_converter.py +61 -0
- markitdown_plus-0.2.0/tests/test_manifest.py +49 -0
- markitdown_plus-0.2.0/tests/test_metadata.py +37 -0
- markitdown_plus-0.2.0/tests/test_property_optional.py +19 -0
- markitdown_plus-0.2.0/tests/test_utils.py +26 -0
|
Binary file
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
custom: ["https://www.paypal.me/lamguo"]
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
pull_request:
|
|
6
|
+
|
|
7
|
+
jobs:
|
|
8
|
+
test:
|
|
9
|
+
runs-on: ubuntu-latest
|
|
10
|
+
strategy:
|
|
11
|
+
matrix:
|
|
12
|
+
python-version: ["3.10", "3.11", "3.12"]
|
|
13
|
+
steps:
|
|
14
|
+
- uses: actions/checkout@v4
|
|
15
|
+
- uses: actions/setup-python@v5
|
|
16
|
+
with:
|
|
17
|
+
python-version: ${{ matrix.python-version }}
|
|
18
|
+
- name: Install package
|
|
19
|
+
run: |
|
|
20
|
+
python -m pip install --upgrade pip
|
|
21
|
+
python -m pip install -e ".[dev]"
|
|
22
|
+
- name: Run tests with coverage
|
|
23
|
+
run: pytest
|
|
24
|
+
- name: Ruff check
|
|
25
|
+
run: ruff check .
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
# MarkItDown Plus Technical Debt
|
|
2
|
+
|
|
3
|
+
- [x] `--workers` parallel conversion (v0.2.0)
|
|
4
|
+
- [x] `--extract-assets` basic Office/HTML image extraction (v0.2.0)
|
|
5
|
+
- [x] multiple chunk strategies: `heading`, `fixed`, `semantic-lite` (v0.2.0)
|
|
6
|
+
- [x] optional tqdm progress support (v0.2.0)
|
|
7
|
+
- [x] coverage gate target: 85%+ (v0.2.0)
|
|
8
|
+
- [x] property-test scaffolding with optional Hypothesis (v0.2.0)
|
|
9
|
+
- [x] benchmark-test scaffolding with optional pytest-benchmark (v0.2.0)
|
|
10
|
+
- [ ] deeper PDF asset extraction (v0.3.0)
|
|
11
|
+
- [ ] incremental conversion / retry failed (v0.3.0)
|
|
12
|
+
- [ ] table cleanup enhancements (v0.3.0)
|
|
13
|
+
- [ ] MCP server (v0.3.0)
|
|
14
|
+
- [ ] documentation site (v0.3.0)
|
|
15
|
+
- [ ] release automation to PyPI (v0.3.0)
|
|
16
|
+
- [ ] front matter output (v0.3.0)
|
|
17
|
+
- [ ] type-checking CI with pyright or mypy (long-term)
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
# This .gitignore file was automatically created by Hypothesis. Hypothesis gitignores
|
|
2
|
+
# .hypothesis by default, because we generally recommend that .hypothesis not be checked
|
|
3
|
+
# into version control.
|
|
4
|
+
#
|
|
5
|
+
# If you *would* like to check .hypothesis into version control, you should delete this
|
|
6
|
+
# file. Hypothesis will not re-create this .gitignore unless .hypothesis is deleted (and
|
|
7
|
+
# if it does, that's a bug - please report it!)
|
|
8
|
+
|
|
9
|
+
*
|
|
@@ -0,0 +1,4 @@
|
|
|
1
|
+
# file: E:\GIT\markitdown-plus\markitdown-plus-v0.2.0\src\markitdown_plus\cli.py
|
|
2
|
+
# hypothesis_version: 6.155.2
|
|
3
|
+
|
|
4
|
+
[800, '--chunk-size', '--chunk-strategy', '--clean', '--dry-run', '--extract-assets', '--fail-fast', '--model', '--output', '--overlap', '--plugins', '--progress', '--quiet', '--rag', '--recursive', '--types', '--verbose', '--version', '--workers', '-o', '-o output/clean.md', '-o output/report.md', '-r', '-v', 'Input Markdown file.', 'Input file.', 'Output JSONL file.', 'Output directory.', '__main__', 'chunk', 'clean', 'command', 'convert', 'gpt4', 'heading', 'input', 'markitdown-plus', 'single', 'store_true', 'utf-8', 'verbose', 'version']
|
|
@@ -0,0 +1,4 @@
|
|
|
1
|
+
# file: E:\GIT\markitdown-plus\markitdown-plus-v0.2.0\src\markitdown_plus\manifest.py
|
|
2
|
+
# hypothesis_version: 6.155.2
|
|
3
|
+
|
|
4
|
+
['a', 'created_at', 'failed', 'failed.json', 'failed_records_path', 'files', 'files_truncated', 'manifest.json', 'markitdown-plus', 'output', 'records_path', 'source', 'success', 'tool', 'total', 'utf-8', 'version']
|
|
@@ -0,0 +1,4 @@
|
|
|
1
|
+
# file: E:\GIT\markitdown-plus\markitdown-plus-v0.2.0\src\markitdown_plus\chunker.py
|
|
2
|
+
# hypothesis_version: 6.155.2
|
|
3
|
+
|
|
4
|
+
[0.75, 3.5, 4.0, 5.0, 100, 800, 'Apr.', 'Aug.', 'Co.', 'Dec.', 'Dr.', 'Eq.', 'Feb.', 'Fig.', 'Inc.', 'Jan.', 'Jr.', 'Jul.', 'Jun.', 'Ltd.', 'M.D.', 'Mar.', 'Mr.', 'Mrs.', 'Ms.', 'No.', 'Nov.', 'Oct.', 'Ph.D.', 'Prof.', 'Sep.', 'Sept.', 'Sr.', 'St.', 'U.K.', 'U.S.', '[\\u4e00-\\u9fff]', '[^a-z\\s-]', '\\S+', '\\b\\w+\\b', '\\s+', '^#{1,6}\\s+', '^(#{1,6})\\s+(.+?)$', '```', 'a.m.', 'background', 'claude', 'conclusion', 'deepseek', 'document.md', 'e.g.', 'etc.', 'fixed', 'gemini', 'gpt-4', 'gpt-4o', 'gpt4', 'heading', 'https?://\\S+', 'i.e.', 'key takeaway', 'key takeaways', 'methodology', 'next steps', 'p.m.', 'recommendation', 'recommendations', 'results', 'semantic-lite', 'summary', 'utf-8', 'vs.', 'w', '~~~', '。!?']
|
|
@@ -0,0 +1,4 @@
|
|
|
1
|
+
# file: E:\GIT\markitdown-plus\markitdown-plus-v0.2.0\src\markitdown_plus\batch.py
|
|
2
|
+
# hypothesis_version: 6.155.2
|
|
3
|
+
|
|
4
|
+
[800, 10000, '*', '**/*', ',', '.', '.bmp', '.csv', '.doc', '.docx', '.epub', '.gif', '.htm', '.html', '.jpeg', '.jpg', '.json', '.jsonl', '.m4a', '.md', '.mp3', '.pdf', '.png', '.ppt', '.pptx', '.txt', '.wav', '.webp', '.xls', '.xlsx', '.xml', '.zip', '1', 'CI', 'Converting', 'DRY_RUN', 'assets', 'chunks', 'failed', 'failed.jsonl', 'file', 'gpt4', 'heading', 'markdown', 'metadata', 'on', 'true', 'unknown error', 'utf-8', 'workers', 'yes']
|
|
@@ -0,0 +1,4 @@
|
|
|
1
|
+
# file: E:\GIT\markitdown-plus\markitdown-plus-v0.2.0\src\markitdown_plus\assets.py
|
|
2
|
+
# hypothesis_version: 6.155.2
|
|
3
|
+
|
|
4
|
+
['## Extracted Assets', '.', '.bin', '.bmp', '.docx', '.gif', '.htm', '.html', '.jpeg', '.jpg', '.png', '.pptx', '.svg', '.tiff', '.webp', '.xlsx', '/media/', 'data', 'http', 'https', 'ignore', 'image', 'utf-8', 'wb']
|
|
@@ -0,0 +1,4 @@
|
|
|
1
|
+
# file: E:\GIT\markitdown-plus\markitdown-plus-v0.2.0\src\markitdown_plus\cleaner.py
|
|
2
|
+
# hypothesis_version: 6.155.2
|
|
3
|
+
|
|
4
|
+
['!', '#', '* ', '-', '- ', '.', '/', '> ', '?', '[ \\t]+$', '\\(cid:\\s*\\d+\\)', '\\1 \\2', '\\n{3,}', '\\r\\n?|\\n', '\\s+', '^(#{1,6})([^#\\s].*)$', '^\\d+[.)]\\s+', '^\\d{1,5}$', '```', '|', '~~~', '。', '!', '?']
|
|
Binary file
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
## v0.2.0 - 2026-06-07
|
|
4
|
+
|
|
5
|
+
### Added
|
|
6
|
+
|
|
7
|
+
- Added `--workers` for parallel batch conversion.
|
|
8
|
+
- Added `--progress` with optional tqdm support and CI-safe fallback.
|
|
9
|
+
- Added `--extract-assets` for basic DOCX/PPTX/XLSX/HTML image extraction.
|
|
10
|
+
- Added `assets.py` with asset records and Markdown asset link appending.
|
|
11
|
+
- Added `--chunk-strategy` with `heading`, `fixed`, and `semantic-lite` modes.
|
|
12
|
+
- Added coverage gate target for 85%+ core test coverage.
|
|
13
|
+
- Added optional property-test scaffolding using Hypothesis.
|
|
14
|
+
- Added optional benchmark-test scaffolding using pytest-benchmark.
|
|
15
|
+
- Added `.hermes/TODO.md` technical-debt tracker.
|
|
16
|
+
|
|
17
|
+
### Changed
|
|
18
|
+
|
|
19
|
+
- Batch conversion now uses a worker-safe per-file processing path.
|
|
20
|
+
- Metadata now records asset extraction status, asset count, chunk strategy, and asset records.
|
|
21
|
+
- README now documents v0.2.0 features and the roadmap-aligned workflow.
|
|
22
|
+
|
|
23
|
+
### Notes
|
|
24
|
+
|
|
25
|
+
- PDF asset extraction is not enabled yet. This remains a v0.3.0+ task because robust PDF image extraction requires heavier dependencies.
|
|
26
|
+
|
|
27
|
+
## v0.1.2 - 2026-06-07
|
|
28
|
+
|
|
29
|
+
### Added
|
|
30
|
+
|
|
31
|
+
- Optional tqdm progress fallback.
|
|
32
|
+
- Large manifest JSONL streaming support.
|
|
33
|
+
- Improved sentence splitting and code-fence-aware paragraph splitting.
|
|
34
|
+
|
|
35
|
+
## v0.1.1 - 2026-06-07
|
|
36
|
+
|
|
37
|
+
### Fixed
|
|
38
|
+
|
|
39
|
+
- Fixed O(n²) manifest counting.
|
|
40
|
+
- Improved Unicode filename safety.
|
|
41
|
+
- Improved page number cleanup and chunk ID uniqueness.
|
|
42
|
+
- Added checkpoint writing and clearer CLI errors.
|
|
43
|
+
|
|
44
|
+
## v0.1.0 - 2026-06-07
|
|
45
|
+
|
|
46
|
+
### Added
|
|
47
|
+
|
|
48
|
+
- Initial MarkItDown Plus alpha release.
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Lam Guo
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,292 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: markitdown-plus
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: Batch conversion, asset extraction, and RAG-ready output toolkit for Microsoft MarkItDown.
|
|
5
|
+
Project-URL: Homepage, https://github.com/lamguo/markitdown-plus
|
|
6
|
+
Project-URL: Repository, https://github.com/lamguo/markitdown-plus
|
|
7
|
+
Project-URL: Issues, https://github.com/lamguo/markitdown-plus/issues
|
|
8
|
+
Project-URL: Funding, https://www.paypal.me/lamguo
|
|
9
|
+
Author-email: Lam Guo <lamguo111@gmail.com>
|
|
10
|
+
License: MIT
|
|
11
|
+
License-File: LICENSE
|
|
12
|
+
Keywords: asset-extraction,batch-conversion,document-conversion,docx-to-markdown,jsonl,llm,markdown,markitdown,microsoft-markitdown,pdf-to-markdown,rag
|
|
13
|
+
Classifier: Development Status :: 3 - Alpha
|
|
14
|
+
Classifier: Environment :: Console
|
|
15
|
+
Classifier: Intended Audience :: Developers
|
|
16
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
17
|
+
Classifier: Programming Language :: Python :: 3
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
21
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
22
|
+
Classifier: Topic :: Text Processing :: Markup :: Markdown
|
|
23
|
+
Requires-Python: >=3.10
|
|
24
|
+
Requires-Dist: markitdown[all]>=0.1.0
|
|
25
|
+
Provides-Extra: dev
|
|
26
|
+
Requires-Dist: hypothesis>=6.100; extra == 'dev'
|
|
27
|
+
Requires-Dist: pytest-benchmark>=4.0; extra == 'dev'
|
|
28
|
+
Requires-Dist: pytest-cov>=5.0; extra == 'dev'
|
|
29
|
+
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
30
|
+
Requires-Dist: ruff>=0.6.0; extra == 'dev'
|
|
31
|
+
Requires-Dist: tqdm>=4.66; extra == 'dev'
|
|
32
|
+
Provides-Extra: progress
|
|
33
|
+
Requires-Dist: tqdm>=4.66; extra == 'progress'
|
|
34
|
+
Provides-Extra: quality
|
|
35
|
+
Requires-Dist: hypothesis>=6.100; extra == 'quality'
|
|
36
|
+
Requires-Dist: pytest-benchmark>=4.0; extra == 'quality'
|
|
37
|
+
Provides-Extra: test
|
|
38
|
+
Requires-Dist: pytest-cov>=5.0; extra == 'test'
|
|
39
|
+
Requires-Dist: pytest>=8.0; extra == 'test'
|
|
40
|
+
Description-Content-Type: text/markdown
|
|
41
|
+
|
|
42
|
+
# MarkItDown Plus
|
|
43
|
+
|
|
44
|
+
Batch conversion, asset extraction, RAG-ready Markdown, JSONL chunks, and cleaner AI document pipelines for **Microsoft MarkItDown**.
|
|
45
|
+
|
|
46
|
+
MarkItDown Plus is an enhancement toolkit built on top of Microsoft MarkItDown. It adds folder conversion, recursive processing, optional parallel workers, Markdown cleanup, multiple chunking strategies, lightweight asset extraction, conversion manifests, and JSONL output for RAG workflows.
|
|
47
|
+
|
|
48
|
+
> This project is independent and is not affiliated with Microsoft. It is designed as a companion CLI for the Microsoft MarkItDown ecosystem.
|
|
49
|
+
|
|
50
|
+
## Why MarkItDown Plus?
|
|
51
|
+
|
|
52
|
+
Microsoft MarkItDown is excellent for converting individual files to Markdown. MarkItDown Plus focuses on the next step: turning many documents into clean, AI-ready project output.
|
|
53
|
+
|
|
54
|
+
Key features:
|
|
55
|
+
|
|
56
|
+
- Batch convert files and folders
|
|
57
|
+
- Recursive directory conversion
|
|
58
|
+
- Parallel conversion with `--workers`
|
|
59
|
+
- Optional tqdm progress with `--progress`
|
|
60
|
+
- RAG-ready JSONL chunk export
|
|
61
|
+
- Chunk strategies: `heading`, `fixed`, `semantic-lite`
|
|
62
|
+
- Markdown cleanup for common PDF/document artifacts
|
|
63
|
+
- Basic asset extraction for DOCX / PPTX / XLSX / HTML
|
|
64
|
+
- `manifest.json`, `failed.json`, and large-run JSONL manifest streaming
|
|
65
|
+
- Unicode-safe output filenames
|
|
66
|
+
- PayPal funding link included through GitHub Sponsors/Funding
|
|
67
|
+
|
|
68
|
+
## Installation
|
|
69
|
+
|
|
70
|
+
```bash
|
|
71
|
+
pip install markitdown-plus
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
For progress bars:
|
|
75
|
+
|
|
76
|
+
```bash
|
|
77
|
+
pip install "markitdown-plus[progress]"
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
For development tests and coverage:
|
|
81
|
+
|
|
82
|
+
```bash
|
|
83
|
+
pip install -e ".[dev]"
|
|
84
|
+
pytest
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
## Quick Start
|
|
88
|
+
|
|
89
|
+
Convert a folder:
|
|
90
|
+
|
|
91
|
+
```bash
|
|
92
|
+
markitdown-plus convert ./docs --output ./out
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
Convert recursively:
|
|
96
|
+
|
|
97
|
+
```bash
|
|
98
|
+
markitdown-plus convert ./docs --output ./out --recursive
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
Convert only specific file types:
|
|
102
|
+
|
|
103
|
+
```bash
|
|
104
|
+
markitdown-plus convert ./docs --output ./out --types pdf,docx,pptx,xlsx,html,csv
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
Clean Markdown and export RAG chunks:
|
|
108
|
+
|
|
109
|
+
```bash
|
|
110
|
+
markitdown-plus convert ./docs --output ./out --clean --rag
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
Use parallel workers:
|
|
114
|
+
|
|
115
|
+
```bash
|
|
116
|
+
markitdown-plus convert ./docs --output ./out --recursive --workers 4 --progress
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
Use auto worker count:
|
|
120
|
+
|
|
121
|
+
```bash
|
|
122
|
+
markitdown-plus convert ./docs --output ./out --workers 0
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
Extract assets when supported:
|
|
126
|
+
|
|
127
|
+
```bash
|
|
128
|
+
markitdown-plus convert ./docs --output ./out --extract-assets
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
Use a specific chunking strategy:
|
|
132
|
+
|
|
133
|
+
```bash
|
|
134
|
+
markitdown-plus convert ./docs --output ./out --rag --chunk-strategy semantic-lite
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
## Output Structure
|
|
138
|
+
|
|
139
|
+
A normal batch run creates:
|
|
140
|
+
|
|
141
|
+
```text
|
|
142
|
+
out/
|
|
143
|
+
markdown/
|
|
144
|
+
report.md
|
|
145
|
+
metadata/
|
|
146
|
+
report.json
|
|
147
|
+
manifest.json
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
With RAG enabled:
|
|
151
|
+
|
|
152
|
+
```text
|
|
153
|
+
out/
|
|
154
|
+
markdown/
|
|
155
|
+
report.md
|
|
156
|
+
chunks/
|
|
157
|
+
report.jsonl
|
|
158
|
+
metadata/
|
|
159
|
+
report.json
|
|
160
|
+
manifest.json
|
|
161
|
+
```
|
|
162
|
+
|
|
163
|
+
With asset extraction enabled:
|
|
164
|
+
|
|
165
|
+
```text
|
|
166
|
+
out/
|
|
167
|
+
markdown/
|
|
168
|
+
report.md
|
|
169
|
+
assets/
|
|
170
|
+
report_img_001.png
|
|
171
|
+
report_img_002.jpg
|
|
172
|
+
metadata/
|
|
173
|
+
report.json
|
|
174
|
+
manifest.json
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
For very large jobs, MarkItDown Plus avoids huge `manifest.json` files by streaming records:
|
|
178
|
+
|
|
179
|
+
```text
|
|
180
|
+
out/
|
|
181
|
+
manifest.json
|
|
182
|
+
manifest-records.jsonl
|
|
183
|
+
failed.jsonl
|
|
184
|
+
```
|
|
185
|
+
|
|
186
|
+
## Chunk Strategies
|
|
187
|
+
|
|
188
|
+
### `heading`
|
|
189
|
+
|
|
190
|
+
Default. Preserves Markdown heading paths and is best for most structured documents.
|
|
191
|
+
|
|
192
|
+
```bash
|
|
193
|
+
markitdown-plus convert ./docs -o ./out --rag --chunk-strategy heading
|
|
194
|
+
```
|
|
195
|
+
|
|
196
|
+
### `fixed`
|
|
197
|
+
|
|
198
|
+
Creates stable chunk sizes and ignores heading boundaries. Useful for embedding pipelines that prefer consistent lengths.
|
|
199
|
+
|
|
200
|
+
```bash
|
|
201
|
+
markitdown-plus convert ./docs -o ./out --rag --chunk-strategy fixed
|
|
202
|
+
```
|
|
203
|
+
|
|
204
|
+
### `semantic-lite`
|
|
205
|
+
|
|
206
|
+
Dependency-free rule-based topical splitting. It starts new chunks at obvious semantic cues such as headings, summary, conclusion, recommendations, and other section-like paragraphs.
|
|
207
|
+
|
|
208
|
+
```bash
|
|
209
|
+
markitdown-plus convert ./docs -o ./out --rag --chunk-strategy semantic-lite
|
|
210
|
+
```
|
|
211
|
+
|
|
212
|
+
## Asset Extraction
|
|
213
|
+
|
|
214
|
+
`--extract-assets` currently supports lightweight extraction for:
|
|
215
|
+
|
|
216
|
+
- `.docx`
|
|
217
|
+
- `.pptx`
|
|
218
|
+
- `.xlsx`
|
|
219
|
+
- `.html` / `.htm` local image references
|
|
220
|
+
|
|
221
|
+
PDF image extraction is intentionally left for a later version because reliable PDF asset extraction requires heavier format-specific dependencies.
|
|
222
|
+
|
|
223
|
+
When assets are extracted, MarkItDown Plus appends an `Extracted Assets` section to the generated Markdown and records asset metadata in the file-level metadata JSON.
|
|
224
|
+
|
|
225
|
+
## Single File Commands
|
|
226
|
+
|
|
227
|
+
Convert one file directly:
|
|
228
|
+
|
|
229
|
+
```bash
|
|
230
|
+
markitdown-plus single report.pdf -o report.md
|
|
231
|
+
```
|
|
232
|
+
|
|
233
|
+
Clean an existing Markdown file:
|
|
234
|
+
|
|
235
|
+
```bash
|
|
236
|
+
markitdown-plus clean dirty.md -o clean.md
|
|
237
|
+
```
|
|
238
|
+
|
|
239
|
+
Chunk an existing Markdown file:
|
|
240
|
+
|
|
241
|
+
```bash
|
|
242
|
+
markitdown-plus chunk clean.md -o chunks.jsonl --chunk-strategy fixed
|
|
243
|
+
```
|
|
244
|
+
|
|
245
|
+
## Development
|
|
246
|
+
|
|
247
|
+
```bash
|
|
248
|
+
git clone https://github.com/lamguo/markitdown-plus.git
|
|
249
|
+
cd markitdown-plus
|
|
250
|
+
pip install -e ".[dev]"
|
|
251
|
+
pytest
|
|
252
|
+
```
|
|
253
|
+
|
|
254
|
+
The test configuration includes a coverage gate:
|
|
255
|
+
|
|
256
|
+
```bash
|
|
257
|
+
pytest --cov=markitdown_plus --cov-fail-under=85
|
|
258
|
+
```
|
|
259
|
+
|
|
260
|
+
Optional property and benchmark tests are included. They are skipped automatically if `hypothesis` or `pytest-benchmark` is not installed.
|
|
261
|
+
|
|
262
|
+
## GitHub Topics
|
|
263
|
+
|
|
264
|
+
Suggested topics for the repository:
|
|
265
|
+
|
|
266
|
+
```text
|
|
267
|
+
markitdown
|
|
268
|
+
microsoft-markitdown
|
|
269
|
+
markdown
|
|
270
|
+
rag
|
|
271
|
+
llm
|
|
272
|
+
document-conversion
|
|
273
|
+
pdf-to-markdown
|
|
274
|
+
docx-to-markdown
|
|
275
|
+
batch-conversion
|
|
276
|
+
jsonl
|
|
277
|
+
asset-extraction
|
|
278
|
+
ai-tools
|
|
279
|
+
```
|
|
280
|
+
|
|
281
|
+
## Support This Project
|
|
282
|
+
|
|
283
|
+
If MarkItDown Plus helps you save time or build better AI document pipelines, you can support development here:
|
|
284
|
+
|
|
285
|
+
- Star this repository
|
|
286
|
+
- Support via PayPal: https://www.paypal.me/lamguo
|
|
287
|
+
|
|
288
|
+
Thank you for supporting open-source development.
|
|
289
|
+
|
|
290
|
+
## License
|
|
291
|
+
|
|
292
|
+
MIT License.
|