markitdown-plus 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. markitdown_plus-0.2.0/.coverage +0 -0
  2. markitdown_plus-0.2.0/.github/FUNDING.yml +1 -0
  3. markitdown_plus-0.2.0/.github/workflows/ci.yml +25 -0
  4. markitdown_plus-0.2.0/.gitignore +13 -0
  5. markitdown_plus-0.2.0/.hermes/TODO.md +17 -0
  6. markitdown_plus-0.2.0/.hypothesis/.gitignore +9 -0
  7. markitdown_plus-0.2.0/.hypothesis/constants/086764ccb2d66f8a +4 -0
  8. markitdown_plus-0.2.0/.hypothesis/constants/110871bc3b4f5e27 +4 -0
  9. markitdown_plus-0.2.0/.hypothesis/constants/1d68e3aa4dbfdf79 +4 -0
  10. markitdown_plus-0.2.0/.hypothesis/constants/332dd3b2262038d8 +4 -0
  11. markitdown_plus-0.2.0/.hypothesis/constants/359268f49f691cc8 +4 -0
  12. markitdown_plus-0.2.0/.hypothesis/constants/4214944546cf5c7c +4 -0
  13. markitdown_plus-0.2.0/.hypothesis/constants/7b4b0b2b153d41d3 +4 -0
  14. markitdown_plus-0.2.0/.hypothesis/constants/8f91968cbf2d23ac +4 -0
  15. markitdown_plus-0.2.0/.hypothesis/constants/92e6eaf999a14c55 +4 -0
  16. markitdown_plus-0.2.0/.hypothesis/constants/c4ac8b71d8e3e21f +4 -0
  17. markitdown_plus-0.2.0/.hypothesis/constants/c55212a6e09c5e51 +4 -0
  18. markitdown_plus-0.2.0/.hypothesis/constants/e79837f15d416198 +4 -0
  19. markitdown_plus-0.2.0/.hypothesis/unicode_data/15.1.0/charmap.json.gz +0 -0
  20. markitdown_plus-0.2.0/.hypothesis/unicode_data/15.1.0/codec-utf-8.json.gz +0 -0
  21. markitdown_plus-0.2.0/CHANGELOG.md +48 -0
  22. markitdown_plus-0.2.0/LICENSE +21 -0
  23. markitdown_plus-0.2.0/PKG-INFO +292 -0
  24. markitdown_plus-0.2.0/README.md +251 -0
  25. markitdown_plus-0.2.0/docs/comparison-with-markitdown.md +17 -0
  26. markitdown_plus-0.2.0/docs/rag-output.md +17 -0
  27. markitdown_plus-0.2.0/docs/usage.md +25 -0
  28. markitdown_plus-0.2.0/examples/sample_docs/hello.txt +5 -0
  29. markitdown_plus-0.2.0/pyproject.toml +88 -0
  30. markitdown_plus-0.2.0/requirements.txt +5 -0
  31. markitdown_plus-0.2.0/src/markitdown_plus/__about__.py +7 -0
  32. markitdown_plus-0.2.0/src/markitdown_plus/__init__.py +12 -0
  33. markitdown_plus-0.2.0/src/markitdown_plus/assets.py +154 -0
  34. markitdown_plus-0.2.0/src/markitdown_plus/batch.py +387 -0
  35. markitdown_plus-0.2.0/src/markitdown_plus/chunker.py +433 -0
  36. markitdown_plus-0.2.0/src/markitdown_plus/cleaner.py +158 -0
  37. markitdown_plus-0.2.0/src/markitdown_plus/cli.py +205 -0
  38. markitdown_plus-0.2.0/src/markitdown_plus/converter.py +58 -0
  39. markitdown_plus-0.2.0/src/markitdown_plus/errors.py +13 -0
  40. markitdown_plus-0.2.0/src/markitdown_plus/manifest.py +164 -0
  41. markitdown_plus-0.2.0/src/markitdown_plus/metadata.py +97 -0
  42. markitdown_plus-0.2.0/src/markitdown_plus/utils.py +52 -0
  43. markitdown_plus-0.2.0/tests/test_assets.py +59 -0
  44. markitdown_plus-0.2.0/tests/test_batch.py +263 -0
  45. markitdown_plus-0.2.0/tests/test_benchmark_optional.py +18 -0
  46. markitdown_plus-0.2.0/tests/test_chunker.py +149 -0
  47. markitdown_plus-0.2.0/tests/test_cleaner.py +97 -0
  48. markitdown_plus-0.2.0/tests/test_cli.py +102 -0
  49. markitdown_plus-0.2.0/tests/test_converter.py +61 -0
  50. markitdown_plus-0.2.0/tests/test_manifest.py +49 -0
  51. markitdown_plus-0.2.0/tests/test_metadata.py +37 -0
  52. markitdown_plus-0.2.0/tests/test_property_optional.py +19 -0
  53. markitdown_plus-0.2.0/tests/test_utils.py +26 -0
Binary file
@@ -0,0 +1 @@
1
+ custom: ["https://www.paypal.me/lamguo"]
@@ -0,0 +1,25 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ pull_request:
6
+
7
+ jobs:
8
+ test:
9
+ runs-on: ubuntu-latest
10
+ strategy:
11
+ matrix:
12
+ python-version: ["3.10", "3.11", "3.12"]
13
+ steps:
14
+ - uses: actions/checkout@v4
15
+ - uses: actions/setup-python@v5
16
+ with:
17
+ python-version: ${{ matrix.python-version }}
18
+ - name: Install package
19
+ run: |
20
+ python -m pip install --upgrade pip
21
+ python -m pip install -e ".[dev]"
22
+ - name: Run tests with coverage
23
+ run: pytest
24
+ - name: Ruff check
25
+ run: ruff check .
@@ -0,0 +1,13 @@
1
+ __pycache__/
2
+ *.py[cod]
3
+ *.egg-info/
4
+ .pytest_cache/
5
+ .ruff_cache/
6
+ dist/
7
+ build/
8
+ .env
9
+ .venv/
10
+ venv/
11
+ .DS_Store
12
+ out/
13
+ output/
@@ -0,0 +1,17 @@
1
+ # MarkItDown Plus Technical Debt
2
+
3
+ - [x] `--workers` parallel conversion (v0.2.0)
4
+ - [x] `--extract-assets` basic Office/HTML image extraction (v0.2.0)
5
+ - [x] multiple chunk strategies: `heading`, `fixed`, `semantic-lite` (v0.2.0)
6
+ - [x] optional tqdm progress support (v0.2.0)
7
+ - [x] coverage gate target: 85%+ (v0.2.0)
8
+ - [x] property-test scaffolding with optional Hypothesis (v0.2.0)
9
+ - [x] benchmark-test scaffolding with optional pytest-benchmark (v0.2.0)
10
+ - [ ] deeper PDF asset extraction (v0.3.0)
11
+ - [ ] incremental conversion / retry failed (v0.3.0)
12
+ - [ ] table cleanup enhancements (v0.3.0)
13
+ - [ ] MCP server (v0.3.0)
14
+ - [ ] documentation site (v0.3.0)
15
+ - [ ] release automation to PyPI (v0.3.0)
16
+ - [ ] front matter output (v0.3.0)
17
+ - [ ] type-checking CI with pyright or mypy (long-term)
@@ -0,0 +1,9 @@
1
+ # This .gitignore file was automatically created by Hypothesis. Hypothesis gitignores
2
+ # .hypothesis by default, because we generally recommend that .hypothesis not be checked
3
+ # into version control.
4
+ #
5
+ # If you *would* like to check .hypothesis into version control, you should delete this
6
+ # file. Hypothesis will not re-create this .gitignore unless .hypothesis is deleted (and
7
+ # if it does, that's a bug - please report it!)
8
+
9
+ *
@@ -0,0 +1,4 @@
1
+ # file: E:\GIT\markitdown-plus\markitdown-plus-v0.2.0\src\markitdown_plus\__init__.py
2
+ # hypothesis_version: 6.155.2
3
+
4
+ ['PlusConverter', '__version__', 'chunk_markdown', 'clean_markdown']
@@ -0,0 +1,4 @@
1
+ # file: E:\GIT\markitdown-plus\markitdown-plus-v0.2.0\src\markitdown_plus\metadata.py
2
+ # hypothesis_version: 6.155.2
3
+
4
+ [0.0, 'assets', 'heading', 'markitdown', 'markitdown_version', 'utf-8']
@@ -0,0 +1,4 @@
1
+ # file: E:\GIT\markitdown-plus\markitdown-plus-v0.2.0\src\markitdown_plus\errors.py
2
+ # hypothesis_version: 6.155.2
3
+
4
+ []
@@ -0,0 +1,4 @@
1
+ # file: E:\GIT\markitdown-plus\markitdown-plus-v0.2.0\src\markitdown_plus\cli.py
2
+ # hypothesis_version: 6.155.2
3
+
4
+ [800, '--chunk-size', '--chunk-strategy', '--clean', '--dry-run', '--extract-assets', '--fail-fast', '--model', '--output', '--overlap', '--plugins', '--progress', '--quiet', '--rag', '--recursive', '--types', '--verbose', '--version', '--workers', '-o', '-o output/clean.md', '-o output/report.md', '-r', '-v', 'Input Markdown file.', 'Input file.', 'Output JSONL file.', 'Output directory.', '__main__', 'chunk', 'clean', 'command', 'convert', 'gpt4', 'heading', 'input', 'markitdown-plus', 'single', 'store_true', 'utf-8', 'verbose', 'version']
@@ -0,0 +1,4 @@
1
+ # file: E:\GIT\markitdown-plus\markitdown-plus-v0.2.0\src\markitdown_plus\manifest.py
2
+ # hypothesis_version: 6.155.2
3
+
4
+ ['a', 'created_at', 'failed', 'failed.json', 'failed_records_path', 'files', 'files_truncated', 'manifest.json', 'markitdown-plus', 'output', 'records_path', 'source', 'success', 'tool', 'total', 'utf-8', 'version']
@@ -0,0 +1,4 @@
1
+ # file: E:\GIT\markitdown-plus\markitdown-plus-v0.2.0\src\markitdown_plus\converter.py
2
+ # hypothesis_version: 6.155.2
3
+
4
+ ['Empty file: %s', 'markdown', 'text_content']
@@ -0,0 +1,4 @@
1
+ # file: E:\GIT\markitdown-plus\markitdown-plus-v0.2.0\src\markitdown_plus\__about__.py
2
+ # hypothesis_version: 6.155.2
3
+
4
+ ['0.2.0']
@@ -0,0 +1,4 @@
1
+ # file: E:\GIT\markitdown-plus\markitdown-plus-v0.2.0\src\markitdown_plus\utils.py
2
+ # hypothesis_version: 6.155.2
3
+
4
+ ['-', '-+', '-._', 'document', 'ignore', 'utf-8']
@@ -0,0 +1,4 @@
1
+ # file: E:\GIT\markitdown-plus\markitdown-plus-v0.2.0\src\markitdown_plus\chunker.py
2
+ # hypothesis_version: 6.155.2
3
+
4
+ [0.75, 3.5, 4.0, 5.0, 100, 800, 'Apr.', 'Aug.', 'Co.', 'Dec.', 'Dr.', 'Eq.', 'Feb.', 'Fig.', 'Inc.', 'Jan.', 'Jr.', 'Jul.', 'Jun.', 'Ltd.', 'M.D.', 'Mar.', 'Mr.', 'Mrs.', 'Ms.', 'No.', 'Nov.', 'Oct.', 'Ph.D.', 'Prof.', 'Sep.', 'Sept.', 'Sr.', 'St.', 'U.K.', 'U.S.', '[\\u4e00-\\u9fff]', '[^a-z\\s-]', '\\S+', '\\b\\w+\\b', '\\s+', '^#{1,6}\\s+', '^(#{1,6})\\s+(.+?)$', '```', 'a.m.', 'background', 'claude', 'conclusion', 'deepseek', 'document.md', 'e.g.', 'etc.', 'fixed', 'gemini', 'gpt-4', 'gpt-4o', 'gpt4', 'heading', 'https?://\\S+', 'i.e.', 'key takeaway', 'key takeaways', 'methodology', 'next steps', 'p.m.', 'recommendation', 'recommendations', 'results', 'semantic-lite', 'summary', 'utf-8', 'vs.', 'w', '~~~', '。!?']
@@ -0,0 +1,4 @@
1
+ # file: E:\GIT\markitdown-plus\markitdown-plus-v0.2.0\src\markitdown_plus\batch.py
2
+ # hypothesis_version: 6.155.2
3
+
4
+ [800, 10000, '*', '**/*', ',', '.', '.bmp', '.csv', '.doc', '.docx', '.epub', '.gif', '.htm', '.html', '.jpeg', '.jpg', '.json', '.jsonl', '.m4a', '.md', '.mp3', '.pdf', '.png', '.ppt', '.pptx', '.txt', '.wav', '.webp', '.xls', '.xlsx', '.xml', '.zip', '1', 'CI', 'Converting', 'DRY_RUN', 'assets', 'chunks', 'failed', 'failed.jsonl', 'file', 'gpt4', 'heading', 'markdown', 'metadata', 'on', 'true', 'unknown error', 'utf-8', 'workers', 'yes']
@@ -0,0 +1,4 @@
1
+ # file: E:\GIT\markitdown-plus\markitdown-plus-v0.2.0\src\markitdown_plus\assets.py
2
+ # hypothesis_version: 6.155.2
3
+
4
+ ['## Extracted Assets', '.', '.bin', '.bmp', '.docx', '.gif', '.htm', '.html', '.jpeg', '.jpg', '.png', '.pptx', '.svg', '.tiff', '.webp', '.xlsx', '/media/', 'data', 'http', 'https', 'ignore', 'image', 'utf-8', 'wb']
@@ -0,0 +1,4 @@
1
+ # file: E:\GIT\markitdown-plus\markitdown-plus-v0.2.0\src\markitdown_plus\cleaner.py
2
+ # hypothesis_version: 6.155.2
3
+
4
+ ['!', '#', '* ', '-', '- ', '.', '/', '> ', '?', '[ \\t]+$', '\\(cid:\\s*\\d+\\)', '\\1 \\2', '\\n{3,}', '\\r\\n?|\\n', '\\s+', '^(#{1,6})([^#\\s].*)$', '^\\d+[.)]\\s+', '^\\d{1,5}$', '```', '|', '~~~', '。', '!', '?']
@@ -0,0 +1,48 @@
1
+ # Changelog
2
+
3
+ ## v0.2.0 - 2026-06-07
4
+
5
+ ### Added
6
+
7
+ - Added `--workers` for parallel batch conversion.
8
+ - Added `--progress` with optional tqdm support and CI-safe fallback.
9
+ - Added `--extract-assets` for basic DOCX/PPTX/XLSX/HTML image extraction.
10
+ - Added `assets.py` with asset records and Markdown asset link appending.
11
+ - Added `--chunk-strategy` with `heading`, `fixed`, and `semantic-lite` modes.
12
+ - Added coverage gate target for 85%+ core test coverage.
13
+ - Added optional property-test scaffolding using Hypothesis.
14
+ - Added optional benchmark-test scaffolding using pytest-benchmark.
15
+ - Added `.hermes/TODO.md` technical-debt tracker.
16
+
17
+ ### Changed
18
+
19
+ - Batch conversion now uses a worker-safe per-file processing path.
20
+ - Metadata now records asset extraction status, asset count, chunk strategy, and asset records.
21
+ - README now documents v0.2.0 features and the roadmap-aligned workflow.
22
+
23
+ ### Notes
24
+
25
+ - PDF asset extraction is not enabled yet. This remains a v0.3.0+ task because robust PDF image extraction requires heavier dependencies.
26
+
27
+ ## v0.1.2 - 2026-06-07
28
+
29
+ ### Added
30
+
31
+ - Optional tqdm progress fallback.
32
+ - Large manifest JSONL streaming support.
33
+ - Improved sentence splitting and code-fence-aware paragraph splitting.
34
+
35
+ ## v0.1.1 - 2026-06-07
36
+
37
+ ### Fixed
38
+
39
+ - Fixed O(n²) manifest counting.
40
+ - Improved Unicode filename safety.
41
+ - Improved page number cleanup and chunk ID uniqueness.
42
+ - Added checkpoint writing and clearer CLI errors.
43
+
44
+ ## v0.1.0 - 2026-06-07
45
+
46
+ ### Added
47
+
48
+ - Initial MarkItDown Plus alpha release.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Lam Guo
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,292 @@
1
+ Metadata-Version: 2.4
2
+ Name: markitdown-plus
3
+ Version: 0.2.0
4
+ Summary: Batch conversion, asset extraction, and RAG-ready output toolkit for Microsoft MarkItDown.
5
+ Project-URL: Homepage, https://github.com/lamguo/markitdown-plus
6
+ Project-URL: Repository, https://github.com/lamguo/markitdown-plus
7
+ Project-URL: Issues, https://github.com/lamguo/markitdown-plus/issues
8
+ Project-URL: Funding, https://www.paypal.me/lamguo
9
+ Author-email: Lam Guo <lamguo111@gmail.com>
10
+ License: MIT
11
+ License-File: LICENSE
12
+ Keywords: asset-extraction,batch-conversion,document-conversion,docx-to-markdown,jsonl,llm,markdown,markitdown,microsoft-markitdown,pdf-to-markdown,rag
13
+ Classifier: Development Status :: 3 - Alpha
14
+ Classifier: Environment :: Console
15
+ Classifier: Intended Audience :: Developers
16
+ Classifier: License :: OSI Approved :: MIT License
17
+ Classifier: Programming Language :: Python :: 3
18
+ Classifier: Programming Language :: Python :: 3.10
19
+ Classifier: Programming Language :: Python :: 3.11
20
+ Classifier: Programming Language :: Python :: 3.12
21
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
22
+ Classifier: Topic :: Text Processing :: Markup :: Markdown
23
+ Requires-Python: >=3.10
24
+ Requires-Dist: markitdown[all]>=0.1.0
25
+ Provides-Extra: dev
26
+ Requires-Dist: hypothesis>=6.100; extra == 'dev'
27
+ Requires-Dist: pytest-benchmark>=4.0; extra == 'dev'
28
+ Requires-Dist: pytest-cov>=5.0; extra == 'dev'
29
+ Requires-Dist: pytest>=8.0; extra == 'dev'
30
+ Requires-Dist: ruff>=0.6.0; extra == 'dev'
31
+ Requires-Dist: tqdm>=4.66; extra == 'dev'
32
+ Provides-Extra: progress
33
+ Requires-Dist: tqdm>=4.66; extra == 'progress'
34
+ Provides-Extra: quality
35
+ Requires-Dist: hypothesis>=6.100; extra == 'quality'
36
+ Requires-Dist: pytest-benchmark>=4.0; extra == 'quality'
37
+ Provides-Extra: test
38
+ Requires-Dist: pytest-cov>=5.0; extra == 'test'
39
+ Requires-Dist: pytest>=8.0; extra == 'test'
40
+ Description-Content-Type: text/markdown
41
+
42
+ # MarkItDown Plus
43
+
44
+ Batch conversion, asset extraction, RAG-ready Markdown, JSONL chunks, and cleaner AI document pipelines for **Microsoft MarkItDown**.
45
+
46
+ MarkItDown Plus is an enhancement toolkit built on top of Microsoft MarkItDown. It adds folder conversion, recursive processing, optional parallel workers, Markdown cleanup, multiple chunking strategies, lightweight asset extraction, conversion manifests, and JSONL output for RAG workflows.
47
+
48
+ > This project is independent and is not affiliated with Microsoft. It is designed as a companion CLI for the Microsoft MarkItDown ecosystem.
49
+
50
+ ## Why MarkItDown Plus?
51
+
52
+ Microsoft MarkItDown is excellent for converting individual files to Markdown. MarkItDown Plus focuses on the next step: turning many documents into clean, AI-ready project output.
53
+
54
+ Key features:
55
+
56
+ - Batch convert files and folders
57
+ - Recursive directory conversion
58
+ - Parallel conversion with `--workers`
59
+ - Optional tqdm progress with `--progress`
60
+ - RAG-ready JSONL chunk export
61
+ - Chunk strategies: `heading`, `fixed`, `semantic-lite`
62
+ - Markdown cleanup for common PDF/document artifacts
63
+ - Basic asset extraction for DOCX / PPTX / XLSX / HTML
64
+ - `manifest.json`, `failed.json`, and large-run JSONL manifest streaming
65
+ - Unicode-safe output filenames
66
+ - PayPal funding link included through GitHub Sponsors/Funding
67
+
68
+ ## Installation
69
+
70
+ ```bash
71
+ pip install markitdown-plus
72
+ ```
73
+
74
+ For progress bars:
75
+
76
+ ```bash
77
+ pip install "markitdown-plus[progress]"
78
+ ```
79
+
80
+ For development tests and coverage:
81
+
82
+ ```bash
83
+ pip install -e ".[dev]"
84
+ pytest
85
+ ```
86
+
87
+ ## Quick Start
88
+
89
+ Convert a folder:
90
+
91
+ ```bash
92
+ markitdown-plus convert ./docs --output ./out
93
+ ```
94
+
95
+ Convert recursively:
96
+
97
+ ```bash
98
+ markitdown-plus convert ./docs --output ./out --recursive
99
+ ```
100
+
101
+ Convert only specific file types:
102
+
103
+ ```bash
104
+ markitdown-plus convert ./docs --output ./out --types pdf,docx,pptx,xlsx,html,csv
105
+ ```
106
+
107
+ Clean Markdown and export RAG chunks:
108
+
109
+ ```bash
110
+ markitdown-plus convert ./docs --output ./out --clean --rag
111
+ ```
112
+
113
+ Use parallel workers:
114
+
115
+ ```bash
116
+ markitdown-plus convert ./docs --output ./out --recursive --workers 4 --progress
117
+ ```
118
+
119
+ Use auto worker count:
120
+
121
+ ```bash
122
+ markitdown-plus convert ./docs --output ./out --workers 0
123
+ ```
124
+
125
+ Extract assets when supported:
126
+
127
+ ```bash
128
+ markitdown-plus convert ./docs --output ./out --extract-assets
129
+ ```
130
+
131
+ Use a specific chunking strategy:
132
+
133
+ ```bash
134
+ markitdown-plus convert ./docs --output ./out --rag --chunk-strategy semantic-lite
135
+ ```
136
+
137
+ ## Output Structure
138
+
139
+ A normal batch run creates:
140
+
141
+ ```text
142
+ out/
143
+ markdown/
144
+ report.md
145
+ metadata/
146
+ report.json
147
+ manifest.json
148
+ ```
149
+
150
+ With RAG enabled:
151
+
152
+ ```text
153
+ out/
154
+ markdown/
155
+ report.md
156
+ chunks/
157
+ report.jsonl
158
+ metadata/
159
+ report.json
160
+ manifest.json
161
+ ```
162
+
163
+ With asset extraction enabled:
164
+
165
+ ```text
166
+ out/
167
+ markdown/
168
+ report.md
169
+ assets/
170
+ report_img_001.png
171
+ report_img_002.jpg
172
+ metadata/
173
+ report.json
174
+ manifest.json
175
+ ```
176
+
177
+ For very large jobs, MarkItDown Plus avoids huge `manifest.json` files by streaming records:
178
+
179
+ ```text
180
+ out/
181
+ manifest.json
182
+ manifest-records.jsonl
183
+ failed.jsonl
184
+ ```
185
+
186
+ ## Chunk Strategies
187
+
188
+ ### `heading`
189
+
190
+ Default. Preserves Markdown heading paths and is best for most structured documents.
191
+
192
+ ```bash
193
+ markitdown-plus convert ./docs -o ./out --rag --chunk-strategy heading
194
+ ```
195
+
196
+ ### `fixed`
197
+
198
+ Creates stable chunk sizes and ignores heading boundaries. Useful for embedding pipelines that prefer consistent lengths.
199
+
200
+ ```bash
201
+ markitdown-plus convert ./docs -o ./out --rag --chunk-strategy fixed
202
+ ```
203
+
204
+ ### `semantic-lite`
205
+
206
+ Dependency-free rule-based topical splitting. It starts new chunks at obvious semantic cues such as headings, summary, conclusion, recommendations, and other section-like paragraphs.
207
+
208
+ ```bash
209
+ markitdown-plus convert ./docs -o ./out --rag --chunk-strategy semantic-lite
210
+ ```
211
+
212
+ ## Asset Extraction
213
+
214
+ `--extract-assets` currently supports lightweight extraction for:
215
+
216
+ - `.docx`
217
+ - `.pptx`
218
+ - `.xlsx`
219
+ - `.html` / `.htm` local image references
220
+
221
+ PDF image extraction is intentionally left for a later version because reliable PDF asset extraction requires heavier format-specific dependencies.
222
+
223
+ When assets are extracted, MarkItDown Plus appends an `Extracted Assets` section to the generated Markdown and records asset metadata in the file-level metadata JSON.
224
+
225
+ ## Single File Commands
226
+
227
+ Convert one file directly:
228
+
229
+ ```bash
230
+ markitdown-plus single report.pdf -o report.md
231
+ ```
232
+
233
+ Clean an existing Markdown file:
234
+
235
+ ```bash
236
+ markitdown-plus clean dirty.md -o clean.md
237
+ ```
238
+
239
+ Chunk an existing Markdown file:
240
+
241
+ ```bash
242
+ markitdown-plus chunk clean.md -o chunks.jsonl --chunk-strategy fixed
243
+ ```
244
+
245
+ ## Development
246
+
247
+ ```bash
248
+ git clone https://github.com/lamguo/markitdown-plus.git
249
+ cd markitdown-plus
250
+ pip install -e ".[dev]"
251
+ pytest
252
+ ```
253
+
254
+ The test configuration includes a coverage gate:
255
+
256
+ ```bash
257
+ pytest --cov=markitdown_plus --cov-fail-under=85
258
+ ```
259
+
260
+ Optional property and benchmark tests are included. They are skipped automatically if `hypothesis` or `pytest-benchmark` is not installed.
261
+
262
+ ## GitHub Topics
263
+
264
+ Suggested topics for the repository:
265
+
266
+ ```text
267
+ markitdown
268
+ microsoft-markitdown
269
+ markdown
270
+ rag
271
+ llm
272
+ document-conversion
273
+ pdf-to-markdown
274
+ docx-to-markdown
275
+ batch-conversion
276
+ jsonl
277
+ asset-extraction
278
+ ai-tools
279
+ ```
280
+
281
+ ## Support This Project
282
+
283
+ If MarkItDown Plus helps you save time or build better AI document pipelines, you can support development here:
284
+
285
+ - Star this repository
286
+ - Support via PayPal: https://www.paypal.me/lamguo
287
+
288
+ Thank you for supporting open-source development.
289
+
290
+ ## License
291
+
292
+ MIT License.