repulp 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- repulp-0.1.0/.github/workflows/ci.yml +31 -0
- repulp-0.1.0/.github/workflows/publish.yml +72 -0
- repulp-0.1.0/.gitignore +30 -0
- repulp-0.1.0/.python-version +1 -0
- repulp-0.1.0/.repulp.example.toml +21 -0
- repulp-0.1.0/LICENSE +21 -0
- repulp-0.1.0/PKG-INFO +366 -0
- repulp-0.1.0/README.md +332 -0
- repulp-0.1.0/pyproject.toml +79 -0
- repulp-0.1.0/samples/api-docs.html +224 -0
- repulp-0.1.0/samples/architecture.html +175 -0
- repulp-0.1.0/samples/changelog.html +93 -0
- repulp-0.1.0/samples/converted/api-docs.md +85 -0
- repulp-0.1.0/samples/converted/architecture.md +103 -0
- repulp-0.1.0/samples/converted/changelog.md +54 -0
- repulp-0.1.0/samples/converted/employee-directory.md +19 -0
- repulp-0.1.0/samples/converted/inventory.md +17 -0
- repulp-0.1.0/samples/converted/meeting-notes.md +75 -0
- repulp-0.1.0/samples/converted/report.md +58 -0
- repulp-0.1.0/samples/converted/research-paper.md +94 -0
- repulp-0.1.0/samples/converted/sales-data.md +26 -0
- repulp-0.1.0/samples/employee-directory.csv +18 -0
- repulp-0.1.0/samples/inventory.csv +16 -0
- repulp-0.1.0/samples/meeting-notes.html +125 -0
- repulp-0.1.0/samples/report.html +105 -0
- repulp-0.1.0/samples/research-paper.html +123 -0
- repulp-0.1.0/samples/sales-data.csv +25 -0
- repulp-0.1.0/src/repulp/__init__.py +172 -0
- repulp-0.1.0/src/repulp/cache.py +88 -0
- repulp-0.1.0/src/repulp/cleaner.py +99 -0
- repulp-0.1.0/src/repulp/cli.py +492 -0
- repulp-0.1.0/src/repulp/config.py +69 -0
- repulp-0.1.0/src/repulp/converter.py +151 -0
- repulp-0.1.0/src/repulp/engine.py +243 -0
- repulp-0.1.0/src/repulp/extractor.py +168 -0
- repulp-0.1.0/src/repulp/fetcher.py +62 -0
- repulp-0.1.0/src/repulp/formatter.py +56 -0
- repulp-0.1.0/src/repulp/frontmatter.py +67 -0
- repulp-0.1.0/src/repulp/py.typed +0 -0
- repulp-0.1.0/src/repulp/watcher.py +168 -0
- repulp-0.1.0/tests/__init__.py +0 -0
- repulp-0.1.0/tests/fixtures/sample.csv +3 -0
- repulp-0.1.0/tests/fixtures/sample.html +1 -0
- repulp-0.1.0/tests/fixtures/sample.md +0 -0
- repulp-0.1.0/tests/fixtures/sample.txt +1 -0
- repulp-0.1.0/tests/test_api.py +83 -0
- repulp-0.1.0/tests/test_cache.py +127 -0
- repulp-0.1.0/tests/test_cleaner.py +45 -0
- repulp-0.1.0/tests/test_cli.py +63 -0
- repulp-0.1.0/tests/test_cli_batch.py +49 -0
- repulp-0.1.0/tests/test_config.py +49 -0
- repulp-0.1.0/tests/test_converter.py +75 -0
- repulp-0.1.0/tests/test_engine.py +225 -0
- repulp-0.1.0/tests/test_extractor.py +112 -0
- repulp-0.1.0/tests/test_fetcher.py +32 -0
- repulp-0.1.0/tests/test_formatter.py +55 -0
- repulp-0.1.0/tests/test_frontmatter.py +76 -0
- repulp-0.1.0/tests/test_integration.py +67 -0
- repulp-0.1.0/tests/test_table_extraction.py +98 -0
- repulp-0.1.0/tests/test_watcher.py +198 -0
- repulp-0.1.0/uv.lock +1965 -0
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
pull_request:
|
|
7
|
+
branches: [main]
|
|
8
|
+
|
|
9
|
+
jobs:
|
|
10
|
+
test:
|
|
11
|
+
name: Test (Python ${{ matrix.python-version }})
|
|
12
|
+
runs-on: ubuntu-latest
|
|
13
|
+
strategy:
|
|
14
|
+
matrix:
|
|
15
|
+
python-version: ["3.10", "3.11", "3.12", "3.13"]
|
|
16
|
+
steps:
|
|
17
|
+
- uses: actions/checkout@v4
|
|
18
|
+
|
|
19
|
+
- name: Install uv
|
|
20
|
+
uses: astral-sh/setup-uv@v5
|
|
21
|
+
|
|
22
|
+
- name: Set up Python ${{ matrix.python-version }}
|
|
23
|
+
uses: actions/setup-python@v5
|
|
24
|
+
with:
|
|
25
|
+
python-version: ${{ matrix.python-version }}
|
|
26
|
+
|
|
27
|
+
- name: Install dependencies
|
|
28
|
+
run: uv sync --group dev
|
|
29
|
+
|
|
30
|
+
- name: Run tests
|
|
31
|
+
run: uv run pytest --cov=repulp
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
name: Publish to PyPI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
release:
|
|
5
|
+
types: [published]
|
|
6
|
+
|
|
7
|
+
permissions:
|
|
8
|
+
id-token: write
|
|
9
|
+
|
|
10
|
+
jobs:
|
|
11
|
+
build:
|
|
12
|
+
name: Build distribution
|
|
13
|
+
runs-on: ubuntu-latest
|
|
14
|
+
steps:
|
|
15
|
+
- uses: actions/checkout@v4
|
|
16
|
+
|
|
17
|
+
- name: Install uv
|
|
18
|
+
uses: astral-sh/setup-uv@v5
|
|
19
|
+
|
|
20
|
+
- name: Set up Python
|
|
21
|
+
uses: actions/setup-python@v5
|
|
22
|
+
with:
|
|
23
|
+
python-version: "3.12"
|
|
24
|
+
|
|
25
|
+
- name: Build package
|
|
26
|
+
run: uv build
|
|
27
|
+
|
|
28
|
+
- name: Upload distribution artifacts
|
|
29
|
+
uses: actions/upload-artifact@v4
|
|
30
|
+
with:
|
|
31
|
+
name: dist
|
|
32
|
+
path: dist/
|
|
33
|
+
|
|
34
|
+
publish-pypi:
|
|
35
|
+
name: Publish to PyPI
|
|
36
|
+
needs: build
|
|
37
|
+
runs-on: ubuntu-latest
|
|
38
|
+
environment:
|
|
39
|
+
name: pypi
|
|
40
|
+
url: https://pypi.org/p/repulp
|
|
41
|
+
permissions:
|
|
42
|
+
id-token: write
|
|
43
|
+
steps:
|
|
44
|
+
- name: Download distribution artifacts
|
|
45
|
+
uses: actions/download-artifact@v4
|
|
46
|
+
with:
|
|
47
|
+
name: dist
|
|
48
|
+
path: dist/
|
|
49
|
+
|
|
50
|
+
- name: Publish to PyPI
|
|
51
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
52
|
+
|
|
53
|
+
publish-testpypi:
|
|
54
|
+
name: Publish to TestPyPI
|
|
55
|
+
needs: build
|
|
56
|
+
runs-on: ubuntu-latest
|
|
57
|
+
environment:
|
|
58
|
+
name: testpypi
|
|
59
|
+
url: https://test.pypi.org/p/repulp
|
|
60
|
+
permissions:
|
|
61
|
+
id-token: write
|
|
62
|
+
steps:
|
|
63
|
+
- name: Download distribution artifacts
|
|
64
|
+
uses: actions/download-artifact@v4
|
|
65
|
+
with:
|
|
66
|
+
name: dist
|
|
67
|
+
path: dist/
|
|
68
|
+
|
|
69
|
+
- name: Publish to TestPyPI
|
|
70
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
71
|
+
with:
|
|
72
|
+
repository-url: https://test.pypi.org/legacy/
|
repulp-0.1.0/.gitignore
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
# Python-generated files
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[oc]
|
|
4
|
+
build/
|
|
5
|
+
dist/
|
|
6
|
+
wheels/
|
|
7
|
+
*.egg-info
|
|
8
|
+
|
|
9
|
+
# Virtual environments
|
|
10
|
+
.venv
|
|
11
|
+
|
|
12
|
+
# Testing
|
|
13
|
+
.coverage
|
|
14
|
+
htmlcov/
|
|
15
|
+
.pytest_cache/
|
|
16
|
+
|
|
17
|
+
# repulp cache and converted output
|
|
18
|
+
.repulp.cache
|
|
19
|
+
|
|
20
|
+
# docs
|
|
21
|
+
docs/
|
|
22
|
+
|
|
23
|
+
# OS
|
|
24
|
+
.DS_Store
|
|
25
|
+
|
|
26
|
+
# Editor
|
|
27
|
+
.idea/
|
|
28
|
+
.vscode/
|
|
29
|
+
*.swp
|
|
30
|
+
*.swo
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
3.12
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
# .repulp.toml — Example repulp configuration
|
|
2
|
+
# Copy this file to .repulp.toml in your project root.
|
|
3
|
+
|
|
4
|
+
[repulp]
|
|
5
|
+
# Output directory for converted markdown files (empty = alongside source files)
|
|
6
|
+
output_dir = "./markdown"
|
|
7
|
+
|
|
8
|
+
# Recursively scan directories
|
|
9
|
+
recursive = true
|
|
10
|
+
|
|
11
|
+
# Post-process markdown (normalize headings, fix tables, strip artifacts)
|
|
12
|
+
clean = true
|
|
13
|
+
|
|
14
|
+
# Number of parallel workers (0 = auto, based on CPU count)
|
|
15
|
+
workers = 0
|
|
16
|
+
|
|
17
|
+
# Only convert files matching these patterns (empty = all supported formats)
|
|
18
|
+
include = ["*.pdf", "*.docx", "*.pptx"]
|
|
19
|
+
|
|
20
|
+
# Skip files matching these patterns
|
|
21
|
+
exclude = ["*.tmp"]
|
repulp-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Sunny Kumar
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
repulp-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,366 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: repulp
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Parallel batch document conversion, watch mode, and structured extraction — powered by MarkItDown.
|
|
5
|
+
Project-URL: Homepage, https://github.com/5unnyKu/repulp
|
|
6
|
+
Project-URL: Repository, https://github.com/5unnyKu/repulp
|
|
7
|
+
Project-URL: Issues, https://github.com/5unnyKu/repulp/issues
|
|
8
|
+
Author-email: Sunny Kumar <5unnykum4r@gmail.com>
|
|
9
|
+
License-Expression: MIT
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Keywords: batch,cli,converter,dataframe,document,docx,extraction,markdown,markitdown,parallel,pdf,watch
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Environment :: Console
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
21
|
+
Classifier: Topic :: Text Processing :: Markup :: Markdown
|
|
22
|
+
Classifier: Topic :: Utilities
|
|
23
|
+
Classifier: Typing :: Typed
|
|
24
|
+
Requires-Python: >=3.10
|
|
25
|
+
Requires-Dist: httpx>=0.27.0
|
|
26
|
+
Requires-Dist: markitdown[all]>=0.1.5
|
|
27
|
+
Requires-Dist: rich>=13.0.0
|
|
28
|
+
Requires-Dist: tomli>=2.0.0; python_version < '3.11'
|
|
29
|
+
Requires-Dist: typer>=0.15.0
|
|
30
|
+
Requires-Dist: watchfiles>=1.0.0
|
|
31
|
+
Provides-Extra: tables
|
|
32
|
+
Requires-Dist: pandas>=2.0.0; extra == 'tables'
|
|
33
|
+
Description-Content-Type: text/markdown
|
|
34
|
+
|
|
35
|
+
# repulp
|
|
36
|
+
|
|
37
|
+
[](https://www.python.org/downloads/)
|
|
38
|
+
[](LICENSE)
|
|
39
|
+
[]()
|
|
40
|
+
|
|
41
|
+
Parallel batch document conversion, watch mode, and structured extraction — powered by [MarkItDown](https://github.com/microsoft/markitdown).
|
|
42
|
+
|
|
43
|
+
repulp wraps Microsoft's MarkItDown with a production workflow layer: parallel batch processing, incremental caching, file watching, table extraction, and a rich CLI.
|
|
44
|
+
|
|
45
|
+
## Why repulp?
|
|
46
|
+
|
|
47
|
+
MarkItDown converts files one at a time. repulp adds everything you need for real-world document pipelines:
|
|
48
|
+
|
|
49
|
+
| Feature | MarkItDown | repulp |
|
|
50
|
+
|---------|-----------|--------|
|
|
51
|
+
| Single file conversion | Yes | Yes |
|
|
52
|
+
| Parallel batch conversion | No | Yes (ProcessPoolExecutor) |
|
|
53
|
+
| Incremental cache (skip unchanged) | No | Yes (SHA256 hashing) |
|
|
54
|
+
| Watch mode (auto-convert on save) | No | Yes (watchfiles) |
|
|
55
|
+
| Extract tables as DataFrames/CSV | No | Yes |
|
|
56
|
+
| CLI with progress bars | No | Yes (Rich + Typer) |
|
|
57
|
+
| Config files (`.repulp.toml`) | No | Yes |
|
|
58
|
+
|
|
59
|
+
## Supported Formats
|
|
60
|
+
|
|
61
|
+
PDF, DOCX, DOC, PPTX, PPT, XLSX, XLS, CSV, HTML, TXT, MD, RST, JSON, XML, YAML, images (JPEG, PNG, GIF, BMP, TIFF, WEBP), audio (MP3, WAV, FLAC), and more via MarkItDown.
|
|
62
|
+
|
|
63
|
+
> **Note:** Formats like HTML, CSV, PDF, DOCX, PPTX, and XLSX produce the richest Markdown output. Plain text formats (TXT, JSON, YAML, XML, RST) are passed through with minimal transformation by the underlying MarkItDown engine.
|
|
64
|
+
|
|
65
|
+
## Quick Start
|
|
66
|
+
|
|
67
|
+
```bash
|
|
68
|
+
pip install repulp
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
Or with [uv](https://docs.astral.sh/uv/):
|
|
72
|
+
|
|
73
|
+
```bash
|
|
74
|
+
uv add repulp
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
**Convert a directory with parallel workers:**
|
|
78
|
+
|
|
79
|
+
```bash
|
|
80
|
+
repulp convert ./documents --workers 4 --output ./markdown
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
**Watch a folder and auto-convert on changes:**
|
|
84
|
+
|
|
85
|
+
```bash
|
|
86
|
+
repulp watch ./incoming --output ./converted
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
**Extract tables from a PDF as CSV:**
|
|
90
|
+
|
|
91
|
+
```bash
|
|
92
|
+
repulp extract tables report.pdf --format csv --output ./tables
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
## CLI Reference
|
|
96
|
+
|
|
97
|
+
### `repulp convert`
|
|
98
|
+
|
|
99
|
+
Convert files, directories, or URLs to Markdown.
|
|
100
|
+
|
|
101
|
+
```bash
|
|
102
|
+
# Single file
|
|
103
|
+
repulp convert report.pdf
|
|
104
|
+
|
|
105
|
+
# Directory with parallel workers
|
|
106
|
+
repulp convert ./docs --workers 4 --output ./markdown
|
|
107
|
+
|
|
108
|
+
# Recursive with filters
|
|
109
|
+
repulp convert ./docs -r --include "*.pdf,*.docx" --exclude "*.tmp"
|
|
110
|
+
|
|
111
|
+
# Incremental (skip unchanged files, enabled by default)
|
|
112
|
+
repulp convert ./docs
|
|
113
|
+
repulp convert ./docs # second run skips unchanged files
|
|
114
|
+
|
|
115
|
+
# Force reconvert all
|
|
116
|
+
repulp convert ./docs --no-cache
|
|
117
|
+
|
|
118
|
+
# URL
|
|
119
|
+
repulp convert https://example.com/page
|
|
120
|
+
|
|
121
|
+
# Stdin
|
|
122
|
+
cat file.html | repulp convert -
|
|
123
|
+
|
|
124
|
+
# Output to stdout
|
|
125
|
+
repulp convert report.pdf --stdout
|
|
126
|
+
|
|
127
|
+
# With frontmatter metadata
|
|
128
|
+
repulp convert report.pdf --frontmatter
|
|
129
|
+
|
|
130
|
+
# Different output formats
|
|
131
|
+
repulp convert report.pdf --format text
|
|
132
|
+
repulp convert report.pdf --format json
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
| Option | Short | Description |
|
|
136
|
+
|--------|-------|-------------|
|
|
137
|
+
| `--output` | `-o` | Output directory |
|
|
138
|
+
| `--recursive` | `-r` | Scan subdirectories |
|
|
139
|
+
| `--workers` | `-w` | Parallel workers (0 = auto) |
|
|
140
|
+
| `--no-cache` | | Disable incremental cache |
|
|
141
|
+
| `--include` | `-I` | Glob patterns to include |
|
|
142
|
+
| `--exclude` | `-E` | Glob patterns to exclude |
|
|
143
|
+
| `--stdout` | `-s` | Print to stdout |
|
|
144
|
+
| `--frontmatter` | `-f` | Add YAML frontmatter |
|
|
145
|
+
| `--format` | `-F` | Output format: md, text, json |
|
|
146
|
+
| `--no-clean` | | Skip markdown post-processing |
|
|
147
|
+
|
|
148
|
+
### `repulp watch`
|
|
149
|
+
|
|
150
|
+
Watch a directory and auto-convert on file changes.
|
|
151
|
+
|
|
152
|
+
```bash
|
|
153
|
+
repulp watch ./incoming --output ./converted
|
|
154
|
+
repulp watch ./docs --include "*.pdf" --debounce 1000
|
|
155
|
+
repulp watch ./docs --on-change "echo converted"
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
| Option | Description |
|
|
159
|
+
|--------|-------------|
|
|
160
|
+
| `--output` / `-o` | Output directory |
|
|
161
|
+
| `--include` / `-I` | Glob patterns to include |
|
|
162
|
+
| `--exclude` / `-E` | Glob patterns to exclude |
|
|
163
|
+
| `--no-clean` | Skip markdown cleanup |
|
|
164
|
+
| `--debounce` | Debounce interval in ms (default: 500) |
|
|
165
|
+
| `--on-change` | Shell command after each conversion |
|
|
166
|
+
|
|
167
|
+
### `repulp extract`
|
|
168
|
+
|
|
169
|
+
Extract structured elements from documents.
|
|
170
|
+
|
|
171
|
+
```bash
|
|
172
|
+
# Tables as CSV
|
|
173
|
+
repulp extract tables report.pdf --format csv
|
|
174
|
+
|
|
175
|
+
# Tables as JSON
|
|
176
|
+
repulp extract tables report.pdf --format json
|
|
177
|
+
|
|
178
|
+
# Save tables to files
|
|
179
|
+
repulp extract tables report.pdf --format csv --output ./tables
|
|
180
|
+
|
|
181
|
+
# Links
|
|
182
|
+
repulp extract links page.html
|
|
183
|
+
repulp extract links page.html --format json
|
|
184
|
+
|
|
185
|
+
# Headings
|
|
186
|
+
repulp extract headings report.pdf
|
|
187
|
+
|
|
188
|
+
# Images
|
|
189
|
+
repulp extract images document.docx
|
|
190
|
+
```
|
|
191
|
+
|
|
192
|
+
## Python API
|
|
193
|
+
|
|
194
|
+
```python
|
|
195
|
+
import repulp
|
|
196
|
+
|
|
197
|
+
# Convert a single file
|
|
198
|
+
result = repulp.convert("report.pdf")
|
|
199
|
+
print(result.markdown)
|
|
200
|
+
|
|
201
|
+
# Convert with options
|
|
202
|
+
result = repulp.convert("report.pdf", frontmatter=True, format="json")
|
|
203
|
+
|
|
204
|
+
# Batch convert a directory
|
|
205
|
+
result = repulp.batch("./documents", workers=4, recursive=True)
|
|
206
|
+
print(f"{result.succeeded}/{result.total} converted in {result.elapsed:.1f}s")
|
|
207
|
+
|
|
208
|
+
# Incremental batch (skip unchanged)
|
|
209
|
+
result = repulp.batch("./documents", incremental=True)
|
|
210
|
+
print(f"{result.skipped} skipped, {result.succeeded} converted")
|
|
211
|
+
|
|
212
|
+
# Extract tables as list of dicts
|
|
213
|
+
tables = repulp.extract_tables("report.pdf")
|
|
214
|
+
for table in tables:
|
|
215
|
+
for row in table:
|
|
216
|
+
print(row)
|
|
217
|
+
|
|
218
|
+
# Extract tables as pandas DataFrames
|
|
219
|
+
tables = repulp.extract_tables("report.pdf", format="dataframe")
|
|
220
|
+
df = tables[0]
|
|
221
|
+
|
|
222
|
+
# Extract tables as CSV strings
|
|
223
|
+
tables = repulp.extract_tables("report.pdf", format="csv")
|
|
224
|
+
|
|
225
|
+
# Watch a directory
|
|
226
|
+
repulp.watch("./incoming", output_dir="./converted")
|
|
227
|
+
```
|
|
228
|
+
|
|
229
|
+
### DataFrame Support
|
|
230
|
+
|
|
231
|
+
Install with the `tables` extra for pandas DataFrame support:
|
|
232
|
+
|
|
233
|
+
```bash
|
|
234
|
+
pip install repulp[tables]
|
|
235
|
+
```
|
|
236
|
+
|
|
237
|
+
```python
|
|
238
|
+
import repulp
|
|
239
|
+
|
|
240
|
+
tables = repulp.extract_tables("financials.xlsx", format="dataframe")
|
|
241
|
+
df = tables[0]
|
|
242
|
+
print(df.describe())
|
|
243
|
+
df.to_csv("output.csv", index=False)
|
|
244
|
+
```
|
|
245
|
+
|
|
246
|
+
## Configuration
|
|
247
|
+
|
|
248
|
+
Create `.repulp.toml` in your project root:
|
|
249
|
+
|
|
250
|
+
```toml
|
|
251
|
+
[repulp]
|
|
252
|
+
output_dir = "./markdown"
|
|
253
|
+
recursive = true
|
|
254
|
+
clean = true
|
|
255
|
+
workers = 0 # 0 = auto (CPU count - 1)
|
|
256
|
+
include = ["*.pdf", "*.docx", "*.pptx"]
|
|
257
|
+
exclude = ["*.tmp"]
|
|
258
|
+
```
|
|
259
|
+
|
|
260
|
+
Or use `[tool.repulp]` in `pyproject.toml`:
|
|
261
|
+
|
|
262
|
+
```toml
|
|
263
|
+
[tool.repulp]
|
|
264
|
+
output_dir = "./markdown"
|
|
265
|
+
recursive = true
|
|
266
|
+
```
|
|
267
|
+
|
|
268
|
+
CLI flags override config file values.
|
|
269
|
+
|
|
270
|
+
## Architecture
|
|
271
|
+
|
|
272
|
+
```
|
|
273
|
+
src/repulp/
|
|
274
|
+
├── __init__.py # Public API: convert(), batch(), extract_tables(), watch()
|
|
275
|
+
├── cli.py # Typer CLI with convert, watch, extract subcommands
|
|
276
|
+
├── converter.py # MarkItDown wrapper for single-file conversion
|
|
277
|
+
├── engine.py # Parallel batch engine (ProcessPoolExecutor)
|
|
278
|
+
├── cache.py # Incremental build cache (SHA256 file hashing)
|
|
279
|
+
├── watcher.py # File watcher (watchfiles) for auto-conversion
|
|
280
|
+
├── extractor.py # Table, link, heading, image extraction from Markdown
|
|
281
|
+
├── cleaner.py # Markdown post-processing and cleanup
|
|
282
|
+
├── config.py # TOML config file loading (.repulp.toml / pyproject.toml)
|
|
283
|
+
├── fetcher.py # URL fetching via httpx
|
|
284
|
+
├── frontmatter.py # YAML frontmatter injection
|
|
285
|
+
└── formatter.py # Output format handling (md, text, json)
|
|
286
|
+
```
|
|
287
|
+
|
|
288
|
+
## Libraries Used
|
|
289
|
+
|
|
290
|
+
repulp is built on top of these libraries:
|
|
291
|
+
|
|
292
|
+
| Library | Purpose |
|
|
293
|
+
|---------|---------|
|
|
294
|
+
| [MarkItDown](https://github.com/microsoft/markitdown) | Core document-to-Markdown conversion engine by Microsoft. Handles PDF, DOCX, PPTX, XLSX, HTML, CSV, images, audio, and more. |
|
|
295
|
+
| [Typer](https://typer.tiangolo.com/) | CLI framework built on Click. Provides argument parsing, help generation, and shell completion. |
|
|
296
|
+
| [Rich](https://rich.readthedocs.io/) | Terminal formatting — progress bars, tables, panels, colored output. |
|
|
297
|
+
| [watchfiles](https://watchfiles.helpmanual.io/) | Rust-backed file watcher. Used for the `watch` command to detect file changes with low latency. |
|
|
298
|
+
| [httpx](https://www.python-httpx.org/) | HTTP client for URL fetching. Used when converting URLs to Markdown. |
|
|
299
|
+
| [pandas](https://pandas.pydata.org/) | *(optional)* DataFrame support for structured table extraction. Install with `pip install repulp[tables]`. |
|
|
300
|
+
| [tomli](https://github.com/hukkin/tomli) | TOML parser for `.repulp.toml` config files. Only needed on Python < 3.11 (3.11+ has `tomllib` in stdlib). |
|
|
301
|
+
|
|
302
|
+
### Build & Dev Tools
|
|
303
|
+
|
|
304
|
+
| Tool | Purpose |
|
|
305
|
+
|------|---------|
|
|
306
|
+
| [hatchling](https://hatch.pypa.io/) | Build backend for packaging |
|
|
307
|
+
| [uv](https://docs.astral.sh/uv/) | Fast Python package manager |
|
|
308
|
+
| [pytest](https://docs.pytest.org/) | Test framework (162 tests) |
|
|
309
|
+
|
|
310
|
+
## Contributing
|
|
311
|
+
|
|
312
|
+
Contributions are welcome! Here's how to get started:
|
|
313
|
+
|
|
314
|
+
### Setup
|
|
315
|
+
|
|
316
|
+
```bash
|
|
317
|
+
git clone https://github.com/5unnykum4r/repulp.git
|
|
318
|
+
cd repulp
|
|
319
|
+
uv sync --group dev
|
|
320
|
+
```
|
|
321
|
+
|
|
322
|
+
### Running Tests
|
|
323
|
+
|
|
324
|
+
```bash
|
|
325
|
+
uv run pytest tests/ -v
|
|
326
|
+
```
|
|
327
|
+
|
|
328
|
+
### Project Conventions
|
|
329
|
+
|
|
330
|
+
- **Python 3.10+** — uses `from __future__ import annotations` for modern type hints
|
|
331
|
+
- **No vague comments** — code should be self-documenting; comments explain *why*, not *what*
|
|
332
|
+
- **Tests live in `tests/`** — mirror the source structure (e.g., `test_engine.py` tests `engine.py`)
|
|
333
|
+
- **Incremental commits** — one logical change per commit
|
|
334
|
+
|
|
335
|
+
### How to Contribute
|
|
336
|
+
|
|
337
|
+
1. Fork the repository
|
|
338
|
+
2. Create a feature branch (`git checkout -b feat/my-feature`)
|
|
339
|
+
3. Write tests for your changes
|
|
340
|
+
4. Make sure all tests pass (`uv run pytest tests/ -v`)
|
|
341
|
+
5. Commit your changes with a descriptive message
|
|
342
|
+
6. Push to your fork and open a Pull Request
|
|
343
|
+
|
|
344
|
+
### Areas for Contribution
|
|
345
|
+
|
|
346
|
+
- Adding support for new output formats
|
|
347
|
+
- Performance improvements to the batch engine
|
|
348
|
+
- Better error messages and diagnostics
|
|
349
|
+
- Documentation improvements
|
|
350
|
+
- New extraction types (e.g., code blocks, footnotes)
|
|
351
|
+
|
|
352
|
+
## Samples
|
|
353
|
+
|
|
354
|
+
The `samples/` directory contains example files (HTML, CSV) that demonstrate repulp's conversion capabilities:
|
|
355
|
+
|
|
356
|
+
```bash
|
|
357
|
+
# Convert all samples
|
|
358
|
+
repulp convert samples/ --output samples/converted --workers 4 --no-cache
|
|
359
|
+
|
|
360
|
+
# Extract tables from a sample
|
|
361
|
+
repulp extract tables samples/architecture.html --format json
|
|
362
|
+
```
|
|
363
|
+
|
|
364
|
+
## License
|
|
365
|
+
|
|
366
|
+
[MIT](LICENSE) — Sunny Kumar
|