repulp 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. repulp-0.1.0/.github/workflows/ci.yml +31 -0
  2. repulp-0.1.0/.github/workflows/publish.yml +72 -0
  3. repulp-0.1.0/.gitignore +30 -0
  4. repulp-0.1.0/.python-version +1 -0
  5. repulp-0.1.0/.repulp.example.toml +21 -0
  6. repulp-0.1.0/LICENSE +21 -0
  7. repulp-0.1.0/PKG-INFO +366 -0
  8. repulp-0.1.0/README.md +332 -0
  9. repulp-0.1.0/pyproject.toml +79 -0
  10. repulp-0.1.0/samples/api-docs.html +224 -0
  11. repulp-0.1.0/samples/architecture.html +175 -0
  12. repulp-0.1.0/samples/changelog.html +93 -0
  13. repulp-0.1.0/samples/converted/api-docs.md +85 -0
  14. repulp-0.1.0/samples/converted/architecture.md +103 -0
  15. repulp-0.1.0/samples/converted/changelog.md +54 -0
  16. repulp-0.1.0/samples/converted/employee-directory.md +19 -0
  17. repulp-0.1.0/samples/converted/inventory.md +17 -0
  18. repulp-0.1.0/samples/converted/meeting-notes.md +75 -0
  19. repulp-0.1.0/samples/converted/report.md +58 -0
  20. repulp-0.1.0/samples/converted/research-paper.md +94 -0
  21. repulp-0.1.0/samples/converted/sales-data.md +26 -0
  22. repulp-0.1.0/samples/employee-directory.csv +18 -0
  23. repulp-0.1.0/samples/inventory.csv +16 -0
  24. repulp-0.1.0/samples/meeting-notes.html +125 -0
  25. repulp-0.1.0/samples/report.html +105 -0
  26. repulp-0.1.0/samples/research-paper.html +123 -0
  27. repulp-0.1.0/samples/sales-data.csv +25 -0
  28. repulp-0.1.0/src/repulp/__init__.py +172 -0
  29. repulp-0.1.0/src/repulp/cache.py +88 -0
  30. repulp-0.1.0/src/repulp/cleaner.py +99 -0
  31. repulp-0.1.0/src/repulp/cli.py +492 -0
  32. repulp-0.1.0/src/repulp/config.py +69 -0
  33. repulp-0.1.0/src/repulp/converter.py +151 -0
  34. repulp-0.1.0/src/repulp/engine.py +243 -0
  35. repulp-0.1.0/src/repulp/extractor.py +168 -0
  36. repulp-0.1.0/src/repulp/fetcher.py +62 -0
  37. repulp-0.1.0/src/repulp/formatter.py +56 -0
  38. repulp-0.1.0/src/repulp/frontmatter.py +67 -0
  39. repulp-0.1.0/src/repulp/py.typed +0 -0
  40. repulp-0.1.0/src/repulp/watcher.py +168 -0
  41. repulp-0.1.0/tests/__init__.py +0 -0
  42. repulp-0.1.0/tests/fixtures/sample.csv +3 -0
  43. repulp-0.1.0/tests/fixtures/sample.html +1 -0
  44. repulp-0.1.0/tests/fixtures/sample.md +0 -0
  45. repulp-0.1.0/tests/fixtures/sample.txt +1 -0
  46. repulp-0.1.0/tests/test_api.py +83 -0
  47. repulp-0.1.0/tests/test_cache.py +127 -0
  48. repulp-0.1.0/tests/test_cleaner.py +45 -0
  49. repulp-0.1.0/tests/test_cli.py +63 -0
  50. repulp-0.1.0/tests/test_cli_batch.py +49 -0
  51. repulp-0.1.0/tests/test_config.py +49 -0
  52. repulp-0.1.0/tests/test_converter.py +75 -0
  53. repulp-0.1.0/tests/test_engine.py +225 -0
  54. repulp-0.1.0/tests/test_extractor.py +112 -0
  55. repulp-0.1.0/tests/test_fetcher.py +32 -0
  56. repulp-0.1.0/tests/test_formatter.py +55 -0
  57. repulp-0.1.0/tests/test_frontmatter.py +76 -0
  58. repulp-0.1.0/tests/test_integration.py +67 -0
  59. repulp-0.1.0/tests/test_table_extraction.py +98 -0
  60. repulp-0.1.0/tests/test_watcher.py +198 -0
  61. repulp-0.1.0/uv.lock +1965 -0
@@ -0,0 +1,31 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+ branches: [main]
8
+
9
+ jobs:
10
+ test:
11
+ name: Test (Python ${{ matrix.python-version }})
12
+ runs-on: ubuntu-latest
13
+ strategy:
14
+ matrix:
15
+ python-version: ["3.10", "3.11", "3.12", "3.13"]
16
+ steps:
17
+ - uses: actions/checkout@v4
18
+
19
+ - name: Install uv
20
+ uses: astral-sh/setup-uv@v5
21
+
22
+ - name: Set up Python ${{ matrix.python-version }}
23
+ uses: actions/setup-python@v5
24
+ with:
25
+ python-version: ${{ matrix.python-version }}
26
+
27
+ - name: Install dependencies
28
+ run: uv sync --group dev
29
+
30
+ - name: Run tests
31
+ run: uv run pytest --cov=repulp
@@ -0,0 +1,72 @@
1
+ name: Publish to PyPI
2
+
3
+ on:
4
+ release:
5
+ types: [published]
6
+
7
+ permissions:
8
+ id-token: write
9
+
10
+ jobs:
11
+ build:
12
+ name: Build distribution
13
+ runs-on: ubuntu-latest
14
+ steps:
15
+ - uses: actions/checkout@v4
16
+
17
+ - name: Install uv
18
+ uses: astral-sh/setup-uv@v5
19
+
20
+ - name: Set up Python
21
+ uses: actions/setup-python@v5
22
+ with:
23
+ python-version: "3.12"
24
+
25
+ - name: Build package
26
+ run: uv build
27
+
28
+ - name: Upload distribution artifacts
29
+ uses: actions/upload-artifact@v4
30
+ with:
31
+ name: dist
32
+ path: dist/
33
+
34
+ publish-pypi:
35
+ name: Publish to PyPI
36
+ needs: build
37
+ runs-on: ubuntu-latest
38
+ environment:
39
+ name: pypi
40
+ url: https://pypi.org/p/repulp
41
+ permissions:
42
+ id-token: write
43
+ steps:
44
+ - name: Download distribution artifacts
45
+ uses: actions/download-artifact@v4
46
+ with:
47
+ name: dist
48
+ path: dist/
49
+
50
+ - name: Publish to PyPI
51
+ uses: pypa/gh-action-pypi-publish@release/v1
52
+
53
+ publish-testpypi:
54
+ name: Publish to TestPyPI
55
+ needs: build
56
+ runs-on: ubuntu-latest
57
+ environment:
58
+ name: testpypi
59
+ url: https://test.pypi.org/p/repulp
60
+ permissions:
61
+ id-token: write
62
+ steps:
63
+ - name: Download distribution artifacts
64
+ uses: actions/download-artifact@v4
65
+ with:
66
+ name: dist
67
+ path: dist/
68
+
69
+ - name: Publish to TestPyPI
70
+ uses: pypa/gh-action-pypi-publish@release/v1
71
+ with:
72
+ repository-url: https://test.pypi.org/legacy/
@@ -0,0 +1,30 @@
1
+ # Python-generated files
2
+ __pycache__/
3
+ *.py[oc]
4
+ build/
5
+ dist/
6
+ wheels/
7
+ *.egg-info
8
+
9
+ # Virtual environments
10
+ .venv
11
+
12
+ # Testing
13
+ .coverage
14
+ htmlcov/
15
+ .pytest_cache/
16
+
17
+ # repulp cache and converted output
18
+ .repulp.cache
19
+
20
+ # docs
21
+ docs/
22
+
23
+ # OS
24
+ .DS_Store
25
+
26
+ # Editor
27
+ .idea/
28
+ .vscode/
29
+ *.swp
30
+ *.swo
@@ -0,0 +1 @@
1
+ 3.12
@@ -0,0 +1,21 @@
1
+ # .repulp.toml — Example repulp configuration
2
+ # Copy this file to .repulp.toml in your project root.
3
+
4
+ [repulp]
5
+ # Output directory for converted markdown files (empty = alongside source files)
6
+ output_dir = "./markdown"
7
+
8
+ # Recursively scan directories
9
+ recursive = true
10
+
11
+ # Post-process markdown (normalize headings, fix tables, strip artifacts)
12
+ clean = true
13
+
14
+ # Number of parallel workers (0 = auto, based on CPU count)
15
+ workers = 0
16
+
17
+ # Only convert files matching these patterns (empty = all supported formats)
18
+ include = ["*.pdf", "*.docx", "*.pptx"]
19
+
20
+ # Skip files matching these patterns
21
+ exclude = ["*.tmp"]
repulp-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Sunny Kumar
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
repulp-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,366 @@
1
+ Metadata-Version: 2.4
2
+ Name: repulp
3
+ Version: 0.1.0
4
+ Summary: Parallel batch document conversion, watch mode, and structured extraction — powered by MarkItDown.
5
+ Project-URL: Homepage, https://github.com/5unnyKu/repulp
6
+ Project-URL: Repository, https://github.com/5unnyKu/repulp
7
+ Project-URL: Issues, https://github.com/5unnyKu/repulp/issues
8
+ Author-email: Sunny Kumar <5unnykum4r@gmail.com>
9
+ License-Expression: MIT
10
+ License-File: LICENSE
11
+ Keywords: batch,cli,converter,dataframe,document,docx,extraction,markdown,markitdown,parallel,pdf,watch
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Environment :: Console
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: License :: OSI Approved :: MIT License
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.10
18
+ Classifier: Programming Language :: Python :: 3.11
19
+ Classifier: Programming Language :: Python :: 3.12
20
+ Classifier: Programming Language :: Python :: 3.13
21
+ Classifier: Topic :: Text Processing :: Markup :: Markdown
22
+ Classifier: Topic :: Utilities
23
+ Classifier: Typing :: Typed
24
+ Requires-Python: >=3.10
25
+ Requires-Dist: httpx>=0.27.0
26
+ Requires-Dist: markitdown[all]>=0.1.5
27
+ Requires-Dist: rich>=13.0.0
28
+ Requires-Dist: tomli>=2.0.0; python_version < '3.11'
29
+ Requires-Dist: typer>=0.15.0
30
+ Requires-Dist: watchfiles>=1.0.0
31
+ Provides-Extra: tables
32
+ Requires-Dist: pandas>=2.0.0; extra == 'tables'
33
+ Description-Content-Type: text/markdown
34
+
35
+ # repulp
36
+
37
+ [![Python 3.10+](https://img.shields.io/badge/python-3.10%2B-blue)](https://www.python.org/downloads/)
38
+ [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](LICENSE)
39
+ [![Tests](https://img.shields.io/badge/tests-162%20passing-brightgreen)]()
40
+
41
+ Parallel batch document conversion, watch mode, and structured extraction — powered by [MarkItDown](https://github.com/microsoft/markitdown).
42
+
43
+ repulp wraps Microsoft's MarkItDown with a production workflow layer: parallel batch processing, incremental caching, file watching, table extraction, and a rich CLI.
44
+
45
+ ## Why repulp?
46
+
47
+ MarkItDown converts files one at a time. repulp adds everything you need for real-world document pipelines:
48
+
49
+ | Feature | MarkItDown | repulp |
50
+ |---------|-----------|--------|
51
+ | Single file conversion | Yes | Yes |
52
+ | Parallel batch conversion | No | Yes (ProcessPoolExecutor) |
53
+ | Incremental cache (skip unchanged) | No | Yes (SHA256 hashing) |
54
+ | Watch mode (auto-convert on save) | No | Yes (watchfiles) |
55
+ | Extract tables as DataFrames/CSV | No | Yes |
56
+ | CLI with progress bars | No | Yes (Rich + Typer) |
57
+ | Config files (`.repulp.toml`) | No | Yes |
58
+
59
+ ## Supported Formats
60
+
61
+ PDF, DOCX, DOC, PPTX, PPT, XLSX, XLS, CSV, HTML, TXT, MD, RST, JSON, XML, YAML, images (JPEG, PNG, GIF, BMP, TIFF, WEBP), audio (MP3, WAV, FLAC), and more via MarkItDown.
62
+
63
+ > **Note:** Formats like HTML, CSV, PDF, DOCX, PPTX, and XLSX produce the richest Markdown output. Plain text formats (TXT, JSON, YAML, XML, RST) are passed through with minimal transformation by the underlying MarkItDown engine.
64
+
65
+ ## Quick Start
66
+
67
+ ```bash
68
+ pip install repulp
69
+ ```
70
+
71
+ Or with [uv](https://docs.astral.sh/uv/):
72
+
73
+ ```bash
74
+ uv add repulp
75
+ ```
76
+
77
+ **Convert a directory with parallel workers:**
78
+
79
+ ```bash
80
+ repulp convert ./documents --workers 4 --output ./markdown
81
+ ```
82
+
83
+ **Watch a folder and auto-convert on changes:**
84
+
85
+ ```bash
86
+ repulp watch ./incoming --output ./converted
87
+ ```
88
+
89
+ **Extract tables from a PDF as CSV:**
90
+
91
+ ```bash
92
+ repulp extract tables report.pdf --format csv --output ./tables
93
+ ```
94
+
95
+ ## CLI Reference
96
+
97
+ ### `repulp convert`
98
+
99
+ Convert files, directories, or URLs to Markdown.
100
+
101
+ ```bash
102
+ # Single file
103
+ repulp convert report.pdf
104
+
105
+ # Directory with parallel workers
106
+ repulp convert ./docs --workers 4 --output ./markdown
107
+
108
+ # Recursive with filters
109
+ repulp convert ./docs -r --include "*.pdf,*.docx" --exclude "*.tmp"
110
+
111
+ # Incremental (skip unchanged files, enabled by default)
112
+ repulp convert ./docs
113
+ repulp convert ./docs # second run skips unchanged files
114
+
115
+ # Force reconvert all
116
+ repulp convert ./docs --no-cache
117
+
118
+ # URL
119
+ repulp convert https://example.com/page
120
+
121
+ # Stdin
122
+ cat file.html | repulp convert -
123
+
124
+ # Output to stdout
125
+ repulp convert report.pdf --stdout
126
+
127
+ # With frontmatter metadata
128
+ repulp convert report.pdf --frontmatter
129
+
130
+ # Different output formats
131
+ repulp convert report.pdf --format text
132
+ repulp convert report.pdf --format json
133
+ ```
134
+
135
+ | Option | Short | Description |
136
+ |--------|-------|-------------|
137
+ | `--output` | `-o` | Output directory |
138
+ | `--recursive` | `-r` | Scan subdirectories |
139
+ | `--workers` | `-w` | Parallel workers (0 = auto) |
140
+ | `--no-cache` | | Disable incremental cache |
141
+ | `--include` | `-I` | Glob patterns to include |
142
+ | `--exclude` | `-E` | Glob patterns to exclude |
143
+ | `--stdout` | `-s` | Print to stdout |
144
+ | `--frontmatter` | `-f` | Add YAML frontmatter |
145
+ | `--format` | `-F` | Output format: md, text, json |
146
+ | `--no-clean` | | Skip markdown post-processing |
147
+
148
+ ### `repulp watch`
149
+
150
+ Watch a directory and auto-convert on file changes.
151
+
152
+ ```bash
153
+ repulp watch ./incoming --output ./converted
154
+ repulp watch ./docs --include "*.pdf" --debounce 1000
155
+ repulp watch ./docs --on-change "echo converted"
156
+ ```
157
+
158
+ | Option | Description |
159
+ |--------|-------------|
160
+ | `--output` / `-o` | Output directory |
161
+ | `--include` / `-I` | Glob patterns to include |
162
+ | `--exclude` / `-E` | Glob patterns to exclude |
163
+ | `--no-clean` | Skip markdown cleanup |
164
+ | `--debounce` | Debounce interval in ms (default: 500) |
165
+ | `--on-change` | Shell command after each conversion |
166
+
167
+ ### `repulp extract`
168
+
169
+ Extract structured elements from documents.
170
+
171
+ ```bash
172
+ # Tables as CSV
173
+ repulp extract tables report.pdf --format csv
174
+
175
+ # Tables as JSON
176
+ repulp extract tables report.pdf --format json
177
+
178
+ # Save tables to files
179
+ repulp extract tables report.pdf --format csv --output ./tables
180
+
181
+ # Links
182
+ repulp extract links page.html
183
+ repulp extract links page.html --format json
184
+
185
+ # Headings
186
+ repulp extract headings report.pdf
187
+
188
+ # Images
189
+ repulp extract images document.docx
190
+ ```
191
+
192
+ ## Python API
193
+
194
+ ```python
195
+ import repulp
196
+
197
+ # Convert a single file
198
+ result = repulp.convert("report.pdf")
199
+ print(result.markdown)
200
+
201
+ # Convert with options
202
+ result = repulp.convert("report.pdf", frontmatter=True, format="json")
203
+
204
+ # Batch convert a directory
205
+ result = repulp.batch("./documents", workers=4, recursive=True)
206
+ print(f"{result.succeeded}/{result.total} converted in {result.elapsed:.1f}s")
207
+
208
+ # Incremental batch (skip unchanged)
209
+ result = repulp.batch("./documents", incremental=True)
210
+ print(f"{result.skipped} skipped, {result.succeeded} converted")
211
+
212
+ # Extract tables as list of dicts
213
+ tables = repulp.extract_tables("report.pdf")
214
+ for table in tables:
215
+ for row in table:
216
+ print(row)
217
+
218
+ # Extract tables as pandas DataFrames
219
+ tables = repulp.extract_tables("report.pdf", format="dataframe")
220
+ df = tables[0]
221
+
222
+ # Extract tables as CSV strings
223
+ tables = repulp.extract_tables("report.pdf", format="csv")
224
+
225
+ # Watch a directory
226
+ repulp.watch("./incoming", output_dir="./converted")
227
+ ```
228
+
229
+ ### DataFrame Support
230
+
231
+ Install with the `tables` extra for pandas DataFrame support:
232
+
233
+ ```bash
234
+ pip install repulp[tables]
235
+ ```
236
+
237
+ ```python
238
+ import repulp
239
+
240
+ tables = repulp.extract_tables("financials.xlsx", format="dataframe")
241
+ df = tables[0]
242
+ print(df.describe())
243
+ df.to_csv("output.csv", index=False)
244
+ ```
245
+
246
+ ## Configuration
247
+
248
+ Create `.repulp.toml` in your project root:
249
+
250
+ ```toml
251
+ [repulp]
252
+ output_dir = "./markdown"
253
+ recursive = true
254
+ clean = true
255
+ workers = 0 # 0 = auto (CPU count - 1)
256
+ include = ["*.pdf", "*.docx", "*.pptx"]
257
+ exclude = ["*.tmp"]
258
+ ```
259
+
260
+ Or use `[tool.repulp]` in `pyproject.toml`:
261
+
262
+ ```toml
263
+ [tool.repulp]
264
+ output_dir = "./markdown"
265
+ recursive = true
266
+ ```
267
+
268
+ CLI flags override config file values.
269
+
270
+ ## Architecture
271
+
272
+ ```
273
+ src/repulp/
274
+ ├── __init__.py # Public API: convert(), batch(), extract_tables(), watch()
275
+ ├── cli.py # Typer CLI with convert, watch, extract subcommands
276
+ ├── converter.py # MarkItDown wrapper for single-file conversion
277
+ ├── engine.py # Parallel batch engine (ProcessPoolExecutor)
278
+ ├── cache.py # Incremental build cache (SHA256 file hashing)
279
+ ├── watcher.py # File watcher (watchfiles) for auto-conversion
280
+ ├── extractor.py # Table, link, heading, image extraction from Markdown
281
+ ├── cleaner.py # Markdown post-processing and cleanup
282
+ ├── config.py # TOML config file loading (.repulp.toml / pyproject.toml)
283
+ ├── fetcher.py # URL fetching via httpx
284
+ ├── frontmatter.py # YAML frontmatter injection
285
+ └── formatter.py # Output format handling (md, text, json)
286
+ ```
287
+
288
+ ## Libraries Used
289
+
290
+ repulp is built on top of these libraries:
291
+
292
+ | Library | Purpose |
293
+ |---------|---------|
294
+ | [MarkItDown](https://github.com/microsoft/markitdown) | Core document-to-Markdown conversion engine by Microsoft. Handles PDF, DOCX, PPTX, XLSX, HTML, CSV, images, audio, and more. |
295
+ | [Typer](https://typer.tiangolo.com/) | CLI framework built on Click. Provides argument parsing, help generation, and shell completion. |
296
+ | [Rich](https://rich.readthedocs.io/) | Terminal formatting — progress bars, tables, panels, colored output. |
297
+ | [watchfiles](https://watchfiles.helpmanual.io/) | Rust-backed file watcher. Used for the `watch` command to detect file changes with low latency. |
298
+ | [httpx](https://www.python-httpx.org/) | HTTP client for URL fetching. Used when converting URLs to Markdown. |
299
+ | [pandas](https://pandas.pydata.org/) | *(optional)* DataFrame support for structured table extraction. Install with `pip install repulp[tables]`. |
300
+ | [tomli](https://github.com/hukkin/tomli) | TOML parser for `.repulp.toml` config files. Only needed on Python < 3.11 (3.11+ has `tomllib` in stdlib). |
301
+
302
+ ### Build & Dev Tools
303
+
304
+ | Tool | Purpose |
305
+ |------|---------|
306
+ | [hatchling](https://hatch.pypa.io/) | Build backend for packaging |
307
+ | [uv](https://docs.astral.sh/uv/) | Fast Python package manager |
308
+ | [pytest](https://docs.pytest.org/) | Test framework (162 tests) |
309
+
310
+ ## Contributing
311
+
312
+ Contributions are welcome! Here's how to get started:
313
+
314
+ ### Setup
315
+
316
+ ```bash
317
+ git clone https://github.com/5unnykum4r/repulp.git
318
+ cd repulp
319
+ uv sync --group dev
320
+ ```
321
+
322
+ ### Running Tests
323
+
324
+ ```bash
325
+ uv run pytest tests/ -v
326
+ ```
327
+
328
+ ### Project Conventions
329
+
330
+ - **Python 3.10+** — uses `from __future__ import annotations` for modern type hints
331
+ - **No vague comments** — code should be self-documenting; comments explain *why*, not *what*
332
+ - **Tests live in `tests/`** — mirror the source structure (e.g., `test_engine.py` tests `engine.py`)
333
+ - **Incremental commits** — one logical change per commit
334
+
335
+ ### How to Contribute
336
+
337
+ 1. Fork the repository
338
+ 2. Create a feature branch (`git checkout -b feat/my-feature`)
339
+ 3. Write tests for your changes
340
+ 4. Make sure all tests pass (`uv run pytest tests/ -v`)
341
+ 5. Commit your changes with a descriptive message
342
+ 6. Push to your fork and open a Pull Request
343
+
344
+ ### Areas for Contribution
345
+
346
+ - Adding support for new output formats
347
+ - Performance improvements to the batch engine
348
+ - Better error messages and diagnostics
349
+ - Documentation improvements
350
+ - New extraction types (e.g., code blocks, footnotes)
351
+
352
+ ## Samples
353
+
354
+ The `samples/` directory contains example files (HTML, CSV) that demonstrate repulp's conversion capabilities:
355
+
356
+ ```bash
357
+ # Convert all samples
358
+ repulp convert samples/ --output samples/converted --workers 4 --no-cache
359
+
360
+ # Extract tables from a sample
361
+ repulp extract tables samples/architecture.html --format json
362
+ ```
363
+
364
+ ## License
365
+
366
+ [MIT](LICENSE) — Sunny Kumar