rc-docparser 0.2.0__tar.gz → 0.2.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {rc_docparser-0.2.0 → rc_docparser-0.2.2}/CHANGELOG.md +16 -0
- {rc_docparser-0.2.0 → rc_docparser-0.2.2}/PKG-INFO +25 -39
- {rc_docparser-0.2.0 → rc_docparser-0.2.2}/README.md +20 -15
- {rc_docparser-0.2.0 → rc_docparser-0.2.2}/pyproject.toml +17 -7
- {rc_docparser-0.2.0 → rc_docparser-0.2.2}/src/docparser/__init__.py +1 -2
- rc_docparser-0.2.2/src/docparser/_version.py +1 -0
- {rc_docparser-0.2.0 → rc_docparser-0.2.2}/src/docparser/cli.py +1 -4
- {rc_docparser-0.2.0 → rc_docparser-0.2.2}/src/docparser/docx.py +1 -3
- {rc_docparser-0.2.0 → rc_docparser-0.2.2}/src/docparser/epub.py +2 -2
- {rc_docparser-0.2.0 → rc_docparser-0.2.2}/src/docparser/html.py +2 -2
- {rc_docparser-0.2.0 → rc_docparser-0.2.2}/src/docparser/image.py +1 -1
- {rc_docparser-0.2.0 → rc_docparser-0.2.2}/src/docparser/localvlm.py +2 -2
- {rc_docparser-0.2.0 → rc_docparser-0.2.2}/src/docparser/ocr.py +2 -2
- {rc_docparser-0.2.0 → rc_docparser-0.2.2}/src/docparser/pdf.py +3 -3
- {rc_docparser-0.2.0 → rc_docparser-0.2.2}/src/docparser/pdf_backends.py +6 -6
- {rc_docparser-0.2.0 → rc_docparser-0.2.2}/src/docparser/pptx.py +2 -2
- {rc_docparser-0.2.0 → rc_docparser-0.2.2}/src/docparser/xlsx.py +3 -6
- {rc_docparser-0.2.0 → rc_docparser-0.2.2}/tests/conftest.py +2 -3
- {rc_docparser-0.2.0 → rc_docparser-0.2.2}/.gitignore +0 -0
- {rc_docparser-0.2.0 → rc_docparser-0.2.2}/LICENSE +0 -0
- {rc_docparser-0.2.0 → rc_docparser-0.2.2}/src/docparser/common.py +0 -0
- {rc_docparser-0.2.0 → rc_docparser-0.2.2}/src/docparser/csvtab.py +0 -0
- {rc_docparser-0.2.0 → rc_docparser-0.2.2}/src/docparser/orchestrator.py +0 -0
- {rc_docparser-0.2.0 → rc_docparser-0.2.2}/src/docparser/py.typed +0 -0
- {rc_docparser-0.2.0 → rc_docparser-0.2.2}/src/docparser/text.py +0 -0
- {rc_docparser-0.2.0 → rc_docparser-0.2.2}/tests/test_cli.py +0 -0
- {rc_docparser-0.2.0 → rc_docparser-0.2.2}/tests/test_common.py +0 -0
- {rc_docparser-0.2.0 → rc_docparser-0.2.2}/tests/test_csv.py +0 -0
- {rc_docparser-0.2.0 → rc_docparser-0.2.2}/tests/test_docx.py +0 -0
- {rc_docparser-0.2.0 → rc_docparser-0.2.2}/tests/test_epub.py +0 -0
- {rc_docparser-0.2.0 → rc_docparser-0.2.2}/tests/test_html.py +0 -0
- {rc_docparser-0.2.0 → rc_docparser-0.2.2}/tests/test_image.py +0 -0
- {rc_docparser-0.2.0 → rc_docparser-0.2.2}/tests/test_localvlm.py +0 -0
- {rc_docparser-0.2.0 → rc_docparser-0.2.2}/tests/test_orchestrator.py +0 -0
- {rc_docparser-0.2.0 → rc_docparser-0.2.2}/tests/test_pdf.py +0 -0
- {rc_docparser-0.2.0 → rc_docparser-0.2.2}/tests/test_pdf_backends.py +0 -0
- {rc_docparser-0.2.0 → rc_docparser-0.2.2}/tests/test_pptx.py +0 -0
- {rc_docparser-0.2.0 → rc_docparser-0.2.2}/tests/test_text.py +0 -0
- {rc_docparser-0.2.0 → rc_docparser-0.2.2}/tests/test_xlsx.py +0 -0
|
@@ -6,6 +6,19 @@ follows [Semantic Versioning](https://semver.org/).
|
|
|
6
6
|
|
|
7
7
|
## [Unreleased]
|
|
8
8
|
|
|
9
|
+
## [0.2.2] - 2026-06-19
|
|
10
|
+
### Changed
|
|
11
|
+
- README/PyPI page now leads with the `rc-docparser` distribution name and
|
|
12
|
+
clarifies the `docparser` import name (docs-only release).
|
|
13
|
+
|
|
14
|
+
## [0.2.1] - 2026-06-19
|
|
15
|
+
### Changed
|
|
16
|
+
- Packaging aligned with Research Commons conventions: PEP 639 SPDX
|
|
17
|
+
`license = "MIT"` (with `LICENSE` shipped via `license-files`), single-source
|
|
18
|
+
version in `src/docparser/_version.py`, `authors = ["rc-docparser contributors"]`,
|
|
19
|
+
a `Typing :: Typed` classifier, and a `Research Commons` project URL.
|
|
20
|
+
- Tooling: stricter ruff rule set (added `SIM`) and a coverage report config.
|
|
21
|
+
|
|
9
22
|
## [0.2.0] - 2026-06-16
|
|
10
23
|
### Added
|
|
11
24
|
- PPTX parser (extra `[pptx]`): walks slides in order; emits per-slide
|
|
@@ -36,6 +49,9 @@ follows [Semantic Versioning](https://semver.org/).
|
|
|
36
49
|
### Changed
|
|
37
50
|
- `parse_path` and `run_all` accept the new PDF and captioning options;
|
|
38
51
|
non-PDF parsers ignore PDF-only keyword arguments.
|
|
52
|
+
- Published on PyPI as `rc-docparser` (the bare `docparser` name is blocked by
|
|
53
|
+
PyPI's project-name similarity guard). The Python import name is unchanged:
|
|
54
|
+
`import docparser`.
|
|
39
55
|
|
|
40
56
|
## [0.1.0] - 2026-06-12
|
|
41
57
|
### Added
|
|
@@ -1,39 +1,19 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: rc-docparser
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.2
|
|
4
4
|
Summary: Convert research literature (.docx, .xlsx, .pdf, .html, .pptx, .epub, .txt, .md, .csv) into structured Markdown + JSON corpora, with optional VLM image semantic captioning.
|
|
5
5
|
Project-URL: Homepage, https://github.com/Research-Commons/docparser
|
|
6
6
|
Project-URL: Repository, https://github.com/Research-Commons/docparser
|
|
7
7
|
Project-URL: Issues, https://github.com/Research-Commons/docparser/issues
|
|
8
8
|
Project-URL: Changelog, https://github.com/Research-Commons/docparser/blob/main/CHANGELOG.md
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
Copyright (c) 2026 Research Commons
|
|
13
|
-
|
|
14
|
-
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
15
|
-
of this software and associated documentation files (the "Software"), to deal
|
|
16
|
-
in the Software without restriction, including without limitation the rights
|
|
17
|
-
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
18
|
-
copies of the Software, and to permit persons to whom the Software is
|
|
19
|
-
furnished to do so, subject to the following conditions:
|
|
20
|
-
|
|
21
|
-
The above copyright notice and this permission notice shall be included in all
|
|
22
|
-
copies or substantial portions of the Software.
|
|
23
|
-
|
|
24
|
-
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
25
|
-
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
26
|
-
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
27
|
-
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
28
|
-
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
29
|
-
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
30
|
-
SOFTWARE.
|
|
9
|
+
Project-URL: Research Commons, https://lab.researchcommons.ai/
|
|
10
|
+
Author: rc-docparser contributors
|
|
11
|
+
License-Expression: MIT
|
|
31
12
|
License-File: LICENSE
|
|
32
13
|
Keywords: corpus,csv,docx,epub,html,literature,markdown,ocr,parser,pdf,pptx,rag,vlm,xlsx
|
|
33
14
|
Classifier: Development Status :: 4 - Beta
|
|
34
15
|
Classifier: Intended Audience :: Developers
|
|
35
16
|
Classifier: Intended Audience :: Science/Research
|
|
36
|
-
Classifier: License :: OSI Approved :: MIT License
|
|
37
17
|
Classifier: Operating System :: OS Independent
|
|
38
18
|
Classifier: Programming Language :: Python :: 3
|
|
39
19
|
Classifier: Programming Language :: Python :: 3.10
|
|
@@ -41,6 +21,7 @@ Classifier: Programming Language :: Python :: 3.11
|
|
|
41
21
|
Classifier: Programming Language :: Python :: 3.12
|
|
42
22
|
Classifier: Topic :: Scientific/Engineering
|
|
43
23
|
Classifier: Topic :: Text Processing :: Markup
|
|
24
|
+
Classifier: Typing :: Typed
|
|
44
25
|
Requires-Python: >=3.10
|
|
45
26
|
Requires-Dist: lxml>=5.3.0
|
|
46
27
|
Requires-Dist: openpyxl>=3.1.5
|
|
@@ -99,7 +80,9 @@ Provides-Extra: vlm
|
|
|
99
80
|
Requires-Dist: requests>=2.32.3; extra == 'vlm'
|
|
100
81
|
Description-Content-Type: text/markdown
|
|
101
82
|
|
|
102
|
-
# docparser
|
|
83
|
+
# rc-docparser
|
|
84
|
+
|
|
85
|
+
_Published on PyPI as **`rc-docparser`**; import it as `docparser` (`import docparser`)._
|
|
103
86
|
|
|
104
87
|
Convert research literature (`.docx`, `.xlsx`, `.pdf`, `.html`, `.pptx`,
|
|
105
88
|
`.epub`, `.txt`, `.md`, `.csv`) into a clean, reproducible **Markdown + JSON**
|
|
@@ -121,29 +104,32 @@ via OpenRouter, OpenAI, Gemini, a local server, or a fully-local model.
|
|
|
121
104
|
|
|
122
105
|
## Install
|
|
123
106
|
|
|
107
|
+
The package is published on PyPI as **`rc-docparser`**; the Python import name
|
|
108
|
+
is `docparser` (i.e. `import docparser`).
|
|
109
|
+
|
|
124
110
|
```bash
|
|
125
|
-
pip install docparser # core: docx + xlsx + txt/md + csv/tsv
|
|
126
|
-
pip install 'docparser[pdf]' # + PyMuPDF for PDFs
|
|
127
|
-
pip install 'docparser[html]' # + trafilatura + bs4 for HTML
|
|
128
|
-
pip install 'docparser[pptx]' # + python-pptx for PowerPoint
|
|
129
|
-
pip install 'docparser[epub]' # + EbookLib + bs4 for EPUB
|
|
130
|
-
pip install 'docparser[vlm]' # + requests for API VLM captions
|
|
131
|
-
pip install 'docparser[all]' # everything above (recommended)
|
|
111
|
+
pip install rc-docparser # core: docx + xlsx + txt/md + csv/tsv
|
|
112
|
+
pip install 'rc-docparser[pdf]' # + PyMuPDF for PDFs
|
|
113
|
+
pip install 'rc-docparser[html]' # + trafilatura + bs4 for HTML
|
|
114
|
+
pip install 'rc-docparser[pptx]' # + python-pptx for PowerPoint
|
|
115
|
+
pip install 'rc-docparser[epub]' # + EbookLib + bs4 for EPUB
|
|
116
|
+
pip install 'rc-docparser[vlm]' # + requests for API VLM captions
|
|
117
|
+
pip install 'rc-docparser[all]' # everything above (recommended)
|
|
132
118
|
```
|
|
133
119
|
|
|
134
120
|
Higher-fidelity / heavier features are separate opt-in extras (so the core
|
|
135
121
|
install stays small and MIT):
|
|
136
122
|
|
|
137
123
|
```bash
|
|
138
|
-
pip install 'docparser[tables]' # + pdfplumber for PDF table extraction
|
|
139
|
-
pip install 'docparser[ocr]' # + rapidocr-onnxruntime for scanned PDFs
|
|
140
|
-
pip install 'docparser[pymupdf4llm]' # PyMuPDF4LLM PDF backend (AGPL/commercial)
|
|
141
|
-
pip install 'docparser[docling]' # IBM Docling PDF backend (MIT)
|
|
142
|
-
pip install 'docparser[marker]' # Datalab Marker PDF backend (GPL-3.0)
|
|
143
|
-
pip install 'docparser[localvlm]' # transformers/torch local captioning
|
|
124
|
+
pip install 'rc-docparser[tables]' # + pdfplumber for PDF table extraction
|
|
125
|
+
pip install 'rc-docparser[ocr]' # + rapidocr-onnxruntime for scanned PDFs
|
|
126
|
+
pip install 'rc-docparser[pymupdf4llm]' # PyMuPDF4LLM PDF backend (AGPL/commercial)
|
|
127
|
+
pip install 'rc-docparser[docling]' # IBM Docling PDF backend (MIT)
|
|
128
|
+
pip install 'rc-docparser[marker]' # Datalab Marker PDF backend (GPL-3.0)
|
|
129
|
+
pip install 'rc-docparser[localvlm]' # transformers/torch local captioning
|
|
144
130
|
```
|
|
145
131
|
|
|
146
|
-
`docparser` requires Python 3.10+.
|
|
132
|
+
`rc-docparser` requires Python 3.10+.
|
|
147
133
|
|
|
148
134
|
## Quick start (library)
|
|
149
135
|
|
|
@@ -1,4 +1,6 @@
|
|
|
1
|
-
# docparser
|
|
1
|
+
# rc-docparser
|
|
2
|
+
|
|
3
|
+
_Published on PyPI as **`rc-docparser`**; import it as `docparser` (`import docparser`)._
|
|
2
4
|
|
|
3
5
|
Convert research literature (`.docx`, `.xlsx`, `.pdf`, `.html`, `.pptx`,
|
|
4
6
|
`.epub`, `.txt`, `.md`, `.csv`) into a clean, reproducible **Markdown + JSON**
|
|
@@ -20,29 +22,32 @@ via OpenRouter, OpenAI, Gemini, a local server, or a fully-local model.
|
|
|
20
22
|
|
|
21
23
|
## Install
|
|
22
24
|
|
|
25
|
+
The package is published on PyPI as **`rc-docparser`**; the Python import name
|
|
26
|
+
is `docparser` (i.e. `import docparser`).
|
|
27
|
+
|
|
23
28
|
```bash
|
|
24
|
-
pip install docparser # core: docx + xlsx + txt/md + csv/tsv
|
|
25
|
-
pip install 'docparser[pdf]' # + PyMuPDF for PDFs
|
|
26
|
-
pip install 'docparser[html]' # + trafilatura + bs4 for HTML
|
|
27
|
-
pip install 'docparser[pptx]' # + python-pptx for PowerPoint
|
|
28
|
-
pip install 'docparser[epub]' # + EbookLib + bs4 for EPUB
|
|
29
|
-
pip install 'docparser[vlm]' # + requests for API VLM captions
|
|
30
|
-
pip install 'docparser[all]' # everything above (recommended)
|
|
29
|
+
pip install rc-docparser # core: docx + xlsx + txt/md + csv/tsv
|
|
30
|
+
pip install 'rc-docparser[pdf]' # + PyMuPDF for PDFs
|
|
31
|
+
pip install 'rc-docparser[html]' # + trafilatura + bs4 for HTML
|
|
32
|
+
pip install 'rc-docparser[pptx]' # + python-pptx for PowerPoint
|
|
33
|
+
pip install 'rc-docparser[epub]' # + EbookLib + bs4 for EPUB
|
|
34
|
+
pip install 'rc-docparser[vlm]' # + requests for API VLM captions
|
|
35
|
+
pip install 'rc-docparser[all]' # everything above (recommended)
|
|
31
36
|
```
|
|
32
37
|
|
|
33
38
|
Higher-fidelity / heavier features are separate opt-in extras (so the core
|
|
34
39
|
install stays small and MIT):
|
|
35
40
|
|
|
36
41
|
```bash
|
|
37
|
-
pip install 'docparser[tables]' # + pdfplumber for PDF table extraction
|
|
38
|
-
pip install 'docparser[ocr]' # + rapidocr-onnxruntime for scanned PDFs
|
|
39
|
-
pip install 'docparser[pymupdf4llm]' # PyMuPDF4LLM PDF backend (AGPL/commercial)
|
|
40
|
-
pip install 'docparser[docling]' # IBM Docling PDF backend (MIT)
|
|
41
|
-
pip install 'docparser[marker]' # Datalab Marker PDF backend (GPL-3.0)
|
|
42
|
-
pip install 'docparser[localvlm]' # transformers/torch local captioning
|
|
42
|
+
pip install 'rc-docparser[tables]' # + pdfplumber for PDF table extraction
|
|
43
|
+
pip install 'rc-docparser[ocr]' # + rapidocr-onnxruntime for scanned PDFs
|
|
44
|
+
pip install 'rc-docparser[pymupdf4llm]' # PyMuPDF4LLM PDF backend (AGPL/commercial)
|
|
45
|
+
pip install 'rc-docparser[docling]' # IBM Docling PDF backend (MIT)
|
|
46
|
+
pip install 'rc-docparser[marker]' # Datalab Marker PDF backend (GPL-3.0)
|
|
47
|
+
pip install 'rc-docparser[localvlm]' # transformers/torch local captioning
|
|
43
48
|
```
|
|
44
49
|
|
|
45
|
-
`docparser` requires Python 3.10+.
|
|
50
|
+
`rc-docparser` requires Python 3.10+.
|
|
46
51
|
|
|
47
52
|
## Quick start (library)
|
|
48
53
|
|
|
@@ -7,11 +7,10 @@ name = "rc-docparser"
|
|
|
7
7
|
dynamic = ["version"]
|
|
8
8
|
description = "Convert research literature (.docx, .xlsx, .pdf, .html, .pptx, .epub, .txt, .md, .csv) into structured Markdown + JSON corpora, with optional VLM image semantic captioning."
|
|
9
9
|
readme = "README.md"
|
|
10
|
-
license =
|
|
10
|
+
license = "MIT"
|
|
11
|
+
license-files = ["LICENSE"]
|
|
11
12
|
requires-python = ">=3.10"
|
|
12
|
-
authors = [
|
|
13
|
-
{ name = "Research Commons", email = "shubhankitsingh@researchcommons.ai" },
|
|
14
|
-
]
|
|
13
|
+
authors = [{ name = "rc-docparser contributors" }]
|
|
15
14
|
keywords = [
|
|
16
15
|
"docx",
|
|
17
16
|
"xlsx",
|
|
@@ -32,7 +31,6 @@ classifiers = [
|
|
|
32
31
|
"Development Status :: 4 - Beta",
|
|
33
32
|
"Intended Audience :: Developers",
|
|
34
33
|
"Intended Audience :: Science/Research",
|
|
35
|
-
"License :: OSI Approved :: MIT License",
|
|
36
34
|
"Operating System :: OS Independent",
|
|
37
35
|
"Programming Language :: Python :: 3",
|
|
38
36
|
"Programming Language :: Python :: 3.10",
|
|
@@ -40,6 +38,7 @@ classifiers = [
|
|
|
40
38
|
"Programming Language :: Python :: 3.12",
|
|
41
39
|
"Topic :: Scientific/Engineering",
|
|
42
40
|
"Topic :: Text Processing :: Markup",
|
|
41
|
+
"Typing :: Typed",
|
|
43
42
|
]
|
|
44
43
|
dependencies = [
|
|
45
44
|
"python-docx>=1.1.2",
|
|
@@ -97,9 +96,10 @@ Homepage = "https://github.com/Research-Commons/docparser"
|
|
|
97
96
|
Repository = "https://github.com/Research-Commons/docparser"
|
|
98
97
|
Issues = "https://github.com/Research-Commons/docparser/issues"
|
|
99
98
|
Changelog = "https://github.com/Research-Commons/docparser/blob/main/CHANGELOG.md"
|
|
99
|
+
"Research Commons" = "https://lab.researchcommons.ai/"
|
|
100
100
|
|
|
101
101
|
[tool.hatch.version]
|
|
102
|
-
path = "src/docparser/
|
|
102
|
+
path = "src/docparser/_version.py"
|
|
103
103
|
|
|
104
104
|
[tool.hatch.build.targets.wheel]
|
|
105
105
|
packages = ["src/docparser"]
|
|
@@ -121,7 +121,7 @@ line-length = 100
|
|
|
121
121
|
target-version = "py310"
|
|
122
122
|
|
|
123
123
|
[tool.ruff.lint]
|
|
124
|
-
select = ["E", "F", "W", "I", "B", "UP", "RUF"]
|
|
124
|
+
select = ["E", "F", "W", "I", "B", "UP", "SIM", "RUF"]
|
|
125
125
|
ignore = ["E501"]
|
|
126
126
|
|
|
127
127
|
[tool.pytest.ini_options]
|
|
@@ -135,6 +135,16 @@ markers = [
|
|
|
135
135
|
source = ["src/docparser"]
|
|
136
136
|
branch = true
|
|
137
137
|
|
|
138
|
+
[tool.coverage.report]
|
|
139
|
+
show_missing = true
|
|
140
|
+
exclude_lines = [
|
|
141
|
+
"pragma: no cover",
|
|
142
|
+
"if TYPE_CHECKING:",
|
|
143
|
+
"@overload",
|
|
144
|
+
"raise NotImplementedError",
|
|
145
|
+
"\\.\\.\\.",
|
|
146
|
+
]
|
|
147
|
+
|
|
138
148
|
[tool.mypy]
|
|
139
149
|
python_version = "3.10"
|
|
140
150
|
files = ["src/docparser"]
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.2.2"
|
|
@@ -21,10 +21,7 @@ from .orchestrator import SUPPORTED_EXTENSIONS, parse_path, run_all
|
|
|
21
21
|
|
|
22
22
|
|
|
23
23
|
def _layout_from_args(args: argparse.Namespace) -> WorkspaceLayout:
|
|
24
|
-
if args.workspace
|
|
25
|
-
layout = WorkspaceLayout.under(args.workspace)
|
|
26
|
-
else:
|
|
27
|
-
layout = WorkspaceLayout()
|
|
24
|
+
layout = WorkspaceLayout.under(args.workspace) if args.workspace else WorkspaceLayout()
|
|
28
25
|
if getattr(args, "raw_dir", None):
|
|
29
26
|
layout.raw_dir = Path(args.raw_dir)
|
|
30
27
|
if getattr(args, "parsed_dir", None):
|
|
@@ -140,9 +140,7 @@ def _is_caption(p: Paragraph) -> bool:
|
|
|
140
140
|
if style in CAPTION_STYLE_NAMES:
|
|
141
141
|
return True
|
|
142
142
|
text = (p.text or "").strip()
|
|
143
|
-
|
|
144
|
-
return True
|
|
145
|
-
return False
|
|
143
|
+
return bool(CAPTION_RE.match(text))
|
|
146
144
|
|
|
147
145
|
|
|
148
146
|
@dataclass
|
|
@@ -5,7 +5,7 @@ over each chapter (headings / paragraphs / lists / tables / images), and
|
|
|
5
5
|
extracts embedded images to the asset directory. Embedded images can be
|
|
6
6
|
captioned via a ``captioner`` callable.
|
|
7
7
|
|
|
8
|
-
Requires the ``[epub]`` extra: ``pip install 'docparser[epub]'`` (which also
|
|
8
|
+
Requires the ``[epub]`` extra: ``pip install 'rc-docparser[epub]'`` (which also
|
|
9
9
|
pulls in BeautifulSoup from the ``[html]`` extra).
|
|
10
10
|
"""
|
|
11
11
|
from __future__ import annotations
|
|
@@ -36,7 +36,7 @@ def _import_deps():
|
|
|
36
36
|
except ImportError as exc: # pragma: no cover - optional dep guard
|
|
37
37
|
raise ImportError(
|
|
38
38
|
"docparser.epub.parse_epub requires the [epub] extra. "
|
|
39
|
-
"Install with: pip install 'docparser[epub]'"
|
|
39
|
+
"Install with: pip install 'rc-docparser[epub]'"
|
|
40
40
|
) from exc
|
|
41
41
|
return ebooklib, epub, BeautifulSoup
|
|
42
42
|
|
|
@@ -8,7 +8,7 @@ Two-tier strategy:
|
|
|
8
8
|
typed blocks (heading/paragraph/list/table/image) when trafilatura returns
|
|
9
9
|
nothing useful or when the caller wants the full structure.
|
|
10
10
|
|
|
11
|
-
Requires the ``[html]`` extra: ``pip install 'docparser[html]'``.
|
|
11
|
+
Requires the ``[html]`` extra: ``pip install 'rc-docparser[html]'``.
|
|
12
12
|
"""
|
|
13
13
|
from __future__ import annotations
|
|
14
14
|
|
|
@@ -37,7 +37,7 @@ def _import_deps():
|
|
|
37
37
|
except ImportError as exc: # pragma: no cover
|
|
38
38
|
raise ImportError(
|
|
39
39
|
"docparser.html.parse_html requires the [html] extra. "
|
|
40
|
-
"Install with: pip install 'docparser[html]'"
|
|
40
|
+
"Install with: pip install 'rc-docparser[html]'"
|
|
41
41
|
) from exc
|
|
42
42
|
return trafilatura, BeautifulSoup
|
|
43
43
|
|
|
@@ -234,7 +234,7 @@ def caption_image(
|
|
|
234
234
|
if requests is None: # pragma: no cover - optional dep guard
|
|
235
235
|
raise ImportError(
|
|
236
236
|
"docparser.image.caption_image requires the [vlm] extra. "
|
|
237
|
-
"Install with: pip install 'docparser[vlm]'"
|
|
237
|
+
"Install with: pip install 'rc-docparser[vlm]'"
|
|
238
238
|
)
|
|
239
239
|
|
|
240
240
|
provider_name, preset = _resolve_provider(provider)
|
|
@@ -8,7 +8,7 @@ image-to-text model such as BLIP.
|
|
|
8
8
|
It honors the same ``VLMResult`` shape and the same ``SHA1(image) x model``
|
|
9
9
|
on-disk cache as the API captioner, so output is interchangeable.
|
|
10
10
|
|
|
11
|
-
Requires the ``[localvlm]`` extra: ``pip install 'docparser[localvlm]'``.
|
|
11
|
+
Requires the ``[localvlm]`` extra: ``pip install 'rc-docparser[localvlm]'``.
|
|
12
12
|
"""
|
|
13
13
|
from __future__ import annotations
|
|
14
14
|
|
|
@@ -32,7 +32,7 @@ def _load_pipeline(model: str):
|
|
|
32
32
|
except ImportError as exc: # pragma: no cover - optional dep
|
|
33
33
|
raise ImportError(
|
|
34
34
|
"docparser.localvlm requires the [localvlm] extra. "
|
|
35
|
-
"Install with: pip install 'docparser[localvlm]'"
|
|
35
|
+
"Install with: pip install 'rc-docparser[localvlm]'"
|
|
36
36
|
) from exc
|
|
37
37
|
return pipeline("image-to-text", model=model)
|
|
38
38
|
|
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
Uses ``rapidocr-onnxruntime`` by default: a pure-pip ONNX OCR engine that needs
|
|
4
4
|
no system binaries (unlike Tesseract). The engine is created once and reused.
|
|
5
5
|
|
|
6
|
-
Requires the ``[ocr]`` extra: ``pip install 'docparser[ocr]'``.
|
|
6
|
+
Requires the ``[ocr]`` extra: ``pip install 'rc-docparser[ocr]'``.
|
|
7
7
|
"""
|
|
8
8
|
from __future__ import annotations
|
|
9
9
|
|
|
@@ -12,7 +12,7 @@ from functools import lru_cache
|
|
|
12
12
|
from typing import Any
|
|
13
13
|
|
|
14
14
|
_NO_OCR_MSG = (
|
|
15
|
-
"OCR requires the [ocr] extra. Install with: pip install 'docparser[ocr]'"
|
|
15
|
+
"OCR requires the [ocr] extra. Install with: pip install 'rc-docparser[ocr]'"
|
|
16
16
|
)
|
|
17
17
|
|
|
18
18
|
|
|
@@ -12,7 +12,7 @@ heading classifier based on font sizing. On top of that it offers:
|
|
|
12
12
|
- ``extract_tables`` use ``pdfplumber`` (the ``[tables]`` extra) to emit real
|
|
13
13
|
table blocks instead of flattened text.
|
|
14
14
|
|
|
15
|
-
Requires the ``[pdf]`` extra: ``pip install 'docparser[pdf]'``.
|
|
15
|
+
Requires the ``[pdf]`` extra: ``pip install 'rc-docparser[pdf]'``.
|
|
16
16
|
"""
|
|
17
17
|
from __future__ import annotations
|
|
18
18
|
|
|
@@ -104,7 +104,7 @@ def parse_pdf(
|
|
|
104
104
|
except ImportError as exc: # pragma: no cover
|
|
105
105
|
raise ImportError(
|
|
106
106
|
"docparser.pdf.parse_pdf requires the [pdf] extra. "
|
|
107
|
-
"Install with: pip install 'docparser[pdf]'"
|
|
107
|
+
"Install with: pip install 'rc-docparser[pdf]'"
|
|
108
108
|
) from exc
|
|
109
109
|
|
|
110
110
|
if ocr not in {"off", "auto", "force"}:
|
|
@@ -218,7 +218,7 @@ def parse_pdf(
|
|
|
218
218
|
except ImportError as exc: # pragma: no cover - optional dep
|
|
219
219
|
raise ImportError(
|
|
220
220
|
"extract_tables=True requires the [tables] extra. "
|
|
221
|
-
"Install with: pip install 'docparser[tables]'"
|
|
221
|
+
"Install with: pip install 'rc-docparser[tables]'"
|
|
222
222
|
) from exc
|
|
223
223
|
out: list[list[list[str]]] = []
|
|
224
224
|
with pdfplumber.open(str(real_source)) as pdf:
|
|
@@ -8,9 +8,9 @@ schema via :func:`docparser.text._blocks_from_markdown`.
|
|
|
8
8
|
|
|
9
9
|
All backends are optional extras and lazily imported:
|
|
10
10
|
|
|
11
|
-
- ``pymupdf4llm`` -> ``pip install 'docparser[pymupdf4llm]'`` (note: AGPL/commercial)
|
|
12
|
-
- ``docling`` -> ``pip install 'docparser[docling]'`` (MIT)
|
|
13
|
-
- ``marker`` -> ``pip install 'docparser[marker]'`` (GPL-3.0)
|
|
11
|
+
- ``pymupdf4llm`` -> ``pip install 'rc-docparser[pymupdf4llm]'`` (note: AGPL/commercial)
|
|
12
|
+
- ``docling`` -> ``pip install 'rc-docparser[docling]'`` (MIT)
|
|
13
|
+
- ``marker`` -> ``pip install 'rc-docparser[marker]'`` (GPL-3.0)
|
|
14
14
|
"""
|
|
15
15
|
from __future__ import annotations
|
|
16
16
|
|
|
@@ -28,7 +28,7 @@ def _markdown_pymupdf4llm(path: Path) -> str:
|
|
|
28
28
|
except ImportError as exc: # pragma: no cover - optional dep
|
|
29
29
|
raise ImportError(
|
|
30
30
|
"backend='pymupdf4llm' requires the [pymupdf4llm] extra. "
|
|
31
|
-
"Install with: pip install 'docparser[pymupdf4llm]'"
|
|
31
|
+
"Install with: pip install 'rc-docparser[pymupdf4llm]'"
|
|
32
32
|
) from exc
|
|
33
33
|
return pymupdf4llm.to_markdown(str(path))
|
|
34
34
|
|
|
@@ -39,7 +39,7 @@ def _markdown_docling(path: Path) -> str:
|
|
|
39
39
|
except ImportError as exc: # pragma: no cover - optional dep
|
|
40
40
|
raise ImportError(
|
|
41
41
|
"backend='docling' requires the [docling] extra. "
|
|
42
|
-
"Install with: pip install 'docparser[docling]'"
|
|
42
|
+
"Install with: pip install 'rc-docparser[docling]'"
|
|
43
43
|
) from exc
|
|
44
44
|
converter = DocumentConverter()
|
|
45
45
|
result = converter.convert(str(path))
|
|
@@ -55,7 +55,7 @@ def _markdown_marker(path: Path) -> str:
|
|
|
55
55
|
except ImportError as exc: # pragma: no cover - optional dep
|
|
56
56
|
raise ImportError(
|
|
57
57
|
"backend='marker' requires the [marker] extra. "
|
|
58
|
-
"Install with: pip install 'docparser[marker]'"
|
|
58
|
+
"Install with: pip install 'rc-docparser[marker]'"
|
|
59
59
|
) from exc
|
|
60
60
|
config_parser = ConfigParser({"output_format": "markdown"})
|
|
61
61
|
converter = PdfConverter(
|
|
@@ -6,7 +6,7 @@ pictures are emitted in shape order. Speaker notes are captured per slide.
|
|
|
6
6
|
Embedded pictures are written to ``layout.assets_dir_for(source)`` and may be
|
|
7
7
|
captioned via a ``captioner`` callable (same contract as the other parsers).
|
|
8
8
|
|
|
9
|
-
Requires the ``[pptx]`` extra: ``pip install 'docparser[pptx]'``.
|
|
9
|
+
Requires the ``[pptx]`` extra: ``pip install 'rc-docparser[pptx]'``.
|
|
10
10
|
"""
|
|
11
11
|
from __future__ import annotations
|
|
12
12
|
|
|
@@ -33,7 +33,7 @@ def _import_pptx():
|
|
|
33
33
|
except ImportError as exc: # pragma: no cover - optional dep guard
|
|
34
34
|
raise ImportError(
|
|
35
35
|
"docparser.pptx.parse_pptx requires the [pptx] extra. "
|
|
36
|
-
"Install with: pip install 'docparser[pptx]'"
|
|
36
|
+
"Install with: pip install 'rc-docparser[pptx]'"
|
|
37
37
|
) from exc
|
|
38
38
|
return Presentation, MSO_SHAPE_TYPE
|
|
39
39
|
|
|
@@ -6,6 +6,7 @@ embedded images.
|
|
|
6
6
|
"""
|
|
7
7
|
from __future__ import annotations
|
|
8
8
|
|
|
9
|
+
import contextlib
|
|
9
10
|
import datetime as _dt
|
|
10
11
|
import io
|
|
11
12
|
from collections.abc import Callable
|
|
@@ -66,10 +67,8 @@ def _cell_record(cell: Cell, formulas_ws: Worksheet | None) -> dict[str, Any]:
|
|
|
66
67
|
except Exception:
|
|
67
68
|
pass
|
|
68
69
|
if cell.comment is not None:
|
|
69
|
-
|
|
70
|
+
with contextlib.suppress(Exception):
|
|
70
71
|
rec["comment"] = str(cell.comment.text)
|
|
71
|
-
except Exception:
|
|
72
|
-
pass
|
|
73
72
|
return rec
|
|
74
73
|
|
|
75
74
|
|
|
@@ -84,10 +83,8 @@ def _extract_images(
|
|
|
84
83
|
data: bytes | None = None
|
|
85
84
|
if hasattr(ref, "read"):
|
|
86
85
|
data = ref.read()
|
|
87
|
-
|
|
86
|
+
with contextlib.suppress(Exception):
|
|
88
87
|
ref.seek(0)
|
|
89
|
-
except Exception:
|
|
90
|
-
pass
|
|
91
88
|
elif isinstance(ref, (bytes, bytearray)):
|
|
92
89
|
data = bytes(ref)
|
|
93
90
|
else:
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
"""Shared pytest fixtures: synthetic docx, xlsx, pdf, html files."""
|
|
2
2
|
from __future__ import annotations
|
|
3
3
|
|
|
4
|
+
import contextlib
|
|
4
5
|
import io
|
|
5
6
|
from pathlib import Path
|
|
6
7
|
|
|
@@ -51,10 +52,8 @@ def sample_docx(tmp_path: Path) -> Path:
|
|
|
51
52
|
img_path.write_bytes(_png_bytes())
|
|
52
53
|
doc.add_picture(str(img_path))
|
|
53
54
|
cap_para = doc.add_paragraph("Figure 1: a red square example.")
|
|
54
|
-
|
|
55
|
+
with contextlib.suppress(KeyError):
|
|
55
56
|
cap_para.style = doc.styles["Caption"]
|
|
56
|
-
except KeyError:
|
|
57
|
-
pass
|
|
58
57
|
|
|
59
58
|
doc.add_heading("Section B", level=2)
|
|
60
59
|
doc.add_paragraph("Conclusion paragraph.")
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|