rc-docparser 0.2.0__tar.gz → 0.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. {rc_docparser-0.2.0 → rc_docparser-0.2.1}/CHANGELOG.md +11 -0
  2. {rc_docparser-0.2.0 → rc_docparser-0.2.1}/PKG-INFO +22 -38
  3. {rc_docparser-0.2.0 → rc_docparser-0.2.1}/README.md +17 -14
  4. {rc_docparser-0.2.0 → rc_docparser-0.2.1}/pyproject.toml +17 -7
  5. {rc_docparser-0.2.0 → rc_docparser-0.2.1}/src/docparser/__init__.py +1 -2
  6. rc_docparser-0.2.1/src/docparser/_version.py +1 -0
  7. {rc_docparser-0.2.0 → rc_docparser-0.2.1}/src/docparser/cli.py +1 -4
  8. {rc_docparser-0.2.0 → rc_docparser-0.2.1}/src/docparser/docx.py +1 -3
  9. {rc_docparser-0.2.0 → rc_docparser-0.2.1}/src/docparser/epub.py +2 -2
  10. {rc_docparser-0.2.0 → rc_docparser-0.2.1}/src/docparser/html.py +2 -2
  11. {rc_docparser-0.2.0 → rc_docparser-0.2.1}/src/docparser/image.py +1 -1
  12. {rc_docparser-0.2.0 → rc_docparser-0.2.1}/src/docparser/localvlm.py +2 -2
  13. {rc_docparser-0.2.0 → rc_docparser-0.2.1}/src/docparser/ocr.py +2 -2
  14. {rc_docparser-0.2.0 → rc_docparser-0.2.1}/src/docparser/pdf.py +3 -3
  15. {rc_docparser-0.2.0 → rc_docparser-0.2.1}/src/docparser/pdf_backends.py +6 -6
  16. {rc_docparser-0.2.0 → rc_docparser-0.2.1}/src/docparser/pptx.py +2 -2
  17. {rc_docparser-0.2.0 → rc_docparser-0.2.1}/src/docparser/xlsx.py +3 -6
  18. {rc_docparser-0.2.0 → rc_docparser-0.2.1}/tests/conftest.py +2 -3
  19. {rc_docparser-0.2.0 → rc_docparser-0.2.1}/.gitignore +0 -0
  20. {rc_docparser-0.2.0 → rc_docparser-0.2.1}/LICENSE +0 -0
  21. {rc_docparser-0.2.0 → rc_docparser-0.2.1}/src/docparser/common.py +0 -0
  22. {rc_docparser-0.2.0 → rc_docparser-0.2.1}/src/docparser/csvtab.py +0 -0
  23. {rc_docparser-0.2.0 → rc_docparser-0.2.1}/src/docparser/orchestrator.py +0 -0
  24. {rc_docparser-0.2.0 → rc_docparser-0.2.1}/src/docparser/py.typed +0 -0
  25. {rc_docparser-0.2.0 → rc_docparser-0.2.1}/src/docparser/text.py +0 -0
  26. {rc_docparser-0.2.0 → rc_docparser-0.2.1}/tests/test_cli.py +0 -0
  27. {rc_docparser-0.2.0 → rc_docparser-0.2.1}/tests/test_common.py +0 -0
  28. {rc_docparser-0.2.0 → rc_docparser-0.2.1}/tests/test_csv.py +0 -0
  29. {rc_docparser-0.2.0 → rc_docparser-0.2.1}/tests/test_docx.py +0 -0
  30. {rc_docparser-0.2.0 → rc_docparser-0.2.1}/tests/test_epub.py +0 -0
  31. {rc_docparser-0.2.0 → rc_docparser-0.2.1}/tests/test_html.py +0 -0
  32. {rc_docparser-0.2.0 → rc_docparser-0.2.1}/tests/test_image.py +0 -0
  33. {rc_docparser-0.2.0 → rc_docparser-0.2.1}/tests/test_localvlm.py +0 -0
  34. {rc_docparser-0.2.0 → rc_docparser-0.2.1}/tests/test_orchestrator.py +0 -0
  35. {rc_docparser-0.2.0 → rc_docparser-0.2.1}/tests/test_pdf.py +0 -0
  36. {rc_docparser-0.2.0 → rc_docparser-0.2.1}/tests/test_pdf_backends.py +0 -0
  37. {rc_docparser-0.2.0 → rc_docparser-0.2.1}/tests/test_pptx.py +0 -0
  38. {rc_docparser-0.2.0 → rc_docparser-0.2.1}/tests/test_text.py +0 -0
  39. {rc_docparser-0.2.0 → rc_docparser-0.2.1}/tests/test_xlsx.py +0 -0
@@ -6,6 +6,14 @@ follows [Semantic Versioning](https://semver.org/).
6
6
 
7
7
  ## [Unreleased]
8
8
 
9
+ ## [0.2.1] - 2026-06-19
10
+ ### Changed
11
+ - Packaging aligned with Research Commons conventions: PEP 639 SPDX
12
+ `license = "MIT"` (with `LICENSE` shipped via `license-files`), single-source
13
+ version in `src/docparser/_version.py`, `authors = ["rc-docparser contributors"]`,
14
+ a `Typing :: Typed` classifier, and a `Research Commons` project URL.
15
+ - Tooling: stricter ruff rule set (added `SIM`) and a coverage report config.
16
+
9
17
  ## [0.2.0] - 2026-06-16
10
18
  ### Added
11
19
  - PPTX parser (extra `[pptx]`): walks slides in order; emits per-slide
@@ -36,6 +44,9 @@ follows [Semantic Versioning](https://semver.org/).
36
44
  ### Changed
37
45
  - `parse_path` and `run_all` accept the new PDF and captioning options;
38
46
  non-PDF parsers ignore PDF-only keyword arguments.
47
+ - Published on PyPI as `rc-docparser` (the bare `docparser` name is blocked by
48
+ PyPI's project-name similarity guard). The Python import name is unchanged:
49
+ `import docparser`.
39
50
 
40
51
  ## [0.1.0] - 2026-06-12
41
52
  ### Added
@@ -1,39 +1,19 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: rc-docparser
3
- Version: 0.2.0
3
+ Version: 0.2.1
4
4
  Summary: Convert research literature (.docx, .xlsx, .pdf, .html, .pptx, .epub, .txt, .md, .csv) into structured Markdown + JSON corpora, with optional VLM image semantic captioning.
5
5
  Project-URL: Homepage, https://github.com/Research-Commons/docparser
6
6
  Project-URL: Repository, https://github.com/Research-Commons/docparser
7
7
  Project-URL: Issues, https://github.com/Research-Commons/docparser/issues
8
8
  Project-URL: Changelog, https://github.com/Research-Commons/docparser/blob/main/CHANGELOG.md
9
- Author-email: Research Commons <shubhankitsingh@researchcommons.ai>
10
- License: MIT License
11
-
12
- Copyright (c) 2026 Research Commons
13
-
14
- Permission is hereby granted, free of charge, to any person obtaining a copy
15
- of this software and associated documentation files (the "Software"), to deal
16
- in the Software without restriction, including without limitation the rights
17
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
18
- copies of the Software, and to permit persons to whom the Software is
19
- furnished to do so, subject to the following conditions:
20
-
21
- The above copyright notice and this permission notice shall be included in all
22
- copies or substantial portions of the Software.
23
-
24
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
25
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
26
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
27
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
28
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
29
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30
- SOFTWARE.
9
+ Project-URL: Research Commons, https://lab.researchcommons.ai/
10
+ Author: rc-docparser contributors
11
+ License-Expression: MIT
31
12
  License-File: LICENSE
32
13
  Keywords: corpus,csv,docx,epub,html,literature,markdown,ocr,parser,pdf,pptx,rag,vlm,xlsx
33
14
  Classifier: Development Status :: 4 - Beta
34
15
  Classifier: Intended Audience :: Developers
35
16
  Classifier: Intended Audience :: Science/Research
36
- Classifier: License :: OSI Approved :: MIT License
37
17
  Classifier: Operating System :: OS Independent
38
18
  Classifier: Programming Language :: Python :: 3
39
19
  Classifier: Programming Language :: Python :: 3.10
@@ -41,6 +21,7 @@ Classifier: Programming Language :: Python :: 3.11
41
21
  Classifier: Programming Language :: Python :: 3.12
42
22
  Classifier: Topic :: Scientific/Engineering
43
23
  Classifier: Topic :: Text Processing :: Markup
24
+ Classifier: Typing :: Typed
44
25
  Requires-Python: >=3.10
45
26
  Requires-Dist: lxml>=5.3.0
46
27
  Requires-Dist: openpyxl>=3.1.5
@@ -121,29 +102,32 @@ via OpenRouter, OpenAI, Gemini, a local server, or a fully-local model.
121
102
 
122
103
  ## Install
123
104
 
105
+ The package is published on PyPI as **`rc-docparser`**; the Python import name
106
+ is `docparser` (i.e. `import docparser`).
107
+
124
108
  ```bash
125
- pip install docparser # core: docx + xlsx + txt/md + csv/tsv
126
- pip install 'docparser[pdf]' # + PyMuPDF for PDFs
127
- pip install 'docparser[html]' # + trafilatura + bs4 for HTML
128
- pip install 'docparser[pptx]' # + python-pptx for PowerPoint
129
- pip install 'docparser[epub]' # + EbookLib + bs4 for EPUB
130
- pip install 'docparser[vlm]' # + requests for API VLM captions
131
- pip install 'docparser[all]' # everything above (recommended)
109
+ pip install rc-docparser # core: docx + xlsx + txt/md + csv/tsv
110
+ pip install 'rc-docparser[pdf]' # + PyMuPDF for PDFs
111
+ pip install 'rc-docparser[html]' # + trafilatura + bs4 for HTML
112
+ pip install 'rc-docparser[pptx]' # + python-pptx for PowerPoint
113
+ pip install 'rc-docparser[epub]' # + EbookLib + bs4 for EPUB
114
+ pip install 'rc-docparser[vlm]' # + requests for API VLM captions
115
+ pip install 'rc-docparser[all]' # everything above (recommended)
132
116
  ```
133
117
 
134
118
  Higher-fidelity / heavier features are separate opt-in extras (so the core
135
119
  install stays small and MIT):
136
120
 
137
121
  ```bash
138
- pip install 'docparser[tables]' # + pdfplumber for PDF table extraction
139
- pip install 'docparser[ocr]' # + rapidocr-onnxruntime for scanned PDFs
140
- pip install 'docparser[pymupdf4llm]' # PyMuPDF4LLM PDF backend (AGPL/commercial)
141
- pip install 'docparser[docling]' # IBM Docling PDF backend (MIT)
142
- pip install 'docparser[marker]' # Datalab Marker PDF backend (GPL-3.0)
143
- pip install 'docparser[localvlm]' # transformers/torch local captioning
122
+ pip install 'rc-docparser[tables]' # + pdfplumber for PDF table extraction
123
+ pip install 'rc-docparser[ocr]' # + rapidocr-onnxruntime for scanned PDFs
124
+ pip install 'rc-docparser[pymupdf4llm]' # PyMuPDF4LLM PDF backend (AGPL/commercial)
125
+ pip install 'rc-docparser[docling]' # IBM Docling PDF backend (MIT)
126
+ pip install 'rc-docparser[marker]' # Datalab Marker PDF backend (GPL-3.0)
127
+ pip install 'rc-docparser[localvlm]' # transformers/torch local captioning
144
128
  ```
145
129
 
146
- `docparser` requires Python 3.10+.
130
+ `rc-docparser` requires Python 3.10+.
147
131
 
148
132
  ## Quick start (library)
149
133
 
@@ -20,29 +20,32 @@ via OpenRouter, OpenAI, Gemini, a local server, or a fully-local model.
20
20
 
21
21
  ## Install
22
22
 
23
+ The package is published on PyPI as **`rc-docparser`**; the Python import name
24
+ is `docparser` (i.e. `import docparser`).
25
+
23
26
  ```bash
24
- pip install docparser # core: docx + xlsx + txt/md + csv/tsv
25
- pip install 'docparser[pdf]' # + PyMuPDF for PDFs
26
- pip install 'docparser[html]' # + trafilatura + bs4 for HTML
27
- pip install 'docparser[pptx]' # + python-pptx for PowerPoint
28
- pip install 'docparser[epub]' # + EbookLib + bs4 for EPUB
29
- pip install 'docparser[vlm]' # + requests for API VLM captions
30
- pip install 'docparser[all]' # everything above (recommended)
27
+ pip install rc-docparser # core: docx + xlsx + txt/md + csv/tsv
28
+ pip install 'rc-docparser[pdf]' # + PyMuPDF for PDFs
29
+ pip install 'rc-docparser[html]' # + trafilatura + bs4 for HTML
30
+ pip install 'rc-docparser[pptx]' # + python-pptx for PowerPoint
31
+ pip install 'rc-docparser[epub]' # + EbookLib + bs4 for EPUB
32
+ pip install 'rc-docparser[vlm]' # + requests for API VLM captions
33
+ pip install 'rc-docparser[all]' # everything above (recommended)
31
34
  ```
32
35
 
33
36
  Higher-fidelity / heavier features are separate opt-in extras (so the core
34
37
  install stays small and MIT):
35
38
 
36
39
  ```bash
37
- pip install 'docparser[tables]' # + pdfplumber for PDF table extraction
38
- pip install 'docparser[ocr]' # + rapidocr-onnxruntime for scanned PDFs
39
- pip install 'docparser[pymupdf4llm]' # PyMuPDF4LLM PDF backend (AGPL/commercial)
40
- pip install 'docparser[docling]' # IBM Docling PDF backend (MIT)
41
- pip install 'docparser[marker]' # Datalab Marker PDF backend (GPL-3.0)
42
- pip install 'docparser[localvlm]' # transformers/torch local captioning
40
+ pip install 'rc-docparser[tables]' # + pdfplumber for PDF table extraction
41
+ pip install 'rc-docparser[ocr]' # + rapidocr-onnxruntime for scanned PDFs
42
+ pip install 'rc-docparser[pymupdf4llm]' # PyMuPDF4LLM PDF backend (AGPL/commercial)
43
+ pip install 'rc-docparser[docling]' # IBM Docling PDF backend (MIT)
44
+ pip install 'rc-docparser[marker]' # Datalab Marker PDF backend (GPL-3.0)
45
+ pip install 'rc-docparser[localvlm]' # transformers/torch local captioning
43
46
  ```
44
47
 
45
- `docparser` requires Python 3.10+.
48
+ `rc-docparser` requires Python 3.10+.
46
49
 
47
50
  ## Quick start (library)
48
51
 
@@ -7,11 +7,10 @@ name = "rc-docparser"
7
7
  dynamic = ["version"]
8
8
  description = "Convert research literature (.docx, .xlsx, .pdf, .html, .pptx, .epub, .txt, .md, .csv) into structured Markdown + JSON corpora, with optional VLM image semantic captioning."
9
9
  readme = "README.md"
10
- license = { file = "LICENSE" }
10
+ license = "MIT"
11
+ license-files = ["LICENSE"]
11
12
  requires-python = ">=3.10"
12
- authors = [
13
- { name = "Research Commons", email = "shubhankitsingh@researchcommons.ai" },
14
- ]
13
+ authors = [{ name = "rc-docparser contributors" }]
15
14
  keywords = [
16
15
  "docx",
17
16
  "xlsx",
@@ -32,7 +31,6 @@ classifiers = [
32
31
  "Development Status :: 4 - Beta",
33
32
  "Intended Audience :: Developers",
34
33
  "Intended Audience :: Science/Research",
35
- "License :: OSI Approved :: MIT License",
36
34
  "Operating System :: OS Independent",
37
35
  "Programming Language :: Python :: 3",
38
36
  "Programming Language :: Python :: 3.10",
@@ -40,6 +38,7 @@ classifiers = [
40
38
  "Programming Language :: Python :: 3.12",
41
39
  "Topic :: Scientific/Engineering",
42
40
  "Topic :: Text Processing :: Markup",
41
+ "Typing :: Typed",
43
42
  ]
44
43
  dependencies = [
45
44
  "python-docx>=1.1.2",
@@ -97,9 +96,10 @@ Homepage = "https://github.com/Research-Commons/docparser"
97
96
  Repository = "https://github.com/Research-Commons/docparser"
98
97
  Issues = "https://github.com/Research-Commons/docparser/issues"
99
98
  Changelog = "https://github.com/Research-Commons/docparser/blob/main/CHANGELOG.md"
99
+ "Research Commons" = "https://lab.researchcommons.ai/"
100
100
 
101
101
  [tool.hatch.version]
102
- path = "src/docparser/__init__.py"
102
+ path = "src/docparser/_version.py"
103
103
 
104
104
  [tool.hatch.build.targets.wheel]
105
105
  packages = ["src/docparser"]
@@ -121,7 +121,7 @@ line-length = 100
121
121
  target-version = "py310"
122
122
 
123
123
  [tool.ruff.lint]
124
- select = ["E", "F", "W", "I", "B", "UP", "RUF"]
124
+ select = ["E", "F", "W", "I", "B", "UP", "SIM", "RUF"]
125
125
  ignore = ["E501"]
126
126
 
127
127
  [tool.pytest.ini_options]
@@ -135,6 +135,16 @@ markers = [
135
135
  source = ["src/docparser"]
136
136
  branch = true
137
137
 
138
+ [tool.coverage.report]
139
+ show_missing = true
140
+ exclude_lines = [
141
+ "pragma: no cover",
142
+ "if TYPE_CHECKING:",
143
+ "@overload",
144
+ "raise NotImplementedError",
145
+ "\\.\\.\\.",
146
+ ]
147
+
138
148
  [tool.mypy]
139
149
  python_version = "3.10"
140
150
  files = ["src/docparser"]
@@ -19,8 +19,7 @@ Public API
19
19
  """
20
20
  from __future__ import annotations
21
21
 
22
- __version__ = "0.2.0"
23
-
22
+ from ._version import __version__
24
23
  from .common import (
25
24
  WorkspaceLayout,
26
25
  bytes_sha1,
@@ -0,0 +1 @@
1
+ __version__ = "0.2.1"
@@ -21,10 +21,7 @@ from .orchestrator import SUPPORTED_EXTENSIONS, parse_path, run_all
21
21
 
22
22
 
23
23
  def _layout_from_args(args: argparse.Namespace) -> WorkspaceLayout:
24
- if args.workspace:
25
- layout = WorkspaceLayout.under(args.workspace)
26
- else:
27
- layout = WorkspaceLayout()
24
+ layout = WorkspaceLayout.under(args.workspace) if args.workspace else WorkspaceLayout()
28
25
  if getattr(args, "raw_dir", None):
29
26
  layout.raw_dir = Path(args.raw_dir)
30
27
  if getattr(args, "parsed_dir", None):
@@ -140,9 +140,7 @@ def _is_caption(p: Paragraph) -> bool:
140
140
  if style in CAPTION_STYLE_NAMES:
141
141
  return True
142
142
  text = (p.text or "").strip()
143
- if CAPTION_RE.match(text):
144
- return True
145
- return False
143
+ return bool(CAPTION_RE.match(text))
146
144
 
147
145
 
148
146
  @dataclass
@@ -5,7 +5,7 @@ over each chapter (headings / paragraphs / lists / tables / images), and
5
5
  extracts embedded images to the asset directory. Embedded images can be
6
6
  captioned via a ``captioner`` callable.
7
7
 
8
- Requires the ``[epub]`` extra: ``pip install 'docparser[epub]'`` (which also
8
+ Requires the ``[epub]`` extra: ``pip install 'rc-docparser[epub]'`` (which also
9
9
  pulls in BeautifulSoup from the ``[html]`` extra).
10
10
  """
11
11
  from __future__ import annotations
@@ -36,7 +36,7 @@ def _import_deps():
36
36
  except ImportError as exc: # pragma: no cover - optional dep guard
37
37
  raise ImportError(
38
38
  "docparser.epub.parse_epub requires the [epub] extra. "
39
- "Install with: pip install 'docparser[epub]'"
39
+ "Install with: pip install 'rc-docparser[epub]'"
40
40
  ) from exc
41
41
  return ebooklib, epub, BeautifulSoup
42
42
 
@@ -8,7 +8,7 @@ Two-tier strategy:
8
8
  typed blocks (heading/paragraph/list/table/image) when trafilatura returns
9
9
  nothing useful or when the caller wants the full structure.
10
10
 
11
- Requires the ``[html]`` extra: ``pip install 'docparser[html]'``.
11
+ Requires the ``[html]`` extra: ``pip install 'rc-docparser[html]'``.
12
12
  """
13
13
  from __future__ import annotations
14
14
 
@@ -37,7 +37,7 @@ def _import_deps():
37
37
  except ImportError as exc: # pragma: no cover
38
38
  raise ImportError(
39
39
  "docparser.html.parse_html requires the [html] extra. "
40
- "Install with: pip install 'docparser[html]'"
40
+ "Install with: pip install 'rc-docparser[html]'"
41
41
  ) from exc
42
42
  return trafilatura, BeautifulSoup
43
43
 
@@ -234,7 +234,7 @@ def caption_image(
234
234
  if requests is None: # pragma: no cover - optional dep guard
235
235
  raise ImportError(
236
236
  "docparser.image.caption_image requires the [vlm] extra. "
237
- "Install with: pip install 'docparser[vlm]'"
237
+ "Install with: pip install 'rc-docparser[vlm]'"
238
238
  )
239
239
 
240
240
  provider_name, preset = _resolve_provider(provider)
@@ -8,7 +8,7 @@ image-to-text model such as BLIP.
8
8
  It honors the same ``VLMResult`` shape and the same ``SHA1(image) x model``
9
9
  on-disk cache as the API captioner, so output is interchangeable.
10
10
 
11
- Requires the ``[localvlm]`` extra: ``pip install 'docparser[localvlm]'``.
11
+ Requires the ``[localvlm]`` extra: ``pip install 'rc-docparser[localvlm]'``.
12
12
  """
13
13
  from __future__ import annotations
14
14
 
@@ -32,7 +32,7 @@ def _load_pipeline(model: str):
32
32
  except ImportError as exc: # pragma: no cover - optional dep
33
33
  raise ImportError(
34
34
  "docparser.localvlm requires the [localvlm] extra. "
35
- "Install with: pip install 'docparser[localvlm]'"
35
+ "Install with: pip install 'rc-docparser[localvlm]'"
36
36
  ) from exc
37
37
  return pipeline("image-to-text", model=model)
38
38
 
@@ -3,7 +3,7 @@
3
3
  Uses ``rapidocr-onnxruntime`` by default: a pure-pip ONNX OCR engine that needs
4
4
  no system binaries (unlike Tesseract). The engine is created once and reused.
5
5
 
6
- Requires the ``[ocr]`` extra: ``pip install 'docparser[ocr]'``.
6
+ Requires the ``[ocr]`` extra: ``pip install 'rc-docparser[ocr]'``.
7
7
  """
8
8
  from __future__ import annotations
9
9
 
@@ -12,7 +12,7 @@ from functools import lru_cache
12
12
  from typing import Any
13
13
 
14
14
  _NO_OCR_MSG = (
15
- "OCR requires the [ocr] extra. Install with: pip install 'docparser[ocr]'"
15
+ "OCR requires the [ocr] extra. Install with: pip install 'rc-docparser[ocr]'"
16
16
  )
17
17
 
18
18
 
@@ -12,7 +12,7 @@ heading classifier based on font sizing. On top of that it offers:
12
12
  - ``extract_tables`` use ``pdfplumber`` (the ``[tables]`` extra) to emit real
13
13
  table blocks instead of flattened text.
14
14
 
15
- Requires the ``[pdf]`` extra: ``pip install 'docparser[pdf]'``.
15
+ Requires the ``[pdf]`` extra: ``pip install 'rc-docparser[pdf]'``.
16
16
  """
17
17
  from __future__ import annotations
18
18
 
@@ -104,7 +104,7 @@ def parse_pdf(
104
104
  except ImportError as exc: # pragma: no cover
105
105
  raise ImportError(
106
106
  "docparser.pdf.parse_pdf requires the [pdf] extra. "
107
- "Install with: pip install 'docparser[pdf]'"
107
+ "Install with: pip install 'rc-docparser[pdf]'"
108
108
  ) from exc
109
109
 
110
110
  if ocr not in {"off", "auto", "force"}:
@@ -218,7 +218,7 @@ def parse_pdf(
218
218
  except ImportError as exc: # pragma: no cover - optional dep
219
219
  raise ImportError(
220
220
  "extract_tables=True requires the [tables] extra. "
221
- "Install with: pip install 'docparser[tables]'"
221
+ "Install with: pip install 'rc-docparser[tables]'"
222
222
  ) from exc
223
223
  out: list[list[list[str]]] = []
224
224
  with pdfplumber.open(str(real_source)) as pdf:
@@ -8,9 +8,9 @@ schema via :func:`docparser.text._blocks_from_markdown`.
8
8
 
9
9
  All backends are optional extras and lazily imported:
10
10
 
11
- - ``pymupdf4llm`` -> ``pip install 'docparser[pymupdf4llm]'`` (note: AGPL/commercial)
12
- - ``docling`` -> ``pip install 'docparser[docling]'`` (MIT)
13
- - ``marker`` -> ``pip install 'docparser[marker]'`` (GPL-3.0)
11
+ - ``pymupdf4llm`` -> ``pip install 'rc-docparser[pymupdf4llm]'`` (note: AGPL/commercial)
12
+ - ``docling`` -> ``pip install 'rc-docparser[docling]'`` (MIT)
13
+ - ``marker`` -> ``pip install 'rc-docparser[marker]'`` (GPL-3.0)
14
14
  """
15
15
  from __future__ import annotations
16
16
 
@@ -28,7 +28,7 @@ def _markdown_pymupdf4llm(path: Path) -> str:
28
28
  except ImportError as exc: # pragma: no cover - optional dep
29
29
  raise ImportError(
30
30
  "backend='pymupdf4llm' requires the [pymupdf4llm] extra. "
31
- "Install with: pip install 'docparser[pymupdf4llm]'"
31
+ "Install with: pip install 'rc-docparser[pymupdf4llm]'"
32
32
  ) from exc
33
33
  return pymupdf4llm.to_markdown(str(path))
34
34
 
@@ -39,7 +39,7 @@ def _markdown_docling(path: Path) -> str:
39
39
  except ImportError as exc: # pragma: no cover - optional dep
40
40
  raise ImportError(
41
41
  "backend='docling' requires the [docling] extra. "
42
- "Install with: pip install 'docparser[docling]'"
42
+ "Install with: pip install 'rc-docparser[docling]'"
43
43
  ) from exc
44
44
  converter = DocumentConverter()
45
45
  result = converter.convert(str(path))
@@ -55,7 +55,7 @@ def _markdown_marker(path: Path) -> str:
55
55
  except ImportError as exc: # pragma: no cover - optional dep
56
56
  raise ImportError(
57
57
  "backend='marker' requires the [marker] extra. "
58
- "Install with: pip install 'docparser[marker]'"
58
+ "Install with: pip install 'rc-docparser[marker]'"
59
59
  ) from exc
60
60
  config_parser = ConfigParser({"output_format": "markdown"})
61
61
  converter = PdfConverter(
@@ -6,7 +6,7 @@ pictures are emitted in shape order. Speaker notes are captured per slide.
6
6
  Embedded pictures are written to ``layout.assets_dir_for(source)`` and may be
7
7
  captioned via a ``captioner`` callable (same contract as the other parsers).
8
8
 
9
- Requires the ``[pptx]`` extra: ``pip install 'docparser[pptx]'``.
9
+ Requires the ``[pptx]`` extra: ``pip install 'rc-docparser[pptx]'``.
10
10
  """
11
11
  from __future__ import annotations
12
12
 
@@ -33,7 +33,7 @@ def _import_pptx():
33
33
  except ImportError as exc: # pragma: no cover - optional dep guard
34
34
  raise ImportError(
35
35
  "docparser.pptx.parse_pptx requires the [pptx] extra. "
36
- "Install with: pip install 'docparser[pptx]'"
36
+ "Install with: pip install 'rc-docparser[pptx]'"
37
37
  ) from exc
38
38
  return Presentation, MSO_SHAPE_TYPE
39
39
 
@@ -6,6 +6,7 @@ embedded images.
6
6
  """
7
7
  from __future__ import annotations
8
8
 
9
+ import contextlib
9
10
  import datetime as _dt
10
11
  import io
11
12
  from collections.abc import Callable
@@ -66,10 +67,8 @@ def _cell_record(cell: Cell, formulas_ws: Worksheet | None) -> dict[str, Any]:
66
67
  except Exception:
67
68
  pass
68
69
  if cell.comment is not None:
69
- try:
70
+ with contextlib.suppress(Exception):
70
71
  rec["comment"] = str(cell.comment.text)
71
- except Exception:
72
- pass
73
72
  return rec
74
73
 
75
74
 
@@ -84,10 +83,8 @@ def _extract_images(
84
83
  data: bytes | None = None
85
84
  if hasattr(ref, "read"):
86
85
  data = ref.read()
87
- try:
86
+ with contextlib.suppress(Exception):
88
87
  ref.seek(0)
89
- except Exception:
90
- pass
91
88
  elif isinstance(ref, (bytes, bytearray)):
92
89
  data = bytes(ref)
93
90
  else:
@@ -1,6 +1,7 @@
1
1
  """Shared pytest fixtures: synthetic docx, xlsx, pdf, html files."""
2
2
  from __future__ import annotations
3
3
 
4
+ import contextlib
4
5
  import io
5
6
  from pathlib import Path
6
7
 
@@ -51,10 +52,8 @@ def sample_docx(tmp_path: Path) -> Path:
51
52
  img_path.write_bytes(_png_bytes())
52
53
  doc.add_picture(str(img_path))
53
54
  cap_para = doc.add_paragraph("Figure 1: a red square example.")
54
- try:
55
+ with contextlib.suppress(KeyError):
55
56
  cap_para.style = doc.styles["Caption"]
56
- except KeyError:
57
- pass
58
57
 
59
58
  doc.add_heading("Section B", level=2)
60
59
  doc.add_paragraph("Conclusion paragraph.")
File without changes
File without changes