ocrmypdf-rapidocr 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Adrian Mazur
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,109 @@
1
+ Metadata-Version: 2.3
2
+ Name: ocrmypdf-rapidocr
3
+ Version: 1.0.0
4
+ Author: Adrian Mazur
5
+ License: MIT License
6
+
7
+ Copyright (c) 2026 Adrian Mazur
8
+
9
+ Permission is hereby granted, free of charge, to any person obtaining a copy
10
+ of this software and associated documentation files (the "Software"), to deal
11
+ in the Software without restriction, including without limitation the rights
12
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13
+ copies of the Software, and to permit persons to whom the Software is
14
+ furnished to do so, subject to the following conditions:
15
+
16
+ The above copyright notice and this permission notice shall be included in all
17
+ copies or substantial portions of the Software.
18
+
19
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
25
+ SOFTWARE.
26
+ Classifier: Programming Language :: Python :: 3.12
27
+ Classifier: Programming Language :: Python :: 3.13
28
+ Classifier: Programming Language :: Python :: 3.14
29
+ Requires-Dist: ocrmypdf>=17.3.0
30
+ Requires-Dist: onnxruntime>=1.24.3
31
+ Requires-Dist: rapidocr>=3.7.0
32
+ Requires-Python: >=3.12
33
+ Project-URL: Homepage, https://github.com/adrianmazur-dev/ocrmypdf-rapidocr
34
+ Project-URL: Repository, https://github.com/adrianmazur-dev/ocrmypdf-rapidocr.git
35
+ Description-Content-Type: text/markdown
36
+
37
+ # ocrmypdf-rapidocr
38
+
39
+ `ocrmypdf-rapidocr` is an OCRmyPDF plugin that uses [RapidOCR](https://github.com/RapidAI/RapidOCR) as an OCR engine.
40
+
41
+ ## Status
42
+
43
+ Supported:
44
+
45
+ - OCR engine integration via OCRmyPDF plugin hooks
46
+ - `hOCR` output path (`--pdf-renderer auto` or `--pdf-renderer fpdf2`)
47
+ - ONNXRuntime backend only
48
+ - Single language selection from `-l/--language`
49
+
50
+ Not supported:
51
+
52
+ - `--pdf-renderer sandwich`
53
+ - multi-language combinations such as `-l eng+fra`
54
+
55
+ ## Installation
56
+
57
+ ```bash
58
+ pip install ocrmypdf-rapidocr
59
+ ```
60
+
61
+ Or from source:
62
+
63
+ ```bash
64
+ pip install .
65
+ ```
66
+
67
+ ## Usage
68
+
69
+ Load the plugin explicitly with `--plugin`:
70
+
71
+ ```bash
72
+ ocrmypdf --plugin ocrmypdf_rapidocr -l eng input.pdf output.pdf
73
+ ```
74
+
75
+ Optional plugin arguments:
76
+
77
+ - `--rapidocr-config-path PATH`: use a custom RapidOCR YAML config
78
+
79
+ Example:
80
+
81
+ ```bash
82
+ ocrmypdf \
83
+ --plugin ocrmypdf_rapidocr \
84
+ -l deu \
85
+ input.pdf output.pdf
86
+ ```
87
+
88
+ ## Language behavior
89
+
90
+ The plugin uses the first OCRmyPDF language code and maps it to RapidOCR language families.
91
+
92
+ - direct mappings: `eng`, `chi_sim`, `chi_tra`, `jpn`, `kor`, `ara`, `rus`, `ukr`, `tha`, `tam`, `tel`, `ell`/`gre`
93
+ - selected Latin-script codes map to RapidOCR `LATIN`
94
+
95
+ If a language code is unsupported, OCRmyPDF exits with a clear error message.
96
+
97
+ ## Runtime model downloads
98
+
99
+ RapidOCR downloads model files on first use when model paths are not pinned in config.
100
+ For offline or restricted environments, provide a custom config via
101
+ `--rapidocr-config-path` that points to local model files.
102
+
103
+ ## References
104
+
105
+ - OCRmyPDF plugin API docs: <https://github.com/ocrmypdf/OCRmyPDF/blob/main/docs/plugins.md>
106
+ - OCRmyPDF EasyOCR reference plugin: <https://github.com/ocrmypdf/OCRmyPDF-EasyOCR>
107
+ - OCRmyPDF AppleOCR reference plugin: <https://github.com/mkyt/OCRmyPDF-AppleOCR>
108
+ - OCRmyPDF PaddleOCR reference plugin: <https://github.com/clefru/ocrmypdf-paddleocr>
109
+ - RapidOCR project: <https://github.com/RapidAI/RapidOCR>
@@ -0,0 +1,73 @@
1
+ # ocrmypdf-rapidocr
2
+
3
+ `ocrmypdf-rapidocr` is an OCRmyPDF plugin that uses [RapidOCR](https://github.com/RapidAI/RapidOCR) as an OCR engine.
4
+
5
+ ## Status
6
+
7
+ Supported:
8
+
9
+ - OCR engine integration via OCRmyPDF plugin hooks
10
+ - `hOCR` output path (`--pdf-renderer auto` or `--pdf-renderer fpdf2`)
11
+ - ONNXRuntime backend only
12
+ - Single language selection from `-l/--language`
13
+
14
+ Not supported:
15
+
16
+ - `--pdf-renderer sandwich`
17
+ - multi-language combinations such as `-l eng+fra`
18
+
19
+ ## Installation
20
+
21
+ ```bash
22
+ pip install ocrmypdf-rapidocr
23
+ ```
24
+
25
+ Or from source:
26
+
27
+ ```bash
28
+ pip install .
29
+ ```
30
+
31
+ ## Usage
32
+
33
+ Load the plugin explicitly with `--plugin`:
34
+
35
+ ```bash
36
+ ocrmypdf --plugin ocrmypdf_rapidocr -l eng input.pdf output.pdf
37
+ ```
38
+
39
+ Optional plugin arguments:
40
+
41
+ - `--rapidocr-config-path PATH`: use a custom RapidOCR YAML config
42
+
43
+ Example:
44
+
45
+ ```bash
46
+ ocrmypdf \
47
+ --plugin ocrmypdf_rapidocr \
48
+ -l deu \
49
+ input.pdf output.pdf
50
+ ```
51
+
52
+ ## Language behavior
53
+
54
+ The plugin uses the first OCRmyPDF language code and maps it to RapidOCR language families.
55
+
56
+ - direct mappings: `eng`, `chi_sim`, `chi_tra`, `jpn`, `kor`, `ara`, `rus`, `ukr`, `tha`, `tam`, `tel`, `ell`/`gre`
57
+ - selected Latin-script codes map to RapidOCR `LATIN`
58
+
59
+ If a language code is unsupported, OCRmyPDF exits with a clear error message.
60
+
61
+ ## Runtime model downloads
62
+
63
+ RapidOCR downloads model files on first use when model paths are not pinned in config.
64
+ For offline or restricted environments, provide a custom config via
65
+ `--rapidocr-config-path` that points to local model files.
66
+
67
+ ## References
68
+
69
+ - OCRmyPDF plugin API docs: <https://github.com/ocrmypdf/OCRmyPDF/blob/main/docs/plugins.md>
70
+ - OCRmyPDF EasyOCR reference plugin: <https://github.com/ocrmypdf/OCRmyPDF-EasyOCR>
71
+ - OCRmyPDF AppleOCR reference plugin: <https://github.com/mkyt/OCRmyPDF-AppleOCR>
72
+ - OCRmyPDF PaddleOCR reference plugin: <https://github.com/clefru/ocrmypdf-paddleocr>
73
+ - RapidOCR project: <https://github.com/RapidAI/RapidOCR>
@@ -0,0 +1,36 @@
1
+ [project]
2
+ name = "ocrmypdf-rapidocr"
3
+ readme = "README.md"
4
+ license = { file = "LICENSE" }
5
+ authors = [{ name = "Adrian Mazur" }]
6
+ version = "1.0.0"
7
+ requires-python = ">=3.12"
8
+ classifiers = [
9
+ "Programming Language :: Python :: 3.12",
10
+ "Programming Language :: Python :: 3.13",
11
+ "Programming Language :: Python :: 3.14",
12
+ ]
13
+ dependencies = ["ocrmypdf>=17.3.0", "onnxruntime>=1.24.3", "rapidocr>=3.7.0"]
14
+
15
+ [dependency-groups]
16
+ dev = [
17
+ "pytest>=9.0.2",
18
+ "pytest-cov>=7.0.0",
19
+ "rapidfuzz>=3.14.1",
20
+ "ruff>=0.15.5",
21
+ "ty>=0.0.21",
22
+ ]
23
+
24
+ [tool.ruff]
25
+ exclude = ["tests"]
26
+
27
+ [tool.ty.src]
28
+ exclude = ["tests"]
29
+
30
+ [project.urls]
31
+ Homepage = "https://github.com/adrianmazur-dev/ocrmypdf-rapidocr"
32
+ Repository = "https://github.com/adrianmazur-dev/ocrmypdf-rapidocr.git"
33
+
34
+ [build-system]
35
+ requires = ["uv_build>=0.10.8,<0.11.0"]
36
+ build-backend = "uv_build"
@@ -0,0 +1,11 @@
1
+ from __future__ import annotations
2
+
3
+ from ocrmypdf_rapidocr.plugin import add_options, check_options, get_ocr_engine
4
+ from ocrmypdf_rapidocr.version import __version__
5
+
6
+ __all__ = [
7
+ "__version__",
8
+ "add_options",
9
+ "check_options",
10
+ "get_ocr_engine",
11
+ ]
@@ -0,0 +1,103 @@
1
+ from __future__ import annotations
2
+
3
+ import importlib.metadata
4
+ from functools import lru_cache
5
+ from pathlib import Path
6
+
7
+ from ocrmypdf.pluginspec import OcrEngine, OrientationConfidence
8
+ from PIL import Image
9
+
10
+ from ocrmypdf_rapidocr.hocr import build_hocr_document, extract_hocr_lines
11
+ from ocrmypdf_rapidocr.languages import (
12
+ SUPPORTED_LANGUAGE_CODES,
13
+ map_language_to_langrec_name,
14
+ select_single_language,
15
+ )
16
+ from ocrmypdf_rapidocr.options import get_option_config_path
17
+ from ocrmypdf_rapidocr.version import __version__
18
+
19
+
20
+ def _import_rapidocr_symbols():
21
+ from rapidocr import EngineType, LangRec, RapidOCR
22
+
23
+ return RapidOCR, LangRec, EngineType
24
+
25
+
26
+ @lru_cache(maxsize=16)
27
+ def get_rapidocr_engine(language: str, config_path: str | None):
28
+ RapidOCR, LangRec, EngineType = _import_rapidocr_symbols()
29
+
30
+ langrec_name = map_language_to_langrec_name(language)
31
+ langrec_value = getattr(LangRec, langrec_name)
32
+
33
+ params = {
34
+ "Det.engine_type": EngineType.ONNXRUNTIME,
35
+ "Cls.engine_type": EngineType.ONNXRUNTIME,
36
+ "Rec.engine_type": EngineType.ONNXRUNTIME,
37
+ "Rec.lang_type": langrec_value,
38
+ }
39
+ return RapidOCR(config_path=config_path, params=params)
40
+
41
+
42
+ class RapidOCREngine(OcrEngine):
43
+ @staticmethod
44
+ def version() -> str:
45
+ try:
46
+ return importlib.metadata.version("rapidocr")
47
+ except importlib.metadata.PackageNotFoundError:
48
+ return "unknown"
49
+
50
+ @staticmethod
51
+ def creator_tag(options) -> str:
52
+ return (
53
+ f"RapidOCR {RapidOCREngine.version()} via ocrmypdf-rapidocr {__version__}"
54
+ )
55
+
56
+ def __str__(self) -> str:
57
+ return f"RapidOCR {RapidOCREngine.version()}"
58
+
59
+ @staticmethod
60
+ def languages(options) -> set[str]:
61
+ return set(SUPPORTED_LANGUAGE_CODES)
62
+
63
+ @staticmethod
64
+ def get_orientation(input_file, options) -> OrientationConfidence:
65
+ return OrientationConfidence(angle=0, confidence=0.0)
66
+
67
+ @staticmethod
68
+ def get_deskew(input_file, options) -> float:
69
+ return 0.0
70
+
71
+ @staticmethod
72
+ def generate_hocr(input_file, output_hocr, output_text, options) -> None:
73
+ language = select_single_language(options)
74
+ config_path = get_option_config_path(options)
75
+ rapidocr_engine = get_rapidocr_engine(
76
+ language,
77
+ config_path,
78
+ )
79
+
80
+ with Image.open(input_file) as image:
81
+ page_width, page_height = image.size
82
+
83
+ result = rapidocr_engine(str(input_file))
84
+ lines = extract_hocr_lines(
85
+ result, page_width=page_width, page_height=page_height
86
+ )
87
+ hocr = build_hocr_document(
88
+ page_width=page_width,
89
+ page_height=page_height,
90
+ language=language,
91
+ lines=lines,
92
+ )
93
+ plain_text = "\n".join(text for text, _bbox, _confidence in lines)
94
+
95
+ Path(output_hocr).write_text(hocr, encoding="utf-8")
96
+ Path(output_text).write_text(plain_text, encoding="utf-8")
97
+
98
+ @staticmethod
99
+ def generate_pdf(input_file, output_pdf, output_text, options) -> None:
100
+ raise NotImplementedError(
101
+ "ocrmypdf-rapidocr does not support sandwich renderer. "
102
+ "Use --pdf-renderer auto or --pdf-renderer fpdf2."
103
+ )
@@ -0,0 +1,118 @@
1
+ from __future__ import annotations
2
+
3
+ from html import escape
4
+ from typing import Any
5
+
6
+ HocrLine = tuple[str, tuple[int, int, int, int], int]
7
+
8
+
9
+ def confidence_to_percent(score: Any) -> int:
10
+ if score is None:
11
+ return 0
12
+ try:
13
+ value = float(score)
14
+ except (TypeError, ValueError):
15
+ return 0
16
+ if value <= 1.0:
17
+ value *= 100.0
18
+ return max(0, min(100, int(round(value))))
19
+
20
+
21
+ def bbox_from_polygon(polygon: Any, width: int, height: int) -> tuple[int, int, int, int]:
22
+ try:
23
+ points = list(polygon)
24
+ except TypeError:
25
+ return (0, 0, width, height)
26
+
27
+ xs: list[float] = []
28
+ ys: list[float] = []
29
+ for point in points:
30
+ try:
31
+ x = float(point[0])
32
+ y = float(point[1])
33
+ except (TypeError, ValueError, IndexError):
34
+ continue
35
+ xs.append(x)
36
+ ys.append(y)
37
+
38
+ if not xs or not ys:
39
+ return (0, 0, width, height)
40
+
41
+ x0 = max(0, min(width - 1, int(round(min(xs)))))
42
+ y0 = max(0, min(height - 1, int(round(min(ys)))))
43
+ x1 = max(x0 + 1, min(width, int(round(max(xs)))))
44
+ y1 = max(y0 + 1, min(height, int(round(max(ys)))))
45
+ return (x0, y0, x1, y1)
46
+
47
+
48
+ def extract_hocr_lines(
49
+ result: Any,
50
+ *,
51
+ page_width: int,
52
+ page_height: int,
53
+ ) -> list[HocrLine]:
54
+ boxes_raw = getattr(result, "boxes", None)
55
+ texts_raw = getattr(result, "txts", None)
56
+ scores_raw = getattr(result, "scores", None)
57
+
58
+ boxes = list(boxes_raw) if boxes_raw is not None else []
59
+ texts = list(texts_raw) if texts_raw is not None else []
60
+ scores = list(scores_raw) if scores_raw is not None else []
61
+ count = min(len(boxes), len(texts))
62
+
63
+ extracted: list[HocrLine] = []
64
+ for index in range(count):
65
+ text = str(texts[index]).strip()
66
+ if not text:
67
+ continue
68
+ box = boxes[index]
69
+ score = scores[index] if index < len(scores) else None
70
+ bbox = bbox_from_polygon(box, page_width, page_height)
71
+ confidence = confidence_to_percent(score)
72
+ extracted.append((text, bbox, confidence))
73
+ return extracted
74
+
75
+
76
+ def build_hocr_document(
77
+ *,
78
+ page_width: int,
79
+ page_height: int,
80
+ language: str,
81
+ lines: list[HocrLine],
82
+ ) -> str:
83
+ hocr_lines = [
84
+ '<?xml version="1.0" encoding="UTF-8"?>',
85
+ '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"',
86
+ ' "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">',
87
+ '<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">',
88
+ "<head>",
89
+ "<title></title>",
90
+ '<meta http-equiv="content-type" content="text/html; charset=utf-8" />',
91
+ '<meta name="ocr-system" content="RapidOCR via ocrmypdf-rapidocr" />',
92
+ '<meta name="ocr-capabilities" content="ocr_page ocr_carea ocr_par ocr_line ocrx_word" />',
93
+ "</head>",
94
+ "<body>",
95
+ f'<div class="ocr_page" id="page_1" title="bbox 0 0 {page_width} {page_height}">',
96
+ ]
97
+
98
+ for line_id, (text, bbox, confidence) in enumerate(lines, start=1):
99
+ x0, y0, x1, y1 = bbox
100
+ text_escaped = escape(text, quote=False)
101
+ hocr_lines.append(
102
+ f'<div class="ocr_carea" id="carea_{line_id}" title="bbox {x0} {y0} {x1} {y1}">'
103
+ )
104
+ hocr_lines.append(
105
+ f'<p class="ocr_par" id="par_{line_id}" lang="{language}" title="bbox {x0} {y0} {x1} {y1}">'
106
+ )
107
+ hocr_lines.append(
108
+ f'<span class="ocr_line" id="line_{line_id}" '
109
+ f'title="bbox {x0} {y0} {x1} {y1}; baseline 0 0; x_wconf {confidence}">'
110
+ )
111
+ hocr_lines.append(
112
+ f'<span class="ocrx_word" id="word_{line_id}" '
113
+ f'title="bbox {x0} {y0} {x1} {y1}; x_wconf {confidence}">{text_escaped}</span>'
114
+ )
115
+ hocr_lines.extend(["</span>", "</p>", "</div>"])
116
+
117
+ hocr_lines.extend(["</div>", "</body>", "</html>"])
118
+ return "\n".join(hocr_lines)
@@ -0,0 +1,105 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any, Sequence
4
+
5
+ from ocrmypdf.exceptions import BadArgsError
6
+
7
+ # Map OCRmyPDF/Tesseract language codes to RapidOCR LangRec values.
8
+ _DIRECT_LANGUAGE_MAP: dict[str, str] = {
9
+ "ara": "ARABIC",
10
+ "chi_sim": "CH",
11
+ "chi_tra": "CHINESE_CHT",
12
+ "eng": "EN",
13
+ "ell": "EL",
14
+ "gre": "EL",
15
+ "jpn": "JAPAN",
16
+ "kor": "KOREAN",
17
+ "tha": "TH",
18
+ "tam": "TA",
19
+ "tel": "TE",
20
+ "bel": "CYRILLIC",
21
+ "bul": "CYRILLIC",
22
+ "mkd": "CYRILLIC",
23
+ "rus": "CYRILLIC",
24
+ "srp": "CYRILLIC",
25
+ "ukr": "CYRILLIC",
26
+ }
27
+
28
+ _LATIN_LANGUAGE_CODES: set[str] = {
29
+ "afr",
30
+ "cat",
31
+ "ces",
32
+ "dan",
33
+ "deu",
34
+ "est",
35
+ "eus",
36
+ "fin",
37
+ "fra",
38
+ "gle",
39
+ "hrv",
40
+ "hun",
41
+ "ind",
42
+ "isl",
43
+ "ita",
44
+ "lav",
45
+ "lit",
46
+ "mlt",
47
+ "msa",
48
+ "nld",
49
+ "nor",
50
+ "pol",
51
+ "por",
52
+ "ron",
53
+ "slk",
54
+ "slv",
55
+ "spa",
56
+ "sqi",
57
+ "swe",
58
+ "tgl",
59
+ "tur",
60
+ "vie",
61
+ }
62
+
63
+ SUPPORTED_LANGUAGE_CODES: frozenset[str] = frozenset(
64
+ set(_DIRECT_LANGUAGE_MAP) | _LATIN_LANGUAGE_CODES
65
+ )
66
+
67
+
68
+ def normalize_languages(languages: Sequence[str] | None) -> list[str]:
69
+ if not languages:
70
+ return ["eng"]
71
+ return [
72
+ str(language).strip().lower() for language in languages if str(language).strip()
73
+ ]
74
+
75
+
76
+ def select_single_language(options: Any) -> str:
77
+ languages = normalize_languages(getattr(options, "languages", None))
78
+ if len(languages) != 1:
79
+ raise BadArgsError(
80
+ "RapidOCR supports exactly one language. "
81
+ "Pass a single language to -l/--language."
82
+ )
83
+
84
+ language = languages[0]
85
+ if "+" in language:
86
+ raise BadArgsError(
87
+ "RapidOCR does not support language combinations like eng+fra."
88
+ )
89
+
90
+ if language not in SUPPORTED_LANGUAGE_CODES:
91
+ supported = ", ".join(sorted(SUPPORTED_LANGUAGE_CODES))
92
+ raise BadArgsError(
93
+ f"Language '{language}' is not supported by ocrmypdf-rapidocr. "
94
+ f"Supported values: {supported}"
95
+ )
96
+ return language
97
+
98
+
99
+ def map_language_to_langrec_name(language: str) -> str:
100
+ normalized = language.lower()
101
+ if normalized in _DIRECT_LANGUAGE_MAP:
102
+ return _DIRECT_LANGUAGE_MAP[normalized]
103
+ if normalized in _LATIN_LANGUAGE_CODES:
104
+ return "LATIN"
105
+ raise KeyError(normalized)
@@ -0,0 +1,56 @@
1
+ from __future__ import annotations
2
+
3
+ import importlib.util
4
+ from pathlib import Path
5
+ from typing import Any
6
+
7
+ from ocrmypdf.exceptions import BadArgsError, MissingDependencyError
8
+
9
+
10
+ def is_rapidocr_selected(options: Any) -> bool:
11
+ return getattr(options, "ocr_engine", "auto") in ("auto", "rapidocr")
12
+
13
+
14
+ def add_plugin_options(parser) -> None:
15
+ rapidocr_options = parser.add_argument_group("RapidOCR", "RapidOCR engine options")
16
+ rapidocr_options.add_argument(
17
+ "--rapidocr-config-path",
18
+ default=None,
19
+ help=(
20
+ "Path to a RapidOCR YAML config file. "
21
+ "If omitted, RapidOCR defaults are used."
22
+ ),
23
+ )
24
+
25
+
26
+ def check_runtime_dependencies() -> None:
27
+ if importlib.util.find_spec("rapidocr") is None:
28
+ raise MissingDependencyError(
29
+ "RapidOCR is not installed. Install it with: pip install rapidocr"
30
+ )
31
+ if importlib.util.find_spec("onnxruntime") is None:
32
+ raise MissingDependencyError(
33
+ "onnxruntime is not installed. Install it with: pip install onnxruntime"
34
+ )
35
+
36
+
37
+ def get_option_config_path(options: Any) -> str | None:
38
+ value = getattr(options, "rapidocr_config_path", None)
39
+ if value is None:
40
+ return None
41
+ return str(value)
42
+
43
+
44
+ def validate_plugin_options(options: Any) -> None:
45
+ if getattr(options, "pdf_renderer", "auto") == "sandwich":
46
+ raise BadArgsError("ocrmypdf-rapidocr only supports hOCR/fpdf2 flow. ")
47
+
48
+ config_path = get_option_config_path(options)
49
+ if config_path is None:
50
+ return
51
+
52
+ path = Path(config_path)
53
+ if not path.exists():
54
+ raise BadArgsError(f"--rapidocr-config-path does not exist: {path}")
55
+ if not path.is_file():
56
+ raise BadArgsError(f"--rapidocr-config-path is not a file: {path}")
@@ -0,0 +1,40 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+
5
+ from ocrmypdf import hookimpl
6
+
7
+ from ocrmypdf_rapidocr.engine import RapidOCREngine
8
+ from ocrmypdf_rapidocr.languages import select_single_language
9
+ from ocrmypdf_rapidocr.options import (
10
+ add_plugin_options,
11
+ check_runtime_dependencies,
12
+ is_rapidocr_selected,
13
+ validate_plugin_options,
14
+ )
15
+
16
+ log = logging.getLogger(__name__)
17
+
18
+
19
+ @hookimpl
20
+ def add_options(parser) -> None:
21
+ add_plugin_options(parser)
22
+
23
+
24
+ @hookimpl
25
+ def check_options(options) -> None:
26
+ if not is_rapidocr_selected(options):
27
+ return
28
+
29
+ check_runtime_dependencies()
30
+ validate_plugin_options(options)
31
+ select_single_language(options)
32
+
33
+
34
+ @hookimpl
35
+ def get_ocr_engine(options=None):
36
+ if options is not None:
37
+ ocr_engine = getattr(options, "ocr_engine", "auto")
38
+ if ocr_engine not in ("auto", "rapidocr"):
39
+ return None
40
+ return RapidOCREngine()
@@ -0,0 +1,8 @@
1
+ from __future__ import annotations
2
+
3
+ import importlib.metadata
4
+
5
+ try:
6
+ __version__ = importlib.metadata.version(__name__)
7
+ except importlib.metadata.PackageNotFoundError:
8
+ __version__ = "0.0.0"