ocrmypdf-rapidocr 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ocrmypdf_rapidocr-1.0.0/LICENSE +21 -0
- ocrmypdf_rapidocr-1.0.0/PKG-INFO +109 -0
- ocrmypdf_rapidocr-1.0.0/README.md +73 -0
- ocrmypdf_rapidocr-1.0.0/pyproject.toml +36 -0
- ocrmypdf_rapidocr-1.0.0/src/ocrmypdf_rapidocr/__init__.py +11 -0
- ocrmypdf_rapidocr-1.0.0/src/ocrmypdf_rapidocr/engine.py +103 -0
- ocrmypdf_rapidocr-1.0.0/src/ocrmypdf_rapidocr/hocr.py +118 -0
- ocrmypdf_rapidocr-1.0.0/src/ocrmypdf_rapidocr/languages.py +105 -0
- ocrmypdf_rapidocr-1.0.0/src/ocrmypdf_rapidocr/options.py +56 -0
- ocrmypdf_rapidocr-1.0.0/src/ocrmypdf_rapidocr/plugin.py +40 -0
- ocrmypdf_rapidocr-1.0.0/src/ocrmypdf_rapidocr/version.py +8 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Adrian Mazur
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: ocrmypdf-rapidocr
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Author: Adrian Mazur
|
|
5
|
+
License: MIT License
|
|
6
|
+
|
|
7
|
+
Copyright (c) 2026 Adrian Mazur
|
|
8
|
+
|
|
9
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
10
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
11
|
+
in the Software without restriction, including without limitation the rights
|
|
12
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
13
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
14
|
+
furnished to do so, subject to the following conditions:
|
|
15
|
+
|
|
16
|
+
The above copyright notice and this permission notice shall be included in all
|
|
17
|
+
copies or substantial portions of the Software.
|
|
18
|
+
|
|
19
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
20
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
21
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
22
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
23
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
24
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
25
|
+
SOFTWARE.
|
|
26
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
27
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
28
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
29
|
+
Requires-Dist: ocrmypdf>=17.3.0
|
|
30
|
+
Requires-Dist: onnxruntime>=1.24.3
|
|
31
|
+
Requires-Dist: rapidocr>=3.7.0
|
|
32
|
+
Requires-Python: >=3.12
|
|
33
|
+
Project-URL: Homepage, https://github.com/adrianmazur-dev/ocrmypdf-rapidocr
|
|
34
|
+
Project-URL: Repository, https://github.com/adrianmazur-dev/ocrmypdf-rapidocr.git
|
|
35
|
+
Description-Content-Type: text/markdown
|
|
36
|
+
|
|
37
|
+
# ocrmypdf-rapidocr
|
|
38
|
+
|
|
39
|
+
`ocrmypdf-rapidocr` is an OCRmyPDF plugin that uses [RapidOCR](https://github.com/RapidAI/RapidOCR) as an OCR engine.
|
|
40
|
+
|
|
41
|
+
## Status
|
|
42
|
+
|
|
43
|
+
Supported:
|
|
44
|
+
|
|
45
|
+
- OCR engine integration via OCRmyPDF plugin hooks
|
|
46
|
+
- `hOCR` output path (`--pdf-renderer auto` or `--pdf-renderer fpdf2`)
|
|
47
|
+
- ONNXRuntime backend only
|
|
48
|
+
- Single language selection from `-l/--language`
|
|
49
|
+
|
|
50
|
+
Not supported:
|
|
51
|
+
|
|
52
|
+
- `--pdf-renderer sandwich`
|
|
53
|
+
- multi-language combinations such as `-l eng+fra`
|
|
54
|
+
|
|
55
|
+
## Installation
|
|
56
|
+
|
|
57
|
+
```bash
|
|
58
|
+
pip install ocrmypdf-rapidocr
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
Or from source:
|
|
62
|
+
|
|
63
|
+
```bash
|
|
64
|
+
pip install .
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
## Usage
|
|
68
|
+
|
|
69
|
+
Load the plugin explicitly with `--plugin`:
|
|
70
|
+
|
|
71
|
+
```bash
|
|
72
|
+
ocrmypdf --plugin ocrmypdf_rapidocr -l eng input.pdf output.pdf
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
Optional plugin arguments:
|
|
76
|
+
|
|
77
|
+
- `--rapidocr-config-path PATH`: use a custom RapidOCR YAML config
|
|
78
|
+
|
|
79
|
+
Example:
|
|
80
|
+
|
|
81
|
+
```bash
|
|
82
|
+
ocrmypdf \
|
|
83
|
+
--plugin ocrmypdf_rapidocr \
|
|
84
|
+
-l deu \
|
|
85
|
+
input.pdf output.pdf
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
## Language behavior
|
|
89
|
+
|
|
90
|
+
The plugin uses the first OCRmyPDF language code and maps it to RapidOCR language families.
|
|
91
|
+
|
|
92
|
+
- direct mappings: `eng`, `chi_sim`, `chi_tra`, `jpn`, `kor`, `ara`, `rus`, `ukr`, `tha`, `tam`, `tel`, `ell`/`gre`
|
|
93
|
+
- selected Latin-script codes map to RapidOCR `LATIN`
|
|
94
|
+
|
|
95
|
+
If a language code is unsupported, OCRmyPDF exits with a clear error message.
|
|
96
|
+
|
|
97
|
+
## Runtime model downloads
|
|
98
|
+
|
|
99
|
+
RapidOCR downloads model files on first use when model paths are not pinned in config.
|
|
100
|
+
For offline or restricted environments, provide a custom config via
|
|
101
|
+
`--rapidocr-config-path` that points to local model files.
|
|
102
|
+
|
|
103
|
+
## References
|
|
104
|
+
|
|
105
|
+
- OCRmyPDF plugin API docs: <https://github.com/ocrmypdf/OCRmyPDF/blob/main/docs/plugins.md>
|
|
106
|
+
- OCRmyPDF EasyOCR reference plugin: <https://github.com/ocrmypdf/OCRmyPDF-EasyOCR>
|
|
107
|
+
- OCRmyPDF AppleOCR reference plugin: <https://github.com/mkyt/OCRmyPDF-AppleOCR>
|
|
108
|
+
- OCRmyPDF PaddleOCR reference plugin: <https://github.com/clefru/ocrmypdf-paddleocr>
|
|
109
|
+
- RapidOCR project: <https://github.com/RapidAI/RapidOCR>
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
# ocrmypdf-rapidocr
|
|
2
|
+
|
|
3
|
+
`ocrmypdf-rapidocr` is an OCRmyPDF plugin that uses [RapidOCR](https://github.com/RapidAI/RapidOCR) as an OCR engine.
|
|
4
|
+
|
|
5
|
+
## Status
|
|
6
|
+
|
|
7
|
+
Supported:
|
|
8
|
+
|
|
9
|
+
- OCR engine integration via OCRmyPDF plugin hooks
|
|
10
|
+
- `hOCR` output path (`--pdf-renderer auto` or `--pdf-renderer fpdf2`)
|
|
11
|
+
- ONNXRuntime backend only
|
|
12
|
+
- Single language selection from `-l/--language`
|
|
13
|
+
|
|
14
|
+
Not supported:
|
|
15
|
+
|
|
16
|
+
- `--pdf-renderer sandwich`
|
|
17
|
+
- multi-language combinations such as `-l eng+fra`
|
|
18
|
+
|
|
19
|
+
## Installation
|
|
20
|
+
|
|
21
|
+
```bash
|
|
22
|
+
pip install ocrmypdf-rapidocr
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
Or from source:
|
|
26
|
+
|
|
27
|
+
```bash
|
|
28
|
+
pip install .
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
## Usage
|
|
32
|
+
|
|
33
|
+
Load the plugin explicitly with `--plugin`:
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
ocrmypdf --plugin ocrmypdf_rapidocr -l eng input.pdf output.pdf
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
Optional plugin arguments:
|
|
40
|
+
|
|
41
|
+
- `--rapidocr-config-path PATH`: use a custom RapidOCR YAML config
|
|
42
|
+
|
|
43
|
+
Example:
|
|
44
|
+
|
|
45
|
+
```bash
|
|
46
|
+
ocrmypdf \
|
|
47
|
+
--plugin ocrmypdf_rapidocr \
|
|
48
|
+
-l deu \
|
|
49
|
+
input.pdf output.pdf
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
## Language behavior
|
|
53
|
+
|
|
54
|
+
The plugin uses the first OCRmyPDF language code and maps it to RapidOCR language families.
|
|
55
|
+
|
|
56
|
+
- direct mappings: `eng`, `chi_sim`, `chi_tra`, `jpn`, `kor`, `ara`, `rus`, `ukr`, `tha`, `tam`, `tel`, `ell`/`gre`
|
|
57
|
+
- selected Latin-script codes map to RapidOCR `LATIN`
|
|
58
|
+
|
|
59
|
+
If a language code is unsupported, OCRmyPDF exits with a clear error message.
|
|
60
|
+
|
|
61
|
+
## Runtime model downloads
|
|
62
|
+
|
|
63
|
+
RapidOCR downloads model files on first use when model paths are not pinned in config.
|
|
64
|
+
For offline or restricted environments, provide a custom config via
|
|
65
|
+
`--rapidocr-config-path` that points to local model files.
|
|
66
|
+
|
|
67
|
+
## References
|
|
68
|
+
|
|
69
|
+
- OCRmyPDF plugin API docs: <https://github.com/ocrmypdf/OCRmyPDF/blob/main/docs/plugins.md>
|
|
70
|
+
- OCRmyPDF EasyOCR reference plugin: <https://github.com/ocrmypdf/OCRmyPDF-EasyOCR>
|
|
71
|
+
- OCRmyPDF AppleOCR reference plugin: <https://github.com/mkyt/OCRmyPDF-AppleOCR>
|
|
72
|
+
- OCRmyPDF PaddleOCR reference plugin: <https://github.com/clefru/ocrmypdf-paddleocr>
|
|
73
|
+
- RapidOCR project: <https://github.com/RapidAI/RapidOCR>
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "ocrmypdf-rapidocr"
|
|
3
|
+
readme = "README.md"
|
|
4
|
+
license = { file = "LICENSE" }
|
|
5
|
+
authors = [{ name = "Adrian Mazur" }]
|
|
6
|
+
version = "1.0.0"
|
|
7
|
+
requires-python = ">=3.12"
|
|
8
|
+
classifiers = [
|
|
9
|
+
"Programming Language :: Python :: 3.12",
|
|
10
|
+
"Programming Language :: Python :: 3.13",
|
|
11
|
+
"Programming Language :: Python :: 3.14",
|
|
12
|
+
]
|
|
13
|
+
dependencies = ["ocrmypdf>=17.3.0", "onnxruntime>=1.24.3", "rapidocr>=3.7.0"]
|
|
14
|
+
|
|
15
|
+
[dependency-groups]
|
|
16
|
+
dev = [
|
|
17
|
+
"pytest>=9.0.2",
|
|
18
|
+
"pytest-cov>=7.0.0",
|
|
19
|
+
"rapidfuzz>=3.14.1",
|
|
20
|
+
"ruff>=0.15.5",
|
|
21
|
+
"ty>=0.0.21",
|
|
22
|
+
]
|
|
23
|
+
|
|
24
|
+
[tool.ruff]
|
|
25
|
+
exclude = ["tests"]
|
|
26
|
+
|
|
27
|
+
[tool.ty.src]
|
|
28
|
+
exclude = ["tests"]
|
|
29
|
+
|
|
30
|
+
[project.urls]
|
|
31
|
+
Homepage = "https://github.com/adrianmazur-dev/ocrmypdf-rapidocr"
|
|
32
|
+
Repository = "https://github.com/adrianmazur-dev/ocrmypdf-rapidocr.git"
|
|
33
|
+
|
|
34
|
+
[build-system]
|
|
35
|
+
requires = ["uv_build>=0.10.8,<0.11.0"]
|
|
36
|
+
build-backend = "uv_build"
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from ocrmypdf_rapidocr.plugin import add_options, check_options, get_ocr_engine
|
|
4
|
+
from ocrmypdf_rapidocr.version import __version__
|
|
5
|
+
|
|
6
|
+
__all__ = [
|
|
7
|
+
"__version__",
|
|
8
|
+
"add_options",
|
|
9
|
+
"check_options",
|
|
10
|
+
"get_ocr_engine",
|
|
11
|
+
]
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import importlib.metadata
|
|
4
|
+
from functools import lru_cache
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from ocrmypdf.pluginspec import OcrEngine, OrientationConfidence
|
|
8
|
+
from PIL import Image
|
|
9
|
+
|
|
10
|
+
from ocrmypdf_rapidocr.hocr import build_hocr_document, extract_hocr_lines
|
|
11
|
+
from ocrmypdf_rapidocr.languages import (
|
|
12
|
+
SUPPORTED_LANGUAGE_CODES,
|
|
13
|
+
map_language_to_langrec_name,
|
|
14
|
+
select_single_language,
|
|
15
|
+
)
|
|
16
|
+
from ocrmypdf_rapidocr.options import get_option_config_path
|
|
17
|
+
from ocrmypdf_rapidocr.version import __version__
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _import_rapidocr_symbols():
|
|
21
|
+
from rapidocr import EngineType, LangRec, RapidOCR
|
|
22
|
+
|
|
23
|
+
return RapidOCR, LangRec, EngineType
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@lru_cache(maxsize=16)
|
|
27
|
+
def get_rapidocr_engine(language: str, config_path: str | None):
|
|
28
|
+
RapidOCR, LangRec, EngineType = _import_rapidocr_symbols()
|
|
29
|
+
|
|
30
|
+
langrec_name = map_language_to_langrec_name(language)
|
|
31
|
+
langrec_value = getattr(LangRec, langrec_name)
|
|
32
|
+
|
|
33
|
+
params = {
|
|
34
|
+
"Det.engine_type": EngineType.ONNXRUNTIME,
|
|
35
|
+
"Cls.engine_type": EngineType.ONNXRUNTIME,
|
|
36
|
+
"Rec.engine_type": EngineType.ONNXRUNTIME,
|
|
37
|
+
"Rec.lang_type": langrec_value,
|
|
38
|
+
}
|
|
39
|
+
return RapidOCR(config_path=config_path, params=params)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class RapidOCREngine(OcrEngine):
|
|
43
|
+
@staticmethod
|
|
44
|
+
def version() -> str:
|
|
45
|
+
try:
|
|
46
|
+
return importlib.metadata.version("rapidocr")
|
|
47
|
+
except importlib.metadata.PackageNotFoundError:
|
|
48
|
+
return "unknown"
|
|
49
|
+
|
|
50
|
+
@staticmethod
|
|
51
|
+
def creator_tag(options) -> str:
|
|
52
|
+
return (
|
|
53
|
+
f"RapidOCR {RapidOCREngine.version()} via ocrmypdf-rapidocr {__version__}"
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
def __str__(self) -> str:
|
|
57
|
+
return f"RapidOCR {RapidOCREngine.version()}"
|
|
58
|
+
|
|
59
|
+
@staticmethod
|
|
60
|
+
def languages(options) -> set[str]:
|
|
61
|
+
return set(SUPPORTED_LANGUAGE_CODES)
|
|
62
|
+
|
|
63
|
+
@staticmethod
|
|
64
|
+
def get_orientation(input_file, options) -> OrientationConfidence:
|
|
65
|
+
return OrientationConfidence(angle=0, confidence=0.0)
|
|
66
|
+
|
|
67
|
+
@staticmethod
|
|
68
|
+
def get_deskew(input_file, options) -> float:
|
|
69
|
+
return 0.0
|
|
70
|
+
|
|
71
|
+
@staticmethod
|
|
72
|
+
def generate_hocr(input_file, output_hocr, output_text, options) -> None:
|
|
73
|
+
language = select_single_language(options)
|
|
74
|
+
config_path = get_option_config_path(options)
|
|
75
|
+
rapidocr_engine = get_rapidocr_engine(
|
|
76
|
+
language,
|
|
77
|
+
config_path,
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
with Image.open(input_file) as image:
|
|
81
|
+
page_width, page_height = image.size
|
|
82
|
+
|
|
83
|
+
result = rapidocr_engine(str(input_file))
|
|
84
|
+
lines = extract_hocr_lines(
|
|
85
|
+
result, page_width=page_width, page_height=page_height
|
|
86
|
+
)
|
|
87
|
+
hocr = build_hocr_document(
|
|
88
|
+
page_width=page_width,
|
|
89
|
+
page_height=page_height,
|
|
90
|
+
language=language,
|
|
91
|
+
lines=lines,
|
|
92
|
+
)
|
|
93
|
+
plain_text = "\n".join(text for text, _bbox, _confidence in lines)
|
|
94
|
+
|
|
95
|
+
Path(output_hocr).write_text(hocr, encoding="utf-8")
|
|
96
|
+
Path(output_text).write_text(plain_text, encoding="utf-8")
|
|
97
|
+
|
|
98
|
+
@staticmethod
|
|
99
|
+
def generate_pdf(input_file, output_pdf, output_text, options) -> None:
|
|
100
|
+
raise NotImplementedError(
|
|
101
|
+
"ocrmypdf-rapidocr does not support sandwich renderer. "
|
|
102
|
+
"Use --pdf-renderer auto or --pdf-renderer fpdf2."
|
|
103
|
+
)
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from html import escape
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
HocrLine = tuple[str, tuple[int, int, int, int], int]
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def confidence_to_percent(score: Any) -> int:
|
|
10
|
+
if score is None:
|
|
11
|
+
return 0
|
|
12
|
+
try:
|
|
13
|
+
value = float(score)
|
|
14
|
+
except (TypeError, ValueError):
|
|
15
|
+
return 0
|
|
16
|
+
if value <= 1.0:
|
|
17
|
+
value *= 100.0
|
|
18
|
+
return max(0, min(100, int(round(value))))
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def bbox_from_polygon(polygon: Any, width: int, height: int) -> tuple[int, int, int, int]:
|
|
22
|
+
try:
|
|
23
|
+
points = list(polygon)
|
|
24
|
+
except TypeError:
|
|
25
|
+
return (0, 0, width, height)
|
|
26
|
+
|
|
27
|
+
xs: list[float] = []
|
|
28
|
+
ys: list[float] = []
|
|
29
|
+
for point in points:
|
|
30
|
+
try:
|
|
31
|
+
x = float(point[0])
|
|
32
|
+
y = float(point[1])
|
|
33
|
+
except (TypeError, ValueError, IndexError):
|
|
34
|
+
continue
|
|
35
|
+
xs.append(x)
|
|
36
|
+
ys.append(y)
|
|
37
|
+
|
|
38
|
+
if not xs or not ys:
|
|
39
|
+
return (0, 0, width, height)
|
|
40
|
+
|
|
41
|
+
x0 = max(0, min(width - 1, int(round(min(xs)))))
|
|
42
|
+
y0 = max(0, min(height - 1, int(round(min(ys)))))
|
|
43
|
+
x1 = max(x0 + 1, min(width, int(round(max(xs)))))
|
|
44
|
+
y1 = max(y0 + 1, min(height, int(round(max(ys)))))
|
|
45
|
+
return (x0, y0, x1, y1)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def extract_hocr_lines(
|
|
49
|
+
result: Any,
|
|
50
|
+
*,
|
|
51
|
+
page_width: int,
|
|
52
|
+
page_height: int,
|
|
53
|
+
) -> list[HocrLine]:
|
|
54
|
+
boxes_raw = getattr(result, "boxes", None)
|
|
55
|
+
texts_raw = getattr(result, "txts", None)
|
|
56
|
+
scores_raw = getattr(result, "scores", None)
|
|
57
|
+
|
|
58
|
+
boxes = list(boxes_raw) if boxes_raw is not None else []
|
|
59
|
+
texts = list(texts_raw) if texts_raw is not None else []
|
|
60
|
+
scores = list(scores_raw) if scores_raw is not None else []
|
|
61
|
+
count = min(len(boxes), len(texts))
|
|
62
|
+
|
|
63
|
+
extracted: list[HocrLine] = []
|
|
64
|
+
for index in range(count):
|
|
65
|
+
text = str(texts[index]).strip()
|
|
66
|
+
if not text:
|
|
67
|
+
continue
|
|
68
|
+
box = boxes[index]
|
|
69
|
+
score = scores[index] if index < len(scores) else None
|
|
70
|
+
bbox = bbox_from_polygon(box, page_width, page_height)
|
|
71
|
+
confidence = confidence_to_percent(score)
|
|
72
|
+
extracted.append((text, bbox, confidence))
|
|
73
|
+
return extracted
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def build_hocr_document(
|
|
77
|
+
*,
|
|
78
|
+
page_width: int,
|
|
79
|
+
page_height: int,
|
|
80
|
+
language: str,
|
|
81
|
+
lines: list[HocrLine],
|
|
82
|
+
) -> str:
|
|
83
|
+
hocr_lines = [
|
|
84
|
+
'<?xml version="1.0" encoding="UTF-8"?>',
|
|
85
|
+
'<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"',
|
|
86
|
+
' "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">',
|
|
87
|
+
'<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">',
|
|
88
|
+
"<head>",
|
|
89
|
+
"<title></title>",
|
|
90
|
+
'<meta http-equiv="content-type" content="text/html; charset=utf-8" />',
|
|
91
|
+
'<meta name="ocr-system" content="RapidOCR via ocrmypdf-rapidocr" />',
|
|
92
|
+
'<meta name="ocr-capabilities" content="ocr_page ocr_carea ocr_par ocr_line ocrx_word" />',
|
|
93
|
+
"</head>",
|
|
94
|
+
"<body>",
|
|
95
|
+
f'<div class="ocr_page" id="page_1" title="bbox 0 0 {page_width} {page_height}">',
|
|
96
|
+
]
|
|
97
|
+
|
|
98
|
+
for line_id, (text, bbox, confidence) in enumerate(lines, start=1):
|
|
99
|
+
x0, y0, x1, y1 = bbox
|
|
100
|
+
text_escaped = escape(text, quote=False)
|
|
101
|
+
hocr_lines.append(
|
|
102
|
+
f'<div class="ocr_carea" id="carea_{line_id}" title="bbox {x0} {y0} {x1} {y1}">'
|
|
103
|
+
)
|
|
104
|
+
hocr_lines.append(
|
|
105
|
+
f'<p class="ocr_par" id="par_{line_id}" lang="{language}" title="bbox {x0} {y0} {x1} {y1}">'
|
|
106
|
+
)
|
|
107
|
+
hocr_lines.append(
|
|
108
|
+
f'<span class="ocr_line" id="line_{line_id}" '
|
|
109
|
+
f'title="bbox {x0} {y0} {x1} {y1}; baseline 0 0; x_wconf {confidence}">'
|
|
110
|
+
)
|
|
111
|
+
hocr_lines.append(
|
|
112
|
+
f'<span class="ocrx_word" id="word_{line_id}" '
|
|
113
|
+
f'title="bbox {x0} {y0} {x1} {y1}; x_wconf {confidence}">{text_escaped}</span>'
|
|
114
|
+
)
|
|
115
|
+
hocr_lines.extend(["</span>", "</p>", "</div>"])
|
|
116
|
+
|
|
117
|
+
hocr_lines.extend(["</div>", "</body>", "</html>"])
|
|
118
|
+
return "\n".join(hocr_lines)
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any, Sequence
|
|
4
|
+
|
|
5
|
+
from ocrmypdf.exceptions import BadArgsError
|
|
6
|
+
|
|
7
|
+
# Map OCRmyPDF/Tesseract language codes to RapidOCR LangRec values.
|
|
8
|
+
_DIRECT_LANGUAGE_MAP: dict[str, str] = {
|
|
9
|
+
"ara": "ARABIC",
|
|
10
|
+
"chi_sim": "CH",
|
|
11
|
+
"chi_tra": "CHINESE_CHT",
|
|
12
|
+
"eng": "EN",
|
|
13
|
+
"ell": "EL",
|
|
14
|
+
"gre": "EL",
|
|
15
|
+
"jpn": "JAPAN",
|
|
16
|
+
"kor": "KOREAN",
|
|
17
|
+
"tha": "TH",
|
|
18
|
+
"tam": "TA",
|
|
19
|
+
"tel": "TE",
|
|
20
|
+
"bel": "CYRILLIC",
|
|
21
|
+
"bul": "CYRILLIC",
|
|
22
|
+
"mkd": "CYRILLIC",
|
|
23
|
+
"rus": "CYRILLIC",
|
|
24
|
+
"srp": "CYRILLIC",
|
|
25
|
+
"ukr": "CYRILLIC",
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
_LATIN_LANGUAGE_CODES: set[str] = {
|
|
29
|
+
"afr",
|
|
30
|
+
"cat",
|
|
31
|
+
"ces",
|
|
32
|
+
"dan",
|
|
33
|
+
"deu",
|
|
34
|
+
"est",
|
|
35
|
+
"eus",
|
|
36
|
+
"fin",
|
|
37
|
+
"fra",
|
|
38
|
+
"gle",
|
|
39
|
+
"hrv",
|
|
40
|
+
"hun",
|
|
41
|
+
"ind",
|
|
42
|
+
"isl",
|
|
43
|
+
"ita",
|
|
44
|
+
"lav",
|
|
45
|
+
"lit",
|
|
46
|
+
"mlt",
|
|
47
|
+
"msa",
|
|
48
|
+
"nld",
|
|
49
|
+
"nor",
|
|
50
|
+
"pol",
|
|
51
|
+
"por",
|
|
52
|
+
"ron",
|
|
53
|
+
"slk",
|
|
54
|
+
"slv",
|
|
55
|
+
"spa",
|
|
56
|
+
"sqi",
|
|
57
|
+
"swe",
|
|
58
|
+
"tgl",
|
|
59
|
+
"tur",
|
|
60
|
+
"vie",
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
SUPPORTED_LANGUAGE_CODES: frozenset[str] = frozenset(
|
|
64
|
+
set(_DIRECT_LANGUAGE_MAP) | _LATIN_LANGUAGE_CODES
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def normalize_languages(languages: Sequence[str] | None) -> list[str]:
|
|
69
|
+
if not languages:
|
|
70
|
+
return ["eng"]
|
|
71
|
+
return [
|
|
72
|
+
str(language).strip().lower() for language in languages if str(language).strip()
|
|
73
|
+
]
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def select_single_language(options: Any) -> str:
|
|
77
|
+
languages = normalize_languages(getattr(options, "languages", None))
|
|
78
|
+
if len(languages) != 1:
|
|
79
|
+
raise BadArgsError(
|
|
80
|
+
"RapidOCR supports exactly one language. "
|
|
81
|
+
"Pass a single language to -l/--language."
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
language = languages[0]
|
|
85
|
+
if "+" in language:
|
|
86
|
+
raise BadArgsError(
|
|
87
|
+
"RapidOCR does not support language combinations like eng+fra."
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
if language not in SUPPORTED_LANGUAGE_CODES:
|
|
91
|
+
supported = ", ".join(sorted(SUPPORTED_LANGUAGE_CODES))
|
|
92
|
+
raise BadArgsError(
|
|
93
|
+
f"Language '{language}' is not supported by ocrmypdf-rapidocr. "
|
|
94
|
+
f"Supported values: {supported}"
|
|
95
|
+
)
|
|
96
|
+
return language
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def map_language_to_langrec_name(language: str) -> str:
|
|
100
|
+
normalized = language.lower()
|
|
101
|
+
if normalized in _DIRECT_LANGUAGE_MAP:
|
|
102
|
+
return _DIRECT_LANGUAGE_MAP[normalized]
|
|
103
|
+
if normalized in _LATIN_LANGUAGE_CODES:
|
|
104
|
+
return "LATIN"
|
|
105
|
+
raise KeyError(normalized)
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import importlib.util
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from ocrmypdf.exceptions import BadArgsError, MissingDependencyError
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def is_rapidocr_selected(options: Any) -> bool:
|
|
11
|
+
return getattr(options, "ocr_engine", "auto") in ("auto", "rapidocr")
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def add_plugin_options(parser) -> None:
|
|
15
|
+
rapidocr_options = parser.add_argument_group("RapidOCR", "RapidOCR engine options")
|
|
16
|
+
rapidocr_options.add_argument(
|
|
17
|
+
"--rapidocr-config-path",
|
|
18
|
+
default=None,
|
|
19
|
+
help=(
|
|
20
|
+
"Path to a RapidOCR YAML config file. "
|
|
21
|
+
"If omitted, RapidOCR defaults are used."
|
|
22
|
+
),
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def check_runtime_dependencies() -> None:
|
|
27
|
+
if importlib.util.find_spec("rapidocr") is None:
|
|
28
|
+
raise MissingDependencyError(
|
|
29
|
+
"RapidOCR is not installed. Install it with: pip install rapidocr"
|
|
30
|
+
)
|
|
31
|
+
if importlib.util.find_spec("onnxruntime") is None:
|
|
32
|
+
raise MissingDependencyError(
|
|
33
|
+
"onnxruntime is not installed. Install it with: pip install onnxruntime"
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def get_option_config_path(options: Any) -> str | None:
|
|
38
|
+
value = getattr(options, "rapidocr_config_path", None)
|
|
39
|
+
if value is None:
|
|
40
|
+
return None
|
|
41
|
+
return str(value)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def validate_plugin_options(options: Any) -> None:
|
|
45
|
+
if getattr(options, "pdf_renderer", "auto") == "sandwich":
|
|
46
|
+
raise BadArgsError("ocrmypdf-rapidocr only supports hOCR/fpdf2 flow. ")
|
|
47
|
+
|
|
48
|
+
config_path = get_option_config_path(options)
|
|
49
|
+
if config_path is None:
|
|
50
|
+
return
|
|
51
|
+
|
|
52
|
+
path = Path(config_path)
|
|
53
|
+
if not path.exists():
|
|
54
|
+
raise BadArgsError(f"--rapidocr-config-path does not exist: {path}")
|
|
55
|
+
if not path.is_file():
|
|
56
|
+
raise BadArgsError(f"--rapidocr-config-path is not a file: {path}")
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
|
|
5
|
+
from ocrmypdf import hookimpl
|
|
6
|
+
|
|
7
|
+
from ocrmypdf_rapidocr.engine import RapidOCREngine
|
|
8
|
+
from ocrmypdf_rapidocr.languages import select_single_language
|
|
9
|
+
from ocrmypdf_rapidocr.options import (
|
|
10
|
+
add_plugin_options,
|
|
11
|
+
check_runtime_dependencies,
|
|
12
|
+
is_rapidocr_selected,
|
|
13
|
+
validate_plugin_options,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
log = logging.getLogger(__name__)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@hookimpl
|
|
20
|
+
def add_options(parser) -> None:
|
|
21
|
+
add_plugin_options(parser)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@hookimpl
|
|
25
|
+
def check_options(options) -> None:
|
|
26
|
+
if not is_rapidocr_selected(options):
|
|
27
|
+
return
|
|
28
|
+
|
|
29
|
+
check_runtime_dependencies()
|
|
30
|
+
validate_plugin_options(options)
|
|
31
|
+
select_single_language(options)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@hookimpl
|
|
35
|
+
def get_ocr_engine(options=None):
|
|
36
|
+
if options is not None:
|
|
37
|
+
ocr_engine = getattr(options, "ocr_engine", "auto")
|
|
38
|
+
if ocr_engine not in ("auto", "rapidocr"):
|
|
39
|
+
return None
|
|
40
|
+
return RapidOCREngine()
|