trivision-ocr 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- trivision_ocr-1.0.0/.env.example +18 -0
- trivision_ocr-1.0.0/CHANGELOG.md +19 -0
- trivision_ocr-1.0.0/MANIFEST.in +5 -0
- trivision_ocr-1.0.0/PKG-INFO +143 -0
- trivision_ocr-1.0.0/README.md +109 -0
- trivision_ocr-1.0.0/models/.gitkeep +0 -0
- trivision_ocr-1.0.0/models/EDSR_x2.pb +0 -0
- trivision_ocr-1.0.0/ocr_pipeline/__init__.py +20 -0
- trivision_ocr-1.0.0/ocr_pipeline/config.py +97 -0
- trivision_ocr-1.0.0/ocr_pipeline/corrector.py +91 -0
- trivision_ocr-1.0.0/ocr_pipeline/engines.py +342 -0
- trivision_ocr-1.0.0/ocr_pipeline/logger.py +36 -0
- trivision_ocr-1.0.0/ocr_pipeline/pipeline.py +90 -0
- trivision_ocr-1.0.0/ocr_pipeline/preprocessor.py +324 -0
- trivision_ocr-1.0.0/ocr_pipeline/run.py +94 -0
- trivision_ocr-1.0.0/ocr_pipeline/voter.py +145 -0
- trivision_ocr-1.0.0/pyproject.toml +55 -0
- trivision_ocr-1.0.0/setup.cfg +4 -0
- trivision_ocr-1.0.0/tests/sample.png +0 -0
- trivision_ocr-1.0.0/tests/test_voter.py +122 -0
- trivision_ocr-1.0.0/trivision_ocr.egg-info/PKG-INFO +143 -0
- trivision_ocr-1.0.0/trivision_ocr.egg-info/SOURCES.txt +24 -0
- trivision_ocr-1.0.0/trivision_ocr.egg-info/dependency_links.txt +1 -0
- trivision_ocr-1.0.0/trivision_ocr.egg-info/entry_points.txt +2 -0
- trivision_ocr-1.0.0/trivision_ocr.egg-info/requires.txt +25 -0
- trivision_ocr-1.0.0/trivision_ocr.egg-info/top_level.txt +1 -0
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
# ─── Tesseract binary path ───────────────────────────────────────────────────
|
|
2
|
+
# Windows default after winget install UB-Mannheim.TesseractOCR
|
|
3
|
+
TESSERACT_CMD=C:\Program Files\Tesseract-OCR\tesseract.exe
|
|
4
|
+
|
|
5
|
+
# ─── GPU ─────────────────────────────────────────────────────────────────────
|
|
6
|
+
# Set to "true" only if you have a CUDA-capable GPU + paddlepaddle-gpu installed
|
|
7
|
+
USE_GPU=false
|
|
8
|
+
|
|
9
|
+
# ─── Super-resolution ────────────────────────────────────────────────────────
|
|
10
|
+
# EDSR 2× upscale — requires models/EDSR_x2.pb (run download_models.py first)
|
|
11
|
+
# Adds ~3–8 seconds on CPU; set to "true" only on GPU or when accuracy is critical
|
|
12
|
+
USE_SUPER_RESOLVE=false
|
|
13
|
+
|
|
14
|
+
# ─── Models directory ────────────────────────────────────────────────────────
|
|
15
|
+
MODELS_DIR=models
|
|
16
|
+
|
|
17
|
+
# ─── Optional: pin GPU device ────────────────────────────────────────────────
|
|
18
|
+
# CUDA_VISIBLE_DEVICES=0
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to this project will be documented here.
|
|
4
|
+
|
|
5
|
+
## [1.0.0] — 2026-03-22
|
|
6
|
+
|
|
7
|
+
### Added
|
|
8
|
+
- **PyPI Release**: Rebranded and packaged for PyPI deployment as `trivision-ocr`.
|
|
9
|
+
- **Stage 0 — Adaptive Preprocessor**: Hough-line median deskew (robust on sparse engineering diagrams), CLAHE contrast enhancement, Otsu + adaptive binarization, optional EDSR 2× super-resolution.
|
|
10
|
+
- **Stage 1 — Three-Engine OCR**: Tesseract (LSTM+CTC), PaddleOCR (PP-OCRv5 DBNet+SVTR), EasyOCR (CRAFT+CRNN). GPU engines run sequentially to avoid CUDA context conflicts.
|
|
11
|
+
- **Stage 2 — Confidence-Weighted Voter**: IoU spatial grouping, per-engine weighting, agreement bonus (+15% per agreeing engine).
|
|
12
|
+
- **Stage 3 — Domain Post-Corrector**: SymSpell spell correction with domain vocabulary protection (PRV, FCV, GV, BFP, HDPE, …).
|
|
13
|
+
- Vision API-compatible JSON output with bonus `engine_count` and `confidence` fields.
|
|
14
|
+
- `ocr-pipeline` CLI entry point.
|
|
15
|
+
- `download_models.py` for one-time model file downloads.
|
|
16
|
+
- `setup_venv.bat` for Windows venv bootstrap (CPU default, `gpu` argument for CUDA).
|
|
17
|
+
- `MANIFEST.in` for proper package distribution.
|
|
18
|
+
- Centralised `logger.py` for structured timestamped logging.
|
|
19
|
+
- Unit test suite (`tests/test_voter.py`) — 7 tests, no GPU required.
|
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: trivision-ocr
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Multi-engine OCR pipeline — beats Google Vision API
|
|
5
|
+
Author-email: Parthasarathy G <parthasarathyg693@gmail.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Keywords: ocr,tesseract,paddleocr,easyocr,vision-api
|
|
8
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
+
Classifier: Topic :: Scientific/Engineering :: Image Recognition
|
|
11
|
+
Requires-Python: >=3.10
|
|
12
|
+
Description-Content-Type: text/markdown
|
|
13
|
+
Requires-Dist: opencv-contrib-python-headless
|
|
14
|
+
Requires-Dist: pytesseract
|
|
15
|
+
Requires-Dist: Pillow
|
|
16
|
+
Requires-Dist: paddleocr
|
|
17
|
+
Requires-Dist: easyocr
|
|
18
|
+
Requires-Dist: symspellpy
|
|
19
|
+
Requires-Dist: numpy
|
|
20
|
+
Requires-Dist: python-dotenv
|
|
21
|
+
Requires-Dist: requests
|
|
22
|
+
Provides-Extra: cpu
|
|
23
|
+
Requires-Dist: paddlepaddle; extra == "cpu"
|
|
24
|
+
Provides-Extra: gpu
|
|
25
|
+
Requires-Dist: paddlepaddle-gpu; extra == "gpu"
|
|
26
|
+
Provides-Extra: api
|
|
27
|
+
Requires-Dist: fastapi; extra == "api"
|
|
28
|
+
Requires-Dist: uvicorn[standard]; extra == "api"
|
|
29
|
+
Requires-Dist: python-multipart; extra == "api"
|
|
30
|
+
Provides-Extra: dev
|
|
31
|
+
Requires-Dist: pytest; extra == "dev"
|
|
32
|
+
Requires-Dist: black; extra == "dev"
|
|
33
|
+
Requires-Dist: isort; extra == "dev"
|
|
34
|
+
|
|
35
|
+
# ocr-pipeline
|
|
36
|
+
|
|
37
|
+
Multi-engine OCR pipeline combining **Tesseract**, **PaddleOCR**, and **EasyOCR** with confidence-weighted voting and domain-aware spell correction. Output is compatible with the Google Vision API JSON schema.
|
|
38
|
+
|
|
39
|
+
## Architecture
|
|
40
|
+
|
|
41
|
+
```
|
|
42
|
+
Image → [Preprocessor] → Tesseract ┐
|
|
43
|
+
→ PaddleOCR ├─→ [Voter] → [Corrector] → Vision API JSON
|
|
44
|
+
→ EasyOCR ┘
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
| Stage | What it does | Your edge vs Vision API |
|
|
48
|
+
|---|---|---|
|
|
49
|
+
| 0 — Preprocessor | Hough deskew, CLAHE, binarize, optional 2× EDSR | Domain-specific prep; Vision API gets raw images |
|
|
50
|
+
| 1 — Three engines | Tesseract (LSTM), PaddleOCR (DBNet+SVTR), EasyOCR (CRAFT+CRNN) | Different failure modes → ensemble eliminates each |
|
|
51
|
+
| 2 — Voter | IoU spatial grouping + weighted confidence + agreement bonus | Cross-engine consensus exposed; Vision API hides this |
|
|
52
|
+
| 3 — Corrector | SymSpell + domain vocabulary protection | Tuned to your label vocabulary; Vision API uses general LM |
|
|
53
|
+
|
|
54
|
+
## Prerequisites
|
|
55
|
+
|
|
56
|
+
**Tesseract binary (Windows — required):**
|
|
57
|
+
```powershell
|
|
58
|
+
winget install UB-Mannheim.TesseractOCR
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
**Python 3.10+** must be installed and on PATH.
|
|
62
|
+
|
|
63
|
+
## Quick Start
|
|
64
|
+
|
|
65
|
+
```powershell
|
|
66
|
+
# 1. Clone / copy the project
|
|
67
|
+
cd ocr-pipeline
|
|
68
|
+
|
|
69
|
+
# 2. Create virtual environment and install (CPU default)
|
|
70
|
+
setup_venv.bat
|
|
71
|
+
|
|
72
|
+
# For GPU (CUDA + paddlepaddle-gpu):
|
|
73
|
+
setup_venv.bat gpu
|
|
74
|
+
|
|
75
|
+
# 3. Activate
|
|
76
|
+
.venv\Scripts\activate
|
|
77
|
+
|
|
78
|
+
# 4. Run on an image
|
|
79
|
+
ocr-pipeline path\to\image.jpg --pretty
|
|
80
|
+
|
|
81
|
+
# Save output to file
|
|
82
|
+
ocr-pipeline path\to\image.jpg --output result.json
|
|
83
|
+
|
|
84
|
+
# Text only
|
|
85
|
+
ocr-pipeline path\to\image.jpg --text-only
|
|
86
|
+
|
|
87
|
+
# Skip super-resolution (faster, no EDSR model needed)
|
|
88
|
+
ocr-pipeline path\to\image.jpg --no-super-resolve
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
## Python API
|
|
92
|
+
|
|
93
|
+
```python
|
|
94
|
+
from ocr_pipeline import beat_vision_api
|
|
95
|
+
|
|
96
|
+
result = beat_vision_api("path/to/image.jpg")
|
|
97
|
+
print(result["responses"][0]["fullTextAnnotation"]["text"])
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
## Configuration
|
|
101
|
+
|
|
102
|
+
Copy `.env.example` to `.env` and edit:
|
|
103
|
+
|
|
104
|
+
```env
|
|
105
|
+
TESSERACT_CMD=C:\Program Files\Tesseract-OCR\tesseract.exe
|
|
106
|
+
USE_GPU=false
|
|
107
|
+
USE_SUPER_RESOLVE=false
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
Add project-specific label codes to `DOMAIN_TERMS` in `ocr_pipeline/config.py`.
|
|
111
|
+
|
|
112
|
+
## Running Tests
|
|
113
|
+
|
|
114
|
+
```powershell
|
|
115
|
+
pytest tests/test_voter.py -v
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
All 7 tests run without GPU or model downloads.
|
|
119
|
+
|
|
120
|
+
## Optional: Super-Resolution
|
|
121
|
+
|
|
122
|
+
EDSR 2× upscale (~143 MB model) significantly improves small text on diagram scans. Enable it:
|
|
123
|
+
|
|
124
|
+
1. Run `python download_models.py` and choose `y` when prompted for EDSR
|
|
125
|
+
2. Set `USE_SUPER_RESOLVE=true` in `.env`
|
|
126
|
+
|
|
127
|
+
## Project Structure
|
|
128
|
+
|
|
129
|
+
```
|
|
130
|
+
ocr-pipeline/
|
|
131
|
+
├── ocr_pipeline/ ← Python package
|
|
132
|
+
│ ├── config.py ← All tunable parameters
|
|
133
|
+
│ ├── preprocessor.py ← Stage 0
|
|
134
|
+
│ ├── engines.py ← Stage 1
|
|
135
|
+
│ ├── voter.py ← Stage 2
|
|
136
|
+
│ ├── corrector.py ← Stage 3
|
|
137
|
+
│ └── pipeline.py ← Orchestration
|
|
138
|
+
├── tests/
|
|
139
|
+
│ └── test_voter.py ← Unit tests (no GPU)
|
|
140
|
+
├── models/ ← Downloaded model files
|
|
141
|
+
├── setup_venv.bat ← Windows setup
|
|
142
|
+
└── download_models.py ← Model downloader
|
|
143
|
+
```
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
# ocr-pipeline
|
|
2
|
+
|
|
3
|
+
Multi-engine OCR pipeline combining **Tesseract**, **PaddleOCR**, and **EasyOCR** with confidence-weighted voting and domain-aware spell correction. Output is compatible with the Google Vision API JSON schema.
|
|
4
|
+
|
|
5
|
+
## Architecture
|
|
6
|
+
|
|
7
|
+
```
|
|
8
|
+
Image → [Preprocessor] → Tesseract ┐
|
|
9
|
+
→ PaddleOCR ├─→ [Voter] → [Corrector] → Vision API JSON
|
|
10
|
+
→ EasyOCR ┘
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
| Stage | What it does | Your edge vs Vision API |
|
|
14
|
+
|---|---|---|
|
|
15
|
+
| 0 — Preprocessor | Hough deskew, CLAHE, binarize, optional 2× EDSR | Domain-specific prep; Vision API gets raw images |
|
|
16
|
+
| 1 — Three engines | Tesseract (LSTM), PaddleOCR (DBNet+SVTR), EasyOCR (CRAFT+CRNN) | Different failure modes → ensemble eliminates each |
|
|
17
|
+
| 2 — Voter | IoU spatial grouping + weighted confidence + agreement bonus | Cross-engine consensus exposed; Vision API hides this |
|
|
18
|
+
| 3 — Corrector | SymSpell + domain vocabulary protection | Tuned to your label vocabulary; Vision API uses general LM |
|
|
19
|
+
|
|
20
|
+
## Prerequisites
|
|
21
|
+
|
|
22
|
+
**Tesseract binary (Windows — required):**
|
|
23
|
+
```powershell
|
|
24
|
+
winget install UB-Mannheim.TesseractOCR
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
**Python 3.10+** must be installed and on PATH.
|
|
28
|
+
|
|
29
|
+
## Quick Start
|
|
30
|
+
|
|
31
|
+
```powershell
|
|
32
|
+
# 1. Clone / copy the project
|
|
33
|
+
cd ocr-pipeline
|
|
34
|
+
|
|
35
|
+
# 2. Create virtual environment and install (CPU default)
|
|
36
|
+
setup_venv.bat
|
|
37
|
+
|
|
38
|
+
# For GPU (CUDA + paddlepaddle-gpu):
|
|
39
|
+
setup_venv.bat gpu
|
|
40
|
+
|
|
41
|
+
# 3. Activate
|
|
42
|
+
.venv\Scripts\activate
|
|
43
|
+
|
|
44
|
+
# 4. Run on an image
|
|
45
|
+
ocr-pipeline path\to\image.jpg --pretty
|
|
46
|
+
|
|
47
|
+
# Save output to file
|
|
48
|
+
ocr-pipeline path\to\image.jpg --output result.json
|
|
49
|
+
|
|
50
|
+
# Text only
|
|
51
|
+
ocr-pipeline path\to\image.jpg --text-only
|
|
52
|
+
|
|
53
|
+
# Skip super-resolution (faster, no EDSR model needed)
|
|
54
|
+
ocr-pipeline path\to\image.jpg --no-super-resolve
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
## Python API
|
|
58
|
+
|
|
59
|
+
```python
|
|
60
|
+
from ocr_pipeline import beat_vision_api
|
|
61
|
+
|
|
62
|
+
result = beat_vision_api("path/to/image.jpg")
|
|
63
|
+
print(result["responses"][0]["fullTextAnnotation"]["text"])
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
## Configuration
|
|
67
|
+
|
|
68
|
+
Copy `.env.example` to `.env` and edit:
|
|
69
|
+
|
|
70
|
+
```env
|
|
71
|
+
TESSERACT_CMD=C:\Program Files\Tesseract-OCR\tesseract.exe
|
|
72
|
+
USE_GPU=false
|
|
73
|
+
USE_SUPER_RESOLVE=false
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
Add project-specific label codes to `DOMAIN_TERMS` in `ocr_pipeline/config.py`.
|
|
77
|
+
|
|
78
|
+
## Running Tests
|
|
79
|
+
|
|
80
|
+
```powershell
|
|
81
|
+
pytest tests/test_voter.py -v
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
All 7 tests run without GPU or model downloads.
|
|
85
|
+
|
|
86
|
+
## Optional: Super-Resolution
|
|
87
|
+
|
|
88
|
+
EDSR 2× upscale (~143 MB model) significantly improves small text on diagram scans. Enable it:
|
|
89
|
+
|
|
90
|
+
1. Run `python download_models.py` and choose `y` when prompted for EDSR
|
|
91
|
+
2. Set `USE_SUPER_RESOLVE=true` in `.env`
|
|
92
|
+
|
|
93
|
+
## Project Structure
|
|
94
|
+
|
|
95
|
+
```
|
|
96
|
+
ocr-pipeline/
|
|
97
|
+
├── ocr_pipeline/ ← Python package
|
|
98
|
+
│ ├── config.py ← All tunable parameters
|
|
99
|
+
│ ├── preprocessor.py ← Stage 0
|
|
100
|
+
│ ├── engines.py ← Stage 1
|
|
101
|
+
│ ├── voter.py ← Stage 2
|
|
102
|
+
│ ├── corrector.py ← Stage 3
|
|
103
|
+
│ └── pipeline.py ← Orchestration
|
|
104
|
+
├── tests/
|
|
105
|
+
│ └── test_voter.py ← Unit tests (no GPU)
|
|
106
|
+
├── models/ ← Downloaded model files
|
|
107
|
+
├── setup_venv.bat ← Windows setup
|
|
108
|
+
└── download_models.py ← Model downloader
|
|
109
|
+
```
|
|
File without changes
|
|
Binary file
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
"""
|
|
2
|
+
ocr_pipeline — Multi-engine OCR pipeline.
|
|
3
|
+
|
|
4
|
+
Top-level public API:
|
|
5
|
+
from ocr_pipeline import beat_vision_api
|
|
6
|
+
result = beat_vision_api("path/to/image.jpg")
|
|
7
|
+
|
|
8
|
+
The pipeline import is lazy so that lightweight submodules (voter, config)
|
|
9
|
+
can be imported in test environments without requiring all heavy ML deps.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
__version__ = "1.0.0"
|
|
13
|
+
__all__ = ["beat_vision_api"]
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def __getattr__(name: str):
|
|
17
|
+
if name == "beat_vision_api":
|
|
18
|
+
from .pipeline import beat_vision_api # noqa: PLC0415
|
|
19
|
+
return beat_vision_api
|
|
20
|
+
raise AttributeError(f"module 'ocr_pipeline' has no attribute {name!r}")
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
"""
|
|
2
|
+
config.py — Central configuration for the OCR pipeline.
|
|
3
|
+
|
|
4
|
+
Loads .env file automatically (via python-dotenv) so that TESSERACT_CMD and
|
|
5
|
+
other settings work reliably on a fresh machine without needing system env vars.
|
|
6
|
+
Import this module first in any other module — it also sets the Tesseract
|
|
7
|
+
binary path so pytesseract works on Windows without extra setup.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import os
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
|
|
15
|
+
import pytesseract
|
|
16
|
+
from dotenv import load_dotenv
|
|
17
|
+
|
|
18
|
+
# Load .env from the project root (two levels up from this file)
|
|
19
|
+
_PROJECT_ROOT = Path(__file__).parent.parent
|
|
20
|
+
load_dotenv(_PROJECT_ROOT / ".env", override=False)
|
|
21
|
+
|
|
22
|
+
# ─── Tesseract binary (Windows) ──────────────────────────────────────────────
|
|
23
|
+
# Set TESSERACT_CMD in .env or as a system env var to override.
|
|
24
|
+
TESSERACT_CMD: str = os.environ.get(
|
|
25
|
+
"TESSERACT_CMD",
|
|
26
|
+
r"C:\Program Files\Tesseract-OCR\tesseract.exe",
|
|
27
|
+
)
|
|
28
|
+
pytesseract.pytesseract.tesseract_cmd = TESSERACT_CMD # applied at import time
|
|
29
|
+
|
|
30
|
+
# ─── Runtime flags ───────────────────────────────────────────────────────────
|
|
31
|
+
USE_GPU: bool = os.environ.get("USE_GPU", "false").lower() == "true"
|
|
32
|
+
USE_SUPER_RESOLVE: bool = os.environ.get("USE_SUPER_RESOLVE", "false").lower() == "true"
|
|
33
|
+
|
|
34
|
+
# ─── Paths (pathlib — no hardcoded strings) ──────────────────────────────────
|
|
35
|
+
MODELS_DIR: Path = Path(os.environ.get("MODELS_DIR", str(_PROJECT_ROOT / "models")))
|
|
36
|
+
EDSR_MODEL_PATH: str = str(MODELS_DIR / "EDSR_x2.pb")
|
|
37
|
+
SYMSPELL_DICT_PATH: str = str(MODELS_DIR / "frequency_dictionary_en_82_765.txt")
|
|
38
|
+
|
|
39
|
+
# ─── Engine weights ──────────────────────────────────────────────────────────
|
|
40
|
+
ENGINE_WEIGHTS: dict[str, float] = {
|
|
41
|
+
"paddleocr": 0.45,
|
|
42
|
+
"easyocr": 0.35,
|
|
43
|
+
"tesseract": 0.20,
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
# ─── Voter parameters ────────────────────────────────────────────────────────
|
|
47
|
+
MIN_CONFIDENCE: float = 0.05
|
|
48
|
+
IOU_THRESHOLD: float = 0.20
|
|
49
|
+
TEXT_SIMILARITY_THRESHOLD: float = 0.80
|
|
50
|
+
AGREEMENT_BONUS: float = 0.15
|
|
51
|
+
MIN_TESSERACT_CONF: int = 0
|
|
52
|
+
|
|
53
|
+
# ─── Preprocessing parameters ────────────────────────────────────────────────
|
|
54
|
+
CLAHE_CLIP_LIMIT: float = 3.0
|
|
55
|
+
CLAHE_TILE_GRID: tuple[int, int] = (8, 8)
|
|
56
|
+
ADAPTIVE_BLOCK_SIZE: int = 31
|
|
57
|
+
ADAPTIVE_C: int = 10
|
|
58
|
+
|
|
59
|
+
HOUGH_THRESHOLD: int = 80
|
|
60
|
+
HOUGH_MIN_LINE_LENGTH: int = 50
|
|
61
|
+
HOUGH_MAX_LINE_GAP: int = 10
|
|
62
|
+
HOUGH_MAX_ANGLE: float = 45.0
|
|
63
|
+
|
|
64
|
+
# ─── Domain vocabulary ───────────────────────────────────────────────────────
|
|
65
|
+
DOMAIN_TERMS: set[str] = {
|
|
66
|
+
"PRV", "FCV", "GV", "BFP", "MH", "DMH", "SV", "WM",
|
|
67
|
+
"HDPE", "DI", "MDIA", "NRV", "ARV", "PSV", "RFV",
|
|
68
|
+
"DN", "PN", "ID", "OD", "PVC", "GRP", "PE",
|
|
69
|
+
"GATE", "VALVE", "SLUICE", "BUTTERFLY", "CHECK", "BALL", "GLOBE",
|
|
70
|
+
"MANHOLE", "CHAMBER", "JUNCTION", "TEE", "BEND", "REDUCER",
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
# ─── SymSpell parameters ─────────────────────────────────────────────────────
|
|
74
|
+
SYMSPELL_MAX_EDIT_DISTANCE: int = 2
|
|
75
|
+
SYMSPELL_MIN_FREQUENCY: int = 50000
|
|
76
|
+
|
|
77
|
+
# ─── OCR confusion aliases ───────────────────────────────────────────────────
|
|
78
|
+
DOMAIN_MISREADS: dict[str, str] = {
|
|
79
|
+
"HOPE": "HDPE",
|
|
80
|
+
"HDFE": "HDPE",
|
|
81
|
+
"PNI6": "PN16",
|
|
82
|
+
"PNI": "PN1",
|
|
83
|
+
"BFP-S": "BFP-3",
|
|
84
|
+
"MHS": "MH-5",
|
|
85
|
+
"MH5": "MH-5",
|
|
86
|
+
"DN3OO": "DN300",
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
# ─── API auth ────────────────────────────────────────────────────────────────
|
|
90
|
+
API_KEYS: set[str] = set(
|
|
91
|
+
k.strip()
|
|
92
|
+
for k in os.environ.get("API_KEYS", "").split(",")
|
|
93
|
+
if k.strip()
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
# ─── Benchmark / logging ─────────────────────────────────────────────────────
|
|
97
|
+
LOG_LEVEL: str = os.environ.get("LOG_LEVEL", "INFO").upper()
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
"""
|
|
2
|
+
corrector.py — Stage 3: Domain-aware SymSpell post-corrector.
|
|
3
|
+
|
|
4
|
+
Post-processes voted OCR results by spell-correcting general English tokens
|
|
5
|
+
while NEVER modifying domain-specific terms (valve codes, pipe labels, etc.).
|
|
6
|
+
|
|
7
|
+
Logic per token:
|
|
8
|
+
1. Skip if the token is in DOMAIN_TERMS (any case normalisation applied).
|
|
9
|
+
2. Skip if the token looks like a numeric/alphanumeric code (e.g. "DN300").
|
|
10
|
+
3. Apply SymSpell with edit-distance ≤ 2; only accept suggestions with
|
|
11
|
+
high corpus frequency (avoids over-correcting rare but valid words).
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import re
|
|
17
|
+
|
|
18
|
+
from symspellpy import SymSpell, Verbosity
|
|
19
|
+
|
|
20
|
+
from . import config
|
|
21
|
+
|
|
22
|
+
# ── Initialise SymSpell once at module level ─────────────────────────────────
|
|
23
|
+
_sym_spell: SymSpell | None = None
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _get_sym_spell() -> SymSpell:
|
|
27
|
+
global _sym_spell
|
|
28
|
+
if _sym_spell is None:
|
|
29
|
+
_sym_spell = SymSpell(max_dictionary_edit_distance=config.SYMSPELL_MAX_EDIT_DISTANCE)
|
|
30
|
+
if not _sym_spell.load_dictionary(
|
|
31
|
+
config.SYMSPELL_DICT_PATH,
|
|
32
|
+
term_index=0,
|
|
33
|
+
count_index=1,
|
|
34
|
+
):
|
|
35
|
+
print(
|
|
36
|
+
f"[WARN] SymSpell dictionary not found at {config.SYMSPELL_DICT_PATH}. "
|
|
37
|
+
"Run download_models.py. Spell correction disabled."
|
|
38
|
+
)
|
|
39
|
+
# Return an initialised-but-empty SymSpell so callers don't crash
|
|
40
|
+
return _sym_spell
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
# ── Numeric / code pattern — never correct these tokens ──────────────────────
|
|
44
|
+
_CODE_PATTERN = re.compile(r"^\d+[\w/\-]*$|^[\w/\-]*\d+[\w/\-]*$")
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def correct_token(token: str) -> str:
|
|
48
|
+
"""
|
|
49
|
+
Correct a single OCR token, respecting domain vocabulary.
|
|
50
|
+
|
|
51
|
+
Returns the original token unchanged if:
|
|
52
|
+
- It matches a domain term (case-insensitive look-up)
|
|
53
|
+
- It looks like a numeric/alphanumeric code (e.g. "DN300", "2B-PRV")
|
|
54
|
+
"""
|
|
55
|
+
upper = token.upper()
|
|
56
|
+
if upper in config.DOMAIN_TERMS:
|
|
57
|
+
return token
|
|
58
|
+
if upper in config.DOMAIN_MISREADS:
|
|
59
|
+
return config.DOMAIN_MISREADS[upper]
|
|
60
|
+
|
|
61
|
+
# Numeric / code pattern check
|
|
62
|
+
if _CODE_PATTERN.match(token):
|
|
63
|
+
return token
|
|
64
|
+
|
|
65
|
+
sym = _get_sym_spell()
|
|
66
|
+
suggestions = sym.lookup(
|
|
67
|
+
token,
|
|
68
|
+
Verbosity.CLOSEST,
|
|
69
|
+
max_edit_distance=config.SYMSPELL_MAX_EDIT_DISTANCE,
|
|
70
|
+
include_unknown=True,
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
if suggestions:
|
|
74
|
+
best = suggestions[0]
|
|
75
|
+
# Only accept if the correction is well attested in general English
|
|
76
|
+
if best.count >= config.SYMSPELL_MIN_FREQUENCY:
|
|
77
|
+
return best.term
|
|
78
|
+
|
|
79
|
+
return token
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def post_correct(voted_results: list[dict]) -> list[dict]:
|
|
83
|
+
"""
|
|
84
|
+
Apply ``correct_token`` to every token in every result entry.
|
|
85
|
+
|
|
86
|
+
Modifies results in-place and also returns the list.
|
|
87
|
+
"""
|
|
88
|
+
for item in voted_results:
|
|
89
|
+
tokens = item["text"].split()
|
|
90
|
+
item["text"] = " ".join(correct_token(t) for t in tokens)
|
|
91
|
+
return voted_results
|