trivision-ocr 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,18 @@
1
+ # ─── Tesseract binary path ───────────────────────────────────────────────────
2
+ # Windows default after winget install UB-Mannheim.TesseractOCR
3
+ TESSERACT_CMD=C:\Program Files\Tesseract-OCR\tesseract.exe
4
+
5
+ # ─── GPU ─────────────────────────────────────────────────────────────────────
6
+ # Set to "true" only if you have a CUDA-capable GPU + paddlepaddle-gpu installed
7
+ USE_GPU=false
8
+
9
+ # ─── Super-resolution ────────────────────────────────────────────────────────
10
+ # EDSR 2× upscale — requires models/EDSR_x2.pb (run download_models.py first)
11
+ # Adds ~3–8 seconds on CPU; set to "true" only on GPU or when accuracy is critical
12
+ USE_SUPER_RESOLVE=false
13
+
14
+ # ─── Models directory ────────────────────────────────────────────────────────
15
+ MODELS_DIR=models
16
+
17
+ # ─── Optional: pin GPU device ────────────────────────────────────────────────
18
+ # CUDA_VISIBLE_DEVICES=0
@@ -0,0 +1,19 @@
1
+ # Changelog
2
+
3
+ All notable changes to this project will be documented here.
4
+
5
+ ## [1.0.0] — 2026-03-22
6
+
7
+ ### Added
8
+ - **PyPI Release**: Rebranded and packaged for PyPI deployment as `trivision-ocr`.
9
+ - **Stage 0 — Adaptive Preprocessor**: Hough-line median deskew (robust on sparse engineering diagrams), CLAHE contrast enhancement, Otsu + adaptive binarization, optional EDSR 2× super-resolution.
10
+ - **Stage 1 — Three-Engine OCR**: Tesseract (LSTM+CTC), PaddleOCR (PP-OCRv5 DBNet+SVTR), EasyOCR (CRAFT+CRNN). GPU engines run sequentially to avoid CUDA context conflicts.
11
+ - **Stage 2 — Confidence-Weighted Voter**: IoU spatial grouping, per-engine weighting, agreement bonus (+15% per agreeing engine).
12
+ - **Stage 3 — Domain Post-Corrector**: SymSpell spell correction with domain vocabulary protection (PRV, FCV, GV, BFP, HDPE, …).
13
+ - Vision API-compatible JSON output with bonus `engine_count` and `confidence` fields.
14
+ - `ocr-pipeline` CLI entry point.
15
+ - `download_models.py` for one-time model file downloads.
16
+ - `setup_venv.bat` for Windows venv bootstrap (CPU default, `gpu` argument for CUDA).
17
+ - `MANIFEST.in` for proper package distribution.
18
+ - Centralised `logger.py` for structured timestamped logging.
19
+ - Unit test suite (`tests/test_voter.py`) — 7 tests, no GPU required.
@@ -0,0 +1,5 @@
1
+ include .env.example
2
+ include CHANGELOG.md
3
+ include README.md
4
+ recursive-include models *
5
+ recursive-include tests *.png *.jpg *.jpeg *.tiff
@@ -0,0 +1,143 @@
1
+ Metadata-Version: 2.4
2
+ Name: trivision-ocr
3
+ Version: 1.0.0
4
+ Summary: Multi-engine OCR pipeline — beats Google Vision API
5
+ Author-email: Parthasarathy G <parthasarathyg693@gmail.com>
6
+ License: MIT
7
+ Keywords: ocr,tesseract,paddleocr,easyocr,vision-api
8
+ Classifier: Programming Language :: Python :: 3.10
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Topic :: Scientific/Engineering :: Image Recognition
11
+ Requires-Python: >=3.10
12
+ Description-Content-Type: text/markdown
13
+ Requires-Dist: opencv-contrib-python-headless
14
+ Requires-Dist: pytesseract
15
+ Requires-Dist: Pillow
16
+ Requires-Dist: paddleocr
17
+ Requires-Dist: easyocr
18
+ Requires-Dist: symspellpy
19
+ Requires-Dist: numpy
20
+ Requires-Dist: python-dotenv
21
+ Requires-Dist: requests
22
+ Provides-Extra: cpu
23
+ Requires-Dist: paddlepaddle; extra == "cpu"
24
+ Provides-Extra: gpu
25
+ Requires-Dist: paddlepaddle-gpu; extra == "gpu"
26
+ Provides-Extra: api
27
+ Requires-Dist: fastapi; extra == "api"
28
+ Requires-Dist: uvicorn[standard]; extra == "api"
29
+ Requires-Dist: python-multipart; extra == "api"
30
+ Provides-Extra: dev
31
+ Requires-Dist: pytest; extra == "dev"
32
+ Requires-Dist: black; extra == "dev"
33
+ Requires-Dist: isort; extra == "dev"
34
+
35
+ # ocr-pipeline
36
+
37
+ Multi-engine OCR pipeline combining **Tesseract**, **PaddleOCR**, and **EasyOCR** with confidence-weighted voting and domain-aware spell correction. Output is compatible with the Google Vision API JSON schema.
38
+
39
+ ## Architecture
40
+
41
+ ```
42
+ Image → [Preprocessor] → Tesseract ┐
43
+ → PaddleOCR ├─→ [Voter] → [Corrector] → Vision API JSON
44
+ → EasyOCR ┘
45
+ ```
46
+
47
+ | Stage | What it does | Your edge vs Vision API |
48
+ |---|---|---|
49
+ | 0 — Preprocessor | Hough deskew, CLAHE, binarize, optional 2× EDSR | Domain-specific prep; Vision API gets raw images |
50
+ | 1 — Three engines | Tesseract (LSTM), PaddleOCR (DBNet+SVTR), EasyOCR (CRAFT+CRNN) | Different failure modes → ensemble eliminates each |
51
+ | 2 — Voter | IoU spatial grouping + weighted confidence + agreement bonus | Cross-engine consensus exposed; Vision API hides this |
52
+ | 3 — Corrector | SymSpell + domain vocabulary protection | Tuned to your label vocabulary; Vision API uses general LM |
53
+
54
+ ## Prerequisites
55
+
56
+ **Tesseract binary (Windows — required):**
57
+ ```powershell
58
+ winget install UB-Mannheim.TesseractOCR
59
+ ```
60
+
61
+ **Python 3.10+** must be installed and on PATH.
62
+
63
+ ## Quick Start
64
+
65
+ ```powershell
66
+ # 1. Clone / copy the project
67
+ cd ocr-pipeline
68
+
69
+ # 2. Create virtual environment and install (CPU default)
70
+ setup_venv.bat
71
+
72
+ # For GPU (CUDA + paddlepaddle-gpu):
73
+ setup_venv.bat gpu
74
+
75
+ # 3. Activate
76
+ .venv\Scripts\activate
77
+
78
+ # 4. Run on an image
79
+ ocr-pipeline path\to\image.jpg --pretty
80
+
81
+ # Save output to file
82
+ ocr-pipeline path\to\image.jpg --output result.json
83
+
84
+ # Text only
85
+ ocr-pipeline path\to\image.jpg --text-only
86
+
87
+ # Skip super-resolution (faster, no EDSR model needed)
88
+ ocr-pipeline path\to\image.jpg --no-super-resolve
89
+ ```
90
+
91
+ ## Python API
92
+
93
+ ```python
94
+ from ocr_pipeline import beat_vision_api
95
+
96
+ result = beat_vision_api("path/to/image.jpg")
97
+ print(result["responses"][0]["fullTextAnnotation"]["text"])
98
+ ```
99
+
100
+ ## Configuration
101
+
102
+ Copy `.env.example` to `.env` and edit:
103
+
104
+ ```env
105
+ TESSERACT_CMD=C:\Program Files\Tesseract-OCR\tesseract.exe
106
+ USE_GPU=false
107
+ USE_SUPER_RESOLVE=false
108
+ ```
109
+
110
+ Add project-specific label codes to `DOMAIN_TERMS` in `ocr_pipeline/config.py`.
111
+
112
+ ## Running Tests
113
+
114
+ ```powershell
115
+ pytest tests/test_voter.py -v
116
+ ```
117
+
118
+ All 7 tests run without GPU or model downloads.
119
+
120
+ ## Optional: Super-Resolution
121
+
122
+ EDSR 2× upscale (~143 MB model) significantly improves small text on diagram scans. Enable it:
123
+
124
+ 1. Run `python download_models.py` and choose `y` when prompted for EDSR
125
+ 2. Set `USE_SUPER_RESOLVE=true` in `.env`
126
+
127
+ ## Project Structure
128
+
129
+ ```
130
+ ocr-pipeline/
131
+ ├── ocr_pipeline/ ← Python package
132
+ │ ├── config.py ← All tunable parameters
133
+ │ ├── preprocessor.py ← Stage 0
134
+ │ ├── engines.py ← Stage 1
135
+ │ ├── voter.py ← Stage 2
136
+ │ ├── corrector.py ← Stage 3
137
+ │ └── pipeline.py ← Orchestration
138
+ ├── tests/
139
+ │ └── test_voter.py ← Unit tests (no GPU)
140
+ ├── models/ ← Downloaded model files
141
+ ├── setup_venv.bat ← Windows setup
142
+ └── download_models.py ← Model downloader
143
+ ```
@@ -0,0 +1,109 @@
1
+ # ocr-pipeline
2
+
3
+ Multi-engine OCR pipeline combining **Tesseract**, **PaddleOCR**, and **EasyOCR** with confidence-weighted voting and domain-aware spell correction. Output is compatible with the Google Vision API JSON schema.
4
+
5
+ ## Architecture
6
+
7
+ ```
8
+ Image → [Preprocessor] → Tesseract ┐
9
+ → PaddleOCR ├─→ [Voter] → [Corrector] → Vision API JSON
10
+ → EasyOCR ┘
11
+ ```
12
+
13
+ | Stage | What it does | Your edge vs Vision API |
14
+ |---|---|---|
15
+ | 0 — Preprocessor | Hough deskew, CLAHE, binarize, optional 2× EDSR | Domain-specific prep; Vision API gets raw images |
16
+ | 1 — Three engines | Tesseract (LSTM), PaddleOCR (DBNet+SVTR), EasyOCR (CRAFT+CRNN) | Different failure modes → ensemble eliminates each |
17
+ | 2 — Voter | IoU spatial grouping + weighted confidence + agreement bonus | Cross-engine consensus exposed; Vision API hides this |
18
+ | 3 — Corrector | SymSpell + domain vocabulary protection | Tuned to your label vocabulary; Vision API uses general LM |
19
+
20
+ ## Prerequisites
21
+
22
+ **Tesseract binary (Windows — required):**
23
+ ```powershell
24
+ winget install UB-Mannheim.TesseractOCR
25
+ ```
26
+
27
+ **Python 3.10+** must be installed and on PATH.
28
+
29
+ ## Quick Start
30
+
31
+ ```powershell
32
+ # 1. Clone / copy the project
33
+ cd ocr-pipeline
34
+
35
+ # 2. Create virtual environment and install (CPU default)
36
+ setup_venv.bat
37
+
38
+ # For GPU (CUDA + paddlepaddle-gpu):
39
+ setup_venv.bat gpu
40
+
41
+ # 3. Activate
42
+ .venv\Scripts\activate
43
+
44
+ # 4. Run on an image
45
+ ocr-pipeline path\to\image.jpg --pretty
46
+
47
+ # Save output to file
48
+ ocr-pipeline path\to\image.jpg --output result.json
49
+
50
+ # Text only
51
+ ocr-pipeline path\to\image.jpg --text-only
52
+
53
+ # Skip super-resolution (faster, no EDSR model needed)
54
+ ocr-pipeline path\to\image.jpg --no-super-resolve
55
+ ```
56
+
57
+ ## Python API
58
+
59
+ ```python
60
+ from ocr_pipeline import beat_vision_api
61
+
62
+ result = beat_vision_api("path/to/image.jpg")
63
+ print(result["responses"][0]["fullTextAnnotation"]["text"])
64
+ ```
65
+
66
+ ## Configuration
67
+
68
+ Copy `.env.example` to `.env` and edit:
69
+
70
+ ```env
71
+ TESSERACT_CMD=C:\Program Files\Tesseract-OCR\tesseract.exe
72
+ USE_GPU=false
73
+ USE_SUPER_RESOLVE=false
74
+ ```
75
+
76
+ Add project-specific label codes to `DOMAIN_TERMS` in `ocr_pipeline/config.py`.
77
+
78
+ ## Running Tests
79
+
80
+ ```powershell
81
+ pytest tests/test_voter.py -v
82
+ ```
83
+
84
+ All 7 tests run without GPU or model downloads.
85
+
86
+ ## Optional: Super-Resolution
87
+
88
+ EDSR 2× upscale (~143 MB model) significantly improves small text on diagram scans. Enable it:
89
+
90
+ 1. Run `python download_models.py` and choose `y` when prompted for EDSR
91
+ 2. Set `USE_SUPER_RESOLVE=true` in `.env`
92
+
93
+ ## Project Structure
94
+
95
+ ```
96
+ ocr-pipeline/
97
+ ├── ocr_pipeline/ ← Python package
98
+ │ ├── config.py ← All tunable parameters
99
+ │ ├── preprocessor.py ← Stage 0
100
+ │ ├── engines.py ← Stage 1
101
+ │ ├── voter.py ← Stage 2
102
+ │ ├── corrector.py ← Stage 3
103
+ │ └── pipeline.py ← Orchestration
104
+ ├── tests/
105
+ │ └── test_voter.py ← Unit tests (no GPU)
106
+ ├── models/ ← Downloaded model files
107
+ ├── setup_venv.bat ← Windows setup
108
+ └── download_models.py ← Model downloader
109
+ ```
File without changes
Binary file
@@ -0,0 +1,20 @@
1
+ """
2
+ ocr_pipeline — Multi-engine OCR pipeline.
3
+
4
+ Top-level public API:
5
+ from ocr_pipeline import beat_vision_api
6
+ result = beat_vision_api("path/to/image.jpg")
7
+
8
+ The pipeline import is lazy so that lightweight submodules (voter, config)
9
+ can be imported in test environments without requiring all heavy ML deps.
10
+ """
11
+
12
+ __version__ = "1.0.0"
13
+ __all__ = ["beat_vision_api"]
14
+
15
+
16
+ def __getattr__(name: str):
17
+ if name == "beat_vision_api":
18
+ from .pipeline import beat_vision_api # noqa: PLC0415
19
+ return beat_vision_api
20
+ raise AttributeError(f"module 'ocr_pipeline' has no attribute {name!r}")
@@ -0,0 +1,97 @@
1
+ """
2
+ config.py — Central configuration for the OCR pipeline.
3
+
4
+ Loads .env file automatically (via python-dotenv) so that TESSERACT_CMD and
5
+ other settings work reliably on a fresh machine without needing system env vars.
6
+ Import this module first in any other module — it also sets the Tesseract
7
+ binary path so pytesseract works on Windows without extra setup.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import os
13
+ from pathlib import Path
14
+
15
+ import pytesseract
16
+ from dotenv import load_dotenv
17
+
18
+ # Load .env from the project root (two levels up from this file)
19
+ _PROJECT_ROOT = Path(__file__).parent.parent
20
+ load_dotenv(_PROJECT_ROOT / ".env", override=False)
21
+
22
+ # ─── Tesseract binary (Windows) ──────────────────────────────────────────────
23
+ # Set TESSERACT_CMD in .env or as a system env var to override.
24
+ TESSERACT_CMD: str = os.environ.get(
25
+ "TESSERACT_CMD",
26
+ r"C:\Program Files\Tesseract-OCR\tesseract.exe",
27
+ )
28
+ pytesseract.pytesseract.tesseract_cmd = TESSERACT_CMD # applied at import time
29
+
30
+ # ─── Runtime flags ───────────────────────────────────────────────────────────
31
+ USE_GPU: bool = os.environ.get("USE_GPU", "false").lower() == "true"
32
+ USE_SUPER_RESOLVE: bool = os.environ.get("USE_SUPER_RESOLVE", "false").lower() == "true"
33
+
34
+ # ─── Paths (pathlib — no hardcoded strings) ──────────────────────────────────
35
+ MODELS_DIR: Path = Path(os.environ.get("MODELS_DIR", str(_PROJECT_ROOT / "models")))
36
+ EDSR_MODEL_PATH: str = str(MODELS_DIR / "EDSR_x2.pb")
37
+ SYMSPELL_DICT_PATH: str = str(MODELS_DIR / "frequency_dictionary_en_82_765.txt")
38
+
39
+ # ─── Engine weights ──────────────────────────────────────────────────────────
40
+ ENGINE_WEIGHTS: dict[str, float] = {
41
+ "paddleocr": 0.45,
42
+ "easyocr": 0.35,
43
+ "tesseract": 0.20,
44
+ }
45
+
46
+ # ─── Voter parameters ────────────────────────────────────────────────────────
47
+ MIN_CONFIDENCE: float = 0.05
48
+ IOU_THRESHOLD: float = 0.20
49
+ TEXT_SIMILARITY_THRESHOLD: float = 0.80
50
+ AGREEMENT_BONUS: float = 0.15
51
+ MIN_TESSERACT_CONF: int = 0
52
+
53
+ # ─── Preprocessing parameters ────────────────────────────────────────────────
54
+ CLAHE_CLIP_LIMIT: float = 3.0
55
+ CLAHE_TILE_GRID: tuple[int, int] = (8, 8)
56
+ ADAPTIVE_BLOCK_SIZE: int = 31
57
+ ADAPTIVE_C: int = 10
58
+
59
+ HOUGH_THRESHOLD: int = 80
60
+ HOUGH_MIN_LINE_LENGTH: int = 50
61
+ HOUGH_MAX_LINE_GAP: int = 10
62
+ HOUGH_MAX_ANGLE: float = 45.0
63
+
64
+ # ─── Domain vocabulary ───────────────────────────────────────────────────────
65
+ DOMAIN_TERMS: set[str] = {
66
+ "PRV", "FCV", "GV", "BFP", "MH", "DMH", "SV", "WM",
67
+ "HDPE", "DI", "MDIA", "NRV", "ARV", "PSV", "RFV",
68
+ "DN", "PN", "ID", "OD", "PVC", "GRP", "PE",
69
+ "GATE", "VALVE", "SLUICE", "BUTTERFLY", "CHECK", "BALL", "GLOBE",
70
+ "MANHOLE", "CHAMBER", "JUNCTION", "TEE", "BEND", "REDUCER",
71
+ }
72
+
73
+ # ─── SymSpell parameters ─────────────────────────────────────────────────────
74
+ SYMSPELL_MAX_EDIT_DISTANCE: int = 2
75
+ SYMSPELL_MIN_FREQUENCY: int = 50000
76
+
77
+ # ─── OCR confusion aliases ───────────────────────────────────────────────────
78
+ DOMAIN_MISREADS: dict[str, str] = {
79
+ "HOPE": "HDPE",
80
+ "HDFE": "HDPE",
81
+ "PNI6": "PN16",
82
+ "PNI": "PN1",
83
+ "BFP-S": "BFP-3",
84
+ "MHS": "MH-5",
85
+ "MH5": "MH-5",
86
+ "DN3OO": "DN300",
87
+ }
88
+
89
+ # ─── API auth ────────────────────────────────────────────────────────────────
90
+ API_KEYS: set[str] = set(
91
+ k.strip()
92
+ for k in os.environ.get("API_KEYS", "").split(",")
93
+ if k.strip()
94
+ )
95
+
96
+ # ─── Benchmark / logging ─────────────────────────────────────────────────────
97
+ LOG_LEVEL: str = os.environ.get("LOG_LEVEL", "INFO").upper()
@@ -0,0 +1,91 @@
1
+ """
2
+ corrector.py — Stage 3: Domain-aware SymSpell post-corrector.
3
+
4
+ Post-processes voted OCR results by spell-correcting general English tokens
5
+ while NEVER modifying domain-specific terms (valve codes, pipe labels, etc.).
6
+
7
+ Logic per token:
8
+ 1. Skip if the token is in DOMAIN_TERMS (any case normalisation applied).
9
+ 2. Skip if the token looks like a numeric/alphanumeric code (e.g. "DN300").
10
+ 3. Apply SymSpell with edit-distance ≤ 2; only accept suggestions with
11
+ high corpus frequency (avoids over-correcting rare but valid words).
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import re
17
+
18
+ from symspellpy import SymSpell, Verbosity
19
+
20
+ from . import config
21
+
22
+ # ── Initialise SymSpell once at module level ─────────────────────────────────
23
+ _sym_spell: SymSpell | None = None
24
+
25
+
26
+ def _get_sym_spell() -> SymSpell:
27
+ global _sym_spell
28
+ if _sym_spell is None:
29
+ _sym_spell = SymSpell(max_dictionary_edit_distance=config.SYMSPELL_MAX_EDIT_DISTANCE)
30
+ if not _sym_spell.load_dictionary(
31
+ config.SYMSPELL_DICT_PATH,
32
+ term_index=0,
33
+ count_index=1,
34
+ ):
35
+ print(
36
+ f"[WARN] SymSpell dictionary not found at {config.SYMSPELL_DICT_PATH}. "
37
+ "Run download_models.py. Spell correction disabled."
38
+ )
39
+ # Return an initialised-but-empty SymSpell so callers don't crash
40
+ return _sym_spell
41
+
42
+
43
+ # ── Numeric / code pattern — never correct these tokens ──────────────────────
44
+ _CODE_PATTERN = re.compile(r"^\d+[\w/\-]*$|^[\w/\-]*\d+[\w/\-]*$")
45
+
46
+
47
+ def correct_token(token: str) -> str:
48
+ """
49
+ Correct a single OCR token, respecting domain vocabulary.
50
+
51
+ Returns the original token unchanged if:
52
+ - It matches a domain term (case-insensitive look-up)
53
+ - It looks like a numeric/alphanumeric code (e.g. "DN300", "2B-PRV")
54
+ """
55
+ upper = token.upper()
56
+ if upper in config.DOMAIN_TERMS:
57
+ return token
58
+ if upper in config.DOMAIN_MISREADS:
59
+ return config.DOMAIN_MISREADS[upper]
60
+
61
+ # Numeric / code pattern check
62
+ if _CODE_PATTERN.match(token):
63
+ return token
64
+
65
+ sym = _get_sym_spell()
66
+ suggestions = sym.lookup(
67
+ token,
68
+ Verbosity.CLOSEST,
69
+ max_edit_distance=config.SYMSPELL_MAX_EDIT_DISTANCE,
70
+ include_unknown=True,
71
+ )
72
+
73
+ if suggestions:
74
+ best = suggestions[0]
75
+ # Only accept if the correction is well attested in general English
76
+ if best.count >= config.SYMSPELL_MIN_FREQUENCY:
77
+ return best.term
78
+
79
+ return token
80
+
81
+
82
+ def post_correct(voted_results: list[dict]) -> list[dict]:
83
+ """
84
+ Apply ``correct_token`` to every token in every result entry.
85
+
86
+ Modifies results in-place and also returns the list.
87
+ """
88
+ for item in voted_results:
89
+ tokens = item["text"].split()
90
+ item["text"] = " ".join(correct_token(t) for t in tokens)
91
+ return voted_results