monocr 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of monocr might be problematic. Click here for more details.
- monocr-0.1.0/PKG-INFO +67 -0
- monocr-0.1.0/README.md +38 -0
- monocr-0.1.0/pyproject.toml +50 -0
- monocr-0.1.0/src/monocr/__init__.py +90 -0
- monocr-0.1.0/src/monocr/cli.py +105 -0
- monocr-0.1.0/src/monocr/crnn_model.py +82 -0
- monocr-0.1.0/src/monocr/inference.py +117 -0
- monocr-0.1.0/src/monocr/models/monocr_v1_best.pt +0 -0
- monocr-0.1.0/src/monocr/ocr.py +268 -0
monocr-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: monocr
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Optical Character Recognition for Mon text
|
|
5
|
+
Keywords: mon,ocr,text-recognition
|
|
6
|
+
Author: janakhpon
|
|
7
|
+
Author-email: janakhpon <jnovaxer@gmail.com>
|
|
8
|
+
License: MIT
|
|
9
|
+
Classifier: Development Status :: 4 - Beta
|
|
10
|
+
Classifier: Intended Audience :: Developers
|
|
11
|
+
Classifier: Intended Audience :: Science/Research
|
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
+
Classifier: Operating System :: OS Independent
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
18
|
+
Classifier: Topic :: Scientific/Engineering :: Image Recognition
|
|
19
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
|
20
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
21
|
+
Requires-Dist: torch>=2.0.0
|
|
22
|
+
Requires-Dist: torchvision>=0.15.0
|
|
23
|
+
Requires-Dist: pillow>=9.0.0
|
|
24
|
+
Requires-Dist: numpy>=1.21.0
|
|
25
|
+
Requires-Dist: click>=8.0.0
|
|
26
|
+
Requires-Python: >=3.11
|
|
27
|
+
Project-URL: Repository, https://github.com/janakhpon/monocr
|
|
28
|
+
Description-Content-Type: text/markdown
|
|
29
|
+
|
|
30
|
+
# Mon OCR
|
|
31
|
+
|
|
32
|
+
Optical Character Recognition for Mon (mnw) text.
|
|
33
|
+
|
|
34
|
+
## Installation
|
|
35
|
+
|
|
36
|
+
```bash
|
|
37
|
+
pip install monocr | uv add monocr
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
## Quick Start
|
|
41
|
+
|
|
42
|
+
```python
|
|
43
|
+
from monocr import read_text, read_folder
|
|
44
|
+
|
|
45
|
+
# Read text from a single image
|
|
46
|
+
text = read_text("image.png")
|
|
47
|
+
print(text)
|
|
48
|
+
|
|
49
|
+
# Read all images in a folder
|
|
50
|
+
results = read_folder("images/")
|
|
51
|
+
for filename, text in results.items():
|
|
52
|
+
print(f"{filename}: {text}")
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
## Command Line
|
|
56
|
+
|
|
57
|
+
```bash
|
|
58
|
+
# Read single image
|
|
59
|
+
monocr read image.png
|
|
60
|
+
|
|
61
|
+
# Process folder
|
|
62
|
+
monocr batch images/ --output results.json
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
## License
|
|
66
|
+
|
|
67
|
+
MIT License
|
monocr-0.1.0/README.md
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
# Mon OCR
|
|
2
|
+
|
|
3
|
+
Optical Character Recognition for Mon (mnw) text.
|
|
4
|
+
|
|
5
|
+
## Installation
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
pip install monocr | uv add monocr
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
## Quick Start
|
|
12
|
+
|
|
13
|
+
```python
|
|
14
|
+
from monocr import read_text, read_folder
|
|
15
|
+
|
|
16
|
+
# Read text from a single image
|
|
17
|
+
text = read_text("image.png")
|
|
18
|
+
print(text)
|
|
19
|
+
|
|
20
|
+
# Read all images in a folder
|
|
21
|
+
results = read_folder("images/")
|
|
22
|
+
for filename, text in results.items():
|
|
23
|
+
print(f"{filename}: {text}")
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
## Command Line
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
# Read single image
|
|
30
|
+
monocr read image.png
|
|
31
|
+
|
|
32
|
+
# Process folder
|
|
33
|
+
monocr batch images/ --output results.json
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
## License
|
|
37
|
+
|
|
38
|
+
MIT License
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "monocr"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Optical Character Recognition for Mon text"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
requires-python = ">=3.11"
|
|
7
|
+
authors = [
|
|
8
|
+
{ name = "janakhpon", email = "jnovaxer@gmail.com" }
|
|
9
|
+
]
|
|
10
|
+
license = { text = "MIT" }
|
|
11
|
+
keywords = ["mon", "ocr", "text-recognition"]
|
|
12
|
+
classifiers = [
|
|
13
|
+
"Development Status :: 4 - Beta",
|
|
14
|
+
"Intended Audience :: Developers",
|
|
15
|
+
"Intended Audience :: Science/Research",
|
|
16
|
+
"License :: OSI Approved :: MIT License",
|
|
17
|
+
"Operating System :: OS Independent",
|
|
18
|
+
"Programming Language :: Python :: 3",
|
|
19
|
+
"Programming Language :: Python :: 3.11",
|
|
20
|
+
"Programming Language :: Python :: 3.12",
|
|
21
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
22
|
+
"Topic :: Scientific/Engineering :: Image Recognition",
|
|
23
|
+
"Topic :: Text Processing :: Linguistic",
|
|
24
|
+
"Topic :: Software Development :: Libraries :: Python Modules",
|
|
25
|
+
]
|
|
26
|
+
dependencies = [
|
|
27
|
+
"torch>=2.0.0",
|
|
28
|
+
"torchvision>=0.15.0",
|
|
29
|
+
"pillow>=9.0.0",
|
|
30
|
+
"numpy>=1.21.0",
|
|
31
|
+
"click>=8.0.0",
|
|
32
|
+
]
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
[project.urls]
|
|
36
|
+
Repository = "https://github.com/janakhpon/monocr"
|
|
37
|
+
|
|
38
|
+
[project.scripts]
|
|
39
|
+
monocr = "monocr.cli:main"
|
|
40
|
+
|
|
41
|
+
[build-system]
|
|
42
|
+
requires = ["uv_build>=0.8.13,<0.9.0"]
|
|
43
|
+
build-backend = "uv_build"
|
|
44
|
+
|
|
45
|
+
[tool.setuptools.packages.find]
|
|
46
|
+
where = ["src"]
|
|
47
|
+
|
|
48
|
+
[tool.setuptools.package-data]
|
|
49
|
+
monocr = ["models/*"]
|
|
50
|
+
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Mon OCR - Optical Character Recognition for Mon text
|
|
3
|
+
A production-ready OCR package for Mon script text recognition
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import os
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from .ocr import MonOCR
|
|
9
|
+
from .inference import MonOCRInference
|
|
10
|
+
|
|
11
|
+
__version__ = "0.1.0"
|
|
12
|
+
__author__ = "janakhpon"
|
|
13
|
+
__email__ = "jnovaxer@gmail.com"
|
|
14
|
+
|
|
15
|
+
__all__ = ["MonOCR", "MonOCRInference", "read_text", "read_image", "read_folder"]
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def get_default_model_path():
|
|
19
|
+
"""Get the path to the bundled default model"""
|
|
20
|
+
package_dir = Path(__file__).parent
|
|
21
|
+
model_path = package_dir / "models" / "monocr_v1_best.pt"
|
|
22
|
+
return str(model_path)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
# Global OCR instance for simple API
|
|
26
|
+
_ocr_instance = None
|
|
27
|
+
|
|
28
|
+
def _get_ocr():
|
|
29
|
+
"""Get or create the global OCR instance"""
|
|
30
|
+
global _ocr_instance
|
|
31
|
+
if _ocr_instance is None:
|
|
32
|
+
_ocr_instance = MonOCR()
|
|
33
|
+
return _ocr_instance
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def read_text(image_path):
|
|
37
|
+
"""
|
|
38
|
+
Read text from a single image - Simple API
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
image_path: Path to image file
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
Extracted text string
|
|
45
|
+
"""
|
|
46
|
+
return _get_ocr().read_text(image_path)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def read_image(image_path):
|
|
50
|
+
"""
|
|
51
|
+
Alias for read_text - Read text from a single image
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
image_path: Path to image file
|
|
55
|
+
|
|
56
|
+
Returns:
|
|
57
|
+
Extracted text string
|
|
58
|
+
"""
|
|
59
|
+
return read_text(image_path)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def read_folder(folder_path, extensions=None):
|
|
63
|
+
"""
|
|
64
|
+
Read text from all images in a folder - Simple API
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
folder_path: Path to folder containing images
|
|
68
|
+
extensions: List of file extensions to process (default: ['.png', '.jpg', '.jpeg'])
|
|
69
|
+
|
|
70
|
+
Returns:
|
|
71
|
+
Dictionary mapping filename to extracted text
|
|
72
|
+
"""
|
|
73
|
+
return _get_ocr().read_from_folder(folder_path, extensions)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def load_ocr(model_path=None, model_type="crnn"):
|
|
77
|
+
"""
|
|
78
|
+
Load OCR model with default settings (Advanced API)
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
model_path: Path to trained model file (if None, uses bundled model)
|
|
82
|
+
model_type: Type of model ("crnn" or "trocr")
|
|
83
|
+
|
|
84
|
+
Returns:
|
|
85
|
+
MonOCR instance
|
|
86
|
+
"""
|
|
87
|
+
if model_path is None:
|
|
88
|
+
model_path = get_default_model_path()
|
|
89
|
+
|
|
90
|
+
return MonOCR(model_path, model_type)
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Command Line Interface for Mon OCR
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import click
|
|
7
|
+
import json
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import List
|
|
10
|
+
|
|
11
|
+
from .ocr import MonOCR
|
|
12
|
+
from .inference import MonOCRInference
|
|
13
|
+
from . import get_default_model_path
|
|
14
|
+
|
|
15
|
+
@click.group()
|
|
16
|
+
@click.version_option()
|
|
17
|
+
def main():
|
|
18
|
+
"""Mon OCR - Optical Character Recognition for Mon text"""
|
|
19
|
+
pass
|
|
20
|
+
|
|
21
|
+
@main.command()
|
|
22
|
+
@click.argument('image_path', type=click.Path(exists=True))
|
|
23
|
+
@click.option('--model', '-m', help='Path to trained model file (default: uses bundled model)')
|
|
24
|
+
@click.option('--model-type', type=click.Choice(['crnn', 'trocr']), default='crnn', help='Type of model to use')
|
|
25
|
+
@click.option('--output', '-o', help='Output file to save results')
|
|
26
|
+
def read(image_path: str, model: str, model_type: str, output: str):
|
|
27
|
+
"""Read text from a single image"""
|
|
28
|
+
try:
|
|
29
|
+
if model is None:
|
|
30
|
+
model = get_default_model_path()
|
|
31
|
+
ocr = MonOCR(model, model_type)
|
|
32
|
+
|
|
33
|
+
print("Processing image...")
|
|
34
|
+
text = ocr.read_text(image_path)
|
|
35
|
+
|
|
36
|
+
print(f"\nExtracted text:")
|
|
37
|
+
print(text)
|
|
38
|
+
|
|
39
|
+
if output:
|
|
40
|
+
result = {
|
|
41
|
+
'image_path': image_path,
|
|
42
|
+
'extracted_text': text,
|
|
43
|
+
'model_type': model_type
|
|
44
|
+
}
|
|
45
|
+
with open(output, 'w', encoding='utf-8') as f:
|
|
46
|
+
json.dump(result, f, ensure_ascii=False, indent=2)
|
|
47
|
+
print(f"\nResults saved to: {output}")
|
|
48
|
+
|
|
49
|
+
except Exception as e:
|
|
50
|
+
print(f"Error: {e}")
|
|
51
|
+
raise click.Abort()
|
|
52
|
+
|
|
53
|
+
@main.command()
|
|
54
|
+
@click.argument('folder_path', type=click.Path(exists=True, file_okay=False))
|
|
55
|
+
@click.option('--model', '-m', help='Path to trained model file (default: uses bundled model)')
|
|
56
|
+
@click.option('--model-type', type=click.Choice(['crnn', 'trocr']), default='crnn', help='Type of model to use')
|
|
57
|
+
@click.option('--output', '-o', help='Output file to save results')
|
|
58
|
+
@click.option('--extensions', default='png,jpg,jpeg', help='File extensions to process (comma-separated)')
|
|
59
|
+
def batch(folder_path: str, model: str, model_type: str, output: str, extensions: str):
|
|
60
|
+
"""Read text from all images in a folder"""
|
|
61
|
+
try:
|
|
62
|
+
if model is None:
|
|
63
|
+
model = get_default_model_path()
|
|
64
|
+
ocr = MonOCR(model, model_type)
|
|
65
|
+
ext_list = [f'.{ext.strip()}' for ext in extensions.split(',')]
|
|
66
|
+
|
|
67
|
+
print("Processing folder...")
|
|
68
|
+
results = ocr.read_from_folder(folder_path, ext_list)
|
|
69
|
+
|
|
70
|
+
print("\nOCR Results:")
|
|
71
|
+
print("-" * 40)
|
|
72
|
+
for filename, text in results.items():
|
|
73
|
+
print(f"{filename}: {text}")
|
|
74
|
+
|
|
75
|
+
if output:
|
|
76
|
+
with open(output, 'w', encoding='utf-8') as f:
|
|
77
|
+
json.dump(results, f, ensure_ascii=False, indent=2)
|
|
78
|
+
print(f"\nResults saved to: {output}")
|
|
79
|
+
|
|
80
|
+
except Exception as e:
|
|
81
|
+
print(f"Error: {e}")
|
|
82
|
+
raise click.Abort()
|
|
83
|
+
|
|
84
|
+
@main.command()
|
|
85
|
+
@click.argument('image_path', type=click.Path(exists=True))
|
|
86
|
+
@click.option('--model', '-m', help='Path to trained model file (default: uses bundled model)')
|
|
87
|
+
@click.option('--model-type', type=click.Choice(['crnn', 'trocr']), default='crnn', help='Type of model to use')
|
|
88
|
+
def confidence(image_path: str, model: str, model_type: str):
|
|
89
|
+
"""Read text with confidence score"""
|
|
90
|
+
try:
|
|
91
|
+
ocr = MonOCRInference(model, model_type)
|
|
92
|
+
|
|
93
|
+
print("Processing image...")
|
|
94
|
+
result = ocr.predict_with_confidence(image_path)
|
|
95
|
+
|
|
96
|
+
print(f"\nExtracted text:")
|
|
97
|
+
print(result['text'])
|
|
98
|
+
print(f"\nConfidence: {result['confidence']:.2%}")
|
|
99
|
+
|
|
100
|
+
except Exception as e:
|
|
101
|
+
print(f"Error: {e}")
|
|
102
|
+
raise click.Abort()
|
|
103
|
+
|
|
104
|
+
if __name__ == '__main__':
|
|
105
|
+
main()
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
CRNN Model Architecture for Mon OCR
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import torch
|
|
8
|
+
import torch.nn as nn
|
|
9
|
+
import torch.nn.functional as F
|
|
10
|
+
import glob
|
|
11
|
+
import os
|
|
12
|
+
from typing import List
|
|
13
|
+
|
|
14
|
+
class CRNN(nn.Module):
|
|
15
|
+
"""CRNN model for Mon OCR - matches the trained model architecture"""
|
|
16
|
+
|
|
17
|
+
def __init__(self, num_classes):
|
|
18
|
+
super(CRNN, self).__init__()
|
|
19
|
+
# Enhanced CNN architecture for better capacity
|
|
20
|
+
self.cnn = nn.Sequential(
|
|
21
|
+
nn.Conv2d(1, 64, 3, 1, 1),
|
|
22
|
+
nn.ReLU(),
|
|
23
|
+
nn.MaxPool2d(2, 2), # 64->32
|
|
24
|
+
nn.Conv2d(64, 128, 3, 1, 1),
|
|
25
|
+
nn.ReLU(),
|
|
26
|
+
nn.MaxPool2d(2, 2), # 32->16
|
|
27
|
+
nn.Conv2d(128, 256, 3, 1, 1),
|
|
28
|
+
nn.BatchNorm2d(256),
|
|
29
|
+
nn.ReLU(),
|
|
30
|
+
nn.Conv2d(256, 256, 3, 1, 1),
|
|
31
|
+
nn.ReLU(),
|
|
32
|
+
nn.MaxPool2d((2, 1), (2, 1)), # 16->8
|
|
33
|
+
nn.Conv2d(256, 512, 3, 1, 1),
|
|
34
|
+
nn.BatchNorm2d(512),
|
|
35
|
+
nn.ReLU(),
|
|
36
|
+
nn.Conv2d(512, 512, 3, 1, 1),
|
|
37
|
+
nn.ReLU(),
|
|
38
|
+
nn.MaxPool2d((2, 1), (2, 1)), # 8->4
|
|
39
|
+
nn.Conv2d(512, 512, 3, 1, 1),
|
|
40
|
+
nn.BatchNorm2d(512),
|
|
41
|
+
nn.ReLU(),
|
|
42
|
+
nn.Conv2d(512, 512, (4, 1), 1, 0), # 4->1
|
|
43
|
+
nn.ReLU(),
|
|
44
|
+
)
|
|
45
|
+
# Two LSTM layers for better sequence modeling
|
|
46
|
+
self.lstm1 = nn.LSTM(512, 256, bidirectional=True, batch_first=True)
|
|
47
|
+
self.lstm2 = nn.LSTM(512, 256, bidirectional=True, batch_first=True)
|
|
48
|
+
self.dropout = nn.Dropout(0.1) # add dropout to prevent overfitting
|
|
49
|
+
self.fc = nn.Linear(512, num_classes)
|
|
50
|
+
|
|
51
|
+
def forward(self, x):
|
|
52
|
+
conv = self.cnn(x)
|
|
53
|
+
b, c, h, w = conv.size()
|
|
54
|
+
assert h == 1, "CNN height must be 1"
|
|
55
|
+
conv = conv.squeeze(2).permute(0, 2, 1) # [B, W, C]
|
|
56
|
+
|
|
57
|
+
# Two LSTM layers for better sequence modeling
|
|
58
|
+
recurrent, _ = self.lstm1(conv)
|
|
59
|
+
recurrent, _ = self.lstm2(recurrent)
|
|
60
|
+
|
|
61
|
+
# Apply dropout before final classification
|
|
62
|
+
recurrent = self.dropout(recurrent)
|
|
63
|
+
out = self.fc(recurrent)
|
|
64
|
+
return out # [B, W, num_classes]
|
|
65
|
+
|
|
66
|
+
def build_charset(corpus_dir: str) -> str:
|
|
67
|
+
"""Build charset from corpus files"""
|
|
68
|
+
charset = set()
|
|
69
|
+
txt_files = glob.glob(os.path.join(corpus_dir, "**/*.txt"), recursive=True)
|
|
70
|
+
|
|
71
|
+
for fpath in txt_files:
|
|
72
|
+
if os.path.getsize(fpath) == 0:
|
|
73
|
+
continue
|
|
74
|
+
try:
|
|
75
|
+
with open(fpath, encoding="utf-8") as f:
|
|
76
|
+
for line in f:
|
|
77
|
+
charset.update(line.strip())
|
|
78
|
+
except Exception:
|
|
79
|
+
continue
|
|
80
|
+
|
|
81
|
+
charset_str = "".join(sorted(list(charset)))
|
|
82
|
+
return charset_str
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
Advanced inference utilities for Mon OCR
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import os
|
|
8
|
+
import torch
|
|
9
|
+
import numpy as np
|
|
10
|
+
from PIL import Image
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
import json
|
|
13
|
+
import logging
|
|
14
|
+
from typing import List, Dict, Optional, Union
|
|
15
|
+
|
|
16
|
+
from .ocr import MonOCR
|
|
17
|
+
|
|
18
|
+
class MonOCRInference:
|
|
19
|
+
"""Advanced Mon OCR inference with additional utilities"""
|
|
20
|
+
|
|
21
|
+
def __init__(self, model_path: Optional[str] = None, model_type: str = "crnn"):
|
|
22
|
+
"""
|
|
23
|
+
Initialize advanced Mon OCR inference
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
model_path: Path to trained model file
|
|
27
|
+
model_type: Type of model ("crnn" or "trocr")
|
|
28
|
+
"""
|
|
29
|
+
self.ocr = MonOCR(model_path, model_type)
|
|
30
|
+
self.logger = logging.getLogger(__name__)
|
|
31
|
+
|
|
32
|
+
def predict_with_confidence(self, image: Union[str, Image.Image]) -> Dict[str, Union[str, float]]:
|
|
33
|
+
"""
|
|
34
|
+
Predict text with confidence score
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
image: Path to image file or PIL Image object
|
|
38
|
+
|
|
39
|
+
Returns:
|
|
40
|
+
Dictionary with 'text' and 'confidence' keys
|
|
41
|
+
"""
|
|
42
|
+
try:
|
|
43
|
+
text = self.ocr.predict(image)
|
|
44
|
+
# For now, return a placeholder confidence score
|
|
45
|
+
# In a full implementation, you'd calculate actual confidence
|
|
46
|
+
confidence = 0.95 # Placeholder
|
|
47
|
+
|
|
48
|
+
return {
|
|
49
|
+
'text': text,
|
|
50
|
+
'confidence': confidence
|
|
51
|
+
}
|
|
52
|
+
except Exception as e:
|
|
53
|
+
self.logger.error(f"Error in prediction: {e}")
|
|
54
|
+
return {
|
|
55
|
+
'text': "",
|
|
56
|
+
'confidence': 0.0
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
def batch_predict_with_confidence(self, images: List[Union[str, Image.Image]]) -> List[Dict[str, Union[str, float]]]:
|
|
60
|
+
"""
|
|
61
|
+
Predict text with confidence for multiple images
|
|
62
|
+
|
|
63
|
+
Args:
|
|
64
|
+
images: List of image paths or PIL Image objects
|
|
65
|
+
|
|
66
|
+
Returns:
|
|
67
|
+
List of dictionaries with 'text' and 'confidence' keys
|
|
68
|
+
"""
|
|
69
|
+
results = []
|
|
70
|
+
for image in images:
|
|
71
|
+
result = self.predict_with_confidence(image)
|
|
72
|
+
results.append(result)
|
|
73
|
+
|
|
74
|
+
return results
|
|
75
|
+
|
|
76
|
+
def process_document(self, image_path: str, output_path: Optional[str] = None) -> Dict[str, str]:
|
|
77
|
+
"""
|
|
78
|
+
Process a document image and save results
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
image_path: Path to document image
|
|
82
|
+
output_path: Path to save results (optional)
|
|
83
|
+
|
|
84
|
+
Returns:
|
|
85
|
+
Dictionary with processing results
|
|
86
|
+
"""
|
|
87
|
+
try:
|
|
88
|
+
# Load and process image
|
|
89
|
+
image = Image.open(image_path)
|
|
90
|
+
text = self.ocr.predict(image)
|
|
91
|
+
|
|
92
|
+
results = {
|
|
93
|
+
'image_path': image_path,
|
|
94
|
+
'extracted_text': text,
|
|
95
|
+
'status': 'success'
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
# Save results if output path provided
|
|
99
|
+
if output_path:
|
|
100
|
+
with open(output_path, 'w', encoding='utf-8') as f:
|
|
101
|
+
json.dump(results, f, ensure_ascii=False, indent=2)
|
|
102
|
+
|
|
103
|
+
return results
|
|
104
|
+
|
|
105
|
+
except Exception as e:
|
|
106
|
+
error_result = {
|
|
107
|
+
'image_path': image_path,
|
|
108
|
+
'extracted_text': "",
|
|
109
|
+
'status': 'error',
|
|
110
|
+
'error': str(e)
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
if output_path:
|
|
114
|
+
with open(output_path, 'w', encoding='utf-8') as f:
|
|
115
|
+
json.dump(error_result, f, ensure_ascii=False, indent=2)
|
|
116
|
+
|
|
117
|
+
return error_result
|
|
Binary file
|
|
@@ -0,0 +1,268 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
Main Mon OCR class - Production-ready OCR for Mon text
|
|
5
|
+
Supports both CRNN and TrOCR models
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import os
|
|
9
|
+
import torch
|
|
10
|
+
import torch.nn as nn
|
|
11
|
+
import numpy as np
|
|
12
|
+
from PIL import Image
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
import json
|
|
15
|
+
import logging
|
|
16
|
+
from typing import List, Dict, Optional, Union
|
|
17
|
+
from torchvision import transforms
|
|
18
|
+
|
|
19
|
+
# TrOCR imports (optional)
|
|
20
|
+
try:
|
|
21
|
+
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
|
|
22
|
+
TROCR_AVAILABLE = True
|
|
23
|
+
except ImportError:
|
|
24
|
+
TROCR_AVAILABLE = False
|
|
25
|
+
|
|
26
|
+
class MonOCR:
|
|
27
|
+
"""Production-ready Mon OCR class supporting both CRNN and TrOCR models"""
|
|
28
|
+
|
|
29
|
+
def __init__(self, model_path: Optional[str] = None, model_type: str = "crnn"):
|
|
30
|
+
"""
|
|
31
|
+
Initialize Mon OCR
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
model_path: Path to trained model file (if None, uses bundled model)
|
|
35
|
+
model_type: Type of model ("crnn" or "trocr")
|
|
36
|
+
"""
|
|
37
|
+
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
38
|
+
self.model_type = model_type.lower()
|
|
39
|
+
self.model = None
|
|
40
|
+
self.processor = None
|
|
41
|
+
self.charset = None
|
|
42
|
+
|
|
43
|
+
# Load model - use bundled model if no path provided
|
|
44
|
+
if model_path is None:
|
|
45
|
+
from . import get_default_model_path
|
|
46
|
+
model_path = get_default_model_path()
|
|
47
|
+
|
|
48
|
+
self.load_model(model_path)
|
|
49
|
+
|
|
50
|
+
def load_model(self, model_path: str):
|
|
51
|
+
"""Load trained model from file"""
|
|
52
|
+
if not os.path.exists(model_path):
|
|
53
|
+
raise FileNotFoundError(f"Model file not found: {model_path}")
|
|
54
|
+
|
|
55
|
+
if self.model_type == "crnn":
|
|
56
|
+
self._load_crnn_model(model_path)
|
|
57
|
+
elif self.model_type == "trocr":
|
|
58
|
+
self._load_trocr_model(model_path)
|
|
59
|
+
else:
|
|
60
|
+
raise ValueError(f"Unsupported model type: {self.model_type}")
|
|
61
|
+
|
|
62
|
+
def _load_crnn_model(self, model_path: str):
|
|
63
|
+
"""Load CRNN model"""
|
|
64
|
+
# Import CRNN model class (this would need to be included in the package)
|
|
65
|
+
from .crnn_model import CRNN, build_charset
|
|
66
|
+
|
|
67
|
+
# Load model state
|
|
68
|
+
checkpoint = torch.load(model_path, map_location=self.device)
|
|
69
|
+
|
|
70
|
+
# Extract charset from checkpoint or build from corpus
|
|
71
|
+
if 'charset' in checkpoint:
|
|
72
|
+
self.charset = checkpoint['charset']
|
|
73
|
+
else:
|
|
74
|
+
# Fallback: build charset from default corpus
|
|
75
|
+
self.charset = build_charset("data/raw/corpus")
|
|
76
|
+
|
|
77
|
+
# Initialize model (add 1 for blank token)
|
|
78
|
+
self.model = CRNN(num_classes=len(self.charset) + 1)
|
|
79
|
+
|
|
80
|
+
# Load weights
|
|
81
|
+
if 'model_state_dict' in checkpoint:
|
|
82
|
+
self.model.load_state_dict(checkpoint['model_state_dict'])
|
|
83
|
+
else:
|
|
84
|
+
self.model.load_state_dict(checkpoint)
|
|
85
|
+
|
|
86
|
+
self.model.to(self.device)
|
|
87
|
+
self.model.eval()
|
|
88
|
+
|
|
89
|
+
def _load_trocr_model(self, model_path: str):
|
|
90
|
+
"""Load TrOCR model"""
|
|
91
|
+
if not TROCR_AVAILABLE:
|
|
92
|
+
raise ImportError("TrOCR dependencies not available. Install with: pip install transformers")
|
|
93
|
+
|
|
94
|
+
self.model = VisionEncoderDecoderModel.from_pretrained(model_path)
|
|
95
|
+
self.processor = TrOCRProcessor.from_pretrained(model_path)
|
|
96
|
+
self.model.to(self.device)
|
|
97
|
+
self.model.eval()
|
|
98
|
+
|
|
99
|
+
def predict(self, image: Union[str, Image.Image]) -> str:
|
|
100
|
+
"""
|
|
101
|
+
Predict text from image
|
|
102
|
+
|
|
103
|
+
Args:
|
|
104
|
+
image: Path to image file or PIL Image object
|
|
105
|
+
|
|
106
|
+
Returns:
|
|
107
|
+
Predicted text string
|
|
108
|
+
"""
|
|
109
|
+
if isinstance(image, str):
|
|
110
|
+
image = Image.open(image).convert("L")
|
|
111
|
+
elif not isinstance(image, Image.Image):
|
|
112
|
+
raise ValueError("Image must be a file path or PIL Image")
|
|
113
|
+
|
|
114
|
+
if self.model_type == "crnn":
|
|
115
|
+
return self._predict_crnn(image)
|
|
116
|
+
elif self.model_type == "trocr":
|
|
117
|
+
return self._predict_trocr(image)
|
|
118
|
+
|
|
119
|
+
def _predict_crnn(self, image: Image.Image) -> str:
|
|
120
|
+
"""Predict using CRNN model"""
|
|
121
|
+
if self.model is None:
|
|
122
|
+
raise ValueError("Model not loaded. Call load_model() first.")
|
|
123
|
+
|
|
124
|
+
# Preprocess image - match simple_inference.py exactly
|
|
125
|
+
if isinstance(image, str):
|
|
126
|
+
image = Image.open(image).convert('L')
|
|
127
|
+
elif isinstance(image, Image.Image):
|
|
128
|
+
image = image.convert('L')
|
|
129
|
+
|
|
130
|
+
# Resize image - target_size is (height, width) for the model
|
|
131
|
+
# PIL resize expects (width, height), so we need to swap
|
|
132
|
+
image = image.resize((256, 64), Image.Resampling.LANCZOS)
|
|
133
|
+
|
|
134
|
+
# Convert to tensor and normalize
|
|
135
|
+
image_array = np.array(image, dtype=np.float32) / 255.0
|
|
136
|
+
image_tensor = torch.from_numpy(image_array).unsqueeze(0).unsqueeze(0) # [1, 1, H, W]
|
|
137
|
+
|
|
138
|
+
# Apply the same transform as training
|
|
139
|
+
transform = transforms.Compose([
|
|
140
|
+
transforms.Normalize(mean=[0.5], std=[0.5])
|
|
141
|
+
])
|
|
142
|
+
image_tensor = transform(image_tensor)
|
|
143
|
+
image_tensor = image_tensor.to(self.device)
|
|
144
|
+
|
|
145
|
+
# Predict
|
|
146
|
+
with torch.no_grad():
|
|
147
|
+
outputs = self.model(image_tensor)
|
|
148
|
+
predicted_text = self._decode_crnn_output(outputs)
|
|
149
|
+
|
|
150
|
+
return predicted_text
|
|
151
|
+
|
|
152
|
+
def _predict_trocr(self, image: Image.Image) -> str:
|
|
153
|
+
"""Predict using TrOCR model"""
|
|
154
|
+
if self.model is None or self.processor is None:
|
|
155
|
+
raise ValueError("Model not loaded. Call load_model() first.")
|
|
156
|
+
|
|
157
|
+
# Preprocess image
|
|
158
|
+
pixel_values = self.processor(image, return_tensors="pt").pixel_values.to(self.device)
|
|
159
|
+
|
|
160
|
+
# Predict
|
|
161
|
+
with torch.no_grad():
|
|
162
|
+
generated_ids = self.model.generate(pixel_values)
|
|
163
|
+
predicted_text = self.processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
|
|
164
|
+
|
|
165
|
+
return predicted_text
|
|
166
|
+
|
|
167
|
+
def _decode_crnn_output(self, output: torch.Tensor) -> str:
|
|
168
|
+
"""Decode CRNN output to text - match simple_inference.py exactly"""
|
|
169
|
+
if self.charset is None:
|
|
170
|
+
raise ValueError("Charset not loaded")
|
|
171
|
+
|
|
172
|
+
# Get predictions - same as working version
|
|
173
|
+
preds = output.softmax(2).argmax(2).squeeze(0) # [seq_len]
|
|
174
|
+
|
|
175
|
+
# CTC decoding - exact same logic as working simple_inference.py
|
|
176
|
+
decoded = []
|
|
177
|
+
prev_char = None
|
|
178
|
+
|
|
179
|
+
for idx in preds:
|
|
180
|
+
idx = idx.item()
|
|
181
|
+
if idx == 0: # blank token
|
|
182
|
+
prev_char = None
|
|
183
|
+
elif idx != prev_char: # new character
|
|
184
|
+
if idx <= len(self.charset): # idx is 1-based, charset is 0-based
|
|
185
|
+
decoded.append(self.charset[idx - 1])
|
|
186
|
+
prev_char = idx
|
|
187
|
+
|
|
188
|
+
return ''.join(decoded)
|
|
189
|
+
|
|
190
|
+
def batch_predict(self, images: List[Union[str, Image.Image]]) -> List[str]:
|
|
191
|
+
"""
|
|
192
|
+
Predict text from multiple images
|
|
193
|
+
|
|
194
|
+
Args:
|
|
195
|
+
images: List of image paths or PIL Image objects
|
|
196
|
+
|
|
197
|
+
Returns:
|
|
198
|
+
List of predicted text strings
|
|
199
|
+
"""
|
|
200
|
+
results = []
|
|
201
|
+
for image in images:
|
|
202
|
+
try:
|
|
203
|
+
result = self.predict(image)
|
|
204
|
+
results.append(result)
|
|
205
|
+
except Exception as e:
|
|
206
|
+
logging.warning(f"Error processing image: {e}")
|
|
207
|
+
results.append("")
|
|
208
|
+
|
|
209
|
+
return results
|
|
210
|
+
|
|
211
|
+
def read_text(self, image: Union[str, Image.Image]) -> str:
|
|
212
|
+
"""
|
|
213
|
+
Read text from image (alias for predict method)
|
|
214
|
+
|
|
215
|
+
Args:
|
|
216
|
+
image: Path to image file or PIL Image object
|
|
217
|
+
|
|
218
|
+
Returns:
|
|
219
|
+
Extracted text string
|
|
220
|
+
"""
|
|
221
|
+
return self.predict(image)
|
|
222
|
+
|
|
223
|
+
def read_multiple(self, images: List[Union[str, Image.Image]]) -> List[str]:
|
|
224
|
+
"""
|
|
225
|
+
Read text from multiple images (alias for batch_predict method)
|
|
226
|
+
|
|
227
|
+
Args:
|
|
228
|
+
images: List of image paths or PIL Image objects
|
|
229
|
+
|
|
230
|
+
Returns:
|
|
231
|
+
List of extracted text strings
|
|
232
|
+
"""
|
|
233
|
+
return self.batch_predict(images)
|
|
234
|
+
|
|
235
|
+
def read_from_folder(self, folder_path: str, extensions: List[str] = None) -> dict:
|
|
236
|
+
"""
|
|
237
|
+
Read text from all images in a folder
|
|
238
|
+
|
|
239
|
+
Args:
|
|
240
|
+
folder_path: Path to folder containing images
|
|
241
|
+
extensions: List of file extensions to process (default: ['.png', '.jpg', '.jpeg'])
|
|
242
|
+
|
|
243
|
+
Returns:
|
|
244
|
+
Dictionary mapping filename to extracted text
|
|
245
|
+
"""
|
|
246
|
+
if extensions is None:
|
|
247
|
+
extensions = ['.png', '.jpg', '.jpeg', '.bmp', '.tiff']
|
|
248
|
+
|
|
249
|
+
folder = Path(folder_path)
|
|
250
|
+
if not folder.exists():
|
|
251
|
+
raise FileNotFoundError(f"Folder not found: {folder_path}")
|
|
252
|
+
|
|
253
|
+
results = {}
|
|
254
|
+
image_files = []
|
|
255
|
+
|
|
256
|
+
for ext in extensions:
|
|
257
|
+
image_files.extend(folder.glob(f"*{ext}"))
|
|
258
|
+
image_files.extend(folder.glob(f"*{ext.upper()}"))
|
|
259
|
+
|
|
260
|
+
for image_file in image_files:
|
|
261
|
+
try:
|
|
262
|
+
text = self.read_text(str(image_file))
|
|
263
|
+
results[image_file.name] = text
|
|
264
|
+
except Exception as e:
|
|
265
|
+
print(f"Error processing {image_file.name}: {e}")
|
|
266
|
+
results[image_file.name] = ""
|
|
267
|
+
|
|
268
|
+
return results
|