monocr 0.1.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- monocr/__init__.py +35 -0
- monocr/assets/valid_chars.txt +1 -0
- monocr/cli.py +54 -0
- monocr/config.py +18 -0
- monocr/exceptions.py +15 -0
- monocr/model.py +69 -0
- monocr/models/monocr.ckpt +3 -0
- monocr/ocr.py +264 -0
- monocr-0.1.10.dist-info/METADATA +89 -0
- monocr-0.1.10.dist-info/RECORD +14 -0
- monocr-0.1.10.dist-info/WHEEL +5 -0
- monocr-0.1.10.dist-info/entry_points.txt +2 -0
- monocr-0.1.10.dist-info/licenses/LICENSE +21 -0
- monocr-0.1.10.dist-info/top_level.txt +1 -0
monocr/__init__.py
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
"""
|
|
2
|
+
mon ocr - optical character recognition for mon text
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from .ocr import MonOCR
|
|
8
|
+
from .config import DEFAULT_MODEL_PATH
|
|
9
|
+
from .exceptions import MonOCRError, ModelNotFoundError, ImageLoadError
|
|
10
|
+
|
|
11
|
+
__version__ = "0.1.5"
|
|
12
|
+
|
|
13
|
+
# Set up null handler to prevent "No handler found" warnings
|
|
14
|
+
logging.getLogger(__name__).addHandler(logging.NullHandler())
|
|
15
|
+
|
|
16
|
+
def get_default_model_path():
|
|
17
|
+
"""get bundled v3 model path"""
|
|
18
|
+
return str(DEFAULT_MODEL_PATH)
|
|
19
|
+
|
|
20
|
+
# Global instance for easy access
|
|
21
|
+
_ocr = None
|
|
22
|
+
|
|
23
|
+
def _get_ocr():
|
|
24
|
+
global _ocr
|
|
25
|
+
if _ocr is None:
|
|
26
|
+
_ocr = MonOCR(get_default_model_path())
|
|
27
|
+
return _ocr
|
|
28
|
+
|
|
29
|
+
def read_text(image):
|
|
30
|
+
"""Recognize text from an image (supports single/multi-line)"""
|
|
31
|
+
return _get_ocr().predict(image)
|
|
32
|
+
|
|
33
|
+
def read_folder(folder_path):
|
|
34
|
+
"""Recognize text from all images in a folder"""
|
|
35
|
+
return _get_ocr().read_from_folder(folder_path)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~£¥¦§©«¬°±²³´µ·¸¹º»¾ÀÁÂÄÅÆÇÉÊÌÍÑÓÖרÜÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüþĀāăćčĐđĒēėěğġĦħīİıņŋŌōőŒœŚśşŠšţũŪūŻŽžƒơƭưƿǎǐǔșɐɑɒɓɔɕɖɗəɛɜɡɣɦɨɪɬɰɲɳɴɹɾʁʃʄʈʊʋʑʒʔʰʲʷʻʾʿˀˈˌː˥˦̟̥̩̪̰̀́̂̃̄̋̏̚͡ΆΒΔΕΜΝΠΣΤΧάέήίαβγδεζηθικλμνξοπρςστυφχψωόύώАБДЗИМНОРСТабвгдежзийклмнопрстухцчшщыьэюяүӨөְֱִֵֶַָֹּׁׂאבדהוחיכלםמןסעפקרשתءأإئابةتثجحخدرسفقكلمنهويَِْٹپکگیंअआईकखगङचजटठडढणतथदधनपफबभमयरलळवशषसह़ािीुूृेैोौ्२ংআউকখগঙচজঝঠঢণতদধনপবভমযরলশষহ়ািীুূেৈো্ৰਖਚਤਨਮਰਲਸਿੀੂੇੰੱੴકચતરાૈ્ଆକଖଢଣତଦଧପବଭମରଶଷାିୁୂୈ୍அஇகஙசஞடணதநனபமயரலளழவாிுூெேை்ంకఖగచణతదధపబమరవశాిీుైౌ్ಂಅಆಕಖಗಚಣತದಧನಪಬಮಯರಲಳವಶಾಿುೆೇೈೌ್ംകചടണതപമയരറലളവഷസിൂെൈ്අඕකතදධනබමයරවසහ්ාිුෙෝกขคฆงจฉชซญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรฤลวศษสหฬอฮฯะัาำิีึืฺุูเแโใไๆ็่้๊์ํ๑๒๕๖๘ຂງຊດທບພຣວສຫະາິູຼ་།༥ཀགཆདནཔབམཛཟའརཤསིེོུྟྱྲླကခဂဃငစဆဇဈဉညဋဌဍဎဏတထဒဓနပဖဗဘမယရလဝသဟဠအဢဣဤဥဦဧဨဩဪါာိီုူေဲဳဴဵံ့း္်ျြွှဿ၀၁၂၃၄၅၆၇၈၉၊။၌၍၎၏ၐၑၓၚၛၜၝၞၟၠၡၢၣၤၥၨၪၰၱၲၳၴၵၷၸၹၺၻၼၾၿႀႄႅႆႇႈႉႊႏ႐႒႓႔႕႘႙ႜႝ႟ὴ–‘’‚“”•⇒−
|
monocr/cli.py
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
import click
|
|
3
|
+
import logging
|
|
4
|
+
import sys
|
|
5
|
+
from .ocr import MonOCR
|
|
6
|
+
from .config import DEFAULT_MODEL_PATH
|
|
7
|
+
from .exceptions import MonOCRError
|
|
8
|
+
|
|
9
|
+
@click.group()
|
|
10
|
+
@click.version_option()
|
|
11
|
+
@click.option('--verbose', '-v', is_flag=True, help='Enable verbose logging')
|
|
12
|
+
def main(verbose):
|
|
13
|
+
"""mon ocr - simple and effective text recognition for Mon language"""
|
|
14
|
+
log_level = logging.DEBUG if verbose else logging.INFO
|
|
15
|
+
logging.basicConfig(
|
|
16
|
+
level=log_level,
|
|
17
|
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
@main.command()
|
|
21
|
+
@click.argument('image_path', type=click.Path(exists=True))
|
|
22
|
+
@click.option('--confidence', '-c', is_flag=True, help='Show confidence score')
|
|
23
|
+
def read(image_path, confidence):
|
|
24
|
+
"""Read text from an image."""
|
|
25
|
+
try:
|
|
26
|
+
ocr = MonOCR(str(DEFAULT_MODEL_PATH))
|
|
27
|
+
if confidence:
|
|
28
|
+
res = ocr.predict_with_confidence(image_path)
|
|
29
|
+
click.echo(f"Text:\n{res['text']}")
|
|
30
|
+
click.echo(f"Confidence: {res['confidence']:.2%}")
|
|
31
|
+
else:
|
|
32
|
+
click.echo(ocr.predict(image_path))
|
|
33
|
+
except MonOCRError as e:
|
|
34
|
+
logging.error(f"OCR Error: {e}")
|
|
35
|
+
sys.exit(1)
|
|
36
|
+
except Exception as e:
|
|
37
|
+
logging.critical(f"Unexpected error: {e}")
|
|
38
|
+
sys.exit(1)
|
|
39
|
+
|
|
40
|
+
@main.command()
|
|
41
|
+
@click.argument('folder_path', type=click.Path(exists=True, file_okay=False))
|
|
42
|
+
def batch(folder_path):
|
|
43
|
+
"""Process a folder of images."""
|
|
44
|
+
try:
|
|
45
|
+
ocr = MonOCR(str(DEFAULT_MODEL_PATH))
|
|
46
|
+
results = ocr.read_from_folder(folder_path)
|
|
47
|
+
for path, text in sorted(results.items()):
|
|
48
|
+
click.echo(f"{path:30}: {text}")
|
|
49
|
+
except MonOCRError as e:
|
|
50
|
+
logging.error(f"Batch processing error: {e}")
|
|
51
|
+
sys.exit(1)
|
|
52
|
+
|
|
53
|
+
if __name__ == '__main__':
|
|
54
|
+
main()
|
monocr/config.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
# Image processing constants
|
|
4
|
+
TARGET_WIDTH = 1024
|
|
5
|
+
TARGET_HEIGHT = 64
|
|
6
|
+
IMAGE_NORM_MEAN = 127.5
|
|
7
|
+
IMAGE_NORM_STD = 127.5
|
|
8
|
+
|
|
9
|
+
# Segmentation constants
|
|
10
|
+
PROJECTION_THRESHOLD = 2
|
|
11
|
+
MIN_LINE_GAP = 5
|
|
12
|
+
BINARY_THRESHOLD = 200
|
|
13
|
+
|
|
14
|
+
# Paths
|
|
15
|
+
PACKAGE_ROOT = Path(__file__).parent
|
|
16
|
+
ASSETS_DIR = PACKAGE_ROOT / "assets"
|
|
17
|
+
DEFAULT_MODEL_PATH = PACKAGE_ROOT / "models" / "monocr.ckpt"
|
|
18
|
+
CHARSET_PATH = ASSETS_DIR / "valid_chars.txt"
|
monocr/exceptions.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
class MonOCRError(Exception):
|
|
2
|
+
"""Base exception for MonOCR errors."""
|
|
3
|
+
pass
|
|
4
|
+
|
|
5
|
+
class ModelNotFoundError(MonOCRError):
|
|
6
|
+
"""Raised when the model file cannot be found."""
|
|
7
|
+
pass
|
|
8
|
+
|
|
9
|
+
class CharsetNotFoundError(MonOCRError):
|
|
10
|
+
"""Raised when the charset file cannot be found."""
|
|
11
|
+
pass
|
|
12
|
+
|
|
13
|
+
class ImageLoadError(MonOCRError):
|
|
14
|
+
"""Raised when an image cannot be loaded or processed."""
|
|
15
|
+
pass
|
monocr/model.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
import torch
|
|
2
|
+
import torch.nn as nn
|
|
3
|
+
import torchvision.models as models
|
|
4
|
+
from torchvision.models.resnet import ResNet18_Weights
|
|
5
|
+
|
|
6
|
+
class ResNetFeatureExtractor(nn.Module):
|
|
7
|
+
"""
|
|
8
|
+
Modified ResNet-18 for OCR feature extraction.
|
|
9
|
+
Handles grayscale input and preserves width for sequence modeling.
|
|
10
|
+
"""
|
|
11
|
+
def __init__(self):
|
|
12
|
+
super(ResNetFeatureExtractor, self).__init__()
|
|
13
|
+
backbone = models.resnet18(weights=ResNet18_Weights.DEFAULT)
|
|
14
|
+
|
|
15
|
+
# Grayscale input
|
|
16
|
+
self.conv1 = nn.Conv2d(1, 64, kernel_size=7, stride=(2, 1), padding=3, bias=False)
|
|
17
|
+
with torch.no_grad():
|
|
18
|
+
self.conv1.weight[:] = backbone.conv1.weight.sum(dim=1, keepdim=True)
|
|
19
|
+
|
|
20
|
+
self.bn1 = backbone.bn1
|
|
21
|
+
self.relu = backbone.relu
|
|
22
|
+
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
|
|
23
|
+
|
|
24
|
+
self.layer1 = backbone.layer1
|
|
25
|
+
self.layer2 = backbone.layer2
|
|
26
|
+
self.layer3 = backbone.layer3
|
|
27
|
+
self.layer4 = backbone.layer4
|
|
28
|
+
|
|
29
|
+
# Preserve width in later layers
|
|
30
|
+
for layer in [self.layer2, self.layer3, self.layer4]:
|
|
31
|
+
layer[0].conv1.stride = (2, 1)
|
|
32
|
+
layer[0].downsample[0].stride = (2, 1)
|
|
33
|
+
|
|
34
|
+
def forward(self, x):
|
|
35
|
+
x = self.conv1(x)
|
|
36
|
+
x = self.bn1(x)
|
|
37
|
+
x = self.relu(x)
|
|
38
|
+
x = self.maxpool(x)
|
|
39
|
+
x = self.layer1(x)
|
|
40
|
+
x = self.layer2(x)
|
|
41
|
+
x = self.layer3(x)
|
|
42
|
+
x = self.layer4(x)
|
|
43
|
+
return x
|
|
44
|
+
|
|
45
|
+
class MonOCRModel(nn.Module):
|
|
46
|
+
"""
|
|
47
|
+
CRNN architecture: ResNet + Bi-LSTM + FC.
|
|
48
|
+
"""
|
|
49
|
+
def __init__(self, num_classes, rnn_hidden_size=256, rnn_layers=2):
|
|
50
|
+
super(MonOCRModel, self).__init__()
|
|
51
|
+
self.feature_extractor = ResNetFeatureExtractor()
|
|
52
|
+
self.avg_pool = nn.AdaptiveAvgPool2d((1, None))
|
|
53
|
+
|
|
54
|
+
self.rnn = nn.LSTM(
|
|
55
|
+
input_size=512,
|
|
56
|
+
hidden_size=rnn_hidden_size,
|
|
57
|
+
num_layers=rnn_layers,
|
|
58
|
+
bidirectional=True,
|
|
59
|
+
batch_first=True,
|
|
60
|
+
dropout=0.1 if rnn_layers > 1 else 0
|
|
61
|
+
)
|
|
62
|
+
self.fc = nn.Linear(rnn_hidden_size * 2, num_classes)
|
|
63
|
+
|
|
64
|
+
def forward(self, x):
|
|
65
|
+
features = self.feature_extractor(x)
|
|
66
|
+
features = self.avg_pool(features).squeeze(2).permute(0, 2, 1)
|
|
67
|
+
self.rnn.flatten_parameters()
|
|
68
|
+
recurrent, _ = self.rnn(features)
|
|
69
|
+
return self.fc(recurrent)
|
monocr/ocr.py
ADDED
|
@@ -0,0 +1,264 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
import os
|
|
3
|
+
import torch
|
|
4
|
+
import numpy as np
|
|
5
|
+
import logging
|
|
6
|
+
from PIL import Image, UnidentifiedImageError
|
|
7
|
+
from typing import List, Optional, Union, Dict
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
|
|
10
|
+
from .model import MonOCRModel
|
|
11
|
+
from .config import (
|
|
12
|
+
TARGET_WIDTH, TARGET_HEIGHT,
|
|
13
|
+
IMAGE_NORM_MEAN, IMAGE_NORM_STD,
|
|
14
|
+
PROJECTION_THRESHOLD, MIN_LINE_GAP, BINARY_THRESHOLD,
|
|
15
|
+
CHARSET_PATH, DEFAULT_MODEL_PATH
|
|
16
|
+
)
|
|
17
|
+
from .exceptions import (
|
|
18
|
+
ModelNotFoundError, CharsetNotFoundError, ImageLoadError
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
logger = logging.getLogger(__name__)
|
|
22
|
+
|
|
23
|
+
class MonOCR:
|
|
24
|
+
"""
|
|
25
|
+
Mon OCR Inference Class.
|
|
26
|
+
Supports single-line and multi-line (paragraph) Mon text recognition.
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
def __init__(self, model_path: Optional[str] = None, model_type: str = "crnn", device: str = None):
|
|
30
|
+
"""
|
|
31
|
+
Initialize Mon OCR.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
model_path: Path to the .pt model file. If None, tries to load bundled default model.
|
|
35
|
+
model_type: Type of model (defaults to 'crnn').
|
|
36
|
+
device: Computing device ('cuda', 'cpu').
|
|
37
|
+
"""
|
|
38
|
+
self.device = device if device else ("cuda" if torch.cuda.is_available() else "cpu")
|
|
39
|
+
logger.info(f"Using device: {self.device}")
|
|
40
|
+
|
|
41
|
+
self.model_type = model_type.lower()
|
|
42
|
+
self.model = None
|
|
43
|
+
self.charset = None
|
|
44
|
+
|
|
45
|
+
if model_path is None:
|
|
46
|
+
# Fallback to default bundled model
|
|
47
|
+
if os.path.exists(DEFAULT_MODEL_PATH):
|
|
48
|
+
logger.info(f"No model path provided. Loading default model from {DEFAULT_MODEL_PATH}")
|
|
49
|
+
model_path = str(DEFAULT_MODEL_PATH)
|
|
50
|
+
else:
|
|
51
|
+
logger.warning(f"Default model not found at {DEFAULT_MODEL_PATH}. Initialized without model.")
|
|
52
|
+
|
|
53
|
+
if model_path:
|
|
54
|
+
self.load_model(model_path)
|
|
55
|
+
|
|
56
|
+
def load_model(self, model_path: str):
|
|
57
|
+
"""Load a trained model from disk."""
|
|
58
|
+
if not os.path.exists(model_path):
|
|
59
|
+
raise ModelNotFoundError(f"Model file not found: {model_path}")
|
|
60
|
+
|
|
61
|
+
logger.info(f"Loading model from {model_path}")
|
|
62
|
+
try:
|
|
63
|
+
checkpoint = torch.load(model_path, map_location=self.device)
|
|
64
|
+
except Exception as e:
|
|
65
|
+
raise ModelNotFoundError(f"Failed to load checkpoint: {e}")
|
|
66
|
+
|
|
67
|
+
if isinstance(checkpoint, dict):
|
|
68
|
+
state_dict = checkpoint.get('state_dict', checkpoint.get('model_state_dict', checkpoint))
|
|
69
|
+
self.charset = checkpoint.get('charset') or checkpoint.get('hyper_parameters', {}).get('charset')
|
|
70
|
+
else:
|
|
71
|
+
state_dict = checkpoint
|
|
72
|
+
self.charset = None
|
|
73
|
+
|
|
74
|
+
if self.charset is None:
|
|
75
|
+
logger.warning("Charset not found in checkpoint. Attempting fallback.")
|
|
76
|
+
if os.path.exists(CHARSET_PATH):
|
|
77
|
+
try:
|
|
78
|
+
with open(CHARSET_PATH, "r", encoding="utf-8") as f:
|
|
79
|
+
self.charset = f.read().strip()
|
|
80
|
+
logger.info(f"Loaded charset from {CHARSET_PATH}")
|
|
81
|
+
except Exception as e:
|
|
82
|
+
logger.error(f"Failed to read charset file: {e}")
|
|
83
|
+
|
|
84
|
+
if self.charset is None:
|
|
85
|
+
raise CharsetNotFoundError("Model checkpoint is missing charset information and valid_chars.txt not found.")
|
|
86
|
+
|
|
87
|
+
# Versatile size handling
|
|
88
|
+
num_classes = len(self.charset) + 1
|
|
89
|
+
if 'fc.weight' in state_dict:
|
|
90
|
+
ckpt_classes = state_dict['fc.weight'].size(0)
|
|
91
|
+
if ckpt_classes != num_classes:
|
|
92
|
+
logger.warning(f"Model checkpoint has {ckpt_classes} classes, but charset has {len(self.charset)} (+1={num_classes}).")
|
|
93
|
+
if ckpt_classes < num_classes:
|
|
94
|
+
logger.warning(f"Adjusting charset to match checkpoint size. {num_classes - ckpt_classes} characters will be ignored.")
|
|
95
|
+
self.charset = self.charset[:ckpt_classes-1]
|
|
96
|
+
num_classes = ckpt_classes
|
|
97
|
+
else:
|
|
98
|
+
logger.warning("Checkpoint has MORE classes than charset. Unknown characters will be ignored during decoding.")
|
|
99
|
+
num_classes = ckpt_classes
|
|
100
|
+
|
|
101
|
+
self.model = MonOCRModel(num_classes=num_classes)
|
|
102
|
+
self.model.load_state_dict(state_dict)
|
|
103
|
+
self.model.to(self.device).eval()
|
|
104
|
+
logger.debug("Model loaded and ready.")
|
|
105
|
+
|
|
106
|
+
def predict(self, image: Union[str, Image.Image, Path]) -> str:
|
|
107
|
+
"""Extract text from an image. Handles single and multi-line images."""
|
|
108
|
+
if self.model is None:
|
|
109
|
+
raise RuntimeError("Model used before loading. Call load_model() first.")
|
|
110
|
+
|
|
111
|
+
try:
|
|
112
|
+
img = self._prepare_image(image)
|
|
113
|
+
except Exception as e:
|
|
114
|
+
logger.error(f"Prediction failed during image preparation: {e}")
|
|
115
|
+
raise ImageLoadError(str(e))
|
|
116
|
+
|
|
117
|
+
# Simple vertical check: if image is tall, try segmentation
|
|
118
|
+
if img.height > 100:
|
|
119
|
+
lines = self._segment_lines(img)
|
|
120
|
+
else:
|
|
121
|
+
lines = [img]
|
|
122
|
+
|
|
123
|
+
results = []
|
|
124
|
+
for line_img in lines:
|
|
125
|
+
text = self._predict_single_line(line_img)
|
|
126
|
+
if text.strip():
|
|
127
|
+
results.append(text)
|
|
128
|
+
|
|
129
|
+
return "\n".join(results)
|
|
130
|
+
|
|
131
|
+
def predict_with_confidence(self, image: Union[str, Image.Image, Path]) -> Dict[str, Union[str, float]]:
|
|
132
|
+
"""Predict text and return alongside a confidence score."""
|
|
133
|
+
if self.model is None:
|
|
134
|
+
raise RuntimeError("Model used before loading.")
|
|
135
|
+
|
|
136
|
+
try:
|
|
137
|
+
img = self._prepare_image(image)
|
|
138
|
+
except Exception as e:
|
|
139
|
+
raise ImageLoadError(str(e))
|
|
140
|
+
|
|
141
|
+
if img.height > 100:
|
|
142
|
+
lines = self._segment_lines(img)
|
|
143
|
+
else:
|
|
144
|
+
lines = [img]
|
|
145
|
+
|
|
146
|
+
all_text = []
|
|
147
|
+
confs = []
|
|
148
|
+
|
|
149
|
+
for line_img in lines:
|
|
150
|
+
text, conf = self._predict_single_line(line_img, return_confidence=True)
|
|
151
|
+
if text.strip():
|
|
152
|
+
all_text.append(text)
|
|
153
|
+
confs.append(conf)
|
|
154
|
+
|
|
155
|
+
return {
|
|
156
|
+
'text': "\n".join(all_text),
|
|
157
|
+
'confidence': sum(confs)/len(confs) if confs else 0.0
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
# API Aliases and Batch Methods
|
|
161
|
+
def read_text(self, image: Union[str, Image.Image, Path]) -> str:
|
|
162
|
+
return self.predict(image)
|
|
163
|
+
|
|
164
|
+
def read_from_folder(self, folder_path: str, extensions: Optional[List[str]] = None) -> Dict[str, str]:
|
|
165
|
+
import glob
|
|
166
|
+
if extensions is None:
|
|
167
|
+
extensions = ['*.png', '*.jpg', '*.jpeg']
|
|
168
|
+
|
|
169
|
+
results = {}
|
|
170
|
+
for ext in extensions:
|
|
171
|
+
for img_path in glob.glob(os.path.join(folder_path, ext)):
|
|
172
|
+
try:
|
|
173
|
+
results[os.path.basename(img_path)] = self.predict(img_path)
|
|
174
|
+
except Exception as e:
|
|
175
|
+
logger.warning(f"Failed to process {img_path}: {e}")
|
|
176
|
+
results[os.path.basename(img_path)] = ""
|
|
177
|
+
return results
|
|
178
|
+
|
|
179
|
+
def predict_batch(self, images: List[Union[str, Image.Image, Path]]) -> List[str]:
|
|
180
|
+
return [self.predict(img) for img in images]
|
|
181
|
+
|
|
182
|
+
def _prepare_image(self, image: Union[str, Image.Image, Path]) -> Image.Image:
|
|
183
|
+
"""Standardize image to grayscale."""
|
|
184
|
+
if isinstance(image, (str, Path)):
|
|
185
|
+
try:
|
|
186
|
+
image = Image.open(str(image))
|
|
187
|
+
except (FileNotFoundError, UnidentifiedImageError) as e:
|
|
188
|
+
raise ImageLoadError(f"Could not open image file: {e}")
|
|
189
|
+
return image.convert("L")
|
|
190
|
+
|
|
191
|
+
def _segment_lines(self, image: Image.Image) -> List[Image.Image]:
|
|
192
|
+
"""Split multi-line images using horizontal projection."""
|
|
193
|
+
img_arr = np.array(image)
|
|
194
|
+
binary = (img_arr < BINARY_THRESHOLD).astype(np.uint8)
|
|
195
|
+
|
|
196
|
+
projection = np.sum(binary, axis=1)
|
|
197
|
+
is_line = projection > PROJECTION_THRESHOLD
|
|
198
|
+
|
|
199
|
+
lines = []
|
|
200
|
+
start = None
|
|
201
|
+
for i, val in enumerate(is_line):
|
|
202
|
+
if val and start is None:
|
|
203
|
+
start = i
|
|
204
|
+
elif not val and start is not None:
|
|
205
|
+
if i - start > MIN_LINE_GAP:
|
|
206
|
+
lines.append(image.crop((0, max(0, start-2), image.width, min(image.height, i+2))))
|
|
207
|
+
start = None
|
|
208
|
+
|
|
209
|
+
if start is not None:
|
|
210
|
+
lines.append(image.crop((0, start, image.width, image.height)))
|
|
211
|
+
|
|
212
|
+
return lines if lines else [image]
|
|
213
|
+
|
|
214
|
+
def _predict_single_line(self, image: Image.Image, return_confidence=False) -> Union[str, tuple]:
|
|
215
|
+
"""Core CRNN inference for a single line."""
|
|
216
|
+
target_w, target_h = TARGET_WIDTH, TARGET_HEIGHT
|
|
217
|
+
|
|
218
|
+
# Aspect-ratio preserving resize
|
|
219
|
+
w, h = image.size
|
|
220
|
+
ratio = w / h
|
|
221
|
+
new_w = int(target_h * ratio)
|
|
222
|
+
|
|
223
|
+
# Resize
|
|
224
|
+
if new_w > target_w:
|
|
225
|
+
new_w = target_w
|
|
226
|
+
pil_img = image.resize((new_w, target_h), Image.Resampling.BILINEAR)
|
|
227
|
+
|
|
228
|
+
# Create canvas of target size (fixed width) and paste
|
|
229
|
+
# Training code uses 255 (white) background
|
|
230
|
+
new_img = Image.new("L", (target_w, target_h), 255)
|
|
231
|
+
new_img.paste(pil_img, (0, 0))
|
|
232
|
+
pil_img = new_img
|
|
233
|
+
|
|
234
|
+
# Normalize to [-1, 1] as per training logic (utils.resize_and_pad)
|
|
235
|
+
# canvas is 0..255
|
|
236
|
+
img_arr = np.array(pil_img).astype(np.float32)
|
|
237
|
+
img_norm = img_arr / 127.5 - 1.0
|
|
238
|
+
|
|
239
|
+
tensor = torch.from_numpy(img_norm).unsqueeze(0).unsqueeze(0).to(self.device)
|
|
240
|
+
|
|
241
|
+
with torch.no_grad():
|
|
242
|
+
preds = self.model(tensor)
|
|
243
|
+
probs = preds.softmax(2).squeeze(0)
|
|
244
|
+
|
|
245
|
+
best_path = probs.argmax(1)
|
|
246
|
+
text = self._decode(best_path)
|
|
247
|
+
|
|
248
|
+
if return_confidence:
|
|
249
|
+
conf = probs.max(1).values.mean().item()
|
|
250
|
+
return text, conf
|
|
251
|
+
|
|
252
|
+
return text
|
|
253
|
+
|
|
254
|
+
def _decode(self, indices: torch.Tensor) -> str:
|
|
255
|
+
"""Greedy CTC decoding."""
|
|
256
|
+
text = []
|
|
257
|
+
prev_idx = 0
|
|
258
|
+
for idx in indices:
|
|
259
|
+
val = idx.item()
|
|
260
|
+
if val != 0 and val != prev_idx:
|
|
261
|
+
if 0 < val <= len(self.charset):
|
|
262
|
+
text.append(self.charset[val-1])
|
|
263
|
+
prev_idx = val
|
|
264
|
+
return "".join(text)
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: monocr
|
|
3
|
+
Version: 0.1.10
|
|
4
|
+
Summary: Optical Character Recognition for Mon text
|
|
5
|
+
Author-email: janakhpon <jnovaxer@gmail.com>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Repository, https://github.com/janakhpon/monocr
|
|
8
|
+
Keywords: mon,ocr,text-recognition
|
|
9
|
+
Classifier: Development Status :: 4 - Beta
|
|
10
|
+
Classifier: Intended Audience :: Developers
|
|
11
|
+
Classifier: Intended Audience :: Science/Research
|
|
12
|
+
Classifier: Operating System :: OS Independent
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
16
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
17
|
+
Classifier: Topic :: Scientific/Engineering :: Image Recognition
|
|
18
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
|
19
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
20
|
+
Requires-Python: >=3.11
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
License-File: LICENSE
|
|
23
|
+
Requires-Dist: torch>=2.0.0
|
|
24
|
+
Requires-Dist: torchvision>=0.15.0
|
|
25
|
+
Requires-Dist: pillow>=9.0.0
|
|
26
|
+
Requires-Dist: numpy>=1.21.0
|
|
27
|
+
Requires-Dist: click>=8.0.0
|
|
28
|
+
Requires-Dist: pytest>=9.0.2
|
|
29
|
+
Dynamic: license-file
|
|
30
|
+
|
|
31
|
+
# Mon OCR
|
|
32
|
+
|
|
33
|
+
Optical Character Recognition for Mon (mnw) text.
|
|
34
|
+
|
|
35
|
+
## Installation
|
|
36
|
+
|
|
37
|
+
```bash
|
|
38
|
+
pip install monocr | uv add monocr
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
## Quick Start
|
|
42
|
+
|
|
43
|
+
```python
|
|
44
|
+
from monocr import read_text, read_folder
|
|
45
|
+
|
|
46
|
+
# Read text from a single image
|
|
47
|
+
text = read_text("image.png")
|
|
48
|
+
print(text)
|
|
49
|
+
|
|
50
|
+
# Read all images in a folder
|
|
51
|
+
results = read_folder("images/")
|
|
52
|
+
for filename, text in results.items():
|
|
53
|
+
print(f"{filename}: {text}")
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
## Command Line
|
|
57
|
+
|
|
58
|
+
```bash
|
|
59
|
+
# Read single image
|
|
60
|
+
monocr read image.png
|
|
61
|
+
|
|
62
|
+
# Process folder
|
|
63
|
+
monocr batch images/ --output results.json
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
## Dev Setup
|
|
67
|
+
|
|
68
|
+
```bash
|
|
69
|
+
git clone git@github.com:janakhpon/monocr.git
|
|
70
|
+
cd monocr
|
|
71
|
+
uv sync --dev
|
|
72
|
+
|
|
73
|
+
# Release workflow
|
|
74
|
+
uv version --bump patch
|
|
75
|
+
git add .
|
|
76
|
+
git commit -m "bump version"
|
|
77
|
+
git tag v0.1.11
|
|
78
|
+
git push origin main --tags
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
## Related tools
|
|
82
|
+
|
|
83
|
+
- [mon_tokenizer](https://github.com/Code-Yay-Mal/mon_tokenizer)
|
|
84
|
+
- [hugging face mon_tokenizer model](https://huggingface.co/janakhpon/mon_tokenizer)
|
|
85
|
+
- [Mon corpus collection in unicode](https://github.com/MonDevHub/MonCorpusCollection)
|
|
86
|
+
|
|
87
|
+
## License
|
|
88
|
+
|
|
89
|
+
MIT - do whatever you want with it.
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
monocr/__init__.py,sha256=mN5523-Iv70DQNhkYvp0ARrnPXRp9Dr9TUFOUPsAGqE,914
|
|
2
|
+
monocr/cli.py,sha256=1cvPW40fnkCzYXa-dYMJU7kqnZX_W0gwtFeZe5rs-pQ,1782
|
|
3
|
+
monocr/config.py,sha256=fgRIWnVpuzPw8mefkrNnUz9wLQ-M3O4sYJfedcWin_o,422
|
|
4
|
+
monocr/exceptions.py,sha256=HI0dxDLSHbsL09tobBdWFgO2UtUySFaxN4P1-DkGg4o,400
|
|
5
|
+
monocr/model.py,sha256=wQWqgwdxSoYARTym26O1kB-KjauDZDmcWnbjYODuM1w,2357
|
|
6
|
+
monocr/ocr.py,sha256=0KKvQCW0sRNqv5CUvl10_kc8RP6RYXfBVvkSTjYtJck,10236
|
|
7
|
+
monocr/assets/valid_chars.txt,sha256=LJT7cj412XtHOn-J_DuaR2gxmHgguePO1f4896jKwT0,2232
|
|
8
|
+
monocr/models/monocr.ckpt,sha256=OcLWLdlkPUqIonInzzdiuB4a8eb4HFUu8k808_6F2U0,134
|
|
9
|
+
monocr-0.1.10.dist-info/licenses/LICENSE,sha256=Ry1WWVVW5wg-eC_zub35afE1GTk7DgMQan35yii1Akk,1064
|
|
10
|
+
monocr-0.1.10.dist-info/METADATA,sha256=GYU_wCmlnsfXOK6AN3fdf1VTnD0kmBr8fYSqDL8zd4g,2221
|
|
11
|
+
monocr-0.1.10.dist-info/WHEEL,sha256=qELbo2s1Yzl39ZmrAibXA2jjPLUYfnVhUNTlyF1rq0Y,92
|
|
12
|
+
monocr-0.1.10.dist-info/entry_points.txt,sha256=I1b4Nfr70Hk6ZY9qu93pRxPqQgoXy1RtNRjG22N9_90,43
|
|
13
|
+
monocr-0.1.10.dist-info/top_level.txt,sha256=ucVTqR0gd6Wwv1ytwD5bh8mfcwnfRYe_67nWPz94TIc,7
|
|
14
|
+
monocr-0.1.10.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Ja Nakh
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
monocr
|