monocr 0.1.0__tar.gz → 0.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of monocr might be problematic. Click here for more details.
- {monocr-0.1.0 → monocr-0.1.1}/PKG-INFO +22 -2
- {monocr-0.1.0 → monocr-0.1.1}/README.md +21 -1
- {monocr-0.1.0 → monocr-0.1.1}/pyproject.toml +1 -1
- monocr-0.1.1/src/monocr/__init__.py +55 -0
- {monocr-0.1.0 → monocr-0.1.1}/src/monocr/cli.py +26 -26
- {monocr-0.1.0 → monocr-0.1.1}/src/monocr/crnn_model.py +25 -20
- monocr-0.1.1/src/monocr/inference.py +80 -0
- {monocr-0.1.0 → monocr-0.1.1}/src/monocr/ocr.py +32 -84
- monocr-0.1.0/src/monocr/__init__.py +0 -90
- monocr-0.1.0/src/monocr/inference.py +0 -117
- {monocr-0.1.0 → monocr-0.1.1}/src/monocr/models/monocr_v1_best.pt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: monocr
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.1
|
|
4
4
|
Summary: Optical Character Recognition for Mon text
|
|
5
5
|
Keywords: mon,ocr,text-recognition
|
|
6
6
|
Author: janakhpon
|
|
@@ -62,6 +62,26 @@ monocr read image.png
|
|
|
62
62
|
monocr batch images/ --output results.json
|
|
63
63
|
```
|
|
64
64
|
|
|
65
|
+
## Dev Setup
|
|
66
|
+
|
|
67
|
+
```bash
|
|
68
|
+
git clone git@github.com:janakhpon/monocr.git
|
|
69
|
+
cd monocr
|
|
70
|
+
uv sync --dev
|
|
71
|
+
|
|
72
|
+
# Release workflow
|
|
73
|
+
uv version --bump patch
|
|
74
|
+
git add .
|
|
75
|
+
git commit -m "bump version"
|
|
76
|
+
git tag v0.1.5
|
|
77
|
+
git push origin main --tags
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
## Related tools
|
|
81
|
+
- [mon_tokenizer](https://github.com/Code-Yay-Mal/mon_tokenizer)
|
|
82
|
+
- [hugging face mon_tokenizer model](https://huggingface.co/janakhpon/mon_tokenizer)
|
|
83
|
+
- [Mon corpus collection in unicode](https://github.com/MonDevHub/MonCorpusCollection)
|
|
84
|
+
|
|
65
85
|
## License
|
|
66
86
|
|
|
67
|
-
MIT
|
|
87
|
+
MIT - do whatever you want with it.
|
|
@@ -33,6 +33,26 @@ monocr read image.png
|
|
|
33
33
|
monocr batch images/ --output results.json
|
|
34
34
|
```
|
|
35
35
|
|
|
36
|
+
## Dev Setup
|
|
37
|
+
|
|
38
|
+
```bash
|
|
39
|
+
git clone git@github.com:janakhpon/monocr.git
|
|
40
|
+
cd monocr
|
|
41
|
+
uv sync --dev
|
|
42
|
+
|
|
43
|
+
# Release workflow
|
|
44
|
+
uv version --bump patch
|
|
45
|
+
git add .
|
|
46
|
+
git commit -m "bump version"
|
|
47
|
+
git tag v0.1.5
|
|
48
|
+
git push origin main --tags
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
## Related tools
|
|
52
|
+
- [mon_tokenizer](https://github.com/Code-Yay-Mal/mon_tokenizer)
|
|
53
|
+
- [hugging face mon_tokenizer model](https://huggingface.co/janakhpon/mon_tokenizer)
|
|
54
|
+
- [Mon corpus collection in unicode](https://github.com/MonDevHub/MonCorpusCollection)
|
|
55
|
+
|
|
36
56
|
## License
|
|
37
57
|
|
|
38
|
-
MIT
|
|
58
|
+
MIT - do whatever you want with it.
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
"""
|
|
2
|
+
mon ocr - optical character recognition for mon text
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from .ocr import MonOCR
|
|
8
|
+
from .inference import MonOCRInference
|
|
9
|
+
|
|
10
|
+
__version__ = "0.1.0"
|
|
11
|
+
__author__ = "janakhpon"
|
|
12
|
+
__email__ = "jnovaxer@gmail.com"
|
|
13
|
+
|
|
14
|
+
__all__ = ["MonOCR", "MonOCRInference", "read_text", "read_image", "read_folder"]
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def get_default_model_path():
|
|
18
|
+
"""get bundled model path"""
|
|
19
|
+
package_dir = Path(__file__).parent
|
|
20
|
+
model_path = package_dir / "models" / "monocr_v1_best.pt"
|
|
21
|
+
return str(model_path)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
# global ocr instance for simple api
|
|
25
|
+
_ocr_instance = None
|
|
26
|
+
|
|
27
|
+
def _get_ocr():
|
|
28
|
+
"""get or create global ocr instance"""
|
|
29
|
+
global _ocr_instance
|
|
30
|
+
if _ocr_instance is None:
|
|
31
|
+
_ocr_instance = MonOCR()
|
|
32
|
+
return _ocr_instance
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def read_text(image_path):
|
|
36
|
+
"""read text from single image"""
|
|
37
|
+
return _get_ocr().read_text(image_path)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def read_image(image_path):
|
|
41
|
+
"""alias for read_text"""
|
|
42
|
+
return read_text(image_path)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def read_folder(folder_path, extensions=None):
|
|
46
|
+
"""read text from all images in folder"""
|
|
47
|
+
return _get_ocr().read_from_folder(folder_path, extensions)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def load_ocr(model_path=None, model_type="crnn"):
|
|
51
|
+
"""load ocr model with custom settings"""
|
|
52
|
+
if model_path is None:
|
|
53
|
+
model_path = get_default_model_path()
|
|
54
|
+
|
|
55
|
+
return MonOCR(model_path, model_type)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
#!/usr/bin/env python3
|
|
2
2
|
"""
|
|
3
|
-
|
|
3
|
+
command line interface for mon ocr
|
|
4
4
|
"""
|
|
5
5
|
|
|
6
6
|
import click
|
|
@@ -15,25 +15,25 @@ from . import get_default_model_path
|
|
|
15
15
|
@click.group()
|
|
16
16
|
@click.version_option()
|
|
17
17
|
def main():
|
|
18
|
-
"""
|
|
18
|
+
"""mon ocr - optical character recognition for mon text"""
|
|
19
19
|
pass
|
|
20
20
|
|
|
21
21
|
@main.command()
|
|
22
22
|
@click.argument('image_path', type=click.Path(exists=True))
|
|
23
|
-
@click.option('--model', '-m', help='
|
|
24
|
-
@click.option('--model-type', type=click.Choice(['crnn', 'trocr']), default='crnn', help='
|
|
25
|
-
@click.option('--output', '-o', help='
|
|
23
|
+
@click.option('--model', '-m', help='path to trained model file (default: uses bundled model)')
|
|
24
|
+
@click.option('--model-type', type=click.Choice(['crnn', 'trocr']), default='crnn', help='type of model to use')
|
|
25
|
+
@click.option('--output', '-o', help='output file to save results')
|
|
26
26
|
def read(image_path: str, model: str, model_type: str, output: str):
|
|
27
|
-
"""
|
|
27
|
+
"""read text from a single image"""
|
|
28
28
|
try:
|
|
29
29
|
if model is None:
|
|
30
30
|
model = get_default_model_path()
|
|
31
31
|
ocr = MonOCR(model, model_type)
|
|
32
32
|
|
|
33
|
-
print("
|
|
33
|
+
print("processing image...")
|
|
34
34
|
text = ocr.read_text(image_path)
|
|
35
35
|
|
|
36
|
-
print(f"\
|
|
36
|
+
print(f"\nextracted text:")
|
|
37
37
|
print(text)
|
|
38
38
|
|
|
39
39
|
if output:
|
|
@@ -44,30 +44,30 @@ def read(image_path: str, model: str, model_type: str, output: str):
|
|
|
44
44
|
}
|
|
45
45
|
with open(output, 'w', encoding='utf-8') as f:
|
|
46
46
|
json.dump(result, f, ensure_ascii=False, indent=2)
|
|
47
|
-
print(f"\
|
|
47
|
+
print(f"\nresults saved to: {output}")
|
|
48
48
|
|
|
49
49
|
except Exception as e:
|
|
50
|
-
print(f"
|
|
50
|
+
print(f"error: {e}")
|
|
51
51
|
raise click.Abort()
|
|
52
52
|
|
|
53
53
|
@main.command()
|
|
54
54
|
@click.argument('folder_path', type=click.Path(exists=True, file_okay=False))
|
|
55
|
-
@click.option('--model', '-m', help='
|
|
56
|
-
@click.option('--model-type', type=click.Choice(['crnn', 'trocr']), default='crnn', help='
|
|
57
|
-
@click.option('--output', '-o', help='
|
|
58
|
-
@click.option('--extensions', default='png,jpg,jpeg', help='
|
|
55
|
+
@click.option('--model', '-m', help='path to trained model file (default: uses bundled model)')
|
|
56
|
+
@click.option('--model-type', type=click.Choice(['crnn', 'trocr']), default='crnn', help='type of model to use')
|
|
57
|
+
@click.option('--output', '-o', help='output file to save results')
|
|
58
|
+
@click.option('--extensions', default='png,jpg,jpeg', help='file extensions to process (comma-separated)')
|
|
59
59
|
def batch(folder_path: str, model: str, model_type: str, output: str, extensions: str):
|
|
60
|
-
"""
|
|
60
|
+
"""read text from all images in a folder"""
|
|
61
61
|
try:
|
|
62
62
|
if model is None:
|
|
63
63
|
model = get_default_model_path()
|
|
64
64
|
ocr = MonOCR(model, model_type)
|
|
65
65
|
ext_list = [f'.{ext.strip()}' for ext in extensions.split(',')]
|
|
66
66
|
|
|
67
|
-
print("
|
|
67
|
+
print("processing folder...")
|
|
68
68
|
results = ocr.read_from_folder(folder_path, ext_list)
|
|
69
69
|
|
|
70
|
-
print("\
|
|
70
|
+
print("\nocr results:")
|
|
71
71
|
print("-" * 40)
|
|
72
72
|
for filename, text in results.items():
|
|
73
73
|
print(f"{filename}: {text}")
|
|
@@ -75,30 +75,30 @@ def batch(folder_path: str, model: str, model_type: str, output: str, extensions
|
|
|
75
75
|
if output:
|
|
76
76
|
with open(output, 'w', encoding='utf-8') as f:
|
|
77
77
|
json.dump(results, f, ensure_ascii=False, indent=2)
|
|
78
|
-
print(f"\
|
|
78
|
+
print(f"\nresults saved to: {output}")
|
|
79
79
|
|
|
80
80
|
except Exception as e:
|
|
81
|
-
print(f"
|
|
81
|
+
print(f"error: {e}")
|
|
82
82
|
raise click.Abort()
|
|
83
83
|
|
|
84
84
|
@main.command()
|
|
85
85
|
@click.argument('image_path', type=click.Path(exists=True))
|
|
86
|
-
@click.option('--model', '-m', help='
|
|
87
|
-
@click.option('--model-type', type=click.Choice(['crnn', 'trocr']), default='crnn', help='
|
|
86
|
+
@click.option('--model', '-m', help='path to trained model file (default: uses bundled model)')
|
|
87
|
+
@click.option('--model-type', type=click.Choice(['crnn', 'trocr']), default='crnn', help='type of model to use')
|
|
88
88
|
def confidence(image_path: str, model: str, model_type: str):
|
|
89
|
-
"""
|
|
89
|
+
"""read text with confidence score"""
|
|
90
90
|
try:
|
|
91
91
|
ocr = MonOCRInference(model, model_type)
|
|
92
92
|
|
|
93
|
-
print("
|
|
93
|
+
print("processing image...")
|
|
94
94
|
result = ocr.predict_with_confidence(image_path)
|
|
95
95
|
|
|
96
|
-
print(f"\
|
|
96
|
+
print(f"\nextracted text:")
|
|
97
97
|
print(result['text'])
|
|
98
|
-
print(f"\
|
|
98
|
+
print(f"\nconfidence: {result['confidence']:.2%}")
|
|
99
99
|
|
|
100
100
|
except Exception as e:
|
|
101
|
-
print(f"
|
|
101
|
+
print(f"error: {e}")
|
|
102
102
|
raise click.Abort()
|
|
103
103
|
|
|
104
104
|
if __name__ == '__main__':
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
#!/usr/bin/env python3
|
|
2
2
|
# -*- coding: utf-8 -*-
|
|
3
3
|
"""
|
|
4
|
-
|
|
4
|
+
crnn model architecture for mon ocr
|
|
5
5
|
"""
|
|
6
6
|
|
|
7
7
|
import torch
|
|
@@ -12,11 +12,11 @@ import os
|
|
|
12
12
|
from typing import List
|
|
13
13
|
|
|
14
14
|
class CRNN(nn.Module):
|
|
15
|
-
"""
|
|
15
|
+
"""crnn model for mon ocr"""
|
|
16
16
|
|
|
17
17
|
def __init__(self, num_classes):
|
|
18
18
|
super(CRNN, self).__init__()
|
|
19
|
-
#
|
|
19
|
+
# cnn architecture
|
|
20
20
|
self.cnn = nn.Sequential(
|
|
21
21
|
nn.Conv2d(1, 64, 3, 1, 1),
|
|
22
22
|
nn.ReLU(),
|
|
@@ -42,10 +42,10 @@ class CRNN(nn.Module):
|
|
|
42
42
|
nn.Conv2d(512, 512, (4, 1), 1, 0), # 4->1
|
|
43
43
|
nn.ReLU(),
|
|
44
44
|
)
|
|
45
|
-
#
|
|
45
|
+
# lstm layers
|
|
46
46
|
self.lstm1 = nn.LSTM(512, 256, bidirectional=True, batch_first=True)
|
|
47
47
|
self.lstm2 = nn.LSTM(512, 256, bidirectional=True, batch_first=True)
|
|
48
|
-
self.dropout = nn.Dropout(0.1)
|
|
48
|
+
self.dropout = nn.Dropout(0.1)
|
|
49
49
|
self.fc = nn.Linear(512, num_classes)
|
|
50
50
|
|
|
51
51
|
def forward(self, x):
|
|
@@ -54,29 +54,34 @@ class CRNN(nn.Module):
|
|
|
54
54
|
assert h == 1, "CNN height must be 1"
|
|
55
55
|
conv = conv.squeeze(2).permute(0, 2, 1) # [B, W, C]
|
|
56
56
|
|
|
57
|
-
#
|
|
57
|
+
# lstm layers
|
|
58
58
|
recurrent, _ = self.lstm1(conv)
|
|
59
59
|
recurrent, _ = self.lstm2(recurrent)
|
|
60
60
|
|
|
61
|
-
#
|
|
61
|
+
# dropout and final classification
|
|
62
62
|
recurrent = self.dropout(recurrent)
|
|
63
63
|
out = self.fc(recurrent)
|
|
64
64
|
return out # [B, W, num_classes]
|
|
65
65
|
|
|
66
|
+
|
|
66
67
|
def build_charset(corpus_dir: str) -> str:
|
|
67
|
-
"""
|
|
68
|
+
"""build charset from corpus files"""
|
|
68
69
|
charset = set()
|
|
69
|
-
txt_files = glob.glob(os.path.join(corpus_dir, "**/*.txt"), recursive=True)
|
|
70
70
|
|
|
71
|
-
for
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
71
|
+
# search for text files in corpus directory
|
|
72
|
+
for ext in ['*.txt']:
|
|
73
|
+
pattern = os.path.join(corpus_dir, '**', ext)
|
|
74
|
+
for file_path in glob.glob(pattern, recursive=True):
|
|
75
|
+
try:
|
|
76
|
+
with open(file_path, 'r', encoding='utf-8') as f:
|
|
77
|
+
content = f.read()
|
|
78
|
+
charset.update(content)
|
|
79
|
+
except:
|
|
80
|
+
continue
|
|
81
|
+
|
|
82
|
+
# remove whitespace and control characters
|
|
83
|
+
charset = {c for c in charset if c.strip() and ord(c) >= 32}
|
|
80
84
|
|
|
81
|
-
|
|
82
|
-
|
|
85
|
+
# sort for consistent ordering
|
|
86
|
+
charset_str = ''.join(sorted(charset))
|
|
87
|
+
return charset_str
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
advanced inference utilities for mon ocr
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import os
|
|
8
|
+
import torch
|
|
9
|
+
import numpy as np
|
|
10
|
+
from PIL import Image
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
import json
|
|
13
|
+
from typing import List, Dict, Optional, Union
|
|
14
|
+
|
|
15
|
+
from .ocr import MonOCR
|
|
16
|
+
|
|
17
|
+
class MonOCRInference:
|
|
18
|
+
"""advanced mon ocr inference with additional utilities"""
|
|
19
|
+
|
|
20
|
+
def __init__(self, model_path: Optional[str] = None, model_type: str = "crnn"):
|
|
21
|
+
"""initialize advanced mon ocr inference"""
|
|
22
|
+
self.ocr = MonOCR(model_path, model_type)
|
|
23
|
+
|
|
24
|
+
def predict_with_confidence(self, image: Union[str, Image.Image]) -> Dict[str, Union[str, float]]:
|
|
25
|
+
"""predict text with confidence score"""
|
|
26
|
+
if isinstance(image, str):
|
|
27
|
+
image = Image.open(image).convert("L")
|
|
28
|
+
elif not isinstance(image, Image.Image):
|
|
29
|
+
raise ValueError("Image must be a file path or PIL Image")
|
|
30
|
+
|
|
31
|
+
# get prediction
|
|
32
|
+
predicted_text = self.ocr.predict(image)
|
|
33
|
+
|
|
34
|
+
# calculate confidence (simplified)
|
|
35
|
+
confidence = self._calculate_confidence(image, predicted_text)
|
|
36
|
+
|
|
37
|
+
return {
|
|
38
|
+
'text': predicted_text,
|
|
39
|
+
'confidence': confidence
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
def _calculate_confidence(self, image: Image.Image, text: str) -> float:
|
|
43
|
+
"""calculate confidence score (simplified implementation)"""
|
|
44
|
+
# simple confidence based on text length and image size
|
|
45
|
+
if not text:
|
|
46
|
+
return 0.0
|
|
47
|
+
|
|
48
|
+
# normalize confidence based on text length and image dimensions
|
|
49
|
+
text_length = len(text)
|
|
50
|
+
image_area = image.width * image.height
|
|
51
|
+
|
|
52
|
+
# simple heuristic: longer text on larger images = higher confidence
|
|
53
|
+
confidence = min(1.0, (text_length * 100) / image_area)
|
|
54
|
+
|
|
55
|
+
return max(0.0, min(1.0, confidence))
|
|
56
|
+
|
|
57
|
+
def batch_predict_with_confidence(self, images: List[Union[str, Image.Image]]) -> List[Dict[str, Union[str, float]]]:
|
|
58
|
+
"""predict text with confidence for multiple images"""
|
|
59
|
+
results = []
|
|
60
|
+
for image in images:
|
|
61
|
+
try:
|
|
62
|
+
result = self.predict_with_confidence(image)
|
|
63
|
+
results.append(result)
|
|
64
|
+
except Exception as e:
|
|
65
|
+
results.append({
|
|
66
|
+
'text': '',
|
|
67
|
+
'confidence': 0.0
|
|
68
|
+
})
|
|
69
|
+
|
|
70
|
+
return results
|
|
71
|
+
|
|
72
|
+
def save_results(self, results: List[Dict[str, Union[str, float]]], output_path: str):
|
|
73
|
+
"""save prediction results to json file"""
|
|
74
|
+
with open(output_path, 'w', encoding='utf-8') as f:
|
|
75
|
+
json.dump(results, f, ensure_ascii=False, indent=2)
|
|
76
|
+
|
|
77
|
+
def load_results(self, input_path: str) -> List[Dict[str, Union[str, float]]]:
|
|
78
|
+
"""load prediction results from json file"""
|
|
79
|
+
with open(input_path, 'r', encoding='utf-8') as f:
|
|
80
|
+
return json.load(f)
|
|
@@ -1,8 +1,7 @@
|
|
|
1
1
|
#!/usr/bin/env python3
|
|
2
2
|
# -*- coding: utf-8 -*-
|
|
3
3
|
"""
|
|
4
|
-
|
|
5
|
-
Supports both CRNN and TrOCR models
|
|
4
|
+
main mon ocr class
|
|
6
5
|
"""
|
|
7
6
|
|
|
8
7
|
import os
|
|
@@ -12,11 +11,10 @@ import numpy as np
|
|
|
12
11
|
from PIL import Image
|
|
13
12
|
from pathlib import Path
|
|
14
13
|
import json
|
|
15
|
-
import logging
|
|
16
14
|
from typing import List, Dict, Optional, Union
|
|
17
15
|
from torchvision import transforms
|
|
18
16
|
|
|
19
|
-
#
|
|
17
|
+
# trocr imports (optional)
|
|
20
18
|
try:
|
|
21
19
|
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
|
|
22
20
|
TROCR_AVAILABLE = True
|
|
@@ -24,23 +22,17 @@ except ImportError:
|
|
|
24
22
|
TROCR_AVAILABLE = False
|
|
25
23
|
|
|
26
24
|
class MonOCR:
|
|
27
|
-
"""
|
|
25
|
+
"""mon ocr class supporting crnn and trocr models"""
|
|
28
26
|
|
|
29
27
|
def __init__(self, model_path: Optional[str] = None, model_type: str = "crnn"):
|
|
30
|
-
"""
|
|
31
|
-
Initialize Mon OCR
|
|
32
|
-
|
|
33
|
-
Args:
|
|
34
|
-
model_path: Path to trained model file (if None, uses bundled model)
|
|
35
|
-
model_type: Type of model ("crnn" or "trocr")
|
|
36
|
-
"""
|
|
28
|
+
"""initialize mon ocr"""
|
|
37
29
|
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
38
30
|
self.model_type = model_type.lower()
|
|
39
31
|
self.model = None
|
|
40
32
|
self.processor = None
|
|
41
33
|
self.charset = None
|
|
42
34
|
|
|
43
|
-
#
|
|
35
|
+
# load model - use bundled model if no path provided
|
|
44
36
|
if model_path is None:
|
|
45
37
|
from . import get_default_model_path
|
|
46
38
|
model_path = get_default_model_path()
|
|
@@ -48,7 +40,7 @@ class MonOCR:
|
|
|
48
40
|
self.load_model(model_path)
|
|
49
41
|
|
|
50
42
|
def load_model(self, model_path: str):
|
|
51
|
-
"""
|
|
43
|
+
"""load trained model from file"""
|
|
52
44
|
if not os.path.exists(model_path):
|
|
53
45
|
raise FileNotFoundError(f"Model file not found: {model_path}")
|
|
54
46
|
|
|
@@ -60,24 +52,23 @@ class MonOCR:
|
|
|
60
52
|
raise ValueError(f"Unsupported model type: {self.model_type}")
|
|
61
53
|
|
|
62
54
|
def _load_crnn_model(self, model_path: str):
|
|
63
|
-
"""
|
|
64
|
-
# Import CRNN model class (this would need to be included in the package)
|
|
55
|
+
"""load crnn model"""
|
|
65
56
|
from .crnn_model import CRNN, build_charset
|
|
66
57
|
|
|
67
|
-
#
|
|
58
|
+
# load model state
|
|
68
59
|
checkpoint = torch.load(model_path, map_location=self.device)
|
|
69
60
|
|
|
70
|
-
#
|
|
61
|
+
# extract charset from checkpoint or build from corpus
|
|
71
62
|
if 'charset' in checkpoint:
|
|
72
63
|
self.charset = checkpoint['charset']
|
|
73
64
|
else:
|
|
74
|
-
#
|
|
65
|
+
# fallback: build charset from default corpus
|
|
75
66
|
self.charset = build_charset("data/raw/corpus")
|
|
76
67
|
|
|
77
|
-
#
|
|
68
|
+
# initialize model (add 1 for blank token)
|
|
78
69
|
self.model = CRNN(num_classes=len(self.charset) + 1)
|
|
79
70
|
|
|
80
|
-
#
|
|
71
|
+
# load weights
|
|
81
72
|
if 'model_state_dict' in checkpoint:
|
|
82
73
|
self.model.load_state_dict(checkpoint['model_state_dict'])
|
|
83
74
|
else:
|
|
@@ -87,7 +78,7 @@ class MonOCR:
|
|
|
87
78
|
self.model.eval()
|
|
88
79
|
|
|
89
80
|
def _load_trocr_model(self, model_path: str):
|
|
90
|
-
"""
|
|
81
|
+
"""load trocr model"""
|
|
91
82
|
if not TROCR_AVAILABLE:
|
|
92
83
|
raise ImportError("TrOCR dependencies not available. Install with: pip install transformers")
|
|
93
84
|
|
|
@@ -97,15 +88,7 @@ class MonOCR:
|
|
|
97
88
|
self.model.eval()
|
|
98
89
|
|
|
99
90
|
def predict(self, image: Union[str, Image.Image]) -> str:
|
|
100
|
-
"""
|
|
101
|
-
Predict text from image
|
|
102
|
-
|
|
103
|
-
Args:
|
|
104
|
-
image: Path to image file or PIL Image object
|
|
105
|
-
|
|
106
|
-
Returns:
|
|
107
|
-
Predicted text string
|
|
108
|
-
"""
|
|
91
|
+
"""predict text from image"""
|
|
109
92
|
if isinstance(image, str):
|
|
110
93
|
image = Image.open(image).convert("L")
|
|
111
94
|
elif not isinstance(image, Image.Image):
|
|
@@ -117,32 +100,32 @@ class MonOCR:
|
|
|
117
100
|
return self._predict_trocr(image)
|
|
118
101
|
|
|
119
102
|
def _predict_crnn(self, image: Image.Image) -> str:
|
|
120
|
-
"""
|
|
103
|
+
"""predict using crnn model"""
|
|
121
104
|
if self.model is None:
|
|
122
105
|
raise ValueError("Model not loaded. Call load_model() first.")
|
|
123
106
|
|
|
124
|
-
#
|
|
107
|
+
# preprocess image - match simple_inference.py exactly
|
|
125
108
|
if isinstance(image, str):
|
|
126
109
|
image = Image.open(image).convert('L')
|
|
127
110
|
elif isinstance(image, Image.Image):
|
|
128
111
|
image = image.convert('L')
|
|
129
112
|
|
|
130
|
-
#
|
|
131
|
-
#
|
|
113
|
+
# resize image - target_size is (height, width) for the model
|
|
114
|
+
# pil resize expects (width, height), so we need to swap
|
|
132
115
|
image = image.resize((256, 64), Image.Resampling.LANCZOS)
|
|
133
116
|
|
|
134
|
-
#
|
|
117
|
+
# convert to tensor and normalize
|
|
135
118
|
image_array = np.array(image, dtype=np.float32) / 255.0
|
|
136
119
|
image_tensor = torch.from_numpy(image_array).unsqueeze(0).unsqueeze(0) # [1, 1, H, W]
|
|
137
120
|
|
|
138
|
-
#
|
|
121
|
+
# apply the same transform as training
|
|
139
122
|
transform = transforms.Compose([
|
|
140
123
|
transforms.Normalize(mean=[0.5], std=[0.5])
|
|
141
124
|
])
|
|
142
125
|
image_tensor = transform(image_tensor)
|
|
143
126
|
image_tensor = image_tensor.to(self.device)
|
|
144
127
|
|
|
145
|
-
#
|
|
128
|
+
# predict
|
|
146
129
|
with torch.no_grad():
|
|
147
130
|
outputs = self.model(image_tensor)
|
|
148
131
|
predicted_text = self._decode_crnn_output(outputs)
|
|
@@ -150,14 +133,14 @@ class MonOCR:
|
|
|
150
133
|
return predicted_text
|
|
151
134
|
|
|
152
135
|
def _predict_trocr(self, image: Image.Image) -> str:
|
|
153
|
-
"""
|
|
136
|
+
"""predict using trocr model"""
|
|
154
137
|
if self.model is None or self.processor is None:
|
|
155
138
|
raise ValueError("Model not loaded. Call load_model() first.")
|
|
156
139
|
|
|
157
|
-
#
|
|
140
|
+
# preprocess image
|
|
158
141
|
pixel_values = self.processor(image, return_tensors="pt").pixel_values.to(self.device)
|
|
159
142
|
|
|
160
|
-
#
|
|
143
|
+
# predict
|
|
161
144
|
with torch.no_grad():
|
|
162
145
|
generated_ids = self.model.generate(pixel_values)
|
|
163
146
|
predicted_text = self.processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
|
|
@@ -165,14 +148,14 @@ class MonOCR:
|
|
|
165
148
|
return predicted_text
|
|
166
149
|
|
|
167
150
|
def _decode_crnn_output(self, output: torch.Tensor) -> str:
|
|
168
|
-
"""
|
|
151
|
+
"""decode crnn output to text - match simple_inference.py exactly"""
|
|
169
152
|
if self.charset is None:
|
|
170
153
|
raise ValueError("Charset not loaded")
|
|
171
154
|
|
|
172
|
-
#
|
|
155
|
+
# get predictions - same as working version
|
|
173
156
|
preds = output.softmax(2).argmax(2).squeeze(0) # [seq_len]
|
|
174
157
|
|
|
175
|
-
#
|
|
158
|
+
# ctc decoding - exact same logic as working simple_inference.py
|
|
176
159
|
decoded = []
|
|
177
160
|
prev_char = None
|
|
178
161
|
|
|
@@ -188,61 +171,27 @@ class MonOCR:
|
|
|
188
171
|
return ''.join(decoded)
|
|
189
172
|
|
|
190
173
|
def batch_predict(self, images: List[Union[str, Image.Image]]) -> List[str]:
|
|
191
|
-
"""
|
|
192
|
-
Predict text from multiple images
|
|
193
|
-
|
|
194
|
-
Args:
|
|
195
|
-
images: List of image paths or PIL Image objects
|
|
196
|
-
|
|
197
|
-
Returns:
|
|
198
|
-
List of predicted text strings
|
|
199
|
-
"""
|
|
174
|
+
"""predict text from multiple images"""
|
|
200
175
|
results = []
|
|
201
176
|
for image in images:
|
|
202
177
|
try:
|
|
203
178
|
result = self.predict(image)
|
|
204
179
|
results.append(result)
|
|
205
180
|
except Exception as e:
|
|
206
|
-
logging.warning(f"Error processing image: {e}")
|
|
207
181
|
results.append("")
|
|
208
182
|
|
|
209
183
|
return results
|
|
210
184
|
|
|
211
185
|
def read_text(self, image: Union[str, Image.Image]) -> str:
|
|
212
|
-
"""
|
|
213
|
-
Read text from image (alias for predict method)
|
|
214
|
-
|
|
215
|
-
Args:
|
|
216
|
-
image: Path to image file or PIL Image object
|
|
217
|
-
|
|
218
|
-
Returns:
|
|
219
|
-
Extracted text string
|
|
220
|
-
"""
|
|
186
|
+
"""read text from image (alias for predict method)"""
|
|
221
187
|
return self.predict(image)
|
|
222
188
|
|
|
223
189
|
def read_multiple(self, images: List[Union[str, Image.Image]]) -> List[str]:
|
|
224
|
-
"""
|
|
225
|
-
Read text from multiple images (alias for batch_predict method)
|
|
226
|
-
|
|
227
|
-
Args:
|
|
228
|
-
images: List of image paths or PIL Image objects
|
|
229
|
-
|
|
230
|
-
Returns:
|
|
231
|
-
List of extracted text strings
|
|
232
|
-
"""
|
|
190
|
+
"""read text from multiple images (alias for batch_predict method)"""
|
|
233
191
|
return self.batch_predict(images)
|
|
234
192
|
|
|
235
193
|
def read_from_folder(self, folder_path: str, extensions: List[str] = None) -> dict:
|
|
236
|
-
"""
|
|
237
|
-
Read text from all images in a folder
|
|
238
|
-
|
|
239
|
-
Args:
|
|
240
|
-
folder_path: Path to folder containing images
|
|
241
|
-
extensions: List of file extensions to process (default: ['.png', '.jpg', '.jpeg'])
|
|
242
|
-
|
|
243
|
-
Returns:
|
|
244
|
-
Dictionary mapping filename to extracted text
|
|
245
|
-
"""
|
|
194
|
+
"""read text from all images in a folder"""
|
|
246
195
|
if extensions is None:
|
|
247
196
|
extensions = ['.png', '.jpg', '.jpeg', '.bmp', '.tiff']
|
|
248
197
|
|
|
@@ -262,7 +211,6 @@ class MonOCR:
|
|
|
262
211
|
text = self.read_text(str(image_file))
|
|
263
212
|
results[image_file.name] = text
|
|
264
213
|
except Exception as e:
|
|
265
|
-
print(f"Error processing {image_file.name}: {e}")
|
|
266
214
|
results[image_file.name] = ""
|
|
267
215
|
|
|
268
|
-
return results
|
|
216
|
+
return results
|
|
@@ -1,90 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Mon OCR - Optical Character Recognition for Mon text
|
|
3
|
-
A production-ready OCR package for Mon script text recognition
|
|
4
|
-
"""
|
|
5
|
-
|
|
6
|
-
import os
|
|
7
|
-
from pathlib import Path
|
|
8
|
-
from .ocr import MonOCR
|
|
9
|
-
from .inference import MonOCRInference
|
|
10
|
-
|
|
11
|
-
__version__ = "0.1.0"
|
|
12
|
-
__author__ = "janakhpon"
|
|
13
|
-
__email__ = "jnovaxer@gmail.com"
|
|
14
|
-
|
|
15
|
-
__all__ = ["MonOCR", "MonOCRInference", "read_text", "read_image", "read_folder"]
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
def get_default_model_path():
|
|
19
|
-
"""Get the path to the bundled default model"""
|
|
20
|
-
package_dir = Path(__file__).parent
|
|
21
|
-
model_path = package_dir / "models" / "monocr_v1_best.pt"
|
|
22
|
-
return str(model_path)
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
# Global OCR instance for simple API
|
|
26
|
-
_ocr_instance = None
|
|
27
|
-
|
|
28
|
-
def _get_ocr():
|
|
29
|
-
"""Get or create the global OCR instance"""
|
|
30
|
-
global _ocr_instance
|
|
31
|
-
if _ocr_instance is None:
|
|
32
|
-
_ocr_instance = MonOCR()
|
|
33
|
-
return _ocr_instance
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
def read_text(image_path):
|
|
37
|
-
"""
|
|
38
|
-
Read text from a single image - Simple API
|
|
39
|
-
|
|
40
|
-
Args:
|
|
41
|
-
image_path: Path to image file
|
|
42
|
-
|
|
43
|
-
Returns:
|
|
44
|
-
Extracted text string
|
|
45
|
-
"""
|
|
46
|
-
return _get_ocr().read_text(image_path)
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
def read_image(image_path):
|
|
50
|
-
"""
|
|
51
|
-
Alias for read_text - Read text from a single image
|
|
52
|
-
|
|
53
|
-
Args:
|
|
54
|
-
image_path: Path to image file
|
|
55
|
-
|
|
56
|
-
Returns:
|
|
57
|
-
Extracted text string
|
|
58
|
-
"""
|
|
59
|
-
return read_text(image_path)
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
def read_folder(folder_path, extensions=None):
|
|
63
|
-
"""
|
|
64
|
-
Read text from all images in a folder - Simple API
|
|
65
|
-
|
|
66
|
-
Args:
|
|
67
|
-
folder_path: Path to folder containing images
|
|
68
|
-
extensions: List of file extensions to process (default: ['.png', '.jpg', '.jpeg'])
|
|
69
|
-
|
|
70
|
-
Returns:
|
|
71
|
-
Dictionary mapping filename to extracted text
|
|
72
|
-
"""
|
|
73
|
-
return _get_ocr().read_from_folder(folder_path, extensions)
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
def load_ocr(model_path=None, model_type="crnn"):
|
|
77
|
-
"""
|
|
78
|
-
Load OCR model with default settings (Advanced API)
|
|
79
|
-
|
|
80
|
-
Args:
|
|
81
|
-
model_path: Path to trained model file (if None, uses bundled model)
|
|
82
|
-
model_type: Type of model ("crnn" or "trocr")
|
|
83
|
-
|
|
84
|
-
Returns:
|
|
85
|
-
MonOCR instance
|
|
86
|
-
"""
|
|
87
|
-
if model_path is None:
|
|
88
|
-
model_path = get_default_model_path()
|
|
89
|
-
|
|
90
|
-
return MonOCR(model_path, model_type)
|
|
@@ -1,117 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env python3
|
|
2
|
-
# -*- coding: utf-8 -*-
|
|
3
|
-
"""
|
|
4
|
-
Advanced inference utilities for Mon OCR
|
|
5
|
-
"""
|
|
6
|
-
|
|
7
|
-
import os
|
|
8
|
-
import torch
|
|
9
|
-
import numpy as np
|
|
10
|
-
from PIL import Image
|
|
11
|
-
from pathlib import Path
|
|
12
|
-
import json
|
|
13
|
-
import logging
|
|
14
|
-
from typing import List, Dict, Optional, Union
|
|
15
|
-
|
|
16
|
-
from .ocr import MonOCR
|
|
17
|
-
|
|
18
|
-
class MonOCRInference:
|
|
19
|
-
"""Advanced Mon OCR inference with additional utilities"""
|
|
20
|
-
|
|
21
|
-
def __init__(self, model_path: Optional[str] = None, model_type: str = "crnn"):
|
|
22
|
-
"""
|
|
23
|
-
Initialize advanced Mon OCR inference
|
|
24
|
-
|
|
25
|
-
Args:
|
|
26
|
-
model_path: Path to trained model file
|
|
27
|
-
model_type: Type of model ("crnn" or "trocr")
|
|
28
|
-
"""
|
|
29
|
-
self.ocr = MonOCR(model_path, model_type)
|
|
30
|
-
self.logger = logging.getLogger(__name__)
|
|
31
|
-
|
|
32
|
-
def predict_with_confidence(self, image: Union[str, Image.Image]) -> Dict[str, Union[str, float]]:
|
|
33
|
-
"""
|
|
34
|
-
Predict text with confidence score
|
|
35
|
-
|
|
36
|
-
Args:
|
|
37
|
-
image: Path to image file or PIL Image object
|
|
38
|
-
|
|
39
|
-
Returns:
|
|
40
|
-
Dictionary with 'text' and 'confidence' keys
|
|
41
|
-
"""
|
|
42
|
-
try:
|
|
43
|
-
text = self.ocr.predict(image)
|
|
44
|
-
# For now, return a placeholder confidence score
|
|
45
|
-
# In a full implementation, you'd calculate actual confidence
|
|
46
|
-
confidence = 0.95 # Placeholder
|
|
47
|
-
|
|
48
|
-
return {
|
|
49
|
-
'text': text,
|
|
50
|
-
'confidence': confidence
|
|
51
|
-
}
|
|
52
|
-
except Exception as e:
|
|
53
|
-
self.logger.error(f"Error in prediction: {e}")
|
|
54
|
-
return {
|
|
55
|
-
'text': "",
|
|
56
|
-
'confidence': 0.0
|
|
57
|
-
}
|
|
58
|
-
|
|
59
|
-
def batch_predict_with_confidence(self, images: List[Union[str, Image.Image]]) -> List[Dict[str, Union[str, float]]]:
|
|
60
|
-
"""
|
|
61
|
-
Predict text with confidence for multiple images
|
|
62
|
-
|
|
63
|
-
Args:
|
|
64
|
-
images: List of image paths or PIL Image objects
|
|
65
|
-
|
|
66
|
-
Returns:
|
|
67
|
-
List of dictionaries with 'text' and 'confidence' keys
|
|
68
|
-
"""
|
|
69
|
-
results = []
|
|
70
|
-
for image in images:
|
|
71
|
-
result = self.predict_with_confidence(image)
|
|
72
|
-
results.append(result)
|
|
73
|
-
|
|
74
|
-
return results
|
|
75
|
-
|
|
76
|
-
def process_document(self, image_path: str, output_path: Optional[str] = None) -> Dict[str, str]:
|
|
77
|
-
"""
|
|
78
|
-
Process a document image and save results
|
|
79
|
-
|
|
80
|
-
Args:
|
|
81
|
-
image_path: Path to document image
|
|
82
|
-
output_path: Path to save results (optional)
|
|
83
|
-
|
|
84
|
-
Returns:
|
|
85
|
-
Dictionary with processing results
|
|
86
|
-
"""
|
|
87
|
-
try:
|
|
88
|
-
# Load and process image
|
|
89
|
-
image = Image.open(image_path)
|
|
90
|
-
text = self.ocr.predict(image)
|
|
91
|
-
|
|
92
|
-
results = {
|
|
93
|
-
'image_path': image_path,
|
|
94
|
-
'extracted_text': text,
|
|
95
|
-
'status': 'success'
|
|
96
|
-
}
|
|
97
|
-
|
|
98
|
-
# Save results if output path provided
|
|
99
|
-
if output_path:
|
|
100
|
-
with open(output_path, 'w', encoding='utf-8') as f:
|
|
101
|
-
json.dump(results, f, ensure_ascii=False, indent=2)
|
|
102
|
-
|
|
103
|
-
return results
|
|
104
|
-
|
|
105
|
-
except Exception as e:
|
|
106
|
-
error_result = {
|
|
107
|
-
'image_path': image_path,
|
|
108
|
-
'extracted_text': "",
|
|
109
|
-
'status': 'error',
|
|
110
|
-
'error': str(e)
|
|
111
|
-
}
|
|
112
|
-
|
|
113
|
-
if output_path:
|
|
114
|
-
with open(output_path, 'w', encoding='utf-8') as f:
|
|
115
|
-
json.dump(error_result, f, ensure_ascii=False, indent=2)
|
|
116
|
-
|
|
117
|
-
return error_result
|
|
File without changes
|