natocr 1.3.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natocr/__init__.py +21 -0
- natocr/core.py +112 -0
- natocr/macos.py +177 -0
- natocr/models.py +108 -0
- natocr/windows.py +174 -0
- natocr-1.3.3.dist-info/METADATA +203 -0
- natocr-1.3.3.dist-info/RECORD +10 -0
- natocr-1.3.3.dist-info/WHEEL +5 -0
- natocr-1.3.3.dist-info/licenses/LICENSE +21 -0
- natocr-1.3.3.dist-info/top_level.txt +1 -0
natocr/__init__.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
"""
|
|
2
|
+
natocr - native ocr library using platform-specific frameworks
|
|
3
|
+
|
|
4
|
+
this package provides ocr functionality using native frameworks:
|
|
5
|
+
- macos: vision framework
|
|
6
|
+
- windows: windows runtime ocr
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from .core import OCR
|
|
10
|
+
from .models import BoundingBox, OCRResult, TextElement
|
|
11
|
+
|
|
12
|
+
__version__ = "1.3.3"
|
|
13
|
+
__author__ = "alfredchiesa"
|
|
14
|
+
__email__ = "alfred.personal@icloud.com"
|
|
15
|
+
|
|
16
|
+
__all__ = [
|
|
17
|
+
"OCR",
|
|
18
|
+
"OCRResult",
|
|
19
|
+
"TextElement",
|
|
20
|
+
"BoundingBox",
|
|
21
|
+
]
|
natocr/core.py
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
"""
|
|
2
|
+
main ocr class with platform detection and delegation
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import io
|
|
6
|
+
import sys
|
|
7
|
+
from typing import List, Union
|
|
8
|
+
|
|
9
|
+
import numpy as np
|
|
10
|
+
from PIL import Image
|
|
11
|
+
|
|
12
|
+
from .macos import MacOSOCR
|
|
13
|
+
from .models import OCRResult
|
|
14
|
+
from .windows import WindowsOCR
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class OCR:
|
|
18
|
+
"""Run OCR using the operating system's native engine.
|
|
19
|
+
|
|
20
|
+
Picks the right backend for the current platform - the Vision framework on
|
|
21
|
+
macOS, Windows Runtime OCR on Windows - and gives you one API over both.
|
|
22
|
+
|
|
23
|
+
Example:
|
|
24
|
+
```python
|
|
25
|
+
from natocr import OCR
|
|
26
|
+
|
|
27
|
+
ocr = OCR() # english by default
|
|
28
|
+
result = ocr.recognize("invoice.png")
|
|
29
|
+
print(result.text)
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
language: language code for text recognition (default: ``"en"``).
|
|
34
|
+
|
|
35
|
+
Raises:
|
|
36
|
+
RuntimeError: on an unsupported platform, or when the platform's native
|
|
37
|
+
OCR dependencies aren't installed.
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
def __init__(self, language: str = "en"):
|
|
41
|
+
self.language = language
|
|
42
|
+
self._backend = None
|
|
43
|
+
self._initialize_backend()
|
|
44
|
+
|
|
45
|
+
def _initialize_backend(self):
|
|
46
|
+
"""initialize platform-specific ocr backend"""
|
|
47
|
+
if sys.platform == "darwin":
|
|
48
|
+
try:
|
|
49
|
+
self._backend = MacOSOCR(self.language)
|
|
50
|
+
except ImportError:
|
|
51
|
+
raise RuntimeError(
|
|
52
|
+
"macos dependencies not installed. install with: pip install natocr[macos]"
|
|
53
|
+
)
|
|
54
|
+
elif sys.platform == "win32":
|
|
55
|
+
try:
|
|
56
|
+
self._backend = WindowsOCR(self.language)
|
|
57
|
+
except ImportError:
|
|
58
|
+
raise RuntimeError(
|
|
59
|
+
"windows dependencies not installed. install with: pip install natocr[windows]"
|
|
60
|
+
)
|
|
61
|
+
else:
|
|
62
|
+
raise RuntimeError(f"unsupported platform: {sys.platform}")
|
|
63
|
+
|
|
64
|
+
def recognize(self, image: Union[str, Image.Image, np.ndarray, bytes]) -> OCRResult:
|
|
65
|
+
"""Recognize text in an image.
|
|
66
|
+
|
|
67
|
+
Args:
|
|
68
|
+
image: what to read. One of: a file path (``str``), a
|
|
69
|
+
``PIL.Image.Image``, a ``numpy.ndarray``, or raw encoded image
|
|
70
|
+
``bytes``.
|
|
71
|
+
|
|
72
|
+
Returns:
|
|
73
|
+
An [OCRResult][natocr.OCRResult] with the detected text and
|
|
74
|
+
per-element metadata.
|
|
75
|
+
|
|
76
|
+
Raises:
|
|
77
|
+
ValueError: if ``image`` isn't one of the supported types.
|
|
78
|
+
"""
|
|
79
|
+
# convert input to pil image for consistent processing
|
|
80
|
+
pil_image = self._convert_to_pil(image)
|
|
81
|
+
|
|
82
|
+
# delegate to platform-specific implementation
|
|
83
|
+
return self._backend.recognize(pil_image)
|
|
84
|
+
|
|
85
|
+
def _convert_to_pil(
|
|
86
|
+
self, image: Union[str, Image.Image, np.ndarray, bytes]
|
|
87
|
+
) -> Image.Image:
|
|
88
|
+
"""convert various image formats to pil image"""
|
|
89
|
+
if isinstance(image, str):
|
|
90
|
+
# file path
|
|
91
|
+
return Image.open(image)
|
|
92
|
+
elif isinstance(image, Image.Image):
|
|
93
|
+
# already a pil image
|
|
94
|
+
return image
|
|
95
|
+
elif isinstance(image, np.ndarray):
|
|
96
|
+
# numpy array
|
|
97
|
+
return Image.fromarray(image)
|
|
98
|
+
elif isinstance(image, bytes):
|
|
99
|
+
# raw bytes
|
|
100
|
+
return Image.open(io.BytesIO(image))
|
|
101
|
+
else:
|
|
102
|
+
raise ValueError(f"unsupported image type: {type(image)}")
|
|
103
|
+
|
|
104
|
+
@property
|
|
105
|
+
def supported_languages(self) -> List[str]:
|
|
106
|
+
"""Language codes the current platform's backend supports."""
|
|
107
|
+
return self._backend.supported_languages if self._backend else []
|
|
108
|
+
|
|
109
|
+
@property
|
|
110
|
+
def platform(self) -> str:
|
|
111
|
+
"""The current platform identifier (e.g. ``"darwin"`` or ``"win32"``)."""
|
|
112
|
+
return sys.platform
|
natocr/macos.py
ADDED
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
"""
|
|
2
|
+
macos implementation using vision framework
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import io
|
|
6
|
+
from typing import List
|
|
7
|
+
|
|
8
|
+
from PIL import Image
|
|
9
|
+
|
|
10
|
+
try:
|
|
11
|
+
from Foundation import NSData
|
|
12
|
+
from Vision import (
|
|
13
|
+
VNImageRequestHandler,
|
|
14
|
+
VNRecognizeTextRequest,
|
|
15
|
+
VNRequestTextRecognitionLevelAccurate,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
VISION_AVAILABLE = True
|
|
19
|
+
except ImportError:
|
|
20
|
+
VISION_AVAILABLE = False
|
|
21
|
+
|
|
22
|
+
from .models import BoundingBox, OCRResult, TextElement
|
|
23
|
+
|
|
24
|
+
# curated fallback if vision's live query fails - the accurate recognizer's set
|
|
25
|
+
# as of macos 15 (bcp-47 tags, exactly what vision returns)
|
|
26
|
+
COMMON_LANGUAGES = [
|
|
27
|
+
"en-US",
|
|
28
|
+
"fr-FR",
|
|
29
|
+
"it-IT",
|
|
30
|
+
"de-DE",
|
|
31
|
+
"es-ES",
|
|
32
|
+
"pt-BR",
|
|
33
|
+
"zh-Hans",
|
|
34
|
+
"zh-Hant",
|
|
35
|
+
"yue-Hans",
|
|
36
|
+
"yue-Hant",
|
|
37
|
+
"ko-KR",
|
|
38
|
+
"ja-JP",
|
|
39
|
+
"ru-RU",
|
|
40
|
+
"uk-UA",
|
|
41
|
+
"th-TH",
|
|
42
|
+
"vi-VT",
|
|
43
|
+
"ar-SA",
|
|
44
|
+
"ars-SA",
|
|
45
|
+
]
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class MacOSOCR:
|
|
49
|
+
"""macos ocr implementation using vision framework"""
|
|
50
|
+
|
|
51
|
+
def __init__(self, language: str = "en"):
|
|
52
|
+
"""
|
|
53
|
+
initialize macos ocr
|
|
54
|
+
|
|
55
|
+
args:
|
|
56
|
+
language: language code for text recognition
|
|
57
|
+
"""
|
|
58
|
+
if not VISION_AVAILABLE:
|
|
59
|
+
raise ImportError("vision framework not available")
|
|
60
|
+
|
|
61
|
+
self.language = language
|
|
62
|
+
self._setup_request()
|
|
63
|
+
|
|
64
|
+
def _setup_request(self):
|
|
65
|
+
"""setup vision text recognition request"""
|
|
66
|
+
self.request = VNRecognizeTextRequest.alloc().init()
|
|
67
|
+
# pyobjc needs the objc setters, plain attribute assignment is read-only
|
|
68
|
+
self.request.setRecognitionLanguages_([self.language])
|
|
69
|
+
self.request.setRecognitionLevel_(VNRequestTextRecognitionLevelAccurate)
|
|
70
|
+
self.request.setUsesLanguageCorrection_(True)
|
|
71
|
+
|
|
72
|
+
def recognize(self, image: Image.Image) -> OCRResult:
|
|
73
|
+
"""
|
|
74
|
+
perform ocr on pil image
|
|
75
|
+
|
|
76
|
+
args:
|
|
77
|
+
image: pil image to process
|
|
78
|
+
|
|
79
|
+
returns:
|
|
80
|
+
OCRResult with detected text and metadata
|
|
81
|
+
"""
|
|
82
|
+
# convert pil image to nsdata for vision framework
|
|
83
|
+
ns_image_data = self._pil_to_nsdata(image)
|
|
84
|
+
|
|
85
|
+
# create image request handler
|
|
86
|
+
handler = VNImageRequestHandler.alloc().initWithData_options_(ns_image_data, {})
|
|
87
|
+
|
|
88
|
+
# perform text recognition
|
|
89
|
+
success, error = handler.performRequests_error_([self.request], None)
|
|
90
|
+
|
|
91
|
+
if not success:
|
|
92
|
+
raise RuntimeError(f"vision framework error: {error}")
|
|
93
|
+
|
|
94
|
+
# extract results
|
|
95
|
+
observations = self.request.results()
|
|
96
|
+
if not observations:
|
|
97
|
+
return OCRResult(text="", confidence=None, elements=[])
|
|
98
|
+
|
|
99
|
+
# process observations into structured result
|
|
100
|
+
return self._process_observations(observations, image.size)
|
|
101
|
+
|
|
102
|
+
def _pil_to_nsdata(self, image: Image.Image) -> NSData:
|
|
103
|
+
"""convert pil image to nsdata for vision framework"""
|
|
104
|
+
# convert to rgb if needed
|
|
105
|
+
if image.mode != "RGB":
|
|
106
|
+
image = image.convert("RGB")
|
|
107
|
+
|
|
108
|
+
# save to bytes
|
|
109
|
+
buffer = io.BytesIO()
|
|
110
|
+
image.save(buffer, format="PNG")
|
|
111
|
+
image_data = buffer.getvalue()
|
|
112
|
+
|
|
113
|
+
# create nsdata
|
|
114
|
+
return NSData.dataWithBytes_length_(image_data, len(image_data))
|
|
115
|
+
|
|
116
|
+
def _process_observations(self, observations, image_size) -> OCRResult:
|
|
117
|
+
"""process vision observations into ocr result"""
|
|
118
|
+
elements = []
|
|
119
|
+
full_text_parts = []
|
|
120
|
+
total_confidence = 0.0
|
|
121
|
+
valid_observations = 0
|
|
122
|
+
|
|
123
|
+
for observation in observations:
|
|
124
|
+
# get recognized text
|
|
125
|
+
text = observation.topCandidates_(1)[0].string()
|
|
126
|
+
confidence = observation.topCandidates_(1)[0].confidence()
|
|
127
|
+
|
|
128
|
+
if text.strip():
|
|
129
|
+
# get bounding box
|
|
130
|
+
bbox = observation.boundingBox()
|
|
131
|
+
|
|
132
|
+
# convert normalized coordinates to pixel coordinates
|
|
133
|
+
x = bbox.origin.x * image_size[0]
|
|
134
|
+
y = (1.0 - bbox.origin.y - bbox.size.height) * image_size[1] # flip y
|
|
135
|
+
width = bbox.size.width * image_size[0]
|
|
136
|
+
height = bbox.size.height * image_size[1]
|
|
137
|
+
|
|
138
|
+
# create bounding box and text element
|
|
139
|
+
bounds = BoundingBox(x=x, y=y, width=width, height=height)
|
|
140
|
+
element = TextElement(text=text, bounds=bounds, confidence=confidence)
|
|
141
|
+
elements.append(element)
|
|
142
|
+
|
|
143
|
+
# accumulate text and confidence
|
|
144
|
+
full_text_parts.append(text)
|
|
145
|
+
total_confidence += confidence
|
|
146
|
+
valid_observations += 1
|
|
147
|
+
|
|
148
|
+
# calculate average confidence
|
|
149
|
+
avg_confidence = (
|
|
150
|
+
total_confidence / valid_observations if valid_observations > 0 else None
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
# join text parts
|
|
154
|
+
full_text = " ".join(full_text_parts)
|
|
155
|
+
|
|
156
|
+
return OCRResult(text=full_text, confidence=avg_confidence, elements=elements)
|
|
157
|
+
|
|
158
|
+
@property
|
|
159
|
+
def supported_languages(self) -> List[str]:
|
|
160
|
+
"""Language codes Vision can recognize on this machine.
|
|
161
|
+
|
|
162
|
+
Queried live from Vision for the request's recognition level, so it
|
|
163
|
+
always matches what the installed macOS version actually supports
|
|
164
|
+
(returned as BCP-47 tags like ``en-US``). Falls back to the curated
|
|
165
|
+
[`COMMON_LANGUAGES`][natocr.macos.COMMON_LANGUAGES] set if the query
|
|
166
|
+
fails.
|
|
167
|
+
"""
|
|
168
|
+
# ask vision directly instead of guessing - the set changes per os version
|
|
169
|
+
try:
|
|
170
|
+
languages, error = (
|
|
171
|
+
self.request.supportedRecognitionLanguagesAndReturnError_(None)
|
|
172
|
+
)
|
|
173
|
+
if error or not languages:
|
|
174
|
+
return list(COMMON_LANGUAGES)
|
|
175
|
+
return list(languages)
|
|
176
|
+
except Exception:
|
|
177
|
+
return list(COMMON_LANGUAGES)
|
natocr/models.py
ADDED
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
"""
|
|
2
|
+
data models for ocr results
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from typing import List, Optional, Tuple
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@dataclass
|
|
10
|
+
class BoundingBox:
|
|
11
|
+
"""Pixel-space bounding box for a piece of detected text.
|
|
12
|
+
|
|
13
|
+
The origin is the top-left of the image, with ``y`` growing downward.
|
|
14
|
+
|
|
15
|
+
Attributes:
|
|
16
|
+
x: left edge, in pixels.
|
|
17
|
+
y: top edge, in pixels.
|
|
18
|
+
width: box width, in pixels.
|
|
19
|
+
height: box height, in pixels.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
x: float
|
|
23
|
+
y: float
|
|
24
|
+
width: float
|
|
25
|
+
height: float
|
|
26
|
+
|
|
27
|
+
@property
|
|
28
|
+
def bounds(self) -> Tuple[float, float, float, float]:
|
|
29
|
+
"""The box as an ``(x, y, width, height)`` tuple."""
|
|
30
|
+
return (self.x, self.y, self.width, self.height)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@dataclass
|
|
34
|
+
class TextElement:
|
|
35
|
+
"""A single detected piece of text with its location.
|
|
36
|
+
|
|
37
|
+
Attributes:
|
|
38
|
+
text: the recognized string.
|
|
39
|
+
bounds: where it was found in the image.
|
|
40
|
+
confidence: recognition confidence in ``0.0..1.0``, or ``None`` when the
|
|
41
|
+
backend doesn't report one (Windows OCR doesn't).
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
text: str
|
|
45
|
+
bounds: BoundingBox
|
|
46
|
+
confidence: Optional[float] = None
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
@dataclass
|
|
50
|
+
class OCRResult:
|
|
51
|
+
"""Everything an OCR pass found in one image.
|
|
52
|
+
|
|
53
|
+
Attributes:
|
|
54
|
+
text: all detected text joined into a single string.
|
|
55
|
+
confidence: average confidence across detections, or ``None`` if the
|
|
56
|
+
backend doesn't report confidence.
|
|
57
|
+
elements: per-detection breakdown with text, bounds, and confidence.
|
|
58
|
+
"""
|
|
59
|
+
|
|
60
|
+
text: str
|
|
61
|
+
confidence: Optional[float] = None
|
|
62
|
+
elements: List[TextElement] = None
|
|
63
|
+
|
|
64
|
+
def __post_init__(self):
|
|
65
|
+
# default the mutable elements list when none was passed
|
|
66
|
+
if self.elements is None:
|
|
67
|
+
self.elements = []
|
|
68
|
+
|
|
69
|
+
@property
|
|
70
|
+
def words(self) -> List[TextElement]:
|
|
71
|
+
"""The elements that contain non-whitespace text."""
|
|
72
|
+
return [elem for elem in self.elements if elem.text.strip()]
|
|
73
|
+
|
|
74
|
+
@property
|
|
75
|
+
def lines(self) -> List[str]:
|
|
76
|
+
"""Detected text grouped into lines by vertical position.
|
|
77
|
+
|
|
78
|
+
Elements whose ``y`` are close together are treated as one line and
|
|
79
|
+
joined left-to-right. Falls back to ``[text]`` (or ``[]``) when there
|
|
80
|
+
are no elements.
|
|
81
|
+
"""
|
|
82
|
+
if not self.elements:
|
|
83
|
+
return [self.text] if self.text else []
|
|
84
|
+
|
|
85
|
+
# group elements by approximate y-coordinate for line detection
|
|
86
|
+
lines = []
|
|
87
|
+
current_line = []
|
|
88
|
+
current_y = None
|
|
89
|
+
|
|
90
|
+
for elem in sorted(self.elements, key=lambda e: (e.bounds.y, e.bounds.x)):
|
|
91
|
+
if (
|
|
92
|
+
current_y is None
|
|
93
|
+
or abs(elem.bounds.y - current_y) < elem.bounds.height * 0.5
|
|
94
|
+
):
|
|
95
|
+
current_line.append(elem)
|
|
96
|
+
current_y = elem.bounds.y
|
|
97
|
+
else:
|
|
98
|
+
if current_line:
|
|
99
|
+
line_text = " ".join(elem.text for elem in current_line)
|
|
100
|
+
lines.append(line_text)
|
|
101
|
+
current_line = [elem]
|
|
102
|
+
current_y = elem.bounds.y
|
|
103
|
+
|
|
104
|
+
if current_line:
|
|
105
|
+
line_text = " ".join(elem.text for elem in current_line)
|
|
106
|
+
lines.append(line_text)
|
|
107
|
+
|
|
108
|
+
return lines
|
natocr/windows.py
ADDED
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
"""
|
|
2
|
+
windows implementation using windows runtime ocr
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import io
|
|
6
|
+
from typing import List, Optional
|
|
7
|
+
|
|
8
|
+
from PIL import Image
|
|
9
|
+
|
|
10
|
+
try:
|
|
11
|
+
import asyncio
|
|
12
|
+
|
|
13
|
+
import winrt.windows.foundation as foundation
|
|
14
|
+
import winrt.windows.graphics.imaging as imaging
|
|
15
|
+
import winrt.windows.media.ocr as ocr
|
|
16
|
+
import winrt.windows.storage.streams as streams
|
|
17
|
+
|
|
18
|
+
WINDOWS_OCR_AVAILABLE = True
|
|
19
|
+
except ImportError:
|
|
20
|
+
WINDOWS_OCR_AVAILABLE = False
|
|
21
|
+
|
|
22
|
+
from .models import BoundingBox, OCRResult, TextElement
|
|
23
|
+
|
|
24
|
+
# curated fallback if the engine's live query fails - common windows ocr packs
|
|
25
|
+
# (bcp-47 tags). actual availability depends on which packs are installed.
|
|
26
|
+
COMMON_LANGUAGES = [
|
|
27
|
+
"en-US",
|
|
28
|
+
"en-GB",
|
|
29
|
+
"fr-FR",
|
|
30
|
+
"de-DE",
|
|
31
|
+
"es-ES",
|
|
32
|
+
"it-IT",
|
|
33
|
+
"pt-BR",
|
|
34
|
+
"nl-NL",
|
|
35
|
+
"ru-RU",
|
|
36
|
+
"ja-JP",
|
|
37
|
+
"ko-KR",
|
|
38
|
+
"zh-Hans-CN",
|
|
39
|
+
"zh-Hant-TW",
|
|
40
|
+
]
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class WindowsOCR:
|
|
44
|
+
"""windows ocr implementation using windows runtime ocr"""
|
|
45
|
+
|
|
46
|
+
def __init__(self, language: str = "en"):
|
|
47
|
+
"""
|
|
48
|
+
initialize windows ocr
|
|
49
|
+
|
|
50
|
+
args:
|
|
51
|
+
language: language code for text recognition
|
|
52
|
+
"""
|
|
53
|
+
if not WINDOWS_OCR_AVAILABLE:
|
|
54
|
+
raise ImportError("windows runtime ocr not available")
|
|
55
|
+
|
|
56
|
+
self.language = language
|
|
57
|
+
self._setup_engine()
|
|
58
|
+
|
|
59
|
+
def _setup_engine(self):
|
|
60
|
+
"""setup windows ocr engine"""
|
|
61
|
+
# create ocr engine with specified language
|
|
62
|
+
self.engine = ocr.OcrEngine.try_create_from_language(
|
|
63
|
+
ocr.Language(self.language)
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
if not self.engine:
|
|
67
|
+
# fallback to default language if specified not available
|
|
68
|
+
self.engine = ocr.OcrEngine.try_create_from_user_profile_languages()
|
|
69
|
+
|
|
70
|
+
if not self.engine:
|
|
71
|
+
raise RuntimeError("could not create ocr engine")
|
|
72
|
+
|
|
73
|
+
def recognize(self, image: Image.Image) -> OCRResult:
|
|
74
|
+
"""
|
|
75
|
+
perform ocr on pil image
|
|
76
|
+
|
|
77
|
+
args:
|
|
78
|
+
image: pil image to process
|
|
79
|
+
|
|
80
|
+
returns:
|
|
81
|
+
OCRResult with detected text and metadata
|
|
82
|
+
"""
|
|
83
|
+
# run async recognition
|
|
84
|
+
return asyncio.run(self._recognize_async(image))
|
|
85
|
+
|
|
86
|
+
async def _recognize_async(self, image: Image.Image) -> OCRResult:
|
|
87
|
+
"""async ocr recognition"""
|
|
88
|
+
# convert pil image to windows bitmap
|
|
89
|
+
bitmap = await self._pil_to_bitmap(image)
|
|
90
|
+
|
|
91
|
+
# perform ocr
|
|
92
|
+
result = await self.engine.recognize_async(bitmap)
|
|
93
|
+
|
|
94
|
+
# process results
|
|
95
|
+
return self._process_result(result, image.size)
|
|
96
|
+
|
|
97
|
+
async def _pil_to_bitmap(self, image: Image.Image):
|
|
98
|
+
"""convert pil image to windows bitmap"""
|
|
99
|
+
# convert to rgb if needed
|
|
100
|
+
if image.mode != "RGB":
|
|
101
|
+
image = image.convert("RGB")
|
|
102
|
+
|
|
103
|
+
# save to bytes
|
|
104
|
+
buffer = io.BytesIO()
|
|
105
|
+
image.save(buffer, format="PNG")
|
|
106
|
+
image_data = buffer.getvalue()
|
|
107
|
+
|
|
108
|
+
# create in-memory random access stream
|
|
109
|
+
stream = streams.InMemoryRandomAccessStream()
|
|
110
|
+
writer = streams.DataWriter(stream)
|
|
111
|
+
writer.write_bytes(image_data)
|
|
112
|
+
await writer.store_async()
|
|
113
|
+
await writer.flush_async()
|
|
114
|
+
stream.seek(0)
|
|
115
|
+
|
|
116
|
+
# create bitmap decoder
|
|
117
|
+
decoder = await imaging.BitmapDecoder.create_async(stream)
|
|
118
|
+
bitmap = await decoder.get_software_bitmap_async()
|
|
119
|
+
|
|
120
|
+
return bitmap
|
|
121
|
+
|
|
122
|
+
def _process_result(self, result, image_size) -> OCRResult:
|
|
123
|
+
"""process windows ocr result into ocr result"""
|
|
124
|
+
elements = []
|
|
125
|
+
full_text_parts = []
|
|
126
|
+
total_confidence = 0.0
|
|
127
|
+
valid_lines = 0
|
|
128
|
+
|
|
129
|
+
for line in result.lines:
|
|
130
|
+
line_text = line.text
|
|
131
|
+
if line_text.strip():
|
|
132
|
+
# get line bounding box
|
|
133
|
+
bbox = line.words[0].bounding_rect if line.words else line.bounding_rect
|
|
134
|
+
|
|
135
|
+
# convert to pixel coordinates
|
|
136
|
+
x = bbox.x
|
|
137
|
+
y = bbox.y
|
|
138
|
+
width = bbox.width
|
|
139
|
+
height = bbox.height
|
|
140
|
+
|
|
141
|
+
# create bounding box and text element
|
|
142
|
+
bounds = BoundingBox(x=x, y=y, width=width, height=height)
|
|
143
|
+
element = TextElement(text=line_text, bounds=bounds, confidence=None)
|
|
144
|
+
elements.append(element)
|
|
145
|
+
|
|
146
|
+
# accumulate text
|
|
147
|
+
full_text_parts.append(line_text)
|
|
148
|
+
valid_lines += 1
|
|
149
|
+
|
|
150
|
+
# join text parts
|
|
151
|
+
full_text = " ".join(full_text_parts)
|
|
152
|
+
|
|
153
|
+
return OCRResult(
|
|
154
|
+
text=full_text,
|
|
155
|
+
confidence=None, # windows ocr doesn't provide confidence scores
|
|
156
|
+
elements=elements,
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
@property
|
|
160
|
+
def supported_languages(self) -> List[str]:
|
|
161
|
+
"""Language codes with an OCR pack installed on this machine.
|
|
162
|
+
|
|
163
|
+
Queried live from the engine, so it reflects whatever Windows OCR
|
|
164
|
+
language packs are installed (returned as BCP-47 tags like ``en-US``).
|
|
165
|
+
Falls back to the curated
|
|
166
|
+
[`COMMON_LANGUAGES`][natocr.windows.COMMON_LANGUAGES] set if the query
|
|
167
|
+
fails.
|
|
168
|
+
"""
|
|
169
|
+
# the set depends on installed ocr language packs, so ask the engine
|
|
170
|
+
try:
|
|
171
|
+
languages = self.engine.available_recognizer_languages
|
|
172
|
+
return [lang.language_tag for lang in languages]
|
|
173
|
+
except Exception:
|
|
174
|
+
return list(COMMON_LANGUAGES)
|
|
@@ -0,0 +1,203 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: natocr
|
|
3
|
+
Version: 1.3.3
|
|
4
|
+
Summary: Native OCR library using platform-specific frameworks (macOS Vision, Windows Runtime OCR)
|
|
5
|
+
Author-email: alfredchiesa <alfred.personal@icloud.com>
|
|
6
|
+
Maintainer-email: alfredchiesa <alfred.personal@icloud.com>
|
|
7
|
+
License: MIT
|
|
8
|
+
Project-URL: Homepage, https://github.com/alfredchiesa/natocr
|
|
9
|
+
Project-URL: Documentation, https://alfredchiesa.github.io/natocr
|
|
10
|
+
Project-URL: Repository, https://github.com/alfredchiesa/natocr.git
|
|
11
|
+
Project-URL: Issues, https://github.com/alfredchiesa/natocr/issues
|
|
12
|
+
Project-URL: Changelog, https://github.com/alfredchiesa/natocr/blob/main/CHANGELOG.md
|
|
13
|
+
Keywords: ocr,text-recognition,vision,macos,windows,native
|
|
14
|
+
Classifier: Development Status :: 3 - Alpha
|
|
15
|
+
Classifier: Intended Audience :: Developers
|
|
16
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
17
|
+
Classifier: Operating System :: MacOS
|
|
18
|
+
Classifier: Operating System :: Microsoft :: Windows
|
|
19
|
+
Classifier: Programming Language :: Python :: 3
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
23
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
24
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
25
|
+
Classifier: Topic :: Scientific/Engineering :: Image Recognition
|
|
26
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
27
|
+
Requires-Python: >=3.10
|
|
28
|
+
Description-Content-Type: text/markdown
|
|
29
|
+
License-File: LICENSE
|
|
30
|
+
Requires-Dist: Pillow>=9.0.0
|
|
31
|
+
Requires-Dist: numpy>=1.21
|
|
32
|
+
Provides-Extra: macos
|
|
33
|
+
Requires-Dist: pyobjc-framework-Vision>=11.1; extra == "macos"
|
|
34
|
+
Requires-Dist: pyobjc-framework-Quartz>=11.1; extra == "macos"
|
|
35
|
+
Provides-Extra: windows
|
|
36
|
+
Requires-Dist: pywin32>=311; extra == "windows"
|
|
37
|
+
Provides-Extra: dev
|
|
38
|
+
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
|
39
|
+
Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
|
|
40
|
+
Requires-Dist: python-semantic-release>=8.0.0; extra == "dev"
|
|
41
|
+
Requires-Dist: mkdocs>=1.6.1; extra == "dev"
|
|
42
|
+
Requires-Dist: mkdocs-material>=9.6.21; extra == "dev"
|
|
43
|
+
Requires-Dist: mkdocstrings[python]>=0.26; extra == "dev"
|
|
44
|
+
Dynamic: license-file
|
|
45
|
+
|
|
46
|
+
# natocr
|
|
47
|
+
|
|
48
|
+
**natocr** (*native ocr*) is a small Python wrapper around the OCR engines that
|
|
49
|
+
already ship with macOS and Windows: Vision framework on macOS and Windows
|
|
50
|
+
Runtime OCR on Windows.
|
|
51
|
+
|
|
52
|
+
These built-in engines are generally faster, more efficient, and more accurate
|
|
53
|
+
than third-party alternatives like Tesseract. **natocr** makes reaching for them
|
|
54
|
+
painless via one clean Python API instead of wrangling with Objective-C bridges
|
|
55
|
+
or WinRT async plumbing.
|
|
56
|
+
|
|
57
|
+
## Install
|
|
58
|
+
|
|
59
|
+
```bash
|
|
60
|
+
pip install natocr[macos] # on macOS
|
|
61
|
+
pip install natocr[windows] # on Windows
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
## Quick start
|
|
65
|
+
|
|
66
|
+
```python
|
|
67
|
+
from natocr import OCR
|
|
68
|
+
|
|
69
|
+
ocr = OCR() # defaults to english
|
|
70
|
+
result = ocr.recognize("invoice.png")
|
|
71
|
+
|
|
72
|
+
print(result.text)
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
```text
|
|
76
|
+
Invoice #1042 Total $58.20 Thank you!
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
### Confidence Scores and Bounding Boxes
|
|
80
|
+
|
|
81
|
+
`recognize()` returns an `OCRResult`. Beyond the flat `.text`, you get a
|
|
82
|
+
per-detection breakdown with bounding boxes and (*on macOS*) confidence scores:
|
|
83
|
+
|
|
84
|
+
```python
|
|
85
|
+
result = ocr.recognize("receipt.png")
|
|
86
|
+
|
|
87
|
+
print(result.confidence) # average confidence, or None if unavailable
|
|
88
|
+
|
|
89
|
+
for element in result.elements:
|
|
90
|
+
box = element.bounds.bounds # (x, y, width, height) in pixels
|
|
91
|
+
print(f"{element.text!r} @ {box} conf={element.confidence}")
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
```text
|
|
95
|
+
0.93
|
|
96
|
+
'Acme Coffee' @ (24.0, 18.0, 180.0, 32.0) conf=0.97
|
|
97
|
+
'Latte' @ (24.0, 70.0, 96.0, 28.0) conf=0.95
|
|
98
|
+
'$4.50' @ (220.0, 70.0, 80.0, 28.0) conf=0.88
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
### Lines and Words
|
|
102
|
+
|
|
103
|
+
There's also convenience views for grouping results by reading order:
|
|
104
|
+
|
|
105
|
+
```python
|
|
106
|
+
result.lines # ['Acme Coffee', 'Latte $4.50'] - elements grouped into lines
|
|
107
|
+
result.words # list of TextElement with non-empty text
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
### Detection Language
|
|
111
|
+
|
|
112
|
+
Pick a different recognition language, and inspect what the current platform
|
|
113
|
+
supports:
|
|
114
|
+
|
|
115
|
+
```python
|
|
116
|
+
ocr = OCR(language="fr")
|
|
117
|
+
print(ocr.platform) # 'darwin' or 'win32'
|
|
118
|
+
print(ocr.supported_languages) # ['en-US', 'fr-FR', 'de-DE', ...]
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
The supported set is decided by the OS and queried live, so
|
|
122
|
+
`supported_languages` always reflects the current machine. On macOS it's
|
|
123
|
+
Vision's built-in set for your macOS version; on Windows it's whatever OCR
|
|
124
|
+
language packs are installed. See the [Usage guide](https://alfredchiesa.github.io/natocr/usage/#supported-languages)
|
|
125
|
+
for the full list and how to add Windows language packs.
|
|
126
|
+
|
|
127
|
+
### Alternative Inputs
|
|
128
|
+
|
|
129
|
+
`recognize()` accepts more than file paths - hand it whatever you already have
|
|
130
|
+
in memory:
|
|
131
|
+
|
|
132
|
+
```python
|
|
133
|
+
from PIL import Image
|
|
134
|
+
import numpy as np
|
|
135
|
+
|
|
136
|
+
ocr.recognize("page.png") # a file path
|
|
137
|
+
ocr.recognize(Image.open("page.png")) # a PIL image
|
|
138
|
+
ocr.recognize(np.array(image)) # a numpy array (e.g. from OpenCV)
|
|
139
|
+
ocr.recognize(open("page.png", "rb").read()) # raw image bytes
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
## Supported File Types
|
|
143
|
+
|
|
144
|
+
Images are decoded with [Pillow](https://python-pillow.org/), so any raster
|
|
145
|
+
format Pillow can open works as an input file or byte string.
|
|
146
|
+
|
|
147
|
+
| Format | Extensions | Notes |
|
|
148
|
+
| --- | --- | --- |
|
|
149
|
+
| PNG | `.png` | recommended - lossless |
|
|
150
|
+
| JPEG | `.jpg`, `.jpeg` | great for photos of documents |
|
|
151
|
+
| TIFF | `.tif`, `.tiff` | common for scans |
|
|
152
|
+
| BMP | `.bmp` | uncompressed bitmap |
|
|
153
|
+
| GIF | `.gif` | first frame is used |
|
|
154
|
+
| WebP | `.webp` | modern lossy/lossless |
|
|
155
|
+
| PPM/PGM | `.ppm`, `.pgm` | netpbm bitmaps |
|
|
156
|
+
|
|
157
|
+
In addition to file paths, `recognize()` accepts these in-memory types:
|
|
158
|
+
|
|
159
|
+
| Input type | Example |
|
|
160
|
+
| --- | --- |
|
|
161
|
+
| `str` (file path) | `ocr.recognize("page.png")` |
|
|
162
|
+
| `PIL.Image.Image` | `ocr.recognize(Image.open("page.png"))` |
|
|
163
|
+
| `numpy.ndarray` | `ocr.recognize(np.array(image))` |
|
|
164
|
+
| `bytes` (encoded image) | `ocr.recognize(data)` |
|
|
165
|
+
|
|
166
|
+
> [!NOTE]
|
|
167
|
+
> PDFs and other multi-page documents aren't decoded directly - rasterize a page
|
|
168
|
+
> to one of the formats above first (e.g. with `pdf2image` or `pymupdf`).
|
|
169
|
+
|
|
170
|
+
## Testing
|
|
171
|
+
|
|
172
|
+
Install the dev dependencies (in a virtualenv), then run the suite. The tests
|
|
173
|
+
mock the native macOS Vision and Windows Runtime backends, so they run anywhere
|
|
174
|
+
without those frameworks installed.
|
|
175
|
+
|
|
176
|
+
```bash
|
|
177
|
+
python3 -m venv .venv
|
|
178
|
+
source .venv/bin/activate
|
|
179
|
+
pip install -e ".[dev]"
|
|
180
|
+
```
|
|
181
|
+
|
|
182
|
+
Run everything with coverage (coverage is wired up in `pyproject.toml`, so plain
|
|
183
|
+
`pytest` already reports it):
|
|
184
|
+
|
|
185
|
+
```bash
|
|
186
|
+
pytest
|
|
187
|
+
```
|
|
188
|
+
|
|
189
|
+
Other handy invocations:
|
|
190
|
+
|
|
191
|
+
```bash
|
|
192
|
+
# run a single test file
|
|
193
|
+
pytest tests/test_models.py
|
|
194
|
+
|
|
195
|
+
# run one test by name
|
|
196
|
+
pytest -k test_lines_groups_close_y_into_single_line
|
|
197
|
+
|
|
198
|
+
# verbose output
|
|
199
|
+
pytest -v
|
|
200
|
+
```
|
|
201
|
+
|
|
202
|
+
Coverage reports land in the terminal, in `htmlcov/index.html`, and in
|
|
203
|
+
`coverage.xml`.
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
natocr/__init__.py,sha256=9A3lC52yGZu7MMPd6AcG1AVdR7c8vNHC1heCvwX6kb4,446
|
|
2
|
+
natocr/core.py,sha256=V_ipHNIImTOXxwGSGSmDT9oXxi8qDatbkBGjZW596Hw,3632
|
|
3
|
+
natocr/macos.py,sha256=g6MKtNHJ0SInrkAm_nndetTEv8otfkx6Ag5RxW8LP0Q,5639
|
|
4
|
+
natocr/models.py,sha256=Em4qOpBwSUylB_SGLDmkJ5IoMAxzS-HdoA5RzPlukJI,3178
|
|
5
|
+
natocr/windows.py,sha256=EiE7xeIxbe5e3zxNcfdELAfLYUfHgLyBGldCvmNbPh4,5261
|
|
6
|
+
natocr-1.3.3.dist-info/licenses/LICENSE,sha256=lSQGjh39IyIvxFdkgGVGuvuZS0mEdXVRkRQTqogJv7g,1056
|
|
7
|
+
natocr-1.3.3.dist-info/METADATA,sha256=3GUlmZJVF0UoijHc_fNp54SKsv1G_4c2rKzhMFrWAwQ,6588
|
|
8
|
+
natocr-1.3.3.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
9
|
+
natocr-1.3.3.dist-info/top_level.txt,sha256=8YbBgCvaZjOLyplN9SbSI04u1zPWJO_2pnf1n8hwoNY,7
|
|
10
|
+
natocr-1.3.3.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025.
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
natocr
|