PyPI - openmlkitOCR - Versions diffs - 1.0.0__py3-none-any.whl - Mend

openmlkitOCR 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

openmlkit/__init__.py +9 -0
openmlkit/detector.py +111 -0
openmlkit/labelmap.py +105 -0
openmlkit/pipeline.py +486 -0
openmlkit/recognizer.py +108 -0
openmlkitocr-1.0.0.dist-info/METADATA +88 -0
openmlkitocr-1.0.0.dist-info/RECORD +10 -0
openmlkitocr-1.0.0.dist-info/WHEEL +5 -0
openmlkitocr-1.0.0.dist-info/licenses/LICENSE +176 -0
openmlkitocr-1.0.0.dist-info/top_level.txt +1 -0

openmlkit/__init__.py ADDED Viewed

@@ -0,0 +1,9 @@
+# openmlkit/__init__.py
+from .labelmap import LabelMap
+from .detector import TextDetector
+from .recognizer import TextRecognizer
+from .pipeline import OpenMLKitOCR
+__version__ = "1.0.0"
+__all__ = ["OpenMLKitOCR", "TextDetector", "TextRecognizer", "LabelMap"]

openmlkit/detector.py ADDED Viewed

@@ -0,0 +1,111 @@
+# openmlkit/detector.py
+import numpy as np
+import cv2
+import tflite_runtime.interpreter as tflite
+class TextDetector:
+    def __init__(self, model_path):
+        self.interpreter = tflite.Interpreter(model_path=model_path)
+        self.interpreter.allocate_tensors()
+        self.input_details = self.interpreter.get_input_details()
+        self.output_details = self.interpreter.get_output_details()
+    def detect_raw(self, img_gray_256):
+        """
+        Runs the text detector model on a 256x256 grayscale image.
+        Returns:
+            cls_probs: [16, 16] probability map
+            dequantized: [1, 16, 16, 4] bounding box regression offsets
+        """
+        if img_gray_256.shape != (256, 256):
+            img_gray_256 = cv2.resize(img_gray_256, (256, 256), interpolation=cv2.INTER_LINEAR)
+        input_data = img_gray_256.reshape((1, 256, 256, 1))
+        self.interpreter.set_tensor(self.input_details[0]['index'], input_data)
+        self.interpreter.invoke()
+        output_data = self.interpreter.get_tensor(self.output_details[0]['index'])
+        # Dequantize output
+        scale, zero_point = self.output_details[0]['quantization']
+        dequantized = (output_data.astype(np.float32) - zero_point) * scale
+        cls_logits = dequantized[0, :, :, 0]
+        cls_probs = 1.0 / (1.0 + np.exp(-cls_logits))
+        return cls_probs, dequantized
+    def detect(self, img_gray, score_threshold=0.35):
+        """
+        Detects text regions in a grayscale image (256x256 shape expected).
+        Returns a list of bounding boxes [x_min, y_min, x_max, y_max] in normalized (0 to 256) scale.
+        """
+        cls_probs, dequantized = self.detect_raw(img_gray)
+        # Step 1: Decode local boxes for active cells
+        local_boxes = []
+        for y in range(16):
+            for x in range(16):
+                prob = cls_probs[y, x]
+                if prob > score_threshold:
+                    vals = dequantized[0, y, x, :]
+                    cy = y * 16 + 8
+                    cx = x * 16 + 8
+                    y_center = cy + vals[2] * 16
+                    # Force a tight, fixed height of 14 pixels in 256 space for text lines
+                    h = 14.0
+                    y_min = y_center - h / 2
+                    y_max = y_center + h / 2
+                    x_min = cx + vals[3] * 16
+                    x_max = cx + vals[1] * 16
+                    local_boxes.append({
+                        'x_min': x_min, 'x_max': x_max,
+                        'y_min': y_min, 'y_max': y_max,
+                        'y_center': y_center, 'h': h
+                    })
+        # Step 2: Group local boxes into lines
+        line_groups = []
+        for box in local_boxes:
+            merged = False
+            for group in line_groups:
+                group_y_centers = [b['y_center'] for b in group]
+                avg_y_center = np.mean(group_y_centers)
+                # Check horizontal proximity
+                horiz_close = False
+                for g_box in group:
+                    dist = max(0, box['x_min'] - g_box['x_max'], g_box['x_min'] - box['x_max'])
+                    if dist < 24: # max horizontal gap in 256 space
+                        horiz_close = True
+                        break
+                if abs(box['y_center'] - avg_y_center) < 5.0 and horiz_close:
+                    group.append(box)
+                    merged = True
+                    break
+            if not merged:
+                line_groups.append([box])
+        # Step 3: Compute final bounding boxes in normalized 0-256 scale
+        boxes = []
+        for group in line_groups:
+            min_x = min(b['x_min'] for b in group)
+            max_x = max(b['x_max'] for b in group)
+            mean_y_min = np.mean([b['y_min'] for b in group])
+            mean_y_max = np.mean([b['y_max'] for b in group])
+            # Pad horizontally slightly to prevent character clipping (pad by 8 pixels)
+            min_x = max(0, min_x - 8)
+            max_x = min(256, max_x + 8)
+            boxes.append([min_x, mean_y_min, max_x, mean_y_max])
+        # Sort boxes top-to-bottom
+        boxes.sort(key=lambda b: b[1])
+        return boxes

openmlkit/labelmap.py ADDED Viewed

@@ -0,0 +1,105 @@
+# openmlkit/labelmap.py
+class LabelMap:
+    def __init__(self, pb_path):
+        self.mapping = self._parse_pb(pb_path)
+    def _skip_field(self, data, idx, wire_type):
+        if wire_type == 0:  # Varint
+            while True:
+                b = data[idx]
+                idx += 1
+                if not (b & 0x80):
+                    break
+        elif wire_type == 1:  # 64-bit
+            idx += 8
+        elif wire_type == 2:  # Length-delimited
+            flen = 0
+            shift = 0
+            while True:
+                b = data[idx]
+                idx += 1
+                flen |= (b & 0x7f) << shift
+                if not (b & 0x80):
+                    break
+                shift += 7
+            idx += flen
+        elif wire_type == 5:  # 32-bit
+            idx += 4
+        return idx
+    def _parse_pb(self, path):
+        with open(path, 'rb') as f:
+            data = f.read()
+        idx = 0
+        total_len = len(data)
+        label_map = {}
+        while idx < total_len:
+            tag_byte = data[idx]
+            tag = tag_byte >> 3
+            wire = tag_byte & 0x07
+            idx += 1
+            # Read varint value (used for length or integer value)
+            val = 0
+            shift = 0
+            while True:
+                b = data[idx]
+                idx += 1
+                val |= (b & 0x7f) << shift
+                if not (b & 0x80):
+                    break
+                shift += 7
+            if tag == 1 and wire == 2:
+                # Length-delimited Entry message
+                end_idx = idx + val
+                char_str = ""
+                class_idx = 0
+                while idx < end_idx:
+                    inner_tag_byte = data[idx]
+                    inner_tag = inner_tag_byte >> 3
+                    inner_wire = inner_tag_byte & 0x07
+                    idx += 1
+                    if inner_tag == 1 and inner_wire == 2:
+                        char_len = 0
+                        shift = 0
+                        while True:
+                            b = data[idx]
+                            idx += 1
+                            char_len |= (b & 0x7f) << shift
+                            if not (b & 0x80):
+                                break
+                            shift += 7
+                        char_str = data[idx:idx+char_len].decode('utf-8', errors='replace')
+                        idx += char_len
+                    elif inner_tag == 2 and inner_wire == 0:
+                        class_val = 0
+                        shift = 0
+                        while True:
+                            b = data[idx]
+                            idx += 1
+                            class_val |= (b & 0x7f) << shift
+                            if not (b & 0x80):
+                                break
+                            shift += 7
+                        class_idx = class_val
+                    else:
+                        idx = self._skip_field(data, idx, inner_wire)
+                label_map[class_idx] = char_str
+            else:
+                # Skip other top-level tags
+                if wire == 1:
+                    idx += 8
+                elif wire == 2:
+                    idx += val
+                elif wire == 5:
+                    idx += 4
+        return label_map
+    def get(self, index, default=""):
+        return self.mapping.get(index, default)

openmlkit/pipeline.py ADDED Viewed

@@ -0,0 +1,486 @@
+# openmlkit/pipeline.py
+import os
+import cv2
+import numpy as np
+from .labelmap import LabelMap
+from .detector import TextDetector
+from .recognizer import TextRecognizer
+class OpenMLKitOCR:
+    def _ensure_model(self, models_dir, relative_path):
+        """
+        Checks if the model file exists locally. If not, attempts to download it
+        from Hugging Face repository configured by the environment variable
+        'OPENMLKIT_MODEL_REPO' (defaults to '0cve0/OpenMLKitOCR').
+        """
+        local_path = os.path.join(models_dir, relative_path)
+        if os.path.exists(local_path):
+            return local_path
+        repo_id = os.environ.get("OPENMLKIT_MODEL_REPO", "0cve0/OpenMLKitOCR")
+        print(f"Model file '{relative_path}' not found locally at {local_path}. Downloading from Hugging Face ({repo_id})...")
+        try:
+            from huggingface_hub import hf_hub_download
+            # hf_hub_download supports subdirectories in filename
+            cached_path = hf_hub_download(repo_id=repo_id, filename=relative_path)
+            return cached_path
+        except Exception as e:
+            # Fallback to urllib.request
+            import urllib.request
+            os.makedirs(os.path.dirname(local_path), exist_ok=True)
+            url = f"https://huggingface.co/{repo_id}/resolve/main/{relative_path}"
+            try:
+                print(f"Downloading {url} to {local_path}...")
+                urllib.request.urlretrieve(url, local_path)
+                return local_path
+            except Exception as download_error:
+                raise FileNotFoundError(
+                    f"Required model file not found: {relative_path} at {local_path} and failed to download from {url}.\n"
+                    f"Error: {download_error}\n"
+                    f"Please verify internet connection or place the model file manually."
+                )
+    def __init__(self, models_dir=None, lang='en'):
+        """
+        Initializes the OCR pipeline.
+        lang: 'en' (Latin) or 'ru' (Cyrillic).
+        If models_dir is None, it defaults to the 'models' subdirectory inside this package.
+        """
+        if models_dir is None:
+            models_dir = os.path.join(os.path.dirname(__file__), 'models')
+        det_rel = 'detector/rpn_detector.tflite'
+        if lang == 'ru':
+            rec_rel = 'ru/recognizer_cyrl.tflite'
+            labelmap_rel = 'ru/LabelMap_cyrl.pb'
+        elif lang == 'zh':
+            rec_rel = 'zh/recognizer_hani.tflite'
+            labelmap_rel = 'zh/recognizer_hani_label_map.pb'
+        elif lang == 'ko':
+            rec_rel = 'ko/recognizer_kore.tflite'
+            labelmap_rel = 'ko/recognizer_kore_label_map.pb'
+        elif lang == 'ja':
+            rec_rel = 'ja/recognizer_jpan.tflite'
+            labelmap_rel = 'ja/recognizer_jpan_label_map.pb'
+        elif lang == 'ar':
+            rec_rel = 'ar/recognizer_arab_retrained.tflite'
+            labelmap_rel = 'ar/recognizer_arab_label_map.pb'
+        elif lang == 'he':
+            rec_rel = 'he/hebr.tflite'
+            labelmap_rel = 'he/hebr_label_map.pb'
+        elif lang == 'th':
+            rec_rel = 'th/recognizer_thai.tflite'
+            labelmap_rel = 'th/recognizer_thai_label_map.pb'
+        elif lang == 'ka':
+            rec_rel = 'ka/geor.tflite'
+            labelmap_rel = 'ka/geor_label_map.pb'
+        elif lang == 'bn':
+            rec_rel = 'bn/bede.tflite'
+            labelmap_rel = 'bn/bede_label_map.pb'
+        elif lang == 'ta':
+            rec_rel = 'ta/recognizer_taml.tflite'
+            labelmap_rel = 'ta/recognizer_taml_label_map.pb'
+        elif lang == 'te':
+            rec_rel = 'te/recognizer_telu.tflite'
+            labelmap_rel = 'te/recognizer_telu_label_map.pb'
+        elif lang == 'kn':
+            rec_rel = 'kn/recognizer_knda.tflite'
+            labelmap_rel = 'kn/recognizer_knda_label_map.pb'
+        elif lang == 'ml':
+            rec_rel = 'ml/recognizer_mlym.tflite'
+            labelmap_rel = 'ml/recognizer_mlym_label_map.pb'
+        elif lang == 'gu':
+            rec_rel = 'gu/gocr_tflite_recognizer_gujr.tflite'
+            labelmap_rel = 'gu/gocr_tflite_recognizer_gujr_label_map.pb'
+        elif lang in ('en_translate', 'latn_vi'):
+            rec_rel = 'vi/gocr_tflite_recognizer_latn_vi.tflite'
+            labelmap_rel = 'vi/gocr_tflite_recognizer_latn_vi_label_map.pb'
+        else:
+            rec_rel = 'en/line_recognizer.fb'
+            labelmap_rel = 'en/LabelMap.pb'
+        det_model = self._ensure_model(models_dir, det_rel)
+        rec_model = self._ensure_model(models_dir, rec_rel)
+        labelmap_path = self._ensure_model(models_dir, labelmap_rel)
+        self.label_map = LabelMap(labelmap_path)
+        self.detector = TextDetector(det_model)
+        self.recognizer = TextRecognizer(rec_model, self.label_map)
+        # Configure stitching parameters based on script characteristics
+        self.lang = lang
+        if lang in ('zh', 'ja', 'ko'):
+            self.min_match_len = 2
+            self.max_scan = 12
+            self.max_off = 3
+        else:  # Alphabetic scripts (Latin, Cyrillic, Arabic …)
+            self.min_match_len = 3
+            self.max_scan = 22
+            self.max_off = 6
+    def _merge_overlapping_texts(self, t1, t2):
+        """Merge two overlapping OCR chunk results using fuzzy suffix-prefix alignment.
+        The model often produces small errors at chunk edges (wrong case, extra
+        or missing character, boundary artefact). This method tolerates up to
+        ``max(1, L // 5)`` substitutions over an overlap window of length L and
+        also allows a small positional *offset* so the overlap does not have to
+        start at the very first character of t2.
+        Strategy (tried in order):
+        1. Exact suffix-of-t1 / prefix-of-t2 character match.
+        2. Exact word-level suffix-of-t1 / prefix-of-t2 match.
+        3. Best fuzzy alignment (vary overlap length L, offset in t1 tail and
+           t2 head) scored by matches - penalty(errors) - penalty(offset).
+        4. Fallback: concatenate with a space.
+        """
+        t1 = t1.strip()
+        t2 = t2.strip()
+        if not t1:
+            return t2
+        if not t2:
+            return t1
+        max_scan = self.max_scan
+        min_match = self.min_match_len
+        max_off = self.max_off
+        # 1. Exact character suffix-prefix match
+        for L in range(min(len(t1), len(t2)), min_match - 1, -1):
+            if t1[-L:] == t2[:L]:
+                return t1 + t2[L:]
+        # 2. Exact word-level suffix-prefix match
+        w1 = t1.split()
+        w2 = t2.split()
+        for i in range(min(len(w1), len(w2)), 0, -1):
+            if w1[-i:] == w2[:i]:
+                return " ".join(w1[:-i] + w2)
+        # 3. Fuzzy alignment: scan over overlap length L and small offsets
+        best_score = -1
+        best_cut1 = None   # keep t1[:best_cut1]
+        best_start2 = None  # append t2[best_start2:]
+        for L in range(min_match, min(len(t1), len(t2), max_scan) + 1):
+            for off1 in range(0, min(max_off + 1, len(t1) - L + 1)):
+                for off2 in range(0, min(max_off + 1, len(t2) - L + 1)):
+                    s = t1[len(t1) - L - off1: len(t1) - off1].lower()
+                    h = t2[off2: off2 + L].lower()
+                    matches = sum(a == b for a, b in zip(s, h))
+                    errors = L - matches
+                    if errors > max(1, L // 5):
+                        continue
+                    # Score: reward long accurate matches, penalise offsets
+                    score = matches * 5 - errors * 6 - (off1 + off2) * 2
+                    if score > best_score:
+                        best_score = score
+                        best_cut1 = len(t1) - off1        # trim garbled tail of t1
+                        best_start2 = off2 + L            # skip overlap head of t2
+        if best_cut1 is not None and best_score > min_match * 2:
+            return t1[:best_cut1] + t2[best_start2:]
+        # 4. Fallback
+        return t1 + " " + t2
+    def _get_tiles(self, orig_w, orig_h, tile_size=512, overlap=128):
+        tiles = []
+        stride = tile_size - overlap
+        y = 0
+        while y < orig_h:
+            end_y = min(orig_h, y + tile_size)
+            start_y = max(0, end_y - tile_size)
+            x = 0
+            while x < orig_w:
+                end_x = min(orig_w, x + tile_size)
+                start_x = max(0, end_x - tile_size)
+                tiles.append((start_x, start_y, end_x, end_y))
+                if end_x == orig_w:
+                    break
+                x += stride
+            if end_y == orig_h:
+                break
+            y += stride
+        return tiles
+    def _split_block_into_lines(self, block_gray, threshold_ratio=0.01, min_line_height=8):
+        h_block, w_block = block_gray.shape
+        # Adaptive background thresholding
+        if np.mean(block_gray) > 127:
+            _, binary = cv2.threshold(block_gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
+        else:
+            _, binary = cv2.threshold(block_gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
+        proj = np.sum(binary, axis=1)
+        max_val = w_block * 255
+        thresh_val = max(1000, max_val * threshold_ratio)
+        in_line = False
+        start_y = 0
+        raw_lines = []
+        for y in range(h_block):
+            if proj[y] > thresh_val:
+                if not in_line:
+                    start_y = y
+                    in_line = True
+            else:
+                if in_line:
+                    end_y = y
+                    if (end_y - start_y) >= min_line_height:
+                        raw_lines.append((start_y, end_y))
+                    in_line = False
+        if in_line:
+            if (h_block - start_y) >= min_line_height:
+                raw_lines.append((start_y, h_block))
+        refined_lines = []
+        for s_y, e_y in raw_lines:
+            s_y_pad = max(0, s_y - 2)
+            e_y_pad = min(h_block, e_y + 2)
+            refined_lines.append((s_y_pad, e_y_pad))
+        return refined_lines
+    def _find_text_horizontal_bounds(self, line_gray, threshold_ratio=0.02):
+        h_line, w_line = line_gray.shape
+        # Adaptive background thresholding
+        if np.mean(line_gray) > 127:
+            _, binary = cv2.threshold(line_gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
+        else:
+            _, binary = cv2.threshold(line_gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
+        proj = np.sum(binary, axis=0)
+        thresh_val = h_line * 255 * threshold_ratio
+        active_cols = np.where(proj > thresh_val)[0]
+        if len(active_cols) == 0:
+            return 0, w_line
+        x_min = active_cols[0]
+        x_max = active_cols[-1]
+        # Add small padding to prevent character clipping
+        x_min = max(0, x_min - 8)
+        x_max = min(w_line, x_max + 8)
+        return x_min, x_max
+    def _are_boxes_close(self, b1, b2):
+        y_dist = abs(b1['y_center'] - b2['y_center'])
+        x_dist = max(0, b1['x_min'] - b2['x_max'], b2['x_min'] - b1['x_max'])
+        # Allow generous horizontal gap so detector anchors (spaced every 16px
+        # in 256-space, up to ~3.5× in a full-width image) don't split one text
+        # line into multiple blocks.
+        if y_dist < 40 and x_dist < 120:
+            return True
+        return False
+    def run(self, img, score_threshold=0.35):
+        """
+        Runs the OCR pipeline on the input image.
+        img: Can be a file path (str) or a numpy array (RGB or Grayscale).
+        score_threshold: Float threshold for text detection.
+        Returns a list of dicts:
+        [
+            {
+                "box": [x_min, y_min, x_max, y_max],  # Bounding box in original image pixels
+                "text": "..."  # Recognized text string
+            },
+            ...
+        ]
+        """
+        # Load image if file path is provided
+        if isinstance(img, str):
+            if not os.path.exists(img):
+                raise FileNotFoundError(f"Image file not found: {img}")
+            img_bgr = cv2.imread(img)
+            img_rgb = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)
+        elif isinstance(img, np.ndarray):
+            if len(img.shape) == 3:
+                img_rgb = img
+            else:
+                img_rgb = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB)
+        else:
+            raise TypeError("img must be a file path (str) or numpy array")
+        orig_h, orig_w = img_rgb.shape[:2]
+        img_gray = cv2.cvtColor(img_rgb, cv2.COLOR_RGB2GRAY)
+        # 1. Determine tiles
+        if orig_w <= 512 and orig_h <= 512:
+            tiles = [(0, 0, orig_w, orig_h)]
+        else:
+            tiles = self._get_tiles(orig_w, orig_h, tile_size=512, overlap=128)
+        # 2. Run detector on each tile and map box predictions back to global space
+        global_boxes = []
+        for tx_min, ty_min, tx_max, ty_max in tiles:
+            tile_w = tx_max - tx_min
+            tile_h = ty_max - ty_min
+            tile_crop = img_gray[ty_min:ty_max, tx_min:tx_max]
+            # Resize tile to 256x256 and run detector
+            cls_probs, dequantized = self.detector.detect_raw(tile_crop)
+            scale_x = tile_w / 256.0
+            scale_y = tile_h / 256.0
+            for y in range(16):
+                for x in range(16):
+                    prob = cls_probs[y, x]
+                    if prob > score_threshold:
+                        vals = dequantized[0, y, x, :]
+                        cy = y * 16 + 8
+                        cx = x * 16 + 8
+                        y_center_local = cy + vals[2] * 16
+                        h_local = 14.0
+                        y_min_local = y_center_local - h_local / 2
+                        y_max_local = y_center_local + h_local / 2
+                        x_min_local = cx + vals[3] * 16
+                        x_max_local = cx + vals[1] * 16
+                        # Map to global coordinates
+                        x_min_global = tx_min + x_min_local * scale_x
+                        x_max_global = tx_min + x_max_local * scale_x
+                        y_min_global = ty_min + y_min_local * scale_y
+                        y_max_global = ty_min + y_max_local * scale_y
+                        global_boxes.append({
+                            'x_min': x_min_global,
+                            'x_max': x_max_global,
+                            'y_min': y_min_global,
+                            'y_max': y_max_global,
+                            'y_center': (y_min_global + y_max_global) / 2
+                        })
+        if not global_boxes:
+            return []
+        # 3. Cluster predictions into global blocks
+        blocks = []
+        visited = set()
+        for i, box in enumerate(global_boxes):
+            if i in visited:
+                continue
+            block = []
+            queue = [i]
+            visited.add(i)
+            while queue:
+                curr_idx = queue.pop(0)
+                curr_box = global_boxes[curr_idx]
+                block.append(curr_box)
+                for nbr_idx, nbr_box in enumerate(global_boxes):
+                    if nbr_idx not in visited:
+                        if self._are_boxes_close(curr_box, nbr_box):
+                            visited.add(nbr_idx)
+                            queue.append(nbr_idx)
+            blocks.append(block)
+        # Sort blocks top-to-bottom
+        blocks.sort(key=lambda b: min(box['y_min'] for box in b))
+        results = []
+        # 4. Extract lines and run OCR using horizontal chunking
+        for block in blocks:
+            min_x = min(box['x_min'] for box in block)
+            max_x = max(box['x_max'] for box in block)
+            min_y = min(box['y_min'] for box in block)
+            max_y = max(box['y_max'] for box in block)
+            # Pad block bounds slightly
+            min_x = max(0, int(min_x) - 16)
+            max_x = min(orig_w, int(max_x) + 16)
+            min_y = max(0, int(min_y) - 8)
+            max_y = min(orig_h, int(max_y) + 8)
+            block_crop = img_gray[min_y:max_y, min_x:max_x]
+            if block_crop.size == 0:
+                continue
+            line_splits = self._split_block_into_lines(block_crop)
+            for y_min_rel, y_max_rel in line_splits:
+                y_min_line = min_y + y_min_rel
+                y_max_line = min_y + y_max_rel
+                line_gray = img_gray[y_min_line:y_max_line, min_x:max_x]
+                x_min_rel, x_max_rel = self._find_text_horizontal_bounds(line_gray)
+                x_min_line = max(0, int(min_x + x_min_rel))
+                x_max_line = min(orig_w, int(min_x + x_max_rel))
+                # Skip empty lines
+                if (x_max_line - x_min_line) <= 0 or (y_max_line - y_min_line) <= 0:
+                    continue
+                width = x_max_line - x_min_line
+                line_h = y_max_line - y_min_line
+                # Compute the maximum chunk width that the recognizer can handle
+                # without squishing the text horizontally.  The recognizer's
+                # input tensor is (target_h × target_w); keeping aspect ratio
+                # means the original-space chunk width must satisfy:
+                #   chunk_w * (target_h / line_h) <= target_w
+                # => chunk_w <= line_h * target_w / target_h
+                # We retrieve target dimensions from the recognizer's input tensor.
+                rec_shape = self.recognizer.input_details[0]['shape']  # [1, H, W, 1]
+                rec_dim1, rec_dim2 = rec_shape[1], rec_shape[2]
+                rec_target_h = min(rec_dim1, rec_dim2)
+                rec_target_w = max(rec_dim1, rec_dim2)
+                max_chunk_w = max(20, int(line_h * rec_target_w / rec_target_h) - 4)
+                if self.lang in ('zh', 'ja', 'ko'):
+                    overlap_ratio = 0.40
+                else:
+                    overlap_ratio = 0.55
+                chunk_w = max_chunk_w
+                overlap = int(chunk_w * overlap_ratio)
+                if width <= chunk_w:
+                    crop = img_rgb[y_min_line:y_max_line, x_min_line:x_max_line]
+                    text = self.recognizer.recognize(crop)
+                else:
+                    chunks = []
+                    curr_x = x_min_line
+                    while curr_x < x_max_line:
+                        end_x = min(x_max_line, curr_x + chunk_w)
+                        chunks.append((curr_x, end_x))
+                        if end_x == x_max_line:
+                            break
+                        curr_x = end_x - overlap
+                    text = ""
+                    for cx_min, cx_max in chunks:
+                        crop = img_rgb[y_min_line:y_max_line, cx_min:cx_max]
+                        chunk_text = self.recognizer.recognize(crop)
+                        text = self._merge_overlapping_texts(text, chunk_text)
+                if self.lang in ('ar', 'he'):
+                    text = text[::-1]
+                results.append({
+                    "box": [x_min_line, y_min_line, x_max_line, y_max_line],
+                    "text": text
+                })
+        # Sort results top-to-bottom based on y_center of box
+        results.sort(key=lambda r: (r['box'][1] + r['box'][3]) / 2)
+        return results

openmlkit/recognizer.py ADDED Viewed

@@ -0,0 +1,108 @@
+# openmlkit/recognizer.py
+import numpy as np
+import cv2
+import tflite_runtime.interpreter as tflite
+class TextRecognizer:
+    def __init__(self, model_path, label_map):
+        self.interpreter = tflite.Interpreter(model_path=model_path)
+        self.interpreter.allocate_tensors()
+        self.input_details = self.interpreter.get_input_details()
+        self.output_details = self.interpreter.get_output_details()
+        self.label_map = label_map
+    def recognize(self, crop_img):
+        """
+        Recognizes the text in a cropped image (can be color or grayscale).
+        Returns the decoded text string.
+        """
+        if len(crop_img.shape) == 3:
+            crop_gray = cv2.cvtColor(crop_img, cv2.COLOR_RGB2GRAY)
+        else:
+            crop_gray = crop_img.copy()
+        # Determine model layout and target dimensions
+        shape = self.input_details[0]['shape']
+        dim1, dim2 = shape[1], shape[2]
+        if dim1 > dim2:
+            # Model expects [batch, width, height, channels] layout
+            target_w = dim1
+            target_h = dim2
+            transpose = True
+        else:
+            # Model expects [batch, height, width, channels] layout
+            target_h = dim1
+            target_w = dim2
+            transpose = False
+        # Preserve aspect ratio: scale by height, pad width with background.
+        # Directly squishing wide text into target_w causes garbled output.
+        h_src, w_src = crop_gray.shape
+        scale = target_h / h_src
+        new_w = int(round(w_src * scale))
+        if new_w <= target_w:
+            # Scale to target height, then right-pad with background colour
+            interp_flag = cv2.INTER_AREA if scale < 1.0 else cv2.INTER_CUBIC
+            resized = cv2.resize(crop_gray, (new_w, target_h), interpolation=interp_flag)
+            # Detect background colour (most frequent of corners)
+            corners = [crop_gray[0, 0], crop_gray[0, -1],
+                       crop_gray[-1, 0], crop_gray[-1, -1]]
+            bg = int(np.median(corners))
+            canvas = np.full((target_h, target_w), bg, dtype=np.uint8)
+            canvas[:, :new_w] = resized
+            crop_resized = canvas
+        else:
+            # Chunk is still wider than the model window — squish as last resort
+            crop_resized = cv2.resize(crop_gray, (target_w, target_h),
+                                      interpolation=cv2.INTER_AREA)
+        # Reshape to standard height-width layout
+        input_data = crop_resized.reshape((1, target_h, target_w, 1))
+        if transpose:
+            input_data = np.transpose(input_data, (0, 2, 1, 3))
+        self.interpreter.set_tensor(self.input_details[0]['index'], input_data)
+        self.interpreter.invoke()
+        # Dynamically locate the 3D output tensor (CTC logits)
+        output_detail = None
+        for out in self.output_details:
+            if len(out['shape']) == 3:
+                output_detail = out
+                break
+        if output_detail is None:
+            output_detail = self.output_details[0]
+        output_data = self.interpreter.get_tensor(output_detail['index'])
+        # Dequantize output
+        scale, zero_point = output_detail['quantization']
+        dequantized = (output_data.astype(np.float32) - zero_point) * scale
+        # Decode text
+        return self._ctc_decode(dequantized)
+    def _ctc_decode(self, output_tensor):
+        # output_tensor shape: [1, 42, V]
+        logits = output_tensor[0]
+        best_paths = np.argmax(logits, axis=-1)
+        decoded_indices = []
+        prev_idx = -1
+        # CTC blank token: conventionally the last class index (V - 1)
+        V = output_tensor.shape[-1]
+        blank_token = V - 1
+        for idx in best_paths:
+            if idx != blank_token:
+                if idx != prev_idx:
+                    decoded_indices.append(idx)
+            prev_idx = idx
+        chars = [self.label_map.get(idx, "") for idx in decoded_indices]
+        return "".join(chars)

openmlkitocr-1.0.0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,88 @@
+Metadata-Version: 2.4
+Name: openmlkitOCR
+Version: 1.0.0
+Summary: A lightweight offline OCR library using Google ML Kit TFLite models
+Author: 0cve0
+Classifier: Programming Language :: Python :: 3
+Classifier: License :: OSI Approved :: Apache Software License
+Classifier: Operating System :: OS Independent
+Classifier: Topic :: Scientific/Engineering :: Image Recognition
+Requires-Python: >=3.7
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: numpy<2.0.0
+Requires-Dist: opencv-python
+Requires-Dist: tflite-runtime
+Requires-Dist: huggingface_hub
+Dynamic: license-file
+# OpenMLkit OCR
+A lightweight, offline Python OCR (Optical Character Recognition) library utilizing highly optimized, mobile-ready Google ML Kit TFLite models. It performs text detection using a Region Proposal Network (RPN) and line recognition using a CRNN-CTC architecture.
+---
+## Features
+- **Fully Offline:** Runs entirely local, no API keys or internet connection required after downloading models.
+- **Multilingual Support:** Supports 15+ languages and scripts (Cyrillic/Russian, Latin/English, Chinese, Japanese, Korean, Arabic, Hebrew, and various Indian scripts).
+- **Auto-downloading:** Automatically downloads and caches required models from Hugging Face if they are not present locally.
+- **High Quality Stitching:** Handles wide text lines without squishing by dividing them into overlapping chunks and merging them using fuzzy suffix-prefix alignment.
+---
+## Installation
+Install the package directly using pip:
+```bash
+pip install openmlkitOCR
+```
+Or install from source:
+```bash
+git clone https://github.com/0cve0/OpenMLkitOCR.git
+cd OpenMLkitOCR
+pip install -e .
+```
+---
+## Quick Start
+```python
+import os
+import cv2
+from openmlkit import OpenMLKitOCR
+# Configure Hugging Face model source (or use defaults)
+os.environ["OPENMLKIT_MODEL_REPO"] = "0cve0/OpenMLKitOCR"
+# Initialize OCR pipeline for Cyrillic (Russian) text
+ocr = OpenMLKitOCR(lang='ru')
+# Load image
+img = cv2.imread("scratch/russian_test.png")
+# Run OCR (detect and recognize text)
+results = ocr.run(img, score_threshold=0.35)
+# Output results
+for r in results:
+    print(f"Box: {r['box']} -> Text: {r['text']}")
+```
+---
+## Project Structure
+- `openmlkit/` - Core Python package directory.
+  - `detector.py` - RPN text detection logic.
+  - `recognizer.py` - CRNN text recognition logic.
+  - `labelmap.py` - Parse binary protobuf label maps.
+  - `pipeline.py` - OCR pipeline joining detection, tiling, and recognition.
+---
+## License
+This project is licensed under the Apache 2.0 License. The model weights are subject to Google's terms and licenses for ML Kit.

openmlkitocr-1.0.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,10 @@
+openmlkit/__init__.py,sha256=j_siZuU0kBd1JOEDII5_O3WuR0vXsU_YPynL5b8BeW4,261
+openmlkit/detector.py,sha256=OomIXweHBQjgUkCevdc1ihljWD2iC84ysfIC7CHtgYg,4454
+openmlkit/labelmap.py,sha256=sz8zq3_iFqQNOUtdLLnYMCZ-nkH_G1BLRcpCePDpM_Y,3489
+openmlkit/pipeline.py,sha256=fw6b7sEpv1f4kHd56u5cuYl9Occ0qzUmbZX77Gqpk84,20045
+openmlkit/recognizer.py,sha256=Rn_6nCppkSxTFQgDiKeUQCrCbXAvm5CyBsKm67CracI,4122
+openmlkitocr-1.0.0.dist-info/licenses/LICENSE,sha256=mPHeCzLulNZK-e1kg1dPV6GH6hoE5hgOu5ETNc3KM5U,10172
+openmlkitocr-1.0.0.dist-info/METADATA,sha256=4PtDkN9SaeR1i_EzSLhdb8f76HZOvGwc2gjOCDs-b_c,2660
+openmlkitocr-1.0.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
+openmlkitocr-1.0.0.dist-info/top_level.txt,sha256=Q-Q8FQ77Bhc6BoZ8NX3Tjoe8rMsUmxv815_uqhmSwJc,10
+openmlkitocr-1.0.0.dist-info/RECORD,,

openmlkitocr-1.0.0.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,5 @@
+Wheel-Version: 1.0
+Generator: setuptools (82.0.1)
+Root-Is-Purelib: true
+Tag: py3-none-any

openmlkitocr-1.0.0.dist-info/licenses/LICENSE ADDED Viewed

@@ -0,0 +1,176 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that you distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS

openmlkitocr-1.0.0.dist-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ openmlkit