PyPI - mathcraft-ocr - Versions diffs - 0.1.0__py3-none-any.whl - Mend

mathcraft-ocr 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

mathcraft_ocr/__init__.py +39 -0
mathcraft_ocr/__main__.py +6 -0
mathcraft_ocr/adapters/__init__.py +13 -0
mathcraft_ocr/adapters/common.py +46 -0
mathcraft_ocr/adapters/formula_detector.py +131 -0
mathcraft_ocr/adapters/formula_recognizer.py +151 -0
mathcraft_ocr/adapters/text_detector.py +57 -0
mathcraft_ocr/adapters/text_recognizer.py +121 -0
mathcraft_ocr/api.py +14 -0
mathcraft_ocr/cache.py +135 -0
mathcraft_ocr/cli.py +110 -0
mathcraft_ocr/debug_blocks.py +202 -0
mathcraft_ocr/doctor.py +50 -0
mathcraft_ocr/downloader.py +97 -0
mathcraft_ocr/errors.py +21 -0
mathcraft_ocr/hardware.py +203 -0
mathcraft_ocr/image.py +33 -0
mathcraft_ocr/layout.py +892 -0
mathcraft_ocr/manifest.py +89 -0
mathcraft_ocr/manifests/models.v1.json +89 -0
mathcraft_ocr/providers.py +80 -0
mathcraft_ocr/results.py +53 -0
mathcraft_ocr/runtime.py +535 -0
mathcraft_ocr/serialization.py +120 -0
mathcraft_ocr/worker.py +131 -0
mathcraft_ocr-0.1.0.dist-info/METADATA +184 -0
mathcraft_ocr-0.1.0.dist-info/RECORD +31 -0
mathcraft_ocr-0.1.0.dist-info/WHEEL +5 -0
mathcraft_ocr-0.1.0.dist-info/entry_points.txt +3 -0
mathcraft_ocr-0.1.0.dist-info/licenses/LICENSE +21 -0
mathcraft_ocr-0.1.0.dist-info/top_level.txt +1 -0

mathcraft_ocr/__init__.py ADDED Viewed

@@ -0,0 +1,39 @@
+# coding: utf-8
+from __future__ import annotations
+__version__ = "0.1.0"
+__all__ = [
+    "DoctorReport",
+    "FormulaRecognitionResult",
+    "MathCraftBlock",
+    "MathCraftError",
+    "MathCraftRuntime",
+    "MixedRecognitionResult",
+    "OCRRegion",
+    "__version__",
+    "run_doctor",
+]
+def __getattr__(name: str) -> object:
+    if name in {
+        "FormulaRecognitionResult",
+        "MathCraftBlock",
+        "MathCraftRuntime",
+        "MixedRecognitionResult",
+        "OCRRegion",
+    }:
+        from . import api
+        return getattr(api, name)
+    if name in {"DoctorReport", "run_doctor"}:
+        from . import doctor
+        return getattr(doctor, name)
+    if name == "MathCraftError":
+        from .errors import MathCraftError
+        return MathCraftError
+    raise AttributeError(name)

mathcraft_ocr/__main__.py ADDED Viewed

@@ -0,0 +1,6 @@
+# coding: utf-8
+from .cli import main
+raise SystemExit(main())

mathcraft_ocr/adapters/__init__.py ADDED Viewed

@@ -0,0 +1,13 @@
+# coding: utf-8
+from .formula_detector import warmup_formula_detector
+from .formula_recognizer import warmup_formula_recognizer
+from .text_detector import warmup_text_detector
+from .text_recognizer import warmup_pp_text_recognizer
+__all__ = [
+    "warmup_formula_detector",
+    "warmup_formula_recognizer",
+    "warmup_text_detector",
+    "warmup_pp_text_recognizer",
+]

mathcraft_ocr/adapters/common.py ADDED Viewed

@@ -0,0 +1,46 @@
+# coding: utf-8
+from __future__ import annotations
+import importlib
+from functools import lru_cache
+from pathlib import Path
+from ..providers import GPU_PROVIDER_NAMES, ProviderInfo
+def _ort():
+    return importlib.import_module("onnxruntime")
+def session_providers(provider_info: ProviderInfo) -> list[str]:
+    available = list(provider_info.available_providers)
+    active = provider_info.active_provider
+    if active and active in GPU_PROVIDER_NAMES and "CPUExecutionProvider" in available:
+        return [active, "CPUExecutionProvider"]
+    if "CPUExecutionProvider" in available:
+        return ["CPUExecutionProvider"]
+    return available
+def create_session(model_path: str | Path, provider_info: ProviderInfo):
+    model_path = str(Path(model_path).resolve())
+    providers = tuple(session_providers(provider_info))
+    session = _create_session_cached(model_path, providers)
+    actual = list(session.get_providers() or [])
+    active = provider_info.active_provider
+    if active and active in GPU_PROVIDER_NAMES and active not in actual:
+        raise RuntimeError(
+            f"requested ONNX GPU provider {active}, but session providers are {actual}"
+        )
+    return session
+@lru_cache(maxsize=16)
+def _create_session_cached(model_path: str, providers: tuple[str, ...]):
+    ort = _ort()
+    return ort.InferenceSession(model_path, providers=list(providers))
+def clear_session_cache() -> None:
+    _create_session_cached.cache_clear()

mathcraft_ocr/adapters/formula_detector.py ADDED Viewed

@@ -0,0 +1,131 @@
+# coding: utf-8
+from __future__ import annotations
+from dataclasses import dataclass
+from pathlib import Path
+import cv2
+import numpy as np
+from .common import create_session
+@dataclass(frozen=True)
+class FormulaBox:
+    box: tuple[
+        tuple[float, float],
+        tuple[float, float],
+        tuple[float, float],
+        tuple[float, float],
+    ]
+    score: float
+    label: str
+def warmup_formula_detector(model_dir: str | Path, provider_info) -> None:
+    root = Path(model_dir)
+    candidates = sorted(root.glob("*mfd*.onnx"))
+    if not candidates:
+        raise FileNotFoundError(f"no mfd onnx file found under {root}")
+    create_session(candidates[0], provider_info)
+def _letterbox(image: np.ndarray, target_size: int = 768) -> tuple[np.ndarray, float, tuple[float, float]]:
+    height, width = image.shape[:2]
+    scale = min(target_size / width, target_size / height)
+    new_w = int(round(width * scale))
+    new_h = int(round(height * scale))
+    resized = cv2.resize(image, (new_w, new_h), interpolation=cv2.INTER_LINEAR)
+    canvas = np.full((target_size, target_size, 3), 114, dtype=np.uint8)
+    pad_x = (target_size - new_w) / 2
+    pad_y = (target_size - new_h) / 2
+    left = int(round(pad_x - 0.1))
+    top = int(round(pad_y - 0.1))
+    canvas[top : top + new_h, left : left + new_w] = resized
+    return canvas, scale, (float(left), float(top))
+def _nms_xyxy(boxes: np.ndarray, scores: np.ndarray, iou_threshold: float) -> list[int]:
+    if len(boxes) == 0:
+        return []
+    x1 = boxes[:, 0]
+    y1 = boxes[:, 1]
+    x2 = boxes[:, 2]
+    y2 = boxes[:, 3]
+    areas = np.maximum(0.0, x2 - x1) * np.maximum(0.0, y2 - y1)
+    order = scores.argsort()[::-1]
+    keep: list[int] = []
+    while order.size > 0:
+        current = int(order[0])
+        keep.append(current)
+        if order.size == 1:
+            break
+        rest = order[1:]
+        xx1 = np.maximum(x1[current], x1[rest])
+        yy1 = np.maximum(y1[current], y1[rest])
+        xx2 = np.minimum(x2[current], x2[rest])
+        yy2 = np.minimum(y2[current], y2[rest])
+        inter_w = np.maximum(0.0, xx2 - xx1)
+        inter_h = np.maximum(0.0, yy2 - yy1)
+        intersection = inter_w * inter_h
+        union = areas[current] + areas[rest] - intersection
+        iou = np.divide(intersection, union, out=np.zeros_like(intersection), where=union > 0)
+        order = rest[iou <= iou_threshold]
+    return keep
+def detect_formula_boxes(
+    image_rgb: np.ndarray,
+    model_dir: str | Path,
+    provider_info,
+    *,
+    confidence_threshold: float = 0.25,
+    iou_threshold: float = 0.45,
+    input_size: int = 768,
+) -> tuple[FormulaBox, ...]:
+    root = Path(model_dir)
+    candidates = sorted(root.glob("*mfd*.onnx"))
+    if not candidates:
+        raise FileNotFoundError(f"no mfd onnx file found under {root}")
+    session = create_session(candidates[0], provider_info)
+    preprocessed, scale, (pad_x, pad_y) = _letterbox(image_rgb, input_size)
+    model_input = (
+        preprocessed.astype(np.float32).transpose(2, 0, 1)[np.newaxis, ...] / 255.0
+    )
+    output = session.run(None, {session.get_inputs()[0].name: model_input})[0]
+    preds = np.asarray(output[0]).T
+    if preds.size == 0 or preds.shape[1] < 6:
+        return ()
+    xywh = preds[:, :4]
+    class_scores = preds[:, 4:]
+    class_ids = np.argmax(class_scores, axis=1)
+    scores = class_scores[np.arange(len(class_scores)), class_ids]
+    mask = scores >= confidence_threshold
+    if not np.any(mask):
+        return ()
+    xywh = xywh[mask]
+    class_ids = class_ids[mask]
+    scores = scores[mask]
+    x, y, w, h = xywh[:, 0], xywh[:, 1], xywh[:, 2], xywh[:, 3]
+    boxes = np.stack([x - w / 2, y - h / 2, x + w / 2, y + h / 2], axis=1)
+    boxes[:, [0, 2]] = (boxes[:, [0, 2]] - pad_x) / scale
+    boxes[:, [1, 3]] = (boxes[:, [1, 3]] - pad_y) / scale
+    height, width = image_rgb.shape[:2]
+    boxes[:, [0, 2]] = np.clip(boxes[:, [0, 2]], 0, width)
+    boxes[:, [1, 3]] = np.clip(boxes[:, [1, 3]], 0, height)
+    labels = ("embedding", "isolated")
+    keep = _nms_xyxy(boxes, scores, iou_threshold)
+    results: list[FormulaBox] = []
+    for index in keep:
+        x1, y1, x2, y2 = boxes[index].tolist()
+        results.append(
+            FormulaBox(
+                box=((x1, y1), (x2, y1), (x2, y2), (x1, y2)),
+                score=float(scores[index]),
+                label=labels[int(class_ids[index])] if int(class_ids[index]) < len(labels) else str(int(class_ids[index])),
+            )
+        )
+    return tuple(results)

mathcraft_ocr/adapters/formula_recognizer.py ADDED Viewed

@@ -0,0 +1,151 @@
+# coding: utf-8
+from __future__ import annotations
+import json
+from functools import lru_cache
+from pathlib import Path
+import numpy as np
+from PIL import Image
+from .common import create_session
+def _softmax(logits: np.ndarray) -> np.ndarray:
+    shifted = logits - np.max(logits, axis=-1, keepdims=True)
+    exp = np.exp(shifted)
+    return exp / np.sum(exp, axis=-1, keepdims=True)
+@lru_cache(maxsize=8)
+def _load_processor(model_dir: str):
+    from transformers import AutoTokenizer, TrOCRProcessor, ViTImageProcessor
+    image_processor = ViTImageProcessor.from_pretrained(model_dir, use_fast=False)
+    tokenizer = AutoTokenizer.from_pretrained(model_dir, use_fast=True)
+    return TrOCRProcessor(image_processor=image_processor, tokenizer=tokenizer)
+def warmup_formula_recognizer(model_dir: str | Path, provider_info) -> None:
+    root = Path(model_dir)
+    encoder = root / "encoder_model.onnx"
+    decoder = root / "decoder_model.onnx"
+    if not encoder.is_file():
+        raise FileNotFoundError(f"missing encoder model under {root}")
+    if not decoder.is_file():
+        raise FileNotFoundError(f"missing decoder model under {root}")
+    create_session(encoder, provider_info)
+    create_session(decoder, provider_info)
+def _load_generation_ids(model_dir: Path, tokenizer) -> tuple[int, int | None]:
+    decoder_start_id = None
+    eos_id = None
+    for filename in ("generation_config.json", "config.json"):
+        path = model_dir / filename
+        if not path.is_file():
+            continue
+        try:
+            data = json.loads(path.read_text(encoding="utf-8-sig"))
+        except Exception:
+            continue
+        decoder_start_id = data.get("decoder_start_token_id", decoder_start_id)
+        eos_id = data.get("eos_token_id", eos_id)
+        decoder_config = data.get("decoder")
+        if isinstance(decoder_config, dict):
+            decoder_start_id = decoder_config.get("decoder_start_token_id", decoder_start_id)
+            eos_id = decoder_config.get("eos_token_id", eos_id)
+        if decoder_start_id is not None and eos_id is not None:
+            break
+    if decoder_start_id is None:
+        decoder_start_id = tokenizer.bos_token_id
+    if decoder_start_id is None:
+        raise ValueError(f"missing decoder_start_token_id under {model_dir}")
+    if eos_id is None:
+        eos_id = tokenizer.eos_token_id
+    return int(decoder_start_id), int(eos_id) if eos_id is not None else None
+def recognize_formula_image(
+    image: Image.Image | np.ndarray,
+    model_dir: str | Path,
+    provider_info,
+    *,
+    max_new_tokens: int = 256,
+) -> tuple[str, float]:
+    return recognize_formula_images(
+        [image],
+        model_dir,
+        provider_info,
+        max_new_tokens=max_new_tokens,
+    )[0]
+def recognize_formula_images(
+    images: list[Image.Image | np.ndarray],
+    model_dir: str | Path,
+    provider_info,
+    *,
+    max_new_tokens: int = 256,
+) -> list[tuple[str, float]]:
+    if not images:
+        return []
+    root = Path(model_dir)
+    processor = _load_processor(str(root))
+    encoder_session = create_session(root / "encoder_model.onnx", provider_info)
+    decoder_session = create_session(root / "decoder_model.onnx", provider_info)
+    pil_images = [image if isinstance(image, Image.Image) else Image.fromarray(image) for image in images]
+    features = processor(images=pil_images, return_tensors="np")
+    pixel_values = np.asarray(features["pixel_values"], dtype=np.float32)
+    encoder_input_name = encoder_session.get_inputs()[0].name
+    encoder_hidden_states = encoder_session.run(
+        None,
+        {encoder_input_name: pixel_values},
+    )[0]
+    tokenizer = processor.tokenizer
+    decoder_start_id, eos_id = _load_generation_ids(root, tokenizer)
+    batch_size = len(pil_images)
+    input_ids = np.full((batch_size, 1), decoder_start_id, dtype=np.int64)
+    token_ids: list[list[int]] = [[] for _ in range(batch_size)]
+    token_scores: list[list[float]] = [[] for _ in range(batch_size)]
+    finished = np.zeros((batch_size,), dtype=bool)
+    pad_after_finish_id = eos_id if eos_id is not None else decoder_start_id
+    for _ in range(max_new_tokens):
+        decoder_inputs = {
+            decoder_session.get_inputs()[0].name: input_ids,
+            decoder_session.get_inputs()[1].name: encoder_hidden_states,
+        }
+        logits = decoder_session.run(None, decoder_inputs)[0]
+        step_logits = logits[:, -1, :]
+        step_probs = _softmax(step_logits)
+        next_tokens = np.argmax(step_probs, axis=1).astype(np.int64)
+        next_column = next_tokens.copy()
+        for row, next_token in enumerate(next_tokens.tolist()):
+            if finished[row]:
+                next_column[row] = pad_after_finish_id
+                continue
+            next_prob = float(step_probs[row, next_token])
+            if eos_id is not None and next_token == eos_id:
+                finished[row] = True
+                next_column[row] = pad_after_finish_id
+                continue
+            token_ids[row].append(int(next_token))
+            token_scores[row].append(next_prob)
+        if bool(np.all(finished)):
+            break
+        input_ids = np.concatenate(
+            [input_ids, next_column.reshape(batch_size, 1)],
+            axis=1,
+        )
+    results: list[tuple[str, float]] = []
+    for ids, scores in zip(token_ids, token_scores):
+        text = tokenizer.decode(ids, skip_special_tokens=True).strip()
+        score = float(sum(scores) / len(scores)) if scores else 0.0
+        results.append((text, score))
+    return results

mathcraft_ocr/adapters/text_detector.py ADDED Viewed

@@ -0,0 +1,57 @@
+# coding: utf-8
+from __future__ import annotations
+from pathlib import Path
+import numpy as np
+from rapidocr.ch_ppocr_det.utils import DBPostProcess, DetPreProcess
+from .common import create_session
+def _find_detector_model(root: Path) -> Path:
+    candidates = sorted(root.glob("**/*det*.onnx"))
+    if not candidates:
+        raise FileNotFoundError(f"missing text detector model under {root}")
+    return candidates[0]
+def warmup_text_detector(model_dir: str | Path, provider_info) -> None:
+    root = Path(model_dir)
+    create_session(_find_detector_model(root), provider_info)
+def _limit_side_len(image: np.ndarray) -> int:
+    max_wh = max(image.shape[0], image.shape[1])
+    return min(max_wh, 960)
+def detect_text_boxes(
+    image_bgr: np.ndarray,
+    model_dir: str | Path,
+    provider_info,
+) -> tuple[np.ndarray, tuple[float, ...]]:
+    root = Path(model_dir)
+    model_path = _find_detector_model(root)
+    session = create_session(model_path, provider_info)
+    pre = DetPreProcess(
+        limit_side_len=_limit_side_len(image_bgr),
+        limit_type="max",
+        mean=[0.5, 0.5, 0.5],
+        std=[0.5, 0.5, 0.5],
+    )
+    post = DBPostProcess(
+        thresh=0.3,
+        box_thresh=0.5,
+        max_candidates=1000,
+        unclip_ratio=1.6,
+        use_dilation=True,
+    )
+    model_input = pre(image_bgr)
+    outputs = session.run(
+        None,
+        {session.get_inputs()[0].name: model_input},
+    )
+    boxes, scores = post(outputs[0], (image_bgr.shape[0], image_bgr.shape[1]))
+    return boxes, tuple(float(score) for score in scores)

mathcraft_ocr/adapters/text_recognizer.py ADDED Viewed

@@ -0,0 +1,121 @@
+# coding: utf-8
+from __future__ import annotations
+from functools import lru_cache
+from pathlib import Path
+import numpy as np
+from rapidocr import EngineType, LangRec, ModelType, OCRVersion
+from rapidocr.ch_ppocr_rec import TextRecInput, TextRecognizer
+from rapidocr.utils.typings import TaskType
+class _Config(dict):
+    def __init__(self, *args, **kwargs):
+        super().__init__()
+        data = dict(*args, **kwargs)
+        for key, value in data.items():
+            if isinstance(value, dict):
+                value = _Config(value)
+            self[key] = value
+    def __getattr__(self, name):
+        try:
+            return self[name]
+        except KeyError as exc:
+            raise AttributeError(name) from exc
+    def __setattr__(self, name, value):
+        self[name] = value
+def warmup_pp_text_recognizer(model_dir: str | Path, provider_info) -> None:
+    recognizer = _create_pp_text_recognizer(Path(model_dir), provider_info)
+    recognizer.rec_batch_num = 1
+def recognize_pp_text_lines(
+    images_bgr: list[np.ndarray],
+    model_dir: str | Path,
+    provider_info,
+    *,
+    rec_batch_num: int | None = None,
+) -> list[tuple[str, float]]:
+    if not images_bgr:
+        return []
+    recognizer = _create_pp_text_recognizer(Path(model_dir), provider_info)
+    max_batch = max(1, int(rec_batch_num or 6))
+    recognizer.rec_batch_num = min(max(len(images_bgr), 1), max_batch)
+    rec_input = TextRecInput(img=images_bgr, return_word_box=False)
+    output = recognizer(rec_input)
+    return [(str(text), float(score)) for text, score in zip(output.txts, output.scores)]
+def _create_pp_text_recognizer(model_dir: Path, provider_info) -> TextRecognizer:
+    model_dir = model_dir.resolve()
+    use_cuda = bool(getattr(provider_info, "device", "") == "gpu")
+    return _create_pp_text_recognizer_cached(str(model_dir), use_cuda)
+@lru_cache(maxsize=8)
+def _create_pp_text_recognizer_cached(model_dir: str, use_cuda: bool) -> TextRecognizer:
+    model_dir = Path(model_dir)
+    model_candidates = sorted(model_dir.glob("**/*rec*.onnx"))
+    if not model_candidates:
+        raise FileNotFoundError(f"no PP-OCR recognizer onnx file found under {model_dir}")
+    model_path = model_candidates[0]
+    dict_path = _find_pp_vocab(model_dir)
+    if dict_path is None:
+        raise FileNotFoundError(f"missing PP-OCR vocabulary under {model_dir}")
+    model_name = model_path.name
+    is_server = "server" in model_name or "server" in model_dir.name
+    is_v5 = "v5" in model_name or "v5" in model_dir.name
+    is_english = dict_path.name == "en_dict.txt"
+    config = _Config({
+        "engine_type": EngineType.ONNXRUNTIME,
+        "lang_type": LangRec.EN if is_english else LangRec.CH,
+        "model_type": ModelType.SERVER if is_server else ModelType.MOBILE,
+        "ocr_version": OCRVersion.PPOCRV5 if is_v5 else OCRVersion.PPOCRV4,
+        "task_type": TaskType.REC,
+        "model_path": str(model_path),
+        "model_dir": None,
+        "rec_keys_path": str(dict_path),
+        "rec_img_shape": [3, 48, 320],
+        "rec_batch_num": 6,
+        "font_path": None,
+        "engine_cfg": {
+            "intra_op_num_threads": -1,
+            "inter_op_num_threads": -1,
+            "enable_cpu_mem_arena": False,
+            "cpu_ep_cfg": {"arena_extend_strategy": "kSameAsRequested"},
+            "use_cuda": use_cuda,
+            "cuda_ep_cfg": {
+                "device_id": 0,
+                "arena_extend_strategy": "kNextPowerOfTwo",
+                "cudnn_conv_algo_search": "EXHAUSTIVE",
+                "do_copy_in_default_stream": True,
+            },
+            "use_dml": False,
+            "dm_ep_cfg": None,
+            "use_cann": False,
+            "cann_ep_cfg": {
+                "device_id": 0,
+                "arena_extend_strategy": "kNextPowerOfTwo",
+                "npu_mem_limit": 21474836480,
+                "op_select_impl_mode": "high_performance",
+                "optypelist_for_implmode": "Gelu",
+                "enable_cann_graph": True,
+            },
+        },
+    })
+    return TextRecognizer(config)
+def clear_text_recognizer_cache() -> None:
+    _create_pp_text_recognizer_cached.cache_clear()
+def _find_pp_vocab(model_dir: Path) -> Path | None:
+    candidate = model_dir / "ppocrv5_keys.txt"
+    return candidate if candidate.is_file() else None

mathcraft_ocr/api.py ADDED Viewed

@@ -0,0 +1,14 @@
+# coding: utf-8
+from .results import FormulaRecognitionResult, MathCraftBlock, MixedRecognitionResult, OCRRegion
+from .runtime import MathCraftRuntime, WarmupComponentStatus, WarmupPlan
+__all__ = [
+    "FormulaRecognitionResult",
+    "MathCraftBlock",
+    "MathCraftRuntime",
+    "MixedRecognitionResult",
+    "OCRRegion",
+    "WarmupComponentStatus",
+    "WarmupPlan",
+]