PyPI - ocr-postprocess - Versions diffs - 0.1.0__py3-none-any.whl - Mend

ocr-postprocess 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (55) hide show

ocr_postprocess/__init__.py +33 -0
ocr_postprocess/classifier.py +63 -0
ocr_postprocess/cli.py +130 -0
ocr_postprocess/engine/__init__.py +0 -0
ocr_postprocess/engine/denoiser.py +134 -0
ocr_postprocess/engine/extractor_stage.py +107 -0
ocr_postprocess/engine/normalizer.py +128 -0
ocr_postprocess/engine/reconciler.py +170 -0
ocr_postprocess/engine/reconstructor.py +469 -0
ocr_postprocess/engine/transform_stage.py +89 -0
ocr_postprocess/exceptions.py +30 -0
ocr_postprocess/extractors/__init__.py +0 -0
ocr_postprocess/extractors/base.py +103 -0
ocr_postprocess/extractors/helpers.py +63 -0
ocr_postprocess/extractors/label_anchor/__init__.py +0 -0
ocr_postprocess/extractors/label_anchor/line_after_label.py +53 -0
ocr_postprocess/extractors/label_anchor/regex_after_label.py +75 -0
ocr_postprocess/extractors/label_anchor/text_until_next_label.py +79 -0
ocr_postprocess/extractors/label_anchor/value_between_labels.py +65 -0
ocr_postprocess/extractors/label_anchor/value_in_same_line.py +60 -0
ocr_postprocess/extractors/pattern/__init__.py +0 -0
ocr_postprocess/extractors/pattern/cccd.py +120 -0
ocr_postprocess/extractors/pattern/cmnd.py +38 -0
ocr_postprocess/extractors/pattern/currency_vnd.py +48 -0
ocr_postprocess/extractors/pattern/date.py +89 -0
ocr_postprocess/extractors/pattern/email.py +38 -0
ocr_postprocess/extractors/pattern/gender_vn.py +48 -0
ocr_postprocess/extractors/pattern/phone_vn.py +83 -0
ocr_postprocess/extractors/pattern/plate_vn.py +39 -0
ocr_postprocess/extractors/pattern/tax_code.py +53 -0
ocr_postprocess/extractors/registry.py +45 -0
ocr_postprocess/extractors/structured/__init__.py +0 -0
ocr_postprocess/extractors/structured/mrz_cccd.py +111 -0
ocr_postprocess/extractors/universal.py +39 -0
ocr_postprocess/models.py +131 -0
ocr_postprocess/pipeline.py +179 -0
ocr_postprocess/profiles/__init__.py +0 -0
ocr_postprocess/profiles/_generic.yml +13 -0
ocr_postprocess/profiles/cccd_2024.yml +113 -0
ocr_postprocess/profiles/dang_kiem.yml +105 -0
ocr_postprocess/profiles/loader.py +63 -0
ocr_postprocess/profiles/matcher.py +71 -0
ocr_postprocess/profiles/schema.py +197 -0
ocr_postprocess/py.typed +0 -0
ocr_postprocess/renderer/__init__.py +0 -0
ocr_postprocess/renderer/json_renderer.py +59 -0
ocr_postprocess/renderer/llm.py +41 -0
ocr_postprocess/renderer/markdown.py +172 -0
ocr_postprocess/scorer.py +78 -0
ocr_postprocess/transformer.py +304 -0
ocr_postprocess-0.1.0.dist-info/METADATA +189 -0
ocr_postprocess-0.1.0.dist-info/RECORD +55 -0
ocr_postprocess-0.1.0.dist-info/WHEEL +5 -0
ocr_postprocess-0.1.0.dist-info/entry_points.txt +2 -0
ocr_postprocess-0.1.0.dist-info/top_level.txt +1 -0

ocr_postprocess/engine/transform_stage.py ADDED Viewed

@@ -0,0 +1,89 @@
+"""Stage 6 — Transform: apply transform ops and compute derived fields."""
+from __future__ import annotations
+import logging
+from typing import Any
+from ocr_postprocess.models import Candidate, PipelineContext
+from ocr_postprocess.transformer import apply_transforms, compute_field
+logger = logging.getLogger(__name__)
+def transform_stage(ctx: PipelineContext) -> None:
+    """Pipeline stage 6: transform candidate values and compute derived fields."""
+    profile = ctx.profile
+    if not profile:
+        return
+    # Build field lookup
+    field_map = {f.key: f for f in profile.fields}
+    # Apply transforms to existing candidates
+    transform_failures = 0
+    for cand in ctx.candidates:
+        field = field_map.get(cand.key)
+        if field is None or not field.transform:
+            continue
+        try:
+            transformed = apply_transforms(cand.value, field.transform)
+            cand.value = transformed
+        except Exception as exc:
+            transform_failures += 1
+            logger.warning("Transform failed for '%s': %s", cand.key, exc)
+            cand.notes.append(f"transform_error: {exc}")
+            cand.needs_llm_review = True
+    # Compute derived fields (topological order via profile.compute)
+    existing_values: dict[str, Any] = {c.key: c.value for c in ctx.candidates}
+    # Separate confidence map — dep VALUES are not confidences (values may be > 1)
+    existing_confidences: dict[str, float] = {c.key: c.confidence for c in ctx.candidates}
+    for compute_def in profile.compute:
+        key = compute_def.get("name") or compute_def.get("key", "")
+        expr = compute_def.get("expr") or compute_def.get("compute", "")
+        if not key or not expr:
+            continue
+        deps = compute_def.get("deps", [])
+        dep_values = {d: existing_values.get(d) for d in deps}
+        # Skip if any dep is None
+        if any(v is None for v in dep_values.values()):
+            logger.debug("Skipping compute '%s': missing deps %s", key, deps)
+            continue
+        try:
+            result = compute_field(expr, dep_values)
+            existing_values[key] = result
+            ctx.candidates.append(
+                Candidate(
+                    key=key,
+                    value=result,
+                    raw=expr,
+                    extractor="computed",
+                    sources=["computed"] + [f"dep:{d}" for d in deps],
+                    # Confidence = min confidence of deps (from actual candidate confidences),
+                    # or 0.9 if there are no deps.
+                    confidence=(
+                        min(existing_confidences.get(d, 0.8) for d in deps) if deps else 0.9
+                    ),
+                )
+            )
+        except Exception as exc:
+            logger.warning("Compute '%s' failed: %s", key, exc)
+            ctx.warnings.append(f"compute_error: {key} — {exc}")
+    computed_count = sum(1 for c in ctx.candidates if c.extractor == "computed")
+    if transform_failures:
+        logger.warning(
+            "Transform stage: %d transform failure(s), %d compute field(s) added",
+            transform_failures,
+            computed_count,
+        )
+    else:
+        logger.debug(
+            "Transform stage: %d compute field(s) added, 0 failures",
+            computed_count,
+        )

ocr_postprocess/exceptions.py ADDED Viewed

@@ -0,0 +1,30 @@
+"""Custom exceptions for ocr_postprocess."""
+class OcrPostprocessError(Exception):
+    """Base exception for all ocr_postprocess errors."""
+class ProfileNotFoundError(OcrPostprocessError):
+    """Profile id not found in registry."""
+class ProfileValidationError(OcrPostprocessError):
+    """YAML profile failed schema validation."""
+    def __init__(self, message: str, field_path: str | None = None) -> None:
+        self.field_path = field_path
+        detail = f"[{field_path}] {message}" if field_path else message
+        super().__init__(detail)
+class ExtractorNotFoundError(OcrPostprocessError):
+    """Extractor name not registered."""
+class TransformError(OcrPostprocessError):
+    """Transform op or compute expression failed."""
+class CyclicComputeError(OcrPostprocessError):
+    """Cyclic dependency detected in compute fields."""

ocr_postprocess/extractors/__init__.py ADDED Viewed

File without changes

ocr_postprocess/extractors/base.py ADDED Viewed

@@ -0,0 +1,103 @@
+"""Extractor abstract base classes."""
+from __future__ import annotations
+from abc import ABC, abstractmethod
+from typing import TYPE_CHECKING, Literal
+from ocr_postprocess.models import Candidate, PipelineContext
+if TYPE_CHECKING:
+    from ocr_postprocess.profiles.schema import FieldDef
+class Extractor(ABC):
+    """Base class for all extractors."""
+    name: str
+    kind: Literal["pattern", "label_anchor", "structured"]
+    @abstractmethod
+    def extract(
+        self,
+        ctx: PipelineContext,
+        field: "FieldDef | None" = None,
+    ) -> list[Candidate]:
+        """Extract candidates from pipeline context."""
+        ...
+class PatternExtractor(Extractor):
+    """Extractor that scans full text for a regex pattern."""
+    kind: Literal["pattern", "label_anchor", "structured"] = "pattern"
+    pattern: str = ""
+    def validate(self, value: str) -> bool:
+        """Optional validation beyond regex match."""
+        return True
+    def confidence_for(self, value: str, match: object) -> float:
+        """Return confidence score for a match."""
+        return 0.9
+    def extract(
+        self,
+        ctx: PipelineContext,
+        field: "FieldDef | None" = None,
+    ) -> list[Candidate]:
+        """Default pattern scan across all sections."""
+        # Import locally to avoid circular imports at package init time
+        import regex as re
+        if not self.pattern:
+            return []
+        candidates: list[Candidate] = []
+        compiled = re.compile(self.pattern)
+        text = ctx.normalized_text or ctx.raw_text
+        for m in compiled.finditer(text):
+            value = m.group(0)
+            if not self.validate(value):
+                continue
+            conf = self.confidence_for(value, m)
+            key = field.key if field else self.name
+            candidates.append(
+                Candidate(
+                    key=key,
+                    value=value,
+                    raw=value,
+                    extractor=self.name,
+                    sources=[f"pattern:{self.name}"],
+                    span=(m.start(), m.end()),
+                    confidence=conf,
+                )
+            )
+        return candidates
+class LabelAnchorExtractor(Extractor):
+    """Extractor that uses label positions to locate values."""
+    kind: Literal["pattern", "label_anchor", "structured"] = "label_anchor"
+    def extract(
+        self,
+        ctx: PipelineContext,
+        field: "FieldDef | None" = None,
+    ) -> list[Candidate]:
+        return []
+class StructuredExtractor(Extractor):
+    """Extractor that parses structured regions (MRZ, tables)."""
+    kind: Literal["pattern", "label_anchor", "structured"] = "structured"
+    def extract(
+        self,
+        ctx: PipelineContext,
+        field: "FieldDef | None" = None,
+    ) -> list[Candidate]:
+        return []

ocr_postprocess/extractors/helpers.py ADDED Viewed

@@ -0,0 +1,63 @@
+"""Shared helpers for extractors: label lookup and fuzzy matching."""
+from __future__ import annotations
+import logging
+from rapidfuzz import fuzz
+from ocr_postprocess.models import LabelHit, PipelineContext
+logger = logging.getLogger(__name__)
+def fuzzy_match_label(text: str, label: str, threshold: float = 0.8) -> float:
+    """Return partial_ratio score in [0, 1] if >= threshold, else 0."""
+    score = fuzz.partial_ratio(label.lower(), text.lower()) / 100.0
+    return score if score >= threshold else 0.0
+def find_label(
+    ctx: PipelineContext,
+    aliases: list[str],
+    fuzzy: bool = True,
+    threshold: float = 0.8,
+) -> list[LabelHit]:
+    """Find all label hits in ctx.label_index matching any alias.
+    Uses exact match first, falls back to fuzzy if fuzzy=True.
+    """
+    hits: list[LabelHit] = []
+    seen: set[tuple[str, int, int]] = set()
+    for alias in aliases:
+        alias_lower = alias.lower()
+        # exact match against label_index keys
+        for key, label_hits in ctx.label_index.items():
+            key_lower = key.lower()
+            score = 1.0
+            if key_lower != alias_lower:
+                if not fuzzy:
+                    continue
+                score = fuzzy_match_label(key_lower, alias_lower, threshold)
+                if score == 0.0:
+                    continue
+            for hit in label_hits:
+                # When fuzzy matching is disabled, skip hits that were themselves
+                # added to the index via a fuzzy (non-exact) text search.
+                if not fuzzy and hit.fuzzy_score < 1.0:
+                    continue
+                dedup_key = (hit.section_id, hit.line_index, hit.char_start)
+                if dedup_key in seen:
+                    continue
+                seen.add(dedup_key)
+                # clone with updated score and aliases_matched
+                updated = hit.model_copy(
+                    update={
+                        "fuzzy_score": score,
+                        "aliases_matched": hit.aliases_matched + [alias],
+                    }
+                )
+                hits.append(updated)
+    return hits

ocr_postprocess/extractors/label_anchor/__init__.py ADDED Viewed

File without changes

ocr_postprocess/extractors/label_anchor/line_after_label.py ADDED Viewed

@@ -0,0 +1,53 @@
+"""Label-anchor extractor: value on the line immediately after the label."""
+from __future__ import annotations
+from ocr_postprocess.extractors.base import LabelAnchorExtractor
+from ocr_postprocess.extractors.helpers import find_label
+from ocr_postprocess.extractors.registry import register
+from ocr_postprocess.models import Candidate, PipelineContext
+@register("line_after_label")
+class LineAfterLabelExtractor(LabelAnchorExtractor):
+    """Extract value from line immediately following the label line."""
+    def extract(self, ctx: PipelineContext, field=None) -> list[Candidate]:
+        if not field or not field.aliases:
+            return []
+        hits = find_label(ctx, field.aliases, fuzzy=field.fuzzy_label)
+        candidates: list[Candidate] = []
+        for hit in hits:
+            section = next((s for s in ctx.sections if s.id == hit.section_id), None)
+            if section is None:
+                continue
+            lines_sorted = sorted(section.lines, key=lambda ln: ln.index)
+            label_pos = next(
+                (i for i, ln in enumerate(lines_sorted) if ln.index == hit.line_index), None
+            )
+            if label_pos is None or label_pos + 1 >= len(lines_sorted):
+                continue
+            next_line = lines_sorted[label_pos + 1]
+            value = next_line.text.strip()
+            if not value:
+                continue
+            conf = 0.7 * hit.fuzzy_score
+            candidates.append(
+                Candidate(
+                    key=field.key,
+                    value=value,
+                    raw=value,
+                    extractor="line_after_label",
+                    sources=["label_anchor"],
+                    section_id=hit.section_id,
+                    line_index=next_line.index,
+                    confidence=conf,
+                )
+            )
+        return candidates

ocr_postprocess/extractors/label_anchor/regex_after_label.py ADDED Viewed

@@ -0,0 +1,75 @@
+"""Label-anchor extractor: apply regex after finding a label."""
+from __future__ import annotations
+import regex as re
+from ocr_postprocess.extractors.base import LabelAnchorExtractor
+from ocr_postprocess.extractors.helpers import find_label
+from ocr_postprocess.extractors.registry import register
+from ocr_postprocess.models import Candidate, PipelineContext
+@register("regex_after_label")
+class RegexAfterLabelExtractor(LabelAnchorExtractor):
+    """Apply a regex to text after a label; group(1) is the extracted value."""
+    def extract(self, ctx: PipelineContext, field=None) -> list[Candidate]:
+        if not field or not field.aliases or not field.pattern:
+            return []
+        compiled = re.compile(field.pattern)
+        hits = find_label(ctx, field.aliases, fuzzy=field.fuzzy_label)
+        candidates: list[Candidate] = []
+        for hit in hits:
+            section = next((s for s in ctx.sections if s.id == hit.section_id), None)
+            if section is None:
+                continue
+            line = next((ln for ln in section.lines if ln.index == hit.line_index), None)
+            if line is None:
+                continue
+            after = line.text[hit.char_end :]
+            m = compiled.search(after)
+            # If no match on label line, search subsequent lines (up to field.next_lines).
+            if not m and getattr(field, "next_lines", 0) > 0:
+                lines_sorted = sorted(section.lines, key=lambda ln: ln.index)
+                label_pos = next(
+                    (i for i, ln in enumerate(lines_sorted) if ln.index == hit.line_index), None
+                )
+                if label_pos is not None:
+                    for extra_ln in lines_sorted[
+                        label_pos + 1 : label_pos + 1 + getattr(field, "next_lines", 0)
+                    ]:
+                        m = compiled.search(extra_ln.text)
+                        if m:
+                            break
+            if not m:
+                continue
+            value = m.group(1) if m.lastindex and m.lastindex >= 1 else m.group(0)
+            value = value.strip()
+            if not value:
+                continue
+            abs_start = hit.char_end + m.start()
+            abs_end = hit.char_end + m.end()
+            conf = 0.7 * hit.fuzzy_score
+            candidates.append(
+                Candidate(
+                    key=field.key,
+                    value=value,
+                    raw=value,
+                    extractor="regex_after_label",
+                    sources=["label_anchor"],
+                    section_id=hit.section_id,
+                    line_index=hit.line_index,
+                    span=(abs_start, abs_end),
+                    confidence=conf,
+                )
+            )
+        return candidates

ocr_postprocess/extractors/label_anchor/text_until_next_label.py ADDED Viewed

@@ -0,0 +1,79 @@
+"""Label-anchor extractor: multi-line text until next known label (stop_labels)."""
+from __future__ import annotations
+from ocr_postprocess.extractors.base import LabelAnchorExtractor
+from ocr_postprocess.extractors.helpers import find_label
+from ocr_postprocess.extractors.registry import register
+from ocr_postprocess.models import Candidate, PipelineContext
+@register("text_until_next_label")
+class TextUntilNextLabelExtractor(LabelAnchorExtractor):
+    """Collect multi-line text from label until next stop label."""
+    def extract(self, ctx: PipelineContext, field=None) -> list[Candidate]:
+        if not field or not field.aliases:
+            return []
+        stop_aliases: list[str] = field.stop_labels or []
+        # Also use all known aliases as implicit stops
+        if ctx.profile and not stop_aliases:
+            for f in ctx.profile.fields:
+                if f.key != field.key:
+                    stop_aliases.extend(f.aliases)
+        stop_hits_by_section: dict[str, list[int]] = {}
+        if stop_aliases:
+            for s_hit in find_label(ctx, stop_aliases, fuzzy=False):
+                stop_hits_by_section.setdefault(s_hit.section_id, []).append(s_hit.line_index)
+        hits = find_label(ctx, field.aliases, fuzzy=field.fuzzy_label)
+        candidates: list[Candidate] = []
+        for hit in hits:
+            section = next((s for s in ctx.sections if s.id == hit.section_id), None)
+            if section is None:
+                continue
+            stop_lines = set(stop_hits_by_section.get(hit.section_id, []))
+            lines_sorted = sorted(section.lines, key=lambda ln: ln.index)
+            label_pos = next(
+                (i for i, ln in enumerate(lines_sorted) if ln.index == hit.line_index), None
+            )
+            if label_pos is None:
+                continue
+            # Collect lines from label_pos+1 to next stop
+            collected: list[str] = []
+            # Also include same-line tail
+            label_line = lines_sorted[label_pos]
+            tail = label_line.text[hit.char_end :].strip().lstrip(":： ")
+            if tail:
+                collected.append(tail)
+            for ln in lines_sorted[label_pos + 1 :]:
+                if ln.index in stop_lines:
+                    break
+                if ln.text.strip():
+                    collected.append(ln.text.strip())
+            value = " ".join(collected).strip()
+            if not value:
+                continue
+            conf = 0.7 * hit.fuzzy_score
+            candidates.append(
+                Candidate(
+                    key=field.key,
+                    value=value,
+                    raw=value,
+                    extractor="text_until_next_label",
+                    sources=["label_anchor"],
+                    section_id=hit.section_id,
+                    line_index=hit.line_index,
+                    confidence=conf,
+                )
+            )
+        return candidates

ocr_postprocess/extractors/label_anchor/value_between_labels.py ADDED Viewed

@@ -0,0 +1,65 @@
+"""Label-anchor extractor: value between two labels on the same line."""
+from __future__ import annotations
+import regex as re
+from ocr_postprocess.extractors.base import LabelAnchorExtractor
+from ocr_postprocess.extractors.helpers import find_label
+from ocr_postprocess.extractors.registry import register
+from ocr_postprocess.models import Candidate, PipelineContext
+@register("value_between_labels")
+class ValueBetweenLabelsExtractor(LabelAnchorExtractor):
+    """Extract value between two consecutive labels on the same line."""
+    def extract(self, ctx: PipelineContext, field=None) -> list[Candidate]:
+        if not field or not field.aliases:
+            return []
+        hits = find_label(ctx, field.aliases, fuzzy=field.fuzzy_label)
+        candidates: list[Candidate] = []
+        for hit in hits:
+            section = next((s for s in ctx.sections if s.id == hit.section_id), None)
+            if section is None:
+                continue
+            line = next((ln for ln in section.lines if ln.index == hit.line_index), None)
+            if line is None:
+                continue
+            after = line.text[hit.char_end :].strip().lstrip(":： ")
+            # Find end of value = start of another label or end of line
+            all_aliases = []
+            if ctx.profile:
+                for f in ctx.profile.fields:
+                    all_aliases.extend(f.aliases)
+            end_pos = len(after)
+            for alias in all_aliases:
+                m = re.search(re.escape(alias), after, re.IGNORECASE)
+                if m and m.start() < end_pos:
+                    end_pos = m.start()
+            value = after[:end_pos].strip()
+            if not value:
+                continue
+            conf = 0.7 * hit.fuzzy_score
+            candidates.append(
+                Candidate(
+                    key=field.key,
+                    value=value,
+                    raw=line.text,
+                    extractor="value_between_labels",
+                    sources=["label_anchor"],
+                    section_id=hit.section_id,
+                    line_index=hit.line_index,
+                    span=(hit.char_end, hit.char_end + end_pos),
+                    confidence=conf,
+                )
+            )
+        return candidates

ocr_postprocess/extractors/label_anchor/value_in_same_line.py ADDED Viewed

@@ -0,0 +1,60 @@
+"""Label-anchor extractor: value on the same line as a label."""
+from __future__ import annotations
+import regex as re
+from ocr_postprocess.extractors.base import LabelAnchorExtractor
+from ocr_postprocess.extractors.helpers import find_label
+from ocr_postprocess.extractors.registry import register
+from ocr_postprocess.models import Candidate, PipelineContext
+_COLON_SPLIT = re.compile(r"[:：]\s*")
+@register("value_in_same_line")
+class ValueInSameLineExtractor(LabelAnchorExtractor):
+    """Extract value appearing after a label on the same line."""
+    def extract(self, ctx: PipelineContext, field=None) -> list[Candidate]:
+        if not field or not field.aliases:
+            return []
+        hits = find_label(ctx, field.aliases, fuzzy=field.fuzzy_label)
+        candidates: list[Candidate] = []
+        for hit in hits:
+            # Find the section and line
+            section = next((s for s in ctx.sections if s.id == hit.section_id), None)
+            if section is None:
+                continue
+            line = next((ln for ln in section.lines if ln.index == hit.line_index), None)
+            if line is None:
+                continue
+            # Text after label end on the same line
+            after = line.text[hit.char_end :].strip().lstrip(":： ")
+            after = _COLON_SPLIT.split(after, 1)[-1].strip()
+            # Stop before next label-like pattern
+            after = re.split(r"(?:\s{2,}|\t)", after)[0].strip()
+            if not after:
+                continue
+            conf = 0.7 * hit.fuzzy_score
+            candidates.append(
+                Candidate(
+                    key=field.key,
+                    value=after,
+                    raw=line.text,
+                    extractor="value_in_same_line",
+                    sources=["label_anchor"],
+                    section_id=hit.section_id,
+                    line_index=hit.line_index,
+                    span=(hit.char_end, hit.char_end + len(after)),
+                    confidence=conf,
+                )
+            )
+        return candidates

ocr_postprocess/extractors/pattern/__init__.py ADDED Viewed

File without changes