ocr-postprocess 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ocr_postprocess/__init__.py +33 -0
- ocr_postprocess/classifier.py +63 -0
- ocr_postprocess/cli.py +130 -0
- ocr_postprocess/engine/__init__.py +0 -0
- ocr_postprocess/engine/denoiser.py +134 -0
- ocr_postprocess/engine/extractor_stage.py +107 -0
- ocr_postprocess/engine/normalizer.py +128 -0
- ocr_postprocess/engine/reconciler.py +170 -0
- ocr_postprocess/engine/reconstructor.py +469 -0
- ocr_postprocess/engine/transform_stage.py +89 -0
- ocr_postprocess/exceptions.py +30 -0
- ocr_postprocess/extractors/__init__.py +0 -0
- ocr_postprocess/extractors/base.py +103 -0
- ocr_postprocess/extractors/helpers.py +63 -0
- ocr_postprocess/extractors/label_anchor/__init__.py +0 -0
- ocr_postprocess/extractors/label_anchor/line_after_label.py +53 -0
- ocr_postprocess/extractors/label_anchor/regex_after_label.py +75 -0
- ocr_postprocess/extractors/label_anchor/text_until_next_label.py +79 -0
- ocr_postprocess/extractors/label_anchor/value_between_labels.py +65 -0
- ocr_postprocess/extractors/label_anchor/value_in_same_line.py +60 -0
- ocr_postprocess/extractors/pattern/__init__.py +0 -0
- ocr_postprocess/extractors/pattern/cccd.py +120 -0
- ocr_postprocess/extractors/pattern/cmnd.py +38 -0
- ocr_postprocess/extractors/pattern/currency_vnd.py +48 -0
- ocr_postprocess/extractors/pattern/date.py +89 -0
- ocr_postprocess/extractors/pattern/email.py +38 -0
- ocr_postprocess/extractors/pattern/gender_vn.py +48 -0
- ocr_postprocess/extractors/pattern/phone_vn.py +83 -0
- ocr_postprocess/extractors/pattern/plate_vn.py +39 -0
- ocr_postprocess/extractors/pattern/tax_code.py +53 -0
- ocr_postprocess/extractors/registry.py +45 -0
- ocr_postprocess/extractors/structured/__init__.py +0 -0
- ocr_postprocess/extractors/structured/mrz_cccd.py +111 -0
- ocr_postprocess/extractors/universal.py +39 -0
- ocr_postprocess/models.py +131 -0
- ocr_postprocess/pipeline.py +179 -0
- ocr_postprocess/profiles/__init__.py +0 -0
- ocr_postprocess/profiles/_generic.yml +13 -0
- ocr_postprocess/profiles/cccd_2024.yml +113 -0
- ocr_postprocess/profiles/dang_kiem.yml +105 -0
- ocr_postprocess/profiles/loader.py +63 -0
- ocr_postprocess/profiles/matcher.py +71 -0
- ocr_postprocess/profiles/schema.py +197 -0
- ocr_postprocess/py.typed +0 -0
- ocr_postprocess/renderer/__init__.py +0 -0
- ocr_postprocess/renderer/json_renderer.py +59 -0
- ocr_postprocess/renderer/llm.py +41 -0
- ocr_postprocess/renderer/markdown.py +172 -0
- ocr_postprocess/scorer.py +78 -0
- ocr_postprocess/transformer.py +304 -0
- ocr_postprocess-0.1.0.dist-info/METADATA +189 -0
- ocr_postprocess-0.1.0.dist-info/RECORD +55 -0
- ocr_postprocess-0.1.0.dist-info/WHEEL +5 -0
- ocr_postprocess-0.1.0.dist-info/entry_points.txt +2 -0
- ocr_postprocess-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
"""Structured extractor: MRZ CCCD (ICAO 9303) lines."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
|
|
7
|
+
import regex as re
|
|
8
|
+
|
|
9
|
+
from ocr_postprocess.extractors.base import StructuredExtractor
|
|
10
|
+
from ocr_postprocess.extractors.registry import register
|
|
11
|
+
from ocr_postprocess.models import Candidate, PipelineContext
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
_MRZ_LINE = re.compile(r"[A-Z0-9<]{20,}")
|
|
16
|
+
|
|
17
|
+
_MRZ_CHARSET: dict[str, int] = {str(i): i for i in range(10)}
|
|
18
|
+
_MRZ_CHARSET.update({chr(c): c - ord("A") + 10 for c in range(ord("A"), ord("Z") + 1)})
|
|
19
|
+
_MRZ_CHARSET["<"] = 0
|
|
20
|
+
|
|
21
|
+
_WEIGHTS = [7, 3, 1]
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _mrz_checksum(s: str) -> int:
|
|
25
|
+
total = 0
|
|
26
|
+
for i, ch in enumerate(s):
|
|
27
|
+
total += _MRZ_CHARSET.get(ch, 0) * _WEIGHTS[i % 3]
|
|
28
|
+
return total % 10
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _check_field(data: str) -> bool:
|
|
32
|
+
"""Verify last char is the checksum of preceding chars."""
|
|
33
|
+
if len(data) < 2:
|
|
34
|
+
return False
|
|
35
|
+
return _mrz_checksum(data[:-1]) == int(data[-1]) if data[-1].isdigit() else False
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _parse_mrz(lines: list[str]) -> dict[str, str] | None:
|
|
39
|
+
"""Parse 2-line or 3-line MRZ from CCCD/CMND."""
|
|
40
|
+
if len(lines) < 2:
|
|
41
|
+
return None
|
|
42
|
+
|
|
43
|
+
result: dict[str, str] = {}
|
|
44
|
+
|
|
45
|
+
if len(lines) >= 2 and len(lines[0]) >= 30:
|
|
46
|
+
line1 = lines[0]
|
|
47
|
+
# Line 1: doc type (2), country (3), names
|
|
48
|
+
result["doc_type"] = line1[:2].strip("<")
|
|
49
|
+
result["country"] = line1[2:5].strip("<")
|
|
50
|
+
name_part = line1[5:] if len(line1) > 5 else ""
|
|
51
|
+
if "<<" in name_part:
|
|
52
|
+
surname, given = name_part.split("<<", 1)
|
|
53
|
+
result["surname"] = surname.replace("<", " ").strip()
|
|
54
|
+
result["given_names"] = given.replace("<", " ").strip()
|
|
55
|
+
result["name_ascii"] = name_part.replace("<", " ").strip()
|
|
56
|
+
|
|
57
|
+
if len(lines) >= 2 and len(lines[1]) >= 28:
|
|
58
|
+
line2 = lines[1]
|
|
59
|
+
result["id_number"] = line2[:9].strip("<")
|
|
60
|
+
result["dob"] = _to_date(line2[13:19]) if len(line2) > 19 else ""
|
|
61
|
+
result["sex"] = line2[20] if len(line2) > 20 else ""
|
|
62
|
+
result["expiry"] = _to_date(line2[21:27]) if len(line2) > 27 else ""
|
|
63
|
+
result["nationality"] = line2[27:30].strip("<") if len(line2) > 30 else ""
|
|
64
|
+
|
|
65
|
+
return result
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def _to_date(s: str) -> str:
|
|
69
|
+
"""Convert YYMMDD to YYYY-MM-DD (assume 1900/2000 cutoff 30)."""
|
|
70
|
+
if len(s) < 6:
|
|
71
|
+
return s
|
|
72
|
+
try:
|
|
73
|
+
yy, mm, dd = int(s[:2]), int(s[2:4]), int(s[4:6])
|
|
74
|
+
except ValueError:
|
|
75
|
+
return s # malformed MRZ date field
|
|
76
|
+
year = 2000 + yy if yy <= 30 else 1900 + yy
|
|
77
|
+
return f"{year:04d}-{mm:02d}-{dd:02d}"
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
@register("mrz_cccd")
|
|
81
|
+
class MrzCccdExtractor(StructuredExtractor):
|
|
82
|
+
"""Extract structured MRZ data from CCCD documents."""
|
|
83
|
+
|
|
84
|
+
def extract(self, ctx: PipelineContext, field=None) -> list[Candidate]:
|
|
85
|
+
text = ctx.normalized_text or ctx.raw_text
|
|
86
|
+
mrz_lines: list[str] = [m.group(0) for m in _MRZ_LINE.finditer(text)]
|
|
87
|
+
|
|
88
|
+
if len(mrz_lines) < 2:
|
|
89
|
+
return []
|
|
90
|
+
|
|
91
|
+
parsed = _parse_mrz(mrz_lines[:3])
|
|
92
|
+
if not parsed:
|
|
93
|
+
return []
|
|
94
|
+
|
|
95
|
+
prefix = "mrz_cccd"
|
|
96
|
+
candidates: list[Candidate] = []
|
|
97
|
+
for sub_key, value in parsed.items():
|
|
98
|
+
if value:
|
|
99
|
+
candidates.append(
|
|
100
|
+
Candidate(
|
|
101
|
+
key=f"{prefix}.{sub_key}",
|
|
102
|
+
value=value,
|
|
103
|
+
raw=value,
|
|
104
|
+
extractor="mrz_cccd",
|
|
105
|
+
sources=["structured:mrz"],
|
|
106
|
+
confidence=0.92,
|
|
107
|
+
)
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
logger.debug("MRZ CCCD: extracted %d candidates", len(candidates))
|
|
111
|
+
return candidates
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
"""Universal extractor: runs all pattern+structured extractors for hints."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
|
|
7
|
+
from ocr_postprocess.models import PipelineContext
|
|
8
|
+
|
|
9
|
+
logger = logging.getLogger(__name__)
|
|
10
|
+
|
|
11
|
+
_UNIVERSAL_EXTRACTORS = [
|
|
12
|
+
"phone_vn",
|
|
13
|
+
"date",
|
|
14
|
+
"email",
|
|
15
|
+
"currency_vnd",
|
|
16
|
+
"plate_vn",
|
|
17
|
+
"tax_code",
|
|
18
|
+
"cccd",
|
|
19
|
+
"cmnd",
|
|
20
|
+
"mrz_cccd",
|
|
21
|
+
]
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def universal_extract(ctx: PipelineContext) -> dict[str, list]:
|
|
25
|
+
"""Run all universal extractors and return a hints dict."""
|
|
26
|
+
from ocr_postprocess.extractors import registry
|
|
27
|
+
|
|
28
|
+
hints: dict[str, list] = {}
|
|
29
|
+
|
|
30
|
+
for extractor_name in _UNIVERSAL_EXTRACTORS:
|
|
31
|
+
try:
|
|
32
|
+
ext = registry.get_instance(extractor_name)
|
|
33
|
+
candidates = ext.extract(ctx, field=None)
|
|
34
|
+
if candidates:
|
|
35
|
+
hints[extractor_name] = [c.value for c in candidates]
|
|
36
|
+
except Exception:
|
|
37
|
+
logger.debug("Universal extractor '%s' unavailable", extractor_name)
|
|
38
|
+
|
|
39
|
+
return hints
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
from pydantic import BaseModel, field_validator
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class Line(BaseModel):
|
|
9
|
+
"""Một dòng text trong document sau reflow."""
|
|
10
|
+
|
|
11
|
+
index: int
|
|
12
|
+
text: str
|
|
13
|
+
original_indices: list[int] = []
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class LabelHit(BaseModel):
|
|
17
|
+
"""Vị trí 1 label tìm thấy trong text."""
|
|
18
|
+
|
|
19
|
+
label: str
|
|
20
|
+
aliases_matched: list[str] = []
|
|
21
|
+
section_id: str
|
|
22
|
+
line_index: int
|
|
23
|
+
char_start: int
|
|
24
|
+
char_end: int
|
|
25
|
+
fuzzy_score: float = 1.0
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class Section(BaseModel):
|
|
29
|
+
"""Một đoạn (section) trong document, nhóm các dòng có liên quan."""
|
|
30
|
+
|
|
31
|
+
id: str
|
|
32
|
+
title: str | None = None
|
|
33
|
+
lines: list[Line] = []
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class Candidate(BaseModel):
|
|
37
|
+
"""Một giá trị đã trích cho 1 field."""
|
|
38
|
+
|
|
39
|
+
key: str
|
|
40
|
+
value: Any
|
|
41
|
+
raw: str | None = None
|
|
42
|
+
extractor: str
|
|
43
|
+
sources: list[str] = []
|
|
44
|
+
section_id: str | None = None
|
|
45
|
+
line_index: int | None = None
|
|
46
|
+
span: tuple[int, int] | None = None
|
|
47
|
+
confidence: float = 0.0
|
|
48
|
+
needs_llm_review: bool = False
|
|
49
|
+
needs_vision: bool = False
|
|
50
|
+
conflict: bool = False
|
|
51
|
+
notes: list[str] = []
|
|
52
|
+
|
|
53
|
+
@field_validator("confidence")
|
|
54
|
+
@classmethod
|
|
55
|
+
def confidence_range(cls, v: float) -> float:
|
|
56
|
+
if not 0.0 <= v <= 1.0:
|
|
57
|
+
raise ValueError(f"confidence must be in [0, 1], got {v}")
|
|
58
|
+
return v
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class StageTrace(BaseModel):
|
|
62
|
+
"""Debug trace entry ghi lại thông tin thực thi của một stage."""
|
|
63
|
+
|
|
64
|
+
stage: str
|
|
65
|
+
duration_ms: float
|
|
66
|
+
input_hash: str | None = None # MD5 của input để phát hiện thay đổi
|
|
67
|
+
diff: str | None = None # Unified diff giữa input/output của stage
|
|
68
|
+
extra: dict[str, Any] = {} # Dữ liệu bổ sung tuỳ từng stage
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
class CrossCheck(BaseModel):
|
|
72
|
+
"""Kết quả kiểm tra chéo giữa các nguồn trích xuất khác nhau cho cùng một field."""
|
|
73
|
+
|
|
74
|
+
field_key: str
|
|
75
|
+
sources: list[str] # Danh sách nguồn tham gia cross-check
|
|
76
|
+
matched: bool # True nếu tất cả nguồn đồng thuận
|
|
77
|
+
detail: str | None = None # Mô tả chi tiết (internal, dùng cho log)
|
|
78
|
+
values: list[str] = [] # Unique values tìm được (dùng bởi renderer)
|
|
79
|
+
value_sources: list[dict[str, Any]] = [] # [{value, extractor, confidence}] cho từng giá trị
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
class ProcessedDocument(BaseModel):
|
|
83
|
+
"""Kết quả pipeline."""
|
|
84
|
+
|
|
85
|
+
profile_id: str
|
|
86
|
+
profile_score: float
|
|
87
|
+
profile_display_name: str = "" # Tên hiển thị của profile (dùng bởi renderer)
|
|
88
|
+
canonical_text: str = (
|
|
89
|
+
"" # Normalized text sau pipeline (dùng bởi renderer làm context tham chiếu)
|
|
90
|
+
)
|
|
91
|
+
sections: list[Section] = []
|
|
92
|
+
candidates: list[Candidate] = []
|
|
93
|
+
hints: dict[str, list[Any]] = {}
|
|
94
|
+
cross_checks: list[CrossCheck] = []
|
|
95
|
+
warnings: list[str] = []
|
|
96
|
+
overall_confidence: float = 0.0
|
|
97
|
+
debug_trace: list[StageTrace] = []
|
|
98
|
+
markdown: str = ""
|
|
99
|
+
field_labels: dict[str, str] = {} # field key → VN label (aliases[0]) cho renderer
|
|
100
|
+
|
|
101
|
+
def get(self, key: str) -> Candidate | None:
|
|
102
|
+
"""Return first candidate matching key."""
|
|
103
|
+
for c in self.candidates:
|
|
104
|
+
if c.key == key:
|
|
105
|
+
return c
|
|
106
|
+
return None
|
|
107
|
+
|
|
108
|
+
def to_json(self) -> dict[str, Any]:
|
|
109
|
+
"""Serialise to plain dict (use json_renderer.to_json() for JSON string output)."""
|
|
110
|
+
return self.model_dump()
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
class PipelineContext(BaseModel):
|
|
114
|
+
"""Mutable state shared across pipeline stages."""
|
|
115
|
+
|
|
116
|
+
raw_text: str # Text OCR thô ban đầu
|
|
117
|
+
normalized_text: str = "" # Text sau stage 1 (Normalizer)
|
|
118
|
+
profile: Any | None = None # DocumentProfile được chọn (forward ref tránh circular import)
|
|
119
|
+
classification_score: float = 0.0 # Điểm phân loại profile (0–1)
|
|
120
|
+
sections: list[Section] = [] # Danh sách section sau stage 4 (Reconstructor)
|
|
121
|
+
label_index: dict[str, list[LabelHit]] = (
|
|
122
|
+
{}
|
|
123
|
+
) # Index label → vị trí, dùng bởi label-anchor extractors
|
|
124
|
+
candidates: list[Candidate] = [] # Tất cả giá trị đã trích xuất
|
|
125
|
+
cross_checks: list[CrossCheck] = [] # Kết quả cross-check từ stage 7 (Reconciler)
|
|
126
|
+
hints: dict[str, list[Any]] = {} # Dữ liệu phụ trợ từ universal extractor
|
|
127
|
+
warnings: list[str] = [] # Cảnh báo tích luỹ qua các stage
|
|
128
|
+
overall_confidence: float = 0.0 # Điểm tổng hợp từ stage 8 (Scorer)
|
|
129
|
+
debug_trace: list[StageTrace] = [] # Trace chi tiết từng stage (chỉ khi debug=True)
|
|
130
|
+
|
|
131
|
+
model_config = {"arbitrary_types_allowed": True}
|
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
"""Pipeline orchestrator."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
import time
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Callable
|
|
9
|
+
|
|
10
|
+
from ocr_postprocess.models import PipelineContext, ProcessedDocument, StageTrace
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
Stage = Callable[[PipelineContext], None]
|
|
15
|
+
|
|
16
|
+
# Default profiles directory bundled with the package.
|
|
17
|
+
# When installed via pip, profiles live next to this file inside the package.
|
|
18
|
+
_BUNDLED_PROFILES_DIR = Path(__file__).parent / "profiles"
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class Pipeline:
|
|
22
|
+
"""9-stage OCR post-processing pipeline."""
|
|
23
|
+
|
|
24
|
+
def __init__(
|
|
25
|
+
self,
|
|
26
|
+
stages: list[Stage],
|
|
27
|
+
profiles_dir: str | Path = "profiles",
|
|
28
|
+
) -> None:
|
|
29
|
+
self._stages = stages
|
|
30
|
+
self._profiles_dir = Path(profiles_dir)
|
|
31
|
+
self._profiles: dict = {}
|
|
32
|
+
|
|
33
|
+
@classmethod
|
|
34
|
+
def from_default(cls, profiles_dir: str | Path | None = None) -> "Pipeline":
|
|
35
|
+
"""Factory: build pipeline with all default stages loaded.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
profiles_dir: Path to YAML profiles directory. Defaults to the
|
|
39
|
+
profiles bundled with the package so no external files are
|
|
40
|
+
required after ``pip install``.
|
|
41
|
+
"""
|
|
42
|
+
# Import lazily to avoid circular imports at module level
|
|
43
|
+
from ocr_postprocess.classifier import classify_stage
|
|
44
|
+
from ocr_postprocess.engine.denoiser import denoise_stage
|
|
45
|
+
from ocr_postprocess.engine.extractor_stage import extract_stage
|
|
46
|
+
from ocr_postprocess.engine.normalizer import normalize_stage
|
|
47
|
+
from ocr_postprocess.engine.reconciler import reconcile_stage
|
|
48
|
+
from ocr_postprocess.engine.reconstructor import reconstruct_stage
|
|
49
|
+
from ocr_postprocess.engine.transform_stage import transform_stage
|
|
50
|
+
from ocr_postprocess.renderer.markdown import render_stage
|
|
51
|
+
from ocr_postprocess.scorer import score_stage
|
|
52
|
+
|
|
53
|
+
stages: list[Stage] = [
|
|
54
|
+
normalize_stage,
|
|
55
|
+
classify_stage,
|
|
56
|
+
denoise_stage,
|
|
57
|
+
reconstruct_stage,
|
|
58
|
+
extract_stage,
|
|
59
|
+
transform_stage,
|
|
60
|
+
reconcile_stage,
|
|
61
|
+
score_stage,
|
|
62
|
+
render_stage,
|
|
63
|
+
]
|
|
64
|
+
resolved_dir = Path(profiles_dir) if profiles_dir is not None else _BUNDLED_PROFILES_DIR
|
|
65
|
+
instance = cls(stages=stages, profiles_dir=resolved_dir)
|
|
66
|
+
instance._load_profiles()
|
|
67
|
+
return instance
|
|
68
|
+
|
|
69
|
+
def _load_profiles(self) -> None:
|
|
70
|
+
"""Load all YAML profiles from profiles_dir into memory."""
|
|
71
|
+
from ocr_postprocess.profiles.loader import load_profiles
|
|
72
|
+
|
|
73
|
+
self._profiles = load_profiles(self._profiles_dir)
|
|
74
|
+
logger.info("Pipeline loaded %d profiles", len(self._profiles))
|
|
75
|
+
|
|
76
|
+
def classify(self, raw: str) -> tuple[str, float]:
|
|
77
|
+
"""Classify raw text and return (profile_id, score)."""
|
|
78
|
+
from ocr_postprocess.profiles.matcher import evaluate
|
|
79
|
+
|
|
80
|
+
best_id = "_generic"
|
|
81
|
+
best_score = 0.0
|
|
82
|
+
|
|
83
|
+
for pid, profile in self._profiles.items():
|
|
84
|
+
if pid.startswith("_"):
|
|
85
|
+
continue # skip fallback profiles
|
|
86
|
+
score = evaluate(profile.classify, raw)
|
|
87
|
+
if score > best_score:
|
|
88
|
+
best_score = score
|
|
89
|
+
best_id = pid
|
|
90
|
+
|
|
91
|
+
if best_score < 0.5:
|
|
92
|
+
best_id = "_generic"
|
|
93
|
+
|
|
94
|
+
return best_id, best_score
|
|
95
|
+
|
|
96
|
+
def process(self, raw: str, debug: bool = False) -> ProcessedDocument:
|
|
97
|
+
"""Run full pipeline on raw OCR text.
|
|
98
|
+
|
|
99
|
+
Args:
|
|
100
|
+
raw: Raw text from OCR engine. Must be a non-empty string.
|
|
101
|
+
debug: When True, attach per-stage timing to the returned document.
|
|
102
|
+
|
|
103
|
+
Raises:
|
|
104
|
+
ValueError: If *raw* is not a non-empty string.
|
|
105
|
+
"""
|
|
106
|
+
if not isinstance(raw, str):
|
|
107
|
+
raise ValueError(f"raw must be a str, got {type(raw).__name__}")
|
|
108
|
+
if not raw.strip():
|
|
109
|
+
raise ValueError("raw text is empty — nothing to process")
|
|
110
|
+
|
|
111
|
+
ctx = PipelineContext(raw_text=raw)
|
|
112
|
+
|
|
113
|
+
# Inject loaded profiles into context using __dict__ bypass because
|
|
114
|
+
# PipelineContext is a Pydantic model with no _profiles field — we store
|
|
115
|
+
# transient data here without modifying the public schema.
|
|
116
|
+
ctx.__dict__["_profiles"] = self._profiles
|
|
117
|
+
|
|
118
|
+
logger.info(
|
|
119
|
+
"Pipeline starting — %d chars, %d profile(s) loaded",
|
|
120
|
+
len(raw),
|
|
121
|
+
len(self._profiles),
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
failed_stages: list[str] = []
|
|
125
|
+
|
|
126
|
+
for stage in self._stages:
|
|
127
|
+
stage_name = getattr(stage, "__name__", str(stage))
|
|
128
|
+
logger.debug("[%s] start", stage_name)
|
|
129
|
+
t0 = time.perf_counter()
|
|
130
|
+
try:
|
|
131
|
+
stage(ctx)
|
|
132
|
+
except Exception:
|
|
133
|
+
logger.exception("[%s] FATAL — stage raised an exception", stage_name)
|
|
134
|
+
failed_stages.append(stage_name)
|
|
135
|
+
raise
|
|
136
|
+
finally:
|
|
137
|
+
elapsed_ms = (time.perf_counter() - t0) * 1000
|
|
138
|
+
logger.debug("[%s] done in %.1f ms", stage_name, elapsed_ms)
|
|
139
|
+
if debug:
|
|
140
|
+
ctx.debug_trace.append(StageTrace(stage=stage_name, duration_ms=elapsed_ms))
|
|
141
|
+
|
|
142
|
+
from ocr_postprocess.renderer.markdown import render_markdown
|
|
143
|
+
|
|
144
|
+
profile_id = ctx.profile.id if ctx.profile else "_unknown"
|
|
145
|
+
profile_display_name = ctx.profile.display_name if ctx.profile else ""
|
|
146
|
+
field_labels: dict[str, str] = {}
|
|
147
|
+
if ctx.profile:
|
|
148
|
+
for fdef in ctx.profile.fields:
|
|
149
|
+
if fdef.aliases:
|
|
150
|
+
field_labels[fdef.key] = fdef.aliases[0]
|
|
151
|
+
doc = ProcessedDocument(
|
|
152
|
+
profile_id=profile_id,
|
|
153
|
+
profile_score=ctx.classification_score,
|
|
154
|
+
profile_display_name=profile_display_name,
|
|
155
|
+
canonical_text=ctx.normalized_text,
|
|
156
|
+
sections=ctx.sections,
|
|
157
|
+
candidates=ctx.candidates,
|
|
158
|
+
cross_checks=ctx.cross_checks,
|
|
159
|
+
hints=ctx.hints,
|
|
160
|
+
warnings=ctx.warnings,
|
|
161
|
+
overall_confidence=ctx.overall_confidence,
|
|
162
|
+
debug_trace=ctx.debug_trace,
|
|
163
|
+
field_labels=field_labels,
|
|
164
|
+
)
|
|
165
|
+
try:
|
|
166
|
+
doc.markdown = render_markdown(doc)
|
|
167
|
+
except Exception:
|
|
168
|
+
logger.exception("render_markdown raised an exception; markdown output will be empty")
|
|
169
|
+
doc.markdown = ""
|
|
170
|
+
|
|
171
|
+
logger.info(
|
|
172
|
+
"Pipeline done — profile=%s score=%.2f confidence=%.2f " "candidates=%d warnings=%d",
|
|
173
|
+
profile_id,
|
|
174
|
+
ctx.classification_score,
|
|
175
|
+
ctx.overall_confidence,
|
|
176
|
+
len(ctx.candidates),
|
|
177
|
+
len(ctx.warnings),
|
|
178
|
+
)
|
|
179
|
+
return doc
|
|
File without changes
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
id: cccd_2024
|
|
2
|
+
version: 1
|
|
3
|
+
display_name: "Căn cước công dân (mẫu 2024)"
|
|
4
|
+
language: ["vi", "en"]
|
|
5
|
+
|
|
6
|
+
classify:
|
|
7
|
+
any_of:
|
|
8
|
+
- all_of:
|
|
9
|
+
- contains_any: ["CĂN CƯỚC", "Citizen Identity"]
|
|
10
|
+
- regex: '\b\d{12}\b'
|
|
11
|
+
- contains_any: ["Số định danh cá nhân", "Personal identification number"]
|
|
12
|
+
|
|
13
|
+
denoise:
|
|
14
|
+
drop_lines:
|
|
15
|
+
regex: ['^[\W_]{3,}$', '^Page \d+/\d+$']
|
|
16
|
+
contains_any: ["SOCIALIST REPUBLIC OF VIET NAM"]
|
|
17
|
+
collapse_repeats: true
|
|
18
|
+
|
|
19
|
+
reconstruct:
|
|
20
|
+
bilingual_pairs:
|
|
21
|
+
- ["Họ và tên", "Full name"]
|
|
22
|
+
- ["Ngày, tháng, năm sinh", "Date of birth"]
|
|
23
|
+
- ["Giới tính", "Sex"]
|
|
24
|
+
- ["Quốc tịch", "Nationality"]
|
|
25
|
+
- ["Quê quán", "Place of origin"]
|
|
26
|
+
- ["Nơi thường trú", "Place of residence"]
|
|
27
|
+
- ["Có giá trị đến", "Date of expiry"]
|
|
28
|
+
- ["Số định danh cá nhân", "Personal identification number"]
|
|
29
|
+
fuzzy_threshold: 0.85
|
|
30
|
+
split_glued_labels: true
|
|
31
|
+
rejoin_wrapped_lines: true
|
|
32
|
+
|
|
33
|
+
extract:
|
|
34
|
+
- name: so_cccd
|
|
35
|
+
aliases: ["Số", "Số định danh", "Personal identification number"]
|
|
36
|
+
extractor: cccd
|
|
37
|
+
required: true
|
|
38
|
+
cross_check_with: [mrz_cccd.so]
|
|
39
|
+
|
|
40
|
+
- name: ho_va_ten
|
|
41
|
+
aliases: ["Họ và tên", "Full name"]
|
|
42
|
+
extractor: value_in_same_line
|
|
43
|
+
required: true
|
|
44
|
+
cross_check_with: [mrz_cccd.ho_ten]
|
|
45
|
+
|
|
46
|
+
- name: ngay_sinh
|
|
47
|
+
aliases: ["Ngày, tháng, năm sinh", "Ngày sinh", "Date of birth"]
|
|
48
|
+
extractor: regex_after_label
|
|
49
|
+
pattern: '(\d{1,2}[/\-.\s]\d{1,2}[/\-.\s]\d{4})'
|
|
50
|
+
fuzzy_label: false
|
|
51
|
+
next_lines: 1
|
|
52
|
+
required: true
|
|
53
|
+
transform: [{op: to_date}]
|
|
54
|
+
cross_check_with: [mrz_cccd.ngay_sinh]
|
|
55
|
+
|
|
56
|
+
- name: gioi_tinh
|
|
57
|
+
aliases: ["Giới tính", "Sex"]
|
|
58
|
+
extractor: gender_vn
|
|
59
|
+
required: true
|
|
60
|
+
cross_check_with: [mrz_cccd.gioi_tinh]
|
|
61
|
+
|
|
62
|
+
- name: quoc_tich
|
|
63
|
+
aliases: ["Quốc tịch", "Nationality"]
|
|
64
|
+
extractor: value_in_same_line
|
|
65
|
+
default: "Việt Nam"
|
|
66
|
+
|
|
67
|
+
- name: que_quan
|
|
68
|
+
# Hỗ trợ cả mẫu cũ ("Quê quán") và mẫu 2024 ("Place of birth" biểu diễn trên dòng song ngữ)
|
|
69
|
+
aliases: ["Quê quán", "Place of origin", "Place of birth"]
|
|
70
|
+
extractor: text_until_next_label
|
|
71
|
+
fuzzy_label: false
|
|
72
|
+
stop_labels: ["Nơi thường trú", "Place of residence", "Nơi cư trú", "Ngày, tháng, năm cấp", "Date of issue"]
|
|
73
|
+
required: true
|
|
74
|
+
|
|
75
|
+
- name: noi_thuong_tru
|
|
76
|
+
# Hỗ trợ cả mẫu cũ ("Nơi thường trú") và mẫu 2024 ("Place of residence" trên dòng "Nơi cư trú /Place of residence")
|
|
77
|
+
aliases: ["Nơi thường trú", "Place of residence"]
|
|
78
|
+
extractor: text_until_next_label
|
|
79
|
+
fuzzy_label: false
|
|
80
|
+
stop_labels: ["Có giá trị đến", "Date of expiry", "Số định danh", "Nơi đăng ký khai sinh", "Place of birth", "Quê quán", "Place of origin"]
|
|
81
|
+
required: true
|
|
82
|
+
|
|
83
|
+
- name: ngay_het_han
|
|
84
|
+
aliases: ["Có giá trị đến", "Date of expiry"]
|
|
85
|
+
extractor: regex_after_label
|
|
86
|
+
pattern: '(\d{1,2}[/\-.\s]\d{1,2}[/\-.\s]\d{4})'
|
|
87
|
+
fuzzy_label: false
|
|
88
|
+
next_lines: 1
|
|
89
|
+
transform: [{op: to_date}]
|
|
90
|
+
cross_check_with: [mrz_cccd.ngay_het_han]
|
|
91
|
+
|
|
92
|
+
- name: mrz_cccd
|
|
93
|
+
extractor: mrz_cccd
|
|
94
|
+
optional: true
|
|
95
|
+
|
|
96
|
+
compute:
|
|
97
|
+
- name: tuoi
|
|
98
|
+
expr: "year_now() - year(ngay_sinh)"
|
|
99
|
+
deps: [ngay_sinh]
|
|
100
|
+
|
|
101
|
+
output:
|
|
102
|
+
markdown:
|
|
103
|
+
title: "Căn cước công dân"
|
|
104
|
+
sections:
|
|
105
|
+
- heading: "Thông tin cá nhân"
|
|
106
|
+
fields: [ho_va_ten, ngay_sinh, tuoi, gioi_tinh, quoc_tich]
|
|
107
|
+
- heading: "Định danh"
|
|
108
|
+
fields: [so_cccd, ngay_het_han]
|
|
109
|
+
- heading: "Địa chỉ"
|
|
110
|
+
fields: [que_quan, noi_thuong_tru]
|
|
111
|
+
json:
|
|
112
|
+
include_trace: true
|
|
113
|
+
include_candidates: true
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
id: dang_kiem
|
|
2
|
+
version: 1
|
|
3
|
+
display_name: "Giấy chứng nhận đăng kiểm xe"
|
|
4
|
+
language: ["vi"]
|
|
5
|
+
|
|
6
|
+
classify:
|
|
7
|
+
any_of:
|
|
8
|
+
- contains_any: ["GIẤY CHỨNG NHẬN KIỂM ĐỊNH", "ĐĂNG KIỂM"]
|
|
9
|
+
- all_of:
|
|
10
|
+
- contains_any: ["Biển số đăng ký", "Biển số"]
|
|
11
|
+
- contains_any: ["Số máy", "Số khung"]
|
|
12
|
+
|
|
13
|
+
denoise:
|
|
14
|
+
drop_lines:
|
|
15
|
+
regex: ['^[\W_]{3,}$']
|
|
16
|
+
collapse_repeats: true
|
|
17
|
+
|
|
18
|
+
reconstruct:
|
|
19
|
+
fuzzy_threshold: 0.85
|
|
20
|
+
split_glued_labels: true
|
|
21
|
+
rejoin_wrapped_lines: true
|
|
22
|
+
|
|
23
|
+
extract:
|
|
24
|
+
- name: bien_so
|
|
25
|
+
aliases: ["Biển số đăng ký", "Biển số"]
|
|
26
|
+
extractor: plate_vn
|
|
27
|
+
required: true
|
|
28
|
+
|
|
29
|
+
- name: nhan_hieu
|
|
30
|
+
aliases: ["Nhãn hiệu"]
|
|
31
|
+
extractor: value_in_same_line
|
|
32
|
+
required: true
|
|
33
|
+
|
|
34
|
+
- name: so_loai
|
|
35
|
+
aliases: ["Số loại"]
|
|
36
|
+
extractor: value_in_same_line
|
|
37
|
+
|
|
38
|
+
- name: so_may
|
|
39
|
+
aliases: ["Số máy"]
|
|
40
|
+
extractor: value_in_same_line
|
|
41
|
+
required: true
|
|
42
|
+
|
|
43
|
+
- name: so_khung
|
|
44
|
+
aliases: ["Số khung"]
|
|
45
|
+
extractor: value_in_same_line
|
|
46
|
+
required: true
|
|
47
|
+
|
|
48
|
+
- name: nam_san_xuat
|
|
49
|
+
aliases: ["Năm sản xuất"]
|
|
50
|
+
extractor: value_in_same_line
|
|
51
|
+
transform: [{op: to_int}]
|
|
52
|
+
|
|
53
|
+
- name: niên_hạn
|
|
54
|
+
aliases: ["Niên hạn", "Niên hạn sử dụng"]
|
|
55
|
+
extractor: value_in_same_line
|
|
56
|
+
|
|
57
|
+
- name: trong_luong_ban_than
|
|
58
|
+
aliases: ["Trọng lượng bản thân", "Khối lượng bản thân"]
|
|
59
|
+
extractor: value_in_same_line
|
|
60
|
+
transform: [{op: strip_unit, units: ["kg"]}, {op: to_int}]
|
|
61
|
+
|
|
62
|
+
- name: so_nguoi_cho_phep
|
|
63
|
+
aliases: ["Số người cho phép chở", "Cho phép chở"]
|
|
64
|
+
extractor: value_in_same_line
|
|
65
|
+
transform: [{op: regex_group, pattern: '(\d+)'}, {op: to_int}]
|
|
66
|
+
|
|
67
|
+
- name: chu_so_huu
|
|
68
|
+
aliases: ["Tên chủ xe", "Chủ sở hữu", "Chủ xe"]
|
|
69
|
+
extractor: value_in_same_line
|
|
70
|
+
required: true
|
|
71
|
+
|
|
72
|
+
- name: dia_chi
|
|
73
|
+
aliases: ["Địa chỉ"]
|
|
74
|
+
extractor: text_until_next_label
|
|
75
|
+
stop_labels: ["Số máy", "Biển số", "Có giá trị đến"]
|
|
76
|
+
|
|
77
|
+
- name: ngay_kiem_dinh
|
|
78
|
+
aliases: ["Ngày kiểm định", "Kiểm định ngày"]
|
|
79
|
+
extractor: regex_after_label
|
|
80
|
+
pattern: '(\d{1,2}[/\-.\s]\d{1,2}[/\-.\s]\d{4})'
|
|
81
|
+
transform: [{op: to_date}]
|
|
82
|
+
|
|
83
|
+
- name: co_gia_tri_den
|
|
84
|
+
aliases: ["Có giá trị đến"]
|
|
85
|
+
extractor: regex_after_label
|
|
86
|
+
pattern: '(\d{1,2}[/\-.\s]\d{1,2}[/\-.\s]\d{4})'
|
|
87
|
+
transform: [{op: to_date}]
|
|
88
|
+
required: true
|
|
89
|
+
|
|
90
|
+
output:
|
|
91
|
+
markdown:
|
|
92
|
+
title: "Giấy chứng nhận đăng kiểm"
|
|
93
|
+
sections:
|
|
94
|
+
- heading: "Thông tin xe"
|
|
95
|
+
fields: [bien_so, nhan_hieu, so_loai, nam_san_xuat, niên_hạn]
|
|
96
|
+
- heading: "Số máy / khung"
|
|
97
|
+
fields: [so_may, so_khung]
|
|
98
|
+
- heading: "Tải trọng"
|
|
99
|
+
fields: [trong_luong_ban_than, so_nguoi_cho_phep]
|
|
100
|
+
- heading: "Chủ sở hữu"
|
|
101
|
+
fields: [chu_so_huu, dia_chi]
|
|
102
|
+
- heading: "Kiểm định"
|
|
103
|
+
fields: [ngay_kiem_dinh, co_gia_tri_den]
|
|
104
|
+
json:
|
|
105
|
+
include_trace: true
|