ocr-postprocess 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. ocr_postprocess/__init__.py +33 -0
  2. ocr_postprocess/classifier.py +63 -0
  3. ocr_postprocess/cli.py +130 -0
  4. ocr_postprocess/engine/__init__.py +0 -0
  5. ocr_postprocess/engine/denoiser.py +134 -0
  6. ocr_postprocess/engine/extractor_stage.py +107 -0
  7. ocr_postprocess/engine/normalizer.py +128 -0
  8. ocr_postprocess/engine/reconciler.py +170 -0
  9. ocr_postprocess/engine/reconstructor.py +469 -0
  10. ocr_postprocess/engine/transform_stage.py +89 -0
  11. ocr_postprocess/exceptions.py +30 -0
  12. ocr_postprocess/extractors/__init__.py +0 -0
  13. ocr_postprocess/extractors/base.py +103 -0
  14. ocr_postprocess/extractors/helpers.py +63 -0
  15. ocr_postprocess/extractors/label_anchor/__init__.py +0 -0
  16. ocr_postprocess/extractors/label_anchor/line_after_label.py +53 -0
  17. ocr_postprocess/extractors/label_anchor/regex_after_label.py +75 -0
  18. ocr_postprocess/extractors/label_anchor/text_until_next_label.py +79 -0
  19. ocr_postprocess/extractors/label_anchor/value_between_labels.py +65 -0
  20. ocr_postprocess/extractors/label_anchor/value_in_same_line.py +60 -0
  21. ocr_postprocess/extractors/pattern/__init__.py +0 -0
  22. ocr_postprocess/extractors/pattern/cccd.py +120 -0
  23. ocr_postprocess/extractors/pattern/cmnd.py +38 -0
  24. ocr_postprocess/extractors/pattern/currency_vnd.py +48 -0
  25. ocr_postprocess/extractors/pattern/date.py +89 -0
  26. ocr_postprocess/extractors/pattern/email.py +38 -0
  27. ocr_postprocess/extractors/pattern/gender_vn.py +48 -0
  28. ocr_postprocess/extractors/pattern/phone_vn.py +83 -0
  29. ocr_postprocess/extractors/pattern/plate_vn.py +39 -0
  30. ocr_postprocess/extractors/pattern/tax_code.py +53 -0
  31. ocr_postprocess/extractors/registry.py +45 -0
  32. ocr_postprocess/extractors/structured/__init__.py +0 -0
  33. ocr_postprocess/extractors/structured/mrz_cccd.py +111 -0
  34. ocr_postprocess/extractors/universal.py +39 -0
  35. ocr_postprocess/models.py +131 -0
  36. ocr_postprocess/pipeline.py +179 -0
  37. ocr_postprocess/profiles/__init__.py +0 -0
  38. ocr_postprocess/profiles/_generic.yml +13 -0
  39. ocr_postprocess/profiles/cccd_2024.yml +113 -0
  40. ocr_postprocess/profiles/dang_kiem.yml +105 -0
  41. ocr_postprocess/profiles/loader.py +63 -0
  42. ocr_postprocess/profiles/matcher.py +71 -0
  43. ocr_postprocess/profiles/schema.py +197 -0
  44. ocr_postprocess/py.typed +0 -0
  45. ocr_postprocess/renderer/__init__.py +0 -0
  46. ocr_postprocess/renderer/json_renderer.py +59 -0
  47. ocr_postprocess/renderer/llm.py +41 -0
  48. ocr_postprocess/renderer/markdown.py +172 -0
  49. ocr_postprocess/scorer.py +78 -0
  50. ocr_postprocess/transformer.py +304 -0
  51. ocr_postprocess-0.1.0.dist-info/METADATA +189 -0
  52. ocr_postprocess-0.1.0.dist-info/RECORD +55 -0
  53. ocr_postprocess-0.1.0.dist-info/WHEEL +5 -0
  54. ocr_postprocess-0.1.0.dist-info/entry_points.txt +2 -0
  55. ocr_postprocess-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,111 @@
1
+ """Structured extractor: MRZ CCCD (ICAO 9303) lines."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+
7
+ import regex as re
8
+
9
+ from ocr_postprocess.extractors.base import StructuredExtractor
10
+ from ocr_postprocess.extractors.registry import register
11
+ from ocr_postprocess.models import Candidate, PipelineContext
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+ _MRZ_LINE = re.compile(r"[A-Z0-9<]{20,}")
16
+
17
+ _MRZ_CHARSET: dict[str, int] = {str(i): i for i in range(10)}
18
+ _MRZ_CHARSET.update({chr(c): c - ord("A") + 10 for c in range(ord("A"), ord("Z") + 1)})
19
+ _MRZ_CHARSET["<"] = 0
20
+
21
+ _WEIGHTS = [7, 3, 1]
22
+
23
+
24
+ def _mrz_checksum(s: str) -> int:
25
+ total = 0
26
+ for i, ch in enumerate(s):
27
+ total += _MRZ_CHARSET.get(ch, 0) * _WEIGHTS[i % 3]
28
+ return total % 10
29
+
30
+
31
+ def _check_field(data: str) -> bool:
32
+ """Verify last char is the checksum of preceding chars."""
33
+ if len(data) < 2:
34
+ return False
35
+ return _mrz_checksum(data[:-1]) == int(data[-1]) if data[-1].isdigit() else False
36
+
37
+
38
+ def _parse_mrz(lines: list[str]) -> dict[str, str] | None:
39
+ """Parse 2-line or 3-line MRZ from CCCD/CMND."""
40
+ if len(lines) < 2:
41
+ return None
42
+
43
+ result: dict[str, str] = {}
44
+
45
+ if len(lines) >= 2 and len(lines[0]) >= 30:
46
+ line1 = lines[0]
47
+ # Line 1: doc type (2), country (3), names
48
+ result["doc_type"] = line1[:2].strip("<")
49
+ result["country"] = line1[2:5].strip("<")
50
+ name_part = line1[5:] if len(line1) > 5 else ""
51
+ if "<<" in name_part:
52
+ surname, given = name_part.split("<<", 1)
53
+ result["surname"] = surname.replace("<", " ").strip()
54
+ result["given_names"] = given.replace("<", " ").strip()
55
+ result["name_ascii"] = name_part.replace("<", " ").strip()
56
+
57
+ if len(lines) >= 2 and len(lines[1]) >= 28:
58
+ line2 = lines[1]
59
+ result["id_number"] = line2[:9].strip("<")
60
+ result["dob"] = _to_date(line2[13:19]) if len(line2) > 19 else ""
61
+ result["sex"] = line2[20] if len(line2) > 20 else ""
62
+ result["expiry"] = _to_date(line2[21:27]) if len(line2) > 27 else ""
63
+ result["nationality"] = line2[27:30].strip("<") if len(line2) > 30 else ""
64
+
65
+ return result
66
+
67
+
68
+ def _to_date(s: str) -> str:
69
+ """Convert YYMMDD to YYYY-MM-DD (assume 1900/2000 cutoff 30)."""
70
+ if len(s) < 6:
71
+ return s
72
+ try:
73
+ yy, mm, dd = int(s[:2]), int(s[2:4]), int(s[4:6])
74
+ except ValueError:
75
+ return s # malformed MRZ date field
76
+ year = 2000 + yy if yy <= 30 else 1900 + yy
77
+ return f"{year:04d}-{mm:02d}-{dd:02d}"
78
+
79
+
80
+ @register("mrz_cccd")
81
+ class MrzCccdExtractor(StructuredExtractor):
82
+ """Extract structured MRZ data from CCCD documents."""
83
+
84
+ def extract(self, ctx: PipelineContext, field=None) -> list[Candidate]:
85
+ text = ctx.normalized_text or ctx.raw_text
86
+ mrz_lines: list[str] = [m.group(0) for m in _MRZ_LINE.finditer(text)]
87
+
88
+ if len(mrz_lines) < 2:
89
+ return []
90
+
91
+ parsed = _parse_mrz(mrz_lines[:3])
92
+ if not parsed:
93
+ return []
94
+
95
+ prefix = "mrz_cccd"
96
+ candidates: list[Candidate] = []
97
+ for sub_key, value in parsed.items():
98
+ if value:
99
+ candidates.append(
100
+ Candidate(
101
+ key=f"{prefix}.{sub_key}",
102
+ value=value,
103
+ raw=value,
104
+ extractor="mrz_cccd",
105
+ sources=["structured:mrz"],
106
+ confidence=0.92,
107
+ )
108
+ )
109
+
110
+ logger.debug("MRZ CCCD: extracted %d candidates", len(candidates))
111
+ return candidates
@@ -0,0 +1,39 @@
1
+ """Universal extractor: runs all pattern+structured extractors for hints."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+
7
+ from ocr_postprocess.models import PipelineContext
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+ _UNIVERSAL_EXTRACTORS = [
12
+ "phone_vn",
13
+ "date",
14
+ "email",
15
+ "currency_vnd",
16
+ "plate_vn",
17
+ "tax_code",
18
+ "cccd",
19
+ "cmnd",
20
+ "mrz_cccd",
21
+ ]
22
+
23
+
24
+ def universal_extract(ctx: PipelineContext) -> dict[str, list]:
25
+ """Run all universal extractors and return a hints dict."""
26
+ from ocr_postprocess.extractors import registry
27
+
28
+ hints: dict[str, list] = {}
29
+
30
+ for extractor_name in _UNIVERSAL_EXTRACTORS:
31
+ try:
32
+ ext = registry.get_instance(extractor_name)
33
+ candidates = ext.extract(ctx, field=None)
34
+ if candidates:
35
+ hints[extractor_name] = [c.value for c in candidates]
36
+ except Exception:
37
+ logger.debug("Universal extractor '%s' unavailable", extractor_name)
38
+
39
+ return hints
@@ -0,0 +1,131 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any
4
+
5
+ from pydantic import BaseModel, field_validator
6
+
7
+
8
+ class Line(BaseModel):
9
+ """Một dòng text trong document sau reflow."""
10
+
11
+ index: int
12
+ text: str
13
+ original_indices: list[int] = []
14
+
15
+
16
+ class LabelHit(BaseModel):
17
+ """Vị trí 1 label tìm thấy trong text."""
18
+
19
+ label: str
20
+ aliases_matched: list[str] = []
21
+ section_id: str
22
+ line_index: int
23
+ char_start: int
24
+ char_end: int
25
+ fuzzy_score: float = 1.0
26
+
27
+
28
+ class Section(BaseModel):
29
+ """Một đoạn (section) trong document, nhóm các dòng có liên quan."""
30
+
31
+ id: str
32
+ title: str | None = None
33
+ lines: list[Line] = []
34
+
35
+
36
+ class Candidate(BaseModel):
37
+ """Một giá trị đã trích cho 1 field."""
38
+
39
+ key: str
40
+ value: Any
41
+ raw: str | None = None
42
+ extractor: str
43
+ sources: list[str] = []
44
+ section_id: str | None = None
45
+ line_index: int | None = None
46
+ span: tuple[int, int] | None = None
47
+ confidence: float = 0.0
48
+ needs_llm_review: bool = False
49
+ needs_vision: bool = False
50
+ conflict: bool = False
51
+ notes: list[str] = []
52
+
53
+ @field_validator("confidence")
54
+ @classmethod
55
+ def confidence_range(cls, v: float) -> float:
56
+ if not 0.0 <= v <= 1.0:
57
+ raise ValueError(f"confidence must be in [0, 1], got {v}")
58
+ return v
59
+
60
+
61
+ class StageTrace(BaseModel):
62
+ """Debug trace entry ghi lại thông tin thực thi của một stage."""
63
+
64
+ stage: str
65
+ duration_ms: float
66
+ input_hash: str | None = None # MD5 của input để phát hiện thay đổi
67
+ diff: str | None = None # Unified diff giữa input/output của stage
68
+ extra: dict[str, Any] = {} # Dữ liệu bổ sung tuỳ từng stage
69
+
70
+
71
+ class CrossCheck(BaseModel):
72
+ """Kết quả kiểm tra chéo giữa các nguồn trích xuất khác nhau cho cùng một field."""
73
+
74
+ field_key: str
75
+ sources: list[str] # Danh sách nguồn tham gia cross-check
76
+ matched: bool # True nếu tất cả nguồn đồng thuận
77
+ detail: str | None = None # Mô tả chi tiết (internal, dùng cho log)
78
+ values: list[str] = [] # Unique values tìm được (dùng bởi renderer)
79
+ value_sources: list[dict[str, Any]] = [] # [{value, extractor, confidence}] cho từng giá trị
80
+
81
+
82
+ class ProcessedDocument(BaseModel):
83
+ """Kết quả pipeline."""
84
+
85
+ profile_id: str
86
+ profile_score: float
87
+ profile_display_name: str = "" # Tên hiển thị của profile (dùng bởi renderer)
88
+ canonical_text: str = (
89
+ "" # Normalized text sau pipeline (dùng bởi renderer làm context tham chiếu)
90
+ )
91
+ sections: list[Section] = []
92
+ candidates: list[Candidate] = []
93
+ hints: dict[str, list[Any]] = {}
94
+ cross_checks: list[CrossCheck] = []
95
+ warnings: list[str] = []
96
+ overall_confidence: float = 0.0
97
+ debug_trace: list[StageTrace] = []
98
+ markdown: str = ""
99
+ field_labels: dict[str, str] = {} # field key → VN label (aliases[0]) cho renderer
100
+
101
+ def get(self, key: str) -> Candidate | None:
102
+ """Return first candidate matching key."""
103
+ for c in self.candidates:
104
+ if c.key == key:
105
+ return c
106
+ return None
107
+
108
+ def to_json(self) -> dict[str, Any]:
109
+ """Serialise to plain dict (use json_renderer.to_json() for JSON string output)."""
110
+ return self.model_dump()
111
+
112
+
113
+ class PipelineContext(BaseModel):
114
+ """Mutable state shared across pipeline stages."""
115
+
116
+ raw_text: str # Text OCR thô ban đầu
117
+ normalized_text: str = "" # Text sau stage 1 (Normalizer)
118
+ profile: Any | None = None # DocumentProfile được chọn (forward ref tránh circular import)
119
+ classification_score: float = 0.0 # Điểm phân loại profile (0–1)
120
+ sections: list[Section] = [] # Danh sách section sau stage 4 (Reconstructor)
121
+ label_index: dict[str, list[LabelHit]] = (
122
+ {}
123
+ ) # Index label → vị trí, dùng bởi label-anchor extractors
124
+ candidates: list[Candidate] = [] # Tất cả giá trị đã trích xuất
125
+ cross_checks: list[CrossCheck] = [] # Kết quả cross-check từ stage 7 (Reconciler)
126
+ hints: dict[str, list[Any]] = {} # Dữ liệu phụ trợ từ universal extractor
127
+ warnings: list[str] = [] # Cảnh báo tích luỹ qua các stage
128
+ overall_confidence: float = 0.0 # Điểm tổng hợp từ stage 8 (Scorer)
129
+ debug_trace: list[StageTrace] = [] # Trace chi tiết từng stage (chỉ khi debug=True)
130
+
131
+ model_config = {"arbitrary_types_allowed": True}
@@ -0,0 +1,179 @@
1
+ """Pipeline orchestrator."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ import time
7
+ from pathlib import Path
8
+ from typing import Callable
9
+
10
+ from ocr_postprocess.models import PipelineContext, ProcessedDocument, StageTrace
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+ Stage = Callable[[PipelineContext], None]
15
+
16
+ # Default profiles directory bundled with the package.
17
+ # When installed via pip, profiles live next to this file inside the package.
18
+ _BUNDLED_PROFILES_DIR = Path(__file__).parent / "profiles"
19
+
20
+
21
+ class Pipeline:
22
+ """9-stage OCR post-processing pipeline."""
23
+
24
+ def __init__(
25
+ self,
26
+ stages: list[Stage],
27
+ profiles_dir: str | Path = "profiles",
28
+ ) -> None:
29
+ self._stages = stages
30
+ self._profiles_dir = Path(profiles_dir)
31
+ self._profiles: dict = {}
32
+
33
+ @classmethod
34
+ def from_default(cls, profiles_dir: str | Path | None = None) -> "Pipeline":
35
+ """Factory: build pipeline with all default stages loaded.
36
+
37
+ Args:
38
+ profiles_dir: Path to YAML profiles directory. Defaults to the
39
+ profiles bundled with the package so no external files are
40
+ required after ``pip install``.
41
+ """
42
+ # Import lazily to avoid circular imports at module level
43
+ from ocr_postprocess.classifier import classify_stage
44
+ from ocr_postprocess.engine.denoiser import denoise_stage
45
+ from ocr_postprocess.engine.extractor_stage import extract_stage
46
+ from ocr_postprocess.engine.normalizer import normalize_stage
47
+ from ocr_postprocess.engine.reconciler import reconcile_stage
48
+ from ocr_postprocess.engine.reconstructor import reconstruct_stage
49
+ from ocr_postprocess.engine.transform_stage import transform_stage
50
+ from ocr_postprocess.renderer.markdown import render_stage
51
+ from ocr_postprocess.scorer import score_stage
52
+
53
+ stages: list[Stage] = [
54
+ normalize_stage,
55
+ classify_stage,
56
+ denoise_stage,
57
+ reconstruct_stage,
58
+ extract_stage,
59
+ transform_stage,
60
+ reconcile_stage,
61
+ score_stage,
62
+ render_stage,
63
+ ]
64
+ resolved_dir = Path(profiles_dir) if profiles_dir is not None else _BUNDLED_PROFILES_DIR
65
+ instance = cls(stages=stages, profiles_dir=resolved_dir)
66
+ instance._load_profiles()
67
+ return instance
68
+
69
+ def _load_profiles(self) -> None:
70
+ """Load all YAML profiles from profiles_dir into memory."""
71
+ from ocr_postprocess.profiles.loader import load_profiles
72
+
73
+ self._profiles = load_profiles(self._profiles_dir)
74
+ logger.info("Pipeline loaded %d profiles", len(self._profiles))
75
+
76
+ def classify(self, raw: str) -> tuple[str, float]:
77
+ """Classify raw text and return (profile_id, score)."""
78
+ from ocr_postprocess.profiles.matcher import evaluate
79
+
80
+ best_id = "_generic"
81
+ best_score = 0.0
82
+
83
+ for pid, profile in self._profiles.items():
84
+ if pid.startswith("_"):
85
+ continue # skip fallback profiles
86
+ score = evaluate(profile.classify, raw)
87
+ if score > best_score:
88
+ best_score = score
89
+ best_id = pid
90
+
91
+ if best_score < 0.5:
92
+ best_id = "_generic"
93
+
94
+ return best_id, best_score
95
+
96
+ def process(self, raw: str, debug: bool = False) -> ProcessedDocument:
97
+ """Run full pipeline on raw OCR text.
98
+
99
+ Args:
100
+ raw: Raw text from OCR engine. Must be a non-empty string.
101
+ debug: When True, attach per-stage timing to the returned document.
102
+
103
+ Raises:
104
+ ValueError: If *raw* is not a non-empty string.
105
+ """
106
+ if not isinstance(raw, str):
107
+ raise ValueError(f"raw must be a str, got {type(raw).__name__}")
108
+ if not raw.strip():
109
+ raise ValueError("raw text is empty — nothing to process")
110
+
111
+ ctx = PipelineContext(raw_text=raw)
112
+
113
+ # Inject loaded profiles into context using __dict__ bypass because
114
+ # PipelineContext is a Pydantic model with no _profiles field — we store
115
+ # transient data here without modifying the public schema.
116
+ ctx.__dict__["_profiles"] = self._profiles
117
+
118
+ logger.info(
119
+ "Pipeline starting — %d chars, %d profile(s) loaded",
120
+ len(raw),
121
+ len(self._profiles),
122
+ )
123
+
124
+ failed_stages: list[str] = []
125
+
126
+ for stage in self._stages:
127
+ stage_name = getattr(stage, "__name__", str(stage))
128
+ logger.debug("[%s] start", stage_name)
129
+ t0 = time.perf_counter()
130
+ try:
131
+ stage(ctx)
132
+ except Exception:
133
+ logger.exception("[%s] FATAL — stage raised an exception", stage_name)
134
+ failed_stages.append(stage_name)
135
+ raise
136
+ finally:
137
+ elapsed_ms = (time.perf_counter() - t0) * 1000
138
+ logger.debug("[%s] done in %.1f ms", stage_name, elapsed_ms)
139
+ if debug:
140
+ ctx.debug_trace.append(StageTrace(stage=stage_name, duration_ms=elapsed_ms))
141
+
142
+ from ocr_postprocess.renderer.markdown import render_markdown
143
+
144
+ profile_id = ctx.profile.id if ctx.profile else "_unknown"
145
+ profile_display_name = ctx.profile.display_name if ctx.profile else ""
146
+ field_labels: dict[str, str] = {}
147
+ if ctx.profile:
148
+ for fdef in ctx.profile.fields:
149
+ if fdef.aliases:
150
+ field_labels[fdef.key] = fdef.aliases[0]
151
+ doc = ProcessedDocument(
152
+ profile_id=profile_id,
153
+ profile_score=ctx.classification_score,
154
+ profile_display_name=profile_display_name,
155
+ canonical_text=ctx.normalized_text,
156
+ sections=ctx.sections,
157
+ candidates=ctx.candidates,
158
+ cross_checks=ctx.cross_checks,
159
+ hints=ctx.hints,
160
+ warnings=ctx.warnings,
161
+ overall_confidence=ctx.overall_confidence,
162
+ debug_trace=ctx.debug_trace,
163
+ field_labels=field_labels,
164
+ )
165
+ try:
166
+ doc.markdown = render_markdown(doc)
167
+ except Exception:
168
+ logger.exception("render_markdown raised an exception; markdown output will be empty")
169
+ doc.markdown = ""
170
+
171
+ logger.info(
172
+ "Pipeline done — profile=%s score=%.2f confidence=%.2f " "candidates=%d warnings=%d",
173
+ profile_id,
174
+ ctx.classification_score,
175
+ ctx.overall_confidence,
176
+ len(ctx.candidates),
177
+ len(ctx.warnings),
178
+ )
179
+ return doc
File without changes
@@ -0,0 +1,13 @@
1
+ id: _generic
2
+ version: 1
3
+ display_name: "Generic fallback"
4
+ language: ["vi", "en"]
5
+
6
+ classify:
7
+ regex: ".*"
8
+
9
+ extract: []
10
+
11
+ output:
12
+ markdown:
13
+ title: "Document"
@@ -0,0 +1,113 @@
1
+ id: cccd_2024
2
+ version: 1
3
+ display_name: "Căn cước công dân (mẫu 2024)"
4
+ language: ["vi", "en"]
5
+
6
+ classify:
7
+ any_of:
8
+ - all_of:
9
+ - contains_any: ["CĂN CƯỚC", "Citizen Identity"]
10
+ - regex: '\b\d{12}\b'
11
+ - contains_any: ["Số định danh cá nhân", "Personal identification number"]
12
+
13
+ denoise:
14
+ drop_lines:
15
+ regex: ['^[\W_]{3,}$', '^Page \d+/\d+$']
16
+ contains_any: ["SOCIALIST REPUBLIC OF VIET NAM"]
17
+ collapse_repeats: true
18
+
19
+ reconstruct:
20
+ bilingual_pairs:
21
+ - ["Họ và tên", "Full name"]
22
+ - ["Ngày, tháng, năm sinh", "Date of birth"]
23
+ - ["Giới tính", "Sex"]
24
+ - ["Quốc tịch", "Nationality"]
25
+ - ["Quê quán", "Place of origin"]
26
+ - ["Nơi thường trú", "Place of residence"]
27
+ - ["Có giá trị đến", "Date of expiry"]
28
+ - ["Số định danh cá nhân", "Personal identification number"]
29
+ fuzzy_threshold: 0.85
30
+ split_glued_labels: true
31
+ rejoin_wrapped_lines: true
32
+
33
+ extract:
34
+ - name: so_cccd
35
+ aliases: ["Số", "Số định danh", "Personal identification number"]
36
+ extractor: cccd
37
+ required: true
38
+ cross_check_with: [mrz_cccd.so]
39
+
40
+ - name: ho_va_ten
41
+ aliases: ["Họ và tên", "Full name"]
42
+ extractor: value_in_same_line
43
+ required: true
44
+ cross_check_with: [mrz_cccd.ho_ten]
45
+
46
+ - name: ngay_sinh
47
+ aliases: ["Ngày, tháng, năm sinh", "Ngày sinh", "Date of birth"]
48
+ extractor: regex_after_label
49
+ pattern: '(\d{1,2}[/\-.\s]\d{1,2}[/\-.\s]\d{4})'
50
+ fuzzy_label: false
51
+ next_lines: 1
52
+ required: true
53
+ transform: [{op: to_date}]
54
+ cross_check_with: [mrz_cccd.ngay_sinh]
55
+
56
+ - name: gioi_tinh
57
+ aliases: ["Giới tính", "Sex"]
58
+ extractor: gender_vn
59
+ required: true
60
+ cross_check_with: [mrz_cccd.gioi_tinh]
61
+
62
+ - name: quoc_tich
63
+ aliases: ["Quốc tịch", "Nationality"]
64
+ extractor: value_in_same_line
65
+ default: "Việt Nam"
66
+
67
+ - name: que_quan
68
+ # Hỗ trợ cả mẫu cũ ("Quê quán") và mẫu 2024 ("Place of birth" biểu diễn trên dòng song ngữ)
69
+ aliases: ["Quê quán", "Place of origin", "Place of birth"]
70
+ extractor: text_until_next_label
71
+ fuzzy_label: false
72
+ stop_labels: ["Nơi thường trú", "Place of residence", "Nơi cư trú", "Ngày, tháng, năm cấp", "Date of issue"]
73
+ required: true
74
+
75
+ - name: noi_thuong_tru
76
+ # Hỗ trợ cả mẫu cũ ("Nơi thường trú") và mẫu 2024 ("Place of residence" trên dòng "Nơi cư trú /Place of residence")
77
+ aliases: ["Nơi thường trú", "Place of residence"]
78
+ extractor: text_until_next_label
79
+ fuzzy_label: false
80
+ stop_labels: ["Có giá trị đến", "Date of expiry", "Số định danh", "Nơi đăng ký khai sinh", "Place of birth", "Quê quán", "Place of origin"]
81
+ required: true
82
+
83
+ - name: ngay_het_han
84
+ aliases: ["Có giá trị đến", "Date of expiry"]
85
+ extractor: regex_after_label
86
+ pattern: '(\d{1,2}[/\-.\s]\d{1,2}[/\-.\s]\d{4})'
87
+ fuzzy_label: false
88
+ next_lines: 1
89
+ transform: [{op: to_date}]
90
+ cross_check_with: [mrz_cccd.ngay_het_han]
91
+
92
+ - name: mrz_cccd
93
+ extractor: mrz_cccd
94
+ optional: true
95
+
96
+ compute:
97
+ - name: tuoi
98
+ expr: "year_now() - year(ngay_sinh)"
99
+ deps: [ngay_sinh]
100
+
101
+ output:
102
+ markdown:
103
+ title: "Căn cước công dân"
104
+ sections:
105
+ - heading: "Thông tin cá nhân"
106
+ fields: [ho_va_ten, ngay_sinh, tuoi, gioi_tinh, quoc_tich]
107
+ - heading: "Định danh"
108
+ fields: [so_cccd, ngay_het_han]
109
+ - heading: "Địa chỉ"
110
+ fields: [que_quan, noi_thuong_tru]
111
+ json:
112
+ include_trace: true
113
+ include_candidates: true
@@ -0,0 +1,105 @@
1
+ id: dang_kiem
2
+ version: 1
3
+ display_name: "Giấy chứng nhận đăng kiểm xe"
4
+ language: ["vi"]
5
+
6
+ classify:
7
+ any_of:
8
+ - contains_any: ["GIẤY CHỨNG NHẬN KIỂM ĐỊNH", "ĐĂNG KIỂM"]
9
+ - all_of:
10
+ - contains_any: ["Biển số đăng ký", "Biển số"]
11
+ - contains_any: ["Số máy", "Số khung"]
12
+
13
+ denoise:
14
+ drop_lines:
15
+ regex: ['^[\W_]{3,}$']
16
+ collapse_repeats: true
17
+
18
+ reconstruct:
19
+ fuzzy_threshold: 0.85
20
+ split_glued_labels: true
21
+ rejoin_wrapped_lines: true
22
+
23
+ extract:
24
+ - name: bien_so
25
+ aliases: ["Biển số đăng ký", "Biển số"]
26
+ extractor: plate_vn
27
+ required: true
28
+
29
+ - name: nhan_hieu
30
+ aliases: ["Nhãn hiệu"]
31
+ extractor: value_in_same_line
32
+ required: true
33
+
34
+ - name: so_loai
35
+ aliases: ["Số loại"]
36
+ extractor: value_in_same_line
37
+
38
+ - name: so_may
39
+ aliases: ["Số máy"]
40
+ extractor: value_in_same_line
41
+ required: true
42
+
43
+ - name: so_khung
44
+ aliases: ["Số khung"]
45
+ extractor: value_in_same_line
46
+ required: true
47
+
48
+ - name: nam_san_xuat
49
+ aliases: ["Năm sản xuất"]
50
+ extractor: value_in_same_line
51
+ transform: [{op: to_int}]
52
+
53
+ - name: niên_hạn
54
+ aliases: ["Niên hạn", "Niên hạn sử dụng"]
55
+ extractor: value_in_same_line
56
+
57
+ - name: trong_luong_ban_than
58
+ aliases: ["Trọng lượng bản thân", "Khối lượng bản thân"]
59
+ extractor: value_in_same_line
60
+ transform: [{op: strip_unit, units: ["kg"]}, {op: to_int}]
61
+
62
+ - name: so_nguoi_cho_phep
63
+ aliases: ["Số người cho phép chở", "Cho phép chở"]
64
+ extractor: value_in_same_line
65
+ transform: [{op: regex_group, pattern: '(\d+)'}, {op: to_int}]
66
+
67
+ - name: chu_so_huu
68
+ aliases: ["Tên chủ xe", "Chủ sở hữu", "Chủ xe"]
69
+ extractor: value_in_same_line
70
+ required: true
71
+
72
+ - name: dia_chi
73
+ aliases: ["Địa chỉ"]
74
+ extractor: text_until_next_label
75
+ stop_labels: ["Số máy", "Biển số", "Có giá trị đến"]
76
+
77
+ - name: ngay_kiem_dinh
78
+ aliases: ["Ngày kiểm định", "Kiểm định ngày"]
79
+ extractor: regex_after_label
80
+ pattern: '(\d{1,2}[/\-.\s]\d{1,2}[/\-.\s]\d{4})'
81
+ transform: [{op: to_date}]
82
+
83
+ - name: co_gia_tri_den
84
+ aliases: ["Có giá trị đến"]
85
+ extractor: regex_after_label
86
+ pattern: '(\d{1,2}[/\-.\s]\d{1,2}[/\-.\s]\d{4})'
87
+ transform: [{op: to_date}]
88
+ required: true
89
+
90
+ output:
91
+ markdown:
92
+ title: "Giấy chứng nhận đăng kiểm"
93
+ sections:
94
+ - heading: "Thông tin xe"
95
+ fields: [bien_so, nhan_hieu, so_loai, nam_san_xuat, niên_hạn]
96
+ - heading: "Số máy / khung"
97
+ fields: [so_may, so_khung]
98
+ - heading: "Tải trọng"
99
+ fields: [trong_luong_ban_than, so_nguoi_cho_phep]
100
+ - heading: "Chủ sở hữu"
101
+ fields: [chu_so_huu, dia_chi]
102
+ - heading: "Kiểm định"
103
+ fields: [ngay_kiem_dinh, co_gia_tri_den]
104
+ json:
105
+ include_trace: true