ocr-postprocess 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. ocr_postprocess/__init__.py +33 -0
  2. ocr_postprocess/classifier.py +63 -0
  3. ocr_postprocess/cli.py +130 -0
  4. ocr_postprocess/engine/__init__.py +0 -0
  5. ocr_postprocess/engine/denoiser.py +134 -0
  6. ocr_postprocess/engine/extractor_stage.py +107 -0
  7. ocr_postprocess/engine/normalizer.py +128 -0
  8. ocr_postprocess/engine/reconciler.py +170 -0
  9. ocr_postprocess/engine/reconstructor.py +469 -0
  10. ocr_postprocess/engine/transform_stage.py +89 -0
  11. ocr_postprocess/exceptions.py +30 -0
  12. ocr_postprocess/extractors/__init__.py +0 -0
  13. ocr_postprocess/extractors/base.py +103 -0
  14. ocr_postprocess/extractors/helpers.py +63 -0
  15. ocr_postprocess/extractors/label_anchor/__init__.py +0 -0
  16. ocr_postprocess/extractors/label_anchor/line_after_label.py +53 -0
  17. ocr_postprocess/extractors/label_anchor/regex_after_label.py +75 -0
  18. ocr_postprocess/extractors/label_anchor/text_until_next_label.py +79 -0
  19. ocr_postprocess/extractors/label_anchor/value_between_labels.py +65 -0
  20. ocr_postprocess/extractors/label_anchor/value_in_same_line.py +60 -0
  21. ocr_postprocess/extractors/pattern/__init__.py +0 -0
  22. ocr_postprocess/extractors/pattern/cccd.py +120 -0
  23. ocr_postprocess/extractors/pattern/cmnd.py +38 -0
  24. ocr_postprocess/extractors/pattern/currency_vnd.py +48 -0
  25. ocr_postprocess/extractors/pattern/date.py +89 -0
  26. ocr_postprocess/extractors/pattern/email.py +38 -0
  27. ocr_postprocess/extractors/pattern/gender_vn.py +48 -0
  28. ocr_postprocess/extractors/pattern/phone_vn.py +83 -0
  29. ocr_postprocess/extractors/pattern/plate_vn.py +39 -0
  30. ocr_postprocess/extractors/pattern/tax_code.py +53 -0
  31. ocr_postprocess/extractors/registry.py +45 -0
  32. ocr_postprocess/extractors/structured/__init__.py +0 -0
  33. ocr_postprocess/extractors/structured/mrz_cccd.py +111 -0
  34. ocr_postprocess/extractors/universal.py +39 -0
  35. ocr_postprocess/models.py +131 -0
  36. ocr_postprocess/pipeline.py +179 -0
  37. ocr_postprocess/profiles/__init__.py +0 -0
  38. ocr_postprocess/profiles/_generic.yml +13 -0
  39. ocr_postprocess/profiles/cccd_2024.yml +113 -0
  40. ocr_postprocess/profiles/dang_kiem.yml +105 -0
  41. ocr_postprocess/profiles/loader.py +63 -0
  42. ocr_postprocess/profiles/matcher.py +71 -0
  43. ocr_postprocess/profiles/schema.py +197 -0
  44. ocr_postprocess/py.typed +0 -0
  45. ocr_postprocess/renderer/__init__.py +0 -0
  46. ocr_postprocess/renderer/json_renderer.py +59 -0
  47. ocr_postprocess/renderer/llm.py +41 -0
  48. ocr_postprocess/renderer/markdown.py +172 -0
  49. ocr_postprocess/scorer.py +78 -0
  50. ocr_postprocess/transformer.py +304 -0
  51. ocr_postprocess-0.1.0.dist-info/METADATA +189 -0
  52. ocr_postprocess-0.1.0.dist-info/RECORD +55 -0
  53. ocr_postprocess-0.1.0.dist-info/WHEEL +5 -0
  54. ocr_postprocess-0.1.0.dist-info/entry_points.txt +2 -0
  55. ocr_postprocess-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,63 @@
1
+ """Profile YAML loader."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ from pathlib import Path
7
+
8
+ import yaml
9
+ from pydantic import ValidationError
10
+
11
+ from ocr_postprocess.exceptions import ProfileValidationError
12
+ from ocr_postprocess.profiles.schema import DocumentProfile
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ def load_profile(path: Path) -> DocumentProfile:
18
+ """Load and validate a single YAML profile file."""
19
+ try:
20
+ raw = yaml.safe_load(path.read_text(encoding="utf-8"))
21
+ except yaml.YAMLError as exc:
22
+ raise ProfileValidationError(f"YAML parse error: {exc}", field_path=str(path))
23
+
24
+ if not isinstance(raw, dict):
25
+ raise ProfileValidationError("Profile must be a YAML mapping", field_path=str(path))
26
+
27
+ # Use filename stem as id if not provided
28
+ if "id" not in raw:
29
+ raw["id"] = path.stem
30
+
31
+ try:
32
+ profile = DocumentProfile.model_validate(raw)
33
+ except ValidationError as exc:
34
+ errors = "; ".join(
35
+ f"{'.'.join(str(loc) for loc in e['loc'])}: {e['msg']}" for e in exc.errors()
36
+ )
37
+ raise ProfileValidationError(errors, field_path=str(path))
38
+
39
+ # Validate id matches filename
40
+ if profile.id != path.stem:
41
+ logger.warning("Profile id '%s' does not match filename '%s'", profile.id, path.stem)
42
+
43
+ logger.debug("Loaded profile '%s' from %s", profile.id, path)
44
+ return profile
45
+
46
+
47
+ def load_profiles(profiles_dir: str | Path) -> dict[str, DocumentProfile]:
48
+ """Load all YAML profiles from a directory."""
49
+ profiles_path = Path(profiles_dir)
50
+ if not profiles_path.is_dir():
51
+ logger.warning("Profiles directory not found: %s", profiles_path)
52
+ return {}
53
+
54
+ profiles: dict[str, DocumentProfile] = {}
55
+ for yml_file in sorted(profiles_path.glob("*.yml")):
56
+ try:
57
+ profile = load_profile(yml_file)
58
+ profiles[profile.id] = profile
59
+ except ProfileValidationError as exc:
60
+ logger.error("Failed to load profile %s: %s", yml_file.name, exc)
61
+
62
+ logger.info("Loaded %d profile(s) from %s", len(profiles), profiles_path)
63
+ return profiles
@@ -0,0 +1,71 @@
1
+ """Classify expression evaluator (AND/OR/NOT tree)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ import re
7
+
8
+ from rapidfuzz import fuzz
9
+
10
+ from ocr_postprocess.profiles.schema import ClassifyExpr
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ def _contains_any(text: str, keywords: list[str], threshold: float) -> float:
16
+ """Return max fuzzy partial_ratio score across keywords, or 0."""
17
+ best = 0.0
18
+ text_lower = text.lower()
19
+ for kw in keywords:
20
+ score = fuzz.partial_ratio(kw.lower(), text_lower) / 100.0
21
+ if score >= threshold:
22
+ best = max(best, score)
23
+ return best
24
+
25
+
26
+ def _regex_match(text: str, pattern: str) -> float:
27
+ """Return 1.0 if pattern matches, else 0.0."""
28
+ return 1.0 if re.search(pattern, text) else 0.0
29
+
30
+
31
+ def evaluate(expr: ClassifyExpr | str, text: str, threshold: float | None = None) -> float:
32
+ """Recursively evaluate a ClassifyExpr against text.
33
+
34
+ Returns score in [0, 1]:
35
+ - 0 means no match / failed condition
36
+ - 1 means perfect match
37
+ """
38
+ if isinstance(expr, str):
39
+ # leaf — keyword
40
+ score = fuzz.partial_ratio(expr.lower(), text.lower()) / 100.0
41
+ return score if score >= (threshold or 0.85) else 0.0
42
+
43
+ thr = threshold if threshold is not None else expr.fuzzy_threshold
44
+
45
+ # contains_any leaf
46
+ if expr.contains_any is not None:
47
+ return _contains_any(text, expr.contains_any, thr)
48
+
49
+ # regex leaf
50
+ if expr.regex is not None:
51
+ return _regex_match(text, expr.regex)
52
+
53
+ # NOT
54
+ if expr.not_ is not None:
55
+ inner = evaluate(expr.not_, text, thr)
56
+ return 0.0 if inner > 0.0 else 1.0
57
+
58
+ # AND
59
+ if expr.all_of is not None:
60
+ scores = [evaluate(child, text, thr) for child in expr.all_of]
61
+ if any(s == 0.0 for s in scores):
62
+ return 0.0
63
+ return sum(scores) / len(scores)
64
+
65
+ # OR
66
+ if expr.any_of is not None:
67
+ scores = [evaluate(child, text, thr) for child in expr.any_of]
68
+ positives = [s for s in scores if s > 0.0]
69
+ return max(positives) if positives else 0.0
70
+
71
+ return 0.0
@@ -0,0 +1,197 @@
1
+ """Profile YAML schema — Pydantic v2 models."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any, Literal
6
+
7
+ from pydantic import BaseModel, Field, model_validator
8
+
9
+
10
+ class ClassifyExpr(BaseModel):
11
+ """AND/OR/NOT expression tree for classifier."""
12
+
13
+ all_of: list["ClassifyExpr | str"] | None = None
14
+ any_of: list["ClassifyExpr | str"] | None = None
15
+ not_: "ClassifyExpr | str | None" = Field(default=None, alias="not")
16
+ contains_any: list[str] | None = None
17
+ regex: str | None = None
18
+ fuzzy_threshold: float = 0.85
19
+
20
+ model_config = {"populate_by_name": True}
21
+
22
+ @model_validator(mode="after")
23
+ def at_least_one_condition(self) -> "ClassifyExpr":
24
+ has = any(
25
+ [
26
+ self.all_of is not None,
27
+ self.any_of is not None,
28
+ self.not_ is not None,
29
+ self.contains_any is not None,
30
+ self.regex is not None,
31
+ ]
32
+ )
33
+ if not has:
34
+ raise ValueError("ClassifyExpr must have at least one condition")
35
+ return self
36
+
37
+
38
+ class DropLinesRule(BaseModel):
39
+ """Rules for which lines to drop during denoising."""
40
+
41
+ regex: list[str] = []
42
+ contains_any: list[str] = []
43
+
44
+
45
+ class NoiseRules(BaseModel):
46
+ """Noise-removal rules applied during denoising stage."""
47
+
48
+ drop_lines: DropLinesRule = Field(default_factory=DropLinesRule)
49
+ drop_patterns: list[str] = []
50
+ mask_patterns: list[dict[str, str]] = []
51
+ collapse_repeats: bool = False
52
+
53
+ @model_validator(mode="before")
54
+ @classmethod
55
+ def _normalise_drop_lines(cls, data: Any) -> Any:
56
+ if isinstance(data, dict):
57
+ data = dict(data)
58
+ dl = data.get("drop_lines")
59
+ if isinstance(dl, list):
60
+ data["drop_lines"] = {"regex": [], "contains_any": dl}
61
+ return data
62
+
63
+
64
+ class SectionDef(BaseModel):
65
+ """Definition of a document section and how to detect its start."""
66
+
67
+ id: str
68
+ start: list[str]
69
+ is_regex: bool = False
70
+
71
+
72
+ class ReconstructConfig(BaseModel):
73
+ """Configuration for the reconstructor stage (6 sub-steps a–f)."""
74
+
75
+ enabled_steps: list[str] = ["a", "b", "c", "d", "e", "f"]
76
+ bilingual_separator: str = "/"
77
+ fuzzy_threshold: float = 0.8
78
+ multi_label_min_count: int = 2
79
+ split_glued_labels: bool = True
80
+ rejoin_wrapped_lines: bool = True
81
+ bilingual_pairs: list[list[str]] = []
82
+
83
+
84
+ class TransformOpRef(BaseModel):
85
+ """One op in a transform pipeline."""
86
+
87
+ name: str
88
+ args: dict[str, Any] = {}
89
+
90
+
91
+ class FieldDef(BaseModel):
92
+ """Definition of one extractable field in a document profile."""
93
+
94
+ model_config = {"populate_by_name": True}
95
+
96
+ key: str = Field(default="", alias="name")
97
+ extractor: str | None = None
98
+ aliases: list[str] = []
99
+ section: str | None = None
100
+ pattern: str | None = None
101
+ transform: list[Any] = []
102
+ compute: str | None = None
103
+ deps: list[str] = []
104
+ constant: Any = None
105
+ default: Any = None
106
+ optional: bool = False
107
+ required: bool = False
108
+ type: Literal["text", "int", "float", "date", "checkbox", "enum"] = "text"
109
+ enum_values: list[str] | None = None
110
+ needs_vision: bool = False
111
+ llm_hint: str | None = None
112
+ cross_validate_with: list[str] = Field(default_factory=list)
113
+ fuzzy_label: bool = True
114
+ confidence_threshold: float = 0.0
115
+ extractor_args: dict[str, Any] = {}
116
+ stop_labels: list[str] = []
117
+ next_lines: int = 0 # Extra lines to search after the label line (used by regex_after_label)
118
+
119
+ @model_validator(mode="before")
120
+ @classmethod
121
+ def _remap_aliases(cls, data: Any) -> Any:
122
+ """Accept YAML field names that differ from model field names."""
123
+ if isinstance(data, dict):
124
+ data = dict(data)
125
+ # cross_check_with → cross_validate_with
126
+ if "cross_check_with" in data and "cross_validate_with" not in data:
127
+ data["cross_validate_with"] = data.pop("cross_check_with")
128
+ # Ensure key is populated if name was given
129
+ if "name" in data and "key" not in data:
130
+ data["key"] = data["name"]
131
+ return data
132
+
133
+
134
+ class StructuredExtractorRef(BaseModel):
135
+ """Reference to a structured extractor with optional args."""
136
+
137
+ name: str
138
+ section: str | None = None
139
+ args: dict[str, Any] = {}
140
+
141
+
142
+ class OutputConfig(BaseModel):
143
+ """Output format configuration (markdown, JSON)."""
144
+
145
+ markdown: dict[str, Any] = {}
146
+ json_output: dict[str, Any] = Field(default_factory=dict, alias="json")
147
+
148
+
149
+ class DocumentProfile(BaseModel):
150
+ """Complete document profile: classification rules, fields, and pipeline config."""
151
+
152
+ id: str
153
+ name: str = ""
154
+ display_name: str = ""
155
+ version: int = 1
156
+ language: list[str] = ["vi"]
157
+ extends: str | None = None
158
+
159
+ classify: ClassifyExpr
160
+ normalize: dict[str, Any] = {}
161
+ noise: NoiseRules = Field(default_factory=NoiseRules)
162
+ sections: list[SectionDef] = []
163
+ reconstruct: ReconstructConfig = Field(default_factory=ReconstructConfig)
164
+
165
+ fields: list[FieldDef] = []
166
+ extract: list[FieldDef] = [] # YAML uses 'extract', normalised to fields
167
+ structured_extractors: list[StructuredExtractorRef] = []
168
+ compute: list[dict[str, Any]] = []
169
+
170
+ output: OutputConfig = Field(default_factory=OutputConfig)
171
+
172
+ model_config = {"populate_by_name": True}
173
+
174
+ @model_validator(mode="before")
175
+ @classmethod
176
+ def _remap_yaml_keys(cls, data: Any) -> Any:
177
+ """Map YAML-style keys to schema field names."""
178
+ if isinstance(data, dict):
179
+ data = dict(data)
180
+ # denoise → noise
181
+ if "denoise" in data and "noise" not in data:
182
+ data["noise"] = data.pop("denoise")
183
+ return data
184
+
185
+ @model_validator(mode="after")
186
+ def normalise_fields(self) -> "DocumentProfile":
187
+ # Merge 'extract' into 'fields'
188
+ if self.extract and not self.fields:
189
+ self.fields = self.extract
190
+ return self
191
+
192
+ def get_field(self, key: str) -> FieldDef | None:
193
+ """Look up a field by key."""
194
+ for f in self.fields:
195
+ if f.key == key:
196
+ return f
197
+ return None
File without changes
File without changes
@@ -0,0 +1,59 @@
1
+ """JSON renderer for ProcessedDocument — output-format.md §9.2."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ from typing import Any
7
+
8
+ from ocr_postprocess.models import ProcessedDocument
9
+
10
+
11
+ def _build_output(doc: ProcessedDocument) -> dict[str, Any]:
12
+ """Build structured dict matching the output-format.md JSON schema."""
13
+ return {
14
+ "profile": {
15
+ "id": doc.profile_id,
16
+ "score": round(doc.profile_score, 4),
17
+ },
18
+ "sections": [
19
+ {
20
+ "id": s.id,
21
+ "title": s.title,
22
+ "lines": [line.text for line in s.lines],
23
+ }
24
+ for s in doc.sections
25
+ ],
26
+ "candidates": [
27
+ {
28
+ "key": c.key,
29
+ "value": c.value,
30
+ "raw": c.raw,
31
+ "extractor": c.extractor,
32
+ "sources": c.sources,
33
+ "section_id": c.section_id,
34
+ "confidence": round(c.confidence, 4),
35
+ "needs_llm_review": c.needs_llm_review,
36
+ "needs_vision": c.needs_vision,
37
+ "conflict": c.conflict,
38
+ "notes": c.notes,
39
+ }
40
+ for c in doc.candidates
41
+ ],
42
+ "hints": doc.hints,
43
+ "cross_checks": [
44
+ {
45
+ "field_key": cc.field_key,
46
+ "sources": cc.sources,
47
+ "matched": cc.matched,
48
+ "detail": cc.detail,
49
+ }
50
+ for cc in doc.cross_checks
51
+ ],
52
+ "warnings": doc.warnings,
53
+ "overall_confidence": round(doc.overall_confidence, 4),
54
+ }
55
+
56
+
57
+ def to_json(doc: ProcessedDocument, indent: int = 2) -> str:
58
+ """Serialize ProcessedDocument to JSON string (output-format.md schema)."""
59
+ return json.dumps(_build_output(doc), ensure_ascii=False, indent=indent)
@@ -0,0 +1,41 @@
1
+ """Renderer: LLM-friendly Markdown output — clean key-value table for use as LLM context."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from ocr_postprocess.models import ProcessedDocument
6
+
7
+
8
+ def render_llm_markdown(doc: ProcessedDocument) -> str:
9
+ """Render ProcessedDocument thành Markdown gọn cho LLM.
10
+
11
+ Chỉ hiển thị bảng key-value sạch (không có extractor metadata, cross-checks,
12
+ hay confidence scores) — phù hợp để gửi vào context của LLM.
13
+ """
14
+ lines: list[str] = []
15
+
16
+ title = doc.profile_display_name or doc.profile_id
17
+ lines.append(f"# {title}")
18
+ lines.append("")
19
+ lines.append("| Trường | Giá trị |")
20
+ lines.append("|--------|---------|")
21
+
22
+ # Deduplicate: chỉ lấy candidate đầu tiên của mỗi key (đã qua reconcile)
23
+ seen_keys: set[str] = set()
24
+ for cand in doc.candidates:
25
+ if cand.key in seen_keys:
26
+ continue
27
+ seen_keys.add(cand.key)
28
+ label = doc.field_labels.get(cand.key) or cand.key
29
+ value = str(cand.value) if cand.value is not None else ""
30
+ if value:
31
+ lines.append(f"| {label} | {value} |")
32
+
33
+ lines.append("")
34
+
35
+ if doc.warnings:
36
+ lines.append("> **Lưu ý:**")
37
+ for w in doc.warnings:
38
+ lines.append(f"> - {w}")
39
+ lines.append("")
40
+
41
+ return "\n".join(lines)
@@ -0,0 +1,172 @@
1
+ """Renderer: Markdown output for ProcessedDocument — Stage 9."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+
7
+ from ocr_postprocess.models import Candidate, PipelineContext, ProcessedDocument
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+
12
+ def _meta(cand: Candidate) -> str:
13
+ """Build «extractor [flags]» meta string."""
14
+ if cand.extractor in ("constant", "computed"):
15
+ base = f"({cand.extractor})"
16
+ else:
17
+ base = cand.extractor
18
+ if any("checksum" in s for s in cand.sources):
19
+ base += " ✓ checksum"
20
+
21
+ flags = []
22
+ if cand.needs_llm_review:
23
+ flags.append("?review")
24
+ if cand.needs_vision:
25
+ flags.append("?vision")
26
+ if cand.conflict:
27
+ flags.append("!conflict")
28
+
29
+ tail = " ".join(flags)
30
+ inner = (base + " " + tail).strip() if tail else base
31
+ return f"«{inner}»"
32
+
33
+
34
+ def render_markdown(doc: ProcessedDocument) -> str:
35
+ """Render ProcessedDocument to Markdown per output-format.md spec."""
36
+ lines: list[str] = []
37
+ conf = doc.overall_confidence
38
+ title = doc.profile_display_name or doc.profile_id or "unknown"
39
+ lines.append(f"# {title} (confidence {conf:.2f})")
40
+ lines.append("")
41
+
42
+ # Group candidates by section
43
+ section_candidates: dict[str, list[Candidate]] = {}
44
+ unplaced: list[Candidate] = []
45
+ for cand in doc.candidates:
46
+ if cand.section_id:
47
+ section_candidates.setdefault(cand.section_id, []).append(cand)
48
+ else:
49
+ unplaced.append(cand)
50
+
51
+ # Per-section listings — value only, no extractor metadata (avoids confusing LLM)
52
+ for section in doc.sections:
53
+ cands = section_candidates.get(section.id, [])
54
+ if not cands:
55
+ continue
56
+ title_part = f" ({section.title})" if section.title else ""
57
+ lines.append(f"## Section: {section.id}{title_part}")
58
+ for cand in cands:
59
+ value_str = str(cand.value) if cand.value is not None else "—"
60
+ label = doc.field_labels.get(cand.key) or cand.key
61
+ lines.append(f"- **{label}**: {value_str}")
62
+ lines.append("")
63
+
64
+ # Fallback table for unplaced candidates
65
+ if unplaced:
66
+ lines.append("## Extracted Fields")
67
+ lines.append("")
68
+ lines.append("| Field | Value | Extractor | Confidence | Flags |")
69
+ lines.append("|-------|-------|-----------|------------|-------|")
70
+ for cand in unplaced:
71
+ value_str = str(cand.value) if cand.value is not None else "—"
72
+ conf_str = f"{cand.confidence:.2f}"
73
+ flag_parts = []
74
+ if cand.needs_llm_review:
75
+ flag_parts.append("?review")
76
+ if cand.conflict:
77
+ flag_parts.append("!conflict")
78
+ if cand.needs_vision:
79
+ flag_parts.append("?vision")
80
+ flag_str = " ".join(flag_parts)
81
+ lines.append(
82
+ f"| `{cand.key}` | {value_str} | {_meta(cand)} | {conf_str} | {flag_str} |"
83
+ )
84
+ lines.append("")
85
+
86
+ # Hints
87
+ if doc.hints:
88
+ lines.append("## Hints")
89
+ for hint_key, hint_vals in doc.hints.items():
90
+ lines.append(f"- {hint_key}: {hint_vals}")
91
+ lines.append("")
92
+
93
+ # Lưu ý — gộp conflict warnings + failed cross-checks thành một section duy nhất.
94
+ # Mỗi dòng chỉ rõ giá trị đã chọn + nguồn + confidence để LLM có đủ context suy luận.
95
+ chosen: dict[str, str] = {c.key: str(c.value) for c in doc.candidates if c.value is not None}
96
+ failed_checks = {cc.field_key: cc for cc in doc.cross_checks if not cc.matched}
97
+ missing = [w for w in doc.warnings if w.startswith("Required field missing")]
98
+
99
+ # Helper: extract ±context_lines lines around occurrences of raw strings in canonical_text
100
+ ocr_lines: list[str] = doc.canonical_text.splitlines() if doc.canonical_text else []
101
+
102
+ def _snippet(anchor_raw: str, context: int = 2) -> str:
103
+ """Return lines (±context) around the first line containing anchor_raw."""
104
+ if not ocr_lines or not anchor_raw:
105
+ return ""
106
+ for i, ln in enumerate(ocr_lines):
107
+ if anchor_raw in ln:
108
+ lo = max(0, i - context)
109
+ hi = min(len(ocr_lines) - 1, i + context)
110
+ return "\n".join(ocr_lines[lo : hi + 1])
111
+ return ""
112
+
113
+ notes: list[str] = []
114
+ for field_key, cc in failed_checks.items():
115
+ label = doc.field_labels.get(field_key)
116
+ # Skip internal/technical fields that have no human-readable label
117
+ if not label:
118
+ continue
119
+ selected = chosen.get(field_key)
120
+ # Build value→{extractor, confidence, raw, line_index} from value_sources
121
+ src_map: dict[str, dict] = {vs["value"]: vs for vs in cc.value_sources}
122
+
123
+ def _fmt(val: str) -> str:
124
+ vs = src_map.get(val, {})
125
+ ext = vs.get("extractor", "?")
126
+ conf = vs.get("confidence")
127
+ raw = vs.get("raw", "")
128
+ conf_str = f", {conf:.2f}" if conf is not None else ""
129
+ raw_str = f", raw: `{raw}`" if raw and raw != val else ""
130
+ return f'**"{val}"** ({ext}{conf_str}{raw_str})'
131
+
132
+ # Anchor snippet on selected value's raw text (most relevant context)
133
+ anchor = src_map.get(selected or "", {}).get("raw", "") if selected else ""
134
+ if not anchor and src_map:
135
+ anchor = next(iter(src_map.values())).get("raw", "")
136
+ snippet = _snippet(anchor)
137
+
138
+ if selected:
139
+ alternatives = [v for v in cc.values if v != selected]
140
+ if alternatives:
141
+ alt_parts = " | ".join(_fmt(v) for v in alternatives)
142
+ note = (
143
+ f"⚠️ **{label}**: hệ thống chọn {_fmt(selected)}"
144
+ f" — cũng nhận dạng được: {alt_parts}"
145
+ )
146
+ if snippet:
147
+ note += f"\n\n ```\n {chr(10).join(' ' + ln for ln in snippet.splitlines())}\n ```"
148
+ notes.append(note)
149
+ elif cc.values:
150
+ parts = " | ".join(_fmt(v) for v in cc.values)
151
+ note = f"⚠️ **{label}**: không thể xác định chắc chắn — tìm thấy: {parts}"
152
+ if snippet:
153
+ note += (
154
+ f"\n\n ```\n {chr(10).join(' ' + ln for ln in snippet.splitlines())}\n ```"
155
+ )
156
+ notes.append(note)
157
+ for w in missing:
158
+ notes.append(f"⚠️ {w}")
159
+
160
+ if notes:
161
+ lines.append("## Lưu ý")
162
+ lines.append("")
163
+ for note in notes:
164
+ lines.append(f"- {note}")
165
+ lines.append("")
166
+
167
+ return "\n".join(lines)
168
+
169
+
170
+ def render_stage(ctx: PipelineContext) -> None:
171
+ """Pipeline stage 9: no-op hook — markdown built in pipeline.process()."""
172
+ logger.debug("Render stage: markdown will be assembled after all stages complete")
@@ -0,0 +1,78 @@
1
+ """Stage 8 — ConfidenceScorer."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+
7
+ from ocr_postprocess.models import PipelineContext
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+
12
+ def score_stage(ctx: PipelineContext) -> None:
13
+ """Pipeline stage 8: compute overall_confidence."""
14
+ profile = ctx.profile
15
+
16
+ # Classification score
17
+ classification_score = ctx.classification_score
18
+
19
+ # % required fields found
20
+ if profile and profile.fields:
21
+ required_fields = [f for f in profile.fields if f.required]
22
+ if required_fields:
23
+ found_keys = {c.key for c in ctx.candidates}
24
+ pct_required = sum(1 for f in required_fields if f.key in found_keys) / len(
25
+ required_fields
26
+ )
27
+ else:
28
+ pct_required = 1.0
29
+
30
+ # avg field confidence
31
+ if ctx.candidates:
32
+ avg_conf = sum(c.confidence for c in ctx.candidates) / len(ctx.candidates)
33
+ else:
34
+ avg_conf = 0.0
35
+
36
+ # % cross-validated
37
+ if ctx.cross_checks:
38
+ pct_cross = sum(1 for cc in ctx.cross_checks if cc.matched) / len(ctx.cross_checks)
39
+ else:
40
+ pct_cross = 1.0
41
+ else:
42
+ pct_required = 1.0
43
+ avg_conf = sum(c.confidence for c in ctx.candidates) / max(len(ctx.candidates), 1)
44
+ pct_cross = 1.0
45
+
46
+ overall = (
47
+ # classification_score: 20% — reward correct document type identification
48
+ classification_score * 0.2
49
+ # pct_required: 30% — largest weight, all required fields must be found
50
+ + pct_required * 0.3
51
+ # avg_conf: 30% — average extractor confidence across all candidates
52
+ + avg_conf * 0.3
53
+ # pct_cross: 20% — reward consistency between independent extractors
54
+ + pct_cross * 0.2
55
+ )
56
+
57
+ # Penalty for missing required fields: -0.2 per missing field (capped at 0.0)
58
+ missing_required = [
59
+ f.key
60
+ for f in (profile.fields if profile else [])
61
+ if f.required and not any(c.key == f.key for c in ctx.candidates)
62
+ ]
63
+ if missing_required:
64
+ overall = max(0.0, overall - 0.2 * len(missing_required))
65
+ # Reconciler already appended warnings; log here for scorer-level visibility
66
+ for key in missing_required:
67
+ logger.warning("Scorer: required field still missing after reconcile: '%s'", key)
68
+
69
+ ctx.overall_confidence = round(min(1.0, max(0.0, overall)), 4)
70
+ logger.debug(
71
+ "Scorer: classify=%.2f required=%.2f avg_conf=%.2f cross=%.2f → overall=%.4f%s",
72
+ classification_score,
73
+ pct_required,
74
+ avg_conf,
75
+ pct_cross,
76
+ ctx.overall_confidence,
77
+ f" (missing required: {missing_required})" if missing_required else "",
78
+ )