ocr-postprocess 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ocr_postprocess/__init__.py +33 -0
- ocr_postprocess/classifier.py +63 -0
- ocr_postprocess/cli.py +130 -0
- ocr_postprocess/engine/__init__.py +0 -0
- ocr_postprocess/engine/denoiser.py +134 -0
- ocr_postprocess/engine/extractor_stage.py +107 -0
- ocr_postprocess/engine/normalizer.py +128 -0
- ocr_postprocess/engine/reconciler.py +170 -0
- ocr_postprocess/engine/reconstructor.py +469 -0
- ocr_postprocess/engine/transform_stage.py +89 -0
- ocr_postprocess/exceptions.py +30 -0
- ocr_postprocess/extractors/__init__.py +0 -0
- ocr_postprocess/extractors/base.py +103 -0
- ocr_postprocess/extractors/helpers.py +63 -0
- ocr_postprocess/extractors/label_anchor/__init__.py +0 -0
- ocr_postprocess/extractors/label_anchor/line_after_label.py +53 -0
- ocr_postprocess/extractors/label_anchor/regex_after_label.py +75 -0
- ocr_postprocess/extractors/label_anchor/text_until_next_label.py +79 -0
- ocr_postprocess/extractors/label_anchor/value_between_labels.py +65 -0
- ocr_postprocess/extractors/label_anchor/value_in_same_line.py +60 -0
- ocr_postprocess/extractors/pattern/__init__.py +0 -0
- ocr_postprocess/extractors/pattern/cccd.py +120 -0
- ocr_postprocess/extractors/pattern/cmnd.py +38 -0
- ocr_postprocess/extractors/pattern/currency_vnd.py +48 -0
- ocr_postprocess/extractors/pattern/date.py +89 -0
- ocr_postprocess/extractors/pattern/email.py +38 -0
- ocr_postprocess/extractors/pattern/gender_vn.py +48 -0
- ocr_postprocess/extractors/pattern/phone_vn.py +83 -0
- ocr_postprocess/extractors/pattern/plate_vn.py +39 -0
- ocr_postprocess/extractors/pattern/tax_code.py +53 -0
- ocr_postprocess/extractors/registry.py +45 -0
- ocr_postprocess/extractors/structured/__init__.py +0 -0
- ocr_postprocess/extractors/structured/mrz_cccd.py +111 -0
- ocr_postprocess/extractors/universal.py +39 -0
- ocr_postprocess/models.py +131 -0
- ocr_postprocess/pipeline.py +179 -0
- ocr_postprocess/profiles/__init__.py +0 -0
- ocr_postprocess/profiles/_generic.yml +13 -0
- ocr_postprocess/profiles/cccd_2024.yml +113 -0
- ocr_postprocess/profiles/dang_kiem.yml +105 -0
- ocr_postprocess/profiles/loader.py +63 -0
- ocr_postprocess/profiles/matcher.py +71 -0
- ocr_postprocess/profiles/schema.py +197 -0
- ocr_postprocess/py.typed +0 -0
- ocr_postprocess/renderer/__init__.py +0 -0
- ocr_postprocess/renderer/json_renderer.py +59 -0
- ocr_postprocess/renderer/llm.py +41 -0
- ocr_postprocess/renderer/markdown.py +172 -0
- ocr_postprocess/scorer.py +78 -0
- ocr_postprocess/transformer.py +304 -0
- ocr_postprocess-0.1.0.dist-info/METADATA +189 -0
- ocr_postprocess-0.1.0.dist-info/RECORD +55 -0
- ocr_postprocess-0.1.0.dist-info/WHEEL +5 -0
- ocr_postprocess-0.1.0.dist-info/entry_points.txt +2 -0
- ocr_postprocess-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
"""Profile YAML loader."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
import yaml
|
|
9
|
+
from pydantic import ValidationError
|
|
10
|
+
|
|
11
|
+
from ocr_postprocess.exceptions import ProfileValidationError
|
|
12
|
+
from ocr_postprocess.profiles.schema import DocumentProfile
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def load_profile(path: Path) -> DocumentProfile:
|
|
18
|
+
"""Load and validate a single YAML profile file."""
|
|
19
|
+
try:
|
|
20
|
+
raw = yaml.safe_load(path.read_text(encoding="utf-8"))
|
|
21
|
+
except yaml.YAMLError as exc:
|
|
22
|
+
raise ProfileValidationError(f"YAML parse error: {exc}", field_path=str(path))
|
|
23
|
+
|
|
24
|
+
if not isinstance(raw, dict):
|
|
25
|
+
raise ProfileValidationError("Profile must be a YAML mapping", field_path=str(path))
|
|
26
|
+
|
|
27
|
+
# Use filename stem as id if not provided
|
|
28
|
+
if "id" not in raw:
|
|
29
|
+
raw["id"] = path.stem
|
|
30
|
+
|
|
31
|
+
try:
|
|
32
|
+
profile = DocumentProfile.model_validate(raw)
|
|
33
|
+
except ValidationError as exc:
|
|
34
|
+
errors = "; ".join(
|
|
35
|
+
f"{'.'.join(str(loc) for loc in e['loc'])}: {e['msg']}" for e in exc.errors()
|
|
36
|
+
)
|
|
37
|
+
raise ProfileValidationError(errors, field_path=str(path))
|
|
38
|
+
|
|
39
|
+
# Validate id matches filename
|
|
40
|
+
if profile.id != path.stem:
|
|
41
|
+
logger.warning("Profile id '%s' does not match filename '%s'", profile.id, path.stem)
|
|
42
|
+
|
|
43
|
+
logger.debug("Loaded profile '%s' from %s", profile.id, path)
|
|
44
|
+
return profile
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def load_profiles(profiles_dir: str | Path) -> dict[str, DocumentProfile]:
|
|
48
|
+
"""Load all YAML profiles from a directory."""
|
|
49
|
+
profiles_path = Path(profiles_dir)
|
|
50
|
+
if not profiles_path.is_dir():
|
|
51
|
+
logger.warning("Profiles directory not found: %s", profiles_path)
|
|
52
|
+
return {}
|
|
53
|
+
|
|
54
|
+
profiles: dict[str, DocumentProfile] = {}
|
|
55
|
+
for yml_file in sorted(profiles_path.glob("*.yml")):
|
|
56
|
+
try:
|
|
57
|
+
profile = load_profile(yml_file)
|
|
58
|
+
profiles[profile.id] = profile
|
|
59
|
+
except ProfileValidationError as exc:
|
|
60
|
+
logger.error("Failed to load profile %s: %s", yml_file.name, exc)
|
|
61
|
+
|
|
62
|
+
logger.info("Loaded %d profile(s) from %s", len(profiles), profiles_path)
|
|
63
|
+
return profiles
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
"""Classify expression evaluator (AND/OR/NOT tree)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
import re
|
|
7
|
+
|
|
8
|
+
from rapidfuzz import fuzz
|
|
9
|
+
|
|
10
|
+
from ocr_postprocess.profiles.schema import ClassifyExpr
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _contains_any(text: str, keywords: list[str], threshold: float) -> float:
|
|
16
|
+
"""Return max fuzzy partial_ratio score across keywords, or 0."""
|
|
17
|
+
best = 0.0
|
|
18
|
+
text_lower = text.lower()
|
|
19
|
+
for kw in keywords:
|
|
20
|
+
score = fuzz.partial_ratio(kw.lower(), text_lower) / 100.0
|
|
21
|
+
if score >= threshold:
|
|
22
|
+
best = max(best, score)
|
|
23
|
+
return best
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _regex_match(text: str, pattern: str) -> float:
|
|
27
|
+
"""Return 1.0 if pattern matches, else 0.0."""
|
|
28
|
+
return 1.0 if re.search(pattern, text) else 0.0
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def evaluate(expr: ClassifyExpr | str, text: str, threshold: float | None = None) -> float:
|
|
32
|
+
"""Recursively evaluate a ClassifyExpr against text.
|
|
33
|
+
|
|
34
|
+
Returns score in [0, 1]:
|
|
35
|
+
- 0 means no match / failed condition
|
|
36
|
+
- 1 means perfect match
|
|
37
|
+
"""
|
|
38
|
+
if isinstance(expr, str):
|
|
39
|
+
# leaf — keyword
|
|
40
|
+
score = fuzz.partial_ratio(expr.lower(), text.lower()) / 100.0
|
|
41
|
+
return score if score >= (threshold or 0.85) else 0.0
|
|
42
|
+
|
|
43
|
+
thr = threshold if threshold is not None else expr.fuzzy_threshold
|
|
44
|
+
|
|
45
|
+
# contains_any leaf
|
|
46
|
+
if expr.contains_any is not None:
|
|
47
|
+
return _contains_any(text, expr.contains_any, thr)
|
|
48
|
+
|
|
49
|
+
# regex leaf
|
|
50
|
+
if expr.regex is not None:
|
|
51
|
+
return _regex_match(text, expr.regex)
|
|
52
|
+
|
|
53
|
+
# NOT
|
|
54
|
+
if expr.not_ is not None:
|
|
55
|
+
inner = evaluate(expr.not_, text, thr)
|
|
56
|
+
return 0.0 if inner > 0.0 else 1.0
|
|
57
|
+
|
|
58
|
+
# AND
|
|
59
|
+
if expr.all_of is not None:
|
|
60
|
+
scores = [evaluate(child, text, thr) for child in expr.all_of]
|
|
61
|
+
if any(s == 0.0 for s in scores):
|
|
62
|
+
return 0.0
|
|
63
|
+
return sum(scores) / len(scores)
|
|
64
|
+
|
|
65
|
+
# OR
|
|
66
|
+
if expr.any_of is not None:
|
|
67
|
+
scores = [evaluate(child, text, thr) for child in expr.any_of]
|
|
68
|
+
positives = [s for s in scores if s > 0.0]
|
|
69
|
+
return max(positives) if positives else 0.0
|
|
70
|
+
|
|
71
|
+
return 0.0
|
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
"""Profile YAML schema — Pydantic v2 models."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any, Literal
|
|
6
|
+
|
|
7
|
+
from pydantic import BaseModel, Field, model_validator
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class ClassifyExpr(BaseModel):
|
|
11
|
+
"""AND/OR/NOT expression tree for classifier."""
|
|
12
|
+
|
|
13
|
+
all_of: list["ClassifyExpr | str"] | None = None
|
|
14
|
+
any_of: list["ClassifyExpr | str"] | None = None
|
|
15
|
+
not_: "ClassifyExpr | str | None" = Field(default=None, alias="not")
|
|
16
|
+
contains_any: list[str] | None = None
|
|
17
|
+
regex: str | None = None
|
|
18
|
+
fuzzy_threshold: float = 0.85
|
|
19
|
+
|
|
20
|
+
model_config = {"populate_by_name": True}
|
|
21
|
+
|
|
22
|
+
@model_validator(mode="after")
|
|
23
|
+
def at_least_one_condition(self) -> "ClassifyExpr":
|
|
24
|
+
has = any(
|
|
25
|
+
[
|
|
26
|
+
self.all_of is not None,
|
|
27
|
+
self.any_of is not None,
|
|
28
|
+
self.not_ is not None,
|
|
29
|
+
self.contains_any is not None,
|
|
30
|
+
self.regex is not None,
|
|
31
|
+
]
|
|
32
|
+
)
|
|
33
|
+
if not has:
|
|
34
|
+
raise ValueError("ClassifyExpr must have at least one condition")
|
|
35
|
+
return self
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class DropLinesRule(BaseModel):
|
|
39
|
+
"""Rules for which lines to drop during denoising."""
|
|
40
|
+
|
|
41
|
+
regex: list[str] = []
|
|
42
|
+
contains_any: list[str] = []
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class NoiseRules(BaseModel):
|
|
46
|
+
"""Noise-removal rules applied during denoising stage."""
|
|
47
|
+
|
|
48
|
+
drop_lines: DropLinesRule = Field(default_factory=DropLinesRule)
|
|
49
|
+
drop_patterns: list[str] = []
|
|
50
|
+
mask_patterns: list[dict[str, str]] = []
|
|
51
|
+
collapse_repeats: bool = False
|
|
52
|
+
|
|
53
|
+
@model_validator(mode="before")
|
|
54
|
+
@classmethod
|
|
55
|
+
def _normalise_drop_lines(cls, data: Any) -> Any:
|
|
56
|
+
if isinstance(data, dict):
|
|
57
|
+
data = dict(data)
|
|
58
|
+
dl = data.get("drop_lines")
|
|
59
|
+
if isinstance(dl, list):
|
|
60
|
+
data["drop_lines"] = {"regex": [], "contains_any": dl}
|
|
61
|
+
return data
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
class SectionDef(BaseModel):
|
|
65
|
+
"""Definition of a document section and how to detect its start."""
|
|
66
|
+
|
|
67
|
+
id: str
|
|
68
|
+
start: list[str]
|
|
69
|
+
is_regex: bool = False
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
class ReconstructConfig(BaseModel):
|
|
73
|
+
"""Configuration for the reconstructor stage (6 sub-steps a–f)."""
|
|
74
|
+
|
|
75
|
+
enabled_steps: list[str] = ["a", "b", "c", "d", "e", "f"]
|
|
76
|
+
bilingual_separator: str = "/"
|
|
77
|
+
fuzzy_threshold: float = 0.8
|
|
78
|
+
multi_label_min_count: int = 2
|
|
79
|
+
split_glued_labels: bool = True
|
|
80
|
+
rejoin_wrapped_lines: bool = True
|
|
81
|
+
bilingual_pairs: list[list[str]] = []
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
class TransformOpRef(BaseModel):
|
|
85
|
+
"""One op in a transform pipeline."""
|
|
86
|
+
|
|
87
|
+
name: str
|
|
88
|
+
args: dict[str, Any] = {}
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
class FieldDef(BaseModel):
|
|
92
|
+
"""Definition of one extractable field in a document profile."""
|
|
93
|
+
|
|
94
|
+
model_config = {"populate_by_name": True}
|
|
95
|
+
|
|
96
|
+
key: str = Field(default="", alias="name")
|
|
97
|
+
extractor: str | None = None
|
|
98
|
+
aliases: list[str] = []
|
|
99
|
+
section: str | None = None
|
|
100
|
+
pattern: str | None = None
|
|
101
|
+
transform: list[Any] = []
|
|
102
|
+
compute: str | None = None
|
|
103
|
+
deps: list[str] = []
|
|
104
|
+
constant: Any = None
|
|
105
|
+
default: Any = None
|
|
106
|
+
optional: bool = False
|
|
107
|
+
required: bool = False
|
|
108
|
+
type: Literal["text", "int", "float", "date", "checkbox", "enum"] = "text"
|
|
109
|
+
enum_values: list[str] | None = None
|
|
110
|
+
needs_vision: bool = False
|
|
111
|
+
llm_hint: str | None = None
|
|
112
|
+
cross_validate_with: list[str] = Field(default_factory=list)
|
|
113
|
+
fuzzy_label: bool = True
|
|
114
|
+
confidence_threshold: float = 0.0
|
|
115
|
+
extractor_args: dict[str, Any] = {}
|
|
116
|
+
stop_labels: list[str] = []
|
|
117
|
+
next_lines: int = 0 # Extra lines to search after the label line (used by regex_after_label)
|
|
118
|
+
|
|
119
|
+
@model_validator(mode="before")
|
|
120
|
+
@classmethod
|
|
121
|
+
def _remap_aliases(cls, data: Any) -> Any:
|
|
122
|
+
"""Accept YAML field names that differ from model field names."""
|
|
123
|
+
if isinstance(data, dict):
|
|
124
|
+
data = dict(data)
|
|
125
|
+
# cross_check_with → cross_validate_with
|
|
126
|
+
if "cross_check_with" in data and "cross_validate_with" not in data:
|
|
127
|
+
data["cross_validate_with"] = data.pop("cross_check_with")
|
|
128
|
+
# Ensure key is populated if name was given
|
|
129
|
+
if "name" in data and "key" not in data:
|
|
130
|
+
data["key"] = data["name"]
|
|
131
|
+
return data
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
class StructuredExtractorRef(BaseModel):
|
|
135
|
+
"""Reference to a structured extractor with optional args."""
|
|
136
|
+
|
|
137
|
+
name: str
|
|
138
|
+
section: str | None = None
|
|
139
|
+
args: dict[str, Any] = {}
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
class OutputConfig(BaseModel):
|
|
143
|
+
"""Output format configuration (markdown, JSON)."""
|
|
144
|
+
|
|
145
|
+
markdown: dict[str, Any] = {}
|
|
146
|
+
json_output: dict[str, Any] = Field(default_factory=dict, alias="json")
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
class DocumentProfile(BaseModel):
|
|
150
|
+
"""Complete document profile: classification rules, fields, and pipeline config."""
|
|
151
|
+
|
|
152
|
+
id: str
|
|
153
|
+
name: str = ""
|
|
154
|
+
display_name: str = ""
|
|
155
|
+
version: int = 1
|
|
156
|
+
language: list[str] = ["vi"]
|
|
157
|
+
extends: str | None = None
|
|
158
|
+
|
|
159
|
+
classify: ClassifyExpr
|
|
160
|
+
normalize: dict[str, Any] = {}
|
|
161
|
+
noise: NoiseRules = Field(default_factory=NoiseRules)
|
|
162
|
+
sections: list[SectionDef] = []
|
|
163
|
+
reconstruct: ReconstructConfig = Field(default_factory=ReconstructConfig)
|
|
164
|
+
|
|
165
|
+
fields: list[FieldDef] = []
|
|
166
|
+
extract: list[FieldDef] = [] # YAML uses 'extract', normalised to fields
|
|
167
|
+
structured_extractors: list[StructuredExtractorRef] = []
|
|
168
|
+
compute: list[dict[str, Any]] = []
|
|
169
|
+
|
|
170
|
+
output: OutputConfig = Field(default_factory=OutputConfig)
|
|
171
|
+
|
|
172
|
+
model_config = {"populate_by_name": True}
|
|
173
|
+
|
|
174
|
+
@model_validator(mode="before")
|
|
175
|
+
@classmethod
|
|
176
|
+
def _remap_yaml_keys(cls, data: Any) -> Any:
|
|
177
|
+
"""Map YAML-style keys to schema field names."""
|
|
178
|
+
if isinstance(data, dict):
|
|
179
|
+
data = dict(data)
|
|
180
|
+
# denoise → noise
|
|
181
|
+
if "denoise" in data and "noise" not in data:
|
|
182
|
+
data["noise"] = data.pop("denoise")
|
|
183
|
+
return data
|
|
184
|
+
|
|
185
|
+
@model_validator(mode="after")
|
|
186
|
+
def normalise_fields(self) -> "DocumentProfile":
|
|
187
|
+
# Merge 'extract' into 'fields'
|
|
188
|
+
if self.extract and not self.fields:
|
|
189
|
+
self.fields = self.extract
|
|
190
|
+
return self
|
|
191
|
+
|
|
192
|
+
def get_field(self, key: str) -> FieldDef | None:
|
|
193
|
+
"""Look up a field by key."""
|
|
194
|
+
for f in self.fields:
|
|
195
|
+
if f.key == key:
|
|
196
|
+
return f
|
|
197
|
+
return None
|
ocr_postprocess/py.typed
ADDED
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
"""JSON renderer for ProcessedDocument — output-format.md §9.2."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from ocr_postprocess.models import ProcessedDocument
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def _build_output(doc: ProcessedDocument) -> dict[str, Any]:
|
|
12
|
+
"""Build structured dict matching the output-format.md JSON schema."""
|
|
13
|
+
return {
|
|
14
|
+
"profile": {
|
|
15
|
+
"id": doc.profile_id,
|
|
16
|
+
"score": round(doc.profile_score, 4),
|
|
17
|
+
},
|
|
18
|
+
"sections": [
|
|
19
|
+
{
|
|
20
|
+
"id": s.id,
|
|
21
|
+
"title": s.title,
|
|
22
|
+
"lines": [line.text for line in s.lines],
|
|
23
|
+
}
|
|
24
|
+
for s in doc.sections
|
|
25
|
+
],
|
|
26
|
+
"candidates": [
|
|
27
|
+
{
|
|
28
|
+
"key": c.key,
|
|
29
|
+
"value": c.value,
|
|
30
|
+
"raw": c.raw,
|
|
31
|
+
"extractor": c.extractor,
|
|
32
|
+
"sources": c.sources,
|
|
33
|
+
"section_id": c.section_id,
|
|
34
|
+
"confidence": round(c.confidence, 4),
|
|
35
|
+
"needs_llm_review": c.needs_llm_review,
|
|
36
|
+
"needs_vision": c.needs_vision,
|
|
37
|
+
"conflict": c.conflict,
|
|
38
|
+
"notes": c.notes,
|
|
39
|
+
}
|
|
40
|
+
for c in doc.candidates
|
|
41
|
+
],
|
|
42
|
+
"hints": doc.hints,
|
|
43
|
+
"cross_checks": [
|
|
44
|
+
{
|
|
45
|
+
"field_key": cc.field_key,
|
|
46
|
+
"sources": cc.sources,
|
|
47
|
+
"matched": cc.matched,
|
|
48
|
+
"detail": cc.detail,
|
|
49
|
+
}
|
|
50
|
+
for cc in doc.cross_checks
|
|
51
|
+
],
|
|
52
|
+
"warnings": doc.warnings,
|
|
53
|
+
"overall_confidence": round(doc.overall_confidence, 4),
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def to_json(doc: ProcessedDocument, indent: int = 2) -> str:
|
|
58
|
+
"""Serialize ProcessedDocument to JSON string (output-format.md schema)."""
|
|
59
|
+
return json.dumps(_build_output(doc), ensure_ascii=False, indent=indent)
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
"""Renderer: LLM-friendly Markdown output — clean key-value table for use as LLM context."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from ocr_postprocess.models import ProcessedDocument
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def render_llm_markdown(doc: ProcessedDocument) -> str:
|
|
9
|
+
"""Render ProcessedDocument thành Markdown gọn cho LLM.
|
|
10
|
+
|
|
11
|
+
Chỉ hiển thị bảng key-value sạch (không có extractor metadata, cross-checks,
|
|
12
|
+
hay confidence scores) — phù hợp để gửi vào context của LLM.
|
|
13
|
+
"""
|
|
14
|
+
lines: list[str] = []
|
|
15
|
+
|
|
16
|
+
title = doc.profile_display_name or doc.profile_id
|
|
17
|
+
lines.append(f"# {title}")
|
|
18
|
+
lines.append("")
|
|
19
|
+
lines.append("| Trường | Giá trị |")
|
|
20
|
+
lines.append("|--------|---------|")
|
|
21
|
+
|
|
22
|
+
# Deduplicate: chỉ lấy candidate đầu tiên của mỗi key (đã qua reconcile)
|
|
23
|
+
seen_keys: set[str] = set()
|
|
24
|
+
for cand in doc.candidates:
|
|
25
|
+
if cand.key in seen_keys:
|
|
26
|
+
continue
|
|
27
|
+
seen_keys.add(cand.key)
|
|
28
|
+
label = doc.field_labels.get(cand.key) or cand.key
|
|
29
|
+
value = str(cand.value) if cand.value is not None else ""
|
|
30
|
+
if value:
|
|
31
|
+
lines.append(f"| {label} | {value} |")
|
|
32
|
+
|
|
33
|
+
lines.append("")
|
|
34
|
+
|
|
35
|
+
if doc.warnings:
|
|
36
|
+
lines.append("> **Lưu ý:**")
|
|
37
|
+
for w in doc.warnings:
|
|
38
|
+
lines.append(f"> - {w}")
|
|
39
|
+
lines.append("")
|
|
40
|
+
|
|
41
|
+
return "\n".join(lines)
|
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
"""Renderer: Markdown output for ProcessedDocument — Stage 9."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
|
|
7
|
+
from ocr_postprocess.models import Candidate, PipelineContext, ProcessedDocument
|
|
8
|
+
|
|
9
|
+
logger = logging.getLogger(__name__)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def _meta(cand: Candidate) -> str:
|
|
13
|
+
"""Build «extractor [flags]» meta string."""
|
|
14
|
+
if cand.extractor in ("constant", "computed"):
|
|
15
|
+
base = f"({cand.extractor})"
|
|
16
|
+
else:
|
|
17
|
+
base = cand.extractor
|
|
18
|
+
if any("checksum" in s for s in cand.sources):
|
|
19
|
+
base += " ✓ checksum"
|
|
20
|
+
|
|
21
|
+
flags = []
|
|
22
|
+
if cand.needs_llm_review:
|
|
23
|
+
flags.append("?review")
|
|
24
|
+
if cand.needs_vision:
|
|
25
|
+
flags.append("?vision")
|
|
26
|
+
if cand.conflict:
|
|
27
|
+
flags.append("!conflict")
|
|
28
|
+
|
|
29
|
+
tail = " ".join(flags)
|
|
30
|
+
inner = (base + " " + tail).strip() if tail else base
|
|
31
|
+
return f"«{inner}»"
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def render_markdown(doc: ProcessedDocument) -> str:
|
|
35
|
+
"""Render ProcessedDocument to Markdown per output-format.md spec."""
|
|
36
|
+
lines: list[str] = []
|
|
37
|
+
conf = doc.overall_confidence
|
|
38
|
+
title = doc.profile_display_name or doc.profile_id or "unknown"
|
|
39
|
+
lines.append(f"# {title} (confidence {conf:.2f})")
|
|
40
|
+
lines.append("")
|
|
41
|
+
|
|
42
|
+
# Group candidates by section
|
|
43
|
+
section_candidates: dict[str, list[Candidate]] = {}
|
|
44
|
+
unplaced: list[Candidate] = []
|
|
45
|
+
for cand in doc.candidates:
|
|
46
|
+
if cand.section_id:
|
|
47
|
+
section_candidates.setdefault(cand.section_id, []).append(cand)
|
|
48
|
+
else:
|
|
49
|
+
unplaced.append(cand)
|
|
50
|
+
|
|
51
|
+
# Per-section listings — value only, no extractor metadata (avoids confusing LLM)
|
|
52
|
+
for section in doc.sections:
|
|
53
|
+
cands = section_candidates.get(section.id, [])
|
|
54
|
+
if not cands:
|
|
55
|
+
continue
|
|
56
|
+
title_part = f" ({section.title})" if section.title else ""
|
|
57
|
+
lines.append(f"## Section: {section.id}{title_part}")
|
|
58
|
+
for cand in cands:
|
|
59
|
+
value_str = str(cand.value) if cand.value is not None else "—"
|
|
60
|
+
label = doc.field_labels.get(cand.key) or cand.key
|
|
61
|
+
lines.append(f"- **{label}**: {value_str}")
|
|
62
|
+
lines.append("")
|
|
63
|
+
|
|
64
|
+
# Fallback table for unplaced candidates
|
|
65
|
+
if unplaced:
|
|
66
|
+
lines.append("## Extracted Fields")
|
|
67
|
+
lines.append("")
|
|
68
|
+
lines.append("| Field | Value | Extractor | Confidence | Flags |")
|
|
69
|
+
lines.append("|-------|-------|-----------|------------|-------|")
|
|
70
|
+
for cand in unplaced:
|
|
71
|
+
value_str = str(cand.value) if cand.value is not None else "—"
|
|
72
|
+
conf_str = f"{cand.confidence:.2f}"
|
|
73
|
+
flag_parts = []
|
|
74
|
+
if cand.needs_llm_review:
|
|
75
|
+
flag_parts.append("?review")
|
|
76
|
+
if cand.conflict:
|
|
77
|
+
flag_parts.append("!conflict")
|
|
78
|
+
if cand.needs_vision:
|
|
79
|
+
flag_parts.append("?vision")
|
|
80
|
+
flag_str = " ".join(flag_parts)
|
|
81
|
+
lines.append(
|
|
82
|
+
f"| `{cand.key}` | {value_str} | {_meta(cand)} | {conf_str} | {flag_str} |"
|
|
83
|
+
)
|
|
84
|
+
lines.append("")
|
|
85
|
+
|
|
86
|
+
# Hints
|
|
87
|
+
if doc.hints:
|
|
88
|
+
lines.append("## Hints")
|
|
89
|
+
for hint_key, hint_vals in doc.hints.items():
|
|
90
|
+
lines.append(f"- {hint_key}: {hint_vals}")
|
|
91
|
+
lines.append("")
|
|
92
|
+
|
|
93
|
+
# Lưu ý — gộp conflict warnings + failed cross-checks thành một section duy nhất.
|
|
94
|
+
# Mỗi dòng chỉ rõ giá trị đã chọn + nguồn + confidence để LLM có đủ context suy luận.
|
|
95
|
+
chosen: dict[str, str] = {c.key: str(c.value) for c in doc.candidates if c.value is not None}
|
|
96
|
+
failed_checks = {cc.field_key: cc for cc in doc.cross_checks if not cc.matched}
|
|
97
|
+
missing = [w for w in doc.warnings if w.startswith("Required field missing")]
|
|
98
|
+
|
|
99
|
+
# Helper: extract ±context_lines lines around occurrences of raw strings in canonical_text
|
|
100
|
+
ocr_lines: list[str] = doc.canonical_text.splitlines() if doc.canonical_text else []
|
|
101
|
+
|
|
102
|
+
def _snippet(anchor_raw: str, context: int = 2) -> str:
|
|
103
|
+
"""Return lines (±context) around the first line containing anchor_raw."""
|
|
104
|
+
if not ocr_lines or not anchor_raw:
|
|
105
|
+
return ""
|
|
106
|
+
for i, ln in enumerate(ocr_lines):
|
|
107
|
+
if anchor_raw in ln:
|
|
108
|
+
lo = max(0, i - context)
|
|
109
|
+
hi = min(len(ocr_lines) - 1, i + context)
|
|
110
|
+
return "\n".join(ocr_lines[lo : hi + 1])
|
|
111
|
+
return ""
|
|
112
|
+
|
|
113
|
+
notes: list[str] = []
|
|
114
|
+
for field_key, cc in failed_checks.items():
|
|
115
|
+
label = doc.field_labels.get(field_key)
|
|
116
|
+
# Skip internal/technical fields that have no human-readable label
|
|
117
|
+
if not label:
|
|
118
|
+
continue
|
|
119
|
+
selected = chosen.get(field_key)
|
|
120
|
+
# Build value→{extractor, confidence, raw, line_index} from value_sources
|
|
121
|
+
src_map: dict[str, dict] = {vs["value"]: vs for vs in cc.value_sources}
|
|
122
|
+
|
|
123
|
+
def _fmt(val: str) -> str:
|
|
124
|
+
vs = src_map.get(val, {})
|
|
125
|
+
ext = vs.get("extractor", "?")
|
|
126
|
+
conf = vs.get("confidence")
|
|
127
|
+
raw = vs.get("raw", "")
|
|
128
|
+
conf_str = f", {conf:.2f}" if conf is not None else ""
|
|
129
|
+
raw_str = f", raw: `{raw}`" if raw and raw != val else ""
|
|
130
|
+
return f'**"{val}"** ({ext}{conf_str}{raw_str})'
|
|
131
|
+
|
|
132
|
+
# Anchor snippet on selected value's raw text (most relevant context)
|
|
133
|
+
anchor = src_map.get(selected or "", {}).get("raw", "") if selected else ""
|
|
134
|
+
if not anchor and src_map:
|
|
135
|
+
anchor = next(iter(src_map.values())).get("raw", "")
|
|
136
|
+
snippet = _snippet(anchor)
|
|
137
|
+
|
|
138
|
+
if selected:
|
|
139
|
+
alternatives = [v for v in cc.values if v != selected]
|
|
140
|
+
if alternatives:
|
|
141
|
+
alt_parts = " | ".join(_fmt(v) for v in alternatives)
|
|
142
|
+
note = (
|
|
143
|
+
f"⚠️ **{label}**: hệ thống chọn {_fmt(selected)}"
|
|
144
|
+
f" — cũng nhận dạng được: {alt_parts}"
|
|
145
|
+
)
|
|
146
|
+
if snippet:
|
|
147
|
+
note += f"\n\n ```\n {chr(10).join(' ' + ln for ln in snippet.splitlines())}\n ```"
|
|
148
|
+
notes.append(note)
|
|
149
|
+
elif cc.values:
|
|
150
|
+
parts = " | ".join(_fmt(v) for v in cc.values)
|
|
151
|
+
note = f"⚠️ **{label}**: không thể xác định chắc chắn — tìm thấy: {parts}"
|
|
152
|
+
if snippet:
|
|
153
|
+
note += (
|
|
154
|
+
f"\n\n ```\n {chr(10).join(' ' + ln for ln in snippet.splitlines())}\n ```"
|
|
155
|
+
)
|
|
156
|
+
notes.append(note)
|
|
157
|
+
for w in missing:
|
|
158
|
+
notes.append(f"⚠️ {w}")
|
|
159
|
+
|
|
160
|
+
if notes:
|
|
161
|
+
lines.append("## Lưu ý")
|
|
162
|
+
lines.append("")
|
|
163
|
+
for note in notes:
|
|
164
|
+
lines.append(f"- {note}")
|
|
165
|
+
lines.append("")
|
|
166
|
+
|
|
167
|
+
return "\n".join(lines)
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
def render_stage(ctx: PipelineContext) -> None:
|
|
171
|
+
"""Pipeline stage 9: no-op hook — markdown built in pipeline.process()."""
|
|
172
|
+
logger.debug("Render stage: markdown will be assembled after all stages complete")
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
"""Stage 8 — ConfidenceScorer."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
|
|
7
|
+
from ocr_postprocess.models import PipelineContext
|
|
8
|
+
|
|
9
|
+
logger = logging.getLogger(__name__)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def score_stage(ctx: PipelineContext) -> None:
|
|
13
|
+
"""Pipeline stage 8: compute overall_confidence."""
|
|
14
|
+
profile = ctx.profile
|
|
15
|
+
|
|
16
|
+
# Classification score
|
|
17
|
+
classification_score = ctx.classification_score
|
|
18
|
+
|
|
19
|
+
# % required fields found
|
|
20
|
+
if profile and profile.fields:
|
|
21
|
+
required_fields = [f for f in profile.fields if f.required]
|
|
22
|
+
if required_fields:
|
|
23
|
+
found_keys = {c.key for c in ctx.candidates}
|
|
24
|
+
pct_required = sum(1 for f in required_fields if f.key in found_keys) / len(
|
|
25
|
+
required_fields
|
|
26
|
+
)
|
|
27
|
+
else:
|
|
28
|
+
pct_required = 1.0
|
|
29
|
+
|
|
30
|
+
# avg field confidence
|
|
31
|
+
if ctx.candidates:
|
|
32
|
+
avg_conf = sum(c.confidence for c in ctx.candidates) / len(ctx.candidates)
|
|
33
|
+
else:
|
|
34
|
+
avg_conf = 0.0
|
|
35
|
+
|
|
36
|
+
# % cross-validated
|
|
37
|
+
if ctx.cross_checks:
|
|
38
|
+
pct_cross = sum(1 for cc in ctx.cross_checks if cc.matched) / len(ctx.cross_checks)
|
|
39
|
+
else:
|
|
40
|
+
pct_cross = 1.0
|
|
41
|
+
else:
|
|
42
|
+
pct_required = 1.0
|
|
43
|
+
avg_conf = sum(c.confidence for c in ctx.candidates) / max(len(ctx.candidates), 1)
|
|
44
|
+
pct_cross = 1.0
|
|
45
|
+
|
|
46
|
+
overall = (
|
|
47
|
+
# classification_score: 20% — reward correct document type identification
|
|
48
|
+
classification_score * 0.2
|
|
49
|
+
# pct_required: 30% — largest weight, all required fields must be found
|
|
50
|
+
+ pct_required * 0.3
|
|
51
|
+
# avg_conf: 30% — average extractor confidence across all candidates
|
|
52
|
+
+ avg_conf * 0.3
|
|
53
|
+
# pct_cross: 20% — reward consistency between independent extractors
|
|
54
|
+
+ pct_cross * 0.2
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
# Penalty for missing required fields: -0.2 per missing field (capped at 0.0)
|
|
58
|
+
missing_required = [
|
|
59
|
+
f.key
|
|
60
|
+
for f in (profile.fields if profile else [])
|
|
61
|
+
if f.required and not any(c.key == f.key for c in ctx.candidates)
|
|
62
|
+
]
|
|
63
|
+
if missing_required:
|
|
64
|
+
overall = max(0.0, overall - 0.2 * len(missing_required))
|
|
65
|
+
# Reconciler already appended warnings; log here for scorer-level visibility
|
|
66
|
+
for key in missing_required:
|
|
67
|
+
logger.warning("Scorer: required field still missing after reconcile: '%s'", key)
|
|
68
|
+
|
|
69
|
+
ctx.overall_confidence = round(min(1.0, max(0.0, overall)), 4)
|
|
70
|
+
logger.debug(
|
|
71
|
+
"Scorer: classify=%.2f required=%.2f avg_conf=%.2f cross=%.2f → overall=%.4f%s",
|
|
72
|
+
classification_score,
|
|
73
|
+
pct_required,
|
|
74
|
+
avg_conf,
|
|
75
|
+
pct_cross,
|
|
76
|
+
ctx.overall_confidence,
|
|
77
|
+
f" (missing required: {missing_required})" if missing_required else "",
|
|
78
|
+
)
|