ocr-postprocess 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ocr_postprocess/__init__.py +33 -0
- ocr_postprocess/classifier.py +63 -0
- ocr_postprocess/cli.py +130 -0
- ocr_postprocess/engine/__init__.py +0 -0
- ocr_postprocess/engine/denoiser.py +134 -0
- ocr_postprocess/engine/extractor_stage.py +107 -0
- ocr_postprocess/engine/normalizer.py +128 -0
- ocr_postprocess/engine/reconciler.py +170 -0
- ocr_postprocess/engine/reconstructor.py +469 -0
- ocr_postprocess/engine/transform_stage.py +89 -0
- ocr_postprocess/exceptions.py +30 -0
- ocr_postprocess/extractors/__init__.py +0 -0
- ocr_postprocess/extractors/base.py +103 -0
- ocr_postprocess/extractors/helpers.py +63 -0
- ocr_postprocess/extractors/label_anchor/__init__.py +0 -0
- ocr_postprocess/extractors/label_anchor/line_after_label.py +53 -0
- ocr_postprocess/extractors/label_anchor/regex_after_label.py +75 -0
- ocr_postprocess/extractors/label_anchor/text_until_next_label.py +79 -0
- ocr_postprocess/extractors/label_anchor/value_between_labels.py +65 -0
- ocr_postprocess/extractors/label_anchor/value_in_same_line.py +60 -0
- ocr_postprocess/extractors/pattern/__init__.py +0 -0
- ocr_postprocess/extractors/pattern/cccd.py +120 -0
- ocr_postprocess/extractors/pattern/cmnd.py +38 -0
- ocr_postprocess/extractors/pattern/currency_vnd.py +48 -0
- ocr_postprocess/extractors/pattern/date.py +89 -0
- ocr_postprocess/extractors/pattern/email.py +38 -0
- ocr_postprocess/extractors/pattern/gender_vn.py +48 -0
- ocr_postprocess/extractors/pattern/phone_vn.py +83 -0
- ocr_postprocess/extractors/pattern/plate_vn.py +39 -0
- ocr_postprocess/extractors/pattern/tax_code.py +53 -0
- ocr_postprocess/extractors/registry.py +45 -0
- ocr_postprocess/extractors/structured/__init__.py +0 -0
- ocr_postprocess/extractors/structured/mrz_cccd.py +111 -0
- ocr_postprocess/extractors/universal.py +39 -0
- ocr_postprocess/models.py +131 -0
- ocr_postprocess/pipeline.py +179 -0
- ocr_postprocess/profiles/__init__.py +0 -0
- ocr_postprocess/profiles/_generic.yml +13 -0
- ocr_postprocess/profiles/cccd_2024.yml +113 -0
- ocr_postprocess/profiles/dang_kiem.yml +105 -0
- ocr_postprocess/profiles/loader.py +63 -0
- ocr_postprocess/profiles/matcher.py +71 -0
- ocr_postprocess/profiles/schema.py +197 -0
- ocr_postprocess/py.typed +0 -0
- ocr_postprocess/renderer/__init__.py +0 -0
- ocr_postprocess/renderer/json_renderer.py +59 -0
- ocr_postprocess/renderer/llm.py +41 -0
- ocr_postprocess/renderer/markdown.py +172 -0
- ocr_postprocess/scorer.py +78 -0
- ocr_postprocess/transformer.py +304 -0
- ocr_postprocess-0.1.0.dist-info/METADATA +189 -0
- ocr_postprocess-0.1.0.dist-info/RECORD +55 -0
- ocr_postprocess-0.1.0.dist-info/WHEEL +5 -0
- ocr_postprocess-0.1.0.dist-info/entry_points.txt +2 -0
- ocr_postprocess-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,304 @@
|
|
|
1
|
+
"""Transform pipeline: built-in ops and simpleeval compute."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import datetime as _dt
|
|
6
|
+
import logging
|
|
7
|
+
import re
|
|
8
|
+
from typing import Any, Callable
|
|
9
|
+
|
|
10
|
+
from dateutil import parser as dateutil_parser
|
|
11
|
+
from simpleeval import EvalWithCompoundTypes, InvalidExpression
|
|
12
|
+
|
|
13
|
+
from ocr_postprocess.exceptions import CyclicComputeError, TransformError
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
_OPS: dict[str, Callable[..., Any]] = {}
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def transform_op(name: str):
|
|
21
|
+
"""Decorator to register a transform op."""
|
|
22
|
+
|
|
23
|
+
def deco(fn: Callable) -> Callable:
|
|
24
|
+
_OPS[name] = fn
|
|
25
|
+
return fn
|
|
26
|
+
|
|
27
|
+
return deco
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
# ---------------------------------------------------------------------------
|
|
31
|
+
# Built-in ops
|
|
32
|
+
# ---------------------------------------------------------------------------
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@transform_op("strip")
|
|
36
|
+
def _strip(value: Any, **_: Any) -> Any:
|
|
37
|
+
return value.strip() if isinstance(value, str) else value
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
@transform_op("upper")
|
|
41
|
+
def _upper(value: Any, **_: Any) -> Any:
|
|
42
|
+
return value.upper() if isinstance(value, str) else value
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@transform_op("lower")
|
|
46
|
+
def _lower(value: Any, **_: Any) -> Any:
|
|
47
|
+
return value.lower() if isinstance(value, str) else value
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
@transform_op("replace")
|
|
51
|
+
def _replace(value: Any, from_: str = "", to: str = "", **_: Any) -> Any:
|
|
52
|
+
if not isinstance(value, str):
|
|
53
|
+
return value
|
|
54
|
+
return value.replace(from_, to)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
@transform_op("regex_group")
|
|
58
|
+
def _regex_group(value: Any, pattern: str = "", group: int = 1, **_: Any) -> Any:
|
|
59
|
+
if not isinstance(value, str):
|
|
60
|
+
return value
|
|
61
|
+
m = re.search(pattern, value)
|
|
62
|
+
if m:
|
|
63
|
+
try:
|
|
64
|
+
return m.group(group)
|
|
65
|
+
except IndexError:
|
|
66
|
+
return None
|
|
67
|
+
return None
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
@transform_op("regex_sub")
|
|
71
|
+
def _regex_sub(value: Any, pattern: str = "", repl: str = "", **_: Any) -> Any:
|
|
72
|
+
if not isinstance(value, str):
|
|
73
|
+
return value
|
|
74
|
+
return re.sub(pattern, repl, value)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
@transform_op("split_take")
|
|
78
|
+
def _split_take(value: Any, sep: str = "/", index: int = 0, **_: Any) -> Any:
|
|
79
|
+
if not isinstance(value, str):
|
|
80
|
+
return value
|
|
81
|
+
parts = value.split(sep)
|
|
82
|
+
return parts[index].strip() if 0 <= index < len(parts) else None
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
@transform_op("split_first")
|
|
86
|
+
def _split_first(value: Any, sep: str = "/", **_: Any) -> Any:
|
|
87
|
+
return _split_take(value, sep=sep, index=0)
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
@transform_op("strip_unit")
|
|
91
|
+
def _strip_unit(value: Any, units: list[str] | None = None, **_: Any) -> Any:
|
|
92
|
+
if not isinstance(value, str):
|
|
93
|
+
return value
|
|
94
|
+
result = value.strip()
|
|
95
|
+
if units:
|
|
96
|
+
for unit in units:
|
|
97
|
+
escaped = re.escape(unit)
|
|
98
|
+
result = re.sub(rf"\s*{escaped}\s*$", "", result, flags=re.IGNORECASE).strip()
|
|
99
|
+
return result
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
@transform_op("dedup_words")
|
|
103
|
+
def _dedup_words(value: Any, **_: Any) -> Any:
|
|
104
|
+
if not isinstance(value, str):
|
|
105
|
+
return value
|
|
106
|
+
words = value.split()
|
|
107
|
+
deduped = []
|
|
108
|
+
for w in words:
|
|
109
|
+
if not deduped or w != deduped[-1]:
|
|
110
|
+
deduped.append(w)
|
|
111
|
+
return " ".join(deduped)
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
@transform_op("to_int")
|
|
115
|
+
def _to_int(value: Any, **_: Any) -> Any:
|
|
116
|
+
if isinstance(value, int):
|
|
117
|
+
return value
|
|
118
|
+
if isinstance(value, str):
|
|
119
|
+
cleaned = re.sub(r"[^\d\-]", "", value.replace(",", "").replace(".", ""))
|
|
120
|
+
try:
|
|
121
|
+
return int(cleaned)
|
|
122
|
+
except (ValueError, TypeError):
|
|
123
|
+
return None
|
|
124
|
+
return None
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
@transform_op("to_float")
|
|
128
|
+
def _to_float(value: Any, **_: Any) -> Any:
|
|
129
|
+
if isinstance(value, float):
|
|
130
|
+
return value
|
|
131
|
+
if isinstance(value, str):
|
|
132
|
+
cleaned = value.replace(",", ".").strip()
|
|
133
|
+
try:
|
|
134
|
+
return float(cleaned)
|
|
135
|
+
except (ValueError, TypeError):
|
|
136
|
+
return None
|
|
137
|
+
return None
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
@transform_op("to_date")
|
|
141
|
+
def _to_date(value: Any, format: str = "", output: str = "iso", **_: Any) -> Any:
|
|
142
|
+
if not isinstance(value, str):
|
|
143
|
+
return value
|
|
144
|
+
# Already ISO YYYY-MM-DD — return unchanged to avoid mis-parsing with dayfirst=True.
|
|
145
|
+
# Example: "2024-06-10" with dayfirst=True would become "2024-10-06" (wrong).
|
|
146
|
+
if re.fullmatch(r"\d{4}-\d{2}-\d{2}", value.strip()):
|
|
147
|
+
return value.strip()
|
|
148
|
+
try:
|
|
149
|
+
dt = dateutil_parser.parse(value, dayfirst=True)
|
|
150
|
+
if output == "iso":
|
|
151
|
+
return dt.date().isoformat()
|
|
152
|
+
return str(dt.date())
|
|
153
|
+
except (ValueError, TypeError):
|
|
154
|
+
return None
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
@transform_op("pick")
|
|
158
|
+
def _pick(value: Any, kind: str = "year", **_: Any) -> Any:
|
|
159
|
+
if not isinstance(value, str):
|
|
160
|
+
return value
|
|
161
|
+
if kind == "year":
|
|
162
|
+
m = re.search(r"\b(19|20)\d{2}\b", value)
|
|
163
|
+
return m.group(0) if m else None
|
|
164
|
+
return value
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
@transform_op("enum_normalize")
|
|
168
|
+
def _enum_normalize(value: Any, map: dict[str, str] | None = None, **_: Any) -> Any:
|
|
169
|
+
if not isinstance(value, str) or not map:
|
|
170
|
+
return value
|
|
171
|
+
return map.get(value, map.get(value.lower(), value))
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
@transform_op("default_if_empty")
|
|
175
|
+
def _default_if_empty(value: Any, default: Any = None, **_: Any) -> Any:
|
|
176
|
+
if value is None or value == "" or value == []:
|
|
177
|
+
return default
|
|
178
|
+
return value
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
# ---------------------------------------------------------------------------
|
|
182
|
+
# Apply pipeline
|
|
183
|
+
# ---------------------------------------------------------------------------
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def _parse_op(op_ref: Any) -> tuple[str, dict[str, Any]]:
|
|
187
|
+
"""Parse op reference from various YAML notations.
|
|
188
|
+
|
|
189
|
+
Supports:
|
|
190
|
+
"strip" -> ("strip", {})
|
|
191
|
+
{"op": "to_date", ...} -> ("to_date", {...})
|
|
192
|
+
{"to_date": {...}} -> ("to_date", {...})
|
|
193
|
+
"""
|
|
194
|
+
if isinstance(op_ref, str):
|
|
195
|
+
return op_ref, {}
|
|
196
|
+
if isinstance(op_ref, dict):
|
|
197
|
+
if "op" in op_ref:
|
|
198
|
+
name = op_ref["op"]
|
|
199
|
+
args = {k: v for k, v in op_ref.items() if k != "op"}
|
|
200
|
+
return name, args
|
|
201
|
+
# single-key dict notation
|
|
202
|
+
if len(op_ref) == 1:
|
|
203
|
+
name = next(iter(op_ref))
|
|
204
|
+
args = op_ref[name] if isinstance(op_ref[name], dict) else {}
|
|
205
|
+
return name, args
|
|
206
|
+
raise TransformError(f"Cannot parse op reference: {op_ref!r}")
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
def apply_transforms(value: Any, ops: list[Any]) -> Any:
|
|
210
|
+
"""Apply a list of transform ops sequentially to value."""
|
|
211
|
+
for op_ref in ops:
|
|
212
|
+
name, args = _parse_op(op_ref)
|
|
213
|
+
if name not in _OPS:
|
|
214
|
+
raise TransformError(f"Unknown transform op: '{name}'")
|
|
215
|
+
try:
|
|
216
|
+
value = _OPS[name](value, **args)
|
|
217
|
+
except TransformError:
|
|
218
|
+
raise
|
|
219
|
+
except Exception as exc:
|
|
220
|
+
raise TransformError(f"Op '{name}' failed: {exc}") from exc
|
|
221
|
+
if value is None:
|
|
222
|
+
break
|
|
223
|
+
return value
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
# ---------------------------------------------------------------------------
|
|
227
|
+
# Compute (simpleeval)
|
|
228
|
+
# ---------------------------------------------------------------------------
|
|
229
|
+
|
|
230
|
+
_SAFE_FUNCTIONS = {
|
|
231
|
+
"min": min,
|
|
232
|
+
"max": max,
|
|
233
|
+
"abs": abs,
|
|
234
|
+
"round": round,
|
|
235
|
+
"len": len,
|
|
236
|
+
"str": str,
|
|
237
|
+
"int": int,
|
|
238
|
+
"float": float,
|
|
239
|
+
# Date helpers for compute expressions (e.g. year_now() - year(ngay_sinh))
|
|
240
|
+
"year_now": lambda: _dt.date.today().year,
|
|
241
|
+
"year": lambda s: _dt.date.fromisoformat(str(s)).year if s else None,
|
|
242
|
+
"month": lambda s: _dt.date.fromisoformat(str(s)).month if s else None,
|
|
243
|
+
"day": lambda s: _dt.date.fromisoformat(str(s)).day if s else None,
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
def compute_field(expr: str, deps: dict[str, Any]) -> Any:
|
|
248
|
+
"""Evaluate a simpleeval expression with dep values as names.
|
|
249
|
+
|
|
250
|
+
Raises TransformError on eval failure.
|
|
251
|
+
"""
|
|
252
|
+
evaluator = EvalWithCompoundTypes(names=deps, functions=_SAFE_FUNCTIONS)
|
|
253
|
+
try:
|
|
254
|
+
return evaluator.eval(expr)
|
|
255
|
+
except InvalidExpression as exc:
|
|
256
|
+
raise TransformError(f"compute expression failed: {exc!r}") from exc
|
|
257
|
+
except Exception as exc:
|
|
258
|
+
raise TransformError(f"compute expression error: {exc}") from exc
|
|
259
|
+
|
|
260
|
+
|
|
261
|
+
# ---------------------------------------------------------------------------
|
|
262
|
+
# Topological sort for compute fields
|
|
263
|
+
# ---------------------------------------------------------------------------
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
def topo_sort_compute(fields: list[dict[str, Any]]) -> list[str]:
|
|
267
|
+
"""Return field keys in topological order based on compute deps.
|
|
268
|
+
|
|
269
|
+
Raises CyclicComputeError if cycle detected.
|
|
270
|
+
Only includes fields that have a 'compute' key.
|
|
271
|
+
"""
|
|
272
|
+
compute_fields = {f["key"]: f for f in fields if f.get("compute")}
|
|
273
|
+
|
|
274
|
+
# Build adjacency: dep → [fields that depend on dep]
|
|
275
|
+
graph: dict[str, list[str]] = {k: [] for k in compute_fields}
|
|
276
|
+
in_degree: dict[str, int] = {k: 0 for k in compute_fields}
|
|
277
|
+
|
|
278
|
+
for key, field in compute_fields.items():
|
|
279
|
+
for dep in field.get("deps", []):
|
|
280
|
+
if dep in compute_fields:
|
|
281
|
+
graph[dep].append(key)
|
|
282
|
+
in_degree[key] += 1
|
|
283
|
+
|
|
284
|
+
# Kahn's algorithm
|
|
285
|
+
queue = [k for k, deg in in_degree.items() if deg == 0]
|
|
286
|
+
order: list[str] = []
|
|
287
|
+
while queue:
|
|
288
|
+
node = queue.pop(0)
|
|
289
|
+
order.append(node)
|
|
290
|
+
for dependent in graph.get(node, []):
|
|
291
|
+
in_degree[dependent] -= 1
|
|
292
|
+
if in_degree[dependent] == 0:
|
|
293
|
+
queue.append(dependent)
|
|
294
|
+
|
|
295
|
+
if len(order) != len(compute_fields):
|
|
296
|
+
cycle_keys = set(compute_fields) - set(order)
|
|
297
|
+
raise CyclicComputeError(f"Cyclic compute dependency detected: {cycle_keys}")
|
|
298
|
+
|
|
299
|
+
return order
|
|
300
|
+
|
|
301
|
+
|
|
302
|
+
def all_op_names() -> list[str]:
|
|
303
|
+
"""Return all registered op names."""
|
|
304
|
+
return list(_OPS.keys())
|
|
@@ -0,0 +1,189 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: ocr-postprocess
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Biến raw OCR text thành structured document — trích xuất trường dữ liệu, xử lý nhiễu, cross-check, render JSON/Markdown.
|
|
5
|
+
Home-page: https://github.com/ohmygodvt95/ocr-postprocess
|
|
6
|
+
Author: ohmygodvt95
|
|
7
|
+
License: MIT
|
|
8
|
+
Keywords: ocr post-processing nlp document extraction vietnamese
|
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Operating System :: OS Independent
|
|
15
|
+
Classifier: Intended Audience :: Developers
|
|
16
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
17
|
+
Classifier: Topic :: Text Processing
|
|
18
|
+
Requires-Python: >=3.10
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
Requires-Dist: pydantic>=2.5
|
|
21
|
+
Requires-Dist: PyYAML>=6.0
|
|
22
|
+
Requires-Dist: rapidfuzz>=3.5
|
|
23
|
+
Requires-Dist: regex>=2024.1.24
|
|
24
|
+
Requires-Dist: typer>=0.12
|
|
25
|
+
Requires-Dist: python-dateutil>=2.9
|
|
26
|
+
Requires-Dist: simpleeval>=0.9.13
|
|
27
|
+
Dynamic: author
|
|
28
|
+
Dynamic: classifier
|
|
29
|
+
Dynamic: description
|
|
30
|
+
Dynamic: description-content-type
|
|
31
|
+
Dynamic: home-page
|
|
32
|
+
Dynamic: keywords
|
|
33
|
+
Dynamic: license
|
|
34
|
+
Dynamic: requires-dist
|
|
35
|
+
Dynamic: requires-python
|
|
36
|
+
Dynamic: summary
|
|
37
|
+
|
|
38
|
+
# ocr-postprocess
|
|
39
|
+
|
|
40
|
+
Biến raw OCR text thành structured document — trích xuất trường dữ liệu, xử lý nhiễu, cross-check, và render ra JSON/Markdown.
|
|
41
|
+
|
|
42
|
+
## Installation
|
|
43
|
+
|
|
44
|
+
```bash
|
|
45
|
+
pip install ocr-postprocess
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
Hoặc cài từ source:
|
|
49
|
+
|
|
50
|
+
```bash
|
|
51
|
+
git clone https://github.com/your-org/ocr-postprocess
|
|
52
|
+
cd ocr-postprocess
|
|
53
|
+
pip install -e .
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
## Library usage
|
|
57
|
+
|
|
58
|
+
```python
|
|
59
|
+
from ocr_postprocess import Pipeline, ProcessedDocument, OcrPostprocessError
|
|
60
|
+
|
|
61
|
+
# Sử dụng profiles bundled sẵn (no extra files needed)
|
|
62
|
+
pipeline = Pipeline.from_default()
|
|
63
|
+
|
|
64
|
+
raw_text = open("scan.txt").read()
|
|
65
|
+
|
|
66
|
+
try:
|
|
67
|
+
doc: ProcessedDocument = pipeline.process(raw_text)
|
|
68
|
+
except OcrPostprocessError as exc:
|
|
69
|
+
print(f"Pipeline error: {exc}")
|
|
70
|
+
raise
|
|
71
|
+
|
|
72
|
+
# Lấy một trường
|
|
73
|
+
name_candidate = doc.get("ho_va_ten")
|
|
74
|
+
if name_candidate:
|
|
75
|
+
print(name_candidate.value) # "NGUYỄN VĂN A"
|
|
76
|
+
print(name_candidate.confidence) # 0.91
|
|
77
|
+
|
|
78
|
+
# Toàn bộ trường đã trích
|
|
79
|
+
fields = {c.key: c.value for c in doc.candidates}
|
|
80
|
+
|
|
81
|
+
# Export JSON
|
|
82
|
+
import json
|
|
83
|
+
print(json.dumps(doc.to_json(), ensure_ascii=False, indent=2))
|
|
84
|
+
|
|
85
|
+
# Export Markdown
|
|
86
|
+
print(doc.markdown)
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
### Custom profiles directory
|
|
90
|
+
|
|
91
|
+
```python
|
|
92
|
+
# Dùng thư mục profiles riêng
|
|
93
|
+
pipeline = Pipeline.from_default(profiles_dir="my_profiles/")
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
### Classify only
|
|
97
|
+
|
|
98
|
+
```python
|
|
99
|
+
profile_id, score = pipeline.classify(raw_text)
|
|
100
|
+
# "cccd_2024", 0.97
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
### ProcessedDocument fields
|
|
104
|
+
|
|
105
|
+
| Field | Type | Mô tả |
|
|
106
|
+
|---|---|---|
|
|
107
|
+
| `profile_id` | `str` | Profile được match |
|
|
108
|
+
| `profile_score` | `float` | Điểm classify (0–1) |
|
|
109
|
+
| `candidates` | `list[Candidate]` | Tất cả trường đã trích |
|
|
110
|
+
| `overall_confidence` | `float` | Điểm tin cậy tổng hợp |
|
|
111
|
+
| `warnings` | `list[str]` | Cảnh báo từ pipeline |
|
|
112
|
+
| `markdown` | `str` | Kết quả render Markdown |
|
|
113
|
+
| `cross_checks` | `list[CrossCheck]` | Kết quả cross-check |
|
|
114
|
+
|
|
115
|
+
## CLI
|
|
116
|
+
|
|
117
|
+
```bash
|
|
118
|
+
# Process a document
|
|
119
|
+
ocrpp process scan.txt
|
|
120
|
+
|
|
121
|
+
# Markdown output
|
|
122
|
+
ocrpp process scan.txt --format markdown
|
|
123
|
+
|
|
124
|
+
# Classify only
|
|
125
|
+
ocrpp classify scan.txt
|
|
126
|
+
|
|
127
|
+
# Validate a profile
|
|
128
|
+
ocrpp validate-profile profiles/my_profile.yml
|
|
129
|
+
|
|
130
|
+
# Custom profiles directory
|
|
131
|
+
ocrpp process scan.txt --profiles ./my_profiles/
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
## Adding a custom profile
|
|
135
|
+
|
|
136
|
+
Tạo file YAML trong thư mục profiles của bạn:
|
|
137
|
+
|
|
138
|
+
```yaml
|
|
139
|
+
id: my_doc
|
|
140
|
+
version: 1
|
|
141
|
+
display_name: "My document type"
|
|
142
|
+
|
|
143
|
+
classify:
|
|
144
|
+
any_of:
|
|
145
|
+
- contains_any: ["MY DOCUMENT HEADER"]
|
|
146
|
+
|
|
147
|
+
extract:
|
|
148
|
+
- name: document_number
|
|
149
|
+
aliases: ["Document No", "Số chứng từ"]
|
|
150
|
+
extractor: value_in_same_line
|
|
151
|
+
required: true
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
Sau đó:
|
|
155
|
+
|
|
156
|
+
```python
|
|
157
|
+
pipeline = Pipeline.from_default(profiles_dir="my_profiles/")
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
## Exceptions
|
|
161
|
+
|
|
162
|
+
```python
|
|
163
|
+
from ocr_postprocess import (
|
|
164
|
+
OcrPostprocessError, # base
|
|
165
|
+
ProfileNotFoundError,
|
|
166
|
+
ProfileValidationError,
|
|
167
|
+
ExtractorNotFoundError,
|
|
168
|
+
TransformError,
|
|
169
|
+
)
|
|
170
|
+
```
|
|
171
|
+
|
|
172
|
+
## Development
|
|
173
|
+
|
|
174
|
+
```bash
|
|
175
|
+
python -m venv .venv && source .venv/bin/activate
|
|
176
|
+
pip install -r requirements-dev.txt
|
|
177
|
+
pip install -e .
|
|
178
|
+
|
|
179
|
+
pytest # all tests
|
|
180
|
+
pytest tests/unit # unit only
|
|
181
|
+
pytest -m golden # golden/regression
|
|
182
|
+
pytest -n auto --cov # parallel + coverage
|
|
183
|
+
ruff check . && black --check .
|
|
184
|
+
```
|
|
185
|
+
|
|
186
|
+
## Docs
|
|
187
|
+
|
|
188
|
+
Xem [docs/README.md](docs/README.md) để biết chi tiết về pipeline stages và profile schema.
|
|
189
|
+
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
ocr_postprocess/__init__.py,sha256=f5tskc50hqq_cbaWIRgrs6dw2_-2UKFk3c_hxG_bAiY,861
|
|
2
|
+
ocr_postprocess/classifier.py,sha256=PENB6t0SWRzMxc9UGDj9nP4AFqWJHFYua3u0JXqWmFU,1986
|
|
3
|
+
ocr_postprocess/cli.py,sha256=pSdg9nhpue8Ql-vf0O6SGk5gwctjipG_OAIidsxHB0w,4259
|
|
4
|
+
ocr_postprocess/exceptions.py,sha256=lHa1r4B1b1TOPdElV3da-mYws-Uu7IQNBgCGCN-taFQ,856
|
|
5
|
+
ocr_postprocess/models.py,sha256=9Fb7t5DT7CrcnKd2Vnmn1OSzHYFkxqCLkVSnXQMYCcA,4510
|
|
6
|
+
ocr_postprocess/pipeline.py,sha256=QLEaU5Fjg0lj1LQvM573ZGM8Zz4o_41BOrTXq1h2W18,6637
|
|
7
|
+
ocr_postprocess/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
8
|
+
ocr_postprocess/scorer.py,sha256=i1bPKkWdBxJUq4GVVPxE7GSSkLhs0YgYKERZav3f9tk,2738
|
|
9
|
+
ocr_postprocess/transformer.py,sha256=ouN5sZZAnQmB22AJr0oG5XmmxrzUiD0xrn1GTRLCNJ4,9349
|
|
10
|
+
ocr_postprocess/engine/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
11
|
+
ocr_postprocess/engine/denoiser.py,sha256=77UNAv75TlV4sVrLT5LHXLUnOdVQBn4UbJtDU0Pq6iQ,4345
|
|
12
|
+
ocr_postprocess/engine/extractor_stage.py,sha256=ArpK-V0oVEkP2yKl_D2x68blCeeYSc4CORZc1TJ_y0Q,3423
|
|
13
|
+
ocr_postprocess/engine/normalizer.py,sha256=BBqAzeyNV2wv-5ewu5SJMte-kjgo9jgVekimh7maQys,3878
|
|
14
|
+
ocr_postprocess/engine/reconciler.py,sha256=ohxErHyn0PavcgUldnhfv442-HvEdtk6PlLyubooP8s,6156
|
|
15
|
+
ocr_postprocess/engine/reconstructor.py,sha256=sVmn-8nygFTlT31aWe-uxbfGDdgEr1bwW-78fIfjuMo,16645
|
|
16
|
+
ocr_postprocess/engine/transform_stage.py,sha256=DIz44lOUDQodS1hUmEtRGw2mF_dxfyPp1LTlzxt-xz4,3330
|
|
17
|
+
ocr_postprocess/extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
18
|
+
ocr_postprocess/extractors/base.py,sha256=u8bzVK0wxv3X-J_lfICEHwG-L__89j5w_XbDHErfChY,2857
|
|
19
|
+
ocr_postprocess/extractors/helpers.py,sha256=kzY-g7KfGI1iySIN0oDzDqtLOE9fIMIVP551OqrSrs0,2141
|
|
20
|
+
ocr_postprocess/extractors/registry.py,sha256=ULh_BoyUbC17VMdnXv1ffzFokZizJXfsxx53ziLbs3U,1092
|
|
21
|
+
ocr_postprocess/extractors/universal.py,sha256=98IP5HfIneLeLHJsvXu1Spxado0XNAsbL_M-x-hEMM0,991
|
|
22
|
+
ocr_postprocess/extractors/label_anchor/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
23
|
+
ocr_postprocess/extractors/label_anchor/line_after_label.py,sha256=gKFzsveaAerUxox-i3BvFSZWIZeeqn33N4jIRxbPECc,1879
|
|
24
|
+
ocr_postprocess/extractors/label_anchor/regex_after_label.py,sha256=PTzOoj8MdXcy9iHZHgbqJ0C76ZdYHeKY4taSVY5vx6s,2802
|
|
25
|
+
ocr_postprocess/extractors/label_anchor/text_until_next_label.py,sha256=jO7JIW7VQNSuhbCkAhDtV43dgyhk70HyCY6f_928-W4,2957
|
|
26
|
+
ocr_postprocess/extractors/label_anchor/value_between_labels.py,sha256=LTUyExIAuhezjy9jTBUvvRDfHppb2bXI_kjvo4Ds20Q,2284
|
|
27
|
+
ocr_postprocess/extractors/label_anchor/value_in_same_line.py,sha256=7aTKbNm0qoBQ86_ovYGEb6egqTHx8fpNhSDLdvcKd68,2078
|
|
28
|
+
ocr_postprocess/extractors/pattern/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
29
|
+
ocr_postprocess/extractors/pattern/cccd.py,sha256=g6gNwbJnVDN7gJtn3aaPGOPGzBSZPVc9WohlTrUYePU,2651
|
|
30
|
+
ocr_postprocess/extractors/pattern/cmnd.py,sha256=mfETtPV9xVRzhmcOrN73BySoyEkVKv8z9OfPsxaq5ok,1150
|
|
31
|
+
ocr_postprocess/extractors/pattern/currency_vnd.py,sha256=cFVE_6O4feOGwd961YxByvmVqg_NatBMT8wjvVlmn-8,1541
|
|
32
|
+
ocr_postprocess/extractors/pattern/date.py,sha256=MSsR0vpYi_ElvaeUdOsfyEKE2ulw4mwe5sjBmwqqQqo,2906
|
|
33
|
+
ocr_postprocess/extractors/pattern/email.py,sha256=u0qE31YsAWXhWpaHJbFQE4N1CWOsvXj_1Wnb5gxWEDY,1157
|
|
34
|
+
ocr_postprocess/extractors/pattern/gender_vn.py,sha256=yWYGYmjH4PUyn87tcdl5RQt8f21ChG-EngJYfZd7fP8,1383
|
|
35
|
+
ocr_postprocess/extractors/pattern/phone_vn.py,sha256=5quJ22uUsJySn0EBUvbdY3iuh8Pk8U1QA1E6OWn_YUY,1972
|
|
36
|
+
ocr_postprocess/extractors/pattern/plate_vn.py,sha256=GXWdd2qlBDT_mOQUd-ZP25WLJ5mde5mQyti8Zn1IDpo,1284
|
|
37
|
+
ocr_postprocess/extractors/pattern/tax_code.py,sha256=Ywd3fRT8Pjdu5CZDvoeFOArWyGQKbIpjEuXR2BmwcWw,1775
|
|
38
|
+
ocr_postprocess/extractors/structured/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
39
|
+
ocr_postprocess/extractors/structured/mrz_cccd.py,sha256=DAd-BIxW78Pqf3j3xhNVYp0Xrqzz5qQeKXxhaCC9OYc,3623
|
|
40
|
+
ocr_postprocess/profiles/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
41
|
+
ocr_postprocess/profiles/_generic.yml,sha256=oD73q5wnj76zhiK4CvMy8qxuhec1Sj-J-Rws8tPTs_M,161
|
|
42
|
+
ocr_postprocess/profiles/cccd_2024.yml,sha256=pQagCWOzOJ-pHWKlNYr131FlNcOVfacvhZJt2tUGqmg,3568
|
|
43
|
+
ocr_postprocess/profiles/dang_kiem.yml,sha256=QPryEckEkIohYkQeVuUsSh7Na6Z9uWOo45eCkO5HYec,2890
|
|
44
|
+
ocr_postprocess/profiles/loader.py,sha256=pxS1R9UEMgOovhS9BEfmDQydt64oRFafUAxgbc5y2Cg,2108
|
|
45
|
+
ocr_postprocess/profiles/matcher.py,sha256=VCqV72NMqITw8Y62quJut7ZxmRwC3sD_1kDo0a1TUKo,2092
|
|
46
|
+
ocr_postprocess/profiles/schema.py,sha256=tUnYd5BfLMG1B9v9a7Qfa-jIoz6GwJcV_daAkkUvXLA,6157
|
|
47
|
+
ocr_postprocess/renderer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
48
|
+
ocr_postprocess/renderer/json_renderer.py,sha256=hR2L0Brs7QN5-GCgPLNsqk2ZBjcjkeCvFE6olVEuTII,1841
|
|
49
|
+
ocr_postprocess/renderer/llm.py,sha256=PIog7ddLaShJDGGiDLLr_L7VITfAzGl0DyLubut0Oog,1350
|
|
50
|
+
ocr_postprocess/renderer/markdown.py,sha256=fklOqrk3m2c8kyAmxDJFKZkpc49M18SUttZkr1CPu6k,6866
|
|
51
|
+
ocr_postprocess-0.1.0.dist-info/METADATA,sha256=iVzgWPUxLNHI_hfZs2BKlB5fK5XlbsjBAKURm5u34Xw,4592
|
|
52
|
+
ocr_postprocess-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
53
|
+
ocr_postprocess-0.1.0.dist-info/entry_points.txt,sha256=TzBHQMR05LYex2OIneKnqqyoM0sDGZGb03VLhH8RAds,50
|
|
54
|
+
ocr_postprocess-0.1.0.dist-info/top_level.txt,sha256=EJBbBVsIAzSLT376rE4C9-wYGhR6P8OMw2117it7dz8,16
|
|
55
|
+
ocr_postprocess-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
ocr_postprocess
|