ocr-postprocess 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. ocr_postprocess/__init__.py +33 -0
  2. ocr_postprocess/classifier.py +63 -0
  3. ocr_postprocess/cli.py +130 -0
  4. ocr_postprocess/engine/__init__.py +0 -0
  5. ocr_postprocess/engine/denoiser.py +134 -0
  6. ocr_postprocess/engine/extractor_stage.py +107 -0
  7. ocr_postprocess/engine/normalizer.py +128 -0
  8. ocr_postprocess/engine/reconciler.py +170 -0
  9. ocr_postprocess/engine/reconstructor.py +469 -0
  10. ocr_postprocess/engine/transform_stage.py +89 -0
  11. ocr_postprocess/exceptions.py +30 -0
  12. ocr_postprocess/extractors/__init__.py +0 -0
  13. ocr_postprocess/extractors/base.py +103 -0
  14. ocr_postprocess/extractors/helpers.py +63 -0
  15. ocr_postprocess/extractors/label_anchor/__init__.py +0 -0
  16. ocr_postprocess/extractors/label_anchor/line_after_label.py +53 -0
  17. ocr_postprocess/extractors/label_anchor/regex_after_label.py +75 -0
  18. ocr_postprocess/extractors/label_anchor/text_until_next_label.py +79 -0
  19. ocr_postprocess/extractors/label_anchor/value_between_labels.py +65 -0
  20. ocr_postprocess/extractors/label_anchor/value_in_same_line.py +60 -0
  21. ocr_postprocess/extractors/pattern/__init__.py +0 -0
  22. ocr_postprocess/extractors/pattern/cccd.py +120 -0
  23. ocr_postprocess/extractors/pattern/cmnd.py +38 -0
  24. ocr_postprocess/extractors/pattern/currency_vnd.py +48 -0
  25. ocr_postprocess/extractors/pattern/date.py +89 -0
  26. ocr_postprocess/extractors/pattern/email.py +38 -0
  27. ocr_postprocess/extractors/pattern/gender_vn.py +48 -0
  28. ocr_postprocess/extractors/pattern/phone_vn.py +83 -0
  29. ocr_postprocess/extractors/pattern/plate_vn.py +39 -0
  30. ocr_postprocess/extractors/pattern/tax_code.py +53 -0
  31. ocr_postprocess/extractors/registry.py +45 -0
  32. ocr_postprocess/extractors/structured/__init__.py +0 -0
  33. ocr_postprocess/extractors/structured/mrz_cccd.py +111 -0
  34. ocr_postprocess/extractors/universal.py +39 -0
  35. ocr_postprocess/models.py +131 -0
  36. ocr_postprocess/pipeline.py +179 -0
  37. ocr_postprocess/profiles/__init__.py +0 -0
  38. ocr_postprocess/profiles/_generic.yml +13 -0
  39. ocr_postprocess/profiles/cccd_2024.yml +113 -0
  40. ocr_postprocess/profiles/dang_kiem.yml +105 -0
  41. ocr_postprocess/profiles/loader.py +63 -0
  42. ocr_postprocess/profiles/matcher.py +71 -0
  43. ocr_postprocess/profiles/schema.py +197 -0
  44. ocr_postprocess/py.typed +0 -0
  45. ocr_postprocess/renderer/__init__.py +0 -0
  46. ocr_postprocess/renderer/json_renderer.py +59 -0
  47. ocr_postprocess/renderer/llm.py +41 -0
  48. ocr_postprocess/renderer/markdown.py +172 -0
  49. ocr_postprocess/scorer.py +78 -0
  50. ocr_postprocess/transformer.py +304 -0
  51. ocr_postprocess-0.1.0.dist-info/METADATA +189 -0
  52. ocr_postprocess-0.1.0.dist-info/RECORD +55 -0
  53. ocr_postprocess-0.1.0.dist-info/WHEEL +5 -0
  54. ocr_postprocess-0.1.0.dist-info/entry_points.txt +2 -0
  55. ocr_postprocess-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,304 @@
1
+ """Transform pipeline: built-in ops and simpleeval compute."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import datetime as _dt
6
+ import logging
7
+ import re
8
+ from typing import Any, Callable
9
+
10
+ from dateutil import parser as dateutil_parser
11
+ from simpleeval import EvalWithCompoundTypes, InvalidExpression
12
+
13
+ from ocr_postprocess.exceptions import CyclicComputeError, TransformError
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+ _OPS: dict[str, Callable[..., Any]] = {}
18
+
19
+
20
+ def transform_op(name: str):
21
+ """Decorator to register a transform op."""
22
+
23
+ def deco(fn: Callable) -> Callable:
24
+ _OPS[name] = fn
25
+ return fn
26
+
27
+ return deco
28
+
29
+
30
+ # ---------------------------------------------------------------------------
31
+ # Built-in ops
32
+ # ---------------------------------------------------------------------------
33
+
34
+
35
+ @transform_op("strip")
36
+ def _strip(value: Any, **_: Any) -> Any:
37
+ return value.strip() if isinstance(value, str) else value
38
+
39
+
40
+ @transform_op("upper")
41
+ def _upper(value: Any, **_: Any) -> Any:
42
+ return value.upper() if isinstance(value, str) else value
43
+
44
+
45
+ @transform_op("lower")
46
+ def _lower(value: Any, **_: Any) -> Any:
47
+ return value.lower() if isinstance(value, str) else value
48
+
49
+
50
+ @transform_op("replace")
51
+ def _replace(value: Any, from_: str = "", to: str = "", **_: Any) -> Any:
52
+ if not isinstance(value, str):
53
+ return value
54
+ return value.replace(from_, to)
55
+
56
+
57
+ @transform_op("regex_group")
58
+ def _regex_group(value: Any, pattern: str = "", group: int = 1, **_: Any) -> Any:
59
+ if not isinstance(value, str):
60
+ return value
61
+ m = re.search(pattern, value)
62
+ if m:
63
+ try:
64
+ return m.group(group)
65
+ except IndexError:
66
+ return None
67
+ return None
68
+
69
+
70
+ @transform_op("regex_sub")
71
+ def _regex_sub(value: Any, pattern: str = "", repl: str = "", **_: Any) -> Any:
72
+ if not isinstance(value, str):
73
+ return value
74
+ return re.sub(pattern, repl, value)
75
+
76
+
77
+ @transform_op("split_take")
78
+ def _split_take(value: Any, sep: str = "/", index: int = 0, **_: Any) -> Any:
79
+ if not isinstance(value, str):
80
+ return value
81
+ parts = value.split(sep)
82
+ return parts[index].strip() if 0 <= index < len(parts) else None
83
+
84
+
85
+ @transform_op("split_first")
86
+ def _split_first(value: Any, sep: str = "/", **_: Any) -> Any:
87
+ return _split_take(value, sep=sep, index=0)
88
+
89
+
90
+ @transform_op("strip_unit")
91
+ def _strip_unit(value: Any, units: list[str] | None = None, **_: Any) -> Any:
92
+ if not isinstance(value, str):
93
+ return value
94
+ result = value.strip()
95
+ if units:
96
+ for unit in units:
97
+ escaped = re.escape(unit)
98
+ result = re.sub(rf"\s*{escaped}\s*$", "", result, flags=re.IGNORECASE).strip()
99
+ return result
100
+
101
+
102
+ @transform_op("dedup_words")
103
+ def _dedup_words(value: Any, **_: Any) -> Any:
104
+ if not isinstance(value, str):
105
+ return value
106
+ words = value.split()
107
+ deduped = []
108
+ for w in words:
109
+ if not deduped or w != deduped[-1]:
110
+ deduped.append(w)
111
+ return " ".join(deduped)
112
+
113
+
114
+ @transform_op("to_int")
115
+ def _to_int(value: Any, **_: Any) -> Any:
116
+ if isinstance(value, int):
117
+ return value
118
+ if isinstance(value, str):
119
+ cleaned = re.sub(r"[^\d\-]", "", value.replace(",", "").replace(".", ""))
120
+ try:
121
+ return int(cleaned)
122
+ except (ValueError, TypeError):
123
+ return None
124
+ return None
125
+
126
+
127
+ @transform_op("to_float")
128
+ def _to_float(value: Any, **_: Any) -> Any:
129
+ if isinstance(value, float):
130
+ return value
131
+ if isinstance(value, str):
132
+ cleaned = value.replace(",", ".").strip()
133
+ try:
134
+ return float(cleaned)
135
+ except (ValueError, TypeError):
136
+ return None
137
+ return None
138
+
139
+
140
+ @transform_op("to_date")
141
+ def _to_date(value: Any, format: str = "", output: str = "iso", **_: Any) -> Any:
142
+ if not isinstance(value, str):
143
+ return value
144
+ # Already ISO YYYY-MM-DD — return unchanged to avoid mis-parsing with dayfirst=True.
145
+ # Example: "2024-06-10" with dayfirst=True would become "2024-10-06" (wrong).
146
+ if re.fullmatch(r"\d{4}-\d{2}-\d{2}", value.strip()):
147
+ return value.strip()
148
+ try:
149
+ dt = dateutil_parser.parse(value, dayfirst=True)
150
+ if output == "iso":
151
+ return dt.date().isoformat()
152
+ return str(dt.date())
153
+ except (ValueError, TypeError):
154
+ return None
155
+
156
+
157
+ @transform_op("pick")
158
+ def _pick(value: Any, kind: str = "year", **_: Any) -> Any:
159
+ if not isinstance(value, str):
160
+ return value
161
+ if kind == "year":
162
+ m = re.search(r"\b(19|20)\d{2}\b", value)
163
+ return m.group(0) if m else None
164
+ return value
165
+
166
+
167
+ @transform_op("enum_normalize")
168
+ def _enum_normalize(value: Any, map: dict[str, str] | None = None, **_: Any) -> Any:
169
+ if not isinstance(value, str) or not map:
170
+ return value
171
+ return map.get(value, map.get(value.lower(), value))
172
+
173
+
174
+ @transform_op("default_if_empty")
175
+ def _default_if_empty(value: Any, default: Any = None, **_: Any) -> Any:
176
+ if value is None or value == "" or value == []:
177
+ return default
178
+ return value
179
+
180
+
181
+ # ---------------------------------------------------------------------------
182
+ # Apply pipeline
183
+ # ---------------------------------------------------------------------------
184
+
185
+
186
+ def _parse_op(op_ref: Any) -> tuple[str, dict[str, Any]]:
187
+ """Parse op reference from various YAML notations.
188
+
189
+ Supports:
190
+ "strip" -> ("strip", {})
191
+ {"op": "to_date", ...} -> ("to_date", {...})
192
+ {"to_date": {...}} -> ("to_date", {...})
193
+ """
194
+ if isinstance(op_ref, str):
195
+ return op_ref, {}
196
+ if isinstance(op_ref, dict):
197
+ if "op" in op_ref:
198
+ name = op_ref["op"]
199
+ args = {k: v for k, v in op_ref.items() if k != "op"}
200
+ return name, args
201
+ # single-key dict notation
202
+ if len(op_ref) == 1:
203
+ name = next(iter(op_ref))
204
+ args = op_ref[name] if isinstance(op_ref[name], dict) else {}
205
+ return name, args
206
+ raise TransformError(f"Cannot parse op reference: {op_ref!r}")
207
+
208
+
209
+ def apply_transforms(value: Any, ops: list[Any]) -> Any:
210
+ """Apply a list of transform ops sequentially to value."""
211
+ for op_ref in ops:
212
+ name, args = _parse_op(op_ref)
213
+ if name not in _OPS:
214
+ raise TransformError(f"Unknown transform op: '{name}'")
215
+ try:
216
+ value = _OPS[name](value, **args)
217
+ except TransformError:
218
+ raise
219
+ except Exception as exc:
220
+ raise TransformError(f"Op '{name}' failed: {exc}") from exc
221
+ if value is None:
222
+ break
223
+ return value
224
+
225
+
226
+ # ---------------------------------------------------------------------------
227
+ # Compute (simpleeval)
228
+ # ---------------------------------------------------------------------------
229
+
230
+ _SAFE_FUNCTIONS = {
231
+ "min": min,
232
+ "max": max,
233
+ "abs": abs,
234
+ "round": round,
235
+ "len": len,
236
+ "str": str,
237
+ "int": int,
238
+ "float": float,
239
+ # Date helpers for compute expressions (e.g. year_now() - year(ngay_sinh))
240
+ "year_now": lambda: _dt.date.today().year,
241
+ "year": lambda s: _dt.date.fromisoformat(str(s)).year if s else None,
242
+ "month": lambda s: _dt.date.fromisoformat(str(s)).month if s else None,
243
+ "day": lambda s: _dt.date.fromisoformat(str(s)).day if s else None,
244
+ }
245
+
246
+
247
+ def compute_field(expr: str, deps: dict[str, Any]) -> Any:
248
+ """Evaluate a simpleeval expression with dep values as names.
249
+
250
+ Raises TransformError on eval failure.
251
+ """
252
+ evaluator = EvalWithCompoundTypes(names=deps, functions=_SAFE_FUNCTIONS)
253
+ try:
254
+ return evaluator.eval(expr)
255
+ except InvalidExpression as exc:
256
+ raise TransformError(f"compute expression failed: {exc!r}") from exc
257
+ except Exception as exc:
258
+ raise TransformError(f"compute expression error: {exc}") from exc
259
+
260
+
261
+ # ---------------------------------------------------------------------------
262
+ # Topological sort for compute fields
263
+ # ---------------------------------------------------------------------------
264
+
265
+
266
+ def topo_sort_compute(fields: list[dict[str, Any]]) -> list[str]:
267
+ """Return field keys in topological order based on compute deps.
268
+
269
+ Raises CyclicComputeError if cycle detected.
270
+ Only includes fields that have a 'compute' key.
271
+ """
272
+ compute_fields = {f["key"]: f for f in fields if f.get("compute")}
273
+
274
+ # Build adjacency: dep → [fields that depend on dep]
275
+ graph: dict[str, list[str]] = {k: [] for k in compute_fields}
276
+ in_degree: dict[str, int] = {k: 0 for k in compute_fields}
277
+
278
+ for key, field in compute_fields.items():
279
+ for dep in field.get("deps", []):
280
+ if dep in compute_fields:
281
+ graph[dep].append(key)
282
+ in_degree[key] += 1
283
+
284
+ # Kahn's algorithm
285
+ queue = [k for k, deg in in_degree.items() if deg == 0]
286
+ order: list[str] = []
287
+ while queue:
288
+ node = queue.pop(0)
289
+ order.append(node)
290
+ for dependent in graph.get(node, []):
291
+ in_degree[dependent] -= 1
292
+ if in_degree[dependent] == 0:
293
+ queue.append(dependent)
294
+
295
+ if len(order) != len(compute_fields):
296
+ cycle_keys = set(compute_fields) - set(order)
297
+ raise CyclicComputeError(f"Cyclic compute dependency detected: {cycle_keys}")
298
+
299
+ return order
300
+
301
+
302
+ def all_op_names() -> list[str]:
303
+ """Return all registered op names."""
304
+ return list(_OPS.keys())
@@ -0,0 +1,189 @@
1
+ Metadata-Version: 2.4
2
+ Name: ocr-postprocess
3
+ Version: 0.1.0
4
+ Summary: Biến raw OCR text thành structured document — trích xuất trường dữ liệu, xử lý nhiễu, cross-check, render JSON/Markdown.
5
+ Home-page: https://github.com/ohmygodvt95/ocr-postprocess
6
+ Author: ohmygodvt95
7
+ License: MIT
8
+ Keywords: ocr post-processing nlp document extraction vietnamese
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: Programming Language :: Python :: 3.10
11
+ Classifier: Programming Language :: Python :: 3.11
12
+ Classifier: Programming Language :: Python :: 3.12
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Operating System :: OS Independent
15
+ Classifier: Intended Audience :: Developers
16
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
17
+ Classifier: Topic :: Text Processing
18
+ Requires-Python: >=3.10
19
+ Description-Content-Type: text/markdown
20
+ Requires-Dist: pydantic>=2.5
21
+ Requires-Dist: PyYAML>=6.0
22
+ Requires-Dist: rapidfuzz>=3.5
23
+ Requires-Dist: regex>=2024.1.24
24
+ Requires-Dist: typer>=0.12
25
+ Requires-Dist: python-dateutil>=2.9
26
+ Requires-Dist: simpleeval>=0.9.13
27
+ Dynamic: author
28
+ Dynamic: classifier
29
+ Dynamic: description
30
+ Dynamic: description-content-type
31
+ Dynamic: home-page
32
+ Dynamic: keywords
33
+ Dynamic: license
34
+ Dynamic: requires-dist
35
+ Dynamic: requires-python
36
+ Dynamic: summary
37
+
38
+ # ocr-postprocess
39
+
40
+ Biến raw OCR text thành structured document — trích xuất trường dữ liệu, xử lý nhiễu, cross-check, và render ra JSON/Markdown.
41
+
42
+ ## Installation
43
+
44
+ ```bash
45
+ pip install ocr-postprocess
46
+ ```
47
+
48
+ Hoặc cài từ source:
49
+
50
+ ```bash
51
+ git clone https://github.com/your-org/ocr-postprocess
52
+ cd ocr-postprocess
53
+ pip install -e .
54
+ ```
55
+
56
+ ## Library usage
57
+
58
+ ```python
59
+ from ocr_postprocess import Pipeline, ProcessedDocument, OcrPostprocessError
60
+
61
+ # Sử dụng profiles bundled sẵn (no extra files needed)
62
+ pipeline = Pipeline.from_default()
63
+
64
+ raw_text = open("scan.txt").read()
65
+
66
+ try:
67
+ doc: ProcessedDocument = pipeline.process(raw_text)
68
+ except OcrPostprocessError as exc:
69
+ print(f"Pipeline error: {exc}")
70
+ raise
71
+
72
+ # Lấy một trường
73
+ name_candidate = doc.get("ho_va_ten")
74
+ if name_candidate:
75
+ print(name_candidate.value) # "NGUYỄN VĂN A"
76
+ print(name_candidate.confidence) # 0.91
77
+
78
+ # Toàn bộ trường đã trích
79
+ fields = {c.key: c.value for c in doc.candidates}
80
+
81
+ # Export JSON
82
+ import json
83
+ print(json.dumps(doc.to_json(), ensure_ascii=False, indent=2))
84
+
85
+ # Export Markdown
86
+ print(doc.markdown)
87
+ ```
88
+
89
+ ### Custom profiles directory
90
+
91
+ ```python
92
+ # Dùng thư mục profiles riêng
93
+ pipeline = Pipeline.from_default(profiles_dir="my_profiles/")
94
+ ```
95
+
96
+ ### Classify only
97
+
98
+ ```python
99
+ profile_id, score = pipeline.classify(raw_text)
100
+ # "cccd_2024", 0.97
101
+ ```
102
+
103
+ ### ProcessedDocument fields
104
+
105
+ | Field | Type | Mô tả |
106
+ |---|---|---|
107
+ | `profile_id` | `str` | Profile được match |
108
+ | `profile_score` | `float` | Điểm classify (0–1) |
109
+ | `candidates` | `list[Candidate]` | Tất cả trường đã trích |
110
+ | `overall_confidence` | `float` | Điểm tin cậy tổng hợp |
111
+ | `warnings` | `list[str]` | Cảnh báo từ pipeline |
112
+ | `markdown` | `str` | Kết quả render Markdown |
113
+ | `cross_checks` | `list[CrossCheck]` | Kết quả cross-check |
114
+
115
+ ## CLI
116
+
117
+ ```bash
118
+ # Process a document
119
+ ocrpp process scan.txt
120
+
121
+ # Markdown output
122
+ ocrpp process scan.txt --format markdown
123
+
124
+ # Classify only
125
+ ocrpp classify scan.txt
126
+
127
+ # Validate a profile
128
+ ocrpp validate-profile profiles/my_profile.yml
129
+
130
+ # Custom profiles directory
131
+ ocrpp process scan.txt --profiles ./my_profiles/
132
+ ```
133
+
134
+ ## Adding a custom profile
135
+
136
+ Tạo file YAML trong thư mục profiles của bạn:
137
+
138
+ ```yaml
139
+ id: my_doc
140
+ version: 1
141
+ display_name: "My document type"
142
+
143
+ classify:
144
+ any_of:
145
+ - contains_any: ["MY DOCUMENT HEADER"]
146
+
147
+ extract:
148
+ - name: document_number
149
+ aliases: ["Document No", "Số chứng từ"]
150
+ extractor: value_in_same_line
151
+ required: true
152
+ ```
153
+
154
+ Sau đó:
155
+
156
+ ```python
157
+ pipeline = Pipeline.from_default(profiles_dir="my_profiles/")
158
+ ```
159
+
160
+ ## Exceptions
161
+
162
+ ```python
163
+ from ocr_postprocess import (
164
+ OcrPostprocessError, # base
165
+ ProfileNotFoundError,
166
+ ProfileValidationError,
167
+ ExtractorNotFoundError,
168
+ TransformError,
169
+ )
170
+ ```
171
+
172
+ ## Development
173
+
174
+ ```bash
175
+ python -m venv .venv && source .venv/bin/activate
176
+ pip install -r requirements-dev.txt
177
+ pip install -e .
178
+
179
+ pytest # all tests
180
+ pytest tests/unit # unit only
181
+ pytest -m golden # golden/regression
182
+ pytest -n auto --cov # parallel + coverage
183
+ ruff check . && black --check .
184
+ ```
185
+
186
+ ## Docs
187
+
188
+ Xem [docs/README.md](docs/README.md) để biết chi tiết về pipeline stages và profile schema.
189
+
@@ -0,0 +1,55 @@
1
+ ocr_postprocess/__init__.py,sha256=f5tskc50hqq_cbaWIRgrs6dw2_-2UKFk3c_hxG_bAiY,861
2
+ ocr_postprocess/classifier.py,sha256=PENB6t0SWRzMxc9UGDj9nP4AFqWJHFYua3u0JXqWmFU,1986
3
+ ocr_postprocess/cli.py,sha256=pSdg9nhpue8Ql-vf0O6SGk5gwctjipG_OAIidsxHB0w,4259
4
+ ocr_postprocess/exceptions.py,sha256=lHa1r4B1b1TOPdElV3da-mYws-Uu7IQNBgCGCN-taFQ,856
5
+ ocr_postprocess/models.py,sha256=9Fb7t5DT7CrcnKd2Vnmn1OSzHYFkxqCLkVSnXQMYCcA,4510
6
+ ocr_postprocess/pipeline.py,sha256=QLEaU5Fjg0lj1LQvM573ZGM8Zz4o_41BOrTXq1h2W18,6637
7
+ ocr_postprocess/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
+ ocr_postprocess/scorer.py,sha256=i1bPKkWdBxJUq4GVVPxE7GSSkLhs0YgYKERZav3f9tk,2738
9
+ ocr_postprocess/transformer.py,sha256=ouN5sZZAnQmB22AJr0oG5XmmxrzUiD0xrn1GTRLCNJ4,9349
10
+ ocr_postprocess/engine/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
+ ocr_postprocess/engine/denoiser.py,sha256=77UNAv75TlV4sVrLT5LHXLUnOdVQBn4UbJtDU0Pq6iQ,4345
12
+ ocr_postprocess/engine/extractor_stage.py,sha256=ArpK-V0oVEkP2yKl_D2x68blCeeYSc4CORZc1TJ_y0Q,3423
13
+ ocr_postprocess/engine/normalizer.py,sha256=BBqAzeyNV2wv-5ewu5SJMte-kjgo9jgVekimh7maQys,3878
14
+ ocr_postprocess/engine/reconciler.py,sha256=ohxErHyn0PavcgUldnhfv442-HvEdtk6PlLyubooP8s,6156
15
+ ocr_postprocess/engine/reconstructor.py,sha256=sVmn-8nygFTlT31aWe-uxbfGDdgEr1bwW-78fIfjuMo,16645
16
+ ocr_postprocess/engine/transform_stage.py,sha256=DIz44lOUDQodS1hUmEtRGw2mF_dxfyPp1LTlzxt-xz4,3330
17
+ ocr_postprocess/extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
18
+ ocr_postprocess/extractors/base.py,sha256=u8bzVK0wxv3X-J_lfICEHwG-L__89j5w_XbDHErfChY,2857
19
+ ocr_postprocess/extractors/helpers.py,sha256=kzY-g7KfGI1iySIN0oDzDqtLOE9fIMIVP551OqrSrs0,2141
20
+ ocr_postprocess/extractors/registry.py,sha256=ULh_BoyUbC17VMdnXv1ffzFokZizJXfsxx53ziLbs3U,1092
21
+ ocr_postprocess/extractors/universal.py,sha256=98IP5HfIneLeLHJsvXu1Spxado0XNAsbL_M-x-hEMM0,991
22
+ ocr_postprocess/extractors/label_anchor/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
23
+ ocr_postprocess/extractors/label_anchor/line_after_label.py,sha256=gKFzsveaAerUxox-i3BvFSZWIZeeqn33N4jIRxbPECc,1879
24
+ ocr_postprocess/extractors/label_anchor/regex_after_label.py,sha256=PTzOoj8MdXcy9iHZHgbqJ0C76ZdYHeKY4taSVY5vx6s,2802
25
+ ocr_postprocess/extractors/label_anchor/text_until_next_label.py,sha256=jO7JIW7VQNSuhbCkAhDtV43dgyhk70HyCY6f_928-W4,2957
26
+ ocr_postprocess/extractors/label_anchor/value_between_labels.py,sha256=LTUyExIAuhezjy9jTBUvvRDfHppb2bXI_kjvo4Ds20Q,2284
27
+ ocr_postprocess/extractors/label_anchor/value_in_same_line.py,sha256=7aTKbNm0qoBQ86_ovYGEb6egqTHx8fpNhSDLdvcKd68,2078
28
+ ocr_postprocess/extractors/pattern/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
29
+ ocr_postprocess/extractors/pattern/cccd.py,sha256=g6gNwbJnVDN7gJtn3aaPGOPGzBSZPVc9WohlTrUYePU,2651
30
+ ocr_postprocess/extractors/pattern/cmnd.py,sha256=mfETtPV9xVRzhmcOrN73BySoyEkVKv8z9OfPsxaq5ok,1150
31
+ ocr_postprocess/extractors/pattern/currency_vnd.py,sha256=cFVE_6O4feOGwd961YxByvmVqg_NatBMT8wjvVlmn-8,1541
32
+ ocr_postprocess/extractors/pattern/date.py,sha256=MSsR0vpYi_ElvaeUdOsfyEKE2ulw4mwe5sjBmwqqQqo,2906
33
+ ocr_postprocess/extractors/pattern/email.py,sha256=u0qE31YsAWXhWpaHJbFQE4N1CWOsvXj_1Wnb5gxWEDY,1157
34
+ ocr_postprocess/extractors/pattern/gender_vn.py,sha256=yWYGYmjH4PUyn87tcdl5RQt8f21ChG-EngJYfZd7fP8,1383
35
+ ocr_postprocess/extractors/pattern/phone_vn.py,sha256=5quJ22uUsJySn0EBUvbdY3iuh8Pk8U1QA1E6OWn_YUY,1972
36
+ ocr_postprocess/extractors/pattern/plate_vn.py,sha256=GXWdd2qlBDT_mOQUd-ZP25WLJ5mde5mQyti8Zn1IDpo,1284
37
+ ocr_postprocess/extractors/pattern/tax_code.py,sha256=Ywd3fRT8Pjdu5CZDvoeFOArWyGQKbIpjEuXR2BmwcWw,1775
38
+ ocr_postprocess/extractors/structured/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
39
+ ocr_postprocess/extractors/structured/mrz_cccd.py,sha256=DAd-BIxW78Pqf3j3xhNVYp0Xrqzz5qQeKXxhaCC9OYc,3623
40
+ ocr_postprocess/profiles/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
41
+ ocr_postprocess/profiles/_generic.yml,sha256=oD73q5wnj76zhiK4CvMy8qxuhec1Sj-J-Rws8tPTs_M,161
42
+ ocr_postprocess/profiles/cccd_2024.yml,sha256=pQagCWOzOJ-pHWKlNYr131FlNcOVfacvhZJt2tUGqmg,3568
43
+ ocr_postprocess/profiles/dang_kiem.yml,sha256=QPryEckEkIohYkQeVuUsSh7Na6Z9uWOo45eCkO5HYec,2890
44
+ ocr_postprocess/profiles/loader.py,sha256=pxS1R9UEMgOovhS9BEfmDQydt64oRFafUAxgbc5y2Cg,2108
45
+ ocr_postprocess/profiles/matcher.py,sha256=VCqV72NMqITw8Y62quJut7ZxmRwC3sD_1kDo0a1TUKo,2092
46
+ ocr_postprocess/profiles/schema.py,sha256=tUnYd5BfLMG1B9v9a7Qfa-jIoz6GwJcV_daAkkUvXLA,6157
47
+ ocr_postprocess/renderer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
48
+ ocr_postprocess/renderer/json_renderer.py,sha256=hR2L0Brs7QN5-GCgPLNsqk2ZBjcjkeCvFE6olVEuTII,1841
49
+ ocr_postprocess/renderer/llm.py,sha256=PIog7ddLaShJDGGiDLLr_L7VITfAzGl0DyLubut0Oog,1350
50
+ ocr_postprocess/renderer/markdown.py,sha256=fklOqrk3m2c8kyAmxDJFKZkpc49M18SUttZkr1CPu6k,6866
51
+ ocr_postprocess-0.1.0.dist-info/METADATA,sha256=iVzgWPUxLNHI_hfZs2BKlB5fK5XlbsjBAKURm5u34Xw,4592
52
+ ocr_postprocess-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
53
+ ocr_postprocess-0.1.0.dist-info/entry_points.txt,sha256=TzBHQMR05LYex2OIneKnqqyoM0sDGZGb03VLhH8RAds,50
54
+ ocr_postprocess-0.1.0.dist-info/top_level.txt,sha256=EJBbBVsIAzSLT376rE4C9-wYGhR6P8OMw2117it7dz8,16
55
+ ocr_postprocess-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ ocrpp = ocr_postprocess.cli:app
@@ -0,0 +1 @@
1
+ ocr_postprocess