ocr-postprocess 0.1.0__tar.gz → 0.1.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. {ocr_postprocess-0.1.0 → ocr_postprocess-0.1.2}/PKG-INFO +1 -1
  2. {ocr_postprocess-0.1.0 → ocr_postprocess-0.1.2}/ocr_postprocess/cli.py +42 -19
  3. {ocr_postprocess-0.1.0 → ocr_postprocess-0.1.2}/ocr_postprocess/engine/reconciler.py +1 -1
  4. {ocr_postprocess-0.1.0 → ocr_postprocess-0.1.2}/ocr_postprocess/pipeline.py +2 -2
  5. ocr_postprocess-0.1.2/ocr_postprocess/profiles/cccd_qr.yml +89 -0
  6. {ocr_postprocess-0.1.0 → ocr_postprocess-0.1.2}/ocr_postprocess.egg-info/PKG-INFO +1 -1
  7. {ocr_postprocess-0.1.0 → ocr_postprocess-0.1.2}/ocr_postprocess.egg-info/SOURCES.txt +1 -0
  8. {ocr_postprocess-0.1.0 → ocr_postprocess-0.1.2}/setup.py +1 -1
  9. {ocr_postprocess-0.1.0 → ocr_postprocess-0.1.2}/MANIFEST.in +0 -0
  10. {ocr_postprocess-0.1.0 → ocr_postprocess-0.1.2}/README.md +0 -0
  11. {ocr_postprocess-0.1.0 → ocr_postprocess-0.1.2}/ocr_postprocess/__init__.py +0 -0
  12. {ocr_postprocess-0.1.0 → ocr_postprocess-0.1.2}/ocr_postprocess/classifier.py +0 -0
  13. {ocr_postprocess-0.1.0 → ocr_postprocess-0.1.2}/ocr_postprocess/engine/__init__.py +0 -0
  14. {ocr_postprocess-0.1.0 → ocr_postprocess-0.1.2}/ocr_postprocess/engine/denoiser.py +0 -0
  15. {ocr_postprocess-0.1.0 → ocr_postprocess-0.1.2}/ocr_postprocess/engine/extractor_stage.py +0 -0
  16. {ocr_postprocess-0.1.0 → ocr_postprocess-0.1.2}/ocr_postprocess/engine/normalizer.py +0 -0
  17. {ocr_postprocess-0.1.0 → ocr_postprocess-0.1.2}/ocr_postprocess/engine/reconstructor.py +0 -0
  18. {ocr_postprocess-0.1.0 → ocr_postprocess-0.1.2}/ocr_postprocess/engine/transform_stage.py +0 -0
  19. {ocr_postprocess-0.1.0 → ocr_postprocess-0.1.2}/ocr_postprocess/exceptions.py +0 -0
  20. {ocr_postprocess-0.1.0 → ocr_postprocess-0.1.2}/ocr_postprocess/extractors/__init__.py +0 -0
  21. {ocr_postprocess-0.1.0 → ocr_postprocess-0.1.2}/ocr_postprocess/extractors/base.py +0 -0
  22. {ocr_postprocess-0.1.0 → ocr_postprocess-0.1.2}/ocr_postprocess/extractors/helpers.py +0 -0
  23. {ocr_postprocess-0.1.0 → ocr_postprocess-0.1.2}/ocr_postprocess/extractors/label_anchor/__init__.py +0 -0
  24. {ocr_postprocess-0.1.0 → ocr_postprocess-0.1.2}/ocr_postprocess/extractors/label_anchor/line_after_label.py +0 -0
  25. {ocr_postprocess-0.1.0 → ocr_postprocess-0.1.2}/ocr_postprocess/extractors/label_anchor/regex_after_label.py +0 -0
  26. {ocr_postprocess-0.1.0 → ocr_postprocess-0.1.2}/ocr_postprocess/extractors/label_anchor/text_until_next_label.py +0 -0
  27. {ocr_postprocess-0.1.0 → ocr_postprocess-0.1.2}/ocr_postprocess/extractors/label_anchor/value_between_labels.py +0 -0
  28. {ocr_postprocess-0.1.0 → ocr_postprocess-0.1.2}/ocr_postprocess/extractors/label_anchor/value_in_same_line.py +0 -0
  29. {ocr_postprocess-0.1.0 → ocr_postprocess-0.1.2}/ocr_postprocess/extractors/pattern/__init__.py +0 -0
  30. {ocr_postprocess-0.1.0 → ocr_postprocess-0.1.2}/ocr_postprocess/extractors/pattern/cccd.py +0 -0
  31. {ocr_postprocess-0.1.0 → ocr_postprocess-0.1.2}/ocr_postprocess/extractors/pattern/cmnd.py +0 -0
  32. {ocr_postprocess-0.1.0 → ocr_postprocess-0.1.2}/ocr_postprocess/extractors/pattern/currency_vnd.py +0 -0
  33. {ocr_postprocess-0.1.0 → ocr_postprocess-0.1.2}/ocr_postprocess/extractors/pattern/date.py +0 -0
  34. {ocr_postprocess-0.1.0 → ocr_postprocess-0.1.2}/ocr_postprocess/extractors/pattern/email.py +0 -0
  35. {ocr_postprocess-0.1.0 → ocr_postprocess-0.1.2}/ocr_postprocess/extractors/pattern/gender_vn.py +0 -0
  36. {ocr_postprocess-0.1.0 → ocr_postprocess-0.1.2}/ocr_postprocess/extractors/pattern/phone_vn.py +0 -0
  37. {ocr_postprocess-0.1.0 → ocr_postprocess-0.1.2}/ocr_postprocess/extractors/pattern/plate_vn.py +0 -0
  38. {ocr_postprocess-0.1.0 → ocr_postprocess-0.1.2}/ocr_postprocess/extractors/pattern/tax_code.py +0 -0
  39. {ocr_postprocess-0.1.0 → ocr_postprocess-0.1.2}/ocr_postprocess/extractors/registry.py +0 -0
  40. {ocr_postprocess-0.1.0 → ocr_postprocess-0.1.2}/ocr_postprocess/extractors/structured/__init__.py +0 -0
  41. {ocr_postprocess-0.1.0 → ocr_postprocess-0.1.2}/ocr_postprocess/extractors/structured/mrz_cccd.py +0 -0
  42. {ocr_postprocess-0.1.0 → ocr_postprocess-0.1.2}/ocr_postprocess/extractors/universal.py +0 -0
  43. {ocr_postprocess-0.1.0 → ocr_postprocess-0.1.2}/ocr_postprocess/models.py +0 -0
  44. {ocr_postprocess-0.1.0 → ocr_postprocess-0.1.2}/ocr_postprocess/profiles/__init__.py +0 -0
  45. {ocr_postprocess-0.1.0 → ocr_postprocess-0.1.2}/ocr_postprocess/profiles/_generic.yml +0 -0
  46. {ocr_postprocess-0.1.0 → ocr_postprocess-0.1.2}/ocr_postprocess/profiles/cccd_2024.yml +0 -0
  47. {ocr_postprocess-0.1.0 → ocr_postprocess-0.1.2}/ocr_postprocess/profiles/dang_kiem.yml +0 -0
  48. {ocr_postprocess-0.1.0 → ocr_postprocess-0.1.2}/ocr_postprocess/profiles/loader.py +0 -0
  49. {ocr_postprocess-0.1.0 → ocr_postprocess-0.1.2}/ocr_postprocess/profiles/matcher.py +0 -0
  50. {ocr_postprocess-0.1.0 → ocr_postprocess-0.1.2}/ocr_postprocess/profiles/schema.py +0 -0
  51. {ocr_postprocess-0.1.0 → ocr_postprocess-0.1.2}/ocr_postprocess/py.typed +0 -0
  52. {ocr_postprocess-0.1.0 → ocr_postprocess-0.1.2}/ocr_postprocess/renderer/__init__.py +0 -0
  53. {ocr_postprocess-0.1.0 → ocr_postprocess-0.1.2}/ocr_postprocess/renderer/json_renderer.py +0 -0
  54. {ocr_postprocess-0.1.0 → ocr_postprocess-0.1.2}/ocr_postprocess/renderer/llm.py +0 -0
  55. {ocr_postprocess-0.1.0 → ocr_postprocess-0.1.2}/ocr_postprocess/renderer/markdown.py +0 -0
  56. {ocr_postprocess-0.1.0 → ocr_postprocess-0.1.2}/ocr_postprocess/scorer.py +0 -0
  57. {ocr_postprocess-0.1.0 → ocr_postprocess-0.1.2}/ocr_postprocess/transformer.py +0 -0
  58. {ocr_postprocess-0.1.0 → ocr_postprocess-0.1.2}/ocr_postprocess.egg-info/dependency_links.txt +0 -0
  59. {ocr_postprocess-0.1.0 → ocr_postprocess-0.1.2}/ocr_postprocess.egg-info/entry_points.txt +0 -0
  60. {ocr_postprocess-0.1.0 → ocr_postprocess-0.1.2}/ocr_postprocess.egg-info/requires.txt +0 -0
  61. {ocr_postprocess-0.1.0 → ocr_postprocess-0.1.2}/ocr_postprocess.egg-info/top_level.txt +0 -0
  62. {ocr_postprocess-0.1.0 → ocr_postprocess-0.1.2}/pyproject.toml +0 -0
  63. {ocr_postprocess-0.1.0 → ocr_postprocess-0.1.2}/requirements-dev.txt +0 -0
  64. {ocr_postprocess-0.1.0 → ocr_postprocess-0.1.2}/requirements.txt +0 -0
  65. {ocr_postprocess-0.1.0 → ocr_postprocess-0.1.2}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ocr-postprocess
3
- Version: 0.1.0
3
+ Version: 0.1.2
4
4
  Summary: Biến raw OCR text thành structured document — trích xuất trường dữ liệu, xử lý nhiễu, cross-check, render JSON/Markdown.
5
5
  Home-page: https://github.com/ohmygodvt95/ocr-postprocess
6
6
  Author: ohmygodvt95
@@ -32,18 +32,22 @@ def classify(
32
32
  input_file: Optional[Path] = typer.Argument(
33
33
  None, help="Path to raw text file. Reads stdin if omitted."
34
34
  ),
35
- profiles_dir: Path = typer.Option(
36
- Path("profiles"), "--profiles", "-p", help="Profiles directory."
35
+ profiles_dir: Optional[Path] = typer.Option(
36
+ None, "--profiles", "-p", help="Profiles directory. Defaults to bundled profiles."
37
37
  ),
38
- log_level: str = typer.Option("info", "--log-level", "-l"),
38
+ log_level: str = typer.Option("warning", "--log-level", "-l"),
39
39
  ) -> None:
40
40
  """Classify a document and print the best matching profile ID."""
41
41
  _setup_logging(log_level)
42
42
  from ocr_postprocess.pipeline import Pipeline
43
43
 
44
44
  raw = _read_input(input_file)
45
- pipeline = Pipeline.from_default(profiles_dir=str(profiles_dir))
46
- profile_id, score = pipeline.classify(raw)
45
+ try:
46
+ pipeline = Pipeline.from_default(profiles_dir=str(profiles_dir) if profiles_dir else None)
47
+ profile_id, score = pipeline.classify(raw)
48
+ except Exception as exc:
49
+ typer.echo(f"Error: {exc}", err=True)
50
+ raise typer.Exit(code=1)
47
51
  typer.echo(f"{profile_id}\t{score:.4f}")
48
52
 
49
53
 
@@ -52,14 +56,14 @@ def process(
52
56
  input_file: Optional[Path] = typer.Argument(
53
57
  None, help="Path to raw text file. Reads stdin if omitted."
54
58
  ),
55
- profiles_dir: Path = typer.Option(
56
- Path("profiles"), "--profiles", "-p", help="Profiles directory."
59
+ profiles_dir: Optional[Path] = typer.Option(
60
+ None, "--profiles", "-p", help="Profiles directory. Defaults to bundled profiles."
57
61
  ),
58
62
  output_format: str = typer.Option(
59
63
  "json", "--format", "-f", help="Output format: json, markdown, or llm."
60
64
  ),
61
65
  debug: bool = typer.Option(False, "--debug", "-d", help="Include debug trace."),
62
- log_level: str = typer.Option("info", "--log-level", "-l"),
66
+ log_level: str = typer.Option("warning", "--log-level", "-l"),
63
67
  ) -> None:
64
68
  """Process a document and print extracted fields."""
65
69
  _setup_logging(log_level)
@@ -69,8 +73,12 @@ def process(
69
73
  from ocr_postprocess.renderer.markdown import render_markdown
70
74
 
71
75
  raw = _read_input(input_file)
72
- pipeline = Pipeline.from_default(profiles_dir=str(profiles_dir))
73
- doc = pipeline.process(raw, debug=debug)
76
+ try:
77
+ pipeline = Pipeline.from_default(profiles_dir=profiles_dir)
78
+ doc = pipeline.process(raw, debug=debug)
79
+ except Exception as exc:
80
+ typer.echo(f"Error: {exc}", err=True)
81
+ raise typer.Exit(code=1)
74
82
 
75
83
  if output_format == "markdown":
76
84
  typer.echo(render_markdown(doc))
@@ -83,7 +91,7 @@ def process(
83
91
  @app.command("validate-profile")
84
92
  def validate_profile(
85
93
  profile_file: Path = typer.Argument(..., help="Path to YAML profile file."),
86
- log_level: str = typer.Option("info", "--log-level", "-l"),
94
+ log_level: str = typer.Option("warning", "--log-level", "-l"),
87
95
  ) -> None:
88
96
  """Validate a YAML profile file and print errors if any."""
89
97
  _setup_logging(log_level)
@@ -100,10 +108,10 @@ def validate_profile(
100
108
  @app.command("dump-canonical")
101
109
  def dump_canonical(
102
110
  input_file: Optional[Path] = typer.Argument(None, help="Path to raw text file."),
103
- profiles_dir: Path = typer.Option(
104
- Path("profiles"), "--profiles", "-p", help="Profiles directory."
111
+ profiles_dir: Optional[Path] = typer.Option(
112
+ None, "--profiles", "-p", help="Profiles directory. Defaults to bundled profiles."
105
113
  ),
106
- log_level: str = typer.Option("info", "--log-level", "-l"),
114
+ log_level: str = typer.Option("warning", "--log-level", "-l"),
107
115
  ) -> None:
108
116
  """Process document and dump canonical JSON to stdout."""
109
117
  _setup_logging(log_level)
@@ -111,18 +119,33 @@ def dump_canonical(
111
119
  from ocr_postprocess.renderer.json_renderer import to_json
112
120
 
113
121
  raw = _read_input(input_file)
114
- pipeline = Pipeline.from_default(profiles_dir=str(profiles_dir))
115
- doc = pipeline.process(raw)
122
+ try:
123
+ pipeline = Pipeline.from_default(profiles_dir=profiles_dir)
124
+ doc = pipeline.process(raw)
125
+ except Exception as exc:
126
+ typer.echo(f"Error: {exc}", err=True)
127
+ raise typer.Exit(code=1)
116
128
  typer.echo(to_json(doc))
117
129
 
118
130
 
119
131
  def _read_input(path: Optional[Path]) -> str:
120
132
  """Read text from file path or stdin; exit with code 1 if no input available."""
121
133
  if path is not None:
122
- return path.read_text(encoding="utf-8")
134
+ if not path.exists():
135
+ typer.echo(f"Error: file not found: {path}", err=True)
136
+ raise typer.Exit(code=1)
137
+ text = path.read_text(encoding="utf-8")
138
+ if not text.strip():
139
+ typer.echo(f"Error: file is empty: {path}", err=True)
140
+ raise typer.Exit(code=1)
141
+ return text
123
142
  if not sys.stdin.isatty():
124
- return sys.stdin.read()
125
- typer.echo("Error: no input provided", err=True)
143
+ text = sys.stdin.read()
144
+ if not text.strip():
145
+ typer.echo("Error: empty input from stdin", err=True)
146
+ raise typer.Exit(code=1)
147
+ return text
148
+ typer.echo("Error: no input provided (pass a file path or pipe text via stdin)", err=True)
126
149
  raise typer.Exit(code=1)
127
150
 
128
151
 
@@ -98,7 +98,7 @@ def reconcile_stage(ctx: PipelineContext) -> None:
98
98
  field_label = fdef.aliases[0]
99
99
  readable_vals = ", ".join(f'"{v}"' for v in seen_vals)
100
100
  warning_msg = f"Conflict: {field_label} — nhiều giá trị khác nhau: {readable_vals}"
101
- logger.warning("Conflict for field '%s': %s", key, detail)
101
+ logger.debug("Conflict for field '%s': %s", key, detail)
102
102
  ctx.warnings.append(warning_msg)
103
103
  merged.append(
104
104
  best.model_copy(
@@ -24,10 +24,10 @@ class Pipeline:
24
24
  def __init__(
25
25
  self,
26
26
  stages: list[Stage],
27
- profiles_dir: str | Path = "profiles",
27
+ profiles_dir: str | Path | None = None,
28
28
  ) -> None:
29
29
  self._stages = stages
30
- self._profiles_dir = Path(profiles_dir)
30
+ self._profiles_dir = Path(profiles_dir) if profiles_dir is not None else _BUNDLED_PROFILES_DIR
31
31
  self._profiles: dict = {}
32
32
 
33
33
  @classmethod
@@ -0,0 +1,89 @@
1
+ id: cccd_qr
2
+ version: 1
3
+ display_name: "Căn cước công dân (quét mã QR)"
4
+ language: ["vi"]
5
+
6
+ classify:
7
+ any_of:
8
+ - contains_any: ["mã QR in trên CCCD", "Zalo không chịu trách nhiệm"]
9
+ - all_of:
10
+ - contains_any: ["Số CCCD", "Số CMND"]
11
+ - contains_any: ["Ngày cấp CCCD"]
12
+
13
+ denoise:
14
+ drop_lines:
15
+ contains_any:
16
+ - "Thông tin này được lấy từ mã QR"
17
+ - "Zalo không chịu trách nhiệm"
18
+ collapse_repeats: true
19
+
20
+ reconstruct:
21
+ fuzzy_threshold: 0.85
22
+ split_glued_labels: true
23
+ rejoin_wrapped_lines: true
24
+
25
+ extract:
26
+ - name: tieu_de
27
+ constant: "Kết quả quét mã QR Code"
28
+
29
+ - name: so_cccd
30
+ aliases: ["Số CCCD"]
31
+ extractor: line_after_label
32
+ required: true
33
+ fuzzy_label: false
34
+ transform:
35
+ - {op: replace, from_: " ", to: ""}
36
+
37
+ - name: so_cmnd
38
+ aliases: ["Số CMND"]
39
+ extractor: line_after_label
40
+ fuzzy_label: false
41
+ transform:
42
+ - {op: replace, from_: " ", to: ""}
43
+
44
+ - name: ho_va_ten
45
+ aliases: ["Họ và tên"]
46
+ extractor: line_after_label
47
+ required: true
48
+ fuzzy_label: false
49
+
50
+ - name: ngay_sinh
51
+ aliases: ["Ngày sinh"]
52
+ extractor: regex_after_label
53
+ pattern: '(\d{1,2}[/\-.\s]\d{1,2}[/\-.\s]\d{4})'
54
+ fuzzy_label: false
55
+ next_lines: 1
56
+ required: true
57
+ transform:
58
+ - {op: to_date}
59
+
60
+ - name: gioi_tinh
61
+ aliases: ["Giới tính", "Nữ", "Nam"]
62
+ extractor: gender_vn
63
+ required: true
64
+
65
+ - name: noi_thuong_tru
66
+ aliases: ["Nơi thường trú"]
67
+ extractor: text_until_next_label
68
+ fuzzy_label: false
69
+ stop_labels: ["Ngày sinh", "Ngày cấp CCCD", "Giới tính"]
70
+ required: true
71
+
72
+ - name: ngay_cap
73
+ aliases: ["Ngày cấp CCCD", "Ngày cấp"]
74
+ extractor: regex_after_label
75
+ pattern: '(\d{1,2}[/\-.\s]\d{1,2}[/\-.\s]\d{4})'
76
+ fuzzy_label: false
77
+ next_lines: 1
78
+ required: true
79
+ transform:
80
+ - {op: to_date}
81
+
82
+ output:
83
+ markdown:
84
+ title: "Kết quả quét mã QR Code"
85
+ sections:
86
+ - heading: "Thông tin cá nhân"
87
+ fields: [so_cccd, so_cmnd, ho_va_ten, gioi_tinh, ngay_sinh]
88
+ - heading: "Địa chỉ & cấp"
89
+ fields: [noi_thuong_tru, ngay_cap]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ocr-postprocess
3
- Version: 0.1.0
3
+ Version: 0.1.2
4
4
  Summary: Biến raw OCR text thành structured document — trích xuất trường dữ liệu, xử lý nhiễu, cross-check, render JSON/Markdown.
5
5
  Home-page: https://github.com/ohmygodvt95/ocr-postprocess
6
6
  Author: ohmygodvt95
@@ -52,6 +52,7 @@ ocr_postprocess/extractors/structured/mrz_cccd.py
52
52
  ocr_postprocess/profiles/__init__.py
53
53
  ocr_postprocess/profiles/_generic.yml
54
54
  ocr_postprocess/profiles/cccd_2024.yml
55
+ ocr_postprocess/profiles/cccd_qr.yml
55
56
  ocr_postprocess/profiles/dang_kiem.yml
56
57
  ocr_postprocess/profiles/loader.py
57
58
  ocr_postprocess/profiles/matcher.py
@@ -5,7 +5,7 @@ with open("README.md", encoding="utf-8") as f:
5
5
 
6
6
  setup(
7
7
  name="ocr-postprocess",
8
- version="0.1.0",
8
+ version="0.1.2",
9
9
  description="Biến raw OCR text thành structured document — trích xuất trường dữ liệu, xử lý nhiễu, cross-check, render JSON/Markdown.",
10
10
  long_description=long_description,
11
11
  long_description_content_type="text/markdown",