ocr-postprocess 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. ocr_postprocess-0.1.0/MANIFEST.in +5 -0
  2. ocr_postprocess-0.1.0/PKG-INFO +189 -0
  3. ocr_postprocess-0.1.0/README.md +152 -0
  4. ocr_postprocess-0.1.0/ocr_postprocess/__init__.py +33 -0
  5. ocr_postprocess-0.1.0/ocr_postprocess/classifier.py +63 -0
  6. ocr_postprocess-0.1.0/ocr_postprocess/cli.py +130 -0
  7. ocr_postprocess-0.1.0/ocr_postprocess/engine/__init__.py +0 -0
  8. ocr_postprocess-0.1.0/ocr_postprocess/engine/denoiser.py +134 -0
  9. ocr_postprocess-0.1.0/ocr_postprocess/engine/extractor_stage.py +107 -0
  10. ocr_postprocess-0.1.0/ocr_postprocess/engine/normalizer.py +128 -0
  11. ocr_postprocess-0.1.0/ocr_postprocess/engine/reconciler.py +170 -0
  12. ocr_postprocess-0.1.0/ocr_postprocess/engine/reconstructor.py +469 -0
  13. ocr_postprocess-0.1.0/ocr_postprocess/engine/transform_stage.py +89 -0
  14. ocr_postprocess-0.1.0/ocr_postprocess/exceptions.py +30 -0
  15. ocr_postprocess-0.1.0/ocr_postprocess/extractors/__init__.py +0 -0
  16. ocr_postprocess-0.1.0/ocr_postprocess/extractors/base.py +103 -0
  17. ocr_postprocess-0.1.0/ocr_postprocess/extractors/helpers.py +63 -0
  18. ocr_postprocess-0.1.0/ocr_postprocess/extractors/label_anchor/__init__.py +0 -0
  19. ocr_postprocess-0.1.0/ocr_postprocess/extractors/label_anchor/line_after_label.py +53 -0
  20. ocr_postprocess-0.1.0/ocr_postprocess/extractors/label_anchor/regex_after_label.py +75 -0
  21. ocr_postprocess-0.1.0/ocr_postprocess/extractors/label_anchor/text_until_next_label.py +79 -0
  22. ocr_postprocess-0.1.0/ocr_postprocess/extractors/label_anchor/value_between_labels.py +65 -0
  23. ocr_postprocess-0.1.0/ocr_postprocess/extractors/label_anchor/value_in_same_line.py +60 -0
  24. ocr_postprocess-0.1.0/ocr_postprocess/extractors/pattern/__init__.py +0 -0
  25. ocr_postprocess-0.1.0/ocr_postprocess/extractors/pattern/cccd.py +120 -0
  26. ocr_postprocess-0.1.0/ocr_postprocess/extractors/pattern/cmnd.py +38 -0
  27. ocr_postprocess-0.1.0/ocr_postprocess/extractors/pattern/currency_vnd.py +48 -0
  28. ocr_postprocess-0.1.0/ocr_postprocess/extractors/pattern/date.py +89 -0
  29. ocr_postprocess-0.1.0/ocr_postprocess/extractors/pattern/email.py +38 -0
  30. ocr_postprocess-0.1.0/ocr_postprocess/extractors/pattern/gender_vn.py +48 -0
  31. ocr_postprocess-0.1.0/ocr_postprocess/extractors/pattern/phone_vn.py +83 -0
  32. ocr_postprocess-0.1.0/ocr_postprocess/extractors/pattern/plate_vn.py +39 -0
  33. ocr_postprocess-0.1.0/ocr_postprocess/extractors/pattern/tax_code.py +53 -0
  34. ocr_postprocess-0.1.0/ocr_postprocess/extractors/registry.py +45 -0
  35. ocr_postprocess-0.1.0/ocr_postprocess/extractors/structured/__init__.py +0 -0
  36. ocr_postprocess-0.1.0/ocr_postprocess/extractors/structured/mrz_cccd.py +111 -0
  37. ocr_postprocess-0.1.0/ocr_postprocess/extractors/universal.py +39 -0
  38. ocr_postprocess-0.1.0/ocr_postprocess/models.py +131 -0
  39. ocr_postprocess-0.1.0/ocr_postprocess/pipeline.py +179 -0
  40. ocr_postprocess-0.1.0/ocr_postprocess/profiles/__init__.py +0 -0
  41. ocr_postprocess-0.1.0/ocr_postprocess/profiles/_generic.yml +13 -0
  42. ocr_postprocess-0.1.0/ocr_postprocess/profiles/cccd_2024.yml +113 -0
  43. ocr_postprocess-0.1.0/ocr_postprocess/profiles/dang_kiem.yml +105 -0
  44. ocr_postprocess-0.1.0/ocr_postprocess/profiles/loader.py +63 -0
  45. ocr_postprocess-0.1.0/ocr_postprocess/profiles/matcher.py +71 -0
  46. ocr_postprocess-0.1.0/ocr_postprocess/profiles/schema.py +197 -0
  47. ocr_postprocess-0.1.0/ocr_postprocess/py.typed +0 -0
  48. ocr_postprocess-0.1.0/ocr_postprocess/renderer/__init__.py +0 -0
  49. ocr_postprocess-0.1.0/ocr_postprocess/renderer/json_renderer.py +59 -0
  50. ocr_postprocess-0.1.0/ocr_postprocess/renderer/llm.py +41 -0
  51. ocr_postprocess-0.1.0/ocr_postprocess/renderer/markdown.py +172 -0
  52. ocr_postprocess-0.1.0/ocr_postprocess/scorer.py +78 -0
  53. ocr_postprocess-0.1.0/ocr_postprocess/transformer.py +304 -0
  54. ocr_postprocess-0.1.0/ocr_postprocess.egg-info/PKG-INFO +189 -0
  55. ocr_postprocess-0.1.0/ocr_postprocess.egg-info/SOURCES.txt +62 -0
  56. ocr_postprocess-0.1.0/ocr_postprocess.egg-info/dependency_links.txt +1 -0
  57. ocr_postprocess-0.1.0/ocr_postprocess.egg-info/entry_points.txt +2 -0
  58. ocr_postprocess-0.1.0/ocr_postprocess.egg-info/requires.txt +7 -0
  59. ocr_postprocess-0.1.0/ocr_postprocess.egg-info/top_level.txt +1 -0
  60. ocr_postprocess-0.1.0/pyproject.toml +13 -0
  61. ocr_postprocess-0.1.0/requirements-dev.txt +7 -0
  62. ocr_postprocess-0.1.0/requirements.txt +7 -0
  63. ocr_postprocess-0.1.0/setup.cfg +4 -0
  64. ocr_postprocess-0.1.0/setup.py +45 -0
@@ -0,0 +1,5 @@
1
+ include README.md
2
+ include requirements.txt
3
+ include requirements-dev.txt
4
+ recursive-include ocr_postprocess/profiles *.yml
5
+ include ocr_postprocess/py.typed
@@ -0,0 +1,189 @@
1
+ Metadata-Version: 2.4
2
+ Name: ocr-postprocess
3
+ Version: 0.1.0
4
+ Summary: Biến raw OCR text thành structured document — trích xuất trường dữ liệu, xử lý nhiễu, cross-check, render JSON/Markdown.
5
+ Home-page: https://github.com/ohmygodvt95/ocr-postprocess
6
+ Author: ohmygodvt95
7
+ License: MIT
8
+ Keywords: ocr post-processing nlp document extraction vietnamese
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: Programming Language :: Python :: 3.10
11
+ Classifier: Programming Language :: Python :: 3.11
12
+ Classifier: Programming Language :: Python :: 3.12
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Operating System :: OS Independent
15
+ Classifier: Intended Audience :: Developers
16
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
17
+ Classifier: Topic :: Text Processing
18
+ Requires-Python: >=3.10
19
+ Description-Content-Type: text/markdown
20
+ Requires-Dist: pydantic>=2.5
21
+ Requires-Dist: PyYAML>=6.0
22
+ Requires-Dist: rapidfuzz>=3.5
23
+ Requires-Dist: regex>=2024.1.24
24
+ Requires-Dist: typer>=0.12
25
+ Requires-Dist: python-dateutil>=2.9
26
+ Requires-Dist: simpleeval>=0.9.13
27
+ Dynamic: author
28
+ Dynamic: classifier
29
+ Dynamic: description
30
+ Dynamic: description-content-type
31
+ Dynamic: home-page
32
+ Dynamic: keywords
33
+ Dynamic: license
34
+ Dynamic: requires-dist
35
+ Dynamic: requires-python
36
+ Dynamic: summary
37
+
38
+ # ocr-postprocess
39
+
40
+ Biến raw OCR text thành structured document — trích xuất trường dữ liệu, xử lý nhiễu, cross-check, và render ra JSON/Markdown.
41
+
42
+ ## Installation
43
+
44
+ ```bash
45
+ pip install ocr-postprocess
46
+ ```
47
+
48
+ Hoặc cài từ source:
49
+
50
+ ```bash
51
+ git clone https://github.com/your-org/ocr-postprocess
52
+ cd ocr-postprocess
53
+ pip install -e .
54
+ ```
55
+
56
+ ## Library usage
57
+
58
+ ```python
59
+ from ocr_postprocess import Pipeline, ProcessedDocument, OcrPostprocessError
60
+
61
+ # Sử dụng profiles bundled sẵn (no extra files needed)
62
+ pipeline = Pipeline.from_default()
63
+
64
+ raw_text = open("scan.txt").read()
65
+
66
+ try:
67
+ doc: ProcessedDocument = pipeline.process(raw_text)
68
+ except OcrPostprocessError as exc:
69
+ print(f"Pipeline error: {exc}")
70
+ raise
71
+
72
+ # Lấy một trường
73
+ name_candidate = doc.get("ho_va_ten")
74
+ if name_candidate:
75
+ print(name_candidate.value) # "NGUYỄN VĂN A"
76
+ print(name_candidate.confidence) # 0.91
77
+
78
+ # Toàn bộ trường đã trích
79
+ fields = {c.key: c.value for c in doc.candidates}
80
+
81
+ # Export JSON
82
+ import json
83
+ print(json.dumps(doc.to_json(), ensure_ascii=False, indent=2))
84
+
85
+ # Export Markdown
86
+ print(doc.markdown)
87
+ ```
88
+
89
+ ### Custom profiles directory
90
+
91
+ ```python
92
+ # Dùng thư mục profiles riêng
93
+ pipeline = Pipeline.from_default(profiles_dir="my_profiles/")
94
+ ```
95
+
96
+ ### Classify only
97
+
98
+ ```python
99
+ profile_id, score = pipeline.classify(raw_text)
100
+ # "cccd_2024", 0.97
101
+ ```
102
+
103
+ ### ProcessedDocument fields
104
+
105
+ | Field | Type | Mô tả |
106
+ |---|---|---|
107
+ | `profile_id` | `str` | Profile được match |
108
+ | `profile_score` | `float` | Điểm classify (0–1) |
109
+ | `candidates` | `list[Candidate]` | Tất cả trường đã trích |
110
+ | `overall_confidence` | `float` | Điểm tin cậy tổng hợp |
111
+ | `warnings` | `list[str]` | Cảnh báo từ pipeline |
112
+ | `markdown` | `str` | Kết quả render Markdown |
113
+ | `cross_checks` | `list[CrossCheck]` | Kết quả cross-check |
114
+
115
+ ## CLI
116
+
117
+ ```bash
118
+ # Process a document
119
+ ocrpp process scan.txt
120
+
121
+ # Markdown output
122
+ ocrpp process scan.txt --format markdown
123
+
124
+ # Classify only
125
+ ocrpp classify scan.txt
126
+
127
+ # Validate a profile
128
+ ocrpp validate-profile profiles/my_profile.yml
129
+
130
+ # Custom profiles directory
131
+ ocrpp process scan.txt --profiles ./my_profiles/
132
+ ```
133
+
134
+ ## Adding a custom profile
135
+
136
+ Tạo file YAML trong thư mục profiles của bạn:
137
+
138
+ ```yaml
139
+ id: my_doc
140
+ version: 1
141
+ display_name: "My document type"
142
+
143
+ classify:
144
+ any_of:
145
+ - contains_any: ["MY DOCUMENT HEADER"]
146
+
147
+ extract:
148
+ - name: document_number
149
+ aliases: ["Document No", "Số chứng từ"]
150
+ extractor: value_in_same_line
151
+ required: true
152
+ ```
153
+
154
+ Sau đó:
155
+
156
+ ```python
157
+ pipeline = Pipeline.from_default(profiles_dir="my_profiles/")
158
+ ```
159
+
160
+ ## Exceptions
161
+
162
+ ```python
163
+ from ocr_postprocess import (
164
+ OcrPostprocessError, # base
165
+ ProfileNotFoundError,
166
+ ProfileValidationError,
167
+ ExtractorNotFoundError,
168
+ TransformError,
169
+ )
170
+ ```
171
+
172
+ ## Development
173
+
174
+ ```bash
175
+ python -m venv .venv && source .venv/bin/activate
176
+ pip install -r requirements-dev.txt
177
+ pip install -e .
178
+
179
+ pytest # all tests
180
+ pytest tests/unit # unit only
181
+ pytest -m golden # golden/regression
182
+ pytest -n auto --cov # parallel + coverage
183
+ ruff check . && black --check .
184
+ ```
185
+
186
+ ## Docs
187
+
188
+ Xem [docs/README.md](docs/README.md) để biết chi tiết về pipeline stages và profile schema.
189
+
@@ -0,0 +1,152 @@
1
+ # ocr-postprocess
2
+
3
+ Biến raw OCR text thành structured document — trích xuất trường dữ liệu, xử lý nhiễu, cross-check, và render ra JSON/Markdown.
4
+
5
+ ## Installation
6
+
7
+ ```bash
8
+ pip install ocr-postprocess
9
+ ```
10
+
11
+ Hoặc cài từ source:
12
+
13
+ ```bash
14
+ git clone https://github.com/your-org/ocr-postprocess
15
+ cd ocr-postprocess
16
+ pip install -e .
17
+ ```
18
+
19
+ ## Library usage
20
+
21
+ ```python
22
+ from ocr_postprocess import Pipeline, ProcessedDocument, OcrPostprocessError
23
+
24
+ # Sử dụng profiles bundled sẵn (no extra files needed)
25
+ pipeline = Pipeline.from_default()
26
+
27
+ raw_text = open("scan.txt").read()
28
+
29
+ try:
30
+ doc: ProcessedDocument = pipeline.process(raw_text)
31
+ except OcrPostprocessError as exc:
32
+ print(f"Pipeline error: {exc}")
33
+ raise
34
+
35
+ # Lấy một trường
36
+ name_candidate = doc.get("ho_va_ten")
37
+ if name_candidate:
38
+ print(name_candidate.value) # "NGUYỄN VĂN A"
39
+ print(name_candidate.confidence) # 0.91
40
+
41
+ # Toàn bộ trường đã trích
42
+ fields = {c.key: c.value for c in doc.candidates}
43
+
44
+ # Export JSON
45
+ import json
46
+ print(json.dumps(doc.to_json(), ensure_ascii=False, indent=2))
47
+
48
+ # Export Markdown
49
+ print(doc.markdown)
50
+ ```
51
+
52
+ ### Custom profiles directory
53
+
54
+ ```python
55
+ # Dùng thư mục profiles riêng
56
+ pipeline = Pipeline.from_default(profiles_dir="my_profiles/")
57
+ ```
58
+
59
+ ### Classify only
60
+
61
+ ```python
62
+ profile_id, score = pipeline.classify(raw_text)
63
+ # "cccd_2024", 0.97
64
+ ```
65
+
66
+ ### ProcessedDocument fields
67
+
68
+ | Field | Type | Mô tả |
69
+ |---|---|---|
70
+ | `profile_id` | `str` | Profile được match |
71
+ | `profile_score` | `float` | Điểm classify (0–1) |
72
+ | `candidates` | `list[Candidate]` | Tất cả trường đã trích |
73
+ | `overall_confidence` | `float` | Điểm tin cậy tổng hợp |
74
+ | `warnings` | `list[str]` | Cảnh báo từ pipeline |
75
+ | `markdown` | `str` | Kết quả render Markdown |
76
+ | `cross_checks` | `list[CrossCheck]` | Kết quả cross-check |
77
+
78
+ ## CLI
79
+
80
+ ```bash
81
+ # Process a document
82
+ ocrpp process scan.txt
83
+
84
+ # Markdown output
85
+ ocrpp process scan.txt --format markdown
86
+
87
+ # Classify only
88
+ ocrpp classify scan.txt
89
+
90
+ # Validate a profile
91
+ ocrpp validate-profile profiles/my_profile.yml
92
+
93
+ # Custom profiles directory
94
+ ocrpp process scan.txt --profiles ./my_profiles/
95
+ ```
96
+
97
+ ## Adding a custom profile
98
+
99
+ Tạo file YAML trong thư mục profiles của bạn:
100
+
101
+ ```yaml
102
+ id: my_doc
103
+ version: 1
104
+ display_name: "My document type"
105
+
106
+ classify:
107
+ any_of:
108
+ - contains_any: ["MY DOCUMENT HEADER"]
109
+
110
+ extract:
111
+ - name: document_number
112
+ aliases: ["Document No", "Số chứng từ"]
113
+ extractor: value_in_same_line
114
+ required: true
115
+ ```
116
+
117
+ Sau đó:
118
+
119
+ ```python
120
+ pipeline = Pipeline.from_default(profiles_dir="my_profiles/")
121
+ ```
122
+
123
+ ## Exceptions
124
+
125
+ ```python
126
+ from ocr_postprocess import (
127
+ OcrPostprocessError, # base
128
+ ProfileNotFoundError,
129
+ ProfileValidationError,
130
+ ExtractorNotFoundError,
131
+ TransformError,
132
+ )
133
+ ```
134
+
135
+ ## Development
136
+
137
+ ```bash
138
+ python -m venv .venv && source .venv/bin/activate
139
+ pip install -r requirements-dev.txt
140
+ pip install -e .
141
+
142
+ pytest # all tests
143
+ pytest tests/unit # unit only
144
+ pytest -m golden # golden/regression
145
+ pytest -n auto --cov # parallel + coverage
146
+ ruff check . && black --check .
147
+ ```
148
+
149
+ ## Docs
150
+
151
+ Xem [docs/README.md](docs/README.md) để biết chi tiết về pipeline stages và profile schema.
152
+
@@ -0,0 +1,33 @@
1
+ """ocr_postprocess — OCR post-processing pipeline."""
2
+
3
+ __version__ = "0.1.0"
4
+
5
+ from ocr_postprocess.exceptions import (
6
+ CyclicComputeError,
7
+ ExtractorNotFoundError,
8
+ OcrPostprocessError,
9
+ ProfileNotFoundError,
10
+ ProfileValidationError,
11
+ TransformError,
12
+ )
13
+ from ocr_postprocess.models import PipelineContext, ProcessedDocument
14
+ from ocr_postprocess.pipeline import Pipeline
15
+ from ocr_postprocess.renderer.llm import render_llm_markdown
16
+
17
+ __all__ = [
18
+ # Core
19
+ "Pipeline",
20
+ "ProcessedDocument",
21
+ "PipelineContext",
22
+ # Renderers
23
+ "render_llm_markdown",
24
+ # Exceptions — import these to catch errors without knowing internal paths
25
+ "OcrPostprocessError",
26
+ "ProfileNotFoundError",
27
+ "ProfileValidationError",
28
+ "ExtractorNotFoundError",
29
+ "TransformError",
30
+ "CyclicComputeError",
31
+ # Version
32
+ "__version__",
33
+ ]
@@ -0,0 +1,63 @@
1
+ """Stage 2 — Classifier: select best matching DocumentProfile."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+
7
+ from ocr_postprocess.models import PipelineContext
8
+ from ocr_postprocess.profiles.matcher import evaluate
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ def classify_stage(ctx: PipelineContext) -> None:
14
+ """Pipeline stage 2: classify document and load matching profile."""
15
+ profiles: dict = ctx.__dict__.get("_profiles", {})
16
+
17
+ if not profiles:
18
+ logger.warning("No profiles loaded; using generic fallback")
19
+ ctx.classification_score = 0.0
20
+ return
21
+
22
+ text = ctx.normalized_text or ctx.raw_text
23
+ best_id: str | None = None
24
+ best_score = 0.0
25
+ second_score = 0.0
26
+
27
+ scores: list[tuple[float, str]] = []
28
+ for pid, profile in profiles.items():
29
+ if pid.startswith("_"):
30
+ continue # skip generic fallback in main scoring
31
+ try:
32
+ score = evaluate(profile.classify, text)
33
+ except Exception:
34
+ logger.exception("Error evaluating classify for profile '%s'", pid)
35
+ score = 0.0
36
+ scores.append((score, pid))
37
+
38
+ scores.sort(reverse=True)
39
+
40
+ if scores:
41
+ best_score, best_id = scores[0]
42
+ second_score = scores[1][0] if len(scores) > 1 else 0.0
43
+
44
+ if best_score < 0.5:
45
+ # Fall back to _generic
46
+ best_id = "_generic"
47
+ best_score = 0.0
48
+ logger.info("Classification score too low; using _generic")
49
+ elif len(scores) > 1 and best_score - second_score < 0.1:
50
+ logger.warning(
51
+ "Ambiguous classification: '%s'=%.2f vs '%s'=%.2f",
52
+ best_id,
53
+ best_score,
54
+ scores[1][1],
55
+ second_score,
56
+ )
57
+ else:
58
+ best_id = "_generic"
59
+
60
+ profile_obj = profiles.get(best_id)
61
+ ctx.profile = profile_obj
62
+ ctx.classification_score = best_score
63
+ logger.info("Classified as '%s' (score=%.3f)", best_id, best_score)
@@ -0,0 +1,130 @@
1
+ """Typer CLI for ocr_postprocess."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ import sys
7
+ from pathlib import Path
8
+ from typing import Optional
9
+
10
+ import typer
11
+
12
+ app = typer.Typer(name="ocrpp", help="OCR post-processing pipeline CLI.", add_completion=False)
13
+
14
+ _LOG_LEVELS = {
15
+ "debug": logging.DEBUG,
16
+ "info": logging.INFO,
17
+ "warning": logging.WARNING,
18
+ "error": logging.ERROR,
19
+ }
20
+
21
+
22
+ def _setup_logging(level: str) -> None:
23
+ """Configure root logger with given level string."""
24
+ logging.basicConfig(
25
+ level=_LOG_LEVELS.get(level.lower(), logging.INFO),
26
+ format="%(levelname)s %(name)s: %(message)s",
27
+ )
28
+
29
+
30
+ @app.command()
31
+ def classify(
32
+ input_file: Optional[Path] = typer.Argument(
33
+ None, help="Path to raw text file. Reads stdin if omitted."
34
+ ),
35
+ profiles_dir: Path = typer.Option(
36
+ Path("profiles"), "--profiles", "-p", help="Profiles directory."
37
+ ),
38
+ log_level: str = typer.Option("info", "--log-level", "-l"),
39
+ ) -> None:
40
+ """Classify a document and print the best matching profile ID."""
41
+ _setup_logging(log_level)
42
+ from ocr_postprocess.pipeline import Pipeline
43
+
44
+ raw = _read_input(input_file)
45
+ pipeline = Pipeline.from_default(profiles_dir=str(profiles_dir))
46
+ profile_id, score = pipeline.classify(raw)
47
+ typer.echo(f"{profile_id}\t{score:.4f}")
48
+
49
+
50
+ @app.command()
51
+ def process(
52
+ input_file: Optional[Path] = typer.Argument(
53
+ None, help="Path to raw text file. Reads stdin if omitted."
54
+ ),
55
+ profiles_dir: Path = typer.Option(
56
+ Path("profiles"), "--profiles", "-p", help="Profiles directory."
57
+ ),
58
+ output_format: str = typer.Option(
59
+ "json", "--format", "-f", help="Output format: json, markdown, or llm."
60
+ ),
61
+ debug: bool = typer.Option(False, "--debug", "-d", help="Include debug trace."),
62
+ log_level: str = typer.Option("info", "--log-level", "-l"),
63
+ ) -> None:
64
+ """Process a document and print extracted fields."""
65
+ _setup_logging(log_level)
66
+ from ocr_postprocess.pipeline import Pipeline
67
+ from ocr_postprocess.renderer.json_renderer import to_json
68
+ from ocr_postprocess.renderer.llm import render_llm_markdown
69
+ from ocr_postprocess.renderer.markdown import render_markdown
70
+
71
+ raw = _read_input(input_file)
72
+ pipeline = Pipeline.from_default(profiles_dir=str(profiles_dir))
73
+ doc = pipeline.process(raw, debug=debug)
74
+
75
+ if output_format == "markdown":
76
+ typer.echo(render_markdown(doc))
77
+ elif output_format == "llm":
78
+ typer.echo(render_llm_markdown(doc))
79
+ else:
80
+ typer.echo(to_json(doc))
81
+
82
+
83
+ @app.command("validate-profile")
84
+ def validate_profile(
85
+ profile_file: Path = typer.Argument(..., help="Path to YAML profile file."),
86
+ log_level: str = typer.Option("info", "--log-level", "-l"),
87
+ ) -> None:
88
+ """Validate a YAML profile file and print errors if any."""
89
+ _setup_logging(log_level)
90
+ from ocr_postprocess.profiles.loader import load_profile
91
+
92
+ try:
93
+ profile = load_profile(profile_file)
94
+ typer.echo(f"OK: {profile.id} (v{profile.version})")
95
+ except Exception as exc:
96
+ typer.echo(f"ERROR: {exc}", err=True)
97
+ raise typer.Exit(code=1)
98
+
99
+
100
+ @app.command("dump-canonical")
101
+ def dump_canonical(
102
+ input_file: Optional[Path] = typer.Argument(None, help="Path to raw text file."),
103
+ profiles_dir: Path = typer.Option(
104
+ Path("profiles"), "--profiles", "-p", help="Profiles directory."
105
+ ),
106
+ log_level: str = typer.Option("info", "--log-level", "-l"),
107
+ ) -> None:
108
+ """Process document and dump canonical JSON to stdout."""
109
+ _setup_logging(log_level)
110
+ from ocr_postprocess.pipeline import Pipeline
111
+ from ocr_postprocess.renderer.json_renderer import to_json
112
+
113
+ raw = _read_input(input_file)
114
+ pipeline = Pipeline.from_default(profiles_dir=str(profiles_dir))
115
+ doc = pipeline.process(raw)
116
+ typer.echo(to_json(doc))
117
+
118
+
119
+ def _read_input(path: Optional[Path]) -> str:
120
+ """Read text from file path or stdin; exit with code 1 if no input available."""
121
+ if path is not None:
122
+ return path.read_text(encoding="utf-8")
123
+ if not sys.stdin.isatty():
124
+ return sys.stdin.read()
125
+ typer.echo("Error: no input provided", err=True)
126
+ raise typer.Exit(code=1)
127
+
128
+
129
+ if __name__ == "__main__":
130
+ app()
@@ -0,0 +1,134 @@
1
+ """Stage 3 — Denoiser: remove boilerplate lines and patterns."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+
7
+ import regex as re
8
+
9
+ from ocr_postprocess.models import PipelineContext
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+ # Built-in rules (always applied regardless of profile config)
14
+ _BUILTIN_DROP_PATTERNS = [
15
+ re.compile(
16
+ r"^[\W_]{3,}$"
17
+ ), # lines consisting only of non-alphanumeric chars (e.g. "---", "***")
18
+ re.compile(r"^\s{0,2}\S\s{0,2}$"), # single character with optional surrounding spaces
19
+ ]
20
+
21
+
22
+ def _is_builtin_noise(line: str) -> bool:
23
+ """Return True if line matches built-in noise rules."""
24
+ stripped = line.strip()
25
+ if len(stripped) < 3:
26
+ return True
27
+ for pat in _BUILTIN_DROP_PATTERNS:
28
+ if pat.fullmatch(stripped):
29
+ return True
30
+ return False
31
+
32
+
33
+ def denoise(
34
+ text: str,
35
+ drop_line_patterns: list[str] | None = None,
36
+ drop_inline_patterns: list[str] | None = None,
37
+ mask_patterns: list[dict[str, str]] | None = None,
38
+ collapse_repeats: bool = False,
39
+ protected_substrings: set[str] | None = None,
40
+ ) -> str:
41
+ """Remove noise from text, preserving line count (empty string for dropped lines)."""
42
+ compiled_drop = [re.compile(p) for p in (drop_line_patterns or [])]
43
+ compiled_inline = [re.compile(p) for p in (drop_inline_patterns or [])]
44
+ compiled_mask = [
45
+ (re.compile(m["pattern"]), m.get("replacement", "")) for m in (mask_patterns or [])
46
+ ]
47
+
48
+ lines = text.split("\n")
49
+ prev_line: str | None = None
50
+ result: list[str] = []
51
+
52
+ for line in lines:
53
+ # 1. Consecutive duplicate
54
+ if collapse_repeats and line == prev_line and line.strip():
55
+ result.append("")
56
+ continue
57
+
58
+ # 2. Built-in noise (skip if line contains protected label)
59
+ stripped = line.strip()
60
+ if _is_builtin_noise(stripped):
61
+ if protected_substrings and any(p in line for p in protected_substrings):
62
+ pass # keep
63
+ else:
64
+ result.append("")
65
+ prev_line = line
66
+ continue
67
+
68
+ # 3. Profile drop_lines rules
69
+ dropped = False
70
+ for pat in compiled_drop:
71
+ if pat.search(stripped):
72
+ if protected_substrings and any(p in line for p in protected_substrings):
73
+ break
74
+ dropped = True
75
+ break
76
+ if dropped:
77
+ result.append("")
78
+ prev_line = line
79
+ continue
80
+
81
+ # 4. Inline drop patterns
82
+ for pat in compiled_inline:
83
+ line = pat.sub("", line)
84
+
85
+ # 5. Mask patterns
86
+ for pat, repl in compiled_mask:
87
+ line = pat.sub(repl, line)
88
+
89
+ result.append(line)
90
+ prev_line = line
91
+
92
+ return "\n".join(result)
93
+
94
+
95
+ def denoise_stage(ctx: PipelineContext) -> None:
96
+ """Pipeline stage 3: denoise text."""
97
+ profile = ctx.profile
98
+ noise = profile.noise if profile else None
99
+
100
+ protected: set[str] = set()
101
+ if profile:
102
+ for field in profile.fields:
103
+ for alias in field.aliases:
104
+ protected.add(alias)
105
+
106
+ # Build drop_line_patterns from DropLinesRule
107
+ drop_line_patterns: list[str] = []
108
+ if noise and noise.drop_lines:
109
+ dl = noise.drop_lines
110
+ drop_line_patterns.extend(dl.regex)
111
+ # contains_any patterns → convert to regex
112
+ for phrase in dl.contains_any:
113
+ drop_line_patterns.append(re.escape(phrase))
114
+
115
+ text = ctx.normalized_text or ctx.raw_text
116
+ before_lines = [ln for ln in text.splitlines() if ln.strip()]
117
+ ctx.normalized_text = denoise(
118
+ text,
119
+ drop_line_patterns=drop_line_patterns,
120
+ drop_inline_patterns=list(noise.drop_patterns) if noise else [],
121
+ mask_patterns=list(noise.mask_patterns) if noise else [],
122
+ collapse_repeats=noise.collapse_repeats if noise else False,
123
+ protected_substrings=protected,
124
+ )
125
+ after_lines = [ln for ln in ctx.normalized_text.splitlines() if ln.strip()]
126
+ dropped = len(before_lines) - len(after_lines)
127
+ logger.debug(
128
+ "Denoiser: %d non-empty lines → %d (dropped %d, %d drop-patterns, %d mask-patterns)",
129
+ len(before_lines),
130
+ len(after_lines),
131
+ dropped,
132
+ len(drop_line_patterns),
133
+ len(noise.mask_patterns) if noise else 0,
134
+ )