PyPI - ocr-postprocess - Versions diffs - 0.1.0__tar.gz → 0.1.2__tar.gz - Mend

ocr-postprocess 0.1.0tar.gz → 0.1.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (65) hide show

{ocr_postprocess-0.1.0 → ocr_postprocess-0.1.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: ocr-postprocess
-Version: 0.1.0
+Version: 0.1.2
 Summary: Biến raw OCR text thành structured document — trích xuất trường dữ liệu, xử lý nhiễu, cross-check, render JSON/Markdown.
 Home-page: https://github.com/ohmygodvt95/ocr-postprocess
 Author: ohmygodvt95

{ocr_postprocess-0.1.0 → ocr_postprocess-0.1.2}/ocr_postprocess/cli.py RENAMED Viewed

@@ -32,18 +32,22 @@ def classify(
     input_file: Optional[Path] = typer.Argument(
         None, help="Path to raw text file. Reads stdin if omitted."
     ),
-    profiles_dir: Path = typer.Option(
-        Path("profiles"), "--profiles", "-p", help="Profiles directory."
+    profiles_dir: Optional[Path] = typer.Option(
+        None, "--profiles", "-p", help="Profiles directory. Defaults to bundled profiles."
     ),
-    log_level: str = typer.Option("info", "--log-level", "-l"),
+    log_level: str = typer.Option("warning", "--log-level", "-l"),
 ) -> None:
     """Classify a document and print the best matching profile ID."""
     _setup_logging(log_level)
     from ocr_postprocess.pipeline import Pipeline
     raw = _read_input(input_file)
-    pipeline = Pipeline.from_default(profiles_dir=str(profiles_dir))
-    profile_id, score = pipeline.classify(raw)
+    try:
+        pipeline = Pipeline.from_default(profiles_dir=str(profiles_dir) if profiles_dir else None)
+        profile_id, score = pipeline.classify(raw)
+    except Exception as exc:
+        typer.echo(f"Error: {exc}", err=True)
+        raise typer.Exit(code=1)
     typer.echo(f"{profile_id}\t{score:.4f}")
@@ -52,14 +56,14 @@ def process(
     input_file: Optional[Path] = typer.Argument(
         None, help="Path to raw text file. Reads stdin if omitted."
     ),
-    profiles_dir: Path = typer.Option(
-        Path("profiles"), "--profiles", "-p", help="Profiles directory."
+    profiles_dir: Optional[Path] = typer.Option(
+        None, "--profiles", "-p", help="Profiles directory. Defaults to bundled profiles."
     ),
     output_format: str = typer.Option(
         "json", "--format", "-f", help="Output format: json, markdown, or llm."
     ),
     debug: bool = typer.Option(False, "--debug", "-d", help="Include debug trace."),
-    log_level: str = typer.Option("info", "--log-level", "-l"),
+    log_level: str = typer.Option("warning", "--log-level", "-l"),
 ) -> None:
     """Process a document and print extracted fields."""
     _setup_logging(log_level)
@@ -69,8 +73,12 @@ def process(
     from ocr_postprocess.renderer.markdown import render_markdown
     raw = _read_input(input_file)
-    pipeline = Pipeline.from_default(profiles_dir=str(profiles_dir))
-    doc = pipeline.process(raw, debug=debug)
+    try:
+        pipeline = Pipeline.from_default(profiles_dir=profiles_dir)
+        doc = pipeline.process(raw, debug=debug)
+    except Exception as exc:
+        typer.echo(f"Error: {exc}", err=True)
+        raise typer.Exit(code=1)
     if output_format == "markdown":
         typer.echo(render_markdown(doc))
@@ -83,7 +91,7 @@ def process(
 @app.command("validate-profile")
 def validate_profile(
     profile_file: Path = typer.Argument(..., help="Path to YAML profile file."),
-    log_level: str = typer.Option("info", "--log-level", "-l"),
+    log_level: str = typer.Option("warning", "--log-level", "-l"),
 ) -> None:
     """Validate a YAML profile file and print errors if any."""
     _setup_logging(log_level)
@@ -100,10 +108,10 @@ def validate_profile(
 @app.command("dump-canonical")
 def dump_canonical(
     input_file: Optional[Path] = typer.Argument(None, help="Path to raw text file."),
-    profiles_dir: Path = typer.Option(
-        Path("profiles"), "--profiles", "-p", help="Profiles directory."
+    profiles_dir: Optional[Path] = typer.Option(
+        None, "--profiles", "-p", help="Profiles directory. Defaults to bundled profiles."
     ),
-    log_level: str = typer.Option("info", "--log-level", "-l"),
+    log_level: str = typer.Option("warning", "--log-level", "-l"),
 ) -> None:
     """Process document and dump canonical JSON to stdout."""
     _setup_logging(log_level)
@@ -111,18 +119,33 @@ def dump_canonical(
     from ocr_postprocess.renderer.json_renderer import to_json
     raw = _read_input(input_file)
-    pipeline = Pipeline.from_default(profiles_dir=str(profiles_dir))
-    doc = pipeline.process(raw)
+    try:
+        pipeline = Pipeline.from_default(profiles_dir=profiles_dir)
+        doc = pipeline.process(raw)
+    except Exception as exc:
+        typer.echo(f"Error: {exc}", err=True)
+        raise typer.Exit(code=1)
     typer.echo(to_json(doc))
 def _read_input(path: Optional[Path]) -> str:
     """Read text from file path or stdin; exit with code 1 if no input available."""
     if path is not None:
-        return path.read_text(encoding="utf-8")
+        if not path.exists():
+            typer.echo(f"Error: file not found: {path}", err=True)
+            raise typer.Exit(code=1)
+        text = path.read_text(encoding="utf-8")
+        if not text.strip():
+            typer.echo(f"Error: file is empty: {path}", err=True)
+            raise typer.Exit(code=1)
+        return text
     if not sys.stdin.isatty():
-        return sys.stdin.read()
-    typer.echo("Error: no input provided", err=True)
+        text = sys.stdin.read()
+        if not text.strip():
+            typer.echo("Error: empty input from stdin", err=True)
+            raise typer.Exit(code=1)
+        return text
+    typer.echo("Error: no input provided (pass a file path or pipe text via stdin)", err=True)
     raise typer.Exit(code=1)

{ocr_postprocess-0.1.0 → ocr_postprocess-0.1.2}/ocr_postprocess/engine/reconciler.py RENAMED Viewed

@@ -98,7 +98,7 @@ def reconcile_stage(ctx: PipelineContext) -> None:
                     field_label = fdef.aliases[0]
             readable_vals = ", ".join(f'"{v}"' for v in seen_vals)
             warning_msg = f"Conflict: {field_label} — nhiều giá trị khác nhau: {readable_vals}"
-            logger.warning("Conflict for field '%s': %s", key, detail)
+            logger.debug("Conflict for field '%s': %s", key, detail)
             ctx.warnings.append(warning_msg)
             merged.append(
                 best.model_copy(

{ocr_postprocess-0.1.0 → ocr_postprocess-0.1.2}/ocr_postprocess/pipeline.py RENAMED Viewed

@@ -24,10 +24,10 @@ class Pipeline:
     def __init__(
         self,
         stages: list[Stage],
-        profiles_dir: str | Path = "profiles",
+        profiles_dir: str | Path | None = None,
     ) -> None:
         self._stages = stages
-        self._profiles_dir = Path(profiles_dir)
+        self._profiles_dir = Path(profiles_dir) if profiles_dir is not None else _BUNDLED_PROFILES_DIR
         self._profiles: dict = {}
     @classmethod

ocr_postprocess-0.1.2/ocr_postprocess/profiles/cccd_qr.yml ADDED Viewed

@@ -0,0 +1,89 @@
+id: cccd_qr
+version: 1
+display_name: "Căn cước công dân (quét mã QR)"
+language: ["vi"]
+classify:
+  any_of:
+    - contains_any: ["mã QR in trên CCCD", "Zalo không chịu trách nhiệm"]
+    - all_of:
+        - contains_any: ["Số CCCD", "Số CMND"]
+        - contains_any: ["Ngày cấp CCCD"]
+denoise:
+  drop_lines:
+    contains_any:
+      - "Thông tin này được lấy từ mã QR"
+      - "Zalo không chịu trách nhiệm"
+  collapse_repeats: true
+reconstruct:
+  fuzzy_threshold: 0.85
+  split_glued_labels: true
+  rejoin_wrapped_lines: true
+extract:
+  - name: tieu_de
+    constant: "Kết quả quét mã QR Code"
+  - name: so_cccd
+    aliases: ["Số CCCD"]
+    extractor: line_after_label
+    required: true
+    fuzzy_label: false
+    transform:
+      - {op: replace, from_: " ", to: ""}
+  - name: so_cmnd
+    aliases: ["Số CMND"]
+    extractor: line_after_label
+    fuzzy_label: false
+    transform:
+      - {op: replace, from_: " ", to: ""}
+  - name: ho_va_ten
+    aliases: ["Họ và tên"]
+    extractor: line_after_label
+    required: true
+    fuzzy_label: false
+  - name: ngay_sinh
+    aliases: ["Ngày sinh"]
+    extractor: regex_after_label
+    pattern: '(\d{1,2}[/\-.\s]\d{1,2}[/\-.\s]\d{4})'
+    fuzzy_label: false
+    next_lines: 1
+    required: true
+    transform:
+      - {op: to_date}
+  - name: gioi_tinh
+    aliases: ["Giới tính", "Nữ", "Nam"]
+    extractor: gender_vn
+    required: true
+  - name: noi_thuong_tru
+    aliases: ["Nơi thường trú"]
+    extractor: text_until_next_label
+    fuzzy_label: false
+    stop_labels: ["Ngày sinh", "Ngày cấp CCCD", "Giới tính"]
+    required: true
+  - name: ngay_cap
+    aliases: ["Ngày cấp CCCD", "Ngày cấp"]
+    extractor: regex_after_label
+    pattern: '(\d{1,2}[/\-.\s]\d{1,2}[/\-.\s]\d{4})'
+    fuzzy_label: false
+    next_lines: 1
+    required: true
+    transform:
+      - {op: to_date}
+output:
+  markdown:
+    title: "Kết quả quét mã QR Code"
+    sections:
+      - heading: "Thông tin cá nhân"
+        fields: [so_cccd, so_cmnd, ho_va_ten, gioi_tinh, ngay_sinh]
+      - heading: "Địa chỉ & cấp"
+        fields: [noi_thuong_tru, ngay_cap]

{ocr_postprocess-0.1.0 → ocr_postprocess-0.1.2}/ocr_postprocess.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: ocr-postprocess
-Version: 0.1.0
+Version: 0.1.2
 Summary: Biến raw OCR text thành structured document — trích xuất trường dữ liệu, xử lý nhiễu, cross-check, render JSON/Markdown.
 Home-page: https://github.com/ohmygodvt95/ocr-postprocess
 Author: ohmygodvt95

{ocr_postprocess-0.1.0 → ocr_postprocess-0.1.2}/ocr_postprocess.egg-info/SOURCES.txt RENAMED Viewed

@@ -52,6 +52,7 @@ ocr_postprocess/extractors/structured/mrz_cccd.py
 ocr_postprocess/profiles/__init__.py
 ocr_postprocess/profiles/_generic.yml
 ocr_postprocess/profiles/cccd_2024.yml
+ocr_postprocess/profiles/cccd_qr.yml
 ocr_postprocess/profiles/dang_kiem.yml
 ocr_postprocess/profiles/loader.py
 ocr_postprocess/profiles/matcher.py

{ocr_postprocess-0.1.0 → ocr_postprocess-0.1.2}/setup.py RENAMED Viewed

@@ -5,7 +5,7 @@ with open("README.md", encoding="utf-8") as f:
 setup(
     name="ocr-postprocess",
-    version="0.1.0",
+    version="0.1.2",
     description="Biến raw OCR text thành structured document — trích xuất trường dữ liệu, xử lý nhiễu, cross-check, render JSON/Markdown.",
     long_description=long_description,
     long_description_content_type="text/markdown",