PyPI - docmirror - Versions diffs - 0.2.0__py3-none-any.whl - Mend

docmirror 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (120) hide show

docmirror/__init__.py +64 -0
docmirror/__main__.py +254 -0
docmirror/adapters/__init__.py +47 -0
docmirror/adapters/data/__init__.py +6 -0
docmirror/adapters/data/structured.py +80 -0
docmirror/adapters/image/__init__.py +6 -0
docmirror/adapters/image/image.py +134 -0
docmirror/adapters/office/__init__.py +6 -0
docmirror/adapters/office/excel.py +113 -0
docmirror/adapters/office/omml_extractor.py +111 -0
docmirror/adapters/office/ppt.py +107 -0
docmirror/adapters/office/word.py +157 -0
docmirror/adapters/pdf/__init__.py +6 -0
docmirror/adapters/pdf/pdf.py +126 -0
docmirror/adapters/web/__init__.py +6 -0
docmirror/adapters/web/email.py +115 -0
docmirror/adapters/web/web.py +113 -0
docmirror/configs/__init__.py +18 -0
docmirror/configs/column_aliases.yaml +178 -0
docmirror/configs/domain_registry.py +206 -0
docmirror/configs/hints.yaml +99 -0
docmirror/configs/institution_registry.yaml +164 -0
docmirror/configs/key_synonyms.yaml +95 -0
docmirror/configs/pipeline_registry.py +108 -0
docmirror/configs/settings.py +229 -0
docmirror/core/__init__.py +14 -0
docmirror/core/exceptions.py +131 -0
docmirror/core/extraction/__init__.py +16 -0
docmirror/core/extraction/entity_collector.py +31 -0
docmirror/core/extraction/extractor.py +2002 -0
docmirror/core/extraction/foundation.py +126 -0
docmirror/core/extraction/html_utils.py +57 -0
docmirror/core/extraction/image_converter.py +48 -0
docmirror/core/extraction/pre_analyzer.py +618 -0
docmirror/core/extraction/quality_router.py +228 -0
docmirror/core/extraction/table_postprocessor.py +97 -0
docmirror/core/factory.py +57 -0
docmirror/core/layout/__init__.py +7 -0
docmirror/core/layout/graph_router.py +421 -0
docmirror/core/layout/layout_analysis.py +1437 -0
docmirror/core/layout/layout_model.py +197 -0
docmirror/core/layout/spatial_graph.py +304 -0
docmirror/core/ocr/__init__.py +8 -0
docmirror/core/ocr/aistudio_provider.py +146 -0
docmirror/core/ocr/fallback.py +1791 -0
docmirror/core/ocr/formula_chars.py +261 -0
docmirror/core/ocr/formula_engine.py +350 -0
docmirror/core/ocr/image_preprocessing.py +369 -0
docmirror/core/ocr/ocr_postprocess.py +367 -0
docmirror/core/ocr/table_reconstruction.py +335 -0
docmirror/core/ocr/vision/__init__.py +7 -0
docmirror/core/ocr/vision/rapidocr_engine.py +340 -0
docmirror/core/ocr/vision/seal_detector.py +252 -0
docmirror/core/security/__init__.py +6 -0
docmirror/core/security/forgery_detector.py +184 -0
docmirror/core/table/__init__.py +8 -0
docmirror/core/table/extraction/__init__.py +60 -0
docmirror/core/table/extraction/char_strategy.py +835 -0
docmirror/core/table/extraction/classifier.py +225 -0
docmirror/core/table/extraction/engine.py +856 -0
docmirror/core/table/extraction/grid_tensor.py +94 -0
docmirror/core/table/extraction/pdfplumber_strategy.py +170 -0
docmirror/core/table/extraction/pipe_strategy.py +234 -0
docmirror/core/table/extraction/rapid_table_engine.py +97 -0
docmirror/core/table/extraction/signal_processor.py +413 -0
docmirror/core/table/extraction/template_injector.py +184 -0
docmirror/core/table/extraction/utils.py +231 -0
docmirror/core/table/merger.py +181 -0
docmirror/core/table/page_state.py +109 -0
docmirror/core/table/postprocess.py +744 -0
docmirror/core/table/table_structure_fix.py +697 -0
docmirror/core/utils/__init__.py +8 -0
docmirror/core/utils/text_utils.py +160 -0
docmirror/core/utils/vocabulary.py +379 -0
docmirror/core/utils/watermark.py +238 -0
docmirror/framework/__init__.py +25 -0
docmirror/framework/base.py +350 -0
docmirror/framework/cache.py +139 -0
docmirror/framework/dispatcher.py +351 -0
docmirror/framework/orchestrator.py +221 -0
docmirror/middlewares/__init__.py +25 -0
docmirror/middlewares/alignment/__init__.py +15 -0
docmirror/middlewares/alignment/amount_splitter.py +179 -0
docmirror/middlewares/alignment/header_alignment.py +209 -0
docmirror/middlewares/base.py +346 -0
docmirror/middlewares/detection/__init__.py +13 -0
docmirror/middlewares/detection/institution_detector.py +169 -0
docmirror/middlewares/detection/language_detector.py +57 -0
docmirror/middlewares/detection/scene_detector.py +308 -0
docmirror/middlewares/extraction/__init__.py +12 -0
docmirror/middlewares/extraction/entity_extractor.py +226 -0
docmirror/middlewares/extraction/generic_entity_extractor.py +44 -0
docmirror/middlewares/validation/__init__.py +12 -0
docmirror/middlewares/validation/mutation_analyzer.py +234 -0
docmirror/middlewares/validation/validator.py +488 -0
docmirror/models/__init__.py +25 -0
docmirror/models/construction/__init__.py +11 -0
docmirror/models/construction/_shared.py +46 -0
docmirror/models/construction/builder.py +341 -0
docmirror/models/entities/__init__.py +18 -0
docmirror/models/entities/document_types.py +126 -0
docmirror/models/entities/domain.py +149 -0
docmirror/models/entities/domain_models.py +214 -0
docmirror/models/entities/enhanced.py +271 -0
docmirror/models/entities/perception_result.py +382 -0
docmirror/models/errors.py +103 -0
docmirror/models/tracking/__init__.py +11 -0
docmirror/models/tracking/mutation.py +93 -0
docmirror/plugins/__init__.py +210 -0
docmirror/plugins/bank_statement.py +104 -0
docmirror/py.typed +0 -0
docmirror/server/__init__.py +6 -0
docmirror/server/api.py +141 -0
docmirror/server/schemas.py +44 -0
docmirror-0.2.0.dist-info/METADATA +202 -0
docmirror-0.2.0.dist-info/RECORD +120 -0
docmirror-0.2.0.dist-info/WHEEL +4 -0
docmirror-0.2.0.dist-info/entry_points.txt +2 -0
docmirror-0.2.0.dist-info/licenses/AUTHORS.md +13 -0
docmirror-0.2.0.dist-info/licenses/LICENSE +201 -0

docmirror/__init__.py ADDED Viewed

@@ -0,0 +1,64 @@
+# Copyright (c) 2026 ValueMap Global and contributors. All rights reserved.
+# Author: Adam Lin <adamlin@valuemapglobal.com>
+#
+# This source code is licensed under the Apache 2.0 license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+DocMirror: Universal Document Parsing Engine
+Directory structure:
+- core/: Core extraction engines (CoreExtractor, LayoutAnalysis, TableExtraction)
+- models/: Data models (BaseResult, EnhancedResult, PerceptionResult)
+- middlewares/: Middleware pipeline (SceneDetector, EntityExtractor, Validator, ...)
+- configs/: Configuration (settings, pipeline_registry, institution_registry)
+- framework/: Pipeline orchestration (dispatcher, orchestrator, cache)
+- adapters/: Format adapters (PDF, Image, Office, Email, Web)
+- plugins/: Domain plugins (bank_statement, ...)
+Single public entry point: perceive_document()
+"""
+__version__ = "0.2.0"
+__author__ = "Adam Lin <adamlin@valuemapglobal.com>"
+__copyright__ = "Copyright 2026, ValueMap Global"
+__license__ = "Apache 2.0"
+import logging
+import sys
+# Configure root logger with millisecond precision, process/thread IDs, and source context
+logging.basicConfig(
+    format="%(asctime)s.%(msecs)03d - [%(levelname)s] [%(process)d:%(threadName)s] %(name)s:%(lineno)d - %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+    level=logging.INFO,
+    stream=sys.stdout,
+)
+from docmirror.core.factory import perceive_document, PerceptionFactory
+from docmirror.models.entities.document_types import DocumentType
+from docmirror.models.entities.perception_result import PerceptionResult
+from docmirror.models.entities.domain_models import DomainData
+from docmirror.framework.dispatcher import ParserDispatcher
+from docmirror.framework.dispatcher import ParserDispatcher as DocumentProcessingOrchestrator  # compat
+from docmirror.framework.base import ParserOutput
+from docmirror.framework.orchestrator import Orchestrator
+logger = logging.getLogger(__name__)
+# backward-compat alias — callers importing PerceptionResponse get ParserOutput
+PerceptionResponse = ParserOutput
+__all__ = [
+    "perceive_document",
+    "PerceptionFactory",
+    "PerceptionResult",
+    "PerceptionResponse",
+    "DocumentType",
+    "DomainData",
+    "ParserDispatcher",
+    "DocumentProcessingOrchestrator",
+    "ParserOutput",
+    "Orchestrator",
+]

docmirror/__main__.py ADDED Viewed

@@ -0,0 +1,254 @@
+# Copyright (c) 2026 ValueMap Global and contributors. All rights reserved.
+# Author: Adam Lin <adamlin@valuemapglobal.com>
+#
+# This source code is licensed under the Apache 2.0 license found in the
+# LICENSE file in the root directory of this source tree.
+"""CLI entry point for DocMirror document parsing engine.
+Provides single-file and batch-directory parsing with rich progress
+display, multiple output formats, and result persistence.
+"""
+from __future__ import annotations
+import asyncio
+import argparse
+import json
+from pathlib import Path
+from rich.console import Console
+from rich.panel import Panel
+from rich.progress import Progress, SpinnerColumn, TextColumn
+from rich.table import Table
+console = Console()
+# Default output directory (relative to cwd)
+DEFAULT_OUTPUT_DIR = Path("output")
+def _safe_str(s: str) -> str:
+    """Encode/decode to replace surrogates so console.print() never raises UnicodeEncodeError."""
+    if not isinstance(s, str):
+        s = str(s)
+    return s.encode("utf-8", errors="replace").decode("utf-8")
+# Skip these when discovering files in a directory
+SKIP_NAMES = {".DS_Store", ".gitkeep", "Thumbs.db"}
+def discover_files(root: Path) -> list[Path]:
+    """Recursively collect all files under *root* (excludes SKIP_NAMES)."""
+    files: list[Path] = []
+    for p in sorted(root.rglob("*")):
+        if p.is_file() and p.name not in SKIP_NAMES:
+            files.append(p)
+    return files
+BANNER = r"""[cyan]
+ ____             __  __ _
+|  _ \  ___   ___|  \/  (_)_ __ _ __ ___  _ __
+| | | |/ _ \ / __| |\/| | | '__| '__/ _ \| '__|
+| |_| | (_) | (__| |  | | | |  | | | (_) | |
+|____/ \___/ \___|_|  |_|_|_|  |_|  \___/|_|
+[/cyan]
+[bold white]Universal Document Parsing Engine[/bold white]
+[yellow]Support us with a ⭐ on GitHub: https://github.com/valuemapglobal/docmirror[/yellow]
+"""
+def print_banner():
+    console.print(Panel(BANNER, border_style="cyan", padding=(1, 2)))
+def show_authors():
+    console.print(Panel("[bold cyan]Made with \u2764\ufe0f by[/bold cyan]\n[white]Adam Lin[/white]", title="Authors", border_style="cyan"))
+    console.print("\n[yellow]Want your name here? Contribute to DocMirror at: https://github.com/valuemapglobal/docmirror[/yellow]\n")
+def save_result(result_dict: dict, source_path: Path, output_dir: Path) -> Path:
+    """Save parse result as JSON to the output directory. Returns the saved file path."""
+    output_dir.mkdir(parents=True, exist_ok=True)
+    output_file = output_dir / f"{source_path.stem}.json"
+    # Avoid overwriting: append a numeric suffix if the file already exists
+    counter = 1
+    while output_file.exists():
+        output_file = output_dir / f"{source_path.stem}_{counter}.json"
+        counter += 1
+    output_file.write_text(json.dumps(result_dict, ensure_ascii=False, indent=2), encoding="utf-8")
+    return output_file
+async def parse_document(file_path: str, format_out: str, output_dir: Path, no_save: bool, skip_cache: bool = False) -> None:
+    from docmirror.core.factory import perceive_document
+    from docmirror.models.entities.document_types import DocumentType
+    path = Path(file_path).resolve()
+    if not path.exists():
+        console.print(f"[bold red]Error[/bold red]: File not found: {file_path}")
+        return
+    if path.is_dir():
+        console.print(f"[bold red]Error[/bold red]: Path is a directory (use it as the batch root to parse all files inside): {path}")
+        return
+    # ── Pipeline stage definitions for progress display ──
+    STAGES = [
+        (5,  "[cyan]Loading document...[/cyan]"),
+        (15, "[cyan]Extracting pages...[/cyan]"),
+        (35, "[cyan]Detecting layout & tables...[/cyan]"),
+        (55, "[cyan]Running OCR & text extraction...[/cyan]"),
+        (70, "[cyan]Analyzing entities & structure...[/cyan]"),
+        (85, "[cyan]Mapping columns & validating...[/cyan]"),
+        (95, "[cyan]Building result...[/cyan]"),
+    ]
+    from rich.progress import BarColumn, TaskProgressColumn, TimeElapsedColumn
+    progress = Progress(
+        SpinnerColumn(),
+        TextColumn("[progress.description]{task.description}"),
+        BarColumn(bar_width=30),
+        TaskProgressColumn(),
+        TimeElapsedColumn(),
+        console=console,
+    )
+    async def _animate_progress(progress, task_id):
+        """Simulate stage-based progress while parsing runs."""
+        import time
+        start = time.monotonic()
+        stage_idx = 0
+        while not progress.tasks[task_id].finished:
+            elapsed = time.monotonic() - start
+            # Advance through stages based on elapsed time
+            # Rough heuristic: ~2s per stage for a typical document
+            target_stage = min(int(elapsed / 2.0), len(STAGES) - 1)
+            while stage_idx <= target_stage and stage_idx < len(STAGES):
+                pct, desc = STAGES[stage_idx]
+                progress.update(task_id, completed=pct, description=desc)
+                stage_idx += 1
+            await asyncio.sleep(0.15)
+    with progress:
+        task_id = progress.add_task(
+            STAGES[0][1], total=100,
+        )
+        # Start progress animation concurrently with parsing
+        import time as _time
+        _wall_start = _time.monotonic()
+        anim_task = asyncio.create_task(_animate_progress(progress, task_id))
+        try:
+            result = await perceive_document(path, DocumentType.OTHER, skip_cache=skip_cache)
+            progress.update(task_id, completed=100, description="[bold green]✅ Done![/bold green]")
+            anim_task.cancel()
+        except Exception as e:
+            progress.update(task_id, completed=100, description="[bold red]❌ Failed[/bold red]")
+            anim_task.cancel()
+            console.print(f"[bold red]Critical Error:[/bold red] {_safe_str(str(e))}")
+            return
+    wall_elapsed_ms = (_time.monotonic() - _wall_start) * 1000
+    # ── Display results (outside spinner) ──
+    try:
+        api_dict = result.to_api_dict()
+        if result.success:
+            console.print(f"\n[bold green]\u2705 Parsing Complete![/bold green]")
+            table = Table(show_header=False, border_style="green")
+            table.add_column("Metric", style="cyan")
+            table.add_column("Value", style="white")
+            table.add_row("Status", str(result.status))
+            table.add_row("Confidence", f"{result.confidence:.2%}")
+            table.add_row("Pages", str(result.content.page_count))
+            table.add_row("Tables Found", str(len(result.tables)))
+            table.add_row("Extracted Text", f"{len(result.content.text)} chars")
+            table.add_row("Time Elapsed", f"{wall_elapsed_ms:.0f} ms")
+            # Detect cached results: internal timing >> wall time
+            is_cached = (
+                result.timing and result.timing.elapsed_ms > 0
+                and wall_elapsed_ms < result.timing.elapsed_ms * 0.5
+                and wall_elapsed_ms < 2000
+            )
+            if is_cached:
+                table.add_row("", "[dim italic]⚡ cached result[/dim italic]")
+            console.print(table)
+            effective_ms = max(wall_elapsed_ms, 1)
+            speed = len(result.content.text) / (effective_ms / 1000)
+            console.print(f"\n[bold magenta]\u26a1 BLAZING FAST:[/bold magenta] Processed at {speed:.0f} chars/sec!")
+            console.print(f"[dim]Copy this benchmark and share it on Twitter / V2EX to show off your speed! \u26a1[/dim]")
+        else:
+            console.print(f"\n[bold red]\u274c Parsing Failed[/bold red]")
+            if result.error:
+                console.print(f"[red]{_safe_str(result.error.message)}[/red]")
+            console.print("\n[bold yellow]Open Source Power[/bold yellow]")
+            console.print("[white]Encountered an unsupported exotic format? This is how we improve![/white]")
+            console.print("[white]Please attach the logs and a sample document by opening an issue at:[/white]")
+            console.print("[cyan]https://github.com/valuemapglobal/docmirror/issues[/cyan]")
+        # Save result to disk (both success and failure, for diagnostics)
+        if not no_save:
+            saved_path = save_result(api_dict, path, output_dir)
+            console.print(f"\n[bold blue]\U0001f4be Result saved to:[/bold blue] [white]{saved_path}[/white]")
+    except Exception as e:
+        console.print(f"[bold red]Critical Error:[/bold red] {_safe_str(str(e))}")
+def main() -> None:
+    parser = argparse.ArgumentParser(description="DocMirror - Universal Document Parsing Engine")
+    parser.add_argument("file", nargs="?", help="Path to a document or a directory (recursively parse all files under it)")
+    parser.add_argument("--format", default="markdown", choices=["markdown", "json", "text"], help="Output format")
+    parser.add_argument("--output-dir", "-o", type=Path, default=DEFAULT_OUTPUT_DIR, help="Directory to save parse results (default: ./output)")
+    parser.add_argument("--no-save", action="store_true", help="Do not save result to disk")
+    parser.add_argument("--skip-cache", action="store_true", help="Skip cache and force a full re-parse")
+    parser.add_argument("--exclude", action="append", default=[], metavar="SUBSTR",
+                        help="Skip files whose path contains SUBSTR (e.g. --exclude 工商银行); can be repeated")
+    parser.add_argument("--authors", action="store_true", help="Show contributors and authors")
+    args = parser.parse_args()
+    if args.authors:
+        print_banner()
+        show_authors()
+        return
+    if not args.file:
+        print_banner()
+        parser.print_help()
+        return
+    print_banner()
+    path = Path(args.file).resolve()
+    if not path.exists():
+        console.print(f"[bold red]Error[/bold red]: Path not found: {path}")
+        return
+    if path.is_dir():
+        files = discover_files(path)
+        if args.exclude:
+            excluded = [f for f in files if any(pat in str(f) for pat in args.exclude)]
+            files = [f for f in files if not any(pat in str(f) for pat in args.exclude)]
+            if excluded:
+                console.print(f"[dim]Excluding {len(excluded)} file(s) matching: {', '.join(args.exclude)}[/dim]")
+        if not files:
+            console.print(f"[bold yellow]No files found under[/bold yellow] {path}")
+            return
+        console.print(f"[bold cyan]Batch mode:[/bold cyan] {len(files)} file(s) under [white]{path}[/white]\n")
+        async def _batch_parse():
+            for i, fp in enumerate(files, 1):
+                console.print(f"\n[bold blue][{i}/{len(files)}][/bold blue] {fp.name}")
+                await parse_document(str(fp), args.format, args.output_dir, args.no_save, args.skip_cache)
+        asyncio.run(_batch_parse())
+    else:
+        asyncio.run(parse_document(args.file, args.format, args.output_dir, args.no_save, args.skip_cache))
+if __name__ == "__main__":
+    main()

docmirror/adapters/__init__.py ADDED Viewed

@@ -0,0 +1,47 @@
+# Copyright (c) 2026 ValueMap Global and contributors. All rights reserved.
+# Author: Adam Lin <adamlin@valuemapglobal.com>
+#
+# This source code is licensed under the Apache 2.0 license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Adapters — Format-specific document converters.
+================================================
+Each adapter is responsible for:
+    1. Converting a specific file format into an immutable ``BaseResult``.
+    2. Optionally returning a ``ParserOutput`` for backward compatibility.
+Adapters contain NO business logic — all domain-specific enhancement
+is handled by the middleware pipeline downstream.
+Supported formats:
+    - PDF      → PDFAdapter
+    - Image    → ImageAdapter (VLM + OCR fallback)
+    - Word     → WordAdapter (.docx via python-docx)
+    - Excel    → ExcelAdapter (.xlsx via openpyxl)
+    - PPT      → PPTAdapter (.pptx via python-pptx)
+    - Email    → EmailAdapter (.eml via stdlib email)
+    - HTML     → WebAdapter (raw text extraction)
+    - JSON/CSV → StructuredAdapter
+"""
+from .pdf.pdf import PDFAdapter
+from .image.image import ImageAdapter
+from .web.email import EmailAdapter
+from .office.excel import ExcelAdapter
+from .office.word import WordAdapter
+from .office.ppt import PPTAdapter
+from .data.structured import StructuredAdapter
+from .web.web import WebAdapter
+__all__ = [
+    "PDFAdapter",
+    "ImageAdapter",
+    "EmailAdapter",
+    "ExcelAdapter",
+    "WordAdapter",
+    "PPTAdapter",
+    "StructuredAdapter",
+    "WebAdapter",
+]

docmirror/adapters/data/__init__.py ADDED Viewed

@@ -0,0 +1,6 @@
+# Copyright (c) 2026 ValueMap Global and contributors. All rights reserved.
+# Author: Adam Lin <adamlin@valuemapglobal.com>
+#
+# This source code is licensed under the Apache 2.0 license found in the
+# LICENSE file in the root directory of this source tree.

docmirror/adapters/data/structured.py ADDED Viewed

@@ -0,0 +1,80 @@
+# Copyright (c) 2026 ValueMap Global and contributors. All rights reserved.
+# Author: Adam Lin <adamlin@valuemapglobal.com>
+#
+# This source code is licensed under the Apache 2.0 license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Structured Data Adapter — JSON/CSV → BaseResult
+=================================================
+Handles structured data files that already have a well-defined schema.
+Processing logic by format:
+**JSON (.json)**:
+    - Loads the entire file into a Python object.
+    - If the root object is a dict, creates a ``key_value`` Block with the
+      dict as raw_content (suitable for flat key-value documents).
+    - The full_text is the pretty-printed JSON (2-space indent).
+**CSV (.csv)**:
+    - Reads all rows via Python's csv.reader (default dialect).
+    - Creates a single ``table`` Block with the 2D list of row data.
+    - The full_text is the comma-joined rows.
+Both formats produce a single-page BaseResult with:
+    - metadata.source_format set to the file extension (without dot).
+.. note::
+    For JSON arrays (e.g., list of records), the current implementation
+    does not create structured blocks. A future enhancement could
+    detect list-of-dicts patterns and convert them to table Blocks.
+"""
+from __future__ import annotations
+import csv
+import json
+import logging
+from pathlib import Path
+from docmirror.framework.base import BaseParser
+from docmirror.models.entities.domain import BaseResult, Block, PageLayout
+logger = logging.getLogger(__name__)
+class StructuredAdapter(BaseParser):
+    """Structured data (JSON/CSV) format adapter."""
+    async def to_base_result(self, file_path: Path) -> BaseResult:
+        """
+        Parse a JSON or CSV file into a BaseResult.
+        Dispatches to format-specific logic based on file extension.
+        JSON dicts become key_value Blocks; CSV files become table Blocks.
+        """
+        ext = file_path.suffix.lower()
+        logger.info(f"[StructuredAdapter] Starting native extraction for {ext} file: {file_path}")
+        blocks = []
+        text = ""
+        if ext == ".json":
+            with open(file_path, "r", encoding="utf-8") as f:
+                data = json.load(f)
+            # Dict objects → key-value Block for flat entity data
+            if isinstance(data, dict):
+                blocks.append(Block(block_type="key_value", raw_content=data, page=0))
+            text = json.dumps(data, indent=2, ensure_ascii=False)
+        elif ext == ".csv":
+            with open(file_path, "r", encoding="utf-8") as f:
+                rows = list(csv.reader(f))
+            if rows:
+                # All CSV rows (including header) → single table Block
+                blocks.append(Block(block_type="table", raw_content=rows, page=0))
+                text = "\n".join(",".join(r) for r in rows)
+        page = PageLayout(page_number=0, blocks=tuple(blocks))
+        return BaseResult(pages=(page,), full_text=text, metadata={"source_format": ext.lstrip(".")})

docmirror/adapters/image/__init__.py ADDED Viewed

@@ -0,0 +1,6 @@
+# Copyright (c) 2026 ValueMap Global and contributors. All rights reserved.
+# Author: Adam Lin <adamlin@valuemapglobal.com>
+#
+# This source code is licensed under the Apache 2.0 license found in the
+# LICENSE file in the root directory of this source tree.

docmirror/adapters/image/image.py ADDED Viewed

@@ -0,0 +1,134 @@
+# Copyright (c) 2026 ValueMap Global and contributors. All rights reserved.
+# Author: Adam Lin <adamlin@valuemapglobal.com>
+#
+# This source code is licensed under the Apache 2.0 license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Image Adapter — Image → BaseResult
+====================================
+Converts image files (JPG, PNG, TIFF, etc.) into structured data using
+RapidOCR (ONNX Runtime) for plain text extraction. This adapter produces a single
+text Block without complex structured table/entity data, as it currently operates
+in a purely CPU-bound environment without Vision-Language Models.
+"""
+from __future__ import annotations
+import logging
+from pathlib import Path
+from docmirror.framework.base import BaseParser
+from docmirror.models.entities.domain import BaseResult, Block, PageLayout
+logger = logging.getLogger(__name__)
+class ImageAdapter(BaseParser):
+    """
+    Image format adapter using OCR extraction.
+    Produces a single text Block containing all recognized text lines joined by newlines.
+    """
+    async def to_base_result(self, file_path: Path) -> BaseResult:
+        """
+        Convert an image file to BaseResult using OCR.
+        """
+        logger.info(f"[ImageAdapter] Starting image parsing for: {file_path}")
+        result = await self._ocr_fallback(file_path)
+        logger.info(f"[ImageAdapter] Completed image parsing for: {file_path}")
+        return result
+    async def _ocr_fallback(self, file_path: Path) -> BaseResult:
+        """
+        Extract text from the image. When image quality is below
+        ``external_ocr_quality_threshold`` and ``external_ocr_provider``
+        is set, delegates to the external provider; otherwise uses RapidOCR.
+        Returns a BaseResult with a single text Block containing all
+        recognized text lines joined by newlines.
+        """
+        import cv2
+        logger.debug(f"[ImageAdapter] Reading image file: {file_path}")
+        img = cv2.imread(str(file_path))
+        if img is None:
+            logger.error(f"[ImageAdapter] Failed to read image, cv2.imread returned None: {file_path.name}")
+            text = ""
+        else:
+            text = self._extract_text_from_image(img, file_path)
+        blocks = [Block(block_type="text", raw_content=text, page=0)] if text else []
+        page = PageLayout(page_number=0, blocks=tuple(blocks))
+        return BaseResult(pages=(page,), full_text=text, metadata={"source_format": "image_ocr"})
+    def _extract_text_from_image(self, img, file_path: Path) -> str:
+        """Use built-in or external OCR depending on image quality."""
+        from docmirror.configs.settings import default_settings
+        from docmirror.core.ocr.fallback import (
+            _resolve_external_ocr_provider,
+            assess_image_quality_from_bgr,
+        )
+        threshold = getattr(default_settings, "external_ocr_quality_threshold", None)
+        provider = _resolve_external_ocr_provider(
+            getattr(default_settings, "external_ocr_provider", None)
+        )
+        quality = assess_image_quality_from_bgr(img)
+        logger.debug(
+            "[ImageAdapter] OCR route: quality=%s, threshold=%s, external_provider=%s → %s",
+            quality,
+            threshold,
+            "set" if provider is not None else "unset",
+            "external" if (threshold is not None and provider is not None and quality < threshold) else "builtin",
+        )
+        if (
+            threshold is not None
+            and provider is not None
+            and quality < threshold
+        ):
+            try:
+                out = provider(img, page_idx=0, dpi=200)
+            except Exception as e:
+                logger.warning(f"[ImageAdapter] External OCR failed: {e}")
+                out = None
+            if out is not None:
+                logger.info(
+                    f"[ImageAdapter] Delegated to external OCR (quality={quality})"
+                )
+                return self._text_from_ocr_result(out)
+        try:
+            from docmirror.core.ocr.vision.rapidocr_engine import get_ocr_engine
+            engine = get_ocr_engine()
+            words = engine.detect_image_words(img)
+            return "\n".join(w[4] for w in words) if words else ""
+        except Exception as e:
+            logger.warning(f"[ImageAdapter] OCR fallback failed: {e}")
+            return ""
+    @staticmethod
+    def _text_from_ocr_result(out) -> str:
+        """Convert external OCR result (list of words or dict) to plain text."""
+        if isinstance(out, list):
+            return "\n".join(w[4] for w in out if len(w) > 4)
+        if isinstance(out, dict):
+            lines = out.get("lines", [])
+            if lines:
+                return "\n".join(
+                    line.get("text", "") if isinstance(line, dict) else str(line)
+                    for line in lines
+                )
+            header = out.get("header_text", "").strip()
+            footer = out.get("footer_text", "").strip()
+            table = out.get("table", [])
+            parts = [header] if header else []
+            if table:
+                for row in table:
+                    if isinstance(row, (list, tuple)):
+                        parts.append(" | ".join(str(c) for c in row if c))
+                    else:
+                        parts.append(str(row))
+            if footer:
+                parts.append(footer)
+            return "\n".join(parts) if parts else ""
+        return ""

docmirror/adapters/office/__init__.py ADDED Viewed

@@ -0,0 +1,6 @@
+# Copyright (c) 2026 ValueMap Global and contributors. All rights reserved.
+# Author: Adam Lin <adamlin@valuemapglobal.com>
+#
+# This source code is licensed under the Apache 2.0 license found in the
+# LICENSE file in the root directory of this source tree.