docmirror 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (120) hide show
  1. docmirror/__init__.py +64 -0
  2. docmirror/__main__.py +254 -0
  3. docmirror/adapters/__init__.py +47 -0
  4. docmirror/adapters/data/__init__.py +6 -0
  5. docmirror/adapters/data/structured.py +80 -0
  6. docmirror/adapters/image/__init__.py +6 -0
  7. docmirror/adapters/image/image.py +134 -0
  8. docmirror/adapters/office/__init__.py +6 -0
  9. docmirror/adapters/office/excel.py +113 -0
  10. docmirror/adapters/office/omml_extractor.py +111 -0
  11. docmirror/adapters/office/ppt.py +107 -0
  12. docmirror/adapters/office/word.py +157 -0
  13. docmirror/adapters/pdf/__init__.py +6 -0
  14. docmirror/adapters/pdf/pdf.py +126 -0
  15. docmirror/adapters/web/__init__.py +6 -0
  16. docmirror/adapters/web/email.py +115 -0
  17. docmirror/adapters/web/web.py +113 -0
  18. docmirror/configs/__init__.py +18 -0
  19. docmirror/configs/column_aliases.yaml +178 -0
  20. docmirror/configs/domain_registry.py +206 -0
  21. docmirror/configs/hints.yaml +99 -0
  22. docmirror/configs/institution_registry.yaml +164 -0
  23. docmirror/configs/key_synonyms.yaml +95 -0
  24. docmirror/configs/pipeline_registry.py +108 -0
  25. docmirror/configs/settings.py +229 -0
  26. docmirror/core/__init__.py +14 -0
  27. docmirror/core/exceptions.py +131 -0
  28. docmirror/core/extraction/__init__.py +16 -0
  29. docmirror/core/extraction/entity_collector.py +31 -0
  30. docmirror/core/extraction/extractor.py +2002 -0
  31. docmirror/core/extraction/foundation.py +126 -0
  32. docmirror/core/extraction/html_utils.py +57 -0
  33. docmirror/core/extraction/image_converter.py +48 -0
  34. docmirror/core/extraction/pre_analyzer.py +618 -0
  35. docmirror/core/extraction/quality_router.py +228 -0
  36. docmirror/core/extraction/table_postprocessor.py +97 -0
  37. docmirror/core/factory.py +57 -0
  38. docmirror/core/layout/__init__.py +7 -0
  39. docmirror/core/layout/graph_router.py +421 -0
  40. docmirror/core/layout/layout_analysis.py +1437 -0
  41. docmirror/core/layout/layout_model.py +197 -0
  42. docmirror/core/layout/spatial_graph.py +304 -0
  43. docmirror/core/ocr/__init__.py +8 -0
  44. docmirror/core/ocr/aistudio_provider.py +146 -0
  45. docmirror/core/ocr/fallback.py +1791 -0
  46. docmirror/core/ocr/formula_chars.py +261 -0
  47. docmirror/core/ocr/formula_engine.py +350 -0
  48. docmirror/core/ocr/image_preprocessing.py +369 -0
  49. docmirror/core/ocr/ocr_postprocess.py +367 -0
  50. docmirror/core/ocr/table_reconstruction.py +335 -0
  51. docmirror/core/ocr/vision/__init__.py +7 -0
  52. docmirror/core/ocr/vision/rapidocr_engine.py +340 -0
  53. docmirror/core/ocr/vision/seal_detector.py +252 -0
  54. docmirror/core/security/__init__.py +6 -0
  55. docmirror/core/security/forgery_detector.py +184 -0
  56. docmirror/core/table/__init__.py +8 -0
  57. docmirror/core/table/extraction/__init__.py +60 -0
  58. docmirror/core/table/extraction/char_strategy.py +835 -0
  59. docmirror/core/table/extraction/classifier.py +225 -0
  60. docmirror/core/table/extraction/engine.py +856 -0
  61. docmirror/core/table/extraction/grid_tensor.py +94 -0
  62. docmirror/core/table/extraction/pdfplumber_strategy.py +170 -0
  63. docmirror/core/table/extraction/pipe_strategy.py +234 -0
  64. docmirror/core/table/extraction/rapid_table_engine.py +97 -0
  65. docmirror/core/table/extraction/signal_processor.py +413 -0
  66. docmirror/core/table/extraction/template_injector.py +184 -0
  67. docmirror/core/table/extraction/utils.py +231 -0
  68. docmirror/core/table/merger.py +181 -0
  69. docmirror/core/table/page_state.py +109 -0
  70. docmirror/core/table/postprocess.py +744 -0
  71. docmirror/core/table/table_structure_fix.py +697 -0
  72. docmirror/core/utils/__init__.py +8 -0
  73. docmirror/core/utils/text_utils.py +160 -0
  74. docmirror/core/utils/vocabulary.py +379 -0
  75. docmirror/core/utils/watermark.py +238 -0
  76. docmirror/framework/__init__.py +25 -0
  77. docmirror/framework/base.py +350 -0
  78. docmirror/framework/cache.py +139 -0
  79. docmirror/framework/dispatcher.py +351 -0
  80. docmirror/framework/orchestrator.py +221 -0
  81. docmirror/middlewares/__init__.py +25 -0
  82. docmirror/middlewares/alignment/__init__.py +15 -0
  83. docmirror/middlewares/alignment/amount_splitter.py +179 -0
  84. docmirror/middlewares/alignment/header_alignment.py +209 -0
  85. docmirror/middlewares/base.py +346 -0
  86. docmirror/middlewares/detection/__init__.py +13 -0
  87. docmirror/middlewares/detection/institution_detector.py +169 -0
  88. docmirror/middlewares/detection/language_detector.py +57 -0
  89. docmirror/middlewares/detection/scene_detector.py +308 -0
  90. docmirror/middlewares/extraction/__init__.py +12 -0
  91. docmirror/middlewares/extraction/entity_extractor.py +226 -0
  92. docmirror/middlewares/extraction/generic_entity_extractor.py +44 -0
  93. docmirror/middlewares/validation/__init__.py +12 -0
  94. docmirror/middlewares/validation/mutation_analyzer.py +234 -0
  95. docmirror/middlewares/validation/validator.py +488 -0
  96. docmirror/models/__init__.py +25 -0
  97. docmirror/models/construction/__init__.py +11 -0
  98. docmirror/models/construction/_shared.py +46 -0
  99. docmirror/models/construction/builder.py +341 -0
  100. docmirror/models/entities/__init__.py +18 -0
  101. docmirror/models/entities/document_types.py +126 -0
  102. docmirror/models/entities/domain.py +149 -0
  103. docmirror/models/entities/domain_models.py +214 -0
  104. docmirror/models/entities/enhanced.py +271 -0
  105. docmirror/models/entities/perception_result.py +382 -0
  106. docmirror/models/errors.py +103 -0
  107. docmirror/models/tracking/__init__.py +11 -0
  108. docmirror/models/tracking/mutation.py +93 -0
  109. docmirror/plugins/__init__.py +210 -0
  110. docmirror/plugins/bank_statement.py +104 -0
  111. docmirror/py.typed +0 -0
  112. docmirror/server/__init__.py +6 -0
  113. docmirror/server/api.py +141 -0
  114. docmirror/server/schemas.py +44 -0
  115. docmirror-0.2.0.dist-info/METADATA +202 -0
  116. docmirror-0.2.0.dist-info/RECORD +120 -0
  117. docmirror-0.2.0.dist-info/WHEEL +4 -0
  118. docmirror-0.2.0.dist-info/entry_points.txt +2 -0
  119. docmirror-0.2.0.dist-info/licenses/AUTHORS.md +13 -0
  120. docmirror-0.2.0.dist-info/licenses/LICENSE +201 -0
docmirror/__init__.py ADDED
@@ -0,0 +1,64 @@
1
+ # Copyright (c) 2026 ValueMap Global and contributors. All rights reserved.
2
+ # Author: Adam Lin <adamlin@valuemapglobal.com>
3
+ #
4
+ # This source code is licensed under the Apache 2.0 license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """
8
+ DocMirror: Universal Document Parsing Engine
9
+
10
+ Directory structure:
11
+ - core/: Core extraction engines (CoreExtractor, LayoutAnalysis, TableExtraction)
12
+ - models/: Data models (BaseResult, EnhancedResult, PerceptionResult)
13
+ - middlewares/: Middleware pipeline (SceneDetector, EntityExtractor, Validator, ...)
14
+ - configs/: Configuration (settings, pipeline_registry, institution_registry)
15
+ - framework/: Pipeline orchestration (dispatcher, orchestrator, cache)
16
+ - adapters/: Format adapters (PDF, Image, Office, Email, Web)
17
+ - plugins/: Domain plugins (bank_statement, ...)
18
+
19
+ Single public entry point: perceive_document()
20
+ """
21
+
22
+ __version__ = "0.2.0"
23
+ __author__ = "Adam Lin <adamlin@valuemapglobal.com>"
24
+ __copyright__ = "Copyright 2026, ValueMap Global"
25
+ __license__ = "Apache 2.0"
26
+
27
+ import logging
28
+ import sys
29
+
30
+ # Configure root logger with millisecond precision, process/thread IDs, and source context
31
+ logging.basicConfig(
32
+ format="%(asctime)s.%(msecs)03d - [%(levelname)s] [%(process)d:%(threadName)s] %(name)s:%(lineno)d - %(message)s",
33
+ datefmt="%Y-%m-%d %H:%M:%S",
34
+ level=logging.INFO,
35
+ stream=sys.stdout,
36
+ )
37
+
38
+ from docmirror.core.factory import perceive_document, PerceptionFactory
39
+ from docmirror.models.entities.document_types import DocumentType
40
+ from docmirror.models.entities.perception_result import PerceptionResult
41
+ from docmirror.models.entities.domain_models import DomainData
42
+ from docmirror.framework.dispatcher import ParserDispatcher
43
+ from docmirror.framework.dispatcher import ParserDispatcher as DocumentProcessingOrchestrator # compat
44
+ from docmirror.framework.base import ParserOutput
45
+ from docmirror.framework.orchestrator import Orchestrator
46
+
47
+ logger = logging.getLogger(__name__)
48
+
49
+ # backward-compat alias — callers importing PerceptionResponse get ParserOutput
50
+ PerceptionResponse = ParserOutput
51
+
52
+
53
+ __all__ = [
54
+ "perceive_document",
55
+ "PerceptionFactory",
56
+ "PerceptionResult",
57
+ "PerceptionResponse",
58
+ "DocumentType",
59
+ "DomainData",
60
+ "ParserDispatcher",
61
+ "DocumentProcessingOrchestrator",
62
+ "ParserOutput",
63
+ "Orchestrator",
64
+ ]
docmirror/__main__.py ADDED
@@ -0,0 +1,254 @@
1
+ # Copyright (c) 2026 ValueMap Global and contributors. All rights reserved.
2
+ # Author: Adam Lin <adamlin@valuemapglobal.com>
3
+ #
4
+ # This source code is licensed under the Apache 2.0 license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """CLI entry point for DocMirror document parsing engine.
8
+
9
+ Provides single-file and batch-directory parsing with rich progress
10
+ display, multiple output formats, and result persistence.
11
+ """
12
+ from __future__ import annotations
13
+ import asyncio
14
+ import argparse
15
+ import json
16
+ from pathlib import Path
17
+ from rich.console import Console
18
+ from rich.panel import Panel
19
+ from rich.progress import Progress, SpinnerColumn, TextColumn
20
+ from rich.table import Table
21
+
22
+ console = Console()
23
+
24
+ # Default output directory (relative to cwd)
25
+ DEFAULT_OUTPUT_DIR = Path("output")
26
+
27
+
28
+ def _safe_str(s: str) -> str:
29
+ """Encode/decode to replace surrogates so console.print() never raises UnicodeEncodeError."""
30
+ if not isinstance(s, str):
31
+ s = str(s)
32
+ return s.encode("utf-8", errors="replace").decode("utf-8")
33
+
34
+
35
+ # Skip these when discovering files in a directory
36
+ SKIP_NAMES = {".DS_Store", ".gitkeep", "Thumbs.db"}
37
+
38
+
39
+ def discover_files(root: Path) -> list[Path]:
40
+ """Recursively collect all files under *root* (excludes SKIP_NAMES)."""
41
+ files: list[Path] = []
42
+ for p in sorted(root.rglob("*")):
43
+ if p.is_file() and p.name not in SKIP_NAMES:
44
+ files.append(p)
45
+ return files
46
+
47
+
48
+ BANNER = r"""[cyan]
49
+ ____ __ __ _
50
+ | _ \ ___ ___| \/ (_)_ __ _ __ ___ _ __
51
+ | | | |/ _ \ / __| |\/| | | '__| '__/ _ \| '__|
52
+ | |_| | (_) | (__| | | | | | | | | (_) | |
53
+ |____/ \___/ \___|_| |_|_|_| |_| \___/|_|
54
+ [/cyan]
55
+ [bold white]Universal Document Parsing Engine[/bold white]
56
+ [yellow]Support us with a ⭐ on GitHub: https://github.com/valuemapglobal/docmirror[/yellow]
57
+ """
58
+
59
+ def print_banner():
60
+ console.print(Panel(BANNER, border_style="cyan", padding=(1, 2)))
61
+
62
+ def show_authors():
63
+ console.print(Panel("[bold cyan]Made with \u2764\ufe0f by[/bold cyan]\n[white]Adam Lin[/white]", title="Authors", border_style="cyan"))
64
+ console.print("\n[yellow]Want your name here? Contribute to DocMirror at: https://github.com/valuemapglobal/docmirror[/yellow]\n")
65
+
66
+
67
+ def save_result(result_dict: dict, source_path: Path, output_dir: Path) -> Path:
68
+ """Save parse result as JSON to the output directory. Returns the saved file path."""
69
+ output_dir.mkdir(parents=True, exist_ok=True)
70
+ output_file = output_dir / f"{source_path.stem}.json"
71
+
72
+ # Avoid overwriting: append a numeric suffix if the file already exists
73
+ counter = 1
74
+ while output_file.exists():
75
+ output_file = output_dir / f"{source_path.stem}_{counter}.json"
76
+ counter += 1
77
+
78
+ output_file.write_text(json.dumps(result_dict, ensure_ascii=False, indent=2), encoding="utf-8")
79
+ return output_file
80
+
81
+
82
+ async def parse_document(file_path: str, format_out: str, output_dir: Path, no_save: bool, skip_cache: bool = False) -> None:
83
+ from docmirror.core.factory import perceive_document
84
+ from docmirror.models.entities.document_types import DocumentType
85
+
86
+ path = Path(file_path).resolve()
87
+ if not path.exists():
88
+ console.print(f"[bold red]Error[/bold red]: File not found: {file_path}")
89
+ return
90
+ if path.is_dir():
91
+ console.print(f"[bold red]Error[/bold red]: Path is a directory (use it as the batch root to parse all files inside): {path}")
92
+ return
93
+
94
+ # ── Pipeline stage definitions for progress display ──
95
+ STAGES = [
96
+ (5, "[cyan]Loading document...[/cyan]"),
97
+ (15, "[cyan]Extracting pages...[/cyan]"),
98
+ (35, "[cyan]Detecting layout & tables...[/cyan]"),
99
+ (55, "[cyan]Running OCR & text extraction...[/cyan]"),
100
+ (70, "[cyan]Analyzing entities & structure...[/cyan]"),
101
+ (85, "[cyan]Mapping columns & validating...[/cyan]"),
102
+ (95, "[cyan]Building result...[/cyan]"),
103
+ ]
104
+
105
+ from rich.progress import BarColumn, TaskProgressColumn, TimeElapsedColumn
106
+
107
+ progress = Progress(
108
+ SpinnerColumn(),
109
+ TextColumn("[progress.description]{task.description}"),
110
+ BarColumn(bar_width=30),
111
+ TaskProgressColumn(),
112
+ TimeElapsedColumn(),
113
+ console=console,
114
+ )
115
+
116
+ async def _animate_progress(progress, task_id):
117
+ """Simulate stage-based progress while parsing runs."""
118
+ import time
119
+ start = time.monotonic()
120
+ stage_idx = 0
121
+ while not progress.tasks[task_id].finished:
122
+ elapsed = time.monotonic() - start
123
+ # Advance through stages based on elapsed time
124
+ # Rough heuristic: ~2s per stage for a typical document
125
+ target_stage = min(int(elapsed / 2.0), len(STAGES) - 1)
126
+ while stage_idx <= target_stage and stage_idx < len(STAGES):
127
+ pct, desc = STAGES[stage_idx]
128
+ progress.update(task_id, completed=pct, description=desc)
129
+ stage_idx += 1
130
+ await asyncio.sleep(0.15)
131
+
132
+ with progress:
133
+ task_id = progress.add_task(
134
+ STAGES[0][1], total=100,
135
+ )
136
+ # Start progress animation concurrently with parsing
137
+ import time as _time
138
+ _wall_start = _time.monotonic()
139
+ anim_task = asyncio.create_task(_animate_progress(progress, task_id))
140
+ try:
141
+ result = await perceive_document(path, DocumentType.OTHER, skip_cache=skip_cache)
142
+ progress.update(task_id, completed=100, description="[bold green]✅ Done![/bold green]")
143
+ anim_task.cancel()
144
+ except Exception as e:
145
+ progress.update(task_id, completed=100, description="[bold red]❌ Failed[/bold red]")
146
+ anim_task.cancel()
147
+ console.print(f"[bold red]Critical Error:[/bold red] {_safe_str(str(e))}")
148
+ return
149
+
150
+ wall_elapsed_ms = (_time.monotonic() - _wall_start) * 1000
151
+
152
+ # ── Display results (outside spinner) ──
153
+ try:
154
+ api_dict = result.to_api_dict()
155
+
156
+ if result.success:
157
+ console.print(f"\n[bold green]\u2705 Parsing Complete![/bold green]")
158
+
159
+ table = Table(show_header=False, border_style="green")
160
+ table.add_column("Metric", style="cyan")
161
+ table.add_column("Value", style="white")
162
+
163
+ table.add_row("Status", str(result.status))
164
+ table.add_row("Confidence", f"{result.confidence:.2%}")
165
+ table.add_row("Pages", str(result.content.page_count))
166
+ table.add_row("Tables Found", str(len(result.tables)))
167
+ table.add_row("Extracted Text", f"{len(result.content.text)} chars")
168
+ table.add_row("Time Elapsed", f"{wall_elapsed_ms:.0f} ms")
169
+
170
+ # Detect cached results: internal timing >> wall time
171
+ is_cached = (
172
+ result.timing and result.timing.elapsed_ms > 0
173
+ and wall_elapsed_ms < result.timing.elapsed_ms * 0.5
174
+ and wall_elapsed_ms < 2000
175
+ )
176
+ if is_cached:
177
+ table.add_row("", "[dim italic]⚡ cached result[/dim italic]")
178
+
179
+ console.print(table)
180
+
181
+ effective_ms = max(wall_elapsed_ms, 1)
182
+ speed = len(result.content.text) / (effective_ms / 1000)
183
+ console.print(f"\n[bold magenta]\u26a1 BLAZING FAST:[/bold magenta] Processed at {speed:.0f} chars/sec!")
184
+ console.print(f"[dim]Copy this benchmark and share it on Twitter / V2EX to show off your speed! \u26a1[/dim]")
185
+ else:
186
+ console.print(f"\n[bold red]\u274c Parsing Failed[/bold red]")
187
+ if result.error:
188
+ console.print(f"[red]{_safe_str(result.error.message)}[/red]")
189
+
190
+ console.print("\n[bold yellow]Open Source Power[/bold yellow]")
191
+ console.print("[white]Encountered an unsupported exotic format? This is how we improve![/white]")
192
+ console.print("[white]Please attach the logs and a sample document by opening an issue at:[/white]")
193
+ console.print("[cyan]https://github.com/valuemapglobal/docmirror/issues[/cyan]")
194
+
195
+ # Save result to disk (both success and failure, for diagnostics)
196
+ if not no_save:
197
+ saved_path = save_result(api_dict, path, output_dir)
198
+ console.print(f"\n[bold blue]\U0001f4be Result saved to:[/bold blue] [white]{saved_path}[/white]")
199
+
200
+ except Exception as e:
201
+ console.print(f"[bold red]Critical Error:[/bold red] {_safe_str(str(e))}")
202
+
203
+ def main() -> None:
204
+ parser = argparse.ArgumentParser(description="DocMirror - Universal Document Parsing Engine")
205
+ parser.add_argument("file", nargs="?", help="Path to a document or a directory (recursively parse all files under it)")
206
+ parser.add_argument("--format", default="markdown", choices=["markdown", "json", "text"], help="Output format")
207
+ parser.add_argument("--output-dir", "-o", type=Path, default=DEFAULT_OUTPUT_DIR, help="Directory to save parse results (default: ./output)")
208
+ parser.add_argument("--no-save", action="store_true", help="Do not save result to disk")
209
+ parser.add_argument("--skip-cache", action="store_true", help="Skip cache and force a full re-parse")
210
+ parser.add_argument("--exclude", action="append", default=[], metavar="SUBSTR",
211
+ help="Skip files whose path contains SUBSTR (e.g. --exclude 工商银行); can be repeated")
212
+ parser.add_argument("--authors", action="store_true", help="Show contributors and authors")
213
+
214
+ args = parser.parse_args()
215
+
216
+ if args.authors:
217
+ print_banner()
218
+ show_authors()
219
+ return
220
+
221
+ if not args.file:
222
+ print_banner()
223
+ parser.print_help()
224
+ return
225
+
226
+ print_banner()
227
+ path = Path(args.file).resolve()
228
+ if not path.exists():
229
+ console.print(f"[bold red]Error[/bold red]: Path not found: {path}")
230
+ return
231
+
232
+ if path.is_dir():
233
+ files = discover_files(path)
234
+ if args.exclude:
235
+ excluded = [f for f in files if any(pat in str(f) for pat in args.exclude)]
236
+ files = [f for f in files if not any(pat in str(f) for pat in args.exclude)]
237
+ if excluded:
238
+ console.print(f"[dim]Excluding {len(excluded)} file(s) matching: {', '.join(args.exclude)}[/dim]")
239
+ if not files:
240
+ console.print(f"[bold yellow]No files found under[/bold yellow] {path}")
241
+ return
242
+ console.print(f"[bold cyan]Batch mode:[/bold cyan] {len(files)} file(s) under [white]{path}[/white]\n")
243
+
244
+ async def _batch_parse():
245
+ for i, fp in enumerate(files, 1):
246
+ console.print(f"\n[bold blue][{i}/{len(files)}][/bold blue] {fp.name}")
247
+ await parse_document(str(fp), args.format, args.output_dir, args.no_save, args.skip_cache)
248
+
249
+ asyncio.run(_batch_parse())
250
+ else:
251
+ asyncio.run(parse_document(args.file, args.format, args.output_dir, args.no_save, args.skip_cache))
252
+
253
+ if __name__ == "__main__":
254
+ main()
@@ -0,0 +1,47 @@
1
+ # Copyright (c) 2026 ValueMap Global and contributors. All rights reserved.
2
+ # Author: Adam Lin <adamlin@valuemapglobal.com>
3
+ #
4
+ # This source code is licensed under the Apache 2.0 license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """
8
+ Adapters — Format-specific document converters.
9
+ ================================================
10
+
11
+ Each adapter is responsible for:
12
+ 1. Converting a specific file format into an immutable ``BaseResult``.
13
+ 2. Optionally returning a ``ParserOutput`` for backward compatibility.
14
+
15
+ Adapters contain NO business logic — all domain-specific enhancement
16
+ is handled by the middleware pipeline downstream.
17
+
18
+ Supported formats:
19
+ - PDF → PDFAdapter
20
+ - Image → ImageAdapter (VLM + OCR fallback)
21
+ - Word → WordAdapter (.docx via python-docx)
22
+ - Excel → ExcelAdapter (.xlsx via openpyxl)
23
+ - PPT → PPTAdapter (.pptx via python-pptx)
24
+ - Email → EmailAdapter (.eml via stdlib email)
25
+ - HTML → WebAdapter (raw text extraction)
26
+ - JSON/CSV → StructuredAdapter
27
+ """
28
+
29
+ from .pdf.pdf import PDFAdapter
30
+ from .image.image import ImageAdapter
31
+ from .web.email import EmailAdapter
32
+ from .office.excel import ExcelAdapter
33
+ from .office.word import WordAdapter
34
+ from .office.ppt import PPTAdapter
35
+ from .data.structured import StructuredAdapter
36
+ from .web.web import WebAdapter
37
+
38
+ __all__ = [
39
+ "PDFAdapter",
40
+ "ImageAdapter",
41
+ "EmailAdapter",
42
+ "ExcelAdapter",
43
+ "WordAdapter",
44
+ "PPTAdapter",
45
+ "StructuredAdapter",
46
+ "WebAdapter",
47
+ ]
@@ -0,0 +1,6 @@
1
+ # Copyright (c) 2026 ValueMap Global and contributors. All rights reserved.
2
+ # Author: Adam Lin <adamlin@valuemapglobal.com>
3
+ #
4
+ # This source code is licensed under the Apache 2.0 license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
@@ -0,0 +1,80 @@
1
+ # Copyright (c) 2026 ValueMap Global and contributors. All rights reserved.
2
+ # Author: Adam Lin <adamlin@valuemapglobal.com>
3
+ #
4
+ # This source code is licensed under the Apache 2.0 license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """
8
+ Structured Data Adapter — JSON/CSV → BaseResult
9
+ =================================================
10
+
11
+ Handles structured data files that already have a well-defined schema.
12
+
13
+ Processing logic by format:
14
+
15
+ **JSON (.json)**:
16
+ - Loads the entire file into a Python object.
17
+ - If the root object is a dict, creates a ``key_value`` Block with the
18
+ dict as raw_content (suitable for flat key-value documents).
19
+ - The full_text is the pretty-printed JSON (2-space indent).
20
+
21
+ **CSV (.csv)**:
22
+ - Reads all rows via Python's csv.reader (default dialect).
23
+ - Creates a single ``table`` Block with the 2D list of row data.
24
+ - The full_text is the comma-joined rows.
25
+
26
+ Both formats produce a single-page BaseResult with:
27
+ - metadata.source_format set to the file extension (without dot).
28
+
29
+ .. note::
30
+ For JSON arrays (e.g., list of records), the current implementation
31
+ does not create structured blocks. A future enhancement could
32
+ detect list-of-dicts patterns and convert them to table Blocks.
33
+ """
34
+ from __future__ import annotations
35
+
36
+
37
+ import csv
38
+ import json
39
+ import logging
40
+ from pathlib import Path
41
+
42
+ from docmirror.framework.base import BaseParser
43
+ from docmirror.models.entities.domain import BaseResult, Block, PageLayout
44
+
45
+ logger = logging.getLogger(__name__)
46
+
47
+
48
+ class StructuredAdapter(BaseParser):
49
+ """Structured data (JSON/CSV) format adapter."""
50
+
51
+ async def to_base_result(self, file_path: Path) -> BaseResult:
52
+ """
53
+ Parse a JSON or CSV file into a BaseResult.
54
+
55
+ Dispatches to format-specific logic based on file extension.
56
+ JSON dicts become key_value Blocks; CSV files become table Blocks.
57
+ """
58
+ ext = file_path.suffix.lower()
59
+ logger.info(f"[StructuredAdapter] Starting native extraction for {ext} file: {file_path}")
60
+ blocks = []
61
+ text = ""
62
+
63
+ if ext == ".json":
64
+ with open(file_path, "r", encoding="utf-8") as f:
65
+ data = json.load(f)
66
+ # Dict objects → key-value Block for flat entity data
67
+ if isinstance(data, dict):
68
+ blocks.append(Block(block_type="key_value", raw_content=data, page=0))
69
+ text = json.dumps(data, indent=2, ensure_ascii=False)
70
+
71
+ elif ext == ".csv":
72
+ with open(file_path, "r", encoding="utf-8") as f:
73
+ rows = list(csv.reader(f))
74
+ if rows:
75
+ # All CSV rows (including header) → single table Block
76
+ blocks.append(Block(block_type="table", raw_content=rows, page=0))
77
+ text = "\n".join(",".join(r) for r in rows)
78
+
79
+ page = PageLayout(page_number=0, blocks=tuple(blocks))
80
+ return BaseResult(pages=(page,), full_text=text, metadata={"source_format": ext.lstrip(".")})
@@ -0,0 +1,6 @@
1
+ # Copyright (c) 2026 ValueMap Global and contributors. All rights reserved.
2
+ # Author: Adam Lin <adamlin@valuemapglobal.com>
3
+ #
4
+ # This source code is licensed under the Apache 2.0 license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
@@ -0,0 +1,134 @@
1
+ # Copyright (c) 2026 ValueMap Global and contributors. All rights reserved.
2
+ # Author: Adam Lin <adamlin@valuemapglobal.com>
3
+ #
4
+ # This source code is licensed under the Apache 2.0 license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """
8
+ Image Adapter — Image → BaseResult
9
+ ====================================
10
+
11
+ Converts image files (JPG, PNG, TIFF, etc.) into structured data using
12
+ RapidOCR (ONNX Runtime) for plain text extraction. This adapter produces a single
13
+ text Block without complex structured table/entity data, as it currently operates
14
+ in a purely CPU-bound environment without Vision-Language Models.
15
+ """
16
+ from __future__ import annotations
17
+
18
+
19
+ import logging
20
+ from pathlib import Path
21
+
22
+ from docmirror.framework.base import BaseParser
23
+ from docmirror.models.entities.domain import BaseResult, Block, PageLayout
24
+
25
+ logger = logging.getLogger(__name__)
26
+
27
+
28
+ class ImageAdapter(BaseParser):
29
+ """
30
+ Image format adapter using OCR extraction.
31
+
32
+ Produces a single text Block containing all recognized text lines joined by newlines.
33
+ """
34
+
35
+ async def to_base_result(self, file_path: Path) -> BaseResult:
36
+ """
37
+ Convert an image file to BaseResult using OCR.
38
+ """
39
+ logger.info(f"[ImageAdapter] Starting image parsing for: {file_path}")
40
+ result = await self._ocr_fallback(file_path)
41
+ logger.info(f"[ImageAdapter] Completed image parsing for: {file_path}")
42
+ return result
43
+
44
+ async def _ocr_fallback(self, file_path: Path) -> BaseResult:
45
+ """
46
+ Extract text from the image. When image quality is below
47
+ ``external_ocr_quality_threshold`` and ``external_ocr_provider``
48
+ is set, delegates to the external provider; otherwise uses RapidOCR.
49
+
50
+ Returns a BaseResult with a single text Block containing all
51
+ recognized text lines joined by newlines.
52
+ """
53
+ import cv2
54
+ logger.debug(f"[ImageAdapter] Reading image file: {file_path}")
55
+ img = cv2.imread(str(file_path))
56
+ if img is None:
57
+ logger.error(f"[ImageAdapter] Failed to read image, cv2.imread returned None: {file_path.name}")
58
+ text = ""
59
+ else:
60
+ text = self._extract_text_from_image(img, file_path)
61
+
62
+ blocks = [Block(block_type="text", raw_content=text, page=0)] if text else []
63
+ page = PageLayout(page_number=0, blocks=tuple(blocks))
64
+ return BaseResult(pages=(page,), full_text=text, metadata={"source_format": "image_ocr"})
65
+
66
+ def _extract_text_from_image(self, img, file_path: Path) -> str:
67
+ """Use built-in or external OCR depending on image quality."""
68
+ from docmirror.configs.settings import default_settings
69
+ from docmirror.core.ocr.fallback import (
70
+ _resolve_external_ocr_provider,
71
+ assess_image_quality_from_bgr,
72
+ )
73
+ threshold = getattr(default_settings, "external_ocr_quality_threshold", None)
74
+ provider = _resolve_external_ocr_provider(
75
+ getattr(default_settings, "external_ocr_provider", None)
76
+ )
77
+ quality = assess_image_quality_from_bgr(img)
78
+ logger.debug(
79
+ "[ImageAdapter] OCR route: quality=%s, threshold=%s, external_provider=%s → %s",
80
+ quality,
81
+ threshold,
82
+ "set" if provider is not None else "unset",
83
+ "external" if (threshold is not None and provider is not None and quality < threshold) else "builtin",
84
+ )
85
+ if (
86
+ threshold is not None
87
+ and provider is not None
88
+ and quality < threshold
89
+ ):
90
+ try:
91
+ out = provider(img, page_idx=0, dpi=200)
92
+ except Exception as e:
93
+ logger.warning(f"[ImageAdapter] External OCR failed: {e}")
94
+ out = None
95
+ if out is not None:
96
+ logger.info(
97
+ f"[ImageAdapter] Delegated to external OCR (quality={quality})"
98
+ )
99
+ return self._text_from_ocr_result(out)
100
+ try:
101
+ from docmirror.core.ocr.vision.rapidocr_engine import get_ocr_engine
102
+ engine = get_ocr_engine()
103
+ words = engine.detect_image_words(img)
104
+ return "\n".join(w[4] for w in words) if words else ""
105
+ except Exception as e:
106
+ logger.warning(f"[ImageAdapter] OCR fallback failed: {e}")
107
+ return ""
108
+
109
+ @staticmethod
110
+ def _text_from_ocr_result(out) -> str:
111
+ """Convert external OCR result (list of words or dict) to plain text."""
112
+ if isinstance(out, list):
113
+ return "\n".join(w[4] for w in out if len(w) > 4)
114
+ if isinstance(out, dict):
115
+ lines = out.get("lines", [])
116
+ if lines:
117
+ return "\n".join(
118
+ line.get("text", "") if isinstance(line, dict) else str(line)
119
+ for line in lines
120
+ )
121
+ header = out.get("header_text", "").strip()
122
+ footer = out.get("footer_text", "").strip()
123
+ table = out.get("table", [])
124
+ parts = [header] if header else []
125
+ if table:
126
+ for row in table:
127
+ if isinstance(row, (list, tuple)):
128
+ parts.append(" | ".join(str(c) for c in row if c))
129
+ else:
130
+ parts.append(str(row))
131
+ if footer:
132
+ parts.append(footer)
133
+ return "\n".join(parts) if parts else ""
134
+ return ""
@@ -0,0 +1,6 @@
1
+ # Copyright (c) 2026 ValueMap Global and contributors. All rights reserved.
2
+ # Author: Adam Lin <adamlin@valuemapglobal.com>
3
+ #
4
+ # This source code is licensed under the Apache 2.0 license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+