docfold 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docfold might be problematic. Click here for more details.

docfold/__init__.py ADDED
@@ -0,0 +1,15 @@
1
+ """docfold - Turn any document into structured data."""
2
+
3
+ __version__ = "0.1.0"
4
+
5
+ from docfold.engines.base import DocumentEngine, EngineResult, OutputFormat
6
+ from docfold.engines.router import BatchResult, EngineRouter, ProgressCallback
7
+
8
+ __all__ = [
9
+ "BatchResult",
10
+ "DocumentEngine",
11
+ "EngineResult",
12
+ "EngineRouter",
13
+ "OutputFormat",
14
+ "ProgressCallback",
15
+ ]
docfold/cli.py ADDED
@@ -0,0 +1,250 @@
1
+ """docfold CLI entry-point."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import argparse
6
+ import asyncio
7
+ import sys
8
+
9
+
10
+ def main(argv: list[str] | None = None) -> None:
11
+ parser = argparse.ArgumentParser(
12
+ prog="docfold",
13
+ description="Turn any document into structured data.",
14
+ )
15
+ sub = parser.add_subparsers(dest="command")
16
+
17
+ # --- convert ---
18
+ convert_p = sub.add_parser("convert", help="Convert a document to structured text")
19
+ convert_p.add_argument("file", help="Path to the input document")
20
+ convert_p.add_argument(
21
+ "-e", "--engine",
22
+ help="Engine to use. Default: auto-select.",
23
+ )
24
+ convert_p.add_argument(
25
+ "-f", "--format",
26
+ choices=["markdown", "html", "json", "text"],
27
+ default="markdown",
28
+ help="Output format (default: markdown)",
29
+ )
30
+ convert_p.add_argument(
31
+ "-o", "--output",
32
+ help="Output file path. If omitted, prints to stdout.",
33
+ )
34
+ convert_p.add_argument(
35
+ "--engines",
36
+ help="Comma-separated list of allowed engines (restricts selection).",
37
+ )
38
+
39
+ # --- engines ---
40
+ sub.add_parser("engines", help="List available engines and their status")
41
+
42
+ # --- compare ---
43
+ compare_p = sub.add_parser("compare", help="Compare engines on a document")
44
+ compare_p.add_argument("file", help="Path to the input document")
45
+ compare_p.add_argument(
46
+ "-e", "--engines",
47
+ help="Comma-separated engine names. Default: all available.",
48
+ )
49
+
50
+ # --- evaluate ---
51
+ eval_p = sub.add_parser("evaluate", help="Run evaluation benchmark")
52
+ eval_p.add_argument("dataset", help="Path to evaluation dataset directory")
53
+ eval_p.add_argument(
54
+ "-e", "--engines",
55
+ help="Comma-separated engine names. Default: all available.",
56
+ )
57
+ eval_p.add_argument(
58
+ "-o", "--output",
59
+ help="Output file for evaluation report (JSON).",
60
+ )
61
+
62
+ args = parser.parse_args(argv)
63
+
64
+ if args.command is None:
65
+ parser.print_help()
66
+ sys.exit(0)
67
+
68
+ if args.command == "convert":
69
+ asyncio.run(_cmd_convert(args))
70
+ elif args.command == "engines":
71
+ _cmd_engines()
72
+ elif args.command == "compare":
73
+ asyncio.run(_cmd_compare(args))
74
+ elif args.command == "evaluate":
75
+ asyncio.run(_cmd_evaluate(args))
76
+
77
+
78
+ def _build_router():
79
+ """Build a router with all discoverable engines."""
80
+ from docfold.engines.router import EngineRouter
81
+
82
+ router = EngineRouter()
83
+
84
+ # Try importing each engine adapter; register if available
85
+ try:
86
+ from docfold.engines.docling_engine import DoclingEngine
87
+ router.register(DoclingEngine())
88
+ except Exception:
89
+ pass
90
+
91
+ try:
92
+ from docfold.engines.mineru_engine import MinerUEngine
93
+ router.register(MinerUEngine())
94
+ except Exception:
95
+ pass
96
+
97
+ try:
98
+ from docfold.engines.marker_engine import MarkerEngine
99
+ router.register(MarkerEngine())
100
+ except Exception:
101
+ pass
102
+
103
+ try:
104
+ from docfold.engines.pymupdf_engine import PyMuPDFEngine
105
+ router.register(PyMuPDFEngine())
106
+ except Exception:
107
+ pass
108
+
109
+ try:
110
+ from docfold.engines.paddleocr_engine import PaddleOCREngine
111
+ router.register(PaddleOCREngine())
112
+ except Exception:
113
+ pass
114
+
115
+ try:
116
+ from docfold.engines.tesseract_engine import TesseractEngine
117
+ router.register(TesseractEngine())
118
+ except Exception:
119
+ pass
120
+
121
+ try:
122
+ from docfold.engines.unstructured_engine import UnstructuredEngine
123
+ router.register(UnstructuredEngine())
124
+ except Exception:
125
+ pass
126
+
127
+ try:
128
+ from docfold.engines.llamaparse_engine import LlamaParseEngine
129
+ router.register(LlamaParseEngine())
130
+ except Exception:
131
+ pass
132
+
133
+ try:
134
+ from docfold.engines.mistral_ocr_engine import MistralOCREngine
135
+ router.register(MistralOCREngine())
136
+ except Exception:
137
+ pass
138
+
139
+ try:
140
+ from docfold.engines.zerox_engine import ZeroxEngine
141
+ router.register(ZeroxEngine())
142
+ except Exception:
143
+ pass
144
+
145
+ try:
146
+ from docfold.engines.textract_engine import TextractEngine
147
+ router.register(TextractEngine())
148
+ except Exception:
149
+ pass
150
+
151
+ try:
152
+ from docfold.engines.google_docai_engine import GoogleDocAIEngine
153
+ router.register(GoogleDocAIEngine())
154
+ except Exception:
155
+ pass
156
+
157
+ try:
158
+ from docfold.engines.azure_docint_engine import AzureDocIntEngine
159
+ router.register(AzureDocIntEngine())
160
+ except Exception:
161
+ pass
162
+
163
+ return router
164
+
165
+
166
+ async def _cmd_convert(args) -> None:
167
+ from docfold.engines.base import OutputFormat
168
+
169
+ allowed = set(args.engines.split(",")) if args.engines else None
170
+ router = _build_router()
171
+ if allowed:
172
+ router._allowed_engines = allowed
173
+ fmt = OutputFormat(args.format)
174
+
175
+ result = await router.process(args.file, output_format=fmt, engine_hint=args.engine)
176
+
177
+ output = result.content
178
+ if args.output:
179
+ with open(args.output, "w", encoding="utf-8") as f:
180
+ f.write(output)
181
+ eng = result.engine_name
182
+ ms = result.processing_time_ms
183
+ print(f"Written to {args.output} (engine={eng}, {ms}ms)")
184
+ else:
185
+ print(output)
186
+
187
+
188
+ def _cmd_engines() -> None:
189
+ router = _build_router()
190
+ engines = router.list_engines()
191
+
192
+ if not engines:
193
+ print("No engines registered. Install extras: pip install docfold[all]")
194
+ return
195
+
196
+ print(f"{'Engine':<14} {'Status':<9} {'BBox':>4} {'Conf':>4} {'Tbl':>4} {'Img':>4} Formats")
197
+ print("-" * 78)
198
+ for e in engines:
199
+ status = "YES" if e["available"] else "no"
200
+ caps = e.get("capabilities", {})
201
+ bbox = "+" if caps.get("bounding_boxes") else "-"
202
+ conf = "+" if caps.get("confidence") else "-"
203
+ tbl = "+" if caps.get("table_structure") else "-"
204
+ img = "+" if caps.get("images") else "-"
205
+ exts = ", ".join(e["extensions"][:6])
206
+ if len(e["extensions"]) > 6:
207
+ exts += ", ..."
208
+ print(f"{e['name']:<14} {status:<9} {bbox:>4} {conf:>4} {tbl:>4} {img:>4} {exts}")
209
+
210
+
211
+ async def _cmd_compare(args) -> None:
212
+ from docfold.engines.base import OutputFormat
213
+
214
+ router = _build_router()
215
+ engine_names = args.engines.split(",") if args.engines else None
216
+
217
+ results = await router.compare(args.file, OutputFormat.MARKDOWN, engines=engine_names)
218
+
219
+ for name, result in results.items():
220
+ print(f"\n{'=' * 60}")
221
+ print(f"Engine: {name} | Time: {result.processing_time_ms}ms | Pages: {result.pages}")
222
+ print(f"{'=' * 60}")
223
+ # Print first 500 chars of content as preview
224
+ preview = result.content[:500]
225
+ if len(result.content) > 500:
226
+ preview += "\n... (truncated)"
227
+ print(preview)
228
+
229
+
230
+ async def _cmd_evaluate(args) -> None:
231
+ from docfold.evaluation.runner import EvaluationRunner
232
+
233
+ router = _build_router()
234
+ engine_names = args.engines.split(",") if args.engines else None
235
+
236
+ runner = EvaluationRunner(router, dataset_path=args.dataset)
237
+ report = await runner.run(engines=engine_names)
238
+
239
+ report_json = report.to_json()
240
+
241
+ if args.output:
242
+ with open(args.output, "w", encoding="utf-8") as f:
243
+ f.write(report_json)
244
+ print(f"Report written to {args.output}")
245
+ else:
246
+ print(report_json)
247
+
248
+
249
+ if __name__ == "__main__":
250
+ main()
@@ -0,0 +1,12 @@
1
+ """Document structuring engine adapters."""
2
+
3
+ from docfold.engines.base import DocumentEngine, EngineCapabilities, EngineResult, OutputFormat
4
+ from docfold.engines.router import EngineRouter
5
+
6
+ __all__ = [
7
+ "DocumentEngine",
8
+ "EngineCapabilities",
9
+ "EngineResult",
10
+ "EngineRouter",
11
+ "OutputFormat",
12
+ ]
@@ -0,0 +1,197 @@
1
+ """Azure Document Intelligence engine adapter — cloud document analysis.
2
+
3
+ Install: ``pip install docfold[azure-docint]``
4
+
5
+ Requires Azure credentials:
6
+ - ``AZURE_DOCINT_ENDPOINT`` — the endpoint URL
7
+ - ``AZURE_DOCINT_KEY`` — the API key
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import logging
13
+ import os
14
+ import time
15
+ from typing import Any
16
+
17
+ from docfold.engines.base import DocumentEngine, EngineCapabilities, EngineResult, OutputFormat
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+ _SUPPORTED_EXTENSIONS = {
22
+ "pdf", "png", "jpg", "jpeg", "tiff", "tif", "bmp",
23
+ "docx", "xlsx", "pptx", "html",
24
+ }
25
+
26
+
27
+ class AzureDocIntEngine(DocumentEngine):
28
+ """Adapter for Azure Document Intelligence (formerly Form Recognizer).
29
+
30
+ Uses the ``prebuilt-layout`` model by default for general-purpose
31
+ document analysis with table, heading, and reading order extraction.
32
+
33
+ Supports DOCX, XLSX, PPTX natively in addition to PDF and images.
34
+
35
+ See https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/
36
+ """
37
+
38
+ def __init__(
39
+ self,
40
+ endpoint: str | None = None,
41
+ key: str | None = None,
42
+ model_id: str = "prebuilt-layout",
43
+ ) -> None:
44
+ self._endpoint = endpoint or os.getenv("AZURE_DOCINT_ENDPOINT")
45
+ self._key = key or os.getenv("AZURE_DOCINT_KEY")
46
+ self._model_id = model_id
47
+
48
+ @property
49
+ def name(self) -> str:
50
+ return "azure_docint"
51
+
52
+ @property
53
+ def supported_extensions(self) -> set[str]:
54
+ return _SUPPORTED_EXTENSIONS
55
+
56
+ @property
57
+ def capabilities(self) -> EngineCapabilities:
58
+ return EngineCapabilities(
59
+ bounding_boxes=True, confidence=True, table_structure=True,
60
+ heading_detection=True, reading_order=True,
61
+ )
62
+
63
+ def is_available(self) -> bool:
64
+ try:
65
+ import azure.ai.documentintelligence # noqa: F401
66
+
67
+ return bool(self._endpoint and self._key)
68
+ except ImportError:
69
+ return False
70
+
71
+ async def process(
72
+ self,
73
+ file_path: str,
74
+ output_format: OutputFormat = OutputFormat.MARKDOWN,
75
+ **kwargs: Any,
76
+ ) -> EngineResult:
77
+ import asyncio
78
+
79
+ start = time.perf_counter()
80
+
81
+ loop = asyncio.get_running_loop()
82
+ content, metadata, boxes, conf, tables = await loop.run_in_executor(
83
+ None, self._analyze, file_path, output_format
84
+ )
85
+
86
+ elapsed_ms = int((time.perf_counter() - start) * 1000)
87
+
88
+ return EngineResult(
89
+ content=content,
90
+ format=output_format,
91
+ engine_name=self.name,
92
+ processing_time_ms=elapsed_ms,
93
+ metadata=metadata,
94
+ bounding_boxes=boxes,
95
+ confidence=conf,
96
+ tables=tables,
97
+ )
98
+
99
+ def _analyze(
100
+ self,
101
+ file_path: str,
102
+ output_format: OutputFormat,
103
+ ) -> tuple[str, dict, list[dict], float | None, list[dict] | None]:
104
+ from azure.ai.documentintelligence import DocumentIntelligenceClient
105
+ from azure.core.credentials import AzureKeyCredential
106
+
107
+ client = DocumentIntelligenceClient(
108
+ endpoint=self._endpoint,
109
+ credential=AzureKeyCredential(self._key),
110
+ )
111
+
112
+ with open(file_path, "rb") as f:
113
+ poller = client.begin_analyze_document(
114
+ model_id=self._model_id,
115
+ analyze_request=f,
116
+ content_type="application/octet-stream",
117
+ output_content_format="markdown",
118
+ )
119
+
120
+ result = poller.result()
121
+
122
+ # Primary content — Azure returns markdown by default
123
+ full_text = result.content or ""
124
+
125
+ # Extract bounding boxes and confidence from paragraphs
126
+ bounding_boxes: list[dict[str, Any]] = []
127
+ confidences: list[float] = []
128
+
129
+ for paragraph in result.paragraphs or []:
130
+ conf = paragraph.confidence
131
+ if conf is not None:
132
+ confidences.append(conf)
133
+
134
+ polygon = None
135
+ if paragraph.bounding_regions:
136
+ region = paragraph.bounding_regions[0]
137
+ polygon = region.polygon
138
+ page_num = region.page_number
139
+ else:
140
+ page_num = 1
141
+
142
+ bounding_boxes.append({
143
+ "type": "paragraph",
144
+ "role": paragraph.role,
145
+ "text": paragraph.content,
146
+ "polygon": polygon,
147
+ "page": page_num,
148
+ "confidence": conf,
149
+ })
150
+
151
+ avg_conf = sum(confidences) / len(confidences) if confidences else None
152
+
153
+ # Extract tables
154
+ tables: list[dict[str, Any]] = []
155
+ for table in result.tables or []:
156
+ table_data = self._extract_table(table)
157
+ if table_data:
158
+ tables.append(table_data)
159
+
160
+ # Format output
161
+ if output_format == OutputFormat.JSON:
162
+ import json
163
+ data = {"text": full_text, "page_count": len(result.pages or [])}
164
+ content = json.dumps(data, ensure_ascii=False)
165
+ elif output_format == OutputFormat.HTML:
166
+ content = f"<html><body><pre>{full_text}</pre></body></html>"
167
+ else:
168
+ content = full_text
169
+
170
+ metadata = {
171
+ "page_count": len(result.pages or []),
172
+ "model_id": self._model_id,
173
+ "paragraph_count": len(result.paragraphs or []),
174
+ "table_count": len(tables),
175
+ }
176
+
177
+ return content, metadata, bounding_boxes, avg_conf, tables or None
178
+
179
+ def _extract_table(self, table: Any) -> dict[str, Any] | None:
180
+ """Extract table structure from Azure table object."""
181
+ if not table.cells:
182
+ return None
183
+
184
+ rows: dict[int, dict[int, str]] = {}
185
+ for cell in table.cells:
186
+ row_idx = cell.row_index
187
+ col_idx = cell.column_index
188
+ rows.setdefault(row_idx, {})[col_idx] = cell.content or ""
189
+
190
+ return {
191
+ "row_count": table.row_count,
192
+ "column_count": table.column_count,
193
+ "rows": [
194
+ {f"col_{c}": rows[r].get(c, "") for c in sorted(rows[r])}
195
+ for r in sorted(rows)
196
+ ],
197
+ }
@@ -0,0 +1,111 @@
1
+ """Base interface for document structuring engines."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from abc import ABC, abstractmethod
6
+ from dataclasses import dataclass, field
7
+ from enum import Enum
8
+ from typing import Any
9
+
10
+
11
+ @dataclass(frozen=True)
12
+ class EngineCapabilities:
13
+ """Declares what enrichments an engine can populate in EngineResult."""
14
+
15
+ bounding_boxes: bool = False
16
+ confidence: bool = False
17
+ images: bool = False
18
+ table_structure: bool = False
19
+ heading_detection: bool = False
20
+ reading_order: bool = False
21
+
22
+
23
+ class OutputFormat(str, Enum):
24
+ MARKDOWN = "markdown"
25
+ HTML = "html"
26
+ JSON = "json"
27
+ TEXT = "text"
28
+
29
+
30
+ @dataclass
31
+ class EngineResult:
32
+ """Unified result returned by all structuring engines.
33
+
34
+ Every engine adapter must produce this dataclass so that callers
35
+ never depend on engine-specific output shapes.
36
+ """
37
+
38
+ content: str
39
+ """Primary output string (markdown, html, plain text, or json string)."""
40
+
41
+ format: OutputFormat
42
+ """Format of ``content``."""
43
+
44
+ engine_name: str
45
+ """Identifier of the engine that produced this result."""
46
+
47
+ # --- optional enrichments ---
48
+
49
+ metadata: dict[str, Any] = field(default_factory=dict)
50
+ """Engine-specific metadata (model versions, config used, etc.)."""
51
+
52
+ pages: int | None = None
53
+ """Number of pages processed (if applicable)."""
54
+
55
+ images: dict[str, str] | None = None
56
+ """Extracted images as ``{filename: base64_data}``."""
57
+
58
+ tables: list[dict[str, Any]] | None = None
59
+ """Extracted tables as list of row-dicts."""
60
+
61
+ bounding_boxes: list[dict[str, Any]] | None = None
62
+ """Layout element bounding boxes ``[{type, bbox, page, ...}]``."""
63
+
64
+ confidence: float | None = None
65
+ """Overall confidence score in [0, 1] (if the engine provides one)."""
66
+
67
+ processing_time_ms: int = 0
68
+ """Wall-clock processing time in milliseconds."""
69
+
70
+
71
+ class DocumentEngine(ABC):
72
+ """Abstract base class that every engine adapter must implement."""
73
+
74
+ @property
75
+ @abstractmethod
76
+ def name(self) -> str:
77
+ """Unique, lowercase engine identifier (e.g. ``'docling'``)."""
78
+ ...
79
+
80
+ @property
81
+ @abstractmethod
82
+ def supported_extensions(self) -> set[str]:
83
+ """File extensions this engine can handle, without dots (e.g. ``{'pdf', 'docx'}``)."""
84
+ ...
85
+
86
+ @abstractmethod
87
+ async def process(
88
+ self,
89
+ file_path: str,
90
+ output_format: OutputFormat = OutputFormat.MARKDOWN,
91
+ **kwargs: Any,
92
+ ) -> EngineResult:
93
+ """Process a document and return a unified :class:`EngineResult`."""
94
+ ...
95
+
96
+ @abstractmethod
97
+ def is_available(self) -> bool:
98
+ """Return ``True`` if the engine's dependencies are installed and ready."""
99
+ ...
100
+
101
+ @property
102
+ def capabilities(self) -> EngineCapabilities:
103
+ """Declare what enrichments this engine populates in :class:`EngineResult`.
104
+
105
+ Engines should override this to advertise their capabilities.
106
+ Defaults to all ``False``.
107
+ """
108
+ return EngineCapabilities()
109
+
110
+ def __repr__(self) -> str:
111
+ return f"<{self.__class__.__name__} name={self.name!r} available={self.is_available()}>"
@@ -0,0 +1,101 @@
1
+ """Docling engine adapter.
2
+
3
+ Install: ``pip install docfold[docling]``
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ import logging
9
+ import time
10
+ from typing import Any
11
+
12
+ from docfold.engines.base import DocumentEngine, EngineCapabilities, EngineResult, OutputFormat
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+ _SUPPORTED_EXTENSIONS = {
17
+ "pdf", "docx", "pptx", "xlsx", "html",
18
+ "png", "jpg", "jpeg", "tiff", "tif",
19
+ "wav", "mp3", "vtt",
20
+ }
21
+
22
+
23
+ class DoclingEngine(DocumentEngine):
24
+ """Adapter for the Docling document conversion framework.
25
+
26
+ See https://github.com/docling-project/docling
27
+ """
28
+
29
+ def __init__(self, pipeline: str = "standard", ocr_enabled: bool = True) -> None:
30
+ self._pipeline = pipeline # "standard" or "vlm"
31
+ self._ocr_enabled = ocr_enabled
32
+ self._converter = None
33
+
34
+ @property
35
+ def name(self) -> str:
36
+ return "docling"
37
+
38
+ @property
39
+ def supported_extensions(self) -> set[str]:
40
+ return _SUPPORTED_EXTENSIONS
41
+
42
+ @property
43
+ def capabilities(self) -> EngineCapabilities:
44
+ return EngineCapabilities(
45
+ bounding_boxes=True, images=True, table_structure=True,
46
+ heading_detection=True, reading_order=True,
47
+ )
48
+
49
+ def is_available(self) -> bool:
50
+ try:
51
+ import docling # noqa: F401
52
+ return True
53
+ except ImportError:
54
+ return False
55
+
56
+ def _get_converter(self): # noqa: ANN202
57
+ """Lazy-init the Docling DocumentConverter."""
58
+ if self._converter is None:
59
+ from docling.document_converter import DocumentConverter
60
+ self._converter = DocumentConverter()
61
+ return self._converter
62
+
63
+ async def process(
64
+ self,
65
+ file_path: str,
66
+ output_format: OutputFormat = OutputFormat.MARKDOWN,
67
+ **kwargs: Any,
68
+ ) -> EngineResult:
69
+ import asyncio
70
+
71
+ start = time.perf_counter()
72
+ converter = self._get_converter()
73
+
74
+ # Docling's convert() is synchronous — run in executor
75
+ loop = asyncio.get_running_loop()
76
+ result = await loop.run_in_executor(None, converter.convert, file_path)
77
+
78
+ doc = result.document
79
+
80
+ if output_format == OutputFormat.MARKDOWN:
81
+ content = doc.export_to_markdown()
82
+ elif output_format == OutputFormat.HTML:
83
+ content = doc.export_to_html()
84
+ elif output_format == OutputFormat.JSON:
85
+ import json
86
+ content = json.dumps(doc.export_to_dict(), ensure_ascii=False)
87
+ else:
88
+ content = doc.export_to_markdown()
89
+
90
+ elapsed_ms = int((time.perf_counter() - start) * 1000)
91
+
92
+ return EngineResult(
93
+ content=content,
94
+ format=output_format,
95
+ engine_name=self.name,
96
+ processing_time_ms=elapsed_ms,
97
+ metadata={
98
+ "pipeline": self._pipeline,
99
+ "ocr_enabled": self._ocr_enabled,
100
+ },
101
+ )