PyPI - docfold - Versions diffs - 0.3.0__py3-none-any.whl - Mend

docfold 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of docfold might be problematic. Click here for more details.

Files changed (29) hide show

docfold/__init__.py +15 -0
docfold/cli.py +250 -0
docfold/engines/__init__.py +12 -0
docfold/engines/azure_docint_engine.py +197 -0
docfold/engines/base.py +111 -0
docfold/engines/docling_engine.py +101 -0
docfold/engines/google_docai_engine.py +215 -0
docfold/engines/llamaparse_engine.py +107 -0
docfold/engines/marker_engine.py +146 -0
docfold/engines/mineru_engine.py +102 -0
docfold/engines/mistral_ocr_engine.py +128 -0
docfold/engines/paddleocr_engine.py +127 -0
docfold/engines/pymupdf_engine.py +92 -0
docfold/engines/router.py +409 -0
docfold/engines/tesseract_engine.py +111 -0
docfold/engines/textract_engine.py +209 -0
docfold/engines/unstructured_engine.py +115 -0
docfold/engines/zerox_engine.py +112 -0
docfold/evaluation/__init__.py +17 -0
docfold/evaluation/metrics.py +172 -0
docfold/evaluation/runner.py +183 -0
docfold/preprocessing/__init__.py +5 -0
docfold/preprocessing/detector.py +107 -0
docfold/py.typed +0 -0
docfold-0.3.0.dist-info/METADATA +458 -0
docfold-0.3.0.dist-info/RECORD +29 -0
docfold-0.3.0.dist-info/WHEEL +4 -0
docfold-0.3.0.dist-info/entry_points.txt +2 -0
docfold-0.3.0.dist-info/licenses/LICENSE +21 -0

docfold/__init__.py ADDED Viewed

@@ -0,0 +1,15 @@
+"""docfold - Turn any document into structured data."""
+__version__ = "0.1.0"
+from docfold.engines.base import DocumentEngine, EngineResult, OutputFormat
+from docfold.engines.router import BatchResult, EngineRouter, ProgressCallback
+__all__ = [
+    "BatchResult",
+    "DocumentEngine",
+    "EngineResult",
+    "EngineRouter",
+    "OutputFormat",
+    "ProgressCallback",
+]

docfold/cli.py ADDED Viewed

@@ -0,0 +1,250 @@
+"""docfold CLI entry-point."""
+from __future__ import annotations
+import argparse
+import asyncio
+import sys
+def main(argv: list[str] | None = None) -> None:
+    parser = argparse.ArgumentParser(
+        prog="docfold",
+        description="Turn any document into structured data.",
+    )
+    sub = parser.add_subparsers(dest="command")
+    # --- convert ---
+    convert_p = sub.add_parser("convert", help="Convert a document to structured text")
+    convert_p.add_argument("file", help="Path to the input document")
+    convert_p.add_argument(
+        "-e", "--engine",
+        help="Engine to use. Default: auto-select.",
+    )
+    convert_p.add_argument(
+        "-f", "--format",
+        choices=["markdown", "html", "json", "text"],
+        default="markdown",
+        help="Output format (default: markdown)",
+    )
+    convert_p.add_argument(
+        "-o", "--output",
+        help="Output file path. If omitted, prints to stdout.",
+    )
+    convert_p.add_argument(
+        "--engines",
+        help="Comma-separated list of allowed engines (restricts selection).",
+    )
+    # --- engines ---
+    sub.add_parser("engines", help="List available engines and their status")
+    # --- compare ---
+    compare_p = sub.add_parser("compare", help="Compare engines on a document")
+    compare_p.add_argument("file", help="Path to the input document")
+    compare_p.add_argument(
+        "-e", "--engines",
+        help="Comma-separated engine names. Default: all available.",
+    )
+    # --- evaluate ---
+    eval_p = sub.add_parser("evaluate", help="Run evaluation benchmark")
+    eval_p.add_argument("dataset", help="Path to evaluation dataset directory")
+    eval_p.add_argument(
+        "-e", "--engines",
+        help="Comma-separated engine names. Default: all available.",
+    )
+    eval_p.add_argument(
+        "-o", "--output",
+        help="Output file for evaluation report (JSON).",
+    )
+    args = parser.parse_args(argv)
+    if args.command is None:
+        parser.print_help()
+        sys.exit(0)
+    if args.command == "convert":
+        asyncio.run(_cmd_convert(args))
+    elif args.command == "engines":
+        _cmd_engines()
+    elif args.command == "compare":
+        asyncio.run(_cmd_compare(args))
+    elif args.command == "evaluate":
+        asyncio.run(_cmd_evaluate(args))
+def _build_router():
+    """Build a router with all discoverable engines."""
+    from docfold.engines.router import EngineRouter
+    router = EngineRouter()
+    # Try importing each engine adapter; register if available
+    try:
+        from docfold.engines.docling_engine import DoclingEngine
+        router.register(DoclingEngine())
+    except Exception:
+        pass
+    try:
+        from docfold.engines.mineru_engine import MinerUEngine
+        router.register(MinerUEngine())
+    except Exception:
+        pass
+    try:
+        from docfold.engines.marker_engine import MarkerEngine
+        router.register(MarkerEngine())
+    except Exception:
+        pass
+    try:
+        from docfold.engines.pymupdf_engine import PyMuPDFEngine
+        router.register(PyMuPDFEngine())
+    except Exception:
+        pass
+    try:
+        from docfold.engines.paddleocr_engine import PaddleOCREngine
+        router.register(PaddleOCREngine())
+    except Exception:
+        pass
+    try:
+        from docfold.engines.tesseract_engine import TesseractEngine
+        router.register(TesseractEngine())
+    except Exception:
+        pass
+    try:
+        from docfold.engines.unstructured_engine import UnstructuredEngine
+        router.register(UnstructuredEngine())
+    except Exception:
+        pass
+    try:
+        from docfold.engines.llamaparse_engine import LlamaParseEngine
+        router.register(LlamaParseEngine())
+    except Exception:
+        pass
+    try:
+        from docfold.engines.mistral_ocr_engine import MistralOCREngine
+        router.register(MistralOCREngine())
+    except Exception:
+        pass
+    try:
+        from docfold.engines.zerox_engine import ZeroxEngine
+        router.register(ZeroxEngine())
+    except Exception:
+        pass
+    try:
+        from docfold.engines.textract_engine import TextractEngine
+        router.register(TextractEngine())
+    except Exception:
+        pass
+    try:
+        from docfold.engines.google_docai_engine import GoogleDocAIEngine
+        router.register(GoogleDocAIEngine())
+    except Exception:
+        pass
+    try:
+        from docfold.engines.azure_docint_engine import AzureDocIntEngine
+        router.register(AzureDocIntEngine())
+    except Exception:
+        pass
+    return router
+async def _cmd_convert(args) -> None:
+    from docfold.engines.base import OutputFormat
+    allowed = set(args.engines.split(",")) if args.engines else None
+    router = _build_router()
+    if allowed:
+        router._allowed_engines = allowed
+    fmt = OutputFormat(args.format)
+    result = await router.process(args.file, output_format=fmt, engine_hint=args.engine)
+    output = result.content
+    if args.output:
+        with open(args.output, "w", encoding="utf-8") as f:
+            f.write(output)
+        eng = result.engine_name
+        ms = result.processing_time_ms
+        print(f"Written to {args.output} (engine={eng}, {ms}ms)")
+    else:
+        print(output)
+def _cmd_engines() -> None:
+    router = _build_router()
+    engines = router.list_engines()
+    if not engines:
+        print("No engines registered. Install extras: pip install docfold[all]")
+        return
+    print(f"{'Engine':<14} {'Status':<9} {'BBox':>4} {'Conf':>4} {'Tbl':>4} {'Img':>4}  Formats")
+    print("-" * 78)
+    for e in engines:
+        status = "YES" if e["available"] else "no"
+        caps = e.get("capabilities", {})
+        bbox = "+" if caps.get("bounding_boxes") else "-"
+        conf = "+" if caps.get("confidence") else "-"
+        tbl = "+" if caps.get("table_structure") else "-"
+        img = "+" if caps.get("images") else "-"
+        exts = ", ".join(e["extensions"][:6])
+        if len(e["extensions"]) > 6:
+            exts += ", ..."
+        print(f"{e['name']:<14} {status:<9} {bbox:>4} {conf:>4} {tbl:>4} {img:>4}  {exts}")
+async def _cmd_compare(args) -> None:
+    from docfold.engines.base import OutputFormat
+    router = _build_router()
+    engine_names = args.engines.split(",") if args.engines else None
+    results = await router.compare(args.file, OutputFormat.MARKDOWN, engines=engine_names)
+    for name, result in results.items():
+        print(f"\n{'=' * 60}")
+        print(f"Engine: {name} | Time: {result.processing_time_ms}ms | Pages: {result.pages}")
+        print(f"{'=' * 60}")
+        # Print first 500 chars of content as preview
+        preview = result.content[:500]
+        if len(result.content) > 500:
+            preview += "\n... (truncated)"
+        print(preview)
+async def _cmd_evaluate(args) -> None:
+    from docfold.evaluation.runner import EvaluationRunner
+    router = _build_router()
+    engine_names = args.engines.split(",") if args.engines else None
+    runner = EvaluationRunner(router, dataset_path=args.dataset)
+    report = await runner.run(engines=engine_names)
+    report_json = report.to_json()
+    if args.output:
+        with open(args.output, "w", encoding="utf-8") as f:
+            f.write(report_json)
+        print(f"Report written to {args.output}")
+    else:
+        print(report_json)
+if __name__ == "__main__":
+    main()

docfold/engines/__init__.py ADDED Viewed

@@ -0,0 +1,12 @@
+"""Document structuring engine adapters."""
+from docfold.engines.base import DocumentEngine, EngineCapabilities, EngineResult, OutputFormat
+from docfold.engines.router import EngineRouter
+__all__ = [
+    "DocumentEngine",
+    "EngineCapabilities",
+    "EngineResult",
+    "EngineRouter",
+    "OutputFormat",
+]

docfold/engines/azure_docint_engine.py ADDED Viewed

@@ -0,0 +1,197 @@
+"""Azure Document Intelligence engine adapter — cloud document analysis.
+Install: ``pip install docfold[azure-docint]``
+Requires Azure credentials:
+- ``AZURE_DOCINT_ENDPOINT`` — the endpoint URL
+- ``AZURE_DOCINT_KEY`` — the API key
+"""
+from __future__ import annotations
+import logging
+import os
+import time
+from typing import Any
+from docfold.engines.base import DocumentEngine, EngineCapabilities, EngineResult, OutputFormat
+logger = logging.getLogger(__name__)
+_SUPPORTED_EXTENSIONS = {
+    "pdf", "png", "jpg", "jpeg", "tiff", "tif", "bmp",
+    "docx", "xlsx", "pptx", "html",
+}
+class AzureDocIntEngine(DocumentEngine):
+    """Adapter for Azure Document Intelligence (formerly Form Recognizer).
+    Uses the ``prebuilt-layout`` model by default for general-purpose
+    document analysis with table, heading, and reading order extraction.
+    Supports DOCX, XLSX, PPTX natively in addition to PDF and images.
+    See https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/
+    """
+    def __init__(
+        self,
+        endpoint: str | None = None,
+        key: str | None = None,
+        model_id: str = "prebuilt-layout",
+    ) -> None:
+        self._endpoint = endpoint or os.getenv("AZURE_DOCINT_ENDPOINT")
+        self._key = key or os.getenv("AZURE_DOCINT_KEY")
+        self._model_id = model_id
+    @property
+    def name(self) -> str:
+        return "azure_docint"
+    @property
+    def supported_extensions(self) -> set[str]:
+        return _SUPPORTED_EXTENSIONS
+    @property
+    def capabilities(self) -> EngineCapabilities:
+        return EngineCapabilities(
+            bounding_boxes=True, confidence=True, table_structure=True,
+            heading_detection=True, reading_order=True,
+        )
+    def is_available(self) -> bool:
+        try:
+            import azure.ai.documentintelligence  # noqa: F401
+            return bool(self._endpoint and self._key)
+        except ImportError:
+            return False
+    async def process(
+        self,
+        file_path: str,
+        output_format: OutputFormat = OutputFormat.MARKDOWN,
+        **kwargs: Any,
+    ) -> EngineResult:
+        import asyncio
+        start = time.perf_counter()
+        loop = asyncio.get_running_loop()
+        content, metadata, boxes, conf, tables = await loop.run_in_executor(
+            None, self._analyze, file_path, output_format
+        )
+        elapsed_ms = int((time.perf_counter() - start) * 1000)
+        return EngineResult(
+            content=content,
+            format=output_format,
+            engine_name=self.name,
+            processing_time_ms=elapsed_ms,
+            metadata=metadata,
+            bounding_boxes=boxes,
+            confidence=conf,
+            tables=tables,
+        )
+    def _analyze(
+        self,
+        file_path: str,
+        output_format: OutputFormat,
+    ) -> tuple[str, dict, list[dict], float | None, list[dict] | None]:
+        from azure.ai.documentintelligence import DocumentIntelligenceClient
+        from azure.core.credentials import AzureKeyCredential
+        client = DocumentIntelligenceClient(
+            endpoint=self._endpoint,
+            credential=AzureKeyCredential(self._key),
+        )
+        with open(file_path, "rb") as f:
+            poller = client.begin_analyze_document(
+                model_id=self._model_id,
+                analyze_request=f,
+                content_type="application/octet-stream",
+                output_content_format="markdown",
+            )
+        result = poller.result()
+        # Primary content — Azure returns markdown by default
+        full_text = result.content or ""
+        # Extract bounding boxes and confidence from paragraphs
+        bounding_boxes: list[dict[str, Any]] = []
+        confidences: list[float] = []
+        for paragraph in result.paragraphs or []:
+            conf = paragraph.confidence
+            if conf is not None:
+                confidences.append(conf)
+            polygon = None
+            if paragraph.bounding_regions:
+                region = paragraph.bounding_regions[0]
+                polygon = region.polygon
+                page_num = region.page_number
+            else:
+                page_num = 1
+            bounding_boxes.append({
+                "type": "paragraph",
+                "role": paragraph.role,
+                "text": paragraph.content,
+                "polygon": polygon,
+                "page": page_num,
+                "confidence": conf,
+            })
+        avg_conf = sum(confidences) / len(confidences) if confidences else None
+        # Extract tables
+        tables: list[dict[str, Any]] = []
+        for table in result.tables or []:
+            table_data = self._extract_table(table)
+            if table_data:
+                tables.append(table_data)
+        # Format output
+        if output_format == OutputFormat.JSON:
+            import json
+            data = {"text": full_text, "page_count": len(result.pages or [])}
+            content = json.dumps(data, ensure_ascii=False)
+        elif output_format == OutputFormat.HTML:
+            content = f"<html><body><pre>{full_text}</pre></body></html>"
+        else:
+            content = full_text
+        metadata = {
+            "page_count": len(result.pages or []),
+            "model_id": self._model_id,
+            "paragraph_count": len(result.paragraphs or []),
+            "table_count": len(tables),
+        }
+        return content, metadata, bounding_boxes, avg_conf, tables or None
+    def _extract_table(self, table: Any) -> dict[str, Any] | None:
+        """Extract table structure from Azure table object."""
+        if not table.cells:
+            return None
+        rows: dict[int, dict[int, str]] = {}
+        for cell in table.cells:
+            row_idx = cell.row_index
+            col_idx = cell.column_index
+            rows.setdefault(row_idx, {})[col_idx] = cell.content or ""
+        return {
+            "row_count": table.row_count,
+            "column_count": table.column_count,
+            "rows": [
+                {f"col_{c}": rows[r].get(c, "") for c in sorted(rows[r])}
+                for r in sorted(rows)
+            ],
+        }

docfold/engines/base.py ADDED Viewed

@@ -0,0 +1,111 @@
+"""Base interface for document structuring engines."""
+from __future__ import annotations
+from abc import ABC, abstractmethod
+from dataclasses import dataclass, field
+from enum import Enum
+from typing import Any
+@dataclass(frozen=True)
+class EngineCapabilities:
+    """Declares what enrichments an engine can populate in EngineResult."""
+    bounding_boxes: bool = False
+    confidence: bool = False
+    images: bool = False
+    table_structure: bool = False
+    heading_detection: bool = False
+    reading_order: bool = False
+class OutputFormat(str, Enum):
+    MARKDOWN = "markdown"
+    HTML = "html"
+    JSON = "json"
+    TEXT = "text"
+@dataclass
+class EngineResult:
+    """Unified result returned by all structuring engines.
+    Every engine adapter must produce this dataclass so that callers
+    never depend on engine-specific output shapes.
+    """
+    content: str
+    """Primary output string (markdown, html, plain text, or json string)."""
+    format: OutputFormat
+    """Format of ``content``."""
+    engine_name: str
+    """Identifier of the engine that produced this result."""
+    # --- optional enrichments ---
+    metadata: dict[str, Any] = field(default_factory=dict)
+    """Engine-specific metadata (model versions, config used, etc.)."""
+    pages: int | None = None
+    """Number of pages processed (if applicable)."""
+    images: dict[str, str] | None = None
+    """Extracted images as ``{filename: base64_data}``."""
+    tables: list[dict[str, Any]] | None = None
+    """Extracted tables as list of row-dicts."""
+    bounding_boxes: list[dict[str, Any]] | None = None
+    """Layout element bounding boxes ``[{type, bbox, page, ...}]``."""
+    confidence: float | None = None
+    """Overall confidence score in [0, 1] (if the engine provides one)."""
+    processing_time_ms: int = 0
+    """Wall-clock processing time in milliseconds."""
+class DocumentEngine(ABC):
+    """Abstract base class that every engine adapter must implement."""
+    @property
+    @abstractmethod
+    def name(self) -> str:
+        """Unique, lowercase engine identifier (e.g. ``'docling'``)."""
+        ...
+    @property
+    @abstractmethod
+    def supported_extensions(self) -> set[str]:
+        """File extensions this engine can handle, without dots (e.g. ``{'pdf', 'docx'}``)."""
+        ...
+    @abstractmethod
+    async def process(
+        self,
+        file_path: str,
+        output_format: OutputFormat = OutputFormat.MARKDOWN,
+        **kwargs: Any,
+    ) -> EngineResult:
+        """Process a document and return a unified :class:`EngineResult`."""
+        ...
+    @abstractmethod
+    def is_available(self) -> bool:
+        """Return ``True`` if the engine's dependencies are installed and ready."""
+        ...
+    @property
+    def capabilities(self) -> EngineCapabilities:
+        """Declare what enrichments this engine populates in :class:`EngineResult`.
+        Engines should override this to advertise their capabilities.
+        Defaults to all ``False``.
+        """
+        return EngineCapabilities()
+    def __repr__(self) -> str:
+        return f"<{self.__class__.__name__} name={self.name!r} available={self.is_available()}>"

docfold/engines/docling_engine.py ADDED Viewed

@@ -0,0 +1,101 @@
+"""Docling engine adapter.
+Install: ``pip install docfold[docling]``
+"""
+from __future__ import annotations
+import logging
+import time
+from typing import Any
+from docfold.engines.base import DocumentEngine, EngineCapabilities, EngineResult, OutputFormat
+logger = logging.getLogger(__name__)
+_SUPPORTED_EXTENSIONS = {
+    "pdf", "docx", "pptx", "xlsx", "html",
+    "png", "jpg", "jpeg", "tiff", "tif",
+    "wav", "mp3", "vtt",
+}
+class DoclingEngine(DocumentEngine):
+    """Adapter for the Docling document conversion framework.
+    See https://github.com/docling-project/docling
+    """
+    def __init__(self, pipeline: str = "standard", ocr_enabled: bool = True) -> None:
+        self._pipeline = pipeline  # "standard" or "vlm"
+        self._ocr_enabled = ocr_enabled
+        self._converter = None
+    @property
+    def name(self) -> str:
+        return "docling"
+    @property
+    def supported_extensions(self) -> set[str]:
+        return _SUPPORTED_EXTENSIONS
+    @property
+    def capabilities(self) -> EngineCapabilities:
+        return EngineCapabilities(
+            bounding_boxes=True, images=True, table_structure=True,
+            heading_detection=True, reading_order=True,
+        )
+    def is_available(self) -> bool:
+        try:
+            import docling  # noqa: F401
+            return True
+        except ImportError:
+            return False
+    def _get_converter(self):  # noqa: ANN202
+        """Lazy-init the Docling DocumentConverter."""
+        if self._converter is None:
+            from docling.document_converter import DocumentConverter
+            self._converter = DocumentConverter()
+        return self._converter
+    async def process(
+        self,
+        file_path: str,
+        output_format: OutputFormat = OutputFormat.MARKDOWN,
+        **kwargs: Any,
+    ) -> EngineResult:
+        import asyncio
+        start = time.perf_counter()
+        converter = self._get_converter()
+        # Docling's convert() is synchronous — run in executor
+        loop = asyncio.get_running_loop()
+        result = await loop.run_in_executor(None, converter.convert, file_path)
+        doc = result.document
+        if output_format == OutputFormat.MARKDOWN:
+            content = doc.export_to_markdown()
+        elif output_format == OutputFormat.HTML:
+            content = doc.export_to_html()
+        elif output_format == OutputFormat.JSON:
+            import json
+            content = json.dumps(doc.export_to_dict(), ensure_ascii=False)
+        else:
+            content = doc.export_to_markdown()
+        elapsed_ms = int((time.perf_counter() - start) * 1000)
+        return EngineResult(
+            content=content,
+            format=output_format,
+            engine_name=self.name,
+            processing_time_ms=elapsed_ms,
+            metadata={
+                "pipeline": self._pipeline,
+                "ocr_enabled": self._ocr_enabled,
+            },
+        )