PyPI - document-extraction-tools - Versions diffs - 0.0.1rc1__py3-none-any.whl - Mend

document-extraction-tools 0.0.1rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (47) hide show

document_extraction_tools/runners/extraction/extraction_orchestrator.py ADDED Viewed

@@ -0,0 +1,202 @@
+"""Extraction Orchestrator.
+This module contains the logic to coordinate the flow of data through the
+extraction pipeline. It manages parallel processing and asynchronous
+concurrency to maximize throughput.
+"""
+import asyncio
+import contextvars
+import logging
+from collections.abc import Callable
+from concurrent.futures import ThreadPoolExecutor
+from typing import Generic, TypeVar
+from document_extraction_tools.base.converter.base_converter import BaseConverter
+from document_extraction_tools.base.exporter.base_extraction_exporter import (
+    BaseExtractionExporter,
+)
+from document_extraction_tools.base.extractor.base_extractor import BaseExtractor
+from document_extraction_tools.base.reader.base_reader import BaseReader
+from document_extraction_tools.config.extraction_orchestrator_config import (
+    ExtractionOrchestratorConfig,
+)
+from document_extraction_tools.config.extraction_pipeline_config import (
+    ExtractionPipelineConfig,
+)
+from document_extraction_tools.types.document import Document
+from document_extraction_tools.types.document_bytes import DocumentBytes
+from document_extraction_tools.types.path_identifier import PathIdentifier
+from document_extraction_tools.types.schema import ExtractionSchema
+logger = logging.getLogger(__name__)
+T = TypeVar("T")
+class ExtractionOrchestrator(Generic[ExtractionSchema]):
+    """Coordinates the document extraction pipeline.
+    This class manages the lifecycle of document processing, ensuring that
+    CPU-bound tasks (Reading/Converting) are offloaded to a thread pool while
+    I/O-bound tasks (Extracting/Exporting) run concurrently in the async event
+    loop.
+    """
+    def __init__(
+        self,
+        config: ExtractionOrchestratorConfig,
+        reader: BaseReader,
+        converter: BaseConverter,
+        extractor: BaseExtractor,
+        exporter: BaseExtractionExporter,
+        schema: type[ExtractionSchema],
+    ) -> None:
+        """Initialize the orchestrator with pipeline components.
+        Args:
+            config (ExtractionOrchestratorConfig): Configuration for the orchestrator.
+            reader (BaseReader): Component to read raw file bytes.
+            converter (BaseConverter): Component to transform bytes into Document objects.
+            extractor (BaseExtractor): Component to extract structured data via LLM.
+            exporter (BaseExtractionExporter): Component to persist the results.
+            schema (type[ExtractionSchema]): The target Pydantic model definition for extraction.
+        """
+        self.config = config
+        self.reader = reader
+        self.converter = converter
+        self.extractor = extractor
+        self.exporter = exporter
+        self.schema = schema
+    @classmethod
+    def from_config(
+        cls,
+        config: ExtractionPipelineConfig,
+        schema: type[ExtractionSchema],
+        reader_cls: type[BaseReader],
+        converter_cls: type[BaseConverter],
+        extractor_cls: type[BaseExtractor],
+        exporter_cls: type[BaseExtractionExporter],
+    ) -> "ExtractionOrchestrator[ExtractionSchema]":
+        """Factory method to create an Orchestrator from a PipelineConfig.
+        Args:
+            config (ExtractionPipelineConfig): The full pipeline configuration.
+            schema (type[ExtractionSchema]): The target Pydantic model definition for extraction.
+            reader_cls (type[BaseReader]): The concrete Reader class to instantiate.
+            converter_cls (type[BaseConverter]): The concrete Converter class to instantiate.
+            extractor_cls (type[BaseExtractor]): The concrete Extractor class to instantiate.
+            exporter_cls (type[BaseExtractionExporter]): The concrete Exporter class to instantiate.
+        Returns:
+            ExtractionOrchestrator[ExtractionSchema]: The configured orchestrator instance.
+        """
+        reader_instance = reader_cls(config.reader)
+        converter_instance = converter_cls(config.converter)
+        extractor_instance = extractor_cls(config.extractor)
+        exporter_instance = exporter_cls(config.exporter)
+        return cls(
+            config=config.orchestrator,
+            reader=reader_instance,
+            converter=converter_instance,
+            extractor=extractor_instance,
+            exporter=exporter_instance,
+            schema=schema,
+        )
+    @staticmethod
+    def _ingest(
+        path_identifier: PathIdentifier, reader: BaseReader, converter: BaseConverter
+    ) -> Document:
+        """Performs the CPU-bound ingestion phase.
+        Args:
+            path_identifier (PathIdentifier): The path identifier to the source file.
+            reader (BaseReader): The reader instance to use.
+            converter (BaseConverter): The converter instance to use.
+        Returns:
+            Document: The fully parsed document object.
+        """
+        doc_bytes: DocumentBytes = reader.read(path_identifier)
+        return converter.convert(doc_bytes)
+    @staticmethod
+    async def _run_in_executor_with_context(
+        loop: asyncio.AbstractEventLoop,
+        pool: ThreadPoolExecutor,
+        func: Callable[..., T],
+        *args: object,
+    ) -> T:
+        """Run a function in an executor while preserving contextvars.
+        Args:
+            loop (asyncio.AbstractEventLoop): The event loop to use.
+            pool (ThreadPoolExecutor): The thread pool to run the function in.
+            func (Callable[..., T]): The function to execute.
+            *args (object): Arguments to pass to the function.
+        Returns:
+            The result of the function execution.
+        """
+        ctx = contextvars.copy_context()
+        return await loop.run_in_executor(pool, ctx.run, func, *args)
+    async def process_document(
+        self,
+        path_identifier: PathIdentifier,
+        pool: ThreadPoolExecutor,
+        semaphore: asyncio.Semaphore,
+    ) -> None:
+        """Runs the full processing lifecycle for a single document.
+        1. Ingest (Read+Convert) -> Offloaded to ThreadPool (CPU).
+        2. Extract -> Async Wait (I/O).
+        3. Export -> Async Wait (I/O).
+        Args:
+            path_identifier (PathIdentifier): The input file to process.
+            pool (ThreadPoolExecutor): The shared pool for CPU tasks.
+            semaphore (asyncio.Semaphore): The shared limiter for I/O tasks.
+        """
+        loop = asyncio.get_running_loop()
+        document: Document = await self._run_in_executor_with_context(
+            loop, pool, self._ingest, path_identifier, self.reader, self.converter
+        )
+        async with semaphore:
+            extracted_data: ExtractionSchema = await self.extractor.extract(
+                document, self.schema
+            )
+            await self.exporter.export(document, extracted_data)
+            logger.info("Completed extraction for %s", document.id)
+    async def run(self, file_paths_to_process: list[PathIdentifier]) -> None:
+        """Main entry point. Orchestrates the execution of the provided file list.
+        Args:
+            file_paths_to_process (list[PathIdentifier]): The list of file paths to process.
+        """
+        semaphore = asyncio.Semaphore(self.config.max_concurrency)
+        with ThreadPoolExecutor(max_workers=self.config.max_workers) as pool:
+            tasks = [
+                self.process_document(path_identifier, pool, semaphore)
+                for path_identifier in file_paths_to_process
+            ]
+            results = await asyncio.gather(*tasks, return_exceptions=True)
+            for path_identifier, result in zip(
+                file_paths_to_process, results, strict=True
+            ):
+                if isinstance(result, BaseException):
+                    logger.error(
+                        "Extraction pipeline failed for %s",
+                        path_identifier,
+                        exc_info=result,
+                    )

document_extraction_tools/types/__init__.py ADDED Viewed

@@ -0,0 +1,20 @@
+"""Public types for document extraction tools."""
+from document_extraction_tools.types.document import Document, ImageData, Page, TextData
+from document_extraction_tools.types.document_bytes import DocumentBytes
+from document_extraction_tools.types.evaluation_example import EvaluationExample
+from document_extraction_tools.types.evaluation_result import EvaluationResult
+from document_extraction_tools.types.path_identifier import PathIdentifier
+from document_extraction_tools.types.schema import ExtractionSchema
+__all__ = [
+    "Document",
+    "DocumentBytes",
+    "EvaluationResult",
+    "ExtractionSchema",
+    "ImageData",
+    "Page",
+    "PathIdentifier",
+    "EvaluationExample",
+    "TextData",
+]

document_extraction_tools/types/document.py ADDED Viewed

@@ -0,0 +1,79 @@
+"""Domain models representing the structured document state."""
+from typing import Any, Literal, TypeAlias
+import numpy as np
+from PIL import Image as PILImage
+from pydantic import BaseModel, ConfigDict, Field, model_validator
+from document_extraction_tools.types.path_identifier import PathIdentifier
+PILImageType: TypeAlias = PILImage.Image
+NumpyArrayType: TypeAlias = np.ndarray
+class TextData(BaseModel):
+    """Encapsulates textual content."""
+    content: str = Field(..., description="The extracted text string.")
+class ImageData(BaseModel):
+    """Encapsulates image content in various formats."""
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+    content: bytes | PILImageType | NumpyArrayType = Field(
+        ...,
+        description="The image payload. Can be raw bytes, a PIL Image, or a NumPy array.",
+    )
+class Page(BaseModel):
+    """Represents a single page within a document."""
+    page_number: int = Field(
+        ..., ge=1, description="The 1-based index of the page in the original document."
+    )
+    data: ImageData | TextData = Field(
+        ...,
+        description="The payload for the page.",
+    )
+class Document(BaseModel):
+    """The master object representing a fully parsed document."""
+    id: str = Field(..., description="A unique identifier for this document.")
+    content_type: Literal["image", "text"] = Field(
+        ..., description="The type of content extracted."
+    )
+    pages: list[Page] = Field(
+        default_factory=list,
+        description="Ordered list of pages belonging to this document.",
+    )
+    path_identifier: PathIdentifier = Field(
+        ..., description="Traceability link to the original source."
+    )
+    metadata: dict[str, Any] = Field(
+        default_factory=dict, description="Arbitrary metadata."
+    )
+    @model_validator(mode="after")
+    def check_content_consistency(self) -> "Document":
+        """Ensures page data types match the declared content_type."""
+        expected_type = ImageData if self.content_type == "image" else TextData
+        for page in self.pages:
+            if not isinstance(page.data, expected_type):
+                raise ValueError(
+                    f"Document declared as '{self.content_type}' but Page {page.page_number} "
+                    f"contains incompatible '{type(page.data).__name__}'."
+                )
+        return self

document_extraction_tools/types/document_bytes.py ADDED Viewed

@@ -0,0 +1,27 @@
+"""Input models representing raw data ingestion.
+These models act as containers for file content before any parsing occurs.
+"""
+from pydantic import BaseModel, Field
+from document_extraction_tools.types.path_identifier import PathIdentifier
+class DocumentBytes(BaseModel):
+    """A standardized container for raw document data in memory.
+    This model decouples the extraction logic from the storage source.
+    It guarantees that the processor receives raw bytes regardless of origin.
+    """
+    file_bytes: bytes = Field(..., description="The raw binary content of the file.")
+    path_identifier: PathIdentifier = Field(
+        ..., description="Path identifier for the original source."
+    )
+    mime_type: str = Field(
+        default="application/pdf",
+        description="The standard MIME type of the file content.",
+    )

document_extraction_tools/types/evaluation_example.py ADDED Viewed

@@ -0,0 +1,21 @@
+"""Evaluation test example model.
+This model defines the input schema used by the evaluation pipeline.
+"""
+from typing import Generic
+from pydantic import BaseModel, Field
+from document_extraction_tools.types.path_identifier import PathIdentifier
+from document_extraction_tools.types.schema import ExtractionSchema
+class EvaluationExample(BaseModel, Generic[ExtractionSchema]):
+    """Pairs a ground-truth schema with a source document."""
+    id: str = Field(..., description="Identifier for the test example.")
+    path_identifier: PathIdentifier = Field(
+        ..., description="Source location for the test example."
+    )
+    true: ExtractionSchema = Field(..., description="Ground-truth data.")

document_extraction_tools/types/evaluation_result.py ADDED Viewed

@@ -0,0 +1,16 @@
+"""Evaluation result model.
+This model defines the output schema produced by evaluators.
+"""
+from typing import Any
+from pydantic import BaseModel, Field
+class EvaluationResult(BaseModel):
+    """Represents a single evaluation result for one document."""
+    name: str = Field(..., description="Name of the evaluator or metric.")
+    result: Any = Field(..., description="Computed metric value.")
+    description: str = Field(..., description="Human-readable description.")

document_extraction_tools/types/path_identifier.py ADDED Viewed

@@ -0,0 +1,16 @@
+"""Models for identifying the source of a document."""
+from pathlib import Path
+from typing import Any
+from pydantic import BaseModel, Field
+class PathIdentifier(BaseModel):
+    """A unified reference to a document source."""
+    path: str | Path = Field(..., description="The primary path identifier.")
+    context: dict[str, Any] = Field(
+        default_factory=dict, description="Optional execution context."
+    )

document_extraction_tools/types/schema.py ADDED Viewed

@@ -0,0 +1,7 @@
+"""Common schema definitions and type variables."""
+from typing import TypeVar
+from pydantic import BaseModel
+ExtractionSchema = TypeVar("ExtractionSchema", bound=BaseModel)

document_extraction_tools-0.0.1rc1.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,15 @@
+Metadata-Version: 2.3
+Name: document-extraction-tools
+Version: 0.0.1rc1
+Summary: A modular, high-performance toolkit for extracting structured data from documents.
+Author: Ollie Kemp, Nikolas Moatsos
+Author-email: Ollie Kemp <oliver.kemp@artefact.com>, Nikolas Moatsos <nikolas.moatsos@artefact.com>
+Requires-Dist: pydantic>=2.0.0
+Requires-Dist: pyyaml>=6.0.3
+Requires-Dist: numpy>=2.4.1
+Requires-Dist: pillow>=12.1.0
+Requires-Dist: pytest>=8.0,<9.0 ; extra == 'dev'
+Requires-Dist: pytest-asyncio>=1.3.0 ; extra == 'dev'
+Requires-Dist: pre-commit>=3.3,<4.0 ; extra == 'dev'
+Requires-Python: >=3.12
+Provides-Extra: dev

document_extraction_tools-0.0.1rc1.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,47 @@
+document_extraction_tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+document_extraction_tools/base/__init__.py,sha256=0WcmjJNjXn1lr2kyJDtI6YNLAG-SeWkgU8tgyNzbhu4,979
+document_extraction_tools/base/converter/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+document_extraction_tools/base/converter/base_converter.py,sha256=cgIyjuEIUtvbGTaPF827C8Xw-vJTHhi6xy1mUzR5ud8,1475
+document_extraction_tools/base/evaluator/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+document_extraction_tools/base/evaluator/base_evaluator.py,sha256=nJrILH9YujIGBbMTaHtUnxqOSZXm_yhPWoZw-853Wns,1328
+document_extraction_tools/base/exporter/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+document_extraction_tools/base/exporter/base_evaluation_exporter.py,sha256=FmcAonBVJqHp34cEWEfkumKOxIBkeXrg81SFb_dtGYA,1515
+document_extraction_tools/base/exporter/base_extraction_exporter.py,sha256=kglZLG4liLaO6bDSRA9xRwjSFA9kXpCFd7hkaYJvgQw,1480
+document_extraction_tools/base/extractor/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+document_extraction_tools/base/extractor/base_extractor.py,sha256=MEgGwT-AtAzCsRCZLfrcjr2e0XbrbYFWrarTxW9VfjU,1470
+document_extraction_tools/base/file_lister/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+document_extraction_tools/base/file_lister/base_file_lister.py,sha256=Nbrei7xfC_J-b1vuGOT2ft-BqC12P9iiNZrUB7FxbNc,1237
+document_extraction_tools/base/reader/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+document_extraction_tools/base/reader/base_reader.py,sha256=E1iaWRwlNL05ErOd2OVrOiXummljSI1H95qgVzY8pdo,1248
+document_extraction_tools/base/test_data_loader/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+document_extraction_tools/base/test_data_loader/base_test_data_loader.py,sha256=8uOYe4SBHS678pN3HNlprRnrUyX7AyDxQ-Fo3i_wrC4,1621
+document_extraction_tools/config/__init__.py,sha256=zAW5d-_16gWEFFOzABybYxU619GGQxh3ZWtf5eOr8qw,1805
+document_extraction_tools/config/base_converter_config.py,sha256=Bh1Wf--D0YVUa8e_qBKmb0XeTRzhaF7jMIWjSB_GlSk,308
+document_extraction_tools/config/base_evaluation_exporter_config.py,sha256=10RPVETm4-KLnpLz5FB3pd5fqtb7l9Wzt2UQ6bzIsOU,347
+document_extraction_tools/config/base_evaluator_config.py,sha256=6cxK55uuFFy4pu9CmHo79rki_qm2dgN8nYYP_uaLFA4,308
+document_extraction_tools/config/base_extraction_exporter_config.py,sha256=w-pFgZ8ulSbnLjrXWgnhE_3cOPoK21s43TPdXJG_1ew,336
+document_extraction_tools/config/base_extractor_config.py,sha256=ZQg_I05z5EMpgCspl4iwconLKNjO1-KHM0mFYZwkVZk,308
+document_extraction_tools/config/base_file_lister_config.py,sha256=OEzMF_NiFiWlFwpAbW2nO9swPPLKpS2nWYukBg1WdbA,315
+document_extraction_tools/config/base_reader_config.py,sha256=tcPE7AQcbcFi7TStDU2ZUU-GK3I3_pAgNiFjBcy2RsA,296
+document_extraction_tools/config/base_test_data_loader_config.py,sha256=ZDo9vMuqluNwHLpYSt4yNR39JjTJJM24Y0LeVnUTBBs,334
+document_extraction_tools/config/config_loader.py,sha256=AVmiCebMED8njkmgoVfqamwkQUAfY7qwGU0LQ5g1iQI,8344
+document_extraction_tools/config/evaluation_orchestrator_config.py,sha256=xwccAfA5Mnw81orLRWmY5lrUcYymIXbbU58UPZI4PrU,560
+document_extraction_tools/config/evaluation_pipeline_config.py,sha256=vW-f1Q0992i47ZODFQxWSUC7iEcqnrty50NiNEkHxBw,1263
+document_extraction_tools/config/extraction_orchestrator_config.py,sha256=obvrwLAApKHWSzrUfNDO7_JgnoGYvLA8ruln6hFBU9o,558
+document_extraction_tools/config/extraction_pipeline_config.py,sha256=P_STMZ4NKJgPJQEfE8Mw34iacy6VxBhl-ovEXYudZ64,1094
+document_extraction_tools/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+document_extraction_tools/runners/__init__.py,sha256=mnXk0OYtTmcm7AZD62BBQsWvRoJd1-bKC_5Tek0Fg6g,321
+document_extraction_tools/runners/evaluation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+document_extraction_tools/runners/evaluation/evaluation_orchestrator.py,sha256=EWRfCDk0pB9apw35dgspSKlLDXlTlw9UDemz9oJ4d0w,10615
+document_extraction_tools/runners/extraction/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+document_extraction_tools/runners/extraction/extraction_orchestrator.py,sha256=7f-bcfDbLzBrwrBnRDzhub0i9e2vPd5aep4Zsm0O6h4,7894
+document_extraction_tools/types/__init__.py,sha256=_8OndtXKj2c9N_63lEDZ4tt_iI7mM4fB0-PI8lOFQMw,708
+document_extraction_tools/types/document.py,sha256=l35GW7h4-cPnNiGrCVTXX5SoRQLQPdDUFKw101ipKxw,2399
+document_extraction_tools/types/document_bytes.py,sha256=4ydlOEj7H4qe1q596bWe0GHZ8xIUIcTlrkiNpl4v9Ow,845
+document_extraction_tools/types/evaluation_example.py,sha256=16XVnOcVSKM6fsz7Ov6K50HD5oce5DJLc84JkyHS2io,710
+document_extraction_tools/types/evaluation_result.py,sha256=Zuz1-s3FGcvu21RpnMoKZqLDDa7PyvQhgTIDQBQXW6k,480
+document_extraction_tools/types/path_identifier.py,sha256=iGAEB2Xqg9OY6XfyyUohkN6NgmiEE2ZhrjfX-YRoRuE,425
+document_extraction_tools/types/schema.py,sha256=crWb9YcOnTCAwhL0HI8X-KfHlyw4v7GFO8ivMBlNcm8,177
+document_extraction_tools-0.0.1rc1.dist-info/WHEEL,sha256=fAguSjoiATBe7TNBkJwOjyL1Tt4wwiaQGtNtjRPNMQA,80
+document_extraction_tools-0.0.1rc1.dist-info/METADATA,sha256=4NwuM5zT8Ie7G_SVoy6ftj2BkQNOypbmPib9ABaMfUM,618
+document_extraction_tools-0.0.1rc1.dist-info/RECORD,,

document_extraction_tools-0.0.1rc1.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,4 @@
+Wheel-Version: 1.0
+Generator: uv 0.9.28
+Root-Is-Purelib: true
+Tag: py3-none-any