document-extraction-tools 0.0.1rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. document_extraction_tools/__init__.py +0 -0
  2. document_extraction_tools/base/__init__.py +27 -0
  3. document_extraction_tools/base/converter/__init__.py +0 -0
  4. document_extraction_tools/base/converter/base_converter.py +40 -0
  5. document_extraction_tools/base/evaluator/__init__.py +0 -0
  6. document_extraction_tools/base/evaluator/base_evaluator.py +40 -0
  7. document_extraction_tools/base/exporter/__init__.py +0 -0
  8. document_extraction_tools/base/exporter/base_evaluation_exporter.py +43 -0
  9. document_extraction_tools/base/exporter/base_extraction_exporter.py +41 -0
  10. document_extraction_tools/base/extractor/__init__.py +0 -0
  11. document_extraction_tools/base/extractor/base_extractor.py +41 -0
  12. document_extraction_tools/base/file_lister/__init__.py +0 -0
  13. document_extraction_tools/base/file_lister/base_file_lister.py +37 -0
  14. document_extraction_tools/base/reader/__init__.py +0 -0
  15. document_extraction_tools/base/reader/base_reader.py +36 -0
  16. document_extraction_tools/base/test_data_loader/__init__.py +0 -0
  17. document_extraction_tools/base/test_data_loader/base_test_data_loader.py +44 -0
  18. document_extraction_tools/config/__init__.py +51 -0
  19. document_extraction_tools/config/base_converter_config.py +14 -0
  20. document_extraction_tools/config/base_evaluation_exporter_config.py +14 -0
  21. document_extraction_tools/config/base_evaluator_config.py +14 -0
  22. document_extraction_tools/config/base_extraction_exporter_config.py +14 -0
  23. document_extraction_tools/config/base_extractor_config.py +14 -0
  24. document_extraction_tools/config/base_file_lister_config.py +14 -0
  25. document_extraction_tools/config/base_reader_config.py +14 -0
  26. document_extraction_tools/config/base_test_data_loader_config.py +14 -0
  27. document_extraction_tools/config/config_loader.py +201 -0
  28. document_extraction_tools/config/evaluation_orchestrator_config.py +20 -0
  29. document_extraction_tools/config/evaluation_pipeline_config.py +32 -0
  30. document_extraction_tools/config/extraction_orchestrator_config.py +20 -0
  31. document_extraction_tools/config/extraction_pipeline_config.py +30 -0
  32. document_extraction_tools/py.typed +0 -0
  33. document_extraction_tools/runners/__init__.py +10 -0
  34. document_extraction_tools/runners/evaluation/__init__.py +0 -0
  35. document_extraction_tools/runners/evaluation/evaluation_orchestrator.py +260 -0
  36. document_extraction_tools/runners/extraction/__init__.py +0 -0
  37. document_extraction_tools/runners/extraction/extraction_orchestrator.py +202 -0
  38. document_extraction_tools/types/__init__.py +20 -0
  39. document_extraction_tools/types/document.py +79 -0
  40. document_extraction_tools/types/document_bytes.py +27 -0
  41. document_extraction_tools/types/evaluation_example.py +21 -0
  42. document_extraction_tools/types/evaluation_result.py +16 -0
  43. document_extraction_tools/types/path_identifier.py +16 -0
  44. document_extraction_tools/types/schema.py +7 -0
  45. document_extraction_tools-0.0.1rc1.dist-info/METADATA +15 -0
  46. document_extraction_tools-0.0.1rc1.dist-info/RECORD +47 -0
  47. document_extraction_tools-0.0.1rc1.dist-info/WHEEL +4 -0
File without changes
@@ -0,0 +1,27 @@
1
+ """Public base component interfaces."""
2
+
3
+ from document_extraction_tools.base.converter.base_converter import BaseConverter
4
+ from document_extraction_tools.base.evaluator.base_evaluator import BaseEvaluator
5
+ from document_extraction_tools.base.exporter.base_evaluation_exporter import (
6
+ BaseEvaluationExporter,
7
+ )
8
+ from document_extraction_tools.base.exporter.base_extraction_exporter import (
9
+ BaseExtractionExporter,
10
+ )
11
+ from document_extraction_tools.base.extractor.base_extractor import BaseExtractor
12
+ from document_extraction_tools.base.file_lister.base_file_lister import BaseFileLister
13
+ from document_extraction_tools.base.reader.base_reader import BaseReader
14
+ from document_extraction_tools.base.test_data_loader.base_test_data_loader import (
15
+ BaseTestDataLoader,
16
+ )
17
+
18
+ __all__ = [
19
+ "BaseConverter",
20
+ "BaseEvaluationExporter",
21
+ "BaseEvaluator",
22
+ "BaseExtractionExporter",
23
+ "BaseExtractor",
24
+ "BaseFileLister",
25
+ "BaseReader",
26
+ "BaseTestDataLoader",
27
+ ]
File without changes
@@ -0,0 +1,40 @@
1
+ """Abstract Base Class for Document Converters.
2
+
3
+ This module defines the interface that all converter implementations must satisfy.
4
+ Converters are responsible for transforming raw binary data (DocumentBytes)
5
+ into a structured Document object containing pages and content.
6
+ """
7
+
8
+ from abc import ABC, abstractmethod
9
+
10
+ from document_extraction_tools.config.base_converter_config import BaseConverterConfig
11
+ from document_extraction_tools.types.document import Document
12
+ from document_extraction_tools.types.document_bytes import DocumentBytes
13
+
14
+
15
+ class BaseConverter(ABC):
16
+ """Abstract interface for document transformation."""
17
+
18
+ def __init__(self, config: BaseConverterConfig) -> None:
19
+ """Initialize with a configuration object.
20
+
21
+ Args:
22
+ config (BaseConverterConfig): Configuration specific to the converter implementation.
23
+ """
24
+ self.config = config
25
+
26
+ @abstractmethod
27
+ def convert(self, document_bytes: DocumentBytes) -> Document:
28
+ """Transforms raw document bytes into a structured Document object.
29
+
30
+ This method should handle the parsing logic and map the metadata from the
31
+ input bytes to the output document.
32
+
33
+ Args:
34
+ document_bytes (DocumentBytes): The standardized raw input containing
35
+ file bytes and source metadata.
36
+
37
+ Returns:
38
+ Document: The fully structured document model ready for extraction.
39
+ """
40
+ pass
File without changes
@@ -0,0 +1,40 @@
1
+ """Abstract Base Class for Evaluators.
2
+
3
+ This module defines the interface that all evaluator implementations must satisfy.
4
+ Evaluators are responsible for computing evaluation metrics by comparing
5
+ predicted data against ground-truth data.
6
+ """
7
+
8
+ from abc import ABC, abstractmethod
9
+ from typing import Generic
10
+
11
+ from document_extraction_tools.config.base_evaluator_config import BaseEvaluatorConfig
12
+ from document_extraction_tools.types.evaluation_result import EvaluationResult
13
+ from document_extraction_tools.types.schema import ExtractionSchema
14
+
15
+
16
+ class BaseEvaluator(ABC, Generic[ExtractionSchema]):
17
+ """Abstract interface for evaluation metrics."""
18
+
19
+ def __init__(self, config: BaseEvaluatorConfig) -> None:
20
+ """Initialize with a configuration object.
21
+
22
+ Args:
23
+ config (BaseEvaluatorConfig): Configuration specific to the evaluator implementation.
24
+ """
25
+ self.config = config
26
+
27
+ @abstractmethod
28
+ def evaluate(
29
+ self, true: ExtractionSchema, pred: ExtractionSchema
30
+ ) -> EvaluationResult:
31
+ """Compute a metric for a single document.
32
+
33
+ Args:
34
+ true (ExtractionSchema): Ground-truth data.
35
+ pred (ExtractionSchema): Predicted data.
36
+
37
+ Returns:
38
+ EvaluationResult: The metric result for this document.
39
+ """
40
+ pass
File without changes
@@ -0,0 +1,43 @@
1
+ """Abstract Base Class for Evaluation Exporters.
2
+
3
+ This module defines the interface that all evaluation exporter implementations must satisfy.
4
+ Evaluation Exporters are responsible for taking evaluation results and persisting them
5
+ to a target destination.
6
+ """
7
+
8
+ from abc import ABC, abstractmethod
9
+
10
+ from document_extraction_tools.config.base_evaluation_exporter_config import (
11
+ BaseEvaluationExporterConfig,
12
+ )
13
+ from document_extraction_tools.types.document import Document
14
+ from document_extraction_tools.types.evaluation_result import EvaluationResult
15
+
16
+
17
+ class BaseEvaluationExporter(ABC):
18
+ """Abstract interface for exporting evaluation results."""
19
+
20
+ def __init__(self, config: BaseEvaluationExporterConfig) -> None:
21
+ """Initialize with a configuration object.
22
+
23
+ Args:
24
+ config (BaseEvaluationExporterConfig): Configuration specific to the evaluation exporter implementation.
25
+ """
26
+ self.config = config
27
+
28
+ @abstractmethod
29
+ async def export(
30
+ self, results: list[tuple[Document, list[EvaluationResult]]]
31
+ ) -> None:
32
+ """Persist evaluation results to a target destination.
33
+
34
+ This is an asynchronous operation to support non-blocking I/O writes.
35
+
36
+ Args:
37
+ results (list[tuple[Document, list[EvaluationResult]]]):
38
+ A list of tuples containing documents and their associated evaluation results.
39
+
40
+ Returns:
41
+ None: The method should raise an exception if the export fails.
42
+ """
43
+ pass
@@ -0,0 +1,41 @@
1
+ """Abstract Base Class for Extraction Exporters.
2
+
3
+ This module defines the interface that all exporter implementations must satisfy.
4
+ Extraction Exporters are responsible for taking the extracted, structured Pydantic data
5
+ and persisting it to a target destination.
6
+ """
7
+
8
+ from abc import ABC, abstractmethod
9
+
10
+ from document_extraction_tools.config.base_extraction_exporter_config import (
11
+ BaseExtractionExporterConfig,
12
+ )
13
+ from document_extraction_tools.types.document import Document
14
+ from document_extraction_tools.types.schema import ExtractionSchema
15
+
16
+
17
+ class BaseExtractionExporter(ABC):
18
+ """Abstract interface for data persistence."""
19
+
20
+ def __init__(self, config: BaseExtractionExporterConfig) -> None:
21
+ """Initialize with a configuration object.
22
+
23
+ Args:
24
+ config (BaseExtractionExporterConfig): Configuration specific to the exporter implementation.
25
+ """
26
+ self.config = config
27
+
28
+ @abstractmethod
29
+ async def export(self, document: Document, data: ExtractionSchema) -> None:
30
+ """Persists extracted data to the configured destination.
31
+
32
+ This is an asynchronous operation to support non-blocking I/O writes.
33
+
34
+ Args:
35
+ document (Document): The source document for this extraction.
36
+ data (ExtractionSchema): The populated Pydantic model containing the extracted information.
37
+
38
+ Returns:
39
+ None: The method should raise an exception if the export fails.
40
+ """
41
+ pass
File without changes
@@ -0,0 +1,41 @@
1
+ """Abstract Base Class for Information Extractors.
2
+
3
+ This module defines the interface that all extractor implementations must satisfy.
4
+ Extractors are responsible for analyzing the structured Document
5
+ and populating a target Pydantic schema with specific data points.
6
+ """
7
+
8
+ from abc import ABC, abstractmethod
9
+
10
+ from document_extraction_tools.config.base_extractor_config import BaseExtractorConfig
11
+ from document_extraction_tools.types.document import Document
12
+ from document_extraction_tools.types.schema import ExtractionSchema
13
+
14
+
15
+ class BaseExtractor(ABC):
16
+ """Abstract interface for data extraction."""
17
+
18
+ def __init__(self, config: BaseExtractorConfig) -> None:
19
+ """Initialize with a configuration object.
20
+
21
+ Args:
22
+ config (BaseExtractorConfig): Configuration specific to the extractor implementation.
23
+ """
24
+ self.config = config
25
+
26
+ @abstractmethod
27
+ async def extract(
28
+ self, document: Document, schema: type[ExtractionSchema]
29
+ ) -> ExtractionSchema:
30
+ """Extracts structured data from a Document to match the provided Schema.
31
+
32
+ This is an asynchronous operation to support I/O-bound tasks.
33
+
34
+ Args:
35
+ document (Document): The fully parsed document.
36
+ schema (type[ExtractionSchema]): The Pydantic model class defining the target structure.
37
+
38
+ Returns:
39
+ ExtractionSchema: An instance of the schema populated with the extracted data.
40
+ """
41
+ pass
File without changes
@@ -0,0 +1,37 @@
1
+ """Abstract Base Class for File Listers.
2
+
3
+ This module defines the interface that all file lister implementations must satisfy.
4
+ File Listers are responsible for scanning a source
5
+ and returning a list of standardized identifiers to be processed.
6
+ """
7
+
8
+ from abc import ABC, abstractmethod
9
+
10
+ from document_extraction_tools.config.base_file_lister_config import (
11
+ BaseFileListerConfig,
12
+ )
13
+ from document_extraction_tools.types.path_identifier import PathIdentifier
14
+
15
+
16
+ class BaseFileLister(ABC):
17
+ """Abstract interface for file discovery."""
18
+
19
+ def __init__(self, config: BaseFileListerConfig) -> None:
20
+ """Initialize with a configuration object.
21
+
22
+ Args:
23
+ config (BaseFileListerConfig): Configuration specific to the file lister implementation.
24
+ """
25
+ self.config = config
26
+
27
+ @abstractmethod
28
+ def list_files(self) -> list[PathIdentifier]:
29
+ """Scans the target source and returns a list of file identifiers.
30
+
31
+ This method should handle the logic to return a clean list of work items.
32
+
33
+ Returns:
34
+ List[PathIdentifier]: A list of standardized objects containing the
35
+ path and any necessary execution context.
36
+ """
37
+ pass
File without changes
@@ -0,0 +1,36 @@
1
+ """Abstract Base Class for Document Readers.
2
+
3
+ This module defines the interface that all reader implementations must satisfy.
4
+ Readers are responsible for fetching raw file content from a source
5
+ and returning it as a standardized DocumentBytes object.
6
+ """
7
+
8
+ from abc import ABC, abstractmethod
9
+
10
+ from document_extraction_tools.config.base_reader_config import BaseReaderConfig
11
+ from document_extraction_tools.types.document_bytes import DocumentBytes
12
+ from document_extraction_tools.types.path_identifier import PathIdentifier
13
+
14
+
15
+ class BaseReader(ABC):
16
+ """Abstract interface for document ingestion."""
17
+
18
+ def __init__(self, config: BaseReaderConfig) -> None:
19
+ """Initialize with a configuration object.
20
+
21
+ Args:
22
+ config (BaseReaderConfig): Configuration specific to the reader implementation.
23
+ """
24
+ self.config = config
25
+
26
+ @abstractmethod
27
+ def read(self, path_identifier: PathIdentifier) -> DocumentBytes:
28
+ """Reads a document from a specific source and returns its raw bytes.
29
+
30
+ Args:
31
+ path_identifier (PathIdentifier): The identifier for the file.
32
+
33
+ Returns:
34
+ DocumentBytes: A standardized container with raw bytes and source metadata.
35
+ """
36
+ pass
@@ -0,0 +1,44 @@
1
+ """Abstract Base Class for Test Data Loaders.
2
+
3
+ This module defines the interface that all test data loader implementations must satisfy.
4
+ Test Data Loaders are responsible for loading evaluation test examples from a specified source.
5
+ """
6
+
7
+ from abc import ABC, abstractmethod
8
+ from typing import Generic
9
+
10
+ from document_extraction_tools.config.base_test_data_loader_config import (
11
+ BaseTestDataLoaderConfig,
12
+ )
13
+ from document_extraction_tools.types.evaluation_example import EvaluationExample
14
+ from document_extraction_tools.types.path_identifier import PathIdentifier
15
+ from document_extraction_tools.types.schema import ExtractionSchema
16
+
17
+
18
+ class BaseTestDataLoader(ABC, Generic[ExtractionSchema]):
19
+ """Abstract interface for loading evaluation test data."""
20
+
21
+ def __init__(self, config: BaseTestDataLoaderConfig) -> None:
22
+ """Initialize with a configuration object.
23
+
24
+ Args:
25
+ config (BaseTestDataLoaderConfig): Configuration specific to the test data loader implementation.
26
+ """
27
+ self.config = config
28
+
29
+ @abstractmethod
30
+ def load_test_data(
31
+ self, path_identifier: PathIdentifier
32
+ ) -> list[EvaluationExample[ExtractionSchema]]:
33
+ """Load test examples for evaluation.
34
+
35
+ This method should retrieve and return a list of EvaluationExample instances
36
+ based on the provided path identifier.
37
+
38
+ Args:
39
+ path_identifier (PathIdentifier): The source location for loading evaluation examples.
40
+
41
+ Returns:
42
+ list[EvaluationExample[ExtractionSchema]]: A list of evaluation examples for evaluation.
43
+ """
44
+ pass
@@ -0,0 +1,51 @@
1
+ """Public config helpers and models."""
2
+
3
+ from document_extraction_tools.config.base_converter_config import BaseConverterConfig
4
+ from document_extraction_tools.config.base_evaluation_exporter_config import (
5
+ BaseEvaluationExporterConfig,
6
+ )
7
+ from document_extraction_tools.config.base_evaluator_config import BaseEvaluatorConfig
8
+ from document_extraction_tools.config.base_extraction_exporter_config import (
9
+ BaseExtractionExporterConfig,
10
+ )
11
+ from document_extraction_tools.config.base_extractor_config import BaseExtractorConfig
12
+ from document_extraction_tools.config.base_file_lister_config import (
13
+ BaseFileListerConfig,
14
+ )
15
+ from document_extraction_tools.config.base_reader_config import BaseReaderConfig
16
+ from document_extraction_tools.config.base_test_data_loader_config import (
17
+ BaseTestDataLoaderConfig,
18
+ )
19
+ from document_extraction_tools.config.config_loader import (
20
+ load_config,
21
+ load_evaluation_config,
22
+ )
23
+ from document_extraction_tools.config.evaluation_orchestrator_config import (
24
+ EvaluationOrchestratorConfig,
25
+ )
26
+ from document_extraction_tools.config.evaluation_pipeline_config import (
27
+ EvaluationPipelineConfig,
28
+ )
29
+ from document_extraction_tools.config.extraction_orchestrator_config import (
30
+ ExtractionOrchestratorConfig,
31
+ )
32
+ from document_extraction_tools.config.extraction_pipeline_config import (
33
+ ExtractionPipelineConfig,
34
+ )
35
+
36
+ __all__ = [
37
+ "BaseConverterConfig",
38
+ "BaseEvaluationExporterConfig",
39
+ "BaseEvaluatorConfig",
40
+ "BaseExtractionExporterConfig",
41
+ "BaseExtractorConfig",
42
+ "BaseFileListerConfig",
43
+ "BaseReaderConfig",
44
+ "BaseTestDataLoaderConfig",
45
+ "EvaluationOrchestratorConfig",
46
+ "EvaluationPipelineConfig",
47
+ "ExtractionOrchestratorConfig",
48
+ "ExtractionPipelineConfig",
49
+ "load_config",
50
+ "load_evaluation_config",
51
+ ]
@@ -0,0 +1,14 @@
1
+ """Configuration for Converter components."""
2
+
3
+ from typing import ClassVar
4
+
5
+ from pydantic import BaseModel
6
+
7
+
8
+ class BaseConverterConfig(BaseModel):
9
+ """Base config for Converters.
10
+
11
+ Implementations should subclass this to add specific parameters.
12
+ """
13
+
14
+ filename: ClassVar[str] = "converter.yaml"
@@ -0,0 +1,14 @@
1
+ """Configuration for Evaluation Exporter components."""
2
+
3
+ from typing import ClassVar
4
+
5
+ from pydantic import BaseModel
6
+
7
+
8
+ class BaseEvaluationExporterConfig(BaseModel):
9
+ """Base config for Evaluation Exporters.
10
+
11
+ Implementations should subclass this to add specific parameters.
12
+ """
13
+
14
+ filename: ClassVar[str] = "evaluation_exporter.yaml"
@@ -0,0 +1,14 @@
1
+ """Configuration for Evaluator components."""
2
+
3
+ from typing import ClassVar
4
+
5
+ from pydantic import BaseModel
6
+
7
+
8
+ class BaseEvaluatorConfig(BaseModel):
9
+ """Base config for Evaluators.
10
+
11
+ Implementations should subclass this to add specific parameters.
12
+ """
13
+
14
+ filename: ClassVar[str] = "evaluator.yaml"
@@ -0,0 +1,14 @@
1
+ """Configuration for Extraction Exporter components."""
2
+
3
+ from typing import ClassVar
4
+
5
+ from pydantic import BaseModel
6
+
7
+
8
+ class BaseExtractionExporterConfig(BaseModel):
9
+ """Base config for Exporters.
10
+
11
+ Implementations should subclass this to add specific parameters.
12
+ """
13
+
14
+ filename: ClassVar[str] = "extraction_exporter.yaml"
@@ -0,0 +1,14 @@
1
+ """Configuration for Extractor components."""
2
+
3
+ from typing import ClassVar
4
+
5
+ from pydantic import BaseModel
6
+
7
+
8
+ class BaseExtractorConfig(BaseModel):
9
+ """Base config for Extractors.
10
+
11
+ Implementations should subclass this to add specific parameters.
12
+ """
13
+
14
+ filename: ClassVar[str] = "extractor.yaml"
@@ -0,0 +1,14 @@
1
+ """Configuration for File Lister components."""
2
+
3
+ from typing import ClassVar
4
+
5
+ from pydantic import BaseModel
6
+
7
+
8
+ class BaseFileListerConfig(BaseModel):
9
+ """Base config for File Listers.
10
+
11
+ Implementations should subclass this to add specific parameters.
12
+ """
13
+
14
+ filename: ClassVar[str] = "file_lister.yaml"
@@ -0,0 +1,14 @@
1
+ """Configuration for Reader components."""
2
+
3
+ from typing import ClassVar
4
+
5
+ from pydantic import BaseModel
6
+
7
+
8
+ class BaseReaderConfig(BaseModel):
9
+ """Base config for Readers.
10
+
11
+ Implementations should subclass this to add specific parameters.
12
+ """
13
+
14
+ filename: ClassVar[str] = "reader.yaml"
@@ -0,0 +1,14 @@
1
+ """Configuration for Test Data Loader components."""
2
+
3
+ from typing import ClassVar
4
+
5
+ from pydantic import BaseModel
6
+
7
+
8
+ class BaseTestDataLoaderConfig(BaseModel):
9
+ """Base config for Test Data Loaders.
10
+
11
+ Implementations should subclass this to add specific parameters.
12
+ """
13
+
14
+ filename: ClassVar[str] = "test_data_loader.yaml"