document-extraction-tools 0.0.1rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. document_extraction_tools/__init__.py +0 -0
  2. document_extraction_tools/base/__init__.py +27 -0
  3. document_extraction_tools/base/converter/__init__.py +0 -0
  4. document_extraction_tools/base/converter/base_converter.py +40 -0
  5. document_extraction_tools/base/evaluator/__init__.py +0 -0
  6. document_extraction_tools/base/evaluator/base_evaluator.py +40 -0
  7. document_extraction_tools/base/exporter/__init__.py +0 -0
  8. document_extraction_tools/base/exporter/base_evaluation_exporter.py +43 -0
  9. document_extraction_tools/base/exporter/base_extraction_exporter.py +41 -0
  10. document_extraction_tools/base/extractor/__init__.py +0 -0
  11. document_extraction_tools/base/extractor/base_extractor.py +41 -0
  12. document_extraction_tools/base/file_lister/__init__.py +0 -0
  13. document_extraction_tools/base/file_lister/base_file_lister.py +37 -0
  14. document_extraction_tools/base/reader/__init__.py +0 -0
  15. document_extraction_tools/base/reader/base_reader.py +36 -0
  16. document_extraction_tools/base/test_data_loader/__init__.py +0 -0
  17. document_extraction_tools/base/test_data_loader/base_test_data_loader.py +44 -0
  18. document_extraction_tools/config/__init__.py +51 -0
  19. document_extraction_tools/config/base_converter_config.py +14 -0
  20. document_extraction_tools/config/base_evaluation_exporter_config.py +14 -0
  21. document_extraction_tools/config/base_evaluator_config.py +14 -0
  22. document_extraction_tools/config/base_extraction_exporter_config.py +14 -0
  23. document_extraction_tools/config/base_extractor_config.py +14 -0
  24. document_extraction_tools/config/base_file_lister_config.py +14 -0
  25. document_extraction_tools/config/base_reader_config.py +14 -0
  26. document_extraction_tools/config/base_test_data_loader_config.py +14 -0
  27. document_extraction_tools/config/config_loader.py +201 -0
  28. document_extraction_tools/config/evaluation_orchestrator_config.py +20 -0
  29. document_extraction_tools/config/evaluation_pipeline_config.py +32 -0
  30. document_extraction_tools/config/extraction_orchestrator_config.py +20 -0
  31. document_extraction_tools/config/extraction_pipeline_config.py +30 -0
  32. document_extraction_tools/py.typed +0 -0
  33. document_extraction_tools/runners/__init__.py +10 -0
  34. document_extraction_tools/runners/evaluation/__init__.py +0 -0
  35. document_extraction_tools/runners/evaluation/evaluation_orchestrator.py +260 -0
  36. document_extraction_tools/runners/extraction/__init__.py +0 -0
  37. document_extraction_tools/runners/extraction/extraction_orchestrator.py +202 -0
  38. document_extraction_tools/types/__init__.py +20 -0
  39. document_extraction_tools/types/document.py +79 -0
  40. document_extraction_tools/types/document_bytes.py +27 -0
  41. document_extraction_tools/types/evaluation_example.py +21 -0
  42. document_extraction_tools/types/evaluation_result.py +16 -0
  43. document_extraction_tools/types/path_identifier.py +16 -0
  44. document_extraction_tools/types/schema.py +7 -0
  45. document_extraction_tools-0.0.1rc1.dist-info/METADATA +15 -0
  46. document_extraction_tools-0.0.1rc1.dist-info/RECORD +47 -0
  47. document_extraction_tools-0.0.1rc1.dist-info/WHEEL +4 -0
@@ -0,0 +1,202 @@
1
+ """Extraction Orchestrator.
2
+
3
+ This module contains the logic to coordinate the flow of data through the
4
+ extraction pipeline. It manages parallel processing and asynchronous
5
+ concurrency to maximize throughput.
6
+ """
7
+
8
+ import asyncio
9
+ import contextvars
10
+ import logging
11
+ from collections.abc import Callable
12
+ from concurrent.futures import ThreadPoolExecutor
13
+ from typing import Generic, TypeVar
14
+
15
+ from document_extraction_tools.base.converter.base_converter import BaseConverter
16
+ from document_extraction_tools.base.exporter.base_extraction_exporter import (
17
+ BaseExtractionExporter,
18
+ )
19
+ from document_extraction_tools.base.extractor.base_extractor import BaseExtractor
20
+ from document_extraction_tools.base.reader.base_reader import BaseReader
21
+ from document_extraction_tools.config.extraction_orchestrator_config import (
22
+ ExtractionOrchestratorConfig,
23
+ )
24
+ from document_extraction_tools.config.extraction_pipeline_config import (
25
+ ExtractionPipelineConfig,
26
+ )
27
+ from document_extraction_tools.types.document import Document
28
+ from document_extraction_tools.types.document_bytes import DocumentBytes
29
+ from document_extraction_tools.types.path_identifier import PathIdentifier
30
+ from document_extraction_tools.types.schema import ExtractionSchema
31
+
32
+ logger = logging.getLogger(__name__)
33
+ T = TypeVar("T")
34
+
35
+
36
+ class ExtractionOrchestrator(Generic[ExtractionSchema]):
37
+ """Coordinates the document extraction pipeline.
38
+
39
+ This class manages the lifecycle of document processing, ensuring that
40
+ CPU-bound tasks (Reading/Converting) are offloaded to a thread pool while
41
+ I/O-bound tasks (Extracting/Exporting) run concurrently in the async event
42
+ loop.
43
+ """
44
+
45
+ def __init__(
46
+ self,
47
+ config: ExtractionOrchestratorConfig,
48
+ reader: BaseReader,
49
+ converter: BaseConverter,
50
+ extractor: BaseExtractor,
51
+ exporter: BaseExtractionExporter,
52
+ schema: type[ExtractionSchema],
53
+ ) -> None:
54
+ """Initialize the orchestrator with pipeline components.
55
+
56
+ Args:
57
+ config (ExtractionOrchestratorConfig): Configuration for the orchestrator.
58
+ reader (BaseReader): Component to read raw file bytes.
59
+ converter (BaseConverter): Component to transform bytes into Document objects.
60
+ extractor (BaseExtractor): Component to extract structured data via LLM.
61
+ exporter (BaseExtractionExporter): Component to persist the results.
62
+ schema (type[ExtractionSchema]): The target Pydantic model definition for extraction.
63
+ """
64
+ self.config = config
65
+ self.reader = reader
66
+ self.converter = converter
67
+ self.extractor = extractor
68
+ self.exporter = exporter
69
+ self.schema = schema
70
+
71
+ @classmethod
72
+ def from_config(
73
+ cls,
74
+ config: ExtractionPipelineConfig,
75
+ schema: type[ExtractionSchema],
76
+ reader_cls: type[BaseReader],
77
+ converter_cls: type[BaseConverter],
78
+ extractor_cls: type[BaseExtractor],
79
+ exporter_cls: type[BaseExtractionExporter],
80
+ ) -> "ExtractionOrchestrator[ExtractionSchema]":
81
+ """Factory method to create an Orchestrator from a PipelineConfig.
82
+
83
+ Args:
84
+ config (ExtractionPipelineConfig): The full pipeline configuration.
85
+ schema (type[ExtractionSchema]): The target Pydantic model definition for extraction.
86
+ reader_cls (type[BaseReader]): The concrete Reader class to instantiate.
87
+ converter_cls (type[BaseConverter]): The concrete Converter class to instantiate.
88
+ extractor_cls (type[BaseExtractor]): The concrete Extractor class to instantiate.
89
+ exporter_cls (type[BaseExtractionExporter]): The concrete Exporter class to instantiate.
90
+
91
+ Returns:
92
+ ExtractionOrchestrator[ExtractionSchema]: The configured orchestrator instance.
93
+ """
94
+ reader_instance = reader_cls(config.reader)
95
+ converter_instance = converter_cls(config.converter)
96
+ extractor_instance = extractor_cls(config.extractor)
97
+ exporter_instance = exporter_cls(config.exporter)
98
+
99
+ return cls(
100
+ config=config.orchestrator,
101
+ reader=reader_instance,
102
+ converter=converter_instance,
103
+ extractor=extractor_instance,
104
+ exporter=exporter_instance,
105
+ schema=schema,
106
+ )
107
+
108
+ @staticmethod
109
+ def _ingest(
110
+ path_identifier: PathIdentifier, reader: BaseReader, converter: BaseConverter
111
+ ) -> Document:
112
+ """Performs the CPU-bound ingestion phase.
113
+
114
+ Args:
115
+ path_identifier (PathIdentifier): The path identifier to the source file.
116
+ reader (BaseReader): The reader instance to use.
117
+ converter (BaseConverter): The converter instance to use.
118
+
119
+ Returns:
120
+ Document: The fully parsed document object.
121
+ """
122
+ doc_bytes: DocumentBytes = reader.read(path_identifier)
123
+ return converter.convert(doc_bytes)
124
+
125
+ @staticmethod
126
+ async def _run_in_executor_with_context(
127
+ loop: asyncio.AbstractEventLoop,
128
+ pool: ThreadPoolExecutor,
129
+ func: Callable[..., T],
130
+ *args: object,
131
+ ) -> T:
132
+ """Run a function in an executor while preserving contextvars.
133
+
134
+ Args:
135
+ loop (asyncio.AbstractEventLoop): The event loop to use.
136
+ pool (ThreadPoolExecutor): The thread pool to run the function in.
137
+ func (Callable[..., T]): The function to execute.
138
+ *args (object): Arguments to pass to the function.
139
+
140
+ Returns:
141
+ The result of the function execution.
142
+ """
143
+ ctx = contextvars.copy_context()
144
+ return await loop.run_in_executor(pool, ctx.run, func, *args)
145
+
146
+ async def process_document(
147
+ self,
148
+ path_identifier: PathIdentifier,
149
+ pool: ThreadPoolExecutor,
150
+ semaphore: asyncio.Semaphore,
151
+ ) -> None:
152
+ """Runs the full processing lifecycle for a single document.
153
+
154
+ 1. Ingest (Read+Convert) -> Offloaded to ThreadPool (CPU).
155
+ 2. Extract -> Async Wait (I/O).
156
+ 3. Export -> Async Wait (I/O).
157
+
158
+ Args:
159
+ path_identifier (PathIdentifier): The input file to process.
160
+ pool (ThreadPoolExecutor): The shared pool for CPU tasks.
161
+ semaphore (asyncio.Semaphore): The shared limiter for I/O tasks.
162
+ """
163
+ loop = asyncio.get_running_loop()
164
+
165
+ document: Document = await self._run_in_executor_with_context(
166
+ loop, pool, self._ingest, path_identifier, self.reader, self.converter
167
+ )
168
+
169
+ async with semaphore:
170
+ extracted_data: ExtractionSchema = await self.extractor.extract(
171
+ document, self.schema
172
+ )
173
+ await self.exporter.export(document, extracted_data)
174
+
175
+ logger.info("Completed extraction for %s", document.id)
176
+
177
+ async def run(self, file_paths_to_process: list[PathIdentifier]) -> None:
178
+ """Main entry point. Orchestrates the execution of the provided file list.
179
+
180
+ Args:
181
+ file_paths_to_process (list[PathIdentifier]): The list of file paths to process.
182
+ """
183
+ semaphore = asyncio.Semaphore(self.config.max_concurrency)
184
+
185
+ with ThreadPoolExecutor(max_workers=self.config.max_workers) as pool:
186
+
187
+ tasks = [
188
+ self.process_document(path_identifier, pool, semaphore)
189
+ for path_identifier in file_paths_to_process
190
+ ]
191
+
192
+ results = await asyncio.gather(*tasks, return_exceptions=True)
193
+
194
+ for path_identifier, result in zip(
195
+ file_paths_to_process, results, strict=True
196
+ ):
197
+ if isinstance(result, BaseException):
198
+ logger.error(
199
+ "Extraction pipeline failed for %s",
200
+ path_identifier,
201
+ exc_info=result,
202
+ )
@@ -0,0 +1,20 @@
1
+ """Public types for document extraction tools."""
2
+
3
+ from document_extraction_tools.types.document import Document, ImageData, Page, TextData
4
+ from document_extraction_tools.types.document_bytes import DocumentBytes
5
+ from document_extraction_tools.types.evaluation_example import EvaluationExample
6
+ from document_extraction_tools.types.evaluation_result import EvaluationResult
7
+ from document_extraction_tools.types.path_identifier import PathIdentifier
8
+ from document_extraction_tools.types.schema import ExtractionSchema
9
+
10
+ __all__ = [
11
+ "Document",
12
+ "DocumentBytes",
13
+ "EvaluationResult",
14
+ "ExtractionSchema",
15
+ "ImageData",
16
+ "Page",
17
+ "PathIdentifier",
18
+ "EvaluationExample",
19
+ "TextData",
20
+ ]
@@ -0,0 +1,79 @@
1
+ """Domain models representing the structured document state."""
2
+
3
+ from typing import Any, Literal, TypeAlias
4
+
5
+ import numpy as np
6
+ from PIL import Image as PILImage
7
+ from pydantic import BaseModel, ConfigDict, Field, model_validator
8
+
9
+ from document_extraction_tools.types.path_identifier import PathIdentifier
10
+
11
+ PILImageType: TypeAlias = PILImage.Image
12
+ NumpyArrayType: TypeAlias = np.ndarray
13
+
14
+
15
+ class TextData(BaseModel):
16
+ """Encapsulates textual content."""
17
+
18
+ content: str = Field(..., description="The extracted text string.")
19
+
20
+
21
+ class ImageData(BaseModel):
22
+ """Encapsulates image content in various formats."""
23
+
24
+ model_config = ConfigDict(arbitrary_types_allowed=True)
25
+
26
+ content: bytes | PILImageType | NumpyArrayType = Field(
27
+ ...,
28
+ description="The image payload. Can be raw bytes, a PIL Image, or a NumPy array.",
29
+ )
30
+
31
+
32
+ class Page(BaseModel):
33
+ """Represents a single page within a document."""
34
+
35
+ page_number: int = Field(
36
+ ..., ge=1, description="The 1-based index of the page in the original document."
37
+ )
38
+
39
+ data: ImageData | TextData = Field(
40
+ ...,
41
+ description="The payload for the page.",
42
+ )
43
+
44
+
45
+ class Document(BaseModel):
46
+ """The master object representing a fully parsed document."""
47
+
48
+ id: str = Field(..., description="A unique identifier for this document.")
49
+
50
+ content_type: Literal["image", "text"] = Field(
51
+ ..., description="The type of content extracted."
52
+ )
53
+
54
+ pages: list[Page] = Field(
55
+ default_factory=list,
56
+ description="Ordered list of pages belonging to this document.",
57
+ )
58
+
59
+ path_identifier: PathIdentifier = Field(
60
+ ..., description="Traceability link to the original source."
61
+ )
62
+
63
+ metadata: dict[str, Any] = Field(
64
+ default_factory=dict, description="Arbitrary metadata."
65
+ )
66
+
67
+ @model_validator(mode="after")
68
+ def check_content_consistency(self) -> "Document":
69
+ """Ensures page data types match the declared content_type."""
70
+ expected_type = ImageData if self.content_type == "image" else TextData
71
+
72
+ for page in self.pages:
73
+ if not isinstance(page.data, expected_type):
74
+ raise ValueError(
75
+ f"Document declared as '{self.content_type}' but Page {page.page_number} "
76
+ f"contains incompatible '{type(page.data).__name__}'."
77
+ )
78
+
79
+ return self
@@ -0,0 +1,27 @@
1
+ """Input models representing raw data ingestion.
2
+
3
+ These models act as containers for file content before any parsing occurs.
4
+ """
5
+
6
+ from pydantic import BaseModel, Field
7
+
8
+ from document_extraction_tools.types.path_identifier import PathIdentifier
9
+
10
+
11
+ class DocumentBytes(BaseModel):
12
+ """A standardized container for raw document data in memory.
13
+
14
+ This model decouples the extraction logic from the storage source.
15
+ It guarantees that the processor receives raw bytes regardless of origin.
16
+ """
17
+
18
+ file_bytes: bytes = Field(..., description="The raw binary content of the file.")
19
+
20
+ path_identifier: PathIdentifier = Field(
21
+ ..., description="Path identifier for the original source."
22
+ )
23
+
24
+ mime_type: str = Field(
25
+ default="application/pdf",
26
+ description="The standard MIME type of the file content.",
27
+ )
@@ -0,0 +1,21 @@
1
+ """Evaluation test example model.
2
+
3
+ This model defines the input schema used by the evaluation pipeline.
4
+ """
5
+
6
+ from typing import Generic
7
+
8
+ from pydantic import BaseModel, Field
9
+
10
+ from document_extraction_tools.types.path_identifier import PathIdentifier
11
+ from document_extraction_tools.types.schema import ExtractionSchema
12
+
13
+
14
+ class EvaluationExample(BaseModel, Generic[ExtractionSchema]):
15
+ """Pairs a ground-truth schema with a source document."""
16
+
17
+ id: str = Field(..., description="Identifier for the test example.")
18
+ path_identifier: PathIdentifier = Field(
19
+ ..., description="Source location for the test example."
20
+ )
21
+ true: ExtractionSchema = Field(..., description="Ground-truth data.")
@@ -0,0 +1,16 @@
1
+ """Evaluation result model.
2
+
3
+ This model defines the output schema produced by evaluators.
4
+ """
5
+
6
+ from typing import Any
7
+
8
+ from pydantic import BaseModel, Field
9
+
10
+
11
+ class EvaluationResult(BaseModel):
12
+ """Represents a single evaluation result for one document."""
13
+
14
+ name: str = Field(..., description="Name of the evaluator or metric.")
15
+ result: Any = Field(..., description="Computed metric value.")
16
+ description: str = Field(..., description="Human-readable description.")
@@ -0,0 +1,16 @@
1
+ """Models for identifying the source of a document."""
2
+
3
+ from pathlib import Path
4
+ from typing import Any
5
+
6
+ from pydantic import BaseModel, Field
7
+
8
+
9
+ class PathIdentifier(BaseModel):
10
+ """A unified reference to a document source."""
11
+
12
+ path: str | Path = Field(..., description="The primary path identifier.")
13
+
14
+ context: dict[str, Any] = Field(
15
+ default_factory=dict, description="Optional execution context."
16
+ )
@@ -0,0 +1,7 @@
1
+ """Common schema definitions and type variables."""
2
+
3
+ from typing import TypeVar
4
+
5
+ from pydantic import BaseModel
6
+
7
+ ExtractionSchema = TypeVar("ExtractionSchema", bound=BaseModel)
@@ -0,0 +1,15 @@
1
+ Metadata-Version: 2.3
2
+ Name: document-extraction-tools
3
+ Version: 0.0.1rc1
4
+ Summary: A modular, high-performance toolkit for extracting structured data from documents.
5
+ Author: Ollie Kemp, Nikolas Moatsos
6
+ Author-email: Ollie Kemp <oliver.kemp@artefact.com>, Nikolas Moatsos <nikolas.moatsos@artefact.com>
7
+ Requires-Dist: pydantic>=2.0.0
8
+ Requires-Dist: pyyaml>=6.0.3
9
+ Requires-Dist: numpy>=2.4.1
10
+ Requires-Dist: pillow>=12.1.0
11
+ Requires-Dist: pytest>=8.0,<9.0 ; extra == 'dev'
12
+ Requires-Dist: pytest-asyncio>=1.3.0 ; extra == 'dev'
13
+ Requires-Dist: pre-commit>=3.3,<4.0 ; extra == 'dev'
14
+ Requires-Python: >=3.12
15
+ Provides-Extra: dev
@@ -0,0 +1,47 @@
1
+ document_extraction_tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ document_extraction_tools/base/__init__.py,sha256=0WcmjJNjXn1lr2kyJDtI6YNLAG-SeWkgU8tgyNzbhu4,979
3
+ document_extraction_tools/base/converter/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
+ document_extraction_tools/base/converter/base_converter.py,sha256=cgIyjuEIUtvbGTaPF827C8Xw-vJTHhi6xy1mUzR5ud8,1475
5
+ document_extraction_tools/base/evaluator/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
+ document_extraction_tools/base/evaluator/base_evaluator.py,sha256=nJrILH9YujIGBbMTaHtUnxqOSZXm_yhPWoZw-853Wns,1328
7
+ document_extraction_tools/base/exporter/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
+ document_extraction_tools/base/exporter/base_evaluation_exporter.py,sha256=FmcAonBVJqHp34cEWEfkumKOxIBkeXrg81SFb_dtGYA,1515
9
+ document_extraction_tools/base/exporter/base_extraction_exporter.py,sha256=kglZLG4liLaO6bDSRA9xRwjSFA9kXpCFd7hkaYJvgQw,1480
10
+ document_extraction_tools/base/extractor/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
+ document_extraction_tools/base/extractor/base_extractor.py,sha256=MEgGwT-AtAzCsRCZLfrcjr2e0XbrbYFWrarTxW9VfjU,1470
12
+ document_extraction_tools/base/file_lister/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
13
+ document_extraction_tools/base/file_lister/base_file_lister.py,sha256=Nbrei7xfC_J-b1vuGOT2ft-BqC12P9iiNZrUB7FxbNc,1237
14
+ document_extraction_tools/base/reader/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
15
+ document_extraction_tools/base/reader/base_reader.py,sha256=E1iaWRwlNL05ErOd2OVrOiXummljSI1H95qgVzY8pdo,1248
16
+ document_extraction_tools/base/test_data_loader/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
+ document_extraction_tools/base/test_data_loader/base_test_data_loader.py,sha256=8uOYe4SBHS678pN3HNlprRnrUyX7AyDxQ-Fo3i_wrC4,1621
18
+ document_extraction_tools/config/__init__.py,sha256=zAW5d-_16gWEFFOzABybYxU619GGQxh3ZWtf5eOr8qw,1805
19
+ document_extraction_tools/config/base_converter_config.py,sha256=Bh1Wf--D0YVUa8e_qBKmb0XeTRzhaF7jMIWjSB_GlSk,308
20
+ document_extraction_tools/config/base_evaluation_exporter_config.py,sha256=10RPVETm4-KLnpLz5FB3pd5fqtb7l9Wzt2UQ6bzIsOU,347
21
+ document_extraction_tools/config/base_evaluator_config.py,sha256=6cxK55uuFFy4pu9CmHo79rki_qm2dgN8nYYP_uaLFA4,308
22
+ document_extraction_tools/config/base_extraction_exporter_config.py,sha256=w-pFgZ8ulSbnLjrXWgnhE_3cOPoK21s43TPdXJG_1ew,336
23
+ document_extraction_tools/config/base_extractor_config.py,sha256=ZQg_I05z5EMpgCspl4iwconLKNjO1-KHM0mFYZwkVZk,308
24
+ document_extraction_tools/config/base_file_lister_config.py,sha256=OEzMF_NiFiWlFwpAbW2nO9swPPLKpS2nWYukBg1WdbA,315
25
+ document_extraction_tools/config/base_reader_config.py,sha256=tcPE7AQcbcFi7TStDU2ZUU-GK3I3_pAgNiFjBcy2RsA,296
26
+ document_extraction_tools/config/base_test_data_loader_config.py,sha256=ZDo9vMuqluNwHLpYSt4yNR39JjTJJM24Y0LeVnUTBBs,334
27
+ document_extraction_tools/config/config_loader.py,sha256=AVmiCebMED8njkmgoVfqamwkQUAfY7qwGU0LQ5g1iQI,8344
28
+ document_extraction_tools/config/evaluation_orchestrator_config.py,sha256=xwccAfA5Mnw81orLRWmY5lrUcYymIXbbU58UPZI4PrU,560
29
+ document_extraction_tools/config/evaluation_pipeline_config.py,sha256=vW-f1Q0992i47ZODFQxWSUC7iEcqnrty50NiNEkHxBw,1263
30
+ document_extraction_tools/config/extraction_orchestrator_config.py,sha256=obvrwLAApKHWSzrUfNDO7_JgnoGYvLA8ruln6hFBU9o,558
31
+ document_extraction_tools/config/extraction_pipeline_config.py,sha256=P_STMZ4NKJgPJQEfE8Mw34iacy6VxBhl-ovEXYudZ64,1094
32
+ document_extraction_tools/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
33
+ document_extraction_tools/runners/__init__.py,sha256=mnXk0OYtTmcm7AZD62BBQsWvRoJd1-bKC_5Tek0Fg6g,321
34
+ document_extraction_tools/runners/evaluation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
35
+ document_extraction_tools/runners/evaluation/evaluation_orchestrator.py,sha256=EWRfCDk0pB9apw35dgspSKlLDXlTlw9UDemz9oJ4d0w,10615
36
+ document_extraction_tools/runners/extraction/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
37
+ document_extraction_tools/runners/extraction/extraction_orchestrator.py,sha256=7f-bcfDbLzBrwrBnRDzhub0i9e2vPd5aep4Zsm0O6h4,7894
38
+ document_extraction_tools/types/__init__.py,sha256=_8OndtXKj2c9N_63lEDZ4tt_iI7mM4fB0-PI8lOFQMw,708
39
+ document_extraction_tools/types/document.py,sha256=l35GW7h4-cPnNiGrCVTXX5SoRQLQPdDUFKw101ipKxw,2399
40
+ document_extraction_tools/types/document_bytes.py,sha256=4ydlOEj7H4qe1q596bWe0GHZ8xIUIcTlrkiNpl4v9Ow,845
41
+ document_extraction_tools/types/evaluation_example.py,sha256=16XVnOcVSKM6fsz7Ov6K50HD5oce5DJLc84JkyHS2io,710
42
+ document_extraction_tools/types/evaluation_result.py,sha256=Zuz1-s3FGcvu21RpnMoKZqLDDa7PyvQhgTIDQBQXW6k,480
43
+ document_extraction_tools/types/path_identifier.py,sha256=iGAEB2Xqg9OY6XfyyUohkN6NgmiEE2ZhrjfX-YRoRuE,425
44
+ document_extraction_tools/types/schema.py,sha256=crWb9YcOnTCAwhL0HI8X-KfHlyw4v7GFO8ivMBlNcm8,177
45
+ document_extraction_tools-0.0.1rc1.dist-info/WHEEL,sha256=fAguSjoiATBe7TNBkJwOjyL1Tt4wwiaQGtNtjRPNMQA,80
46
+ document_extraction_tools-0.0.1rc1.dist-info/METADATA,sha256=4NwuM5zT8Ie7G_SVoy6ftj2BkQNOypbmPib9ABaMfUM,618
47
+ document_extraction_tools-0.0.1rc1.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: uv 0.9.28
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any