document-extraction-tools 0.0.1rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. document_extraction_tools/__init__.py +0 -0
  2. document_extraction_tools/base/__init__.py +27 -0
  3. document_extraction_tools/base/converter/__init__.py +0 -0
  4. document_extraction_tools/base/converter/base_converter.py +40 -0
  5. document_extraction_tools/base/evaluator/__init__.py +0 -0
  6. document_extraction_tools/base/evaluator/base_evaluator.py +40 -0
  7. document_extraction_tools/base/exporter/__init__.py +0 -0
  8. document_extraction_tools/base/exporter/base_evaluation_exporter.py +43 -0
  9. document_extraction_tools/base/exporter/base_extraction_exporter.py +41 -0
  10. document_extraction_tools/base/extractor/__init__.py +0 -0
  11. document_extraction_tools/base/extractor/base_extractor.py +41 -0
  12. document_extraction_tools/base/file_lister/__init__.py +0 -0
  13. document_extraction_tools/base/file_lister/base_file_lister.py +37 -0
  14. document_extraction_tools/base/reader/__init__.py +0 -0
  15. document_extraction_tools/base/reader/base_reader.py +36 -0
  16. document_extraction_tools/base/test_data_loader/__init__.py +0 -0
  17. document_extraction_tools/base/test_data_loader/base_test_data_loader.py +44 -0
  18. document_extraction_tools/config/__init__.py +51 -0
  19. document_extraction_tools/config/base_converter_config.py +14 -0
  20. document_extraction_tools/config/base_evaluation_exporter_config.py +14 -0
  21. document_extraction_tools/config/base_evaluator_config.py +14 -0
  22. document_extraction_tools/config/base_extraction_exporter_config.py +14 -0
  23. document_extraction_tools/config/base_extractor_config.py +14 -0
  24. document_extraction_tools/config/base_file_lister_config.py +14 -0
  25. document_extraction_tools/config/base_reader_config.py +14 -0
  26. document_extraction_tools/config/base_test_data_loader_config.py +14 -0
  27. document_extraction_tools/config/config_loader.py +201 -0
  28. document_extraction_tools/config/evaluation_orchestrator_config.py +20 -0
  29. document_extraction_tools/config/evaluation_pipeline_config.py +32 -0
  30. document_extraction_tools/config/extraction_orchestrator_config.py +20 -0
  31. document_extraction_tools/config/extraction_pipeline_config.py +30 -0
  32. document_extraction_tools/py.typed +0 -0
  33. document_extraction_tools/runners/__init__.py +10 -0
  34. document_extraction_tools/runners/evaluation/__init__.py +0 -0
  35. document_extraction_tools/runners/evaluation/evaluation_orchestrator.py +260 -0
  36. document_extraction_tools/runners/extraction/__init__.py +0 -0
  37. document_extraction_tools/runners/extraction/extraction_orchestrator.py +202 -0
  38. document_extraction_tools/types/__init__.py +20 -0
  39. document_extraction_tools/types/document.py +79 -0
  40. document_extraction_tools/types/document_bytes.py +27 -0
  41. document_extraction_tools/types/evaluation_example.py +21 -0
  42. document_extraction_tools/types/evaluation_result.py +16 -0
  43. document_extraction_tools/types/path_identifier.py +16 -0
  44. document_extraction_tools/types/schema.py +7 -0
  45. document_extraction_tools-0.0.1rc1.dist-info/METADATA +15 -0
  46. document_extraction_tools-0.0.1rc1.dist-info/RECORD +47 -0
  47. document_extraction_tools-0.0.1rc1.dist-info/WHEEL +4 -0
@@ -0,0 +1,201 @@
1
+ """Configuration Loader."""
2
+
3
+ from pathlib import Path
4
+ from typing import Any
5
+
6
+ import yaml
7
+
8
+ from document_extraction_tools.config.base_converter_config import BaseConverterConfig
9
+ from document_extraction_tools.config.base_evaluation_exporter_config import (
10
+ BaseEvaluationExporterConfig,
11
+ )
12
+ from document_extraction_tools.config.base_evaluator_config import BaseEvaluatorConfig
13
+ from document_extraction_tools.config.base_extraction_exporter_config import (
14
+ BaseExtractionExporterConfig,
15
+ )
16
+ from document_extraction_tools.config.base_extractor_config import BaseExtractorConfig
17
+ from document_extraction_tools.config.base_file_lister_config import (
18
+ BaseFileListerConfig,
19
+ )
20
+ from document_extraction_tools.config.base_reader_config import BaseReaderConfig
21
+ from document_extraction_tools.config.base_test_data_loader_config import (
22
+ BaseTestDataLoaderConfig,
23
+ )
24
+ from document_extraction_tools.config.evaluation_orchestrator_config import (
25
+ EvaluationOrchestratorConfig,
26
+ )
27
+ from document_extraction_tools.config.evaluation_pipeline_config import (
28
+ EvaluationPipelineConfig,
29
+ )
30
+ from document_extraction_tools.config.extraction_orchestrator_config import (
31
+ ExtractionOrchestratorConfig,
32
+ )
33
+ from document_extraction_tools.config.extraction_pipeline_config import (
34
+ ExtractionPipelineConfig,
35
+ )
36
+
37
+
38
+ def _load_yaml(path: Path) -> dict[str, Any]:
39
+ """Helper to load a YAML file into a dictionary.
40
+
41
+ Args:
42
+ path (Path): Path to the .yaml file.
43
+
44
+ Returns:
45
+ dict[str, Any]: The parsed YAML data. Returns an empty dict if the file
46
+ does not exist or is empty.
47
+
48
+ Raises:
49
+ FileNotFoundError: If the file does not exist.
50
+ """
51
+ if not path.exists():
52
+ raise FileNotFoundError(f"Config file not found: {path.absolute()}")
53
+
54
+ with open(path) as f:
55
+ return yaml.safe_load(f) or {}
56
+
57
+
58
+ def load_config(
59
+ lister_config_cls: type[BaseFileListerConfig],
60
+ reader_config_cls: type[BaseReaderConfig],
61
+ converter_config_cls: type[BaseConverterConfig],
62
+ extractor_config_cls: type[BaseExtractorConfig],
63
+ exporter_config_cls: type[BaseExtractionExporterConfig],
64
+ orchestrator_config_cls: type[
65
+ ExtractionOrchestratorConfig
66
+ ] = ExtractionOrchestratorConfig,
67
+ config_dir: Path = Path("config/yaml"),
68
+ ) -> ExtractionPipelineConfig:
69
+ """Loads configuration based on a mapping file.
70
+
71
+ Args:
72
+ lister_config_cls (type[BaseFileListerConfig]): The FileListerConfig subclass to use.
73
+ reader_config_cls (type[BaseReaderConfig]): The ReaderConfig subclass to use.
74
+ converter_config_cls (type[BaseConverterConfig]): The ConverterConfig subclass to use.
75
+ extractor_config_cls (type[BaseExtractorConfig]): The ExtractorConfig subclass to use.
76
+ exporter_config_cls (type[BaseExtractionExporterConfig]): The ExporterConfig subclass to use.
77
+ orchestrator_config_cls (type[ExtractionOrchestratorConfig]): The ExtractionOrchestratorConfig class to use.
78
+ config_dir (Path): Directory containing the configs.
79
+
80
+ Returns:
81
+ ExtractionPipelineConfig: The fully validated configuration.
82
+
83
+ Raises:
84
+ FileNotFoundError: If the config directory or mapping file is missing.
85
+ """
86
+ if not config_dir.exists():
87
+ raise FileNotFoundError(f"Config directory not found: {config_dir.absolute()}")
88
+
89
+ return ExtractionPipelineConfig(
90
+ orchestrator=orchestrator_config_cls(
91
+ **_load_yaml(config_dir / orchestrator_config_cls.filename)
92
+ ),
93
+ file_lister=lister_config_cls(
94
+ **_load_yaml(config_dir / lister_config_cls.filename)
95
+ ),
96
+ reader=reader_config_cls(**_load_yaml(config_dir / reader_config_cls.filename)),
97
+ converter=converter_config_cls(
98
+ **_load_yaml(config_dir / converter_config_cls.filename)
99
+ ),
100
+ extractor=extractor_config_cls(
101
+ **_load_yaml(config_dir / extractor_config_cls.filename)
102
+ ),
103
+ exporter=exporter_config_cls(
104
+ **_load_yaml(config_dir / exporter_config_cls.filename)
105
+ ),
106
+ )
107
+
108
+
109
+ def load_evaluation_config(
110
+ test_data_loader_config_cls: type[BaseTestDataLoaderConfig],
111
+ evaluator_config_classes: list[type[BaseEvaluatorConfig]],
112
+ reader_config_cls: type[BaseReaderConfig],
113
+ converter_config_cls: type[BaseConverterConfig],
114
+ extractor_config_cls: type[BaseExtractorConfig],
115
+ evaluation_exporter_config_cls: type[BaseEvaluationExporterConfig],
116
+ orchestrator_config_cls: type[
117
+ EvaluationOrchestratorConfig
118
+ ] = EvaluationOrchestratorConfig,
119
+ config_dir: Path = Path("config/yaml"),
120
+ ) -> EvaluationPipelineConfig:
121
+ """Loads evaluation configuration based on default filenames.
122
+
123
+ Args:
124
+ test_data_loader_config_cls (type[BaseTestDataLoaderConfig]): The TestDataLoaderConfig subclass to use.
125
+ evaluator_config_classes (list[type[BaseEvaluatorConfig]]): EvaluatorConfig
126
+ subclasses to load using the top-level keys in evaluator.yaml.
127
+ reader_config_cls (type[BaseReaderConfig]): The ReaderConfig subclass to use.
128
+ converter_config_cls (type[BaseConverterConfig]): The ConverterConfig subclass to use.
129
+ extractor_config_cls (type[BaseExtractorConfig]): The ExtractorConfig subclass to use.
130
+ evaluation_exporter_config_cls (type[BaseEvaluationExporterConfig]): The EvaluationExporterConfig
131
+ subclass to use.
132
+ orchestrator_config_cls (type[EvaluationOrchestratorConfig]): The EvaluationOrchestratorConfig class to use.
133
+ config_dir (Path): Directory containing the configs.
134
+
135
+ Returns:
136
+ EvaluationPipelineConfig: The fully validated configuration.
137
+
138
+ Raises:
139
+ FileNotFoundError: If the config directory or mapping file is missing.
140
+ """
141
+ if not config_dir.exists():
142
+ raise FileNotFoundError(f"Config directory not found: {config_dir.absolute()}")
143
+
144
+ return EvaluationPipelineConfig(
145
+ orchestrator=orchestrator_config_cls(
146
+ **_load_yaml(config_dir / orchestrator_config_cls.filename)
147
+ ),
148
+ test_data_loader=test_data_loader_config_cls(
149
+ **_load_yaml(config_dir / test_data_loader_config_cls.filename)
150
+ ),
151
+ evaluators=_load_evaluator_configs(config_dir, evaluator_config_classes),
152
+ reader=reader_config_cls(**_load_yaml(config_dir / reader_config_cls.filename)),
153
+ converter=converter_config_cls(
154
+ **_load_yaml(config_dir / converter_config_cls.filename)
155
+ ),
156
+ extractor=extractor_config_cls(
157
+ **_load_yaml(config_dir / extractor_config_cls.filename)
158
+ ),
159
+ evaluation_exporter=evaluation_exporter_config_cls(
160
+ **_load_yaml(config_dir / evaluation_exporter_config_cls.filename)
161
+ ),
162
+ )
163
+
164
+
165
+ def _load_evaluator_configs(
166
+ config_dir: Path, evaluator_config_classes: list[type[BaseEvaluatorConfig]]
167
+ ) -> list[BaseEvaluatorConfig]:
168
+ """Helper to load multiple evaluator configs from evaluator.yaml.
169
+
170
+ Args:
171
+ config_dir (Path): Directory containing the configs.
172
+ evaluator_config_classes (list[type[BaseEvaluatorConfig]]): EvaluatorConfig
173
+ subclasses keyed by their class names.
174
+
175
+ Returns:
176
+ list[BaseEvaluatorConfig]: The loaded evaluator configurations.
177
+ """
178
+ evaluator_lookup = {cls.__name__: cls for cls in evaluator_config_classes}
179
+ evaluator_yaml = _load_yaml(config_dir / BaseEvaluatorConfig.filename)
180
+ if not evaluator_yaml:
181
+ raise ValueError("No evaluator configuration found in evaluator.yaml.")
182
+ if not isinstance(evaluator_yaml, dict):
183
+ raise ValueError(
184
+ "Expected evaluator.yaml to contain a mapping of config class names."
185
+ )
186
+
187
+ evaluators: list[BaseEvaluatorConfig] = []
188
+ for evaluator_key, evaluator_data in evaluator_yaml.items():
189
+ evaluator_cls = evaluator_lookup.get(evaluator_key)
190
+ if evaluator_cls is None:
191
+ raise ValueError(
192
+ f"Unknown evaluator config class '{evaluator_key}' in evaluator.yaml."
193
+ )
194
+ if evaluator_data is None:
195
+ evaluator_data = {}
196
+ if not isinstance(evaluator_data, dict):
197
+ raise ValueError(
198
+ f"Expected evaluator data for '{evaluator_key}' to be a mapping."
199
+ )
200
+ evaluators.append(evaluator_cls(**evaluator_data))
201
+ return evaluators
@@ -0,0 +1,20 @@
1
+ """Configuration for the Evaluation Orchestrator component."""
2
+
3
+ from typing import ClassVar
4
+
5
+ from pydantic import BaseModel, Field
6
+
7
+
8
+ class EvaluationOrchestratorConfig(BaseModel):
9
+ """Configuration for the Evaluation Orchestrator."""
10
+
11
+ filename: ClassVar[str] = "evaluation_orchestrator.yaml"
12
+
13
+ max_workers: int = Field(
14
+ default=4,
15
+ description="Number of processes to use for CPU-bound tasks.",
16
+ )
17
+ max_concurrency: int = Field(
18
+ default=10,
19
+ description="Maximum number of concurrent I/O requests allowed.",
20
+ )
@@ -0,0 +1,32 @@
1
+ """Master Evaluation Pipeline Configuration."""
2
+
3
+ from pydantic import BaseModel
4
+
5
+ from document_extraction_tools.config.base_converter_config import BaseConverterConfig
6
+ from document_extraction_tools.config.base_evaluation_exporter_config import (
7
+ BaseEvaluationExporterConfig,
8
+ )
9
+ from document_extraction_tools.config.base_evaluator_config import BaseEvaluatorConfig
10
+ from document_extraction_tools.config.base_extractor_config import BaseExtractorConfig
11
+ from document_extraction_tools.config.base_reader_config import BaseReaderConfig
12
+ from document_extraction_tools.config.base_test_data_loader_config import (
13
+ BaseTestDataLoaderConfig,
14
+ )
15
+ from document_extraction_tools.config.evaluation_orchestrator_config import (
16
+ EvaluationOrchestratorConfig,
17
+ )
18
+
19
+
20
+ class EvaluationPipelineConfig(BaseModel):
21
+ """Master container for evaluation pipeline component configurations.
22
+
23
+ This class aggregates the configurations for all evaluation pipeline components.
24
+ """
25
+
26
+ orchestrator: EvaluationOrchestratorConfig
27
+ test_data_loader: BaseTestDataLoaderConfig
28
+ evaluators: list[BaseEvaluatorConfig]
29
+ reader: BaseReaderConfig
30
+ converter: BaseConverterConfig
31
+ extractor: BaseExtractorConfig
32
+ evaluation_exporter: BaseEvaluationExporterConfig
@@ -0,0 +1,20 @@
1
+ """Configuration for the Extraction Orchestrator component."""
2
+
3
+ from typing import ClassVar
4
+
5
+ from pydantic import BaseModel, Field
6
+
7
+
8
+ class ExtractionOrchestratorConfig(BaseModel):
9
+ """Configuration for the Pipeline Orchestrator."""
10
+
11
+ filename: ClassVar[str] = "extraction_orchestrator.yaml"
12
+
13
+ max_workers: int = Field(
14
+ default=4,
15
+ description="Number of processes to use for CPU-bound tasks.",
16
+ )
17
+ max_concurrency: int = Field(
18
+ default=10,
19
+ description="Maximum number of concurrent I/O requests allowed.",
20
+ )
@@ -0,0 +1,30 @@
1
+ """Master Extraction Pipeline Configuration."""
2
+
3
+ from pydantic import BaseModel
4
+
5
+ from document_extraction_tools.config.base_converter_config import BaseConverterConfig
6
+ from document_extraction_tools.config.base_extraction_exporter_config import (
7
+ BaseExtractionExporterConfig,
8
+ )
9
+ from document_extraction_tools.config.base_extractor_config import BaseExtractorConfig
10
+ from document_extraction_tools.config.base_file_lister_config import (
11
+ BaseFileListerConfig,
12
+ )
13
+ from document_extraction_tools.config.base_reader_config import BaseReaderConfig
14
+ from document_extraction_tools.config.extraction_orchestrator_config import (
15
+ ExtractionOrchestratorConfig,
16
+ )
17
+
18
+
19
+ class ExtractionPipelineConfig(BaseModel):
20
+ """Master container for extraction pipeline component configurations.
21
+
22
+ This class aggregates the configurations for all pipeline components.
23
+ """
24
+
25
+ orchestrator: ExtractionOrchestratorConfig
26
+ file_lister: BaseFileListerConfig
27
+ reader: BaseReaderConfig
28
+ converter: BaseConverterConfig
29
+ extractor: BaseExtractorConfig
30
+ exporter: BaseExtractionExporterConfig
File without changes
@@ -0,0 +1,10 @@
1
+ """Pipeline orchestrators."""
2
+
3
+ from document_extraction_tools.runners.evaluation.evaluation_orchestrator import (
4
+ EvaluationOrchestrator,
5
+ )
6
+ from document_extraction_tools.runners.extraction.extraction_orchestrator import (
7
+ ExtractionOrchestrator,
8
+ )
9
+
10
+ __all__ = ["EvaluationOrchestrator", "ExtractionOrchestrator"]
@@ -0,0 +1,260 @@
1
+ """Evaluation orchestrator.
2
+
3
+ This module defines the EvaluationOrchestrator class, which coordinates
4
+ the evaluation of extraction models against ground-truth data using multiple
5
+ evaluators. It handles loading evaluation examples, reading and converting documents,
6
+ running extraction, applying evaluators, and exporting results.
7
+ """
8
+
9
+ import asyncio
10
+ import contextvars
11
+ import logging
12
+ from collections.abc import Callable, Iterable
13
+ from concurrent.futures import ThreadPoolExecutor
14
+ from typing import Generic, TypeVar
15
+
16
+ from document_extraction_tools.base.converter.base_converter import BaseConverter
17
+ from document_extraction_tools.base.evaluator.base_evaluator import BaseEvaluator
18
+ from document_extraction_tools.base.exporter.base_evaluation_exporter import (
19
+ BaseEvaluationExporter,
20
+ )
21
+ from document_extraction_tools.base.extractor.base_extractor import BaseExtractor
22
+ from document_extraction_tools.base.reader.base_reader import BaseReader
23
+ from document_extraction_tools.base.test_data_loader.base_test_data_loader import (
24
+ BaseTestDataLoader,
25
+ )
26
+ from document_extraction_tools.config.evaluation_orchestrator_config import (
27
+ EvaluationOrchestratorConfig,
28
+ )
29
+ from document_extraction_tools.config.evaluation_pipeline_config import (
30
+ EvaluationPipelineConfig,
31
+ )
32
+ from document_extraction_tools.types.document import Document
33
+ from document_extraction_tools.types.document_bytes import DocumentBytes
34
+ from document_extraction_tools.types.evaluation_example import EvaluationExample
35
+ from document_extraction_tools.types.evaluation_result import EvaluationResult
36
+ from document_extraction_tools.types.path_identifier import PathIdentifier
37
+ from document_extraction_tools.types.schema import ExtractionSchema
38
+
39
+ logger = logging.getLogger(__name__)
40
+ T = TypeVar("T")
41
+
42
+
43
+ class EvaluationOrchestrator(Generic[ExtractionSchema]):
44
+ """Coordinates evaluation across multiple evaluators."""
45
+
46
+ def __init__(
47
+ self,
48
+ config: EvaluationOrchestratorConfig,
49
+ test_data_loader: BaseTestDataLoader[ExtractionSchema],
50
+ reader: BaseReader,
51
+ converter: BaseConverter,
52
+ extractor: BaseExtractor,
53
+ evaluators: Iterable[BaseEvaluator[ExtractionSchema]],
54
+ exporter: BaseEvaluationExporter,
55
+ schema: type[ExtractionSchema],
56
+ ) -> None:
57
+ """Initialize the evaluation orchestrator with pipeline components.
58
+
59
+ Args:
60
+ config (EvaluationOrchestratorConfig): Configuration for evaluation orchestration.
61
+ test_data_loader (BaseTestDataLoader[ExtractionSchema]): Component to load evaluation examples.
62
+ reader (BaseReader): Component to read raw file bytes.
63
+ converter (BaseConverter): Component to transform bytes into Document objects.
64
+ extractor (BaseExtractor): Component to generate predictions.
65
+ evaluators (Iterable[BaseEvaluator[ExtractionSchema]]): Metrics to apply to each example.
66
+ exporter (BaseEvaluationExporter): Component to persist evaluation results.
67
+ schema (type[ExtractionSchema]): The target Pydantic model definition for extraction.
68
+ """
69
+ self.config = config
70
+ self.test_data_loader = test_data_loader
71
+ self.reader = reader
72
+ self.converter = converter
73
+ self.extractor = extractor
74
+ self.evaluators = list(evaluators)
75
+ self.exporter = exporter
76
+ self.schema = schema
77
+
78
+ @classmethod
79
+ def from_config(
80
+ cls,
81
+ config: EvaluationPipelineConfig,
82
+ schema: type[ExtractionSchema],
83
+ reader_cls: type[BaseReader],
84
+ converter_cls: type[BaseConverter],
85
+ extractor_cls: type[BaseExtractor],
86
+ test_data_loader_cls: type[BaseTestDataLoader[ExtractionSchema]],
87
+ evaluator_classes: list[type[BaseEvaluator[ExtractionSchema]]],
88
+ evaluation_exporter_cls: type[BaseEvaluationExporter],
89
+ ) -> "EvaluationOrchestrator[ExtractionSchema]":
90
+ """Factory method to create an EvaluationOrchestrator from config.
91
+
92
+ Args:
93
+ config (EvaluationPipelineConfig): The full evaluation pipeline configuration.
94
+ schema (type[ExtractionSchema]): The target Pydantic model definition for extraction.
95
+ reader_cls (type[BaseReader]): The concrete Reader class to instantiate.
96
+ converter_cls (type[BaseConverter]): The concrete Converter class to instantiate.
97
+ extractor_cls (type[BaseExtractor]): The concrete Extractor class to instantiate.
98
+ test_data_loader_cls (type[BaseTestDataLoader[ExtractionSchema]]): The
99
+ concrete TestDataLoader class to instantiate.
100
+ evaluator_classes (list[type[BaseEvaluator[ExtractionSchema]]]): The
101
+ evaluator classes available for instantiation.
102
+ evaluation_exporter_cls (type[BaseEvaluationExporter]): The concrete
103
+ EvaluationExporter class to instantiate.
104
+
105
+ Returns:
106
+ EvaluationOrchestrator[ExtractionSchema]: The configured orchestrator.
107
+ """
108
+ reader_instance = reader_cls(config.reader)
109
+ converter_instance = converter_cls(config.converter)
110
+ extractor_instance = extractor_cls(config.extractor)
111
+ test_data_loader_instance = test_data_loader_cls(config.test_data_loader)
112
+ evaluation_exporter_instance = evaluation_exporter_cls(
113
+ config.evaluation_exporter
114
+ )
115
+
116
+ config_lookup = {
117
+ item.__class__.__name__.replace("Config", ""): item
118
+ for item in config.evaluators
119
+ }
120
+
121
+ evaluators = []
122
+ for evaluator_cls in evaluator_classes:
123
+ evaluator_key = evaluator_cls.__name__
124
+ evaluator_config = config_lookup.get(evaluator_key)
125
+
126
+ if evaluator_config is not None:
127
+ evaluators.append(evaluator_cls(evaluator_config))
128
+ else:
129
+ raise ValueError(
130
+ f"No configuration found for evaluator '{evaluator_key}'."
131
+ )
132
+ if not evaluators:
133
+ raise ValueError("No valid evaluators configured.")
134
+
135
+ return cls(
136
+ config=config.orchestrator,
137
+ test_data_loader=test_data_loader_instance,
138
+ reader=reader_instance,
139
+ converter=converter_instance,
140
+ extractor=extractor_instance,
141
+ evaluators=evaluators,
142
+ exporter=evaluation_exporter_instance,
143
+ schema=schema,
144
+ )
145
+
146
+ @staticmethod
147
+ def _ingest(
148
+ path_identifier: PathIdentifier,
149
+ reader: BaseReader,
150
+ converter: BaseConverter,
151
+ ) -> Document:
152
+ """Performs the CPU-bound ingestion phase.
153
+
154
+ Args:
155
+ path_identifier (PathIdentifier): The path identifier to the source file.
156
+ reader (BaseReader): The reader instance to use.
157
+ converter (BaseConverter): The converter instance to use.
158
+
159
+ Returns:
160
+ Document: The fully parsed document object.
161
+ """
162
+ doc_bytes: DocumentBytes = reader.read(path_identifier)
163
+ return converter.convert(doc_bytes)
164
+
165
+ @staticmethod
166
+ async def _run_in_executor_with_context(
167
+ loop: asyncio.AbstractEventLoop,
168
+ pool: ThreadPoolExecutor,
169
+ func: Callable[..., T],
170
+ *args: object,
171
+ ) -> T:
172
+ """Run a function in an executor while preserving contextvars.
173
+
174
+ Args:
175
+ loop (asyncio.AbstractEventLoop): The event loop to use.
176
+ pool (ThreadPoolExecutor): The thread pool to run the function in.
177
+ func (Callable[..., T]): The function to execute.
178
+ *args (object): Arguments to pass to the function.
179
+
180
+ Returns:
181
+ The result of the function execution.
182
+ """
183
+ ctx = contextvars.copy_context()
184
+ return await loop.run_in_executor(pool, ctx.run, func, *args)
185
+
186
+ async def process_example(
187
+ self,
188
+ example: EvaluationExample[ExtractionSchema],
189
+ pool: ThreadPoolExecutor,
190
+ semaphore: asyncio.Semaphore,
191
+ ) -> tuple[Document, list[EvaluationResult]]:
192
+ """Runs extraction, evaluation, and export for a single example.
193
+
194
+ Args:
195
+ example (EvaluationExample[ExtractionSchema]): The evaluation example to process.
196
+ pool (ThreadPoolExecutor): The thread pool for CPU-bound tasks.
197
+ semaphore (asyncio.Semaphore): Semaphore to limit concurrency.
198
+
199
+ Returns:
200
+ tuple[Document, list[EvaluationResult]]: The document and its evaluation results.
201
+ """
202
+ loop = asyncio.get_running_loop()
203
+
204
+ document: Document = await self._run_in_executor_with_context(
205
+ loop,
206
+ pool,
207
+ self._ingest,
208
+ example.path_identifier,
209
+ self.reader,
210
+ self.converter,
211
+ )
212
+
213
+ async with semaphore:
214
+ pred: ExtractionSchema = await self.extractor.extract(document, self.schema)
215
+
216
+ evaluation_tasks = [
217
+ self._run_in_executor_with_context(
218
+ loop, pool, evaluator.evaluate, example.true, pred
219
+ )
220
+ for evaluator in self.evaluators
221
+ ]
222
+ results: list[EvaluationResult] = list(
223
+ await asyncio.gather(*evaluation_tasks)
224
+ )
225
+
226
+ logger.info("Completed evaluation for %s", document.id)
227
+ return document, results
228
+
229
+ async def run(
230
+ self,
231
+ examples: list[EvaluationExample[ExtractionSchema]],
232
+ ) -> None:
233
+ """Run all evaluators and export results for the provided examples.
234
+
235
+ Args:
236
+ examples (list[EvaluationExample[ExtractionSchema]]): The evaluation examples to evaluate.
237
+ """
238
+ semaphore = asyncio.Semaphore(self.config.max_concurrency)
239
+
240
+ with ThreadPoolExecutor(max_workers=self.config.max_workers) as pool:
241
+ tasks = [
242
+ self.process_example(example, pool, semaphore) for example in examples
243
+ ]
244
+
245
+ results_or_exceptions = await asyncio.gather(*tasks, return_exceptions=True)
246
+
247
+ valid_results: list[tuple[Document, list[EvaluationResult]]] = []
248
+
249
+ for example, result in zip(examples, results_or_exceptions, strict=True):
250
+ if isinstance(result, BaseException):
251
+ logger.error(
252
+ "Evaluation pipeline failed for %s",
253
+ example.path_identifier,
254
+ exc_info=result,
255
+ )
256
+ else:
257
+ valid_results.append(result)
258
+
259
+ if valid_results:
260
+ await self.exporter.export(valid_results)