kiln-ai 0.19.0__py3-none-any.whl → 0.21.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of kiln-ai might be problematic. Click here for more details.
- kiln_ai/adapters/__init__.py +8 -2
- kiln_ai/adapters/adapter_registry.py +43 -208
- kiln_ai/adapters/chat/chat_formatter.py +8 -12
- kiln_ai/adapters/chat/test_chat_formatter.py +6 -2
- kiln_ai/adapters/chunkers/__init__.py +13 -0
- kiln_ai/adapters/chunkers/base_chunker.py +42 -0
- kiln_ai/adapters/chunkers/chunker_registry.py +16 -0
- kiln_ai/adapters/chunkers/fixed_window_chunker.py +39 -0
- kiln_ai/adapters/chunkers/helpers.py +23 -0
- kiln_ai/adapters/chunkers/test_base_chunker.py +63 -0
- kiln_ai/adapters/chunkers/test_chunker_registry.py +28 -0
- kiln_ai/adapters/chunkers/test_fixed_window_chunker.py +346 -0
- kiln_ai/adapters/chunkers/test_helpers.py +75 -0
- kiln_ai/adapters/data_gen/test_data_gen_task.py +9 -3
- kiln_ai/adapters/docker_model_runner_tools.py +119 -0
- kiln_ai/adapters/embedding/__init__.py +0 -0
- kiln_ai/adapters/embedding/base_embedding_adapter.py +44 -0
- kiln_ai/adapters/embedding/embedding_registry.py +32 -0
- kiln_ai/adapters/embedding/litellm_embedding_adapter.py +199 -0
- kiln_ai/adapters/embedding/test_base_embedding_adapter.py +283 -0
- kiln_ai/adapters/embedding/test_embedding_registry.py +166 -0
- kiln_ai/adapters/embedding/test_litellm_embedding_adapter.py +1149 -0
- kiln_ai/adapters/eval/base_eval.py +2 -2
- kiln_ai/adapters/eval/eval_runner.py +9 -3
- kiln_ai/adapters/eval/g_eval.py +2 -2
- kiln_ai/adapters/eval/test_base_eval.py +2 -4
- kiln_ai/adapters/eval/test_g_eval.py +4 -5
- kiln_ai/adapters/extractors/__init__.py +18 -0
- kiln_ai/adapters/extractors/base_extractor.py +72 -0
- kiln_ai/adapters/extractors/encoding.py +20 -0
- kiln_ai/adapters/extractors/extractor_registry.py +44 -0
- kiln_ai/adapters/extractors/extractor_runner.py +112 -0
- kiln_ai/adapters/extractors/litellm_extractor.py +386 -0
- kiln_ai/adapters/extractors/test_base_extractor.py +244 -0
- kiln_ai/adapters/extractors/test_encoding.py +54 -0
- kiln_ai/adapters/extractors/test_extractor_registry.py +181 -0
- kiln_ai/adapters/extractors/test_extractor_runner.py +181 -0
- kiln_ai/adapters/extractors/test_litellm_extractor.py +1192 -0
- kiln_ai/adapters/fine_tune/__init__.py +1 -1
- kiln_ai/adapters/fine_tune/openai_finetune.py +14 -4
- kiln_ai/adapters/fine_tune/test_dataset_formatter.py +2 -2
- kiln_ai/adapters/fine_tune/test_fireworks_tinetune.py +2 -6
- kiln_ai/adapters/fine_tune/test_openai_finetune.py +108 -111
- kiln_ai/adapters/fine_tune/test_together_finetune.py +2 -6
- kiln_ai/adapters/ml_embedding_model_list.py +192 -0
- kiln_ai/adapters/ml_model_list.py +761 -37
- kiln_ai/adapters/model_adapters/base_adapter.py +51 -21
- kiln_ai/adapters/model_adapters/litellm_adapter.py +380 -138
- kiln_ai/adapters/model_adapters/test_base_adapter.py +193 -17
- kiln_ai/adapters/model_adapters/test_litellm_adapter.py +407 -2
- kiln_ai/adapters/model_adapters/test_litellm_adapter_tools.py +1103 -0
- kiln_ai/adapters/model_adapters/test_saving_adapter_results.py +5 -5
- kiln_ai/adapters/model_adapters/test_structured_output.py +113 -5
- kiln_ai/adapters/ollama_tools.py +69 -12
- kiln_ai/adapters/parsers/__init__.py +1 -1
- kiln_ai/adapters/provider_tools.py +205 -47
- kiln_ai/adapters/rag/deduplication.py +49 -0
- kiln_ai/adapters/rag/progress.py +252 -0
- kiln_ai/adapters/rag/rag_runners.py +844 -0
- kiln_ai/adapters/rag/test_deduplication.py +195 -0
- kiln_ai/adapters/rag/test_progress.py +785 -0
- kiln_ai/adapters/rag/test_rag_runners.py +2376 -0
- kiln_ai/adapters/remote_config.py +80 -8
- kiln_ai/adapters/repair/test_repair_task.py +12 -9
- kiln_ai/adapters/run_output.py +3 -0
- kiln_ai/adapters/test_adapter_registry.py +657 -85
- kiln_ai/adapters/test_docker_model_runner_tools.py +305 -0
- kiln_ai/adapters/test_ml_embedding_model_list.py +429 -0
- kiln_ai/adapters/test_ml_model_list.py +251 -1
- kiln_ai/adapters/test_ollama_tools.py +340 -1
- kiln_ai/adapters/test_prompt_adaptors.py +13 -6
- kiln_ai/adapters/test_prompt_builders.py +1 -1
- kiln_ai/adapters/test_provider_tools.py +254 -8
- kiln_ai/adapters/test_remote_config.py +651 -58
- kiln_ai/adapters/vector_store/__init__.py +1 -0
- kiln_ai/adapters/vector_store/base_vector_store_adapter.py +83 -0
- kiln_ai/adapters/vector_store/lancedb_adapter.py +389 -0
- kiln_ai/adapters/vector_store/test_base_vector_store.py +160 -0
- kiln_ai/adapters/vector_store/test_lancedb_adapter.py +1841 -0
- kiln_ai/adapters/vector_store/test_vector_store_registry.py +199 -0
- kiln_ai/adapters/vector_store/vector_store_registry.py +33 -0
- kiln_ai/datamodel/__init__.py +39 -34
- kiln_ai/datamodel/basemodel.py +170 -1
- kiln_ai/datamodel/chunk.py +158 -0
- kiln_ai/datamodel/datamodel_enums.py +28 -0
- kiln_ai/datamodel/embedding.py +64 -0
- kiln_ai/datamodel/eval.py +1 -1
- kiln_ai/datamodel/external_tool_server.py +298 -0
- kiln_ai/datamodel/extraction.py +303 -0
- kiln_ai/datamodel/json_schema.py +25 -10
- kiln_ai/datamodel/project.py +40 -1
- kiln_ai/datamodel/rag.py +79 -0
- kiln_ai/datamodel/registry.py +0 -15
- kiln_ai/datamodel/run_config.py +62 -0
- kiln_ai/datamodel/task.py +2 -77
- kiln_ai/datamodel/task_output.py +6 -1
- kiln_ai/datamodel/task_run.py +41 -0
- kiln_ai/datamodel/test_attachment.py +649 -0
- kiln_ai/datamodel/test_basemodel.py +4 -4
- kiln_ai/datamodel/test_chunk_models.py +317 -0
- kiln_ai/datamodel/test_dataset_split.py +1 -1
- kiln_ai/datamodel/test_embedding_models.py +448 -0
- kiln_ai/datamodel/test_eval_model.py +6 -6
- kiln_ai/datamodel/test_example_models.py +175 -0
- kiln_ai/datamodel/test_external_tool_server.py +691 -0
- kiln_ai/datamodel/test_extraction_chunk.py +206 -0
- kiln_ai/datamodel/test_extraction_model.py +470 -0
- kiln_ai/datamodel/test_rag.py +641 -0
- kiln_ai/datamodel/test_registry.py +8 -3
- kiln_ai/datamodel/test_task.py +15 -47
- kiln_ai/datamodel/test_tool_id.py +320 -0
- kiln_ai/datamodel/test_vector_store.py +320 -0
- kiln_ai/datamodel/tool_id.py +105 -0
- kiln_ai/datamodel/vector_store.py +141 -0
- kiln_ai/tools/__init__.py +8 -0
- kiln_ai/tools/base_tool.py +82 -0
- kiln_ai/tools/built_in_tools/__init__.py +13 -0
- kiln_ai/tools/built_in_tools/math_tools.py +124 -0
- kiln_ai/tools/built_in_tools/test_math_tools.py +204 -0
- kiln_ai/tools/mcp_server_tool.py +95 -0
- kiln_ai/tools/mcp_session_manager.py +246 -0
- kiln_ai/tools/rag_tools.py +157 -0
- kiln_ai/tools/test_base_tools.py +199 -0
- kiln_ai/tools/test_mcp_server_tool.py +457 -0
- kiln_ai/tools/test_mcp_session_manager.py +1585 -0
- kiln_ai/tools/test_rag_tools.py +848 -0
- kiln_ai/tools/test_tool_registry.py +562 -0
- kiln_ai/tools/tool_registry.py +85 -0
- kiln_ai/utils/__init__.py +3 -0
- kiln_ai/utils/async_job_runner.py +62 -17
- kiln_ai/utils/config.py +24 -2
- kiln_ai/utils/env.py +15 -0
- kiln_ai/utils/filesystem.py +14 -0
- kiln_ai/utils/filesystem_cache.py +60 -0
- kiln_ai/utils/litellm.py +94 -0
- kiln_ai/utils/lock.py +100 -0
- kiln_ai/utils/mime_type.py +38 -0
- kiln_ai/utils/open_ai_types.py +94 -0
- kiln_ai/utils/pdf_utils.py +38 -0
- kiln_ai/utils/project_utils.py +17 -0
- kiln_ai/utils/test_async_job_runner.py +151 -35
- kiln_ai/utils/test_config.py +138 -1
- kiln_ai/utils/test_env.py +142 -0
- kiln_ai/utils/test_filesystem_cache.py +316 -0
- kiln_ai/utils/test_litellm.py +206 -0
- kiln_ai/utils/test_lock.py +185 -0
- kiln_ai/utils/test_mime_type.py +66 -0
- kiln_ai/utils/test_open_ai_types.py +131 -0
- kiln_ai/utils/test_pdf_utils.py +73 -0
- kiln_ai/utils/test_uuid.py +111 -0
- kiln_ai/utils/test_validation.py +524 -0
- kiln_ai/utils/uuid.py +9 -0
- kiln_ai/utils/validation.py +90 -0
- {kiln_ai-0.19.0.dist-info → kiln_ai-0.21.0.dist-info}/METADATA +12 -5
- kiln_ai-0.21.0.dist-info/RECORD +211 -0
- kiln_ai-0.19.0.dist-info/RECORD +0 -115
- {kiln_ai-0.19.0.dist-info → kiln_ai-0.21.0.dist-info}/WHEEL +0 -0
- {kiln_ai-0.19.0.dist-info → kiln_ai-0.21.0.dist-info}/licenses/LICENSE.txt +0 -0
|
@@ -7,7 +7,7 @@ from kiln_ai.adapters.ml_model_list import ModelProviderName
|
|
|
7
7
|
from kiln_ai.adapters.model_adapters.base_adapter import AdapterConfig
|
|
8
8
|
from kiln_ai.datamodel.eval import Eval, EvalConfig, EvalScores
|
|
9
9
|
from kiln_ai.datamodel.json_schema import validate_schema_with_value_error
|
|
10
|
-
from kiln_ai.datamodel.task import
|
|
10
|
+
from kiln_ai.datamodel.task import RunConfigProperties, TaskOutputRatingType, TaskRun
|
|
11
11
|
from kiln_ai.utils.exhaustive_error import raise_exhaustive_enum_error
|
|
12
12
|
|
|
13
13
|
|
|
@@ -18,7 +18,7 @@ class BaseEval:
|
|
|
18
18
|
Should be subclassed, and the run_eval method implemented.
|
|
19
19
|
"""
|
|
20
20
|
|
|
21
|
-
def __init__(self, eval_config: EvalConfig, run_config:
|
|
21
|
+
def __init__(self, eval_config: EvalConfig, run_config: RunConfigProperties | None):
|
|
22
22
|
self.eval_config = eval_config
|
|
23
23
|
eval = eval_config.parent_eval()
|
|
24
24
|
if not eval:
|
|
@@ -160,8 +160,12 @@ class EvalRunner:
|
|
|
160
160
|
"""
|
|
161
161
|
jobs = self.collect_tasks()
|
|
162
162
|
|
|
163
|
-
runner = AsyncJobRunner(
|
|
164
|
-
|
|
163
|
+
runner = AsyncJobRunner(
|
|
164
|
+
concurrency=concurrency,
|
|
165
|
+
jobs=jobs,
|
|
166
|
+
run_job_fn=self.run_job,
|
|
167
|
+
)
|
|
168
|
+
async for progress in runner.run():
|
|
165
169
|
yield progress
|
|
166
170
|
|
|
167
171
|
async def run_job(self, job: EvalJob) -> bool:
|
|
@@ -169,7 +173,9 @@ class EvalRunner:
|
|
|
169
173
|
# Create the evaluator for this eval config/run config pair
|
|
170
174
|
evaluator = eval_adapter_from_type(job.eval_config.config_type)(
|
|
171
175
|
job.eval_config,
|
|
172
|
-
job.task_run_config.
|
|
176
|
+
job.task_run_config.run_config_properties
|
|
177
|
+
if job.task_run_config
|
|
178
|
+
else None,
|
|
173
179
|
)
|
|
174
180
|
if not isinstance(evaluator, BaseEval):
|
|
175
181
|
raise ValueError("Not able to create evaluator from eval config")
|
kiln_ai/adapters/eval/g_eval.py
CHANGED
|
@@ -12,7 +12,7 @@ from kiln_ai.adapters.model_adapters.base_adapter import AdapterConfig, RunOutpu
|
|
|
12
12
|
from kiln_ai.adapters.prompt_builders import PromptGenerators
|
|
13
13
|
from kiln_ai.datamodel import Project, Task, TaskRun
|
|
14
14
|
from kiln_ai.datamodel.eval import EvalConfig, EvalConfigType, EvalScores
|
|
15
|
-
from kiln_ai.datamodel.task import
|
|
15
|
+
from kiln_ai.datamodel.task import RunConfigProperties, StructuredOutputMode
|
|
16
16
|
|
|
17
17
|
# all the tokens we score for, and their float scores.
|
|
18
18
|
TOKEN_TO_SCORE_MAP: Dict[str, float] = {
|
|
@@ -89,7 +89,7 @@ class GEval(BaseEval):
|
|
|
89
89
|
}
|
|
90
90
|
"""
|
|
91
91
|
|
|
92
|
-
def __init__(self, eval_config: EvalConfig, run_config:
|
|
92
|
+
def __init__(self, eval_config: EvalConfig, run_config: RunConfigProperties | None):
|
|
93
93
|
if (
|
|
94
94
|
eval_config.config_type != EvalConfigType.g_eval
|
|
95
95
|
and eval_config.config_type != EvalConfigType.llm_as_judge
|
|
@@ -307,9 +307,7 @@ async def test_run_method():
|
|
|
307
307
|
evaluator = EvalTester(eval_config, run_config.run_config())
|
|
308
308
|
|
|
309
309
|
# Run the evaluation
|
|
310
|
-
task_run, eval_scores,
|
|
311
|
-
"test input"
|
|
312
|
-
)
|
|
310
|
+
task_run, eval_scores, _ = await evaluator.run_task_and_eval("test input")
|
|
313
311
|
|
|
314
312
|
# Verify task run was created
|
|
315
313
|
assert task_run.input == "test input"
|
|
@@ -380,7 +378,7 @@ async def test_run_task_and_eval():
|
|
|
380
378
|
async def run_eval(self, task_run):
|
|
381
379
|
return {"overall_rating": 5, "quality": 4}, {"thinking": "test thinking"}
|
|
382
380
|
|
|
383
|
-
evaluator = MockEval(eval_config, run_config.
|
|
381
|
+
evaluator = MockEval(eval_config, run_config.run_config_properties)
|
|
384
382
|
|
|
385
383
|
# Mock dependencies
|
|
386
384
|
mock_adapter = AsyncMock()
|
|
@@ -19,7 +19,7 @@ from kiln_ai.datamodel import (
|
|
|
19
19
|
TaskRun,
|
|
20
20
|
)
|
|
21
21
|
from kiln_ai.datamodel.eval import Eval, EvalConfig, EvalConfigType, EvalOutputScore
|
|
22
|
-
from kiln_ai.datamodel.task import
|
|
22
|
+
from kiln_ai.datamodel.task import RunConfigProperties
|
|
23
23
|
|
|
24
24
|
|
|
25
25
|
@pytest.fixture
|
|
@@ -93,11 +93,10 @@ def test_eval_config(test_task):
|
|
|
93
93
|
|
|
94
94
|
|
|
95
95
|
@pytest.fixture
|
|
96
|
-
def test_run_config(
|
|
97
|
-
return
|
|
96
|
+
def test_run_config():
|
|
97
|
+
return RunConfigProperties(
|
|
98
98
|
model_name="llama_3_1_8b",
|
|
99
99
|
model_provider_name="groq",
|
|
100
|
-
task=test_task,
|
|
101
100
|
prompt_id="simple_prompt_builder",
|
|
102
101
|
structured_output_mode="json_schema",
|
|
103
102
|
)
|
|
@@ -189,7 +188,7 @@ async def test_run_g_eval_e2e(
|
|
|
189
188
|
g_eval = GEval(test_eval_config, test_run_config)
|
|
190
189
|
|
|
191
190
|
# Run the evaluation
|
|
192
|
-
|
|
191
|
+
_, scores, intermediate_outputs = await g_eval.run_task_and_eval("chickens")
|
|
193
192
|
|
|
194
193
|
# Verify the evaluation results
|
|
195
194
|
assert isinstance(scores, dict)
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
"""
|
|
2
|
+
File extractors for processing different document types.
|
|
3
|
+
|
|
4
|
+
This package provides a framework for extracting content from files
|
|
5
|
+
using different extraction methods.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from . import base_extractor, extractor_registry, extractor_runner, litellm_extractor
|
|
9
|
+
from .base_extractor import ExtractionInput, ExtractionOutput
|
|
10
|
+
|
|
11
|
+
__all__ = [
|
|
12
|
+
"ExtractionInput",
|
|
13
|
+
"ExtractionOutput",
|
|
14
|
+
"base_extractor",
|
|
15
|
+
"extractor_registry",
|
|
16
|
+
"extractor_runner",
|
|
17
|
+
"litellm_extractor",
|
|
18
|
+
]
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from abc import ABC, abstractmethod
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from pydantic import BaseModel, Field
|
|
6
|
+
|
|
7
|
+
from kiln_ai.datamodel.extraction import ExtractorConfig, OutputFormat
|
|
8
|
+
|
|
9
|
+
logger = logging.getLogger(__name__)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class ExtractionInput(BaseModel):
|
|
13
|
+
path: Path | str = Field(description="The absolute path to the file to extract.")
|
|
14
|
+
mime_type: str = Field(description="The mime type of the file.")
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class ExtractionOutput(BaseModel):
|
|
18
|
+
"""
|
|
19
|
+
The output of an extraction. This is the data that will be saved to the data store.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
is_passthrough: bool = Field(
|
|
23
|
+
default=False, description="Whether the extractor returned the file as is."
|
|
24
|
+
)
|
|
25
|
+
content_format: OutputFormat = Field(
|
|
26
|
+
description="The format of the extracted data."
|
|
27
|
+
)
|
|
28
|
+
content: str = Field(description="The extracted data.")
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class BaseExtractor(ABC):
|
|
32
|
+
"""
|
|
33
|
+
Base class for all extractors.
|
|
34
|
+
|
|
35
|
+
Should be subclassed by each extractor.
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
def __init__(self, extractor_config: ExtractorConfig):
|
|
39
|
+
self.extractor_config = extractor_config
|
|
40
|
+
|
|
41
|
+
@abstractmethod
|
|
42
|
+
async def _extract(self, extraction_input: ExtractionInput) -> ExtractionOutput:
|
|
43
|
+
pass
|
|
44
|
+
|
|
45
|
+
async def extract(
|
|
46
|
+
self,
|
|
47
|
+
extraction_input: ExtractionInput,
|
|
48
|
+
) -> ExtractionOutput:
|
|
49
|
+
"""
|
|
50
|
+
Extracts content from a file by delegating to the concrete extractor implementation.
|
|
51
|
+
"""
|
|
52
|
+
try:
|
|
53
|
+
if self._should_passthrough(extraction_input.mime_type):
|
|
54
|
+
return ExtractionOutput(
|
|
55
|
+
is_passthrough=True,
|
|
56
|
+
content=Path(extraction_input.path).read_text(encoding="utf-8"),
|
|
57
|
+
content_format=self.extractor_config.output_format,
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
return await self._extract(
|
|
61
|
+
extraction_input,
|
|
62
|
+
)
|
|
63
|
+
except Exception as e:
|
|
64
|
+
raise ValueError(f"Error extracting {extraction_input.path}: {e}") from e
|
|
65
|
+
|
|
66
|
+
def _should_passthrough(self, mime_type: str) -> bool:
|
|
67
|
+
return mime_type.lower() in {
|
|
68
|
+
mt.lower() for mt in self.extractor_config.passthrough_mimetypes
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
def output_format(self) -> OutputFormat:
|
|
72
|
+
return self.extractor_config.output_format
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
import base64
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def to_base64_url(mime_type: str, bytes: bytes) -> str:
|
|
5
|
+
base64_url = f"data:{mime_type};base64,{base64.b64encode(bytes).decode('utf-8')}"
|
|
6
|
+
return base64_url
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def from_base64_url(base64_url: str) -> bytes:
|
|
10
|
+
if not base64_url.startswith("data:") or "," not in base64_url:
|
|
11
|
+
raise ValueError("Invalid base64 URL format")
|
|
12
|
+
|
|
13
|
+
parts = base64_url.split(",")
|
|
14
|
+
if len(parts) != 2:
|
|
15
|
+
raise ValueError("Invalid base64 URL format")
|
|
16
|
+
|
|
17
|
+
try:
|
|
18
|
+
return base64.b64decode(parts[1])
|
|
19
|
+
except Exception as e:
|
|
20
|
+
raise ValueError(f"Failed to decode base64 data: {e}")
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
from kiln_ai.adapters.extractors.base_extractor import BaseExtractor
|
|
2
|
+
from kiln_ai.adapters.extractors.litellm_extractor import LitellmExtractor
|
|
3
|
+
from kiln_ai.adapters.ml_model_list import ModelProviderName
|
|
4
|
+
from kiln_ai.adapters.provider_tools import (
|
|
5
|
+
core_provider,
|
|
6
|
+
lite_llm_core_config_for_provider,
|
|
7
|
+
)
|
|
8
|
+
from kiln_ai.datamodel.extraction import ExtractorConfig, ExtractorType
|
|
9
|
+
from kiln_ai.utils.exhaustive_error import raise_exhaustive_enum_error
|
|
10
|
+
from kiln_ai.utils.filesystem_cache import FilesystemCache
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def extractor_adapter_from_type(
|
|
14
|
+
extractor_type: ExtractorType,
|
|
15
|
+
extractor_config: ExtractorConfig,
|
|
16
|
+
filesystem_cache: FilesystemCache | None = None,
|
|
17
|
+
) -> BaseExtractor:
|
|
18
|
+
match extractor_type:
|
|
19
|
+
case ExtractorType.LITELLM:
|
|
20
|
+
try:
|
|
21
|
+
provider_enum = ModelProviderName(extractor_config.model_provider_name)
|
|
22
|
+
except ValueError:
|
|
23
|
+
raise ValueError(
|
|
24
|
+
f"Unsupported model provider name: {extractor_config.model_provider_name}. "
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
core_provider_name = core_provider(
|
|
28
|
+
extractor_config.model_name, provider_enum
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
provider_config = lite_llm_core_config_for_provider(core_provider_name)
|
|
32
|
+
if provider_config is None:
|
|
33
|
+
raise ValueError(
|
|
34
|
+
f"No configuration found for core provider: {core_provider_name.value}. "
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
return LitellmExtractor(
|
|
38
|
+
extractor_config,
|
|
39
|
+
provider_config,
|
|
40
|
+
filesystem_cache,
|
|
41
|
+
)
|
|
42
|
+
case _:
|
|
43
|
+
# type checking will catch missing cases
|
|
44
|
+
raise_exhaustive_enum_error(extractor_type)
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from collections import defaultdict
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import AsyncGenerator, Dict, List, Set
|
|
6
|
+
|
|
7
|
+
from kiln_ai.adapters.extractors.base_extractor import BaseExtractor, ExtractionInput
|
|
8
|
+
from kiln_ai.adapters.extractors.extractor_registry import extractor_adapter_from_type
|
|
9
|
+
from kiln_ai.datamodel.basemodel import ID_TYPE, KilnAttachmentModel
|
|
10
|
+
from kiln_ai.datamodel.extraction import (
|
|
11
|
+
Document,
|
|
12
|
+
Extraction,
|
|
13
|
+
ExtractionSource,
|
|
14
|
+
ExtractorConfig,
|
|
15
|
+
)
|
|
16
|
+
from kiln_ai.utils.async_job_runner import AsyncJobRunner, Progress
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass
|
|
22
|
+
class ExtractorJob:
|
|
23
|
+
doc: Document
|
|
24
|
+
extractor_config: ExtractorConfig
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class ExtractorRunner:
|
|
28
|
+
def __init__(
|
|
29
|
+
self,
|
|
30
|
+
documents: List[Document],
|
|
31
|
+
extractor_configs: List[ExtractorConfig],
|
|
32
|
+
):
|
|
33
|
+
if len(extractor_configs) == 0:
|
|
34
|
+
raise ValueError("Extractor runner requires at least one extractor config")
|
|
35
|
+
|
|
36
|
+
self.documents = documents
|
|
37
|
+
self.extractor_configs = extractor_configs
|
|
38
|
+
|
|
39
|
+
def collect_jobs(self) -> List[ExtractorJob]:
|
|
40
|
+
jobs = []
|
|
41
|
+
|
|
42
|
+
# we want to avoid re-running the same document for the same extractor config
|
|
43
|
+
already_extracted: Dict[ID_TYPE, Set[ID_TYPE]] = defaultdict(set)
|
|
44
|
+
for document in self.documents:
|
|
45
|
+
for extraction in document.extractions():
|
|
46
|
+
already_extracted[extraction.extractor_config_id].add(document.id)
|
|
47
|
+
|
|
48
|
+
for extractor_config in self.extractor_configs:
|
|
49
|
+
for document in self.documents:
|
|
50
|
+
if document.id not in already_extracted[extractor_config.id]:
|
|
51
|
+
jobs.append(
|
|
52
|
+
ExtractorJob(
|
|
53
|
+
doc=document,
|
|
54
|
+
extractor_config=extractor_config,
|
|
55
|
+
)
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
return jobs
|
|
59
|
+
|
|
60
|
+
async def run(self, concurrency: int = 25) -> AsyncGenerator[Progress, None]:
|
|
61
|
+
jobs = self.collect_jobs()
|
|
62
|
+
|
|
63
|
+
runner = AsyncJobRunner(
|
|
64
|
+
concurrency=concurrency,
|
|
65
|
+
jobs=jobs,
|
|
66
|
+
run_job_fn=self.run_job,
|
|
67
|
+
)
|
|
68
|
+
async for progress in runner.run():
|
|
69
|
+
yield progress
|
|
70
|
+
|
|
71
|
+
async def run_job(self, job: ExtractorJob) -> bool:
|
|
72
|
+
try:
|
|
73
|
+
extractor = extractor_adapter_from_type(
|
|
74
|
+
job.extractor_config.extractor_type,
|
|
75
|
+
job.extractor_config,
|
|
76
|
+
)
|
|
77
|
+
if not isinstance(extractor, BaseExtractor):
|
|
78
|
+
raise ValueError("Not able to create extractor from extractor config")
|
|
79
|
+
|
|
80
|
+
if job.doc.path is None:
|
|
81
|
+
raise ValueError("Document path is not set")
|
|
82
|
+
|
|
83
|
+
output = await extractor.extract(
|
|
84
|
+
extraction_input=ExtractionInput(
|
|
85
|
+
path=Path(
|
|
86
|
+
job.doc.original_file.attachment.resolve_path(
|
|
87
|
+
job.doc.path.parent
|
|
88
|
+
)
|
|
89
|
+
),
|
|
90
|
+
mime_type=job.doc.original_file.mime_type,
|
|
91
|
+
)
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
extraction = Extraction(
|
|
95
|
+
parent=job.doc,
|
|
96
|
+
extractor_config_id=job.extractor_config.id,
|
|
97
|
+
output=KilnAttachmentModel.from_data(
|
|
98
|
+
data=output.content,
|
|
99
|
+
mime_type=output.content_format,
|
|
100
|
+
),
|
|
101
|
+
source=ExtractionSource.PASSTHROUGH
|
|
102
|
+
if output.is_passthrough
|
|
103
|
+
else ExtractionSource.PROCESSED,
|
|
104
|
+
)
|
|
105
|
+
extraction.save_to_file()
|
|
106
|
+
|
|
107
|
+
return True
|
|
108
|
+
except Exception as e:
|
|
109
|
+
logger.error(
|
|
110
|
+
f"Error running extraction job for dataset item {job.doc.id}: {e}"
|
|
111
|
+
)
|
|
112
|
+
return False
|