kiln-ai 0.19.0__py3-none-any.whl → 0.21.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of kiln-ai might be problematic. Click here for more details.
- kiln_ai/adapters/__init__.py +8 -2
- kiln_ai/adapters/adapter_registry.py +43 -208
- kiln_ai/adapters/chat/chat_formatter.py +8 -12
- kiln_ai/adapters/chat/test_chat_formatter.py +6 -2
- kiln_ai/adapters/chunkers/__init__.py +13 -0
- kiln_ai/adapters/chunkers/base_chunker.py +42 -0
- kiln_ai/adapters/chunkers/chunker_registry.py +16 -0
- kiln_ai/adapters/chunkers/fixed_window_chunker.py +39 -0
- kiln_ai/adapters/chunkers/helpers.py +23 -0
- kiln_ai/adapters/chunkers/test_base_chunker.py +63 -0
- kiln_ai/adapters/chunkers/test_chunker_registry.py +28 -0
- kiln_ai/adapters/chunkers/test_fixed_window_chunker.py +346 -0
- kiln_ai/adapters/chunkers/test_helpers.py +75 -0
- kiln_ai/adapters/data_gen/test_data_gen_task.py +9 -3
- kiln_ai/adapters/docker_model_runner_tools.py +119 -0
- kiln_ai/adapters/embedding/__init__.py +0 -0
- kiln_ai/adapters/embedding/base_embedding_adapter.py +44 -0
- kiln_ai/adapters/embedding/embedding_registry.py +32 -0
- kiln_ai/adapters/embedding/litellm_embedding_adapter.py +199 -0
- kiln_ai/adapters/embedding/test_base_embedding_adapter.py +283 -0
- kiln_ai/adapters/embedding/test_embedding_registry.py +166 -0
- kiln_ai/adapters/embedding/test_litellm_embedding_adapter.py +1149 -0
- kiln_ai/adapters/eval/base_eval.py +2 -2
- kiln_ai/adapters/eval/eval_runner.py +9 -3
- kiln_ai/adapters/eval/g_eval.py +2 -2
- kiln_ai/adapters/eval/test_base_eval.py +2 -4
- kiln_ai/adapters/eval/test_g_eval.py +4 -5
- kiln_ai/adapters/extractors/__init__.py +18 -0
- kiln_ai/adapters/extractors/base_extractor.py +72 -0
- kiln_ai/adapters/extractors/encoding.py +20 -0
- kiln_ai/adapters/extractors/extractor_registry.py +44 -0
- kiln_ai/adapters/extractors/extractor_runner.py +112 -0
- kiln_ai/adapters/extractors/litellm_extractor.py +386 -0
- kiln_ai/adapters/extractors/test_base_extractor.py +244 -0
- kiln_ai/adapters/extractors/test_encoding.py +54 -0
- kiln_ai/adapters/extractors/test_extractor_registry.py +181 -0
- kiln_ai/adapters/extractors/test_extractor_runner.py +181 -0
- kiln_ai/adapters/extractors/test_litellm_extractor.py +1192 -0
- kiln_ai/adapters/fine_tune/__init__.py +1 -1
- kiln_ai/adapters/fine_tune/openai_finetune.py +14 -4
- kiln_ai/adapters/fine_tune/test_dataset_formatter.py +2 -2
- kiln_ai/adapters/fine_tune/test_fireworks_tinetune.py +2 -6
- kiln_ai/adapters/fine_tune/test_openai_finetune.py +108 -111
- kiln_ai/adapters/fine_tune/test_together_finetune.py +2 -6
- kiln_ai/adapters/ml_embedding_model_list.py +192 -0
- kiln_ai/adapters/ml_model_list.py +761 -37
- kiln_ai/adapters/model_adapters/base_adapter.py +51 -21
- kiln_ai/adapters/model_adapters/litellm_adapter.py +380 -138
- kiln_ai/adapters/model_adapters/test_base_adapter.py +193 -17
- kiln_ai/adapters/model_adapters/test_litellm_adapter.py +407 -2
- kiln_ai/adapters/model_adapters/test_litellm_adapter_tools.py +1103 -0
- kiln_ai/adapters/model_adapters/test_saving_adapter_results.py +5 -5
- kiln_ai/adapters/model_adapters/test_structured_output.py +113 -5
- kiln_ai/adapters/ollama_tools.py +69 -12
- kiln_ai/adapters/parsers/__init__.py +1 -1
- kiln_ai/adapters/provider_tools.py +205 -47
- kiln_ai/adapters/rag/deduplication.py +49 -0
- kiln_ai/adapters/rag/progress.py +252 -0
- kiln_ai/adapters/rag/rag_runners.py +844 -0
- kiln_ai/adapters/rag/test_deduplication.py +195 -0
- kiln_ai/adapters/rag/test_progress.py +785 -0
- kiln_ai/adapters/rag/test_rag_runners.py +2376 -0
- kiln_ai/adapters/remote_config.py +80 -8
- kiln_ai/adapters/repair/test_repair_task.py +12 -9
- kiln_ai/adapters/run_output.py +3 -0
- kiln_ai/adapters/test_adapter_registry.py +657 -85
- kiln_ai/adapters/test_docker_model_runner_tools.py +305 -0
- kiln_ai/adapters/test_ml_embedding_model_list.py +429 -0
- kiln_ai/adapters/test_ml_model_list.py +251 -1
- kiln_ai/adapters/test_ollama_tools.py +340 -1
- kiln_ai/adapters/test_prompt_adaptors.py +13 -6
- kiln_ai/adapters/test_prompt_builders.py +1 -1
- kiln_ai/adapters/test_provider_tools.py +254 -8
- kiln_ai/adapters/test_remote_config.py +651 -58
- kiln_ai/adapters/vector_store/__init__.py +1 -0
- kiln_ai/adapters/vector_store/base_vector_store_adapter.py +83 -0
- kiln_ai/adapters/vector_store/lancedb_adapter.py +389 -0
- kiln_ai/adapters/vector_store/test_base_vector_store.py +160 -0
- kiln_ai/adapters/vector_store/test_lancedb_adapter.py +1841 -0
- kiln_ai/adapters/vector_store/test_vector_store_registry.py +199 -0
- kiln_ai/adapters/vector_store/vector_store_registry.py +33 -0
- kiln_ai/datamodel/__init__.py +39 -34
- kiln_ai/datamodel/basemodel.py +170 -1
- kiln_ai/datamodel/chunk.py +158 -0
- kiln_ai/datamodel/datamodel_enums.py +28 -0
- kiln_ai/datamodel/embedding.py +64 -0
- kiln_ai/datamodel/eval.py +1 -1
- kiln_ai/datamodel/external_tool_server.py +298 -0
- kiln_ai/datamodel/extraction.py +303 -0
- kiln_ai/datamodel/json_schema.py +25 -10
- kiln_ai/datamodel/project.py +40 -1
- kiln_ai/datamodel/rag.py +79 -0
- kiln_ai/datamodel/registry.py +0 -15
- kiln_ai/datamodel/run_config.py +62 -0
- kiln_ai/datamodel/task.py +2 -77
- kiln_ai/datamodel/task_output.py +6 -1
- kiln_ai/datamodel/task_run.py +41 -0
- kiln_ai/datamodel/test_attachment.py +649 -0
- kiln_ai/datamodel/test_basemodel.py +4 -4
- kiln_ai/datamodel/test_chunk_models.py +317 -0
- kiln_ai/datamodel/test_dataset_split.py +1 -1
- kiln_ai/datamodel/test_embedding_models.py +448 -0
- kiln_ai/datamodel/test_eval_model.py +6 -6
- kiln_ai/datamodel/test_example_models.py +175 -0
- kiln_ai/datamodel/test_external_tool_server.py +691 -0
- kiln_ai/datamodel/test_extraction_chunk.py +206 -0
- kiln_ai/datamodel/test_extraction_model.py +470 -0
- kiln_ai/datamodel/test_rag.py +641 -0
- kiln_ai/datamodel/test_registry.py +8 -3
- kiln_ai/datamodel/test_task.py +15 -47
- kiln_ai/datamodel/test_tool_id.py +320 -0
- kiln_ai/datamodel/test_vector_store.py +320 -0
- kiln_ai/datamodel/tool_id.py +105 -0
- kiln_ai/datamodel/vector_store.py +141 -0
- kiln_ai/tools/__init__.py +8 -0
- kiln_ai/tools/base_tool.py +82 -0
- kiln_ai/tools/built_in_tools/__init__.py +13 -0
- kiln_ai/tools/built_in_tools/math_tools.py +124 -0
- kiln_ai/tools/built_in_tools/test_math_tools.py +204 -0
- kiln_ai/tools/mcp_server_tool.py +95 -0
- kiln_ai/tools/mcp_session_manager.py +246 -0
- kiln_ai/tools/rag_tools.py +157 -0
- kiln_ai/tools/test_base_tools.py +199 -0
- kiln_ai/tools/test_mcp_server_tool.py +457 -0
- kiln_ai/tools/test_mcp_session_manager.py +1585 -0
- kiln_ai/tools/test_rag_tools.py +848 -0
- kiln_ai/tools/test_tool_registry.py +562 -0
- kiln_ai/tools/tool_registry.py +85 -0
- kiln_ai/utils/__init__.py +3 -0
- kiln_ai/utils/async_job_runner.py +62 -17
- kiln_ai/utils/config.py +24 -2
- kiln_ai/utils/env.py +15 -0
- kiln_ai/utils/filesystem.py +14 -0
- kiln_ai/utils/filesystem_cache.py +60 -0
- kiln_ai/utils/litellm.py +94 -0
- kiln_ai/utils/lock.py +100 -0
- kiln_ai/utils/mime_type.py +38 -0
- kiln_ai/utils/open_ai_types.py +94 -0
- kiln_ai/utils/pdf_utils.py +38 -0
- kiln_ai/utils/project_utils.py +17 -0
- kiln_ai/utils/test_async_job_runner.py +151 -35
- kiln_ai/utils/test_config.py +138 -1
- kiln_ai/utils/test_env.py +142 -0
- kiln_ai/utils/test_filesystem_cache.py +316 -0
- kiln_ai/utils/test_litellm.py +206 -0
- kiln_ai/utils/test_lock.py +185 -0
- kiln_ai/utils/test_mime_type.py +66 -0
- kiln_ai/utils/test_open_ai_types.py +131 -0
- kiln_ai/utils/test_pdf_utils.py +73 -0
- kiln_ai/utils/test_uuid.py +111 -0
- kiln_ai/utils/test_validation.py +524 -0
- kiln_ai/utils/uuid.py +9 -0
- kiln_ai/utils/validation.py +90 -0
- {kiln_ai-0.19.0.dist-info → kiln_ai-0.21.0.dist-info}/METADATA +12 -5
- kiln_ai-0.21.0.dist-info/RECORD +211 -0
- kiln_ai-0.19.0.dist-info/RECORD +0 -115
- {kiln_ai-0.19.0.dist-info → kiln_ai-0.21.0.dist-info}/WHEEL +0 -0
- {kiln_ai-0.19.0.dist-info → kiln_ai-0.21.0.dist-info}/licenses/LICENSE.txt +0 -0
|
@@ -0,0 +1,181 @@
|
|
|
1
|
+
from unittest.mock import patch
|
|
2
|
+
|
|
3
|
+
import pytest
|
|
4
|
+
|
|
5
|
+
from kiln_ai.adapters.extractors.extractor_registry import extractor_adapter_from_type
|
|
6
|
+
from kiln_ai.adapters.extractors.litellm_extractor import LitellmExtractor
|
|
7
|
+
from kiln_ai.adapters.ml_model_list import ModelProviderName
|
|
8
|
+
from kiln_ai.adapters.provider_tools import LiteLlmCoreConfig
|
|
9
|
+
from kiln_ai.datamodel.extraction import ExtractorConfig, ExtractorType
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@pytest.fixture
|
|
13
|
+
def mock_provider_configs():
|
|
14
|
+
with patch("kiln_ai.utils.config.Config.shared") as mock_config:
|
|
15
|
+
mock_config.return_value.open_ai_api_key = "test-openai-key"
|
|
16
|
+
mock_config.return_value.gemini_api_key = "test-gemini-key"
|
|
17
|
+
mock_config.return_value.anthropic_api_key = "test-anthropic-key"
|
|
18
|
+
mock_config.return_value.bedrock_access_key = "test-amazon-bedrock-key"
|
|
19
|
+
mock_config.return_value.bedrock_secret_key = "test-amazon-bedrock-secret-key"
|
|
20
|
+
mock_config.return_value.fireworks_api_key = "test-fireworks-key"
|
|
21
|
+
mock_config.return_value.groq_api_key = "test-groq-key"
|
|
22
|
+
mock_config.return_value.huggingface_api_key = "test-huggingface-key"
|
|
23
|
+
yield mock_config
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def test_extractor_adapter_from_type(mock_provider_configs):
|
|
27
|
+
extractor = extractor_adapter_from_type(
|
|
28
|
+
ExtractorType.LITELLM,
|
|
29
|
+
ExtractorConfig(
|
|
30
|
+
name="test-extractor",
|
|
31
|
+
extractor_type=ExtractorType.LITELLM,
|
|
32
|
+
model_provider_name="gemini_api",
|
|
33
|
+
model_name="gemini-2.0-flash",
|
|
34
|
+
properties={
|
|
35
|
+
"prompt_document": "Extract the text from the document",
|
|
36
|
+
"prompt_image": "Extract the text from the image",
|
|
37
|
+
"prompt_video": "Extract the text from the video",
|
|
38
|
+
"prompt_audio": "Extract the text from the audio",
|
|
39
|
+
},
|
|
40
|
+
),
|
|
41
|
+
)
|
|
42
|
+
assert isinstance(extractor, LitellmExtractor)
|
|
43
|
+
assert extractor.extractor_config.model_name == "gemini-2.0-flash"
|
|
44
|
+
assert extractor.extractor_config.model_provider_name == "gemini_api"
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
@patch(
|
|
48
|
+
"kiln_ai.adapters.extractors.extractor_registry.lite_llm_core_config_for_provider"
|
|
49
|
+
)
|
|
50
|
+
def test_extractor_adapter_from_type_uses_litellm_core_config(
|
|
51
|
+
mock_get_litellm_core_config,
|
|
52
|
+
):
|
|
53
|
+
"""Test that extractor receives auth details from provider_tools."""
|
|
54
|
+
mock_litellm_core_config = LiteLlmCoreConfig(
|
|
55
|
+
base_url="https://test.com",
|
|
56
|
+
additional_body_options={"api_key": "test-key"},
|
|
57
|
+
default_headers={},
|
|
58
|
+
)
|
|
59
|
+
mock_get_litellm_core_config.return_value = mock_litellm_core_config
|
|
60
|
+
|
|
61
|
+
extractor = extractor_adapter_from_type(
|
|
62
|
+
ExtractorType.LITELLM,
|
|
63
|
+
ExtractorConfig(
|
|
64
|
+
name="test-extractor",
|
|
65
|
+
extractor_type=ExtractorType.LITELLM,
|
|
66
|
+
model_provider_name="openai",
|
|
67
|
+
model_name="gpt-4",
|
|
68
|
+
properties={
|
|
69
|
+
"prompt_document": "Extract the text from the document",
|
|
70
|
+
"prompt_image": "Extract the text from the image",
|
|
71
|
+
"prompt_video": "Extract the text from the video",
|
|
72
|
+
"prompt_audio": "Extract the text from the audio",
|
|
73
|
+
},
|
|
74
|
+
),
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
assert isinstance(extractor, LitellmExtractor)
|
|
78
|
+
assert extractor.litellm_core_config == mock_litellm_core_config
|
|
79
|
+
mock_get_litellm_core_config.assert_called_once_with(ModelProviderName.openai)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def test_extractor_adapter_from_type_invalid_provider():
|
|
83
|
+
"""Test that invalid model provider names raise a clear error."""
|
|
84
|
+
with pytest.raises(
|
|
85
|
+
ValueError, match="Unsupported model provider name: invalid_provider"
|
|
86
|
+
):
|
|
87
|
+
extractor_adapter_from_type(
|
|
88
|
+
ExtractorType.LITELLM,
|
|
89
|
+
ExtractorConfig(
|
|
90
|
+
name="test-extractor",
|
|
91
|
+
extractor_type=ExtractorType.LITELLM,
|
|
92
|
+
model_provider_name="invalid_provider",
|
|
93
|
+
model_name="some-model",
|
|
94
|
+
properties={
|
|
95
|
+
"prompt_document": "Extract the text from the document",
|
|
96
|
+
"prompt_image": "Extract the text from the image",
|
|
97
|
+
"prompt_video": "Extract the text from the video",
|
|
98
|
+
"prompt_audio": "Extract the text from the audio",
|
|
99
|
+
},
|
|
100
|
+
),
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def test_extractor_adapter_from_type_invalid():
|
|
105
|
+
with pytest.raises(ValueError, match="Unhandled enum value: fake_type"):
|
|
106
|
+
extractor_adapter_from_type(
|
|
107
|
+
"fake_type",
|
|
108
|
+
ExtractorConfig(
|
|
109
|
+
name="test-extractor",
|
|
110
|
+
extractor_type=ExtractorType.LITELLM,
|
|
111
|
+
model_provider_name="invalid_provider",
|
|
112
|
+
model_name="some-model",
|
|
113
|
+
properties={
|
|
114
|
+
"prompt_document": "Extract the text from the document",
|
|
115
|
+
"prompt_image": "Extract the text from the image",
|
|
116
|
+
"prompt_video": "Extract the text from the video",
|
|
117
|
+
"prompt_audio": "Extract the text from the audio",
|
|
118
|
+
},
|
|
119
|
+
),
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
@pytest.mark.parametrize(
|
|
124
|
+
"provider_name",
|
|
125
|
+
[
|
|
126
|
+
"openai",
|
|
127
|
+
"anthropic",
|
|
128
|
+
"gemini_api",
|
|
129
|
+
"amazon_bedrock",
|
|
130
|
+
"fireworks_ai",
|
|
131
|
+
"groq",
|
|
132
|
+
"huggingface",
|
|
133
|
+
],
|
|
134
|
+
)
|
|
135
|
+
def test_extractor_adapter_from_type_different_providers(
|
|
136
|
+
provider_name, mock_provider_configs
|
|
137
|
+
):
|
|
138
|
+
"""Test that different providers work correctly."""
|
|
139
|
+
extractor = extractor_adapter_from_type(
|
|
140
|
+
ExtractorType.LITELLM,
|
|
141
|
+
ExtractorConfig(
|
|
142
|
+
name="test-extractor",
|
|
143
|
+
extractor_type=ExtractorType.LITELLM,
|
|
144
|
+
model_provider_name=provider_name,
|
|
145
|
+
model_name="test-model",
|
|
146
|
+
properties={
|
|
147
|
+
"prompt_document": "Extract the text from the document",
|
|
148
|
+
"prompt_image": "Extract the text from the image",
|
|
149
|
+
"prompt_video": "Extract the text from the video",
|
|
150
|
+
"prompt_audio": "Extract the text from the audio",
|
|
151
|
+
},
|
|
152
|
+
),
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
assert isinstance(extractor, LitellmExtractor)
|
|
156
|
+
assert extractor.extractor_config.model_provider_name == provider_name
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def test_extractor_adapter_from_type_no_config_found(mock_provider_configs):
|
|
160
|
+
with patch(
|
|
161
|
+
"kiln_ai.adapters.extractors.extractor_registry.lite_llm_core_config_for_provider"
|
|
162
|
+
) as mock_lite_llm_core_config_for_provider:
|
|
163
|
+
mock_lite_llm_core_config_for_provider.return_value = None
|
|
164
|
+
with pytest.raises(
|
|
165
|
+
ValueError, match="No configuration found for core provider: openai"
|
|
166
|
+
):
|
|
167
|
+
extractor_adapter_from_type(
|
|
168
|
+
ExtractorType.LITELLM,
|
|
169
|
+
ExtractorConfig(
|
|
170
|
+
name="test-extractor",
|
|
171
|
+
extractor_type=ExtractorType.LITELLM,
|
|
172
|
+
model_provider_name="openai",
|
|
173
|
+
model_name="gpt-4",
|
|
174
|
+
properties={
|
|
175
|
+
"prompt_document": "Extract the text from the document",
|
|
176
|
+
"prompt_image": "Extract the text from the image",
|
|
177
|
+
"prompt_video": "Extract the text from the video",
|
|
178
|
+
"prompt_audio": "Extract the text from the audio",
|
|
179
|
+
},
|
|
180
|
+
),
|
|
181
|
+
)
|
|
@@ -0,0 +1,181 @@
|
|
|
1
|
+
from unittest.mock import AsyncMock
|
|
2
|
+
|
|
3
|
+
import pytest
|
|
4
|
+
|
|
5
|
+
from conftest import MockFileFactoryMimeType
|
|
6
|
+
from kiln_ai.adapters.extractors.extractor_runner import ExtractorRunner
|
|
7
|
+
from kiln_ai.datamodel.basemodel import KilnAttachmentModel
|
|
8
|
+
from kiln_ai.datamodel.extraction import (
|
|
9
|
+
Document,
|
|
10
|
+
Extraction,
|
|
11
|
+
ExtractionSource,
|
|
12
|
+
ExtractorConfig,
|
|
13
|
+
ExtractorType,
|
|
14
|
+
FileInfo,
|
|
15
|
+
Kind,
|
|
16
|
+
OutputFormat,
|
|
17
|
+
)
|
|
18
|
+
from kiln_ai.datamodel.project import Project
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@pytest.fixture
|
|
22
|
+
def mock_project(tmp_path):
|
|
23
|
+
project = Project(
|
|
24
|
+
name="test",
|
|
25
|
+
description="test",
|
|
26
|
+
path=tmp_path / "project.kiln",
|
|
27
|
+
)
|
|
28
|
+
project.save_to_file()
|
|
29
|
+
return project
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@pytest.fixture
|
|
33
|
+
def mock_extractor_config(mock_project):
|
|
34
|
+
extractor_config = ExtractorConfig(
|
|
35
|
+
name="test",
|
|
36
|
+
description="test",
|
|
37
|
+
output_format=OutputFormat.TEXT,
|
|
38
|
+
passthrough_mimetypes=[],
|
|
39
|
+
extractor_type=ExtractorType.LITELLM,
|
|
40
|
+
model_provider_name="gemini_api",
|
|
41
|
+
model_name="gemini-2.0-flash",
|
|
42
|
+
parent=mock_project,
|
|
43
|
+
properties={
|
|
44
|
+
"prompt_document": "Extract the text from the document",
|
|
45
|
+
"prompt_image": "Extract the text from the image",
|
|
46
|
+
"prompt_video": "Extract the text from the video",
|
|
47
|
+
"prompt_audio": "Extract the text from the audio",
|
|
48
|
+
},
|
|
49
|
+
)
|
|
50
|
+
extractor_config.save_to_file()
|
|
51
|
+
return extractor_config
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
@pytest.fixture
|
|
55
|
+
def mock_document(mock_project, mock_file_factory) -> Document:
|
|
56
|
+
test_pdf_file = mock_file_factory(MockFileFactoryMimeType.PDF)
|
|
57
|
+
document = Document(
|
|
58
|
+
name="test",
|
|
59
|
+
description="test",
|
|
60
|
+
kind=Kind.DOCUMENT,
|
|
61
|
+
original_file=FileInfo(
|
|
62
|
+
filename="test.pdf",
|
|
63
|
+
size=100,
|
|
64
|
+
mime_type="application/pdf",
|
|
65
|
+
attachment=KilnAttachmentModel.from_file(test_pdf_file),
|
|
66
|
+
),
|
|
67
|
+
parent=mock_project,
|
|
68
|
+
)
|
|
69
|
+
document.save_to_file()
|
|
70
|
+
return document
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
@pytest.fixture
|
|
74
|
+
def mock_extractor_runner(mock_extractor_config, mock_document):
|
|
75
|
+
return ExtractorRunner(
|
|
76
|
+
extractor_configs=[mock_extractor_config],
|
|
77
|
+
documents=[mock_document],
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
# Test with and without concurrency
|
|
82
|
+
@pytest.mark.parametrize("concurrency", [1, 25])
|
|
83
|
+
@pytest.mark.asyncio
|
|
84
|
+
async def test_async_extractor_runner_status_updates(
|
|
85
|
+
mock_extractor_runner, concurrency
|
|
86
|
+
):
|
|
87
|
+
# Real async testing!
|
|
88
|
+
|
|
89
|
+
job_count = 50
|
|
90
|
+
# Job objects are not the right type, but since we're mocking run_job, it doesn't matter
|
|
91
|
+
jobs = [{} for _ in range(job_count)]
|
|
92
|
+
|
|
93
|
+
# Mock collect_tasks to return our fake jobs
|
|
94
|
+
mock_extractor_runner.collect_jobs = lambda: jobs
|
|
95
|
+
|
|
96
|
+
# Mock run_job to return True immediately
|
|
97
|
+
mock_extractor_runner.run_job = AsyncMock(return_value=True)
|
|
98
|
+
|
|
99
|
+
# Expect the status updates in order, and 1 for each job
|
|
100
|
+
expected_completed_count = 0
|
|
101
|
+
async for progress in mock_extractor_runner.run(concurrency=concurrency):
|
|
102
|
+
assert progress.complete == expected_completed_count
|
|
103
|
+
expected_completed_count += 1
|
|
104
|
+
assert progress.errors == 0
|
|
105
|
+
assert progress.total == job_count
|
|
106
|
+
|
|
107
|
+
# Verify last status update was complete
|
|
108
|
+
assert expected_completed_count == job_count + 1
|
|
109
|
+
|
|
110
|
+
# Verify run_job was called for each job
|
|
111
|
+
assert mock_extractor_runner.run_job.call_count == job_count
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def test_collect_jobs_excludes_already_run_extraction(
|
|
115
|
+
mock_extractor_runner, mock_document, mock_extractor_config
|
|
116
|
+
):
|
|
117
|
+
"""Test that already run documents are excluded"""
|
|
118
|
+
Extraction(
|
|
119
|
+
parent=mock_document,
|
|
120
|
+
source=ExtractionSource.PROCESSED,
|
|
121
|
+
extractor_config_id="other-extractor-config-id",
|
|
122
|
+
output=KilnAttachmentModel.from_data("test extraction output", "text/plain"),
|
|
123
|
+
).save_to_file()
|
|
124
|
+
|
|
125
|
+
# should get the one job, since the document was not already extracted with this extractor config
|
|
126
|
+
jobs = mock_extractor_runner.collect_jobs()
|
|
127
|
+
assert len(jobs) == 1
|
|
128
|
+
assert jobs[0].doc.id == mock_document.id
|
|
129
|
+
assert jobs[0].extractor_config.id == mock_extractor_config.id
|
|
130
|
+
|
|
131
|
+
# Create an extraction for this document
|
|
132
|
+
Extraction(
|
|
133
|
+
parent=mock_document,
|
|
134
|
+
source=ExtractionSource.PROCESSED,
|
|
135
|
+
extractor_config_id=mock_extractor_config.id,
|
|
136
|
+
output=KilnAttachmentModel.from_data("test extraction output", "text/plain"),
|
|
137
|
+
).save_to_file()
|
|
138
|
+
|
|
139
|
+
jobs = mock_extractor_runner.collect_jobs()
|
|
140
|
+
|
|
141
|
+
# should now get no jobs since the document was already extracted with this extractor config
|
|
142
|
+
assert len(jobs) == 0
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def test_collect_jobs_multiple_extractor_configs(
|
|
146
|
+
mock_extractor_runner,
|
|
147
|
+
mock_document,
|
|
148
|
+
mock_extractor_config,
|
|
149
|
+
mock_project,
|
|
150
|
+
):
|
|
151
|
+
"""Test handling multiple extractor configs"""
|
|
152
|
+
second_config = ExtractorConfig(
|
|
153
|
+
name="test2",
|
|
154
|
+
description="test2",
|
|
155
|
+
output_format=OutputFormat.TEXT,
|
|
156
|
+
passthrough_mimetypes=[],
|
|
157
|
+
extractor_type=ExtractorType.LITELLM,
|
|
158
|
+
parent=mock_project,
|
|
159
|
+
model_provider_name="gemini_api",
|
|
160
|
+
model_name="gemini-2.0-flash",
|
|
161
|
+
properties={
|
|
162
|
+
"prompt_document": "Extract the text from the document",
|
|
163
|
+
"prompt_image": "Extract the text from the image",
|
|
164
|
+
"prompt_video": "Extract the text from the video",
|
|
165
|
+
"prompt_audio": "Extract the text from the audio",
|
|
166
|
+
},
|
|
167
|
+
)
|
|
168
|
+
second_config.save_to_file()
|
|
169
|
+
|
|
170
|
+
runner = ExtractorRunner(
|
|
171
|
+
extractor_configs=[mock_extractor_config, second_config],
|
|
172
|
+
documents=[mock_document],
|
|
173
|
+
)
|
|
174
|
+
jobs = runner.collect_jobs()
|
|
175
|
+
|
|
176
|
+
# Should get 2 jobs, one for each config
|
|
177
|
+
assert len(jobs) == 2
|
|
178
|
+
assert {job.extractor_config.id for job in jobs} == {
|
|
179
|
+
second_config.id,
|
|
180
|
+
mock_extractor_config.id,
|
|
181
|
+
}
|