kiln-ai 0.8.1__py3-none-any.whl → 0.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of kiln-ai might be problematic. Click here for more details.
- kiln_ai/adapters/__init__.py +7 -7
- kiln_ai/adapters/adapter_registry.py +81 -10
- kiln_ai/adapters/data_gen/data_gen_task.py +21 -3
- kiln_ai/adapters/data_gen/test_data_gen_task.py +23 -3
- kiln_ai/adapters/eval/base_eval.py +164 -0
- kiln_ai/adapters/eval/eval_runner.py +267 -0
- kiln_ai/adapters/eval/g_eval.py +367 -0
- kiln_ai/adapters/eval/registry.py +16 -0
- kiln_ai/adapters/eval/test_base_eval.py +324 -0
- kiln_ai/adapters/eval/test_eval_runner.py +640 -0
- kiln_ai/adapters/eval/test_g_eval.py +497 -0
- kiln_ai/adapters/eval/test_g_eval_data.py +4 -0
- kiln_ai/adapters/fine_tune/base_finetune.py +5 -1
- kiln_ai/adapters/fine_tune/dataset_formatter.py +310 -65
- kiln_ai/adapters/fine_tune/fireworks_finetune.py +47 -32
- kiln_ai/adapters/fine_tune/openai_finetune.py +12 -11
- kiln_ai/adapters/fine_tune/test_base_finetune.py +19 -0
- kiln_ai/adapters/fine_tune/test_dataset_formatter.py +472 -129
- kiln_ai/adapters/fine_tune/test_fireworks_tinetune.py +114 -22
- kiln_ai/adapters/fine_tune/test_openai_finetune.py +125 -14
- kiln_ai/adapters/ml_model_list.py +434 -93
- kiln_ai/adapters/model_adapters/__init__.py +18 -0
- kiln_ai/adapters/model_adapters/base_adapter.py +250 -0
- kiln_ai/adapters/model_adapters/langchain_adapters.py +309 -0
- kiln_ai/adapters/model_adapters/openai_compatible_config.py +10 -0
- kiln_ai/adapters/model_adapters/openai_model_adapter.py +289 -0
- kiln_ai/adapters/model_adapters/test_base_adapter.py +199 -0
- kiln_ai/adapters/{test_langchain_adapter.py → model_adapters/test_langchain_adapter.py} +105 -97
- kiln_ai/adapters/model_adapters/test_openai_model_adapter.py +216 -0
- kiln_ai/adapters/{test_saving_adapter_results.py → model_adapters/test_saving_adapter_results.py} +80 -30
- kiln_ai/adapters/{test_structured_output.py → model_adapters/test_structured_output.py} +125 -46
- kiln_ai/adapters/ollama_tools.py +0 -1
- kiln_ai/adapters/parsers/__init__.py +10 -0
- kiln_ai/adapters/parsers/base_parser.py +12 -0
- kiln_ai/adapters/parsers/json_parser.py +37 -0
- kiln_ai/adapters/parsers/parser_registry.py +19 -0
- kiln_ai/adapters/parsers/r1_parser.py +69 -0
- kiln_ai/adapters/parsers/test_json_parser.py +81 -0
- kiln_ai/adapters/parsers/test_parser_registry.py +32 -0
- kiln_ai/adapters/parsers/test_r1_parser.py +144 -0
- kiln_ai/adapters/prompt_builders.py +193 -49
- kiln_ai/adapters/provider_tools.py +91 -36
- kiln_ai/adapters/repair/repair_task.py +18 -19
- kiln_ai/adapters/repair/test_repair_task.py +7 -7
- kiln_ai/adapters/run_output.py +11 -0
- kiln_ai/adapters/test_adapter_registry.py +177 -0
- kiln_ai/adapters/test_generate_docs.py +69 -0
- kiln_ai/adapters/test_ollama_tools.py +0 -1
- kiln_ai/adapters/test_prompt_adaptors.py +25 -18
- kiln_ai/adapters/test_prompt_builders.py +265 -44
- kiln_ai/adapters/test_provider_tools.py +268 -46
- kiln_ai/datamodel/__init__.py +51 -772
- kiln_ai/datamodel/basemodel.py +31 -11
- kiln_ai/datamodel/datamodel_enums.py +58 -0
- kiln_ai/datamodel/dataset_filters.py +114 -0
- kiln_ai/datamodel/dataset_split.py +170 -0
- kiln_ai/datamodel/eval.py +298 -0
- kiln_ai/datamodel/finetune.py +105 -0
- kiln_ai/datamodel/json_schema.py +14 -3
- kiln_ai/datamodel/model_cache.py +8 -3
- kiln_ai/datamodel/project.py +23 -0
- kiln_ai/datamodel/prompt.py +37 -0
- kiln_ai/datamodel/prompt_id.py +83 -0
- kiln_ai/datamodel/strict_mode.py +24 -0
- kiln_ai/datamodel/task.py +181 -0
- kiln_ai/datamodel/task_output.py +321 -0
- kiln_ai/datamodel/task_run.py +164 -0
- kiln_ai/datamodel/test_basemodel.py +80 -2
- kiln_ai/datamodel/test_dataset_filters.py +71 -0
- kiln_ai/datamodel/test_dataset_split.py +127 -6
- kiln_ai/datamodel/test_datasource.py +3 -2
- kiln_ai/datamodel/test_eval_model.py +635 -0
- kiln_ai/datamodel/test_example_models.py +34 -17
- kiln_ai/datamodel/test_json_schema.py +23 -0
- kiln_ai/datamodel/test_model_cache.py +24 -0
- kiln_ai/datamodel/test_model_perf.py +125 -0
- kiln_ai/datamodel/test_models.py +131 -2
- kiln_ai/datamodel/test_prompt_id.py +129 -0
- kiln_ai/datamodel/test_task.py +159 -0
- kiln_ai/utils/config.py +6 -1
- kiln_ai/utils/exhaustive_error.py +6 -0
- {kiln_ai-0.8.1.dist-info → kiln_ai-0.12.0.dist-info}/METADATA +45 -7
- kiln_ai-0.12.0.dist-info/RECORD +100 -0
- kiln_ai/adapters/base_adapter.py +0 -191
- kiln_ai/adapters/langchain_adapters.py +0 -256
- kiln_ai-0.8.1.dist-info/RECORD +0 -58
- {kiln_ai-0.8.1.dist-info → kiln_ai-0.12.0.dist-info}/WHEEL +0 -0
- {kiln_ai-0.8.1.dist-info → kiln_ai-0.12.0.dist-info}/licenses/LICENSE.txt +0 -0
kiln_ai/adapters/{test_saving_adapter_results.py → model_adapters/test_saving_adapter_results.py}
RENAMED
|
@@ -2,13 +2,17 @@ from unittest.mock import patch
|
|
|
2
2
|
|
|
3
3
|
import pytest
|
|
4
4
|
|
|
5
|
-
from kiln_ai.adapters.base_adapter import
|
|
5
|
+
from kiln_ai.adapters.model_adapters.base_adapter import (
|
|
6
|
+
BaseAdapter,
|
|
7
|
+
RunOutput,
|
|
8
|
+
)
|
|
6
9
|
from kiln_ai.datamodel import (
|
|
7
10
|
DataSource,
|
|
8
11
|
DataSourceType,
|
|
9
12
|
Project,
|
|
10
13
|
Task,
|
|
11
14
|
)
|
|
15
|
+
from kiln_ai.datamodel.task import RunConfig
|
|
12
16
|
from kiln_ai.utils.config import Config
|
|
13
17
|
|
|
14
18
|
|
|
@@ -16,13 +20,8 @@ class MockAdapter(BaseAdapter):
|
|
|
16
20
|
async def _run(self, input: dict | str) -> dict | str:
|
|
17
21
|
return RunOutput(output="Test output", intermediate_outputs=None)
|
|
18
22
|
|
|
19
|
-
def
|
|
20
|
-
return
|
|
21
|
-
adapter_name="mock_adapter",
|
|
22
|
-
model_name="mock_model",
|
|
23
|
-
model_provider="mock_provider",
|
|
24
|
-
prompt_builder_name="mock_prompt_builder",
|
|
25
|
-
)
|
|
23
|
+
def adapter_name(self) -> str:
|
|
24
|
+
return "mock_adapter"
|
|
26
25
|
|
|
27
26
|
|
|
28
27
|
@pytest.fixture
|
|
@@ -38,8 +37,19 @@ def test_task(tmp_path):
|
|
|
38
37
|
return task
|
|
39
38
|
|
|
40
39
|
|
|
41
|
-
|
|
42
|
-
|
|
40
|
+
@pytest.fixture
|
|
41
|
+
def adapter(test_task):
|
|
42
|
+
return MockAdapter(
|
|
43
|
+
run_config=RunConfig(
|
|
44
|
+
task=test_task,
|
|
45
|
+
model_name="phi_3_5",
|
|
46
|
+
model_provider_name="ollama",
|
|
47
|
+
prompt_id="simple_chain_of_thought_prompt_builder",
|
|
48
|
+
),
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def test_save_run_isolation(test_task, adapter):
|
|
43
53
|
input_data = "Test input"
|
|
44
54
|
output_data = "Test output"
|
|
45
55
|
run_output = RunOutput(
|
|
@@ -85,13 +95,12 @@ def test_save_run_isolation(test_task):
|
|
|
85
95
|
assert reloaded_output.source.type == DataSourceType.synthetic
|
|
86
96
|
assert reloaded_output.rating is None
|
|
87
97
|
assert reloaded_output.source.properties["adapter_name"] == "mock_adapter"
|
|
88
|
-
assert reloaded_output.source.properties["model_name"] == "
|
|
89
|
-
assert reloaded_output.source.properties["model_provider"] == "
|
|
98
|
+
assert reloaded_output.source.properties["model_name"] == "phi_3_5"
|
|
99
|
+
assert reloaded_output.source.properties["model_provider"] == "ollama"
|
|
90
100
|
assert (
|
|
91
|
-
reloaded_output.source.properties["
|
|
92
|
-
== "
|
|
101
|
+
reloaded_output.source.properties["prompt_id"]
|
|
102
|
+
== "simple_chain_of_thought_prompt_builder"
|
|
93
103
|
)
|
|
94
|
-
|
|
95
104
|
# Run again, with same input and different output. Should create a new TaskRun.
|
|
96
105
|
different_run_output = RunOutput(
|
|
97
106
|
output="Different output", intermediate_outputs=None
|
|
@@ -101,13 +110,6 @@ def test_save_run_isolation(test_task):
|
|
|
101
110
|
assert len(test_task.runs()) == 2
|
|
102
111
|
assert "Different output" in set(run.output.output for run in test_task.runs())
|
|
103
112
|
|
|
104
|
-
# run again with same input and same output. Should not create a new TaskRun.
|
|
105
|
-
task_output = adapter.generate_run(input_data, None, run_output)
|
|
106
|
-
task_output.save_to_file()
|
|
107
|
-
assert len(test_task.runs()) == 2
|
|
108
|
-
assert "Different output" in set(run.output.output for run in test_task.runs())
|
|
109
|
-
assert output_data in set(run.output.output for run in test_task.runs())
|
|
110
|
-
|
|
111
113
|
# run again with input of different type. Should create a new TaskRun and TaskOutput.
|
|
112
114
|
task_output = adapter.generate_run(
|
|
113
115
|
input_data,
|
|
@@ -116,7 +118,7 @@ def test_save_run_isolation(test_task):
|
|
|
116
118
|
properties={
|
|
117
119
|
"model_name": "mock_model",
|
|
118
120
|
"model_provider": "mock_provider",
|
|
119
|
-
"
|
|
121
|
+
"prompt_id": "mock_prompt_builder",
|
|
120
122
|
"adapter_name": "mock_adapter",
|
|
121
123
|
},
|
|
122
124
|
),
|
|
@@ -130,14 +132,41 @@ def test_save_run_isolation(test_task):
|
|
|
130
132
|
assert output_data in set(run.output.output for run in test_task.runs())
|
|
131
133
|
|
|
132
134
|
|
|
135
|
+
def test_generate_run_non_ascii(test_task, adapter):
|
|
136
|
+
input_data = {"key": "input with non-ascii character: 你好"}
|
|
137
|
+
output_data = {"key": "output with non-ascii character: 你好"}
|
|
138
|
+
run_output = RunOutput(
|
|
139
|
+
output=output_data,
|
|
140
|
+
intermediate_outputs=None,
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
task_run = adapter.generate_run(
|
|
144
|
+
input=input_data, input_source=None, run_output=run_output
|
|
145
|
+
)
|
|
146
|
+
task_run.save_to_file()
|
|
147
|
+
|
|
148
|
+
# as these values are saved as strings, they should properly represent the non-ascii characters
|
|
149
|
+
assert task_run.input == '{"key": "input with non-ascii character: 你好"}'
|
|
150
|
+
assert task_run.output.output == '{"key": "output with non-ascii character: 你好"}'
|
|
151
|
+
|
|
152
|
+
# check that the stringified unicode strings can be read back from the file
|
|
153
|
+
reloaded_task = Task.load_from_file(test_task.path)
|
|
154
|
+
reloaded_runs = reloaded_task.runs()
|
|
155
|
+
assert len(reloaded_runs) == 1
|
|
156
|
+
reloaded_run = reloaded_runs[0]
|
|
157
|
+
assert reloaded_run.input == '{"key": "input with non-ascii character: 你好"}'
|
|
158
|
+
assert (
|
|
159
|
+
reloaded_run.output.output == '{"key": "output with non-ascii character: 你好"}'
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
|
|
133
163
|
@pytest.mark.asyncio
|
|
134
|
-
async def test_autosave_false(test_task):
|
|
164
|
+
async def test_autosave_false(test_task, adapter):
|
|
135
165
|
with patch("kiln_ai.utils.config.Config.shared") as mock_shared:
|
|
136
166
|
mock_config = mock_shared.return_value
|
|
137
167
|
mock_config.autosave_runs = False
|
|
138
168
|
mock_config.user_id = "test_user"
|
|
139
169
|
|
|
140
|
-
adapter = MockAdapter(test_task)
|
|
141
170
|
input_data = "Test input"
|
|
142
171
|
|
|
143
172
|
run = await adapter.invoke(input_data)
|
|
@@ -150,13 +179,31 @@ async def test_autosave_false(test_task):
|
|
|
150
179
|
|
|
151
180
|
|
|
152
181
|
@pytest.mark.asyncio
|
|
153
|
-
async def
|
|
182
|
+
async def test_autosave_true_with_disabled(test_task, adapter):
|
|
183
|
+
with patch("kiln_ai.utils.config.Config.shared") as mock_shared:
|
|
184
|
+
mock_config = mock_shared.return_value
|
|
185
|
+
mock_config.autosave_runs = True
|
|
186
|
+
mock_config.user_id = "test_user"
|
|
187
|
+
|
|
188
|
+
input_data = "Test input"
|
|
189
|
+
|
|
190
|
+
adapter.base_adapter_config.allow_saving = False
|
|
191
|
+
run = await adapter.invoke(input_data)
|
|
192
|
+
|
|
193
|
+
# Check that no runs were saved
|
|
194
|
+
assert len(test_task.runs()) == 0
|
|
195
|
+
|
|
196
|
+
# Check that the run ID is not set
|
|
197
|
+
assert run.id is None
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
@pytest.mark.asyncio
|
|
201
|
+
async def test_autosave_true(test_task, adapter):
|
|
154
202
|
with patch("kiln_ai.utils.config.Config.shared") as mock_shared:
|
|
155
203
|
mock_config = mock_shared.return_value
|
|
156
204
|
mock_config.autosave_runs = True
|
|
157
205
|
mock_config.user_id = "test_user"
|
|
158
206
|
|
|
159
|
-
adapter = MockAdapter(test_task)
|
|
160
207
|
input_data = "Test input"
|
|
161
208
|
|
|
162
209
|
run = await adapter.invoke(input_data)
|
|
@@ -174,6 +221,9 @@ async def test_autosave_true(test_task):
|
|
|
174
221
|
assert output.output == "Test output"
|
|
175
222
|
assert output.source.type == DataSourceType.synthetic
|
|
176
223
|
assert output.source.properties["adapter_name"] == "mock_adapter"
|
|
177
|
-
assert output.source.properties["model_name"] == "
|
|
178
|
-
assert output.source.properties["model_provider"] == "
|
|
179
|
-
assert
|
|
224
|
+
assert output.source.properties["model_name"] == "phi_3_5"
|
|
225
|
+
assert output.source.properties["model_provider"] == "ollama"
|
|
226
|
+
assert (
|
|
227
|
+
output.source.properties["prompt_id"]
|
|
228
|
+
== "simple_chain_of_thought_prompt_builder"
|
|
229
|
+
)
|
|
@@ -1,22 +1,22 @@
|
|
|
1
|
+
import json
|
|
1
2
|
from pathlib import Path
|
|
2
3
|
from typing import Dict
|
|
3
4
|
|
|
4
|
-
import jsonschema
|
|
5
|
-
import jsonschema.exceptions
|
|
6
5
|
import pytest
|
|
7
6
|
|
|
8
7
|
import kiln_ai.datamodel as datamodel
|
|
9
8
|
from kiln_ai.adapters.adapter_registry import adapter_for_task
|
|
10
|
-
from kiln_ai.adapters.base_adapter import AdapterInfo, BaseAdapter, RunOutput
|
|
11
9
|
from kiln_ai.adapters.ml_model_list import (
|
|
12
10
|
built_in_models,
|
|
13
11
|
)
|
|
14
|
-
from kiln_ai.adapters.
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
SimpleChainOfThoughtPromptBuilder,
|
|
12
|
+
from kiln_ai.adapters.model_adapters.base_adapter import (
|
|
13
|
+
BaseAdapter,
|
|
14
|
+
RunOutput,
|
|
18
15
|
)
|
|
16
|
+
from kiln_ai.adapters.ollama_tools import ollama_online
|
|
19
17
|
from kiln_ai.adapters.test_prompt_adaptors import get_all_models_and_providers
|
|
18
|
+
from kiln_ai.datamodel import PromptId
|
|
19
|
+
from kiln_ai.datamodel.task import RunConfig
|
|
20
20
|
from kiln_ai.datamodel.test_json_schema import json_joke_schema, json_triangle_schema
|
|
21
21
|
|
|
22
22
|
|
|
@@ -34,9 +34,9 @@ async def test_structured_output_gpt_4o_mini(tmp_path):
|
|
|
34
34
|
await run_structured_output_test(tmp_path, "gpt_4o_mini", "openai")
|
|
35
35
|
|
|
36
36
|
|
|
37
|
-
@pytest.mark.parametrize("model_name", ["llama_3_1_8b"])
|
|
37
|
+
@pytest.mark.parametrize("model_name", ["llama_3_1_8b", "gemma_2_2b"])
|
|
38
38
|
@pytest.mark.ollama
|
|
39
|
-
async def
|
|
39
|
+
async def test_structured_output_ollama(tmp_path, model_name):
|
|
40
40
|
if not await ollama_online():
|
|
41
41
|
pytest.skip("Ollama API not running. Expect it running on localhost:11434")
|
|
42
42
|
await run_structured_output_test(tmp_path, model_name, "ollama")
|
|
@@ -44,19 +44,21 @@ async def test_structured_output_ollama_llama(tmp_path, model_name):
|
|
|
44
44
|
|
|
45
45
|
class MockAdapter(BaseAdapter):
|
|
46
46
|
def __init__(self, kiln_task: datamodel.Task, response: Dict | str | None):
|
|
47
|
-
super().__init__(
|
|
47
|
+
super().__init__(
|
|
48
|
+
run_config=RunConfig(
|
|
49
|
+
task=kiln_task,
|
|
50
|
+
model_name="phi_3_5",
|
|
51
|
+
model_provider_name="ollama",
|
|
52
|
+
prompt_id="simple_chain_of_thought_prompt_builder",
|
|
53
|
+
),
|
|
54
|
+
)
|
|
48
55
|
self.response = response
|
|
49
56
|
|
|
50
57
|
async def _run(self, input: str) -> RunOutput:
|
|
51
58
|
return RunOutput(output=self.response, intermediate_outputs=None)
|
|
52
59
|
|
|
53
|
-
def
|
|
54
|
-
return
|
|
55
|
-
adapter_name="mock_adapter",
|
|
56
|
-
model_name="mock_model",
|
|
57
|
-
model_provider="mock_provider",
|
|
58
|
-
prompt_builder_name="mock_prompt_builder",
|
|
59
|
-
)
|
|
60
|
+
def adapter_name(self) -> str:
|
|
61
|
+
return "mock_adapter"
|
|
60
62
|
|
|
61
63
|
|
|
62
64
|
async def test_mock_unstructred_response(tmp_path):
|
|
@@ -93,19 +95,10 @@ async def test_mock_unstructred_response(tmp_path):
|
|
|
93
95
|
answer = await adapter.invoke("You are a mock, send me the response!")
|
|
94
96
|
|
|
95
97
|
|
|
96
|
-
|
|
97
|
-
@pytest.mark.ollama
|
|
98
|
-
@pytest.mark.parametrize("model_name,provider_name", get_all_models_and_providers())
|
|
99
|
-
async def test_all_built_in_models_structured_output(
|
|
100
|
-
tmp_path, model_name, provider_name
|
|
101
|
-
):
|
|
98
|
+
def check_supports_structured_output(model_name: str, provider_name: str):
|
|
102
99
|
for model in built_in_models:
|
|
103
100
|
if model.name != model_name:
|
|
104
101
|
continue
|
|
105
|
-
if not model.supports_structured_output:
|
|
106
|
-
pytest.skip(
|
|
107
|
-
f"Skipping {model.name} because it does not support structured output"
|
|
108
|
-
)
|
|
109
102
|
for provider in model.providers:
|
|
110
103
|
if provider.name != provider_name:
|
|
111
104
|
continue
|
|
@@ -113,11 +106,20 @@ async def test_all_built_in_models_structured_output(
|
|
|
113
106
|
pytest.skip(
|
|
114
107
|
f"Skipping {model.name} {provider.name} because it does not support structured output"
|
|
115
108
|
)
|
|
116
|
-
await run_structured_output_test(tmp_path, model.name, provider.name)
|
|
117
109
|
return
|
|
118
110
|
raise RuntimeError(f"No model {model_name} {provider_name} found")
|
|
119
111
|
|
|
120
112
|
|
|
113
|
+
@pytest.mark.paid
|
|
114
|
+
@pytest.mark.ollama
|
|
115
|
+
@pytest.mark.parametrize("model_name,provider_name", get_all_models_and_providers())
|
|
116
|
+
async def test_all_built_in_models_structured_output(
|
|
117
|
+
tmp_path, model_name, provider_name
|
|
118
|
+
):
|
|
119
|
+
check_supports_structured_output(model_name, provider_name)
|
|
120
|
+
await run_structured_output_test(tmp_path, model_name, provider_name)
|
|
121
|
+
|
|
122
|
+
|
|
121
123
|
def build_structured_output_test_task(tmp_path: Path):
|
|
122
124
|
project = datamodel.Project(name="test", path=tmp_path / "test.kiln")
|
|
123
125
|
project.save_to_file()
|
|
@@ -140,7 +142,14 @@ def build_structured_output_test_task(tmp_path: Path):
|
|
|
140
142
|
async def run_structured_output_test(tmp_path: Path, model_name: str, provider: str):
|
|
141
143
|
task = build_structured_output_test_task(tmp_path)
|
|
142
144
|
a = adapter_for_task(task, model_name=model_name, provider=provider)
|
|
143
|
-
|
|
145
|
+
try:
|
|
146
|
+
parsed = await a.invoke_returning_raw("Cows") # a joke about cows
|
|
147
|
+
except ValueError as e:
|
|
148
|
+
if str(e) == "Failed to connect to Ollama. Ensure Ollama is running.":
|
|
149
|
+
pytest.skip(
|
|
150
|
+
f"Skipping {model_name} {provider} because Ollama is not running"
|
|
151
|
+
)
|
|
152
|
+
raise e
|
|
144
153
|
if parsed is None or not isinstance(parsed, Dict):
|
|
145
154
|
raise RuntimeError(f"structured response is not a dict: {parsed}")
|
|
146
155
|
assert parsed["setup"] is not None
|
|
@@ -161,6 +170,7 @@ def build_structured_input_test_task(tmp_path: Path):
|
|
|
161
170
|
parent=project,
|
|
162
171
|
name="test task",
|
|
163
172
|
instruction="You are an assistant which classifies a triangle given the lengths of its sides. If all sides are of equal length, the triangle is equilateral. If two sides are equal, the triangle is isosceles. Otherwise, it is scalene.\n\nAt the end of your response return the result in double square brackets. It should be plain text. It should be exactly one of the three following strings: '[[equilateral]]', or '[[isosceles]]', or '[[scalene]]'.",
|
|
173
|
+
thinking_prompt="Think step by step.",
|
|
164
174
|
)
|
|
165
175
|
task.input_json_schema = json_triangle_schema
|
|
166
176
|
schema = task.input_schema()
|
|
@@ -177,37 +187,59 @@ def build_structured_input_test_task(tmp_path: Path):
|
|
|
177
187
|
|
|
178
188
|
async def run_structured_input_test(tmp_path: Path, model_name: str, provider: str):
|
|
179
189
|
task = build_structured_input_test_task(tmp_path)
|
|
180
|
-
|
|
190
|
+
try:
|
|
191
|
+
await run_structured_input_task(task, model_name, provider)
|
|
192
|
+
except ValueError as e:
|
|
193
|
+
if str(e) == "Failed to connect to Ollama. Ensure Ollama is running.":
|
|
194
|
+
pytest.skip(
|
|
195
|
+
f"Skipping {model_name} {provider} because Ollama is not running"
|
|
196
|
+
)
|
|
197
|
+
raise e
|
|
181
198
|
|
|
182
199
|
|
|
183
200
|
async def run_structured_input_task(
|
|
184
201
|
task: datamodel.Task,
|
|
185
202
|
model_name: str,
|
|
186
203
|
provider: str,
|
|
187
|
-
|
|
204
|
+
prompt_id: PromptId | None = None,
|
|
188
205
|
):
|
|
189
206
|
a = adapter_for_task(
|
|
190
|
-
task,
|
|
207
|
+
task,
|
|
208
|
+
model_name=model_name,
|
|
209
|
+
provider=provider,
|
|
210
|
+
prompt_id=prompt_id,
|
|
191
211
|
)
|
|
192
212
|
with pytest.raises(ValueError):
|
|
193
213
|
# not structured input in dictionary
|
|
194
214
|
await a.invoke("a=1, b=2, c=3")
|
|
195
|
-
with pytest.raises(
|
|
215
|
+
with pytest.raises(
|
|
216
|
+
ValueError,
|
|
217
|
+
match="This task requires a specific output schema. While the model produced JSON, that JSON didn't meet the schema.",
|
|
218
|
+
):
|
|
196
219
|
# invalid structured input
|
|
197
220
|
await a.invoke({"a": 1, "b": 2, "d": 3})
|
|
198
221
|
|
|
199
|
-
|
|
222
|
+
try:
|
|
223
|
+
response = await a.invoke_returning_raw({"a": 2, "b": 2, "c": 2})
|
|
224
|
+
except ValueError as e:
|
|
225
|
+
if str(e) == "Failed to connect to Ollama. Ensure Ollama is running.":
|
|
226
|
+
pytest.skip(
|
|
227
|
+
f"Skipping {model_name} {provider} because Ollama is not running"
|
|
228
|
+
)
|
|
229
|
+
raise e
|
|
200
230
|
assert response is not None
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
231
|
+
if isinstance(response, str):
|
|
232
|
+
assert "[[equilateral]]" in response
|
|
233
|
+
else:
|
|
234
|
+
assert response["is_equilateral"] is True
|
|
235
|
+
|
|
204
236
|
expected_pb_name = "simple_prompt_builder"
|
|
205
|
-
if
|
|
206
|
-
expected_pb_name =
|
|
207
|
-
assert
|
|
208
|
-
|
|
209
|
-
assert
|
|
210
|
-
assert
|
|
237
|
+
if prompt_id is not None:
|
|
238
|
+
expected_pb_name = prompt_id
|
|
239
|
+
assert a.run_config.prompt_id == expected_pb_name
|
|
240
|
+
|
|
241
|
+
assert a.run_config.model_name == model_name
|
|
242
|
+
assert a.run_config.model_provider_name == provider
|
|
211
243
|
|
|
212
244
|
|
|
213
245
|
@pytest.mark.paid
|
|
@@ -227,7 +259,54 @@ async def test_all_built_in_models_structured_input(
|
|
|
227
259
|
@pytest.mark.paid
|
|
228
260
|
@pytest.mark.ollama
|
|
229
261
|
@pytest.mark.parametrize("model_name,provider_name", get_all_models_and_providers())
|
|
230
|
-
async def
|
|
262
|
+
async def test_structured_input_cot_prompt_builder(tmp_path, model_name, provider_name):
|
|
231
263
|
task = build_structured_input_test_task(tmp_path)
|
|
232
|
-
|
|
233
|
-
|
|
264
|
+
await run_structured_input_task(
|
|
265
|
+
task, model_name, provider_name, "simple_chain_of_thought_prompt_builder"
|
|
266
|
+
)
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
@pytest.mark.paid
|
|
270
|
+
@pytest.mark.ollama
|
|
271
|
+
@pytest.mark.parametrize("model_name,provider_name", get_all_models_and_providers())
|
|
272
|
+
async def test_structured_output_cot_prompt_builder(
|
|
273
|
+
tmp_path, model_name, provider_name
|
|
274
|
+
):
|
|
275
|
+
check_supports_structured_output(model_name, provider_name)
|
|
276
|
+
triangle_schema = {
|
|
277
|
+
"type": "object",
|
|
278
|
+
"properties": {
|
|
279
|
+
"is_equilateral": {
|
|
280
|
+
"type": "boolean",
|
|
281
|
+
"description": "True if all sides of the triangle are equal in length",
|
|
282
|
+
},
|
|
283
|
+
"is_scalene": {
|
|
284
|
+
"type": "boolean",
|
|
285
|
+
"description": "True if all sides of the triangle have different lengths",
|
|
286
|
+
},
|
|
287
|
+
"is_obtuse": {
|
|
288
|
+
"type": "boolean",
|
|
289
|
+
"description": "True if one of the angles is greater than 90 degrees",
|
|
290
|
+
},
|
|
291
|
+
},
|
|
292
|
+
"required": ["is_equilateral", "is_scalene", "is_obtuse"],
|
|
293
|
+
"additionalProperties": False,
|
|
294
|
+
}
|
|
295
|
+
task = build_structured_input_test_task(tmp_path)
|
|
296
|
+
task.instruction = """
|
|
297
|
+
You are an assistant which classifies a triangle given the lengths of its sides. If all sides are of equal length, the triangle is equilateral. If two sides are equal, the triangle is isosceles. Otherwise, it is scalene.\n\n"
|
|
298
|
+
|
|
299
|
+
When asked for a final result, this is the format (for an equilateral example):
|
|
300
|
+
```json
|
|
301
|
+
{
|
|
302
|
+
"is_equilateral": true,
|
|
303
|
+
"is_scalene": false,
|
|
304
|
+
"is_obtuse": false
|
|
305
|
+
}
|
|
306
|
+
```
|
|
307
|
+
"""
|
|
308
|
+
task.output_json_schema = json.dumps(triangle_schema)
|
|
309
|
+
task.save_to_file()
|
|
310
|
+
await run_structured_input_task(
|
|
311
|
+
task, model_name, provider_name, "simple_chain_of_thought_prompt_builder"
|
|
312
|
+
)
|
kiln_ai/adapters/ollama_tools.py
CHANGED
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
from kiln_ai.adapters.run_output import RunOutput
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class BaseParser:
|
|
5
|
+
def __init__(self, structured_output: bool = False):
|
|
6
|
+
self.structured_output = structured_output
|
|
7
|
+
|
|
8
|
+
def parse_output(self, original_output: RunOutput) -> RunOutput:
|
|
9
|
+
"""
|
|
10
|
+
Method for parsing the output of a model. Typically overridden by subclasses.
|
|
11
|
+
"""
|
|
12
|
+
return original_output
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from typing import Any, Dict
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def parse_json_string(json_string: str) -> Dict[str, Any]:
|
|
6
|
+
"""
|
|
7
|
+
Parse a JSON string into a dictionary. Handles multiple formats:
|
|
8
|
+
- Plain JSON
|
|
9
|
+
- JSON wrapped in ```json code blocks
|
|
10
|
+
- JSON wrapped in ``` code blocks
|
|
11
|
+
|
|
12
|
+
Args:
|
|
13
|
+
json_string: String containing JSON data, possibly wrapped in code blocks
|
|
14
|
+
|
|
15
|
+
Returns:
|
|
16
|
+
Dict containing parsed JSON data
|
|
17
|
+
|
|
18
|
+
Raises:
|
|
19
|
+
ValueError: If JSON parsing fails
|
|
20
|
+
"""
|
|
21
|
+
# Remove code block markers if present
|
|
22
|
+
cleaned_string = json_string.strip()
|
|
23
|
+
if cleaned_string.startswith("```"):
|
|
24
|
+
# Split by newlines and remove first/last lines if they contain ```
|
|
25
|
+
lines = cleaned_string.split("\n")
|
|
26
|
+
if lines[0].startswith("```"):
|
|
27
|
+
lines = lines[1:]
|
|
28
|
+
if lines and lines[-1].strip() == "```":
|
|
29
|
+
lines = lines[:-1]
|
|
30
|
+
cleaned_string = "\n".join(lines)
|
|
31
|
+
|
|
32
|
+
try:
|
|
33
|
+
return json.loads(cleaned_string)
|
|
34
|
+
except json.JSONDecodeError as e:
|
|
35
|
+
raise ValueError(
|
|
36
|
+
f"This task requires JSON output but the model didn't return valid JSON. Search 'Troubleshooting Structured Data Issues' in our docs for more information. The model produced the following: {cleaned_string}"
|
|
37
|
+
) from e
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
from typing import Type
|
|
2
|
+
|
|
3
|
+
from kiln_ai.adapters.ml_model_list import ModelParserID
|
|
4
|
+
from kiln_ai.adapters.parsers.base_parser import BaseParser
|
|
5
|
+
from kiln_ai.adapters.parsers.r1_parser import R1ThinkingParser
|
|
6
|
+
from kiln_ai.utils.exhaustive_error import raise_exhaustive_enum_error
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def model_parser_from_id(parser_id: ModelParserID | None) -> Type[BaseParser]:
|
|
10
|
+
"""
|
|
11
|
+
Get a model parser from its ID.
|
|
12
|
+
"""
|
|
13
|
+
match parser_id:
|
|
14
|
+
case None:
|
|
15
|
+
return BaseParser
|
|
16
|
+
case ModelParserID.r1_thinking:
|
|
17
|
+
return R1ThinkingParser
|
|
18
|
+
case _:
|
|
19
|
+
raise_exhaustive_enum_error(parser_id)
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
from kiln_ai.adapters.parsers.base_parser import BaseParser
|
|
2
|
+
from kiln_ai.adapters.parsers.json_parser import parse_json_string
|
|
3
|
+
from kiln_ai.adapters.run_output import RunOutput
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class R1ThinkingParser(BaseParser):
|
|
7
|
+
START_TAG = "<think>"
|
|
8
|
+
END_TAG = "</think>"
|
|
9
|
+
|
|
10
|
+
def parse_output(self, original_output: RunOutput) -> RunOutput:
|
|
11
|
+
"""
|
|
12
|
+
Parse the <think> </think> tags from the response into the intermediate and final outputs.
|
|
13
|
+
|
|
14
|
+
Args:
|
|
15
|
+
original_output: RunOutput containing the raw response string
|
|
16
|
+
|
|
17
|
+
Returns:
|
|
18
|
+
ParsedOutput containing the intermediate content (thinking content) and final result
|
|
19
|
+
|
|
20
|
+
Raises:
|
|
21
|
+
ValueError: If response format is invalid (missing tags, multiple tags, or no content after closing tag)
|
|
22
|
+
"""
|
|
23
|
+
# This parser only works for strings
|
|
24
|
+
if not isinstance(original_output.output, str):
|
|
25
|
+
raise ValueError("Response must be a string for R1 parser")
|
|
26
|
+
|
|
27
|
+
# Strip whitespace and validate basic structure
|
|
28
|
+
cleaned_response = original_output.output.strip()
|
|
29
|
+
if not cleaned_response.startswith(self.START_TAG):
|
|
30
|
+
raise ValueError("Response must start with <think> tag")
|
|
31
|
+
|
|
32
|
+
# Find the thinking tags
|
|
33
|
+
think_start = cleaned_response.find(self.START_TAG)
|
|
34
|
+
think_end = cleaned_response.find(self.END_TAG)
|
|
35
|
+
|
|
36
|
+
if think_start == -1 or think_end == -1:
|
|
37
|
+
raise ValueError("Missing thinking tags")
|
|
38
|
+
|
|
39
|
+
# Check for multiple tags
|
|
40
|
+
if (
|
|
41
|
+
cleaned_response.count(self.START_TAG) > 1
|
|
42
|
+
or cleaned_response.count(self.END_TAG) > 1
|
|
43
|
+
):
|
|
44
|
+
raise ValueError("Multiple thinking tags found")
|
|
45
|
+
|
|
46
|
+
# Extract thinking content
|
|
47
|
+
thinking_content = cleaned_response[
|
|
48
|
+
think_start + len(self.START_TAG) : think_end
|
|
49
|
+
].strip()
|
|
50
|
+
|
|
51
|
+
# Extract result (everything after </think>)
|
|
52
|
+
result = cleaned_response[think_end + len(self.END_TAG) :].strip()
|
|
53
|
+
|
|
54
|
+
if not result or len(result) == 0:
|
|
55
|
+
raise ValueError("No content found after </think> tag")
|
|
56
|
+
|
|
57
|
+
# Parse JSON if needed
|
|
58
|
+
output = result
|
|
59
|
+
if self.structured_output:
|
|
60
|
+
output = parse_json_string(result)
|
|
61
|
+
|
|
62
|
+
# Add thinking content to intermediate outputs if it exists
|
|
63
|
+
intermediate_outputs = original_output.intermediate_outputs or {}
|
|
64
|
+
intermediate_outputs["reasoning"] = thinking_content
|
|
65
|
+
|
|
66
|
+
return RunOutput(
|
|
67
|
+
output=output,
|
|
68
|
+
intermediate_outputs=intermediate_outputs,
|
|
69
|
+
)
|