kiln-ai 0.8.1__py3-none-any.whl → 0.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kiln-ai might be problematic. Click here for more details.

Files changed (88) hide show
  1. kiln_ai/adapters/__init__.py +7 -7
  2. kiln_ai/adapters/adapter_registry.py +81 -10
  3. kiln_ai/adapters/data_gen/data_gen_task.py +21 -3
  4. kiln_ai/adapters/data_gen/test_data_gen_task.py +23 -3
  5. kiln_ai/adapters/eval/base_eval.py +164 -0
  6. kiln_ai/adapters/eval/eval_runner.py +267 -0
  7. kiln_ai/adapters/eval/g_eval.py +367 -0
  8. kiln_ai/adapters/eval/registry.py +16 -0
  9. kiln_ai/adapters/eval/test_base_eval.py +324 -0
  10. kiln_ai/adapters/eval/test_eval_runner.py +640 -0
  11. kiln_ai/adapters/eval/test_g_eval.py +497 -0
  12. kiln_ai/adapters/eval/test_g_eval_data.py +4 -0
  13. kiln_ai/adapters/fine_tune/base_finetune.py +5 -1
  14. kiln_ai/adapters/fine_tune/dataset_formatter.py +310 -65
  15. kiln_ai/adapters/fine_tune/fireworks_finetune.py +47 -32
  16. kiln_ai/adapters/fine_tune/openai_finetune.py +12 -11
  17. kiln_ai/adapters/fine_tune/test_base_finetune.py +19 -0
  18. kiln_ai/adapters/fine_tune/test_dataset_formatter.py +472 -129
  19. kiln_ai/adapters/fine_tune/test_fireworks_tinetune.py +114 -22
  20. kiln_ai/adapters/fine_tune/test_openai_finetune.py +125 -14
  21. kiln_ai/adapters/ml_model_list.py +434 -93
  22. kiln_ai/adapters/model_adapters/__init__.py +18 -0
  23. kiln_ai/adapters/model_adapters/base_adapter.py +250 -0
  24. kiln_ai/adapters/model_adapters/langchain_adapters.py +309 -0
  25. kiln_ai/adapters/model_adapters/openai_compatible_config.py +10 -0
  26. kiln_ai/adapters/model_adapters/openai_model_adapter.py +289 -0
  27. kiln_ai/adapters/model_adapters/test_base_adapter.py +199 -0
  28. kiln_ai/adapters/{test_langchain_adapter.py → model_adapters/test_langchain_adapter.py} +105 -97
  29. kiln_ai/adapters/model_adapters/test_openai_model_adapter.py +216 -0
  30. kiln_ai/adapters/{test_saving_adapter_results.py → model_adapters/test_saving_adapter_results.py} +80 -30
  31. kiln_ai/adapters/{test_structured_output.py → model_adapters/test_structured_output.py} +125 -46
  32. kiln_ai/adapters/ollama_tools.py +0 -1
  33. kiln_ai/adapters/parsers/__init__.py +10 -0
  34. kiln_ai/adapters/parsers/base_parser.py +12 -0
  35. kiln_ai/adapters/parsers/json_parser.py +37 -0
  36. kiln_ai/adapters/parsers/parser_registry.py +19 -0
  37. kiln_ai/adapters/parsers/r1_parser.py +69 -0
  38. kiln_ai/adapters/parsers/test_json_parser.py +81 -0
  39. kiln_ai/adapters/parsers/test_parser_registry.py +32 -0
  40. kiln_ai/adapters/parsers/test_r1_parser.py +144 -0
  41. kiln_ai/adapters/prompt_builders.py +193 -49
  42. kiln_ai/adapters/provider_tools.py +91 -36
  43. kiln_ai/adapters/repair/repair_task.py +18 -19
  44. kiln_ai/adapters/repair/test_repair_task.py +7 -7
  45. kiln_ai/adapters/run_output.py +11 -0
  46. kiln_ai/adapters/test_adapter_registry.py +177 -0
  47. kiln_ai/adapters/test_generate_docs.py +69 -0
  48. kiln_ai/adapters/test_ollama_tools.py +0 -1
  49. kiln_ai/adapters/test_prompt_adaptors.py +25 -18
  50. kiln_ai/adapters/test_prompt_builders.py +265 -44
  51. kiln_ai/adapters/test_provider_tools.py +268 -46
  52. kiln_ai/datamodel/__init__.py +51 -772
  53. kiln_ai/datamodel/basemodel.py +31 -11
  54. kiln_ai/datamodel/datamodel_enums.py +58 -0
  55. kiln_ai/datamodel/dataset_filters.py +114 -0
  56. kiln_ai/datamodel/dataset_split.py +170 -0
  57. kiln_ai/datamodel/eval.py +298 -0
  58. kiln_ai/datamodel/finetune.py +105 -0
  59. kiln_ai/datamodel/json_schema.py +14 -3
  60. kiln_ai/datamodel/model_cache.py +8 -3
  61. kiln_ai/datamodel/project.py +23 -0
  62. kiln_ai/datamodel/prompt.py +37 -0
  63. kiln_ai/datamodel/prompt_id.py +83 -0
  64. kiln_ai/datamodel/strict_mode.py +24 -0
  65. kiln_ai/datamodel/task.py +181 -0
  66. kiln_ai/datamodel/task_output.py +321 -0
  67. kiln_ai/datamodel/task_run.py +164 -0
  68. kiln_ai/datamodel/test_basemodel.py +80 -2
  69. kiln_ai/datamodel/test_dataset_filters.py +71 -0
  70. kiln_ai/datamodel/test_dataset_split.py +127 -6
  71. kiln_ai/datamodel/test_datasource.py +3 -2
  72. kiln_ai/datamodel/test_eval_model.py +635 -0
  73. kiln_ai/datamodel/test_example_models.py +34 -17
  74. kiln_ai/datamodel/test_json_schema.py +23 -0
  75. kiln_ai/datamodel/test_model_cache.py +24 -0
  76. kiln_ai/datamodel/test_model_perf.py +125 -0
  77. kiln_ai/datamodel/test_models.py +131 -2
  78. kiln_ai/datamodel/test_prompt_id.py +129 -0
  79. kiln_ai/datamodel/test_task.py +159 -0
  80. kiln_ai/utils/config.py +6 -1
  81. kiln_ai/utils/exhaustive_error.py +6 -0
  82. {kiln_ai-0.8.1.dist-info → kiln_ai-0.12.0.dist-info}/METADATA +45 -7
  83. kiln_ai-0.12.0.dist-info/RECORD +100 -0
  84. kiln_ai/adapters/base_adapter.py +0 -191
  85. kiln_ai/adapters/langchain_adapters.py +0 -256
  86. kiln_ai-0.8.1.dist-info/RECORD +0 -58
  87. {kiln_ai-0.8.1.dist-info → kiln_ai-0.12.0.dist-info}/WHEEL +0 -0
  88. {kiln_ai-0.8.1.dist-info → kiln_ai-0.12.0.dist-info}/licenses/LICENSE.txt +0 -0
@@ -2,13 +2,17 @@ from unittest.mock import patch
2
2
 
3
3
  import pytest
4
4
 
5
- from kiln_ai.adapters.base_adapter import AdapterInfo, BaseAdapter, RunOutput
5
+ from kiln_ai.adapters.model_adapters.base_adapter import (
6
+ BaseAdapter,
7
+ RunOutput,
8
+ )
6
9
  from kiln_ai.datamodel import (
7
10
  DataSource,
8
11
  DataSourceType,
9
12
  Project,
10
13
  Task,
11
14
  )
15
+ from kiln_ai.datamodel.task import RunConfig
12
16
  from kiln_ai.utils.config import Config
13
17
 
14
18
 
@@ -16,13 +20,8 @@ class MockAdapter(BaseAdapter):
16
20
  async def _run(self, input: dict | str) -> dict | str:
17
21
  return RunOutput(output="Test output", intermediate_outputs=None)
18
22
 
19
- def adapter_info(self) -> AdapterInfo:
20
- return AdapterInfo(
21
- adapter_name="mock_adapter",
22
- model_name="mock_model",
23
- model_provider="mock_provider",
24
- prompt_builder_name="mock_prompt_builder",
25
- )
23
+ def adapter_name(self) -> str:
24
+ return "mock_adapter"
26
25
 
27
26
 
28
27
  @pytest.fixture
@@ -38,8 +37,19 @@ def test_task(tmp_path):
38
37
  return task
39
38
 
40
39
 
41
- def test_save_run_isolation(test_task):
42
- adapter = MockAdapter(test_task)
40
+ @pytest.fixture
41
+ def adapter(test_task):
42
+ return MockAdapter(
43
+ run_config=RunConfig(
44
+ task=test_task,
45
+ model_name="phi_3_5",
46
+ model_provider_name="ollama",
47
+ prompt_id="simple_chain_of_thought_prompt_builder",
48
+ ),
49
+ )
50
+
51
+
52
+ def test_save_run_isolation(test_task, adapter):
43
53
  input_data = "Test input"
44
54
  output_data = "Test output"
45
55
  run_output = RunOutput(
@@ -85,13 +95,12 @@ def test_save_run_isolation(test_task):
85
95
  assert reloaded_output.source.type == DataSourceType.synthetic
86
96
  assert reloaded_output.rating is None
87
97
  assert reloaded_output.source.properties["adapter_name"] == "mock_adapter"
88
- assert reloaded_output.source.properties["model_name"] == "mock_model"
89
- assert reloaded_output.source.properties["model_provider"] == "mock_provider"
98
+ assert reloaded_output.source.properties["model_name"] == "phi_3_5"
99
+ assert reloaded_output.source.properties["model_provider"] == "ollama"
90
100
  assert (
91
- reloaded_output.source.properties["prompt_builder_name"]
92
- == "mock_prompt_builder"
101
+ reloaded_output.source.properties["prompt_id"]
102
+ == "simple_chain_of_thought_prompt_builder"
93
103
  )
94
-
95
104
  # Run again, with same input and different output. Should create a new TaskRun.
96
105
  different_run_output = RunOutput(
97
106
  output="Different output", intermediate_outputs=None
@@ -101,13 +110,6 @@ def test_save_run_isolation(test_task):
101
110
  assert len(test_task.runs()) == 2
102
111
  assert "Different output" in set(run.output.output for run in test_task.runs())
103
112
 
104
- # run again with same input and same output. Should not create a new TaskRun.
105
- task_output = adapter.generate_run(input_data, None, run_output)
106
- task_output.save_to_file()
107
- assert len(test_task.runs()) == 2
108
- assert "Different output" in set(run.output.output for run in test_task.runs())
109
- assert output_data in set(run.output.output for run in test_task.runs())
110
-
111
113
  # run again with input of different type. Should create a new TaskRun and TaskOutput.
112
114
  task_output = adapter.generate_run(
113
115
  input_data,
@@ -116,7 +118,7 @@ def test_save_run_isolation(test_task):
116
118
  properties={
117
119
  "model_name": "mock_model",
118
120
  "model_provider": "mock_provider",
119
- "prompt_builder_name": "mock_prompt_builder",
121
+ "prompt_id": "mock_prompt_builder",
120
122
  "adapter_name": "mock_adapter",
121
123
  },
122
124
  ),
@@ -130,14 +132,41 @@ def test_save_run_isolation(test_task):
130
132
  assert output_data in set(run.output.output for run in test_task.runs())
131
133
 
132
134
 
135
+ def test_generate_run_non_ascii(test_task, adapter):
136
+ input_data = {"key": "input with non-ascii character: 你好"}
137
+ output_data = {"key": "output with non-ascii character: 你好"}
138
+ run_output = RunOutput(
139
+ output=output_data,
140
+ intermediate_outputs=None,
141
+ )
142
+
143
+ task_run = adapter.generate_run(
144
+ input=input_data, input_source=None, run_output=run_output
145
+ )
146
+ task_run.save_to_file()
147
+
148
+ # as these values are saved as strings, they should properly represent the non-ascii characters
149
+ assert task_run.input == '{"key": "input with non-ascii character: 你好"}'
150
+ assert task_run.output.output == '{"key": "output with non-ascii character: 你好"}'
151
+
152
+ # check that the stringified unicode strings can be read back from the file
153
+ reloaded_task = Task.load_from_file(test_task.path)
154
+ reloaded_runs = reloaded_task.runs()
155
+ assert len(reloaded_runs) == 1
156
+ reloaded_run = reloaded_runs[0]
157
+ assert reloaded_run.input == '{"key": "input with non-ascii character: 你好"}'
158
+ assert (
159
+ reloaded_run.output.output == '{"key": "output with non-ascii character: 你好"}'
160
+ )
161
+
162
+
133
163
  @pytest.mark.asyncio
134
- async def test_autosave_false(test_task):
164
+ async def test_autosave_false(test_task, adapter):
135
165
  with patch("kiln_ai.utils.config.Config.shared") as mock_shared:
136
166
  mock_config = mock_shared.return_value
137
167
  mock_config.autosave_runs = False
138
168
  mock_config.user_id = "test_user"
139
169
 
140
- adapter = MockAdapter(test_task)
141
170
  input_data = "Test input"
142
171
 
143
172
  run = await adapter.invoke(input_data)
@@ -150,13 +179,31 @@ async def test_autosave_false(test_task):
150
179
 
151
180
 
152
181
  @pytest.mark.asyncio
153
- async def test_autosave_true(test_task):
182
+ async def test_autosave_true_with_disabled(test_task, adapter):
183
+ with patch("kiln_ai.utils.config.Config.shared") as mock_shared:
184
+ mock_config = mock_shared.return_value
185
+ mock_config.autosave_runs = True
186
+ mock_config.user_id = "test_user"
187
+
188
+ input_data = "Test input"
189
+
190
+ adapter.base_adapter_config.allow_saving = False
191
+ run = await adapter.invoke(input_data)
192
+
193
+ # Check that no runs were saved
194
+ assert len(test_task.runs()) == 0
195
+
196
+ # Check that the run ID is not set
197
+ assert run.id is None
198
+
199
+
200
+ @pytest.mark.asyncio
201
+ async def test_autosave_true(test_task, adapter):
154
202
  with patch("kiln_ai.utils.config.Config.shared") as mock_shared:
155
203
  mock_config = mock_shared.return_value
156
204
  mock_config.autosave_runs = True
157
205
  mock_config.user_id = "test_user"
158
206
 
159
- adapter = MockAdapter(test_task)
160
207
  input_data = "Test input"
161
208
 
162
209
  run = await adapter.invoke(input_data)
@@ -174,6 +221,9 @@ async def test_autosave_true(test_task):
174
221
  assert output.output == "Test output"
175
222
  assert output.source.type == DataSourceType.synthetic
176
223
  assert output.source.properties["adapter_name"] == "mock_adapter"
177
- assert output.source.properties["model_name"] == "mock_model"
178
- assert output.source.properties["model_provider"] == "mock_provider"
179
- assert output.source.properties["prompt_builder_name"] == "mock_prompt_builder"
224
+ assert output.source.properties["model_name"] == "phi_3_5"
225
+ assert output.source.properties["model_provider"] == "ollama"
226
+ assert (
227
+ output.source.properties["prompt_id"]
228
+ == "simple_chain_of_thought_prompt_builder"
229
+ )
@@ -1,22 +1,22 @@
1
+ import json
1
2
  from pathlib import Path
2
3
  from typing import Dict
3
4
 
4
- import jsonschema
5
- import jsonschema.exceptions
6
5
  import pytest
7
6
 
8
7
  import kiln_ai.datamodel as datamodel
9
8
  from kiln_ai.adapters.adapter_registry import adapter_for_task
10
- from kiln_ai.adapters.base_adapter import AdapterInfo, BaseAdapter, RunOutput
11
9
  from kiln_ai.adapters.ml_model_list import (
12
10
  built_in_models,
13
11
  )
14
- from kiln_ai.adapters.ollama_tools import ollama_online
15
- from kiln_ai.adapters.prompt_builders import (
16
- BasePromptBuilder,
17
- SimpleChainOfThoughtPromptBuilder,
12
+ from kiln_ai.adapters.model_adapters.base_adapter import (
13
+ BaseAdapter,
14
+ RunOutput,
18
15
  )
16
+ from kiln_ai.adapters.ollama_tools import ollama_online
19
17
  from kiln_ai.adapters.test_prompt_adaptors import get_all_models_and_providers
18
+ from kiln_ai.datamodel import PromptId
19
+ from kiln_ai.datamodel.task import RunConfig
20
20
  from kiln_ai.datamodel.test_json_schema import json_joke_schema, json_triangle_schema
21
21
 
22
22
 
@@ -34,9 +34,9 @@ async def test_structured_output_gpt_4o_mini(tmp_path):
34
34
  await run_structured_output_test(tmp_path, "gpt_4o_mini", "openai")
35
35
 
36
36
 
37
- @pytest.mark.parametrize("model_name", ["llama_3_1_8b"])
37
+ @pytest.mark.parametrize("model_name", ["llama_3_1_8b", "gemma_2_2b"])
38
38
  @pytest.mark.ollama
39
- async def test_structured_output_ollama_llama(tmp_path, model_name):
39
+ async def test_structured_output_ollama(tmp_path, model_name):
40
40
  if not await ollama_online():
41
41
  pytest.skip("Ollama API not running. Expect it running on localhost:11434")
42
42
  await run_structured_output_test(tmp_path, model_name, "ollama")
@@ -44,19 +44,21 @@ async def test_structured_output_ollama_llama(tmp_path, model_name):
44
44
 
45
45
  class MockAdapter(BaseAdapter):
46
46
  def __init__(self, kiln_task: datamodel.Task, response: Dict | str | None):
47
- super().__init__(kiln_task)
47
+ super().__init__(
48
+ run_config=RunConfig(
49
+ task=kiln_task,
50
+ model_name="phi_3_5",
51
+ model_provider_name="ollama",
52
+ prompt_id="simple_chain_of_thought_prompt_builder",
53
+ ),
54
+ )
48
55
  self.response = response
49
56
 
50
57
  async def _run(self, input: str) -> RunOutput:
51
58
  return RunOutput(output=self.response, intermediate_outputs=None)
52
59
 
53
- def adapter_info(self) -> AdapterInfo:
54
- return AdapterInfo(
55
- adapter_name="mock_adapter",
56
- model_name="mock_model",
57
- model_provider="mock_provider",
58
- prompt_builder_name="mock_prompt_builder",
59
- )
60
+ def adapter_name(self) -> str:
61
+ return "mock_adapter"
60
62
 
61
63
 
62
64
  async def test_mock_unstructred_response(tmp_path):
@@ -93,19 +95,10 @@ async def test_mock_unstructred_response(tmp_path):
93
95
  answer = await adapter.invoke("You are a mock, send me the response!")
94
96
 
95
97
 
96
- @pytest.mark.paid
97
- @pytest.mark.ollama
98
- @pytest.mark.parametrize("model_name,provider_name", get_all_models_and_providers())
99
- async def test_all_built_in_models_structured_output(
100
- tmp_path, model_name, provider_name
101
- ):
98
+ def check_supports_structured_output(model_name: str, provider_name: str):
102
99
  for model in built_in_models:
103
100
  if model.name != model_name:
104
101
  continue
105
- if not model.supports_structured_output:
106
- pytest.skip(
107
- f"Skipping {model.name} because it does not support structured output"
108
- )
109
102
  for provider in model.providers:
110
103
  if provider.name != provider_name:
111
104
  continue
@@ -113,11 +106,20 @@ async def test_all_built_in_models_structured_output(
113
106
  pytest.skip(
114
107
  f"Skipping {model.name} {provider.name} because it does not support structured output"
115
108
  )
116
- await run_structured_output_test(tmp_path, model.name, provider.name)
117
109
  return
118
110
  raise RuntimeError(f"No model {model_name} {provider_name} found")
119
111
 
120
112
 
113
+ @pytest.mark.paid
114
+ @pytest.mark.ollama
115
+ @pytest.mark.parametrize("model_name,provider_name", get_all_models_and_providers())
116
+ async def test_all_built_in_models_structured_output(
117
+ tmp_path, model_name, provider_name
118
+ ):
119
+ check_supports_structured_output(model_name, provider_name)
120
+ await run_structured_output_test(tmp_path, model_name, provider_name)
121
+
122
+
121
123
  def build_structured_output_test_task(tmp_path: Path):
122
124
  project = datamodel.Project(name="test", path=tmp_path / "test.kiln")
123
125
  project.save_to_file()
@@ -140,7 +142,14 @@ def build_structured_output_test_task(tmp_path: Path):
140
142
  async def run_structured_output_test(tmp_path: Path, model_name: str, provider: str):
141
143
  task = build_structured_output_test_task(tmp_path)
142
144
  a = adapter_for_task(task, model_name=model_name, provider=provider)
143
- parsed = await a.invoke_returning_raw("Cows") # a joke about cows
145
+ try:
146
+ parsed = await a.invoke_returning_raw("Cows") # a joke about cows
147
+ except ValueError as e:
148
+ if str(e) == "Failed to connect to Ollama. Ensure Ollama is running.":
149
+ pytest.skip(
150
+ f"Skipping {model_name} {provider} because Ollama is not running"
151
+ )
152
+ raise e
144
153
  if parsed is None or not isinstance(parsed, Dict):
145
154
  raise RuntimeError(f"structured response is not a dict: {parsed}")
146
155
  assert parsed["setup"] is not None
@@ -161,6 +170,7 @@ def build_structured_input_test_task(tmp_path: Path):
161
170
  parent=project,
162
171
  name="test task",
163
172
  instruction="You are an assistant which classifies a triangle given the lengths of its sides. If all sides are of equal length, the triangle is equilateral. If two sides are equal, the triangle is isosceles. Otherwise, it is scalene.\n\nAt the end of your response return the result in double square brackets. It should be plain text. It should be exactly one of the three following strings: '[[equilateral]]', or '[[isosceles]]', or '[[scalene]]'.",
173
+ thinking_prompt="Think step by step.",
164
174
  )
165
175
  task.input_json_schema = json_triangle_schema
166
176
  schema = task.input_schema()
@@ -177,37 +187,59 @@ def build_structured_input_test_task(tmp_path: Path):
177
187
 
178
188
  async def run_structured_input_test(tmp_path: Path, model_name: str, provider: str):
179
189
  task = build_structured_input_test_task(tmp_path)
180
- await run_structured_input_task(task, model_name, provider)
190
+ try:
191
+ await run_structured_input_task(task, model_name, provider)
192
+ except ValueError as e:
193
+ if str(e) == "Failed to connect to Ollama. Ensure Ollama is running.":
194
+ pytest.skip(
195
+ f"Skipping {model_name} {provider} because Ollama is not running"
196
+ )
197
+ raise e
181
198
 
182
199
 
183
200
  async def run_structured_input_task(
184
201
  task: datamodel.Task,
185
202
  model_name: str,
186
203
  provider: str,
187
- pb: BasePromptBuilder | None = None,
204
+ prompt_id: PromptId | None = None,
188
205
  ):
189
206
  a = adapter_for_task(
190
- task, model_name=model_name, provider=provider, prompt_builder=pb
207
+ task,
208
+ model_name=model_name,
209
+ provider=provider,
210
+ prompt_id=prompt_id,
191
211
  )
192
212
  with pytest.raises(ValueError):
193
213
  # not structured input in dictionary
194
214
  await a.invoke("a=1, b=2, c=3")
195
- with pytest.raises(jsonschema.exceptions.ValidationError):
215
+ with pytest.raises(
216
+ ValueError,
217
+ match="This task requires a specific output schema. While the model produced JSON, that JSON didn't meet the schema.",
218
+ ):
196
219
  # invalid structured input
197
220
  await a.invoke({"a": 1, "b": 2, "d": 3})
198
221
 
199
- response = await a.invoke_returning_raw({"a": 2, "b": 2, "c": 2})
222
+ try:
223
+ response = await a.invoke_returning_raw({"a": 2, "b": 2, "c": 2})
224
+ except ValueError as e:
225
+ if str(e) == "Failed to connect to Ollama. Ensure Ollama is running.":
226
+ pytest.skip(
227
+ f"Skipping {model_name} {provider} because Ollama is not running"
228
+ )
229
+ raise e
200
230
  assert response is not None
201
- assert isinstance(response, str)
202
- assert "[[equilateral]]" in response
203
- adapter_info = a.adapter_info()
231
+ if isinstance(response, str):
232
+ assert "[[equilateral]]" in response
233
+ else:
234
+ assert response["is_equilateral"] is True
235
+
204
236
  expected_pb_name = "simple_prompt_builder"
205
- if pb is not None:
206
- expected_pb_name = pb.__class__.prompt_builder_name()
207
- assert adapter_info.prompt_builder_name == expected_pb_name
208
- assert adapter_info.model_name == model_name
209
- assert adapter_info.model_provider == provider
210
- assert adapter_info.adapter_name == "kiln_langchain_adapter"
237
+ if prompt_id is not None:
238
+ expected_pb_name = prompt_id
239
+ assert a.run_config.prompt_id == expected_pb_name
240
+
241
+ assert a.run_config.model_name == model_name
242
+ assert a.run_config.model_provider_name == provider
211
243
 
212
244
 
213
245
  @pytest.mark.paid
@@ -227,7 +259,54 @@ async def test_all_built_in_models_structured_input(
227
259
  @pytest.mark.paid
228
260
  @pytest.mark.ollama
229
261
  @pytest.mark.parametrize("model_name,provider_name", get_all_models_and_providers())
230
- async def test_structured_cot_prompt_builder(tmp_path, model_name, provider_name):
262
+ async def test_structured_input_cot_prompt_builder(tmp_path, model_name, provider_name):
231
263
  task = build_structured_input_test_task(tmp_path)
232
- pb = SimpleChainOfThoughtPromptBuilder(task)
233
- await run_structured_input_task(task, model_name, provider_name, pb)
264
+ await run_structured_input_task(
265
+ task, model_name, provider_name, "simple_chain_of_thought_prompt_builder"
266
+ )
267
+
268
+
269
+ @pytest.mark.paid
270
+ @pytest.mark.ollama
271
+ @pytest.mark.parametrize("model_name,provider_name", get_all_models_and_providers())
272
+ async def test_structured_output_cot_prompt_builder(
273
+ tmp_path, model_name, provider_name
274
+ ):
275
+ check_supports_structured_output(model_name, provider_name)
276
+ triangle_schema = {
277
+ "type": "object",
278
+ "properties": {
279
+ "is_equilateral": {
280
+ "type": "boolean",
281
+ "description": "True if all sides of the triangle are equal in length",
282
+ },
283
+ "is_scalene": {
284
+ "type": "boolean",
285
+ "description": "True if all sides of the triangle have different lengths",
286
+ },
287
+ "is_obtuse": {
288
+ "type": "boolean",
289
+ "description": "True if one of the angles is greater than 90 degrees",
290
+ },
291
+ },
292
+ "required": ["is_equilateral", "is_scalene", "is_obtuse"],
293
+ "additionalProperties": False,
294
+ }
295
+ task = build_structured_input_test_task(tmp_path)
296
+ task.instruction = """
297
+ You are an assistant which classifies a triangle given the lengths of its sides. If all sides are of equal length, the triangle is equilateral. If two sides are equal, the triangle is isosceles. Otherwise, it is scalene.\n\n"
298
+
299
+ When asked for a final result, this is the format (for an equilateral example):
300
+ ```json
301
+ {
302
+ "is_equilateral": true,
303
+ "is_scalene": false,
304
+ "is_obtuse": false
305
+ }
306
+ ```
307
+ """
308
+ task.output_json_schema = json.dumps(triangle_schema)
309
+ task.save_to_file()
310
+ await run_structured_input_task(
311
+ task, model_name, provider_name, "simple_chain_of_thought_prompt_builder"
312
+ )
@@ -1,4 +1,3 @@
1
- import os
2
1
  from typing import Any, List
3
2
 
4
3
  import httpx
@@ -0,0 +1,10 @@
1
+ """
2
+ # Parsers
3
+
4
+ Parsing utilities for JSON and models with custom output formats (R1, etc.)
5
+
6
+ """
7
+
8
+ from . import base_parser, json_parser, r1_parser
9
+
10
+ __all__ = ["r1_parser", "base_parser", "json_parser"]
@@ -0,0 +1,12 @@
1
+ from kiln_ai.adapters.run_output import RunOutput
2
+
3
+
4
+ class BaseParser:
5
+ def __init__(self, structured_output: bool = False):
6
+ self.structured_output = structured_output
7
+
8
+ def parse_output(self, original_output: RunOutput) -> RunOutput:
9
+ """
10
+ Method for parsing the output of a model. Typically overridden by subclasses.
11
+ """
12
+ return original_output
@@ -0,0 +1,37 @@
1
+ import json
2
+ from typing import Any, Dict
3
+
4
+
5
+ def parse_json_string(json_string: str) -> Dict[str, Any]:
6
+ """
7
+ Parse a JSON string into a dictionary. Handles multiple formats:
8
+ - Plain JSON
9
+ - JSON wrapped in ```json code blocks
10
+ - JSON wrapped in ``` code blocks
11
+
12
+ Args:
13
+ json_string: String containing JSON data, possibly wrapped in code blocks
14
+
15
+ Returns:
16
+ Dict containing parsed JSON data
17
+
18
+ Raises:
19
+ ValueError: If JSON parsing fails
20
+ """
21
+ # Remove code block markers if present
22
+ cleaned_string = json_string.strip()
23
+ if cleaned_string.startswith("```"):
24
+ # Split by newlines and remove first/last lines if they contain ```
25
+ lines = cleaned_string.split("\n")
26
+ if lines[0].startswith("```"):
27
+ lines = lines[1:]
28
+ if lines and lines[-1].strip() == "```":
29
+ lines = lines[:-1]
30
+ cleaned_string = "\n".join(lines)
31
+
32
+ try:
33
+ return json.loads(cleaned_string)
34
+ except json.JSONDecodeError as e:
35
+ raise ValueError(
36
+ f"This task requires JSON output but the model didn't return valid JSON. Search 'Troubleshooting Structured Data Issues' in our docs for more information. The model produced the following: {cleaned_string}"
37
+ ) from e
@@ -0,0 +1,19 @@
1
+ from typing import Type
2
+
3
+ from kiln_ai.adapters.ml_model_list import ModelParserID
4
+ from kiln_ai.adapters.parsers.base_parser import BaseParser
5
+ from kiln_ai.adapters.parsers.r1_parser import R1ThinkingParser
6
+ from kiln_ai.utils.exhaustive_error import raise_exhaustive_enum_error
7
+
8
+
9
+ def model_parser_from_id(parser_id: ModelParserID | None) -> Type[BaseParser]:
10
+ """
11
+ Get a model parser from its ID.
12
+ """
13
+ match parser_id:
14
+ case None:
15
+ return BaseParser
16
+ case ModelParserID.r1_thinking:
17
+ return R1ThinkingParser
18
+ case _:
19
+ raise_exhaustive_enum_error(parser_id)
@@ -0,0 +1,69 @@
1
+ from kiln_ai.adapters.parsers.base_parser import BaseParser
2
+ from kiln_ai.adapters.parsers.json_parser import parse_json_string
3
+ from kiln_ai.adapters.run_output import RunOutput
4
+
5
+
6
+ class R1ThinkingParser(BaseParser):
7
+ START_TAG = "<think>"
8
+ END_TAG = "</think>"
9
+
10
+ def parse_output(self, original_output: RunOutput) -> RunOutput:
11
+ """
12
+ Parse the <think> </think> tags from the response into the intermediate and final outputs.
13
+
14
+ Args:
15
+ original_output: RunOutput containing the raw response string
16
+
17
+ Returns:
18
+ ParsedOutput containing the intermediate content (thinking content) and final result
19
+
20
+ Raises:
21
+ ValueError: If response format is invalid (missing tags, multiple tags, or no content after closing tag)
22
+ """
23
+ # This parser only works for strings
24
+ if not isinstance(original_output.output, str):
25
+ raise ValueError("Response must be a string for R1 parser")
26
+
27
+ # Strip whitespace and validate basic structure
28
+ cleaned_response = original_output.output.strip()
29
+ if not cleaned_response.startswith(self.START_TAG):
30
+ raise ValueError("Response must start with <think> tag")
31
+
32
+ # Find the thinking tags
33
+ think_start = cleaned_response.find(self.START_TAG)
34
+ think_end = cleaned_response.find(self.END_TAG)
35
+
36
+ if think_start == -1 or think_end == -1:
37
+ raise ValueError("Missing thinking tags")
38
+
39
+ # Check for multiple tags
40
+ if (
41
+ cleaned_response.count(self.START_TAG) > 1
42
+ or cleaned_response.count(self.END_TAG) > 1
43
+ ):
44
+ raise ValueError("Multiple thinking tags found")
45
+
46
+ # Extract thinking content
47
+ thinking_content = cleaned_response[
48
+ think_start + len(self.START_TAG) : think_end
49
+ ].strip()
50
+
51
+ # Extract result (everything after </think>)
52
+ result = cleaned_response[think_end + len(self.END_TAG) :].strip()
53
+
54
+ if not result or len(result) == 0:
55
+ raise ValueError("No content found after </think> tag")
56
+
57
+ # Parse JSON if needed
58
+ output = result
59
+ if self.structured_output:
60
+ output = parse_json_string(result)
61
+
62
+ # Add thinking content to intermediate outputs if it exists
63
+ intermediate_outputs = original_output.intermediate_outputs or {}
64
+ intermediate_outputs["reasoning"] = thinking_content
65
+
66
+ return RunOutput(
67
+ output=output,
68
+ intermediate_outputs=intermediate_outputs,
69
+ )