kiln-ai 0.11.1__py3-none-any.whl → 0.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kiln-ai might be problematic. Click here for more details.

Files changed (63) hide show
  1. kiln_ai/adapters/adapter_registry.py +12 -13
  2. kiln_ai/adapters/data_gen/data_gen_task.py +18 -0
  3. kiln_ai/adapters/eval/base_eval.py +164 -0
  4. kiln_ai/adapters/eval/eval_runner.py +267 -0
  5. kiln_ai/adapters/eval/g_eval.py +367 -0
  6. kiln_ai/adapters/eval/registry.py +16 -0
  7. kiln_ai/adapters/eval/test_base_eval.py +324 -0
  8. kiln_ai/adapters/eval/test_eval_runner.py +640 -0
  9. kiln_ai/adapters/eval/test_g_eval.py +497 -0
  10. kiln_ai/adapters/eval/test_g_eval_data.py +4 -0
  11. kiln_ai/adapters/fine_tune/test_dataset_formatter.py +4 -1
  12. kiln_ai/adapters/fine_tune/test_fireworks_tinetune.py +1 -1
  13. kiln_ai/adapters/fine_tune/test_openai_finetune.py +1 -1
  14. kiln_ai/adapters/ml_model_list.py +141 -29
  15. kiln_ai/adapters/model_adapters/base_adapter.py +50 -35
  16. kiln_ai/adapters/model_adapters/langchain_adapters.py +27 -20
  17. kiln_ai/adapters/model_adapters/openai_compatible_config.py +0 -1
  18. kiln_ai/adapters/model_adapters/openai_model_adapter.py +93 -50
  19. kiln_ai/adapters/model_adapters/test_base_adapter.py +22 -13
  20. kiln_ai/adapters/model_adapters/test_langchain_adapter.py +7 -14
  21. kiln_ai/adapters/model_adapters/test_openai_model_adapter.py +55 -64
  22. kiln_ai/adapters/model_adapters/test_saving_adapter_results.py +41 -19
  23. kiln_ai/adapters/model_adapters/test_structured_output.py +36 -30
  24. kiln_ai/adapters/ollama_tools.py +0 -1
  25. kiln_ai/adapters/prompt_builders.py +80 -42
  26. kiln_ai/adapters/repair/repair_task.py +9 -21
  27. kiln_ai/adapters/repair/test_repair_task.py +3 -3
  28. kiln_ai/adapters/run_output.py +3 -0
  29. kiln_ai/adapters/test_adapter_registry.py +10 -10
  30. kiln_ai/adapters/test_generate_docs.py +6 -6
  31. kiln_ai/adapters/test_ollama_tools.py +0 -1
  32. kiln_ai/adapters/test_prompt_adaptors.py +17 -14
  33. kiln_ai/adapters/test_prompt_builders.py +91 -31
  34. kiln_ai/datamodel/__init__.py +50 -952
  35. kiln_ai/datamodel/datamodel_enums.py +58 -0
  36. kiln_ai/datamodel/dataset_filters.py +114 -0
  37. kiln_ai/datamodel/dataset_split.py +170 -0
  38. kiln_ai/datamodel/eval.py +298 -0
  39. kiln_ai/datamodel/finetune.py +105 -0
  40. kiln_ai/datamodel/json_schema.py +6 -0
  41. kiln_ai/datamodel/project.py +23 -0
  42. kiln_ai/datamodel/prompt.py +37 -0
  43. kiln_ai/datamodel/prompt_id.py +83 -0
  44. kiln_ai/datamodel/strict_mode.py +24 -0
  45. kiln_ai/datamodel/task.py +181 -0
  46. kiln_ai/datamodel/task_output.py +321 -0
  47. kiln_ai/datamodel/task_run.py +164 -0
  48. kiln_ai/datamodel/test_basemodel.py +10 -11
  49. kiln_ai/datamodel/test_dataset_filters.py +71 -0
  50. kiln_ai/datamodel/test_dataset_split.py +32 -8
  51. kiln_ai/datamodel/test_datasource.py +3 -2
  52. kiln_ai/datamodel/test_eval_model.py +635 -0
  53. kiln_ai/datamodel/test_example_models.py +9 -13
  54. kiln_ai/datamodel/test_json_schema.py +23 -0
  55. kiln_ai/datamodel/test_models.py +2 -2
  56. kiln_ai/datamodel/test_prompt_id.py +129 -0
  57. kiln_ai/datamodel/test_task.py +159 -0
  58. kiln_ai/utils/config.py +6 -1
  59. {kiln_ai-0.11.1.dist-info → kiln_ai-0.12.0.dist-info}/METADATA +37 -1
  60. kiln_ai-0.12.0.dist-info/RECORD +100 -0
  61. kiln_ai-0.11.1.dist-info/RECORD +0 -76
  62. {kiln_ai-0.11.1.dist-info → kiln_ai-0.12.0.dist-info}/WHEEL +0 -0
  63. {kiln_ai-0.11.1.dist-info → kiln_ai-0.12.0.dist-info}/licenses/LICENSE.txt +0 -0
@@ -0,0 +1,324 @@
1
+ import json
2
+
3
+ import pytest
4
+ from kiln_ai.adapters.eval.base_eval import BaseEval
5
+ from kiln_ai.datamodel import BasePrompt, DataSource, DataSourceType
6
+ from kiln_ai.datamodel.eval import Eval, EvalConfig, EvalOutputScore
7
+ from kiln_ai.datamodel.task import (
8
+ RunConfigProperties,
9
+ Task,
10
+ TaskOutputRatingType,
11
+ TaskRequirement,
12
+ TaskRunConfig,
13
+ )
14
+
15
+
16
+ def test_score_schema_five_star():
17
+ # Create an eval with a five-star score
18
+ eval = Eval(
19
+ name="Test Eval",
20
+ eval_set_filter_id="tag::tag1",
21
+ eval_configs_filter_id="tag::tag2",
22
+ output_scores=[
23
+ EvalOutputScore(
24
+ name="Quality Score",
25
+ instruction="Rate the quality",
26
+ type=TaskOutputRatingType.five_star,
27
+ ),
28
+ EvalOutputScore(
29
+ name="Overall Rating",
30
+ instruction="The overall rating for the task output",
31
+ type=TaskOutputRatingType.five_star,
32
+ ),
33
+ ],
34
+ )
35
+
36
+ schema_str = BaseEval.build_score_schema(eval)
37
+ schema = json.loads(schema_str)
38
+
39
+ # Check basic schema structure
40
+ assert schema["type"] == "object"
41
+ assert schema["required"] == ["quality_score", "overall_rating"]
42
+
43
+ # Check score property, and that it's an enum of 1-5
44
+ score_prop = schema["properties"]["quality_score"]
45
+ assert score_prop["enum"] == [1, 2, 3, 4, 5]
46
+ assert "Quality Score" in score_prop["title"]
47
+ assert "Rate the quality" in score_prop["description"]
48
+ assert "between 1 and 5" in score_prop["description"]
49
+
50
+ # Check overall rating property, and that it's an enum of 1-5
51
+ assert "overall_rating" in schema["properties"]
52
+ overall = schema["properties"]["overall_rating"]
53
+ assert overall["enum"] == [1, 2, 3, 4, 5]
54
+ assert "Overall Rating" in overall["title"]
55
+ assert "The overall rating for the task output" in overall["description"]
56
+ assert "between 1 and 5" in overall["description"]
57
+
58
+
59
+ def test_score_schema_five_star_float():
60
+ # Create an eval with a five-star score
61
+ eval = Eval(
62
+ name="Test Eval",
63
+ eval_set_filter_id="tag::tag1",
64
+ eval_configs_filter_id="tag::tag2",
65
+ output_scores=[
66
+ EvalOutputScore(
67
+ name="Quality Score",
68
+ instruction="Rate the quality",
69
+ type=TaskOutputRatingType.five_star,
70
+ ),
71
+ EvalOutputScore(
72
+ name="Overall Rating",
73
+ instruction="The overall rating for the task output",
74
+ type=TaskOutputRatingType.five_star,
75
+ ),
76
+ ],
77
+ )
78
+
79
+ schema_str = BaseEval.build_score_schema(eval, allow_float_scores=True)
80
+ schema = json.loads(schema_str)
81
+
82
+ # Check basic schema structure
83
+ assert schema["type"] == "object"
84
+ assert schema["required"] == ["quality_score", "overall_rating"]
85
+
86
+ # Check score property
87
+ score_prop = schema["properties"]["quality_score"]
88
+ assert score_prop["type"] == "number"
89
+ assert score_prop["minimum"] == 1
90
+ assert score_prop["maximum"] == 5
91
+ assert "Quality Score" in score_prop["title"]
92
+ assert "Rate the quality" in score_prop["description"]
93
+ assert "between 1 and 5" in score_prop["description"]
94
+
95
+ # Check overall rating property
96
+ assert "overall_rating" in schema["properties"]
97
+ overall = schema["properties"]["overall_rating"]
98
+ assert overall["type"] == "number"
99
+ assert overall["minimum"] == 1
100
+ assert overall["maximum"] == 5
101
+ assert "Overall Rating" in overall["title"]
102
+ assert "The overall rating for the task output" in overall["description"]
103
+ assert "between 1 and 5" in overall["description"]
104
+
105
+
106
+ def test_score_schema_pass_fail():
107
+ eval = Eval(
108
+ name="Test Eval",
109
+ eval_set_filter_id="tag::tag1",
110
+ eval_configs_filter_id="tag::tag2",
111
+ output_scores=[
112
+ EvalOutputScore(
113
+ name="Pass Fail Test",
114
+ instruction="Check if it passes",
115
+ type=TaskOutputRatingType.pass_fail,
116
+ ),
117
+ EvalOutputScore(
118
+ name="Overall Rating",
119
+ instruction="The overall rating for the task output",
120
+ type=TaskOutputRatingType.five_star,
121
+ ),
122
+ ],
123
+ )
124
+
125
+ schema_str = BaseEval.build_score_schema(eval)
126
+ schema = json.loads(schema_str)
127
+
128
+ score_prop = schema["properties"]["pass_fail_test"]
129
+ assert score_prop["enum"] == ["pass", "fail"]
130
+ assert "Pass Fail Test" in score_prop["title"]
131
+ assert "Check if it passes" in score_prop["description"]
132
+ assert "'pass' or 'fail'" in score_prop["description"]
133
+
134
+ assert schema["properties"]["overall_rating"] is not None
135
+
136
+ # Now check that we can allow float scores with the proper float structure
137
+ schema_str = BaseEval.build_score_schema(eval, allow_float_scores=True)
138
+ schema = json.loads(schema_str)
139
+
140
+ score_prop = schema["properties"]["pass_fail_test"]
141
+ assert score_prop["type"] == "number"
142
+ assert score_prop["minimum"] == 0
143
+ assert score_prop["maximum"] == 1
144
+ assert (
145
+ "between 0 and 1, with 0 being a failure and 1 being a pass"
146
+ in score_prop["description"]
147
+ )
148
+
149
+
150
+ def test_score_schema_pass_fail_critical():
151
+ eval = Eval(
152
+ name="Test Eval",
153
+ eval_set_filter_id="tag::tag1",
154
+ eval_configs_filter_id="tag::tag2",
155
+ output_scores=[
156
+ EvalOutputScore(
157
+ name="Critical Test",
158
+ instruction="Check for critical issues",
159
+ type=TaskOutputRatingType.pass_fail_critical,
160
+ ),
161
+ EvalOutputScore(
162
+ name="Overall Rating",
163
+ instruction="The overall rating for the task output",
164
+ type=TaskOutputRatingType.five_star,
165
+ ),
166
+ ],
167
+ )
168
+
169
+ schema_str = BaseEval.build_score_schema(eval)
170
+ schema = json.loads(schema_str)
171
+
172
+ score_prop = schema["properties"]["critical_test"]
173
+ assert "enum" in score_prop
174
+ assert score_prop["enum"] == ["pass", "fail", "critical"]
175
+ assert "'pass', 'fail', or 'critical'" in score_prop["description"]
176
+
177
+ assert schema["properties"]["overall_rating"] is not None
178
+
179
+ # Now check that we can allow float scores with the proper float structure
180
+ schema_str = BaseEval.build_score_schema(eval, allow_float_scores=True)
181
+ schema = json.loads(schema_str)
182
+
183
+ score_prop = schema["properties"]["critical_test"]
184
+ assert score_prop["type"] == "number"
185
+ assert score_prop["minimum"] == -1
186
+ assert score_prop["maximum"] == 1
187
+ assert "between -1 and 1, with 1 being a pass" in score_prop["description"]
188
+
189
+
190
+ def test_score_schema_multiple_scores():
191
+ eval = Eval(
192
+ name="Test Eval",
193
+ eval_set_filter_id="tag::tag1",
194
+ eval_configs_filter_id="tag::tag2",
195
+ output_scores=[
196
+ EvalOutputScore(
197
+ name="Quality",
198
+ instruction="Rate quality",
199
+ type=TaskOutputRatingType.five_star,
200
+ ),
201
+ EvalOutputScore(
202
+ name="Pass Check",
203
+ instruction="Basic pass check",
204
+ type=TaskOutputRatingType.pass_fail,
205
+ ),
206
+ EvalOutputScore(
207
+ name="Security",
208
+ instruction="Check security",
209
+ type=TaskOutputRatingType.pass_fail_critical,
210
+ ),
211
+ EvalOutputScore(
212
+ name="Overall Rating",
213
+ instruction="The overall rating for the task output",
214
+ type=TaskOutputRatingType.five_star,
215
+ ),
216
+ ],
217
+ )
218
+
219
+ schema_str = BaseEval.build_score_schema(eval)
220
+ schema = json.loads(schema_str)
221
+
222
+ # Verify order is maintained
223
+ assert list(schema["properties"].keys()) == [
224
+ "quality",
225
+ "pass_check",
226
+ "security",
227
+ "overall_rating",
228
+ ]
229
+
230
+
231
+ def test_score_schema_no_scores():
232
+ # This should raise an error since at least one score is required
233
+ with pytest.raises(ValueError, match="output_scores are required"):
234
+ eval = Eval(
235
+ name="Test Eval",
236
+ eval_set_filter_id="tag::tag1",
237
+ eval_configs_filter_id="tag::tag2",
238
+ output_scores=[],
239
+ )
240
+ BaseEval.build_score_schema(eval)
241
+
242
+
243
+ class EvalTester(BaseEval):
244
+ """Test implementation of BaseEval"""
245
+
246
+ async def run_eval(self, task_run):
247
+ return {"overall_rating": 5, "quality": 4}
248
+
249
+
250
+ @pytest.mark.paid
251
+ @pytest.mark.asyncio
252
+ async def test_run_method():
253
+ task = Task(
254
+ name="Test Task",
255
+ instruction="Test instruction",
256
+ requirements=[
257
+ TaskRequirement(
258
+ name="Quality",
259
+ instruction="Rate quality",
260
+ type=TaskOutputRatingType.five_star,
261
+ ),
262
+ ],
263
+ )
264
+
265
+ eval_config = EvalConfig(
266
+ name="Test Eval Config",
267
+ model=DataSource(
268
+ type=DataSourceType.synthetic,
269
+ properties={
270
+ "model_name": "gpt-4o",
271
+ "model_provider": "openai",
272
+ "adapter_name": "test",
273
+ },
274
+ ),
275
+ parent=Eval(
276
+ name="Test Eval",
277
+ parent=task,
278
+ eval_set_filter_id="all",
279
+ eval_configs_filter_id="all",
280
+ output_scores=[
281
+ EvalOutputScore(
282
+ name="Quality",
283
+ instruction="Rate quality",
284
+ type=TaskOutputRatingType.five_star,
285
+ ),
286
+ EvalOutputScore(
287
+ name="Overall Rating",
288
+ instruction="The overall rating for the task output",
289
+ type=TaskOutputRatingType.five_star,
290
+ ),
291
+ ],
292
+ ),
293
+ prompt=BasePrompt(
294
+ name="Test Prompt",
295
+ prompt="Test prompt",
296
+ ),
297
+ properties={"eval_steps": ["test_step"]},
298
+ )
299
+
300
+ run_config = TaskRunConfig(
301
+ name="Test Run Config",
302
+ run_config_properties=RunConfigProperties(
303
+ model_name="llama_3_1_8b",
304
+ model_provider_name="groq",
305
+ prompt_id="simple_prompt_builder",
306
+ ),
307
+ parent=task,
308
+ )
309
+
310
+ evaluator = EvalTester(eval_config, run_config.run_config())
311
+
312
+ # Run the evaluation
313
+ task_run, eval_scores = await evaluator.run("test input")
314
+
315
+ # Verify task run was created
316
+ assert task_run.input == "test input"
317
+ assert isinstance(task_run.output.output, str)
318
+
319
+ # Verify eval scores match schema and contain expected values
320
+ assert eval_scores["overall_rating"] == 5
321
+ assert eval_scores["quality"] == 4
322
+
323
+ # Verify schema validation worked (these keys should exist per schema)
324
+ assert set(eval_scores.keys()) == {"overall_rating", "quality"}