kiln-ai 0.11.1__py3-none-any.whl → 0.13.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kiln-ai might be problematic. Click here for more details.

Files changed (80) hide show
  1. kiln_ai/adapters/__init__.py +4 -0
  2. kiln_ai/adapters/adapter_registry.py +163 -39
  3. kiln_ai/adapters/data_gen/data_gen_task.py +18 -0
  4. kiln_ai/adapters/eval/__init__.py +28 -0
  5. kiln_ai/adapters/eval/base_eval.py +164 -0
  6. kiln_ai/adapters/eval/eval_runner.py +270 -0
  7. kiln_ai/adapters/eval/g_eval.py +368 -0
  8. kiln_ai/adapters/eval/registry.py +16 -0
  9. kiln_ai/adapters/eval/test_base_eval.py +325 -0
  10. kiln_ai/adapters/eval/test_eval_runner.py +641 -0
  11. kiln_ai/adapters/eval/test_g_eval.py +498 -0
  12. kiln_ai/adapters/eval/test_g_eval_data.py +4 -0
  13. kiln_ai/adapters/fine_tune/base_finetune.py +16 -2
  14. kiln_ai/adapters/fine_tune/finetune_registry.py +2 -0
  15. kiln_ai/adapters/fine_tune/test_dataset_formatter.py +4 -1
  16. kiln_ai/adapters/fine_tune/test_fireworks_tinetune.py +1 -1
  17. kiln_ai/adapters/fine_tune/test_openai_finetune.py +1 -1
  18. kiln_ai/adapters/fine_tune/test_together_finetune.py +531 -0
  19. kiln_ai/adapters/fine_tune/together_finetune.py +325 -0
  20. kiln_ai/adapters/ml_model_list.py +758 -163
  21. kiln_ai/adapters/model_adapters/__init__.py +2 -4
  22. kiln_ai/adapters/model_adapters/base_adapter.py +61 -43
  23. kiln_ai/adapters/model_adapters/litellm_adapter.py +391 -0
  24. kiln_ai/adapters/model_adapters/litellm_config.py +13 -0
  25. kiln_ai/adapters/model_adapters/test_base_adapter.py +22 -13
  26. kiln_ai/adapters/model_adapters/test_litellm_adapter.py +407 -0
  27. kiln_ai/adapters/model_adapters/test_saving_adapter_results.py +41 -19
  28. kiln_ai/adapters/model_adapters/test_structured_output.py +59 -35
  29. kiln_ai/adapters/ollama_tools.py +3 -3
  30. kiln_ai/adapters/parsers/r1_parser.py +19 -14
  31. kiln_ai/adapters/parsers/test_r1_parser.py +17 -5
  32. kiln_ai/adapters/prompt_builders.py +80 -42
  33. kiln_ai/adapters/provider_tools.py +50 -58
  34. kiln_ai/adapters/repair/repair_task.py +9 -21
  35. kiln_ai/adapters/repair/test_repair_task.py +6 -6
  36. kiln_ai/adapters/run_output.py +3 -0
  37. kiln_ai/adapters/test_adapter_registry.py +26 -29
  38. kiln_ai/adapters/test_generate_docs.py +4 -4
  39. kiln_ai/adapters/test_ollama_tools.py +0 -1
  40. kiln_ai/adapters/test_prompt_adaptors.py +47 -33
  41. kiln_ai/adapters/test_prompt_builders.py +91 -31
  42. kiln_ai/adapters/test_provider_tools.py +26 -81
  43. kiln_ai/datamodel/__init__.py +50 -952
  44. kiln_ai/datamodel/basemodel.py +2 -0
  45. kiln_ai/datamodel/datamodel_enums.py +60 -0
  46. kiln_ai/datamodel/dataset_filters.py +114 -0
  47. kiln_ai/datamodel/dataset_split.py +170 -0
  48. kiln_ai/datamodel/eval.py +298 -0
  49. kiln_ai/datamodel/finetune.py +105 -0
  50. kiln_ai/datamodel/json_schema.py +7 -1
  51. kiln_ai/datamodel/project.py +23 -0
  52. kiln_ai/datamodel/prompt.py +37 -0
  53. kiln_ai/datamodel/prompt_id.py +83 -0
  54. kiln_ai/datamodel/strict_mode.py +24 -0
  55. kiln_ai/datamodel/task.py +181 -0
  56. kiln_ai/datamodel/task_output.py +328 -0
  57. kiln_ai/datamodel/task_run.py +164 -0
  58. kiln_ai/datamodel/test_basemodel.py +19 -11
  59. kiln_ai/datamodel/test_dataset_filters.py +71 -0
  60. kiln_ai/datamodel/test_dataset_split.py +32 -8
  61. kiln_ai/datamodel/test_datasource.py +22 -2
  62. kiln_ai/datamodel/test_eval_model.py +635 -0
  63. kiln_ai/datamodel/test_example_models.py +9 -13
  64. kiln_ai/datamodel/test_json_schema.py +23 -0
  65. kiln_ai/datamodel/test_models.py +2 -2
  66. kiln_ai/datamodel/test_prompt_id.py +129 -0
  67. kiln_ai/datamodel/test_task.py +159 -0
  68. kiln_ai/utils/config.py +43 -1
  69. kiln_ai/utils/dataset_import.py +232 -0
  70. kiln_ai/utils/test_dataset_import.py +596 -0
  71. {kiln_ai-0.11.1.dist-info → kiln_ai-0.13.0.dist-info}/METADATA +86 -6
  72. kiln_ai-0.13.0.dist-info/RECORD +103 -0
  73. kiln_ai/adapters/model_adapters/langchain_adapters.py +0 -302
  74. kiln_ai/adapters/model_adapters/openai_compatible_config.py +0 -11
  75. kiln_ai/adapters/model_adapters/openai_model_adapter.py +0 -246
  76. kiln_ai/adapters/model_adapters/test_langchain_adapter.py +0 -350
  77. kiln_ai/adapters/model_adapters/test_openai_model_adapter.py +0 -225
  78. kiln_ai-0.11.1.dist-info/RECORD +0 -76
  79. {kiln_ai-0.11.1.dist-info → kiln_ai-0.13.0.dist-info}/WHEEL +0 -0
  80. {kiln_ai-0.11.1.dist-info → kiln_ai-0.13.0.dist-info}/licenses/LICENSE.txt +0 -0
@@ -0,0 +1,325 @@
1
+ import json
2
+
3
+ import pytest
4
+
5
+ from kiln_ai.adapters.eval.base_eval import BaseEval
6
+ from kiln_ai.datamodel import BasePrompt, DataSource, DataSourceType
7
+ from kiln_ai.datamodel.eval import Eval, EvalConfig, EvalOutputScore
8
+ from kiln_ai.datamodel.task import (
9
+ RunConfigProperties,
10
+ Task,
11
+ TaskOutputRatingType,
12
+ TaskRequirement,
13
+ TaskRunConfig,
14
+ )
15
+
16
+
17
+ def test_score_schema_five_star():
18
+ # Create an eval with a five-star score
19
+ eval = Eval(
20
+ name="Test Eval",
21
+ eval_set_filter_id="tag::tag1",
22
+ eval_configs_filter_id="tag::tag2",
23
+ output_scores=[
24
+ EvalOutputScore(
25
+ name="Quality Score",
26
+ instruction="Rate the quality",
27
+ type=TaskOutputRatingType.five_star,
28
+ ),
29
+ EvalOutputScore(
30
+ name="Overall Rating",
31
+ instruction="The overall rating for the task output",
32
+ type=TaskOutputRatingType.five_star,
33
+ ),
34
+ ],
35
+ )
36
+
37
+ schema_str = BaseEval.build_score_schema(eval)
38
+ schema = json.loads(schema_str)
39
+
40
+ # Check basic schema structure
41
+ assert schema["type"] == "object"
42
+ assert schema["required"] == ["quality_score", "overall_rating"]
43
+
44
+ # Check score property, and that it's an enum of 1-5
45
+ score_prop = schema["properties"]["quality_score"]
46
+ assert score_prop["enum"] == [1, 2, 3, 4, 5]
47
+ assert "Quality Score" in score_prop["title"]
48
+ assert "Rate the quality" in score_prop["description"]
49
+ assert "between 1 and 5" in score_prop["description"]
50
+
51
+ # Check overall rating property, and that it's an enum of 1-5
52
+ assert "overall_rating" in schema["properties"]
53
+ overall = schema["properties"]["overall_rating"]
54
+ assert overall["enum"] == [1, 2, 3, 4, 5]
55
+ assert "Overall Rating" in overall["title"]
56
+ assert "The overall rating for the task output" in overall["description"]
57
+ assert "between 1 and 5" in overall["description"]
58
+
59
+
60
+ def test_score_schema_five_star_float():
61
+ # Create an eval with a five-star score
62
+ eval = Eval(
63
+ name="Test Eval",
64
+ eval_set_filter_id="tag::tag1",
65
+ eval_configs_filter_id="tag::tag2",
66
+ output_scores=[
67
+ EvalOutputScore(
68
+ name="Quality Score",
69
+ instruction="Rate the quality",
70
+ type=TaskOutputRatingType.five_star,
71
+ ),
72
+ EvalOutputScore(
73
+ name="Overall Rating",
74
+ instruction="The overall rating for the task output",
75
+ type=TaskOutputRatingType.five_star,
76
+ ),
77
+ ],
78
+ )
79
+
80
+ schema_str = BaseEval.build_score_schema(eval, allow_float_scores=True)
81
+ schema = json.loads(schema_str)
82
+
83
+ # Check basic schema structure
84
+ assert schema["type"] == "object"
85
+ assert schema["required"] == ["quality_score", "overall_rating"]
86
+
87
+ # Check score property
88
+ score_prop = schema["properties"]["quality_score"]
89
+ assert score_prop["type"] == "number"
90
+ assert score_prop["minimum"] == 1
91
+ assert score_prop["maximum"] == 5
92
+ assert "Quality Score" in score_prop["title"]
93
+ assert "Rate the quality" in score_prop["description"]
94
+ assert "between 1 and 5" in score_prop["description"]
95
+
96
+ # Check overall rating property
97
+ assert "overall_rating" in schema["properties"]
98
+ overall = schema["properties"]["overall_rating"]
99
+ assert overall["type"] == "number"
100
+ assert overall["minimum"] == 1
101
+ assert overall["maximum"] == 5
102
+ assert "Overall Rating" in overall["title"]
103
+ assert "The overall rating for the task output" in overall["description"]
104
+ assert "between 1 and 5" in overall["description"]
105
+
106
+
107
+ def test_score_schema_pass_fail():
108
+ eval = Eval(
109
+ name="Test Eval",
110
+ eval_set_filter_id="tag::tag1",
111
+ eval_configs_filter_id="tag::tag2",
112
+ output_scores=[
113
+ EvalOutputScore(
114
+ name="Pass Fail Test",
115
+ instruction="Check if it passes",
116
+ type=TaskOutputRatingType.pass_fail,
117
+ ),
118
+ EvalOutputScore(
119
+ name="Overall Rating",
120
+ instruction="The overall rating for the task output",
121
+ type=TaskOutputRatingType.five_star,
122
+ ),
123
+ ],
124
+ )
125
+
126
+ schema_str = BaseEval.build_score_schema(eval)
127
+ schema = json.loads(schema_str)
128
+
129
+ score_prop = schema["properties"]["pass_fail_test"]
130
+ assert score_prop["enum"] == ["pass", "fail"]
131
+ assert "Pass Fail Test" in score_prop["title"]
132
+ assert "Check if it passes" in score_prop["description"]
133
+ assert "'pass' or 'fail'" in score_prop["description"]
134
+
135
+ assert schema["properties"]["overall_rating"] is not None
136
+
137
+ # Now check that we can allow float scores with the proper float structure
138
+ schema_str = BaseEval.build_score_schema(eval, allow_float_scores=True)
139
+ schema = json.loads(schema_str)
140
+
141
+ score_prop = schema["properties"]["pass_fail_test"]
142
+ assert score_prop["type"] == "number"
143
+ assert score_prop["minimum"] == 0
144
+ assert score_prop["maximum"] == 1
145
+ assert (
146
+ "between 0 and 1, with 0 being a failure and 1 being a pass"
147
+ in score_prop["description"]
148
+ )
149
+
150
+
151
+ def test_score_schema_pass_fail_critical():
152
+ eval = Eval(
153
+ name="Test Eval",
154
+ eval_set_filter_id="tag::tag1",
155
+ eval_configs_filter_id="tag::tag2",
156
+ output_scores=[
157
+ EvalOutputScore(
158
+ name="Critical Test",
159
+ instruction="Check for critical issues",
160
+ type=TaskOutputRatingType.pass_fail_critical,
161
+ ),
162
+ EvalOutputScore(
163
+ name="Overall Rating",
164
+ instruction="The overall rating for the task output",
165
+ type=TaskOutputRatingType.five_star,
166
+ ),
167
+ ],
168
+ )
169
+
170
+ schema_str = BaseEval.build_score_schema(eval)
171
+ schema = json.loads(schema_str)
172
+
173
+ score_prop = schema["properties"]["critical_test"]
174
+ assert "enum" in score_prop
175
+ assert score_prop["enum"] == ["pass", "fail", "critical"]
176
+ assert "'pass', 'fail', or 'critical'" in score_prop["description"]
177
+
178
+ assert schema["properties"]["overall_rating"] is not None
179
+
180
+ # Now check that we can allow float scores with the proper float structure
181
+ schema_str = BaseEval.build_score_schema(eval, allow_float_scores=True)
182
+ schema = json.loads(schema_str)
183
+
184
+ score_prop = schema["properties"]["critical_test"]
185
+ assert score_prop["type"] == "number"
186
+ assert score_prop["minimum"] == -1
187
+ assert score_prop["maximum"] == 1
188
+ assert "between -1 and 1, with 1 being a pass" in score_prop["description"]
189
+
190
+
191
+ def test_score_schema_multiple_scores():
192
+ eval = Eval(
193
+ name="Test Eval",
194
+ eval_set_filter_id="tag::tag1",
195
+ eval_configs_filter_id="tag::tag2",
196
+ output_scores=[
197
+ EvalOutputScore(
198
+ name="Quality",
199
+ instruction="Rate quality",
200
+ type=TaskOutputRatingType.five_star,
201
+ ),
202
+ EvalOutputScore(
203
+ name="Pass Check",
204
+ instruction="Basic pass check",
205
+ type=TaskOutputRatingType.pass_fail,
206
+ ),
207
+ EvalOutputScore(
208
+ name="Security",
209
+ instruction="Check security",
210
+ type=TaskOutputRatingType.pass_fail_critical,
211
+ ),
212
+ EvalOutputScore(
213
+ name="Overall Rating",
214
+ instruction="The overall rating for the task output",
215
+ type=TaskOutputRatingType.five_star,
216
+ ),
217
+ ],
218
+ )
219
+
220
+ schema_str = BaseEval.build_score_schema(eval)
221
+ schema = json.loads(schema_str)
222
+
223
+ # Verify order is maintained
224
+ assert list(schema["properties"].keys()) == [
225
+ "quality",
226
+ "pass_check",
227
+ "security",
228
+ "overall_rating",
229
+ ]
230
+
231
+
232
+ def test_score_schema_no_scores():
233
+ # This should raise an error since at least one score is required
234
+ with pytest.raises(ValueError, match="output_scores are required"):
235
+ eval = Eval(
236
+ name="Test Eval",
237
+ eval_set_filter_id="tag::tag1",
238
+ eval_configs_filter_id="tag::tag2",
239
+ output_scores=[],
240
+ )
241
+ BaseEval.build_score_schema(eval)
242
+
243
+
244
+ class EvalTester(BaseEval):
245
+ """Test implementation of BaseEval"""
246
+
247
+ async def run_eval(self, task_run):
248
+ return {"overall_rating": 5, "quality": 4}
249
+
250
+
251
+ @pytest.mark.paid
252
+ @pytest.mark.asyncio
253
+ async def test_run_method():
254
+ task = Task(
255
+ name="Test Task",
256
+ instruction="Test instruction",
257
+ requirements=[
258
+ TaskRequirement(
259
+ name="Quality",
260
+ instruction="Rate quality",
261
+ type=TaskOutputRatingType.five_star,
262
+ ),
263
+ ],
264
+ )
265
+
266
+ eval_config = EvalConfig(
267
+ name="Test Eval Config",
268
+ model=DataSource(
269
+ type=DataSourceType.synthetic,
270
+ properties={
271
+ "model_name": "gpt-4o",
272
+ "model_provider": "openai",
273
+ "adapter_name": "test",
274
+ },
275
+ ),
276
+ parent=Eval(
277
+ name="Test Eval",
278
+ parent=task,
279
+ eval_set_filter_id="all",
280
+ eval_configs_filter_id="all",
281
+ output_scores=[
282
+ EvalOutputScore(
283
+ name="Quality",
284
+ instruction="Rate quality",
285
+ type=TaskOutputRatingType.five_star,
286
+ ),
287
+ EvalOutputScore(
288
+ name="Overall Rating",
289
+ instruction="The overall rating for the task output",
290
+ type=TaskOutputRatingType.five_star,
291
+ ),
292
+ ],
293
+ ),
294
+ prompt=BasePrompt(
295
+ name="Test Prompt",
296
+ prompt="Test prompt",
297
+ ),
298
+ properties={"eval_steps": ["test_step"]},
299
+ )
300
+
301
+ run_config = TaskRunConfig(
302
+ name="Test Run Config",
303
+ run_config_properties=RunConfigProperties(
304
+ model_name="llama_3_1_8b",
305
+ model_provider_name="groq",
306
+ prompt_id="simple_prompt_builder",
307
+ ),
308
+ parent=task,
309
+ )
310
+
311
+ evaluator = EvalTester(eval_config, run_config.run_config())
312
+
313
+ # Run the evaluation
314
+ task_run, eval_scores = await evaluator.run("test input")
315
+
316
+ # Verify task run was created
317
+ assert task_run.input == "test input"
318
+ assert isinstance(task_run.output.output, str)
319
+
320
+ # Verify eval scores match schema and contain expected values
321
+ assert eval_scores["overall_rating"] == 5
322
+ assert eval_scores["quality"] == 4
323
+
324
+ # Verify schema validation worked (these keys should exist per schema)
325
+ assert set(eval_scores.keys()) == {"overall_rating", "quality"}