kiln-ai 0.16.0__py3-none-any.whl → 0.18.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kiln_ai/adapters/__init__.py +2 -0
- kiln_ai/adapters/adapter_registry.py +22 -44
- kiln_ai/adapters/chat/__init__.py +8 -0
- kiln_ai/adapters/chat/chat_formatter.py +233 -0
- kiln_ai/adapters/chat/test_chat_formatter.py +131 -0
- kiln_ai/adapters/data_gen/data_gen_prompts.py +121 -36
- kiln_ai/adapters/data_gen/data_gen_task.py +49 -36
- kiln_ai/adapters/data_gen/test_data_gen_task.py +330 -40
- kiln_ai/adapters/eval/base_eval.py +7 -6
- kiln_ai/adapters/eval/eval_runner.py +9 -2
- kiln_ai/adapters/eval/g_eval.py +40 -17
- kiln_ai/adapters/eval/test_base_eval.py +174 -17
- kiln_ai/adapters/eval/test_eval_runner.py +3 -0
- kiln_ai/adapters/eval/test_g_eval.py +116 -5
- kiln_ai/adapters/fine_tune/base_finetune.py +3 -8
- kiln_ai/adapters/fine_tune/dataset_formatter.py +135 -273
- kiln_ai/adapters/fine_tune/test_base_finetune.py +10 -10
- kiln_ai/adapters/fine_tune/test_dataset_formatter.py +287 -353
- kiln_ai/adapters/fine_tune/test_fireworks_tinetune.py +3 -3
- kiln_ai/adapters/fine_tune/test_openai_finetune.py +6 -6
- kiln_ai/adapters/fine_tune/test_together_finetune.py +1 -0
- kiln_ai/adapters/fine_tune/test_vertex_finetune.py +6 -11
- kiln_ai/adapters/fine_tune/together_finetune.py +13 -2
- kiln_ai/adapters/ml_model_list.py +370 -84
- kiln_ai/adapters/model_adapters/base_adapter.py +73 -26
- kiln_ai/adapters/model_adapters/litellm_adapter.py +88 -97
- kiln_ai/adapters/model_adapters/litellm_config.py +3 -2
- kiln_ai/adapters/model_adapters/test_base_adapter.py +235 -61
- kiln_ai/adapters/model_adapters/test_litellm_adapter.py +104 -21
- kiln_ai/adapters/model_adapters/test_saving_adapter_results.py +41 -0
- kiln_ai/adapters/model_adapters/test_structured_output.py +44 -12
- kiln_ai/adapters/parsers/parser_registry.py +0 -2
- kiln_ai/adapters/parsers/r1_parser.py +0 -1
- kiln_ai/adapters/prompt_builders.py +0 -16
- kiln_ai/adapters/provider_tools.py +27 -9
- kiln_ai/adapters/remote_config.py +66 -0
- kiln_ai/adapters/repair/repair_task.py +1 -6
- kiln_ai/adapters/repair/test_repair_task.py +24 -3
- kiln_ai/adapters/test_adapter_registry.py +88 -28
- kiln_ai/adapters/test_ml_model_list.py +176 -0
- kiln_ai/adapters/test_prompt_adaptors.py +17 -7
- kiln_ai/adapters/test_prompt_builders.py +3 -16
- kiln_ai/adapters/test_provider_tools.py +69 -20
- kiln_ai/adapters/test_remote_config.py +100 -0
- kiln_ai/datamodel/__init__.py +0 -2
- kiln_ai/datamodel/datamodel_enums.py +38 -13
- kiln_ai/datamodel/eval.py +32 -0
- kiln_ai/datamodel/finetune.py +12 -8
- kiln_ai/datamodel/task.py +68 -7
- kiln_ai/datamodel/task_output.py +0 -2
- kiln_ai/datamodel/task_run.py +0 -2
- kiln_ai/datamodel/test_basemodel.py +2 -1
- kiln_ai/datamodel/test_dataset_split.py +0 -8
- kiln_ai/datamodel/test_eval_model.py +146 -4
- kiln_ai/datamodel/test_models.py +33 -10
- kiln_ai/datamodel/test_task.py +168 -2
- kiln_ai/utils/config.py +3 -2
- kiln_ai/utils/dataset_import.py +1 -1
- kiln_ai/utils/logging.py +166 -0
- kiln_ai/utils/test_config.py +23 -0
- kiln_ai/utils/test_dataset_import.py +30 -0
- {kiln_ai-0.16.0.dist-info → kiln_ai-0.18.0.dist-info}/METADATA +2 -2
- kiln_ai-0.18.0.dist-info/RECORD +115 -0
- kiln_ai-0.16.0.dist-info/RECORD +0 -108
- {kiln_ai-0.16.0.dist-info → kiln_ai-0.18.0.dist-info}/WHEEL +0 -0
- {kiln_ai-0.16.0.dist-info → kiln_ai-0.18.0.dist-info}/licenses/LICENSE.txt +0 -0
|
@@ -3,6 +3,10 @@ import json
|
|
|
3
3
|
import pytest
|
|
4
4
|
|
|
5
5
|
from kiln_ai.adapters.adapter_registry import adapter_for_task
|
|
6
|
+
from kiln_ai.adapters.data_gen.data_gen_prompts import (
|
|
7
|
+
generate_sample_generation_prompt,
|
|
8
|
+
generate_topic_tree_prompt,
|
|
9
|
+
)
|
|
6
10
|
from kiln_ai.adapters.data_gen.data_gen_task import (
|
|
7
11
|
DataGenCategoriesTask,
|
|
8
12
|
DataGenCategoriesTaskInput,
|
|
@@ -14,6 +18,7 @@ from kiln_ai.adapters.data_gen.data_gen_task import (
|
|
|
14
18
|
from kiln_ai.adapters.provider_tools import get_model_and_provider
|
|
15
19
|
from kiln_ai.adapters.test_prompt_adaptors import get_all_models_and_providers
|
|
16
20
|
from kiln_ai.datamodel import Project, Task
|
|
21
|
+
from kiln_ai.datamodel.task import RunConfigProperties
|
|
17
22
|
|
|
18
23
|
|
|
19
24
|
@pytest.fixture
|
|
@@ -32,22 +37,19 @@ def test_data_gen_categories_task_input_initialization(base_task):
|
|
|
32
37
|
# Arrange
|
|
33
38
|
node_path = ["root", "branch", "leaf"]
|
|
34
39
|
num_subtopics = 4
|
|
35
|
-
human_guidance = "Test guidance"
|
|
36
40
|
|
|
37
41
|
# Act
|
|
38
42
|
input_model = DataGenCategoriesTaskInput.from_task(
|
|
39
43
|
task=base_task,
|
|
40
44
|
node_path=node_path,
|
|
41
45
|
num_subtopics=num_subtopics,
|
|
42
|
-
human_guidance=human_guidance,
|
|
43
46
|
)
|
|
44
47
|
|
|
45
48
|
# Assert
|
|
46
|
-
assert input_model.
|
|
47
|
-
assert input_model.
|
|
48
|
-
assert input_model.
|
|
49
|
-
assert
|
|
50
|
-
assert "Reply like a cowboy" in input_model.system_prompt
|
|
49
|
+
assert input_model.kiln_data_gen_topic_path == node_path
|
|
50
|
+
assert input_model.kiln_data_gen_num_subtopics == num_subtopics
|
|
51
|
+
assert isinstance(input_model.kiln_data_gen_system_prompt, str)
|
|
52
|
+
assert "Reply like a cowboy" in input_model.kiln_data_gen_system_prompt
|
|
51
53
|
|
|
52
54
|
|
|
53
55
|
def test_data_gen_categories_task_input_default_values(base_task):
|
|
@@ -55,14 +57,13 @@ def test_data_gen_categories_task_input_default_values(base_task):
|
|
|
55
57
|
input_model = DataGenCategoriesTaskInput.from_task(task=base_task)
|
|
56
58
|
|
|
57
59
|
# Assert
|
|
58
|
-
assert input_model.
|
|
59
|
-
assert input_model.
|
|
60
|
-
assert input_model.node_path == []
|
|
60
|
+
assert input_model.kiln_data_gen_num_subtopics == 6
|
|
61
|
+
assert input_model.kiln_data_gen_topic_path == []
|
|
61
62
|
|
|
62
63
|
|
|
63
64
|
def test_data_gen_categories_task_initialization():
|
|
64
65
|
# Act
|
|
65
|
-
task = DataGenCategoriesTask()
|
|
66
|
+
task = DataGenCategoriesTask(gen_type="training", guidance="Test guidance")
|
|
66
67
|
|
|
67
68
|
# Assert
|
|
68
69
|
assert task.name == "DataGen"
|
|
@@ -71,11 +72,16 @@ def test_data_gen_categories_task_initialization():
|
|
|
71
72
|
assert task.instruction is not None
|
|
72
73
|
assert isinstance(task.input_json_schema, str)
|
|
73
74
|
assert isinstance(task.output_json_schema, str)
|
|
75
|
+
assert "I want to train a large language model" in task.instruction
|
|
76
|
+
assert "Test guidance" in task.instruction
|
|
74
77
|
|
|
75
78
|
|
|
76
79
|
def test_data_gen_categories_task_schemas():
|
|
77
80
|
# Act
|
|
78
|
-
task = DataGenCategoriesTask()
|
|
81
|
+
task = DataGenCategoriesTask(gen_type="eval", guidance="Test guidance")
|
|
82
|
+
|
|
83
|
+
assert "I want to evaluate a large language model" in task.instruction
|
|
84
|
+
assert "Test guidance" in task.instruction
|
|
79
85
|
|
|
80
86
|
# Assert
|
|
81
87
|
input_schema = json.loads(task.input_json_schema)
|
|
@@ -85,12 +91,14 @@ def test_data_gen_categories_task_schemas():
|
|
|
85
91
|
assert isinstance(output_schema, dict)
|
|
86
92
|
assert output_schema["type"] == "object"
|
|
87
93
|
assert output_schema["properties"]["subtopics"]["type"] == "array"
|
|
88
|
-
assert input_schema["properties"]["
|
|
89
|
-
assert
|
|
94
|
+
assert input_schema["properties"]["kiln_data_gen_topic_path"]["type"] == "array"
|
|
95
|
+
assert (
|
|
96
|
+
input_schema["properties"]["kiln_data_gen_num_subtopics"]["type"] == "integer"
|
|
97
|
+
)
|
|
90
98
|
assert set(input_schema["required"]) == {
|
|
91
|
-
"
|
|
92
|
-
"
|
|
93
|
-
"
|
|
99
|
+
"kiln_data_gen_topic_path",
|
|
100
|
+
"kiln_data_gen_num_subtopics",
|
|
101
|
+
"kiln_data_gen_system_prompt",
|
|
94
102
|
}
|
|
95
103
|
|
|
96
104
|
|
|
@@ -105,13 +113,17 @@ async def test_data_gen_all_models_providers(
|
|
|
105
113
|
# pass if the model doesn't support data gen (testing the support flag is part of this)
|
|
106
114
|
return
|
|
107
115
|
|
|
108
|
-
data_gen_task = DataGenCategoriesTask()
|
|
116
|
+
data_gen_task = DataGenCategoriesTask(gen_type="training", guidance=None)
|
|
109
117
|
data_gen_input = DataGenCategoriesTaskInput.from_task(base_task, num_subtopics=6)
|
|
110
118
|
|
|
111
119
|
adapter = adapter_for_task(
|
|
112
120
|
data_gen_task,
|
|
113
|
-
|
|
114
|
-
|
|
121
|
+
run_config_properties=RunConfigProperties(
|
|
122
|
+
model_name=model_name,
|
|
123
|
+
model_provider_name=provider_name,
|
|
124
|
+
prompt_id="simple_prompt_builder",
|
|
125
|
+
structured_output_mode="unknown",
|
|
126
|
+
),
|
|
115
127
|
)
|
|
116
128
|
|
|
117
129
|
input_dict = data_gen_input.model_dump()
|
|
@@ -126,22 +138,19 @@ def test_data_gen_sample_task_input_initialization(base_task):
|
|
|
126
138
|
# Arrange
|
|
127
139
|
topic = ["cowboys", "hats"]
|
|
128
140
|
num_samples = 4
|
|
129
|
-
human_guidance = "Test guidance"
|
|
130
141
|
|
|
131
142
|
# Act
|
|
132
143
|
input_model = DataGenSampleTaskInput.from_task(
|
|
133
144
|
task=base_task,
|
|
134
145
|
topic=topic,
|
|
135
146
|
num_samples=num_samples,
|
|
136
|
-
human_guidance=human_guidance,
|
|
137
147
|
)
|
|
138
148
|
|
|
139
149
|
# Assert
|
|
140
|
-
assert input_model.
|
|
141
|
-
assert input_model.
|
|
142
|
-
assert input_model.
|
|
143
|
-
assert
|
|
144
|
-
assert "Reply like a cowboy" in input_model.system_prompt
|
|
150
|
+
assert input_model.kiln_data_gen_topic_path == topic
|
|
151
|
+
assert input_model.kiln_data_gen_num_samples == num_samples
|
|
152
|
+
assert isinstance(input_model.kiln_data_gen_system_prompt, str)
|
|
153
|
+
assert "Reply like a cowboy" in input_model.kiln_data_gen_system_prompt
|
|
145
154
|
|
|
146
155
|
|
|
147
156
|
def test_data_gen_sample_task_input_default_values(base_task):
|
|
@@ -149,20 +158,23 @@ def test_data_gen_sample_task_input_default_values(base_task):
|
|
|
149
158
|
input_model = DataGenSampleTaskInput.from_task(task=base_task)
|
|
150
159
|
|
|
151
160
|
# Assert
|
|
152
|
-
assert input_model.
|
|
153
|
-
assert input_model.
|
|
154
|
-
assert input_model.topic == []
|
|
161
|
+
assert input_model.kiln_data_gen_num_samples == 8
|
|
162
|
+
assert input_model.kiln_data_gen_topic_path == []
|
|
155
163
|
|
|
156
164
|
|
|
157
165
|
def test_data_gen_sample_task_initialization(base_task):
|
|
158
166
|
# Act
|
|
159
|
-
task = DataGenSampleTask(
|
|
167
|
+
task = DataGenSampleTask(
|
|
168
|
+
target_task=base_task, gen_type="eval", guidance="Test guidance"
|
|
169
|
+
)
|
|
160
170
|
|
|
161
171
|
# Assert
|
|
162
172
|
assert task.name == "DataGenSample"
|
|
163
173
|
assert isinstance(task.parent, Project)
|
|
164
174
|
assert task.description is not None
|
|
165
175
|
assert task.instruction is not None
|
|
176
|
+
assert "I want to evaluate a large language model" in task.instruction
|
|
177
|
+
assert "Test guidance" in task.instruction
|
|
166
178
|
|
|
167
179
|
input_schema = json.loads(task.input_json_schema)
|
|
168
180
|
output_schema = json.loads(task.output_json_schema)
|
|
@@ -171,12 +183,12 @@ def test_data_gen_sample_task_initialization(base_task):
|
|
|
171
183
|
assert isinstance(output_schema, dict)
|
|
172
184
|
assert output_schema["type"] == "object"
|
|
173
185
|
assert output_schema["properties"]["generated_samples"]["type"] == "array"
|
|
174
|
-
assert input_schema["properties"]["
|
|
175
|
-
assert input_schema["properties"]["
|
|
186
|
+
assert input_schema["properties"]["kiln_data_gen_topic_path"]["type"] == "array"
|
|
187
|
+
assert input_schema["properties"]["kiln_data_gen_num_samples"]["type"] == "integer"
|
|
176
188
|
assert set(input_schema["required"]) == {
|
|
177
|
-
"
|
|
178
|
-
"
|
|
179
|
-
"
|
|
189
|
+
"kiln_data_gen_topic_path",
|
|
190
|
+
"kiln_data_gen_num_samples",
|
|
191
|
+
"kiln_data_gen_system_prompt",
|
|
180
192
|
}
|
|
181
193
|
|
|
182
194
|
|
|
@@ -254,8 +266,12 @@ async def test_data_gen_sample_all_models_providers(
|
|
|
254
266
|
|
|
255
267
|
adapter = adapter_for_task(
|
|
256
268
|
data_gen_task,
|
|
257
|
-
|
|
258
|
-
|
|
269
|
+
run_config_properties=RunConfigProperties(
|
|
270
|
+
model_name=model_name,
|
|
271
|
+
model_provider_name=provider_name,
|
|
272
|
+
prompt_id="simple_prompt_builder",
|
|
273
|
+
structured_output_mode="unknown",
|
|
274
|
+
),
|
|
259
275
|
)
|
|
260
276
|
|
|
261
277
|
input_dict = data_gen_input.model_dump()
|
|
@@ -304,8 +320,12 @@ async def test_data_gen_sample_all_models_providers_with_structured_output(
|
|
|
304
320
|
|
|
305
321
|
adapter = adapter_for_task(
|
|
306
322
|
data_gen_task,
|
|
307
|
-
|
|
308
|
-
|
|
323
|
+
run_config_properties=RunConfigProperties(
|
|
324
|
+
model_name=model_name,
|
|
325
|
+
model_provider_name=provider_name,
|
|
326
|
+
prompt_id="simple_prompt_builder",
|
|
327
|
+
structured_output_mode="unknown",
|
|
328
|
+
),
|
|
309
329
|
)
|
|
310
330
|
|
|
311
331
|
input_dict = data_gen_input.model_dump()
|
|
@@ -319,3 +339,273 @@ async def test_data_gen_sample_all_models_providers_with_structured_output(
|
|
|
319
339
|
assert "tweet" in sample
|
|
320
340
|
assert isinstance(sample["username"], str)
|
|
321
341
|
assert isinstance(sample["tweet"], str)
|
|
342
|
+
|
|
343
|
+
|
|
344
|
+
def test_generate_topic_tree_prompt_training_type():
|
|
345
|
+
"""Test generate_topic_tree_prompt with gen_type='training'"""
|
|
346
|
+
# Act
|
|
347
|
+
prompt = generate_topic_tree_prompt(gen_type="training")
|
|
348
|
+
|
|
349
|
+
# Assert
|
|
350
|
+
assert isinstance(prompt, str)
|
|
351
|
+
assert (
|
|
352
|
+
"I want to train a large language model and you should help me generate training data for it."
|
|
353
|
+
in prompt
|
|
354
|
+
)
|
|
355
|
+
assert "## Task Description" in prompt
|
|
356
|
+
assert "Your job is the following:" in prompt
|
|
357
|
+
assert "## Next Step" in prompt
|
|
358
|
+
assert "When generating subtopics, remain somewhat vague." in prompt
|
|
359
|
+
assert "The guidance is:" not in prompt # Should not have specific guidance
|
|
360
|
+
|
|
361
|
+
|
|
362
|
+
def test_generate_topic_tree_prompt_eval_type():
|
|
363
|
+
"""Test generate_topic_tree_prompt with gen_type='eval'"""
|
|
364
|
+
# Act
|
|
365
|
+
prompt = generate_topic_tree_prompt(gen_type="eval")
|
|
366
|
+
|
|
367
|
+
# Assert
|
|
368
|
+
assert isinstance(prompt, str)
|
|
369
|
+
assert (
|
|
370
|
+
"I want to evaluate a large language model and you should help me generate eval data for it."
|
|
371
|
+
in prompt
|
|
372
|
+
)
|
|
373
|
+
assert "## Task Description" in prompt
|
|
374
|
+
assert "Your job is the following:" in prompt
|
|
375
|
+
assert "## Next Step" in prompt
|
|
376
|
+
assert "When generating subtopics, remain somewhat vague." in prompt
|
|
377
|
+
assert "The guidance is:" not in prompt # Should not have specific guidance
|
|
378
|
+
|
|
379
|
+
|
|
380
|
+
def test_generate_topic_tree_prompt_with_guidance():
|
|
381
|
+
"""Test generate_topic_tree_prompt with guidance provided"""
|
|
382
|
+
# Arrange
|
|
383
|
+
guidance = "Focus on technical topics related to artificial intelligence and machine learning"
|
|
384
|
+
|
|
385
|
+
# Act
|
|
386
|
+
prompt = generate_topic_tree_prompt(gen_type="training", guidance=guidance)
|
|
387
|
+
|
|
388
|
+
# Assert
|
|
389
|
+
assert isinstance(prompt, str)
|
|
390
|
+
assert (
|
|
391
|
+
"I want to train a large language model and you should help me generate training data for it."
|
|
392
|
+
in prompt
|
|
393
|
+
)
|
|
394
|
+
assert "## Custom Guidance" in prompt
|
|
395
|
+
assert f"<guidance>\n{guidance}\n</guidance>" in prompt
|
|
396
|
+
assert (
|
|
397
|
+
"When generating subtopics, remain somewhat vague." not in prompt
|
|
398
|
+
) # Should not have default guidance
|
|
399
|
+
|
|
400
|
+
|
|
401
|
+
def test_generate_topic_tree_prompt_with_empty_guidance():
|
|
402
|
+
"""Test generate_topic_tree_prompt with empty string guidance"""
|
|
403
|
+
# Act
|
|
404
|
+
prompt = generate_topic_tree_prompt(gen_type="eval", guidance="")
|
|
405
|
+
|
|
406
|
+
# Assert
|
|
407
|
+
assert isinstance(prompt, str)
|
|
408
|
+
assert (
|
|
409
|
+
"I want to evaluate a large language model and you should help me generate eval data for it."
|
|
410
|
+
in prompt
|
|
411
|
+
)
|
|
412
|
+
assert "## Specific Guidance" not in prompt
|
|
413
|
+
assert (
|
|
414
|
+
"When generating subtopics, remain somewhat vague." in prompt
|
|
415
|
+
) # Should have default guidance
|
|
416
|
+
|
|
417
|
+
|
|
418
|
+
def test_generate_topic_tree_prompt_contains_examples():
|
|
419
|
+
"""Test that the prompt contains the expected examples"""
|
|
420
|
+
# Act
|
|
421
|
+
prompt = generate_topic_tree_prompt(gen_type="training")
|
|
422
|
+
|
|
423
|
+
# Assert
|
|
424
|
+
# Check for news examples
|
|
425
|
+
assert "News Topics" in prompt
|
|
426
|
+
assert "Sports" in prompt
|
|
427
|
+
assert "Football" in prompt
|
|
428
|
+
assert "College Football" in prompt
|
|
429
|
+
assert "Entertainment" in prompt
|
|
430
|
+
assert "Tom Hanks" in prompt
|
|
431
|
+
|
|
432
|
+
# Check for smalltalk examples
|
|
433
|
+
assert "Small Talk Topics" in prompt
|
|
434
|
+
assert "Weather" in prompt
|
|
435
|
+
assert "Family" in prompt
|
|
436
|
+
assert "Hobbies" in prompt
|
|
437
|
+
assert "Cooking" in prompt
|
|
438
|
+
assert "Asian Food" in prompt
|
|
439
|
+
|
|
440
|
+
|
|
441
|
+
def test_generate_topic_tree_prompt_contains_required_sections():
|
|
442
|
+
"""Test that the prompt contains all required sections"""
|
|
443
|
+
# Act
|
|
444
|
+
prompt = generate_topic_tree_prompt(gen_type="training")
|
|
445
|
+
|
|
446
|
+
# Assert
|
|
447
|
+
assert "## Task Description" in prompt
|
|
448
|
+
assert "## Next Step" in prompt
|
|
449
|
+
assert "system_prompt" in prompt
|
|
450
|
+
assert "kiln_data_gen_topic_path" in prompt
|
|
451
|
+
assert "kiln_data_gen_num_subtopics" in prompt
|
|
452
|
+
assert "existing_topics" in prompt
|
|
453
|
+
|
|
454
|
+
|
|
455
|
+
def test_generate_topic_tree_prompt_structure_consistency():
|
|
456
|
+
"""Test that the prompt structure is consistent between training and eval types"""
|
|
457
|
+
# Act
|
|
458
|
+
training_prompt = generate_topic_tree_prompt(gen_type="training")
|
|
459
|
+
eval_prompt = generate_topic_tree_prompt(gen_type="eval")
|
|
460
|
+
|
|
461
|
+
# Assert
|
|
462
|
+
# Both should have the same structure, just different goal descriptions
|
|
463
|
+
assert "## Task Description" in training_prompt
|
|
464
|
+
assert "## Task Description" in eval_prompt
|
|
465
|
+
assert "## Next Step" in training_prompt
|
|
466
|
+
assert "## Next Step" in eval_prompt
|
|
467
|
+
|
|
468
|
+
# The main difference should be in the goal description
|
|
469
|
+
assert "train a large language model" in training_prompt
|
|
470
|
+
assert "evaluate a large language model" in eval_prompt
|
|
471
|
+
assert "generate training data" in training_prompt
|
|
472
|
+
assert "generate eval data" in eval_prompt
|
|
473
|
+
|
|
474
|
+
|
|
475
|
+
def test_generate_sample_generation_prompt_training_type():
|
|
476
|
+
"""Test generate_sample_generation_prompt with gen_type='training'"""
|
|
477
|
+
# Act
|
|
478
|
+
prompt = generate_sample_generation_prompt(gen_type="training")
|
|
479
|
+
|
|
480
|
+
# Assert
|
|
481
|
+
assert isinstance(prompt, str)
|
|
482
|
+
assert (
|
|
483
|
+
"I want to train a large language model and you should help me generate training data for it."
|
|
484
|
+
in prompt
|
|
485
|
+
)
|
|
486
|
+
assert "## Task Description" in prompt
|
|
487
|
+
assert "Your job is to generate a list of potential inputs" in prompt
|
|
488
|
+
assert "The guidance is:" not in prompt # Should not have specific guidance
|
|
489
|
+
|
|
490
|
+
|
|
491
|
+
def test_generate_sample_generation_prompt_eval_type():
|
|
492
|
+
"""Test generate_sample_generation_prompt with gen_type='eval'"""
|
|
493
|
+
# Act
|
|
494
|
+
prompt = generate_sample_generation_prompt(gen_type="eval")
|
|
495
|
+
|
|
496
|
+
# Assert
|
|
497
|
+
assert isinstance(prompt, str)
|
|
498
|
+
assert (
|
|
499
|
+
"I want to evaluate a large language model and you should help me generate eval data for it."
|
|
500
|
+
in prompt
|
|
501
|
+
)
|
|
502
|
+
assert "## Task Description" in prompt
|
|
503
|
+
assert "Your job is to generate a list of potential inputs" in prompt
|
|
504
|
+
assert "The guidance is:" not in prompt # Should not have specific guidance
|
|
505
|
+
|
|
506
|
+
|
|
507
|
+
def test_generate_sample_generation_prompt_with_guidance():
|
|
508
|
+
"""Test generate_sample_generation_prompt with guidance provided"""
|
|
509
|
+
# Arrange
|
|
510
|
+
guidance = "Focus on generating diverse examples with varying complexity levels"
|
|
511
|
+
|
|
512
|
+
# Act
|
|
513
|
+
prompt = generate_sample_generation_prompt(gen_type="training", guidance=guidance)
|
|
514
|
+
|
|
515
|
+
# Assert
|
|
516
|
+
assert isinstance(prompt, str)
|
|
517
|
+
assert (
|
|
518
|
+
"I want to train a large language model and you should help me generate training data for it."
|
|
519
|
+
in prompt
|
|
520
|
+
)
|
|
521
|
+
assert "## Custom Guidance" in prompt
|
|
522
|
+
assert f"<guidance>\n{guidance}\n</guidance>" in prompt
|
|
523
|
+
|
|
524
|
+
|
|
525
|
+
def test_generate_sample_generation_prompt_with_empty_guidance():
|
|
526
|
+
"""Test generate_sample_generation_prompt with empty string guidance"""
|
|
527
|
+
# Act
|
|
528
|
+
prompt = generate_sample_generation_prompt(gen_type="eval", guidance="")
|
|
529
|
+
|
|
530
|
+
# Assert
|
|
531
|
+
assert isinstance(prompt, str)
|
|
532
|
+
assert (
|
|
533
|
+
"I want to evaluate a large language model and you should help me generate eval data for it."
|
|
534
|
+
in prompt
|
|
535
|
+
)
|
|
536
|
+
assert "## Specific Guidance" not in prompt
|
|
537
|
+
|
|
538
|
+
|
|
539
|
+
def test_generate_sample_generation_prompt_contains_examples():
|
|
540
|
+
"""Test that the prompt contains the expected examples"""
|
|
541
|
+
# Act
|
|
542
|
+
prompt = generate_sample_generation_prompt(gen_type="training")
|
|
543
|
+
|
|
544
|
+
# Assert
|
|
545
|
+
# Check for the tweet classification example
|
|
546
|
+
assert "You are an assistant that classifies the tone of a tweet" in prompt
|
|
547
|
+
assert "positive" in prompt
|
|
548
|
+
assert "negative" in prompt
|
|
549
|
+
assert "neutral" in prompt
|
|
550
|
+
assert "Technology" in prompt
|
|
551
|
+
assert "New iPhone Event" in prompt
|
|
552
|
+
assert "New iPhone looks amazing! I need that camera." in prompt
|
|
553
|
+
assert "Another boring event from Apple." in prompt
|
|
554
|
+
|
|
555
|
+
|
|
556
|
+
def test_generate_sample_generation_prompt_contains_required_sections():
|
|
557
|
+
"""Test that the prompt contains all required sections"""
|
|
558
|
+
# Act
|
|
559
|
+
prompt = generate_sample_generation_prompt(gen_type="training")
|
|
560
|
+
|
|
561
|
+
# Assert
|
|
562
|
+
assert "## Task Description" in prompt
|
|
563
|
+
assert "system_prompt" in prompt
|
|
564
|
+
assert "topic" in prompt
|
|
565
|
+
assert "num_samples" in prompt
|
|
566
|
+
assert "generated_samples" in prompt
|
|
567
|
+
assert "The output must be formatted:" in prompt
|
|
568
|
+
assert "Do not include any other text or break the schema in any way." in prompt
|
|
569
|
+
assert (
|
|
570
|
+
"Note how the output of this task is data to input into the system prompt"
|
|
571
|
+
in prompt
|
|
572
|
+
)
|
|
573
|
+
|
|
574
|
+
|
|
575
|
+
def test_generate_sample_generation_prompt_structure_consistency():
|
|
576
|
+
"""Test that the prompt structure is consistent between training and eval types"""
|
|
577
|
+
# Act
|
|
578
|
+
training_prompt = generate_sample_generation_prompt(gen_type="training")
|
|
579
|
+
eval_prompt = generate_sample_generation_prompt(gen_type="eval")
|
|
580
|
+
|
|
581
|
+
# Assert
|
|
582
|
+
# Both should have the same structure, just different goal descriptions
|
|
583
|
+
assert "## Task Description" in training_prompt
|
|
584
|
+
assert "## Task Description" in eval_prompt
|
|
585
|
+
|
|
586
|
+
# The main difference should be in the goal description
|
|
587
|
+
assert "train a large language model" in training_prompt
|
|
588
|
+
assert "evaluate a large language model" in eval_prompt
|
|
589
|
+
assert "generate training data" in training_prompt
|
|
590
|
+
assert "generate eval data" in eval_prompt
|
|
591
|
+
|
|
592
|
+
# Both should have the same core content
|
|
593
|
+
assert "Your job is to generate a list of potential inputs" in training_prompt
|
|
594
|
+
assert "Your job is to generate a list of potential inputs" in eval_prompt
|
|
595
|
+
assert "generated_samples" in training_prompt
|
|
596
|
+
assert "generated_samples" in eval_prompt
|
|
597
|
+
|
|
598
|
+
|
|
599
|
+
def test_generate_sample_generation_prompt_with_none_guidance():
|
|
600
|
+
"""Test generate_sample_generation_prompt with None guidance"""
|
|
601
|
+
# Act
|
|
602
|
+
prompt = generate_sample_generation_prompt(gen_type="training", guidance=None)
|
|
603
|
+
|
|
604
|
+
# Assert
|
|
605
|
+
assert isinstance(prompt, str)
|
|
606
|
+
assert (
|
|
607
|
+
"I want to train a large language model and you should help me generate training data for it."
|
|
608
|
+
in prompt
|
|
609
|
+
)
|
|
610
|
+
assert "## Specific Guidance" not in prompt
|
|
611
|
+
assert "The guidance is:" not in prompt
|
|
@@ -2,8 +2,6 @@ import json
|
|
|
2
2
|
from abc import abstractmethod
|
|
3
3
|
from typing import Dict
|
|
4
4
|
|
|
5
|
-
import jsonschema
|
|
6
|
-
|
|
7
5
|
from kiln_ai.adapters.adapter_registry import adapter_for_task
|
|
8
6
|
from kiln_ai.adapters.ml_model_list import ModelProviderName
|
|
9
7
|
from kiln_ai.adapters.model_adapters.base_adapter import AdapterConfig
|
|
@@ -60,14 +58,13 @@ class BaseEval:
|
|
|
60
58
|
|
|
61
59
|
run_adapter = adapter_for_task(
|
|
62
60
|
self.target_task,
|
|
63
|
-
self.run_config
|
|
64
|
-
ModelProviderName(self.run_config.model_provider_name),
|
|
61
|
+
self.run_config,
|
|
65
62
|
base_adapter_config=AdapterConfig(allow_saving=False),
|
|
66
63
|
)
|
|
67
64
|
|
|
68
65
|
# Parse structured input if needed
|
|
69
66
|
parsed_input = input
|
|
70
|
-
if self.target_task.
|
|
67
|
+
if self.target_task.input_json_schema is not None:
|
|
71
68
|
parsed_input = json.loads(input)
|
|
72
69
|
|
|
73
70
|
# we don't save by default here. We'll save manually after validating the output
|
|
@@ -122,7 +119,9 @@ class BaseEval:
|
|
|
122
119
|
property["minimum"] = 1
|
|
123
120
|
property["maximum"] = 5
|
|
124
121
|
else:
|
|
125
|
-
property["
|
|
122
|
+
property["type"] = "integer"
|
|
123
|
+
property["minimum"] = 1
|
|
124
|
+
property["maximum"] = 5
|
|
126
125
|
|
|
127
126
|
property["description"] = (
|
|
128
127
|
f"{output_score.instruction}\n\nThe rating should be between 1 and 5, with 1 being the worst and 5 being the best."
|
|
@@ -137,6 +136,7 @@ class BaseEval:
|
|
|
137
136
|
)
|
|
138
137
|
else:
|
|
139
138
|
property["enum"] = ["pass", "fail"]
|
|
139
|
+
property["type"] = "string"
|
|
140
140
|
property["description"] = (
|
|
141
141
|
f"{output_score.instruction}\n\nThe rating should be either 'pass' or 'fail'."
|
|
142
142
|
)
|
|
@@ -150,6 +150,7 @@ class BaseEval:
|
|
|
150
150
|
)
|
|
151
151
|
else:
|
|
152
152
|
property["enum"] = ["pass", "fail", "critical"]
|
|
153
|
+
property["type"] = "string"
|
|
153
154
|
property["description"] = (
|
|
154
155
|
f"{output_score.instruction}\n\nThe rating should be either 'pass', 'fail', or 'critical' where critical a very severe failure."
|
|
155
156
|
)
|
|
@@ -8,7 +8,7 @@ from kiln_ai.datamodel.basemodel import ID_TYPE
|
|
|
8
8
|
from kiln_ai.datamodel.dataset_filters import dataset_filter_from_id
|
|
9
9
|
from kiln_ai.datamodel.eval import EvalConfig, EvalRun, EvalScores
|
|
10
10
|
from kiln_ai.datamodel.task import TaskRunConfig
|
|
11
|
-
from kiln_ai.datamodel.task_run import TaskRun
|
|
11
|
+
from kiln_ai.datamodel.task_run import TaskRun, Usage
|
|
12
12
|
from kiln_ai.utils.async_job_runner import AsyncJobRunner, Progress
|
|
13
13
|
|
|
14
14
|
logger = logging.getLogger(__name__)
|
|
@@ -177,10 +177,12 @@ class EvalRunner:
|
|
|
177
177
|
task_output: str | None = None
|
|
178
178
|
scores: EvalScores | None = None
|
|
179
179
|
intermediate_outputs: Dict[str, str] | None = None
|
|
180
|
+
task_run_usage: Usage | None = None
|
|
180
181
|
if job.type == "eval_config_eval":
|
|
181
182
|
# Eval config eval, we use the saved input from the task run, not invoking the task again
|
|
182
183
|
scores, intermediate_outputs = await evaluator.run_eval(job.item)
|
|
183
184
|
task_output = job.item.output.output
|
|
185
|
+
task_run_usage = job.item.usage
|
|
184
186
|
else:
|
|
185
187
|
# Task run eval, we invoke the task again to get a fresh output
|
|
186
188
|
(
|
|
@@ -189,6 +191,7 @@ class EvalRunner:
|
|
|
189
191
|
intermediate_outputs,
|
|
190
192
|
) = await evaluator.run_task_and_eval(job.item.input)
|
|
191
193
|
task_output = result_task_run.output.output
|
|
194
|
+
task_run_usage = result_task_run.usage
|
|
192
195
|
|
|
193
196
|
# Save the job result
|
|
194
197
|
eval_run = EvalRun(
|
|
@@ -202,10 +205,14 @@ class EvalRunner:
|
|
|
202
205
|
input=job.item.input,
|
|
203
206
|
output=task_output,
|
|
204
207
|
intermediate_outputs=intermediate_outputs,
|
|
208
|
+
task_run_usage=task_run_usage,
|
|
205
209
|
)
|
|
206
210
|
eval_run.save_to_file()
|
|
207
211
|
|
|
208
212
|
return True
|
|
209
213
|
except Exception as e:
|
|
210
|
-
logger.error(
|
|
214
|
+
logger.error(
|
|
215
|
+
f"Error running eval job for dataset item {job.item.id}: {e}",
|
|
216
|
+
exc_info=True,
|
|
217
|
+
)
|
|
211
218
|
return False
|