kiln-ai 0.17.0__py3-none-any.whl → 0.18.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kiln_ai/adapters/chat/chat_formatter.py +0 -1
- kiln_ai/adapters/data_gen/data_gen_prompts.py +121 -36
- kiln_ai/adapters/data_gen/data_gen_task.py +49 -36
- kiln_ai/adapters/data_gen/test_data_gen_task.py +311 -34
- kiln_ai/adapters/eval/base_eval.py +6 -7
- kiln_ai/adapters/eval/eval_runner.py +5 -1
- kiln_ai/adapters/eval/g_eval.py +17 -12
- kiln_ai/adapters/eval/test_base_eval.py +8 -2
- kiln_ai/adapters/eval/test_g_eval.py +115 -5
- kiln_ai/adapters/fine_tune/base_finetune.py +1 -6
- kiln_ai/adapters/fine_tune/dataset_formatter.py +1 -5
- kiln_ai/adapters/fine_tune/test_dataset_formatter.py +1 -1
- kiln_ai/adapters/fine_tune/test_vertex_finetune.py +2 -7
- kiln_ai/adapters/fine_tune/together_finetune.py +1 -1
- kiln_ai/adapters/ml_model_list.py +293 -44
- kiln_ai/adapters/model_adapters/litellm_adapter.py +9 -0
- kiln_ai/adapters/model_adapters/test_base_adapter.py +0 -1
- kiln_ai/adapters/model_adapters/test_litellm_adapter.py +48 -0
- kiln_ai/adapters/model_adapters/test_structured_output.py +3 -3
- kiln_ai/adapters/parsers/parser_registry.py +0 -2
- kiln_ai/adapters/parsers/r1_parser.py +0 -1
- kiln_ai/adapters/remote_config.py +66 -0
- kiln_ai/adapters/repair/repair_task.py +1 -6
- kiln_ai/adapters/test_ml_model_list.py +18 -0
- kiln_ai/adapters/test_prompt_adaptors.py +0 -4
- kiln_ai/adapters/test_remote_config.py +100 -0
- kiln_ai/datamodel/eval.py +32 -0
- kiln_ai/datamodel/finetune.py +0 -1
- kiln_ai/datamodel/task_output.py +0 -2
- kiln_ai/datamodel/task_run.py +0 -2
- kiln_ai/datamodel/test_eval_model.py +146 -4
- kiln_ai/utils/logging.py +4 -3
- {kiln_ai-0.17.0.dist-info → kiln_ai-0.18.0.dist-info}/METADATA +2 -2
- {kiln_ai-0.17.0.dist-info → kiln_ai-0.18.0.dist-info}/RECORD +36 -34
- {kiln_ai-0.17.0.dist-info → kiln_ai-0.18.0.dist-info}/WHEEL +0 -0
- {kiln_ai-0.17.0.dist-info → kiln_ai-0.18.0.dist-info}/licenses/LICENSE.txt +0 -0
|
@@ -3,6 +3,10 @@ import json
|
|
|
3
3
|
import pytest
|
|
4
4
|
|
|
5
5
|
from kiln_ai.adapters.adapter_registry import adapter_for_task
|
|
6
|
+
from kiln_ai.adapters.data_gen.data_gen_prompts import (
|
|
7
|
+
generate_sample_generation_prompt,
|
|
8
|
+
generate_topic_tree_prompt,
|
|
9
|
+
)
|
|
6
10
|
from kiln_ai.adapters.data_gen.data_gen_task import (
|
|
7
11
|
DataGenCategoriesTask,
|
|
8
12
|
DataGenCategoriesTaskInput,
|
|
@@ -33,22 +37,19 @@ def test_data_gen_categories_task_input_initialization(base_task):
|
|
|
33
37
|
# Arrange
|
|
34
38
|
node_path = ["root", "branch", "leaf"]
|
|
35
39
|
num_subtopics = 4
|
|
36
|
-
human_guidance = "Test guidance"
|
|
37
40
|
|
|
38
41
|
# Act
|
|
39
42
|
input_model = DataGenCategoriesTaskInput.from_task(
|
|
40
43
|
task=base_task,
|
|
41
44
|
node_path=node_path,
|
|
42
45
|
num_subtopics=num_subtopics,
|
|
43
|
-
human_guidance=human_guidance,
|
|
44
46
|
)
|
|
45
47
|
|
|
46
48
|
# Assert
|
|
47
|
-
assert input_model.
|
|
48
|
-
assert input_model.
|
|
49
|
-
assert input_model.
|
|
50
|
-
assert
|
|
51
|
-
assert "Reply like a cowboy" in input_model.system_prompt
|
|
49
|
+
assert input_model.kiln_data_gen_topic_path == node_path
|
|
50
|
+
assert input_model.kiln_data_gen_num_subtopics == num_subtopics
|
|
51
|
+
assert isinstance(input_model.kiln_data_gen_system_prompt, str)
|
|
52
|
+
assert "Reply like a cowboy" in input_model.kiln_data_gen_system_prompt
|
|
52
53
|
|
|
53
54
|
|
|
54
55
|
def test_data_gen_categories_task_input_default_values(base_task):
|
|
@@ -56,14 +57,13 @@ def test_data_gen_categories_task_input_default_values(base_task):
|
|
|
56
57
|
input_model = DataGenCategoriesTaskInput.from_task(task=base_task)
|
|
57
58
|
|
|
58
59
|
# Assert
|
|
59
|
-
assert input_model.
|
|
60
|
-
assert input_model.
|
|
61
|
-
assert input_model.node_path == []
|
|
60
|
+
assert input_model.kiln_data_gen_num_subtopics == 6
|
|
61
|
+
assert input_model.kiln_data_gen_topic_path == []
|
|
62
62
|
|
|
63
63
|
|
|
64
64
|
def test_data_gen_categories_task_initialization():
|
|
65
65
|
# Act
|
|
66
|
-
task = DataGenCategoriesTask()
|
|
66
|
+
task = DataGenCategoriesTask(gen_type="training", guidance="Test guidance")
|
|
67
67
|
|
|
68
68
|
# Assert
|
|
69
69
|
assert task.name == "DataGen"
|
|
@@ -72,11 +72,16 @@ def test_data_gen_categories_task_initialization():
|
|
|
72
72
|
assert task.instruction is not None
|
|
73
73
|
assert isinstance(task.input_json_schema, str)
|
|
74
74
|
assert isinstance(task.output_json_schema, str)
|
|
75
|
+
assert "I want to train a large language model" in task.instruction
|
|
76
|
+
assert "Test guidance" in task.instruction
|
|
75
77
|
|
|
76
78
|
|
|
77
79
|
def test_data_gen_categories_task_schemas():
|
|
78
80
|
# Act
|
|
79
|
-
task = DataGenCategoriesTask()
|
|
81
|
+
task = DataGenCategoriesTask(gen_type="eval", guidance="Test guidance")
|
|
82
|
+
|
|
83
|
+
assert "I want to evaluate a large language model" in task.instruction
|
|
84
|
+
assert "Test guidance" in task.instruction
|
|
80
85
|
|
|
81
86
|
# Assert
|
|
82
87
|
input_schema = json.loads(task.input_json_schema)
|
|
@@ -86,12 +91,14 @@ def test_data_gen_categories_task_schemas():
|
|
|
86
91
|
assert isinstance(output_schema, dict)
|
|
87
92
|
assert output_schema["type"] == "object"
|
|
88
93
|
assert output_schema["properties"]["subtopics"]["type"] == "array"
|
|
89
|
-
assert input_schema["properties"]["
|
|
90
|
-
assert
|
|
94
|
+
assert input_schema["properties"]["kiln_data_gen_topic_path"]["type"] == "array"
|
|
95
|
+
assert (
|
|
96
|
+
input_schema["properties"]["kiln_data_gen_num_subtopics"]["type"] == "integer"
|
|
97
|
+
)
|
|
91
98
|
assert set(input_schema["required"]) == {
|
|
92
|
-
"
|
|
93
|
-
"
|
|
94
|
-
"
|
|
99
|
+
"kiln_data_gen_topic_path",
|
|
100
|
+
"kiln_data_gen_num_subtopics",
|
|
101
|
+
"kiln_data_gen_system_prompt",
|
|
95
102
|
}
|
|
96
103
|
|
|
97
104
|
|
|
@@ -106,7 +113,7 @@ async def test_data_gen_all_models_providers(
|
|
|
106
113
|
# pass if the model doesn't support data gen (testing the support flag is part of this)
|
|
107
114
|
return
|
|
108
115
|
|
|
109
|
-
data_gen_task = DataGenCategoriesTask()
|
|
116
|
+
data_gen_task = DataGenCategoriesTask(gen_type="training", guidance=None)
|
|
110
117
|
data_gen_input = DataGenCategoriesTaskInput.from_task(base_task, num_subtopics=6)
|
|
111
118
|
|
|
112
119
|
adapter = adapter_for_task(
|
|
@@ -131,22 +138,19 @@ def test_data_gen_sample_task_input_initialization(base_task):
|
|
|
131
138
|
# Arrange
|
|
132
139
|
topic = ["cowboys", "hats"]
|
|
133
140
|
num_samples = 4
|
|
134
|
-
human_guidance = "Test guidance"
|
|
135
141
|
|
|
136
142
|
# Act
|
|
137
143
|
input_model = DataGenSampleTaskInput.from_task(
|
|
138
144
|
task=base_task,
|
|
139
145
|
topic=topic,
|
|
140
146
|
num_samples=num_samples,
|
|
141
|
-
human_guidance=human_guidance,
|
|
142
147
|
)
|
|
143
148
|
|
|
144
149
|
# Assert
|
|
145
|
-
assert input_model.
|
|
146
|
-
assert input_model.
|
|
147
|
-
assert input_model.
|
|
148
|
-
assert
|
|
149
|
-
assert "Reply like a cowboy" in input_model.system_prompt
|
|
150
|
+
assert input_model.kiln_data_gen_topic_path == topic
|
|
151
|
+
assert input_model.kiln_data_gen_num_samples == num_samples
|
|
152
|
+
assert isinstance(input_model.kiln_data_gen_system_prompt, str)
|
|
153
|
+
assert "Reply like a cowboy" in input_model.kiln_data_gen_system_prompt
|
|
150
154
|
|
|
151
155
|
|
|
152
156
|
def test_data_gen_sample_task_input_default_values(base_task):
|
|
@@ -154,20 +158,23 @@ def test_data_gen_sample_task_input_default_values(base_task):
|
|
|
154
158
|
input_model = DataGenSampleTaskInput.from_task(task=base_task)
|
|
155
159
|
|
|
156
160
|
# Assert
|
|
157
|
-
assert input_model.
|
|
158
|
-
assert input_model.
|
|
159
|
-
assert input_model.topic == []
|
|
161
|
+
assert input_model.kiln_data_gen_num_samples == 8
|
|
162
|
+
assert input_model.kiln_data_gen_topic_path == []
|
|
160
163
|
|
|
161
164
|
|
|
162
165
|
def test_data_gen_sample_task_initialization(base_task):
|
|
163
166
|
# Act
|
|
164
|
-
task = DataGenSampleTask(
|
|
167
|
+
task = DataGenSampleTask(
|
|
168
|
+
target_task=base_task, gen_type="eval", guidance="Test guidance"
|
|
169
|
+
)
|
|
165
170
|
|
|
166
171
|
# Assert
|
|
167
172
|
assert task.name == "DataGenSample"
|
|
168
173
|
assert isinstance(task.parent, Project)
|
|
169
174
|
assert task.description is not None
|
|
170
175
|
assert task.instruction is not None
|
|
176
|
+
assert "I want to evaluate a large language model" in task.instruction
|
|
177
|
+
assert "Test guidance" in task.instruction
|
|
171
178
|
|
|
172
179
|
input_schema = json.loads(task.input_json_schema)
|
|
173
180
|
output_schema = json.loads(task.output_json_schema)
|
|
@@ -176,12 +183,12 @@ def test_data_gen_sample_task_initialization(base_task):
|
|
|
176
183
|
assert isinstance(output_schema, dict)
|
|
177
184
|
assert output_schema["type"] == "object"
|
|
178
185
|
assert output_schema["properties"]["generated_samples"]["type"] == "array"
|
|
179
|
-
assert input_schema["properties"]["
|
|
180
|
-
assert input_schema["properties"]["
|
|
186
|
+
assert input_schema["properties"]["kiln_data_gen_topic_path"]["type"] == "array"
|
|
187
|
+
assert input_schema["properties"]["kiln_data_gen_num_samples"]["type"] == "integer"
|
|
181
188
|
assert set(input_schema["required"]) == {
|
|
182
|
-
"
|
|
183
|
-
"
|
|
184
|
-
"
|
|
189
|
+
"kiln_data_gen_topic_path",
|
|
190
|
+
"kiln_data_gen_num_samples",
|
|
191
|
+
"kiln_data_gen_system_prompt",
|
|
185
192
|
}
|
|
186
193
|
|
|
187
194
|
|
|
@@ -332,3 +339,273 @@ async def test_data_gen_sample_all_models_providers_with_structured_output(
|
|
|
332
339
|
assert "tweet" in sample
|
|
333
340
|
assert isinstance(sample["username"], str)
|
|
334
341
|
assert isinstance(sample["tweet"], str)
|
|
342
|
+
|
|
343
|
+
|
|
344
|
+
def test_generate_topic_tree_prompt_training_type():
|
|
345
|
+
"""Test generate_topic_tree_prompt with gen_type='training'"""
|
|
346
|
+
# Act
|
|
347
|
+
prompt = generate_topic_tree_prompt(gen_type="training")
|
|
348
|
+
|
|
349
|
+
# Assert
|
|
350
|
+
assert isinstance(prompt, str)
|
|
351
|
+
assert (
|
|
352
|
+
"I want to train a large language model and you should help me generate training data for it."
|
|
353
|
+
in prompt
|
|
354
|
+
)
|
|
355
|
+
assert "## Task Description" in prompt
|
|
356
|
+
assert "Your job is the following:" in prompt
|
|
357
|
+
assert "## Next Step" in prompt
|
|
358
|
+
assert "When generating subtopics, remain somewhat vague." in prompt
|
|
359
|
+
assert "The guidance is:" not in prompt # Should not have specific guidance
|
|
360
|
+
|
|
361
|
+
|
|
362
|
+
def test_generate_topic_tree_prompt_eval_type():
|
|
363
|
+
"""Test generate_topic_tree_prompt with gen_type='eval'"""
|
|
364
|
+
# Act
|
|
365
|
+
prompt = generate_topic_tree_prompt(gen_type="eval")
|
|
366
|
+
|
|
367
|
+
# Assert
|
|
368
|
+
assert isinstance(prompt, str)
|
|
369
|
+
assert (
|
|
370
|
+
"I want to evaluate a large language model and you should help me generate eval data for it."
|
|
371
|
+
in prompt
|
|
372
|
+
)
|
|
373
|
+
assert "## Task Description" in prompt
|
|
374
|
+
assert "Your job is the following:" in prompt
|
|
375
|
+
assert "## Next Step" in prompt
|
|
376
|
+
assert "When generating subtopics, remain somewhat vague." in prompt
|
|
377
|
+
assert "The guidance is:" not in prompt # Should not have specific guidance
|
|
378
|
+
|
|
379
|
+
|
|
380
|
+
def test_generate_topic_tree_prompt_with_guidance():
|
|
381
|
+
"""Test generate_topic_tree_prompt with guidance provided"""
|
|
382
|
+
# Arrange
|
|
383
|
+
guidance = "Focus on technical topics related to artificial intelligence and machine learning"
|
|
384
|
+
|
|
385
|
+
# Act
|
|
386
|
+
prompt = generate_topic_tree_prompt(gen_type="training", guidance=guidance)
|
|
387
|
+
|
|
388
|
+
# Assert
|
|
389
|
+
assert isinstance(prompt, str)
|
|
390
|
+
assert (
|
|
391
|
+
"I want to train a large language model and you should help me generate training data for it."
|
|
392
|
+
in prompt
|
|
393
|
+
)
|
|
394
|
+
assert "## Custom Guidance" in prompt
|
|
395
|
+
assert f"<guidance>\n{guidance}\n</guidance>" in prompt
|
|
396
|
+
assert (
|
|
397
|
+
"When generating subtopics, remain somewhat vague." not in prompt
|
|
398
|
+
) # Should not have default guidance
|
|
399
|
+
|
|
400
|
+
|
|
401
|
+
def test_generate_topic_tree_prompt_with_empty_guidance():
|
|
402
|
+
"""Test generate_topic_tree_prompt with empty string guidance"""
|
|
403
|
+
# Act
|
|
404
|
+
prompt = generate_topic_tree_prompt(gen_type="eval", guidance="")
|
|
405
|
+
|
|
406
|
+
# Assert
|
|
407
|
+
assert isinstance(prompt, str)
|
|
408
|
+
assert (
|
|
409
|
+
"I want to evaluate a large language model and you should help me generate eval data for it."
|
|
410
|
+
in prompt
|
|
411
|
+
)
|
|
412
|
+
assert "## Specific Guidance" not in prompt
|
|
413
|
+
assert (
|
|
414
|
+
"When generating subtopics, remain somewhat vague." in prompt
|
|
415
|
+
) # Should have default guidance
|
|
416
|
+
|
|
417
|
+
|
|
418
|
+
def test_generate_topic_tree_prompt_contains_examples():
|
|
419
|
+
"""Test that the prompt contains the expected examples"""
|
|
420
|
+
# Act
|
|
421
|
+
prompt = generate_topic_tree_prompt(gen_type="training")
|
|
422
|
+
|
|
423
|
+
# Assert
|
|
424
|
+
# Check for news examples
|
|
425
|
+
assert "News Topics" in prompt
|
|
426
|
+
assert "Sports" in prompt
|
|
427
|
+
assert "Football" in prompt
|
|
428
|
+
assert "College Football" in prompt
|
|
429
|
+
assert "Entertainment" in prompt
|
|
430
|
+
assert "Tom Hanks" in prompt
|
|
431
|
+
|
|
432
|
+
# Check for smalltalk examples
|
|
433
|
+
assert "Small Talk Topics" in prompt
|
|
434
|
+
assert "Weather" in prompt
|
|
435
|
+
assert "Family" in prompt
|
|
436
|
+
assert "Hobbies" in prompt
|
|
437
|
+
assert "Cooking" in prompt
|
|
438
|
+
assert "Asian Food" in prompt
|
|
439
|
+
|
|
440
|
+
|
|
441
|
+
def test_generate_topic_tree_prompt_contains_required_sections():
|
|
442
|
+
"""Test that the prompt contains all required sections"""
|
|
443
|
+
# Act
|
|
444
|
+
prompt = generate_topic_tree_prompt(gen_type="training")
|
|
445
|
+
|
|
446
|
+
# Assert
|
|
447
|
+
assert "## Task Description" in prompt
|
|
448
|
+
assert "## Next Step" in prompt
|
|
449
|
+
assert "system_prompt" in prompt
|
|
450
|
+
assert "kiln_data_gen_topic_path" in prompt
|
|
451
|
+
assert "kiln_data_gen_num_subtopics" in prompt
|
|
452
|
+
assert "existing_topics" in prompt
|
|
453
|
+
|
|
454
|
+
|
|
455
|
+
def test_generate_topic_tree_prompt_structure_consistency():
|
|
456
|
+
"""Test that the prompt structure is consistent between training and eval types"""
|
|
457
|
+
# Act
|
|
458
|
+
training_prompt = generate_topic_tree_prompt(gen_type="training")
|
|
459
|
+
eval_prompt = generate_topic_tree_prompt(gen_type="eval")
|
|
460
|
+
|
|
461
|
+
# Assert
|
|
462
|
+
# Both should have the same structure, just different goal descriptions
|
|
463
|
+
assert "## Task Description" in training_prompt
|
|
464
|
+
assert "## Task Description" in eval_prompt
|
|
465
|
+
assert "## Next Step" in training_prompt
|
|
466
|
+
assert "## Next Step" in eval_prompt
|
|
467
|
+
|
|
468
|
+
# The main difference should be in the goal description
|
|
469
|
+
assert "train a large language model" in training_prompt
|
|
470
|
+
assert "evaluate a large language model" in eval_prompt
|
|
471
|
+
assert "generate training data" in training_prompt
|
|
472
|
+
assert "generate eval data" in eval_prompt
|
|
473
|
+
|
|
474
|
+
|
|
475
|
+
def test_generate_sample_generation_prompt_training_type():
|
|
476
|
+
"""Test generate_sample_generation_prompt with gen_type='training'"""
|
|
477
|
+
# Act
|
|
478
|
+
prompt = generate_sample_generation_prompt(gen_type="training")
|
|
479
|
+
|
|
480
|
+
# Assert
|
|
481
|
+
assert isinstance(prompt, str)
|
|
482
|
+
assert (
|
|
483
|
+
"I want to train a large language model and you should help me generate training data for it."
|
|
484
|
+
in prompt
|
|
485
|
+
)
|
|
486
|
+
assert "## Task Description" in prompt
|
|
487
|
+
assert "Your job is to generate a list of potential inputs" in prompt
|
|
488
|
+
assert "The guidance is:" not in prompt # Should not have specific guidance
|
|
489
|
+
|
|
490
|
+
|
|
491
|
+
def test_generate_sample_generation_prompt_eval_type():
|
|
492
|
+
"""Test generate_sample_generation_prompt with gen_type='eval'"""
|
|
493
|
+
# Act
|
|
494
|
+
prompt = generate_sample_generation_prompt(gen_type="eval")
|
|
495
|
+
|
|
496
|
+
# Assert
|
|
497
|
+
assert isinstance(prompt, str)
|
|
498
|
+
assert (
|
|
499
|
+
"I want to evaluate a large language model and you should help me generate eval data for it."
|
|
500
|
+
in prompt
|
|
501
|
+
)
|
|
502
|
+
assert "## Task Description" in prompt
|
|
503
|
+
assert "Your job is to generate a list of potential inputs" in prompt
|
|
504
|
+
assert "The guidance is:" not in prompt # Should not have specific guidance
|
|
505
|
+
|
|
506
|
+
|
|
507
|
+
def test_generate_sample_generation_prompt_with_guidance():
|
|
508
|
+
"""Test generate_sample_generation_prompt with guidance provided"""
|
|
509
|
+
# Arrange
|
|
510
|
+
guidance = "Focus on generating diverse examples with varying complexity levels"
|
|
511
|
+
|
|
512
|
+
# Act
|
|
513
|
+
prompt = generate_sample_generation_prompt(gen_type="training", guidance=guidance)
|
|
514
|
+
|
|
515
|
+
# Assert
|
|
516
|
+
assert isinstance(prompt, str)
|
|
517
|
+
assert (
|
|
518
|
+
"I want to train a large language model and you should help me generate training data for it."
|
|
519
|
+
in prompt
|
|
520
|
+
)
|
|
521
|
+
assert "## Custom Guidance" in prompt
|
|
522
|
+
assert f"<guidance>\n{guidance}\n</guidance>" in prompt
|
|
523
|
+
|
|
524
|
+
|
|
525
|
+
def test_generate_sample_generation_prompt_with_empty_guidance():
|
|
526
|
+
"""Test generate_sample_generation_prompt with empty string guidance"""
|
|
527
|
+
# Act
|
|
528
|
+
prompt = generate_sample_generation_prompt(gen_type="eval", guidance="")
|
|
529
|
+
|
|
530
|
+
# Assert
|
|
531
|
+
assert isinstance(prompt, str)
|
|
532
|
+
assert (
|
|
533
|
+
"I want to evaluate a large language model and you should help me generate eval data for it."
|
|
534
|
+
in prompt
|
|
535
|
+
)
|
|
536
|
+
assert "## Specific Guidance" not in prompt
|
|
537
|
+
|
|
538
|
+
|
|
539
|
+
def test_generate_sample_generation_prompt_contains_examples():
|
|
540
|
+
"""Test that the prompt contains the expected examples"""
|
|
541
|
+
# Act
|
|
542
|
+
prompt = generate_sample_generation_prompt(gen_type="training")
|
|
543
|
+
|
|
544
|
+
# Assert
|
|
545
|
+
# Check for the tweet classification example
|
|
546
|
+
assert "You are an assistant that classifies the tone of a tweet" in prompt
|
|
547
|
+
assert "positive" in prompt
|
|
548
|
+
assert "negative" in prompt
|
|
549
|
+
assert "neutral" in prompt
|
|
550
|
+
assert "Technology" in prompt
|
|
551
|
+
assert "New iPhone Event" in prompt
|
|
552
|
+
assert "New iPhone looks amazing! I need that camera." in prompt
|
|
553
|
+
assert "Another boring event from Apple." in prompt
|
|
554
|
+
|
|
555
|
+
|
|
556
|
+
def test_generate_sample_generation_prompt_contains_required_sections():
|
|
557
|
+
"""Test that the prompt contains all required sections"""
|
|
558
|
+
# Act
|
|
559
|
+
prompt = generate_sample_generation_prompt(gen_type="training")
|
|
560
|
+
|
|
561
|
+
# Assert
|
|
562
|
+
assert "## Task Description" in prompt
|
|
563
|
+
assert "system_prompt" in prompt
|
|
564
|
+
assert "topic" in prompt
|
|
565
|
+
assert "num_samples" in prompt
|
|
566
|
+
assert "generated_samples" in prompt
|
|
567
|
+
assert "The output must be formatted:" in prompt
|
|
568
|
+
assert "Do not include any other text or break the schema in any way." in prompt
|
|
569
|
+
assert (
|
|
570
|
+
"Note how the output of this task is data to input into the system prompt"
|
|
571
|
+
in prompt
|
|
572
|
+
)
|
|
573
|
+
|
|
574
|
+
|
|
575
|
+
def test_generate_sample_generation_prompt_structure_consistency():
|
|
576
|
+
"""Test that the prompt structure is consistent between training and eval types"""
|
|
577
|
+
# Act
|
|
578
|
+
training_prompt = generate_sample_generation_prompt(gen_type="training")
|
|
579
|
+
eval_prompt = generate_sample_generation_prompt(gen_type="eval")
|
|
580
|
+
|
|
581
|
+
# Assert
|
|
582
|
+
# Both should have the same structure, just different goal descriptions
|
|
583
|
+
assert "## Task Description" in training_prompt
|
|
584
|
+
assert "## Task Description" in eval_prompt
|
|
585
|
+
|
|
586
|
+
# The main difference should be in the goal description
|
|
587
|
+
assert "train a large language model" in training_prompt
|
|
588
|
+
assert "evaluate a large language model" in eval_prompt
|
|
589
|
+
assert "generate training data" in training_prompt
|
|
590
|
+
assert "generate eval data" in eval_prompt
|
|
591
|
+
|
|
592
|
+
# Both should have the same core content
|
|
593
|
+
assert "Your job is to generate a list of potential inputs" in training_prompt
|
|
594
|
+
assert "Your job is to generate a list of potential inputs" in eval_prompt
|
|
595
|
+
assert "generated_samples" in training_prompt
|
|
596
|
+
assert "generated_samples" in eval_prompt
|
|
597
|
+
|
|
598
|
+
|
|
599
|
+
def test_generate_sample_generation_prompt_with_none_guidance():
|
|
600
|
+
"""Test generate_sample_generation_prompt with None guidance"""
|
|
601
|
+
# Act
|
|
602
|
+
prompt = generate_sample_generation_prompt(gen_type="training", guidance=None)
|
|
603
|
+
|
|
604
|
+
# Assert
|
|
605
|
+
assert isinstance(prompt, str)
|
|
606
|
+
assert (
|
|
607
|
+
"I want to train a large language model and you should help me generate training data for it."
|
|
608
|
+
in prompt
|
|
609
|
+
)
|
|
610
|
+
assert "## Specific Guidance" not in prompt
|
|
611
|
+
assert "The guidance is:" not in prompt
|
|
@@ -7,12 +7,7 @@ from kiln_ai.adapters.ml_model_list import ModelProviderName
|
|
|
7
7
|
from kiln_ai.adapters.model_adapters.base_adapter import AdapterConfig
|
|
8
8
|
from kiln_ai.datamodel.eval import Eval, EvalConfig, EvalScores
|
|
9
9
|
from kiln_ai.datamodel.json_schema import validate_schema_with_value_error
|
|
10
|
-
from kiln_ai.datamodel.task import
|
|
11
|
-
RunConfig,
|
|
12
|
-
RunConfigProperties,
|
|
13
|
-
TaskOutputRatingType,
|
|
14
|
-
TaskRun,
|
|
15
|
-
)
|
|
10
|
+
from kiln_ai.datamodel.task import RunConfig, TaskOutputRatingType, TaskRun
|
|
16
11
|
from kiln_ai.utils.exhaustive_error import raise_exhaustive_enum_error
|
|
17
12
|
|
|
18
13
|
|
|
@@ -124,7 +119,9 @@ class BaseEval:
|
|
|
124
119
|
property["minimum"] = 1
|
|
125
120
|
property["maximum"] = 5
|
|
126
121
|
else:
|
|
127
|
-
property["
|
|
122
|
+
property["type"] = "integer"
|
|
123
|
+
property["minimum"] = 1
|
|
124
|
+
property["maximum"] = 5
|
|
128
125
|
|
|
129
126
|
property["description"] = (
|
|
130
127
|
f"{output_score.instruction}\n\nThe rating should be between 1 and 5, with 1 being the worst and 5 being the best."
|
|
@@ -139,6 +136,7 @@ class BaseEval:
|
|
|
139
136
|
)
|
|
140
137
|
else:
|
|
141
138
|
property["enum"] = ["pass", "fail"]
|
|
139
|
+
property["type"] = "string"
|
|
142
140
|
property["description"] = (
|
|
143
141
|
f"{output_score.instruction}\n\nThe rating should be either 'pass' or 'fail'."
|
|
144
142
|
)
|
|
@@ -152,6 +150,7 @@ class BaseEval:
|
|
|
152
150
|
)
|
|
153
151
|
else:
|
|
154
152
|
property["enum"] = ["pass", "fail", "critical"]
|
|
153
|
+
property["type"] = "string"
|
|
155
154
|
property["description"] = (
|
|
156
155
|
f"{output_score.instruction}\n\nThe rating should be either 'pass', 'fail', or 'critical' where critical a very severe failure."
|
|
157
156
|
)
|
|
@@ -8,7 +8,7 @@ from kiln_ai.datamodel.basemodel import ID_TYPE
|
|
|
8
8
|
from kiln_ai.datamodel.dataset_filters import dataset_filter_from_id
|
|
9
9
|
from kiln_ai.datamodel.eval import EvalConfig, EvalRun, EvalScores
|
|
10
10
|
from kiln_ai.datamodel.task import TaskRunConfig
|
|
11
|
-
from kiln_ai.datamodel.task_run import TaskRun
|
|
11
|
+
from kiln_ai.datamodel.task_run import TaskRun, Usage
|
|
12
12
|
from kiln_ai.utils.async_job_runner import AsyncJobRunner, Progress
|
|
13
13
|
|
|
14
14
|
logger = logging.getLogger(__name__)
|
|
@@ -177,10 +177,12 @@ class EvalRunner:
|
|
|
177
177
|
task_output: str | None = None
|
|
178
178
|
scores: EvalScores | None = None
|
|
179
179
|
intermediate_outputs: Dict[str, str] | None = None
|
|
180
|
+
task_run_usage: Usage | None = None
|
|
180
181
|
if job.type == "eval_config_eval":
|
|
181
182
|
# Eval config eval, we use the saved input from the task run, not invoking the task again
|
|
182
183
|
scores, intermediate_outputs = await evaluator.run_eval(job.item)
|
|
183
184
|
task_output = job.item.output.output
|
|
185
|
+
task_run_usage = job.item.usage
|
|
184
186
|
else:
|
|
185
187
|
# Task run eval, we invoke the task again to get a fresh output
|
|
186
188
|
(
|
|
@@ -189,6 +191,7 @@ class EvalRunner:
|
|
|
189
191
|
intermediate_outputs,
|
|
190
192
|
) = await evaluator.run_task_and_eval(job.item.input)
|
|
191
193
|
task_output = result_task_run.output.output
|
|
194
|
+
task_run_usage = result_task_run.usage
|
|
192
195
|
|
|
193
196
|
# Save the job result
|
|
194
197
|
eval_run = EvalRun(
|
|
@@ -202,6 +205,7 @@ class EvalRunner:
|
|
|
202
205
|
input=job.item.input,
|
|
203
206
|
output=task_output,
|
|
204
207
|
intermediate_outputs=intermediate_outputs,
|
|
208
|
+
task_run_usage=task_run_usage,
|
|
205
209
|
)
|
|
206
210
|
eval_run.save_to_file()
|
|
207
211
|
|
kiln_ai/adapters/eval/g_eval.py
CHANGED
|
@@ -102,6 +102,18 @@ class GEval(BaseEval):
|
|
|
102
102
|
|
|
103
103
|
self.geval_task = GEvalTask(eval_config)
|
|
104
104
|
|
|
105
|
+
def generate_run_description(self, eval_input: str, eval_output: str) -> str:
|
|
106
|
+
return f"""The model was given the following input for the task:
|
|
107
|
+
<eval_data>
|
|
108
|
+
{eval_input}
|
|
109
|
+
</eval_data>
|
|
110
|
+
|
|
111
|
+
The model produced the following output for the task:
|
|
112
|
+
<eval_data>
|
|
113
|
+
{eval_output}
|
|
114
|
+
</eval_data>
|
|
115
|
+
"""
|
|
116
|
+
|
|
105
117
|
async def run_eval(
|
|
106
118
|
self, task_run: TaskRun
|
|
107
119
|
) -> tuple[EvalScores, Dict[str, str] | None]:
|
|
@@ -145,19 +157,12 @@ class GEval(BaseEval):
|
|
|
145
157
|
),
|
|
146
158
|
)
|
|
147
159
|
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
</eval_data>
|
|
152
|
-
|
|
153
|
-
The model produced the following output for the task:
|
|
154
|
-
<eval_data>
|
|
155
|
-
{task_run.output}
|
|
156
|
-
</eval_data>
|
|
157
|
-
"""
|
|
160
|
+
run_description = self.generate_run_description(
|
|
161
|
+
task_run.input, task_run.output.output
|
|
162
|
+
)
|
|
158
163
|
|
|
159
164
|
# We don't need the run, but invoke_returning_run_output() runs validations for us over _run()
|
|
160
|
-
_, run_output = await adapter.invoke_returning_run_output(
|
|
165
|
+
_, run_output = await adapter.invoke_returning_run_output(run_description)
|
|
161
166
|
|
|
162
167
|
if self.eval_config.config_type == EvalConfigType.llm_as_judge:
|
|
163
168
|
return self.build_llm_as_judge_score(
|
|
@@ -310,7 +315,7 @@ The model produced the following output for the task:
|
|
|
310
315
|
"""
|
|
311
316
|
primary_token_score = self.score_from_token_string(token_logprob.token)
|
|
312
317
|
# check this is a real rating token, it could just be the ": ", "," or whitespace
|
|
313
|
-
if
|
|
318
|
+
if primary_token_score is None:
|
|
314
319
|
return None
|
|
315
320
|
|
|
316
321
|
total_score = 0.0
|
|
@@ -43,7 +43,9 @@ def test_score_schema_five_star():
|
|
|
43
43
|
|
|
44
44
|
# Check score property, and that it's an enum of 1-5
|
|
45
45
|
score_prop = schema["properties"]["quality_score"]
|
|
46
|
-
assert score_prop["
|
|
46
|
+
assert score_prop["type"] == "integer"
|
|
47
|
+
assert score_prop["minimum"] == 1
|
|
48
|
+
assert score_prop["maximum"] == 5
|
|
47
49
|
assert "Quality Score" in score_prop["title"]
|
|
48
50
|
assert "Rate the quality" in score_prop["description"]
|
|
49
51
|
assert "between 1 and 5" in score_prop["description"]
|
|
@@ -51,7 +53,9 @@ def test_score_schema_five_star():
|
|
|
51
53
|
# Check overall rating property, and that it's an enum of 1-5
|
|
52
54
|
assert "overall_rating" in schema["properties"]
|
|
53
55
|
overall = schema["properties"]["overall_rating"]
|
|
54
|
-
assert overall["
|
|
56
|
+
assert overall["type"] == "integer"
|
|
57
|
+
assert overall["minimum"] == 1
|
|
58
|
+
assert overall["maximum"] == 5
|
|
55
59
|
assert "Overall Rating" in overall["title"]
|
|
56
60
|
assert "The overall rating for the task output" in overall["description"]
|
|
57
61
|
assert "between 1 and 5" in overall["description"]
|
|
@@ -127,6 +131,7 @@ def test_score_schema_pass_fail():
|
|
|
127
131
|
schema = json.loads(schema_str)
|
|
128
132
|
|
|
129
133
|
score_prop = schema["properties"]["pass_fail_test"]
|
|
134
|
+
assert score_prop["type"] == "string"
|
|
130
135
|
assert score_prop["enum"] == ["pass", "fail"]
|
|
131
136
|
assert "Pass Fail Test" in score_prop["title"]
|
|
132
137
|
assert "Check if it passes" in score_prop["description"]
|
|
@@ -173,6 +178,7 @@ def test_score_schema_pass_fail_critical():
|
|
|
173
178
|
score_prop = schema["properties"]["critical_test"]
|
|
174
179
|
assert "enum" in score_prop
|
|
175
180
|
assert score_prop["enum"] == ["pass", "fail", "critical"]
|
|
181
|
+
assert score_prop["type"] == "string"
|
|
176
182
|
assert "'pass', 'fail', or 'critical'" in score_prop["description"]
|
|
177
183
|
|
|
178
184
|
assert schema["properties"]["overall_rating"] is not None
|