kiln-ai 0.17.0__py3-none-any.whl → 0.19.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kiln-ai might be problematic. Click here for more details.

Files changed (58) hide show
  1. kiln_ai/adapters/adapter_registry.py +28 -0
  2. kiln_ai/adapters/chat/chat_formatter.py +0 -1
  3. kiln_ai/adapters/data_gen/data_gen_prompts.py +121 -36
  4. kiln_ai/adapters/data_gen/data_gen_task.py +51 -38
  5. kiln_ai/adapters/data_gen/test_data_gen_task.py +318 -37
  6. kiln_ai/adapters/eval/base_eval.py +6 -7
  7. kiln_ai/adapters/eval/eval_runner.py +5 -1
  8. kiln_ai/adapters/eval/g_eval.py +17 -12
  9. kiln_ai/adapters/eval/test_base_eval.py +8 -2
  10. kiln_ai/adapters/eval/test_eval_runner.py +6 -12
  11. kiln_ai/adapters/eval/test_g_eval.py +115 -5
  12. kiln_ai/adapters/eval/test_g_eval_data.py +1 -1
  13. kiln_ai/adapters/fine_tune/base_finetune.py +2 -6
  14. kiln_ai/adapters/fine_tune/dataset_formatter.py +1 -5
  15. kiln_ai/adapters/fine_tune/fireworks_finetune.py +32 -20
  16. kiln_ai/adapters/fine_tune/test_dataset_formatter.py +1 -1
  17. kiln_ai/adapters/fine_tune/test_fireworks_tinetune.py +30 -21
  18. kiln_ai/adapters/fine_tune/test_vertex_finetune.py +2 -7
  19. kiln_ai/adapters/fine_tune/together_finetune.py +1 -1
  20. kiln_ai/adapters/ml_model_list.py +926 -125
  21. kiln_ai/adapters/model_adapters/base_adapter.py +11 -7
  22. kiln_ai/adapters/model_adapters/litellm_adapter.py +23 -1
  23. kiln_ai/adapters/model_adapters/test_base_adapter.py +1 -2
  24. kiln_ai/adapters/model_adapters/test_litellm_adapter.py +70 -3
  25. kiln_ai/adapters/model_adapters/test_structured_output.py +13 -13
  26. kiln_ai/adapters/parsers/parser_registry.py +0 -2
  27. kiln_ai/adapters/parsers/r1_parser.py +0 -1
  28. kiln_ai/adapters/parsers/test_r1_parser.py +1 -1
  29. kiln_ai/adapters/provider_tools.py +20 -19
  30. kiln_ai/adapters/remote_config.py +113 -0
  31. kiln_ai/adapters/repair/repair_task.py +2 -7
  32. kiln_ai/adapters/test_adapter_registry.py +30 -2
  33. kiln_ai/adapters/test_ml_model_list.py +30 -0
  34. kiln_ai/adapters/test_prompt_adaptors.py +0 -4
  35. kiln_ai/adapters/test_provider_tools.py +18 -12
  36. kiln_ai/adapters/test_remote_config.py +456 -0
  37. kiln_ai/datamodel/basemodel.py +54 -28
  38. kiln_ai/datamodel/datamodel_enums.py +2 -0
  39. kiln_ai/datamodel/dataset_split.py +5 -3
  40. kiln_ai/datamodel/eval.py +35 -3
  41. kiln_ai/datamodel/finetune.py +2 -3
  42. kiln_ai/datamodel/project.py +3 -3
  43. kiln_ai/datamodel/prompt.py +2 -2
  44. kiln_ai/datamodel/prompt_id.py +4 -4
  45. kiln_ai/datamodel/task.py +6 -6
  46. kiln_ai/datamodel/task_output.py +1 -3
  47. kiln_ai/datamodel/task_run.py +0 -2
  48. kiln_ai/datamodel/test_basemodel.py +210 -18
  49. kiln_ai/datamodel/test_eval_model.py +152 -10
  50. kiln_ai/datamodel/test_model_perf.py +1 -1
  51. kiln_ai/datamodel/test_prompt_id.py +5 -1
  52. kiln_ai/datamodel/test_task.py +5 -0
  53. kiln_ai/utils/config.py +10 -0
  54. kiln_ai/utils/logging.py +4 -3
  55. {kiln_ai-0.17.0.dist-info → kiln_ai-0.19.0.dist-info}/METADATA +33 -3
  56. {kiln_ai-0.17.0.dist-info → kiln_ai-0.19.0.dist-info}/RECORD +58 -56
  57. {kiln_ai-0.17.0.dist-info → kiln_ai-0.19.0.dist-info}/WHEEL +0 -0
  58. {kiln_ai-0.17.0.dist-info → kiln_ai-0.19.0.dist-info}/licenses/LICENSE.txt +0 -0
@@ -3,6 +3,10 @@ import json
3
3
  import pytest
4
4
 
5
5
  from kiln_ai.adapters.adapter_registry import adapter_for_task
6
+ from kiln_ai.adapters.data_gen.data_gen_prompts import (
7
+ generate_sample_generation_prompt,
8
+ generate_topic_tree_prompt,
9
+ )
6
10
  from kiln_ai.adapters.data_gen.data_gen_task import (
7
11
  DataGenCategoriesTask,
8
12
  DataGenCategoriesTaskInput,
@@ -33,22 +37,19 @@ def test_data_gen_categories_task_input_initialization(base_task):
33
37
  # Arrange
34
38
  node_path = ["root", "branch", "leaf"]
35
39
  num_subtopics = 4
36
- human_guidance = "Test guidance"
37
40
 
38
41
  # Act
39
42
  input_model = DataGenCategoriesTaskInput.from_task(
40
43
  task=base_task,
41
44
  node_path=node_path,
42
45
  num_subtopics=num_subtopics,
43
- human_guidance=human_guidance,
44
46
  )
45
47
 
46
48
  # Assert
47
- assert input_model.node_path == node_path
48
- assert input_model.num_subtopics == num_subtopics
49
- assert input_model.human_guidance == human_guidance
50
- assert isinstance(input_model.system_prompt, str)
51
- assert "Reply like a cowboy" in input_model.system_prompt
49
+ assert input_model.kiln_data_gen_topic_path == node_path
50
+ assert input_model.kiln_data_gen_num_subtopics == num_subtopics
51
+ assert isinstance(input_model.kiln_data_gen_system_prompt, str)
52
+ assert "Reply like a cowboy" in input_model.kiln_data_gen_system_prompt
52
53
 
53
54
 
54
55
  def test_data_gen_categories_task_input_default_values(base_task):
@@ -56,14 +57,13 @@ def test_data_gen_categories_task_input_default_values(base_task):
56
57
  input_model = DataGenCategoriesTaskInput.from_task(task=base_task)
57
58
 
58
59
  # Assert
59
- assert input_model.num_subtopics == 6
60
- assert input_model.human_guidance is None
61
- assert input_model.node_path == []
60
+ assert input_model.kiln_data_gen_num_subtopics == 6
61
+ assert input_model.kiln_data_gen_topic_path == []
62
62
 
63
63
 
64
64
  def test_data_gen_categories_task_initialization():
65
65
  # Act
66
- task = DataGenCategoriesTask()
66
+ task = DataGenCategoriesTask(gen_type="training", guidance="Test guidance")
67
67
 
68
68
  # Assert
69
69
  assert task.name == "DataGen"
@@ -72,11 +72,16 @@ def test_data_gen_categories_task_initialization():
72
72
  assert task.instruction is not None
73
73
  assert isinstance(task.input_json_schema, str)
74
74
  assert isinstance(task.output_json_schema, str)
75
+ assert "I want to train a large language model" in task.instruction
76
+ assert "Test guidance" in task.instruction
75
77
 
76
78
 
77
79
  def test_data_gen_categories_task_schemas():
78
80
  # Act
79
- task = DataGenCategoriesTask()
81
+ task = DataGenCategoriesTask(gen_type="eval", guidance="Test guidance")
82
+
83
+ assert "I want to evaluate a large language model" in task.instruction
84
+ assert "Test guidance" in task.instruction
80
85
 
81
86
  # Assert
82
87
  input_schema = json.loads(task.input_json_schema)
@@ -86,12 +91,14 @@ def test_data_gen_categories_task_schemas():
86
91
  assert isinstance(output_schema, dict)
87
92
  assert output_schema["type"] == "object"
88
93
  assert output_schema["properties"]["subtopics"]["type"] == "array"
89
- assert input_schema["properties"]["node_path"]["type"] == "array"
90
- assert input_schema["properties"]["num_subtopics"]["type"] == "integer"
94
+ assert input_schema["properties"]["kiln_data_gen_topic_path"]["type"] == "array"
95
+ assert (
96
+ input_schema["properties"]["kiln_data_gen_num_subtopics"]["type"] == "integer"
97
+ )
91
98
  assert set(input_schema["required"]) == {
92
- "node_path",
93
- "num_subtopics",
94
- "system_prompt",
99
+ "kiln_data_gen_topic_path",
100
+ "kiln_data_gen_num_subtopics",
101
+ "kiln_data_gen_system_prompt",
95
102
  }
96
103
 
97
104
 
@@ -106,7 +113,7 @@ async def test_data_gen_all_models_providers(
106
113
  # pass if the model doesn't support data gen (testing the support flag is part of this)
107
114
  return
108
115
 
109
- data_gen_task = DataGenCategoriesTask()
116
+ data_gen_task = DataGenCategoriesTask(gen_type="training", guidance=None)
110
117
  data_gen_input = DataGenCategoriesTaskInput.from_task(base_task, num_subtopics=6)
111
118
 
112
119
  adapter = adapter_for_task(
@@ -131,22 +138,19 @@ def test_data_gen_sample_task_input_initialization(base_task):
131
138
  # Arrange
132
139
  topic = ["cowboys", "hats"]
133
140
  num_samples = 4
134
- human_guidance = "Test guidance"
135
141
 
136
142
  # Act
137
143
  input_model = DataGenSampleTaskInput.from_task(
138
144
  task=base_task,
139
145
  topic=topic,
140
146
  num_samples=num_samples,
141
- human_guidance=human_guidance,
142
147
  )
143
148
 
144
149
  # Assert
145
- assert input_model.topic == topic
146
- assert input_model.num_samples == num_samples
147
- assert input_model.human_guidance == human_guidance
148
- assert isinstance(input_model.system_prompt, str)
149
- assert "Reply like a cowboy" in input_model.system_prompt
150
+ assert input_model.kiln_data_gen_topic_path == topic
151
+ assert input_model.kiln_data_gen_num_samples == num_samples
152
+ assert isinstance(input_model.kiln_data_gen_system_prompt, str)
153
+ assert "Reply like a cowboy" in input_model.kiln_data_gen_system_prompt
150
154
 
151
155
 
152
156
  def test_data_gen_sample_task_input_default_values(base_task):
@@ -154,20 +158,23 @@ def test_data_gen_sample_task_input_default_values(base_task):
154
158
  input_model = DataGenSampleTaskInput.from_task(task=base_task)
155
159
 
156
160
  # Assert
157
- assert input_model.num_samples == 8
158
- assert input_model.human_guidance is None
159
- assert input_model.topic == []
161
+ assert input_model.kiln_data_gen_num_samples == 8
162
+ assert input_model.kiln_data_gen_topic_path == []
160
163
 
161
164
 
162
165
  def test_data_gen_sample_task_initialization(base_task):
163
166
  # Act
164
- task = DataGenSampleTask(target_task=base_task)
167
+ task = DataGenSampleTask(
168
+ target_task=base_task, gen_type="eval", guidance="Test guidance"
169
+ )
165
170
 
166
171
  # Assert
167
172
  assert task.name == "DataGenSample"
168
173
  assert isinstance(task.parent, Project)
169
174
  assert task.description is not None
170
175
  assert task.instruction is not None
176
+ assert "I want to evaluate a large language model" in task.instruction
177
+ assert "Test guidance" in task.instruction
171
178
 
172
179
  input_schema = json.loads(task.input_json_schema)
173
180
  output_schema = json.loads(task.output_json_schema)
@@ -176,12 +183,12 @@ def test_data_gen_sample_task_initialization(base_task):
176
183
  assert isinstance(output_schema, dict)
177
184
  assert output_schema["type"] == "object"
178
185
  assert output_schema["properties"]["generated_samples"]["type"] == "array"
179
- assert input_schema["properties"]["topic"]["type"] == "array"
180
- assert input_schema["properties"]["num_samples"]["type"] == "integer"
186
+ assert input_schema["properties"]["kiln_data_gen_topic_path"]["type"] == "array"
187
+ assert input_schema["properties"]["kiln_data_gen_num_samples"]["type"] == "integer"
181
188
  assert set(input_schema["required"]) == {
182
- "topic",
183
- "num_samples",
184
- "system_prompt",
189
+ "kiln_data_gen_topic_path",
190
+ "kiln_data_gen_num_samples",
191
+ "kiln_data_gen_system_prompt",
185
192
  }
186
193
 
187
194
 
@@ -248,11 +255,13 @@ async def test_data_gen_sample_all_models_providers(
248
255
  tmp_path, model_name, provider_name, base_task
249
256
  ):
250
257
  _, provider = get_model_and_provider(model_name, provider_name)
251
- if not provider.supports_data_gen:
258
+ if provider is None or not provider.supports_data_gen:
252
259
  # pass if the model doesn't support data gen (testing the support flag is part of this)
253
260
  return
254
261
 
255
- data_gen_task = DataGenSampleTask(target_task=base_task)
262
+ data_gen_task = DataGenSampleTask(
263
+ target_task=base_task, gen_type="training", guidance=None
264
+ )
256
265
  data_gen_input = DataGenSampleTaskInput.from_task(
257
266
  base_task, topic=["riding horses"], num_samples=4
258
267
  )
@@ -306,7 +315,9 @@ async def test_data_gen_sample_all_models_providers_with_structured_output(
306
315
  # pass if the model doesn't support data gen (testing the support flag is part of this)
307
316
  return
308
317
 
309
- data_gen_task = DataGenSampleTask(target_task=task)
318
+ data_gen_task = DataGenSampleTask(
319
+ target_task=task, gen_type="training", guidance=None
320
+ )
310
321
  data_gen_input = DataGenSampleTaskInput.from_task(
311
322
  task, topic=["Food"], num_samples=4
312
323
  )
@@ -332,3 +343,273 @@ async def test_data_gen_sample_all_models_providers_with_structured_output(
332
343
  assert "tweet" in sample
333
344
  assert isinstance(sample["username"], str)
334
345
  assert isinstance(sample["tweet"], str)
346
+
347
+
348
+ def test_generate_topic_tree_prompt_training_type():
349
+ """Test generate_topic_tree_prompt with gen_type='training'"""
350
+ # Act
351
+ prompt = generate_topic_tree_prompt(gen_type="training")
352
+
353
+ # Assert
354
+ assert isinstance(prompt, str)
355
+ assert (
356
+ "I want to train a large language model and you should help me generate training data for it."
357
+ in prompt
358
+ )
359
+ assert "## Task Description" in prompt
360
+ assert "Your job is the following:" in prompt
361
+ assert "## Next Step" in prompt
362
+ assert "When generating subtopics, remain somewhat vague." in prompt
363
+ assert "The guidance is:" not in prompt # Should not have specific guidance
364
+
365
+
366
+ def test_generate_topic_tree_prompt_eval_type():
367
+ """Test generate_topic_tree_prompt with gen_type='eval'"""
368
+ # Act
369
+ prompt = generate_topic_tree_prompt(gen_type="eval")
370
+
371
+ # Assert
372
+ assert isinstance(prompt, str)
373
+ assert (
374
+ "I want to evaluate a large language model and you should help me generate eval data for it."
375
+ in prompt
376
+ )
377
+ assert "## Task Description" in prompt
378
+ assert "Your job is the following:" in prompt
379
+ assert "## Next Step" in prompt
380
+ assert "When generating subtopics, remain somewhat vague." in prompt
381
+ assert "The guidance is:" not in prompt # Should not have specific guidance
382
+
383
+
384
+ def test_generate_topic_tree_prompt_with_guidance():
385
+ """Test generate_topic_tree_prompt with guidance provided"""
386
+ # Arrange
387
+ guidance = "Focus on technical topics related to artificial intelligence and machine learning"
388
+
389
+ # Act
390
+ prompt = generate_topic_tree_prompt(gen_type="training", guidance=guidance)
391
+
392
+ # Assert
393
+ assert isinstance(prompt, str)
394
+ assert (
395
+ "I want to train a large language model and you should help me generate training data for it."
396
+ in prompt
397
+ )
398
+ assert "## Custom Guidance" in prompt
399
+ assert f"<guidance>\n{guidance}\n</guidance>" in prompt
400
+ assert (
401
+ "When generating subtopics, remain somewhat vague." not in prompt
402
+ ) # Should not have default guidance
403
+
404
+
405
+ def test_generate_topic_tree_prompt_with_empty_guidance():
406
+ """Test generate_topic_tree_prompt with empty string guidance"""
407
+ # Act
408
+ prompt = generate_topic_tree_prompt(gen_type="eval", guidance="")
409
+
410
+ # Assert
411
+ assert isinstance(prompt, str)
412
+ assert (
413
+ "I want to evaluate a large language model and you should help me generate eval data for it."
414
+ in prompt
415
+ )
416
+ assert "## Specific Guidance" not in prompt
417
+ assert (
418
+ "When generating subtopics, remain somewhat vague." in prompt
419
+ ) # Should have default guidance
420
+
421
+
422
+ def test_generate_topic_tree_prompt_contains_examples():
423
+ """Test that the prompt contains the expected examples"""
424
+ # Act
425
+ prompt = generate_topic_tree_prompt(gen_type="training")
426
+
427
+ # Assert
428
+ # Check for news examples
429
+ assert "News Topics" in prompt
430
+ assert "Sports" in prompt
431
+ assert "Football" in prompt
432
+ assert "College Football" in prompt
433
+ assert "Entertainment" in prompt
434
+ assert "Tom Hanks" in prompt
435
+
436
+ # Check for smalltalk examples
437
+ assert "Small Talk Topics" in prompt
438
+ assert "Weather" in prompt
439
+ assert "Family" in prompt
440
+ assert "Hobbies" in prompt
441
+ assert "Cooking" in prompt
442
+ assert "Asian Food" in prompt
443
+
444
+
445
+ def test_generate_topic_tree_prompt_contains_required_sections():
446
+ """Test that the prompt contains all required sections"""
447
+ # Act
448
+ prompt = generate_topic_tree_prompt(gen_type="training")
449
+
450
+ # Assert
451
+ assert "## Task Description" in prompt
452
+ assert "## Next Step" in prompt
453
+ assert "system_prompt" in prompt
454
+ assert "kiln_data_gen_topic_path" in prompt
455
+ assert "kiln_data_gen_num_subtopics" in prompt
456
+ assert "existing_topics" in prompt
457
+
458
+
459
+ def test_generate_topic_tree_prompt_structure_consistency():
460
+ """Test that the prompt structure is consistent between training and eval types"""
461
+ # Act
462
+ training_prompt = generate_topic_tree_prompt(gen_type="training")
463
+ eval_prompt = generate_topic_tree_prompt(gen_type="eval")
464
+
465
+ # Assert
466
+ # Both should have the same structure, just different goal descriptions
467
+ assert "## Task Description" in training_prompt
468
+ assert "## Task Description" in eval_prompt
469
+ assert "## Next Step" in training_prompt
470
+ assert "## Next Step" in eval_prompt
471
+
472
+ # The main difference should be in the goal description
473
+ assert "train a large language model" in training_prompt
474
+ assert "evaluate a large language model" in eval_prompt
475
+ assert "generate training data" in training_prompt
476
+ assert "generate eval data" in eval_prompt
477
+
478
+
479
+ def test_generate_sample_generation_prompt_training_type():
480
+ """Test generate_sample_generation_prompt with gen_type='training'"""
481
+ # Act
482
+ prompt = generate_sample_generation_prompt(gen_type="training")
483
+
484
+ # Assert
485
+ assert isinstance(prompt, str)
486
+ assert (
487
+ "I want to train a large language model and you should help me generate training data for it."
488
+ in prompt
489
+ )
490
+ assert "## Task Description" in prompt
491
+ assert "Your job is to generate a list of potential inputs" in prompt
492
+ assert "The guidance is:" not in prompt # Should not have specific guidance
493
+
494
+
495
+ def test_generate_sample_generation_prompt_eval_type():
496
+ """Test generate_sample_generation_prompt with gen_type='eval'"""
497
+ # Act
498
+ prompt = generate_sample_generation_prompt(gen_type="eval")
499
+
500
+ # Assert
501
+ assert isinstance(prompt, str)
502
+ assert (
503
+ "I want to evaluate a large language model and you should help me generate eval data for it."
504
+ in prompt
505
+ )
506
+ assert "## Task Description" in prompt
507
+ assert "Your job is to generate a list of potential inputs" in prompt
508
+ assert "The guidance is:" not in prompt # Should not have specific guidance
509
+
510
+
511
+ def test_generate_sample_generation_prompt_with_guidance():
512
+ """Test generate_sample_generation_prompt with guidance provided"""
513
+ # Arrange
514
+ guidance = "Focus on generating diverse examples with varying complexity levels"
515
+
516
+ # Act
517
+ prompt = generate_sample_generation_prompt(gen_type="training", guidance=guidance)
518
+
519
+ # Assert
520
+ assert isinstance(prompt, str)
521
+ assert (
522
+ "I want to train a large language model and you should help me generate training data for it."
523
+ in prompt
524
+ )
525
+ assert "## Custom Guidance" in prompt
526
+ assert f"<guidance>\n{guidance}\n</guidance>" in prompt
527
+
528
+
529
+ def test_generate_sample_generation_prompt_with_empty_guidance():
530
+ """Test generate_sample_generation_prompt with empty string guidance"""
531
+ # Act
532
+ prompt = generate_sample_generation_prompt(gen_type="eval", guidance="")
533
+
534
+ # Assert
535
+ assert isinstance(prompt, str)
536
+ assert (
537
+ "I want to evaluate a large language model and you should help me generate eval data for it."
538
+ in prompt
539
+ )
540
+ assert "## Specific Guidance" not in prompt
541
+
542
+
543
+ def test_generate_sample_generation_prompt_contains_examples():
544
+ """Test that the prompt contains the expected examples"""
545
+ # Act
546
+ prompt = generate_sample_generation_prompt(gen_type="training")
547
+
548
+ # Assert
549
+ # Check for the tweet classification example
550
+ assert "You are an assistant that classifies the tone of a tweet" in prompt
551
+ assert "positive" in prompt
552
+ assert "negative" in prompt
553
+ assert "neutral" in prompt
554
+ assert "Technology" in prompt
555
+ assert "New iPhone Event" in prompt
556
+ assert "New iPhone looks amazing! I need that camera." in prompt
557
+ assert "Another boring event from Apple." in prompt
558
+
559
+
560
+ def test_generate_sample_generation_prompt_contains_required_sections():
561
+ """Test that the prompt contains all required sections"""
562
+ # Act
563
+ prompt = generate_sample_generation_prompt(gen_type="training")
564
+
565
+ # Assert
566
+ assert "## Task Description" in prompt
567
+ assert "system_prompt" in prompt
568
+ assert "topic" in prompt
569
+ assert "num_samples" in prompt
570
+ assert "generated_samples" in prompt
571
+ assert "The output must be formatted:" in prompt
572
+ assert "Do not include any other text or break the schema in any way." in prompt
573
+ assert (
574
+ "Note how the output of this task is data to input into the system prompt"
575
+ in prompt
576
+ )
577
+
578
+
579
+ def test_generate_sample_generation_prompt_structure_consistency():
580
+ """Test that the prompt structure is consistent between training and eval types"""
581
+ # Act
582
+ training_prompt = generate_sample_generation_prompt(gen_type="training")
583
+ eval_prompt = generate_sample_generation_prompt(gen_type="eval")
584
+
585
+ # Assert
586
+ # Both should have the same structure, just different goal descriptions
587
+ assert "## Task Description" in training_prompt
588
+ assert "## Task Description" in eval_prompt
589
+
590
+ # The main difference should be in the goal description
591
+ assert "train a large language model" in training_prompt
592
+ assert "evaluate a large language model" in eval_prompt
593
+ assert "generate training data" in training_prompt
594
+ assert "generate eval data" in eval_prompt
595
+
596
+ # Both should have the same core content
597
+ assert "Your job is to generate a list of potential inputs" in training_prompt
598
+ assert "Your job is to generate a list of potential inputs" in eval_prompt
599
+ assert "generated_samples" in training_prompt
600
+ assert "generated_samples" in eval_prompt
601
+
602
+
603
+ def test_generate_sample_generation_prompt_with_none_guidance():
604
+ """Test generate_sample_generation_prompt with None guidance"""
605
+ # Act
606
+ prompt = generate_sample_generation_prompt(gen_type="training", guidance=None)
607
+
608
+ # Assert
609
+ assert isinstance(prompt, str)
610
+ assert (
611
+ "I want to train a large language model and you should help me generate training data for it."
612
+ in prompt
613
+ )
614
+ assert "## Specific Guidance" not in prompt
615
+ assert "The guidance is:" not in prompt
@@ -7,12 +7,7 @@ from kiln_ai.adapters.ml_model_list import ModelProviderName
7
7
  from kiln_ai.adapters.model_adapters.base_adapter import AdapterConfig
8
8
  from kiln_ai.datamodel.eval import Eval, EvalConfig, EvalScores
9
9
  from kiln_ai.datamodel.json_schema import validate_schema_with_value_error
10
- from kiln_ai.datamodel.task import (
11
- RunConfig,
12
- RunConfigProperties,
13
- TaskOutputRatingType,
14
- TaskRun,
15
- )
10
+ from kiln_ai.datamodel.task import RunConfig, TaskOutputRatingType, TaskRun
16
11
  from kiln_ai.utils.exhaustive_error import raise_exhaustive_enum_error
17
12
 
18
13
 
@@ -124,7 +119,9 @@ class BaseEval:
124
119
  property["minimum"] = 1
125
120
  property["maximum"] = 5
126
121
  else:
127
- property["enum"] = [1, 2, 3, 4, 5]
122
+ property["type"] = "integer"
123
+ property["minimum"] = 1
124
+ property["maximum"] = 5
128
125
 
129
126
  property["description"] = (
130
127
  f"{output_score.instruction}\n\nThe rating should be between 1 and 5, with 1 being the worst and 5 being the best."
@@ -139,6 +136,7 @@ class BaseEval:
139
136
  )
140
137
  else:
141
138
  property["enum"] = ["pass", "fail"]
139
+ property["type"] = "string"
142
140
  property["description"] = (
143
141
  f"{output_score.instruction}\n\nThe rating should be either 'pass' or 'fail'."
144
142
  )
@@ -152,6 +150,7 @@ class BaseEval:
152
150
  )
153
151
  else:
154
152
  property["enum"] = ["pass", "fail", "critical"]
153
+ property["type"] = "string"
155
154
  property["description"] = (
156
155
  f"{output_score.instruction}\n\nThe rating should be either 'pass', 'fail', or 'critical' where critical a very severe failure."
157
156
  )
@@ -8,7 +8,7 @@ from kiln_ai.datamodel.basemodel import ID_TYPE
8
8
  from kiln_ai.datamodel.dataset_filters import dataset_filter_from_id
9
9
  from kiln_ai.datamodel.eval import EvalConfig, EvalRun, EvalScores
10
10
  from kiln_ai.datamodel.task import TaskRunConfig
11
- from kiln_ai.datamodel.task_run import TaskRun
11
+ from kiln_ai.datamodel.task_run import TaskRun, Usage
12
12
  from kiln_ai.utils.async_job_runner import AsyncJobRunner, Progress
13
13
 
14
14
  logger = logging.getLogger(__name__)
@@ -177,10 +177,12 @@ class EvalRunner:
177
177
  task_output: str | None = None
178
178
  scores: EvalScores | None = None
179
179
  intermediate_outputs: Dict[str, str] | None = None
180
+ task_run_usage: Usage | None = None
180
181
  if job.type == "eval_config_eval":
181
182
  # Eval config eval, we use the saved input from the task run, not invoking the task again
182
183
  scores, intermediate_outputs = await evaluator.run_eval(job.item)
183
184
  task_output = job.item.output.output
185
+ task_run_usage = job.item.usage
184
186
  else:
185
187
  # Task run eval, we invoke the task again to get a fresh output
186
188
  (
@@ -189,6 +191,7 @@ class EvalRunner:
189
191
  intermediate_outputs,
190
192
  ) = await evaluator.run_task_and_eval(job.item.input)
191
193
  task_output = result_task_run.output.output
194
+ task_run_usage = result_task_run.usage
192
195
 
193
196
  # Save the job result
194
197
  eval_run = EvalRun(
@@ -202,6 +205,7 @@ class EvalRunner:
202
205
  input=job.item.input,
203
206
  output=task_output,
204
207
  intermediate_outputs=intermediate_outputs,
208
+ task_run_usage=task_run_usage,
205
209
  )
206
210
  eval_run.save_to_file()
207
211
 
@@ -102,6 +102,18 @@ class GEval(BaseEval):
102
102
 
103
103
  self.geval_task = GEvalTask(eval_config)
104
104
 
105
+ def generate_run_description(self, eval_input: str, eval_output: str) -> str:
106
+ return f"""The model was given the following input for the task:
107
+ <eval_data>
108
+ {eval_input}
109
+ </eval_data>
110
+
111
+ The model produced the following output for the task:
112
+ <eval_data>
113
+ {eval_output}
114
+ </eval_data>
115
+ """
116
+
105
117
  async def run_eval(
106
118
  self, task_run: TaskRun
107
119
  ) -> tuple[EvalScores, Dict[str, str] | None]:
@@ -145,19 +157,12 @@ class GEval(BaseEval):
145
157
  ),
146
158
  )
147
159
 
148
- input = f"""The model was given the following input for the task:
149
- <eval_data>
150
- {task_run.input}
151
- </eval_data>
152
-
153
- The model produced the following output for the task:
154
- <eval_data>
155
- {task_run.output}
156
- </eval_data>
157
- """
160
+ run_description = self.generate_run_description(
161
+ task_run.input, task_run.output.output
162
+ )
158
163
 
159
164
  # We don't need the run, but invoke_returning_run_output() runs validations for us over _run()
160
- _, run_output = await adapter.invoke_returning_run_output(input)
165
+ _, run_output = await adapter.invoke_returning_run_output(run_description)
161
166
 
162
167
  if self.eval_config.config_type == EvalConfigType.llm_as_judge:
163
168
  return self.build_llm_as_judge_score(
@@ -310,7 +315,7 @@ The model produced the following output for the task:
310
315
  """
311
316
  primary_token_score = self.score_from_token_string(token_logprob.token)
312
317
  # check this is a real rating token, it could just be the ": ", "," or whitespace
313
- if not primary_token_score:
318
+ if primary_token_score is None:
314
319
  return None
315
320
 
316
321
  total_score = 0.0
@@ -43,7 +43,9 @@ def test_score_schema_five_star():
43
43
 
44
44
  # Check score property, and that it's an enum of 1-5
45
45
  score_prop = schema["properties"]["quality_score"]
46
- assert score_prop["enum"] == [1, 2, 3, 4, 5]
46
+ assert score_prop["type"] == "integer"
47
+ assert score_prop["minimum"] == 1
48
+ assert score_prop["maximum"] == 5
47
49
  assert "Quality Score" in score_prop["title"]
48
50
  assert "Rate the quality" in score_prop["description"]
49
51
  assert "between 1 and 5" in score_prop["description"]
@@ -51,7 +53,9 @@ def test_score_schema_five_star():
51
53
  # Check overall rating property, and that it's an enum of 1-5
52
54
  assert "overall_rating" in schema["properties"]
53
55
  overall = schema["properties"]["overall_rating"]
54
- assert overall["enum"] == [1, 2, 3, 4, 5]
56
+ assert overall["type"] == "integer"
57
+ assert overall["minimum"] == 1
58
+ assert overall["maximum"] == 5
55
59
  assert "Overall Rating" in overall["title"]
56
60
  assert "The overall rating for the task output" in overall["description"]
57
61
  assert "between 1 and 5" in overall["description"]
@@ -127,6 +131,7 @@ def test_score_schema_pass_fail():
127
131
  schema = json.loads(schema_str)
128
132
 
129
133
  score_prop = schema["properties"]["pass_fail_test"]
134
+ assert score_prop["type"] == "string"
130
135
  assert score_prop["enum"] == ["pass", "fail"]
131
136
  assert "Pass Fail Test" in score_prop["title"]
132
137
  assert "Check if it passes" in score_prop["description"]
@@ -173,6 +178,7 @@ def test_score_schema_pass_fail_critical():
173
178
  score_prop = schema["properties"]["critical_test"]
174
179
  assert "enum" in score_prop
175
180
  assert score_prop["enum"] == ["pass", "fail", "critical"]
181
+ assert score_prop["type"] == "string"
176
182
  assert "'pass', 'fail', or 'critical'" in score_prop["description"]
177
183
 
178
184
  assert schema["properties"]["overall_rating"] is not None
@@ -485,18 +485,17 @@ async def test_run_job_success_task_run_eval(
485
485
  )
486
486
 
487
487
  # Mock the evaluator
488
- mock_result_run = TaskRun(
489
- input="test input",
490
- input_source=data_source,
491
- output=TaskOutput(output="evaluated output"),
492
- intermediate_outputs={"intermediate_output": "intermediate output"},
493
- )
494
488
  mock_scores = {"accuracy": 0.95}
495
489
 
496
490
  class MockEvaluator(BaseEval):
497
491
  async def run_task_and_eval(self, input_text):
498
492
  return (
499
- mock_result_run,
493
+ TaskRun(
494
+ input="test input",
495
+ input_source=data_source,
496
+ output=TaskOutput(output="evaluated output"),
497
+ intermediate_outputs={"intermediate_output": "intermediate output"},
498
+ ),
500
499
  mock_scores,
501
500
  {"intermediate_output": "intermediate output"},
502
501
  )
@@ -546,11 +545,6 @@ async def test_run_job_success_eval_config_eval(
546
545
  )
547
546
 
548
547
  # Mock the evaluator
549
- mock_result_run = TaskRun(
550
- input="test input",
551
- input_source=data_source,
552
- output=TaskOutput(output="evaluated output"),
553
- )
554
548
  mock_scores: EvalScores = {"accuracy": 0.95}
555
549
 
556
550
  class MockEvaluator(BaseEval):