kiln-ai 0.16.0__py3-none-any.whl → 0.18.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. kiln_ai/adapters/__init__.py +2 -0
  2. kiln_ai/adapters/adapter_registry.py +22 -44
  3. kiln_ai/adapters/chat/__init__.py +8 -0
  4. kiln_ai/adapters/chat/chat_formatter.py +233 -0
  5. kiln_ai/adapters/chat/test_chat_formatter.py +131 -0
  6. kiln_ai/adapters/data_gen/data_gen_prompts.py +121 -36
  7. kiln_ai/adapters/data_gen/data_gen_task.py +49 -36
  8. kiln_ai/adapters/data_gen/test_data_gen_task.py +330 -40
  9. kiln_ai/adapters/eval/base_eval.py +7 -6
  10. kiln_ai/adapters/eval/eval_runner.py +9 -2
  11. kiln_ai/adapters/eval/g_eval.py +40 -17
  12. kiln_ai/adapters/eval/test_base_eval.py +174 -17
  13. kiln_ai/adapters/eval/test_eval_runner.py +3 -0
  14. kiln_ai/adapters/eval/test_g_eval.py +116 -5
  15. kiln_ai/adapters/fine_tune/base_finetune.py +3 -8
  16. kiln_ai/adapters/fine_tune/dataset_formatter.py +135 -273
  17. kiln_ai/adapters/fine_tune/test_base_finetune.py +10 -10
  18. kiln_ai/adapters/fine_tune/test_dataset_formatter.py +287 -353
  19. kiln_ai/adapters/fine_tune/test_fireworks_tinetune.py +3 -3
  20. kiln_ai/adapters/fine_tune/test_openai_finetune.py +6 -6
  21. kiln_ai/adapters/fine_tune/test_together_finetune.py +1 -0
  22. kiln_ai/adapters/fine_tune/test_vertex_finetune.py +6 -11
  23. kiln_ai/adapters/fine_tune/together_finetune.py +13 -2
  24. kiln_ai/adapters/ml_model_list.py +370 -84
  25. kiln_ai/adapters/model_adapters/base_adapter.py +73 -26
  26. kiln_ai/adapters/model_adapters/litellm_adapter.py +88 -97
  27. kiln_ai/adapters/model_adapters/litellm_config.py +3 -2
  28. kiln_ai/adapters/model_adapters/test_base_adapter.py +235 -61
  29. kiln_ai/adapters/model_adapters/test_litellm_adapter.py +104 -21
  30. kiln_ai/adapters/model_adapters/test_saving_adapter_results.py +41 -0
  31. kiln_ai/adapters/model_adapters/test_structured_output.py +44 -12
  32. kiln_ai/adapters/parsers/parser_registry.py +0 -2
  33. kiln_ai/adapters/parsers/r1_parser.py +0 -1
  34. kiln_ai/adapters/prompt_builders.py +0 -16
  35. kiln_ai/adapters/provider_tools.py +27 -9
  36. kiln_ai/adapters/remote_config.py +66 -0
  37. kiln_ai/adapters/repair/repair_task.py +1 -6
  38. kiln_ai/adapters/repair/test_repair_task.py +24 -3
  39. kiln_ai/adapters/test_adapter_registry.py +88 -28
  40. kiln_ai/adapters/test_ml_model_list.py +176 -0
  41. kiln_ai/adapters/test_prompt_adaptors.py +17 -7
  42. kiln_ai/adapters/test_prompt_builders.py +3 -16
  43. kiln_ai/adapters/test_provider_tools.py +69 -20
  44. kiln_ai/adapters/test_remote_config.py +100 -0
  45. kiln_ai/datamodel/__init__.py +0 -2
  46. kiln_ai/datamodel/datamodel_enums.py +38 -13
  47. kiln_ai/datamodel/eval.py +32 -0
  48. kiln_ai/datamodel/finetune.py +12 -8
  49. kiln_ai/datamodel/task.py +68 -7
  50. kiln_ai/datamodel/task_output.py +0 -2
  51. kiln_ai/datamodel/task_run.py +0 -2
  52. kiln_ai/datamodel/test_basemodel.py +2 -1
  53. kiln_ai/datamodel/test_dataset_split.py +0 -8
  54. kiln_ai/datamodel/test_eval_model.py +146 -4
  55. kiln_ai/datamodel/test_models.py +33 -10
  56. kiln_ai/datamodel/test_task.py +168 -2
  57. kiln_ai/utils/config.py +3 -2
  58. kiln_ai/utils/dataset_import.py +1 -1
  59. kiln_ai/utils/logging.py +166 -0
  60. kiln_ai/utils/test_config.py +23 -0
  61. kiln_ai/utils/test_dataset_import.py +30 -0
  62. {kiln_ai-0.16.0.dist-info → kiln_ai-0.18.0.dist-info}/METADATA +2 -2
  63. kiln_ai-0.18.0.dist-info/RECORD +115 -0
  64. kiln_ai-0.16.0.dist-info/RECORD +0 -108
  65. {kiln_ai-0.16.0.dist-info → kiln_ai-0.18.0.dist-info}/WHEEL +0 -0
  66. {kiln_ai-0.16.0.dist-info → kiln_ai-0.18.0.dist-info}/licenses/LICENSE.txt +0 -0
@@ -3,6 +3,10 @@ import json
3
3
  import pytest
4
4
 
5
5
  from kiln_ai.adapters.adapter_registry import adapter_for_task
6
+ from kiln_ai.adapters.data_gen.data_gen_prompts import (
7
+ generate_sample_generation_prompt,
8
+ generate_topic_tree_prompt,
9
+ )
6
10
  from kiln_ai.adapters.data_gen.data_gen_task import (
7
11
  DataGenCategoriesTask,
8
12
  DataGenCategoriesTaskInput,
@@ -14,6 +18,7 @@ from kiln_ai.adapters.data_gen.data_gen_task import (
14
18
  from kiln_ai.adapters.provider_tools import get_model_and_provider
15
19
  from kiln_ai.adapters.test_prompt_adaptors import get_all_models_and_providers
16
20
  from kiln_ai.datamodel import Project, Task
21
+ from kiln_ai.datamodel.task import RunConfigProperties
17
22
 
18
23
 
19
24
  @pytest.fixture
@@ -32,22 +37,19 @@ def test_data_gen_categories_task_input_initialization(base_task):
32
37
  # Arrange
33
38
  node_path = ["root", "branch", "leaf"]
34
39
  num_subtopics = 4
35
- human_guidance = "Test guidance"
36
40
 
37
41
  # Act
38
42
  input_model = DataGenCategoriesTaskInput.from_task(
39
43
  task=base_task,
40
44
  node_path=node_path,
41
45
  num_subtopics=num_subtopics,
42
- human_guidance=human_guidance,
43
46
  )
44
47
 
45
48
  # Assert
46
- assert input_model.node_path == node_path
47
- assert input_model.num_subtopics == num_subtopics
48
- assert input_model.human_guidance == human_guidance
49
- assert isinstance(input_model.system_prompt, str)
50
- assert "Reply like a cowboy" in input_model.system_prompt
49
+ assert input_model.kiln_data_gen_topic_path == node_path
50
+ assert input_model.kiln_data_gen_num_subtopics == num_subtopics
51
+ assert isinstance(input_model.kiln_data_gen_system_prompt, str)
52
+ assert "Reply like a cowboy" in input_model.kiln_data_gen_system_prompt
51
53
 
52
54
 
53
55
  def test_data_gen_categories_task_input_default_values(base_task):
@@ -55,14 +57,13 @@ def test_data_gen_categories_task_input_default_values(base_task):
55
57
  input_model = DataGenCategoriesTaskInput.from_task(task=base_task)
56
58
 
57
59
  # Assert
58
- assert input_model.num_subtopics == 6
59
- assert input_model.human_guidance is None
60
- assert input_model.node_path == []
60
+ assert input_model.kiln_data_gen_num_subtopics == 6
61
+ assert input_model.kiln_data_gen_topic_path == []
61
62
 
62
63
 
63
64
  def test_data_gen_categories_task_initialization():
64
65
  # Act
65
- task = DataGenCategoriesTask()
66
+ task = DataGenCategoriesTask(gen_type="training", guidance="Test guidance")
66
67
 
67
68
  # Assert
68
69
  assert task.name == "DataGen"
@@ -71,11 +72,16 @@ def test_data_gen_categories_task_initialization():
71
72
  assert task.instruction is not None
72
73
  assert isinstance(task.input_json_schema, str)
73
74
  assert isinstance(task.output_json_schema, str)
75
+ assert "I want to train a large language model" in task.instruction
76
+ assert "Test guidance" in task.instruction
74
77
 
75
78
 
76
79
  def test_data_gen_categories_task_schemas():
77
80
  # Act
78
- task = DataGenCategoriesTask()
81
+ task = DataGenCategoriesTask(gen_type="eval", guidance="Test guidance")
82
+
83
+ assert "I want to evaluate a large language model" in task.instruction
84
+ assert "Test guidance" in task.instruction
79
85
 
80
86
  # Assert
81
87
  input_schema = json.loads(task.input_json_schema)
@@ -85,12 +91,14 @@ def test_data_gen_categories_task_schemas():
85
91
  assert isinstance(output_schema, dict)
86
92
  assert output_schema["type"] == "object"
87
93
  assert output_schema["properties"]["subtopics"]["type"] == "array"
88
- assert input_schema["properties"]["node_path"]["type"] == "array"
89
- assert input_schema["properties"]["num_subtopics"]["type"] == "integer"
94
+ assert input_schema["properties"]["kiln_data_gen_topic_path"]["type"] == "array"
95
+ assert (
96
+ input_schema["properties"]["kiln_data_gen_num_subtopics"]["type"] == "integer"
97
+ )
90
98
  assert set(input_schema["required"]) == {
91
- "node_path",
92
- "num_subtopics",
93
- "system_prompt",
99
+ "kiln_data_gen_topic_path",
100
+ "kiln_data_gen_num_subtopics",
101
+ "kiln_data_gen_system_prompt",
94
102
  }
95
103
 
96
104
 
@@ -105,13 +113,17 @@ async def test_data_gen_all_models_providers(
105
113
  # pass if the model doesn't support data gen (testing the support flag is part of this)
106
114
  return
107
115
 
108
- data_gen_task = DataGenCategoriesTask()
116
+ data_gen_task = DataGenCategoriesTask(gen_type="training", guidance=None)
109
117
  data_gen_input = DataGenCategoriesTaskInput.from_task(base_task, num_subtopics=6)
110
118
 
111
119
  adapter = adapter_for_task(
112
120
  data_gen_task,
113
- model_name=model_name,
114
- provider=provider_name,
121
+ run_config_properties=RunConfigProperties(
122
+ model_name=model_name,
123
+ model_provider_name=provider_name,
124
+ prompt_id="simple_prompt_builder",
125
+ structured_output_mode="unknown",
126
+ ),
115
127
  )
116
128
 
117
129
  input_dict = data_gen_input.model_dump()
@@ -126,22 +138,19 @@ def test_data_gen_sample_task_input_initialization(base_task):
126
138
  # Arrange
127
139
  topic = ["cowboys", "hats"]
128
140
  num_samples = 4
129
- human_guidance = "Test guidance"
130
141
 
131
142
  # Act
132
143
  input_model = DataGenSampleTaskInput.from_task(
133
144
  task=base_task,
134
145
  topic=topic,
135
146
  num_samples=num_samples,
136
- human_guidance=human_guidance,
137
147
  )
138
148
 
139
149
  # Assert
140
- assert input_model.topic == topic
141
- assert input_model.num_samples == num_samples
142
- assert input_model.human_guidance == human_guidance
143
- assert isinstance(input_model.system_prompt, str)
144
- assert "Reply like a cowboy" in input_model.system_prompt
150
+ assert input_model.kiln_data_gen_topic_path == topic
151
+ assert input_model.kiln_data_gen_num_samples == num_samples
152
+ assert isinstance(input_model.kiln_data_gen_system_prompt, str)
153
+ assert "Reply like a cowboy" in input_model.kiln_data_gen_system_prompt
145
154
 
146
155
 
147
156
  def test_data_gen_sample_task_input_default_values(base_task):
@@ -149,20 +158,23 @@ def test_data_gen_sample_task_input_default_values(base_task):
149
158
  input_model = DataGenSampleTaskInput.from_task(task=base_task)
150
159
 
151
160
  # Assert
152
- assert input_model.num_samples == 8
153
- assert input_model.human_guidance is None
154
- assert input_model.topic == []
161
+ assert input_model.kiln_data_gen_num_samples == 8
162
+ assert input_model.kiln_data_gen_topic_path == []
155
163
 
156
164
 
157
165
  def test_data_gen_sample_task_initialization(base_task):
158
166
  # Act
159
- task = DataGenSampleTask(target_task=base_task)
167
+ task = DataGenSampleTask(
168
+ target_task=base_task, gen_type="eval", guidance="Test guidance"
169
+ )
160
170
 
161
171
  # Assert
162
172
  assert task.name == "DataGenSample"
163
173
  assert isinstance(task.parent, Project)
164
174
  assert task.description is not None
165
175
  assert task.instruction is not None
176
+ assert "I want to evaluate a large language model" in task.instruction
177
+ assert "Test guidance" in task.instruction
166
178
 
167
179
  input_schema = json.loads(task.input_json_schema)
168
180
  output_schema = json.loads(task.output_json_schema)
@@ -171,12 +183,12 @@ def test_data_gen_sample_task_initialization(base_task):
171
183
  assert isinstance(output_schema, dict)
172
184
  assert output_schema["type"] == "object"
173
185
  assert output_schema["properties"]["generated_samples"]["type"] == "array"
174
- assert input_schema["properties"]["topic"]["type"] == "array"
175
- assert input_schema["properties"]["num_samples"]["type"] == "integer"
186
+ assert input_schema["properties"]["kiln_data_gen_topic_path"]["type"] == "array"
187
+ assert input_schema["properties"]["kiln_data_gen_num_samples"]["type"] == "integer"
176
188
  assert set(input_schema["required"]) == {
177
- "topic",
178
- "num_samples",
179
- "system_prompt",
189
+ "kiln_data_gen_topic_path",
190
+ "kiln_data_gen_num_samples",
191
+ "kiln_data_gen_system_prompt",
180
192
  }
181
193
 
182
194
 
@@ -254,8 +266,12 @@ async def test_data_gen_sample_all_models_providers(
254
266
 
255
267
  adapter = adapter_for_task(
256
268
  data_gen_task,
257
- model_name=model_name,
258
- provider=provider_name,
269
+ run_config_properties=RunConfigProperties(
270
+ model_name=model_name,
271
+ model_provider_name=provider_name,
272
+ prompt_id="simple_prompt_builder",
273
+ structured_output_mode="unknown",
274
+ ),
259
275
  )
260
276
 
261
277
  input_dict = data_gen_input.model_dump()
@@ -304,8 +320,12 @@ async def test_data_gen_sample_all_models_providers_with_structured_output(
304
320
 
305
321
  adapter = adapter_for_task(
306
322
  data_gen_task,
307
- model_name=model_name,
308
- provider=provider_name,
323
+ run_config_properties=RunConfigProperties(
324
+ model_name=model_name,
325
+ model_provider_name=provider_name,
326
+ prompt_id="simple_prompt_builder",
327
+ structured_output_mode="unknown",
328
+ ),
309
329
  )
310
330
 
311
331
  input_dict = data_gen_input.model_dump()
@@ -319,3 +339,273 @@ async def test_data_gen_sample_all_models_providers_with_structured_output(
319
339
  assert "tweet" in sample
320
340
  assert isinstance(sample["username"], str)
321
341
  assert isinstance(sample["tweet"], str)
342
+
343
+
344
+ def test_generate_topic_tree_prompt_training_type():
345
+ """Test generate_topic_tree_prompt with gen_type='training'"""
346
+ # Act
347
+ prompt = generate_topic_tree_prompt(gen_type="training")
348
+
349
+ # Assert
350
+ assert isinstance(prompt, str)
351
+ assert (
352
+ "I want to train a large language model and you should help me generate training data for it."
353
+ in prompt
354
+ )
355
+ assert "## Task Description" in prompt
356
+ assert "Your job is the following:" in prompt
357
+ assert "## Next Step" in prompt
358
+ assert "When generating subtopics, remain somewhat vague." in prompt
359
+ assert "The guidance is:" not in prompt # Should not have specific guidance
360
+
361
+
362
+ def test_generate_topic_tree_prompt_eval_type():
363
+ """Test generate_topic_tree_prompt with gen_type='eval'"""
364
+ # Act
365
+ prompt = generate_topic_tree_prompt(gen_type="eval")
366
+
367
+ # Assert
368
+ assert isinstance(prompt, str)
369
+ assert (
370
+ "I want to evaluate a large language model and you should help me generate eval data for it."
371
+ in prompt
372
+ )
373
+ assert "## Task Description" in prompt
374
+ assert "Your job is the following:" in prompt
375
+ assert "## Next Step" in prompt
376
+ assert "When generating subtopics, remain somewhat vague." in prompt
377
+ assert "The guidance is:" not in prompt # Should not have specific guidance
378
+
379
+
380
+ def test_generate_topic_tree_prompt_with_guidance():
381
+ """Test generate_topic_tree_prompt with guidance provided"""
382
+ # Arrange
383
+ guidance = "Focus on technical topics related to artificial intelligence and machine learning"
384
+
385
+ # Act
386
+ prompt = generate_topic_tree_prompt(gen_type="training", guidance=guidance)
387
+
388
+ # Assert
389
+ assert isinstance(prompt, str)
390
+ assert (
391
+ "I want to train a large language model and you should help me generate training data for it."
392
+ in prompt
393
+ )
394
+ assert "## Custom Guidance" in prompt
395
+ assert f"<guidance>\n{guidance}\n</guidance>" in prompt
396
+ assert (
397
+ "When generating subtopics, remain somewhat vague." not in prompt
398
+ ) # Should not have default guidance
399
+
400
+
401
+ def test_generate_topic_tree_prompt_with_empty_guidance():
402
+ """Test generate_topic_tree_prompt with empty string guidance"""
403
+ # Act
404
+ prompt = generate_topic_tree_prompt(gen_type="eval", guidance="")
405
+
406
+ # Assert
407
+ assert isinstance(prompt, str)
408
+ assert (
409
+ "I want to evaluate a large language model and you should help me generate eval data for it."
410
+ in prompt
411
+ )
412
+ assert "## Specific Guidance" not in prompt
413
+ assert (
414
+ "When generating subtopics, remain somewhat vague." in prompt
415
+ ) # Should have default guidance
416
+
417
+
418
+ def test_generate_topic_tree_prompt_contains_examples():
419
+ """Test that the prompt contains the expected examples"""
420
+ # Act
421
+ prompt = generate_topic_tree_prompt(gen_type="training")
422
+
423
+ # Assert
424
+ # Check for news examples
425
+ assert "News Topics" in prompt
426
+ assert "Sports" in prompt
427
+ assert "Football" in prompt
428
+ assert "College Football" in prompt
429
+ assert "Entertainment" in prompt
430
+ assert "Tom Hanks" in prompt
431
+
432
+ # Check for smalltalk examples
433
+ assert "Small Talk Topics" in prompt
434
+ assert "Weather" in prompt
435
+ assert "Family" in prompt
436
+ assert "Hobbies" in prompt
437
+ assert "Cooking" in prompt
438
+ assert "Asian Food" in prompt
439
+
440
+
441
+ def test_generate_topic_tree_prompt_contains_required_sections():
442
+ """Test that the prompt contains all required sections"""
443
+ # Act
444
+ prompt = generate_topic_tree_prompt(gen_type="training")
445
+
446
+ # Assert
447
+ assert "## Task Description" in prompt
448
+ assert "## Next Step" in prompt
449
+ assert "system_prompt" in prompt
450
+ assert "kiln_data_gen_topic_path" in prompt
451
+ assert "kiln_data_gen_num_subtopics" in prompt
452
+ assert "existing_topics" in prompt
453
+
454
+
455
+ def test_generate_topic_tree_prompt_structure_consistency():
456
+ """Test that the prompt structure is consistent between training and eval types"""
457
+ # Act
458
+ training_prompt = generate_topic_tree_prompt(gen_type="training")
459
+ eval_prompt = generate_topic_tree_prompt(gen_type="eval")
460
+
461
+ # Assert
462
+ # Both should have the same structure, just different goal descriptions
463
+ assert "## Task Description" in training_prompt
464
+ assert "## Task Description" in eval_prompt
465
+ assert "## Next Step" in training_prompt
466
+ assert "## Next Step" in eval_prompt
467
+
468
+ # The main difference should be in the goal description
469
+ assert "train a large language model" in training_prompt
470
+ assert "evaluate a large language model" in eval_prompt
471
+ assert "generate training data" in training_prompt
472
+ assert "generate eval data" in eval_prompt
473
+
474
+
475
+ def test_generate_sample_generation_prompt_training_type():
476
+ """Test generate_sample_generation_prompt with gen_type='training'"""
477
+ # Act
478
+ prompt = generate_sample_generation_prompt(gen_type="training")
479
+
480
+ # Assert
481
+ assert isinstance(prompt, str)
482
+ assert (
483
+ "I want to train a large language model and you should help me generate training data for it."
484
+ in prompt
485
+ )
486
+ assert "## Task Description" in prompt
487
+ assert "Your job is to generate a list of potential inputs" in prompt
488
+ assert "The guidance is:" not in prompt # Should not have specific guidance
489
+
490
+
491
+ def test_generate_sample_generation_prompt_eval_type():
492
+ """Test generate_sample_generation_prompt with gen_type='eval'"""
493
+ # Act
494
+ prompt = generate_sample_generation_prompt(gen_type="eval")
495
+
496
+ # Assert
497
+ assert isinstance(prompt, str)
498
+ assert (
499
+ "I want to evaluate a large language model and you should help me generate eval data for it."
500
+ in prompt
501
+ )
502
+ assert "## Task Description" in prompt
503
+ assert "Your job is to generate a list of potential inputs" in prompt
504
+ assert "The guidance is:" not in prompt # Should not have specific guidance
505
+
506
+
507
+ def test_generate_sample_generation_prompt_with_guidance():
508
+ """Test generate_sample_generation_prompt with guidance provided"""
509
+ # Arrange
510
+ guidance = "Focus on generating diverse examples with varying complexity levels"
511
+
512
+ # Act
513
+ prompt = generate_sample_generation_prompt(gen_type="training", guidance=guidance)
514
+
515
+ # Assert
516
+ assert isinstance(prompt, str)
517
+ assert (
518
+ "I want to train a large language model and you should help me generate training data for it."
519
+ in prompt
520
+ )
521
+ assert "## Custom Guidance" in prompt
522
+ assert f"<guidance>\n{guidance}\n</guidance>" in prompt
523
+
524
+
525
+ def test_generate_sample_generation_prompt_with_empty_guidance():
526
+ """Test generate_sample_generation_prompt with empty string guidance"""
527
+ # Act
528
+ prompt = generate_sample_generation_prompt(gen_type="eval", guidance="")
529
+
530
+ # Assert
531
+ assert isinstance(prompt, str)
532
+ assert (
533
+ "I want to evaluate a large language model and you should help me generate eval data for it."
534
+ in prompt
535
+ )
536
+ assert "## Specific Guidance" not in prompt
537
+
538
+
539
+ def test_generate_sample_generation_prompt_contains_examples():
540
+ """Test that the prompt contains the expected examples"""
541
+ # Act
542
+ prompt = generate_sample_generation_prompt(gen_type="training")
543
+
544
+ # Assert
545
+ # Check for the tweet classification example
546
+ assert "You are an assistant that classifies the tone of a tweet" in prompt
547
+ assert "positive" in prompt
548
+ assert "negative" in prompt
549
+ assert "neutral" in prompt
550
+ assert "Technology" in prompt
551
+ assert "New iPhone Event" in prompt
552
+ assert "New iPhone looks amazing! I need that camera." in prompt
553
+ assert "Another boring event from Apple." in prompt
554
+
555
+
556
+ def test_generate_sample_generation_prompt_contains_required_sections():
557
+ """Test that the prompt contains all required sections"""
558
+ # Act
559
+ prompt = generate_sample_generation_prompt(gen_type="training")
560
+
561
+ # Assert
562
+ assert "## Task Description" in prompt
563
+ assert "system_prompt" in prompt
564
+ assert "topic" in prompt
565
+ assert "num_samples" in prompt
566
+ assert "generated_samples" in prompt
567
+ assert "The output must be formatted:" in prompt
568
+ assert "Do not include any other text or break the schema in any way." in prompt
569
+ assert (
570
+ "Note how the output of this task is data to input into the system prompt"
571
+ in prompt
572
+ )
573
+
574
+
575
+ def test_generate_sample_generation_prompt_structure_consistency():
576
+ """Test that the prompt structure is consistent between training and eval types"""
577
+ # Act
578
+ training_prompt = generate_sample_generation_prompt(gen_type="training")
579
+ eval_prompt = generate_sample_generation_prompt(gen_type="eval")
580
+
581
+ # Assert
582
+ # Both should have the same structure, just different goal descriptions
583
+ assert "## Task Description" in training_prompt
584
+ assert "## Task Description" in eval_prompt
585
+
586
+ # The main difference should be in the goal description
587
+ assert "train a large language model" in training_prompt
588
+ assert "evaluate a large language model" in eval_prompt
589
+ assert "generate training data" in training_prompt
590
+ assert "generate eval data" in eval_prompt
591
+
592
+ # Both should have the same core content
593
+ assert "Your job is to generate a list of potential inputs" in training_prompt
594
+ assert "Your job is to generate a list of potential inputs" in eval_prompt
595
+ assert "generated_samples" in training_prompt
596
+ assert "generated_samples" in eval_prompt
597
+
598
+
599
+ def test_generate_sample_generation_prompt_with_none_guidance():
600
+ """Test generate_sample_generation_prompt with None guidance"""
601
+ # Act
602
+ prompt = generate_sample_generation_prompt(gen_type="training", guidance=None)
603
+
604
+ # Assert
605
+ assert isinstance(prompt, str)
606
+ assert (
607
+ "I want to train a large language model and you should help me generate training data for it."
608
+ in prompt
609
+ )
610
+ assert "## Specific Guidance" not in prompt
611
+ assert "The guidance is:" not in prompt
@@ -2,8 +2,6 @@ import json
2
2
  from abc import abstractmethod
3
3
  from typing import Dict
4
4
 
5
- import jsonschema
6
-
7
5
  from kiln_ai.adapters.adapter_registry import adapter_for_task
8
6
  from kiln_ai.adapters.ml_model_list import ModelProviderName
9
7
  from kiln_ai.adapters.model_adapters.base_adapter import AdapterConfig
@@ -60,14 +58,13 @@ class BaseEval:
60
58
 
61
59
  run_adapter = adapter_for_task(
62
60
  self.target_task,
63
- self.run_config.model_name,
64
- ModelProviderName(self.run_config.model_provider_name),
61
+ self.run_config,
65
62
  base_adapter_config=AdapterConfig(allow_saving=False),
66
63
  )
67
64
 
68
65
  # Parse structured input if needed
69
66
  parsed_input = input
70
- if self.target_task.output_json_schema is not None:
67
+ if self.target_task.input_json_schema is not None:
71
68
  parsed_input = json.loads(input)
72
69
 
73
70
  # we don't save by default here. We'll save manually after validating the output
@@ -122,7 +119,9 @@ class BaseEval:
122
119
  property["minimum"] = 1
123
120
  property["maximum"] = 5
124
121
  else:
125
- property["enum"] = [1, 2, 3, 4, 5]
122
+ property["type"] = "integer"
123
+ property["minimum"] = 1
124
+ property["maximum"] = 5
126
125
 
127
126
  property["description"] = (
128
127
  f"{output_score.instruction}\n\nThe rating should be between 1 and 5, with 1 being the worst and 5 being the best."
@@ -137,6 +136,7 @@ class BaseEval:
137
136
  )
138
137
  else:
139
138
  property["enum"] = ["pass", "fail"]
139
+ property["type"] = "string"
140
140
  property["description"] = (
141
141
  f"{output_score.instruction}\n\nThe rating should be either 'pass' or 'fail'."
142
142
  )
@@ -150,6 +150,7 @@ class BaseEval:
150
150
  )
151
151
  else:
152
152
  property["enum"] = ["pass", "fail", "critical"]
153
+ property["type"] = "string"
153
154
  property["description"] = (
154
155
  f"{output_score.instruction}\n\nThe rating should be either 'pass', 'fail', or 'critical' where critical a very severe failure."
155
156
  )
@@ -8,7 +8,7 @@ from kiln_ai.datamodel.basemodel import ID_TYPE
8
8
  from kiln_ai.datamodel.dataset_filters import dataset_filter_from_id
9
9
  from kiln_ai.datamodel.eval import EvalConfig, EvalRun, EvalScores
10
10
  from kiln_ai.datamodel.task import TaskRunConfig
11
- from kiln_ai.datamodel.task_run import TaskRun
11
+ from kiln_ai.datamodel.task_run import TaskRun, Usage
12
12
  from kiln_ai.utils.async_job_runner import AsyncJobRunner, Progress
13
13
 
14
14
  logger = logging.getLogger(__name__)
@@ -177,10 +177,12 @@ class EvalRunner:
177
177
  task_output: str | None = None
178
178
  scores: EvalScores | None = None
179
179
  intermediate_outputs: Dict[str, str] | None = None
180
+ task_run_usage: Usage | None = None
180
181
  if job.type == "eval_config_eval":
181
182
  # Eval config eval, we use the saved input from the task run, not invoking the task again
182
183
  scores, intermediate_outputs = await evaluator.run_eval(job.item)
183
184
  task_output = job.item.output.output
185
+ task_run_usage = job.item.usage
184
186
  else:
185
187
  # Task run eval, we invoke the task again to get a fresh output
186
188
  (
@@ -189,6 +191,7 @@ class EvalRunner:
189
191
  intermediate_outputs,
190
192
  ) = await evaluator.run_task_and_eval(job.item.input)
191
193
  task_output = result_task_run.output.output
194
+ task_run_usage = result_task_run.usage
192
195
 
193
196
  # Save the job result
194
197
  eval_run = EvalRun(
@@ -202,10 +205,14 @@ class EvalRunner:
202
205
  input=job.item.input,
203
206
  output=task_output,
204
207
  intermediate_outputs=intermediate_outputs,
208
+ task_run_usage=task_run_usage,
205
209
  )
206
210
  eval_run.save_to_file()
207
211
 
208
212
  return True
209
213
  except Exception as e:
210
- logger.error(f"Error running eval job for dataset item {job.item.id}: {e}")
214
+ logger.error(
215
+ f"Error running eval job for dataset item {job.item.id}: {e}",
216
+ exc_info=True,
217
+ )
211
218
  return False