kiln-ai 0.17.0__py3-none-any.whl → 0.18.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. kiln_ai/adapters/chat/chat_formatter.py +0 -1
  2. kiln_ai/adapters/data_gen/data_gen_prompts.py +121 -36
  3. kiln_ai/adapters/data_gen/data_gen_task.py +49 -36
  4. kiln_ai/adapters/data_gen/test_data_gen_task.py +311 -34
  5. kiln_ai/adapters/eval/base_eval.py +6 -7
  6. kiln_ai/adapters/eval/eval_runner.py +5 -1
  7. kiln_ai/adapters/eval/g_eval.py +17 -12
  8. kiln_ai/adapters/eval/test_base_eval.py +8 -2
  9. kiln_ai/adapters/eval/test_g_eval.py +115 -5
  10. kiln_ai/adapters/fine_tune/base_finetune.py +1 -6
  11. kiln_ai/adapters/fine_tune/dataset_formatter.py +1 -5
  12. kiln_ai/adapters/fine_tune/test_dataset_formatter.py +1 -1
  13. kiln_ai/adapters/fine_tune/test_vertex_finetune.py +2 -7
  14. kiln_ai/adapters/fine_tune/together_finetune.py +1 -1
  15. kiln_ai/adapters/ml_model_list.py +293 -44
  16. kiln_ai/adapters/model_adapters/litellm_adapter.py +9 -0
  17. kiln_ai/adapters/model_adapters/test_base_adapter.py +0 -1
  18. kiln_ai/adapters/model_adapters/test_litellm_adapter.py +48 -0
  19. kiln_ai/adapters/model_adapters/test_structured_output.py +3 -3
  20. kiln_ai/adapters/parsers/parser_registry.py +0 -2
  21. kiln_ai/adapters/parsers/r1_parser.py +0 -1
  22. kiln_ai/adapters/remote_config.py +66 -0
  23. kiln_ai/adapters/repair/repair_task.py +1 -6
  24. kiln_ai/adapters/test_ml_model_list.py +18 -0
  25. kiln_ai/adapters/test_prompt_adaptors.py +0 -4
  26. kiln_ai/adapters/test_remote_config.py +100 -0
  27. kiln_ai/datamodel/eval.py +32 -0
  28. kiln_ai/datamodel/finetune.py +0 -1
  29. kiln_ai/datamodel/task_output.py +0 -2
  30. kiln_ai/datamodel/task_run.py +0 -2
  31. kiln_ai/datamodel/test_eval_model.py +146 -4
  32. kiln_ai/utils/logging.py +4 -3
  33. {kiln_ai-0.17.0.dist-info → kiln_ai-0.18.0.dist-info}/METADATA +2 -2
  34. {kiln_ai-0.17.0.dist-info → kiln_ai-0.18.0.dist-info}/RECORD +36 -34
  35. {kiln_ai-0.17.0.dist-info → kiln_ai-0.18.0.dist-info}/WHEEL +0 -0
  36. {kiln_ai-0.17.0.dist-info → kiln_ai-0.18.0.dist-info}/licenses/LICENSE.txt +0 -0
@@ -3,6 +3,10 @@ import json
3
3
  import pytest
4
4
 
5
5
  from kiln_ai.adapters.adapter_registry import adapter_for_task
6
+ from kiln_ai.adapters.data_gen.data_gen_prompts import (
7
+ generate_sample_generation_prompt,
8
+ generate_topic_tree_prompt,
9
+ )
6
10
  from kiln_ai.adapters.data_gen.data_gen_task import (
7
11
  DataGenCategoriesTask,
8
12
  DataGenCategoriesTaskInput,
@@ -33,22 +37,19 @@ def test_data_gen_categories_task_input_initialization(base_task):
33
37
  # Arrange
34
38
  node_path = ["root", "branch", "leaf"]
35
39
  num_subtopics = 4
36
- human_guidance = "Test guidance"
37
40
 
38
41
  # Act
39
42
  input_model = DataGenCategoriesTaskInput.from_task(
40
43
  task=base_task,
41
44
  node_path=node_path,
42
45
  num_subtopics=num_subtopics,
43
- human_guidance=human_guidance,
44
46
  )
45
47
 
46
48
  # Assert
47
- assert input_model.node_path == node_path
48
- assert input_model.num_subtopics == num_subtopics
49
- assert input_model.human_guidance == human_guidance
50
- assert isinstance(input_model.system_prompt, str)
51
- assert "Reply like a cowboy" in input_model.system_prompt
49
+ assert input_model.kiln_data_gen_topic_path == node_path
50
+ assert input_model.kiln_data_gen_num_subtopics == num_subtopics
51
+ assert isinstance(input_model.kiln_data_gen_system_prompt, str)
52
+ assert "Reply like a cowboy" in input_model.kiln_data_gen_system_prompt
52
53
 
53
54
 
54
55
  def test_data_gen_categories_task_input_default_values(base_task):
@@ -56,14 +57,13 @@ def test_data_gen_categories_task_input_default_values(base_task):
56
57
  input_model = DataGenCategoriesTaskInput.from_task(task=base_task)
57
58
 
58
59
  # Assert
59
- assert input_model.num_subtopics == 6
60
- assert input_model.human_guidance is None
61
- assert input_model.node_path == []
60
+ assert input_model.kiln_data_gen_num_subtopics == 6
61
+ assert input_model.kiln_data_gen_topic_path == []
62
62
 
63
63
 
64
64
  def test_data_gen_categories_task_initialization():
65
65
  # Act
66
- task = DataGenCategoriesTask()
66
+ task = DataGenCategoriesTask(gen_type="training", guidance="Test guidance")
67
67
 
68
68
  # Assert
69
69
  assert task.name == "DataGen"
@@ -72,11 +72,16 @@ def test_data_gen_categories_task_initialization():
72
72
  assert task.instruction is not None
73
73
  assert isinstance(task.input_json_schema, str)
74
74
  assert isinstance(task.output_json_schema, str)
75
+ assert "I want to train a large language model" in task.instruction
76
+ assert "Test guidance" in task.instruction
75
77
 
76
78
 
77
79
  def test_data_gen_categories_task_schemas():
78
80
  # Act
79
- task = DataGenCategoriesTask()
81
+ task = DataGenCategoriesTask(gen_type="eval", guidance="Test guidance")
82
+
83
+ assert "I want to evaluate a large language model" in task.instruction
84
+ assert "Test guidance" in task.instruction
80
85
 
81
86
  # Assert
82
87
  input_schema = json.loads(task.input_json_schema)
@@ -86,12 +91,14 @@ def test_data_gen_categories_task_schemas():
86
91
  assert isinstance(output_schema, dict)
87
92
  assert output_schema["type"] == "object"
88
93
  assert output_schema["properties"]["subtopics"]["type"] == "array"
89
- assert input_schema["properties"]["node_path"]["type"] == "array"
90
- assert input_schema["properties"]["num_subtopics"]["type"] == "integer"
94
+ assert input_schema["properties"]["kiln_data_gen_topic_path"]["type"] == "array"
95
+ assert (
96
+ input_schema["properties"]["kiln_data_gen_num_subtopics"]["type"] == "integer"
97
+ )
91
98
  assert set(input_schema["required"]) == {
92
- "node_path",
93
- "num_subtopics",
94
- "system_prompt",
99
+ "kiln_data_gen_topic_path",
100
+ "kiln_data_gen_num_subtopics",
101
+ "kiln_data_gen_system_prompt",
95
102
  }
96
103
 
97
104
 
@@ -106,7 +113,7 @@ async def test_data_gen_all_models_providers(
106
113
  # pass if the model doesn't support data gen (testing the support flag is part of this)
107
114
  return
108
115
 
109
- data_gen_task = DataGenCategoriesTask()
116
+ data_gen_task = DataGenCategoriesTask(gen_type="training", guidance=None)
110
117
  data_gen_input = DataGenCategoriesTaskInput.from_task(base_task, num_subtopics=6)
111
118
 
112
119
  adapter = adapter_for_task(
@@ -131,22 +138,19 @@ def test_data_gen_sample_task_input_initialization(base_task):
131
138
  # Arrange
132
139
  topic = ["cowboys", "hats"]
133
140
  num_samples = 4
134
- human_guidance = "Test guidance"
135
141
 
136
142
  # Act
137
143
  input_model = DataGenSampleTaskInput.from_task(
138
144
  task=base_task,
139
145
  topic=topic,
140
146
  num_samples=num_samples,
141
- human_guidance=human_guidance,
142
147
  )
143
148
 
144
149
  # Assert
145
- assert input_model.topic == topic
146
- assert input_model.num_samples == num_samples
147
- assert input_model.human_guidance == human_guidance
148
- assert isinstance(input_model.system_prompt, str)
149
- assert "Reply like a cowboy" in input_model.system_prompt
150
+ assert input_model.kiln_data_gen_topic_path == topic
151
+ assert input_model.kiln_data_gen_num_samples == num_samples
152
+ assert isinstance(input_model.kiln_data_gen_system_prompt, str)
153
+ assert "Reply like a cowboy" in input_model.kiln_data_gen_system_prompt
150
154
 
151
155
 
152
156
  def test_data_gen_sample_task_input_default_values(base_task):
@@ -154,20 +158,23 @@ def test_data_gen_sample_task_input_default_values(base_task):
154
158
  input_model = DataGenSampleTaskInput.from_task(task=base_task)
155
159
 
156
160
  # Assert
157
- assert input_model.num_samples == 8
158
- assert input_model.human_guidance is None
159
- assert input_model.topic == []
161
+ assert input_model.kiln_data_gen_num_samples == 8
162
+ assert input_model.kiln_data_gen_topic_path == []
160
163
 
161
164
 
162
165
  def test_data_gen_sample_task_initialization(base_task):
163
166
  # Act
164
- task = DataGenSampleTask(target_task=base_task)
167
+ task = DataGenSampleTask(
168
+ target_task=base_task, gen_type="eval", guidance="Test guidance"
169
+ )
165
170
 
166
171
  # Assert
167
172
  assert task.name == "DataGenSample"
168
173
  assert isinstance(task.parent, Project)
169
174
  assert task.description is not None
170
175
  assert task.instruction is not None
176
+ assert "I want to evaluate a large language model" in task.instruction
177
+ assert "Test guidance" in task.instruction
171
178
 
172
179
  input_schema = json.loads(task.input_json_schema)
173
180
  output_schema = json.loads(task.output_json_schema)
@@ -176,12 +183,12 @@ def test_data_gen_sample_task_initialization(base_task):
176
183
  assert isinstance(output_schema, dict)
177
184
  assert output_schema["type"] == "object"
178
185
  assert output_schema["properties"]["generated_samples"]["type"] == "array"
179
- assert input_schema["properties"]["topic"]["type"] == "array"
180
- assert input_schema["properties"]["num_samples"]["type"] == "integer"
186
+ assert input_schema["properties"]["kiln_data_gen_topic_path"]["type"] == "array"
187
+ assert input_schema["properties"]["kiln_data_gen_num_samples"]["type"] == "integer"
181
188
  assert set(input_schema["required"]) == {
182
- "topic",
183
- "num_samples",
184
- "system_prompt",
189
+ "kiln_data_gen_topic_path",
190
+ "kiln_data_gen_num_samples",
191
+ "kiln_data_gen_system_prompt",
185
192
  }
186
193
 
187
194
 
@@ -332,3 +339,273 @@ async def test_data_gen_sample_all_models_providers_with_structured_output(
332
339
  assert "tweet" in sample
333
340
  assert isinstance(sample["username"], str)
334
341
  assert isinstance(sample["tweet"], str)
342
+
343
+
344
+ def test_generate_topic_tree_prompt_training_type():
345
+ """Test generate_topic_tree_prompt with gen_type='training'"""
346
+ # Act
347
+ prompt = generate_topic_tree_prompt(gen_type="training")
348
+
349
+ # Assert
350
+ assert isinstance(prompt, str)
351
+ assert (
352
+ "I want to train a large language model and you should help me generate training data for it."
353
+ in prompt
354
+ )
355
+ assert "## Task Description" in prompt
356
+ assert "Your job is the following:" in prompt
357
+ assert "## Next Step" in prompt
358
+ assert "When generating subtopics, remain somewhat vague." in prompt
359
+ assert "The guidance is:" not in prompt # Should not have specific guidance
360
+
361
+
362
+ def test_generate_topic_tree_prompt_eval_type():
363
+ """Test generate_topic_tree_prompt with gen_type='eval'"""
364
+ # Act
365
+ prompt = generate_topic_tree_prompt(gen_type="eval")
366
+
367
+ # Assert
368
+ assert isinstance(prompt, str)
369
+ assert (
370
+ "I want to evaluate a large language model and you should help me generate eval data for it."
371
+ in prompt
372
+ )
373
+ assert "## Task Description" in prompt
374
+ assert "Your job is the following:" in prompt
375
+ assert "## Next Step" in prompt
376
+ assert "When generating subtopics, remain somewhat vague." in prompt
377
+ assert "The guidance is:" not in prompt # Should not have specific guidance
378
+
379
+
380
+ def test_generate_topic_tree_prompt_with_guidance():
381
+ """Test generate_topic_tree_prompt with guidance provided"""
382
+ # Arrange
383
+ guidance = "Focus on technical topics related to artificial intelligence and machine learning"
384
+
385
+ # Act
386
+ prompt = generate_topic_tree_prompt(gen_type="training", guidance=guidance)
387
+
388
+ # Assert
389
+ assert isinstance(prompt, str)
390
+ assert (
391
+ "I want to train a large language model and you should help me generate training data for it."
392
+ in prompt
393
+ )
394
+ assert "## Custom Guidance" in prompt
395
+ assert f"<guidance>\n{guidance}\n</guidance>" in prompt
396
+ assert (
397
+ "When generating subtopics, remain somewhat vague." not in prompt
398
+ ) # Should not have default guidance
399
+
400
+
401
+ def test_generate_topic_tree_prompt_with_empty_guidance():
402
+ """Test generate_topic_tree_prompt with empty string guidance"""
403
+ # Act
404
+ prompt = generate_topic_tree_prompt(gen_type="eval", guidance="")
405
+
406
+ # Assert
407
+ assert isinstance(prompt, str)
408
+ assert (
409
+ "I want to evaluate a large language model and you should help me generate eval data for it."
410
+ in prompt
411
+ )
412
+ assert "## Specific Guidance" not in prompt
413
+ assert (
414
+ "When generating subtopics, remain somewhat vague." in prompt
415
+ ) # Should have default guidance
416
+
417
+
418
+ def test_generate_topic_tree_prompt_contains_examples():
419
+ """Test that the prompt contains the expected examples"""
420
+ # Act
421
+ prompt = generate_topic_tree_prompt(gen_type="training")
422
+
423
+ # Assert
424
+ # Check for news examples
425
+ assert "News Topics" in prompt
426
+ assert "Sports" in prompt
427
+ assert "Football" in prompt
428
+ assert "College Football" in prompt
429
+ assert "Entertainment" in prompt
430
+ assert "Tom Hanks" in prompt
431
+
432
+ # Check for smalltalk examples
433
+ assert "Small Talk Topics" in prompt
434
+ assert "Weather" in prompt
435
+ assert "Family" in prompt
436
+ assert "Hobbies" in prompt
437
+ assert "Cooking" in prompt
438
+ assert "Asian Food" in prompt
439
+
440
+
441
+ def test_generate_topic_tree_prompt_contains_required_sections():
442
+ """Test that the prompt contains all required sections"""
443
+ # Act
444
+ prompt = generate_topic_tree_prompt(gen_type="training")
445
+
446
+ # Assert
447
+ assert "## Task Description" in prompt
448
+ assert "## Next Step" in prompt
449
+ assert "system_prompt" in prompt
450
+ assert "kiln_data_gen_topic_path" in prompt
451
+ assert "kiln_data_gen_num_subtopics" in prompt
452
+ assert "existing_topics" in prompt
453
+
454
+
455
+ def test_generate_topic_tree_prompt_structure_consistency():
456
+ """Test that the prompt structure is consistent between training and eval types"""
457
+ # Act
458
+ training_prompt = generate_topic_tree_prompt(gen_type="training")
459
+ eval_prompt = generate_topic_tree_prompt(gen_type="eval")
460
+
461
+ # Assert
462
+ # Both should have the same structure, just different goal descriptions
463
+ assert "## Task Description" in training_prompt
464
+ assert "## Task Description" in eval_prompt
465
+ assert "## Next Step" in training_prompt
466
+ assert "## Next Step" in eval_prompt
467
+
468
+ # The main difference should be in the goal description
469
+ assert "train a large language model" in training_prompt
470
+ assert "evaluate a large language model" in eval_prompt
471
+ assert "generate training data" in training_prompt
472
+ assert "generate eval data" in eval_prompt
473
+
474
+
475
+ def test_generate_sample_generation_prompt_training_type():
476
+ """Test generate_sample_generation_prompt with gen_type='training'"""
477
+ # Act
478
+ prompt = generate_sample_generation_prompt(gen_type="training")
479
+
480
+ # Assert
481
+ assert isinstance(prompt, str)
482
+ assert (
483
+ "I want to train a large language model and you should help me generate training data for it."
484
+ in prompt
485
+ )
486
+ assert "## Task Description" in prompt
487
+ assert "Your job is to generate a list of potential inputs" in prompt
488
+ assert "The guidance is:" not in prompt # Should not have specific guidance
489
+
490
+
491
+ def test_generate_sample_generation_prompt_eval_type():
492
+ """Test generate_sample_generation_prompt with gen_type='eval'"""
493
+ # Act
494
+ prompt = generate_sample_generation_prompt(gen_type="eval")
495
+
496
+ # Assert
497
+ assert isinstance(prompt, str)
498
+ assert (
499
+ "I want to evaluate a large language model and you should help me generate eval data for it."
500
+ in prompt
501
+ )
502
+ assert "## Task Description" in prompt
503
+ assert "Your job is to generate a list of potential inputs" in prompt
504
+ assert "The guidance is:" not in prompt # Should not have specific guidance
505
+
506
+
507
+ def test_generate_sample_generation_prompt_with_guidance():
508
+ """Test generate_sample_generation_prompt with guidance provided"""
509
+ # Arrange
510
+ guidance = "Focus on generating diverse examples with varying complexity levels"
511
+
512
+ # Act
513
+ prompt = generate_sample_generation_prompt(gen_type="training", guidance=guidance)
514
+
515
+ # Assert
516
+ assert isinstance(prompt, str)
517
+ assert (
518
+ "I want to train a large language model and you should help me generate training data for it."
519
+ in prompt
520
+ )
521
+ assert "## Custom Guidance" in prompt
522
+ assert f"<guidance>\n{guidance}\n</guidance>" in prompt
523
+
524
+
525
+ def test_generate_sample_generation_prompt_with_empty_guidance():
526
+ """Test generate_sample_generation_prompt with empty string guidance"""
527
+ # Act
528
+ prompt = generate_sample_generation_prompt(gen_type="eval", guidance="")
529
+
530
+ # Assert
531
+ assert isinstance(prompt, str)
532
+ assert (
533
+ "I want to evaluate a large language model and you should help me generate eval data for it."
534
+ in prompt
535
+ )
536
+ assert "## Specific Guidance" not in prompt
537
+
538
+
539
+ def test_generate_sample_generation_prompt_contains_examples():
540
+ """Test that the prompt contains the expected examples"""
541
+ # Act
542
+ prompt = generate_sample_generation_prompt(gen_type="training")
543
+
544
+ # Assert
545
+ # Check for the tweet classification example
546
+ assert "You are an assistant that classifies the tone of a tweet" in prompt
547
+ assert "positive" in prompt
548
+ assert "negative" in prompt
549
+ assert "neutral" in prompt
550
+ assert "Technology" in prompt
551
+ assert "New iPhone Event" in prompt
552
+ assert "New iPhone looks amazing! I need that camera." in prompt
553
+ assert "Another boring event from Apple." in prompt
554
+
555
+
556
+ def test_generate_sample_generation_prompt_contains_required_sections():
557
+ """Test that the prompt contains all required sections"""
558
+ # Act
559
+ prompt = generate_sample_generation_prompt(gen_type="training")
560
+
561
+ # Assert
562
+ assert "## Task Description" in prompt
563
+ assert "system_prompt" in prompt
564
+ assert "topic" in prompt
565
+ assert "num_samples" in prompt
566
+ assert "generated_samples" in prompt
567
+ assert "The output must be formatted:" in prompt
568
+ assert "Do not include any other text or break the schema in any way." in prompt
569
+ assert (
570
+ "Note how the output of this task is data to input into the system prompt"
571
+ in prompt
572
+ )
573
+
574
+
575
+ def test_generate_sample_generation_prompt_structure_consistency():
576
+ """Test that the prompt structure is consistent between training and eval types"""
577
+ # Act
578
+ training_prompt = generate_sample_generation_prompt(gen_type="training")
579
+ eval_prompt = generate_sample_generation_prompt(gen_type="eval")
580
+
581
+ # Assert
582
+ # Both should have the same structure, just different goal descriptions
583
+ assert "## Task Description" in training_prompt
584
+ assert "## Task Description" in eval_prompt
585
+
586
+ # The main difference should be in the goal description
587
+ assert "train a large language model" in training_prompt
588
+ assert "evaluate a large language model" in eval_prompt
589
+ assert "generate training data" in training_prompt
590
+ assert "generate eval data" in eval_prompt
591
+
592
+ # Both should have the same core content
593
+ assert "Your job is to generate a list of potential inputs" in training_prompt
594
+ assert "Your job is to generate a list of potential inputs" in eval_prompt
595
+ assert "generated_samples" in training_prompt
596
+ assert "generated_samples" in eval_prompt
597
+
598
+
599
+ def test_generate_sample_generation_prompt_with_none_guidance():
600
+ """Test generate_sample_generation_prompt with None guidance"""
601
+ # Act
602
+ prompt = generate_sample_generation_prompt(gen_type="training", guidance=None)
603
+
604
+ # Assert
605
+ assert isinstance(prompt, str)
606
+ assert (
607
+ "I want to train a large language model and you should help me generate training data for it."
608
+ in prompt
609
+ )
610
+ assert "## Specific Guidance" not in prompt
611
+ assert "The guidance is:" not in prompt
@@ -7,12 +7,7 @@ from kiln_ai.adapters.ml_model_list import ModelProviderName
7
7
  from kiln_ai.adapters.model_adapters.base_adapter import AdapterConfig
8
8
  from kiln_ai.datamodel.eval import Eval, EvalConfig, EvalScores
9
9
  from kiln_ai.datamodel.json_schema import validate_schema_with_value_error
10
- from kiln_ai.datamodel.task import (
11
- RunConfig,
12
- RunConfigProperties,
13
- TaskOutputRatingType,
14
- TaskRun,
15
- )
10
+ from kiln_ai.datamodel.task import RunConfig, TaskOutputRatingType, TaskRun
16
11
  from kiln_ai.utils.exhaustive_error import raise_exhaustive_enum_error
17
12
 
18
13
 
@@ -124,7 +119,9 @@ class BaseEval:
124
119
  property["minimum"] = 1
125
120
  property["maximum"] = 5
126
121
  else:
127
- property["enum"] = [1, 2, 3, 4, 5]
122
+ property["type"] = "integer"
123
+ property["minimum"] = 1
124
+ property["maximum"] = 5
128
125
 
129
126
  property["description"] = (
130
127
  f"{output_score.instruction}\n\nThe rating should be between 1 and 5, with 1 being the worst and 5 being the best."
@@ -139,6 +136,7 @@ class BaseEval:
139
136
  )
140
137
  else:
141
138
  property["enum"] = ["pass", "fail"]
139
+ property["type"] = "string"
142
140
  property["description"] = (
143
141
  f"{output_score.instruction}\n\nThe rating should be either 'pass' or 'fail'."
144
142
  )
@@ -152,6 +150,7 @@ class BaseEval:
152
150
  )
153
151
  else:
154
152
  property["enum"] = ["pass", "fail", "critical"]
153
+ property["type"] = "string"
155
154
  property["description"] = (
156
155
  f"{output_score.instruction}\n\nThe rating should be either 'pass', 'fail', or 'critical' where critical a very severe failure."
157
156
  )
@@ -8,7 +8,7 @@ from kiln_ai.datamodel.basemodel import ID_TYPE
8
8
  from kiln_ai.datamodel.dataset_filters import dataset_filter_from_id
9
9
  from kiln_ai.datamodel.eval import EvalConfig, EvalRun, EvalScores
10
10
  from kiln_ai.datamodel.task import TaskRunConfig
11
- from kiln_ai.datamodel.task_run import TaskRun
11
+ from kiln_ai.datamodel.task_run import TaskRun, Usage
12
12
  from kiln_ai.utils.async_job_runner import AsyncJobRunner, Progress
13
13
 
14
14
  logger = logging.getLogger(__name__)
@@ -177,10 +177,12 @@ class EvalRunner:
177
177
  task_output: str | None = None
178
178
  scores: EvalScores | None = None
179
179
  intermediate_outputs: Dict[str, str] | None = None
180
+ task_run_usage: Usage | None = None
180
181
  if job.type == "eval_config_eval":
181
182
  # Eval config eval, we use the saved input from the task run, not invoking the task again
182
183
  scores, intermediate_outputs = await evaluator.run_eval(job.item)
183
184
  task_output = job.item.output.output
185
+ task_run_usage = job.item.usage
184
186
  else:
185
187
  # Task run eval, we invoke the task again to get a fresh output
186
188
  (
@@ -189,6 +191,7 @@ class EvalRunner:
189
191
  intermediate_outputs,
190
192
  ) = await evaluator.run_task_and_eval(job.item.input)
191
193
  task_output = result_task_run.output.output
194
+ task_run_usage = result_task_run.usage
192
195
 
193
196
  # Save the job result
194
197
  eval_run = EvalRun(
@@ -202,6 +205,7 @@ class EvalRunner:
202
205
  input=job.item.input,
203
206
  output=task_output,
204
207
  intermediate_outputs=intermediate_outputs,
208
+ task_run_usage=task_run_usage,
205
209
  )
206
210
  eval_run.save_to_file()
207
211
 
@@ -102,6 +102,18 @@ class GEval(BaseEval):
102
102
 
103
103
  self.geval_task = GEvalTask(eval_config)
104
104
 
105
+ def generate_run_description(self, eval_input: str, eval_output: str) -> str:
106
+ return f"""The model was given the following input for the task:
107
+ <eval_data>
108
+ {eval_input}
109
+ </eval_data>
110
+
111
+ The model produced the following output for the task:
112
+ <eval_data>
113
+ {eval_output}
114
+ </eval_data>
115
+ """
116
+
105
117
  async def run_eval(
106
118
  self, task_run: TaskRun
107
119
  ) -> tuple[EvalScores, Dict[str, str] | None]:
@@ -145,19 +157,12 @@ class GEval(BaseEval):
145
157
  ),
146
158
  )
147
159
 
148
- input = f"""The model was given the following input for the task:
149
- <eval_data>
150
- {task_run.input}
151
- </eval_data>
152
-
153
- The model produced the following output for the task:
154
- <eval_data>
155
- {task_run.output}
156
- </eval_data>
157
- """
160
+ run_description = self.generate_run_description(
161
+ task_run.input, task_run.output.output
162
+ )
158
163
 
159
164
  # We don't need the run, but invoke_returning_run_output() runs validations for us over _run()
160
- _, run_output = await adapter.invoke_returning_run_output(input)
165
+ _, run_output = await adapter.invoke_returning_run_output(run_description)
161
166
 
162
167
  if self.eval_config.config_type == EvalConfigType.llm_as_judge:
163
168
  return self.build_llm_as_judge_score(
@@ -310,7 +315,7 @@ The model produced the following output for the task:
310
315
  """
311
316
  primary_token_score = self.score_from_token_string(token_logprob.token)
312
317
  # check this is a real rating token, it could just be the ": ", "," or whitespace
313
- if not primary_token_score:
318
+ if primary_token_score is None:
314
319
  return None
315
320
 
316
321
  total_score = 0.0
@@ -43,7 +43,9 @@ def test_score_schema_five_star():
43
43
 
44
44
  # Check score property, and that it's an enum of 1-5
45
45
  score_prop = schema["properties"]["quality_score"]
46
- assert score_prop["enum"] == [1, 2, 3, 4, 5]
46
+ assert score_prop["type"] == "integer"
47
+ assert score_prop["minimum"] == 1
48
+ assert score_prop["maximum"] == 5
47
49
  assert "Quality Score" in score_prop["title"]
48
50
  assert "Rate the quality" in score_prop["description"]
49
51
  assert "between 1 and 5" in score_prop["description"]
@@ -51,7 +53,9 @@ def test_score_schema_five_star():
51
53
  # Check overall rating property, and that it's an enum of 1-5
52
54
  assert "overall_rating" in schema["properties"]
53
55
  overall = schema["properties"]["overall_rating"]
54
- assert overall["enum"] == [1, 2, 3, 4, 5]
56
+ assert overall["type"] == "integer"
57
+ assert overall["minimum"] == 1
58
+ assert overall["maximum"] == 5
55
59
  assert "Overall Rating" in overall["title"]
56
60
  assert "The overall rating for the task output" in overall["description"]
57
61
  assert "between 1 and 5" in overall["description"]
@@ -127,6 +131,7 @@ def test_score_schema_pass_fail():
127
131
  schema = json.loads(schema_str)
128
132
 
129
133
  score_prop = schema["properties"]["pass_fail_test"]
134
+ assert score_prop["type"] == "string"
130
135
  assert score_prop["enum"] == ["pass", "fail"]
131
136
  assert "Pass Fail Test" in score_prop["title"]
132
137
  assert "Check if it passes" in score_prop["description"]
@@ -173,6 +178,7 @@ def test_score_schema_pass_fail_critical():
173
178
  score_prop = schema["properties"]["critical_test"]
174
179
  assert "enum" in score_prop
175
180
  assert score_prop["enum"] == ["pass", "fail", "critical"]
181
+ assert score_prop["type"] == "string"
176
182
  assert "'pass', 'fail', or 'critical'" in score_prop["description"]
177
183
 
178
184
  assert schema["properties"]["overall_rating"] is not None