kiln-ai 0.16.0__py3-none-any.whl → 0.18.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. kiln_ai/adapters/__init__.py +2 -0
  2. kiln_ai/adapters/adapter_registry.py +22 -44
  3. kiln_ai/adapters/chat/__init__.py +8 -0
  4. kiln_ai/adapters/chat/chat_formatter.py +233 -0
  5. kiln_ai/adapters/chat/test_chat_formatter.py +131 -0
  6. kiln_ai/adapters/data_gen/data_gen_prompts.py +121 -36
  7. kiln_ai/adapters/data_gen/data_gen_task.py +49 -36
  8. kiln_ai/adapters/data_gen/test_data_gen_task.py +330 -40
  9. kiln_ai/adapters/eval/base_eval.py +7 -6
  10. kiln_ai/adapters/eval/eval_runner.py +9 -2
  11. kiln_ai/adapters/eval/g_eval.py +40 -17
  12. kiln_ai/adapters/eval/test_base_eval.py +174 -17
  13. kiln_ai/adapters/eval/test_eval_runner.py +3 -0
  14. kiln_ai/adapters/eval/test_g_eval.py +116 -5
  15. kiln_ai/adapters/fine_tune/base_finetune.py +3 -8
  16. kiln_ai/adapters/fine_tune/dataset_formatter.py +135 -273
  17. kiln_ai/adapters/fine_tune/test_base_finetune.py +10 -10
  18. kiln_ai/adapters/fine_tune/test_dataset_formatter.py +287 -353
  19. kiln_ai/adapters/fine_tune/test_fireworks_tinetune.py +3 -3
  20. kiln_ai/adapters/fine_tune/test_openai_finetune.py +6 -6
  21. kiln_ai/adapters/fine_tune/test_together_finetune.py +1 -0
  22. kiln_ai/adapters/fine_tune/test_vertex_finetune.py +6 -11
  23. kiln_ai/adapters/fine_tune/together_finetune.py +13 -2
  24. kiln_ai/adapters/ml_model_list.py +370 -84
  25. kiln_ai/adapters/model_adapters/base_adapter.py +73 -26
  26. kiln_ai/adapters/model_adapters/litellm_adapter.py +88 -97
  27. kiln_ai/adapters/model_adapters/litellm_config.py +3 -2
  28. kiln_ai/adapters/model_adapters/test_base_adapter.py +235 -61
  29. kiln_ai/adapters/model_adapters/test_litellm_adapter.py +104 -21
  30. kiln_ai/adapters/model_adapters/test_saving_adapter_results.py +41 -0
  31. kiln_ai/adapters/model_adapters/test_structured_output.py +44 -12
  32. kiln_ai/adapters/parsers/parser_registry.py +0 -2
  33. kiln_ai/adapters/parsers/r1_parser.py +0 -1
  34. kiln_ai/adapters/prompt_builders.py +0 -16
  35. kiln_ai/adapters/provider_tools.py +27 -9
  36. kiln_ai/adapters/remote_config.py +66 -0
  37. kiln_ai/adapters/repair/repair_task.py +1 -6
  38. kiln_ai/adapters/repair/test_repair_task.py +24 -3
  39. kiln_ai/adapters/test_adapter_registry.py +88 -28
  40. kiln_ai/adapters/test_ml_model_list.py +176 -0
  41. kiln_ai/adapters/test_prompt_adaptors.py +17 -7
  42. kiln_ai/adapters/test_prompt_builders.py +3 -16
  43. kiln_ai/adapters/test_provider_tools.py +69 -20
  44. kiln_ai/adapters/test_remote_config.py +100 -0
  45. kiln_ai/datamodel/__init__.py +0 -2
  46. kiln_ai/datamodel/datamodel_enums.py +38 -13
  47. kiln_ai/datamodel/eval.py +32 -0
  48. kiln_ai/datamodel/finetune.py +12 -8
  49. kiln_ai/datamodel/task.py +68 -7
  50. kiln_ai/datamodel/task_output.py +0 -2
  51. kiln_ai/datamodel/task_run.py +0 -2
  52. kiln_ai/datamodel/test_basemodel.py +2 -1
  53. kiln_ai/datamodel/test_dataset_split.py +0 -8
  54. kiln_ai/datamodel/test_eval_model.py +146 -4
  55. kiln_ai/datamodel/test_models.py +33 -10
  56. kiln_ai/datamodel/test_task.py +168 -2
  57. kiln_ai/utils/config.py +3 -2
  58. kiln_ai/utils/dataset_import.py +1 -1
  59. kiln_ai/utils/logging.py +166 -0
  60. kiln_ai/utils/test_config.py +23 -0
  61. kiln_ai/utils/test_dataset_import.py +30 -0
  62. {kiln_ai-0.16.0.dist-info → kiln_ai-0.18.0.dist-info}/METADATA +2 -2
  63. kiln_ai-0.18.0.dist-info/RECORD +115 -0
  64. kiln_ai-0.16.0.dist-info/RECORD +0 -108
  65. {kiln_ai-0.16.0.dist-info → kiln_ai-0.18.0.dist-info}/WHEEL +0 -0
  66. {kiln_ai-0.16.0.dist-info → kiln_ai-0.18.0.dist-info}/licenses/LICENSE.txt +0 -0
@@ -5,11 +5,14 @@ from litellm.types.utils import ChatCompletionTokenLogprob
5
5
 
6
6
  from kiln_ai.adapters.adapter_registry import adapter_for_task
7
7
  from kiln_ai.adapters.eval.base_eval import BaseEval
8
+ from kiln_ai.adapters.ml_model_list import (
9
+ default_structured_output_mode_for_model_provider,
10
+ )
8
11
  from kiln_ai.adapters.model_adapters.base_adapter import AdapterConfig, RunOutput
9
12
  from kiln_ai.adapters.prompt_builders import PromptGenerators
10
13
  from kiln_ai.datamodel import Project, Task, TaskRun
11
14
  from kiln_ai.datamodel.eval import EvalConfig, EvalConfigType, EvalScores
12
- from kiln_ai.datamodel.task import RunConfig
15
+ from kiln_ai.datamodel.task import RunConfig, RunConfigProperties, StructuredOutputMode
13
16
 
14
17
  # all the tokens we score for, and their float scores.
15
18
  TOKEN_TO_SCORE_MAP: Dict[str, float] = {
@@ -99,6 +102,18 @@ class GEval(BaseEval):
99
102
 
100
103
  self.geval_task = GEvalTask(eval_config)
101
104
 
105
+ def generate_run_description(self, eval_input: str, eval_output: str) -> str:
106
+ return f"""The model was given the following input for the task:
107
+ <eval_data>
108
+ {eval_input}
109
+ </eval_data>
110
+
111
+ The model produced the following output for the task:
112
+ <eval_data>
113
+ {eval_output}
114
+ </eval_data>
115
+ """
116
+
102
117
  async def run_eval(
103
118
  self, task_run: TaskRun
104
119
  ) -> tuple[EvalScores, Dict[str, str] | None]:
@@ -114,12 +129,27 @@ class GEval(BaseEval):
114
129
  10 if self.eval_config.config_type == EvalConfigType.g_eval else None
115
130
  )
116
131
 
117
- adapter = adapter_for_task(
118
- self.geval_task,
132
+ # We don't expose setting this manually in the UI, so pull a recommended mode from ml_model_list
133
+ structured_output_mode = default_structured_output_mode_for_model_provider(
119
134
  model_name,
120
135
  provider,
121
- # We always use Simple COT for G-Eval and LLM as Judge
122
- prompt_id=PromptGenerators.SIMPLE_CHAIN_OF_THOUGHT,
136
+ default=StructuredOutputMode.json_schema,
137
+ # G-eval expects JSON, so don't allow function calling modes
138
+ disallowed_modes=[
139
+ StructuredOutputMode.function_calling,
140
+ StructuredOutputMode.function_calling_weak,
141
+ ],
142
+ )
143
+
144
+ adapter = adapter_for_task(
145
+ self.geval_task,
146
+ run_config_properties=RunConfigProperties(
147
+ model_name=model_name,
148
+ model_provider_name=provider,
149
+ # We always use Simple COT for G-Eval and LLM as Judge
150
+ prompt_id=PromptGenerators.SIMPLE_CHAIN_OF_THOUGHT,
151
+ structured_output_mode=structured_output_mode,
152
+ ),
123
153
  base_adapter_config=AdapterConfig(
124
154
  # Don't save this run into the task_runs. It will be saved into an eval_run where it belongs
125
155
  allow_saving=False,
@@ -127,19 +157,12 @@ class GEval(BaseEval):
127
157
  ),
128
158
  )
129
159
 
130
- input = f"""The model was given the following input for the task:
131
- <eval_data>
132
- {task_run.input}
133
- </eval_data>
134
-
135
- The model produced the following output for the task:
136
- <eval_data>
137
- {task_run.output}
138
- </eval_data>
139
- """
160
+ run_description = self.generate_run_description(
161
+ task_run.input, task_run.output.output
162
+ )
140
163
 
141
164
  # We don't need the run, but invoke_returning_run_output() runs validations for us over _run()
142
- _, run_output = await adapter.invoke_returning_run_output(input)
165
+ _, run_output = await adapter.invoke_returning_run_output(run_description)
143
166
 
144
167
  if self.eval_config.config_type == EvalConfigType.llm_as_judge:
145
168
  return self.build_llm_as_judge_score(
@@ -292,7 +315,7 @@ The model produced the following output for the task:
292
315
  """
293
316
  primary_token_score = self.score_from_token_string(token_logprob.token)
294
317
  # check this is a real rating token, it could just be the ": ", "," or whitespace
295
- if not primary_token_score:
318
+ if primary_token_score is None:
296
319
  return None
297
320
 
298
321
  total_score = 0.0
@@ -1,9 +1,9 @@
1
1
  import json
2
+ from unittest.mock import AsyncMock, MagicMock, patch
2
3
 
3
4
  import pytest
4
5
 
5
6
  from kiln_ai.adapters.eval.base_eval import BaseEval
6
- from kiln_ai.datamodel import BasePrompt, DataSource, DataSourceType
7
7
  from kiln_ai.datamodel.eval import Eval, EvalConfig, EvalOutputScore
8
8
  from kiln_ai.datamodel.task import (
9
9
  RunConfigProperties,
@@ -43,7 +43,9 @@ def test_score_schema_five_star():
43
43
 
44
44
  # Check score property, and that it's an enum of 1-5
45
45
  score_prop = schema["properties"]["quality_score"]
46
- assert score_prop["enum"] == [1, 2, 3, 4, 5]
46
+ assert score_prop["type"] == "integer"
47
+ assert score_prop["minimum"] == 1
48
+ assert score_prop["maximum"] == 5
47
49
  assert "Quality Score" in score_prop["title"]
48
50
  assert "Rate the quality" in score_prop["description"]
49
51
  assert "between 1 and 5" in score_prop["description"]
@@ -51,7 +53,9 @@ def test_score_schema_five_star():
51
53
  # Check overall rating property, and that it's an enum of 1-5
52
54
  assert "overall_rating" in schema["properties"]
53
55
  overall = schema["properties"]["overall_rating"]
54
- assert overall["enum"] == [1, 2, 3, 4, 5]
56
+ assert overall["type"] == "integer"
57
+ assert overall["minimum"] == 1
58
+ assert overall["maximum"] == 5
55
59
  assert "Overall Rating" in overall["title"]
56
60
  assert "The overall rating for the task output" in overall["description"]
57
61
  assert "between 1 and 5" in overall["description"]
@@ -127,6 +131,7 @@ def test_score_schema_pass_fail():
127
131
  schema = json.loads(schema_str)
128
132
 
129
133
  score_prop = schema["properties"]["pass_fail_test"]
134
+ assert score_prop["type"] == "string"
130
135
  assert score_prop["enum"] == ["pass", "fail"]
131
136
  assert "Pass Fail Test" in score_prop["title"]
132
137
  assert "Check if it passes" in score_prop["description"]
@@ -173,6 +178,7 @@ def test_score_schema_pass_fail_critical():
173
178
  score_prop = schema["properties"]["critical_test"]
174
179
  assert "enum" in score_prop
175
180
  assert score_prop["enum"] == ["pass", "fail", "critical"]
181
+ assert score_prop["type"] == "string"
176
182
  assert "'pass', 'fail', or 'critical'" in score_prop["description"]
177
183
 
178
184
  assert schema["properties"]["overall_rating"] is not None
@@ -245,7 +251,7 @@ class EvalTester(BaseEval):
245
251
  """Test implementation of BaseEval"""
246
252
 
247
253
  async def run_eval(self, task_run):
248
- return {"overall_rating": 5, "quality": 4}
254
+ return {"overall_rating": 5, "quality": 4}, None
249
255
 
250
256
 
251
257
  @pytest.mark.paid
@@ -265,14 +271,8 @@ async def test_run_method():
265
271
 
266
272
  eval_config = EvalConfig(
267
273
  name="Test Eval Config",
268
- model=DataSource(
269
- type=DataSourceType.synthetic,
270
- properties={
271
- "model_name": "gpt-4o",
272
- "model_provider": "openai",
273
- "adapter_name": "test",
274
- },
275
- ),
274
+ model_name="gpt-4o",
275
+ model_provider="openai",
276
276
  parent=Eval(
277
277
  name="Test Eval",
278
278
  parent=task,
@@ -291,10 +291,6 @@ async def test_run_method():
291
291
  ),
292
292
  ],
293
293
  ),
294
- prompt=BasePrompt(
295
- name="Test Prompt",
296
- prompt="Test prompt",
297
- ),
298
294
  properties={"eval_steps": ["test_step"]},
299
295
  )
300
296
 
@@ -311,7 +307,9 @@ async def test_run_method():
311
307
  evaluator = EvalTester(eval_config, run_config.run_config())
312
308
 
313
309
  # Run the evaluation
314
- task_run, eval_scores = await evaluator.run("test input")
310
+ task_run, eval_scores, intermediate_outputs = await evaluator.run_task_and_eval(
311
+ "test input"
312
+ )
315
313
 
316
314
  # Verify task run was created
317
315
  assert task_run.input == "test input"
@@ -323,3 +321,162 @@ async def test_run_method():
323
321
 
324
322
  # Verify schema validation worked (these keys should exist per schema)
325
323
  assert set(eval_scores.keys()) == {"overall_rating", "quality"}
324
+
325
+
326
+ @pytest.mark.asyncio
327
+ async def test_run_task_and_eval():
328
+ """Test run_task_and_eval method with mocked dependencies"""
329
+ # Create test data
330
+ task = Task(
331
+ name="Test Task",
332
+ instruction="Test instruction",
333
+ requirements=[
334
+ TaskRequirement(
335
+ name="Quality",
336
+ instruction="Rate quality",
337
+ type=TaskOutputRatingType.five_star,
338
+ ),
339
+ ],
340
+ )
341
+
342
+ eval_config = EvalConfig(
343
+ name="Test Eval Config",
344
+ model_name="gpt-4o",
345
+ model_provider="openai",
346
+ parent=Eval(
347
+ name="Test Eval",
348
+ parent=task,
349
+ eval_set_filter_id="all",
350
+ eval_configs_filter_id="all",
351
+ output_scores=[
352
+ EvalOutputScore(
353
+ name="Quality",
354
+ instruction="Rate quality",
355
+ type=TaskOutputRatingType.five_star,
356
+ ),
357
+ EvalOutputScore(
358
+ name="Overall Rating",
359
+ instruction="The overall rating for the task output",
360
+ type=TaskOutputRatingType.five_star,
361
+ ),
362
+ ],
363
+ ),
364
+ properties={"eval_steps": ["test_step"]},
365
+ )
366
+
367
+ run_config = TaskRunConfig(
368
+ name="Test Run Config",
369
+ run_config_properties=RunConfigProperties(
370
+ model_name="llama_3_1_8b",
371
+ model_provider_name="groq",
372
+ prompt_id="simple_prompt_builder",
373
+ structured_output_mode="json_schema",
374
+ ),
375
+ parent=task,
376
+ )
377
+
378
+ # Create evaluator instance
379
+ class MockEval(BaseEval):
380
+ async def run_eval(self, task_run):
381
+ return {"overall_rating": 5, "quality": 4}, {"thinking": "test thinking"}
382
+
383
+ evaluator = MockEval(eval_config, run_config.run_config())
384
+
385
+ # Mock dependencies
386
+ mock_adapter = AsyncMock()
387
+ mock_task_run = MagicMock()
388
+ mock_task_run.input = "test input"
389
+ mock_task_run.output.output = "test output"
390
+ mock_adapter.invoke.return_value = mock_task_run
391
+
392
+ with (
393
+ patch(
394
+ "kiln_ai.adapters.eval.base_eval.adapter_for_task"
395
+ ) as mock_adapter_for_task,
396
+ patch(
397
+ "kiln_ai.adapters.eval.base_eval.validate_schema_with_value_error"
398
+ ) as mock_validate,
399
+ ):
400
+ mock_adapter_for_task.return_value = mock_adapter
401
+
402
+ # Test with string input
403
+ result = await evaluator.run_task_and_eval("test input")
404
+
405
+ # Verify adapter_for_task was called with correct parameters
406
+ mock_adapter_for_task.assert_called_once()
407
+ assert mock_adapter_for_task.call_args[0][0] == evaluator.target_task
408
+ props = mock_adapter_for_task.call_args[0][1]
409
+ assert props.model_name == "llama_3_1_8b"
410
+ assert props.model_provider_name == "groq"
411
+ assert props.prompt_id == "simple_prompt_builder"
412
+ bac = mock_adapter_for_task.call_args[1]
413
+ assert bac["base_adapter_config"].allow_saving is False
414
+
415
+ # Verify the base_adapter_config has allow_saving=False
416
+ adapter_config = mock_adapter_for_task.call_args[1]["base_adapter_config"]
417
+ assert adapter_config.allow_saving is False
418
+
419
+ # Verify adapter.invoke was called with correct input
420
+ mock_adapter.invoke.assert_called_once_with("test input")
421
+
422
+ # Verify validate_schema_with_value_error was called
423
+ mock_validate.assert_called_once_with(
424
+ {"overall_rating": 5, "quality": 4},
425
+ evaluator.score_schema,
426
+ "Eval output does not match score schema.",
427
+ )
428
+
429
+ # Verify return values
430
+ task_run, eval_scores, intermediate_outputs = result
431
+ assert task_run == mock_task_run
432
+ assert eval_scores == {"overall_rating": 5, "quality": 4}
433
+ assert intermediate_outputs == {"thinking": "test thinking"}
434
+
435
+
436
+ @pytest.mark.asyncio
437
+ async def test_run_task_and_eval_no_run_config():
438
+ """Test run_task_and_eval raises error when run_config is None"""
439
+ task = Task(
440
+ name="Test Task",
441
+ instruction="Test instruction",
442
+ requirements=[
443
+ TaskRequirement(
444
+ name="Quality",
445
+ instruction="Rate quality",
446
+ type=TaskOutputRatingType.five_star,
447
+ ),
448
+ ],
449
+ )
450
+
451
+ eval_config = EvalConfig(
452
+ name="Test Eval Config",
453
+ model_name="gpt-4o",
454
+ model_provider="openai",
455
+ parent=Eval(
456
+ name="Test Eval",
457
+ parent=task,
458
+ eval_set_filter_id="all",
459
+ eval_configs_filter_id="all",
460
+ output_scores=[
461
+ EvalOutputScore(
462
+ name="Quality",
463
+ instruction="Rate quality",
464
+ type=TaskOutputRatingType.five_star,
465
+ ),
466
+ ],
467
+ ),
468
+ properties={"eval_steps": ["test_step"]},
469
+ )
470
+
471
+ # Create evaluator instance with no run_config
472
+ class MockEval(BaseEval):
473
+ async def run_eval(self, task_run):
474
+ return {"quality": 4}, None
475
+
476
+ evaluator = MockEval(eval_config, None)
477
+
478
+ # Test that it raises ValueError
479
+ with pytest.raises(
480
+ ValueError, match="Run config is required for run_task_and_eval"
481
+ ):
482
+ await evaluator.run_task_and_eval("test input")
@@ -94,6 +94,7 @@ def mock_run_config(
94
94
  model_name="gpt-4",
95
95
  model_provider_name="openai",
96
96
  prompt_id="simple_prompt_builder",
97
+ structured_output_mode="json_schema",
97
98
  ),
98
99
  parent=mock_task,
99
100
  )
@@ -209,6 +210,7 @@ def test_collect_tasks_filtering(
209
210
  model_name="gpt-4",
210
211
  model_provider_name="openai",
211
212
  prompt_id="simple_prompt_builder",
213
+ structured_output_mode="json_schema",
212
214
  ),
213
215
  parent=mock_task,
214
216
  )
@@ -416,6 +418,7 @@ def test_collect_tasks_multiple_run_configs(
416
418
  model_name="gpt-3.5",
417
419
  model_provider_name="openai",
418
420
  prompt_id="simple_prompt_builder",
421
+ structured_output_mode="json_schema",
419
422
  ),
420
423
  parent=mock_task,
421
424
  )
@@ -99,6 +99,7 @@ def test_run_config(test_task):
99
99
  model_provider_name="groq",
100
100
  task=test_task,
101
101
  prompt_id="simple_prompt_builder",
102
+ structured_output_mode="json_schema",
102
103
  )
103
104
 
104
105
 
@@ -273,6 +274,36 @@ def test_token_case():
273
274
  assert token.lower() == token
274
275
 
275
276
 
277
+ def test_generate_run_description(test_eval_config, test_run_config, test_task_run):
278
+ """Test that generate_run_description correctly uses task_run.output.output (the string) rather than task_run.output (the object)."""
279
+ # Create G-Eval instance
280
+ g_eval = GEval(test_eval_config, test_run_config)
281
+
282
+ # Call generate_run_description
283
+ description = g_eval.generate_run_description(
284
+ test_task_run.input, test_task_run.output.output
285
+ )
286
+
287
+ # Verify that the actual string output is in the description
288
+ expected_output = "Why did the chicken cross the road? To get to the other side!"
289
+ assert expected_output in description
290
+
291
+ # Verify that the input is also in the description
292
+ assert "Tell me a chicken joke" in description
293
+
294
+ # Verify the description has the expected structure
295
+ assert "<eval_data>" in description
296
+ assert description.count("<eval_data>") == 2 # 2 opening tags
297
+ assert description.count("</eval_data>") == 2 # 2 closing tags
298
+ assert "The model was given the following input for the task:" in description
299
+ assert "The model produced the following output for the task:" in description
300
+
301
+ # Verify that we're getting the actual string value, not a Python object representation
302
+ # The string should not contain 'TaskOutput' or other object indicators
303
+ assert "TaskOutput" not in description
304
+ assert "output=" not in description # Would appear if object __repr__ was used
305
+
306
+
276
307
  def test_metric_offsets_and_search_ranges(
277
308
  test_eval_config, test_run_config, test_task_run
278
309
  ):
@@ -400,7 +431,7 @@ def test_rating_token_to_score(test_eval_config, test_run_config):
400
431
 
401
432
  # Test single token case
402
433
  token_logprob = MockTokenLogprob("5", [("5", 0.0)], logprob=1e-8) # log(1) = 0
403
- score = g_eval.rating_token_to_score(token_logprob)
434
+ score = g_eval.rating_token_to_score(token_logprob) # type: ignore
404
435
  assert score == 5.0
405
436
 
406
437
  # Test weighted average case
@@ -412,20 +443,62 @@ def test_rating_token_to_score(test_eval_config, test_run_config):
412
443
  ],
413
444
  logprob=math.log(0.6),
414
445
  )
415
- score = g_eval.rating_token_to_score(token_logprob)
446
+ score = g_eval.rating_token_to_score(token_logprob) # type: ignore
416
447
  assert pytest.approx(score) == 4.4 # (4 * 0.6 + 5 * 0.4)
417
448
 
418
449
  # Test invalid token
419
450
  token_logprob = MockTokenLogprob(":", [(":", 0.0)], logprob=1e-8)
420
- assert g_eval.rating_token_to_score(token_logprob) is None
451
+ assert g_eval.rating_token_to_score(token_logprob) is None # type: ignore
421
452
 
422
453
  # Test missing from top logprobs
423
454
  token_logprob = MockTokenLogprob("5", [], logprob=1e-8)
424
- assert pytest.approx(g_eval.rating_token_to_score(token_logprob)) == 5.0
455
+ assert pytest.approx(g_eval.rating_token_to_score(token_logprob)) == 5.0 # type: ignore
425
456
 
426
457
  # Test missing from top logprobs, with special case logprob
427
458
  token_logprob = MockTokenLogprob("5", [], logprob=-9999)
428
- assert pytest.approx(g_eval.rating_token_to_score(token_logprob)) == 5.0
459
+ assert pytest.approx(g_eval.rating_token_to_score(token_logprob)) == 5.0 # type: ignore
460
+
461
+
462
+ def test_rating_token_to_score_zero_score_bug_fix(test_eval_config, test_run_config):
463
+ """Test that rating_token_to_score correctly handles 0.0 scores (like 'fail') and doesn't return None.
464
+
465
+ This test verifies the fix for the bug where 'if not primary_token_score:' would incorrectly
466
+ treat 0.0 as falsy and return None, when it should only return None for actual None values.
467
+ """
468
+ g_eval = GEval(test_eval_config, test_run_config)
469
+
470
+ class MockTopLogprob:
471
+ def __init__(self, token, logprob):
472
+ self.token = token
473
+ self.logprob = logprob
474
+
475
+ class MockTokenLogprob:
476
+ def __init__(self, token, top_logprobs, logprob):
477
+ self.token = token
478
+ self.top_logprobs = [MockTopLogprob(t, lp) for t, lp in top_logprobs]
479
+ self.logprob = logprob
480
+
481
+ # Test that "fail" token (which maps to 0.0) is handled correctly
482
+ token_logprob = MockTokenLogprob("fail", [("fail", 0.0)], logprob=1e-8)
483
+ score = g_eval.rating_token_to_score(token_logprob) # type: ignore
484
+ assert score == 0.0, f"Expected 0.0 for 'fail' token, got {score}"
485
+
486
+ # Test that "0" token (which maps to None) still returns None
487
+ token_logprob = MockTokenLogprob("0", [("0", 0.0)], logprob=1e-8)
488
+ score = g_eval.rating_token_to_score(token_logprob) # type: ignore
489
+ assert score is None, f"Expected None for '0' token, got {score}"
490
+
491
+ # Test weighted average case with fail token
492
+ token_logprob = MockTokenLogprob(
493
+ "fail",
494
+ [
495
+ ("fail", math.log(0.7)), # 70% probability for fail (0.0)
496
+ ("pass", math.log(0.3)), # 30% probability for pass (1.0)
497
+ ],
498
+ logprob=math.log(0.7),
499
+ )
500
+ score = g_eval.rating_token_to_score(token_logprob) # type: ignore
501
+ assert pytest.approx(score) == 0.3 # (0.0 * 0.7 + 1.0 * 0.3)
429
502
 
430
503
 
431
504
  def test_g_eval_system_instruction():
@@ -501,3 +574,41 @@ async def test_all_built_in_models_logprobs_geval(
501
574
  model_name,
502
575
  provider_name.value,
503
576
  )
577
+
578
+
579
+ def check_supports_llm_as_judge(model_name: str, provider_name: str):
580
+ for model in built_in_models:
581
+ if model.name != model_name:
582
+ continue
583
+ for provider in model.providers:
584
+ if provider.name != provider_name:
585
+ continue
586
+ if not provider.supports_structured_output:
587
+ pytest.skip(
588
+ f"Skipping {model.name} {provider.name} because it does not support llm_as_judge (structured_output_mode)"
589
+ )
590
+ return
591
+ raise RuntimeError(f"No model {model_name} {provider_name} found")
592
+
593
+
594
+ @pytest.mark.paid
595
+ @pytest.mark.ollama
596
+ @pytest.mark.parametrize("model_name,provider_name", get_all_models_and_providers())
597
+ async def test_all_built_in_models_llm_as_judge(
598
+ model_name,
599
+ provider_name,
600
+ test_task,
601
+ test_eval_config,
602
+ test_task_run,
603
+ test_run_config,
604
+ ):
605
+ check_supports_llm_as_judge(model_name, provider_name)
606
+ await run_g_eval_test(
607
+ test_task,
608
+ test_eval_config,
609
+ test_task_run,
610
+ EvalConfigType.llm_as_judge,
611
+ test_run_config,
612
+ model_name,
613
+ provider_name.value,
614
+ )
@@ -3,14 +3,9 @@ from typing import Literal
3
3
 
4
4
  from pydantic import BaseModel
5
5
 
6
- from kiln_ai.adapters.ml_model_list import built_in_models
7
- from kiln_ai.datamodel import (
8
- DatasetSplit,
9
- FinetuneDataStrategy,
10
- FineTuneStatusType,
11
- Task,
12
- )
6
+ from kiln_ai.datamodel import DatasetSplit, FineTuneStatusType, Task
13
7
  from kiln_ai.datamodel import Finetune as FinetuneModel
8
+ from kiln_ai.datamodel.datamodel_enums import ChatStrategy
14
9
  from kiln_ai.utils.name_generator import generate_memorable_name
15
10
 
16
11
 
@@ -62,7 +57,7 @@ class BaseFinetuneAdapter(ABC):
62
57
  train_split_name: str,
63
58
  system_message: str,
64
59
  thinking_instructions: str | None,
65
- data_strategy: FinetuneDataStrategy,
60
+ data_strategy: ChatStrategy,
66
61
  parameters: dict[str, str | int | float | bool] = {},
67
62
  name: str | None = None,
68
63
  description: str | None = None,