kiln-ai 0.16.0__py3-none-any.whl → 0.17.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kiln-ai might be problematic. Click here for more details.

Files changed (54) hide show
  1. kiln_ai/adapters/__init__.py +2 -0
  2. kiln_ai/adapters/adapter_registry.py +22 -44
  3. kiln_ai/adapters/chat/__init__.py +8 -0
  4. kiln_ai/adapters/chat/chat_formatter.py +234 -0
  5. kiln_ai/adapters/chat/test_chat_formatter.py +131 -0
  6. kiln_ai/adapters/data_gen/test_data_gen_task.py +19 -6
  7. kiln_ai/adapters/eval/base_eval.py +8 -6
  8. kiln_ai/adapters/eval/eval_runner.py +4 -1
  9. kiln_ai/adapters/eval/g_eval.py +23 -5
  10. kiln_ai/adapters/eval/test_base_eval.py +166 -15
  11. kiln_ai/adapters/eval/test_eval_runner.py +3 -0
  12. kiln_ai/adapters/eval/test_g_eval.py +1 -0
  13. kiln_ai/adapters/fine_tune/base_finetune.py +2 -2
  14. kiln_ai/adapters/fine_tune/dataset_formatter.py +138 -272
  15. kiln_ai/adapters/fine_tune/test_base_finetune.py +10 -10
  16. kiln_ai/adapters/fine_tune/test_dataset_formatter.py +287 -353
  17. kiln_ai/adapters/fine_tune/test_fireworks_tinetune.py +3 -3
  18. kiln_ai/adapters/fine_tune/test_openai_finetune.py +6 -6
  19. kiln_ai/adapters/fine_tune/test_together_finetune.py +1 -0
  20. kiln_ai/adapters/fine_tune/test_vertex_finetune.py +4 -4
  21. kiln_ai/adapters/fine_tune/together_finetune.py +12 -1
  22. kiln_ai/adapters/ml_model_list.py +80 -43
  23. kiln_ai/adapters/model_adapters/base_adapter.py +73 -26
  24. kiln_ai/adapters/model_adapters/litellm_adapter.py +79 -97
  25. kiln_ai/adapters/model_adapters/litellm_config.py +3 -2
  26. kiln_ai/adapters/model_adapters/test_base_adapter.py +235 -60
  27. kiln_ai/adapters/model_adapters/test_litellm_adapter.py +56 -21
  28. kiln_ai/adapters/model_adapters/test_saving_adapter_results.py +41 -0
  29. kiln_ai/adapters/model_adapters/test_structured_output.py +44 -12
  30. kiln_ai/adapters/prompt_builders.py +0 -16
  31. kiln_ai/adapters/provider_tools.py +27 -9
  32. kiln_ai/adapters/repair/test_repair_task.py +24 -3
  33. kiln_ai/adapters/test_adapter_registry.py +88 -28
  34. kiln_ai/adapters/test_ml_model_list.py +158 -0
  35. kiln_ai/adapters/test_prompt_adaptors.py +17 -3
  36. kiln_ai/adapters/test_prompt_builders.py +3 -16
  37. kiln_ai/adapters/test_provider_tools.py +69 -20
  38. kiln_ai/datamodel/__init__.py +0 -2
  39. kiln_ai/datamodel/datamodel_enums.py +38 -13
  40. kiln_ai/datamodel/finetune.py +12 -7
  41. kiln_ai/datamodel/task.py +68 -7
  42. kiln_ai/datamodel/test_basemodel.py +2 -1
  43. kiln_ai/datamodel/test_dataset_split.py +0 -8
  44. kiln_ai/datamodel/test_models.py +33 -10
  45. kiln_ai/datamodel/test_task.py +168 -2
  46. kiln_ai/utils/config.py +3 -2
  47. kiln_ai/utils/dataset_import.py +1 -1
  48. kiln_ai/utils/logging.py +165 -0
  49. kiln_ai/utils/test_config.py +23 -0
  50. kiln_ai/utils/test_dataset_import.py +30 -0
  51. {kiln_ai-0.16.0.dist-info → kiln_ai-0.17.0.dist-info}/METADATA +1 -1
  52. {kiln_ai-0.16.0.dist-info → kiln_ai-0.17.0.dist-info}/RECORD +54 -49
  53. {kiln_ai-0.16.0.dist-info → kiln_ai-0.17.0.dist-info}/WHEEL +0 -0
  54. {kiln_ai-0.16.0.dist-info → kiln_ai-0.17.0.dist-info}/licenses/LICENSE.txt +0 -0
@@ -207,5 +207,8 @@ class EvalRunner:
207
207
 
208
208
  return True
209
209
  except Exception as e:
210
- logger.error(f"Error running eval job for dataset item {job.item.id}: {e}")
210
+ logger.error(
211
+ f"Error running eval job for dataset item {job.item.id}: {e}",
212
+ exc_info=True,
213
+ )
211
214
  return False
@@ -5,11 +5,14 @@ from litellm.types.utils import ChatCompletionTokenLogprob
5
5
 
6
6
  from kiln_ai.adapters.adapter_registry import adapter_for_task
7
7
  from kiln_ai.adapters.eval.base_eval import BaseEval
8
+ from kiln_ai.adapters.ml_model_list import (
9
+ default_structured_output_mode_for_model_provider,
10
+ )
8
11
  from kiln_ai.adapters.model_adapters.base_adapter import AdapterConfig, RunOutput
9
12
  from kiln_ai.adapters.prompt_builders import PromptGenerators
10
13
  from kiln_ai.datamodel import Project, Task, TaskRun
11
14
  from kiln_ai.datamodel.eval import EvalConfig, EvalConfigType, EvalScores
12
- from kiln_ai.datamodel.task import RunConfig
15
+ from kiln_ai.datamodel.task import RunConfig, RunConfigProperties, StructuredOutputMode
13
16
 
14
17
  # all the tokens we score for, and their float scores.
15
18
  TOKEN_TO_SCORE_MAP: Dict[str, float] = {
@@ -114,12 +117,27 @@ class GEval(BaseEval):
114
117
  10 if self.eval_config.config_type == EvalConfigType.g_eval else None
115
118
  )
116
119
 
117
- adapter = adapter_for_task(
118
- self.geval_task,
120
+ # We don't expose setting this manually in the UI, so pull a recommended mode from ml_model_list
121
+ structured_output_mode = default_structured_output_mode_for_model_provider(
119
122
  model_name,
120
123
  provider,
121
- # We always use Simple COT for G-Eval and LLM as Judge
122
- prompt_id=PromptGenerators.SIMPLE_CHAIN_OF_THOUGHT,
124
+ default=StructuredOutputMode.json_schema,
125
+ # G-eval expects JSON, so don't allow function calling modes
126
+ disallowed_modes=[
127
+ StructuredOutputMode.function_calling,
128
+ StructuredOutputMode.function_calling_weak,
129
+ ],
130
+ )
131
+
132
+ adapter = adapter_for_task(
133
+ self.geval_task,
134
+ run_config_properties=RunConfigProperties(
135
+ model_name=model_name,
136
+ model_provider_name=provider,
137
+ # We always use Simple COT for G-Eval and LLM as Judge
138
+ prompt_id=PromptGenerators.SIMPLE_CHAIN_OF_THOUGHT,
139
+ structured_output_mode=structured_output_mode,
140
+ ),
123
141
  base_adapter_config=AdapterConfig(
124
142
  # Don't save this run into the task_runs. It will be saved into an eval_run where it belongs
125
143
  allow_saving=False,
@@ -1,9 +1,9 @@
1
1
  import json
2
+ from unittest.mock import AsyncMock, MagicMock, patch
2
3
 
3
4
  import pytest
4
5
 
5
6
  from kiln_ai.adapters.eval.base_eval import BaseEval
6
- from kiln_ai.datamodel import BasePrompt, DataSource, DataSourceType
7
7
  from kiln_ai.datamodel.eval import Eval, EvalConfig, EvalOutputScore
8
8
  from kiln_ai.datamodel.task import (
9
9
  RunConfigProperties,
@@ -245,7 +245,7 @@ class EvalTester(BaseEval):
245
245
  """Test implementation of BaseEval"""
246
246
 
247
247
  async def run_eval(self, task_run):
248
- return {"overall_rating": 5, "quality": 4}
248
+ return {"overall_rating": 5, "quality": 4}, None
249
249
 
250
250
 
251
251
  @pytest.mark.paid
@@ -265,14 +265,8 @@ async def test_run_method():
265
265
 
266
266
  eval_config = EvalConfig(
267
267
  name="Test Eval Config",
268
- model=DataSource(
269
- type=DataSourceType.synthetic,
270
- properties={
271
- "model_name": "gpt-4o",
272
- "model_provider": "openai",
273
- "adapter_name": "test",
274
- },
275
- ),
268
+ model_name="gpt-4o",
269
+ model_provider="openai",
276
270
  parent=Eval(
277
271
  name="Test Eval",
278
272
  parent=task,
@@ -291,10 +285,6 @@ async def test_run_method():
291
285
  ),
292
286
  ],
293
287
  ),
294
- prompt=BasePrompt(
295
- name="Test Prompt",
296
- prompt="Test prompt",
297
- ),
298
288
  properties={"eval_steps": ["test_step"]},
299
289
  )
300
290
 
@@ -311,7 +301,9 @@ async def test_run_method():
311
301
  evaluator = EvalTester(eval_config, run_config.run_config())
312
302
 
313
303
  # Run the evaluation
314
- task_run, eval_scores = await evaluator.run("test input")
304
+ task_run, eval_scores, intermediate_outputs = await evaluator.run_task_and_eval(
305
+ "test input"
306
+ )
315
307
 
316
308
  # Verify task run was created
317
309
  assert task_run.input == "test input"
@@ -323,3 +315,162 @@ async def test_run_method():
323
315
 
324
316
  # Verify schema validation worked (these keys should exist per schema)
325
317
  assert set(eval_scores.keys()) == {"overall_rating", "quality"}
318
+
319
+
320
+ @pytest.mark.asyncio
321
+ async def test_run_task_and_eval():
322
+ """Test run_task_and_eval method with mocked dependencies"""
323
+ # Create test data
324
+ task = Task(
325
+ name="Test Task",
326
+ instruction="Test instruction",
327
+ requirements=[
328
+ TaskRequirement(
329
+ name="Quality",
330
+ instruction="Rate quality",
331
+ type=TaskOutputRatingType.five_star,
332
+ ),
333
+ ],
334
+ )
335
+
336
+ eval_config = EvalConfig(
337
+ name="Test Eval Config",
338
+ model_name="gpt-4o",
339
+ model_provider="openai",
340
+ parent=Eval(
341
+ name="Test Eval",
342
+ parent=task,
343
+ eval_set_filter_id="all",
344
+ eval_configs_filter_id="all",
345
+ output_scores=[
346
+ EvalOutputScore(
347
+ name="Quality",
348
+ instruction="Rate quality",
349
+ type=TaskOutputRatingType.five_star,
350
+ ),
351
+ EvalOutputScore(
352
+ name="Overall Rating",
353
+ instruction="The overall rating for the task output",
354
+ type=TaskOutputRatingType.five_star,
355
+ ),
356
+ ],
357
+ ),
358
+ properties={"eval_steps": ["test_step"]},
359
+ )
360
+
361
+ run_config = TaskRunConfig(
362
+ name="Test Run Config",
363
+ run_config_properties=RunConfigProperties(
364
+ model_name="llama_3_1_8b",
365
+ model_provider_name="groq",
366
+ prompt_id="simple_prompt_builder",
367
+ structured_output_mode="json_schema",
368
+ ),
369
+ parent=task,
370
+ )
371
+
372
+ # Create evaluator instance
373
+ class MockEval(BaseEval):
374
+ async def run_eval(self, task_run):
375
+ return {"overall_rating": 5, "quality": 4}, {"thinking": "test thinking"}
376
+
377
+ evaluator = MockEval(eval_config, run_config.run_config())
378
+
379
+ # Mock dependencies
380
+ mock_adapter = AsyncMock()
381
+ mock_task_run = MagicMock()
382
+ mock_task_run.input = "test input"
383
+ mock_task_run.output.output = "test output"
384
+ mock_adapter.invoke.return_value = mock_task_run
385
+
386
+ with (
387
+ patch(
388
+ "kiln_ai.adapters.eval.base_eval.adapter_for_task"
389
+ ) as mock_adapter_for_task,
390
+ patch(
391
+ "kiln_ai.adapters.eval.base_eval.validate_schema_with_value_error"
392
+ ) as mock_validate,
393
+ ):
394
+ mock_adapter_for_task.return_value = mock_adapter
395
+
396
+ # Test with string input
397
+ result = await evaluator.run_task_and_eval("test input")
398
+
399
+ # Verify adapter_for_task was called with correct parameters
400
+ mock_adapter_for_task.assert_called_once()
401
+ assert mock_adapter_for_task.call_args[0][0] == evaluator.target_task
402
+ props = mock_adapter_for_task.call_args[0][1]
403
+ assert props.model_name == "llama_3_1_8b"
404
+ assert props.model_provider_name == "groq"
405
+ assert props.prompt_id == "simple_prompt_builder"
406
+ bac = mock_adapter_for_task.call_args[1]
407
+ assert bac["base_adapter_config"].allow_saving is False
408
+
409
+ # Verify the base_adapter_config has allow_saving=False
410
+ adapter_config = mock_adapter_for_task.call_args[1]["base_adapter_config"]
411
+ assert adapter_config.allow_saving is False
412
+
413
+ # Verify adapter.invoke was called with correct input
414
+ mock_adapter.invoke.assert_called_once_with("test input")
415
+
416
+ # Verify validate_schema_with_value_error was called
417
+ mock_validate.assert_called_once_with(
418
+ {"overall_rating": 5, "quality": 4},
419
+ evaluator.score_schema,
420
+ "Eval output does not match score schema.",
421
+ )
422
+
423
+ # Verify return values
424
+ task_run, eval_scores, intermediate_outputs = result
425
+ assert task_run == mock_task_run
426
+ assert eval_scores == {"overall_rating": 5, "quality": 4}
427
+ assert intermediate_outputs == {"thinking": "test thinking"}
428
+
429
+
430
+ @pytest.mark.asyncio
431
+ async def test_run_task_and_eval_no_run_config():
432
+ """Test run_task_and_eval raises error when run_config is None"""
433
+ task = Task(
434
+ name="Test Task",
435
+ instruction="Test instruction",
436
+ requirements=[
437
+ TaskRequirement(
438
+ name="Quality",
439
+ instruction="Rate quality",
440
+ type=TaskOutputRatingType.five_star,
441
+ ),
442
+ ],
443
+ )
444
+
445
+ eval_config = EvalConfig(
446
+ name="Test Eval Config",
447
+ model_name="gpt-4o",
448
+ model_provider="openai",
449
+ parent=Eval(
450
+ name="Test Eval",
451
+ parent=task,
452
+ eval_set_filter_id="all",
453
+ eval_configs_filter_id="all",
454
+ output_scores=[
455
+ EvalOutputScore(
456
+ name="Quality",
457
+ instruction="Rate quality",
458
+ type=TaskOutputRatingType.five_star,
459
+ ),
460
+ ],
461
+ ),
462
+ properties={"eval_steps": ["test_step"]},
463
+ )
464
+
465
+ # Create evaluator instance with no run_config
466
+ class MockEval(BaseEval):
467
+ async def run_eval(self, task_run):
468
+ return {"quality": 4}, None
469
+
470
+ evaluator = MockEval(eval_config, None)
471
+
472
+ # Test that it raises ValueError
473
+ with pytest.raises(
474
+ ValueError, match="Run config is required for run_task_and_eval"
475
+ ):
476
+ await evaluator.run_task_and_eval("test input")
@@ -94,6 +94,7 @@ def mock_run_config(
94
94
  model_name="gpt-4",
95
95
  model_provider_name="openai",
96
96
  prompt_id="simple_prompt_builder",
97
+ structured_output_mode="json_schema",
97
98
  ),
98
99
  parent=mock_task,
99
100
  )
@@ -209,6 +210,7 @@ def test_collect_tasks_filtering(
209
210
  model_name="gpt-4",
210
211
  model_provider_name="openai",
211
212
  prompt_id="simple_prompt_builder",
213
+ structured_output_mode="json_schema",
212
214
  ),
213
215
  parent=mock_task,
214
216
  )
@@ -416,6 +418,7 @@ def test_collect_tasks_multiple_run_configs(
416
418
  model_name="gpt-3.5",
417
419
  model_provider_name="openai",
418
420
  prompt_id="simple_prompt_builder",
421
+ structured_output_mode="json_schema",
419
422
  ),
420
423
  parent=mock_task,
421
424
  )
@@ -99,6 +99,7 @@ def test_run_config(test_task):
99
99
  model_provider_name="groq",
100
100
  task=test_task,
101
101
  prompt_id="simple_prompt_builder",
102
+ structured_output_mode="json_schema",
102
103
  )
103
104
 
104
105
 
@@ -6,11 +6,11 @@ from pydantic import BaseModel
6
6
  from kiln_ai.adapters.ml_model_list import built_in_models
7
7
  from kiln_ai.datamodel import (
8
8
  DatasetSplit,
9
- FinetuneDataStrategy,
10
9
  FineTuneStatusType,
11
10
  Task,
12
11
  )
13
12
  from kiln_ai.datamodel import Finetune as FinetuneModel
13
+ from kiln_ai.datamodel.datamodel_enums import ChatStrategy
14
14
  from kiln_ai.utils.name_generator import generate_memorable_name
15
15
 
16
16
 
@@ -62,7 +62,7 @@ class BaseFinetuneAdapter(ABC):
62
62
  train_split_name: str,
63
63
  system_message: str,
64
64
  thinking_instructions: str | None,
65
- data_strategy: FinetuneDataStrategy,
65
+ data_strategy: ChatStrategy,
66
66
  parameters: dict[str, str | int | float | bool] = {},
67
67
  name: str | None = None,
68
68
  description: str | None = None,