kiln-ai 0.15.0__py3-none-any.whl → 0.17.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kiln-ai might be problematic. Click here for more details.

Files changed (72) hide show
  1. kiln_ai/adapters/__init__.py +2 -0
  2. kiln_ai/adapters/adapter_registry.py +22 -44
  3. kiln_ai/adapters/chat/__init__.py +8 -0
  4. kiln_ai/adapters/chat/chat_formatter.py +234 -0
  5. kiln_ai/adapters/chat/test_chat_formatter.py +131 -0
  6. kiln_ai/adapters/data_gen/test_data_gen_task.py +19 -6
  7. kiln_ai/adapters/eval/base_eval.py +8 -6
  8. kiln_ai/adapters/eval/eval_runner.py +9 -65
  9. kiln_ai/adapters/eval/g_eval.py +26 -8
  10. kiln_ai/adapters/eval/test_base_eval.py +166 -15
  11. kiln_ai/adapters/eval/test_eval_runner.py +3 -0
  12. kiln_ai/adapters/eval/test_g_eval.py +1 -0
  13. kiln_ai/adapters/fine_tune/base_finetune.py +2 -2
  14. kiln_ai/adapters/fine_tune/dataset_formatter.py +153 -197
  15. kiln_ai/adapters/fine_tune/test_base_finetune.py +10 -10
  16. kiln_ai/adapters/fine_tune/test_dataset_formatter.py +402 -211
  17. kiln_ai/adapters/fine_tune/test_fireworks_tinetune.py +3 -3
  18. kiln_ai/adapters/fine_tune/test_openai_finetune.py +6 -6
  19. kiln_ai/adapters/fine_tune/test_together_finetune.py +1 -0
  20. kiln_ai/adapters/fine_tune/test_vertex_finetune.py +4 -4
  21. kiln_ai/adapters/fine_tune/together_finetune.py +12 -1
  22. kiln_ai/adapters/ml_model_list.py +556 -45
  23. kiln_ai/adapters/model_adapters/base_adapter.py +100 -35
  24. kiln_ai/adapters/model_adapters/litellm_adapter.py +116 -100
  25. kiln_ai/adapters/model_adapters/litellm_config.py +3 -2
  26. kiln_ai/adapters/model_adapters/test_base_adapter.py +299 -52
  27. kiln_ai/adapters/model_adapters/test_litellm_adapter.py +121 -22
  28. kiln_ai/adapters/model_adapters/test_saving_adapter_results.py +44 -2
  29. kiln_ai/adapters/model_adapters/test_structured_output.py +48 -18
  30. kiln_ai/adapters/parsers/base_parser.py +0 -3
  31. kiln_ai/adapters/parsers/parser_registry.py +5 -3
  32. kiln_ai/adapters/parsers/r1_parser.py +17 -2
  33. kiln_ai/adapters/parsers/request_formatters.py +40 -0
  34. kiln_ai/adapters/parsers/test_parser_registry.py +2 -2
  35. kiln_ai/adapters/parsers/test_r1_parser.py +44 -1
  36. kiln_ai/adapters/parsers/test_request_formatters.py +76 -0
  37. kiln_ai/adapters/prompt_builders.py +14 -17
  38. kiln_ai/adapters/provider_tools.py +39 -4
  39. kiln_ai/adapters/repair/test_repair_task.py +27 -5
  40. kiln_ai/adapters/test_adapter_registry.py +88 -28
  41. kiln_ai/adapters/test_ml_model_list.py +158 -0
  42. kiln_ai/adapters/test_prompt_adaptors.py +17 -3
  43. kiln_ai/adapters/test_prompt_builders.py +27 -19
  44. kiln_ai/adapters/test_provider_tools.py +130 -12
  45. kiln_ai/datamodel/__init__.py +2 -2
  46. kiln_ai/datamodel/datamodel_enums.py +43 -4
  47. kiln_ai/datamodel/dataset_filters.py +69 -1
  48. kiln_ai/datamodel/dataset_split.py +4 -0
  49. kiln_ai/datamodel/eval.py +8 -0
  50. kiln_ai/datamodel/finetune.py +13 -7
  51. kiln_ai/datamodel/prompt_id.py +1 -0
  52. kiln_ai/datamodel/task.py +68 -7
  53. kiln_ai/datamodel/task_output.py +1 -1
  54. kiln_ai/datamodel/task_run.py +39 -7
  55. kiln_ai/datamodel/test_basemodel.py +5 -8
  56. kiln_ai/datamodel/test_dataset_filters.py +82 -0
  57. kiln_ai/datamodel/test_dataset_split.py +2 -8
  58. kiln_ai/datamodel/test_example_models.py +54 -0
  59. kiln_ai/datamodel/test_models.py +80 -9
  60. kiln_ai/datamodel/test_task.py +168 -2
  61. kiln_ai/utils/async_job_runner.py +106 -0
  62. kiln_ai/utils/config.py +3 -2
  63. kiln_ai/utils/dataset_import.py +81 -19
  64. kiln_ai/utils/logging.py +165 -0
  65. kiln_ai/utils/test_async_job_runner.py +199 -0
  66. kiln_ai/utils/test_config.py +23 -0
  67. kiln_ai/utils/test_dataset_import.py +272 -10
  68. {kiln_ai-0.15.0.dist-info → kiln_ai-0.17.0.dist-info}/METADATA +1 -1
  69. kiln_ai-0.17.0.dist-info/RECORD +113 -0
  70. kiln_ai-0.15.0.dist-info/RECORD +0 -104
  71. {kiln_ai-0.15.0.dist-info → kiln_ai-0.17.0.dist-info}/WHEEL +0 -0
  72. {kiln_ai-0.15.0.dist-info → kiln_ai-0.17.0.dist-info}/licenses/LICENSE.txt +0 -0
@@ -1,4 +1,3 @@
1
- import asyncio
2
1
  import logging
3
2
  from dataclasses import dataclass
4
3
  from typing import AsyncGenerator, Dict, List, Literal, Set
@@ -10,6 +9,7 @@ from kiln_ai.datamodel.dataset_filters import dataset_filter_from_id
10
9
  from kiln_ai.datamodel.eval import EvalConfig, EvalRun, EvalScores
11
10
  from kiln_ai.datamodel.task import TaskRunConfig
12
11
  from kiln_ai.datamodel.task_run import TaskRun
12
+ from kiln_ai.utils.async_job_runner import AsyncJobRunner, Progress
13
13
 
14
14
  logger = logging.getLogger(__name__)
15
15
 
@@ -23,13 +23,6 @@ class EvalJob:
23
23
  task_run_config: TaskRunConfig | None = None
24
24
 
25
25
 
26
- @dataclass
27
- class EvalProgress:
28
- complete: int | None = None
29
- total: int | None = None
30
- errors: int | None = None
31
-
32
-
33
26
  class EvalRunner:
34
27
  """
35
28
  Runs an eval. Async execution is supported to make it faster when using remote/fast model providers.
@@ -161,67 +154,15 @@ class EvalRunner:
161
154
  if task_run.id not in already_run[eval_config.id][run_config.id]
162
155
  ]
163
156
 
164
- async def run(self, concurrency: int = 25) -> AsyncGenerator[EvalProgress, None]:
157
+ async def run(self, concurrency: int = 25) -> AsyncGenerator[Progress, None]:
165
158
  """
166
159
  Runs the configured eval run with parallel workers and yields progress updates.
167
160
  """
168
161
  jobs = self.collect_tasks()
169
162
 
170
- complete = 0
171
- errors = 0
172
- total = len(jobs)
173
-
174
- # Send initial status
175
- yield EvalProgress(complete=complete, total=total, errors=errors)
176
-
177
- worker_queue: asyncio.Queue[EvalJob] = asyncio.Queue()
178
- for job in jobs:
179
- worker_queue.put_nowait(job)
180
-
181
- # simple status queue to return progress. True=success, False=error
182
- status_queue: asyncio.Queue[bool] = asyncio.Queue()
183
-
184
- workers = []
185
- for i in range(concurrency):
186
- task = asyncio.create_task(self.run_worker(worker_queue, status_queue))
187
- workers.append(task)
188
-
189
- # Send status updates until workers are done, and they are all sent
190
- while not status_queue.empty() or not all(worker.done() for worker in workers):
191
- try:
192
- # Use timeout to prevent hanging if all workers complete
193
- # between our while condition check and get()
194
- success = await asyncio.wait_for(status_queue.get(), timeout=0.1)
195
- if success:
196
- complete += 1
197
- else:
198
- errors += 1
199
-
200
- yield EvalProgress(complete=complete, total=total, errors=errors)
201
- except asyncio.TimeoutError:
202
- # Timeout is expected, just continue to recheck worker status
203
- # Don't love this but beats sentinels for reliability
204
- continue
205
-
206
- # These are redundant, but keeping them will catch async errors
207
- await asyncio.gather(*workers)
208
- await worker_queue.join()
209
-
210
- async def run_worker(
211
- self, worker_queue: asyncio.Queue[EvalJob], status_queue: asyncio.Queue[bool]
212
- ):
213
- while True:
214
- try:
215
- job = worker_queue.get_nowait()
216
- except asyncio.QueueEmpty:
217
- # worker can end when the queue is empty
218
- break
219
- try:
220
- success = await self.run_job(job)
221
- await status_queue.put(success)
222
- finally:
223
- # Always mark the dequeued task as done, even on exceptions
224
- worker_queue.task_done()
163
+ runner = AsyncJobRunner(concurrency=concurrency)
164
+ async for progress in runner.run(jobs, self.run_job):
165
+ yield progress
225
166
 
226
167
  async def run_job(self, job: EvalJob) -> bool:
227
168
  try:
@@ -266,5 +207,8 @@ class EvalRunner:
266
207
 
267
208
  return True
268
209
  except Exception as e:
269
- logger.error(f"Error running eval job for dataset item {job.item.id}: {e}")
210
+ logger.error(
211
+ f"Error running eval job for dataset item {job.item.id}: {e}",
212
+ exc_info=True,
213
+ )
270
214
  return False
@@ -5,11 +5,14 @@ from litellm.types.utils import ChatCompletionTokenLogprob
5
5
 
6
6
  from kiln_ai.adapters.adapter_registry import adapter_for_task
7
7
  from kiln_ai.adapters.eval.base_eval import BaseEval
8
+ from kiln_ai.adapters.ml_model_list import (
9
+ default_structured_output_mode_for_model_provider,
10
+ )
8
11
  from kiln_ai.adapters.model_adapters.base_adapter import AdapterConfig, RunOutput
9
12
  from kiln_ai.adapters.prompt_builders import PromptGenerators
10
13
  from kiln_ai.datamodel import Project, Task, TaskRun
11
14
  from kiln_ai.datamodel.eval import EvalConfig, EvalConfigType, EvalScores
12
- from kiln_ai.datamodel.task import RunConfig
15
+ from kiln_ai.datamodel.task import RunConfig, RunConfigProperties, StructuredOutputMode
13
16
 
14
17
  # all the tokens we score for, and their float scores.
15
18
  TOKEN_TO_SCORE_MAP: Dict[str, float] = {
@@ -43,9 +46,9 @@ class GEvalTask(Task, parent_of={}):
43
46
 
44
47
  # Build the COT eval instructions
45
48
  cot_instructions = "First, think step by step about the model's performance following these evaluation steps:\n\n"
46
- steps = eval_config.properties.get("eval_steps", None)
47
- if not steps or not isinstance(steps, list):
48
- raise ValueError("eval_steps must be a list")
49
+ steps = eval_config.properties.get("eval_steps", [])
50
+ if not isinstance(steps, list):
51
+ raise ValueError("eval_steps must be a list.")
49
52
  for i, step in enumerate(steps):
50
53
  cot_instructions += f"{i + 1}) {step}\n"
51
54
 
@@ -114,12 +117,27 @@ class GEval(BaseEval):
114
117
  10 if self.eval_config.config_type == EvalConfigType.g_eval else None
115
118
  )
116
119
 
117
- adapter = adapter_for_task(
118
- self.geval_task,
120
+ # We don't expose setting this manually in the UI, so pull a recommended mode from ml_model_list
121
+ structured_output_mode = default_structured_output_mode_for_model_provider(
119
122
  model_name,
120
123
  provider,
121
- # We always use Simple COT for G-Eval and LLM as Judge
122
- prompt_id=PromptGenerators.SIMPLE_CHAIN_OF_THOUGHT,
124
+ default=StructuredOutputMode.json_schema,
125
+ # G-eval expects JSON, so don't allow function calling modes
126
+ disallowed_modes=[
127
+ StructuredOutputMode.function_calling,
128
+ StructuredOutputMode.function_calling_weak,
129
+ ],
130
+ )
131
+
132
+ adapter = adapter_for_task(
133
+ self.geval_task,
134
+ run_config_properties=RunConfigProperties(
135
+ model_name=model_name,
136
+ model_provider_name=provider,
137
+ # We always use Simple COT for G-Eval and LLM as Judge
138
+ prompt_id=PromptGenerators.SIMPLE_CHAIN_OF_THOUGHT,
139
+ structured_output_mode=structured_output_mode,
140
+ ),
123
141
  base_adapter_config=AdapterConfig(
124
142
  # Don't save this run into the task_runs. It will be saved into an eval_run where it belongs
125
143
  allow_saving=False,
@@ -1,9 +1,9 @@
1
1
  import json
2
+ from unittest.mock import AsyncMock, MagicMock, patch
2
3
 
3
4
  import pytest
4
5
 
5
6
  from kiln_ai.adapters.eval.base_eval import BaseEval
6
- from kiln_ai.datamodel import BasePrompt, DataSource, DataSourceType
7
7
  from kiln_ai.datamodel.eval import Eval, EvalConfig, EvalOutputScore
8
8
  from kiln_ai.datamodel.task import (
9
9
  RunConfigProperties,
@@ -245,7 +245,7 @@ class EvalTester(BaseEval):
245
245
  """Test implementation of BaseEval"""
246
246
 
247
247
  async def run_eval(self, task_run):
248
- return {"overall_rating": 5, "quality": 4}
248
+ return {"overall_rating": 5, "quality": 4}, None
249
249
 
250
250
 
251
251
  @pytest.mark.paid
@@ -265,14 +265,8 @@ async def test_run_method():
265
265
 
266
266
  eval_config = EvalConfig(
267
267
  name="Test Eval Config",
268
- model=DataSource(
269
- type=DataSourceType.synthetic,
270
- properties={
271
- "model_name": "gpt-4o",
272
- "model_provider": "openai",
273
- "adapter_name": "test",
274
- },
275
- ),
268
+ model_name="gpt-4o",
269
+ model_provider="openai",
276
270
  parent=Eval(
277
271
  name="Test Eval",
278
272
  parent=task,
@@ -291,10 +285,6 @@ async def test_run_method():
291
285
  ),
292
286
  ],
293
287
  ),
294
- prompt=BasePrompt(
295
- name="Test Prompt",
296
- prompt="Test prompt",
297
- ),
298
288
  properties={"eval_steps": ["test_step"]},
299
289
  )
300
290
 
@@ -311,7 +301,9 @@ async def test_run_method():
311
301
  evaluator = EvalTester(eval_config, run_config.run_config())
312
302
 
313
303
  # Run the evaluation
314
- task_run, eval_scores = await evaluator.run("test input")
304
+ task_run, eval_scores, intermediate_outputs = await evaluator.run_task_and_eval(
305
+ "test input"
306
+ )
315
307
 
316
308
  # Verify task run was created
317
309
  assert task_run.input == "test input"
@@ -323,3 +315,162 @@ async def test_run_method():
323
315
 
324
316
  # Verify schema validation worked (these keys should exist per schema)
325
317
  assert set(eval_scores.keys()) == {"overall_rating", "quality"}
318
+
319
+
320
+ @pytest.mark.asyncio
321
+ async def test_run_task_and_eval():
322
+ """Test run_task_and_eval method with mocked dependencies"""
323
+ # Create test data
324
+ task = Task(
325
+ name="Test Task",
326
+ instruction="Test instruction",
327
+ requirements=[
328
+ TaskRequirement(
329
+ name="Quality",
330
+ instruction="Rate quality",
331
+ type=TaskOutputRatingType.five_star,
332
+ ),
333
+ ],
334
+ )
335
+
336
+ eval_config = EvalConfig(
337
+ name="Test Eval Config",
338
+ model_name="gpt-4o",
339
+ model_provider="openai",
340
+ parent=Eval(
341
+ name="Test Eval",
342
+ parent=task,
343
+ eval_set_filter_id="all",
344
+ eval_configs_filter_id="all",
345
+ output_scores=[
346
+ EvalOutputScore(
347
+ name="Quality",
348
+ instruction="Rate quality",
349
+ type=TaskOutputRatingType.five_star,
350
+ ),
351
+ EvalOutputScore(
352
+ name="Overall Rating",
353
+ instruction="The overall rating for the task output",
354
+ type=TaskOutputRatingType.five_star,
355
+ ),
356
+ ],
357
+ ),
358
+ properties={"eval_steps": ["test_step"]},
359
+ )
360
+
361
+ run_config = TaskRunConfig(
362
+ name="Test Run Config",
363
+ run_config_properties=RunConfigProperties(
364
+ model_name="llama_3_1_8b",
365
+ model_provider_name="groq",
366
+ prompt_id="simple_prompt_builder",
367
+ structured_output_mode="json_schema",
368
+ ),
369
+ parent=task,
370
+ )
371
+
372
+ # Create evaluator instance
373
+ class MockEval(BaseEval):
374
+ async def run_eval(self, task_run):
375
+ return {"overall_rating": 5, "quality": 4}, {"thinking": "test thinking"}
376
+
377
+ evaluator = MockEval(eval_config, run_config.run_config())
378
+
379
+ # Mock dependencies
380
+ mock_adapter = AsyncMock()
381
+ mock_task_run = MagicMock()
382
+ mock_task_run.input = "test input"
383
+ mock_task_run.output.output = "test output"
384
+ mock_adapter.invoke.return_value = mock_task_run
385
+
386
+ with (
387
+ patch(
388
+ "kiln_ai.adapters.eval.base_eval.adapter_for_task"
389
+ ) as mock_adapter_for_task,
390
+ patch(
391
+ "kiln_ai.adapters.eval.base_eval.validate_schema_with_value_error"
392
+ ) as mock_validate,
393
+ ):
394
+ mock_adapter_for_task.return_value = mock_adapter
395
+
396
+ # Test with string input
397
+ result = await evaluator.run_task_and_eval("test input")
398
+
399
+ # Verify adapter_for_task was called with correct parameters
400
+ mock_adapter_for_task.assert_called_once()
401
+ assert mock_adapter_for_task.call_args[0][0] == evaluator.target_task
402
+ props = mock_adapter_for_task.call_args[0][1]
403
+ assert props.model_name == "llama_3_1_8b"
404
+ assert props.model_provider_name == "groq"
405
+ assert props.prompt_id == "simple_prompt_builder"
406
+ bac = mock_adapter_for_task.call_args[1]
407
+ assert bac["base_adapter_config"].allow_saving is False
408
+
409
+ # Verify the base_adapter_config has allow_saving=False
410
+ adapter_config = mock_adapter_for_task.call_args[1]["base_adapter_config"]
411
+ assert adapter_config.allow_saving is False
412
+
413
+ # Verify adapter.invoke was called with correct input
414
+ mock_adapter.invoke.assert_called_once_with("test input")
415
+
416
+ # Verify validate_schema_with_value_error was called
417
+ mock_validate.assert_called_once_with(
418
+ {"overall_rating": 5, "quality": 4},
419
+ evaluator.score_schema,
420
+ "Eval output does not match score schema.",
421
+ )
422
+
423
+ # Verify return values
424
+ task_run, eval_scores, intermediate_outputs = result
425
+ assert task_run == mock_task_run
426
+ assert eval_scores == {"overall_rating": 5, "quality": 4}
427
+ assert intermediate_outputs == {"thinking": "test thinking"}
428
+
429
+
430
+ @pytest.mark.asyncio
431
+ async def test_run_task_and_eval_no_run_config():
432
+ """Test run_task_and_eval raises error when run_config is None"""
433
+ task = Task(
434
+ name="Test Task",
435
+ instruction="Test instruction",
436
+ requirements=[
437
+ TaskRequirement(
438
+ name="Quality",
439
+ instruction="Rate quality",
440
+ type=TaskOutputRatingType.five_star,
441
+ ),
442
+ ],
443
+ )
444
+
445
+ eval_config = EvalConfig(
446
+ name="Test Eval Config",
447
+ model_name="gpt-4o",
448
+ model_provider="openai",
449
+ parent=Eval(
450
+ name="Test Eval",
451
+ parent=task,
452
+ eval_set_filter_id="all",
453
+ eval_configs_filter_id="all",
454
+ output_scores=[
455
+ EvalOutputScore(
456
+ name="Quality",
457
+ instruction="Rate quality",
458
+ type=TaskOutputRatingType.five_star,
459
+ ),
460
+ ],
461
+ ),
462
+ properties={"eval_steps": ["test_step"]},
463
+ )
464
+
465
+ # Create evaluator instance with no run_config
466
+ class MockEval(BaseEval):
467
+ async def run_eval(self, task_run):
468
+ return {"quality": 4}, None
469
+
470
+ evaluator = MockEval(eval_config, None)
471
+
472
+ # Test that it raises ValueError
473
+ with pytest.raises(
474
+ ValueError, match="Run config is required for run_task_and_eval"
475
+ ):
476
+ await evaluator.run_task_and_eval("test input")
@@ -94,6 +94,7 @@ def mock_run_config(
94
94
  model_name="gpt-4",
95
95
  model_provider_name="openai",
96
96
  prompt_id="simple_prompt_builder",
97
+ structured_output_mode="json_schema",
97
98
  ),
98
99
  parent=mock_task,
99
100
  )
@@ -209,6 +210,7 @@ def test_collect_tasks_filtering(
209
210
  model_name="gpt-4",
210
211
  model_provider_name="openai",
211
212
  prompt_id="simple_prompt_builder",
213
+ structured_output_mode="json_schema",
212
214
  ),
213
215
  parent=mock_task,
214
216
  )
@@ -416,6 +418,7 @@ def test_collect_tasks_multiple_run_configs(
416
418
  model_name="gpt-3.5",
417
419
  model_provider_name="openai",
418
420
  prompt_id="simple_prompt_builder",
421
+ structured_output_mode="json_schema",
419
422
  ),
420
423
  parent=mock_task,
421
424
  )
@@ -99,6 +99,7 @@ def test_run_config(test_task):
99
99
  model_provider_name="groq",
100
100
  task=test_task,
101
101
  prompt_id="simple_prompt_builder",
102
+ structured_output_mode="json_schema",
102
103
  )
103
104
 
104
105
 
@@ -6,11 +6,11 @@ from pydantic import BaseModel
6
6
  from kiln_ai.adapters.ml_model_list import built_in_models
7
7
  from kiln_ai.datamodel import (
8
8
  DatasetSplit,
9
- FinetuneDataStrategy,
10
9
  FineTuneStatusType,
11
10
  Task,
12
11
  )
13
12
  from kiln_ai.datamodel import Finetune as FinetuneModel
13
+ from kiln_ai.datamodel.datamodel_enums import ChatStrategy
14
14
  from kiln_ai.utils.name_generator import generate_memorable_name
15
15
 
16
16
 
@@ -62,7 +62,7 @@ class BaseFinetuneAdapter(ABC):
62
62
  train_split_name: str,
63
63
  system_message: str,
64
64
  thinking_instructions: str | None,
65
- data_strategy: FinetuneDataStrategy,
65
+ data_strategy: ChatStrategy,
66
66
  parameters: dict[str, str | int | float | bool] = {},
67
67
  name: str | None = None,
68
68
  description: str | None = None,