PyPI - kiln-ai - Versions diffs - 0.14.0__py3-none-any.whl → 0.16.0__py3-none-any.whl - Mend

kiln-ai 0.14.0py3-none-any.whl → 0.16.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (58) hide show

kiln_ai/adapters/eval/base_eval.py +7 -2
kiln_ai/adapters/eval/eval_runner.py +5 -64
kiln_ai/adapters/eval/g_eval.py +3 -3
kiln_ai/adapters/fine_tune/base_finetune.py +6 -3
kiln_ai/adapters/fine_tune/dataset_formatter.py +128 -38
kiln_ai/adapters/fine_tune/finetune_registry.py +2 -0
kiln_ai/adapters/fine_tune/fireworks_finetune.py +2 -1
kiln_ai/adapters/fine_tune/test_base_finetune.py +7 -0
kiln_ai/adapters/fine_tune/test_dataset_formatter.py +267 -10
kiln_ai/adapters/fine_tune/test_fireworks_tinetune.py +1 -1
kiln_ai/adapters/fine_tune/test_vertex_finetune.py +586 -0
kiln_ai/adapters/fine_tune/vertex_finetune.py +217 -0
kiln_ai/adapters/ml_model_list.py +817 -62
kiln_ai/adapters/model_adapters/base_adapter.py +33 -10
kiln_ai/adapters/model_adapters/litellm_adapter.py +51 -12
kiln_ai/adapters/model_adapters/test_base_adapter.py +74 -2
kiln_ai/adapters/model_adapters/test_litellm_adapter.py +65 -1
kiln_ai/adapters/model_adapters/test_saving_adapter_results.py +3 -2
kiln_ai/adapters/model_adapters/test_structured_output.py +4 -6
kiln_ai/adapters/parsers/base_parser.py +0 -3
kiln_ai/adapters/parsers/parser_registry.py +5 -3
kiln_ai/adapters/parsers/r1_parser.py +17 -2
kiln_ai/adapters/parsers/request_formatters.py +40 -0
kiln_ai/adapters/parsers/test_parser_registry.py +2 -2
kiln_ai/adapters/parsers/test_r1_parser.py +44 -1
kiln_ai/adapters/parsers/test_request_formatters.py +76 -0
kiln_ai/adapters/prompt_builders.py +14 -1
kiln_ai/adapters/provider_tools.py +25 -1
kiln_ai/adapters/repair/test_repair_task.py +3 -2
kiln_ai/adapters/test_prompt_builders.py +24 -3
kiln_ai/adapters/test_provider_tools.py +86 -1
kiln_ai/datamodel/__init__.py +2 -0
kiln_ai/datamodel/datamodel_enums.py +14 -0
kiln_ai/datamodel/dataset_filters.py +69 -1
kiln_ai/datamodel/dataset_split.py +4 -0
kiln_ai/datamodel/eval.py +8 -0
kiln_ai/datamodel/finetune.py +1 -0
kiln_ai/datamodel/json_schema.py +24 -7
kiln_ai/datamodel/prompt_id.py +1 -0
kiln_ai/datamodel/task_output.py +10 -6
kiln_ai/datamodel/task_run.py +68 -12
kiln_ai/datamodel/test_basemodel.py +3 -7
kiln_ai/datamodel/test_dataset_filters.py +82 -0
kiln_ai/datamodel/test_dataset_split.py +2 -0
kiln_ai/datamodel/test_example_models.py +158 -3
kiln_ai/datamodel/test_json_schema.py +22 -3
kiln_ai/datamodel/test_model_perf.py +3 -2
kiln_ai/datamodel/test_models.py +50 -2
kiln_ai/utils/async_job_runner.py +106 -0
kiln_ai/utils/dataset_import.py +80 -18
kiln_ai/utils/test_async_job_runner.py +199 -0
kiln_ai/utils/test_dataset_import.py +242 -10
{kiln_ai-0.14.0.dist-info → kiln_ai-0.16.0.dist-info}/METADATA +3 -2
kiln_ai-0.16.0.dist-info/RECORD +108 -0
kiln_ai/adapters/test_generate_docs.py +0 -69
kiln_ai-0.14.0.dist-info/RECORD +0 -103
{kiln_ai-0.14.0.dist-info → kiln_ai-0.16.0.dist-info}/WHEEL +0 -0
{kiln_ai-0.14.0.dist-info → kiln_ai-0.16.0.dist-info}/licenses/LICENSE.txt +0 -0

kiln_ai/adapters/eval/base_eval.py CHANGED Viewed

@@ -2,11 +2,13 @@ import json
 from abc import abstractmethod
 from typing import Dict
+import jsonschema
 from kiln_ai.adapters.adapter_registry import adapter_for_task
 from kiln_ai.adapters.ml_model_list import ModelProviderName
 from kiln_ai.adapters.model_adapters.base_adapter import AdapterConfig
 from kiln_ai.datamodel.eval import Eval, EvalConfig, EvalScores
-from kiln_ai.datamodel.json_schema import validate_schema
+from kiln_ai.datamodel.json_schema import validate_schema_with_value_error
 from kiln_ai.datamodel.task import RunConfig, TaskOutputRatingType, TaskRun
 from kiln_ai.utils.exhaustive_error import raise_exhaustive_enum_error
@@ -72,7 +74,10 @@ class BaseEval:
         run_output = await run_adapter.invoke(parsed_input)
         eval_output, intermediate_outputs = await self.run_eval(run_output)
-        validate_schema(eval_output, self.score_schema)
+        validate_schema_with_value_error(
+            eval_output, self.score_schema, "Eval output does not match score schema."
+        )
         return run_output, eval_output, intermediate_outputs

kiln_ai/adapters/eval/eval_runner.py CHANGED Viewed

@@ -1,4 +1,3 @@
-import asyncio
 import logging
 from dataclasses import dataclass
 from typing import AsyncGenerator, Dict, List, Literal, Set
@@ -10,6 +9,7 @@ from kiln_ai.datamodel.dataset_filters import dataset_filter_from_id
 from kiln_ai.datamodel.eval import EvalConfig, EvalRun, EvalScores
 from kiln_ai.datamodel.task import TaskRunConfig
 from kiln_ai.datamodel.task_run import TaskRun
+from kiln_ai.utils.async_job_runner import AsyncJobRunner, Progress
 logger = logging.getLogger(__name__)
@@ -23,13 +23,6 @@ class EvalJob:
     task_run_config: TaskRunConfig | None = None
-@dataclass
-class EvalProgress:
-    complete: int | None = None
-    total: int | None = None
-    errors: int | None = None
 class EvalRunner:
     """
     Runs an eval. Async execution is supported to make it faster when using remote/fast model providers.
@@ -161,67 +154,15 @@ class EvalRunner:
             if task_run.id not in already_run[eval_config.id][run_config.id]
         ]
-    async def run(self, concurrency: int = 25) -> AsyncGenerator[EvalProgress, None]:
+    async def run(self, concurrency: int = 25) -> AsyncGenerator[Progress, None]:
         """
         Runs the configured eval run with parallel workers and yields progress updates.
         """
         jobs = self.collect_tasks()
-        complete = 0
-        errors = 0
-        total = len(jobs)
-        # Send initial status
-        yield EvalProgress(complete=complete, total=total, errors=errors)
-        worker_queue: asyncio.Queue[EvalJob] = asyncio.Queue()
-        for job in jobs:
-            worker_queue.put_nowait(job)
-        # simple status queue to return progress. True=success, False=error
-        status_queue: asyncio.Queue[bool] = asyncio.Queue()
-        workers = []
-        for i in range(concurrency):
-            task = asyncio.create_task(self.run_worker(worker_queue, status_queue))
-            workers.append(task)
-        # Send status updates until workers are done, and they are all sent
-        while not status_queue.empty() or not all(worker.done() for worker in workers):
-            try:
-                # Use timeout to prevent hanging if all workers complete
-                # between our while condition check and get()
-                success = await asyncio.wait_for(status_queue.get(), timeout=0.1)
-                if success:
-                    complete += 1
-                else:
-                    errors += 1
-                yield EvalProgress(complete=complete, total=total, errors=errors)
-            except asyncio.TimeoutError:
-                # Timeout is expected, just continue to recheck worker status
-                # Don't love this but beats sentinels for reliability
-                continue
-        # These are redundant, but keeping them will catch async errors
-        await asyncio.gather(*workers)
-        await worker_queue.join()
-    async def run_worker(
-        self, worker_queue: asyncio.Queue[EvalJob], status_queue: asyncio.Queue[bool]
-    ):
-        while True:
-            try:
-                job = worker_queue.get_nowait()
-            except asyncio.QueueEmpty:
-                # worker can end when the queue is empty
-                break
-            try:
-                success = await self.run_job(job)
-                await status_queue.put(success)
-            finally:
-                # Always mark the dequeued task as done, even on exceptions
-                worker_queue.task_done()
+        runner = AsyncJobRunner(concurrency=concurrency)
+        async for progress in runner.run(jobs, self.run_job):
+            yield progress
     async def run_job(self, job: EvalJob) -> bool:
         try:

kiln_ai/adapters/eval/g_eval.py CHANGED Viewed

@@ -43,9 +43,9 @@ class GEvalTask(Task, parent_of={}):
         # Build the COT eval instructions
         cot_instructions = "First, think step by step about the model's performance following these evaluation steps:\n\n"
-        steps = eval_config.properties.get("eval_steps", None)
-        if not steps or not isinstance(steps, list):
-            raise ValueError("eval_steps must be a list")
+        steps = eval_config.properties.get("eval_steps", [])
+        if not isinstance(steps, list):
+            raise ValueError("eval_steps must be a list.")
         for i, step in enumerate(steps):
             cot_instructions += f"{i + 1}) {step}\n"

kiln_ai/adapters/fine_tune/base_finetune.py CHANGED Viewed

@@ -166,9 +166,12 @@ class BaseFinetuneAdapter(ABC):
                 # Strict type checking for numeric types
                 if expected_type is float and not isinstance(value, float):
-                    raise ValueError(
-                        f"Parameter {parameter.name} must be a float, got {type(value)}"
-                    )
+                    if isinstance(value, int):
+                        value = float(value)
+                    else:
+                        raise ValueError(
+                            f"Parameter {parameter.name} must be a float, got {type(value)}"
+                        )
                 elif expected_type is int and not isinstance(value, int):
                     raise ValueError(
                         f"Parameter {parameter.name} must be an integer, got {type(value)}"

kiln_ai/adapters/fine_tune/dataset_formatter.py CHANGED Viewed

@@ -8,6 +8,7 @@ from uuid import uuid4
 from kiln_ai.adapters.model_adapters.base_adapter import COT_FINAL_ANSWER_PROMPT
 from kiln_ai.datamodel import DatasetSplit, FinetuneDataStrategy, TaskRun
+from kiln_ai.datamodel.datamodel_enums import THINKING_DATA_STRATEGIES
 class DatasetFormat(str, Enum):
@@ -30,8 +31,8 @@ class DatasetFormat(str, Enum):
         "huggingface_chat_template_toolcall_jsonl"
     )
-    """Vertex Gemini 1.5 format (flash and pro)"""
-    VERTEX_GEMINI_1_5 = "vertex_gemini_1_5"
+    """Vertex Gemini format"""
+    VERTEX_GEMINI = "vertex_gemini"
 @dataclass
@@ -43,8 +44,12 @@ class ModelTrainingData:
     thinking_instructions: str | None = None
     thinking: str | None = None
     thinking_final_answer_prompt: str | None = None
+    thinking_r1_style: bool = False
     def supports_cot(self) -> bool:
+        if self.thinking_r1_style:
+            raise ValueError("R1 style does not support COT")
         return (
             self.thinking_instructions is not None
             and self.thinking is not None
@@ -64,7 +69,7 @@ class FormatGenerator(Protocol):
 def build_training_data(
     task_run: TaskRun,
     system_message: str,
-    include_cot: bool,
+    data_strategy: FinetuneDataStrategy,
     thinking_instructions: str | None = None,
 ) -> ModelTrainingData:
     """
@@ -80,27 +85,41 @@ def build_training_data(
     thinking = None
     thinking_final_answer_prompt = None
+    thinking_r1_style = False
     parent_task = task_run.parent_task()
-    if include_cot and task_run.has_thinking_training_data():
-        if not parent_task:
-            raise ValueError(
-                "TaskRuns for training required a parent Task for building a chain of thought prompts. Train without COT, or save this TaskRun to a parent Task."
-            )
+    if data_strategy in THINKING_DATA_STRATEGIES:
         # Prefer reasoning to cot if both are present
-        intermediate_outputs = task_run.intermediate_outputs or {}
-        thinking = intermediate_outputs.get("reasoning") or intermediate_outputs.get(
-            "chain_of_thought"
-        )
+        thinking = task_run.thinking_training_data()
+        if data_strategy == FinetuneDataStrategy.final_and_intermediate_r1_compatible:
+            if not task_run.has_thinking_training_data() or not thinking:
+                raise ValueError(
+                    "Thinking data is required when fine-tuning thinking models (R1, QwQ, etc). Please ensure your fine-tuning dataset contains reasoning or chain of thought output for every entry."
+                )
+            if thinking_instructions:
+                raise ValueError(
+                    "Thinking instructions are not supported when fine-tuning thinking models (R1, QwQ, etc). Please remove the thinking instructions."
+                )
+            thinking_r1_style = True
+        elif (
+            data_strategy == FinetuneDataStrategy.final_and_intermediate
+            and task_run.has_thinking_training_data()
+        ):
+            if not parent_task:
+                raise ValueError(
+                    "TaskRuns for training required a parent Task for building a chain of thought prompts. Train without COT, or save this TaskRun to a parent Task."
+                )
-        thinking_final_answer_prompt = COT_FINAL_ANSWER_PROMPT
+            thinking_final_answer_prompt = COT_FINAL_ANSWER_PROMPT
-        # Always use the passed thinking instructions, but check they are present for COT
-        if not thinking_instructions:
-            raise ValueError(
-                "Thinking instructions are required when data_strategy is final_and_intermediate"
-            )
+            # Always use the passed thinking instructions, but check they are present for COT
+            if not thinking_instructions:
+                raise ValueError(
+                    "Thinking instructions are required when data_strategy is final_and_intermediate"
+                )
+        else:
+            raise ValueError(f"Unsupported data strategy: {data_strategy}")
     return ModelTrainingData(
         input=task_run.input,
@@ -109,9 +128,19 @@ def build_training_data(
         thinking=thinking,
         thinking_instructions=thinking_instructions,
         thinking_final_answer_prompt=thinking_final_answer_prompt,
+        thinking_r1_style=thinking_r1_style,
     )
+def serialize_r1_style_message(thinking: str | None, final_output: str):
+    if thinking is None or len(thinking.strip()) == 0:
+        raise ValueError(
+            "Thinking data is required when fine-tuning thinking models (R1, QwQ, etc). Please ensure your fine-tuning dataset contains reasoning or chain of thought output for every entry."
+        )
+    return f"<think>\n{thinking}\n</think>\n\n{final_output}"
 def generate_chat_message_response(
     training_data: ModelTrainingData,
 ) -> Dict[str, Any]:
@@ -122,7 +151,21 @@ def generate_chat_message_response(
         {"role": "user", "content": training_data.input},
     ]
-    if training_data.supports_cot():
+    if training_data.thinking_r1_style:
+        messages.extend(
+            [
+                {
+                    "role": "assistant",
+                    "content": serialize_r1_style_message(
+                        thinking=training_data.thinking,
+                        final_output=training_data.final_output,
+                    ),
+                }
+            ]
+        )
+        return {"messages": messages}
+    elif training_data.supports_cot():
         messages.extend(
             [
                 {"role": "user", "content": training_data.thinking_instructions},
@@ -157,7 +200,21 @@ def generate_json_schema_message(
         {"role": "user", "content": training_data.input},
     ]
-    if training_data.supports_cot():
+    if training_data.thinking_r1_style:
+        messages.extend(
+            [
+                {
+                    "role": "assistant",
+                    "content": serialize_r1_style_message(
+                        thinking=training_data.thinking,
+                        final_output=training_data.final_output,
+                    ),
+                }
+            ]
+        )
+        return {"messages": messages}
+    elif training_data.supports_cot():
         messages.extend(
             [
                 {"role": "user", "content": training_data.thinking_instructions},
@@ -188,7 +245,11 @@ def generate_chat_message_toolcall(
         {"role": "user", "content": training_data.input},
     ]
-    if training_data.supports_cot():
+    if training_data.thinking_r1_style:
+        raise ValueError(
+            "R1 style thinking is not supported for tool call downloads. Please use a different training strategy."
+        )
+    elif training_data.supports_cot():
         messages.extend(
             [
                 {"role": "user", "content": training_data.thinking_instructions},
@@ -231,12 +292,29 @@ def generate_huggingface_chat_template(
         {"role": "user", "content": training_data.input},
     ]
+    if training_data.thinking_r1_style:
+        conversations.extend(
+            [
+                {
+                    "role": "assistant",
+                    "content": serialize_r1_style_message(
+                        thinking=training_data.thinking,
+                        final_output=training_data.final_output,
+                    ),
+                }
+            ]
+        )
+        return {"conversations": conversations}
     if training_data.supports_cot():
         conversations.extend(
             [
                 {"role": "user", "content": training_data.thinking_instructions},
                 {"role": "assistant", "content": training_data.thinking},
-                {"role": "user", "content": training_data.thinking_final_answer_prompt},
+                {
+                    "role": "user",
+                    "content": training_data.thinking_final_answer_prompt,
+                },
             ]
         )
@@ -260,12 +338,19 @@ def generate_huggingface_chat_template_toolcall(
         {"role": "user", "content": training_data.input},
     ]
-    if training_data.supports_cot():
+    if training_data.thinking_r1_style:
+        raise ValueError(
+            "R1 style thinking is not supported for tool call downloads. Please use a different training strategy."
+        )
+    elif training_data.supports_cot():
         conversations.extend(
             [
                 {"role": "user", "content": training_data.thinking_instructions},
                 {"role": "assistant", "content": training_data.thinking},
-                {"role": "user", "content": training_data.thinking_final_answer_prompt},
+                {
+                    "role": "user",
+                    "content": training_data.thinking_final_answer_prompt,
+                },
             ]
         )
@@ -288,12 +373,20 @@ def generate_huggingface_chat_template_toolcall(
     return {"conversations": conversations}
-def generate_vertex_gemini_1_5(
+def generate_vertex_gemini(
     training_data: ModelTrainingData,
 ) -> Dict[str, Any]:
     """Generate Vertex Gemini 1.5 format (flash and pro)"""
     # See https://cloud.google.com/vertex-ai/generative-ai/docs/models/gemini-supervised-tuning-prepare
+    system_instruction = {
+        "role": "system",
+        "parts": [
+            {
+                "text": training_data.system_message,
+            }
+        ],
+    }
     contents = [
         {
             "role": "user",
@@ -305,7 +398,11 @@ def generate_vertex_gemini_1_5(
         }
     ]
-    if training_data.supports_cot():
+    if training_data.thinking_r1_style:
+        raise ValueError(
+            "R1 style thinking is not supported for Vertex Gemini. Please use a different training strategy."
+        )
+    elif training_data.supports_cot():
         contents.extend(
             [
                 {
@@ -328,14 +425,7 @@ def generate_vertex_gemini_1_5(
     )
     return {
-        "systemInstruction": {
-            "role": "system",
-            "parts": [
-                {
-                    "text": training_data.system_message,
-                }
-            ],
-        },
+        "systemInstruction": system_instruction,
         "contents": contents,
     }
@@ -346,7 +436,7 @@ FORMAT_GENERATORS: Dict[DatasetFormat, FormatGenerator] = {
     DatasetFormat.OPENAI_CHAT_TOOLCALL_JSONL: generate_chat_message_toolcall,
     DatasetFormat.HUGGINGFACE_CHAT_TEMPLATE_JSONL: generate_huggingface_chat_template,
     DatasetFormat.HUGGINGFACE_CHAT_TEMPLATE_TOOLCALL_JSONL: generate_huggingface_chat_template_toolcall,
-    DatasetFormat.VERTEX_GEMINI_1_5: generate_vertex_gemini_1_5,
+    DatasetFormat.VERTEX_GEMINI: generate_vertex_gemini,
 }
@@ -397,7 +487,7 @@ class DatasetFormatter:
         generator = FORMAT_GENERATORS[format_type]
-        include_cot = data_strategy == FinetuneDataStrategy.final_and_intermediate
+        include_cot = data_strategy in THINKING_DATA_STRATEGIES
         # Write to a temp file if no path is provided
         output_path = (
@@ -421,7 +511,7 @@ class DatasetFormatter:
                 training_data = build_training_data(
                     task_run=task_run,
                     system_message=self.system_message,
-                    include_cot=include_cot,
+                    data_strategy=data_strategy,
                     thinking_instructions=self.thinking_instructions,
                 )
                 example = generator(training_data)

kiln_ai/adapters/fine_tune/finetune_registry.py CHANGED Viewed

@@ -4,10 +4,12 @@ from kiln_ai.adapters.fine_tune.base_finetune import BaseFinetuneAdapter
 from kiln_ai.adapters.fine_tune.fireworks_finetune import FireworksFinetune
 from kiln_ai.adapters.fine_tune.openai_finetune import OpenAIFinetune
 from kiln_ai.adapters.fine_tune.together_finetune import TogetherFinetune
+from kiln_ai.adapters.fine_tune.vertex_finetune import VertexFinetune
 from kiln_ai.adapters.ml_model_list import ModelProviderName
 finetune_registry: dict[ModelProviderName, Type[BaseFinetuneAdapter]] = {
     ModelProviderName.openai: OpenAIFinetune,
     ModelProviderName.fireworks_ai: FireworksFinetune,
     ModelProviderName.together_ai: TogetherFinetune,
+    ModelProviderName.vertex: VertexFinetune,
 }

kiln_ai/adapters/fine_tune/fireworks_finetune.py CHANGED Viewed

@@ -198,7 +198,8 @@ class FireworksFinetune(BaseFinetuneAdapter):
         if not api_key or not account_id:
             raise ValueError("Fireworks API key or account ID not set")
         url = f"https://api.fireworks.ai/v1/accounts/{account_id}/datasets"
-        dataset_id = str(uuid4())
+        # First char can't be a digit: https://discord.com/channels/1137072072808472616/1363214412395184350/1363214412395184350
+        dataset_id = "kiln-" + str(uuid4())
         payload = {
             "datasetId": dataset_id,
             "dataset": {

kiln_ai/adapters/fine_tune/test_base_finetune.py CHANGED Viewed

@@ -98,6 +98,13 @@ def test_validate_parameters_valid():
     }
     MockFinetune.validate_parameters(valid_params)  # Should not raise
+    # Test valid parameters (float as int)
+    valid_params = {
+        "learning_rate": 1,
+        "epochs": 10,
+    }
+    MockFinetune.validate_parameters(valid_params)  # Should not raise
 def test_validate_parameters_missing_required():
     # Test missing required parameter

kiln-ai 0.14.0__py3-none-any.whl → 0.16.0__py3-none-any.whl

kiln-ai 0.14.0py3-none-any.whl → 0.16.0py3-none-any.whl