PyPI - kiln-ai - Versions diffs - 0.16.0__py3-none-any.whl → 0.18.0__py3-none-any.whl - Mend

kiln-ai 0.16.0py3-none-any.whl → 0.18.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (66) hide show

kiln_ai/adapters/__init__.py +2 -0
kiln_ai/adapters/adapter_registry.py +22 -44
kiln_ai/adapters/chat/__init__.py +8 -0
kiln_ai/adapters/chat/chat_formatter.py +233 -0
kiln_ai/adapters/chat/test_chat_formatter.py +131 -0
kiln_ai/adapters/data_gen/data_gen_prompts.py +121 -36
kiln_ai/adapters/data_gen/data_gen_task.py +49 -36
kiln_ai/adapters/data_gen/test_data_gen_task.py +330 -40
kiln_ai/adapters/eval/base_eval.py +7 -6
kiln_ai/adapters/eval/eval_runner.py +9 -2
kiln_ai/adapters/eval/g_eval.py +40 -17
kiln_ai/adapters/eval/test_base_eval.py +174 -17
kiln_ai/adapters/eval/test_eval_runner.py +3 -0
kiln_ai/adapters/eval/test_g_eval.py +116 -5
kiln_ai/adapters/fine_tune/base_finetune.py +3 -8
kiln_ai/adapters/fine_tune/dataset_formatter.py +135 -273
kiln_ai/adapters/fine_tune/test_base_finetune.py +10 -10
kiln_ai/adapters/fine_tune/test_dataset_formatter.py +287 -353
kiln_ai/adapters/fine_tune/test_fireworks_tinetune.py +3 -3
kiln_ai/adapters/fine_tune/test_openai_finetune.py +6 -6
kiln_ai/adapters/fine_tune/test_together_finetune.py +1 -0
kiln_ai/adapters/fine_tune/test_vertex_finetune.py +6 -11
kiln_ai/adapters/fine_tune/together_finetune.py +13 -2
kiln_ai/adapters/ml_model_list.py +370 -84
kiln_ai/adapters/model_adapters/base_adapter.py +73 -26
kiln_ai/adapters/model_adapters/litellm_adapter.py +88 -97
kiln_ai/adapters/model_adapters/litellm_config.py +3 -2
kiln_ai/adapters/model_adapters/test_base_adapter.py +235 -61
kiln_ai/adapters/model_adapters/test_litellm_adapter.py +104 -21
kiln_ai/adapters/model_adapters/test_saving_adapter_results.py +41 -0
kiln_ai/adapters/model_adapters/test_structured_output.py +44 -12
kiln_ai/adapters/parsers/parser_registry.py +0 -2
kiln_ai/adapters/parsers/r1_parser.py +0 -1
kiln_ai/adapters/prompt_builders.py +0 -16
kiln_ai/adapters/provider_tools.py +27 -9
kiln_ai/adapters/remote_config.py +66 -0
kiln_ai/adapters/repair/repair_task.py +1 -6
kiln_ai/adapters/repair/test_repair_task.py +24 -3
kiln_ai/adapters/test_adapter_registry.py +88 -28
kiln_ai/adapters/test_ml_model_list.py +176 -0
kiln_ai/adapters/test_prompt_adaptors.py +17 -7
kiln_ai/adapters/test_prompt_builders.py +3 -16
kiln_ai/adapters/test_provider_tools.py +69 -20
kiln_ai/adapters/test_remote_config.py +100 -0
kiln_ai/datamodel/__init__.py +0 -2
kiln_ai/datamodel/datamodel_enums.py +38 -13
kiln_ai/datamodel/eval.py +32 -0
kiln_ai/datamodel/finetune.py +12 -8
kiln_ai/datamodel/task.py +68 -7
kiln_ai/datamodel/task_output.py +0 -2
kiln_ai/datamodel/task_run.py +0 -2
kiln_ai/datamodel/test_basemodel.py +2 -1
kiln_ai/datamodel/test_dataset_split.py +0 -8
kiln_ai/datamodel/test_eval_model.py +146 -4
kiln_ai/datamodel/test_models.py +33 -10
kiln_ai/datamodel/test_task.py +168 -2
kiln_ai/utils/config.py +3 -2
kiln_ai/utils/dataset_import.py +1 -1
kiln_ai/utils/logging.py +166 -0
kiln_ai/utils/test_config.py +23 -0
kiln_ai/utils/test_dataset_import.py +30 -0
{kiln_ai-0.16.0.dist-info → kiln_ai-0.18.0.dist-info}/METADATA +2 -2
kiln_ai-0.18.0.dist-info/RECORD +115 -0
kiln_ai-0.16.0.dist-info/RECORD +0 -108
{kiln_ai-0.16.0.dist-info → kiln_ai-0.18.0.dist-info}/WHEEL +0 -0
{kiln_ai-0.16.0.dist-info → kiln_ai-0.18.0.dist-info}/licenses/LICENSE.txt +0 -0

kiln_ai/adapters/model_adapters/base_adapter.py CHANGED Viewed

@@ -1,11 +1,17 @@
 import json
 from abc import ABCMeta, abstractmethod
 from dataclasses import dataclass
-from typing import Dict, Literal, Tuple
+from typing import Dict, Tuple
-import jsonschema
-from kiln_ai.adapters.ml_model_list import KilnModelProvider, StructuredOutputMode
+from kiln_ai.adapters.chat.chat_formatter import (
+    ChatFormatter,
+    get_chat_formatter,
+)
+from kiln_ai.adapters.ml_model_list import (
+    KilnModelProvider,
+    StructuredOutputMode,
+    default_structured_output_mode_for_model_provider,
+)
 from kiln_ai.adapters.parsers.json_parser import parse_json_string
 from kiln_ai.adapters.parsers.parser_registry import model_parser_from_id
 from kiln_ai.adapters.parsers.request_formatters import request_formatter_from_id
@@ -20,6 +26,7 @@ from kiln_ai.datamodel import (
     TaskRun,
     Usage,
 )
+from kiln_ai.datamodel.datamodel_enums import ChatStrategy
 from kiln_ai.datamodel.json_schema import validate_schema_with_value_error
 from kiln_ai.datamodel.task import RunConfig
 from kiln_ai.utils.config import Config
@@ -38,9 +45,6 @@ class AdapterConfig:
     default_tags: list[str] | None = None
-COT_FINAL_ANSWER_PROMPT = "Considering the above, return a final result."
 class BaseAdapter(metaclass=ABCMeta):
     """Base class for AI model adapters that handle task execution.
@@ -61,6 +65,7 @@ class BaseAdapter(metaclass=ABCMeta):
         config: AdapterConfig | None = None,
     ):
         self.run_config = run_config
+        self.update_run_config_unknown_structured_output_mode()
         self.prompt_builder = prompt_builder_from_id(
             run_config.prompt_id, run_config.task
         )
@@ -188,10 +193,10 @@ class BaseAdapter(metaclass=ABCMeta):
     def build_prompt(self) -> str:
         # The prompt builder needs to know if we want to inject formatting instructions
-        provider = self.model_provider()
+        structured_output_mode = self.run_config.structured_output_mode
         add_json_instructions = self.has_structured_output() and (
-            provider.structured_output_mode == StructuredOutputMode.json_instructions
-            or provider.structured_output_mode
+            structured_output_mode == StructuredOutputMode.json_instructions
+            or structured_output_mode
             == StructuredOutputMode.json_instruction_and_object
         )
@@ -199,26 +204,51 @@ class BaseAdapter(metaclass=ABCMeta):
             include_json_instructions=add_json_instructions
         )
-    def run_strategy(
-        self,
-    ) -> Tuple[Literal["cot_as_message", "cot_two_call", "basic"], str | None]:
-        # Determine the run strategy for COT prompting. 3 options:
-        # 1. "Thinking" LLM designed to output thinking in a structured format plus a COT prompt: we make 1 call to the LLM, which outputs thinking in a structured format. We include the thinking instuctions as a message.
-        # 2. Normal LLM with COT prompt: we make 2 calls to the LLM - one for thinking and one for the final response. This helps us use the LLM's structured output modes (json_schema, tools, etc), which can't be used in a single call. It also separates the thinking from the final response.
-        # 3. Non chain of thought: we make 1 call to the LLM, with no COT prompt.
+    def build_chat_formatter(self, input: Dict | str) -> ChatFormatter:
+        # Determine the chat strategy to use based on the prompt the user selected, the model's capabilities, and if the model was finetuned with a specific chat strategy.
         cot_prompt = self.prompt_builder.chain_of_thought_prompt()
-        reasoning_capable = self.model_provider().reasoning_capable
+        system_message = self.build_prompt()
+        # If no COT prompt, use the single turn strategy. Even when a tuned strategy is set, as the tuned strategy is either already single turn, or won't work without a COT prompt.
+        if not cot_prompt:
+            return get_chat_formatter(
+                strategy=ChatStrategy.single_turn,
+                system_message=system_message,
+                user_input=input,
+            )
-        if cot_prompt and reasoning_capable:
-            # 1: "Thinking" LLM designed to output thinking in a structured format
+        # Some models like finetunes are trained with a specific chat strategy. Use that.
+        # However, don't use that if it is single turn. The user selected a COT prompt, and we give explicit prompt selection priority over the tuned strategy.
+        tuned_chat_strategy = self.model_provider().tuned_chat_strategy
+        if tuned_chat_strategy and tuned_chat_strategy != ChatStrategy.single_turn:
+            return get_chat_formatter(
+                strategy=tuned_chat_strategy,
+                system_message=system_message,
+                user_input=input,
+                thinking_instructions=cot_prompt,
+            )
+        # Pick the best chat strategy for the model given it has a cot prompt.
+        reasoning_capable = self.model_provider().reasoning_capable
+        if reasoning_capable:
+            # "Thinking" LLM designed to output thinking in a structured format. We'll use it's native format.
             # A simple message with the COT prompt appended to the message list is sufficient
-            return "cot_as_message", cot_prompt
-        elif cot_prompt:
-            # 2: Unstructured output with COT
-            # Two calls to separate the thinking from the final response
-            return "cot_two_call", cot_prompt
+            return get_chat_formatter(
+                strategy=ChatStrategy.single_turn_r1_thinking,
+                system_message=system_message,
+                user_input=input,
+                thinking_instructions=cot_prompt,
+            )
         else:
-            return "basic", None
+            # Unstructured output with COT
+            # Two calls to separate the thinking from the final response
+            return get_chat_formatter(
+                strategy=ChatStrategy.two_message_cot,
+                system_message=system_message,
+                user_input=input,
+                thinking_instructions=cot_prompt,
+            )
     # create a run and task output
     def generate_run(
@@ -272,5 +302,22 @@ class BaseAdapter(metaclass=ABCMeta):
         props["model_name"] = self.run_config.model_name
         props["model_provider"] = self.run_config.model_provider_name
         props["prompt_id"] = self.run_config.prompt_id
+        props["structured_output_mode"] = self.run_config.structured_output_mode
+        props["temperature"] = self.run_config.temperature
+        props["top_p"] = self.run_config.top_p
         return props
+    def update_run_config_unknown_structured_output_mode(self) -> None:
+        structured_output_mode = self.run_config.structured_output_mode
+        # Old datamodels didn't save the structured output mode. Some clients (tests, end users) might not set it.
+        # Look up our recommended mode from ml_model_list if we have one
+        if structured_output_mode == StructuredOutputMode.unknown:
+            new_run_config = self.run_config.model_copy(deep=True)
+            structured_output_mode = default_structured_output_mode_for_model_provider(
+                self.run_config.model_name,
+                self.run_config.model_provider_name,
+            )
+            new_run_config.structured_output_mode = structured_output_mode
+            self.run_config = new_run_config

kiln_ai/adapters/model_adapters/litellm_adapter.py CHANGED Viewed

@@ -12,15 +12,13 @@ from kiln_ai.adapters.ml_model_list import (
     StructuredOutputMode,
 )
 from kiln_ai.adapters.model_adapters.base_adapter import (
-    COT_FINAL_ANSWER_PROMPT,
     AdapterConfig,
     BaseAdapter,
     RunOutput,
     Usage,
 )
 from kiln_ai.adapters.model_adapters.litellm_config import LiteLlmConfig
-from kiln_ai.datamodel import PromptGenerators, PromptId
-from kiln_ai.datamodel.task import RunConfig
+from kiln_ai.datamodel.task import run_config_from_run_config_properties
 from kiln_ai.utils.exhaustive_error import raise_exhaustive_enum_error
 logger = logging.getLogger(__name__)
@@ -31,7 +29,6 @@ class LiteLlmAdapter(BaseAdapter):
         self,
         config: LiteLlmConfig,
         kiln_task: datamodel.Task,
-        prompt_id: PromptId | None = None,
         base_adapter_config: AdapterConfig | None = None,
     ):
         self.config = config
@@ -40,11 +37,10 @@ class LiteLlmAdapter(BaseAdapter):
         self._headers = config.default_headers
         self._litellm_model_id: str | None = None
-        run_config = RunConfig(
+        # Create a RunConfig, adding the task to the RunConfigProperties
+        run_config = run_config_from_run_config_properties(
             task=kiln_task,
-            model_name=config.model_name,
-            model_provider_name=config.provider_name,
-            prompt_id=prompt_id or PromptGenerators.SIMPLE,
+            run_config_properties=config.run_config_properties,
         )
         super().__init__(
@@ -57,79 +53,69 @@ class LiteLlmAdapter(BaseAdapter):
         if not provider.model_id:
             raise ValueError("Model ID is required for OpenAI compatible models")
-        intermediate_outputs: dict[str, str] = {}
-        prompt = self.build_prompt()
-        user_msg = self.prompt_builder.build_user_message(input)
-        messages = [
-            {"role": "system", "content": prompt},
-            {"role": "user", "content": user_msg},
-        ]
-        run_strategy, cot_prompt = self.run_strategy()
-        if run_strategy == "cot_as_message":
-            # Used for reasoning-capable models that can output thinking and structured format
-            if not cot_prompt:
-                raise ValueError("cot_prompt is required for cot_as_message strategy")
-            messages.append({"role": "system", "content": cot_prompt})
-        elif run_strategy == "cot_two_call":
-            if not cot_prompt:
-                raise ValueError("cot_prompt is required for cot_two_call strategy")
-            messages.append({"role": "system", "content": cot_prompt})
-            # First call for chain of thought
-            # No response format as this request is for "thinking" in plain text
-            # No logprobs as only needed for final answer
+        chat_formatter = self.build_chat_formatter(input)
+        prior_output = None
+        prior_message = None
+        response = None
+        turns = 0
+        while True:
+            turns += 1
+            if turns > 10:
+                raise RuntimeError(
+                    "Too many turns. Stopping iteration to avoid using too many tokens."
+                )
+            turn = chat_formatter.next_turn(prior_output)
+            if turn is None:
+                break
+            skip_response_format = not turn.final_call
+            all_messages = chat_formatter.message_dicts()
             completion_kwargs = await self.build_completion_kwargs(
-                provider, messages, None, skip_response_format=True
+                provider,
+                all_messages,
+                self.base_adapter_config.top_logprobs if turn.final_call else None,
+                skip_response_format,
             )
-            cot_response = await litellm.acompletion(**completion_kwargs)
+            response = await litellm.acompletion(**completion_kwargs)
             if (
-                not isinstance(cot_response, ModelResponse)
-                or not cot_response.choices
-                or len(cot_response.choices) == 0
-                or not isinstance(cot_response.choices[0], Choices)
+                not isinstance(response, ModelResponse)
+                or not response.choices
+                or len(response.choices) == 0
+                or not isinstance(response.choices[0], Choices)
             ):
                 raise RuntimeError(
-                    f"Expected ModelResponse with Choices, got {type(cot_response)}."
+                    f"Expected ModelResponse with Choices, got {type(response)}."
                 )
-            cot_content = cot_response.choices[0].message.content
-            if cot_content is not None:
-                intermediate_outputs["chain_of_thought"] = cot_content
-            messages.extend(
-                [
-                    {"role": "assistant", "content": cot_content or ""},
-                    {"role": "user", "content": COT_FINAL_ANSWER_PROMPT},
-                ]
-            )
+            prior_message = response.choices[0].message
+            prior_output = prior_message.content
-        # Make the API call using litellm
-        completion_kwargs = await self.build_completion_kwargs(
-            provider, messages, self.base_adapter_config.top_logprobs
-        )
-        response = await litellm.acompletion(**completion_kwargs)
+            # Fallback: Use args of first tool call to task_response if it exists
+            if (
+                not prior_output
+                and hasattr(prior_message, "tool_calls")
+                and prior_message.tool_calls
+            ):
+                tool_call = next(
+                    (
+                        tool_call
+                        for tool_call in prior_message.tool_calls
+                        if tool_call.function.name == "task_response"
+                    ),
+                    None,
+                )
+                if tool_call:
+                    prior_output = tool_call.function.arguments
-        if not isinstance(response, ModelResponse):
-            raise RuntimeError(f"Expected ModelResponse, got {type(response)}.")
+            if not prior_output:
+                raise RuntimeError("No output returned from model")
-        # Maybe remove this? There is no error attribute on the response object.
-        # # Keeping in typesafe way as we added it for a reason, but should investigate what that was and if it still applies.
-        if hasattr(response, "error") and response.__getattribute__("error"):
-            raise RuntimeError(
-                f"LLM API returned an error: {response.__getattribute__('error')}"
-            )
+        if response is None or prior_message is None:
+            raise RuntimeError("No response returned from model")
-        if (
-            not response.choices
-            or len(response.choices) == 0
-            or not isinstance(response.choices[0], Choices)
-        ):
-            raise RuntimeError(
-                "No message content returned in the response from LLM API"
-            )
+        intermediate_outputs = chat_formatter.intermediate_outputs()
-        message = response.choices[0].message
         logprobs = (
             response.choices[0].logprobs
             if hasattr(response.choices[0], "logprobs")
@@ -143,31 +129,15 @@ class LiteLlmAdapter(BaseAdapter):
         # Save reasoning if it exists and was parsed by LiteLLM (or openrouter, or anyone upstream)
         if (
-            hasattr(message, "reasoning_content")
-            and message.reasoning_content
-            and len(message.reasoning_content.strip()) > 0
+            prior_message is not None
+            and hasattr(prior_message, "reasoning_content")
+            and prior_message.reasoning_content
+            and len(prior_message.reasoning_content.strip()) > 0
         ):
-            intermediate_outputs["reasoning"] = message.reasoning_content.strip()
+            intermediate_outputs["reasoning"] = prior_message.reasoning_content.strip()
         # the string content of the response
-        response_content = message.content
-        # Fallback: Use args of first tool call to task_response if it exists
-        if (
-            not response_content
-            and hasattr(message, "tool_calls")
-            and message.tool_calls
-        ):
-            tool_call = next(
-                (
-                    tool_call
-                    for tool_call in message.tool_calls
-                    if tool_call.function.name == "task_response"
-                ),
-                None,
-            )
-            if tool_call:
-                response_content = tool_call.function.arguments
+        response_content = prior_output
         if not isinstance(response_content, str):
             raise RuntimeError(f"response is not a string: {response_content}")
@@ -186,8 +156,9 @@ class LiteLlmAdapter(BaseAdapter):
         if not self.has_structured_output():
             return {}
-        provider = self.model_provider()
-        match provider.structured_output_mode:
+        structured_output_mode = self.run_config.structured_output_mode
+        match structured_output_mode:
             case StructuredOutputMode.json_mode:
                 return {"response_format": {"type": "json_object"}}
             case StructuredOutputMode.json_schema:
@@ -206,16 +177,20 @@ class LiteLlmAdapter(BaseAdapter):
                 # We set response_format to json_object and also set json instructions in the prompt
                 return {"response_format": {"type": "json_object"}}
             case StructuredOutputMode.default:
-                if provider.name == ModelProviderName.ollama:
+                provider_name = self.run_config.model_provider_name
+                if provider_name == ModelProviderName.ollama:
                     # Ollama added json_schema to all models: https://ollama.com/blog/structured-outputs
                     return self.json_schema_response_format()
                 else:
                     # Default to function calling -- it's older than the other modes. Higher compatibility.
                     # Strict isn't widely supported yet, so we don't use it by default unless it's OpenAI.
-                    strict = provider.name == ModelProviderName.openai
+                    strict = provider_name == ModelProviderName.openai
                     return self.tool_call_params(strict=strict)
+            case StructuredOutputMode.unknown:
+                # See above, but this case should never happen.
+                raise ValueError("Structured output mode is unknown.")
             case _:
-                raise_exhaustive_enum_error(provider.structured_output_mode)
+                raise_exhaustive_enum_error(structured_output_mode)
     def json_schema_response_format(self) -> dict[str, Any]:
         output_schema = self.task().output_schema()
@@ -276,6 +251,10 @@ class LiteLlmAdapter(BaseAdapter):
                 "exclude": False,
             }
+        if provider.name == ModelProviderName.openrouter:
+            # Ask OpenRouter to include usage in the response (cost)
+            extra_body["usage"] = {"include": True}
         if provider.anthropic_extended_thinking:
             extra_body["thinking"] = {"type": "enabled", "budget_tokens": 4000}
@@ -387,6 +366,13 @@ class LiteLlmAdapter(BaseAdapter):
             "messages": messages,
             "api_base": self._api_base,
             "headers": self._headers,
+            "temperature": self.run_config.temperature,
+            "top_p": self.run_config.top_p,
+            # This drops params that are not supported by the model. Only openai params like top_p, temperature -- not litellm params like model, etc.
+            # Not all models and providers support all openai params (for example, o3 doesn't support top_p)
+            # Better to ignore them than to fail the model call.
+            # https://docs.litellm.ai/docs/completion/input
+            "drop_params": True,
             **extra_body,
             **self._additional_body_options,
         }
@@ -404,7 +390,12 @@ class LiteLlmAdapter(BaseAdapter):
     def usage_from_response(self, response: ModelResponse) -> Usage | None:
         litellm_usage = response.get("usage", None)
+        # LiteLLM isn't consistent in how it returns the cost.
         cost = response._hidden_params.get("response_cost", None)
+        if cost is None and litellm_usage:
+            cost = litellm_usage.get("cost", None)
         if not litellm_usage and not cost:
             return None

kiln_ai/adapters/model_adapters/litellm_config.py CHANGED Viewed

@@ -1,10 +1,11 @@
 from dataclasses import dataclass, field
+from kiln_ai.datamodel.task import RunConfigProperties
 @dataclass
 class LiteLlmConfig:
-    model_name: str
-    provider_name: str
+    run_config_properties: RunConfigProperties
     # If set, over rides the provider-name based URL from litellm
     base_url: str | None = None
     # Headers to send with every request

kiln-ai 0.16.0__py3-none-any.whl → 0.18.0__py3-none-any.whl

kiln-ai 0.16.0py3-none-any.whl → 0.18.0py3-none-any.whl