PyPI - kiln-ai - Versions diffs - 0.8.1__py3-none-any.whl → 0.12.0__py3-none-any.whl - Mend

kiln-ai 0.8.1py3-none-any.whl → 0.12.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of kiln-ai might be problematic. Click here for more details.

Files changed (88) hide show

kiln_ai/adapters/__init__.py +7 -7
kiln_ai/adapters/adapter_registry.py +81 -10
kiln_ai/adapters/data_gen/data_gen_task.py +21 -3
kiln_ai/adapters/data_gen/test_data_gen_task.py +23 -3
kiln_ai/adapters/eval/base_eval.py +164 -0
kiln_ai/adapters/eval/eval_runner.py +267 -0
kiln_ai/adapters/eval/g_eval.py +367 -0
kiln_ai/adapters/eval/registry.py +16 -0
kiln_ai/adapters/eval/test_base_eval.py +324 -0
kiln_ai/adapters/eval/test_eval_runner.py +640 -0
kiln_ai/adapters/eval/test_g_eval.py +497 -0
kiln_ai/adapters/eval/test_g_eval_data.py +4 -0
kiln_ai/adapters/fine_tune/base_finetune.py +5 -1
kiln_ai/adapters/fine_tune/dataset_formatter.py +310 -65
kiln_ai/adapters/fine_tune/fireworks_finetune.py +47 -32
kiln_ai/adapters/fine_tune/openai_finetune.py +12 -11
kiln_ai/adapters/fine_tune/test_base_finetune.py +19 -0
kiln_ai/adapters/fine_tune/test_dataset_formatter.py +472 -129
kiln_ai/adapters/fine_tune/test_fireworks_tinetune.py +114 -22
kiln_ai/adapters/fine_tune/test_openai_finetune.py +125 -14
kiln_ai/adapters/ml_model_list.py +434 -93
kiln_ai/adapters/model_adapters/__init__.py +18 -0
kiln_ai/adapters/model_adapters/base_adapter.py +250 -0
kiln_ai/adapters/model_adapters/langchain_adapters.py +309 -0
kiln_ai/adapters/model_adapters/openai_compatible_config.py +10 -0
kiln_ai/adapters/model_adapters/openai_model_adapter.py +289 -0
kiln_ai/adapters/model_adapters/test_base_adapter.py +199 -0
kiln_ai/adapters/{test_langchain_adapter.py → model_adapters/test_langchain_adapter.py} +105 -97
kiln_ai/adapters/model_adapters/test_openai_model_adapter.py +216 -0
kiln_ai/adapters/{test_saving_adapter_results.py → model_adapters/test_saving_adapter_results.py} +80 -30
kiln_ai/adapters/{test_structured_output.py → model_adapters/test_structured_output.py} +125 -46
kiln_ai/adapters/ollama_tools.py +0 -1
kiln_ai/adapters/parsers/__init__.py +10 -0
kiln_ai/adapters/parsers/base_parser.py +12 -0
kiln_ai/adapters/parsers/json_parser.py +37 -0
kiln_ai/adapters/parsers/parser_registry.py +19 -0
kiln_ai/adapters/parsers/r1_parser.py +69 -0
kiln_ai/adapters/parsers/test_json_parser.py +81 -0
kiln_ai/adapters/parsers/test_parser_registry.py +32 -0
kiln_ai/adapters/parsers/test_r1_parser.py +144 -0
kiln_ai/adapters/prompt_builders.py +193 -49
kiln_ai/adapters/provider_tools.py +91 -36
kiln_ai/adapters/repair/repair_task.py +18 -19
kiln_ai/adapters/repair/test_repair_task.py +7 -7
kiln_ai/adapters/run_output.py +11 -0
kiln_ai/adapters/test_adapter_registry.py +177 -0
kiln_ai/adapters/test_generate_docs.py +69 -0
kiln_ai/adapters/test_ollama_tools.py +0 -1
kiln_ai/adapters/test_prompt_adaptors.py +25 -18
kiln_ai/adapters/test_prompt_builders.py +265 -44
kiln_ai/adapters/test_provider_tools.py +268 -46
kiln_ai/datamodel/__init__.py +51 -772
kiln_ai/datamodel/basemodel.py +31 -11
kiln_ai/datamodel/datamodel_enums.py +58 -0
kiln_ai/datamodel/dataset_filters.py +114 -0
kiln_ai/datamodel/dataset_split.py +170 -0
kiln_ai/datamodel/eval.py +298 -0
kiln_ai/datamodel/finetune.py +105 -0
kiln_ai/datamodel/json_schema.py +14 -3
kiln_ai/datamodel/model_cache.py +8 -3
kiln_ai/datamodel/project.py +23 -0
kiln_ai/datamodel/prompt.py +37 -0
kiln_ai/datamodel/prompt_id.py +83 -0
kiln_ai/datamodel/strict_mode.py +24 -0
kiln_ai/datamodel/task.py +181 -0
kiln_ai/datamodel/task_output.py +321 -0
kiln_ai/datamodel/task_run.py +164 -0
kiln_ai/datamodel/test_basemodel.py +80 -2
kiln_ai/datamodel/test_dataset_filters.py +71 -0
kiln_ai/datamodel/test_dataset_split.py +127 -6
kiln_ai/datamodel/test_datasource.py +3 -2
kiln_ai/datamodel/test_eval_model.py +635 -0
kiln_ai/datamodel/test_example_models.py +34 -17
kiln_ai/datamodel/test_json_schema.py +23 -0
kiln_ai/datamodel/test_model_cache.py +24 -0
kiln_ai/datamodel/test_model_perf.py +125 -0
kiln_ai/datamodel/test_models.py +131 -2
kiln_ai/datamodel/test_prompt_id.py +129 -0
kiln_ai/datamodel/test_task.py +159 -0
kiln_ai/utils/config.py +6 -1
kiln_ai/utils/exhaustive_error.py +6 -0
{kiln_ai-0.8.1.dist-info → kiln_ai-0.12.0.dist-info}/METADATA +45 -7
kiln_ai-0.12.0.dist-info/RECORD +100 -0
kiln_ai/adapters/base_adapter.py +0 -191
kiln_ai/adapters/langchain_adapters.py +0 -256
kiln_ai-0.8.1.dist-info/RECORD +0 -58
{kiln_ai-0.8.1.dist-info → kiln_ai-0.12.0.dist-info}/WHEEL +0 -0
{kiln_ai-0.8.1.dist-info → kiln_ai-0.12.0.dist-info}/licenses/LICENSE.txt +0 -0

kiln_ai/adapters/eval/g_eval.py ADDED Viewed

@@ -0,0 +1,367 @@
+import math
+from typing import Dict, List, Tuple
+from kiln_ai.adapters.adapter_registry import adapter_for_task
+from kiln_ai.adapters.eval.base_eval import BaseEval
+from kiln_ai.adapters.model_adapters.base_adapter import AdapterConfig, RunOutput
+from kiln_ai.adapters.prompt_builders import PromptGenerators
+from kiln_ai.datamodel import Project, Task, TaskRun
+from kiln_ai.datamodel.eval import EvalConfig, EvalConfigType, EvalScores
+from kiln_ai.datamodel.task import RunConfig
+from openai.types.chat import ChatCompletionTokenLogprob
+# all the tokens we score for, and their float scores.
+TOKEN_TO_SCORE_MAP: Dict[str, float] = {
+    "1": 1.0,
+    "2": 2.0,
+    "3": 3.0,
+    "4": 4.0,
+    "5": 5.0,
+    "pass": 1.0,
+    "fail": 0.0,
+    "critical": -1.0,
+}
+class GEvalTask(Task, parent_of={}):
+    """
+    Kiln task for executing a G-Eval. Can be run on any Kiln adapter which supports logprobs.
+    Note G-Eval implements both G-Eval and LLM as Judge as they are very similar.
+    """
+    def __init__(self, eval_config: EvalConfig):
+        tmp_project = Project(name="GEval")
+        # Build a simple LLM as Judge system instruction
+        system_instruction = "Your job to evaluate a model's performance on a task. Blocks will be marked with <eval_data> tags.\n"
+        # Optionally add a short task description
+        task_description = eval_config.properties.get("task_description", None)
+        if task_description:
+            system_instruction += f"\nThe task the model was given is as follows:\n<eval_data>\n{task_description}\n</eval_data>\n"
+        # Build the COT eval instructions
+        cot_instructions = "First, think step by step about the model's performance following these evaluation steps:\n\n"
+        steps = eval_config.properties.get("eval_steps", None)
+        if not steps or not isinstance(steps, list):
+            raise ValueError("eval_steps must be a list")
+        for i, step in enumerate(steps):
+            cot_instructions += f"{i + 1}) {step}\n"
+        eval = eval_config.parent_eval()
+        if not eval:
+            raise ValueError("Eval config must have a parent eval")
+        # Build the output schema from the eval's target output scores.
+        # We restrict the LLM's output scoring schema to discrete scores (pass/fail/critical/1-5) - allow_float_scores=False
+        # However, the final scores from the evaluator can be a float (see later logprob calculation, which requires discrete token outputs)
+        output_schema = BaseEval.build_score_schema(eval, allow_float_scores=False)
+        super().__init__(
+            name="GEval Task",
+            parent=tmp_project,
+            instruction=system_instruction,
+            thinking_instruction=cot_instructions,
+            output_json_schema=output_schema,
+        )
+class GEval(BaseEval):
+    """
+    A evaluator which implements G-Eval and LLM as Judge.
+    G-Eval is a method of evaluating the quality of a model's output. It is a weighted average of the scores of the tokens in the output. The weights are the log probabilities of the tokens in the output. https://arxiv.org/abs/2303.16634
+    LLM as Judge is a method of evaluating the quality of a model's output. It simply asks the LLM to score, and uses the returned output (no logprobs needed). Also called direct evaluation.
+    @misc{liu2023gevalnlgevaluationusing,
+        title={G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment},
+        author={Yang Liu and Dan Iter and Yichong Xu and Shuohang Wang and Ruochen Xu and Chenguang Zhu},
+        year={2023},
+        eprint={2303.16634},
+        archivePrefix={arXiv},
+        primaryClass={cs.CL},
+        url={https://arxiv.org/abs/2303.16634},
+    }
+    """
+    def __init__(self, eval_config: EvalConfig, run_config: RunConfig | None):
+        if (
+            eval_config.config_type != EvalConfigType.g_eval
+            and eval_config.config_type != EvalConfigType.llm_as_judge
+        ):
+            raise ValueError(
+                f"GEval must be initialized with a GEval or LLM as Judge config_type. Got {eval_config.config_type}"
+            )
+        super().__init__(eval_config, run_config)
+        self.geval_task = GEvalTask(eval_config)
+    async def run_eval(
+        self, task_run: TaskRun
+    ) -> tuple[EvalScores, Dict[str, str] | None]:
+        """
+        Run this eval on the given task run.
+        """
+        model_name, provider = self.model_and_provider()
+        # Only fetch logprobs for G-Eval
+        # There are at most 5 valid rating tokens per rating type (five_star being largest), so 10 is more than enough to get to the very very unlikely
+        top_logprobs = (
+            10 if self.eval_config.config_type == EvalConfigType.g_eval else None
+        )
+        adapter = adapter_for_task(
+            self.geval_task,
+            model_name,
+            provider,
+            # We always use Simple COT for G-Eval and LLM as Judge
+            prompt_id=PromptGenerators.SIMPLE_CHAIN_OF_THOUGHT,
+            base_adapter_config=AdapterConfig(
+                # Don't save this run into the task_runs. It will be saved into an eval_run where it belongs
+                allow_saving=False,
+                top_logprobs=top_logprobs,
+            ),
+        )
+        input = f"""The model was given the following input for the task:
+<eval_data>
+{task_run.input}
+</eval_data>
+The model produced the following output for the task:
+<eval_data>
+{task_run.output}
+</eval_data>
+"""
+        # We don't need the run, but invoke_returning_run_output() runs validations for us over _run()
+        _, run_output = await adapter.invoke_returning_run_output(input)
+        if self.eval_config.config_type == EvalConfigType.llm_as_judge:
+            return self.build_llm_as_judge_score(
+                run_output
+            ), run_output.intermediate_outputs
+        else:
+            return self.build_g_eval_score(run_output), run_output.intermediate_outputs
+    def build_llm_as_judge_score(self, run_output: RunOutput) -> EvalScores:
+        """
+        Build the LLM as Judge score for the given run and run output.
+        """
+        # Convert the output format we asked for (discreet values) to our float scores
+        scores: EvalScores = {}
+        if not isinstance(run_output.output, dict):
+            raise ValueError("LLM as Judge output must be a dictionary")
+        for metric, score in run_output.output.items():
+            token_score = self.score_from_token_string(f"{score}")
+            if token_score is None:
+                raise ValueError(
+                    f"No score found for metric: {metric}. The LLM failed to follow the scoring rubric/instructions/schema."
+                )
+            scores[metric] = token_score
+        return scores
+    def build_g_eval_score(self, run_output: RunOutput) -> EvalScores:
+        """
+        Build the G-Eval score for the given run and run output.
+        We create a weighted average of each rating using the logprobs.
+        @misc{liu2023gevalnlgevaluationusing,
+            title={G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment},
+            author={Yang Liu and Dan Iter and Yichong Xu and Shuohang Wang and Ruochen Xu and Chenguang Zhu},
+            year={2023},
+            eprint={2303.16634},
+            archivePrefix={arXiv},
+            primaryClass={cs.CL},
+            url={https://arxiv.org/abs/2303.16634},
+        }
+        """
+        # We use structured output
+        outputs = run_output.output
+        assert isinstance(outputs, dict)
+        # Build raw string output from the logprobs, which is easier to work with than Dict for the next bit
+        raw_output = self.raw_output_from_logprobs(run_output)
+        # find the offset the start of each metric in the raw output json
+        metrics: List[str] = list(outputs.keys())
+        metric_offsets = self.metric_offsets(raw_output, metrics)
+        final_scores: EvalScores = {}
+        for metric in metrics:
+            score = self.g_eval_single_metric(
+                run_output, metric, metric_offsets, raw_output
+            )
+            if score is None:
+                raise ValueError(
+                    f"No score found for metric: {metric}. The LLM failed to follow the scoring rubric/instructions/schema."
+                )
+            final_scores[metric] = score
+        return final_scores
+    def g_eval_single_metric(
+        self,
+        run_output: RunOutput,
+        metric: str,
+        metric_offsets: Dict[str, int],
+        raw_output: str,
+    ) -> float | None:
+        """
+        Run the G-Eval for a single metric.
+        Scan the logprobs for the metric and return the weighted score of the rating token.
+        """
+        start_offset, end_offset = self.token_search_range(
+            raw_output, metric, metric_offsets
+        )
+        offset = 0
+        if (
+            run_output.output_logprobs is None
+            or run_output.output_logprobs.content is None
+        ):
+            raise RuntimeError(
+                "No logprobs found for output - can not calculate g-eval"
+            )
+        # scan the tokens in the range, looking for the rating token
+        for _, chat_logprob in enumerate(run_output.output_logprobs.content):
+            if offset >= end_offset:
+                break
+            if offset >= start_offset:
+                score = self.rating_token_to_score(chat_logprob)
+                if score is not None:
+                    return score
+            offset += len(chat_logprob.token)
+        return None
+    def raw_output_from_logprobs(self, run_output: RunOutput) -> str:
+        """
+        Build the raw output string from the logprobs. Generate from logprobs so it's guaranteed to match the logprobs offsets
+        """
+        if (
+            run_output.output_logprobs is None
+            or run_output.output_logprobs.content is None
+        ):
+            raise RuntimeError(
+                "No logprobs found for output - can not calculate g-eval"
+            )
+        raw = ""
+        for chat_logprob in run_output.output_logprobs.content:
+            raw += chat_logprob.token
+        return raw
+    def token_search_range(
+        self, raw_output: str, metric: str, metric_offsets: Dict[str, int]
+    ) -> Tuple[int, int]:
+        """
+        Find the start and end offsets of the metric in the raw output.
+        Start searching after the end of the target metric json entry ("overall_rating":), and before the start of the next metric ("some_other_score").
+        """
+        start_offset = metric_offsets[metric] + len(metric)
+        # Find the lowest end offset that is greater than the start offset
+        end_offset = len(raw_output)
+        for v in list(metric_offsets.values()):
+            if v < end_offset and v > start_offset:
+                end_offset = v
+        return start_offset, end_offset
+    def rating_token_to_score(
+        self, token_logprob: ChatCompletionTokenLogprob
+    ) -> float | None:
+        """
+        Convert a rating token to a score using weighted average of top logprobs.
+        Only includes tokens that have valid scores.
+        Some cleanup for upper case, whitespace and quotes. LLMs aren't always consistent.
+        """
+        primary_token_score = self.score_from_token_string(token_logprob.token)
+        # check this is a real rating token, it could just be the ": ", "," or whitespace
+        if not primary_token_score:
+            return None
+        total_score = 0.0
+        total_probability = 0.0
+        # Process all valid scoring tokens
+        for top_logprob in token_logprob.top_logprobs:
+            token_score = self.score_from_token_string(top_logprob.token)
+            if token_score is not None:
+                # Convert logprob to probability
+                probability = math.exp(top_logprob.logprob)
+                total_score += token_score * probability
+                total_probability += probability
+        if total_probability <= 0.0:
+            raise RuntimeError(
+                f"No valid scoring tokens found for {token_logprob.token}. This should never happen. Please file a bug if you see this."
+            )
+        # Normalize by total probability of valid tokens (LLM may have wanted to generate other non-rating tokens, these shouldn't lower score of rating tokens)
+        weighted_score = total_score / total_probability
+        return weighted_score
+    def score_from_token_string(self, token: str) -> float | None:
+        if token in TOKEN_TO_SCORE_MAP:
+            return TOKEN_TO_SCORE_MAP[token]
+        # handle more token variations like '"1"' and '"pass"' and ' paSS' and 'PASS'
+        unquoted_token = token.strip().strip('"').lower()
+        if unquoted_token in TOKEN_TO_SCORE_MAP:
+            return TOKEN_TO_SCORE_MAP[unquoted_token]
+        # handle numeric tokens like "1.0"
+        try:
+            float_value = float(token)
+            if float_value.is_integer():
+                str_token = str(int(float_value))
+                if str_token in TOKEN_TO_SCORE_MAP:
+                    return TOKEN_TO_SCORE_MAP[str_token]
+        except ValueError:
+            pass
+        return None
+    def metric_offsets(self, raw_output: str, metrics: List[str]) -> Dict[str, int]:
+        """
+        Find the offset to the start of each metric in the raw output json
+        For the example json: `{"overall_rating": 1}` == 1
+        should return:
+        {
+            "overall_rating": 1 # it's 1 character into the json string
+        }
+        """
+        metric_offsets: Dict[str, int] = {}
+        for metric in metrics:
+            # the quoted metric name is expected in the json: `{"overall_rating": 1}` == 1
+            metric_name = f'"{metric}"'
+            # we expect it exactly once
+            count = raw_output.count(metric_name)
+            if count != 1:
+                raise ValueError(
+                    f"Metric {metric} should appear exactly once in the output. Found {count} times"
+                )
+            offset = raw_output.find(metric_name)
+            if offset == -1:
+                raise ValueError(f"Metric {metric} not found in raw output")
+            metric_offsets[metric] = offset
+        return metric_offsets

kiln_ai/adapters/eval/registry.py ADDED Viewed

@@ -0,0 +1,16 @@
+from kiln_ai.adapters.eval.base_eval import BaseEval
+from kiln_ai.adapters.eval.g_eval import GEval
+from kiln_ai.datamodel.eval import EvalConfigType
+from kiln_ai.utils.exhaustive_error import raise_exhaustive_enum_error
+def eval_adapter_from_type(eval_config_type: EvalConfigType) -> type[BaseEval]:
+    match eval_config_type:
+        case EvalConfigType.g_eval:
+            return GEval
+        case EvalConfigType.llm_as_judge:
+            # Also implemented by GEval
+            return GEval
+        case _:
+            # type checking will catch missing cases
+            raise_exhaustive_enum_error(eval_config_type)

kiln-ai 0.8.1__py3-none-any.whl → 0.12.0__py3-none-any.whl

Potentially problematic release.

kiln-ai 0.8.1py3-none-any.whl → 0.12.0py3-none-any.whl