kiln-ai 0.11.1__py3-none-any.whl → 0.13.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kiln-ai might be problematic. Click here for more details.

Files changed (80) hide show
  1. kiln_ai/adapters/__init__.py +4 -0
  2. kiln_ai/adapters/adapter_registry.py +163 -39
  3. kiln_ai/adapters/data_gen/data_gen_task.py +18 -0
  4. kiln_ai/adapters/eval/__init__.py +28 -0
  5. kiln_ai/adapters/eval/base_eval.py +164 -0
  6. kiln_ai/adapters/eval/eval_runner.py +270 -0
  7. kiln_ai/adapters/eval/g_eval.py +368 -0
  8. kiln_ai/adapters/eval/registry.py +16 -0
  9. kiln_ai/adapters/eval/test_base_eval.py +325 -0
  10. kiln_ai/adapters/eval/test_eval_runner.py +641 -0
  11. kiln_ai/adapters/eval/test_g_eval.py +498 -0
  12. kiln_ai/adapters/eval/test_g_eval_data.py +4 -0
  13. kiln_ai/adapters/fine_tune/base_finetune.py +16 -2
  14. kiln_ai/adapters/fine_tune/finetune_registry.py +2 -0
  15. kiln_ai/adapters/fine_tune/test_dataset_formatter.py +4 -1
  16. kiln_ai/adapters/fine_tune/test_fireworks_tinetune.py +1 -1
  17. kiln_ai/adapters/fine_tune/test_openai_finetune.py +1 -1
  18. kiln_ai/adapters/fine_tune/test_together_finetune.py +531 -0
  19. kiln_ai/adapters/fine_tune/together_finetune.py +325 -0
  20. kiln_ai/adapters/ml_model_list.py +758 -163
  21. kiln_ai/adapters/model_adapters/__init__.py +2 -4
  22. kiln_ai/adapters/model_adapters/base_adapter.py +61 -43
  23. kiln_ai/adapters/model_adapters/litellm_adapter.py +391 -0
  24. kiln_ai/adapters/model_adapters/litellm_config.py +13 -0
  25. kiln_ai/adapters/model_adapters/test_base_adapter.py +22 -13
  26. kiln_ai/adapters/model_adapters/test_litellm_adapter.py +407 -0
  27. kiln_ai/adapters/model_adapters/test_saving_adapter_results.py +41 -19
  28. kiln_ai/adapters/model_adapters/test_structured_output.py +59 -35
  29. kiln_ai/adapters/ollama_tools.py +3 -3
  30. kiln_ai/adapters/parsers/r1_parser.py +19 -14
  31. kiln_ai/adapters/parsers/test_r1_parser.py +17 -5
  32. kiln_ai/adapters/prompt_builders.py +80 -42
  33. kiln_ai/adapters/provider_tools.py +50 -58
  34. kiln_ai/adapters/repair/repair_task.py +9 -21
  35. kiln_ai/adapters/repair/test_repair_task.py +6 -6
  36. kiln_ai/adapters/run_output.py +3 -0
  37. kiln_ai/adapters/test_adapter_registry.py +26 -29
  38. kiln_ai/adapters/test_generate_docs.py +4 -4
  39. kiln_ai/adapters/test_ollama_tools.py +0 -1
  40. kiln_ai/adapters/test_prompt_adaptors.py +47 -33
  41. kiln_ai/adapters/test_prompt_builders.py +91 -31
  42. kiln_ai/adapters/test_provider_tools.py +26 -81
  43. kiln_ai/datamodel/__init__.py +50 -952
  44. kiln_ai/datamodel/basemodel.py +2 -0
  45. kiln_ai/datamodel/datamodel_enums.py +60 -0
  46. kiln_ai/datamodel/dataset_filters.py +114 -0
  47. kiln_ai/datamodel/dataset_split.py +170 -0
  48. kiln_ai/datamodel/eval.py +298 -0
  49. kiln_ai/datamodel/finetune.py +105 -0
  50. kiln_ai/datamodel/json_schema.py +7 -1
  51. kiln_ai/datamodel/project.py +23 -0
  52. kiln_ai/datamodel/prompt.py +37 -0
  53. kiln_ai/datamodel/prompt_id.py +83 -0
  54. kiln_ai/datamodel/strict_mode.py +24 -0
  55. kiln_ai/datamodel/task.py +181 -0
  56. kiln_ai/datamodel/task_output.py +328 -0
  57. kiln_ai/datamodel/task_run.py +164 -0
  58. kiln_ai/datamodel/test_basemodel.py +19 -11
  59. kiln_ai/datamodel/test_dataset_filters.py +71 -0
  60. kiln_ai/datamodel/test_dataset_split.py +32 -8
  61. kiln_ai/datamodel/test_datasource.py +22 -2
  62. kiln_ai/datamodel/test_eval_model.py +635 -0
  63. kiln_ai/datamodel/test_example_models.py +9 -13
  64. kiln_ai/datamodel/test_json_schema.py +23 -0
  65. kiln_ai/datamodel/test_models.py +2 -2
  66. kiln_ai/datamodel/test_prompt_id.py +129 -0
  67. kiln_ai/datamodel/test_task.py +159 -0
  68. kiln_ai/utils/config.py +43 -1
  69. kiln_ai/utils/dataset_import.py +232 -0
  70. kiln_ai/utils/test_dataset_import.py +596 -0
  71. {kiln_ai-0.11.1.dist-info → kiln_ai-0.13.0.dist-info}/METADATA +86 -6
  72. kiln_ai-0.13.0.dist-info/RECORD +103 -0
  73. kiln_ai/adapters/model_adapters/langchain_adapters.py +0 -302
  74. kiln_ai/adapters/model_adapters/openai_compatible_config.py +0 -11
  75. kiln_ai/adapters/model_adapters/openai_model_adapter.py +0 -246
  76. kiln_ai/adapters/model_adapters/test_langchain_adapter.py +0 -350
  77. kiln_ai/adapters/model_adapters/test_openai_model_adapter.py +0 -225
  78. kiln_ai-0.11.1.dist-info/RECORD +0 -76
  79. {kiln_ai-0.11.1.dist-info → kiln_ai-0.13.0.dist-info}/WHEEL +0 -0
  80. {kiln_ai-0.11.1.dist-info → kiln_ai-0.13.0.dist-info}/licenses/LICENSE.txt +0 -0
@@ -0,0 +1,270 @@
1
+ import asyncio
2
+ import logging
3
+ from dataclasses import dataclass
4
+ from typing import AsyncGenerator, Dict, List, Literal, Set
5
+
6
+ from kiln_ai.adapters.eval.base_eval import BaseEval
7
+ from kiln_ai.adapters.eval.registry import eval_adapter_from_type
8
+ from kiln_ai.datamodel.basemodel import ID_TYPE
9
+ from kiln_ai.datamodel.dataset_filters import dataset_filter_from_id
10
+ from kiln_ai.datamodel.eval import EvalConfig, EvalRun, EvalScores
11
+ from kiln_ai.datamodel.task import TaskRunConfig
12
+ from kiln_ai.datamodel.task_run import TaskRun
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ @dataclass
18
+ class EvalJob:
19
+ item: TaskRun
20
+ type: Literal["task_run_eval", "eval_config_eval"]
21
+ # If type == "task_run_eval", both of these should be set. If type == "eval_config_eval", only eval_config should be set.
22
+ eval_config: EvalConfig
23
+ task_run_config: TaskRunConfig | None = None
24
+
25
+
26
+ @dataclass
27
+ class EvalProgress:
28
+ complete: int | None = None
29
+ total: int | None = None
30
+ errors: int | None = None
31
+
32
+
33
+ class EvalRunner:
34
+ """
35
+ Runs an eval. Async execution is supported to make it faster when using remote/fast model providers.
36
+
37
+ Can run an eval in 2 modes:
38
+ 1) eval_config_eval: evaluate an eval config using existing dataset items.
39
+ 2) task_run_eval: evaluate a range of task run configs, generating new run output using existing dataset item input.
40
+ """
41
+
42
+ def __init__(
43
+ self,
44
+ eval_configs: List[EvalConfig],
45
+ run_configs: List[TaskRunConfig] | None,
46
+ eval_run_type: Literal["eval_config_eval", "task_run_eval"],
47
+ ):
48
+ if len(eval_configs) == 0:
49
+ raise ValueError("Eval runner requires at least one eval config")
50
+ target_eval = eval_configs[0].parent_eval()
51
+ if target_eval is None:
52
+ raise ValueError("Eval config requires a parent eval")
53
+ for eval_config in eval_configs:
54
+ parent_eval = eval_config.parent_eval()
55
+ if parent_eval is None:
56
+ raise ValueError("Eval config requires a parent eval")
57
+ if parent_eval.id != target_eval.id:
58
+ raise ValueError("All eval configs must have the same parent eval")
59
+
60
+ target_task = target_eval.parent_task()
61
+ if target_task is None:
62
+ raise ValueError("Eval config requires a (grand)parent task")
63
+
64
+ # Check that run_configs is compatible
65
+ if eval_run_type == "task_run_eval":
66
+ if run_configs is None or len(run_configs) == 0:
67
+ raise ValueError("Task run eval requires run configs")
68
+ for run_config in run_configs:
69
+ parent_task = run_config.parent_task()
70
+ if parent_task is None:
71
+ raise ValueError("All run configs must have a parent task")
72
+ if parent_task.id != target_task.id:
73
+ raise ValueError(
74
+ "Run config is not for the same task as the eval configs"
75
+ )
76
+ else:
77
+ if run_configs is not None:
78
+ raise ValueError("Mode 'eval_config_eval' does not support run configs")
79
+
80
+ self.eval_run_type = eval_run_type
81
+ self.eval_configs = eval_configs
82
+ self.run_configs = run_configs
83
+ self.task = target_task
84
+ self.eval = target_eval
85
+
86
+ def collect_tasks(self) -> List[EvalJob]:
87
+ if self.eval_run_type == "eval_config_eval":
88
+ return self.collect_tasks_for_eval_config_eval()
89
+ else:
90
+ return self.collect_tasks_for_task_run_eval()
91
+
92
+ def collect_tasks_for_eval_config_eval(self) -> List[EvalJob]:
93
+ """
94
+ Collect all jobs for this run, excluding any that have already been run.
95
+
96
+ This variant is used for mode "eval_config_eval", using existing dataset run data (input/output).
97
+
98
+ The tasks:
99
+ - should be in the eval config set filter
100
+ - should not have already been run for this eval config + dataset item pair
101
+ """
102
+ filter = dataset_filter_from_id(self.eval.eval_configs_filter_id)
103
+
104
+ # already_run[eval_config_id][dataset_id]
105
+ already_run: Dict[ID_TYPE, Set[ID_TYPE]] = {}
106
+ for eval_config in self.eval_configs:
107
+ already_run[eval_config.id] = set()
108
+ for run in eval_config.runs(readonly=True):
109
+ already_run[eval_config.id].add(run.dataset_id)
110
+
111
+ return [
112
+ EvalJob(
113
+ item=task_run,
114
+ eval_config=eval_config,
115
+ type="eval_config_eval",
116
+ )
117
+ for task_run in self.task.runs(readonly=True)
118
+ if filter(task_run)
119
+ for eval_config in self.eval_configs
120
+ if task_run.id not in already_run[eval_config.id]
121
+ ]
122
+
123
+ def collect_tasks_for_task_run_eval(self) -> List[EvalJob]:
124
+ """
125
+ Collect all jobs for this run, excluding any that have already been run.
126
+
127
+ This variant is used for mode "task_run_eval", generating new run output using existing dataset item input.
128
+
129
+ The tasks:
130
+ - should be in the eval set filter
131
+ - should not have already been run for this eval config + run config + dataset item
132
+ """
133
+ filter = dataset_filter_from_id(self.eval.eval_set_filter_id)
134
+
135
+ # already_run[eval_config_id][run_config_id][dataset_id]
136
+ already_run: Dict[ID_TYPE, Dict[ID_TYPE, Set[ID_TYPE]]] = {}
137
+ for eval_config in self.eval_configs:
138
+ already_run[eval_config.id] = {}
139
+ for run_config in self.run_configs or []:
140
+ already_run[eval_config.id][run_config.id] = set()
141
+ for run in eval_config.runs(readonly=True):
142
+ if (
143
+ run.task_run_config_id is not None
144
+ and run.task_run_config_id in already_run[eval_config.id]
145
+ ):
146
+ already_run[eval_config.id][run.task_run_config_id].add(
147
+ run.dataset_id
148
+ )
149
+
150
+ return [
151
+ EvalJob(
152
+ item=task_run,
153
+ task_run_config=run_config,
154
+ type="task_run_eval",
155
+ eval_config=eval_config,
156
+ )
157
+ for task_run in self.task.runs(readonly=True)
158
+ if filter(task_run)
159
+ for eval_config in self.eval_configs
160
+ for run_config in self.run_configs or []
161
+ if task_run.id not in already_run[eval_config.id][run_config.id]
162
+ ]
163
+
164
+ async def run(self, concurrency: int = 25) -> AsyncGenerator[EvalProgress, None]:
165
+ """
166
+ Runs the configured eval run with parallel workers and yields progress updates.
167
+ """
168
+ jobs = self.collect_tasks()
169
+
170
+ complete = 0
171
+ errors = 0
172
+ total = len(jobs)
173
+
174
+ # Send initial status
175
+ yield EvalProgress(complete=complete, total=total, errors=errors)
176
+
177
+ worker_queue: asyncio.Queue[EvalJob] = asyncio.Queue()
178
+ for job in jobs:
179
+ worker_queue.put_nowait(job)
180
+
181
+ # simple status queue to return progress. True=success, False=error
182
+ status_queue: asyncio.Queue[bool] = asyncio.Queue()
183
+
184
+ workers = []
185
+ for i in range(concurrency):
186
+ task = asyncio.create_task(self.run_worker(worker_queue, status_queue))
187
+ workers.append(task)
188
+
189
+ # Send status updates until workers are done, and they are all sent
190
+ while not status_queue.empty() or not all(worker.done() for worker in workers):
191
+ try:
192
+ # Use timeout to prevent hanging if all workers complete
193
+ # between our while condition check and get()
194
+ success = await asyncio.wait_for(status_queue.get(), timeout=0.1)
195
+ if success:
196
+ complete += 1
197
+ else:
198
+ errors += 1
199
+
200
+ yield EvalProgress(complete=complete, total=total, errors=errors)
201
+ except asyncio.TimeoutError:
202
+ # Timeout is expected, just continue to recheck worker status
203
+ # Don't love this but beats sentinels for reliability
204
+ continue
205
+
206
+ # These are redundant, but keeping them will catch async errors
207
+ await asyncio.gather(*workers)
208
+ await worker_queue.join()
209
+
210
+ async def run_worker(
211
+ self, worker_queue: asyncio.Queue[EvalJob], status_queue: asyncio.Queue[bool]
212
+ ):
213
+ while True:
214
+ try:
215
+ job = worker_queue.get_nowait()
216
+ except asyncio.QueueEmpty:
217
+ # worker can end when the queue is empty
218
+ break
219
+ try:
220
+ success = await self.run_job(job)
221
+ await status_queue.put(success)
222
+ finally:
223
+ # Always mark the dequeued task as done, even on exceptions
224
+ worker_queue.task_done()
225
+
226
+ async def run_job(self, job: EvalJob) -> bool:
227
+ try:
228
+ # Create the evaluator for this eval config/run config pair
229
+ evaluator = eval_adapter_from_type(job.eval_config.config_type)(
230
+ job.eval_config,
231
+ job.task_run_config.run_config() if job.task_run_config else None,
232
+ )
233
+ if not isinstance(evaluator, BaseEval):
234
+ raise ValueError("Not able to create evaluator from eval config")
235
+
236
+ task_output: str | None = None
237
+ scores: EvalScores | None = None
238
+ intermediate_outputs: Dict[str, str] | None = None
239
+ if job.type == "eval_config_eval":
240
+ # Eval config eval, we use the saved input from the task run, not invoking the task again
241
+ scores, intermediate_outputs = await evaluator.run_eval(job.item)
242
+ task_output = job.item.output.output
243
+ else:
244
+ # Task run eval, we invoke the task again to get a fresh output
245
+ (
246
+ result_task_run,
247
+ scores,
248
+ intermediate_outputs,
249
+ ) = await evaluator.run_task_and_eval(job.item.input)
250
+ task_output = result_task_run.output.output
251
+
252
+ # Save the job result
253
+ eval_run = EvalRun(
254
+ parent=job.eval_config,
255
+ task_run_config_id=job.task_run_config.id
256
+ if job.task_run_config
257
+ else None,
258
+ dataset_id=job.item.id,
259
+ eval_config_eval=job.type == "eval_config_eval",
260
+ scores=scores,
261
+ input=job.item.input,
262
+ output=task_output,
263
+ intermediate_outputs=intermediate_outputs,
264
+ )
265
+ eval_run.save_to_file()
266
+
267
+ return True
268
+ except Exception as e:
269
+ logger.error(f"Error running eval job for dataset item {job.item.id}: {e}")
270
+ return False
@@ -0,0 +1,368 @@
1
+ import math
2
+ from typing import Dict, List, Tuple
3
+
4
+ from litellm.types.utils import ChatCompletionTokenLogprob
5
+
6
+ from kiln_ai.adapters.adapter_registry import adapter_for_task
7
+ from kiln_ai.adapters.eval.base_eval import BaseEval
8
+ from kiln_ai.adapters.model_adapters.base_adapter import AdapterConfig, RunOutput
9
+ from kiln_ai.adapters.prompt_builders import PromptGenerators
10
+ from kiln_ai.datamodel import Project, Task, TaskRun
11
+ from kiln_ai.datamodel.eval import EvalConfig, EvalConfigType, EvalScores
12
+ from kiln_ai.datamodel.task import RunConfig
13
+
14
+ # all the tokens we score for, and their float scores.
15
+ TOKEN_TO_SCORE_MAP: Dict[str, float] = {
16
+ "1": 1.0,
17
+ "2": 2.0,
18
+ "3": 3.0,
19
+ "4": 4.0,
20
+ "5": 5.0,
21
+ "pass": 1.0,
22
+ "fail": 0.0,
23
+ "critical": -1.0,
24
+ }
25
+
26
+
27
+ class GEvalTask(Task, parent_of={}):
28
+ """
29
+ Kiln task for executing a G-Eval. Can be run on any Kiln adapter which supports logprobs.
30
+
31
+ Note G-Eval implements both G-Eval and LLM as Judge as they are very similar.
32
+ """
33
+
34
+ def __init__(self, eval_config: EvalConfig):
35
+ tmp_project = Project(name="GEval")
36
+
37
+ # Build a simple LLM as Judge system instruction
38
+ system_instruction = "Your job to evaluate a model's performance on a task. Blocks will be marked with <eval_data> tags.\n"
39
+ # Optionally add a short task description
40
+ task_description = eval_config.properties.get("task_description", None)
41
+ if task_description:
42
+ system_instruction += f"\nThe task the model was given is as follows:\n<eval_data>\n{task_description}\n</eval_data>\n"
43
+
44
+ # Build the COT eval instructions
45
+ cot_instructions = "First, think step by step about the model's performance following these evaluation steps:\n\n"
46
+ steps = eval_config.properties.get("eval_steps", None)
47
+ if not steps or not isinstance(steps, list):
48
+ raise ValueError("eval_steps must be a list")
49
+ for i, step in enumerate(steps):
50
+ cot_instructions += f"{i + 1}) {step}\n"
51
+
52
+ eval = eval_config.parent_eval()
53
+ if not eval:
54
+ raise ValueError("Eval config must have a parent eval")
55
+
56
+ # Build the output schema from the eval's target output scores.
57
+ # We restrict the LLM's output scoring schema to discrete scores (pass/fail/critical/1-5) - allow_float_scores=False
58
+ # However, the final scores from the evaluator can be a float (see later logprob calculation, which requires discrete token outputs)
59
+ output_schema = BaseEval.build_score_schema(eval, allow_float_scores=False)
60
+
61
+ super().__init__(
62
+ name="GEval Task",
63
+ parent=tmp_project,
64
+ instruction=system_instruction,
65
+ thinking_instruction=cot_instructions,
66
+ output_json_schema=output_schema,
67
+ )
68
+
69
+
70
+ class GEval(BaseEval):
71
+ """
72
+ A evaluator which implements G-Eval and LLM as Judge.
73
+
74
+ G-Eval is a method of evaluating the quality of a model's output. It is a weighted average of the scores of the tokens in the output. The weights are the log probabilities of the tokens in the output. https://arxiv.org/abs/2303.16634
75
+
76
+ LLM as Judge is a method of evaluating the quality of a model's output. It simply asks the LLM to score, and uses the returned output (no logprobs needed). Also called direct evaluation.
77
+
78
+ @misc{liu2023gevalnlgevaluationusing,
79
+ title={G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment},
80
+ author={Yang Liu and Dan Iter and Yichong Xu and Shuohang Wang and Ruochen Xu and Chenguang Zhu},
81
+ year={2023},
82
+ eprint={2303.16634},
83
+ archivePrefix={arXiv},
84
+ primaryClass={cs.CL},
85
+ url={https://arxiv.org/abs/2303.16634},
86
+ }
87
+ """
88
+
89
+ def __init__(self, eval_config: EvalConfig, run_config: RunConfig | None):
90
+ if (
91
+ eval_config.config_type != EvalConfigType.g_eval
92
+ and eval_config.config_type != EvalConfigType.llm_as_judge
93
+ ):
94
+ raise ValueError(
95
+ f"GEval must be initialized with a GEval or LLM as Judge config_type. Got {eval_config.config_type}"
96
+ )
97
+
98
+ super().__init__(eval_config, run_config)
99
+
100
+ self.geval_task = GEvalTask(eval_config)
101
+
102
+ async def run_eval(
103
+ self, task_run: TaskRun
104
+ ) -> tuple[EvalScores, Dict[str, str] | None]:
105
+ """
106
+ Run this eval on the given task run.
107
+ """
108
+
109
+ model_name, provider = self.model_and_provider()
110
+
111
+ # Only fetch logprobs for G-Eval
112
+ # There are at most 5 valid rating tokens per rating type (five_star being largest), so 10 is more than enough to get to the very very unlikely
113
+ top_logprobs = (
114
+ 10 if self.eval_config.config_type == EvalConfigType.g_eval else None
115
+ )
116
+
117
+ adapter = adapter_for_task(
118
+ self.geval_task,
119
+ model_name,
120
+ provider,
121
+ # We always use Simple COT for G-Eval and LLM as Judge
122
+ prompt_id=PromptGenerators.SIMPLE_CHAIN_OF_THOUGHT,
123
+ base_adapter_config=AdapterConfig(
124
+ # Don't save this run into the task_runs. It will be saved into an eval_run where it belongs
125
+ allow_saving=False,
126
+ top_logprobs=top_logprobs,
127
+ ),
128
+ )
129
+
130
+ input = f"""The model was given the following input for the task:
131
+ <eval_data>
132
+ {task_run.input}
133
+ </eval_data>
134
+
135
+ The model produced the following output for the task:
136
+ <eval_data>
137
+ {task_run.output}
138
+ </eval_data>
139
+ """
140
+
141
+ # We don't need the run, but invoke_returning_run_output() runs validations for us over _run()
142
+ _, run_output = await adapter.invoke_returning_run_output(input)
143
+
144
+ if self.eval_config.config_type == EvalConfigType.llm_as_judge:
145
+ return self.build_llm_as_judge_score(
146
+ run_output
147
+ ), run_output.intermediate_outputs
148
+ else:
149
+ return self.build_g_eval_score(run_output), run_output.intermediate_outputs
150
+
151
+ def build_llm_as_judge_score(self, run_output: RunOutput) -> EvalScores:
152
+ """
153
+ Build the LLM as Judge score for the given run and run output.
154
+ """
155
+ # Convert the output format we asked for (discreet values) to our float scores
156
+ scores: EvalScores = {}
157
+ if not isinstance(run_output.output, dict):
158
+ raise ValueError("LLM as Judge output must be a dictionary")
159
+
160
+ for metric, score in run_output.output.items():
161
+ token_score = self.score_from_token_string(f"{score}")
162
+ if token_score is None:
163
+ raise ValueError(
164
+ f"No score found for metric: {metric}. The LLM failed to follow the scoring rubric/instructions/schema."
165
+ )
166
+ scores[metric] = token_score
167
+ return scores
168
+
169
+ def build_g_eval_score(self, run_output: RunOutput) -> EvalScores:
170
+ """
171
+ Build the G-Eval score for the given run and run output.
172
+
173
+ We create a weighted average of each rating using the logprobs.
174
+
175
+ @misc{liu2023gevalnlgevaluationusing,
176
+ title={G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment},
177
+ author={Yang Liu and Dan Iter and Yichong Xu and Shuohang Wang and Ruochen Xu and Chenguang Zhu},
178
+ year={2023},
179
+ eprint={2303.16634},
180
+ archivePrefix={arXiv},
181
+ primaryClass={cs.CL},
182
+ url={https://arxiv.org/abs/2303.16634},
183
+ }
184
+ """
185
+ # We use structured output
186
+ outputs = run_output.output
187
+ assert isinstance(outputs, dict)
188
+
189
+ # Build raw string output from the logprobs, which is easier to work with than Dict for the next bit
190
+ raw_output = self.raw_output_from_logprobs(run_output)
191
+
192
+ # find the offset the start of each metric in the raw output json
193
+ metrics: List[str] = list(outputs.keys())
194
+ metric_offsets = self.metric_offsets(raw_output, metrics)
195
+
196
+ final_scores: EvalScores = {}
197
+ for metric in metrics:
198
+ score = self.g_eval_single_metric(
199
+ run_output, metric, metric_offsets, raw_output
200
+ )
201
+ if score is None:
202
+ raise ValueError(
203
+ f"No score found for metric: {metric}. The LLM failed to follow the scoring rubric/instructions/schema."
204
+ )
205
+ final_scores[metric] = score
206
+
207
+ return final_scores
208
+
209
+ def g_eval_single_metric(
210
+ self,
211
+ run_output: RunOutput,
212
+ metric: str,
213
+ metric_offsets: Dict[str, int],
214
+ raw_output: str,
215
+ ) -> float | None:
216
+ """
217
+ Run the G-Eval for a single metric.
218
+
219
+ Scan the logprobs for the metric and return the weighted score of the rating token.
220
+ """
221
+
222
+ start_offset, end_offset = self.token_search_range(
223
+ raw_output, metric, metric_offsets
224
+ )
225
+
226
+ offset = 0
227
+
228
+ if (
229
+ run_output.output_logprobs is None
230
+ or run_output.output_logprobs.content is None
231
+ ):
232
+ raise RuntimeError(
233
+ "No logprobs found for output - can not calculate g-eval"
234
+ )
235
+
236
+ # scan the tokens in the range, looking for the rating token
237
+ for _, chat_logprob in enumerate(run_output.output_logprobs.content):
238
+ if offset >= end_offset:
239
+ break
240
+ if offset >= start_offset:
241
+ score = self.rating_token_to_score(chat_logprob)
242
+ if score is not None:
243
+ return score
244
+ offset += len(chat_logprob.token)
245
+
246
+ return None
247
+
248
+ def raw_output_from_logprobs(self, run_output: RunOutput) -> str:
249
+ """
250
+ Build the raw output string from the logprobs. Generate from logprobs so it's guaranteed to match the logprobs offsets
251
+ """
252
+ if (
253
+ run_output.output_logprobs is None
254
+ or run_output.output_logprobs.content is None
255
+ ):
256
+ raise RuntimeError(
257
+ "No logprobs found for output - can not calculate g-eval"
258
+ )
259
+
260
+ raw = ""
261
+ for chat_logprob in run_output.output_logprobs.content:
262
+ raw += chat_logprob.token
263
+ return raw
264
+
265
+ def token_search_range(
266
+ self, raw_output: str, metric: str, metric_offsets: Dict[str, int]
267
+ ) -> Tuple[int, int]:
268
+ """
269
+ Find the start and end offsets of the metric in the raw output.
270
+
271
+ Start searching after the end of the target metric json entry ("overall_rating":), and before the start of the next metric ("some_other_score").
272
+ """
273
+ start_offset = metric_offsets[metric] + len(metric)
274
+
275
+ # Find the lowest end offset that is greater than the start offset
276
+ end_offset = len(raw_output)
277
+ for v in list(metric_offsets.values()):
278
+ if v < end_offset and v > start_offset:
279
+ end_offset = v
280
+
281
+ return start_offset, end_offset
282
+
283
+ def rating_token_to_score(
284
+ self, token_logprob: ChatCompletionTokenLogprob
285
+ ) -> float | None:
286
+ """
287
+ Convert a rating token to a score using weighted average of top logprobs.
288
+
289
+ Only includes tokens that have valid scores.
290
+
291
+ Some cleanup for upper case, whitespace and quotes. LLMs aren't always consistent.
292
+ """
293
+ primary_token_score = self.score_from_token_string(token_logprob.token)
294
+ # check this is a real rating token, it could just be the ": ", "," or whitespace
295
+ if not primary_token_score:
296
+ return None
297
+
298
+ total_score = 0.0
299
+ total_probability = 0.0
300
+
301
+ # Process all valid scoring tokens
302
+ for top_logprob in token_logprob.top_logprobs:
303
+ token_score = self.score_from_token_string(top_logprob.token)
304
+ if token_score is not None:
305
+ # Convert logprob to probability
306
+ probability = math.exp(top_logprob.logprob)
307
+ total_score += token_score * probability
308
+ total_probability += probability
309
+
310
+ if total_probability <= 0.0:
311
+ raise RuntimeError(
312
+ f"No valid scoring tokens found for {token_logprob.token}. This should never happen. Please file a bug if you see this."
313
+ )
314
+
315
+ # Normalize by total probability of valid tokens (LLM may have wanted to generate other non-rating tokens, these shouldn't lower score of rating tokens)
316
+ weighted_score = total_score / total_probability
317
+
318
+ return weighted_score
319
+
320
+ def score_from_token_string(self, token: str) -> float | None:
321
+ if token in TOKEN_TO_SCORE_MAP:
322
+ return TOKEN_TO_SCORE_MAP[token]
323
+
324
+ # handle more token variations like '"1"' and '"pass"' and ' paSS' and 'PASS'
325
+ unquoted_token = token.strip().strip('"').lower()
326
+ if unquoted_token in TOKEN_TO_SCORE_MAP:
327
+ return TOKEN_TO_SCORE_MAP[unquoted_token]
328
+
329
+ # handle numeric tokens like "1.0"
330
+ try:
331
+ float_value = float(token)
332
+ if float_value.is_integer():
333
+ str_token = str(int(float_value))
334
+ if str_token in TOKEN_TO_SCORE_MAP:
335
+ return TOKEN_TO_SCORE_MAP[str_token]
336
+ except ValueError:
337
+ pass
338
+
339
+ return None
340
+
341
+ def metric_offsets(self, raw_output: str, metrics: List[str]) -> Dict[str, int]:
342
+ """
343
+ Find the offset to the start of each metric in the raw output json
344
+
345
+ For the example json: `{"overall_rating": 1}` == 1
346
+
347
+ should return:
348
+ {
349
+ "overall_rating": 1 # it's 1 character into the json string
350
+ }
351
+ """
352
+ metric_offsets: Dict[str, int] = {}
353
+ for metric in metrics:
354
+ # the quoted metric name is expected in the json: `{"overall_rating": 1}` == 1
355
+ metric_name = f'"{metric}"'
356
+
357
+ # we expect it exactly once
358
+ count = raw_output.count(metric_name)
359
+ if count != 1:
360
+ raise ValueError(
361
+ f"Metric {metric} should appear exactly once in the output. Found {count} times"
362
+ )
363
+
364
+ offset = raw_output.find(metric_name)
365
+ if offset == -1:
366
+ raise ValueError(f"Metric {metric} not found in raw output")
367
+ metric_offsets[metric] = offset
368
+ return metric_offsets
@@ -0,0 +1,16 @@
1
+ from kiln_ai.adapters.eval.base_eval import BaseEval
2
+ from kiln_ai.adapters.eval.g_eval import GEval
3
+ from kiln_ai.datamodel.eval import EvalConfigType
4
+ from kiln_ai.utils.exhaustive_error import raise_exhaustive_enum_error
5
+
6
+
7
+ def eval_adapter_from_type(eval_config_type: EvalConfigType) -> type[BaseEval]:
8
+ match eval_config_type:
9
+ case EvalConfigType.g_eval:
10
+ return GEval
11
+ case EvalConfigType.llm_as_judge:
12
+ # Also implemented by GEval
13
+ return GEval
14
+ case _:
15
+ # type checking will catch missing cases
16
+ raise_exhaustive_enum_error(eval_config_type)