kiln-ai 0.11.1__py3-none-any.whl → 0.13.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of kiln-ai might be problematic. Click here for more details.
- kiln_ai/adapters/__init__.py +4 -0
- kiln_ai/adapters/adapter_registry.py +163 -39
- kiln_ai/adapters/data_gen/data_gen_task.py +18 -0
- kiln_ai/adapters/eval/__init__.py +28 -0
- kiln_ai/adapters/eval/base_eval.py +164 -0
- kiln_ai/adapters/eval/eval_runner.py +270 -0
- kiln_ai/adapters/eval/g_eval.py +368 -0
- kiln_ai/adapters/eval/registry.py +16 -0
- kiln_ai/adapters/eval/test_base_eval.py +325 -0
- kiln_ai/adapters/eval/test_eval_runner.py +641 -0
- kiln_ai/adapters/eval/test_g_eval.py +498 -0
- kiln_ai/adapters/eval/test_g_eval_data.py +4 -0
- kiln_ai/adapters/fine_tune/base_finetune.py +16 -2
- kiln_ai/adapters/fine_tune/finetune_registry.py +2 -0
- kiln_ai/adapters/fine_tune/test_dataset_formatter.py +4 -1
- kiln_ai/adapters/fine_tune/test_fireworks_tinetune.py +1 -1
- kiln_ai/adapters/fine_tune/test_openai_finetune.py +1 -1
- kiln_ai/adapters/fine_tune/test_together_finetune.py +531 -0
- kiln_ai/adapters/fine_tune/together_finetune.py +325 -0
- kiln_ai/adapters/ml_model_list.py +758 -163
- kiln_ai/adapters/model_adapters/__init__.py +2 -4
- kiln_ai/adapters/model_adapters/base_adapter.py +61 -43
- kiln_ai/adapters/model_adapters/litellm_adapter.py +391 -0
- kiln_ai/adapters/model_adapters/litellm_config.py +13 -0
- kiln_ai/adapters/model_adapters/test_base_adapter.py +22 -13
- kiln_ai/adapters/model_adapters/test_litellm_adapter.py +407 -0
- kiln_ai/adapters/model_adapters/test_saving_adapter_results.py +41 -19
- kiln_ai/adapters/model_adapters/test_structured_output.py +59 -35
- kiln_ai/adapters/ollama_tools.py +3 -3
- kiln_ai/adapters/parsers/r1_parser.py +19 -14
- kiln_ai/adapters/parsers/test_r1_parser.py +17 -5
- kiln_ai/adapters/prompt_builders.py +80 -42
- kiln_ai/adapters/provider_tools.py +50 -58
- kiln_ai/adapters/repair/repair_task.py +9 -21
- kiln_ai/adapters/repair/test_repair_task.py +6 -6
- kiln_ai/adapters/run_output.py +3 -0
- kiln_ai/adapters/test_adapter_registry.py +26 -29
- kiln_ai/adapters/test_generate_docs.py +4 -4
- kiln_ai/adapters/test_ollama_tools.py +0 -1
- kiln_ai/adapters/test_prompt_adaptors.py +47 -33
- kiln_ai/adapters/test_prompt_builders.py +91 -31
- kiln_ai/adapters/test_provider_tools.py +26 -81
- kiln_ai/datamodel/__init__.py +50 -952
- kiln_ai/datamodel/basemodel.py +2 -0
- kiln_ai/datamodel/datamodel_enums.py +60 -0
- kiln_ai/datamodel/dataset_filters.py +114 -0
- kiln_ai/datamodel/dataset_split.py +170 -0
- kiln_ai/datamodel/eval.py +298 -0
- kiln_ai/datamodel/finetune.py +105 -0
- kiln_ai/datamodel/json_schema.py +7 -1
- kiln_ai/datamodel/project.py +23 -0
- kiln_ai/datamodel/prompt.py +37 -0
- kiln_ai/datamodel/prompt_id.py +83 -0
- kiln_ai/datamodel/strict_mode.py +24 -0
- kiln_ai/datamodel/task.py +181 -0
- kiln_ai/datamodel/task_output.py +328 -0
- kiln_ai/datamodel/task_run.py +164 -0
- kiln_ai/datamodel/test_basemodel.py +19 -11
- kiln_ai/datamodel/test_dataset_filters.py +71 -0
- kiln_ai/datamodel/test_dataset_split.py +32 -8
- kiln_ai/datamodel/test_datasource.py +22 -2
- kiln_ai/datamodel/test_eval_model.py +635 -0
- kiln_ai/datamodel/test_example_models.py +9 -13
- kiln_ai/datamodel/test_json_schema.py +23 -0
- kiln_ai/datamodel/test_models.py +2 -2
- kiln_ai/datamodel/test_prompt_id.py +129 -0
- kiln_ai/datamodel/test_task.py +159 -0
- kiln_ai/utils/config.py +43 -1
- kiln_ai/utils/dataset_import.py +232 -0
- kiln_ai/utils/test_dataset_import.py +596 -0
- {kiln_ai-0.11.1.dist-info → kiln_ai-0.13.0.dist-info}/METADATA +86 -6
- kiln_ai-0.13.0.dist-info/RECORD +103 -0
- kiln_ai/adapters/model_adapters/langchain_adapters.py +0 -302
- kiln_ai/adapters/model_adapters/openai_compatible_config.py +0 -11
- kiln_ai/adapters/model_adapters/openai_model_adapter.py +0 -246
- kiln_ai/adapters/model_adapters/test_langchain_adapter.py +0 -350
- kiln_ai/adapters/model_adapters/test_openai_model_adapter.py +0 -225
- kiln_ai-0.11.1.dist-info/RECORD +0 -76
- {kiln_ai-0.11.1.dist-info → kiln_ai-0.13.0.dist-info}/WHEEL +0 -0
- {kiln_ai-0.11.1.dist-info → kiln_ai-0.13.0.dist-info}/licenses/LICENSE.txt +0 -0
|
@@ -0,0 +1,270 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import logging
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from typing import AsyncGenerator, Dict, List, Literal, Set
|
|
5
|
+
|
|
6
|
+
from kiln_ai.adapters.eval.base_eval import BaseEval
|
|
7
|
+
from kiln_ai.adapters.eval.registry import eval_adapter_from_type
|
|
8
|
+
from kiln_ai.datamodel.basemodel import ID_TYPE
|
|
9
|
+
from kiln_ai.datamodel.dataset_filters import dataset_filter_from_id
|
|
10
|
+
from kiln_ai.datamodel.eval import EvalConfig, EvalRun, EvalScores
|
|
11
|
+
from kiln_ai.datamodel.task import TaskRunConfig
|
|
12
|
+
from kiln_ai.datamodel.task_run import TaskRun
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@dataclass
|
|
18
|
+
class EvalJob:
|
|
19
|
+
item: TaskRun
|
|
20
|
+
type: Literal["task_run_eval", "eval_config_eval"]
|
|
21
|
+
# If type == "task_run_eval", both of these should be set. If type == "eval_config_eval", only eval_config should be set.
|
|
22
|
+
eval_config: EvalConfig
|
|
23
|
+
task_run_config: TaskRunConfig | None = None
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@dataclass
|
|
27
|
+
class EvalProgress:
|
|
28
|
+
complete: int | None = None
|
|
29
|
+
total: int | None = None
|
|
30
|
+
errors: int | None = None
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class EvalRunner:
|
|
34
|
+
"""
|
|
35
|
+
Runs an eval. Async execution is supported to make it faster when using remote/fast model providers.
|
|
36
|
+
|
|
37
|
+
Can run an eval in 2 modes:
|
|
38
|
+
1) eval_config_eval: evaluate an eval config using existing dataset items.
|
|
39
|
+
2) task_run_eval: evaluate a range of task run configs, generating new run output using existing dataset item input.
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
def __init__(
|
|
43
|
+
self,
|
|
44
|
+
eval_configs: List[EvalConfig],
|
|
45
|
+
run_configs: List[TaskRunConfig] | None,
|
|
46
|
+
eval_run_type: Literal["eval_config_eval", "task_run_eval"],
|
|
47
|
+
):
|
|
48
|
+
if len(eval_configs) == 0:
|
|
49
|
+
raise ValueError("Eval runner requires at least one eval config")
|
|
50
|
+
target_eval = eval_configs[0].parent_eval()
|
|
51
|
+
if target_eval is None:
|
|
52
|
+
raise ValueError("Eval config requires a parent eval")
|
|
53
|
+
for eval_config in eval_configs:
|
|
54
|
+
parent_eval = eval_config.parent_eval()
|
|
55
|
+
if parent_eval is None:
|
|
56
|
+
raise ValueError("Eval config requires a parent eval")
|
|
57
|
+
if parent_eval.id != target_eval.id:
|
|
58
|
+
raise ValueError("All eval configs must have the same parent eval")
|
|
59
|
+
|
|
60
|
+
target_task = target_eval.parent_task()
|
|
61
|
+
if target_task is None:
|
|
62
|
+
raise ValueError("Eval config requires a (grand)parent task")
|
|
63
|
+
|
|
64
|
+
# Check that run_configs is compatible
|
|
65
|
+
if eval_run_type == "task_run_eval":
|
|
66
|
+
if run_configs is None or len(run_configs) == 0:
|
|
67
|
+
raise ValueError("Task run eval requires run configs")
|
|
68
|
+
for run_config in run_configs:
|
|
69
|
+
parent_task = run_config.parent_task()
|
|
70
|
+
if parent_task is None:
|
|
71
|
+
raise ValueError("All run configs must have a parent task")
|
|
72
|
+
if parent_task.id != target_task.id:
|
|
73
|
+
raise ValueError(
|
|
74
|
+
"Run config is not for the same task as the eval configs"
|
|
75
|
+
)
|
|
76
|
+
else:
|
|
77
|
+
if run_configs is not None:
|
|
78
|
+
raise ValueError("Mode 'eval_config_eval' does not support run configs")
|
|
79
|
+
|
|
80
|
+
self.eval_run_type = eval_run_type
|
|
81
|
+
self.eval_configs = eval_configs
|
|
82
|
+
self.run_configs = run_configs
|
|
83
|
+
self.task = target_task
|
|
84
|
+
self.eval = target_eval
|
|
85
|
+
|
|
86
|
+
def collect_tasks(self) -> List[EvalJob]:
|
|
87
|
+
if self.eval_run_type == "eval_config_eval":
|
|
88
|
+
return self.collect_tasks_for_eval_config_eval()
|
|
89
|
+
else:
|
|
90
|
+
return self.collect_tasks_for_task_run_eval()
|
|
91
|
+
|
|
92
|
+
def collect_tasks_for_eval_config_eval(self) -> List[EvalJob]:
|
|
93
|
+
"""
|
|
94
|
+
Collect all jobs for this run, excluding any that have already been run.
|
|
95
|
+
|
|
96
|
+
This variant is used for mode "eval_config_eval", using existing dataset run data (input/output).
|
|
97
|
+
|
|
98
|
+
The tasks:
|
|
99
|
+
- should be in the eval config set filter
|
|
100
|
+
- should not have already been run for this eval config + dataset item pair
|
|
101
|
+
"""
|
|
102
|
+
filter = dataset_filter_from_id(self.eval.eval_configs_filter_id)
|
|
103
|
+
|
|
104
|
+
# already_run[eval_config_id][dataset_id]
|
|
105
|
+
already_run: Dict[ID_TYPE, Set[ID_TYPE]] = {}
|
|
106
|
+
for eval_config in self.eval_configs:
|
|
107
|
+
already_run[eval_config.id] = set()
|
|
108
|
+
for run in eval_config.runs(readonly=True):
|
|
109
|
+
already_run[eval_config.id].add(run.dataset_id)
|
|
110
|
+
|
|
111
|
+
return [
|
|
112
|
+
EvalJob(
|
|
113
|
+
item=task_run,
|
|
114
|
+
eval_config=eval_config,
|
|
115
|
+
type="eval_config_eval",
|
|
116
|
+
)
|
|
117
|
+
for task_run in self.task.runs(readonly=True)
|
|
118
|
+
if filter(task_run)
|
|
119
|
+
for eval_config in self.eval_configs
|
|
120
|
+
if task_run.id not in already_run[eval_config.id]
|
|
121
|
+
]
|
|
122
|
+
|
|
123
|
+
def collect_tasks_for_task_run_eval(self) -> List[EvalJob]:
|
|
124
|
+
"""
|
|
125
|
+
Collect all jobs for this run, excluding any that have already been run.
|
|
126
|
+
|
|
127
|
+
This variant is used for mode "task_run_eval", generating new run output using existing dataset item input.
|
|
128
|
+
|
|
129
|
+
The tasks:
|
|
130
|
+
- should be in the eval set filter
|
|
131
|
+
- should not have already been run for this eval config + run config + dataset item
|
|
132
|
+
"""
|
|
133
|
+
filter = dataset_filter_from_id(self.eval.eval_set_filter_id)
|
|
134
|
+
|
|
135
|
+
# already_run[eval_config_id][run_config_id][dataset_id]
|
|
136
|
+
already_run: Dict[ID_TYPE, Dict[ID_TYPE, Set[ID_TYPE]]] = {}
|
|
137
|
+
for eval_config in self.eval_configs:
|
|
138
|
+
already_run[eval_config.id] = {}
|
|
139
|
+
for run_config in self.run_configs or []:
|
|
140
|
+
already_run[eval_config.id][run_config.id] = set()
|
|
141
|
+
for run in eval_config.runs(readonly=True):
|
|
142
|
+
if (
|
|
143
|
+
run.task_run_config_id is not None
|
|
144
|
+
and run.task_run_config_id in already_run[eval_config.id]
|
|
145
|
+
):
|
|
146
|
+
already_run[eval_config.id][run.task_run_config_id].add(
|
|
147
|
+
run.dataset_id
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
return [
|
|
151
|
+
EvalJob(
|
|
152
|
+
item=task_run,
|
|
153
|
+
task_run_config=run_config,
|
|
154
|
+
type="task_run_eval",
|
|
155
|
+
eval_config=eval_config,
|
|
156
|
+
)
|
|
157
|
+
for task_run in self.task.runs(readonly=True)
|
|
158
|
+
if filter(task_run)
|
|
159
|
+
for eval_config in self.eval_configs
|
|
160
|
+
for run_config in self.run_configs or []
|
|
161
|
+
if task_run.id not in already_run[eval_config.id][run_config.id]
|
|
162
|
+
]
|
|
163
|
+
|
|
164
|
+
async def run(self, concurrency: int = 25) -> AsyncGenerator[EvalProgress, None]:
|
|
165
|
+
"""
|
|
166
|
+
Runs the configured eval run with parallel workers and yields progress updates.
|
|
167
|
+
"""
|
|
168
|
+
jobs = self.collect_tasks()
|
|
169
|
+
|
|
170
|
+
complete = 0
|
|
171
|
+
errors = 0
|
|
172
|
+
total = len(jobs)
|
|
173
|
+
|
|
174
|
+
# Send initial status
|
|
175
|
+
yield EvalProgress(complete=complete, total=total, errors=errors)
|
|
176
|
+
|
|
177
|
+
worker_queue: asyncio.Queue[EvalJob] = asyncio.Queue()
|
|
178
|
+
for job in jobs:
|
|
179
|
+
worker_queue.put_nowait(job)
|
|
180
|
+
|
|
181
|
+
# simple status queue to return progress. True=success, False=error
|
|
182
|
+
status_queue: asyncio.Queue[bool] = asyncio.Queue()
|
|
183
|
+
|
|
184
|
+
workers = []
|
|
185
|
+
for i in range(concurrency):
|
|
186
|
+
task = asyncio.create_task(self.run_worker(worker_queue, status_queue))
|
|
187
|
+
workers.append(task)
|
|
188
|
+
|
|
189
|
+
# Send status updates until workers are done, and they are all sent
|
|
190
|
+
while not status_queue.empty() or not all(worker.done() for worker in workers):
|
|
191
|
+
try:
|
|
192
|
+
# Use timeout to prevent hanging if all workers complete
|
|
193
|
+
# between our while condition check and get()
|
|
194
|
+
success = await asyncio.wait_for(status_queue.get(), timeout=0.1)
|
|
195
|
+
if success:
|
|
196
|
+
complete += 1
|
|
197
|
+
else:
|
|
198
|
+
errors += 1
|
|
199
|
+
|
|
200
|
+
yield EvalProgress(complete=complete, total=total, errors=errors)
|
|
201
|
+
except asyncio.TimeoutError:
|
|
202
|
+
# Timeout is expected, just continue to recheck worker status
|
|
203
|
+
# Don't love this but beats sentinels for reliability
|
|
204
|
+
continue
|
|
205
|
+
|
|
206
|
+
# These are redundant, but keeping them will catch async errors
|
|
207
|
+
await asyncio.gather(*workers)
|
|
208
|
+
await worker_queue.join()
|
|
209
|
+
|
|
210
|
+
async def run_worker(
|
|
211
|
+
self, worker_queue: asyncio.Queue[EvalJob], status_queue: asyncio.Queue[bool]
|
|
212
|
+
):
|
|
213
|
+
while True:
|
|
214
|
+
try:
|
|
215
|
+
job = worker_queue.get_nowait()
|
|
216
|
+
except asyncio.QueueEmpty:
|
|
217
|
+
# worker can end when the queue is empty
|
|
218
|
+
break
|
|
219
|
+
try:
|
|
220
|
+
success = await self.run_job(job)
|
|
221
|
+
await status_queue.put(success)
|
|
222
|
+
finally:
|
|
223
|
+
# Always mark the dequeued task as done, even on exceptions
|
|
224
|
+
worker_queue.task_done()
|
|
225
|
+
|
|
226
|
+
async def run_job(self, job: EvalJob) -> bool:
|
|
227
|
+
try:
|
|
228
|
+
# Create the evaluator for this eval config/run config pair
|
|
229
|
+
evaluator = eval_adapter_from_type(job.eval_config.config_type)(
|
|
230
|
+
job.eval_config,
|
|
231
|
+
job.task_run_config.run_config() if job.task_run_config else None,
|
|
232
|
+
)
|
|
233
|
+
if not isinstance(evaluator, BaseEval):
|
|
234
|
+
raise ValueError("Not able to create evaluator from eval config")
|
|
235
|
+
|
|
236
|
+
task_output: str | None = None
|
|
237
|
+
scores: EvalScores | None = None
|
|
238
|
+
intermediate_outputs: Dict[str, str] | None = None
|
|
239
|
+
if job.type == "eval_config_eval":
|
|
240
|
+
# Eval config eval, we use the saved input from the task run, not invoking the task again
|
|
241
|
+
scores, intermediate_outputs = await evaluator.run_eval(job.item)
|
|
242
|
+
task_output = job.item.output.output
|
|
243
|
+
else:
|
|
244
|
+
# Task run eval, we invoke the task again to get a fresh output
|
|
245
|
+
(
|
|
246
|
+
result_task_run,
|
|
247
|
+
scores,
|
|
248
|
+
intermediate_outputs,
|
|
249
|
+
) = await evaluator.run_task_and_eval(job.item.input)
|
|
250
|
+
task_output = result_task_run.output.output
|
|
251
|
+
|
|
252
|
+
# Save the job result
|
|
253
|
+
eval_run = EvalRun(
|
|
254
|
+
parent=job.eval_config,
|
|
255
|
+
task_run_config_id=job.task_run_config.id
|
|
256
|
+
if job.task_run_config
|
|
257
|
+
else None,
|
|
258
|
+
dataset_id=job.item.id,
|
|
259
|
+
eval_config_eval=job.type == "eval_config_eval",
|
|
260
|
+
scores=scores,
|
|
261
|
+
input=job.item.input,
|
|
262
|
+
output=task_output,
|
|
263
|
+
intermediate_outputs=intermediate_outputs,
|
|
264
|
+
)
|
|
265
|
+
eval_run.save_to_file()
|
|
266
|
+
|
|
267
|
+
return True
|
|
268
|
+
except Exception as e:
|
|
269
|
+
logger.error(f"Error running eval job for dataset item {job.item.id}: {e}")
|
|
270
|
+
return False
|
|
@@ -0,0 +1,368 @@
|
|
|
1
|
+
import math
|
|
2
|
+
from typing import Dict, List, Tuple
|
|
3
|
+
|
|
4
|
+
from litellm.types.utils import ChatCompletionTokenLogprob
|
|
5
|
+
|
|
6
|
+
from kiln_ai.adapters.adapter_registry import adapter_for_task
|
|
7
|
+
from kiln_ai.adapters.eval.base_eval import BaseEval
|
|
8
|
+
from kiln_ai.adapters.model_adapters.base_adapter import AdapterConfig, RunOutput
|
|
9
|
+
from kiln_ai.adapters.prompt_builders import PromptGenerators
|
|
10
|
+
from kiln_ai.datamodel import Project, Task, TaskRun
|
|
11
|
+
from kiln_ai.datamodel.eval import EvalConfig, EvalConfigType, EvalScores
|
|
12
|
+
from kiln_ai.datamodel.task import RunConfig
|
|
13
|
+
|
|
14
|
+
# all the tokens we score for, and their float scores.
|
|
15
|
+
TOKEN_TO_SCORE_MAP: Dict[str, float] = {
|
|
16
|
+
"1": 1.0,
|
|
17
|
+
"2": 2.0,
|
|
18
|
+
"3": 3.0,
|
|
19
|
+
"4": 4.0,
|
|
20
|
+
"5": 5.0,
|
|
21
|
+
"pass": 1.0,
|
|
22
|
+
"fail": 0.0,
|
|
23
|
+
"critical": -1.0,
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class GEvalTask(Task, parent_of={}):
|
|
28
|
+
"""
|
|
29
|
+
Kiln task for executing a G-Eval. Can be run on any Kiln adapter which supports logprobs.
|
|
30
|
+
|
|
31
|
+
Note G-Eval implements both G-Eval and LLM as Judge as they are very similar.
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
def __init__(self, eval_config: EvalConfig):
|
|
35
|
+
tmp_project = Project(name="GEval")
|
|
36
|
+
|
|
37
|
+
# Build a simple LLM as Judge system instruction
|
|
38
|
+
system_instruction = "Your job to evaluate a model's performance on a task. Blocks will be marked with <eval_data> tags.\n"
|
|
39
|
+
# Optionally add a short task description
|
|
40
|
+
task_description = eval_config.properties.get("task_description", None)
|
|
41
|
+
if task_description:
|
|
42
|
+
system_instruction += f"\nThe task the model was given is as follows:\n<eval_data>\n{task_description}\n</eval_data>\n"
|
|
43
|
+
|
|
44
|
+
# Build the COT eval instructions
|
|
45
|
+
cot_instructions = "First, think step by step about the model's performance following these evaluation steps:\n\n"
|
|
46
|
+
steps = eval_config.properties.get("eval_steps", None)
|
|
47
|
+
if not steps or not isinstance(steps, list):
|
|
48
|
+
raise ValueError("eval_steps must be a list")
|
|
49
|
+
for i, step in enumerate(steps):
|
|
50
|
+
cot_instructions += f"{i + 1}) {step}\n"
|
|
51
|
+
|
|
52
|
+
eval = eval_config.parent_eval()
|
|
53
|
+
if not eval:
|
|
54
|
+
raise ValueError("Eval config must have a parent eval")
|
|
55
|
+
|
|
56
|
+
# Build the output schema from the eval's target output scores.
|
|
57
|
+
# We restrict the LLM's output scoring schema to discrete scores (pass/fail/critical/1-5) - allow_float_scores=False
|
|
58
|
+
# However, the final scores from the evaluator can be a float (see later logprob calculation, which requires discrete token outputs)
|
|
59
|
+
output_schema = BaseEval.build_score_schema(eval, allow_float_scores=False)
|
|
60
|
+
|
|
61
|
+
super().__init__(
|
|
62
|
+
name="GEval Task",
|
|
63
|
+
parent=tmp_project,
|
|
64
|
+
instruction=system_instruction,
|
|
65
|
+
thinking_instruction=cot_instructions,
|
|
66
|
+
output_json_schema=output_schema,
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
class GEval(BaseEval):
|
|
71
|
+
"""
|
|
72
|
+
A evaluator which implements G-Eval and LLM as Judge.
|
|
73
|
+
|
|
74
|
+
G-Eval is a method of evaluating the quality of a model's output. It is a weighted average of the scores of the tokens in the output. The weights are the log probabilities of the tokens in the output. https://arxiv.org/abs/2303.16634
|
|
75
|
+
|
|
76
|
+
LLM as Judge is a method of evaluating the quality of a model's output. It simply asks the LLM to score, and uses the returned output (no logprobs needed). Also called direct evaluation.
|
|
77
|
+
|
|
78
|
+
@misc{liu2023gevalnlgevaluationusing,
|
|
79
|
+
title={G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment},
|
|
80
|
+
author={Yang Liu and Dan Iter and Yichong Xu and Shuohang Wang and Ruochen Xu and Chenguang Zhu},
|
|
81
|
+
year={2023},
|
|
82
|
+
eprint={2303.16634},
|
|
83
|
+
archivePrefix={arXiv},
|
|
84
|
+
primaryClass={cs.CL},
|
|
85
|
+
url={https://arxiv.org/abs/2303.16634},
|
|
86
|
+
}
|
|
87
|
+
"""
|
|
88
|
+
|
|
89
|
+
def __init__(self, eval_config: EvalConfig, run_config: RunConfig | None):
|
|
90
|
+
if (
|
|
91
|
+
eval_config.config_type != EvalConfigType.g_eval
|
|
92
|
+
and eval_config.config_type != EvalConfigType.llm_as_judge
|
|
93
|
+
):
|
|
94
|
+
raise ValueError(
|
|
95
|
+
f"GEval must be initialized with a GEval or LLM as Judge config_type. Got {eval_config.config_type}"
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
super().__init__(eval_config, run_config)
|
|
99
|
+
|
|
100
|
+
self.geval_task = GEvalTask(eval_config)
|
|
101
|
+
|
|
102
|
+
async def run_eval(
|
|
103
|
+
self, task_run: TaskRun
|
|
104
|
+
) -> tuple[EvalScores, Dict[str, str] | None]:
|
|
105
|
+
"""
|
|
106
|
+
Run this eval on the given task run.
|
|
107
|
+
"""
|
|
108
|
+
|
|
109
|
+
model_name, provider = self.model_and_provider()
|
|
110
|
+
|
|
111
|
+
# Only fetch logprobs for G-Eval
|
|
112
|
+
# There are at most 5 valid rating tokens per rating type (five_star being largest), so 10 is more than enough to get to the very very unlikely
|
|
113
|
+
top_logprobs = (
|
|
114
|
+
10 if self.eval_config.config_type == EvalConfigType.g_eval else None
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
adapter = adapter_for_task(
|
|
118
|
+
self.geval_task,
|
|
119
|
+
model_name,
|
|
120
|
+
provider,
|
|
121
|
+
# We always use Simple COT for G-Eval and LLM as Judge
|
|
122
|
+
prompt_id=PromptGenerators.SIMPLE_CHAIN_OF_THOUGHT,
|
|
123
|
+
base_adapter_config=AdapterConfig(
|
|
124
|
+
# Don't save this run into the task_runs. It will be saved into an eval_run where it belongs
|
|
125
|
+
allow_saving=False,
|
|
126
|
+
top_logprobs=top_logprobs,
|
|
127
|
+
),
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
input = f"""The model was given the following input for the task:
|
|
131
|
+
<eval_data>
|
|
132
|
+
{task_run.input}
|
|
133
|
+
</eval_data>
|
|
134
|
+
|
|
135
|
+
The model produced the following output for the task:
|
|
136
|
+
<eval_data>
|
|
137
|
+
{task_run.output}
|
|
138
|
+
</eval_data>
|
|
139
|
+
"""
|
|
140
|
+
|
|
141
|
+
# We don't need the run, but invoke_returning_run_output() runs validations for us over _run()
|
|
142
|
+
_, run_output = await adapter.invoke_returning_run_output(input)
|
|
143
|
+
|
|
144
|
+
if self.eval_config.config_type == EvalConfigType.llm_as_judge:
|
|
145
|
+
return self.build_llm_as_judge_score(
|
|
146
|
+
run_output
|
|
147
|
+
), run_output.intermediate_outputs
|
|
148
|
+
else:
|
|
149
|
+
return self.build_g_eval_score(run_output), run_output.intermediate_outputs
|
|
150
|
+
|
|
151
|
+
def build_llm_as_judge_score(self, run_output: RunOutput) -> EvalScores:
|
|
152
|
+
"""
|
|
153
|
+
Build the LLM as Judge score for the given run and run output.
|
|
154
|
+
"""
|
|
155
|
+
# Convert the output format we asked for (discreet values) to our float scores
|
|
156
|
+
scores: EvalScores = {}
|
|
157
|
+
if not isinstance(run_output.output, dict):
|
|
158
|
+
raise ValueError("LLM as Judge output must be a dictionary")
|
|
159
|
+
|
|
160
|
+
for metric, score in run_output.output.items():
|
|
161
|
+
token_score = self.score_from_token_string(f"{score}")
|
|
162
|
+
if token_score is None:
|
|
163
|
+
raise ValueError(
|
|
164
|
+
f"No score found for metric: {metric}. The LLM failed to follow the scoring rubric/instructions/schema."
|
|
165
|
+
)
|
|
166
|
+
scores[metric] = token_score
|
|
167
|
+
return scores
|
|
168
|
+
|
|
169
|
+
def build_g_eval_score(self, run_output: RunOutput) -> EvalScores:
|
|
170
|
+
"""
|
|
171
|
+
Build the G-Eval score for the given run and run output.
|
|
172
|
+
|
|
173
|
+
We create a weighted average of each rating using the logprobs.
|
|
174
|
+
|
|
175
|
+
@misc{liu2023gevalnlgevaluationusing,
|
|
176
|
+
title={G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment},
|
|
177
|
+
author={Yang Liu and Dan Iter and Yichong Xu and Shuohang Wang and Ruochen Xu and Chenguang Zhu},
|
|
178
|
+
year={2023},
|
|
179
|
+
eprint={2303.16634},
|
|
180
|
+
archivePrefix={arXiv},
|
|
181
|
+
primaryClass={cs.CL},
|
|
182
|
+
url={https://arxiv.org/abs/2303.16634},
|
|
183
|
+
}
|
|
184
|
+
"""
|
|
185
|
+
# We use structured output
|
|
186
|
+
outputs = run_output.output
|
|
187
|
+
assert isinstance(outputs, dict)
|
|
188
|
+
|
|
189
|
+
# Build raw string output from the logprobs, which is easier to work with than Dict for the next bit
|
|
190
|
+
raw_output = self.raw_output_from_logprobs(run_output)
|
|
191
|
+
|
|
192
|
+
# find the offset the start of each metric in the raw output json
|
|
193
|
+
metrics: List[str] = list(outputs.keys())
|
|
194
|
+
metric_offsets = self.metric_offsets(raw_output, metrics)
|
|
195
|
+
|
|
196
|
+
final_scores: EvalScores = {}
|
|
197
|
+
for metric in metrics:
|
|
198
|
+
score = self.g_eval_single_metric(
|
|
199
|
+
run_output, metric, metric_offsets, raw_output
|
|
200
|
+
)
|
|
201
|
+
if score is None:
|
|
202
|
+
raise ValueError(
|
|
203
|
+
f"No score found for metric: {metric}. The LLM failed to follow the scoring rubric/instructions/schema."
|
|
204
|
+
)
|
|
205
|
+
final_scores[metric] = score
|
|
206
|
+
|
|
207
|
+
return final_scores
|
|
208
|
+
|
|
209
|
+
def g_eval_single_metric(
|
|
210
|
+
self,
|
|
211
|
+
run_output: RunOutput,
|
|
212
|
+
metric: str,
|
|
213
|
+
metric_offsets: Dict[str, int],
|
|
214
|
+
raw_output: str,
|
|
215
|
+
) -> float | None:
|
|
216
|
+
"""
|
|
217
|
+
Run the G-Eval for a single metric.
|
|
218
|
+
|
|
219
|
+
Scan the logprobs for the metric and return the weighted score of the rating token.
|
|
220
|
+
"""
|
|
221
|
+
|
|
222
|
+
start_offset, end_offset = self.token_search_range(
|
|
223
|
+
raw_output, metric, metric_offsets
|
|
224
|
+
)
|
|
225
|
+
|
|
226
|
+
offset = 0
|
|
227
|
+
|
|
228
|
+
if (
|
|
229
|
+
run_output.output_logprobs is None
|
|
230
|
+
or run_output.output_logprobs.content is None
|
|
231
|
+
):
|
|
232
|
+
raise RuntimeError(
|
|
233
|
+
"No logprobs found for output - can not calculate g-eval"
|
|
234
|
+
)
|
|
235
|
+
|
|
236
|
+
# scan the tokens in the range, looking for the rating token
|
|
237
|
+
for _, chat_logprob in enumerate(run_output.output_logprobs.content):
|
|
238
|
+
if offset >= end_offset:
|
|
239
|
+
break
|
|
240
|
+
if offset >= start_offset:
|
|
241
|
+
score = self.rating_token_to_score(chat_logprob)
|
|
242
|
+
if score is not None:
|
|
243
|
+
return score
|
|
244
|
+
offset += len(chat_logprob.token)
|
|
245
|
+
|
|
246
|
+
return None
|
|
247
|
+
|
|
248
|
+
def raw_output_from_logprobs(self, run_output: RunOutput) -> str:
|
|
249
|
+
"""
|
|
250
|
+
Build the raw output string from the logprobs. Generate from logprobs so it's guaranteed to match the logprobs offsets
|
|
251
|
+
"""
|
|
252
|
+
if (
|
|
253
|
+
run_output.output_logprobs is None
|
|
254
|
+
or run_output.output_logprobs.content is None
|
|
255
|
+
):
|
|
256
|
+
raise RuntimeError(
|
|
257
|
+
"No logprobs found for output - can not calculate g-eval"
|
|
258
|
+
)
|
|
259
|
+
|
|
260
|
+
raw = ""
|
|
261
|
+
for chat_logprob in run_output.output_logprobs.content:
|
|
262
|
+
raw += chat_logprob.token
|
|
263
|
+
return raw
|
|
264
|
+
|
|
265
|
+
def token_search_range(
|
|
266
|
+
self, raw_output: str, metric: str, metric_offsets: Dict[str, int]
|
|
267
|
+
) -> Tuple[int, int]:
|
|
268
|
+
"""
|
|
269
|
+
Find the start and end offsets of the metric in the raw output.
|
|
270
|
+
|
|
271
|
+
Start searching after the end of the target metric json entry ("overall_rating":), and before the start of the next metric ("some_other_score").
|
|
272
|
+
"""
|
|
273
|
+
start_offset = metric_offsets[metric] + len(metric)
|
|
274
|
+
|
|
275
|
+
# Find the lowest end offset that is greater than the start offset
|
|
276
|
+
end_offset = len(raw_output)
|
|
277
|
+
for v in list(metric_offsets.values()):
|
|
278
|
+
if v < end_offset and v > start_offset:
|
|
279
|
+
end_offset = v
|
|
280
|
+
|
|
281
|
+
return start_offset, end_offset
|
|
282
|
+
|
|
283
|
+
def rating_token_to_score(
|
|
284
|
+
self, token_logprob: ChatCompletionTokenLogprob
|
|
285
|
+
) -> float | None:
|
|
286
|
+
"""
|
|
287
|
+
Convert a rating token to a score using weighted average of top logprobs.
|
|
288
|
+
|
|
289
|
+
Only includes tokens that have valid scores.
|
|
290
|
+
|
|
291
|
+
Some cleanup for upper case, whitespace and quotes. LLMs aren't always consistent.
|
|
292
|
+
"""
|
|
293
|
+
primary_token_score = self.score_from_token_string(token_logprob.token)
|
|
294
|
+
# check this is a real rating token, it could just be the ": ", "," or whitespace
|
|
295
|
+
if not primary_token_score:
|
|
296
|
+
return None
|
|
297
|
+
|
|
298
|
+
total_score = 0.0
|
|
299
|
+
total_probability = 0.0
|
|
300
|
+
|
|
301
|
+
# Process all valid scoring tokens
|
|
302
|
+
for top_logprob in token_logprob.top_logprobs:
|
|
303
|
+
token_score = self.score_from_token_string(top_logprob.token)
|
|
304
|
+
if token_score is not None:
|
|
305
|
+
# Convert logprob to probability
|
|
306
|
+
probability = math.exp(top_logprob.logprob)
|
|
307
|
+
total_score += token_score * probability
|
|
308
|
+
total_probability += probability
|
|
309
|
+
|
|
310
|
+
if total_probability <= 0.0:
|
|
311
|
+
raise RuntimeError(
|
|
312
|
+
f"No valid scoring tokens found for {token_logprob.token}. This should never happen. Please file a bug if you see this."
|
|
313
|
+
)
|
|
314
|
+
|
|
315
|
+
# Normalize by total probability of valid tokens (LLM may have wanted to generate other non-rating tokens, these shouldn't lower score of rating tokens)
|
|
316
|
+
weighted_score = total_score / total_probability
|
|
317
|
+
|
|
318
|
+
return weighted_score
|
|
319
|
+
|
|
320
|
+
def score_from_token_string(self, token: str) -> float | None:
|
|
321
|
+
if token in TOKEN_TO_SCORE_MAP:
|
|
322
|
+
return TOKEN_TO_SCORE_MAP[token]
|
|
323
|
+
|
|
324
|
+
# handle more token variations like '"1"' and '"pass"' and ' paSS' and 'PASS'
|
|
325
|
+
unquoted_token = token.strip().strip('"').lower()
|
|
326
|
+
if unquoted_token in TOKEN_TO_SCORE_MAP:
|
|
327
|
+
return TOKEN_TO_SCORE_MAP[unquoted_token]
|
|
328
|
+
|
|
329
|
+
# handle numeric tokens like "1.0"
|
|
330
|
+
try:
|
|
331
|
+
float_value = float(token)
|
|
332
|
+
if float_value.is_integer():
|
|
333
|
+
str_token = str(int(float_value))
|
|
334
|
+
if str_token in TOKEN_TO_SCORE_MAP:
|
|
335
|
+
return TOKEN_TO_SCORE_MAP[str_token]
|
|
336
|
+
except ValueError:
|
|
337
|
+
pass
|
|
338
|
+
|
|
339
|
+
return None
|
|
340
|
+
|
|
341
|
+
def metric_offsets(self, raw_output: str, metrics: List[str]) -> Dict[str, int]:
|
|
342
|
+
"""
|
|
343
|
+
Find the offset to the start of each metric in the raw output json
|
|
344
|
+
|
|
345
|
+
For the example json: `{"overall_rating": 1}` == 1
|
|
346
|
+
|
|
347
|
+
should return:
|
|
348
|
+
{
|
|
349
|
+
"overall_rating": 1 # it's 1 character into the json string
|
|
350
|
+
}
|
|
351
|
+
"""
|
|
352
|
+
metric_offsets: Dict[str, int] = {}
|
|
353
|
+
for metric in metrics:
|
|
354
|
+
# the quoted metric name is expected in the json: `{"overall_rating": 1}` == 1
|
|
355
|
+
metric_name = f'"{metric}"'
|
|
356
|
+
|
|
357
|
+
# we expect it exactly once
|
|
358
|
+
count = raw_output.count(metric_name)
|
|
359
|
+
if count != 1:
|
|
360
|
+
raise ValueError(
|
|
361
|
+
f"Metric {metric} should appear exactly once in the output. Found {count} times"
|
|
362
|
+
)
|
|
363
|
+
|
|
364
|
+
offset = raw_output.find(metric_name)
|
|
365
|
+
if offset == -1:
|
|
366
|
+
raise ValueError(f"Metric {metric} not found in raw output")
|
|
367
|
+
metric_offsets[metric] = offset
|
|
368
|
+
return metric_offsets
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
from kiln_ai.adapters.eval.base_eval import BaseEval
|
|
2
|
+
from kiln_ai.adapters.eval.g_eval import GEval
|
|
3
|
+
from kiln_ai.datamodel.eval import EvalConfigType
|
|
4
|
+
from kiln_ai.utils.exhaustive_error import raise_exhaustive_enum_error
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def eval_adapter_from_type(eval_config_type: EvalConfigType) -> type[BaseEval]:
|
|
8
|
+
match eval_config_type:
|
|
9
|
+
case EvalConfigType.g_eval:
|
|
10
|
+
return GEval
|
|
11
|
+
case EvalConfigType.llm_as_judge:
|
|
12
|
+
# Also implemented by GEval
|
|
13
|
+
return GEval
|
|
14
|
+
case _:
|
|
15
|
+
# type checking will catch missing cases
|
|
16
|
+
raise_exhaustive_enum_error(eval_config_type)
|