kiln-ai 0.8.1__py3-none-any.whl → 0.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kiln-ai might be problematic. Click here for more details.

Files changed (88) hide show
  1. kiln_ai/adapters/__init__.py +7 -7
  2. kiln_ai/adapters/adapter_registry.py +81 -10
  3. kiln_ai/adapters/data_gen/data_gen_task.py +21 -3
  4. kiln_ai/adapters/data_gen/test_data_gen_task.py +23 -3
  5. kiln_ai/adapters/eval/base_eval.py +164 -0
  6. kiln_ai/adapters/eval/eval_runner.py +267 -0
  7. kiln_ai/adapters/eval/g_eval.py +367 -0
  8. kiln_ai/adapters/eval/registry.py +16 -0
  9. kiln_ai/adapters/eval/test_base_eval.py +324 -0
  10. kiln_ai/adapters/eval/test_eval_runner.py +640 -0
  11. kiln_ai/adapters/eval/test_g_eval.py +497 -0
  12. kiln_ai/adapters/eval/test_g_eval_data.py +4 -0
  13. kiln_ai/adapters/fine_tune/base_finetune.py +5 -1
  14. kiln_ai/adapters/fine_tune/dataset_formatter.py +310 -65
  15. kiln_ai/adapters/fine_tune/fireworks_finetune.py +47 -32
  16. kiln_ai/adapters/fine_tune/openai_finetune.py +12 -11
  17. kiln_ai/adapters/fine_tune/test_base_finetune.py +19 -0
  18. kiln_ai/adapters/fine_tune/test_dataset_formatter.py +472 -129
  19. kiln_ai/adapters/fine_tune/test_fireworks_tinetune.py +114 -22
  20. kiln_ai/adapters/fine_tune/test_openai_finetune.py +125 -14
  21. kiln_ai/adapters/ml_model_list.py +434 -93
  22. kiln_ai/adapters/model_adapters/__init__.py +18 -0
  23. kiln_ai/adapters/model_adapters/base_adapter.py +250 -0
  24. kiln_ai/adapters/model_adapters/langchain_adapters.py +309 -0
  25. kiln_ai/adapters/model_adapters/openai_compatible_config.py +10 -0
  26. kiln_ai/adapters/model_adapters/openai_model_adapter.py +289 -0
  27. kiln_ai/adapters/model_adapters/test_base_adapter.py +199 -0
  28. kiln_ai/adapters/{test_langchain_adapter.py → model_adapters/test_langchain_adapter.py} +105 -97
  29. kiln_ai/adapters/model_adapters/test_openai_model_adapter.py +216 -0
  30. kiln_ai/adapters/{test_saving_adapter_results.py → model_adapters/test_saving_adapter_results.py} +80 -30
  31. kiln_ai/adapters/{test_structured_output.py → model_adapters/test_structured_output.py} +125 -46
  32. kiln_ai/adapters/ollama_tools.py +0 -1
  33. kiln_ai/adapters/parsers/__init__.py +10 -0
  34. kiln_ai/adapters/parsers/base_parser.py +12 -0
  35. kiln_ai/adapters/parsers/json_parser.py +37 -0
  36. kiln_ai/adapters/parsers/parser_registry.py +19 -0
  37. kiln_ai/adapters/parsers/r1_parser.py +69 -0
  38. kiln_ai/adapters/parsers/test_json_parser.py +81 -0
  39. kiln_ai/adapters/parsers/test_parser_registry.py +32 -0
  40. kiln_ai/adapters/parsers/test_r1_parser.py +144 -0
  41. kiln_ai/adapters/prompt_builders.py +193 -49
  42. kiln_ai/adapters/provider_tools.py +91 -36
  43. kiln_ai/adapters/repair/repair_task.py +18 -19
  44. kiln_ai/adapters/repair/test_repair_task.py +7 -7
  45. kiln_ai/adapters/run_output.py +11 -0
  46. kiln_ai/adapters/test_adapter_registry.py +177 -0
  47. kiln_ai/adapters/test_generate_docs.py +69 -0
  48. kiln_ai/adapters/test_ollama_tools.py +0 -1
  49. kiln_ai/adapters/test_prompt_adaptors.py +25 -18
  50. kiln_ai/adapters/test_prompt_builders.py +265 -44
  51. kiln_ai/adapters/test_provider_tools.py +268 -46
  52. kiln_ai/datamodel/__init__.py +51 -772
  53. kiln_ai/datamodel/basemodel.py +31 -11
  54. kiln_ai/datamodel/datamodel_enums.py +58 -0
  55. kiln_ai/datamodel/dataset_filters.py +114 -0
  56. kiln_ai/datamodel/dataset_split.py +170 -0
  57. kiln_ai/datamodel/eval.py +298 -0
  58. kiln_ai/datamodel/finetune.py +105 -0
  59. kiln_ai/datamodel/json_schema.py +14 -3
  60. kiln_ai/datamodel/model_cache.py +8 -3
  61. kiln_ai/datamodel/project.py +23 -0
  62. kiln_ai/datamodel/prompt.py +37 -0
  63. kiln_ai/datamodel/prompt_id.py +83 -0
  64. kiln_ai/datamodel/strict_mode.py +24 -0
  65. kiln_ai/datamodel/task.py +181 -0
  66. kiln_ai/datamodel/task_output.py +321 -0
  67. kiln_ai/datamodel/task_run.py +164 -0
  68. kiln_ai/datamodel/test_basemodel.py +80 -2
  69. kiln_ai/datamodel/test_dataset_filters.py +71 -0
  70. kiln_ai/datamodel/test_dataset_split.py +127 -6
  71. kiln_ai/datamodel/test_datasource.py +3 -2
  72. kiln_ai/datamodel/test_eval_model.py +635 -0
  73. kiln_ai/datamodel/test_example_models.py +34 -17
  74. kiln_ai/datamodel/test_json_schema.py +23 -0
  75. kiln_ai/datamodel/test_model_cache.py +24 -0
  76. kiln_ai/datamodel/test_model_perf.py +125 -0
  77. kiln_ai/datamodel/test_models.py +131 -2
  78. kiln_ai/datamodel/test_prompt_id.py +129 -0
  79. kiln_ai/datamodel/test_task.py +159 -0
  80. kiln_ai/utils/config.py +6 -1
  81. kiln_ai/utils/exhaustive_error.py +6 -0
  82. {kiln_ai-0.8.1.dist-info → kiln_ai-0.12.0.dist-info}/METADATA +45 -7
  83. kiln_ai-0.12.0.dist-info/RECORD +100 -0
  84. kiln_ai/adapters/base_adapter.py +0 -191
  85. kiln_ai/adapters/langchain_adapters.py +0 -256
  86. kiln_ai-0.8.1.dist-info/RECORD +0 -58
  87. {kiln_ai-0.8.1.dist-info → kiln_ai-0.12.0.dist-info}/WHEEL +0 -0
  88. {kiln_ai-0.8.1.dist-info → kiln_ai-0.12.0.dist-info}/licenses/LICENSE.txt +0 -0
@@ -0,0 +1,367 @@
1
+ import math
2
+ from typing import Dict, List, Tuple
3
+
4
+ from kiln_ai.adapters.adapter_registry import adapter_for_task
5
+ from kiln_ai.adapters.eval.base_eval import BaseEval
6
+ from kiln_ai.adapters.model_adapters.base_adapter import AdapterConfig, RunOutput
7
+ from kiln_ai.adapters.prompt_builders import PromptGenerators
8
+ from kiln_ai.datamodel import Project, Task, TaskRun
9
+ from kiln_ai.datamodel.eval import EvalConfig, EvalConfigType, EvalScores
10
+ from kiln_ai.datamodel.task import RunConfig
11
+ from openai.types.chat import ChatCompletionTokenLogprob
12
+
13
+ # all the tokens we score for, and their float scores.
14
+ TOKEN_TO_SCORE_MAP: Dict[str, float] = {
15
+ "1": 1.0,
16
+ "2": 2.0,
17
+ "3": 3.0,
18
+ "4": 4.0,
19
+ "5": 5.0,
20
+ "pass": 1.0,
21
+ "fail": 0.0,
22
+ "critical": -1.0,
23
+ }
24
+
25
+
26
+ class GEvalTask(Task, parent_of={}):
27
+ """
28
+ Kiln task for executing a G-Eval. Can be run on any Kiln adapter which supports logprobs.
29
+
30
+ Note G-Eval implements both G-Eval and LLM as Judge as they are very similar.
31
+ """
32
+
33
+ def __init__(self, eval_config: EvalConfig):
34
+ tmp_project = Project(name="GEval")
35
+
36
+ # Build a simple LLM as Judge system instruction
37
+ system_instruction = "Your job to evaluate a model's performance on a task. Blocks will be marked with <eval_data> tags.\n"
38
+ # Optionally add a short task description
39
+ task_description = eval_config.properties.get("task_description", None)
40
+ if task_description:
41
+ system_instruction += f"\nThe task the model was given is as follows:\n<eval_data>\n{task_description}\n</eval_data>\n"
42
+
43
+ # Build the COT eval instructions
44
+ cot_instructions = "First, think step by step about the model's performance following these evaluation steps:\n\n"
45
+ steps = eval_config.properties.get("eval_steps", None)
46
+ if not steps or not isinstance(steps, list):
47
+ raise ValueError("eval_steps must be a list")
48
+ for i, step in enumerate(steps):
49
+ cot_instructions += f"{i + 1}) {step}\n"
50
+
51
+ eval = eval_config.parent_eval()
52
+ if not eval:
53
+ raise ValueError("Eval config must have a parent eval")
54
+
55
+ # Build the output schema from the eval's target output scores.
56
+ # We restrict the LLM's output scoring schema to discrete scores (pass/fail/critical/1-5) - allow_float_scores=False
57
+ # However, the final scores from the evaluator can be a float (see later logprob calculation, which requires discrete token outputs)
58
+ output_schema = BaseEval.build_score_schema(eval, allow_float_scores=False)
59
+
60
+ super().__init__(
61
+ name="GEval Task",
62
+ parent=tmp_project,
63
+ instruction=system_instruction,
64
+ thinking_instruction=cot_instructions,
65
+ output_json_schema=output_schema,
66
+ )
67
+
68
+
69
+ class GEval(BaseEval):
70
+ """
71
+ A evaluator which implements G-Eval and LLM as Judge.
72
+
73
+ G-Eval is a method of evaluating the quality of a model's output. It is a weighted average of the scores of the tokens in the output. The weights are the log probabilities of the tokens in the output. https://arxiv.org/abs/2303.16634
74
+
75
+ LLM as Judge is a method of evaluating the quality of a model's output. It simply asks the LLM to score, and uses the returned output (no logprobs needed). Also called direct evaluation.
76
+
77
+ @misc{liu2023gevalnlgevaluationusing,
78
+ title={G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment},
79
+ author={Yang Liu and Dan Iter and Yichong Xu and Shuohang Wang and Ruochen Xu and Chenguang Zhu},
80
+ year={2023},
81
+ eprint={2303.16634},
82
+ archivePrefix={arXiv},
83
+ primaryClass={cs.CL},
84
+ url={https://arxiv.org/abs/2303.16634},
85
+ }
86
+ """
87
+
88
+ def __init__(self, eval_config: EvalConfig, run_config: RunConfig | None):
89
+ if (
90
+ eval_config.config_type != EvalConfigType.g_eval
91
+ and eval_config.config_type != EvalConfigType.llm_as_judge
92
+ ):
93
+ raise ValueError(
94
+ f"GEval must be initialized with a GEval or LLM as Judge config_type. Got {eval_config.config_type}"
95
+ )
96
+
97
+ super().__init__(eval_config, run_config)
98
+
99
+ self.geval_task = GEvalTask(eval_config)
100
+
101
+ async def run_eval(
102
+ self, task_run: TaskRun
103
+ ) -> tuple[EvalScores, Dict[str, str] | None]:
104
+ """
105
+ Run this eval on the given task run.
106
+ """
107
+
108
+ model_name, provider = self.model_and_provider()
109
+
110
+ # Only fetch logprobs for G-Eval
111
+ # There are at most 5 valid rating tokens per rating type (five_star being largest), so 10 is more than enough to get to the very very unlikely
112
+ top_logprobs = (
113
+ 10 if self.eval_config.config_type == EvalConfigType.g_eval else None
114
+ )
115
+
116
+ adapter = adapter_for_task(
117
+ self.geval_task,
118
+ model_name,
119
+ provider,
120
+ # We always use Simple COT for G-Eval and LLM as Judge
121
+ prompt_id=PromptGenerators.SIMPLE_CHAIN_OF_THOUGHT,
122
+ base_adapter_config=AdapterConfig(
123
+ # Don't save this run into the task_runs. It will be saved into an eval_run where it belongs
124
+ allow_saving=False,
125
+ top_logprobs=top_logprobs,
126
+ ),
127
+ )
128
+
129
+ input = f"""The model was given the following input for the task:
130
+ <eval_data>
131
+ {task_run.input}
132
+ </eval_data>
133
+
134
+ The model produced the following output for the task:
135
+ <eval_data>
136
+ {task_run.output}
137
+ </eval_data>
138
+ """
139
+
140
+ # We don't need the run, but invoke_returning_run_output() runs validations for us over _run()
141
+ _, run_output = await adapter.invoke_returning_run_output(input)
142
+
143
+ if self.eval_config.config_type == EvalConfigType.llm_as_judge:
144
+ return self.build_llm_as_judge_score(
145
+ run_output
146
+ ), run_output.intermediate_outputs
147
+ else:
148
+ return self.build_g_eval_score(run_output), run_output.intermediate_outputs
149
+
150
+ def build_llm_as_judge_score(self, run_output: RunOutput) -> EvalScores:
151
+ """
152
+ Build the LLM as Judge score for the given run and run output.
153
+ """
154
+ # Convert the output format we asked for (discreet values) to our float scores
155
+ scores: EvalScores = {}
156
+ if not isinstance(run_output.output, dict):
157
+ raise ValueError("LLM as Judge output must be a dictionary")
158
+
159
+ for metric, score in run_output.output.items():
160
+ token_score = self.score_from_token_string(f"{score}")
161
+ if token_score is None:
162
+ raise ValueError(
163
+ f"No score found for metric: {metric}. The LLM failed to follow the scoring rubric/instructions/schema."
164
+ )
165
+ scores[metric] = token_score
166
+ return scores
167
+
168
+ def build_g_eval_score(self, run_output: RunOutput) -> EvalScores:
169
+ """
170
+ Build the G-Eval score for the given run and run output.
171
+
172
+ We create a weighted average of each rating using the logprobs.
173
+
174
+ @misc{liu2023gevalnlgevaluationusing,
175
+ title={G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment},
176
+ author={Yang Liu and Dan Iter and Yichong Xu and Shuohang Wang and Ruochen Xu and Chenguang Zhu},
177
+ year={2023},
178
+ eprint={2303.16634},
179
+ archivePrefix={arXiv},
180
+ primaryClass={cs.CL},
181
+ url={https://arxiv.org/abs/2303.16634},
182
+ }
183
+ """
184
+ # We use structured output
185
+ outputs = run_output.output
186
+ assert isinstance(outputs, dict)
187
+
188
+ # Build raw string output from the logprobs, which is easier to work with than Dict for the next bit
189
+ raw_output = self.raw_output_from_logprobs(run_output)
190
+
191
+ # find the offset the start of each metric in the raw output json
192
+ metrics: List[str] = list(outputs.keys())
193
+ metric_offsets = self.metric_offsets(raw_output, metrics)
194
+
195
+ final_scores: EvalScores = {}
196
+ for metric in metrics:
197
+ score = self.g_eval_single_metric(
198
+ run_output, metric, metric_offsets, raw_output
199
+ )
200
+ if score is None:
201
+ raise ValueError(
202
+ f"No score found for metric: {metric}. The LLM failed to follow the scoring rubric/instructions/schema."
203
+ )
204
+ final_scores[metric] = score
205
+
206
+ return final_scores
207
+
208
+ def g_eval_single_metric(
209
+ self,
210
+ run_output: RunOutput,
211
+ metric: str,
212
+ metric_offsets: Dict[str, int],
213
+ raw_output: str,
214
+ ) -> float | None:
215
+ """
216
+ Run the G-Eval for a single metric.
217
+
218
+ Scan the logprobs for the metric and return the weighted score of the rating token.
219
+ """
220
+
221
+ start_offset, end_offset = self.token_search_range(
222
+ raw_output, metric, metric_offsets
223
+ )
224
+
225
+ offset = 0
226
+
227
+ if (
228
+ run_output.output_logprobs is None
229
+ or run_output.output_logprobs.content is None
230
+ ):
231
+ raise RuntimeError(
232
+ "No logprobs found for output - can not calculate g-eval"
233
+ )
234
+
235
+ # scan the tokens in the range, looking for the rating token
236
+ for _, chat_logprob in enumerate(run_output.output_logprobs.content):
237
+ if offset >= end_offset:
238
+ break
239
+ if offset >= start_offset:
240
+ score = self.rating_token_to_score(chat_logprob)
241
+ if score is not None:
242
+ return score
243
+ offset += len(chat_logprob.token)
244
+
245
+ return None
246
+
247
+ def raw_output_from_logprobs(self, run_output: RunOutput) -> str:
248
+ """
249
+ Build the raw output string from the logprobs. Generate from logprobs so it's guaranteed to match the logprobs offsets
250
+ """
251
+ if (
252
+ run_output.output_logprobs is None
253
+ or run_output.output_logprobs.content is None
254
+ ):
255
+ raise RuntimeError(
256
+ "No logprobs found for output - can not calculate g-eval"
257
+ )
258
+
259
+ raw = ""
260
+ for chat_logprob in run_output.output_logprobs.content:
261
+ raw += chat_logprob.token
262
+ return raw
263
+
264
+ def token_search_range(
265
+ self, raw_output: str, metric: str, metric_offsets: Dict[str, int]
266
+ ) -> Tuple[int, int]:
267
+ """
268
+ Find the start and end offsets of the metric in the raw output.
269
+
270
+ Start searching after the end of the target metric json entry ("overall_rating":), and before the start of the next metric ("some_other_score").
271
+ """
272
+ start_offset = metric_offsets[metric] + len(metric)
273
+
274
+ # Find the lowest end offset that is greater than the start offset
275
+ end_offset = len(raw_output)
276
+ for v in list(metric_offsets.values()):
277
+ if v < end_offset and v > start_offset:
278
+ end_offset = v
279
+
280
+ return start_offset, end_offset
281
+
282
+ def rating_token_to_score(
283
+ self, token_logprob: ChatCompletionTokenLogprob
284
+ ) -> float | None:
285
+ """
286
+ Convert a rating token to a score using weighted average of top logprobs.
287
+
288
+ Only includes tokens that have valid scores.
289
+
290
+ Some cleanup for upper case, whitespace and quotes. LLMs aren't always consistent.
291
+ """
292
+ primary_token_score = self.score_from_token_string(token_logprob.token)
293
+ # check this is a real rating token, it could just be the ": ", "," or whitespace
294
+ if not primary_token_score:
295
+ return None
296
+
297
+ total_score = 0.0
298
+ total_probability = 0.0
299
+
300
+ # Process all valid scoring tokens
301
+ for top_logprob in token_logprob.top_logprobs:
302
+ token_score = self.score_from_token_string(top_logprob.token)
303
+ if token_score is not None:
304
+ # Convert logprob to probability
305
+ probability = math.exp(top_logprob.logprob)
306
+ total_score += token_score * probability
307
+ total_probability += probability
308
+
309
+ if total_probability <= 0.0:
310
+ raise RuntimeError(
311
+ f"No valid scoring tokens found for {token_logprob.token}. This should never happen. Please file a bug if you see this."
312
+ )
313
+
314
+ # Normalize by total probability of valid tokens (LLM may have wanted to generate other non-rating tokens, these shouldn't lower score of rating tokens)
315
+ weighted_score = total_score / total_probability
316
+
317
+ return weighted_score
318
+
319
+ def score_from_token_string(self, token: str) -> float | None:
320
+ if token in TOKEN_TO_SCORE_MAP:
321
+ return TOKEN_TO_SCORE_MAP[token]
322
+
323
+ # handle more token variations like '"1"' and '"pass"' and ' paSS' and 'PASS'
324
+ unquoted_token = token.strip().strip('"').lower()
325
+ if unquoted_token in TOKEN_TO_SCORE_MAP:
326
+ return TOKEN_TO_SCORE_MAP[unquoted_token]
327
+
328
+ # handle numeric tokens like "1.0"
329
+ try:
330
+ float_value = float(token)
331
+ if float_value.is_integer():
332
+ str_token = str(int(float_value))
333
+ if str_token in TOKEN_TO_SCORE_MAP:
334
+ return TOKEN_TO_SCORE_MAP[str_token]
335
+ except ValueError:
336
+ pass
337
+
338
+ return None
339
+
340
+ def metric_offsets(self, raw_output: str, metrics: List[str]) -> Dict[str, int]:
341
+ """
342
+ Find the offset to the start of each metric in the raw output json
343
+
344
+ For the example json: `{"overall_rating": 1}` == 1
345
+
346
+ should return:
347
+ {
348
+ "overall_rating": 1 # it's 1 character into the json string
349
+ }
350
+ """
351
+ metric_offsets: Dict[str, int] = {}
352
+ for metric in metrics:
353
+ # the quoted metric name is expected in the json: `{"overall_rating": 1}` == 1
354
+ metric_name = f'"{metric}"'
355
+
356
+ # we expect it exactly once
357
+ count = raw_output.count(metric_name)
358
+ if count != 1:
359
+ raise ValueError(
360
+ f"Metric {metric} should appear exactly once in the output. Found {count} times"
361
+ )
362
+
363
+ offset = raw_output.find(metric_name)
364
+ if offset == -1:
365
+ raise ValueError(f"Metric {metric} not found in raw output")
366
+ metric_offsets[metric] = offset
367
+ return metric_offsets
@@ -0,0 +1,16 @@
1
+ from kiln_ai.adapters.eval.base_eval import BaseEval
2
+ from kiln_ai.adapters.eval.g_eval import GEval
3
+ from kiln_ai.datamodel.eval import EvalConfigType
4
+ from kiln_ai.utils.exhaustive_error import raise_exhaustive_enum_error
5
+
6
+
7
+ def eval_adapter_from_type(eval_config_type: EvalConfigType) -> type[BaseEval]:
8
+ match eval_config_type:
9
+ case EvalConfigType.g_eval:
10
+ return GEval
11
+ case EvalConfigType.llm_as_judge:
12
+ # Also implemented by GEval
13
+ return GEval
14
+ case _:
15
+ # type checking will catch missing cases
16
+ raise_exhaustive_enum_error(eval_config_type)