kiln-ai 0.11.1__py3-none-any.whl → 0.13.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of kiln-ai might be problematic. Click here for more details.
- kiln_ai/adapters/__init__.py +4 -0
- kiln_ai/adapters/adapter_registry.py +163 -39
- kiln_ai/adapters/data_gen/data_gen_task.py +18 -0
- kiln_ai/adapters/eval/__init__.py +28 -0
- kiln_ai/adapters/eval/base_eval.py +164 -0
- kiln_ai/adapters/eval/eval_runner.py +270 -0
- kiln_ai/adapters/eval/g_eval.py +368 -0
- kiln_ai/adapters/eval/registry.py +16 -0
- kiln_ai/adapters/eval/test_base_eval.py +325 -0
- kiln_ai/adapters/eval/test_eval_runner.py +641 -0
- kiln_ai/adapters/eval/test_g_eval.py +498 -0
- kiln_ai/adapters/eval/test_g_eval_data.py +4 -0
- kiln_ai/adapters/fine_tune/base_finetune.py +16 -2
- kiln_ai/adapters/fine_tune/finetune_registry.py +2 -0
- kiln_ai/adapters/fine_tune/test_dataset_formatter.py +4 -1
- kiln_ai/adapters/fine_tune/test_fireworks_tinetune.py +1 -1
- kiln_ai/adapters/fine_tune/test_openai_finetune.py +1 -1
- kiln_ai/adapters/fine_tune/test_together_finetune.py +531 -0
- kiln_ai/adapters/fine_tune/together_finetune.py +325 -0
- kiln_ai/adapters/ml_model_list.py +758 -163
- kiln_ai/adapters/model_adapters/__init__.py +2 -4
- kiln_ai/adapters/model_adapters/base_adapter.py +61 -43
- kiln_ai/adapters/model_adapters/litellm_adapter.py +391 -0
- kiln_ai/adapters/model_adapters/litellm_config.py +13 -0
- kiln_ai/adapters/model_adapters/test_base_adapter.py +22 -13
- kiln_ai/adapters/model_adapters/test_litellm_adapter.py +407 -0
- kiln_ai/adapters/model_adapters/test_saving_adapter_results.py +41 -19
- kiln_ai/adapters/model_adapters/test_structured_output.py +59 -35
- kiln_ai/adapters/ollama_tools.py +3 -3
- kiln_ai/adapters/parsers/r1_parser.py +19 -14
- kiln_ai/adapters/parsers/test_r1_parser.py +17 -5
- kiln_ai/adapters/prompt_builders.py +80 -42
- kiln_ai/adapters/provider_tools.py +50 -58
- kiln_ai/adapters/repair/repair_task.py +9 -21
- kiln_ai/adapters/repair/test_repair_task.py +6 -6
- kiln_ai/adapters/run_output.py +3 -0
- kiln_ai/adapters/test_adapter_registry.py +26 -29
- kiln_ai/adapters/test_generate_docs.py +4 -4
- kiln_ai/adapters/test_ollama_tools.py +0 -1
- kiln_ai/adapters/test_prompt_adaptors.py +47 -33
- kiln_ai/adapters/test_prompt_builders.py +91 -31
- kiln_ai/adapters/test_provider_tools.py +26 -81
- kiln_ai/datamodel/__init__.py +50 -952
- kiln_ai/datamodel/basemodel.py +2 -0
- kiln_ai/datamodel/datamodel_enums.py +60 -0
- kiln_ai/datamodel/dataset_filters.py +114 -0
- kiln_ai/datamodel/dataset_split.py +170 -0
- kiln_ai/datamodel/eval.py +298 -0
- kiln_ai/datamodel/finetune.py +105 -0
- kiln_ai/datamodel/json_schema.py +7 -1
- kiln_ai/datamodel/project.py +23 -0
- kiln_ai/datamodel/prompt.py +37 -0
- kiln_ai/datamodel/prompt_id.py +83 -0
- kiln_ai/datamodel/strict_mode.py +24 -0
- kiln_ai/datamodel/task.py +181 -0
- kiln_ai/datamodel/task_output.py +328 -0
- kiln_ai/datamodel/task_run.py +164 -0
- kiln_ai/datamodel/test_basemodel.py +19 -11
- kiln_ai/datamodel/test_dataset_filters.py +71 -0
- kiln_ai/datamodel/test_dataset_split.py +32 -8
- kiln_ai/datamodel/test_datasource.py +22 -2
- kiln_ai/datamodel/test_eval_model.py +635 -0
- kiln_ai/datamodel/test_example_models.py +9 -13
- kiln_ai/datamodel/test_json_schema.py +23 -0
- kiln_ai/datamodel/test_models.py +2 -2
- kiln_ai/datamodel/test_prompt_id.py +129 -0
- kiln_ai/datamodel/test_task.py +159 -0
- kiln_ai/utils/config.py +43 -1
- kiln_ai/utils/dataset_import.py +232 -0
- kiln_ai/utils/test_dataset_import.py +596 -0
- {kiln_ai-0.11.1.dist-info → kiln_ai-0.13.0.dist-info}/METADATA +86 -6
- kiln_ai-0.13.0.dist-info/RECORD +103 -0
- kiln_ai/adapters/model_adapters/langchain_adapters.py +0 -302
- kiln_ai/adapters/model_adapters/openai_compatible_config.py +0 -11
- kiln_ai/adapters/model_adapters/openai_model_adapter.py +0 -246
- kiln_ai/adapters/model_adapters/test_langchain_adapter.py +0 -350
- kiln_ai/adapters/model_adapters/test_openai_model_adapter.py +0 -225
- kiln_ai-0.11.1.dist-info/RECORD +0 -76
- {kiln_ai-0.11.1.dist-info → kiln_ai-0.13.0.dist-info}/WHEEL +0 -0
- {kiln_ai-0.11.1.dist-info → kiln_ai-0.13.0.dist-info}/licenses/LICENSE.txt +0 -0
kiln_ai/adapters/__init__.py
CHANGED
|
@@ -12,10 +12,13 @@ The prompt_builders submodule contains classes that build prompts for use with t
|
|
|
12
12
|
The repair submodule contains an adapter for the repair task.
|
|
13
13
|
|
|
14
14
|
The parser submodule contains parsers for the output of the AI models.
|
|
15
|
+
|
|
16
|
+
The eval submodule contains the code for evaluating the performance of a model.
|
|
15
17
|
"""
|
|
16
18
|
|
|
17
19
|
from . import (
|
|
18
20
|
data_gen,
|
|
21
|
+
eval,
|
|
19
22
|
fine_tune,
|
|
20
23
|
ml_model_list,
|
|
21
24
|
model_adapters,
|
|
@@ -30,4 +33,5 @@ __all__ = [
|
|
|
30
33
|
"ml_model_list",
|
|
31
34
|
"prompt_builders",
|
|
32
35
|
"repair",
|
|
36
|
+
"eval",
|
|
33
37
|
]
|
|
@@ -2,14 +2,13 @@ from os import getenv
|
|
|
2
2
|
|
|
3
3
|
from kiln_ai import datamodel
|
|
4
4
|
from kiln_ai.adapters.ml_model_list import ModelProviderName
|
|
5
|
-
from kiln_ai.adapters.model_adapters.base_adapter import BaseAdapter
|
|
6
|
-
from kiln_ai.adapters.model_adapters.
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
OpenAICompatibleConfig,
|
|
5
|
+
from kiln_ai.adapters.model_adapters.base_adapter import AdapterConfig, BaseAdapter
|
|
6
|
+
from kiln_ai.adapters.model_adapters.litellm_adapter import (
|
|
7
|
+
LiteLlmAdapter,
|
|
8
|
+
LiteLlmConfig,
|
|
10
9
|
)
|
|
11
|
-
from kiln_ai.adapters.
|
|
12
|
-
from kiln_ai.
|
|
10
|
+
from kiln_ai.adapters.provider_tools import core_provider, lite_llm_config
|
|
11
|
+
from kiln_ai.datamodel import PromptId
|
|
13
12
|
from kiln_ai.utils.config import Config
|
|
14
13
|
from kiln_ai.utils.exhaustive_error import raise_exhaustive_enum_error
|
|
15
14
|
|
|
@@ -18,59 +17,193 @@ def adapter_for_task(
|
|
|
18
17
|
kiln_task: datamodel.Task,
|
|
19
18
|
model_name: str,
|
|
20
19
|
provider: ModelProviderName,
|
|
21
|
-
|
|
22
|
-
|
|
20
|
+
prompt_id: PromptId | None = None,
|
|
21
|
+
base_adapter_config: AdapterConfig | None = None,
|
|
23
22
|
) -> BaseAdapter:
|
|
24
23
|
# Get the provider to run. For things like the fine-tune provider, we want to run the underlying provider
|
|
25
24
|
core_provider_name = core_provider(model_name, provider)
|
|
26
25
|
|
|
27
26
|
match core_provider_name:
|
|
28
27
|
case ModelProviderName.openrouter:
|
|
29
|
-
return
|
|
28
|
+
return LiteLlmAdapter(
|
|
30
29
|
kiln_task=kiln_task,
|
|
31
|
-
config=
|
|
30
|
+
config=LiteLlmConfig(
|
|
31
|
+
model_name=model_name,
|
|
32
32
|
base_url=getenv("OPENROUTER_BASE_URL")
|
|
33
33
|
or "https://openrouter.ai/api/v1",
|
|
34
|
-
api_key=Config.shared().open_router_api_key,
|
|
35
|
-
model_name=model_name,
|
|
36
34
|
provider_name=provider,
|
|
37
|
-
openrouter_style_reasoning=True,
|
|
38
35
|
default_headers={
|
|
39
36
|
"HTTP-Referer": "https://getkiln.ai/openrouter",
|
|
40
37
|
"X-Title": "KilnAI",
|
|
41
38
|
},
|
|
39
|
+
additional_body_options={
|
|
40
|
+
"api_key": Config.shared().open_router_api_key,
|
|
41
|
+
},
|
|
42
42
|
),
|
|
43
|
-
|
|
44
|
-
|
|
43
|
+
prompt_id=prompt_id,
|
|
44
|
+
base_adapter_config=base_adapter_config,
|
|
45
45
|
)
|
|
46
46
|
case ModelProviderName.openai:
|
|
47
|
-
return
|
|
47
|
+
return LiteLlmAdapter(
|
|
48
48
|
kiln_task=kiln_task,
|
|
49
|
-
config=
|
|
50
|
-
api_key=Config.shared().open_ai_api_key,
|
|
49
|
+
config=LiteLlmConfig(
|
|
51
50
|
model_name=model_name,
|
|
52
51
|
provider_name=provider,
|
|
52
|
+
additional_body_options={
|
|
53
|
+
"api_key": Config.shared().open_ai_api_key,
|
|
54
|
+
},
|
|
53
55
|
),
|
|
54
|
-
|
|
55
|
-
|
|
56
|
+
prompt_id=prompt_id,
|
|
57
|
+
base_adapter_config=base_adapter_config,
|
|
56
58
|
)
|
|
57
59
|
case ModelProviderName.openai_compatible:
|
|
58
|
-
config =
|
|
59
|
-
return
|
|
60
|
+
config = lite_llm_config(model_name)
|
|
61
|
+
return LiteLlmAdapter(
|
|
60
62
|
kiln_task=kiln_task,
|
|
61
63
|
config=config,
|
|
62
|
-
|
|
63
|
-
|
|
64
|
+
prompt_id=prompt_id,
|
|
65
|
+
base_adapter_config=base_adapter_config,
|
|
64
66
|
)
|
|
65
|
-
# Use LangchainAdapter for the rest
|
|
66
67
|
case ModelProviderName.groq:
|
|
67
|
-
|
|
68
|
+
return LiteLlmAdapter(
|
|
69
|
+
kiln_task=kiln_task,
|
|
70
|
+
prompt_id=prompt_id,
|
|
71
|
+
base_adapter_config=base_adapter_config,
|
|
72
|
+
config=LiteLlmConfig(
|
|
73
|
+
model_name=model_name,
|
|
74
|
+
provider_name=provider,
|
|
75
|
+
additional_body_options={
|
|
76
|
+
"api_key": Config.shared().groq_api_key,
|
|
77
|
+
},
|
|
78
|
+
),
|
|
79
|
+
)
|
|
68
80
|
case ModelProviderName.amazon_bedrock:
|
|
69
|
-
|
|
81
|
+
return LiteLlmAdapter(
|
|
82
|
+
kiln_task=kiln_task,
|
|
83
|
+
prompt_id=prompt_id,
|
|
84
|
+
base_adapter_config=base_adapter_config,
|
|
85
|
+
config=LiteLlmConfig(
|
|
86
|
+
model_name=model_name,
|
|
87
|
+
provider_name=provider,
|
|
88
|
+
additional_body_options={
|
|
89
|
+
"aws_access_key_id": Config.shared().bedrock_access_key,
|
|
90
|
+
"aws_secret_access_key": Config.shared().bedrock_secret_key,
|
|
91
|
+
# The only region that's widely supported for bedrock
|
|
92
|
+
"aws_region_name": "us-west-2",
|
|
93
|
+
},
|
|
94
|
+
),
|
|
95
|
+
)
|
|
70
96
|
case ModelProviderName.ollama:
|
|
71
|
-
|
|
97
|
+
ollama_base_url = (
|
|
98
|
+
Config.shared().ollama_base_url or "http://localhost:11434"
|
|
99
|
+
)
|
|
100
|
+
return LiteLlmAdapter(
|
|
101
|
+
kiln_task=kiln_task,
|
|
102
|
+
prompt_id=prompt_id,
|
|
103
|
+
base_adapter_config=base_adapter_config,
|
|
104
|
+
config=LiteLlmConfig(
|
|
105
|
+
model_name=model_name,
|
|
106
|
+
provider_name=provider,
|
|
107
|
+
# Set the Ollama base URL for 2 reasons:
|
|
108
|
+
# 1. To use the correct base URL
|
|
109
|
+
# 2. We use Ollama's OpenAI compatible API (/v1), and don't just let litellm use the Ollama API. We use more advanced features like json_schema.
|
|
110
|
+
base_url=ollama_base_url + "/v1",
|
|
111
|
+
),
|
|
112
|
+
)
|
|
72
113
|
case ModelProviderName.fireworks_ai:
|
|
73
|
-
|
|
114
|
+
return LiteLlmAdapter(
|
|
115
|
+
kiln_task=kiln_task,
|
|
116
|
+
prompt_id=prompt_id,
|
|
117
|
+
base_adapter_config=base_adapter_config,
|
|
118
|
+
config=LiteLlmConfig(
|
|
119
|
+
model_name=model_name,
|
|
120
|
+
provider_name=provider,
|
|
121
|
+
additional_body_options={
|
|
122
|
+
"api_key": Config.shared().fireworks_api_key,
|
|
123
|
+
},
|
|
124
|
+
),
|
|
125
|
+
)
|
|
126
|
+
case ModelProviderName.anthropic:
|
|
127
|
+
return LiteLlmAdapter(
|
|
128
|
+
kiln_task=kiln_task,
|
|
129
|
+
prompt_id=prompt_id,
|
|
130
|
+
base_adapter_config=base_adapter_config,
|
|
131
|
+
config=LiteLlmConfig(
|
|
132
|
+
model_name=model_name,
|
|
133
|
+
provider_name=provider,
|
|
134
|
+
additional_body_options={
|
|
135
|
+
"api_key": Config.shared().anthropic_api_key,
|
|
136
|
+
},
|
|
137
|
+
),
|
|
138
|
+
)
|
|
139
|
+
case ModelProviderName.gemini_api:
|
|
140
|
+
return LiteLlmAdapter(
|
|
141
|
+
kiln_task=kiln_task,
|
|
142
|
+
prompt_id=prompt_id,
|
|
143
|
+
base_adapter_config=base_adapter_config,
|
|
144
|
+
config=LiteLlmConfig(
|
|
145
|
+
model_name=model_name,
|
|
146
|
+
provider_name=provider,
|
|
147
|
+
additional_body_options={
|
|
148
|
+
"api_key": Config.shared().gemini_api_key,
|
|
149
|
+
},
|
|
150
|
+
),
|
|
151
|
+
)
|
|
152
|
+
case ModelProviderName.vertex:
|
|
153
|
+
return LiteLlmAdapter(
|
|
154
|
+
kiln_task=kiln_task,
|
|
155
|
+
prompt_id=prompt_id,
|
|
156
|
+
base_adapter_config=base_adapter_config,
|
|
157
|
+
config=LiteLlmConfig(
|
|
158
|
+
model_name=model_name,
|
|
159
|
+
provider_name=provider,
|
|
160
|
+
additional_body_options={
|
|
161
|
+
"vertex_project": Config.shared().vertex_project_id,
|
|
162
|
+
"vertex_location": Config.shared().vertex_location,
|
|
163
|
+
},
|
|
164
|
+
),
|
|
165
|
+
)
|
|
166
|
+
case ModelProviderName.together_ai:
|
|
167
|
+
return LiteLlmAdapter(
|
|
168
|
+
kiln_task=kiln_task,
|
|
169
|
+
prompt_id=prompt_id,
|
|
170
|
+
base_adapter_config=base_adapter_config,
|
|
171
|
+
config=LiteLlmConfig(
|
|
172
|
+
model_name=model_name,
|
|
173
|
+
provider_name=provider,
|
|
174
|
+
additional_body_options={
|
|
175
|
+
"api_key": Config.shared().together_api_key,
|
|
176
|
+
},
|
|
177
|
+
),
|
|
178
|
+
)
|
|
179
|
+
case ModelProviderName.azure_openai:
|
|
180
|
+
return LiteLlmAdapter(
|
|
181
|
+
kiln_task=kiln_task,
|
|
182
|
+
prompt_id=prompt_id,
|
|
183
|
+
base_adapter_config=base_adapter_config,
|
|
184
|
+
config=LiteLlmConfig(
|
|
185
|
+
base_url=Config.shared().azure_openai_endpoint,
|
|
186
|
+
model_name=model_name,
|
|
187
|
+
provider_name=provider,
|
|
188
|
+
additional_body_options={
|
|
189
|
+
"api_key": Config.shared().azure_openai_api_key,
|
|
190
|
+
"api_version": "2025-02-01-preview",
|
|
191
|
+
},
|
|
192
|
+
),
|
|
193
|
+
)
|
|
194
|
+
case ModelProviderName.huggingface:
|
|
195
|
+
return LiteLlmAdapter(
|
|
196
|
+
kiln_task=kiln_task,
|
|
197
|
+
prompt_id=prompt_id,
|
|
198
|
+
base_adapter_config=base_adapter_config,
|
|
199
|
+
config=LiteLlmConfig(
|
|
200
|
+
model_name=model_name,
|
|
201
|
+
provider_name=provider,
|
|
202
|
+
additional_body_options={
|
|
203
|
+
"api_key": Config.shared().huggingface_api_key,
|
|
204
|
+
},
|
|
205
|
+
),
|
|
206
|
+
)
|
|
74
207
|
# These are virtual providers that should have mapped to an actual provider in core_provider
|
|
75
208
|
case ModelProviderName.kiln_fine_tune:
|
|
76
209
|
raise ValueError(
|
|
@@ -82,12 +215,3 @@ def adapter_for_task(
|
|
|
82
215
|
)
|
|
83
216
|
case _:
|
|
84
217
|
raise_exhaustive_enum_error(core_provider_name)
|
|
85
|
-
|
|
86
|
-
# We use langchain for all others right now, but moving off it as we touch anything.
|
|
87
|
-
return LangchainAdapter(
|
|
88
|
-
kiln_task,
|
|
89
|
-
model_name=model_name,
|
|
90
|
-
provider=provider,
|
|
91
|
-
prompt_builder=prompt_builder,
|
|
92
|
-
tags=tags,
|
|
93
|
-
)
|
|
@@ -183,3 +183,21 @@ class DataGenSampleTask(Task, parent_of={}):
|
|
|
183
183
|
input_json_schema=json.dumps(DataGenSampleTaskInput.model_json_schema()),
|
|
184
184
|
output_json_schema=list_json_schema_for_task(target_task),
|
|
185
185
|
)
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def wrap_task_with_guidance(original_instruction: str, guidance: str) -> str:
|
|
189
|
+
"""Wrap the original instruction with human guidance.
|
|
190
|
+
|
|
191
|
+
Args:
|
|
192
|
+
original_instruction: The original instruction to wrap
|
|
193
|
+
guidance: The human guidance to wrap the instruction with
|
|
194
|
+
"""
|
|
195
|
+
return f"""{original_instruction}
|
|
196
|
+
|
|
197
|
+
# Special Instructions
|
|
198
|
+
|
|
199
|
+
The above instructions are the original instructions for this task. For this execution, we've been given additional instructions. Follow both, but prioritize the additional instructions when they conflict. The additional instructions are:
|
|
200
|
+
<additional_instructions>
|
|
201
|
+
{guidance}
|
|
202
|
+
</additional_instructions>
|
|
203
|
+
"""
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
"""
|
|
2
|
+
# Evals
|
|
3
|
+
|
|
4
|
+
This module contains the code for evaluating the performance of a model.
|
|
5
|
+
|
|
6
|
+
The submodules contain:
|
|
7
|
+
|
|
8
|
+
- BaseEval: each eval technique implements this interface.
|
|
9
|
+
- G-Eval: an eval implementation, that implements G-Eval and LLM as Judge.
|
|
10
|
+
- EvalRunner: a class that runs an full evaluation (many smaller evals jobs). Includes async parallel processing, and the ability to restart where it left off.
|
|
11
|
+
- EvalRegistry: a registry for all eval implementations.
|
|
12
|
+
|
|
13
|
+
The datamodel for Evals is in the `kiln_ai.datamodel.eval` module.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from . import (
|
|
17
|
+
base_eval,
|
|
18
|
+
eval_runner,
|
|
19
|
+
g_eval,
|
|
20
|
+
registry,
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
__all__ = [
|
|
24
|
+
"base_eval",
|
|
25
|
+
"eval_runner",
|
|
26
|
+
"g_eval",
|
|
27
|
+
"registry",
|
|
28
|
+
]
|
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from abc import abstractmethod
|
|
3
|
+
from typing import Dict
|
|
4
|
+
|
|
5
|
+
from kiln_ai.adapters.adapter_registry import adapter_for_task
|
|
6
|
+
from kiln_ai.adapters.ml_model_list import ModelProviderName
|
|
7
|
+
from kiln_ai.adapters.model_adapters.base_adapter import AdapterConfig
|
|
8
|
+
from kiln_ai.datamodel.eval import Eval, EvalConfig, EvalScores
|
|
9
|
+
from kiln_ai.datamodel.json_schema import validate_schema
|
|
10
|
+
from kiln_ai.datamodel.task import RunConfig, TaskOutputRatingType, TaskRun
|
|
11
|
+
from kiln_ai.utils.exhaustive_error import raise_exhaustive_enum_error
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class BaseEval:
|
|
15
|
+
"""
|
|
16
|
+
Base class for all evals/evaluators.
|
|
17
|
+
|
|
18
|
+
Should be subclassed, and the run_eval method implemented.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
def __init__(self, eval_config: EvalConfig, run_config: RunConfig | None):
|
|
22
|
+
self.eval_config = eval_config
|
|
23
|
+
eval = eval_config.parent_eval()
|
|
24
|
+
if not eval:
|
|
25
|
+
raise ValueError("Eval config must have a parent eval")
|
|
26
|
+
self.eval = eval
|
|
27
|
+
task = self.eval.parent_task()
|
|
28
|
+
if not task:
|
|
29
|
+
raise ValueError("Eval must have a parent task")
|
|
30
|
+
self.target_task = task
|
|
31
|
+
self.score_schema = BaseEval.build_score_schema(eval, allow_float_scores=True)
|
|
32
|
+
self.run_config = run_config
|
|
33
|
+
|
|
34
|
+
def model_and_provider(self) -> tuple[str, ModelProviderName]:
|
|
35
|
+
model_name = self.eval_config.model_name
|
|
36
|
+
provider = self.eval_config.model_provider
|
|
37
|
+
if (
|
|
38
|
+
not model_name
|
|
39
|
+
or not provider
|
|
40
|
+
or not isinstance(model_name, str)
|
|
41
|
+
or not isinstance(provider, str)
|
|
42
|
+
or provider not in ModelProviderName.__members__
|
|
43
|
+
):
|
|
44
|
+
raise ValueError(
|
|
45
|
+
"Model name and provider must be set in the eval config model properties"
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
return model_name, ModelProviderName(provider)
|
|
49
|
+
|
|
50
|
+
async def run_task_and_eval(
|
|
51
|
+
self, input: str
|
|
52
|
+
) -> tuple[TaskRun, EvalScores, Dict[str, str] | None]:
|
|
53
|
+
"""
|
|
54
|
+
Runs the task on the provided run_config to generate fresh output, then runs the eval on that output.
|
|
55
|
+
"""
|
|
56
|
+
if self.run_config is None:
|
|
57
|
+
raise ValueError("Run config is required for run_task_and_eval")
|
|
58
|
+
|
|
59
|
+
run_adapter = adapter_for_task(
|
|
60
|
+
self.target_task,
|
|
61
|
+
self.run_config.model_name,
|
|
62
|
+
ModelProviderName(self.run_config.model_provider_name),
|
|
63
|
+
base_adapter_config=AdapterConfig(allow_saving=False),
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
# Parse structured input if needed
|
|
67
|
+
parsed_input = input
|
|
68
|
+
if self.target_task.output_json_schema is not None:
|
|
69
|
+
parsed_input = json.loads(input)
|
|
70
|
+
|
|
71
|
+
# we don't save by default here. We'll save manually after validating the output
|
|
72
|
+
run_output = await run_adapter.invoke(parsed_input)
|
|
73
|
+
|
|
74
|
+
eval_output, intermediate_outputs = await self.run_eval(run_output)
|
|
75
|
+
validate_schema(eval_output, self.score_schema)
|
|
76
|
+
|
|
77
|
+
return run_output, eval_output, intermediate_outputs
|
|
78
|
+
|
|
79
|
+
@abstractmethod
|
|
80
|
+
async def run_eval(
|
|
81
|
+
self, task_run: TaskRun
|
|
82
|
+
) -> tuple[EvalScores, Dict[str, str] | None]:
|
|
83
|
+
"""
|
|
84
|
+
Runs the eval on the given task run.
|
|
85
|
+
|
|
86
|
+
Returns a dictionary of scores which should conform to the score schema, and a dictionary of intermediate outputs (eval thinking).
|
|
87
|
+
"""
|
|
88
|
+
pass
|
|
89
|
+
|
|
90
|
+
@classmethod
|
|
91
|
+
def build_score_schema(cls, eval: Eval, allow_float_scores: bool = False) -> str:
|
|
92
|
+
"""
|
|
93
|
+
Build a JSON schema for the scoring output of the task requirements
|
|
94
|
+
|
|
95
|
+
We allow 2 modes: allow_float_scores=True and allow_float_scores=False.
|
|
96
|
+
|
|
97
|
+
allow_float_scores=False is used for the call to the model, and forces the model into selecting into discrete rating options (int 1-5, pass-fail, etc).
|
|
98
|
+
allow_float_scores=True is used for final score output (for example, after we take a g-eval weighting of the model's logprobs). A pass/fail rating might return 0.75 for likely pass (as opposed to 0.99 for near certain pass), or a 1-5 score might return 3.75.
|
|
99
|
+
"""
|
|
100
|
+
|
|
101
|
+
# Note: python maintains order, which is good as we want the user defined order, and overall last
|
|
102
|
+
properties = {}
|
|
103
|
+
for output_score in eval.output_scores:
|
|
104
|
+
output_score_json_key = output_score.json_key()
|
|
105
|
+
|
|
106
|
+
if len(output_score_json_key) == 0:
|
|
107
|
+
raise ValueError(
|
|
108
|
+
f"Invalid output score name: {output_score.name}. Can not be used as JSON schema key."
|
|
109
|
+
)
|
|
110
|
+
property: dict[str, str | int | float | list[str] | list[int]] = {
|
|
111
|
+
"title": output_score.name,
|
|
112
|
+
}
|
|
113
|
+
match output_score.type:
|
|
114
|
+
case TaskOutputRatingType.five_star:
|
|
115
|
+
if allow_float_scores:
|
|
116
|
+
property["type"] = "number"
|
|
117
|
+
property["minimum"] = 1
|
|
118
|
+
property["maximum"] = 5
|
|
119
|
+
else:
|
|
120
|
+
property["enum"] = [1, 2, 3, 4, 5]
|
|
121
|
+
|
|
122
|
+
property["description"] = (
|
|
123
|
+
f"{output_score.instruction}\n\nThe rating should be between 1 and 5, with 1 being the worst and 5 being the best."
|
|
124
|
+
)
|
|
125
|
+
case TaskOutputRatingType.pass_fail:
|
|
126
|
+
if allow_float_scores:
|
|
127
|
+
property["type"] = "number"
|
|
128
|
+
property["minimum"] = 0
|
|
129
|
+
property["maximum"] = 1
|
|
130
|
+
property["description"] = (
|
|
131
|
+
f"{output_score.instruction}\n\nThe rating should be between 0 and 1, with 0 being a failure and 1 being a pass."
|
|
132
|
+
)
|
|
133
|
+
else:
|
|
134
|
+
property["enum"] = ["pass", "fail"]
|
|
135
|
+
property["description"] = (
|
|
136
|
+
f"{output_score.instruction}\n\nThe rating should be either 'pass' or 'fail'."
|
|
137
|
+
)
|
|
138
|
+
case TaskOutputRatingType.pass_fail_critical:
|
|
139
|
+
if allow_float_scores:
|
|
140
|
+
property["type"] = "number"
|
|
141
|
+
property["minimum"] = -1
|
|
142
|
+
property["maximum"] = 1
|
|
143
|
+
property["description"] = (
|
|
144
|
+
f"{output_score.instruction}\n\nThe rating should be between -1 and 1, with 1 being a pass, 0 being a failure, and -1 being a critical failure (very severe failure)."
|
|
145
|
+
)
|
|
146
|
+
else:
|
|
147
|
+
property["enum"] = ["pass", "fail", "critical"]
|
|
148
|
+
property["description"] = (
|
|
149
|
+
f"{output_score.instruction}\n\nThe rating should be either 'pass', 'fail', or 'critical' where critical a very severe failure."
|
|
150
|
+
)
|
|
151
|
+
case TaskOutputRatingType.custom:
|
|
152
|
+
# Skip custom rating types in evals
|
|
153
|
+
continue
|
|
154
|
+
case _:
|
|
155
|
+
raise_exhaustive_enum_error(output_score.type)
|
|
156
|
+
|
|
157
|
+
properties[output_score_json_key] = property
|
|
158
|
+
|
|
159
|
+
schema = {
|
|
160
|
+
"type": "object",
|
|
161
|
+
"properties": properties,
|
|
162
|
+
"required": list(properties.keys()),
|
|
163
|
+
}
|
|
164
|
+
return json.dumps(schema, ensure_ascii=False)
|