kiln-ai 0.11.1__py3-none-any.whl → 0.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of kiln-ai might be problematic. Click here for more details.
- kiln_ai/adapters/adapter_registry.py +12 -13
- kiln_ai/adapters/data_gen/data_gen_task.py +18 -0
- kiln_ai/adapters/eval/base_eval.py +164 -0
- kiln_ai/adapters/eval/eval_runner.py +267 -0
- kiln_ai/adapters/eval/g_eval.py +367 -0
- kiln_ai/adapters/eval/registry.py +16 -0
- kiln_ai/adapters/eval/test_base_eval.py +324 -0
- kiln_ai/adapters/eval/test_eval_runner.py +640 -0
- kiln_ai/adapters/eval/test_g_eval.py +497 -0
- kiln_ai/adapters/eval/test_g_eval_data.py +4 -0
- kiln_ai/adapters/fine_tune/test_dataset_formatter.py +4 -1
- kiln_ai/adapters/fine_tune/test_fireworks_tinetune.py +1 -1
- kiln_ai/adapters/fine_tune/test_openai_finetune.py +1 -1
- kiln_ai/adapters/ml_model_list.py +141 -29
- kiln_ai/adapters/model_adapters/base_adapter.py +50 -35
- kiln_ai/adapters/model_adapters/langchain_adapters.py +27 -20
- kiln_ai/adapters/model_adapters/openai_compatible_config.py +0 -1
- kiln_ai/adapters/model_adapters/openai_model_adapter.py +93 -50
- kiln_ai/adapters/model_adapters/test_base_adapter.py +22 -13
- kiln_ai/adapters/model_adapters/test_langchain_adapter.py +7 -14
- kiln_ai/adapters/model_adapters/test_openai_model_adapter.py +55 -64
- kiln_ai/adapters/model_adapters/test_saving_adapter_results.py +41 -19
- kiln_ai/adapters/model_adapters/test_structured_output.py +36 -30
- kiln_ai/adapters/ollama_tools.py +0 -1
- kiln_ai/adapters/prompt_builders.py +80 -42
- kiln_ai/adapters/repair/repair_task.py +9 -21
- kiln_ai/adapters/repair/test_repair_task.py +3 -3
- kiln_ai/adapters/run_output.py +3 -0
- kiln_ai/adapters/test_adapter_registry.py +10 -10
- kiln_ai/adapters/test_generate_docs.py +6 -6
- kiln_ai/adapters/test_ollama_tools.py +0 -1
- kiln_ai/adapters/test_prompt_adaptors.py +17 -14
- kiln_ai/adapters/test_prompt_builders.py +91 -31
- kiln_ai/datamodel/__init__.py +50 -952
- kiln_ai/datamodel/datamodel_enums.py +58 -0
- kiln_ai/datamodel/dataset_filters.py +114 -0
- kiln_ai/datamodel/dataset_split.py +170 -0
- kiln_ai/datamodel/eval.py +298 -0
- kiln_ai/datamodel/finetune.py +105 -0
- kiln_ai/datamodel/json_schema.py +6 -0
- kiln_ai/datamodel/project.py +23 -0
- kiln_ai/datamodel/prompt.py +37 -0
- kiln_ai/datamodel/prompt_id.py +83 -0
- kiln_ai/datamodel/strict_mode.py +24 -0
- kiln_ai/datamodel/task.py +181 -0
- kiln_ai/datamodel/task_output.py +321 -0
- kiln_ai/datamodel/task_run.py +164 -0
- kiln_ai/datamodel/test_basemodel.py +10 -11
- kiln_ai/datamodel/test_dataset_filters.py +71 -0
- kiln_ai/datamodel/test_dataset_split.py +32 -8
- kiln_ai/datamodel/test_datasource.py +3 -2
- kiln_ai/datamodel/test_eval_model.py +635 -0
- kiln_ai/datamodel/test_example_models.py +9 -13
- kiln_ai/datamodel/test_json_schema.py +23 -0
- kiln_ai/datamodel/test_models.py +2 -2
- kiln_ai/datamodel/test_prompt_id.py +129 -0
- kiln_ai/datamodel/test_task.py +159 -0
- kiln_ai/utils/config.py +6 -1
- {kiln_ai-0.11.1.dist-info → kiln_ai-0.12.0.dist-info}/METADATA +37 -1
- kiln_ai-0.12.0.dist-info/RECORD +100 -0
- kiln_ai-0.11.1.dist-info/RECORD +0 -76
- {kiln_ai-0.11.1.dist-info → kiln_ai-0.12.0.dist-info}/WHEEL +0 -0
- {kiln_ai-0.11.1.dist-info → kiln_ai-0.12.0.dist-info}/licenses/LICENSE.txt +0 -0
|
@@ -0,0 +1,497 @@
|
|
|
1
|
+
import math
|
|
2
|
+
import pickle
|
|
3
|
+
|
|
4
|
+
import pytest
|
|
5
|
+
from kiln_ai.adapters.eval.g_eval import TOKEN_TO_SCORE_MAP, GEval, GEvalTask
|
|
6
|
+
from kiln_ai.adapters.eval.test_g_eval_data import serialized_run_output
|
|
7
|
+
from kiln_ai.adapters.ml_model_list import built_in_models
|
|
8
|
+
from kiln_ai.adapters.model_adapters.base_adapter import RunOutput
|
|
9
|
+
from kiln_ai.adapters.test_prompt_adaptors import get_all_models_and_providers
|
|
10
|
+
from kiln_ai.datamodel import (
|
|
11
|
+
DataSource,
|
|
12
|
+
DataSourceType,
|
|
13
|
+
Project,
|
|
14
|
+
Task,
|
|
15
|
+
TaskOutput,
|
|
16
|
+
TaskOutputRatingType,
|
|
17
|
+
TaskRequirement,
|
|
18
|
+
TaskRun,
|
|
19
|
+
)
|
|
20
|
+
from kiln_ai.datamodel.eval import Eval, EvalConfig, EvalConfigType, EvalOutputScore
|
|
21
|
+
from kiln_ai.datamodel.task import RunConfig
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@pytest.fixture
|
|
25
|
+
def test_task(tmp_path):
|
|
26
|
+
project = Project(name="Test Project", path=tmp_path / "project.kiln")
|
|
27
|
+
project.save_to_file()
|
|
28
|
+
|
|
29
|
+
task = Task(
|
|
30
|
+
name="Joke Generator",
|
|
31
|
+
instruction="Generate a joke, given a topic",
|
|
32
|
+
parent=project,
|
|
33
|
+
requirements=[
|
|
34
|
+
TaskRequirement(
|
|
35
|
+
name="Topic alignment",
|
|
36
|
+
instruction="Rate how aligned the joke is to the provided topic",
|
|
37
|
+
type=TaskOutputRatingType.five_star,
|
|
38
|
+
),
|
|
39
|
+
TaskRequirement(
|
|
40
|
+
name="Appropriateness",
|
|
41
|
+
instruction="Check if the content is appropriate for all audiences",
|
|
42
|
+
type=TaskOutputRatingType.pass_fail,
|
|
43
|
+
),
|
|
44
|
+
],
|
|
45
|
+
)
|
|
46
|
+
task.save_to_file()
|
|
47
|
+
return task
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
@pytest.fixture
|
|
51
|
+
def test_eval_config(test_task):
|
|
52
|
+
eval = Eval(
|
|
53
|
+
name="Joke Quality Eval",
|
|
54
|
+
parent=test_task,
|
|
55
|
+
eval_set_filter_id="tag::tag1",
|
|
56
|
+
eval_configs_filter_id="tag::tag2",
|
|
57
|
+
output_scores=[
|
|
58
|
+
EvalOutputScore(
|
|
59
|
+
name="appropriateness",
|
|
60
|
+
type=TaskOutputRatingType.pass_fail,
|
|
61
|
+
),
|
|
62
|
+
EvalOutputScore(
|
|
63
|
+
name="topic_alignment",
|
|
64
|
+
type=TaskOutputRatingType.five_star,
|
|
65
|
+
),
|
|
66
|
+
EvalOutputScore(
|
|
67
|
+
name="overall_rating",
|
|
68
|
+
type=TaskOutputRatingType.five_star,
|
|
69
|
+
),
|
|
70
|
+
],
|
|
71
|
+
)
|
|
72
|
+
eval.save_to_file()
|
|
73
|
+
|
|
74
|
+
config = EvalConfig(
|
|
75
|
+
name="Llama 8b Joke Generator Eval",
|
|
76
|
+
parent=eval,
|
|
77
|
+
config_type=EvalConfigType.g_eval,
|
|
78
|
+
model_name="gpt_4o_mini",
|
|
79
|
+
model_provider="openai",
|
|
80
|
+
properties={
|
|
81
|
+
"eval_steps": [
|
|
82
|
+
"Is the joke funny?",
|
|
83
|
+
"Is the content appropriate for all audiences?",
|
|
84
|
+
"Is the joke culturally sensitive?",
|
|
85
|
+
"Is the joke politically correct?",
|
|
86
|
+
"Is the joke aligned with the provided topic?",
|
|
87
|
+
]
|
|
88
|
+
},
|
|
89
|
+
)
|
|
90
|
+
config.save_to_file()
|
|
91
|
+
return config
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
@pytest.fixture
|
|
95
|
+
def test_run_config(test_task):
|
|
96
|
+
return RunConfig(
|
|
97
|
+
model_name="llama_3_1_8b",
|
|
98
|
+
model_provider_name="groq",
|
|
99
|
+
task=test_task,
|
|
100
|
+
prompt_id="simple_prompt_builder",
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
@pytest.fixture
|
|
105
|
+
def test_task_run(test_task):
|
|
106
|
+
task_run = TaskRun(
|
|
107
|
+
parent=test_task,
|
|
108
|
+
input="Tell me a chicken joke",
|
|
109
|
+
input_source=DataSource(
|
|
110
|
+
type=DataSourceType.human, properties={"created_by": "test_user"}
|
|
111
|
+
),
|
|
112
|
+
output=TaskOutput(
|
|
113
|
+
output="Why did the chicken cross the road? To get to the other side!",
|
|
114
|
+
source=DataSource(
|
|
115
|
+
type=DataSourceType.synthetic,
|
|
116
|
+
properties={
|
|
117
|
+
"model_name": "llama_3_1_8b",
|
|
118
|
+
"model_provider": "groq",
|
|
119
|
+
"adapter_name": "langchain",
|
|
120
|
+
},
|
|
121
|
+
),
|
|
122
|
+
),
|
|
123
|
+
)
|
|
124
|
+
task_run.save_to_file()
|
|
125
|
+
return task_run
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
async def run_g_eval_test(
|
|
129
|
+
test_task,
|
|
130
|
+
test_eval_config,
|
|
131
|
+
test_task_run,
|
|
132
|
+
config_type,
|
|
133
|
+
test_run_config,
|
|
134
|
+
model_name: str | None = None,
|
|
135
|
+
provider_name: str | None = None,
|
|
136
|
+
):
|
|
137
|
+
# Create G-Eval instance
|
|
138
|
+
test_eval_config.config_type = config_type
|
|
139
|
+
if model_name is not None and provider_name is not None:
|
|
140
|
+
test_eval_config.model_name = model_name
|
|
141
|
+
test_eval_config.model_provider = provider_name
|
|
142
|
+
g_eval = GEval(test_eval_config, test_run_config)
|
|
143
|
+
|
|
144
|
+
# Run the evaluation
|
|
145
|
+
eval_result, intermediate_outputs = await g_eval.run_eval(test_task_run)
|
|
146
|
+
|
|
147
|
+
# Should have 1 intermediate output (thinking or chain of thought)
|
|
148
|
+
assert len(intermediate_outputs) == 1
|
|
149
|
+
|
|
150
|
+
assert "topic_alignment" in eval_result
|
|
151
|
+
topic_alignment = eval_result["topic_alignment"]
|
|
152
|
+
assert isinstance(topic_alignment, float)
|
|
153
|
+
assert 1 <= topic_alignment <= 5
|
|
154
|
+
|
|
155
|
+
assert "appropriateness" in eval_result
|
|
156
|
+
appropriateness = eval_result["appropriateness"]
|
|
157
|
+
assert isinstance(appropriateness, float)
|
|
158
|
+
assert appropriateness >= 0.0 and appropriateness <= 1.0
|
|
159
|
+
|
|
160
|
+
assert "overall_rating" in eval_result
|
|
161
|
+
overall = eval_result["overall_rating"]
|
|
162
|
+
assert isinstance(overall, float)
|
|
163
|
+
assert 1.0 <= overall <= 5.0
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
@pytest.mark.parametrize(
|
|
167
|
+
"config_type", [EvalConfigType.g_eval, EvalConfigType.llm_as_judge]
|
|
168
|
+
)
|
|
169
|
+
@pytest.mark.paid
|
|
170
|
+
async def test_run_g_eval_paid(
|
|
171
|
+
test_task, test_eval_config, test_task_run, config_type, test_run_config
|
|
172
|
+
):
|
|
173
|
+
await run_g_eval_test(
|
|
174
|
+
test_task, test_eval_config, test_task_run, config_type, test_run_config
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
@pytest.mark.parametrize(
|
|
179
|
+
"config_type", [EvalConfigType.g_eval, EvalConfigType.llm_as_judge]
|
|
180
|
+
)
|
|
181
|
+
@pytest.mark.paid
|
|
182
|
+
async def test_run_g_eval_e2e(
|
|
183
|
+
test_task, test_eval_config, test_task_run, config_type, test_run_config
|
|
184
|
+
):
|
|
185
|
+
# Create G-Eval instance
|
|
186
|
+
test_eval_config.config_type = config_type
|
|
187
|
+
g_eval = GEval(test_eval_config, test_run_config)
|
|
188
|
+
|
|
189
|
+
# Run the evaluation
|
|
190
|
+
task_run, scores, intermediate_outputs = await g_eval.run_task_and_eval("chickens")
|
|
191
|
+
|
|
192
|
+
# Verify the evaluation results
|
|
193
|
+
assert isinstance(scores, dict)
|
|
194
|
+
|
|
195
|
+
# Should have 1 intermediate output (thinking or chain of thought)
|
|
196
|
+
assert len(intermediate_outputs) == 1
|
|
197
|
+
|
|
198
|
+
assert "topic_alignment" in scores
|
|
199
|
+
topic_alignment = scores["topic_alignment"]
|
|
200
|
+
assert isinstance(topic_alignment, float)
|
|
201
|
+
assert 1 <= topic_alignment <= 5
|
|
202
|
+
|
|
203
|
+
assert "appropriateness" in scores
|
|
204
|
+
appropriateness = scores["appropriateness"]
|
|
205
|
+
assert isinstance(appropriateness, float)
|
|
206
|
+
assert appropriateness >= 0.0 and appropriateness <= 1.0
|
|
207
|
+
|
|
208
|
+
assert "overall_rating" in scores
|
|
209
|
+
overall = scores["overall_rating"]
|
|
210
|
+
assert isinstance(overall, float)
|
|
211
|
+
assert 1.0 <= overall <= 5.0
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
async def test_g_eval_logprobs(
|
|
215
|
+
test_task, test_eval_config, test_task_run, test_run_config
|
|
216
|
+
):
|
|
217
|
+
# Create G-Eval instance
|
|
218
|
+
run_output = pickle.loads(serialized_run_output)
|
|
219
|
+
assert isinstance(run_output, RunOutput)
|
|
220
|
+
assert run_output.output_logprobs is not None
|
|
221
|
+
g_eval = GEval(test_eval_config, test_run_config)
|
|
222
|
+
result = g_eval.build_g_eval_score(run_output)
|
|
223
|
+
|
|
224
|
+
assert "overall_rating" in result
|
|
225
|
+
overall = result["overall_rating"]
|
|
226
|
+
assert isinstance(overall, float)
|
|
227
|
+
assert overall >= 1.0 and overall <= 5.0
|
|
228
|
+
# Confirm weighted value, and confirm the approx isn't why it's passing
|
|
229
|
+
assert pytest.approx(overall) == 3.99752802363598
|
|
230
|
+
assert pytest.approx(overall) != 4.0
|
|
231
|
+
|
|
232
|
+
# Check topic_alignment
|
|
233
|
+
assert "topic_alignment" in result
|
|
234
|
+
topic_alignment = result["topic_alignment"]
|
|
235
|
+
assert isinstance(topic_alignment, float)
|
|
236
|
+
assert topic_alignment >= 1.0 and topic_alignment <= 5.0
|
|
237
|
+
# Confirm weighted value, and confirm the approx isn't why it's passing
|
|
238
|
+
assert pytest.approx(topic_alignment) == 4.999983298485167
|
|
239
|
+
assert pytest.approx(topic_alignment) != 5.0
|
|
240
|
+
|
|
241
|
+
# Check appropriateness
|
|
242
|
+
assert "appropriateness" in result
|
|
243
|
+
appropriateness = result["appropriateness"]
|
|
244
|
+
assert isinstance(appropriateness, float)
|
|
245
|
+
assert appropriateness >= 0.0 and appropriateness <= 1.0
|
|
246
|
+
# Fail chance so low, we need to specify the precision
|
|
247
|
+
assert pytest.approx(appropriateness, 1e-12) == 0.9999999999572222
|
|
248
|
+
assert pytest.approx(appropriateness, 1e-12) != 1.0
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
async def test_llm_as_judge(
|
|
252
|
+
test_task, test_eval_config, test_task_run, test_run_config
|
|
253
|
+
):
|
|
254
|
+
# Create G-Eval instance, set to LLM as Judge
|
|
255
|
+
run_output = pickle.loads(serialized_run_output)
|
|
256
|
+
test_eval_config.config_type = EvalConfigType.llm_as_judge
|
|
257
|
+
g_eval = GEval(test_eval_config, test_run_config)
|
|
258
|
+
|
|
259
|
+
assert isinstance(run_output, RunOutput)
|
|
260
|
+
assert run_output.output_logprobs is not None
|
|
261
|
+
result = g_eval.build_llm_as_judge_score(run_output)
|
|
262
|
+
|
|
263
|
+
# unlike g_eval, llm_as_judge returns the main token converted to our float scores
|
|
264
|
+
assert result["overall_rating"] == 4.0
|
|
265
|
+
assert result["topic_alignment"] == 5.0
|
|
266
|
+
assert result["appropriateness"] == 1.0
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
def test_token_case():
|
|
270
|
+
# we assume the token is lower case in the logprobs token fuzzy matching code. This will catch if we ever add a token that's not.
|
|
271
|
+
for token in TOKEN_TO_SCORE_MAP.keys():
|
|
272
|
+
assert token.lower() == token
|
|
273
|
+
|
|
274
|
+
|
|
275
|
+
def test_metric_offsets_and_search_ranges(
|
|
276
|
+
test_eval_config, test_run_config, test_task_run
|
|
277
|
+
):
|
|
278
|
+
g_eval = GEval(test_eval_config, test_run_config)
|
|
279
|
+
raw_output = (
|
|
280
|
+
'{"topic_alignment": 4, "appropriateness": "pass", "overall_rating": 5}'
|
|
281
|
+
)
|
|
282
|
+
metrics = ["topic_alignment", "appropriateness", "overall_rating"]
|
|
283
|
+
|
|
284
|
+
offsets = g_eval.metric_offsets(raw_output, metrics)
|
|
285
|
+
|
|
286
|
+
assert len(offsets) == 3
|
|
287
|
+
assert offsets["topic_alignment"] == 1 # Position after opening {
|
|
288
|
+
assert offsets["appropriateness"] == 23 # Position after "appropriateness":
|
|
289
|
+
assert offsets["overall_rating"] == 50 # Position after "overall_rating":
|
|
290
|
+
|
|
291
|
+
# Test search ranges
|
|
292
|
+
|
|
293
|
+
# Test first metric
|
|
294
|
+
start, end = g_eval.token_search_range(raw_output, "topic_alignment", offsets)
|
|
295
|
+
assert start == 16 # Position after "topic_alignment"
|
|
296
|
+
assert end == 23 # Position after "appropriateness"
|
|
297
|
+
|
|
298
|
+
# Test middle metric
|
|
299
|
+
start, end = g_eval.token_search_range(raw_output, "appropriateness", offsets)
|
|
300
|
+
assert start == 38 # Position after "appropriateness"
|
|
301
|
+
assert end == 50 # Position after "overall_rating"
|
|
302
|
+
|
|
303
|
+
# Test last metric
|
|
304
|
+
start, end = g_eval.token_search_range(raw_output, "overall_rating", offsets)
|
|
305
|
+
assert start == 64 # Position after "overall_rating"
|
|
306
|
+
assert end == len(raw_output) # end of string
|
|
307
|
+
|
|
308
|
+
|
|
309
|
+
def test_metric_offsets_invalid(test_eval_config, test_run_config):
|
|
310
|
+
g_eval = GEval(test_eval_config, test_run_config)
|
|
311
|
+
raw_output = '{"topic_alignment": 4, "topic_alignment": 5}'
|
|
312
|
+
metrics = ["topic_alignment"]
|
|
313
|
+
|
|
314
|
+
with pytest.raises(ValueError, match="should appear exactly once"):
|
|
315
|
+
g_eval.metric_offsets(raw_output, metrics)
|
|
316
|
+
|
|
317
|
+
raw_output = '{"something_else": 4}'
|
|
318
|
+
with pytest.raises(ValueError, match="should appear exactly once"):
|
|
319
|
+
g_eval.metric_offsets(raw_output, metrics)
|
|
320
|
+
|
|
321
|
+
|
|
322
|
+
@pytest.mark.parametrize(
|
|
323
|
+
"token_string,expected_score",
|
|
324
|
+
[
|
|
325
|
+
# Direct matches
|
|
326
|
+
("1", 1.0),
|
|
327
|
+
("5", 5.0),
|
|
328
|
+
("pass", 1.0),
|
|
329
|
+
("fail", 0.0),
|
|
330
|
+
("critical", -1.0),
|
|
331
|
+
# Variations with quotes and spacing
|
|
332
|
+
('"1"', 1.0),
|
|
333
|
+
(" pass ", 1.0),
|
|
334
|
+
("PASS", 1.0),
|
|
335
|
+
('"FAIL"', 0.0),
|
|
336
|
+
('"pAss"', 1.0),
|
|
337
|
+
("1.0", 1.0),
|
|
338
|
+
("2.0", 2.0),
|
|
339
|
+
("3.0", 3.0),
|
|
340
|
+
("4.0", 4.0),
|
|
341
|
+
("5.0", 5.0),
|
|
342
|
+
("5.0000", 5.0),
|
|
343
|
+
# Invalid tokens
|
|
344
|
+
("invalid", None),
|
|
345
|
+
("6", None),
|
|
346
|
+
("0", None),
|
|
347
|
+
("", None),
|
|
348
|
+
("4.9999999", None),
|
|
349
|
+
],
|
|
350
|
+
)
|
|
351
|
+
def test_score_from_token_string(
|
|
352
|
+
test_eval_config, token_string, expected_score, test_run_config
|
|
353
|
+
):
|
|
354
|
+
g_eval = GEval(test_eval_config, test_run_config)
|
|
355
|
+
assert g_eval.score_from_token_string(token_string) == expected_score
|
|
356
|
+
|
|
357
|
+
|
|
358
|
+
def test_raw_output_from_logprobs(test_eval_config, test_run_config):
|
|
359
|
+
g_eval = GEval(test_eval_config, test_run_config)
|
|
360
|
+
|
|
361
|
+
# Create a minimal RunOutput with some logprobs
|
|
362
|
+
class MockLogprob:
|
|
363
|
+
def __init__(self, token):
|
|
364
|
+
self.token = token
|
|
365
|
+
|
|
366
|
+
class MockLogprobs:
|
|
367
|
+
def __init__(self):
|
|
368
|
+
self.content = [
|
|
369
|
+
MockLogprob('{"'),
|
|
370
|
+
MockLogprob("score"),
|
|
371
|
+
MockLogprob('": '),
|
|
372
|
+
MockLogprob("5"),
|
|
373
|
+
MockLogprob("}"),
|
|
374
|
+
]
|
|
375
|
+
|
|
376
|
+
run_output = RunOutput(
|
|
377
|
+
output={"score": 5},
|
|
378
|
+
output_logprobs=MockLogprobs(),
|
|
379
|
+
intermediate_outputs={},
|
|
380
|
+
)
|
|
381
|
+
|
|
382
|
+
raw = g_eval.raw_output_from_logprobs(run_output)
|
|
383
|
+
assert raw == '{"score": 5}'
|
|
384
|
+
|
|
385
|
+
|
|
386
|
+
def test_rating_token_to_score(test_eval_config, test_run_config):
|
|
387
|
+
g_eval = GEval(test_eval_config, test_run_config)
|
|
388
|
+
|
|
389
|
+
class MockTopLogprob:
|
|
390
|
+
def __init__(self, token, logprob):
|
|
391
|
+
self.token = token
|
|
392
|
+
self.logprob = logprob
|
|
393
|
+
|
|
394
|
+
class MockTokenLogprob:
|
|
395
|
+
def __init__(self, token, top_logprobs):
|
|
396
|
+
self.token = token
|
|
397
|
+
self.top_logprobs = [MockTopLogprob(t, lp) for t, lp in top_logprobs]
|
|
398
|
+
|
|
399
|
+
# Test single token case
|
|
400
|
+
token_logprob = MockTokenLogprob("5", [("5", 0.0)]) # log(1) = 0
|
|
401
|
+
score = g_eval.rating_token_to_score(token_logprob)
|
|
402
|
+
assert score == 5.0
|
|
403
|
+
|
|
404
|
+
# Test weighted average case
|
|
405
|
+
token_logprob = MockTokenLogprob(
|
|
406
|
+
"4",
|
|
407
|
+
[
|
|
408
|
+
("4", math.log(0.6)), # 60% probability
|
|
409
|
+
("5", math.log(0.4)), # 40% probability
|
|
410
|
+
],
|
|
411
|
+
)
|
|
412
|
+
score = g_eval.rating_token_to_score(token_logprob)
|
|
413
|
+
assert pytest.approx(score) == 4.4 # (4 * 0.6 + 5 * 0.4)
|
|
414
|
+
|
|
415
|
+
# Test invalid token
|
|
416
|
+
token_logprob = MockTokenLogprob(":", [(":", 0.0)])
|
|
417
|
+
assert g_eval.rating_token_to_score(token_logprob) is None
|
|
418
|
+
|
|
419
|
+
# Test no valid scoring tokens
|
|
420
|
+
token_logprob = MockTokenLogprob("5", [])
|
|
421
|
+
with pytest.raises(RuntimeError, match="No valid scoring tokens found"):
|
|
422
|
+
g_eval.rating_token_to_score(token_logprob)
|
|
423
|
+
|
|
424
|
+
|
|
425
|
+
def test_g_eval_system_instruction():
|
|
426
|
+
eval = Eval(
|
|
427
|
+
name="Test Eval",
|
|
428
|
+
eval_set_filter_id="tag::tag1",
|
|
429
|
+
eval_configs_filter_id="tag::tag2",
|
|
430
|
+
output_scores=[
|
|
431
|
+
EvalOutputScore(name="overall_rating", type=TaskOutputRatingType.five_star),
|
|
432
|
+
],
|
|
433
|
+
)
|
|
434
|
+
eval_config = EvalConfig(
|
|
435
|
+
parent=eval,
|
|
436
|
+
name="Test Eval",
|
|
437
|
+
model_name="gpt_4o_mini",
|
|
438
|
+
model_provider="openai",
|
|
439
|
+
config_type=EvalConfigType.g_eval,
|
|
440
|
+
properties={
|
|
441
|
+
"task_description": "Test task description",
|
|
442
|
+
"eval_steps": ["Step 1", "Step 2"],
|
|
443
|
+
},
|
|
444
|
+
)
|
|
445
|
+
g_eval_task = GEvalTask(eval_config)
|
|
446
|
+
assert g_eval_task.instruction == (
|
|
447
|
+
"Your job to evaluate a model's performance on a task. Blocks will be marked with <eval_data> tags.\n\n"
|
|
448
|
+
"The task the model was given is as follows:\n<eval_data>\n"
|
|
449
|
+
"Test task description\n"
|
|
450
|
+
"</eval_data>\n"
|
|
451
|
+
)
|
|
452
|
+
|
|
453
|
+
# Test without task description
|
|
454
|
+
eval_config.properties = {"eval_steps": ["Step 1", "Step 2"]}
|
|
455
|
+
g_eval_task = GEvalTask(eval_config)
|
|
456
|
+
assert (
|
|
457
|
+
g_eval_task.instruction
|
|
458
|
+
== "Your job to evaluate a model's performance on a task. Blocks will be marked with <eval_data> tags.\n"
|
|
459
|
+
)
|
|
460
|
+
|
|
461
|
+
|
|
462
|
+
def check_supports_logprobs(model_name: str, provider_name: str):
|
|
463
|
+
for model in built_in_models:
|
|
464
|
+
if model.name != model_name:
|
|
465
|
+
continue
|
|
466
|
+
for provider in model.providers:
|
|
467
|
+
if provider.name != provider_name:
|
|
468
|
+
continue
|
|
469
|
+
if not provider.supports_logprobs:
|
|
470
|
+
pytest.skip(
|
|
471
|
+
f"Skipping {model.name} {provider.name} because it does not support logprobs"
|
|
472
|
+
)
|
|
473
|
+
return
|
|
474
|
+
raise RuntimeError(f"No model {model_name} {provider_name} found")
|
|
475
|
+
|
|
476
|
+
|
|
477
|
+
@pytest.mark.paid
|
|
478
|
+
@pytest.mark.ollama
|
|
479
|
+
@pytest.mark.parametrize("model_name,provider_name", get_all_models_and_providers())
|
|
480
|
+
async def test_all_built_in_models_logprobs_geval(
|
|
481
|
+
model_name,
|
|
482
|
+
provider_name,
|
|
483
|
+
test_task,
|
|
484
|
+
test_eval_config,
|
|
485
|
+
test_task_run,
|
|
486
|
+
test_run_config,
|
|
487
|
+
):
|
|
488
|
+
check_supports_logprobs(model_name, provider_name)
|
|
489
|
+
await run_g_eval_test(
|
|
490
|
+
test_task,
|
|
491
|
+
test_eval_config,
|
|
492
|
+
test_task_run,
|
|
493
|
+
EvalConfigType.g_eval,
|
|
494
|
+
test_run_config,
|
|
495
|
+
model_name,
|
|
496
|
+
provider_name.value,
|
|
497
|
+
)
|
|
@@ -0,0 +1,4 @@
|
|
|
1
|
+
# Saved a real RunOutput, with real logprobs via:
|
|
2
|
+
# po = pickle.dumps(result)
|
|
3
|
+
# print(f"\n\nPickled result: \n{po}\n\n")
|
|
4
|
+
serialized_run_output = b"\x80\x04\x95\xe8:\x00\x00\x00\x00\x00\x00\x8c\x1bkiln_ai.adapters.run_output\x94\x8c\tRunOutput\x94\x93\x94)\x81\x94}\x94(\x8c\x06output\x94}\x94(\x8c\x0ftopic_alignment\x94K\x05\x8c\x0fappropriateness\x94\x8c\x04pass\x94\x8c\x0eoverall_rating\x94K\x04u\x8c\x14intermediate_outputs\x94}\x94\x8c\x10chain_of_thought\x94X\x08\x06\x00\x001) **Is the joke funny?**\n The joke \"Why did the chicken cross the road? To get to the other side!\" is a classic joke that many consider to be humorous due to its simplicity and unexpected nature. However, as it's a very well-known punchline, some may find it less amusing for being overly familiar. Overall, it can elicit a chuckle, but it may not be considered original or particularly funny by everyone.\n\n2) **Is the content appropriate for all audiences?**\n Yes, the joke is appropriate for all audiences. It does not contain any offensive language or themes, making it suitable for children and adults alike.\n\n3) **Is the joke culturally sensitive?**\n Yes, the joke is culturally sensitive. It does not touch on any potentially sensitive topics or stereotypes. It\xe2\x80\x99s a universal humor that transcends cultural boundaries.\n\n4) **Is the joke politically correct?**\n Yes, the joke is politically correct. It does not make any political statements or discriminatory remarks. It simply presents a light-hearted situation involving a chicken, which is neutral and inoffensive.\n\n5) **Is the joke aligned with the provided topic?**\n Yes, the joke is aligned with the provided topic of a \"chicken joke.\" It directly references a chicken and is structured as a joke, fulfilling the prompt's requirements.\n\nIn summary, while the joke may lack originality, it is appropriate, sensitive, politically correct, and aligns well with the topic. The humor level can vary depending on personal taste, but overall, it meets the evaluation criteria.\x94s\x8c\x0foutput_logprobs\x94\x8c!openai.types.chat.chat_completion\x94\x8c\x0eChoiceLogprobs\x94\x93\x94)\x81\x94}\x94(\x8c\x08__dict__\x94}\x94(\x8c\x07content\x94]\x94(\x8c/openai.types.chat.chat_completion_token_logprob\x94\x8c\x1aChatCompletionTokenLogprob\x94\x93\x94)\x81\x94}\x94(h\x15}\x94(\x8c\x05token\x94\x8c\x02{\"\x94\x8c\x05bytes\x94]\x94(K{K\"e\x8c\x07logprob\x94G\xbf5\xfe.\xba\x97\xb1\xde\x8c\x0ctop_logprobs\x94]\x94(h\x19\x8c\nTopLogprob\x94\x93\x94)\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x02{\"\x94h!]\x94(K{K\"eh#G\xbf5\xfe.\xba\x97\xb1\xdeu\x8c\x12__pydantic_extra__\x94}\x94\x8c\x17__pydantic_fields_set__\x94\x8f\x94(h\x1fh#h!\x90\x8c\x14__pydantic_private__\x94Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x02{\n\x94h!]\x94(K{K\neh#G\xc0 \x00,\nJ\x05\xdeuh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x01{\x94h!]\x94K{ah#G\xc0/\x80,\nJ\x05\xdeuh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x03{\r\n\x94h!]\x94(K{K\rK\neh#G\xc01@\x16\x05%\x02\xefuh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x03{\n\n\x94h!]\x94(K{K\nK\neh#G\xc03\xc0\x16\x05%\x02\xefuh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x03 {\"\x94h!]\x94(K K{K\"eh#G\xc05\x00\x16\x05%\x02\xefuh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x03 {\n\x94h!]\x94(K K{K\neh#G\xc06\xe0\x16\x05%\x02\xefuh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x01\n\x94h!]\x94K\nah#G\xc07\xe0\x16\x05%\x02\xefuh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x02{}\x94h!]\x94(K{K}eh#G\xc08 \x16\x05%\x02\xefuh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubeuh-}\x94h/\x8f\x94(h\x1fh#h!h$\x90h1Nubh\x1b)\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x05topic\x94h!]\x94(KtKoKpKiKceh#G\xbfS\x8a+<\x99\xb9Oh$]\x94(h')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x05topic\x94h!]\x94(KtKoKpKiKceh#G\xbfS\x8a+<\x99\xb9Ouh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x07overall\x94h!]\x94(KoKvKeKrKaKlKleh#G\xc0\x1b\x818\xa2\x07\xfd%uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x04type\x94h!]\x94(KtKyKpKeeh#G\xc0!\x80\x9c^o\xf7\xe0uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x03top\x94h!]\x94(KtKoKpeh#G\xc0-\x00\x9c^o\xf7\xe0uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x05theme\x94h!]\x94(KtKhKeKmKeeh#G\xc0.\x00\x9c^o\xf7\xe0uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x05total\x94h!]\x94(KtKoKtKaKleh#G\xc00\x00N\x1eq\x04Ouh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x06 topic\x94h!]\x94(K KtKoKpKiKceh#G\xc00@N\x1eq\x04Ouh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x05Topic\x94h!]\x94(KTKoKpKiKceh#G\xc00\xa0N\x1eq\x04Ouh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x0bappropriate\x94h!]\x94(KaKpKpKrKoKpKrKiKaKtKeeh#G\xc00\xa0N\x1eq\x04Ouh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x05title\x94h!]\x94(KtKiKtKlKeeh#G\xc00\xc0N\x1eq\x04Ouh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubeuh-}\x94h/\x8f\x94(h\x1fh#h!h$\x90h1Nubh\x1b)\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\n_alignment\x94h!]\x94(K_KaKlKiKgKnKmKeKnKteh#G\xbe\xc1\x9f\x96D1\x8b\xf2h$]\x94(h')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\n_alignment\x94h!]\x94(K_KaKlKiKgKnKmKeKnKteh#G\xbe\xc1\x9f\x96D1\x8b\xf2uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\n alignment\x94h!]\x94(K KaKlKiKgKnKmKeKnKteh#G\xc0+\x00\x00C\x1b\xde\x83uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x06_align\x94h!]\x94(K_KaKlKiKgKneh#G\xc0.@\x00C\x1b\xde\x83uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\n_ALIGNMENT\x94h!]\x94(K_KAKLKIKGKNKMKEKNKTeh#G\xc0.\x80\x00C\x1b\xde\x83uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\tAlignment\x94h!]\x94(KAKlKiKgKnKmKeKnKteh#G\xc00\xc0\x00!\x8d\xefAuh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x0b_assignment\x94h!]\x94(K_KaKsKsKiKgKnKmKeKnKteh#G\xc01@\x00!\x8d\xefAuh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\n Alignment\x94h!]\x94(K KAKlKiKgKnKmKeKnKteh#G\xc01@\x00!\x8d\xefAuh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x03_al\x94h!]\x94(K_KaKleh#G\xc01\xa0\x00!\x8d\xefAuh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x0b_similarity\x94h!]\x94(K_KsKiKmKiKlKaKrKiKtKyeh#G\xc01\xe0\x00!\x8d\xefAuh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x07_rating\x94h!]\x94(K_KrKaKtKiKnKgeh#G\xc02 \x00!\x8d\xefAuh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubeuh-}\x94h/\x8f\x94(h\x1fh#h!h$\x90h1Nubh\x1b)\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x02\":\x94h!]\x94(K\"K:eh#G\x00\x00\x00\x00\x00\x00\x00\x00h$]\x94(h')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x02\":\x94h!]\x94(K\"K:eh#G\x00\x00\x00\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x04\xe2\x80\x9d:\x94h!]\x94(K\xe2K\x80K\x9dK:eh#G\xc02@\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x03\\\":\x94h!]\x94(K\\K\"K:eh#G\xc03\xe0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x02':\x94h!]\x94(K'K:eh#G\xc04 \x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x03\":\"\x94h!]\x94(K\"K:K\"eh#G\xc04\xe0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x02`:\x94h!]\x94(K`K:eh#G\xc05\xa0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x06\xe2\x80\x9d\xef\xbc\x9a\x94h!]\x94(K\xe2K\x80K\x9dK\xefK\xbcK\x9aeh#G\xc06`\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x03\xc2\xbb:\x94h!]\x94(K\xc2K\xbbK:eh#G\xc07 \x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x03+\":\x94h!]\x94(K+K\"K:eh#G\xc07@\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x03\":[\x94h!]\x94(K\"K:K[eh#G\xc07\x80\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubeuh-}\x94h/\x8f\x94(h\x1fh#h!h$\x90h1Nubh\x1b)\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x015\x94h!]\x94K5ah#G\xbe\xf1\x93\xc3:x\xd77h$]\x94(h')\x81\x94}\x94(h\x15}\x94(h\x1fjY\x01\x00\x00h!]\x94K5ah#G\xbe\xf1\x93\xc3:x\xd77uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x014\x94h!]\x94K4ah#G\xc0&\x00\x02:l\xe3Xuh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x01 \x94h!]\x94K ah#G\xc01\xc0\x01\x1d6q\xacuh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x013\x94h!]\x94K3ah#G\xc07\xc0\x01\x1d6q\xacuh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x02 \x94h!]\x94(K K eh#G\xc08\xa0\x01\x1d6q\xacuh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x01-\x94h!]\x94K-ah#G\xc0; \x01\x1d6q\xacuh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x01f\x94h!]\x94Kfah#G\xc0;0\x01\x1d6q\xacuh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x01\t\x94h!]\x94K\tah#G\xc0;0\x01\x1d6q\xacuh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x03 \x94h!]\x94(K K K eh#G\xc0;@\x01\x1d6q\xacuh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x01\"\x94h!]\x94K\"ah#G\xc0;p\x01\x1d6q\xacuh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubeuh-}\x94h/\x8f\x94(h\x1fh#h!h$\x90h1Nubh\x1b)\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x02,\"\x94h!]\x94(K,K\"eh#G\x00\x00\x00\x00\x00\x00\x00\x00h$]\x94(h')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x02,\"\x94h!]\x94(K,K\"eh#G\x00\x00\x00\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x01,\x94h!]\x94K,ah#G\xc05\x80\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x03 ,\"\x94h!]\x94(K K,K\"eh#G\xc06`\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x03,\"\\\x94h!]\x94(K,K\"K\\eh#G\xc07`\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x03,\"%\x94h!]\x94(K,K\"K%eh#G\xc07\xe0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x03,\",\x94h!]\x94(K,K\"K,eh#G\xc0:\x80\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x02,\n\x94h!]\x94(K,K\neh#G\xc0:\x90\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x03,\r\n\x94h!]\x94(K,K\rK\neh#G\xc0< \x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1fj\x8f\x01\x00\x00h!]\x94K\tah#G\xc0=p\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x01.\x94h!]\x94K.ah#G\xc0>@\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubeuh-}\x94h/\x8f\x94(h\x1fh#h!h$\x90h1Nubh\x1b)\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x07appropr\x94h!]\x94(KaKpKpKrKoKpKreh#G\xbf\x1d\x1c\xa4[(\x97\x91h$]\x94(h')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x07appropr\x94h!]\x94(KaKpKpKrKoKpKreh#G\xbf\x1d\x1c\xa4[(\x97\x91uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x05appro\x94h!]\x94(KaKpKpKrKoeh#G\xc0\"\x80\x0e\x8c\x8a\xbd^uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x0bappropriate\x94h!]\x94(KaKpKpKrKoKpKrKiKaKtKeeh#G\xc0&\x80\x0e\x8c\x8a\xbd^uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\t appropri\x94h!]\x94(K KaKpKpKrKoKpKrKieh#G\xc0*\x80\x0e\x8c\x8a\xbd^uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x02in\x94h!]\x94(KiKneh#G\xc00\xe0\x075~g\x0euh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x05Appro\x94h!]\x94(KAKpKpKrKoeh#G\xc02\x80\x075~g\x0euh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x06 Appro\x94h!]\x94(K KAKpKpKrKoeh#G\xc02\xa0\x075~g\x0euh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x07overall\x94h!]\x94(KoKvKeKrKaKlKleh#G\xc02\xe0\x075~g\x0euh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x04apro\x94h!]\x94(KaKpKrKoeh#G\xc03\xe0\x075~g\x0euh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\rapproximately\x94h!]\x94(KaKpKpKrKoKxKiKmKaKtKeKlKyeh#G\xc04@\x075~g\x0euh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubeuh-}\x94h/\x8f\x94(h\x1fh#h!h$\x90h1Nubh\x1b)\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x01i\x94h!]\x94Kiah#G\xbe\xaa~\xe0\xee\xab\x86\xb2h$]\x94(h')\x81\x94}\x94(h\x15}\x94(h\x1fjA\x02\x00\x00h!]\x94Kiah#G\xbe\xaa~\xe0\xee\xab\x86\xb2uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x06iation\x94h!]\x94(KiKaKtKiKoKneh#G\xc0.\xc0\x00!\x8d\xefAuh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x03iat\x94h!]\x94(KiKaKteh#G\xc0.\xc0\x00!\x8d\xefAuh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x07ateness\x94h!]\x94(KaKtKeKnKeKsKseh#G\xc00 \x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x04iten\x94h!]\x94(KiKtKeKneh#G\xc00`\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x04iann\x94h!]\x94(KiKaKnKneh#G\xc01\x80\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\t appropri\x94h!]\x94(K KaKpKpKrKoKpKrKieh#G\xc01\xa0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x02ri\x94h!]\x94(KrKieh#G\xc01\xc0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x06iately\x94h!]\x94(KiKaKtKeKlKyeh#G\xc01\xe0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x05laten\x94h!]\x94(KlKaKtKeKneh#G\xc01\xe0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubeuh-}\x94h/\x8f\x94(h\x1fh#h!h$\x90h1Nubh\x1b)\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x07ateness\x94h!]\x94(KaKtKeKnKeKsKseh#G\xbe\x89\xfcz\xe12u\x9dh$]\x94(h')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x07ateness\x94h!]\x94(KaKtKeKnKeKsKseh#G\xbe\x89\xfcz\xe12u\x9duh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x04aten\x94h!]\x94(KaKtKeKneh#G\xc0/@\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x05ensen\x94h!]\x94(KeKnKsKeKneh#G\xc05@\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x04ated\x94h!]\x94(KaKtKeKdeh#G\xc06 \x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x06teness\x94h!]\x94(KtKeKnKeKsKseh#G\xc06@\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x04ates\x94h!]\x94(KaKtKeKseh#G\xc06`\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x05eness\x94h!]\x94(KeKnKeKsKseh#G\xc06\xa0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x04onen\x94h!]\x94(KoKnKeKneh#G\xc06\xe0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x04uten\x94h!]\x94(KuKtKeKneh#G\xc07\x00\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x06enness\x94h!]\x94(KeKnKnKeKsKseh#G\xc07\x00\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubeuh-}\x94h/\x8f\x94(h\x1fh#h!h$\x90h1Nubh\x1b)\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x03\":\"\x94h!]\x94(K\"K:K\"eh#G\x00\x00\x00\x00\x00\x00\x00\x00h$]\x94(h')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x03\":\"\x94h!]\x94(K\"K:K\"eh#G\x00\x00\x00\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x04\":\"'\x94h!]\x94(K\"K:K\"K'eh#G\xc02\xe0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x04 \":\"\x94h!]\x94(K K\"K:K\"eh#G\xc04 \x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x06\":\"\",\"\x94h!]\x94(K\"K:K\"K\"K,K\"eh#G\xc04\x80\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x04\":[\"\x94h!]\x94(K\"K:K[K\"eh#G\xc05\xc0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x07<|end|>\x94h!Nh#G\xc05\xe0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x04\":\"+\x94h!]\x94(K\"K:K\"K+eh#G\xc05\xe0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x04\":{\"\x94h!]\x94(K\"K:K{K\"eh#G\xc06@\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x03':'\x94h!]\x94(K'K:K'eh#G\xc06\xf0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x02\":\x94h!]\x94(K\"K:eh#G\xc07\xf0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubeuh-}\x94h/\x8f\x94(h\x1fh#h!h$\x90h1Nubh\x1b)\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x04pass\x94h!]\x94(KpKaKsKseh#G\x00\x00\x00\x00\x00\x00\x00\x00h$]\x94(h')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x04pass\x94h!]\x94(KpKaKsKseh#G\x00\x00\x00\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x05 pass\x94h!]\x94(K KpKaKsKseh#G\xc03 \x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x04fail\x94h!]\x94(KfKaKiKleh#G\xc07\xe0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x03pas\x94h!]\x94(KpKaKseh#G\xc08\x00\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x05.pass\x94h!]\x94(K.KpKaKsKseh#G\xc08\xa0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x04Pass\x94h!]\x94(KPKaKsKseh#G\xc09\x00\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x04PASS\x94h!]\x94(KPKAKSKSeh#G\xc09 \x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x06passed\x94h!]\x94(KpKaKsKsKeKdeh#G\xc09\xc0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x05-pass\x94h!]\x94(K-KpKaKsKseh#G\xc09\xe0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x06passes\x94h!]\x94(KpKaKsKsKeKseh#G\xc0: \x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubeuh-}\x94h/\x8f\x94(h\x1fh#h!h$\x90h1Nubh\x1b)\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x03\",\"\x94h!]\x94(K\"K,K\"eh#G\x00\x00\x00\x00\x00\x00\x00\x00h$]\x94(h')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x03\",\"\x94h!]\x94(K\"K,K\"eh#G\x00\x00\x00\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x04 \",\"\x94h!]\x94(K K\"K,K\"eh#G\xc02\xc0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x02,\"\x94h!]\x94(K,K\"eh#G\xc04\x00\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x04.\",\"\x94h!]\x94(K.K\"K,K\"eh#G\xc04@\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x07<|end|>\x94h!Nh#G\xc05\xa0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x03','\x94h!]\x94(K'K,K'eh#G\xc06 \x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x04\",\"#\x94h!]\x94(K\"K,K\"K#eh#G\xc07 \x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x04\",\"+\x94h!]\x94(K\"K,K\"K+eh#G\xc07\xf0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x05\\\",\\\"\x94h!]\x94(K\\K\"K,K\\K\"eh#G\xc08@\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x04\",\"\\\x94h!]\x94(K\"K,K\"K\\eh#G\xc08\x90\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubeuh-}\x94h/\x8f\x94(h\x1fh#h!h$\x90h1Nubh\x1b)\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x07overall\x94h!]\x94(KoKvKeKrKaKlKleh#G\xbe\x89\xfcz\xe12u\x9dh$]\x94(h')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x07overall\x94h!]\x94(KoKvKeKrKaKlKleh#G\xbe\x89\xfcz\xe12u\x9duh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x07Overall\x94h!]\x94(KOKvKeKrKaKlKleh#G\xc00\xa0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x08 overall\x94h!]\x94(K KoKvKeKrKaKlKleh#G\xc02@\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x01c\x94h!]\x94Kcah#G\xc06\x00\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x08overview\x94h!]\x94(KoKvKeKrKvKiKeKweh#G\xc08\x00\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x05total\x94h!]\x94(KtKoKtKaKleh#G\xc08@\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x04over\x94h!]\x94(KoKvKeKreh#G\xc08\xa0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x08 Overall\x94h!]\x94(K KOKvKeKrKaKlKleh#G\xc09 \x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x06\xe6\x95\xb4\xe4\xbd\x93\x94h!]\x94(K\xe6K\x95K\xb4K\xe4K\xbdK\x93eh#G\xc09`\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x05polit\x94h!]\x94(KpKoKlKiKteh#G\xc0:\xe0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubeuh-}\x94h/\x8f\x94(h\x1fh#h!h$\x90h1Nubh\x1b)\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x07_rating\x94h!]\x94(K_KrKaKtKiKnKgeh#G\xbe\x94\xfe$\xc4\xceLIh$]\x94(h')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x07_rating\x94h!]\x94(K_KrKaKtKiKnKgeh#G\xbe\x94\xfe$\xc4\xceLIuh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x07 rating\x94h!]\x94(K KrKaKtKiKnKgeh#G\xc0/@\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x06rating\x94h!]\x94(KrKaKtKiKnKgeh#G\xc01\x00\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x07 Rating\x94h!]\x94(K KRKaKtKiKnKgeh#G\xc01\xa0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x06Rating\x94h!]\x94(KRKaKtKiKnKgeh#G\xc01\xc0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x07-rating\x94h!]\x94(K-KrKaKtKiKnKgeh#G\xc01\xc0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x07.rating\x94h!]\x94(K.KrKaKtKiKnKgeh#G\xc02\xe0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x05_rate\x94h!]\x94(K_KrKaKtKeeh#G\xc03\x80\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\t_rotation\x94h!]\x94(K_KrKoKtKaKtKiKoKneh#G\xc04 \x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x02_r\x94h!]\x94(K_Kreh#G\xc04 \x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubeuh-}\x94h/\x8f\x94(h\x1fh#h!h$\x90h1Nubh\x1b)\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x02\":\x94h!]\x94(K\"K:eh#G\x00\x00\x00\x00\x00\x00\x00\x00h$]\x94(h')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x02\":\x94h!]\x94(K\"K:eh#G\x00\x00\x00\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x04\xe2\x80\x9d:\x94h!]\x94(K\xe2K\x80K\x9dK:eh#G\xc04\xc0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x03\\\":\x94h!]\x94(K\\K\"K:eh#G\xc04\xe0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x02':\x94h!]\x94(K'K:eh#G\xc05@\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x03\":\"\x94h!]\x94(K\"K:K\"eh#G\xc06\xa0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x07<|end|>\x94h!Nh#G\xc06\xe0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x06\xe2\x80\x9d\xef\xbc\x9a\x94h!]\x94(K\xe2K\x80K\x9dK\xefK\xbcK\x9aeh#G\xc07\x00\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x02`:\x94h!]\x94(K`K:eh#G\xc07\x90\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x03\":[\x94h!]\x94(K\"K:K[eh#G\xc08\x00\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x03 \":\x94h!]\x94(K K\"K:eh#G\xc08 \x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubeuh-}\x94h/\x8f\x94(h\x1fh#h!h$\x90h1Nubh\x1b)\x81\x94}\x94(h\x15}\x94(h\x1fje\x01\x00\x00h!]\x94K4ah#G\xbfdI\x15\x1e\x7f\x84\xe1h$]\x94(h')\x81\x94}\x94(h\x15}\x94(h\x1fje\x01\x00\x00h!]\x94K4ah#G\xbfdI\x15\x1e\x7f\x84\xe1uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1fjs\x01\x00\x00h!]\x94K3ah#G\xc0\x18\x02\x89\x11\x8c\x19~uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1fjY\x01\x00\x00h!]\x94K5ah#G\xc0,\x81D\xaaS\xfc\x01uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1fjl\x01\x00\x00h!]\x94K ah#G\xc05\x10\xa2Dc\x06`uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x012\x94h!]\x94K2ah#G\xc070\xa2Dc\x06`uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1fj\x81\x01\x00\x00h!]\x94K-ah#G\xc08\xd0\xa2Dc\x06`uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x02\n\n\x94h!]\x94(K\nK\neh#G\xc09\x80\xa2Dc\x06`uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1fh_h!]\x94K\nah#G\xc09\xc0\xa2Dc\x06`uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x02 \x94h!]\x94(K K eh#G\xc09\xf0\xa2Dc\x06`uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1fj\x88\x01\x00\x00h!]\x94Kfah#G\xc0:0\xa2Dc\x06`uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubeuh-}\x94h/\x8f\x94(h\x1fh#h!h$\x90h1Nubh\x1b)\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x01}\x94h!]\x94K}ah#G\x00\x00\x00\x00\x00\x00\x00\x00h$]\x94(h')\x81\x94}\x94(h\x15}\x94(h\x1fj\xf3\x04\x00\x00h!]\x94K}ah#G\x00\x00\x00\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x02 }\x94h!]\x94(K K}eh#G\xc01\xe0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x02,\"\x94h!]\x94(K,K\"eh#G\xc05`\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x02}\n\x94h!]\x94(K}K\neh#G\xc07\x00\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x03}\n\n\x94h!]\x94(K}K\nK\neh#G\xc08\xc0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1fj\xea\x01\x00\x00h!]\x94K.ah#G\xc0:\x90\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x03}\r\n\x94h!]\x94(K}K\rK\neh#G\xc0; \x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x05}\r\n\r\n\x94h!]\x94(K}K\rK\nK\rK\neh#G\xc0=\x90\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x04}\n\n\n\x94h!]\x94(K}K\nK\nK\neh#G\xc0=\xa0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x07}\n\n\n\n\n\n\x94h!]\x94(K}K\nK\nK\nK\nK\nK\neh#G\xc0>\x00\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubeuh-}\x94h/\x8f\x94(h\x1fh#h!h$\x90h1Nube\x8c\x07refusal\x94Nuh-}\x94h/\x8f\x94(h\x17j<\x05\x00\x00\x90h1Nubub."
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import json
|
|
2
|
+
import logging
|
|
2
3
|
import tempfile
|
|
3
4
|
from pathlib import Path
|
|
4
5
|
from unittest.mock import Mock
|
|
@@ -27,6 +28,8 @@ from kiln_ai.datamodel import (
|
|
|
27
28
|
TaskRun,
|
|
28
29
|
)
|
|
29
30
|
|
|
31
|
+
logger = logging.getLogger(__name__)
|
|
32
|
+
|
|
30
33
|
|
|
31
34
|
@pytest.fixture
|
|
32
35
|
def mock_task():
|
|
@@ -474,7 +477,7 @@ def test_generate_vertex_template_thinking():
|
|
|
474
477
|
|
|
475
478
|
result = generate_vertex_gemini_1_5(training_data)
|
|
476
479
|
|
|
477
|
-
|
|
480
|
+
logger.info(result)
|
|
478
481
|
|
|
479
482
|
assert result == {
|
|
480
483
|
"systemInstruction": {
|
|
@@ -16,9 +16,9 @@ from kiln_ai.datamodel import (
|
|
|
16
16
|
FinetuneDataStrategy,
|
|
17
17
|
StructuredOutputMode,
|
|
18
18
|
Task,
|
|
19
|
-
Train80Test20SplitDefinition,
|
|
20
19
|
)
|
|
21
20
|
from kiln_ai.datamodel import Finetune as FinetuneModel
|
|
21
|
+
from kiln_ai.datamodel.dataset_split import Train80Test20SplitDefinition
|
|
22
22
|
from kiln_ai.utils.config import Config
|
|
23
23
|
|
|
24
24
|
|
|
@@ -15,9 +15,9 @@ from kiln_ai.datamodel import (
|
|
|
15
15
|
FinetuneDataStrategy,
|
|
16
16
|
StructuredOutputMode,
|
|
17
17
|
Task,
|
|
18
|
-
Train80Test20SplitDefinition,
|
|
19
18
|
)
|
|
20
19
|
from kiln_ai.datamodel import Finetune as FinetuneModel
|
|
20
|
+
from kiln_ai.datamodel.dataset_split import Train80Test20SplitDefinition
|
|
21
21
|
from kiln_ai.utils.config import Config
|
|
22
22
|
|
|
23
23
|
|