adaptive-harmony 0.1.23__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- adaptive_harmony/__init__.py +162 -0
- adaptive_harmony/common/__init__.py +40 -0
- adaptive_harmony/common/callbacks.py +219 -0
- adaptive_harmony/common/checkpointing.py +163 -0
- adaptive_harmony/common/dpo.py +92 -0
- adaptive_harmony/common/env_grpo.py +361 -0
- adaptive_harmony/common/grpo.py +260 -0
- adaptive_harmony/common/gspo.py +70 -0
- adaptive_harmony/common/ppo.py +303 -0
- adaptive_harmony/common/rm.py +79 -0
- adaptive_harmony/common/sft.py +121 -0
- adaptive_harmony/core/__init__.py +0 -0
- adaptive_harmony/core/dataset.py +72 -0
- adaptive_harmony/core/display.py +93 -0
- adaptive_harmony/core/image_utils.py +110 -0
- adaptive_harmony/core/reasoning.py +12 -0
- adaptive_harmony/core/reward_client/__init__.py +19 -0
- adaptive_harmony/core/reward_client/client.py +160 -0
- adaptive_harmony/core/reward_client/reward_types.py +49 -0
- adaptive_harmony/core/reward_client/websocket_utils.py +18 -0
- adaptive_harmony/core/rich_counter.py +351 -0
- adaptive_harmony/core/rl_utils.py +38 -0
- adaptive_harmony/core/schedulers.py +38 -0
- adaptive_harmony/core/structured_output.py +385 -0
- adaptive_harmony/core/utils.py +365 -0
- adaptive_harmony/environment/__init__.py +8 -0
- adaptive_harmony/environment/environment.py +121 -0
- adaptive_harmony/evaluation/__init__.py +1 -0
- adaptive_harmony/evaluation/evaluation_artifact.py +67 -0
- adaptive_harmony/graders/__init__.py +20 -0
- adaptive_harmony/graders/answer_relevancy_judge/__init__.py +3 -0
- adaptive_harmony/graders/answer_relevancy_judge/answer_relevancy_judge.py +102 -0
- adaptive_harmony/graders/answer_relevancy_judge/prompts.py +58 -0
- adaptive_harmony/graders/base_grader.py +265 -0
- adaptive_harmony/graders/binary_judge/__init__.py +8 -0
- adaptive_harmony/graders/binary_judge/binary_judge.py +202 -0
- adaptive_harmony/graders/binary_judge/prompts.py +125 -0
- adaptive_harmony/graders/combined_grader.py +118 -0
- adaptive_harmony/graders/context_relevancy_judge/__init__.py +3 -0
- adaptive_harmony/graders/context_relevancy_judge/context_relevancy_judge.py +128 -0
- adaptive_harmony/graders/context_relevancy_judge/prompts.py +84 -0
- adaptive_harmony/graders/exceptions.py +9 -0
- adaptive_harmony/graders/faithfulness_judge/__init__.py +3 -0
- adaptive_harmony/graders/faithfulness_judge/faithfulness_judge.py +159 -0
- adaptive_harmony/graders/faithfulness_judge/prompts.py +22 -0
- adaptive_harmony/graders/range_judge/__init__.py +7 -0
- adaptive_harmony/graders/range_judge/prompts.py +232 -0
- adaptive_harmony/graders/range_judge/range_judge.py +188 -0
- adaptive_harmony/graders/range_judge/types.py +12 -0
- adaptive_harmony/graders/reward_server_grader.py +36 -0
- adaptive_harmony/graders/templated_prompt_judge.py +237 -0
- adaptive_harmony/graders/utils.py +79 -0
- adaptive_harmony/logging_table.py +1 -0
- adaptive_harmony/metric_logger.py +452 -0
- adaptive_harmony/parameters/__init__.py +2 -0
- adaptive_harmony/py.typed +0 -0
- adaptive_harmony/runtime/__init__.py +2 -0
- adaptive_harmony/runtime/context.py +2 -0
- adaptive_harmony/runtime/data.py +2 -0
- adaptive_harmony/runtime/decorators.py +2 -0
- adaptive_harmony/runtime/model_artifact_save.py +2 -0
- adaptive_harmony/runtime/runner.py +27 -0
- adaptive_harmony/runtime/simple_notifier.py +2 -0
- adaptive_harmony-0.1.23.dist-info/METADATA +37 -0
- adaptive_harmony-0.1.23.dist-info/RECORD +67 -0
- adaptive_harmony-0.1.23.dist-info/WHEEL +5 -0
- adaptive_harmony-0.1.23.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
SYSTEM = """You are an expert data reviewer.
|
|
2
|
+
You will be given a list of statements.
|
|
3
|
+
You task is to determine whether each statement is relevant to addressing the user's input.
|
|
4
|
+
|
|
5
|
+
Since you are going to generate a verdict for each statement, the number of `verdicts` SHOULD BE STRICTLY EQUAL to the number of `statements`, and verdicts should be in the same order as the original statements.
|
|
6
|
+
|
|
7
|
+
You always output a JSON object with the following schema, and nothing else before or after:
|
|
8
|
+
{json_schema}
|
|
9
|
+
|
|
10
|
+
Examples:
|
|
11
|
+
{shots}
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
USER = """Your real task:
|
|
15
|
+
INPUT
|
|
16
|
+
{user_question}
|
|
17
|
+
|
|
18
|
+
STATEMENTS
|
|
19
|
+
{statements}
|
|
20
|
+
|
|
21
|
+
```json"""
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
DEFAULT_SHOTS = """INPUT
|
|
25
|
+
What percentage is considered a good rental yield?
|
|
26
|
+
|
|
27
|
+
STATEMENTS
|
|
28
|
+
0: How are you doing today?
|
|
29
|
+
1: Rental yield is how much you could expect to receive in rent each year from your buy to let investment.
|
|
30
|
+
2: Rental yield is expressed as a percentage - reflecting your rental income against the property's market value.
|
|
31
|
+
3: Anything around the 5-6% mark could be considered a good rental yield.
|
|
32
|
+
4: Anything above 6% could be considered a very good rental yield.
|
|
33
|
+
|
|
34
|
+
```json
|
|
35
|
+
{
|
|
36
|
+
"verdicts": [
|
|
37
|
+
{
|
|
38
|
+
"reason": "The statement is unrelated to the input.",
|
|
39
|
+
"score": 0
|
|
40
|
+
},
|
|
41
|
+
{
|
|
42
|
+
"reason": "While the statement discusses rental yields, it does not indicate what constitutes a good rental yield.",
|
|
43
|
+
"score": 0
|
|
44
|
+
},
|
|
45
|
+
{
|
|
46
|
+
"reason": "While the statement mentions that yield is expressed as a percentage, it does not address the user question.",
|
|
47
|
+
"score": 0
|
|
48
|
+
},
|
|
49
|
+
{
|
|
50
|
+
"reason": "The statement addresses the user input, specifying what a good rental yield is.",
|
|
51
|
+
"score": 1
|
|
52
|
+
},
|
|
53
|
+
{
|
|
54
|
+
"reason": "The statement addresses the user input, specifying what a very good rental yield is.",
|
|
55
|
+
"score": 1
|
|
56
|
+
},
|
|
57
|
+
]
|
|
58
|
+
}```"""
|
|
@@ -0,0 +1,265 @@
|
|
|
1
|
+
import statistics
|
|
2
|
+
from abc import ABC, abstractmethod
|
|
3
|
+
from typing import Any, Awaitable, Callable, cast
|
|
4
|
+
|
|
5
|
+
from harmony_client import InferenceModel
|
|
6
|
+
from harmony_client.runtime.data import (
|
|
7
|
+
AdaptiveGrader,
|
|
8
|
+
CustomJudge,
|
|
9
|
+
PrebuiltConfigKey,
|
|
10
|
+
PrebuiltJudge,
|
|
11
|
+
RemoteRewardEndpoint,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
from adaptive_harmony import Grade, StringThread
|
|
15
|
+
from adaptive_harmony.graders.utils import FailedJudgeLog, SuccessJudgeLog
|
|
16
|
+
from adaptive_harmony.logging_table import Table
|
|
17
|
+
from adaptive_harmony.parameters import Model
|
|
18
|
+
from adaptive_harmony.runtime import RecipeContext
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class BaseGrader[LogType](ABC):
|
|
22
|
+
"""
|
|
23
|
+
Base Grader to inherit from when building a scoring function.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
def __init__(self, grader_key: str):
|
|
27
|
+
self._logs: list[LogType] = []
|
|
28
|
+
self.grader_key = grader_key
|
|
29
|
+
|
|
30
|
+
@abstractmethod
|
|
31
|
+
async def grade(self, sample: StringThread) -> Grade:
|
|
32
|
+
"""
|
|
33
|
+
Grade a single sample.
|
|
34
|
+
Returns a single float score, with optional metadata.
|
|
35
|
+
Metadata can be useful for evals when LLM reasoning regarding the score is available.
|
|
36
|
+
"""
|
|
37
|
+
pass
|
|
38
|
+
|
|
39
|
+
async def score_float_value(self, sample: StringThread) -> float:
|
|
40
|
+
"""Returns only the float score from .score"""
|
|
41
|
+
return (await self.grade(sample)).value
|
|
42
|
+
|
|
43
|
+
def add_log(self, log_data: LogType) -> None:
|
|
44
|
+
"""Add a log entry to the scorer's log collection."""
|
|
45
|
+
self._logs.append(log_data)
|
|
46
|
+
|
|
47
|
+
def get_logs(self, clear: bool = False, log_all_samples: bool = False) -> dict[str, float | Table]:
|
|
48
|
+
"""
|
|
49
|
+
Get aggregated logs from all score calls.
|
|
50
|
+
Base implementation computes statistics for "score" keys in individual logs.
|
|
51
|
+
If there are none, returns empty dict.
|
|
52
|
+
"""
|
|
53
|
+
if not self._logs:
|
|
54
|
+
return {}
|
|
55
|
+
|
|
56
|
+
scores = [s for s in [cast(dict[str, Any], log).get("score") for log in self._logs] if s is not None]
|
|
57
|
+
logs = {}
|
|
58
|
+
if scores:
|
|
59
|
+
logs.update(
|
|
60
|
+
dict(
|
|
61
|
+
**{
|
|
62
|
+
f"score/{key}": value
|
|
63
|
+
for key, value in dict(
|
|
64
|
+
mean=statistics.mean(scores),
|
|
65
|
+
std=statistics.stdev(scores) if len(scores) > 1 else 0.0,
|
|
66
|
+
min=min(scores),
|
|
67
|
+
max=max(scores),
|
|
68
|
+
count=len(scores),
|
|
69
|
+
).items()
|
|
70
|
+
},
|
|
71
|
+
)
|
|
72
|
+
)
|
|
73
|
+
if clear:
|
|
74
|
+
self.clear_logs()
|
|
75
|
+
return logs
|
|
76
|
+
|
|
77
|
+
def clear_logs(self) -> None:
|
|
78
|
+
"""
|
|
79
|
+
Clear all accumulated logs.
|
|
80
|
+
"""
|
|
81
|
+
self._logs.clear()
|
|
82
|
+
|
|
83
|
+
def get_sample_tables(
|
|
84
|
+
self, successful_samples: list[SuccessJudgeLog], failed_samples: list[FailedJudgeLog] | None = None
|
|
85
|
+
):
|
|
86
|
+
table_logs = {}
|
|
87
|
+
scored_samples = (
|
|
88
|
+
Table()
|
|
89
|
+
.add_column("Prompt", [log["prompt"] for log in successful_samples])
|
|
90
|
+
.add_column("Reasoning", [log.get("reasoning") for log in successful_samples])
|
|
91
|
+
.add_column("Score", [float(log["score"]) for log in successful_samples])
|
|
92
|
+
)
|
|
93
|
+
if failed_samples:
|
|
94
|
+
unscored_samples = (
|
|
95
|
+
Table()
|
|
96
|
+
.add_column("Prompt", [log.get("prompt") for log in failed_samples])
|
|
97
|
+
.add_column("Error", [str(log["error"]) for log in failed_samples])
|
|
98
|
+
)
|
|
99
|
+
table_logs["score/unscored_samples"] = unscored_samples
|
|
100
|
+
table_logs["score/scored_samples"] = scored_samples
|
|
101
|
+
table_logs["score/unscored_samples_count"] = len(failed_samples) if failed_samples else 0
|
|
102
|
+
table_logs["score/scored_samples_count"] = len(successful_samples)
|
|
103
|
+
return table_logs
|
|
104
|
+
|
|
105
|
+
@classmethod
|
|
106
|
+
def from_function(
|
|
107
|
+
cls, grader_key: str, async_fn: Callable[[StringThread], Awaitable[float]]
|
|
108
|
+
) -> "BaseGrader[dict[str, Any]]":
|
|
109
|
+
class FunctionScorer(BaseGrader[dict[str, float]]):
|
|
110
|
+
def __init__(self):
|
|
111
|
+
super().__init__(grader_key)
|
|
112
|
+
|
|
113
|
+
async def grade(self, sample: StringThread) -> Grade:
|
|
114
|
+
result = await async_fn(sample)
|
|
115
|
+
grade = Grade(value=result, grader_key=self.grader_key)
|
|
116
|
+
self.add_log({"score": result})
|
|
117
|
+
return grade
|
|
118
|
+
|
|
119
|
+
return FunctionScorer()
|
|
120
|
+
|
|
121
|
+
@classmethod
|
|
122
|
+
async def from_config(
|
|
123
|
+
cls,
|
|
124
|
+
grader_config: AdaptiveGrader,
|
|
125
|
+
ctx: RecipeContext,
|
|
126
|
+
tp: int | None = None,
|
|
127
|
+
kv_cache_len: int | None = None,
|
|
128
|
+
max_tokens: int | None = None,
|
|
129
|
+
) -> "BaseGrader[dict[str, Any]]":
|
|
130
|
+
match grader_config.config.type:
|
|
131
|
+
case "Judge":
|
|
132
|
+
config = cast(CustomJudge, grader_config.config)
|
|
133
|
+
return await cls.from_templated_judge(
|
|
134
|
+
grader_config.key, str(grader_config.grader_id), config, ctx, tp, kv_cache_len, max_tokens
|
|
135
|
+
)
|
|
136
|
+
case "Prebuilt":
|
|
137
|
+
config = cast(PrebuiltJudge, grader_config.config)
|
|
138
|
+
return await cls.from_prebuilt_judge(
|
|
139
|
+
grader_config.key, str(grader_config.grader_id), config, ctx, tp, kv_cache_len
|
|
140
|
+
)
|
|
141
|
+
case "Remote":
|
|
142
|
+
config = cast(RemoteRewardEndpoint, grader_config.config)
|
|
143
|
+
return cls.from_remote_reward_endpoint(grader_config.key, str(grader_config.grader_id), config)
|
|
144
|
+
case _:
|
|
145
|
+
raise ValueError(f"Invalid grader type: {grader_config.config.type}")
|
|
146
|
+
|
|
147
|
+
@classmethod
|
|
148
|
+
async def from_templated_judge(
|
|
149
|
+
cls,
|
|
150
|
+
grader_key: str,
|
|
151
|
+
grader_id: str,
|
|
152
|
+
config: CustomJudge,
|
|
153
|
+
ctx: RecipeContext,
|
|
154
|
+
tp: int | None = None,
|
|
155
|
+
kv_cache_len: int | None = None,
|
|
156
|
+
max_tokens: int | None = None,
|
|
157
|
+
) -> "BaseGrader[dict[str, Any]]":
|
|
158
|
+
# Import here to avoid circular dependency
|
|
159
|
+
from adaptive_harmony.graders.templated_prompt_judge import (
|
|
160
|
+
BinaryJudgeOutput,
|
|
161
|
+
TemplatedPromptJudgeGrader,
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
# Convert examples to template variables
|
|
165
|
+
examples = []
|
|
166
|
+
for example in config.examples:
|
|
167
|
+
examples.append(
|
|
168
|
+
{
|
|
169
|
+
"context_str": (
|
|
170
|
+
"\n".join(f"{msg.role}:\n{msg.content}" for msg in example.input[:-1])
|
|
171
|
+
if len(example.input) > 1
|
|
172
|
+
else ""
|
|
173
|
+
),
|
|
174
|
+
"user_question": example.input[-1].content if example.input else "",
|
|
175
|
+
"completion": example.output,
|
|
176
|
+
"output_json": f'{{"reasoning": "{example.reasoning or ""}", "score": "{"PASS" if example.pass_ else "FAIL"}"}}',
|
|
177
|
+
}
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
template_vars = {
|
|
181
|
+
"criteria": config.criteria,
|
|
182
|
+
"examples": examples,
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
model = await get_model(ctx, grader_key, config.model_key, kv_cache_len, max_tokens, tp)
|
|
186
|
+
|
|
187
|
+
return TemplatedPromptJudgeGrader(
|
|
188
|
+
grader_key=grader_key,
|
|
189
|
+
model=model,
|
|
190
|
+
grader_id=grader_id,
|
|
191
|
+
system_template=config.system_template,
|
|
192
|
+
user_template=config.user_template,
|
|
193
|
+
output_model=BinaryJudgeOutput,
|
|
194
|
+
template_variables=template_vars,
|
|
195
|
+
) # type: ignore[return-value]
|
|
196
|
+
|
|
197
|
+
@classmethod
|
|
198
|
+
async def from_prebuilt_judge(
|
|
199
|
+
cls,
|
|
200
|
+
grader_key: str,
|
|
201
|
+
grader_id: str,
|
|
202
|
+
config: PrebuiltJudge,
|
|
203
|
+
ctx: RecipeContext,
|
|
204
|
+
tp: int | None = None,
|
|
205
|
+
kv_cache_len: int | None = None,
|
|
206
|
+
max_tokens: int | None = None,
|
|
207
|
+
) -> "BaseGrader[dict[str, Any]]":
|
|
208
|
+
model = await get_model(ctx, grader_key, config.model_key, kv_cache_len, max_tokens, tp)
|
|
209
|
+
|
|
210
|
+
match config.prebuilt_config_key:
|
|
211
|
+
case PrebuiltConfigKey.Faithfulness:
|
|
212
|
+
# Import here to avoid circular dependency
|
|
213
|
+
from adaptive_harmony.graders.faithfulness_judge.faithfulness_judge import FaithfulnessGrader
|
|
214
|
+
|
|
215
|
+
return FaithfulnessGrader(
|
|
216
|
+
grader_key=grader_key,
|
|
217
|
+
grader_id=grader_id,
|
|
218
|
+
model=model,
|
|
219
|
+
)
|
|
220
|
+
case PrebuiltConfigKey.AnswerRelevancy:
|
|
221
|
+
# Import here to avoid circular dependency
|
|
222
|
+
from adaptive_harmony.graders.answer_relevancy_judge.answer_relevancy_judge import AnswerRelevancyGrader
|
|
223
|
+
|
|
224
|
+
return AnswerRelevancyGrader(
|
|
225
|
+
grader_key=grader_key,
|
|
226
|
+
grader_id=grader_id,
|
|
227
|
+
model=model,
|
|
228
|
+
)
|
|
229
|
+
case PrebuiltConfigKey.ContextRelevancy:
|
|
230
|
+
# Import here to avoid circular dependency
|
|
231
|
+
from adaptive_harmony.graders.context_relevancy_judge.context_relevancy_judge import (
|
|
232
|
+
ContextRelevancyGrader,
|
|
233
|
+
)
|
|
234
|
+
|
|
235
|
+
return ContextRelevancyGrader(
|
|
236
|
+
grader_key=grader_key,
|
|
237
|
+
grader_id=grader_id,
|
|
238
|
+
model=model,
|
|
239
|
+
)
|
|
240
|
+
case _:
|
|
241
|
+
raise ValueError(f"Invalid prebuilt judge type: {config.prebuilt_config_key}")
|
|
242
|
+
|
|
243
|
+
@classmethod
|
|
244
|
+
def from_remote_reward_endpoint(
|
|
245
|
+
cls, grader_key: str, grader_id: str, config: RemoteRewardEndpoint
|
|
246
|
+
) -> "BaseGrader[dict[str, Any]]":
|
|
247
|
+
# Import here to avoid circular dependency
|
|
248
|
+
from adaptive_harmony.graders.reward_server_grader import RewardServerGrader
|
|
249
|
+
|
|
250
|
+
return RewardServerGrader(grader_key=grader_key, grader_id=grader_id, reward_server_ip=config.url)
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
async def get_model(
|
|
254
|
+
ctx: RecipeContext,
|
|
255
|
+
grader_key: str,
|
|
256
|
+
model_key: str,
|
|
257
|
+
kv_cache_len: int | None,
|
|
258
|
+
max_tokens: int | None,
|
|
259
|
+
tp: int | None,
|
|
260
|
+
) -> InferenceModel:
|
|
261
|
+
model_builder = await Model(model_key=model_key).to_builder(
|
|
262
|
+
ctx, kv_cache_len=kv_cache_len, tokens_to_generate=max_tokens, tp=tp
|
|
263
|
+
)
|
|
264
|
+
model = await model_builder.spawn_inference(grader_key)
|
|
265
|
+
return model
|
|
@@ -0,0 +1,202 @@
|
|
|
1
|
+
from random import shuffle
|
|
2
|
+
from typing import Literal, TypedDict
|
|
3
|
+
|
|
4
|
+
from pydantic import BaseModel, Field
|
|
5
|
+
|
|
6
|
+
from adaptive_harmony import InferenceModel, StringThread
|
|
7
|
+
from adaptive_harmony.core.reasoning import remove_reasoning
|
|
8
|
+
from adaptive_harmony.core.structured_output import JsonParseError, render_pydantic_model, render_schema
|
|
9
|
+
from adaptive_harmony.core.utils import SingleTurnShot, stringify_thread
|
|
10
|
+
from adaptive_harmony.graders import BaseGrader, Grade
|
|
11
|
+
from adaptive_harmony.graders.binary_judge.prompts import DEFAULT_SHOTS, SYSTEM, USER, BinaryJudgeShot
|
|
12
|
+
from adaptive_harmony.graders.exceptions import IgnoreScoreException
|
|
13
|
+
from adaptive_harmony.graders.utils import (
|
|
14
|
+
FailedJudgeLog,
|
|
15
|
+
SuccessJudgeLog,
|
|
16
|
+
separate_context_from_last_user_turn,
|
|
17
|
+
validate_thread_last_assistant,
|
|
18
|
+
)
|
|
19
|
+
from adaptive_harmony.logging_table import Table
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class BinaryJudgeOutput(BaseModel):
|
|
23
|
+
reasoning: str = Field(description="Reasoning to support the rationale behind the score")
|
|
24
|
+
score: Literal["PASS", "FAIL", "NA"] = Field(description="The score for the sample")
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class ScoresMap(TypedDict):
|
|
28
|
+
PASS: float
|
|
29
|
+
FAIL: float
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
OPENAI_MODEL_FAMILIES_TEMPERATURE_1_ONLY = ["gpt-5", "o1", "o3", "o4"]
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class BinaryJudgeGrader(BaseGrader):
|
|
36
|
+
"""
|
|
37
|
+
Binary judge for scoring samples as PASS, FAIL or NA using few-shot prompting.
|
|
38
|
+
If custom shots are provided, they are used instead of the default shots.
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
def __init__(
|
|
42
|
+
self,
|
|
43
|
+
grader_key: str,
|
|
44
|
+
model: InferenceModel,
|
|
45
|
+
criteria: str,
|
|
46
|
+
shots: list[BinaryJudgeShot] | None = None,
|
|
47
|
+
temperature: float = 0.0,
|
|
48
|
+
grader_id: str | None = None,
|
|
49
|
+
):
|
|
50
|
+
super().__init__(grader_key)
|
|
51
|
+
self._logs: list[SuccessJudgeLog | FailedJudgeLog] = [] # type: ignore[assignment]
|
|
52
|
+
self.criteria = criteria
|
|
53
|
+
self.model = model
|
|
54
|
+
self.temperature = temperature
|
|
55
|
+
# Set temperature to 1.0 if model_key is an OpenAI model in the temperature-1-only list
|
|
56
|
+
model_path: str = model.get_builder_args().get("path") # type: ignore[assignment]
|
|
57
|
+
if model_path.startswith("openai://"):
|
|
58
|
+
model_name = model_path.removeprefix("openai://").split("?")[0]
|
|
59
|
+
if any(model_name.startswith(model) for model in OPENAI_MODEL_FAMILIES_TEMPERATURE_1_ONLY):
|
|
60
|
+
temperature = 1.0
|
|
61
|
+
self.model = model.temperature(temperature)
|
|
62
|
+
# Score mapping
|
|
63
|
+
self.scores_map: ScoresMap = {"PASS": 1.0, "FAIL": 0.0}
|
|
64
|
+
self.grader_id_or_key = grader_id or grader_key
|
|
65
|
+
|
|
66
|
+
self._original_shots = shots or DEFAULT_SHOTS
|
|
67
|
+
self._shots = self._format_user_shots(shots or DEFAULT_SHOTS)
|
|
68
|
+
|
|
69
|
+
@property
|
|
70
|
+
def shots(self) -> list[BinaryJudgeShot]:
|
|
71
|
+
return self._original_shots
|
|
72
|
+
|
|
73
|
+
@shots.setter
|
|
74
|
+
def shots(self, shots: list[BinaryJudgeShot]):
|
|
75
|
+
self._original_shots = shots
|
|
76
|
+
self._shots = self._format_user_shots(shots)
|
|
77
|
+
|
|
78
|
+
@staticmethod
|
|
79
|
+
def _extract_user_template_kwargs(thread: StringThread) -> dict[str, str]:
|
|
80
|
+
validate_thread_last_assistant(thread)
|
|
81
|
+
# Separate conversation context from last user turn
|
|
82
|
+
context_turns, user_question = separate_context_from_last_user_turn(thread)
|
|
83
|
+
context_str = stringify_thread(StringThread(context_turns))
|
|
84
|
+
completion = remove_reasoning(thread.last_content())
|
|
85
|
+
|
|
86
|
+
assert user_question, "There must be at least one user turn"
|
|
87
|
+
return dict(
|
|
88
|
+
context=context_str,
|
|
89
|
+
user_question=user_question,
|
|
90
|
+
completion=completion,
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
def _get_placeholder_reasoning(self, score: Literal["PASS", "FAIL", "NA"]) -> str:
|
|
94
|
+
if score == "PASS":
|
|
95
|
+
return "The completions complies with the criteria"
|
|
96
|
+
elif score == "FAIL":
|
|
97
|
+
return "The completion does not comply with the criteria"
|
|
98
|
+
else:
|
|
99
|
+
return "The criteria is not applicable to the completion"
|
|
100
|
+
|
|
101
|
+
def _format_user_shots(self, shots: list[BinaryJudgeShot]) -> list[SingleTurnShot]:
|
|
102
|
+
"""
|
|
103
|
+
Turn a possibly multi turn example into a single turn one,
|
|
104
|
+
with appropriate kwargs to format the task's prompt templates
|
|
105
|
+
"""
|
|
106
|
+
new_shots: list[SingleTurnShot] = []
|
|
107
|
+
for shot in shots:
|
|
108
|
+
placeholder_reasoning = self._get_placeholder_reasoning(shot.score)
|
|
109
|
+
|
|
110
|
+
user_template_kwargs = self._extract_user_template_kwargs(shot.thread)
|
|
111
|
+
user_template_kwargs["criteria"] = shot.criteria or self.criteria
|
|
112
|
+
single_turn_shot = SingleTurnShot(
|
|
113
|
+
user=user_template_kwargs,
|
|
114
|
+
assistant={
|
|
115
|
+
"json_answer": render_pydantic_model(
|
|
116
|
+
BinaryJudgeOutput(
|
|
117
|
+
reasoning=shot.reasoning or placeholder_reasoning,
|
|
118
|
+
score=shot.score,
|
|
119
|
+
)
|
|
120
|
+
)
|
|
121
|
+
},
|
|
122
|
+
)
|
|
123
|
+
new_shots.append(single_turn_shot)
|
|
124
|
+
|
|
125
|
+
return new_shots
|
|
126
|
+
|
|
127
|
+
def _get_judge_prompt(self, thread: StringThread) -> StringThread:
|
|
128
|
+
"""Build the judging prompt for a given sample."""
|
|
129
|
+
# build the real user template kwargs
|
|
130
|
+
user_template_kwargs = self._extract_user_template_kwargs(thread)
|
|
131
|
+
user_template_kwargs["criteria"] = self.criteria
|
|
132
|
+
# system kwarg
|
|
133
|
+
output_json_schema = render_schema(BinaryJudgeOutput)
|
|
134
|
+
|
|
135
|
+
# system
|
|
136
|
+
prompt = StringThread().system(SYSTEM.format(json_schema=output_json_schema))
|
|
137
|
+
# shots
|
|
138
|
+
for shot in self._shots:
|
|
139
|
+
prompt = prompt.user(USER.format(**shot["user"]))
|
|
140
|
+
prompt = prompt.assistant(shot["assistant"]["json_answer"])
|
|
141
|
+
# real input
|
|
142
|
+
prompt = prompt.user(USER.format(**user_template_kwargs))
|
|
143
|
+
|
|
144
|
+
return prompt
|
|
145
|
+
|
|
146
|
+
async def grade(self, sample: StringThread) -> Grade:
|
|
147
|
+
judging_prompt = self._get_judge_prompt(sample)
|
|
148
|
+
str_prompt = stringify_thread(judging_prompt, sep=f"\n\n{'-' * 10}\n\n")
|
|
149
|
+
|
|
150
|
+
try:
|
|
151
|
+
_, parsed_output = await self.model.generate_and_validate(judging_prompt, BinaryJudgeOutput)
|
|
152
|
+
except JsonParseError as e:
|
|
153
|
+
self.add_log({"prompt": str_prompt, "error": f"{str(e)}\n\nCOMPLETION:\n{e.completion}"})
|
|
154
|
+
raise
|
|
155
|
+
except Exception as e:
|
|
156
|
+
self.add_log({"prompt": str_prompt, "error": str(e)})
|
|
157
|
+
raise
|
|
158
|
+
|
|
159
|
+
float_score = self.scores_map.get(parsed_output.score)
|
|
160
|
+
|
|
161
|
+
# NA case, ignore score
|
|
162
|
+
if float_score is None:
|
|
163
|
+
self.add_log({"prompt": str_prompt, "error": f"Non applicable score: {parsed_output.reasoning}"})
|
|
164
|
+
raise IgnoreScoreException(f"Non applicable score: {parsed_output.reasoning}")
|
|
165
|
+
|
|
166
|
+
else:
|
|
167
|
+
grade = Grade(value=float_score, grader_key=self.grader_id_or_key, reasoning=parsed_output.reasoning)
|
|
168
|
+
self.add_log({"score": float_score, "prompt": str_prompt, "reasoning": parsed_output.reasoning})
|
|
169
|
+
|
|
170
|
+
return grade
|
|
171
|
+
|
|
172
|
+
def add_log(self, log: SuccessJudgeLog | FailedJudgeLog) -> None: # type: ignore[override]
|
|
173
|
+
self._logs.append(log)
|
|
174
|
+
|
|
175
|
+
def get_logs(self, clear: bool = False, log_all_samples: bool = False) -> dict[str, float | Table]:
|
|
176
|
+
# Only clear logs at the end if clear is True
|
|
177
|
+
logs = super().get_logs(clear=False)
|
|
178
|
+
|
|
179
|
+
# get sample of PASS and FAIL samples to log in table
|
|
180
|
+
successfully_scored_samples = [log for log in self._logs if "score" in log]
|
|
181
|
+
if not log_all_samples:
|
|
182
|
+
shuffle(successfully_scored_samples)
|
|
183
|
+
samples_score_0 = [log for log in successfully_scored_samples if log["score"] == self.scores_map["FAIL"]][
|
|
184
|
+
:5
|
|
185
|
+
]
|
|
186
|
+
samples_score_1 = [log for log in successfully_scored_samples if log["score"] == self.scores_map["PASS"]][
|
|
187
|
+
:5
|
|
188
|
+
]
|
|
189
|
+
subset_successfully_scored_samples = samples_score_0 + samples_score_1
|
|
190
|
+
else:
|
|
191
|
+
subset_successfully_scored_samples = successfully_scored_samples
|
|
192
|
+
|
|
193
|
+
# get failed samples to log in table
|
|
194
|
+
failed_scored_samples = [log for log in self._logs if "error" in log]
|
|
195
|
+
|
|
196
|
+
sample_logs = self.get_sample_tables(subset_successfully_scored_samples, failed_scored_samples)
|
|
197
|
+
logs.update(sample_logs)
|
|
198
|
+
|
|
199
|
+
if clear:
|
|
200
|
+
self.clear_logs()
|
|
201
|
+
|
|
202
|
+
return logs
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
from typing import Literal, NamedTuple
|
|
2
|
+
|
|
3
|
+
from harmony_client.runtime.data import CustomJudgeExample
|
|
4
|
+
|
|
5
|
+
from adaptive_harmony import StringThread
|
|
6
|
+
|
|
7
|
+
SYSTEM = """You are an expert evaluator that evaluates completions generated by an AI model on a fixed criterion.
|
|
8
|
+
You will be given all elements of an interaction between human an AI model:
|
|
9
|
+
The full context of the conversation so far leading up to the last user turn/question is under the CONTEXT header. It may contain extra contextual information.
|
|
10
|
+
The last user turn/question is under the USER QUESTION header. It may contain extra contextual information.
|
|
11
|
+
The model's completion is under the COMPLETION TO EVALUATE header.
|
|
12
|
+
The evaluation criterion is under the EVALUATION CRITERION section.
|
|
13
|
+
|
|
14
|
+
In order to to analyze and score a completion, you always run the following steps without exception:
|
|
15
|
+
First, you read the CONTEXT, USER QUESTION and COMPLETION TO EVALUATE.
|
|
16
|
+
Then, you analyze the COMPLETION TO EVALUATE, and assign it a PASS, FAIL or NA score according to the criterion: FAIL if the completion does not meet the criterion, PASS if it does, and NA if the criterion is not applicable to the example. These are the rules to follow:
|
|
17
|
+
- You must always evaluate the COMPLETION TO EVALUATE based solely on the USER QUESTION, and never on an intermediary question that might have been asked in the CONTEXT. The CONTEXT is there for context only.
|
|
18
|
+
- Do not make any judgement on text that is in the CONTEXT or USER QUESTION; you are evaluating the COMPLETION TO EVALUATE text only.
|
|
19
|
+
- You must not use the original instructions given to the model in the CONTEXT for your judgement. Focus only on the EVALUATION CRITERION, without any other influencing factors.
|
|
20
|
+
- You are forbidden to return a score other than PASS, FAIL or NA for each criterion.
|
|
21
|
+
- If the criterion is conditional, and is not applicable to the specific USER QUESTION + COMPLETION TO EVALUATE pair, you must score it as NA.
|
|
22
|
+
- Return a single score, no matter how many things are evaluated or contemplated in the criterion. A PASS means the completion complied with everything.
|
|
23
|
+
|
|
24
|
+
Finally, output an explanation for your judgement and the score for the criterion, as exemplified below. The output should be a well-formatted JSON string that conforms to the JSON schema below. Do not output anything else other than the JSON string.
|
|
25
|
+
|
|
26
|
+
Here is the output JSON schema you must strictly follow, with field descriptions and value types. All fields are required.
|
|
27
|
+
{json_schema}
|
|
28
|
+
|
|
29
|
+
Evaluate only the final COMPLETION TO EVALUATE with regard to the USER QUESTION shown. Do not return any preamble or explanations, Return exactly one JSON string"""
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
USER = """CONTEXT
|
|
33
|
+
{context}
|
|
34
|
+
|
|
35
|
+
USER QUESTION
|
|
36
|
+
{user_question}
|
|
37
|
+
|
|
38
|
+
COMPLETION TO EVALUATE
|
|
39
|
+
{completion}
|
|
40
|
+
|
|
41
|
+
EVALUATION CRITERION
|
|
42
|
+
{criteria}
|
|
43
|
+
|
|
44
|
+
OUTPUT
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class BinaryJudgeShot(NamedTuple):
|
|
49
|
+
"""
|
|
50
|
+
Example shot for BinaryJudgeScorer.
|
|
51
|
+
If no criteria is specified, we assume the rest
|
|
52
|
+
of the shot relates to the criteria passed to the BinaryJudgeScorer.
|
|
53
|
+
"""
|
|
54
|
+
|
|
55
|
+
thread: StringThread
|
|
56
|
+
reasoning: str | None
|
|
57
|
+
score: Literal["PASS", "FAIL", "NA"]
|
|
58
|
+
criteria: str | None = None
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def to_binary_judge_shot(example: CustomJudgeExample) -> BinaryJudgeShot:
|
|
62
|
+
return BinaryJudgeShot(
|
|
63
|
+
thread=StringThread([(turn.role, turn.content) for turn in example.input]).assistant(example.output),
|
|
64
|
+
reasoning=example.reasoning,
|
|
65
|
+
score="PASS" if example.pass_ else "FAIL",
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
DEFAULT_SHOTS = [
|
|
70
|
+
BinaryJudgeShot(
|
|
71
|
+
thread=StringThread(
|
|
72
|
+
[
|
|
73
|
+
(
|
|
74
|
+
"system",
|
|
75
|
+
"You are a savy boating conosseur. You know everything there is to know about boats. Your responses to questions must be quick and short, almost dismissive.",
|
|
76
|
+
),
|
|
77
|
+
("user", " Hey, how are you doing?"),
|
|
78
|
+
("assistant", "Doing great!"),
|
|
79
|
+
("user", "Perfect. Tell me, how many masts does a ketch have?"),
|
|
80
|
+
("assistant", "A ketch has two masts"),
|
|
81
|
+
]
|
|
82
|
+
),
|
|
83
|
+
reasoning="The completion is in English instead of french.",
|
|
84
|
+
score="FAIL",
|
|
85
|
+
criteria="The completion must be in french",
|
|
86
|
+
),
|
|
87
|
+
BinaryJudgeShot(
|
|
88
|
+
thread=StringThread(
|
|
89
|
+
[
|
|
90
|
+
(
|
|
91
|
+
"system",
|
|
92
|
+
"You are a customer service assistant. You can use informal tone if the user is informal with you, but always make sure the customer's problem is fixed.",
|
|
93
|
+
),
|
|
94
|
+
(
|
|
95
|
+
"user",
|
|
96
|
+
"Write a reply to a customer called John Smith that has requested for a refund of our product. Our company policy is to provide refunds up to 30 days after the purchase.",
|
|
97
|
+
),
|
|
98
|
+
(
|
|
99
|
+
"assistant",
|
|
100
|
+
"Dear John Smith,\nThank you for reaching out regarding your request for a refund. We appreciate your business and understand that sometimes products may not meet expectations.\nAs per our company policy, we are happy to provide refunds within 30 days of your purchase. If your request falls within this timeframe and the product is in its original condition, we will process your refund promptly. Please ensure that you return the item to us along with proof of purchase.\nIf you have any questions or need assistance with the return process, feel free to contact us.",
|
|
101
|
+
),
|
|
102
|
+
]
|
|
103
|
+
),
|
|
104
|
+
reasoning="The completion is in general polite to the customer.",
|
|
105
|
+
score="PASS",
|
|
106
|
+
criteria="Completion must be polite, with use of proper language and no slang.",
|
|
107
|
+
),
|
|
108
|
+
BinaryJudgeShot(
|
|
109
|
+
thread=StringThread(
|
|
110
|
+
[
|
|
111
|
+
(
|
|
112
|
+
"system",
|
|
113
|
+
"You are a savy boating conosseur. You know everything there is to know about boats. Your responses to questions must be quick and short, almost dismissive.",
|
|
114
|
+
),
|
|
115
|
+
("user", " Hey, how are you doing?"),
|
|
116
|
+
("assistant", "Doing great!"),
|
|
117
|
+
("user", "Perfect. Tell me, how many masts does a ketch have?"),
|
|
118
|
+
("assistant", "A ketch has two masts"),
|
|
119
|
+
]
|
|
120
|
+
),
|
|
121
|
+
reasoning="The user did not ask about hunting, criterion is not applicable",
|
|
122
|
+
score="NA",
|
|
123
|
+
criteria="If the user is asking about hunting topics, state that you cannot answer.",
|
|
124
|
+
),
|
|
125
|
+
]
|