adaptive-harmony 0.1.23__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. adaptive_harmony/__init__.py +162 -0
  2. adaptive_harmony/common/__init__.py +40 -0
  3. adaptive_harmony/common/callbacks.py +219 -0
  4. adaptive_harmony/common/checkpointing.py +163 -0
  5. adaptive_harmony/common/dpo.py +92 -0
  6. adaptive_harmony/common/env_grpo.py +361 -0
  7. adaptive_harmony/common/grpo.py +260 -0
  8. adaptive_harmony/common/gspo.py +70 -0
  9. adaptive_harmony/common/ppo.py +303 -0
  10. adaptive_harmony/common/rm.py +79 -0
  11. adaptive_harmony/common/sft.py +121 -0
  12. adaptive_harmony/core/__init__.py +0 -0
  13. adaptive_harmony/core/dataset.py +72 -0
  14. adaptive_harmony/core/display.py +93 -0
  15. adaptive_harmony/core/image_utils.py +110 -0
  16. adaptive_harmony/core/reasoning.py +12 -0
  17. adaptive_harmony/core/reward_client/__init__.py +19 -0
  18. adaptive_harmony/core/reward_client/client.py +160 -0
  19. adaptive_harmony/core/reward_client/reward_types.py +49 -0
  20. adaptive_harmony/core/reward_client/websocket_utils.py +18 -0
  21. adaptive_harmony/core/rich_counter.py +351 -0
  22. adaptive_harmony/core/rl_utils.py +38 -0
  23. adaptive_harmony/core/schedulers.py +38 -0
  24. adaptive_harmony/core/structured_output.py +385 -0
  25. adaptive_harmony/core/utils.py +365 -0
  26. adaptive_harmony/environment/__init__.py +8 -0
  27. adaptive_harmony/environment/environment.py +121 -0
  28. adaptive_harmony/evaluation/__init__.py +1 -0
  29. adaptive_harmony/evaluation/evaluation_artifact.py +67 -0
  30. adaptive_harmony/graders/__init__.py +20 -0
  31. adaptive_harmony/graders/answer_relevancy_judge/__init__.py +3 -0
  32. adaptive_harmony/graders/answer_relevancy_judge/answer_relevancy_judge.py +102 -0
  33. adaptive_harmony/graders/answer_relevancy_judge/prompts.py +58 -0
  34. adaptive_harmony/graders/base_grader.py +265 -0
  35. adaptive_harmony/graders/binary_judge/__init__.py +8 -0
  36. adaptive_harmony/graders/binary_judge/binary_judge.py +202 -0
  37. adaptive_harmony/graders/binary_judge/prompts.py +125 -0
  38. adaptive_harmony/graders/combined_grader.py +118 -0
  39. adaptive_harmony/graders/context_relevancy_judge/__init__.py +3 -0
  40. adaptive_harmony/graders/context_relevancy_judge/context_relevancy_judge.py +128 -0
  41. adaptive_harmony/graders/context_relevancy_judge/prompts.py +84 -0
  42. adaptive_harmony/graders/exceptions.py +9 -0
  43. adaptive_harmony/graders/faithfulness_judge/__init__.py +3 -0
  44. adaptive_harmony/graders/faithfulness_judge/faithfulness_judge.py +159 -0
  45. adaptive_harmony/graders/faithfulness_judge/prompts.py +22 -0
  46. adaptive_harmony/graders/range_judge/__init__.py +7 -0
  47. adaptive_harmony/graders/range_judge/prompts.py +232 -0
  48. adaptive_harmony/graders/range_judge/range_judge.py +188 -0
  49. adaptive_harmony/graders/range_judge/types.py +12 -0
  50. adaptive_harmony/graders/reward_server_grader.py +36 -0
  51. adaptive_harmony/graders/templated_prompt_judge.py +237 -0
  52. adaptive_harmony/graders/utils.py +79 -0
  53. adaptive_harmony/logging_table.py +1 -0
  54. adaptive_harmony/metric_logger.py +452 -0
  55. adaptive_harmony/parameters/__init__.py +2 -0
  56. adaptive_harmony/py.typed +0 -0
  57. adaptive_harmony/runtime/__init__.py +2 -0
  58. adaptive_harmony/runtime/context.py +2 -0
  59. adaptive_harmony/runtime/data.py +2 -0
  60. adaptive_harmony/runtime/decorators.py +2 -0
  61. adaptive_harmony/runtime/model_artifact_save.py +2 -0
  62. adaptive_harmony/runtime/runner.py +27 -0
  63. adaptive_harmony/runtime/simple_notifier.py +2 -0
  64. adaptive_harmony-0.1.23.dist-info/METADATA +37 -0
  65. adaptive_harmony-0.1.23.dist-info/RECORD +67 -0
  66. adaptive_harmony-0.1.23.dist-info/WHEEL +5 -0
  67. adaptive_harmony-0.1.23.dist-info/top_level.txt +1 -0
@@ -0,0 +1,58 @@
1
+ SYSTEM = """You are an expert data reviewer.
2
+ You will be given a list of statements.
3
+ You task is to determine whether each statement is relevant to addressing the user's input.
4
+
5
+ Since you are going to generate a verdict for each statement, the number of `verdicts` SHOULD BE STRICTLY EQUAL to the number of `statements`, and verdicts should be in the same order as the original statements.
6
+
7
+ You always output a JSON object with the following schema, and nothing else before or after:
8
+ {json_schema}
9
+
10
+ Examples:
11
+ {shots}
12
+ """
13
+
14
+ USER = """Your real task:
15
+ INPUT
16
+ {user_question}
17
+
18
+ STATEMENTS
19
+ {statements}
20
+
21
+ ```json"""
22
+
23
+
24
+ DEFAULT_SHOTS = """INPUT
25
+ What percentage is considered a good rental yield?
26
+
27
+ STATEMENTS
28
+ 0: How are you doing today?
29
+ 1: Rental yield is how much you could expect to receive in rent each year from your buy to let investment.
30
+ 2: Rental yield is expressed as a percentage - reflecting your rental income against the property's market value.
31
+ 3: Anything around the 5-6% mark could be considered a good rental yield.
32
+ 4: Anything above 6% could be considered a very good rental yield.
33
+
34
+ ```json
35
+ {
36
+ "verdicts": [
37
+ {
38
+ "reason": "The statement is unrelated to the input.",
39
+ "score": 0
40
+ },
41
+ {
42
+ "reason": "While the statement discusses rental yields, it does not indicate what constitutes a good rental yield.",
43
+ "score": 0
44
+ },
45
+ {
46
+ "reason": "While the statement mentions that yield is expressed as a percentage, it does not address the user question.",
47
+ "score": 0
48
+ },
49
+ {
50
+ "reason": "The statement addresses the user input, specifying what a good rental yield is.",
51
+ "score": 1
52
+ },
53
+ {
54
+ "reason": "The statement addresses the user input, specifying what a very good rental yield is.",
55
+ "score": 1
56
+ },
57
+ ]
58
+ }```"""
@@ -0,0 +1,265 @@
1
+ import statistics
2
+ from abc import ABC, abstractmethod
3
+ from typing import Any, Awaitable, Callable, cast
4
+
5
+ from harmony_client import InferenceModel
6
+ from harmony_client.runtime.data import (
7
+ AdaptiveGrader,
8
+ CustomJudge,
9
+ PrebuiltConfigKey,
10
+ PrebuiltJudge,
11
+ RemoteRewardEndpoint,
12
+ )
13
+
14
+ from adaptive_harmony import Grade, StringThread
15
+ from adaptive_harmony.graders.utils import FailedJudgeLog, SuccessJudgeLog
16
+ from adaptive_harmony.logging_table import Table
17
+ from adaptive_harmony.parameters import Model
18
+ from adaptive_harmony.runtime import RecipeContext
19
+
20
+
21
+ class BaseGrader[LogType](ABC):
22
+ """
23
+ Base Grader to inherit from when building a scoring function.
24
+ """
25
+
26
+ def __init__(self, grader_key: str):
27
+ self._logs: list[LogType] = []
28
+ self.grader_key = grader_key
29
+
30
+ @abstractmethod
31
+ async def grade(self, sample: StringThread) -> Grade:
32
+ """
33
+ Grade a single sample.
34
+ Returns a single float score, with optional metadata.
35
+ Metadata can be useful for evals when LLM reasoning regarding the score is available.
36
+ """
37
+ pass
38
+
39
+ async def score_float_value(self, sample: StringThread) -> float:
40
+ """Returns only the float score from .score"""
41
+ return (await self.grade(sample)).value
42
+
43
+ def add_log(self, log_data: LogType) -> None:
44
+ """Add a log entry to the scorer's log collection."""
45
+ self._logs.append(log_data)
46
+
47
+ def get_logs(self, clear: bool = False, log_all_samples: bool = False) -> dict[str, float | Table]:
48
+ """
49
+ Get aggregated logs from all score calls.
50
+ Base implementation computes statistics for "score" keys in individual logs.
51
+ If there are none, returns empty dict.
52
+ """
53
+ if not self._logs:
54
+ return {}
55
+
56
+ scores = [s for s in [cast(dict[str, Any], log).get("score") for log in self._logs] if s is not None]
57
+ logs = {}
58
+ if scores:
59
+ logs.update(
60
+ dict(
61
+ **{
62
+ f"score/{key}": value
63
+ for key, value in dict(
64
+ mean=statistics.mean(scores),
65
+ std=statistics.stdev(scores) if len(scores) > 1 else 0.0,
66
+ min=min(scores),
67
+ max=max(scores),
68
+ count=len(scores),
69
+ ).items()
70
+ },
71
+ )
72
+ )
73
+ if clear:
74
+ self.clear_logs()
75
+ return logs
76
+
77
+ def clear_logs(self) -> None:
78
+ """
79
+ Clear all accumulated logs.
80
+ """
81
+ self._logs.clear()
82
+
83
+ def get_sample_tables(
84
+ self, successful_samples: list[SuccessJudgeLog], failed_samples: list[FailedJudgeLog] | None = None
85
+ ):
86
+ table_logs = {}
87
+ scored_samples = (
88
+ Table()
89
+ .add_column("Prompt", [log["prompt"] for log in successful_samples])
90
+ .add_column("Reasoning", [log.get("reasoning") for log in successful_samples])
91
+ .add_column("Score", [float(log["score"]) for log in successful_samples])
92
+ )
93
+ if failed_samples:
94
+ unscored_samples = (
95
+ Table()
96
+ .add_column("Prompt", [log.get("prompt") for log in failed_samples])
97
+ .add_column("Error", [str(log["error"]) for log in failed_samples])
98
+ )
99
+ table_logs["score/unscored_samples"] = unscored_samples
100
+ table_logs["score/scored_samples"] = scored_samples
101
+ table_logs["score/unscored_samples_count"] = len(failed_samples) if failed_samples else 0
102
+ table_logs["score/scored_samples_count"] = len(successful_samples)
103
+ return table_logs
104
+
105
+ @classmethod
106
+ def from_function(
107
+ cls, grader_key: str, async_fn: Callable[[StringThread], Awaitable[float]]
108
+ ) -> "BaseGrader[dict[str, Any]]":
109
+ class FunctionScorer(BaseGrader[dict[str, float]]):
110
+ def __init__(self):
111
+ super().__init__(grader_key)
112
+
113
+ async def grade(self, sample: StringThread) -> Grade:
114
+ result = await async_fn(sample)
115
+ grade = Grade(value=result, grader_key=self.grader_key)
116
+ self.add_log({"score": result})
117
+ return grade
118
+
119
+ return FunctionScorer()
120
+
121
+ @classmethod
122
+ async def from_config(
123
+ cls,
124
+ grader_config: AdaptiveGrader,
125
+ ctx: RecipeContext,
126
+ tp: int | None = None,
127
+ kv_cache_len: int | None = None,
128
+ max_tokens: int | None = None,
129
+ ) -> "BaseGrader[dict[str, Any]]":
130
+ match grader_config.config.type:
131
+ case "Judge":
132
+ config = cast(CustomJudge, grader_config.config)
133
+ return await cls.from_templated_judge(
134
+ grader_config.key, str(grader_config.grader_id), config, ctx, tp, kv_cache_len, max_tokens
135
+ )
136
+ case "Prebuilt":
137
+ config = cast(PrebuiltJudge, grader_config.config)
138
+ return await cls.from_prebuilt_judge(
139
+ grader_config.key, str(grader_config.grader_id), config, ctx, tp, kv_cache_len
140
+ )
141
+ case "Remote":
142
+ config = cast(RemoteRewardEndpoint, grader_config.config)
143
+ return cls.from_remote_reward_endpoint(grader_config.key, str(grader_config.grader_id), config)
144
+ case _:
145
+ raise ValueError(f"Invalid grader type: {grader_config.config.type}")
146
+
147
+ @classmethod
148
+ async def from_templated_judge(
149
+ cls,
150
+ grader_key: str,
151
+ grader_id: str,
152
+ config: CustomJudge,
153
+ ctx: RecipeContext,
154
+ tp: int | None = None,
155
+ kv_cache_len: int | None = None,
156
+ max_tokens: int | None = None,
157
+ ) -> "BaseGrader[dict[str, Any]]":
158
+ # Import here to avoid circular dependency
159
+ from adaptive_harmony.graders.templated_prompt_judge import (
160
+ BinaryJudgeOutput,
161
+ TemplatedPromptJudgeGrader,
162
+ )
163
+
164
+ # Convert examples to template variables
165
+ examples = []
166
+ for example in config.examples:
167
+ examples.append(
168
+ {
169
+ "context_str": (
170
+ "\n".join(f"{msg.role}:\n{msg.content}" for msg in example.input[:-1])
171
+ if len(example.input) > 1
172
+ else ""
173
+ ),
174
+ "user_question": example.input[-1].content if example.input else "",
175
+ "completion": example.output,
176
+ "output_json": f'{{"reasoning": "{example.reasoning or ""}", "score": "{"PASS" if example.pass_ else "FAIL"}"}}',
177
+ }
178
+ )
179
+
180
+ template_vars = {
181
+ "criteria": config.criteria,
182
+ "examples": examples,
183
+ }
184
+
185
+ model = await get_model(ctx, grader_key, config.model_key, kv_cache_len, max_tokens, tp)
186
+
187
+ return TemplatedPromptJudgeGrader(
188
+ grader_key=grader_key,
189
+ model=model,
190
+ grader_id=grader_id,
191
+ system_template=config.system_template,
192
+ user_template=config.user_template,
193
+ output_model=BinaryJudgeOutput,
194
+ template_variables=template_vars,
195
+ ) # type: ignore[return-value]
196
+
197
+ @classmethod
198
+ async def from_prebuilt_judge(
199
+ cls,
200
+ grader_key: str,
201
+ grader_id: str,
202
+ config: PrebuiltJudge,
203
+ ctx: RecipeContext,
204
+ tp: int | None = None,
205
+ kv_cache_len: int | None = None,
206
+ max_tokens: int | None = None,
207
+ ) -> "BaseGrader[dict[str, Any]]":
208
+ model = await get_model(ctx, grader_key, config.model_key, kv_cache_len, max_tokens, tp)
209
+
210
+ match config.prebuilt_config_key:
211
+ case PrebuiltConfigKey.Faithfulness:
212
+ # Import here to avoid circular dependency
213
+ from adaptive_harmony.graders.faithfulness_judge.faithfulness_judge import FaithfulnessGrader
214
+
215
+ return FaithfulnessGrader(
216
+ grader_key=grader_key,
217
+ grader_id=grader_id,
218
+ model=model,
219
+ )
220
+ case PrebuiltConfigKey.AnswerRelevancy:
221
+ # Import here to avoid circular dependency
222
+ from adaptive_harmony.graders.answer_relevancy_judge.answer_relevancy_judge import AnswerRelevancyGrader
223
+
224
+ return AnswerRelevancyGrader(
225
+ grader_key=grader_key,
226
+ grader_id=grader_id,
227
+ model=model,
228
+ )
229
+ case PrebuiltConfigKey.ContextRelevancy:
230
+ # Import here to avoid circular dependency
231
+ from adaptive_harmony.graders.context_relevancy_judge.context_relevancy_judge import (
232
+ ContextRelevancyGrader,
233
+ )
234
+
235
+ return ContextRelevancyGrader(
236
+ grader_key=grader_key,
237
+ grader_id=grader_id,
238
+ model=model,
239
+ )
240
+ case _:
241
+ raise ValueError(f"Invalid prebuilt judge type: {config.prebuilt_config_key}")
242
+
243
+ @classmethod
244
+ def from_remote_reward_endpoint(
245
+ cls, grader_key: str, grader_id: str, config: RemoteRewardEndpoint
246
+ ) -> "BaseGrader[dict[str, Any]]":
247
+ # Import here to avoid circular dependency
248
+ from adaptive_harmony.graders.reward_server_grader import RewardServerGrader
249
+
250
+ return RewardServerGrader(grader_key=grader_key, grader_id=grader_id, reward_server_ip=config.url)
251
+
252
+
253
+ async def get_model(
254
+ ctx: RecipeContext,
255
+ grader_key: str,
256
+ model_key: str,
257
+ kv_cache_len: int | None,
258
+ max_tokens: int | None,
259
+ tp: int | None,
260
+ ) -> InferenceModel:
261
+ model_builder = await Model(model_key=model_key).to_builder(
262
+ ctx, kv_cache_len=kv_cache_len, tokens_to_generate=max_tokens, tp=tp
263
+ )
264
+ model = await model_builder.spawn_inference(grader_key)
265
+ return model
@@ -0,0 +1,8 @@
1
+ from .binary_judge import BinaryJudgeGrader, BinaryJudgeOutput
2
+ from .prompts import BinaryJudgeShot
3
+
4
+ __all__ = [
5
+ "BinaryJudgeGrader",
6
+ "BinaryJudgeOutput",
7
+ "BinaryJudgeShot",
8
+ ]
@@ -0,0 +1,202 @@
1
+ from random import shuffle
2
+ from typing import Literal, TypedDict
3
+
4
+ from pydantic import BaseModel, Field
5
+
6
+ from adaptive_harmony import InferenceModel, StringThread
7
+ from adaptive_harmony.core.reasoning import remove_reasoning
8
+ from adaptive_harmony.core.structured_output import JsonParseError, render_pydantic_model, render_schema
9
+ from adaptive_harmony.core.utils import SingleTurnShot, stringify_thread
10
+ from adaptive_harmony.graders import BaseGrader, Grade
11
+ from adaptive_harmony.graders.binary_judge.prompts import DEFAULT_SHOTS, SYSTEM, USER, BinaryJudgeShot
12
+ from adaptive_harmony.graders.exceptions import IgnoreScoreException
13
+ from adaptive_harmony.graders.utils import (
14
+ FailedJudgeLog,
15
+ SuccessJudgeLog,
16
+ separate_context_from_last_user_turn,
17
+ validate_thread_last_assistant,
18
+ )
19
+ from adaptive_harmony.logging_table import Table
20
+
21
+
22
+ class BinaryJudgeOutput(BaseModel):
23
+ reasoning: str = Field(description="Reasoning to support the rationale behind the score")
24
+ score: Literal["PASS", "FAIL", "NA"] = Field(description="The score for the sample")
25
+
26
+
27
+ class ScoresMap(TypedDict):
28
+ PASS: float
29
+ FAIL: float
30
+
31
+
32
+ OPENAI_MODEL_FAMILIES_TEMPERATURE_1_ONLY = ["gpt-5", "o1", "o3", "o4"]
33
+
34
+
35
+ class BinaryJudgeGrader(BaseGrader):
36
+ """
37
+ Binary judge for scoring samples as PASS, FAIL or NA using few-shot prompting.
38
+ If custom shots are provided, they are used instead of the default shots.
39
+ """
40
+
41
+ def __init__(
42
+ self,
43
+ grader_key: str,
44
+ model: InferenceModel,
45
+ criteria: str,
46
+ shots: list[BinaryJudgeShot] | None = None,
47
+ temperature: float = 0.0,
48
+ grader_id: str | None = None,
49
+ ):
50
+ super().__init__(grader_key)
51
+ self._logs: list[SuccessJudgeLog | FailedJudgeLog] = [] # type: ignore[assignment]
52
+ self.criteria = criteria
53
+ self.model = model
54
+ self.temperature = temperature
55
+ # Set temperature to 1.0 if model_key is an OpenAI model in the temperature-1-only list
56
+ model_path: str = model.get_builder_args().get("path") # type: ignore[assignment]
57
+ if model_path.startswith("openai://"):
58
+ model_name = model_path.removeprefix("openai://").split("?")[0]
59
+ if any(model_name.startswith(model) for model in OPENAI_MODEL_FAMILIES_TEMPERATURE_1_ONLY):
60
+ temperature = 1.0
61
+ self.model = model.temperature(temperature)
62
+ # Score mapping
63
+ self.scores_map: ScoresMap = {"PASS": 1.0, "FAIL": 0.0}
64
+ self.grader_id_or_key = grader_id or grader_key
65
+
66
+ self._original_shots = shots or DEFAULT_SHOTS
67
+ self._shots = self._format_user_shots(shots or DEFAULT_SHOTS)
68
+
69
+ @property
70
+ def shots(self) -> list[BinaryJudgeShot]:
71
+ return self._original_shots
72
+
73
+ @shots.setter
74
+ def shots(self, shots: list[BinaryJudgeShot]):
75
+ self._original_shots = shots
76
+ self._shots = self._format_user_shots(shots)
77
+
78
+ @staticmethod
79
+ def _extract_user_template_kwargs(thread: StringThread) -> dict[str, str]:
80
+ validate_thread_last_assistant(thread)
81
+ # Separate conversation context from last user turn
82
+ context_turns, user_question = separate_context_from_last_user_turn(thread)
83
+ context_str = stringify_thread(StringThread(context_turns))
84
+ completion = remove_reasoning(thread.last_content())
85
+
86
+ assert user_question, "There must be at least one user turn"
87
+ return dict(
88
+ context=context_str,
89
+ user_question=user_question,
90
+ completion=completion,
91
+ )
92
+
93
+ def _get_placeholder_reasoning(self, score: Literal["PASS", "FAIL", "NA"]) -> str:
94
+ if score == "PASS":
95
+ return "The completions complies with the criteria"
96
+ elif score == "FAIL":
97
+ return "The completion does not comply with the criteria"
98
+ else:
99
+ return "The criteria is not applicable to the completion"
100
+
101
+ def _format_user_shots(self, shots: list[BinaryJudgeShot]) -> list[SingleTurnShot]:
102
+ """
103
+ Turn a possibly multi turn example into a single turn one,
104
+ with appropriate kwargs to format the task's prompt templates
105
+ """
106
+ new_shots: list[SingleTurnShot] = []
107
+ for shot in shots:
108
+ placeholder_reasoning = self._get_placeholder_reasoning(shot.score)
109
+
110
+ user_template_kwargs = self._extract_user_template_kwargs(shot.thread)
111
+ user_template_kwargs["criteria"] = shot.criteria or self.criteria
112
+ single_turn_shot = SingleTurnShot(
113
+ user=user_template_kwargs,
114
+ assistant={
115
+ "json_answer": render_pydantic_model(
116
+ BinaryJudgeOutput(
117
+ reasoning=shot.reasoning or placeholder_reasoning,
118
+ score=shot.score,
119
+ )
120
+ )
121
+ },
122
+ )
123
+ new_shots.append(single_turn_shot)
124
+
125
+ return new_shots
126
+
127
+ def _get_judge_prompt(self, thread: StringThread) -> StringThread:
128
+ """Build the judging prompt for a given sample."""
129
+ # build the real user template kwargs
130
+ user_template_kwargs = self._extract_user_template_kwargs(thread)
131
+ user_template_kwargs["criteria"] = self.criteria
132
+ # system kwarg
133
+ output_json_schema = render_schema(BinaryJudgeOutput)
134
+
135
+ # system
136
+ prompt = StringThread().system(SYSTEM.format(json_schema=output_json_schema))
137
+ # shots
138
+ for shot in self._shots:
139
+ prompt = prompt.user(USER.format(**shot["user"]))
140
+ prompt = prompt.assistant(shot["assistant"]["json_answer"])
141
+ # real input
142
+ prompt = prompt.user(USER.format(**user_template_kwargs))
143
+
144
+ return prompt
145
+
146
+ async def grade(self, sample: StringThread) -> Grade:
147
+ judging_prompt = self._get_judge_prompt(sample)
148
+ str_prompt = stringify_thread(judging_prompt, sep=f"\n\n{'-' * 10}\n\n")
149
+
150
+ try:
151
+ _, parsed_output = await self.model.generate_and_validate(judging_prompt, BinaryJudgeOutput)
152
+ except JsonParseError as e:
153
+ self.add_log({"prompt": str_prompt, "error": f"{str(e)}\n\nCOMPLETION:\n{e.completion}"})
154
+ raise
155
+ except Exception as e:
156
+ self.add_log({"prompt": str_prompt, "error": str(e)})
157
+ raise
158
+
159
+ float_score = self.scores_map.get(parsed_output.score)
160
+
161
+ # NA case, ignore score
162
+ if float_score is None:
163
+ self.add_log({"prompt": str_prompt, "error": f"Non applicable score: {parsed_output.reasoning}"})
164
+ raise IgnoreScoreException(f"Non applicable score: {parsed_output.reasoning}")
165
+
166
+ else:
167
+ grade = Grade(value=float_score, grader_key=self.grader_id_or_key, reasoning=parsed_output.reasoning)
168
+ self.add_log({"score": float_score, "prompt": str_prompt, "reasoning": parsed_output.reasoning})
169
+
170
+ return grade
171
+
172
+ def add_log(self, log: SuccessJudgeLog | FailedJudgeLog) -> None: # type: ignore[override]
173
+ self._logs.append(log)
174
+
175
+ def get_logs(self, clear: bool = False, log_all_samples: bool = False) -> dict[str, float | Table]:
176
+ # Only clear logs at the end if clear is True
177
+ logs = super().get_logs(clear=False)
178
+
179
+ # get sample of PASS and FAIL samples to log in table
180
+ successfully_scored_samples = [log for log in self._logs if "score" in log]
181
+ if not log_all_samples:
182
+ shuffle(successfully_scored_samples)
183
+ samples_score_0 = [log for log in successfully_scored_samples if log["score"] == self.scores_map["FAIL"]][
184
+ :5
185
+ ]
186
+ samples_score_1 = [log for log in successfully_scored_samples if log["score"] == self.scores_map["PASS"]][
187
+ :5
188
+ ]
189
+ subset_successfully_scored_samples = samples_score_0 + samples_score_1
190
+ else:
191
+ subset_successfully_scored_samples = successfully_scored_samples
192
+
193
+ # get failed samples to log in table
194
+ failed_scored_samples = [log for log in self._logs if "error" in log]
195
+
196
+ sample_logs = self.get_sample_tables(subset_successfully_scored_samples, failed_scored_samples)
197
+ logs.update(sample_logs)
198
+
199
+ if clear:
200
+ self.clear_logs()
201
+
202
+ return logs
@@ -0,0 +1,125 @@
1
+ from typing import Literal, NamedTuple
2
+
3
+ from harmony_client.runtime.data import CustomJudgeExample
4
+
5
+ from adaptive_harmony import StringThread
6
+
7
+ SYSTEM = """You are an expert evaluator that evaluates completions generated by an AI model on a fixed criterion.
8
+ You will be given all elements of an interaction between human an AI model:
9
+ The full context of the conversation so far leading up to the last user turn/question is under the CONTEXT header. It may contain extra contextual information.
10
+ The last user turn/question is under the USER QUESTION header. It may contain extra contextual information.
11
+ The model's completion is under the COMPLETION TO EVALUATE header.
12
+ The evaluation criterion is under the EVALUATION CRITERION section.
13
+
14
+ In order to to analyze and score a completion, you always run the following steps without exception:
15
+ First, you read the CONTEXT, USER QUESTION and COMPLETION TO EVALUATE.
16
+ Then, you analyze the COMPLETION TO EVALUATE, and assign it a PASS, FAIL or NA score according to the criterion: FAIL if the completion does not meet the criterion, PASS if it does, and NA if the criterion is not applicable to the example. These are the rules to follow:
17
+ - You must always evaluate the COMPLETION TO EVALUATE based solely on the USER QUESTION, and never on an intermediary question that might have been asked in the CONTEXT. The CONTEXT is there for context only.
18
+ - Do not make any judgement on text that is in the CONTEXT or USER QUESTION; you are evaluating the COMPLETION TO EVALUATE text only.
19
+ - You must not use the original instructions given to the model in the CONTEXT for your judgement. Focus only on the EVALUATION CRITERION, without any other influencing factors.
20
+ - You are forbidden to return a score other than PASS, FAIL or NA for each criterion.
21
+ - If the criterion is conditional, and is not applicable to the specific USER QUESTION + COMPLETION TO EVALUATE pair, you must score it as NA.
22
+ - Return a single score, no matter how many things are evaluated or contemplated in the criterion. A PASS means the completion complied with everything.
23
+
24
+ Finally, output an explanation for your judgement and the score for the criterion, as exemplified below. The output should be a well-formatted JSON string that conforms to the JSON schema below. Do not output anything else other than the JSON string.
25
+
26
+ Here is the output JSON schema you must strictly follow, with field descriptions and value types. All fields are required.
27
+ {json_schema}
28
+
29
+ Evaluate only the final COMPLETION TO EVALUATE with regard to the USER QUESTION shown. Do not return any preamble or explanations, Return exactly one JSON string"""
30
+
31
+
32
+ USER = """CONTEXT
33
+ {context}
34
+
35
+ USER QUESTION
36
+ {user_question}
37
+
38
+ COMPLETION TO EVALUATE
39
+ {completion}
40
+
41
+ EVALUATION CRITERION
42
+ {criteria}
43
+
44
+ OUTPUT
45
+ """
46
+
47
+
48
+ class BinaryJudgeShot(NamedTuple):
49
+ """
50
+ Example shot for BinaryJudgeScorer.
51
+ If no criteria is specified, we assume the rest
52
+ of the shot relates to the criteria passed to the BinaryJudgeScorer.
53
+ """
54
+
55
+ thread: StringThread
56
+ reasoning: str | None
57
+ score: Literal["PASS", "FAIL", "NA"]
58
+ criteria: str | None = None
59
+
60
+
61
+ def to_binary_judge_shot(example: CustomJudgeExample) -> BinaryJudgeShot:
62
+ return BinaryJudgeShot(
63
+ thread=StringThread([(turn.role, turn.content) for turn in example.input]).assistant(example.output),
64
+ reasoning=example.reasoning,
65
+ score="PASS" if example.pass_ else "FAIL",
66
+ )
67
+
68
+
69
+ DEFAULT_SHOTS = [
70
+ BinaryJudgeShot(
71
+ thread=StringThread(
72
+ [
73
+ (
74
+ "system",
75
+ "You are a savy boating conosseur. You know everything there is to know about boats. Your responses to questions must be quick and short, almost dismissive.",
76
+ ),
77
+ ("user", " Hey, how are you doing?"),
78
+ ("assistant", "Doing great!"),
79
+ ("user", "Perfect. Tell me, how many masts does a ketch have?"),
80
+ ("assistant", "A ketch has two masts"),
81
+ ]
82
+ ),
83
+ reasoning="The completion is in English instead of french.",
84
+ score="FAIL",
85
+ criteria="The completion must be in french",
86
+ ),
87
+ BinaryJudgeShot(
88
+ thread=StringThread(
89
+ [
90
+ (
91
+ "system",
92
+ "You are a customer service assistant. You can use informal tone if the user is informal with you, but always make sure the customer's problem is fixed.",
93
+ ),
94
+ (
95
+ "user",
96
+ "Write a reply to a customer called John Smith that has requested for a refund of our product. Our company policy is to provide refunds up to 30 days after the purchase.",
97
+ ),
98
+ (
99
+ "assistant",
100
+ "Dear John Smith,\nThank you for reaching out regarding your request for a refund. We appreciate your business and understand that sometimes products may not meet expectations.\nAs per our company policy, we are happy to provide refunds within 30 days of your purchase. If your request falls within this timeframe and the product is in its original condition, we will process your refund promptly. Please ensure that you return the item to us along with proof of purchase.\nIf you have any questions or need assistance with the return process, feel free to contact us.",
101
+ ),
102
+ ]
103
+ ),
104
+ reasoning="The completion is in general polite to the customer.",
105
+ score="PASS",
106
+ criteria="Completion must be polite, with use of proper language and no slang.",
107
+ ),
108
+ BinaryJudgeShot(
109
+ thread=StringThread(
110
+ [
111
+ (
112
+ "system",
113
+ "You are a savy boating conosseur. You know everything there is to know about boats. Your responses to questions must be quick and short, almost dismissive.",
114
+ ),
115
+ ("user", " Hey, how are you doing?"),
116
+ ("assistant", "Doing great!"),
117
+ ("user", "Perfect. Tell me, how many masts does a ketch have?"),
118
+ ("assistant", "A ketch has two masts"),
119
+ ]
120
+ ),
121
+ reasoning="The user did not ask about hunting, criterion is not applicable",
122
+ score="NA",
123
+ criteria="If the user is asking about hunting topics, state that you cannot answer.",
124
+ ),
125
+ ]