speedy-utils 1.1.23__py3-none-any.whl → 1.1.25__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- llm_utils/__init__.py +12 -8
- llm_utils/chat_format/__init__.py +2 -0
- llm_utils/chat_format/display.py +115 -44
- llm_utils/lm/__init__.py +14 -6
- llm_utils/lm/llm.py +413 -0
- llm_utils/lm/llm_signature.py +35 -0
- llm_utils/lm/mixins.py +379 -0
- llm_utils/lm/openai_memoize.py +18 -7
- llm_utils/lm/signature.py +26 -37
- llm_utils/lm/utils.py +61 -76
- speedy_utils/__init__.py +31 -2
- speedy_utils/all.py +30 -1
- speedy_utils/common/utils_cache.py +142 -1
- speedy_utils/common/utils_io.py +36 -26
- speedy_utils/common/utils_misc.py +25 -1
- speedy_utils/multi_worker/thread.py +145 -58
- {speedy_utils-1.1.23.dist-info → speedy_utils-1.1.25.dist-info}/METADATA +1 -1
- {speedy_utils-1.1.23.dist-info → speedy_utils-1.1.25.dist-info}/RECORD +20 -19
- llm_utils/lm/llm_as_a_judge.py +0 -390
- llm_utils/lm/llm_task.py +0 -614
- {speedy_utils-1.1.23.dist-info → speedy_utils-1.1.25.dist-info}/WHEEL +0 -0
- {speedy_utils-1.1.23.dist-info → speedy_utils-1.1.25.dist-info}/entry_points.txt +0 -0
|
@@ -1,17 +1,18 @@
|
|
|
1
|
-
llm_utils/__init__.py,sha256=
|
|
1
|
+
llm_utils/__init__.py,sha256=pbnOQddU5KnhP8uqMqN9E87BeDeCxFrgta2m2P89LmM,1591
|
|
2
2
|
llm_utils/group_messages.py,sha256=Oe2tlhg-zRodG1-hodYebddrR77j9UdE05LzJw0EvYI,3622
|
|
3
|
-
llm_utils/chat_format/__init__.py,sha256=
|
|
4
|
-
llm_utils/chat_format/display.py,sha256=
|
|
3
|
+
llm_utils/chat_format/__init__.py,sha256=MCNT8o-BZWmoOFE5VLyhJJOqHg8lJGqHXEKSXU08fK0,775
|
|
4
|
+
llm_utils/chat_format/display.py,sha256=HiAOAC8FY7956gNuwE7rxii1MCCebn0avbXi1iIcDSc,17178
|
|
5
5
|
llm_utils/chat_format/transform.py,sha256=eU0c3PdAHCNLuGP1UqPwln0B34Lv3bt_uV9v9BrlCN4,5402
|
|
6
6
|
llm_utils/chat_format/utils.py,sha256=xTxN4HrLHcRO2PfCTR43nH1M5zCa7v0kTTdzAcGkZg0,1229
|
|
7
|
-
llm_utils/lm/__init__.py,sha256=
|
|
7
|
+
llm_utils/lm/__init__.py,sha256=FBe8wVNWDMpvJ2kQYedJ3HH5L2BCAZBQVE0zEjND0Vo,729
|
|
8
8
|
llm_utils/lm/base_prompt_builder.py,sha256=OLqyxbA8QeYIVFzB9EqxUiE_P2p4_MD_Lq4WSwxFtKU,12136
|
|
9
|
-
llm_utils/lm/
|
|
10
|
-
llm_utils/lm/
|
|
9
|
+
llm_utils/lm/llm.py,sha256=uk45JhVcWDMaqezn9Yn_K5hehFSmQ4txU901fn_PcQg,16262
|
|
10
|
+
llm_utils/lm/llm_signature.py,sha256=SP72cWXaVGcZs3m2V361DcLk_St7aYJamNapUiFBB6Q,1242
|
|
11
11
|
llm_utils/lm/lm_base.py,sha256=pqbHZOdR7yUMpvwt8uBG1dZnt76SY_Wk8BkXQQ-mpWs,9557
|
|
12
|
-
llm_utils/lm/
|
|
13
|
-
llm_utils/lm/
|
|
14
|
-
llm_utils/lm/
|
|
12
|
+
llm_utils/lm/mixins.py,sha256=Sn5KyPKGCT_HVJmmosmy3XSlZ0_k5Kds0VvSJqeUDpI,13695
|
|
13
|
+
llm_utils/lm/openai_memoize.py,sha256=PDs3YCXKgHXaHlegkhouzPtf2Gom_o7pvzChCT-NQyQ,3870
|
|
14
|
+
llm_utils/lm/signature.py,sha256=16QOHnGc-p7H8rR3j1dPg8AokdV_rEGUYCGGkIHIghE,10240
|
|
15
|
+
llm_utils/lm/utils.py,sha256=oiJ50b8WV6oktnW4BByr1gRaGc55VJeF3IyhHqoofp4,12193
|
|
15
16
|
llm_utils/lm/async_lm/__init__.py,sha256=PUBbCuf5u6-0GBUu-2PI6YAguzsyXj-LPkU6vccqT6E,121
|
|
16
17
|
llm_utils/lm/async_lm/_utils.py,sha256=P1-pUDf_0pDmo8WTIi43t5ARlyGA1RIJfpAhz-gfA5g,6105
|
|
17
18
|
llm_utils/lm/async_lm/async_llm_task.py,sha256=-BVOk18ZD8eC2obTLgiPq39f2PP3cji17Ku-Gb7c7Xo,18683
|
|
@@ -26,8 +27,8 @@ llm_utils/vector_cache/cli.py,sha256=DMXTj8nZ2_LRjprbYPb4uzq04qZtOfBbmblmaqDcCuM
|
|
|
26
27
|
llm_utils/vector_cache/core.py,sha256=J8ocRX9sBfzboQkf5vFF2cx0SK-nftmKWJUa91WUBy8,31134
|
|
27
28
|
llm_utils/vector_cache/types.py,sha256=ru8qmUZ8_lNd3_oYpjCMtpXTsqmwsSBe56Z4hTWm3xI,435
|
|
28
29
|
llm_utils/vector_cache/utils.py,sha256=dwbbXlRrARrpmS4YqSlYQqrTURg0UWe8XvaAWcX05MM,1458
|
|
29
|
-
speedy_utils/__init__.py,sha256=
|
|
30
|
-
speedy_utils/all.py,sha256=
|
|
30
|
+
speedy_utils/__init__.py,sha256=wPz1MNAicV7skqqZloUFt5QrJcAhxtPQ4jFXk2lz6YA,6190
|
|
31
|
+
speedy_utils/all.py,sha256=gXXRlBLvU8AON7XqO6iFQ8LCIQEIcP_2CDumd_U1ppI,5171
|
|
31
32
|
speedy_utils/common/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
32
33
|
speedy_utils/common/clock.py,sha256=3n4FkCW0dz46O8By09V5Pve1DSMgpLDRbWEVRryryeQ,7423
|
|
33
34
|
speedy_utils/common/function_decorator.py,sha256=BspJ0YuGL6elS7lWBAgELZ-sCfED_1N2P5fgH-fCRUQ,2132
|
|
@@ -35,17 +36,17 @@ speedy_utils/common/logger.py,sha256=a2iZx0eWyfi2-2X_H2QmfuA3tfR7_XSM7Nd0GdUnUOs
|
|
|
35
36
|
speedy_utils/common/notebook_utils.py,sha256=-97kehJ_Gg3TzDLubsLIYJcykqX1NXhbvBO6nniZSYM,2063
|
|
36
37
|
speedy_utils/common/patcher.py,sha256=VCmdxyTF87qroggQkQklRPhAOPJbeBqhcJoTsLcDxNw,2303
|
|
37
38
|
speedy_utils/common/report_manager.py,sha256=eBiw5KY6bWUhwki3B4lK5o8bFsp7L5x28X9GCI-Sd1w,3899
|
|
38
|
-
speedy_utils/common/utils_cache.py,sha256=
|
|
39
|
-
speedy_utils/common/utils_io.py,sha256
|
|
40
|
-
speedy_utils/common/utils_misc.py,sha256=
|
|
39
|
+
speedy_utils/common/utils_cache.py,sha256=h3JbIi0V5pTaFNJDjfwORSN63bc0SrRq_dm8KZJiL94,27023
|
|
40
|
+
speedy_utils/common/utils_io.py,sha256=E7mbxB_OpLvNWoFM2Qpxi1jaD8VwF-tvNOpGbf7swuU,14849
|
|
41
|
+
speedy_utils/common/utils_misc.py,sha256=yYlyP0eXQuapY1dn5O8-UDePPq5bb6FxKFjb1kfZy5o,2354
|
|
41
42
|
speedy_utils/common/utils_print.py,sha256=syRrnSFtguxrV-elx6DDVcSGu4Qy7D_xVNZhPwbUY4A,4864
|
|
42
43
|
speedy_utils/multi_worker/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
43
44
|
speedy_utils/multi_worker/process.py,sha256=RGGGnbZXCbEbdmxFVmnNfyccClAlflzRPE0d1C3CeeE,11385
|
|
44
|
-
speedy_utils/multi_worker/thread.py,sha256=
|
|
45
|
+
speedy_utils/multi_worker/thread.py,sha256=bRjxUHkBjbXHQ2KSsf-Zao28zbSId-8mqMFHwSG1l1s,25206
|
|
45
46
|
speedy_utils/scripts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
46
47
|
speedy_utils/scripts/mpython.py,sha256=IvywP7Y0_V6tWfMP-4MjPvN5_KfxWF21xaLJsCIayCk,3821
|
|
47
48
|
speedy_utils/scripts/openapi_client_codegen.py,sha256=f2125S_q0PILgH5dyzoKRz7pIvNEjCkzpi4Q4pPFRZE,9683
|
|
48
|
-
speedy_utils-1.1.
|
|
49
|
-
speedy_utils-1.1.
|
|
50
|
-
speedy_utils-1.1.
|
|
51
|
-
speedy_utils-1.1.
|
|
49
|
+
speedy_utils-1.1.25.dist-info/METADATA,sha256=P99_Ej4mZJfouYjbGDsAjqodW4IJz7qBnvfR_V8RZ_k,8028
|
|
50
|
+
speedy_utils-1.1.25.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
51
|
+
speedy_utils-1.1.25.dist-info/entry_points.txt,sha256=1rrFMfqvaMUE9hvwGiD6vnVh98kmgy0TARBj-v0Lfhs,244
|
|
52
|
+
speedy_utils-1.1.25.dist-info/RECORD,,
|
llm_utils/lm/llm_as_a_judge.py
DELETED
|
@@ -1,390 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
LLM-as-a-Judge implementation with template support and SFT export utilities.
|
|
3
|
-
|
|
4
|
-
This module provides a base class for creating LLM judges with structured
|
|
5
|
-
prompts, variable substitution, and export capabilities for fine-tuning.
|
|
6
|
-
"""
|
|
7
|
-
|
|
8
|
-
import json
|
|
9
|
-
from typing import Any, Dict, List, Optional, Type, Union
|
|
10
|
-
from pydantic import BaseModel
|
|
11
|
-
from ..chat_format import get_conversation_one_turn
|
|
12
|
-
from .llm_task import LLMTask
|
|
13
|
-
from .signature import Signature
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
class LLMJudgeBase(LLMTask):
|
|
17
|
-
"""Base class for LLM judges with template support and SFT export."""
|
|
18
|
-
|
|
19
|
-
def __init__(
|
|
20
|
-
self,
|
|
21
|
-
system_prompt_template: str,
|
|
22
|
-
signature: Optional[Type[Signature]] = None,
|
|
23
|
-
**kwargs
|
|
24
|
-
):
|
|
25
|
-
"""
|
|
26
|
-
Initialize LLMJudgeBase.
|
|
27
|
-
|
|
28
|
-
Args:
|
|
29
|
-
system_prompt_template: System prompt template with {variable} placeholders
|
|
30
|
-
signature: Optional Signature class for structured I/O
|
|
31
|
-
**kwargs: Additional arguments passed to LLMTask
|
|
32
|
-
"""
|
|
33
|
-
self.system_prompt_template = system_prompt_template
|
|
34
|
-
self.signature = signature
|
|
35
|
-
self.sft_data: List[Dict[str, Any]] = [] # Store SFT training examples
|
|
36
|
-
|
|
37
|
-
# Set instruction from signature if available
|
|
38
|
-
if signature is not None:
|
|
39
|
-
instruction = signature.get_instruction()
|
|
40
|
-
kwargs.setdefault('instruction', instruction)
|
|
41
|
-
kwargs.setdefault('output_model', signature.get_output_model())
|
|
42
|
-
else:
|
|
43
|
-
kwargs.setdefault('instruction', system_prompt_template)
|
|
44
|
-
|
|
45
|
-
super().__init__(**kwargs)
|
|
46
|
-
|
|
47
|
-
def format_system_prompt(self, variables: Dict[str, Any]) -> str:
|
|
48
|
-
"""Format system prompt template with provided variables."""
|
|
49
|
-
try:
|
|
50
|
-
return self.system_prompt_template.format(**variables)
|
|
51
|
-
except KeyError as e:
|
|
52
|
-
missing_var = str(e).strip("'")
|
|
53
|
-
raise ValueError(f"Missing required variable '{missing_var}' for system prompt template")
|
|
54
|
-
|
|
55
|
-
def judge(
|
|
56
|
-
self,
|
|
57
|
-
input_data: Union[str, Dict[str, Any], BaseModel],
|
|
58
|
-
variables: Optional[Dict[str, Any]] = None,
|
|
59
|
-
**runtime_kwargs
|
|
60
|
-
) -> List[Dict[str, Any]]:
|
|
61
|
-
"""
|
|
62
|
-
Execute judgment with variable substitution in system prompt.
|
|
63
|
-
|
|
64
|
-
Args:
|
|
65
|
-
input_data: Input data for the judge
|
|
66
|
-
variables: Variables to substitute in system prompt template
|
|
67
|
-
**runtime_kwargs: Additional runtime arguments
|
|
68
|
-
|
|
69
|
-
Returns:
|
|
70
|
-
List of judgment results
|
|
71
|
-
"""
|
|
72
|
-
variables = variables or {}
|
|
73
|
-
|
|
74
|
-
# Format system prompt with variables
|
|
75
|
-
formatted_prompt = self.format_system_prompt(variables)
|
|
76
|
-
|
|
77
|
-
# Temporarily override instruction
|
|
78
|
-
original_instruction = self.instruction
|
|
79
|
-
self.instruction = formatted_prompt
|
|
80
|
-
|
|
81
|
-
try:
|
|
82
|
-
# Handle different input types
|
|
83
|
-
if isinstance(input_data, dict):
|
|
84
|
-
processed_input = json.dumps(input_data)
|
|
85
|
-
else:
|
|
86
|
-
processed_input = input_data
|
|
87
|
-
results = self(processed_input, **runtime_kwargs)
|
|
88
|
-
|
|
89
|
-
# Store for SFT if needed
|
|
90
|
-
self._store_sft_example(input_data, results, variables, formatted_prompt)
|
|
91
|
-
|
|
92
|
-
return results
|
|
93
|
-
finally:
|
|
94
|
-
# Restore original instruction
|
|
95
|
-
self.instruction = original_instruction
|
|
96
|
-
|
|
97
|
-
def _store_sft_example(
|
|
98
|
-
self,
|
|
99
|
-
input_data: Union[str, Dict[str, Any], BaseModel],
|
|
100
|
-
results: List[Dict[str, Any]],
|
|
101
|
-
variables: Dict[str, Any],
|
|
102
|
-
formatted_prompt: str
|
|
103
|
-
) -> None:
|
|
104
|
-
"""Store example for SFT export."""
|
|
105
|
-
for result in results:
|
|
106
|
-
# Create input text
|
|
107
|
-
if isinstance(input_data, str):
|
|
108
|
-
input_text = input_data
|
|
109
|
-
elif isinstance(input_data, BaseModel):
|
|
110
|
-
input_text = input_data.model_dump_json()
|
|
111
|
-
elif isinstance(input_data, dict):
|
|
112
|
-
input_text = json.dumps(input_data)
|
|
113
|
-
else:
|
|
114
|
-
input_text = str(input_data)
|
|
115
|
-
|
|
116
|
-
# Extract output
|
|
117
|
-
output_text = result['parsed']
|
|
118
|
-
if isinstance(output_text, BaseModel):
|
|
119
|
-
output_text = output_text.model_dump_json()
|
|
120
|
-
elif not isinstance(output_text, str):
|
|
121
|
-
output_text = str(output_text)
|
|
122
|
-
|
|
123
|
-
# Create conversation format
|
|
124
|
-
messages = get_conversation_one_turn(
|
|
125
|
-
formatted_prompt,
|
|
126
|
-
input_text,
|
|
127
|
-
output_text
|
|
128
|
-
)
|
|
129
|
-
|
|
130
|
-
sft_example = {
|
|
131
|
-
'messages': messages,
|
|
132
|
-
'variables': variables,
|
|
133
|
-
'input_data': input_data,
|
|
134
|
-
'output': result['parsed']
|
|
135
|
-
}
|
|
136
|
-
|
|
137
|
-
self.sft_data.append(sft_example)
|
|
138
|
-
|
|
139
|
-
def export_sft_data(self, format: str = 'messages') -> List[Dict[str, Any]]:
|
|
140
|
-
"""
|
|
141
|
-
Export stored examples in SFT format.
|
|
142
|
-
|
|
143
|
-
Args:
|
|
144
|
-
format: Export format ('messages', 'full', or 'sharegpt')
|
|
145
|
-
|
|
146
|
-
Returns:
|
|
147
|
-
List of SFT training examples
|
|
148
|
-
"""
|
|
149
|
-
if format == 'messages':
|
|
150
|
-
return [{'messages': example['messages']} for example in self.sft_data]
|
|
151
|
-
elif format == 'full':
|
|
152
|
-
return self.sft_data
|
|
153
|
-
elif format == 'sharegpt':
|
|
154
|
-
# Convert to ShareGPT format
|
|
155
|
-
sharegpt_data = []
|
|
156
|
-
for example in self.sft_data:
|
|
157
|
-
conversations = []
|
|
158
|
-
for msg in example['messages']:
|
|
159
|
-
conversations.append({
|
|
160
|
-
'from': 'human' if msg['role'] == 'user' else 'gpt' if msg['role'] == 'assistant' else 'system',
|
|
161
|
-
'value': msg['content']
|
|
162
|
-
})
|
|
163
|
-
sharegpt_data.append({'conversations': conversations})
|
|
164
|
-
return sharegpt_data
|
|
165
|
-
else:
|
|
166
|
-
raise ValueError(f"Unsupported format: {format}. Choose from 'messages', 'full', or 'sharegpt'")
|
|
167
|
-
|
|
168
|
-
def save_sft_data(self, filepath: str, format: str = 'messages') -> None:
|
|
169
|
-
"""Save SFT data to file."""
|
|
170
|
-
sft_data = self.export_sft_data(format)
|
|
171
|
-
with open(filepath, 'w', encoding='utf-8') as f:
|
|
172
|
-
json.dump(sft_data, f, indent=2, ensure_ascii=False)
|
|
173
|
-
|
|
174
|
-
def clear_sft_data(self) -> None:
|
|
175
|
-
"""Clear stored SFT examples."""
|
|
176
|
-
self.sft_data.clear()
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
class ChainOfThought:
|
|
180
|
-
"""DSPy-like ChainOfThought wrapper for signatures."""
|
|
181
|
-
|
|
182
|
-
def __init__(self, signature: Type[Signature], **llm_kwargs):
|
|
183
|
-
"""
|
|
184
|
-
Initialize ChainOfThought with a signature.
|
|
185
|
-
|
|
186
|
-
Args:
|
|
187
|
-
signature: Signature class defining input/output structure
|
|
188
|
-
**llm_kwargs: Arguments passed to LLMJudgeBase
|
|
189
|
-
"""
|
|
190
|
-
self.signature = signature
|
|
191
|
-
|
|
192
|
-
# Create system prompt from signature
|
|
193
|
-
system_prompt = signature.get_instruction()
|
|
194
|
-
|
|
195
|
-
# Add reasoning instruction
|
|
196
|
-
system_prompt += "\n\nThink step by step before providing your final answer."
|
|
197
|
-
|
|
198
|
-
self.llm = LLMJudgeBase(
|
|
199
|
-
system_prompt_template=system_prompt,
|
|
200
|
-
signature=signature,
|
|
201
|
-
**llm_kwargs
|
|
202
|
-
)
|
|
203
|
-
|
|
204
|
-
def __call__(self, **kwargs) -> Any:
|
|
205
|
-
"""Execute chain of thought reasoning."""
|
|
206
|
-
# Format input using signature
|
|
207
|
-
signature_instance = self.signature(**kwargs)
|
|
208
|
-
input_text = signature_instance.format_input(**kwargs)
|
|
209
|
-
|
|
210
|
-
results = self.llm.judge(input_text)
|
|
211
|
-
|
|
212
|
-
# Return the parsed output
|
|
213
|
-
if results:
|
|
214
|
-
return results[0]['parsed']
|
|
215
|
-
return None
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
# Example usage classes based on the raw code
|
|
219
|
-
class TranslationOutput(BaseModel):
|
|
220
|
-
"""Output schema for translation evaluation."""
|
|
221
|
-
structure_score: int # 0 = wrong, 1 = partially correct, 2 = correct
|
|
222
|
-
translation_score: int # 0 = not faithful, 1 = somewhat faithful, 2 = fully faithful
|
|
223
|
-
term_score: int # 0 = glossary not followed, 1 = partially followed, 2 = fully followed or no glossary provided
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
class TranslationEvaluatorJudge(LLMJudgeBase):
|
|
227
|
-
"""Translation evaluator judge based on the raw code example."""
|
|
228
|
-
|
|
229
|
-
def __init__(self, **kwargs):
|
|
230
|
-
system_prompt = """You are a careful **translation evaluator**.
|
|
231
|
-
|
|
232
|
-
You are given five inputs:
|
|
233
|
-
|
|
234
|
-
* **Source Prompt** (the original text & any constraints)
|
|
235
|
-
* **AI Translation** (the machine translation to evaluate)
|
|
236
|
-
* **Human Reference** (a reference rendering; use only for guidance, not as ground truth)
|
|
237
|
-
* **System Message** (an automated hint about a possible structural error)
|
|
238
|
-
* **Glossaries** (optional terminology constraints; may be empty)
|
|
239
|
-
|
|
240
|
-
## Your tasks
|
|
241
|
-
|
|
242
|
-
1. **Check structure correctness**:
|
|
243
|
-
- Use the System Message as a hint.
|
|
244
|
-
- Assign a `structure_score`:
|
|
245
|
-
* `0` = structure is clearly wrong or the error flagged is correct.
|
|
246
|
-
* `1` = partially correct but flawed.
|
|
247
|
-
* `2` = structure is correct; the system error is invalid.
|
|
248
|
-
|
|
249
|
-
2. **Check translation quality**:
|
|
250
|
-
- Compare AI Translation with Source Prompt and Human Reference.
|
|
251
|
-
- Assign a `translation_score`:
|
|
252
|
-
* `0` = unfaithful (major omissions/additions/distortions/repetitions).
|
|
253
|
-
* `1` = somewhat faithful (mostly correct but noticeable issues).
|
|
254
|
-
* `2` = faithful (preserves meaning, scope, nuance; only minor style differences).
|
|
255
|
-
|
|
256
|
-
3. **Check glossary/terminology adherence**:
|
|
257
|
-
- If no glossary is provided → `term_score = 2`.
|
|
258
|
-
- If glossary exists but only partially followed → `term_score = 1`.
|
|
259
|
-
- If glossary exists but not followed at all → `term_score = 0`.
|
|
260
|
-
|
|
261
|
-
## Output format (JSON only; no commentary)
|
|
262
|
-
|
|
263
|
-
Return exactly one JSON object with the three scores.
|
|
264
|
-
Do not output any explanations.
|
|
265
|
-
|
|
266
|
-
---
|
|
267
|
-
|
|
268
|
-
### Inputs
|
|
269
|
-
|
|
270
|
-
Source Prompt: {SOURCE_PROMPT}
|
|
271
|
-
|
|
272
|
-
AI Translation: {AI_TRANSLATION}
|
|
273
|
-
|
|
274
|
-
Human Reference: {HUMAN_REFERENCE}
|
|
275
|
-
|
|
276
|
-
System Message: {SYSTEM_MESSAGE}
|
|
277
|
-
|
|
278
|
-
Glossaries: {GLOSSARIES}
|
|
279
|
-
"""
|
|
280
|
-
|
|
281
|
-
super().__init__(
|
|
282
|
-
system_prompt_template=system_prompt,
|
|
283
|
-
output_model=TranslationOutput,
|
|
284
|
-
**kwargs
|
|
285
|
-
)
|
|
286
|
-
|
|
287
|
-
def evaluate_translation(
|
|
288
|
-
self,
|
|
289
|
-
source_prompt: str,
|
|
290
|
-
ai_translation: str,
|
|
291
|
-
human_reference: str,
|
|
292
|
-
system_message: str,
|
|
293
|
-
glossaries: str
|
|
294
|
-
) -> TranslationOutput:
|
|
295
|
-
"""
|
|
296
|
-
Evaluate a translation with all required parameters.
|
|
297
|
-
|
|
298
|
-
Returns:
|
|
299
|
-
TranslationOutput with the three scores
|
|
300
|
-
"""
|
|
301
|
-
variables = {
|
|
302
|
-
'SOURCE_PROMPT': source_prompt,
|
|
303
|
-
'AI_TRANSLATION': ai_translation,
|
|
304
|
-
'HUMAN_REFERENCE': human_reference,
|
|
305
|
-
'SYSTEM_MESSAGE': system_message,
|
|
306
|
-
'GLOSSARIES': glossaries
|
|
307
|
-
}
|
|
308
|
-
|
|
309
|
-
input_data = {
|
|
310
|
-
'source': source_prompt,
|
|
311
|
-
'target': human_reference,
|
|
312
|
-
'glossaries': glossaries,
|
|
313
|
-
'translation': ai_translation
|
|
314
|
-
}
|
|
315
|
-
|
|
316
|
-
results = self.judge(json.dumps(input_data), variables=variables)
|
|
317
|
-
return results[0]['parsed']
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
# Example usage and testing
|
|
321
|
-
if __name__ == "__main__":
|
|
322
|
-
# Test the Signature system
|
|
323
|
-
from .signature import Signature, InputField, OutputField
|
|
324
|
-
|
|
325
|
-
# Example 1: Using Signature with ChainOfThought (like DSPy)
|
|
326
|
-
class FactJudge(Signature):
|
|
327
|
-
"""Judge if the answer is factually correct based on the context."""
|
|
328
|
-
|
|
329
|
-
context: str = InputField(desc="Context for the prediction") # type: ignore
|
|
330
|
-
question: str = InputField(desc="Question to be answered") # type: ignore
|
|
331
|
-
answer: str = InputField(desc="Answer for the question") # type: ignore
|
|
332
|
-
factually_correct: bool = OutputField(desc="Is the answer factually correct based on the context?") # type: ignore
|
|
333
|
-
|
|
334
|
-
print("=== Testing Signature System ===")
|
|
335
|
-
print("Instruction:")
|
|
336
|
-
print(FactJudge.get_instruction())
|
|
337
|
-
|
|
338
|
-
# Example 2: Using LLMJudgeBase directly
|
|
339
|
-
judge_prompt = """You are a factual accuracy judge.
|
|
340
|
-
|
|
341
|
-
Given:
|
|
342
|
-
- Context: {context}
|
|
343
|
-
- Question: {question}
|
|
344
|
-
- Answer: {answer}
|
|
345
|
-
|
|
346
|
-
Determine if the answer is factually correct based on the context.
|
|
347
|
-
Respond with true if correct, false if incorrect."""
|
|
348
|
-
|
|
349
|
-
print("\n=== Testing LLMJudgeBase ===")
|
|
350
|
-
print("System prompt template:")
|
|
351
|
-
print(judge_prompt)
|
|
352
|
-
|
|
353
|
-
# Example 3: Translation evaluator from raw code
|
|
354
|
-
print("\n=== Translation Evaluator Example ===")
|
|
355
|
-
evaluator = TranslationEvaluatorJudge()
|
|
356
|
-
print("Translation evaluator initialized with structured output schema.")
|
|
357
|
-
print("Output schema:", TranslationOutput.model_json_schema())
|
|
358
|
-
|
|
359
|
-
# Test SFT export functionality
|
|
360
|
-
print("\n=== SFT Export Test ===")
|
|
361
|
-
# Create a mock judge with some example data
|
|
362
|
-
mock_judge = LLMJudgeBase("Rate the quality: {text}")
|
|
363
|
-
mock_judge.sft_data = [
|
|
364
|
-
{
|
|
365
|
-
'messages': [
|
|
366
|
-
{'role': 'system', 'content': 'Rate the quality: This is good text'},
|
|
367
|
-
{'role': 'user', 'content': 'Please rate this text'},
|
|
368
|
-
{'role': 'assistant', 'content': '{"quality": "good"}'}
|
|
369
|
-
],
|
|
370
|
-
'variables': {'text': 'This is good text'},
|
|
371
|
-
'input_data': 'Please rate this text',
|
|
372
|
-
'output': '{"quality": "good"}'
|
|
373
|
-
}
|
|
374
|
-
]
|
|
375
|
-
|
|
376
|
-
sft_formats = ['messages', 'sharegpt']
|
|
377
|
-
for format_name in sft_formats:
|
|
378
|
-
exported = mock_judge.export_sft_data(format_name)
|
|
379
|
-
print(f"SFT export ({format_name} format): {len(exported)} examples")
|
|
380
|
-
if exported:
|
|
381
|
-
print(f"Sample structure: {list(exported[0].keys())}")
|
|
382
|
-
|
|
383
|
-
print("\n=== All Tests Completed ===")
|
|
384
|
-
print("The LLMJudgeBase system is ready for use!")
|
|
385
|
-
print("\nKey features:")
|
|
386
|
-
print("- System prompt templating with variables")
|
|
387
|
-
print("- DSPy-like Signature system")
|
|
388
|
-
print("- Automatic SFT data collection")
|
|
389
|
-
print("- Multiple export formats (messages, sharegpt, full)")
|
|
390
|
-
print("- Chain of Thought reasoning support")
|