together 1.5.21__py3-none-any.whl → 1.5.23__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- together/cli/api/evaluation.py +379 -0
- together/cli/api/finetune.py +0 -14
- together/cli/cli.py +2 -0
- together/client.py +4 -0
- together/filemanager.py +2 -4
- together/legacy/finetune.py +2 -2
- together/resources/__init__.py +3 -0
- together/resources/batch.py +0 -1
- together/resources/evaluation.py +724 -0
- together/resources/finetune.py +13 -26
- together/types/__init__.py +24 -0
- together/types/evaluation.py +87 -0
- together/types/files.py +2 -0
- together/types/finetune.py +1 -1
- together/utils/files.py +178 -73
- {together-1.5.21.dist-info → together-1.5.23.dist-info}/METADATA +28 -1
- {together-1.5.21.dist-info → together-1.5.23.dist-info}/RECORD +20 -17
- {together-1.5.21.dist-info → together-1.5.23.dist-info}/LICENSE +0 -0
- {together-1.5.21.dist-info → together-1.5.23.dist-info}/WHEEL +0 -0
- {together-1.5.21.dist-info → together-1.5.23.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,724 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any, Dict, List, Optional, Union
|
|
4
|
+
|
|
5
|
+
from together.abstract import api_requestor
|
|
6
|
+
from together.together_response import TogetherResponse
|
|
7
|
+
from together.types import (
|
|
8
|
+
TogetherClient,
|
|
9
|
+
TogetherRequest,
|
|
10
|
+
)
|
|
11
|
+
from together.types.evaluation import (
|
|
12
|
+
ClassifyParameters,
|
|
13
|
+
CompareParameters,
|
|
14
|
+
EvaluationCreateResponse,
|
|
15
|
+
EvaluationJob,
|
|
16
|
+
EvaluationStatusResponse,
|
|
17
|
+
JudgeModelConfig,
|
|
18
|
+
ModelRequest,
|
|
19
|
+
ScoreParameters,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class Evaluation:
|
|
24
|
+
def __init__(self, client: TogetherClient) -> None:
|
|
25
|
+
self._client = client
|
|
26
|
+
|
|
27
|
+
def create(
|
|
28
|
+
self,
|
|
29
|
+
type: str,
|
|
30
|
+
judge_model_name: str,
|
|
31
|
+
judge_system_template: str,
|
|
32
|
+
input_data_file_path: str,
|
|
33
|
+
# Classify-specific parameters
|
|
34
|
+
labels: Optional[List[str]] = None,
|
|
35
|
+
pass_labels: Optional[List[str]] = None,
|
|
36
|
+
# Score-specific parameters
|
|
37
|
+
min_score: Optional[float] = None,
|
|
38
|
+
max_score: Optional[float] = None,
|
|
39
|
+
pass_threshold: Optional[float] = None,
|
|
40
|
+
# Compare-specific parameters (model_a and model_b handled below)
|
|
41
|
+
# Common optional parameters
|
|
42
|
+
model_a: Optional[Union[str, Dict[str, Any]]] = None,
|
|
43
|
+
model_b: Optional[Union[str, Dict[str, Any]]] = None,
|
|
44
|
+
model_to_evaluate: Optional[Union[str, Dict[str, Any]]] = None,
|
|
45
|
+
) -> EvaluationCreateResponse:
|
|
46
|
+
"""
|
|
47
|
+
Create a new evaluation job.
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
type: The type of evaluation ("classify", "score", or "compare")
|
|
51
|
+
judge_model_name: Name of the judge model
|
|
52
|
+
judge_system_template: System template for the judge
|
|
53
|
+
input_data_file_path: Path to input data file
|
|
54
|
+
labels: List of classification labels (required for classify)
|
|
55
|
+
pass_labels: List of labels considered as passing (required for classify)
|
|
56
|
+
min_score: Minimum score value (required for score)
|
|
57
|
+
max_score: Maximum score value (required for score)
|
|
58
|
+
pass_threshold: Threshold score for passing (required for score)
|
|
59
|
+
model_to_evaluate: Model to evaluate for classify/score types
|
|
60
|
+
model_a: Model A for compare type
|
|
61
|
+
model_b: Model B for compare type
|
|
62
|
+
|
|
63
|
+
Returns:
|
|
64
|
+
EvaluationCreateResponse with workflow_id and status
|
|
65
|
+
"""
|
|
66
|
+
requestor = api_requestor.APIRequestor(
|
|
67
|
+
client=self._client,
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
# Build judge config
|
|
71
|
+
judge_config = JudgeModelConfig(
|
|
72
|
+
model_name=judge_model_name,
|
|
73
|
+
system_template=judge_system_template,
|
|
74
|
+
)
|
|
75
|
+
parameters: Union[ClassifyParameters, ScoreParameters, CompareParameters]
|
|
76
|
+
# Build parameters based on type
|
|
77
|
+
if type == "classify":
|
|
78
|
+
if labels is None or pass_labels is None:
|
|
79
|
+
raise ValueError(
|
|
80
|
+
"labels and pass_labels are required for classify evaluation"
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
# Validate that no score-specific parameters are provided
|
|
84
|
+
if any(
|
|
85
|
+
[
|
|
86
|
+
min_score is not None,
|
|
87
|
+
max_score is not None,
|
|
88
|
+
pass_threshold is not None,
|
|
89
|
+
]
|
|
90
|
+
):
|
|
91
|
+
raise ValueError(
|
|
92
|
+
"min_score, max_score, and pass_threshold parameters are exclusive to the score mode"
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
# Validate that no compare-specific parameters are provided
|
|
96
|
+
if any([model_a is not None, model_b is not None]):
|
|
97
|
+
raise ValueError(
|
|
98
|
+
"model_a and model_b parameters are exclusive to the compare mode"
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
parameters = ClassifyParameters(
|
|
102
|
+
judge=judge_config,
|
|
103
|
+
labels=labels,
|
|
104
|
+
pass_labels=pass_labels,
|
|
105
|
+
input_data_file_path=input_data_file_path,
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
# Handle model_to_evaluate
|
|
109
|
+
if model_to_evaluate is not None:
|
|
110
|
+
if isinstance(model_to_evaluate, str):
|
|
111
|
+
parameters.model_to_evaluate = model_to_evaluate
|
|
112
|
+
elif isinstance(model_to_evaluate, dict):
|
|
113
|
+
# Validate that all required fields are present for model config
|
|
114
|
+
required_fields = [
|
|
115
|
+
"model_name",
|
|
116
|
+
"max_tokens",
|
|
117
|
+
"temperature",
|
|
118
|
+
"system_template",
|
|
119
|
+
"input_template",
|
|
120
|
+
]
|
|
121
|
+
missing_fields = [
|
|
122
|
+
field
|
|
123
|
+
for field in required_fields
|
|
124
|
+
if field not in model_to_evaluate
|
|
125
|
+
]
|
|
126
|
+
if missing_fields:
|
|
127
|
+
raise ValueError(
|
|
128
|
+
f"All model config parameters are required when using detailed configuration. "
|
|
129
|
+
f"Missing: {', '.join(missing_fields)}"
|
|
130
|
+
)
|
|
131
|
+
parameters.model_to_evaluate = ModelRequest(**model_to_evaluate)
|
|
132
|
+
|
|
133
|
+
elif type == "score":
|
|
134
|
+
if min_score is None or max_score is None or pass_threshold is None:
|
|
135
|
+
raise ValueError(
|
|
136
|
+
"min_score, max_score, and pass_threshold are required for score evaluation"
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
# Validate that no classify-specific parameters are provided
|
|
140
|
+
if any([labels is not None, pass_labels is not None]):
|
|
141
|
+
raise ValueError(
|
|
142
|
+
"labels and pass_labels parameters are exclusive to the classify mode"
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
# Validate that no compare-specific parameters are provided
|
|
146
|
+
if any([model_a is not None, model_b is not None]):
|
|
147
|
+
raise ValueError(
|
|
148
|
+
"model_a and model_b parameters are exclusive to the compare mode"
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
parameters = ScoreParameters(
|
|
152
|
+
judge=judge_config,
|
|
153
|
+
min_score=min_score,
|
|
154
|
+
max_score=max_score,
|
|
155
|
+
pass_threshold=pass_threshold,
|
|
156
|
+
input_data_file_path=input_data_file_path,
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
# Handle model_to_evaluate
|
|
160
|
+
if model_to_evaluate is not None:
|
|
161
|
+
if isinstance(model_to_evaluate, str):
|
|
162
|
+
parameters.model_to_evaluate = model_to_evaluate
|
|
163
|
+
elif isinstance(model_to_evaluate, dict):
|
|
164
|
+
# Validate that all required fields are present for model config
|
|
165
|
+
required_fields = [
|
|
166
|
+
"model_name",
|
|
167
|
+
"max_tokens",
|
|
168
|
+
"temperature",
|
|
169
|
+
"system_template",
|
|
170
|
+
"input_template",
|
|
171
|
+
]
|
|
172
|
+
missing_fields = [
|
|
173
|
+
field
|
|
174
|
+
for field in required_fields
|
|
175
|
+
if field not in model_to_evaluate
|
|
176
|
+
]
|
|
177
|
+
if missing_fields:
|
|
178
|
+
raise ValueError(
|
|
179
|
+
f"All model config parameters are required when using detailed configuration. "
|
|
180
|
+
f"Missing: {', '.join(missing_fields)}"
|
|
181
|
+
)
|
|
182
|
+
parameters.model_to_evaluate = ModelRequest(**model_to_evaluate)
|
|
183
|
+
|
|
184
|
+
elif type == "compare":
|
|
185
|
+
# Validate that model_a and model_b are provided
|
|
186
|
+
if model_a is None or model_b is None:
|
|
187
|
+
raise ValueError(
|
|
188
|
+
"model_a and model_b parameters are required for compare evaluation"
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
# Validate that no classify-specific parameters are provided
|
|
192
|
+
if any([labels is not None, pass_labels is not None]):
|
|
193
|
+
raise ValueError(
|
|
194
|
+
"labels and pass_labels parameters are exclusive to the classify mode"
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
# Validate that no score-specific parameters are provided
|
|
198
|
+
if any(
|
|
199
|
+
[
|
|
200
|
+
min_score is not None,
|
|
201
|
+
max_score is not None,
|
|
202
|
+
pass_threshold is not None,
|
|
203
|
+
]
|
|
204
|
+
):
|
|
205
|
+
raise ValueError(
|
|
206
|
+
"min_score, max_score, and pass_threshold parameters are exclusive to the score mode"
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
# Validate that model_to_evaluate is not provided
|
|
210
|
+
if model_to_evaluate is not None:
|
|
211
|
+
raise ValueError(
|
|
212
|
+
"model_to_evaluate parameter is exclusive to classify and score modes"
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
parameters = CompareParameters(
|
|
216
|
+
judge=judge_config,
|
|
217
|
+
input_data_file_path=input_data_file_path,
|
|
218
|
+
)
|
|
219
|
+
|
|
220
|
+
# Handle model_a
|
|
221
|
+
if isinstance(model_a, str):
|
|
222
|
+
parameters.model_a = model_a
|
|
223
|
+
elif isinstance(model_a, dict):
|
|
224
|
+
# Validate that all required fields are present for model config
|
|
225
|
+
required_fields = [
|
|
226
|
+
"model_name",
|
|
227
|
+
"max_tokens",
|
|
228
|
+
"temperature",
|
|
229
|
+
"system_template",
|
|
230
|
+
"input_template",
|
|
231
|
+
]
|
|
232
|
+
missing_fields = [
|
|
233
|
+
field for field in required_fields if field not in model_a
|
|
234
|
+
]
|
|
235
|
+
if missing_fields:
|
|
236
|
+
raise ValueError(
|
|
237
|
+
f"All model config parameters are required for model_a when using detailed configuration. "
|
|
238
|
+
f"Missing: {', '.join(missing_fields)}"
|
|
239
|
+
)
|
|
240
|
+
parameters.model_a = ModelRequest(**model_a)
|
|
241
|
+
|
|
242
|
+
# Handle model_b
|
|
243
|
+
if isinstance(model_b, str):
|
|
244
|
+
parameters.model_b = model_b
|
|
245
|
+
elif isinstance(model_b, dict):
|
|
246
|
+
# Validate that all required fields are present for model config
|
|
247
|
+
required_fields = [
|
|
248
|
+
"model_name",
|
|
249
|
+
"max_tokens",
|
|
250
|
+
"temperature",
|
|
251
|
+
"system_template",
|
|
252
|
+
"input_template",
|
|
253
|
+
]
|
|
254
|
+
missing_fields = [
|
|
255
|
+
field for field in required_fields if field not in model_b
|
|
256
|
+
]
|
|
257
|
+
if missing_fields:
|
|
258
|
+
raise ValueError(
|
|
259
|
+
f"All model config parameters are required for model_b when using detailed configuration. "
|
|
260
|
+
f"Missing: {', '.join(missing_fields)}"
|
|
261
|
+
)
|
|
262
|
+
parameters.model_b = ModelRequest(**model_b)
|
|
263
|
+
|
|
264
|
+
else:
|
|
265
|
+
raise ValueError(
|
|
266
|
+
f"Invalid evaluation type: {type}. Must be 'classify', 'score', or 'compare'"
|
|
267
|
+
)
|
|
268
|
+
|
|
269
|
+
payload = {
|
|
270
|
+
"type": type,
|
|
271
|
+
"parameters": parameters.model_dump(),
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
response, _, _ = requestor.request(
|
|
275
|
+
options=TogetherRequest(
|
|
276
|
+
method="POST",
|
|
277
|
+
url="evaluation",
|
|
278
|
+
params=payload,
|
|
279
|
+
),
|
|
280
|
+
stream=False,
|
|
281
|
+
)
|
|
282
|
+
|
|
283
|
+
assert isinstance(response, TogetherResponse)
|
|
284
|
+
return EvaluationCreateResponse(**response.data)
|
|
285
|
+
|
|
286
|
+
def list(
|
|
287
|
+
self,
|
|
288
|
+
status: Optional[str] = None,
|
|
289
|
+
limit: Optional[int] = None,
|
|
290
|
+
) -> List[EvaluationJob]:
|
|
291
|
+
"""
|
|
292
|
+
List evaluation jobs.
|
|
293
|
+
|
|
294
|
+
Args:
|
|
295
|
+
status: Optional filter by job status
|
|
296
|
+
limit: Optional limit on number of results (max 100)
|
|
297
|
+
|
|
298
|
+
Returns:
|
|
299
|
+
List of EvaluationJob objects
|
|
300
|
+
"""
|
|
301
|
+
requestor = api_requestor.APIRequestor(
|
|
302
|
+
client=self._client,
|
|
303
|
+
)
|
|
304
|
+
|
|
305
|
+
params: Dict[str, Any] = {}
|
|
306
|
+
if status is not None:
|
|
307
|
+
params["status"] = status
|
|
308
|
+
if limit is not None:
|
|
309
|
+
params["limit"] = limit
|
|
310
|
+
|
|
311
|
+
response, _, _ = requestor.request(
|
|
312
|
+
options=TogetherRequest(
|
|
313
|
+
method="GET",
|
|
314
|
+
url="evaluations",
|
|
315
|
+
params=params if params else None,
|
|
316
|
+
),
|
|
317
|
+
stream=False,
|
|
318
|
+
)
|
|
319
|
+
|
|
320
|
+
assert isinstance(response, TogetherResponse)
|
|
321
|
+
jobs = response.data or []
|
|
322
|
+
return [EvaluationJob(**job) for job in jobs]
|
|
323
|
+
|
|
324
|
+
def retrieve(self, evaluation_id: str) -> EvaluationJob:
|
|
325
|
+
"""
|
|
326
|
+
Get details of a specific evaluation job.
|
|
327
|
+
|
|
328
|
+
Args:
|
|
329
|
+
evaluation_id: The workflow ID of the evaluation job
|
|
330
|
+
|
|
331
|
+
Returns:
|
|
332
|
+
EvaluationJob object with full details
|
|
333
|
+
"""
|
|
334
|
+
requestor = api_requestor.APIRequestor(
|
|
335
|
+
client=self._client,
|
|
336
|
+
)
|
|
337
|
+
|
|
338
|
+
response, _, _ = requestor.request(
|
|
339
|
+
options=TogetherRequest(
|
|
340
|
+
method="GET",
|
|
341
|
+
url=f"evaluation/{evaluation_id}",
|
|
342
|
+
),
|
|
343
|
+
stream=False,
|
|
344
|
+
)
|
|
345
|
+
|
|
346
|
+
assert isinstance(response, TogetherResponse)
|
|
347
|
+
return EvaluationJob(**response.data)
|
|
348
|
+
|
|
349
|
+
def status(self, evaluation_id: str) -> EvaluationStatusResponse:
|
|
350
|
+
"""
|
|
351
|
+
Get the status and results of a specific evaluation job.
|
|
352
|
+
|
|
353
|
+
Args:
|
|
354
|
+
evaluation_id: The workflow ID of the evaluation job
|
|
355
|
+
|
|
356
|
+
Returns:
|
|
357
|
+
EvaluationStatusResponse with status and results
|
|
358
|
+
"""
|
|
359
|
+
requestor = api_requestor.APIRequestor(
|
|
360
|
+
client=self._client,
|
|
361
|
+
)
|
|
362
|
+
|
|
363
|
+
response, _, _ = requestor.request(
|
|
364
|
+
options=TogetherRequest(
|
|
365
|
+
method="GET",
|
|
366
|
+
url=f"evaluation/{evaluation_id}/status",
|
|
367
|
+
),
|
|
368
|
+
stream=False,
|
|
369
|
+
)
|
|
370
|
+
|
|
371
|
+
assert isinstance(response, TogetherResponse)
|
|
372
|
+
return EvaluationStatusResponse(**response.data)
|
|
373
|
+
|
|
374
|
+
|
|
375
|
+
class AsyncEvaluation:
|
|
376
|
+
def __init__(self, client: TogetherClient) -> None:
|
|
377
|
+
self._client = client
|
|
378
|
+
|
|
379
|
+
async def create(
|
|
380
|
+
self,
|
|
381
|
+
type: str,
|
|
382
|
+
judge_model_name: str,
|
|
383
|
+
judge_system_template: str,
|
|
384
|
+
input_data_file_path: str,
|
|
385
|
+
# Classify-specific parameters
|
|
386
|
+
labels: Optional[List[str]] = None,
|
|
387
|
+
pass_labels: Optional[List[str]] = None,
|
|
388
|
+
# Score-specific parameters
|
|
389
|
+
min_score: Optional[float] = None,
|
|
390
|
+
max_score: Optional[float] = None,
|
|
391
|
+
pass_threshold: Optional[float] = None,
|
|
392
|
+
# Compare-specific parameters (model_a and model_b handled below)
|
|
393
|
+
# Common optional parameters
|
|
394
|
+
model_to_evaluate: Optional[Union[str, Dict[str, Any]]] = None,
|
|
395
|
+
model_a: Optional[Union[str, Dict[str, Any]]] = None,
|
|
396
|
+
model_b: Optional[Union[str, Dict[str, Any]]] = None,
|
|
397
|
+
) -> EvaluationCreateResponse:
|
|
398
|
+
"""
|
|
399
|
+
Create a new evaluation job.
|
|
400
|
+
|
|
401
|
+
Args:
|
|
402
|
+
type: The type of evaluation ("classify", "score", or "compare")
|
|
403
|
+
judge_model_name: Name of the judge model
|
|
404
|
+
judge_system_template: System template for the judge
|
|
405
|
+
input_data_file_path: Path to input data file
|
|
406
|
+
labels: List of classification labels (required for classify)
|
|
407
|
+
pass_labels: List of labels considered as passing (required for classify)
|
|
408
|
+
min_score: Minimum score value (required for score)
|
|
409
|
+
max_score: Maximum score value (required for score)
|
|
410
|
+
pass_threshold: Threshold score for passing (required for score)
|
|
411
|
+
model_to_evaluate: Model to evaluate for classify/score types
|
|
412
|
+
model_a: Model A for compare type
|
|
413
|
+
model_b: Model B for compare type
|
|
414
|
+
|
|
415
|
+
Returns:
|
|
416
|
+
EvaluationCreateResponse with workflow_id and status
|
|
417
|
+
"""
|
|
418
|
+
requestor = api_requestor.APIRequestor(
|
|
419
|
+
client=self._client,
|
|
420
|
+
)
|
|
421
|
+
|
|
422
|
+
# Build judge config
|
|
423
|
+
judge_config = JudgeModelConfig(
|
|
424
|
+
model_name=judge_model_name,
|
|
425
|
+
system_template=judge_system_template,
|
|
426
|
+
)
|
|
427
|
+
parameters: Union[ClassifyParameters, ScoreParameters, CompareParameters]
|
|
428
|
+
# Build parameters based on type
|
|
429
|
+
if type == "classify":
|
|
430
|
+
if labels is None or pass_labels is None:
|
|
431
|
+
raise ValueError(
|
|
432
|
+
"labels and pass_labels are required for classify evaluation"
|
|
433
|
+
)
|
|
434
|
+
|
|
435
|
+
# Validate that no score-specific parameters are provided
|
|
436
|
+
if any(
|
|
437
|
+
[
|
|
438
|
+
min_score is not None,
|
|
439
|
+
max_score is not None,
|
|
440
|
+
pass_threshold is not None,
|
|
441
|
+
]
|
|
442
|
+
):
|
|
443
|
+
raise ValueError(
|
|
444
|
+
"min_score, max_score, and pass_threshold parameters are exclusive to the score mode"
|
|
445
|
+
)
|
|
446
|
+
|
|
447
|
+
# Validate that no compare-specific parameters are provided
|
|
448
|
+
if any([model_a is not None, model_b is not None]):
|
|
449
|
+
raise ValueError(
|
|
450
|
+
"model_a and model_b parameters are exclusive to the compare mode"
|
|
451
|
+
)
|
|
452
|
+
|
|
453
|
+
parameters = ClassifyParameters(
|
|
454
|
+
judge=judge_config,
|
|
455
|
+
labels=labels,
|
|
456
|
+
pass_labels=pass_labels,
|
|
457
|
+
input_data_file_path=input_data_file_path,
|
|
458
|
+
)
|
|
459
|
+
|
|
460
|
+
# Handle model_to_evaluate
|
|
461
|
+
if model_to_evaluate is not None:
|
|
462
|
+
if isinstance(model_to_evaluate, str):
|
|
463
|
+
parameters.model_to_evaluate = model_to_evaluate
|
|
464
|
+
elif isinstance(model_to_evaluate, dict):
|
|
465
|
+
# Validate that all required fields are present for model config
|
|
466
|
+
required_fields = [
|
|
467
|
+
"model_name",
|
|
468
|
+
"max_tokens",
|
|
469
|
+
"temperature",
|
|
470
|
+
"system_template",
|
|
471
|
+
"input_template",
|
|
472
|
+
]
|
|
473
|
+
missing_fields = [
|
|
474
|
+
field
|
|
475
|
+
for field in required_fields
|
|
476
|
+
if field not in model_to_evaluate
|
|
477
|
+
]
|
|
478
|
+
if missing_fields:
|
|
479
|
+
raise ValueError(
|
|
480
|
+
f"All model config parameters are required when using detailed configuration. "
|
|
481
|
+
f"Missing: {', '.join(missing_fields)}"
|
|
482
|
+
)
|
|
483
|
+
parameters.model_to_evaluate = ModelRequest(**model_to_evaluate)
|
|
484
|
+
|
|
485
|
+
elif type == "score":
|
|
486
|
+
if min_score is None or max_score is None or pass_threshold is None:
|
|
487
|
+
raise ValueError(
|
|
488
|
+
"min_score, max_score, and pass_threshold are required for score evaluation"
|
|
489
|
+
)
|
|
490
|
+
|
|
491
|
+
# Validate that no classify-specific parameters are provided
|
|
492
|
+
if any([labels is not None, pass_labels is not None]):
|
|
493
|
+
raise ValueError(
|
|
494
|
+
"labels and pass_labels parameters are exclusive to the classify mode"
|
|
495
|
+
)
|
|
496
|
+
|
|
497
|
+
# Validate that no compare-specific parameters are provided
|
|
498
|
+
if any([model_a is not None, model_b is not None]):
|
|
499
|
+
raise ValueError(
|
|
500
|
+
"model_a and model_b parameters are exclusive to the compare mode"
|
|
501
|
+
)
|
|
502
|
+
|
|
503
|
+
parameters = ScoreParameters(
|
|
504
|
+
judge=judge_config,
|
|
505
|
+
min_score=min_score,
|
|
506
|
+
max_score=max_score,
|
|
507
|
+
pass_threshold=pass_threshold,
|
|
508
|
+
input_data_file_path=input_data_file_path,
|
|
509
|
+
)
|
|
510
|
+
|
|
511
|
+
# Handle model_to_evaluate
|
|
512
|
+
if model_to_evaluate is not None:
|
|
513
|
+
if isinstance(model_to_evaluate, str):
|
|
514
|
+
parameters.model_to_evaluate = model_to_evaluate
|
|
515
|
+
elif isinstance(model_to_evaluate, dict):
|
|
516
|
+
# Validate that all required fields are present for model config
|
|
517
|
+
required_fields = [
|
|
518
|
+
"model_name",
|
|
519
|
+
"max_tokens",
|
|
520
|
+
"temperature",
|
|
521
|
+
"system_template",
|
|
522
|
+
"input_template",
|
|
523
|
+
]
|
|
524
|
+
missing_fields = [
|
|
525
|
+
field
|
|
526
|
+
for field in required_fields
|
|
527
|
+
if field not in model_to_evaluate
|
|
528
|
+
]
|
|
529
|
+
if missing_fields:
|
|
530
|
+
raise ValueError(
|
|
531
|
+
f"All model config parameters are required when using detailed configuration. "
|
|
532
|
+
f"Missing: {', '.join(missing_fields)}"
|
|
533
|
+
)
|
|
534
|
+
parameters.model_to_evaluate = ModelRequest(**model_to_evaluate)
|
|
535
|
+
|
|
536
|
+
elif type == "compare":
|
|
537
|
+
parameters = CompareParameters(
|
|
538
|
+
judge=judge_config,
|
|
539
|
+
input_data_file_path=input_data_file_path,
|
|
540
|
+
)
|
|
541
|
+
|
|
542
|
+
# Validate that model_a and model_b are provided
|
|
543
|
+
if model_a is None or model_b is None:
|
|
544
|
+
raise ValueError(
|
|
545
|
+
"model_a and model_b parameters are required for compare evaluation"
|
|
546
|
+
)
|
|
547
|
+
|
|
548
|
+
# Validate that no classify-specific parameters are provided
|
|
549
|
+
if any([labels is not None, pass_labels is not None]):
|
|
550
|
+
raise ValueError(
|
|
551
|
+
"labels and pass_labels parameters are exclusive to the classify mode"
|
|
552
|
+
)
|
|
553
|
+
|
|
554
|
+
# Validate that no score-specific parameters are provided
|
|
555
|
+
if any(
|
|
556
|
+
[
|
|
557
|
+
min_score is not None,
|
|
558
|
+
max_score is not None,
|
|
559
|
+
pass_threshold is not None,
|
|
560
|
+
]
|
|
561
|
+
):
|
|
562
|
+
raise ValueError(
|
|
563
|
+
"min_score, max_score, and pass_threshold parameters are exclusive to the score mode"
|
|
564
|
+
)
|
|
565
|
+
|
|
566
|
+
# Validate that model_to_evaluate is not provided
|
|
567
|
+
if model_to_evaluate is not None:
|
|
568
|
+
raise ValueError(
|
|
569
|
+
"model_to_evaluate parameter is exclusive to classify and score modes"
|
|
570
|
+
)
|
|
571
|
+
|
|
572
|
+
# Handle model_a
|
|
573
|
+
if isinstance(model_a, str):
|
|
574
|
+
parameters.model_a = model_a
|
|
575
|
+
elif isinstance(model_a, dict):
|
|
576
|
+
# Validate that all required fields are present for model config
|
|
577
|
+
required_fields = [
|
|
578
|
+
"model_name",
|
|
579
|
+
"max_tokens",
|
|
580
|
+
"temperature",
|
|
581
|
+
"system_template",
|
|
582
|
+
"input_template",
|
|
583
|
+
]
|
|
584
|
+
missing_fields = [
|
|
585
|
+
field for field in required_fields if field not in model_a
|
|
586
|
+
]
|
|
587
|
+
if missing_fields:
|
|
588
|
+
raise ValueError(
|
|
589
|
+
f"All model config parameters are required for model_a when using detailed configuration. "
|
|
590
|
+
f"Missing: {', '.join(missing_fields)}"
|
|
591
|
+
)
|
|
592
|
+
parameters.model_a = ModelRequest(**model_a)
|
|
593
|
+
|
|
594
|
+
# Handle model_b
|
|
595
|
+
if isinstance(model_b, str):
|
|
596
|
+
parameters.model_b = model_b
|
|
597
|
+
elif isinstance(model_b, dict):
|
|
598
|
+
# Validate that all required fields are present for model config
|
|
599
|
+
required_fields = [
|
|
600
|
+
"model_name",
|
|
601
|
+
"max_tokens",
|
|
602
|
+
"temperature",
|
|
603
|
+
"system_template",
|
|
604
|
+
"input_template",
|
|
605
|
+
]
|
|
606
|
+
missing_fields = [
|
|
607
|
+
field for field in required_fields if field not in model_b
|
|
608
|
+
]
|
|
609
|
+
if missing_fields:
|
|
610
|
+
raise ValueError(
|
|
611
|
+
f"All model config parameters are required for model_b when using detailed configuration. "
|
|
612
|
+
f"Missing: {', '.join(missing_fields)}"
|
|
613
|
+
)
|
|
614
|
+
parameters.model_b = ModelRequest(**model_b)
|
|
615
|
+
|
|
616
|
+
else:
|
|
617
|
+
raise ValueError(
|
|
618
|
+
f"Invalid evaluation type: {type}. Must be 'classify', 'score', or 'compare'"
|
|
619
|
+
)
|
|
620
|
+
|
|
621
|
+
payload = {
|
|
622
|
+
"type": type,
|
|
623
|
+
"parameters": parameters.model_dump(),
|
|
624
|
+
}
|
|
625
|
+
|
|
626
|
+
response, _, _ = await requestor.arequest(
|
|
627
|
+
options=TogetherRequest(
|
|
628
|
+
method="POST",
|
|
629
|
+
url="evaluation",
|
|
630
|
+
params=payload,
|
|
631
|
+
),
|
|
632
|
+
stream=False,
|
|
633
|
+
)
|
|
634
|
+
|
|
635
|
+
assert isinstance(response, TogetherResponse)
|
|
636
|
+
return EvaluationCreateResponse(**response.data)
|
|
637
|
+
|
|
638
|
+
async def list(
|
|
639
|
+
self,
|
|
640
|
+
status: Optional[str] = None,
|
|
641
|
+
limit: Optional[int] = None,
|
|
642
|
+
) -> List[EvaluationJob]:
|
|
643
|
+
"""
|
|
644
|
+
List evaluation jobs.
|
|
645
|
+
|
|
646
|
+
Args:
|
|
647
|
+
status: Optional filter by job status
|
|
648
|
+
limit: Optional limit on number of results (max 100)
|
|
649
|
+
|
|
650
|
+
Returns:
|
|
651
|
+
List of EvaluationJob objects
|
|
652
|
+
"""
|
|
653
|
+
requestor = api_requestor.APIRequestor(
|
|
654
|
+
client=self._client,
|
|
655
|
+
)
|
|
656
|
+
|
|
657
|
+
params: Dict[str, Any] = {}
|
|
658
|
+
if status is not None:
|
|
659
|
+
params["status"] = status
|
|
660
|
+
if limit is not None:
|
|
661
|
+
params["limit"] = limit
|
|
662
|
+
|
|
663
|
+
response, _, _ = await requestor.arequest(
|
|
664
|
+
options=TogetherRequest(
|
|
665
|
+
method="GET",
|
|
666
|
+
url="evaluations",
|
|
667
|
+
params=params if params else None,
|
|
668
|
+
),
|
|
669
|
+
stream=False,
|
|
670
|
+
)
|
|
671
|
+
|
|
672
|
+
assert isinstance(response, TogetherResponse)
|
|
673
|
+
jobs = response.data or []
|
|
674
|
+
return [EvaluationJob(**job) for job in jobs]
|
|
675
|
+
|
|
676
|
+
async def retrieve(self, evaluation_id: str) -> EvaluationJob:
|
|
677
|
+
"""
|
|
678
|
+
Get details of a specific evaluation job.
|
|
679
|
+
|
|
680
|
+
Args:
|
|
681
|
+
evaluation_id: The workflow ID of the evaluation job
|
|
682
|
+
|
|
683
|
+
Returns:
|
|
684
|
+
EvaluationJob object with full details
|
|
685
|
+
"""
|
|
686
|
+
requestor = api_requestor.APIRequestor(
|
|
687
|
+
client=self._client,
|
|
688
|
+
)
|
|
689
|
+
|
|
690
|
+
response, _, _ = await requestor.arequest(
|
|
691
|
+
options=TogetherRequest(
|
|
692
|
+
method="GET",
|
|
693
|
+
url=f"evaluation/{evaluation_id}",
|
|
694
|
+
),
|
|
695
|
+
stream=False,
|
|
696
|
+
)
|
|
697
|
+
|
|
698
|
+
assert isinstance(response, TogetherResponse)
|
|
699
|
+
return EvaluationJob(**response.data)
|
|
700
|
+
|
|
701
|
+
async def status(self, evaluation_id: str) -> EvaluationStatusResponse:
|
|
702
|
+
"""
|
|
703
|
+
Get the status and results of a specific evaluation job.
|
|
704
|
+
|
|
705
|
+
Args:
|
|
706
|
+
evaluation_id: The workflow ID of the evaluation job
|
|
707
|
+
|
|
708
|
+
Returns:
|
|
709
|
+
EvaluationStatusResponse with status and results
|
|
710
|
+
"""
|
|
711
|
+
requestor = api_requestor.APIRequestor(
|
|
712
|
+
client=self._client,
|
|
713
|
+
)
|
|
714
|
+
|
|
715
|
+
response, _, _ = await requestor.arequest(
|
|
716
|
+
options=TogetherRequest(
|
|
717
|
+
method="GET",
|
|
718
|
+
url=f"evaluation/{evaluation_id}/status",
|
|
719
|
+
),
|
|
720
|
+
stream=False,
|
|
721
|
+
)
|
|
722
|
+
|
|
723
|
+
assert isinstance(response, TogetherResponse)
|
|
724
|
+
return EvaluationStatusResponse(**response.data)
|