together 1.5.21__py3-none-any.whl → 1.5.24__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- together/cli/api/evaluation.py +379 -0
- together/cli/api/finetune.py +18 -14
- together/cli/cli.py +2 -0
- together/client.py +4 -0
- together/filemanager.py +2 -4
- together/legacy/finetune.py +2 -2
- together/resources/__init__.py +3 -0
- together/resources/batch.py +0 -1
- together/resources/evaluation.py +724 -0
- together/resources/finetune.py +46 -26
- together/types/__init__.py +24 -0
- together/types/evaluation.py +87 -0
- together/types/files.py +2 -0
- together/types/finetune.py +1 -1
- together/utils/files.py +178 -73
- {together-1.5.21.dist-info → together-1.5.24.dist-info}/METADATA +28 -1
- {together-1.5.21.dist-info → together-1.5.24.dist-info}/RECORD +20 -17
- {together-1.5.21.dist-info → together-1.5.24.dist-info}/LICENSE +0 -0
- {together-1.5.21.dist-info → together-1.5.24.dist-info}/WHEEL +0 -0
- {together-1.5.21.dist-info → together-1.5.24.dist-info}/entry_points.txt +0 -0
together/resources/finetune.py
CHANGED
|
@@ -76,6 +76,8 @@ def create_finetune_request(
|
|
|
76
76
|
rpo_alpha: float | None = None,
|
|
77
77
|
simpo_gamma: float | None = None,
|
|
78
78
|
from_checkpoint: str | None = None,
|
|
79
|
+
from_hf_model: str | None = None,
|
|
80
|
+
hf_model_revision: str | None = None,
|
|
79
81
|
hf_api_token: str | None = None,
|
|
80
82
|
hf_output_repo_name: str | None = None,
|
|
81
83
|
) -> FinetuneRequest:
|
|
@@ -87,20 +89,23 @@ def create_finetune_request(
|
|
|
87
89
|
if model is None and from_checkpoint is None:
|
|
88
90
|
raise ValueError("You must specify either a model or a checkpoint")
|
|
89
91
|
|
|
90
|
-
|
|
92
|
+
if from_checkpoint is not None and from_hf_model is not None:
|
|
93
|
+
raise ValueError(
|
|
94
|
+
"You must specify either a Hugging Face Hub model or a previous checkpoint from "
|
|
95
|
+
"Together to start a job from, not both"
|
|
96
|
+
)
|
|
91
97
|
|
|
92
|
-
if
|
|
93
|
-
|
|
94
|
-
"
|
|
95
|
-
"the default batch size is set to the maximum allowed value for each model."
|
|
98
|
+
if from_hf_model is not None and model is None:
|
|
99
|
+
raise ValueError(
|
|
100
|
+
"You must specify the base model to fine-tune a model from the Hugging Face Hub"
|
|
96
101
|
)
|
|
102
|
+
|
|
103
|
+
model_or_checkpoint = model or from_checkpoint
|
|
104
|
+
|
|
97
105
|
if warmup_ratio is None:
|
|
98
106
|
warmup_ratio = 0.0
|
|
99
107
|
|
|
100
108
|
training_type: TrainingType = FullTrainingType()
|
|
101
|
-
max_batch_size: int = 0
|
|
102
|
-
max_batch_size_dpo: int = 0
|
|
103
|
-
min_batch_size: int = 0
|
|
104
109
|
if lora:
|
|
105
110
|
if model_limits.lora_training is None:
|
|
106
111
|
raise ValueError(
|
|
@@ -133,28 +138,23 @@ def create_finetune_request(
|
|
|
133
138
|
min_batch_size = model_limits.full_training.min_batch_size
|
|
134
139
|
max_batch_size_dpo = model_limits.full_training.max_batch_size_dpo
|
|
135
140
|
|
|
136
|
-
if batch_size
|
|
137
|
-
if training_method == "
|
|
138
|
-
batch_size
|
|
139
|
-
|
|
140
|
-
|
|
141
|
+
if batch_size != "max":
|
|
142
|
+
if training_method == "sft":
|
|
143
|
+
if batch_size > max_batch_size:
|
|
144
|
+
raise ValueError(
|
|
145
|
+
f"Requested batch size of {batch_size} is higher that the maximum allowed value of {max_batch_size}."
|
|
146
|
+
)
|
|
147
|
+
elif training_method == "dpo":
|
|
148
|
+
if batch_size > max_batch_size_dpo:
|
|
149
|
+
raise ValueError(
|
|
150
|
+
f"Requested batch size of {batch_size} is higher that the maximum allowed value of {max_batch_size_dpo}."
|
|
151
|
+
)
|
|
141
152
|
|
|
142
|
-
|
|
143
|
-
if batch_size > max_batch_size:
|
|
144
|
-
raise ValueError(
|
|
145
|
-
f"Requested batch size of {batch_size} is higher that the maximum allowed value of {max_batch_size}."
|
|
146
|
-
)
|
|
147
|
-
elif training_method == "dpo":
|
|
148
|
-
if batch_size > max_batch_size_dpo:
|
|
153
|
+
if batch_size < min_batch_size:
|
|
149
154
|
raise ValueError(
|
|
150
|
-
f"Requested batch size of {batch_size} is
|
|
155
|
+
f"Requested batch size of {batch_size} is lower that the minimum allowed value of {min_batch_size}."
|
|
151
156
|
)
|
|
152
157
|
|
|
153
|
-
if batch_size < min_batch_size:
|
|
154
|
-
raise ValueError(
|
|
155
|
-
f"Requested batch size of {batch_size} is lower that the minimum allowed value of {min_batch_size}."
|
|
156
|
-
)
|
|
157
|
-
|
|
158
158
|
if warmup_ratio > 1 or warmup_ratio < 0:
|
|
159
159
|
raise ValueError(f"Warmup ratio should be between 0 and 1 (got {warmup_ratio})")
|
|
160
160
|
|
|
@@ -264,6 +264,8 @@ def create_finetune_request(
|
|
|
264
264
|
wandb_name=wandb_name,
|
|
265
265
|
training_method=training_method_cls,
|
|
266
266
|
from_checkpoint=from_checkpoint,
|
|
267
|
+
from_hf_model=from_hf_model,
|
|
268
|
+
hf_model_revision=hf_model_revision,
|
|
267
269
|
hf_api_token=hf_api_token,
|
|
268
270
|
hf_output_repo_name=hf_output_repo_name,
|
|
269
271
|
)
|
|
@@ -345,6 +347,8 @@ class FineTuning:
|
|
|
345
347
|
rpo_alpha: float | None = None,
|
|
346
348
|
simpo_gamma: float | None = None,
|
|
347
349
|
from_checkpoint: str | None = None,
|
|
350
|
+
from_hf_model: str | None = None,
|
|
351
|
+
hf_model_revision: str | None = None,
|
|
348
352
|
hf_api_token: str | None = None,
|
|
349
353
|
hf_output_repo_name: str | None = None,
|
|
350
354
|
) -> FinetuneResponse:
|
|
@@ -403,6 +407,11 @@ class FineTuning:
|
|
|
403
407
|
from_checkpoint (str, optional): The checkpoint identifier to continue training from a previous fine-tuning job.
|
|
404
408
|
The format: {$JOB_ID/$OUTPUT_MODEL_NAME}:{$STEP}.
|
|
405
409
|
The step value is optional, without it the final checkpoint will be used.
|
|
410
|
+
from_hf_model (str, optional): The Hugging Face Hub repo to start training from.
|
|
411
|
+
Should be as close as possible to the base model (specified by the `model` argument) in terms of architecture and size.
|
|
412
|
+
hf_model_revision (str, optional): The revision of the Hugging Face Hub model to continue training from. Defaults to None.
|
|
413
|
+
Example: hf_model_revision=None (defaults to the latest revision in `main`) or
|
|
414
|
+
hf_model_revision="607a30d783dfa663caf39e06633721c8d4cfcd7e" (specific commit).
|
|
406
415
|
hf_api_token (str, optional): API key for the Hugging Face Hub. Defaults to None.
|
|
407
416
|
hf_output_repo_name (str, optional): HF repo to upload the fine-tuned model to. Defaults to None.
|
|
408
417
|
|
|
@@ -458,6 +467,8 @@ class FineTuning:
|
|
|
458
467
|
rpo_alpha=rpo_alpha,
|
|
459
468
|
simpo_gamma=simpo_gamma,
|
|
460
469
|
from_checkpoint=from_checkpoint,
|
|
470
|
+
from_hf_model=from_hf_model,
|
|
471
|
+
hf_model_revision=hf_model_revision,
|
|
461
472
|
hf_api_token=hf_api_token,
|
|
462
473
|
hf_output_repo_name=hf_output_repo_name,
|
|
463
474
|
)
|
|
@@ -772,6 +783,8 @@ class AsyncFineTuning:
|
|
|
772
783
|
rpo_alpha: float | None = None,
|
|
773
784
|
simpo_gamma: float | None = None,
|
|
774
785
|
from_checkpoint: str | None = None,
|
|
786
|
+
from_hf_model: str | None = None,
|
|
787
|
+
hf_model_revision: str | None = None,
|
|
775
788
|
hf_api_token: str | None = None,
|
|
776
789
|
hf_output_repo_name: str | None = None,
|
|
777
790
|
) -> FinetuneResponse:
|
|
@@ -830,6 +843,11 @@ class AsyncFineTuning:
|
|
|
830
843
|
from_checkpoint (str, optional): The checkpoint identifier to continue training from a previous fine-tuning job.
|
|
831
844
|
The format: {$JOB_ID/$OUTPUT_MODEL_NAME}:{$STEP}.
|
|
832
845
|
The step value is optional, without it the final checkpoint will be used.
|
|
846
|
+
from_hf_model (str, optional): The Hugging Face Hub repo to start training from.
|
|
847
|
+
Should be as close as possible to the base model (specified by the `model` argument) in terms of architecture and size.
|
|
848
|
+
hf_model_revision (str, optional): The revision of the Hugging Face Hub model to continue training from. Defaults to None.
|
|
849
|
+
Example: hf_model_revision=None (defaults to the latest revision in `main`) or
|
|
850
|
+
hf_model_revision="607a30d783dfa663caf39e06633721c8d4cfcd7e" (specific commit).
|
|
833
851
|
hf_api_token (str, optional): API key for the Huggging Face Hub. Defaults to None.
|
|
834
852
|
hf_output_repo_name (str, optional): HF repo to upload the fine-tuned model to. Defaults to None.
|
|
835
853
|
|
|
@@ -885,6 +903,8 @@ class AsyncFineTuning:
|
|
|
885
903
|
rpo_alpha=rpo_alpha,
|
|
886
904
|
simpo_gamma=simpo_gamma,
|
|
887
905
|
from_checkpoint=from_checkpoint,
|
|
906
|
+
from_hf_model=from_hf_model,
|
|
907
|
+
hf_model_revision=hf_model_revision,
|
|
888
908
|
hf_api_token=hf_api_token,
|
|
889
909
|
hf_output_repo_name=hf_output_repo_name,
|
|
890
910
|
)
|
together/types/__init__.py
CHANGED
|
@@ -61,6 +61,19 @@ from together.types.images import ImageRequest, ImageResponse
|
|
|
61
61
|
from together.types.models import ModelObject
|
|
62
62
|
from together.types.rerank import RerankRequest, RerankResponse
|
|
63
63
|
from together.types.batch import BatchJob, BatchJobStatus, BatchEndpoint
|
|
64
|
+
from together.types.evaluation import (
|
|
65
|
+
EvaluationType,
|
|
66
|
+
EvaluationStatus,
|
|
67
|
+
JudgeModelConfig,
|
|
68
|
+
ModelRequest,
|
|
69
|
+
ClassifyParameters,
|
|
70
|
+
ScoreParameters,
|
|
71
|
+
CompareParameters,
|
|
72
|
+
EvaluationRequest,
|
|
73
|
+
EvaluationCreateResponse,
|
|
74
|
+
EvaluationJob,
|
|
75
|
+
EvaluationStatusResponse,
|
|
76
|
+
)
|
|
64
77
|
|
|
65
78
|
|
|
66
79
|
__all__ = [
|
|
@@ -124,4 +137,15 @@ __all__ = [
|
|
|
124
137
|
"BatchJob",
|
|
125
138
|
"BatchJobStatus",
|
|
126
139
|
"BatchEndpoint",
|
|
140
|
+
"EvaluationType",
|
|
141
|
+
"EvaluationStatus",
|
|
142
|
+
"JudgeModelConfig",
|
|
143
|
+
"ModelRequest",
|
|
144
|
+
"ClassifyParameters",
|
|
145
|
+
"ScoreParameters",
|
|
146
|
+
"CompareParameters",
|
|
147
|
+
"EvaluationRequest",
|
|
148
|
+
"EvaluationCreateResponse",
|
|
149
|
+
"EvaluationJob",
|
|
150
|
+
"EvaluationStatusResponse",
|
|
127
151
|
]
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
from enum import Enum
|
|
5
|
+
from typing import Any, Dict, List, Optional, Union
|
|
6
|
+
|
|
7
|
+
from pydantic import BaseModel, Field
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class EvaluationType(str, Enum):
|
|
11
|
+
CLASSIFY = "classify"
|
|
12
|
+
SCORE = "score"
|
|
13
|
+
COMPARE = "compare"
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class EvaluationStatus(str, Enum):
|
|
17
|
+
PENDING = "pending"
|
|
18
|
+
QUEUED = "queued"
|
|
19
|
+
RUNNING = "running"
|
|
20
|
+
COMPLETED = "completed"
|
|
21
|
+
ERROR = "error"
|
|
22
|
+
USER_ERROR = "user_error"
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class JudgeModelConfig(BaseModel):
|
|
26
|
+
model_name: str
|
|
27
|
+
system_template: str
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class ModelRequest(BaseModel):
|
|
31
|
+
model_name: str
|
|
32
|
+
max_tokens: int
|
|
33
|
+
temperature: float
|
|
34
|
+
system_template: str
|
|
35
|
+
input_template: str
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class ClassifyParameters(BaseModel):
|
|
39
|
+
judge: JudgeModelConfig
|
|
40
|
+
labels: List[str]
|
|
41
|
+
pass_labels: List[str]
|
|
42
|
+
model_to_evaluate: Optional[Union[str, ModelRequest]] = None
|
|
43
|
+
input_data_file_path: str
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class ScoreParameters(BaseModel):
|
|
47
|
+
judge: JudgeModelConfig
|
|
48
|
+
min_score: float
|
|
49
|
+
max_score: float
|
|
50
|
+
pass_threshold: float
|
|
51
|
+
model_to_evaluate: Optional[Union[str, ModelRequest]] = None
|
|
52
|
+
input_data_file_path: str
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class CompareParameters(BaseModel):
|
|
56
|
+
judge: JudgeModelConfig
|
|
57
|
+
model_a: Optional[Union[str, ModelRequest]] = None
|
|
58
|
+
model_b: Optional[Union[str, ModelRequest]] = None
|
|
59
|
+
input_data_file_path: str
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class EvaluationRequest(BaseModel):
|
|
63
|
+
type: EvaluationType
|
|
64
|
+
parameters: Union[ClassifyParameters, ScoreParameters, CompareParameters]
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
class EvaluationCreateResponse(BaseModel):
|
|
68
|
+
workflow_id: str
|
|
69
|
+
status: EvaluationStatus
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
class EvaluationJob(BaseModel):
|
|
73
|
+
workflow_id: str = Field(alias="workflow_id")
|
|
74
|
+
type: Optional[EvaluationType] = None
|
|
75
|
+
status: EvaluationStatus
|
|
76
|
+
results: Optional[Dict[str, Any]] = None
|
|
77
|
+
parameters: Optional[Dict[str, Any]] = None
|
|
78
|
+
created_at: Optional[datetime] = None
|
|
79
|
+
updated_at: Optional[datetime] = None
|
|
80
|
+
|
|
81
|
+
class Config:
|
|
82
|
+
populate_by_name = True
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
class EvaluationStatusResponse(BaseModel):
|
|
86
|
+
status: EvaluationStatus
|
|
87
|
+
results: Optional[Dict[str, Any]] = None
|
together/types/files.py
CHANGED
|
@@ -14,11 +14,13 @@ from together.types.common import (
|
|
|
14
14
|
class FilePurpose(str, Enum):
|
|
15
15
|
FineTune = "fine-tune"
|
|
16
16
|
BatchAPI = "batch-api"
|
|
17
|
+
Eval = "eval"
|
|
17
18
|
|
|
18
19
|
|
|
19
20
|
class FileType(str, Enum):
|
|
20
21
|
jsonl = "jsonl"
|
|
21
22
|
parquet = "parquet"
|
|
23
|
+
csv = "csv"
|
|
22
24
|
|
|
23
25
|
|
|
24
26
|
class FileRequest(BaseModel):
|
together/types/finetune.py
CHANGED
|
@@ -195,7 +195,7 @@ class FinetuneRequest(BaseModel):
|
|
|
195
195
|
# number of evaluation loops to run
|
|
196
196
|
n_evals: int | None = None
|
|
197
197
|
# training batch size
|
|
198
|
-
batch_size: int | None = None
|
|
198
|
+
batch_size: int | Literal["max"] | None = None
|
|
199
199
|
# up to 40 character suffix for output model name
|
|
200
200
|
suffix: str | None = None
|
|
201
201
|
# weights & biases api key
|
together/utils/files.py
CHANGED
|
@@ -2,6 +2,7 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
import json
|
|
4
4
|
import os
|
|
5
|
+
import csv
|
|
5
6
|
from pathlib import Path
|
|
6
7
|
from traceback import format_exc
|
|
7
8
|
from typing import Any, Dict, List
|
|
@@ -17,6 +18,7 @@ from together.constants import (
|
|
|
17
18
|
POSSIBLE_ROLES_CONVERSATION,
|
|
18
19
|
DatasetFormat,
|
|
19
20
|
)
|
|
21
|
+
from together.types import FilePurpose
|
|
20
22
|
|
|
21
23
|
|
|
22
24
|
class InvalidFileFormatError(ValueError):
|
|
@@ -36,6 +38,7 @@ class InvalidFileFormatError(ValueError):
|
|
|
36
38
|
|
|
37
39
|
def check_file(
|
|
38
40
|
file: Path | str,
|
|
41
|
+
purpose: FilePurpose | str = FilePurpose.FineTune,
|
|
39
42
|
) -> Dict[str, Any]:
|
|
40
43
|
if not isinstance(file, Path):
|
|
41
44
|
file = Path(file)
|
|
@@ -52,6 +55,7 @@ def check_file(
|
|
|
52
55
|
"has_min_samples": None,
|
|
53
56
|
"num_samples": None,
|
|
54
57
|
"load_json": None,
|
|
58
|
+
"load_csv": None,
|
|
55
59
|
}
|
|
56
60
|
|
|
57
61
|
if not file.is_file():
|
|
@@ -79,10 +83,13 @@ def check_file(
|
|
|
79
83
|
data_report_dict = {}
|
|
80
84
|
if file.suffix == ".jsonl":
|
|
81
85
|
report_dict["filetype"] = "jsonl"
|
|
82
|
-
data_report_dict = _check_jsonl(file)
|
|
86
|
+
data_report_dict = _check_jsonl(file, purpose)
|
|
83
87
|
elif file.suffix == ".parquet":
|
|
84
88
|
report_dict["filetype"] = "parquet"
|
|
85
|
-
data_report_dict = _check_parquet(file)
|
|
89
|
+
data_report_dict = _check_parquet(file, purpose)
|
|
90
|
+
elif file.suffix == ".csv":
|
|
91
|
+
report_dict["filetype"] = "csv"
|
|
92
|
+
data_report_dict = _check_csv(file, purpose)
|
|
86
93
|
else:
|
|
87
94
|
report_dict["filetype"] = (
|
|
88
95
|
f"Unknown extension of file {file}. "
|
|
@@ -229,9 +236,15 @@ def validate_preference_openai(example: Dict[str, Any], idx: int = 0) -> None:
|
|
|
229
236
|
validate_messages(example["non_preferred_output"], idx)
|
|
230
237
|
|
|
231
238
|
|
|
232
|
-
def
|
|
239
|
+
def _check_utf8(file: Path) -> Dict[str, Any]:
|
|
240
|
+
"""Check if the file is UTF-8 encoded.
|
|
241
|
+
|
|
242
|
+
Args:
|
|
243
|
+
file (Path): Path to the file to check.
|
|
244
|
+
Returns:
|
|
245
|
+
Dict[str, Any]: A dictionary with the results of the check.
|
|
246
|
+
"""
|
|
233
247
|
report_dict: Dict[str, Any] = {}
|
|
234
|
-
# Check that the file is UTF-8 encoded. If not report where the error occurs.
|
|
235
248
|
try:
|
|
236
249
|
with file.open(encoding="utf-8") as f:
|
|
237
250
|
f.read()
|
|
@@ -240,6 +253,99 @@ def _check_jsonl(file: Path) -> Dict[str, Any]:
|
|
|
240
253
|
report_dict["utf8"] = False
|
|
241
254
|
report_dict["message"] = f"File is not UTF-8 encoded. Error raised: {e}."
|
|
242
255
|
report_dict["is_check_passed"] = False
|
|
256
|
+
return report_dict
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
def _check_samples_count(
|
|
260
|
+
file: Path, report_dict: Dict[str, Any], idx: int
|
|
261
|
+
) -> Dict[str, Any]:
|
|
262
|
+
if idx + 1 < MIN_SAMPLES:
|
|
263
|
+
report_dict["has_min_samples"] = False
|
|
264
|
+
report_dict["message"] = (
|
|
265
|
+
f"Processing {file} resulted in only {idx + 1} samples. "
|
|
266
|
+
f"Our minimum is {MIN_SAMPLES} samples. "
|
|
267
|
+
)
|
|
268
|
+
report_dict["is_check_passed"] = False
|
|
269
|
+
else:
|
|
270
|
+
report_dict["num_samples"] = idx + 1
|
|
271
|
+
report_dict["has_min_samples"] = True
|
|
272
|
+
|
|
273
|
+
return report_dict
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
def _check_csv(file: Path, purpose: FilePurpose | str) -> Dict[str, Any]:
|
|
277
|
+
"""Check if the file is a valid CSV file.
|
|
278
|
+
|
|
279
|
+
Args:
|
|
280
|
+
file (Path): Path to the file to check.
|
|
281
|
+
purpose (FilePurpose | str): Purpose of the file, used to determine if the file should be checked for specific columns.
|
|
282
|
+
|
|
283
|
+
Returns:
|
|
284
|
+
Dict[str, Any]: A dictionary with the results of the check.
|
|
285
|
+
"""
|
|
286
|
+
report_dict: Dict[str, Any] = {}
|
|
287
|
+
if purpose != FilePurpose.Eval:
|
|
288
|
+
report_dict["is_check_passed"] = False
|
|
289
|
+
report_dict["message"] = (
|
|
290
|
+
f"CSV files are not supported for {purpose}. "
|
|
291
|
+
"Only JSONL and Parquet files are supported."
|
|
292
|
+
)
|
|
293
|
+
return report_dict
|
|
294
|
+
|
|
295
|
+
report_dict.update(_check_utf8(file))
|
|
296
|
+
|
|
297
|
+
if not report_dict["utf8"]:
|
|
298
|
+
return report_dict
|
|
299
|
+
|
|
300
|
+
with file.open() as f:
|
|
301
|
+
reader = csv.DictReader(f)
|
|
302
|
+
if not reader.fieldnames:
|
|
303
|
+
report_dict["message"] = "CSV file is empty or has no header."
|
|
304
|
+
report_dict["is_check_passed"] = False
|
|
305
|
+
return report_dict
|
|
306
|
+
idx = -1
|
|
307
|
+
|
|
308
|
+
try:
|
|
309
|
+
# for loop to iterate through the CSV rows
|
|
310
|
+
for idx, item in enumerate(reader):
|
|
311
|
+
if None in item.keys() or None in item.values():
|
|
312
|
+
raise InvalidFileFormatError(
|
|
313
|
+
message=f"CSV file is malformed or the number of columns found on line {idx + 1} is inconsistent with the header",
|
|
314
|
+
line_number=idx + 1,
|
|
315
|
+
error_source="format",
|
|
316
|
+
)
|
|
317
|
+
|
|
318
|
+
report_dict.update(_check_samples_count(file, report_dict, idx))
|
|
319
|
+
report_dict["load_csv"] = True
|
|
320
|
+
|
|
321
|
+
except InvalidFileFormatError as e:
|
|
322
|
+
report_dict["load_csv"] = False
|
|
323
|
+
report_dict["is_check_passed"] = False
|
|
324
|
+
report_dict["message"] = e.message
|
|
325
|
+
if e.line_number is not None:
|
|
326
|
+
report_dict["line_number"] = e.line_number
|
|
327
|
+
if e.error_source is not None:
|
|
328
|
+
report_dict[e.error_source] = False
|
|
329
|
+
except ValueError:
|
|
330
|
+
report_dict["load_csv"] = False
|
|
331
|
+
if idx < 0:
|
|
332
|
+
report_dict["message"] = (
|
|
333
|
+
"Unable to decode file. "
|
|
334
|
+
"File may be empty or in an unsupported format. "
|
|
335
|
+
)
|
|
336
|
+
else:
|
|
337
|
+
report_dict["message"] = (
|
|
338
|
+
f"Error parsing the CSV file. Unexpected format on line {idx + 1}."
|
|
339
|
+
)
|
|
340
|
+
report_dict["is_check_passed"] = False
|
|
341
|
+
|
|
342
|
+
return report_dict
|
|
343
|
+
|
|
344
|
+
|
|
345
|
+
def _check_jsonl(file: Path, purpose: FilePurpose | str) -> Dict[str, Any]:
|
|
346
|
+
report_dict: Dict[str, Any] = {}
|
|
347
|
+
report_dict.update(_check_utf8(file))
|
|
348
|
+
if not report_dict["utf8"]:
|
|
243
349
|
return report_dict
|
|
244
350
|
|
|
245
351
|
dataset_format = None
|
|
@@ -259,84 +365,75 @@ def _check_jsonl(file: Path) -> Dict[str, Any]:
|
|
|
259
365
|
line_number=idx + 1,
|
|
260
366
|
error_source="line_type",
|
|
261
367
|
)
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
current_format
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
message="Found multiple dataset formats in the input file. "
|
|
274
|
-
f"Got {current_format} and {possible_format} on line {idx + 1}.",
|
|
275
|
-
line_number=idx + 1,
|
|
276
|
-
error_source="format",
|
|
277
|
-
)
|
|
278
|
-
|
|
279
|
-
# Check that there are no extra columns
|
|
280
|
-
for column in json_line:
|
|
281
|
-
if (
|
|
282
|
-
column
|
|
283
|
-
not in JSONL_REQUIRED_COLUMNS_MAP[possible_format]
|
|
284
|
-
):
|
|
368
|
+
# In evals, we don't check the format of the dataset.
|
|
369
|
+
if purpose != FilePurpose.Eval:
|
|
370
|
+
current_format = None
|
|
371
|
+
for possible_format in JSONL_REQUIRED_COLUMNS_MAP:
|
|
372
|
+
if all(
|
|
373
|
+
column in json_line
|
|
374
|
+
for column in JSONL_REQUIRED_COLUMNS_MAP[possible_format]
|
|
375
|
+
):
|
|
376
|
+
if current_format is None:
|
|
377
|
+
current_format = possible_format
|
|
378
|
+
elif current_format != possible_format:
|
|
285
379
|
raise InvalidFileFormatError(
|
|
286
|
-
message=
|
|
380
|
+
message="Found multiple dataset formats in the input file. "
|
|
381
|
+
f"Got {current_format} and {possible_format} on line {idx + 1}.",
|
|
287
382
|
line_number=idx + 1,
|
|
288
383
|
error_source="format",
|
|
289
384
|
)
|
|
290
385
|
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
DatasetFormat.CONVERSATION
|
|
305
|
-
][0]
|
|
306
|
-
validate_messages(json_line[message_column], idx)
|
|
307
|
-
else:
|
|
308
|
-
for column in JSONL_REQUIRED_COLUMNS_MAP[current_format]:
|
|
309
|
-
if not isinstance(json_line[column], str):
|
|
310
|
-
raise InvalidFileFormatError(
|
|
311
|
-
message=f'Invalid value type for "{column}" key on line {idx + 1}. '
|
|
312
|
-
f"Expected string. Found {type(json_line[column])}.",
|
|
313
|
-
line_number=idx + 1,
|
|
314
|
-
error_source="key_value",
|
|
315
|
-
)
|
|
316
|
-
|
|
317
|
-
if dataset_format is None:
|
|
318
|
-
dataset_format = current_format
|
|
319
|
-
elif current_format is not None:
|
|
320
|
-
if current_format != dataset_format:
|
|
386
|
+
# Check that there are no extra columns
|
|
387
|
+
for column in json_line:
|
|
388
|
+
if (
|
|
389
|
+
column
|
|
390
|
+
not in JSONL_REQUIRED_COLUMNS_MAP[possible_format]
|
|
391
|
+
):
|
|
392
|
+
raise InvalidFileFormatError(
|
|
393
|
+
message=f'Found extra column "{column}" in the line {idx + 1}.',
|
|
394
|
+
line_number=idx + 1,
|
|
395
|
+
error_source="format",
|
|
396
|
+
)
|
|
397
|
+
|
|
398
|
+
if current_format is None:
|
|
321
399
|
raise InvalidFileFormatError(
|
|
322
|
-
message=
|
|
323
|
-
|
|
324
|
-
|
|
400
|
+
message=(
|
|
401
|
+
f"Error parsing file. Could not detect a format for the line {idx + 1} with the columns:\n"
|
|
402
|
+
f"{json_line.keys()}"
|
|
403
|
+
),
|
|
325
404
|
line_number=idx + 1,
|
|
326
405
|
error_source="format",
|
|
327
406
|
)
|
|
407
|
+
if current_format == DatasetFormat.PREFERENCE_OPENAI:
|
|
408
|
+
validate_preference_openai(json_line, idx)
|
|
409
|
+
elif current_format == DatasetFormat.CONVERSATION:
|
|
410
|
+
message_column = JSONL_REQUIRED_COLUMNS_MAP[
|
|
411
|
+
DatasetFormat.CONVERSATION
|
|
412
|
+
][0]
|
|
413
|
+
validate_messages(json_line[message_column], idx)
|
|
414
|
+
else:
|
|
415
|
+
for column in JSONL_REQUIRED_COLUMNS_MAP[current_format]:
|
|
416
|
+
if not isinstance(json_line[column], str):
|
|
417
|
+
raise InvalidFileFormatError(
|
|
418
|
+
message=f'Invalid value type for "{column}" key on line {idx + 1}. '
|
|
419
|
+
f"Expected string. Found {type(json_line[column])}.",
|
|
420
|
+
line_number=idx + 1,
|
|
421
|
+
error_source="key_value",
|
|
422
|
+
)
|
|
328
423
|
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
424
|
+
if dataset_format is None:
|
|
425
|
+
dataset_format = current_format
|
|
426
|
+
elif current_format is not None:
|
|
427
|
+
if current_format != dataset_format:
|
|
428
|
+
raise InvalidFileFormatError(
|
|
429
|
+
message="All samples in the dataset must have the same dataset format. "
|
|
430
|
+
f"Got {dataset_format} for the first line and {current_format} "
|
|
431
|
+
f"for the line {idx + 1}.",
|
|
432
|
+
line_number=idx + 1,
|
|
433
|
+
error_source="format",
|
|
434
|
+
)
|
|
435
|
+
|
|
436
|
+
report_dict.update(_check_samples_count(file, report_dict, idx))
|
|
340
437
|
|
|
341
438
|
report_dict["load_json"] = True
|
|
342
439
|
|
|
@@ -370,7 +467,7 @@ def _check_jsonl(file: Path) -> Dict[str, Any]:
|
|
|
370
467
|
return report_dict
|
|
371
468
|
|
|
372
469
|
|
|
373
|
-
def _check_parquet(file: Path) -> Dict[str, Any]:
|
|
470
|
+
def _check_parquet(file: Path, purpose: FilePurpose | str) -> Dict[str, Any]:
|
|
374
471
|
try:
|
|
375
472
|
# Pyarrow is optional as it's large (~80MB) and isn't compatible with older systems.
|
|
376
473
|
from pyarrow import ArrowInvalid, parquet
|
|
@@ -380,6 +477,13 @@ def _check_parquet(file: Path) -> Dict[str, Any]:
|
|
|
380
477
|
)
|
|
381
478
|
|
|
382
479
|
report_dict: Dict[str, Any] = {}
|
|
480
|
+
if purpose == FilePurpose.Eval:
|
|
481
|
+
report_dict["is_check_passed"] = False
|
|
482
|
+
report_dict["message"] = (
|
|
483
|
+
f"Parquet files are not supported for {purpose}. "
|
|
484
|
+
"Only JSONL and CSV files are supported."
|
|
485
|
+
)
|
|
486
|
+
return report_dict
|
|
383
487
|
|
|
384
488
|
try:
|
|
385
489
|
table = parquet.read_table(str(file), memory_map=True)
|
|
@@ -399,6 +503,7 @@ def _check_parquet(file: Path) -> Dict[str, Any]:
|
|
|
399
503
|
report_dict["is_check_passed"] = False
|
|
400
504
|
return report_dict
|
|
401
505
|
|
|
506
|
+
# Don't check for eval
|
|
402
507
|
for column_name in column_names:
|
|
403
508
|
if column_name not in PARQUET_EXPECTED_COLUMNS:
|
|
404
509
|
report_dict["load_parquet"] = (
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: together
|
|
3
|
-
Version: 1.5.
|
|
3
|
+
Version: 1.5.24
|
|
4
4
|
Summary: Python client for Together's Cloud Platform!
|
|
5
5
|
License: Apache-2.0
|
|
6
6
|
Author: Together AI
|
|
@@ -421,6 +421,33 @@ for model in models:
|
|
|
421
421
|
print(model)
|
|
422
422
|
```
|
|
423
423
|
|
|
424
|
+
### Batch Inference
|
|
425
|
+
|
|
426
|
+
The batch API allows you to submit larger inference jobs for completion with a 24 hour turn-around time, below is an example. To learn more refer to the [docs here](https://docs.together.ai/docs/batch-inference).
|
|
427
|
+
|
|
428
|
+
```python
|
|
429
|
+
from together import Together
|
|
430
|
+
|
|
431
|
+
client = Together()
|
|
432
|
+
|
|
433
|
+
# Upload the batch file
|
|
434
|
+
batch_file = client.files.upload(file="simpleqa_batch_student.jsonl", purpose="batch-api")
|
|
435
|
+
|
|
436
|
+
# Create the batch job
|
|
437
|
+
batch = client.batches.create_batch(file_id=batch_file.id, endpoint="/v1/chat/completions")
|
|
438
|
+
|
|
439
|
+
# Monitor the batch status
|
|
440
|
+
batch_stat = client.batches.get_batch(batch.id)
|
|
441
|
+
|
|
442
|
+
# List all batches - contains other batches as well
|
|
443
|
+
client.batches.list_batches()
|
|
444
|
+
|
|
445
|
+
# Download the file content if job completed
|
|
446
|
+
if batch_stat.status == 'COMPLETED':
|
|
447
|
+
output_response = client.files.retrieve_content(id=batch_stat.output_file_id,
|
|
448
|
+
output="simpleqa_v3_output.jsonl")
|
|
449
|
+
```
|
|
450
|
+
|
|
424
451
|
## Usage – CLI
|
|
425
452
|
|
|
426
453
|
### Chat Completions
|