together 1.3.2__py3-none-any.whl → 1.3.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- together/cli/api/finetune.py +21 -18
- together/cli/api/utils.py +35 -7
- together/constants.py +19 -0
- together/resources/finetune.py +19 -1
- together/types/finetune.py +3 -1
- together/utils/files.py +146 -26
- {together-1.3.2.dist-info → together-1.3.4.dist-info}/METADATA +2 -2
- {together-1.3.2.dist-info → together-1.3.4.dist-info}/RECORD +11 -11
- {together-1.3.2.dist-info → together-1.3.4.dist-info}/LICENSE +0 -0
- {together-1.3.2.dist-info → together-1.3.4.dist-info}/WHEEL +0 -0
- {together-1.3.2.dist-info → together-1.3.4.dist-info}/entry_points.txt +0 -0
together/cli/api/finetune.py
CHANGED
|
@@ -11,8 +11,13 @@ from rich import print as rprint
|
|
|
11
11
|
from tabulate import tabulate
|
|
12
12
|
|
|
13
13
|
from together import Together
|
|
14
|
-
from together.cli.api.utils import INT_WITH_MAX
|
|
15
|
-
from together.utils import
|
|
14
|
+
from together.cli.api.utils import BOOL_WITH_AUTO, INT_WITH_MAX
|
|
15
|
+
from together.utils import (
|
|
16
|
+
finetune_price_to_dollars,
|
|
17
|
+
log_warn,
|
|
18
|
+
log_warn_once,
|
|
19
|
+
parse_timestamp,
|
|
20
|
+
)
|
|
16
21
|
from together.types.finetune import DownloadCheckpointType, FinetuneTrainingLimits
|
|
17
22
|
|
|
18
23
|
|
|
@@ -93,6 +98,13 @@ def fine_tuning(ctx: click.Context) -> None:
|
|
|
93
98
|
default=False,
|
|
94
99
|
help="Whether to skip the launch confirmation message",
|
|
95
100
|
)
|
|
101
|
+
@click.option(
|
|
102
|
+
"--train-on-inputs",
|
|
103
|
+
type=BOOL_WITH_AUTO,
|
|
104
|
+
default="auto",
|
|
105
|
+
help="Whether to mask the user messages in conversational data or prompts in instruction data. "
|
|
106
|
+
"`auto` will automatically determine whether to mask the inputs based on the data format.",
|
|
107
|
+
)
|
|
96
108
|
def create(
|
|
97
109
|
ctx: click.Context,
|
|
98
110
|
training_file: str,
|
|
@@ -112,6 +124,7 @@ def create(
|
|
|
112
124
|
suffix: str,
|
|
113
125
|
wandb_api_key: str,
|
|
114
126
|
confirm: bool,
|
|
127
|
+
train_on_inputs: bool | Literal["auto"],
|
|
115
128
|
) -> None:
|
|
116
129
|
"""Start fine-tuning"""
|
|
117
130
|
client: Together = ctx.obj
|
|
@@ -133,6 +146,7 @@ def create(
|
|
|
133
146
|
lora_trainable_modules=lora_trainable_modules,
|
|
134
147
|
suffix=suffix,
|
|
135
148
|
wandb_api_key=wandb_api_key,
|
|
149
|
+
train_on_inputs=train_on_inputs,
|
|
136
150
|
)
|
|
137
151
|
|
|
138
152
|
model_limits: FinetuneTrainingLimits = client.fine_tuning.get_model_limits(
|
|
@@ -150,6 +164,10 @@ def create(
|
|
|
150
164
|
"batch_size": model_limits.lora_training.max_batch_size,
|
|
151
165
|
"learning_rate": 1e-3,
|
|
152
166
|
}
|
|
167
|
+
log_warn_once(
|
|
168
|
+
f"The default LoRA rank for {model} has been changed to {default_values['lora_r']} as the max available.\n"
|
|
169
|
+
f"Also, the default learning rate for LoRA fine-tuning has been changed to {default_values['learning_rate']}."
|
|
170
|
+
)
|
|
153
171
|
for arg in default_values:
|
|
154
172
|
arg_source = ctx.get_parameter_source("arg") # type: ignore[attr-defined]
|
|
155
173
|
if arg_source == ParameterSource.DEFAULT:
|
|
@@ -186,22 +204,7 @@ def create(
|
|
|
186
204
|
|
|
187
205
|
if confirm or click.confirm(_CONFIRMATION_MESSAGE, default=True, show_default=True):
|
|
188
206
|
response = client.fine_tuning.create(
|
|
189
|
-
|
|
190
|
-
model=model,
|
|
191
|
-
n_epochs=n_epochs,
|
|
192
|
-
validation_file=validation_file,
|
|
193
|
-
n_evals=n_evals,
|
|
194
|
-
n_checkpoints=n_checkpoints,
|
|
195
|
-
batch_size=batch_size,
|
|
196
|
-
learning_rate=learning_rate,
|
|
197
|
-
warmup_ratio=warmup_ratio,
|
|
198
|
-
lora=lora,
|
|
199
|
-
lora_r=lora_r,
|
|
200
|
-
lora_dropout=lora_dropout,
|
|
201
|
-
lora_alpha=lora_alpha,
|
|
202
|
-
lora_trainable_modules=lora_trainable_modules,
|
|
203
|
-
suffix=suffix,
|
|
204
|
-
wandb_api_key=wandb_api_key,
|
|
207
|
+
**training_args,
|
|
205
208
|
verbose=True,
|
|
206
209
|
)
|
|
207
210
|
|
together/cli/api/utils.py
CHANGED
|
@@ -1,23 +1,51 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
import
|
|
4
|
-
|
|
3
|
+
from gettext import gettext as _
|
|
5
4
|
from typing import Literal
|
|
6
5
|
|
|
6
|
+
import click
|
|
7
|
+
|
|
7
8
|
|
|
8
9
|
class AutoIntParamType(click.ParamType):
|
|
9
|
-
name = "
|
|
10
|
+
name = "integer_or_max"
|
|
11
|
+
_number_class = int
|
|
10
12
|
|
|
11
13
|
def convert(
|
|
12
14
|
self, value: str, param: click.Parameter | None, ctx: click.Context | None
|
|
13
15
|
) -> int | Literal["max"] | None:
|
|
14
|
-
if isinstance(value, int):
|
|
15
|
-
return value
|
|
16
|
-
|
|
17
16
|
if value == "max":
|
|
18
17
|
return "max"
|
|
18
|
+
try:
|
|
19
|
+
return int(value)
|
|
20
|
+
except ValueError:
|
|
21
|
+
self.fail(
|
|
22
|
+
_("{value!r} is not a valid {number_type}.").format(
|
|
23
|
+
value=value, number_type=self.name
|
|
24
|
+
),
|
|
25
|
+
param,
|
|
26
|
+
ctx,
|
|
27
|
+
)
|
|
19
28
|
|
|
20
|
-
|
|
29
|
+
|
|
30
|
+
class BooleanWithAutoParamType(click.ParamType):
|
|
31
|
+
name = "boolean_or_auto"
|
|
32
|
+
|
|
33
|
+
def convert(
|
|
34
|
+
self, value: str, param: click.Parameter | None, ctx: click.Context | None
|
|
35
|
+
) -> bool | Literal["auto"] | None:
|
|
36
|
+
if value == "auto":
|
|
37
|
+
return "auto"
|
|
38
|
+
try:
|
|
39
|
+
return bool(value)
|
|
40
|
+
except ValueError:
|
|
41
|
+
self.fail(
|
|
42
|
+
_("{value!r} is not a valid {type}.").format(
|
|
43
|
+
value=value, type=self.name
|
|
44
|
+
),
|
|
45
|
+
param,
|
|
46
|
+
ctx,
|
|
47
|
+
)
|
|
21
48
|
|
|
22
49
|
|
|
23
50
|
INT_WITH_MAX = AutoIntParamType()
|
|
51
|
+
BOOL_WITH_AUTO = BooleanWithAutoParamType()
|
together/constants.py
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
import enum
|
|
2
|
+
|
|
1
3
|
# Session constants
|
|
2
4
|
TIMEOUT_SECS = 600
|
|
3
5
|
MAX_SESSION_LIFETIME_SECS = 180
|
|
@@ -29,3 +31,20 @@ MAX_FILE_SIZE_GB = 4.9
|
|
|
29
31
|
|
|
30
32
|
# expected columns for Parquet files
|
|
31
33
|
PARQUET_EXPECTED_COLUMNS = ["input_ids", "attention_mask", "labels"]
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class DatasetFormat(enum.Enum):
|
|
37
|
+
"""Dataset format enum."""
|
|
38
|
+
|
|
39
|
+
GENERAL = "general"
|
|
40
|
+
CONVERSATION = "conversation"
|
|
41
|
+
INSTRUCTION = "instruction"
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
JSONL_REQUIRED_COLUMNS_MAP = {
|
|
45
|
+
DatasetFormat.GENERAL: ["text"],
|
|
46
|
+
DatasetFormat.CONVERSATION: ["messages"],
|
|
47
|
+
DatasetFormat.INSTRUCTION: ["prompt", "completion"],
|
|
48
|
+
}
|
|
49
|
+
REQUIRED_COLUMNS_MESSAGE = ["role", "content"]
|
|
50
|
+
POSSIBLE_ROLES_CONVERSATION = ["system", "user", "assistant"]
|
together/resources/finetune.py
CHANGED
|
@@ -43,6 +43,7 @@ def createFinetuneRequest(
|
|
|
43
43
|
lora_trainable_modules: str | None = "all-linear",
|
|
44
44
|
suffix: str | None = None,
|
|
45
45
|
wandb_api_key: str | None = None,
|
|
46
|
+
train_on_inputs: bool | Literal["auto"] = "auto",
|
|
46
47
|
) -> FinetuneRequest:
|
|
47
48
|
if batch_size == "max":
|
|
48
49
|
log_warn_once(
|
|
@@ -95,6 +96,7 @@ def createFinetuneRequest(
|
|
|
95
96
|
training_type=training_type,
|
|
96
97
|
suffix=suffix,
|
|
97
98
|
wandb_key=wandb_api_key,
|
|
99
|
+
train_on_inputs=train_on_inputs,
|
|
98
100
|
)
|
|
99
101
|
|
|
100
102
|
return finetune_request
|
|
@@ -125,6 +127,7 @@ class FineTuning:
|
|
|
125
127
|
wandb_api_key: str | None = None,
|
|
126
128
|
verbose: bool = False,
|
|
127
129
|
model_limits: FinetuneTrainingLimits | None = None,
|
|
130
|
+
train_on_inputs: bool | Literal["auto"] = "auto",
|
|
128
131
|
) -> FinetuneResponse:
|
|
129
132
|
"""
|
|
130
133
|
Method to initiate a fine-tuning job
|
|
@@ -137,7 +140,7 @@ class FineTuning:
|
|
|
137
140
|
n_evals (int, optional): Number of evaluation loops to run. Defaults to 0.
|
|
138
141
|
n_checkpoints (int, optional): Number of checkpoints to save during fine-tuning.
|
|
139
142
|
Defaults to 1.
|
|
140
|
-
batch_size (int
|
|
143
|
+
batch_size (int or "max"): Batch size for fine-tuning. Defaults to max.
|
|
141
144
|
learning_rate (float, optional): Learning rate multiplier to use for training
|
|
142
145
|
Defaults to 0.00001.
|
|
143
146
|
warmup_ratio (float, optional): Warmup ratio for learning rate scheduler.
|
|
@@ -154,6 +157,12 @@ class FineTuning:
|
|
|
154
157
|
Defaults to False.
|
|
155
158
|
model_limits (FinetuneTrainingLimits, optional): Limits for the hyperparameters the model in Fine-tuning.
|
|
156
159
|
Defaults to None.
|
|
160
|
+
train_on_inputs (bool or "auto"): Whether to mask the user messages in conversational data or prompts in instruction data.
|
|
161
|
+
"auto" will automatically determine whether to mask the inputs based on the data format.
|
|
162
|
+
For datasets with the "text" field (general format), inputs will not be masked.
|
|
163
|
+
For datasets with the "messages" field (conversational format) or "prompt" and "completion" fields
|
|
164
|
+
(Instruction format), inputs will be masked.
|
|
165
|
+
Defaults to "auto".
|
|
157
166
|
|
|
158
167
|
Returns:
|
|
159
168
|
FinetuneResponse: Object containing information about fine-tuning job.
|
|
@@ -184,6 +193,7 @@ class FineTuning:
|
|
|
184
193
|
lora_trainable_modules=lora_trainable_modules,
|
|
185
194
|
suffix=suffix,
|
|
186
195
|
wandb_api_key=wandb_api_key,
|
|
196
|
+
train_on_inputs=train_on_inputs,
|
|
187
197
|
)
|
|
188
198
|
|
|
189
199
|
if verbose:
|
|
@@ -436,6 +446,7 @@ class AsyncFineTuning:
|
|
|
436
446
|
wandb_api_key: str | None = None,
|
|
437
447
|
verbose: bool = False,
|
|
438
448
|
model_limits: FinetuneTrainingLimits | None = None,
|
|
449
|
+
train_on_inputs: bool | Literal["auto"] = "auto",
|
|
439
450
|
) -> FinetuneResponse:
|
|
440
451
|
"""
|
|
441
452
|
Async method to initiate a fine-tuning job
|
|
@@ -465,6 +476,12 @@ class AsyncFineTuning:
|
|
|
465
476
|
Defaults to False.
|
|
466
477
|
model_limits (FinetuneTrainingLimits, optional): Limits for the hyperparameters the model in Fine-tuning.
|
|
467
478
|
Defaults to None.
|
|
479
|
+
train_on_inputs (bool or "auto"): Whether to mask the user messages in conversational data or prompts in instruction data.
|
|
480
|
+
"auto" will automatically determine whether to mask the inputs based on the data format.
|
|
481
|
+
For datasets with the "text" field (general format), inputs will not be masked.
|
|
482
|
+
For datasets with the "messages" field (conversational format) or "prompt" and "completion" fields
|
|
483
|
+
(Instruction format), inputs will be masked.
|
|
484
|
+
Defaults to "auto".
|
|
468
485
|
|
|
469
486
|
Returns:
|
|
470
487
|
FinetuneResponse: Object containing information about fine-tuning job.
|
|
@@ -495,6 +512,7 @@ class AsyncFineTuning:
|
|
|
495
512
|
lora_trainable_modules=lora_trainable_modules,
|
|
496
513
|
suffix=suffix,
|
|
497
514
|
wandb_api_key=wandb_api_key,
|
|
515
|
+
train_on_inputs=train_on_inputs,
|
|
498
516
|
)
|
|
499
517
|
|
|
500
518
|
if verbose:
|
together/types/finetune.py
CHANGED
|
@@ -3,7 +3,7 @@ from __future__ import annotations
|
|
|
3
3
|
from enum import Enum
|
|
4
4
|
from typing import List, Literal
|
|
5
5
|
|
|
6
|
-
from pydantic import Field, validator, field_validator
|
|
6
|
+
from pydantic import StrictBool, Field, validator, field_validator
|
|
7
7
|
|
|
8
8
|
from together.types.abstract import BaseModel
|
|
9
9
|
from together.types.common import (
|
|
@@ -163,6 +163,7 @@ class FinetuneRequest(BaseModel):
|
|
|
163
163
|
# weights & biases api key
|
|
164
164
|
wandb_key: str | None = None
|
|
165
165
|
training_type: FullTrainingType | LoRATrainingType | None = None
|
|
166
|
+
train_on_inputs: StrictBool | Literal["auto"] = "auto"
|
|
166
167
|
|
|
167
168
|
|
|
168
169
|
class FinetuneResponse(BaseModel):
|
|
@@ -230,6 +231,7 @@ class FinetuneResponse(BaseModel):
|
|
|
230
231
|
# training file metadata
|
|
231
232
|
training_file_num_lines: int | None = Field(None, alias="TrainingFileNumLines")
|
|
232
233
|
training_file_size: int | None = Field(None, alias="TrainingFileSize")
|
|
234
|
+
train_on_inputs: StrictBool | Literal["auto"] | None = "auto"
|
|
233
235
|
|
|
234
236
|
@field_validator("training_type")
|
|
235
237
|
@classmethod
|
together/utils/files.py
CHANGED
|
@@ -13,9 +13,28 @@ from together.constants import (
|
|
|
13
13
|
MIN_SAMPLES,
|
|
14
14
|
NUM_BYTES_IN_GB,
|
|
15
15
|
PARQUET_EXPECTED_COLUMNS,
|
|
16
|
+
JSONL_REQUIRED_COLUMNS_MAP,
|
|
17
|
+
REQUIRED_COLUMNS_MESSAGE,
|
|
18
|
+
POSSIBLE_ROLES_CONVERSATION,
|
|
19
|
+
DatasetFormat,
|
|
16
20
|
)
|
|
17
21
|
|
|
18
22
|
|
|
23
|
+
class InvalidFileFormatError(ValueError):
|
|
24
|
+
"""Exception raised for invalid file formats during file checks."""
|
|
25
|
+
|
|
26
|
+
def __init__(
|
|
27
|
+
self,
|
|
28
|
+
message: str = "",
|
|
29
|
+
line_number: int | None = None,
|
|
30
|
+
error_source: str | None = None,
|
|
31
|
+
) -> None:
|
|
32
|
+
super().__init__(message)
|
|
33
|
+
self.message = message
|
|
34
|
+
self.line_number = line_number
|
|
35
|
+
self.error_source = error_source
|
|
36
|
+
|
|
37
|
+
|
|
19
38
|
def check_file(
|
|
20
39
|
file: Path | str,
|
|
21
40
|
) -> Dict[str, Any]:
|
|
@@ -31,7 +50,7 @@ def check_file(
|
|
|
31
50
|
"line_type": None,
|
|
32
51
|
"text_field": None,
|
|
33
52
|
"key_value": None,
|
|
34
|
-
"
|
|
53
|
+
"has_min_samples": None,
|
|
35
54
|
"num_samples": None,
|
|
36
55
|
"load_json": None,
|
|
37
56
|
}
|
|
@@ -58,6 +77,7 @@ def check_file(
|
|
|
58
77
|
else:
|
|
59
78
|
report_dict["file_size"] = file_size
|
|
60
79
|
|
|
80
|
+
data_report_dict = {}
|
|
61
81
|
if file.suffix == ".jsonl":
|
|
62
82
|
report_dict["filetype"] = "jsonl"
|
|
63
83
|
data_report_dict = _check_jsonl(file)
|
|
@@ -72,6 +92,7 @@ def check_file(
|
|
|
72
92
|
report_dict["is_check_passed"] = False
|
|
73
93
|
|
|
74
94
|
report_dict.update(data_report_dict)
|
|
95
|
+
|
|
75
96
|
return report_dict
|
|
76
97
|
|
|
77
98
|
|
|
@@ -88,43 +109,132 @@ def _check_jsonl(file: Path) -> Dict[str, Any]:
|
|
|
88
109
|
report_dict["is_check_passed"] = False
|
|
89
110
|
return report_dict
|
|
90
111
|
|
|
112
|
+
dataset_format = None
|
|
91
113
|
with file.open() as f:
|
|
92
|
-
# idx must be instantiated so decode errors (e.g. file is a tar) or empty files are caught
|
|
93
114
|
idx = -1
|
|
94
115
|
try:
|
|
95
116
|
for idx, line in enumerate(f):
|
|
96
|
-
json_line = json.loads(line)
|
|
117
|
+
json_line = json.loads(line)
|
|
97
118
|
|
|
98
119
|
if not isinstance(json_line, dict):
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
120
|
+
raise InvalidFileFormatError(
|
|
121
|
+
message=(
|
|
122
|
+
f"Error parsing file. Invalid format on line {idx + 1} of the input file. "
|
|
123
|
+
'Example of valid json: {"text": "my sample string"}. '
|
|
124
|
+
),
|
|
125
|
+
line_number=idx + 1,
|
|
126
|
+
error_source="line_type",
|
|
103
127
|
)
|
|
104
128
|
|
|
105
|
-
|
|
129
|
+
current_format = None
|
|
130
|
+
for possible_format in JSONL_REQUIRED_COLUMNS_MAP:
|
|
131
|
+
if all(
|
|
132
|
+
column in json_line
|
|
133
|
+
for column in JSONL_REQUIRED_COLUMNS_MAP[possible_format]
|
|
134
|
+
):
|
|
135
|
+
if current_format is None:
|
|
136
|
+
current_format = possible_format
|
|
137
|
+
elif current_format != possible_format:
|
|
138
|
+
raise InvalidFileFormatError(
|
|
139
|
+
message="Found multiple dataset formats in the input file. "
|
|
140
|
+
f"Got {current_format} and {possible_format} on line {idx + 1}.",
|
|
141
|
+
line_number=idx + 1,
|
|
142
|
+
error_source="format",
|
|
143
|
+
)
|
|
106
144
|
|
|
107
|
-
if
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
145
|
+
if current_format is None:
|
|
146
|
+
raise InvalidFileFormatError(
|
|
147
|
+
message=(
|
|
148
|
+
f"Error parsing file. Could not detect a format for the line {idx + 1} with the columns:\n"
|
|
149
|
+
f"{json_line.keys()}"
|
|
150
|
+
),
|
|
151
|
+
line_number=idx + 1,
|
|
152
|
+
error_source="format",
|
|
112
153
|
)
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
f
|
|
154
|
+
|
|
155
|
+
if current_format == DatasetFormat.CONVERSATION:
|
|
156
|
+
message_column = JSONL_REQUIRED_COLUMNS_MAP[
|
|
157
|
+
DatasetFormat.CONVERSATION
|
|
158
|
+
][0]
|
|
159
|
+
if not isinstance(json_line[message_column], list):
|
|
160
|
+
raise InvalidFileFormatError(
|
|
161
|
+
message=f"Invalid format on line {idx + 1} of the input file. "
|
|
162
|
+
f"Expected a list of messages. Found {type(json_line[message_column])}",
|
|
163
|
+
line_number=idx + 1,
|
|
164
|
+
error_source="key_value",
|
|
121
165
|
)
|
|
122
166
|
|
|
123
|
-
|
|
167
|
+
for turn_id, turn in enumerate(json_line[message_column]):
|
|
168
|
+
if not isinstance(turn, dict):
|
|
169
|
+
raise InvalidFileFormatError(
|
|
170
|
+
message=f"Invalid format on line {idx + 1} of the input file. "
|
|
171
|
+
f"Expected a dictionary in the {turn_id + 1} turn. Found {type(turn)}",
|
|
172
|
+
line_number=idx + 1,
|
|
173
|
+
error_source="key_value",
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
previous_role = None
|
|
177
|
+
for turn in json_line[message_column]:
|
|
178
|
+
for column in REQUIRED_COLUMNS_MESSAGE:
|
|
179
|
+
if column not in turn:
|
|
180
|
+
raise InvalidFileFormatError(
|
|
181
|
+
message=f"Field `{column}` is missing for a turn `{turn}` on line {idx + 1} "
|
|
182
|
+
"of the the input file.",
|
|
183
|
+
line_number=idx + 1,
|
|
184
|
+
error_source="key_value",
|
|
185
|
+
)
|
|
186
|
+
else:
|
|
187
|
+
if not isinstance(turn[column], str):
|
|
188
|
+
raise InvalidFileFormatError(
|
|
189
|
+
message=f"Invalid format on line {idx + 1} in the column {column} for turn `{turn}` "
|
|
190
|
+
f"of the input file. Expected string. Found {type(turn[column])}",
|
|
191
|
+
line_number=idx + 1,
|
|
192
|
+
error_source="text_field",
|
|
193
|
+
)
|
|
194
|
+
role = turn["role"]
|
|
195
|
+
|
|
196
|
+
if role not in POSSIBLE_ROLES_CONVERSATION:
|
|
197
|
+
raise InvalidFileFormatError(
|
|
198
|
+
message=f"Found invalid role `{role}` in the messages on the line {idx + 1}. "
|
|
199
|
+
f"Possible roles in the conversation are: {POSSIBLE_ROLES_CONVERSATION}",
|
|
200
|
+
line_number=idx + 1,
|
|
201
|
+
error_source="key_value",
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
if previous_role == role:
|
|
205
|
+
raise InvalidFileFormatError(
|
|
206
|
+
message=f"Invalid role turns on line {idx + 1} of the input file. "
|
|
207
|
+
"`user` and `assistant` roles must alternate user/assistant/user/assistant/...",
|
|
208
|
+
line_number=idx + 1,
|
|
209
|
+
error_source="key_value",
|
|
210
|
+
)
|
|
211
|
+
|
|
212
|
+
previous_role = role
|
|
213
|
+
|
|
214
|
+
else:
|
|
215
|
+
for column in JSONL_REQUIRED_COLUMNS_MAP[current_format]:
|
|
216
|
+
if not isinstance(json_line[column], str):
|
|
217
|
+
raise InvalidFileFormatError(
|
|
218
|
+
message=f'Invalid value type for "{column}" key on line {idx + 1}. '
|
|
219
|
+
f"Expected string. Found {type(json_line[column])}.",
|
|
220
|
+
line_number=idx + 1,
|
|
221
|
+
error_source="key_value",
|
|
222
|
+
)
|
|
223
|
+
|
|
224
|
+
if dataset_format is None:
|
|
225
|
+
dataset_format = current_format
|
|
226
|
+
elif current_format is not None:
|
|
227
|
+
if current_format != dataset_format:
|
|
228
|
+
raise InvalidFileFormatError(
|
|
229
|
+
message="All samples in the dataset must have the same dataset format. "
|
|
230
|
+
f"Got {dataset_format} for the first line and {current_format} "
|
|
231
|
+
f"for the line {idx + 1}.",
|
|
232
|
+
line_number=idx + 1,
|
|
233
|
+
error_source="format",
|
|
234
|
+
)
|
|
124
235
|
|
|
125
|
-
# make sure this is outside the for idx, line in enumerate(f): for loop
|
|
126
236
|
if idx + 1 < MIN_SAMPLES:
|
|
127
|
-
report_dict["
|
|
237
|
+
report_dict["has_min_samples"] = False
|
|
128
238
|
report_dict["message"] = (
|
|
129
239
|
f"Processing {file} resulted in only {idx + 1} samples. "
|
|
130
240
|
f"Our minimum is {MIN_SAMPLES} samples. "
|
|
@@ -132,10 +242,19 @@ def _check_jsonl(file: Path) -> Dict[str, Any]:
|
|
|
132
242
|
report_dict["is_check_passed"] = False
|
|
133
243
|
else:
|
|
134
244
|
report_dict["num_samples"] = idx + 1
|
|
135
|
-
report_dict["
|
|
245
|
+
report_dict["has_min_samples"] = True
|
|
246
|
+
report_dict["is_check_passed"] = True
|
|
136
247
|
|
|
137
248
|
report_dict["load_json"] = True
|
|
138
249
|
|
|
250
|
+
except InvalidFileFormatError as e:
|
|
251
|
+
report_dict["load_json"] = False
|
|
252
|
+
report_dict["is_check_passed"] = False
|
|
253
|
+
report_dict["message"] = e.message
|
|
254
|
+
if e.line_number is not None:
|
|
255
|
+
report_dict["line_number"] = e.line_number
|
|
256
|
+
if e.error_source is not None:
|
|
257
|
+
report_dict[e.error_source] = False
|
|
139
258
|
except ValueError:
|
|
140
259
|
report_dict["load_json"] = False
|
|
141
260
|
if idx < 0:
|
|
@@ -190,7 +309,8 @@ def _check_parquet(file: Path) -> Dict[str, Any]:
|
|
|
190
309
|
|
|
191
310
|
num_samples = len(table)
|
|
192
311
|
if num_samples < MIN_SAMPLES:
|
|
193
|
-
report_dict["
|
|
312
|
+
report_dict["has_min_samples"] = False
|
|
313
|
+
report_dict["message"] = (
|
|
194
314
|
f"Processing {file} resulted in only {num_samples} samples. "
|
|
195
315
|
f"Our minimum is {MIN_SAMPLES} samples. "
|
|
196
316
|
)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: together
|
|
3
|
-
Version: 1.3.
|
|
3
|
+
Version: 1.3.4
|
|
4
4
|
Summary: Python client for Together's Cloud Platform!
|
|
5
5
|
Home-page: https://github.com/togethercomputer/together-python
|
|
6
6
|
License: Apache-2.0
|
|
@@ -29,7 +29,7 @@ Requires-Dist: requests (>=2.31.0,<3.0.0)
|
|
|
29
29
|
Requires-Dist: rich (>=13.8.1,<14.0.0)
|
|
30
30
|
Requires-Dist: tabulate (>=0.9.0,<0.10.0)
|
|
31
31
|
Requires-Dist: tqdm (>=4.66.2,<5.0.0)
|
|
32
|
-
Requires-Dist: typer (>=0.9,<0.
|
|
32
|
+
Requires-Dist: typer (>=0.9,<0.14)
|
|
33
33
|
Project-URL: Bug Tracker, https://github.com/togethercomputer/together-python/issues
|
|
34
34
|
Project-URL: Repository, https://github.com/togethercomputer/together-python
|
|
35
35
|
Description-Content-Type: text/markdown
|
|
@@ -6,13 +6,13 @@ together/cli/api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,
|
|
|
6
6
|
together/cli/api/chat.py,sha256=2PHRb-9T-lUEKhUJFtc7SxJv3shCVx40gq_8pzfsewM,9234
|
|
7
7
|
together/cli/api/completions.py,sha256=l-Zw5t7hojL3w8xd_mitS2NRB72i5Z0xwkzH0rT5XMc,4263
|
|
8
8
|
together/cli/api/files.py,sha256=QLYEXRkY8J2Gg1SbTCtzGfoTMvosoeACNK83L_oLubs,3397
|
|
9
|
-
together/cli/api/finetune.py,sha256=
|
|
9
|
+
together/cli/api/finetune.py,sha256=vl-0cTubZER7wKEPFTFfhe8_Ry_Squ4PypPzR0VHClg,12175
|
|
10
10
|
together/cli/api/images.py,sha256=GADSeaNUHUVMtWovmccGuKc28IJ9E_v4vAEwYHJhu5o,2645
|
|
11
11
|
together/cli/api/models.py,sha256=xWEzu8ZpxM_Pz9KEjRPRVuv_v22RayYZ4QcgiezT5tE,1126
|
|
12
|
-
together/cli/api/utils.py,sha256=
|
|
12
|
+
together/cli/api/utils.py,sha256=IuqYWPnLI38_Bqd7lj8V_SnGdYc59pRmMbQmciS4FsM,1326
|
|
13
13
|
together/cli/cli.py,sha256=RC0tgapkSOFjsRPg8p-8dx9D2LDzm8YmVCHUjk_aVyQ,1977
|
|
14
14
|
together/client.py,sha256=mOlIFjjE9eSTb0o_weaKJwm8qvWNKHDiMmp8kQ7y68I,4946
|
|
15
|
-
together/constants.py,sha256=
|
|
15
|
+
together/constants.py,sha256=0L2R8ftvls9eywQstSsrQcpHIkYsOo473vGw0okArN4,1359
|
|
16
16
|
together/error.py,sha256=emjhTSsLwiZvW0v1EmYemjacCMtcFIKAXWWK_2IdP18,5419
|
|
17
17
|
together/filemanager.py,sha256=QHhBn73oVFdgUpSYXYLmJzHJ9c5wYEMJC0ur6ZgDeYo,11269
|
|
18
18
|
together/legacy/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -29,7 +29,7 @@ together/resources/chat/completions.py,sha256=jYiNZsWa8RyEacL0VgxWj1egJ857oU4nxI
|
|
|
29
29
|
together/resources/completions.py,sha256=5Wa-ZjPCxRcam6CDe7KgGYlTA7yJZMmd5TrRgGCL_ug,11726
|
|
30
30
|
together/resources/embeddings.py,sha256=PTvLb82yjG_-iQOyuhsilp77Fr7gZ0o6WD2KeRnKoxs,2675
|
|
31
31
|
together/resources/files.py,sha256=bnPbaF25e4InBRPvHwXHXT-oSX1Z1sZRsnQW5wq82U4,4990
|
|
32
|
-
together/resources/finetune.py,sha256=
|
|
32
|
+
together/resources/finetune.py,sha256=K_jLNeApduKQXtz9rN7V_tG_IZdfwGrmf_zYgJNX9aA,23609
|
|
33
33
|
together/resources/images.py,sha256=LQUjKPaFxWTqOAPnyF1Pp7Rz4NLOYhmoKwshpYiprEM,4923
|
|
34
34
|
together/resources/models.py,sha256=2dtHhXAqTDOOpwSbYLzWcKTC0-m2Szlb7LDYvp7Jr4w,1786
|
|
35
35
|
together/resources/rerank.py,sha256=3Ju_aRSyZ1s_3zCSNZnSnEJErUVmt2xa3M8z1nvejMA,3931
|
|
@@ -42,18 +42,18 @@ together/types/completions.py,sha256=o3FR5ixsTUj-a3pmOUzbSQg-hESVhpqrC9UD__VCqr4
|
|
|
42
42
|
together/types/embeddings.py,sha256=J7grkYYn7xhqeKaBO2T-8XQRtHhkzYzymovtGdIUK5A,751
|
|
43
43
|
together/types/error.py,sha256=OVlCs3cx_2WhZK4JzHT8SQyRIIqKOP1AZQ4y1PydjAE,370
|
|
44
44
|
together/types/files.py,sha256=-rEUfsV6f2vZB9NrFxT4_933ubsDIUNkPB-3OlOFk4A,1954
|
|
45
|
-
together/types/finetune.py,sha256=
|
|
45
|
+
together/types/finetune.py,sha256=1-EZ-HB1wA2fYX2Gt8u-nVPy6UgVyNQwh4aYzvo8eic,8079
|
|
46
46
|
together/types/images.py,sha256=xnC-FZGdZU30WSFTybfGneWxb-kj0ZGufJsgHtB8j0k,980
|
|
47
47
|
together/types/models.py,sha256=K9Om3cCFexy7qzRSEXUj7gpCy1CVb1hHx7MGG-hvTLw,1035
|
|
48
48
|
together/types/rerank.py,sha256=qZfuXOn7MZ6ly8hpJ_MZ7OU_Bi1-cgYNSB20Wja8Qkk,1061
|
|
49
49
|
together/utils/__init__.py,sha256=n1kmLiaExT9YOKT5ye--dC4tW2qcHeicKX0GR86U640,698
|
|
50
50
|
together/utils/_log.py,sha256=5IYNI-jYzxyIS-pUvhb0vE_Muo3MA7GgBhsu66TKP2w,1951
|
|
51
51
|
together/utils/api_helpers.py,sha256=RSF7SRhbjHzroMOSWAXscflByM1r1ta_1SpxkAT22iE,2407
|
|
52
|
-
together/utils/files.py,sha256=
|
|
52
|
+
together/utils/files.py,sha256=rBCwez0i0bcJIgQQsgd-ROgcakR5NfSmUreYPQoE5Nk,13005
|
|
53
53
|
together/utils/tools.py,sha256=3-lXWP3cBCzOVSZg9tr5zOT1jaVeKAKVWxO2fcXZTh8,1788
|
|
54
54
|
together/version.py,sha256=p03ivHyE0SyWU4jAnRTBi_sOwywVWoZPU4g2gzRgG-Y,126
|
|
55
|
-
together-1.3.
|
|
56
|
-
together-1.3.
|
|
57
|
-
together-1.3.
|
|
58
|
-
together-1.3.
|
|
59
|
-
together-1.3.
|
|
55
|
+
together-1.3.4.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
56
|
+
together-1.3.4.dist-info/METADATA,sha256=4z5uVKF141cdQiwBWGVlpBFvkMAOHb5RDExHDh9UtFg,11829
|
|
57
|
+
together-1.3.4.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
|
|
58
|
+
together-1.3.4.dist-info/entry_points.txt,sha256=G-b5NKW6lUUf1V1fH8IPTBb7jXnK7lhbX9H1zTEJXPs,50
|
|
59
|
+
together-1.3.4.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|