together 1.3.3__py3-none-any.whl → 1.3.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -11,8 +11,13 @@ from rich import print as rprint
11
11
  from tabulate import tabulate
12
12
 
13
13
  from together import Together
14
- from together.cli.api.utils import INT_WITH_MAX
15
- from together.utils import finetune_price_to_dollars, log_warn, parse_timestamp
14
+ from together.cli.api.utils import BOOL_WITH_AUTO, INT_WITH_MAX
15
+ from together.utils import (
16
+ finetune_price_to_dollars,
17
+ log_warn,
18
+ log_warn_once,
19
+ parse_timestamp,
20
+ )
16
21
  from together.types.finetune import DownloadCheckpointType, FinetuneTrainingLimits
17
22
 
18
23
 
@@ -60,12 +65,30 @@ def fine_tuning(ctx: click.Context) -> None:
60
65
  )
61
66
  @click.option("--batch-size", type=INT_WITH_MAX, default="max", help="Train batch size")
62
67
  @click.option("--learning-rate", type=float, default=1e-5, help="Learning rate")
68
+ @click.option(
69
+ "--min-lr-ratio",
70
+ type=float,
71
+ default=0.0,
72
+ help="The ratio of the final learning rate to the peak learning rate",
73
+ )
63
74
  @click.option(
64
75
  "--warmup-ratio",
65
76
  type=float,
66
77
  default=0.0,
67
78
  help="Warmup ratio for learning rate scheduler.",
68
79
  )
80
+ @click.option(
81
+ "--max-grad-norm",
82
+ type=float,
83
+ default=1.0,
84
+ help="Max gradient norm to be used for gradient clipping. Set to 0 to disable.",
85
+ )
86
+ @click.option(
87
+ "--weight-decay",
88
+ type=float,
89
+ default=0.0,
90
+ help="Weight decay",
91
+ )
69
92
  @click.option(
70
93
  "--lora/--no-lora",
71
94
  type=bool,
@@ -93,6 +116,13 @@ def fine_tuning(ctx: click.Context) -> None:
93
116
  default=False,
94
117
  help="Whether to skip the launch confirmation message",
95
118
  )
119
+ @click.option(
120
+ "--train-on-inputs",
121
+ type=BOOL_WITH_AUTO,
122
+ default="auto",
123
+ help="Whether to mask the user messages in conversational data or prompts in instruction data. "
124
+ "`auto` will automatically determine whether to mask the inputs based on the data format.",
125
+ )
96
126
  def create(
97
127
  ctx: click.Context,
98
128
  training_file: str,
@@ -103,7 +133,10 @@ def create(
103
133
  n_checkpoints: int,
104
134
  batch_size: int | Literal["max"],
105
135
  learning_rate: float,
136
+ min_lr_ratio: float,
106
137
  warmup_ratio: float,
138
+ max_grad_norm: float,
139
+ weight_decay: float,
107
140
  lora: bool,
108
141
  lora_r: int,
109
142
  lora_dropout: float,
@@ -112,6 +145,7 @@ def create(
112
145
  suffix: str,
113
146
  wandb_api_key: str,
114
147
  confirm: bool,
148
+ train_on_inputs: bool | Literal["auto"],
115
149
  ) -> None:
116
150
  """Start fine-tuning"""
117
151
  client: Together = ctx.obj
@@ -125,7 +159,10 @@ def create(
125
159
  n_checkpoints=n_checkpoints,
126
160
  batch_size=batch_size,
127
161
  learning_rate=learning_rate,
162
+ min_lr_ratio=min_lr_ratio,
128
163
  warmup_ratio=warmup_ratio,
164
+ max_grad_norm=max_grad_norm,
165
+ weight_decay=weight_decay,
129
166
  lora=lora,
130
167
  lora_r=lora_r,
131
168
  lora_dropout=lora_dropout,
@@ -133,6 +170,7 @@ def create(
133
170
  lora_trainable_modules=lora_trainable_modules,
134
171
  suffix=suffix,
135
172
  wandb_api_key=wandb_api_key,
173
+ train_on_inputs=train_on_inputs,
136
174
  )
137
175
 
138
176
  model_limits: FinetuneTrainingLimits = client.fine_tuning.get_model_limits(
@@ -150,6 +188,10 @@ def create(
150
188
  "batch_size": model_limits.lora_training.max_batch_size,
151
189
  "learning_rate": 1e-3,
152
190
  }
191
+ log_warn_once(
192
+ f"The default LoRA rank for {model} has been changed to {default_values['lora_r']} as the max available.\n"
193
+ f"Also, the default learning rate for LoRA fine-tuning has been changed to {default_values['learning_rate']}."
194
+ )
153
195
  for arg in default_values:
154
196
  arg_source = ctx.get_parameter_source("arg") # type: ignore[attr-defined]
155
197
  if arg_source == ParameterSource.DEFAULT:
@@ -186,22 +228,7 @@ def create(
186
228
 
187
229
  if confirm or click.confirm(_CONFIRMATION_MESSAGE, default=True, show_default=True):
188
230
  response = client.fine_tuning.create(
189
- training_file=training_file,
190
- model=model,
191
- n_epochs=n_epochs,
192
- validation_file=validation_file,
193
- n_evals=n_evals,
194
- n_checkpoints=n_checkpoints,
195
- batch_size=batch_size,
196
- learning_rate=learning_rate,
197
- warmup_ratio=warmup_ratio,
198
- lora=lora,
199
- lora_r=lora_r,
200
- lora_dropout=lora_dropout,
201
- lora_alpha=lora_alpha,
202
- lora_trainable_modules=lora_trainable_modules,
203
- suffix=suffix,
204
- wandb_api_key=wandb_api_key,
231
+ **training_args,
205
232
  verbose=True,
206
233
  )
207
234
 
together/cli/api/utils.py CHANGED
@@ -27,4 +27,25 @@ class AutoIntParamType(click.ParamType):
27
27
  )
28
28
 
29
29
 
30
+ class BooleanWithAutoParamType(click.ParamType):
31
+ name = "boolean_or_auto"
32
+
33
+ def convert(
34
+ self, value: str, param: click.Parameter | None, ctx: click.Context | None
35
+ ) -> bool | Literal["auto"] | None:
36
+ if value == "auto":
37
+ return "auto"
38
+ try:
39
+ return bool(value)
40
+ except ValueError:
41
+ self.fail(
42
+ _("{value!r} is not a valid {type}.").format(
43
+ value=value, type=self.name
44
+ ),
45
+ param,
46
+ ctx,
47
+ )
48
+
49
+
30
50
  INT_WITH_MAX = AutoIntParamType()
51
+ BOOL_WITH_AUTO = BooleanWithAutoParamType()
together/constants.py CHANGED
@@ -1,3 +1,5 @@
1
+ import enum
2
+
1
3
  # Session constants
2
4
  TIMEOUT_SECS = 600
3
5
  MAX_SESSION_LIFETIME_SECS = 180
@@ -29,3 +31,20 @@ MAX_FILE_SIZE_GB = 4.9
29
31
 
30
32
  # expected columns for Parquet files
31
33
  PARQUET_EXPECTED_COLUMNS = ["input_ids", "attention_mask", "labels"]
34
+
35
+
36
+ class DatasetFormat(enum.Enum):
37
+ """Dataset format enum."""
38
+
39
+ GENERAL = "general"
40
+ CONVERSATION = "conversation"
41
+ INSTRUCTION = "instruction"
42
+
43
+
44
+ JSONL_REQUIRED_COLUMNS_MAP = {
45
+ DatasetFormat.GENERAL: ["text"],
46
+ DatasetFormat.CONVERSATION: ["messages"],
47
+ DatasetFormat.INSTRUCTION: ["prompt", "completion"],
48
+ }
49
+ REQUIRED_COLUMNS_MESSAGE = ["role", "content"]
50
+ POSSIBLE_ROLES_CONVERSATION = ["system", "user", "assistant"]
@@ -20,6 +20,8 @@ from together.types import (
20
20
  TogetherClient,
21
21
  TogetherRequest,
22
22
  TrainingType,
23
+ FinetuneLRScheduler,
24
+ FinetuneLinearLRSchedulerArgs,
23
25
  )
24
26
  from together.types.finetune import DownloadCheckpointType
25
27
  from together.utils import log_warn_once, normalize_key
@@ -35,7 +37,10 @@ def createFinetuneRequest(
35
37
  n_checkpoints: int | None = 1,
36
38
  batch_size: int | Literal["max"] = "max",
37
39
  learning_rate: float | None = 0.00001,
38
- warmup_ratio: float | None = 0.0,
40
+ min_lr_ratio: float = 0.0,
41
+ warmup_ratio: float = 0.0,
42
+ max_grad_norm: float = 1.0,
43
+ weight_decay: float = 0.0,
39
44
  lora: bool = False,
40
45
  lora_r: int | None = None,
41
46
  lora_dropout: float | None = 0,
@@ -43,6 +48,7 @@ def createFinetuneRequest(
43
48
  lora_trainable_modules: str | None = "all-linear",
44
49
  suffix: str | None = None,
45
50
  wandb_api_key: str | None = None,
51
+ train_on_inputs: bool | Literal["auto"] = "auto",
46
52
  ) -> FinetuneRequest:
47
53
  if batch_size == "max":
48
54
  log_warn_once(
@@ -82,6 +88,20 @@ def createFinetuneRequest(
82
88
  if warmup_ratio > 1 or warmup_ratio < 0:
83
89
  raise ValueError("Warmup ratio should be between 0 and 1")
84
90
 
91
+ if min_lr_ratio is not None and (min_lr_ratio > 1 or min_lr_ratio < 0):
92
+ raise ValueError("Min learning rate ratio should be between 0 and 1")
93
+
94
+ if max_grad_norm < 0:
95
+ raise ValueError("Max gradient norm should be non-negative")
96
+
97
+ if weight_decay is not None and (weight_decay < 0):
98
+ raise ValueError("Weight decay should be non-negative")
99
+
100
+ lrScheduler = FinetuneLRScheduler(
101
+ lr_scheduler_type="linear",
102
+ lr_scheduler_args=FinetuneLinearLRSchedulerArgs(min_lr_ratio=min_lr_ratio),
103
+ )
104
+
85
105
  finetune_request = FinetuneRequest(
86
106
  model=model,
87
107
  training_file=training_file,
@@ -91,10 +111,14 @@ def createFinetuneRequest(
91
111
  n_checkpoints=n_checkpoints,
92
112
  batch_size=batch_size,
93
113
  learning_rate=learning_rate,
114
+ lr_scheduler=lrScheduler,
94
115
  warmup_ratio=warmup_ratio,
116
+ max_grad_norm=max_grad_norm,
117
+ weight_decay=weight_decay,
95
118
  training_type=training_type,
96
119
  suffix=suffix,
97
120
  wandb_key=wandb_api_key,
121
+ train_on_inputs=train_on_inputs,
98
122
  )
99
123
 
100
124
  return finetune_request
@@ -115,7 +139,10 @@ class FineTuning:
115
139
  n_checkpoints: int | None = 1,
116
140
  batch_size: int | Literal["max"] = "max",
117
141
  learning_rate: float | None = 0.00001,
118
- warmup_ratio: float | None = 0.0,
142
+ min_lr_ratio: float = 0.0,
143
+ warmup_ratio: float = 0.0,
144
+ max_grad_norm: float = 1.0,
145
+ weight_decay: float = 0.0,
119
146
  lora: bool = False,
120
147
  lora_r: int | None = None,
121
148
  lora_dropout: float | None = 0,
@@ -125,6 +152,7 @@ class FineTuning:
125
152
  wandb_api_key: str | None = None,
126
153
  verbose: bool = False,
127
154
  model_limits: FinetuneTrainingLimits | None = None,
155
+ train_on_inputs: bool | Literal["auto"] = "auto",
128
156
  ) -> FinetuneResponse:
129
157
  """
130
158
  Method to initiate a fine-tuning job
@@ -137,10 +165,14 @@ class FineTuning:
137
165
  n_evals (int, optional): Number of evaluation loops to run. Defaults to 0.
138
166
  n_checkpoints (int, optional): Number of checkpoints to save during fine-tuning.
139
167
  Defaults to 1.
140
- batch_size (int, optional): Batch size for fine-tuning. Defaults to max.
168
+ batch_size (int or "max"): Batch size for fine-tuning. Defaults to max.
141
169
  learning_rate (float, optional): Learning rate multiplier to use for training
142
170
  Defaults to 0.00001.
171
+ min_lr_ratio (float, optional): Min learning rate ratio of the initial learning rate for
172
+ the learning rate scheduler. Defaults to 0.0.
143
173
  warmup_ratio (float, optional): Warmup ratio for learning rate scheduler.
174
+ max_grad_norm (float, optional): Max gradient norm. Defaults to 1.0, set to 0 to disable.
175
+ weight_decay (float, optional): Weight decay. Defaults to 0.0.
144
176
  lora (bool, optional): Whether to use LoRA adapters. Defaults to True.
145
177
  lora_r (int, optional): Rank of LoRA adapters. Defaults to 8.
146
178
  lora_dropout (float, optional): Dropout rate for LoRA adapters. Defaults to 0.
@@ -154,6 +186,12 @@ class FineTuning:
154
186
  Defaults to False.
155
187
  model_limits (FinetuneTrainingLimits, optional): Limits for the hyperparameters the model in Fine-tuning.
156
188
  Defaults to None.
189
+ train_on_inputs (bool or "auto"): Whether to mask the user messages in conversational data or prompts in instruction data.
190
+ "auto" will automatically determine whether to mask the inputs based on the data format.
191
+ For datasets with the "text" field (general format), inputs will not be masked.
192
+ For datasets with the "messages" field (conversational format) or "prompt" and "completion" fields
193
+ (Instruction format), inputs will be masked.
194
+ Defaults to "auto".
157
195
 
158
196
  Returns:
159
197
  FinetuneResponse: Object containing information about fine-tuning job.
@@ -176,7 +214,10 @@ class FineTuning:
176
214
  n_checkpoints=n_checkpoints,
177
215
  batch_size=batch_size,
178
216
  learning_rate=learning_rate,
217
+ min_lr_ratio=min_lr_ratio,
179
218
  warmup_ratio=warmup_ratio,
219
+ max_grad_norm=max_grad_norm,
220
+ weight_decay=weight_decay,
180
221
  lora=lora,
181
222
  lora_r=lora_r,
182
223
  lora_dropout=lora_dropout,
@@ -184,6 +225,7 @@ class FineTuning:
184
225
  lora_trainable_modules=lora_trainable_modules,
185
226
  suffix=suffix,
186
227
  wandb_api_key=wandb_api_key,
228
+ train_on_inputs=train_on_inputs,
187
229
  )
188
230
 
189
231
  if verbose:
@@ -426,7 +468,10 @@ class AsyncFineTuning:
426
468
  n_checkpoints: int | None = 1,
427
469
  batch_size: int | Literal["max"] = "max",
428
470
  learning_rate: float | None = 0.00001,
429
- warmup_ratio: float | None = 0.0,
471
+ min_lr_ratio: float = 0.0,
472
+ warmup_ratio: float = 0.0,
473
+ max_grad_norm: float = 1.0,
474
+ weight_decay: float = 0.0,
430
475
  lora: bool = False,
431
476
  lora_r: int | None = None,
432
477
  lora_dropout: float | None = 0,
@@ -436,6 +481,7 @@ class AsyncFineTuning:
436
481
  wandb_api_key: str | None = None,
437
482
  verbose: bool = False,
438
483
  model_limits: FinetuneTrainingLimits | None = None,
484
+ train_on_inputs: bool | Literal["auto"] = "auto",
439
485
  ) -> FinetuneResponse:
440
486
  """
441
487
  Async method to initiate a fine-tuning job
@@ -451,7 +497,11 @@ class AsyncFineTuning:
451
497
  batch_size (int, optional): Batch size for fine-tuning. Defaults to max.
452
498
  learning_rate (float, optional): Learning rate multiplier to use for training
453
499
  Defaults to 0.00001.
500
+ min_lr_ratio (float, optional): Min learning rate ratio of the initial learning rate for
501
+ the learning rate scheduler. Defaults to 0.0.
454
502
  warmup_ratio (float, optional): Warmup ratio for learning rate scheduler.
503
+ max_grad_norm (float, optional): Max gradient norm. Defaults to 1.0, set to 0 to disable.
504
+ weight_decay (float, optional): Weight decay. Defaults to 0.0.
455
505
  lora (bool, optional): Whether to use LoRA adapters. Defaults to True.
456
506
  lora_r (int, optional): Rank of LoRA adapters. Defaults to 8.
457
507
  lora_dropout (float, optional): Dropout rate for LoRA adapters. Defaults to 0.
@@ -465,6 +515,12 @@ class AsyncFineTuning:
465
515
  Defaults to False.
466
516
  model_limits (FinetuneTrainingLimits, optional): Limits for the hyperparameters the model in Fine-tuning.
467
517
  Defaults to None.
518
+ train_on_inputs (bool or "auto"): Whether to mask the user messages in conversational data or prompts in instruction data.
519
+ "auto" will automatically determine whether to mask the inputs based on the data format.
520
+ For datasets with the "text" field (general format), inputs will not be masked.
521
+ For datasets with the "messages" field (conversational format) or "prompt" and "completion" fields
522
+ (Instruction format), inputs will be masked.
523
+ Defaults to "auto".
468
524
 
469
525
  Returns:
470
526
  FinetuneResponse: Object containing information about fine-tuning job.
@@ -487,7 +543,10 @@ class AsyncFineTuning:
487
543
  n_checkpoints=n_checkpoints,
488
544
  batch_size=batch_size,
489
545
  learning_rate=learning_rate,
546
+ min_lr_ratio=min_lr_ratio,
490
547
  warmup_ratio=warmup_ratio,
548
+ max_grad_norm=max_grad_norm,
549
+ weight_decay=weight_decay,
491
550
  lora=lora,
492
551
  lora_r=lora_r,
493
552
  lora_dropout=lora_dropout,
@@ -495,6 +554,7 @@ class AsyncFineTuning:
495
554
  lora_trainable_modules=lora_trainable_modules,
496
555
  suffix=suffix,
497
556
  wandb_api_key=wandb_api_key,
557
+ train_on_inputs=train_on_inputs,
498
558
  )
499
559
 
500
560
  if verbose:
@@ -30,6 +30,8 @@ from together.types.finetune import (
30
30
  LoRATrainingType,
31
31
  TrainingType,
32
32
  FinetuneTrainingLimits,
33
+ FinetuneLRScheduler,
34
+ FinetuneLinearLRSchedulerArgs,
33
35
  )
34
36
  from together.types.images import (
35
37
  ImageRequest,
@@ -57,6 +59,8 @@ __all__ = [
57
59
  "FinetuneList",
58
60
  "FinetuneListEvents",
59
61
  "FinetuneDownloadResult",
62
+ "FinetuneLRScheduler",
63
+ "FinetuneLinearLRSchedulerArgs",
60
64
  "FileRequest",
61
65
  "FileResponse",
62
66
  "FileList",
@@ -3,7 +3,7 @@ from __future__ import annotations
3
3
  from enum import Enum
4
4
  from typing import List, Literal
5
5
 
6
- from pydantic import Field, validator, field_validator
6
+ from pydantic import StrictBool, Field, validator, field_validator
7
7
 
8
8
  from together.types.abstract import BaseModel
9
9
  from together.types.common import (
@@ -150,8 +150,14 @@ class FinetuneRequest(BaseModel):
150
150
  n_epochs: int
151
151
  # training learning rate
152
152
  learning_rate: float
153
+ # learning rate scheduler type and args
154
+ lr_scheduler: FinetuneLRScheduler | None = None
153
155
  # learning rate warmup ratio
154
156
  warmup_ratio: float
157
+ # max gradient norm
158
+ max_grad_norm: float
159
+ # weight decay
160
+ weight_decay: float
155
161
  # number of checkpoints to save
156
162
  n_checkpoints: int | None = None
157
163
  # number of evaluation loops to run
@@ -163,6 +169,7 @@ class FinetuneRequest(BaseModel):
163
169
  # weights & biases api key
164
170
  wandb_key: str | None = None
165
171
  training_type: FullTrainingType | LoRATrainingType | None = None
172
+ train_on_inputs: StrictBool | Literal["auto"] = "auto"
166
173
 
167
174
 
168
175
  class FinetuneResponse(BaseModel):
@@ -192,8 +199,14 @@ class FinetuneResponse(BaseModel):
192
199
  batch_size: int | None = None
193
200
  # training learning rate
194
201
  learning_rate: float | None = None
202
+ # learning rate scheduler type and args
203
+ lr_scheduler: FinetuneLRScheduler | None = None
195
204
  # learning rate warmup ratio
196
205
  warmup_ratio: float | None = None
206
+ # max gradient norm
207
+ max_grad_norm: float | None = None
208
+ # weight decay
209
+ weight_decay: float | None = None
197
210
  # number of steps between evals
198
211
  eval_steps: int | None = None
199
212
  # training type
@@ -230,6 +243,7 @@ class FinetuneResponse(BaseModel):
230
243
  # training file metadata
231
244
  training_file_num_lines: int | None = Field(None, alias="TrainingFileNumLines")
232
245
  training_file_size: int | None = Field(None, alias="TrainingFileSize")
246
+ train_on_inputs: StrictBool | Literal["auto"] | None = "auto"
233
247
 
234
248
  @field_validator("training_type")
235
249
  @classmethod
@@ -285,3 +299,12 @@ class FinetuneTrainingLimits(BaseModel):
285
299
  min_learning_rate: float
286
300
  full_training: FinetuneFullTrainingLimits | None = None
287
301
  lora_training: FinetuneLoraTrainingLimits | None = None
302
+
303
+
304
+ class FinetuneLRScheduler(BaseModel):
305
+ lr_scheduler_type: str
306
+ lr_scheduler_args: FinetuneLinearLRSchedulerArgs | None = None
307
+
308
+
309
+ class FinetuneLinearLRSchedulerArgs(BaseModel):
310
+ min_lr_ratio: float | None = 0.0
together/utils/files.py CHANGED
@@ -13,9 +13,28 @@ from together.constants import (
13
13
  MIN_SAMPLES,
14
14
  NUM_BYTES_IN_GB,
15
15
  PARQUET_EXPECTED_COLUMNS,
16
+ JSONL_REQUIRED_COLUMNS_MAP,
17
+ REQUIRED_COLUMNS_MESSAGE,
18
+ POSSIBLE_ROLES_CONVERSATION,
19
+ DatasetFormat,
16
20
  )
17
21
 
18
22
 
23
+ class InvalidFileFormatError(ValueError):
24
+ """Exception raised for invalid file formats during file checks."""
25
+
26
+ def __init__(
27
+ self,
28
+ message: str = "",
29
+ line_number: int | None = None,
30
+ error_source: str | None = None,
31
+ ) -> None:
32
+ super().__init__(message)
33
+ self.message = message
34
+ self.line_number = line_number
35
+ self.error_source = error_source
36
+
37
+
19
38
  def check_file(
20
39
  file: Path | str,
21
40
  ) -> Dict[str, Any]:
@@ -31,7 +50,7 @@ def check_file(
31
50
  "line_type": None,
32
51
  "text_field": None,
33
52
  "key_value": None,
34
- "min_samples": None,
53
+ "has_min_samples": None,
35
54
  "num_samples": None,
36
55
  "load_json": None,
37
56
  }
@@ -58,6 +77,7 @@ def check_file(
58
77
  else:
59
78
  report_dict["file_size"] = file_size
60
79
 
80
+ data_report_dict = {}
61
81
  if file.suffix == ".jsonl":
62
82
  report_dict["filetype"] = "jsonl"
63
83
  data_report_dict = _check_jsonl(file)
@@ -72,6 +92,7 @@ def check_file(
72
92
  report_dict["is_check_passed"] = False
73
93
 
74
94
  report_dict.update(data_report_dict)
95
+
75
96
  return report_dict
76
97
 
77
98
 
@@ -88,43 +109,132 @@ def _check_jsonl(file: Path) -> Dict[str, Any]:
88
109
  report_dict["is_check_passed"] = False
89
110
  return report_dict
90
111
 
112
+ dataset_format = None
91
113
  with file.open() as f:
92
- # idx must be instantiated so decode errors (e.g. file is a tar) or empty files are caught
93
114
  idx = -1
94
115
  try:
95
116
  for idx, line in enumerate(f):
96
- json_line = json.loads(line) # each line in jsonlines should be a json
117
+ json_line = json.loads(line)
97
118
 
98
119
  if not isinstance(json_line, dict):
99
- report_dict["line_type"] = False
100
- report_dict["message"] = (
101
- f"Error parsing file. Invalid format on line {idx + 1} of the input file. "
102
- 'Example of valid json: {"text": "my sample string"}. '
120
+ raise InvalidFileFormatError(
121
+ message=(
122
+ f"Error parsing file. Invalid format on line {idx + 1} of the input file. "
123
+ 'Example of valid json: {"text": "my sample string"}. '
124
+ ),
125
+ line_number=idx + 1,
126
+ error_source="line_type",
103
127
  )
104
128
 
105
- report_dict["is_check_passed"] = False
129
+ current_format = None
130
+ for possible_format in JSONL_REQUIRED_COLUMNS_MAP:
131
+ if all(
132
+ column in json_line
133
+ for column in JSONL_REQUIRED_COLUMNS_MAP[possible_format]
134
+ ):
135
+ if current_format is None:
136
+ current_format = possible_format
137
+ elif current_format != possible_format:
138
+ raise InvalidFileFormatError(
139
+ message="Found multiple dataset formats in the input file. "
140
+ f"Got {current_format} and {possible_format} on line {idx + 1}.",
141
+ line_number=idx + 1,
142
+ error_source="format",
143
+ )
106
144
 
107
- if "text" not in json_line.keys():
108
- report_dict["text_field"] = False
109
- report_dict["message"] = (
110
- f"Missing 'text' field was found on line {idx + 1} of the the input file. "
111
- "Expected format: {'text': 'my sample string'}. "
145
+ if current_format is None:
146
+ raise InvalidFileFormatError(
147
+ message=(
148
+ f"Error parsing file. Could not detect a format for the line {idx + 1} with the columns:\n"
149
+ f"{json_line.keys()}"
150
+ ),
151
+ line_number=idx + 1,
152
+ error_source="format",
112
153
  )
113
- report_dict["is_check_passed"] = False
114
- else:
115
- # check to make sure the value of the "text" key is a string
116
- if not isinstance(json_line["text"], str):
117
- report_dict["key_value"] = False
118
- report_dict["message"] = (
119
- f'Invalid value type for "text" key on line {idx + 1}. '
120
- f'Expected string. Found {type(json_line["text"])}.'
154
+
155
+ if current_format == DatasetFormat.CONVERSATION:
156
+ message_column = JSONL_REQUIRED_COLUMNS_MAP[
157
+ DatasetFormat.CONVERSATION
158
+ ][0]
159
+ if not isinstance(json_line[message_column], list):
160
+ raise InvalidFileFormatError(
161
+ message=f"Invalid format on line {idx + 1} of the input file. "
162
+ f"Expected a list of messages. Found {type(json_line[message_column])}",
163
+ line_number=idx + 1,
164
+ error_source="key_value",
121
165
  )
122
166
 
123
- report_dict["is_check_passed"] = False
167
+ for turn_id, turn in enumerate(json_line[message_column]):
168
+ if not isinstance(turn, dict):
169
+ raise InvalidFileFormatError(
170
+ message=f"Invalid format on line {idx + 1} of the input file. "
171
+ f"Expected a dictionary in the {turn_id + 1} turn. Found {type(turn)}",
172
+ line_number=idx + 1,
173
+ error_source="key_value",
174
+ )
175
+
176
+ previous_role = None
177
+ for turn in json_line[message_column]:
178
+ for column in REQUIRED_COLUMNS_MESSAGE:
179
+ if column not in turn:
180
+ raise InvalidFileFormatError(
181
+ message=f"Field `{column}` is missing for a turn `{turn}` on line {idx + 1} "
182
+ "of the the input file.",
183
+ line_number=idx + 1,
184
+ error_source="key_value",
185
+ )
186
+ else:
187
+ if not isinstance(turn[column], str):
188
+ raise InvalidFileFormatError(
189
+ message=f"Invalid format on line {idx + 1} in the column {column} for turn `{turn}` "
190
+ f"of the input file. Expected string. Found {type(turn[column])}",
191
+ line_number=idx + 1,
192
+ error_source="text_field",
193
+ )
194
+ role = turn["role"]
195
+
196
+ if role not in POSSIBLE_ROLES_CONVERSATION:
197
+ raise InvalidFileFormatError(
198
+ message=f"Found invalid role `{role}` in the messages on the line {idx + 1}. "
199
+ f"Possible roles in the conversation are: {POSSIBLE_ROLES_CONVERSATION}",
200
+ line_number=idx + 1,
201
+ error_source="key_value",
202
+ )
203
+
204
+ if previous_role == role:
205
+ raise InvalidFileFormatError(
206
+ message=f"Invalid role turns on line {idx + 1} of the input file. "
207
+ "`user` and `assistant` roles must alternate user/assistant/user/assistant/...",
208
+ line_number=idx + 1,
209
+ error_source="key_value",
210
+ )
211
+
212
+ previous_role = role
213
+
214
+ else:
215
+ for column in JSONL_REQUIRED_COLUMNS_MAP[current_format]:
216
+ if not isinstance(json_line[column], str):
217
+ raise InvalidFileFormatError(
218
+ message=f'Invalid value type for "{column}" key on line {idx + 1}. '
219
+ f"Expected string. Found {type(json_line[column])}.",
220
+ line_number=idx + 1,
221
+ error_source="key_value",
222
+ )
223
+
224
+ if dataset_format is None:
225
+ dataset_format = current_format
226
+ elif current_format is not None:
227
+ if current_format != dataset_format:
228
+ raise InvalidFileFormatError(
229
+ message="All samples in the dataset must have the same dataset format. "
230
+ f"Got {dataset_format} for the first line and {current_format} "
231
+ f"for the line {idx + 1}.",
232
+ line_number=idx + 1,
233
+ error_source="format",
234
+ )
124
235
 
125
- # make sure this is outside the for idx, line in enumerate(f): for loop
126
236
  if idx + 1 < MIN_SAMPLES:
127
- report_dict["min_samples"] = False
237
+ report_dict["has_min_samples"] = False
128
238
  report_dict["message"] = (
129
239
  f"Processing {file} resulted in only {idx + 1} samples. "
130
240
  f"Our minimum is {MIN_SAMPLES} samples. "
@@ -132,10 +242,19 @@ def _check_jsonl(file: Path) -> Dict[str, Any]:
132
242
  report_dict["is_check_passed"] = False
133
243
  else:
134
244
  report_dict["num_samples"] = idx + 1
135
- report_dict["min_samples"] = True
245
+ report_dict["has_min_samples"] = True
246
+ report_dict["is_check_passed"] = True
136
247
 
137
248
  report_dict["load_json"] = True
138
249
 
250
+ except InvalidFileFormatError as e:
251
+ report_dict["load_json"] = False
252
+ report_dict["is_check_passed"] = False
253
+ report_dict["message"] = e.message
254
+ if e.line_number is not None:
255
+ report_dict["line_number"] = e.line_number
256
+ if e.error_source is not None:
257
+ report_dict[e.error_source] = False
139
258
  except ValueError:
140
259
  report_dict["load_json"] = False
141
260
  if idx < 0:
@@ -190,7 +309,8 @@ def _check_parquet(file: Path) -> Dict[str, Any]:
190
309
 
191
310
  num_samples = len(table)
192
311
  if num_samples < MIN_SAMPLES:
193
- report_dict["min_samples"] = (
312
+ report_dict["has_min_samples"] = False
313
+ report_dict["message"] = (
194
314
  f"Processing {file} resulted in only {num_samples} samples. "
195
315
  f"Our minimum is {MIN_SAMPLES} samples. "
196
316
  )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: together
3
- Version: 1.3.3
3
+ Version: 1.3.5
4
4
  Summary: Python client for Together's Cloud Platform!
5
5
  Home-page: https://github.com/togethercomputer/together-python
6
6
  License: Apache-2.0
@@ -29,7 +29,7 @@ Requires-Dist: requests (>=2.31.0,<3.0.0)
29
29
  Requires-Dist: rich (>=13.8.1,<14.0.0)
30
30
  Requires-Dist: tabulate (>=0.9.0,<0.10.0)
31
31
  Requires-Dist: tqdm (>=4.66.2,<5.0.0)
32
- Requires-Dist: typer (>=0.9,<0.13)
32
+ Requires-Dist: typer (>=0.9,<0.14)
33
33
  Project-URL: Bug Tracker, https://github.com/togethercomputer/together-python/issues
34
34
  Project-URL: Repository, https://github.com/togethercomputer/together-python
35
35
  Description-Content-Type: text/markdown
@@ -6,13 +6,13 @@ together/cli/api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,
6
6
  together/cli/api/chat.py,sha256=2PHRb-9T-lUEKhUJFtc7SxJv3shCVx40gq_8pzfsewM,9234
7
7
  together/cli/api/completions.py,sha256=l-Zw5t7hojL3w8xd_mitS2NRB72i5Z0xwkzH0rT5XMc,4263
8
8
  together/cli/api/files.py,sha256=QLYEXRkY8J2Gg1SbTCtzGfoTMvosoeACNK83L_oLubs,3397
9
- together/cli/api/finetune.py,sha256=rF3PP-BwvrVPPubNn5j-_EToss7sC5pj0-DjsJ7hYxc,12026
9
+ together/cli/api/finetune.py,sha256=78dJs_hF_gDWQjUT5R3v518GmNQnnB0Qt8CyU68e5jY,12760
10
10
  together/cli/api/images.py,sha256=GADSeaNUHUVMtWovmccGuKc28IJ9E_v4vAEwYHJhu5o,2645
11
11
  together/cli/api/models.py,sha256=xWEzu8ZpxM_Pz9KEjRPRVuv_v22RayYZ4QcgiezT5tE,1126
12
- together/cli/api/utils.py,sha256=nWvaCplARQa5e4lrisI8-mwY6031UMaO3wcpVxpUM9I,726
12
+ together/cli/api/utils.py,sha256=IuqYWPnLI38_Bqd7lj8V_SnGdYc59pRmMbQmciS4FsM,1326
13
13
  together/cli/cli.py,sha256=RC0tgapkSOFjsRPg8p-8dx9D2LDzm8YmVCHUjk_aVyQ,1977
14
14
  together/client.py,sha256=mOlIFjjE9eSTb0o_weaKJwm8qvWNKHDiMmp8kQ7y68I,4946
15
- together/constants.py,sha256=6DAvMTrGYI73gUFRbfBdLfDxksucpKjKsiH07PGtSSM,906
15
+ together/constants.py,sha256=0L2R8ftvls9eywQstSsrQcpHIkYsOo473vGw0okArN4,1359
16
16
  together/error.py,sha256=emjhTSsLwiZvW0v1EmYemjacCMtcFIKAXWWK_2IdP18,5419
17
17
  together/filemanager.py,sha256=QHhBn73oVFdgUpSYXYLmJzHJ9c5wYEMJC0ur6ZgDeYo,11269
18
18
  together/legacy/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -29,12 +29,12 @@ together/resources/chat/completions.py,sha256=jYiNZsWa8RyEacL0VgxWj1egJ857oU4nxI
29
29
  together/resources/completions.py,sha256=5Wa-ZjPCxRcam6CDe7KgGYlTA7yJZMmd5TrRgGCL_ug,11726
30
30
  together/resources/embeddings.py,sha256=PTvLb82yjG_-iQOyuhsilp77Fr7gZ0o6WD2KeRnKoxs,2675
31
31
  together/resources/files.py,sha256=bnPbaF25e4InBRPvHwXHXT-oSX1Z1sZRsnQW5wq82U4,4990
32
- together/resources/finetune.py,sha256=AEuX21vTcnBZOvMIFtcV0Y4X008_gF_-Tnz87RjQT-Q,22213
32
+ together/resources/finetune.py,sha256=UcbPAZ0b_WR3ks754n5fPzDjraNQHSkulaKGmQQZ2Zs,25516
33
33
  together/resources/images.py,sha256=LQUjKPaFxWTqOAPnyF1Pp7Rz4NLOYhmoKwshpYiprEM,4923
34
34
  together/resources/models.py,sha256=2dtHhXAqTDOOpwSbYLzWcKTC0-m2Szlb7LDYvp7Jr4w,1786
35
35
  together/resources/rerank.py,sha256=3Ju_aRSyZ1s_3zCSNZnSnEJErUVmt2xa3M8z1nvejMA,3931
36
36
  together/together_response.py,sha256=MhczUCPem93cjX-A1TOAUrRj3sO-o3SLcEcTsZgVzQI,1319
37
- together/types/__init__.py,sha256=oHZCMC0H3j1ykf7ZRgxIU0QBA534EMpfKqRaa9SdgOo,1739
37
+ together/types/__init__.py,sha256=jEnnepzUeeYgCNTQIi4EWKaOEsZKYp0vEqzYmP8bK5o,1863
38
38
  together/types/abstract.py,sha256=1lFQI_3WjsR_t1128AeKW0aTk6EiM6Gh1J3ZuyLLPao,642
39
39
  together/types/chat_completions.py,sha256=d24F3VfT7uVnmaEk7Fn-O7qkGUg_AQQzR7vPwlXVDXw,4882
40
40
  together/types/common.py,sha256=4ZeIgqGioqhIC-nNxY90czNPp-kAqboMulw6-1z6ShM,1511
@@ -42,18 +42,18 @@ together/types/completions.py,sha256=o3FR5ixsTUj-a3pmOUzbSQg-hESVhpqrC9UD__VCqr4
42
42
  together/types/embeddings.py,sha256=J7grkYYn7xhqeKaBO2T-8XQRtHhkzYzymovtGdIUK5A,751
43
43
  together/types/error.py,sha256=OVlCs3cx_2WhZK4JzHT8SQyRIIqKOP1AZQ4y1PydjAE,370
44
44
  together/types/files.py,sha256=-rEUfsV6f2vZB9NrFxT4_933ubsDIUNkPB-3OlOFk4A,1954
45
- together/types/finetune.py,sha256=CVnU20WKBo9mGDu3uErzqqWvP0xvx2aVCIRHpfb0HJI,7942
45
+ together/types/finetune.py,sha256=17IM5A__GnT6hgMClMz0vESohWI_qh5Eeq3iR9w1ODg,8704
46
46
  together/types/images.py,sha256=xnC-FZGdZU30WSFTybfGneWxb-kj0ZGufJsgHtB8j0k,980
47
47
  together/types/models.py,sha256=K9Om3cCFexy7qzRSEXUj7gpCy1CVb1hHx7MGG-hvTLw,1035
48
48
  together/types/rerank.py,sha256=qZfuXOn7MZ6ly8hpJ_MZ7OU_Bi1-cgYNSB20Wja8Qkk,1061
49
49
  together/utils/__init__.py,sha256=n1kmLiaExT9YOKT5ye--dC4tW2qcHeicKX0GR86U640,698
50
50
  together/utils/_log.py,sha256=5IYNI-jYzxyIS-pUvhb0vE_Muo3MA7GgBhsu66TKP2w,1951
51
51
  together/utils/api_helpers.py,sha256=RSF7SRhbjHzroMOSWAXscflByM1r1ta_1SpxkAT22iE,2407
52
- together/utils/files.py,sha256=gMLthqfP5hKxVAerHMdy7gLXzdfY6lyOXdpW24Y4X3I,7165
52
+ together/utils/files.py,sha256=rBCwez0i0bcJIgQQsgd-ROgcakR5NfSmUreYPQoE5Nk,13005
53
53
  together/utils/tools.py,sha256=3-lXWP3cBCzOVSZg9tr5zOT1jaVeKAKVWxO2fcXZTh8,1788
54
54
  together/version.py,sha256=p03ivHyE0SyWU4jAnRTBi_sOwywVWoZPU4g2gzRgG-Y,126
55
- together-1.3.3.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
56
- together-1.3.3.dist-info/METADATA,sha256=Cz9xAfH5jrXLw1RxPE25ZFzloe72UGfza2o1ltF_nm8,11829
57
- together-1.3.3.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
58
- together-1.3.3.dist-info/entry_points.txt,sha256=G-b5NKW6lUUf1V1fH8IPTBb7jXnK7lhbX9H1zTEJXPs,50
59
- together-1.3.3.dist-info/RECORD,,
55
+ together-1.3.5.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
56
+ together-1.3.5.dist-info/METADATA,sha256=4naWLEoh8icjBGlIVvJSXlNjtwFGdgKpWi-hVEXDo-E,11829
57
+ together-1.3.5.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
58
+ together-1.3.5.dist-info/entry_points.txt,sha256=G-b5NKW6lUUf1V1fH8IPTBb7jXnK7lhbX9H1zTEJXPs,50
59
+ together-1.3.5.dist-info/RECORD,,