together 1.5.34__py3-none-any.whl → 2.0.0a6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (208) hide show
  1. together/__init__.py +101 -114
  2. together/_base_client.py +1995 -0
  3. together/_client.py +1033 -0
  4. together/_compat.py +219 -0
  5. together/_constants.py +14 -0
  6. together/_exceptions.py +108 -0
  7. together/_files.py +123 -0
  8. together/_models.py +857 -0
  9. together/_qs.py +150 -0
  10. together/_resource.py +43 -0
  11. together/_response.py +830 -0
  12. together/_streaming.py +370 -0
  13. together/_types.py +260 -0
  14. together/_utils/__init__.py +64 -0
  15. together/_utils/_compat.py +45 -0
  16. together/_utils/_datetime_parse.py +136 -0
  17. together/_utils/_logs.py +25 -0
  18. together/_utils/_proxy.py +65 -0
  19. together/_utils/_reflection.py +42 -0
  20. together/_utils/_resources_proxy.py +24 -0
  21. together/_utils/_streams.py +12 -0
  22. together/_utils/_sync.py +58 -0
  23. together/_utils/_transform.py +457 -0
  24. together/_utils/_typing.py +156 -0
  25. together/_utils/_utils.py +421 -0
  26. together/_version.py +4 -0
  27. together/lib/.keep +4 -0
  28. together/lib/__init__.py +23 -0
  29. together/{cli → lib/cli}/api/endpoints.py +65 -81
  30. together/{cli/api/evaluation.py → lib/cli/api/evals.py} +152 -43
  31. together/{cli → lib/cli}/api/files.py +20 -17
  32. together/{cli/api/finetune.py → lib/cli/api/fine_tuning.py} +116 -172
  33. together/{cli → lib/cli}/api/models.py +34 -27
  34. together/lib/cli/api/utils.py +50 -0
  35. together/{cli → lib/cli}/cli.py +16 -26
  36. together/{constants.py → lib/constants.py} +11 -24
  37. together/lib/resources/__init__.py +11 -0
  38. together/lib/resources/files.py +999 -0
  39. together/lib/resources/fine_tuning.py +280 -0
  40. together/lib/resources/models.py +35 -0
  41. together/lib/types/__init__.py +13 -0
  42. together/lib/types/error.py +9 -0
  43. together/lib/types/fine_tuning.py +397 -0
  44. together/{utils → lib/utils}/__init__.py +6 -14
  45. together/{utils → lib/utils}/_log.py +11 -16
  46. together/{utils → lib/utils}/files.py +90 -288
  47. together/lib/utils/serializer.py +10 -0
  48. together/{utils → lib/utils}/tools.py +19 -55
  49. together/resources/__init__.py +225 -39
  50. together/resources/audio/__init__.py +72 -48
  51. together/resources/audio/audio.py +198 -0
  52. together/resources/audio/speech.py +574 -128
  53. together/resources/audio/transcriptions.py +247 -261
  54. together/resources/audio/translations.py +221 -241
  55. together/resources/audio/voices.py +111 -41
  56. together/resources/batches.py +417 -0
  57. together/resources/chat/__init__.py +30 -21
  58. together/resources/chat/chat.py +102 -0
  59. together/resources/chat/completions.py +1063 -263
  60. together/resources/code_interpreter/__init__.py +33 -0
  61. together/resources/code_interpreter/code_interpreter.py +258 -0
  62. together/resources/code_interpreter/sessions.py +135 -0
  63. together/resources/completions.py +884 -225
  64. together/resources/embeddings.py +172 -68
  65. together/resources/endpoints.py +589 -477
  66. together/resources/evals.py +452 -0
  67. together/resources/files.py +397 -129
  68. together/resources/fine_tuning.py +1033 -0
  69. together/resources/hardware.py +181 -0
  70. together/resources/images.py +258 -104
  71. together/resources/jobs.py +214 -0
  72. together/resources/models.py +223 -193
  73. together/resources/rerank.py +190 -92
  74. together/resources/videos.py +286 -214
  75. together/types/__init__.py +66 -167
  76. together/types/audio/__init__.py +10 -0
  77. together/types/audio/speech_create_params.py +75 -0
  78. together/types/audio/transcription_create_params.py +54 -0
  79. together/types/audio/transcription_create_response.py +111 -0
  80. together/types/audio/translation_create_params.py +40 -0
  81. together/types/audio/translation_create_response.py +70 -0
  82. together/types/audio/voice_list_response.py +23 -0
  83. together/types/audio_speech_stream_chunk.py +16 -0
  84. together/types/autoscaling.py +13 -0
  85. together/types/autoscaling_param.py +15 -0
  86. together/types/batch_create_params.py +24 -0
  87. together/types/batch_create_response.py +14 -0
  88. together/types/batch_job.py +45 -0
  89. together/types/batch_list_response.py +10 -0
  90. together/types/chat/__init__.py +18 -0
  91. together/types/chat/chat_completion.py +60 -0
  92. together/types/chat/chat_completion_chunk.py +61 -0
  93. together/types/chat/chat_completion_structured_message_image_url_param.py +18 -0
  94. together/types/chat/chat_completion_structured_message_text_param.py +13 -0
  95. together/types/chat/chat_completion_structured_message_video_url_param.py +18 -0
  96. together/types/chat/chat_completion_usage.py +13 -0
  97. together/types/chat/chat_completion_warning.py +9 -0
  98. together/types/chat/completion_create_params.py +329 -0
  99. together/types/code_interpreter/__init__.py +5 -0
  100. together/types/code_interpreter/session_list_response.py +31 -0
  101. together/types/code_interpreter_execute_params.py +45 -0
  102. together/types/completion.py +42 -0
  103. together/types/completion_chunk.py +66 -0
  104. together/types/completion_create_params.py +138 -0
  105. together/types/dedicated_endpoint.py +44 -0
  106. together/types/embedding.py +24 -0
  107. together/types/embedding_create_params.py +31 -0
  108. together/types/endpoint_create_params.py +43 -0
  109. together/types/endpoint_list_avzones_response.py +11 -0
  110. together/types/endpoint_list_params.py +18 -0
  111. together/types/endpoint_list_response.py +41 -0
  112. together/types/endpoint_update_params.py +27 -0
  113. together/types/eval_create_params.py +263 -0
  114. together/types/eval_create_response.py +16 -0
  115. together/types/eval_list_params.py +21 -0
  116. together/types/eval_list_response.py +10 -0
  117. together/types/eval_status_response.py +100 -0
  118. together/types/evaluation_job.py +139 -0
  119. together/types/execute_response.py +108 -0
  120. together/types/file_delete_response.py +13 -0
  121. together/types/file_list.py +12 -0
  122. together/types/file_purpose.py +9 -0
  123. together/types/file_response.py +31 -0
  124. together/types/file_type.py +7 -0
  125. together/types/fine_tuning_cancel_response.py +194 -0
  126. together/types/fine_tuning_content_params.py +24 -0
  127. together/types/fine_tuning_delete_params.py +11 -0
  128. together/types/fine_tuning_delete_response.py +12 -0
  129. together/types/fine_tuning_list_checkpoints_response.py +21 -0
  130. together/types/fine_tuning_list_events_response.py +12 -0
  131. together/types/fine_tuning_list_response.py +199 -0
  132. together/types/finetune_event.py +41 -0
  133. together/types/finetune_event_type.py +33 -0
  134. together/types/finetune_response.py +177 -0
  135. together/types/hardware_list_params.py +16 -0
  136. together/types/hardware_list_response.py +58 -0
  137. together/types/image_data_b64.py +15 -0
  138. together/types/image_data_url.py +15 -0
  139. together/types/image_file.py +23 -0
  140. together/types/image_generate_params.py +85 -0
  141. together/types/job_list_response.py +47 -0
  142. together/types/job_retrieve_response.py +43 -0
  143. together/types/log_probs.py +18 -0
  144. together/types/model_list_response.py +10 -0
  145. together/types/model_object.py +42 -0
  146. together/types/model_upload_params.py +36 -0
  147. together/types/model_upload_response.py +23 -0
  148. together/types/rerank_create_params.py +36 -0
  149. together/types/rerank_create_response.py +36 -0
  150. together/types/tool_choice.py +23 -0
  151. together/types/tool_choice_param.py +23 -0
  152. together/types/tools_param.py +23 -0
  153. together/types/training_method_dpo.py +22 -0
  154. together/types/training_method_sft.py +18 -0
  155. together/types/video_create_params.py +86 -0
  156. together/types/video_create_response.py +10 -0
  157. together/types/video_job.py +57 -0
  158. together-2.0.0a6.dist-info/METADATA +729 -0
  159. together-2.0.0a6.dist-info/RECORD +165 -0
  160. {together-1.5.34.dist-info → together-2.0.0a6.dist-info}/WHEEL +1 -1
  161. together-2.0.0a6.dist-info/entry_points.txt +2 -0
  162. {together-1.5.34.dist-info → together-2.0.0a6.dist-info}/licenses/LICENSE +1 -1
  163. together/abstract/api_requestor.py +0 -770
  164. together/cli/api/chat.py +0 -298
  165. together/cli/api/completions.py +0 -119
  166. together/cli/api/images.py +0 -93
  167. together/cli/api/utils.py +0 -139
  168. together/client.py +0 -186
  169. together/error.py +0 -194
  170. together/filemanager.py +0 -635
  171. together/legacy/__init__.py +0 -0
  172. together/legacy/base.py +0 -27
  173. together/legacy/complete.py +0 -93
  174. together/legacy/embeddings.py +0 -27
  175. together/legacy/files.py +0 -146
  176. together/legacy/finetune.py +0 -177
  177. together/legacy/images.py +0 -27
  178. together/legacy/models.py +0 -44
  179. together/resources/batch.py +0 -165
  180. together/resources/code_interpreter.py +0 -82
  181. together/resources/evaluation.py +0 -808
  182. together/resources/finetune.py +0 -1388
  183. together/together_response.py +0 -50
  184. together/types/abstract.py +0 -26
  185. together/types/audio_speech.py +0 -311
  186. together/types/batch.py +0 -54
  187. together/types/chat_completions.py +0 -210
  188. together/types/code_interpreter.py +0 -57
  189. together/types/common.py +0 -67
  190. together/types/completions.py +0 -107
  191. together/types/embeddings.py +0 -35
  192. together/types/endpoints.py +0 -123
  193. together/types/error.py +0 -16
  194. together/types/evaluation.py +0 -93
  195. together/types/files.py +0 -93
  196. together/types/finetune.py +0 -464
  197. together/types/images.py +0 -42
  198. together/types/models.py +0 -96
  199. together/types/rerank.py +0 -43
  200. together/types/videos.py +0 -69
  201. together/utils/api_helpers.py +0 -124
  202. together/version.py +0 -6
  203. together-1.5.34.dist-info/METADATA +0 -583
  204. together-1.5.34.dist-info/RECORD +0 -77
  205. together-1.5.34.dist-info/entry_points.txt +0 -3
  206. /together/{abstract → lib/cli}/__init__.py +0 -0
  207. /together/{cli → lib/cli/api}/__init__.py +0 -0
  208. /together/{cli/api/__init__.py → py.typed} +0 -0
@@ -1,37 +1,26 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import os
3
4
  import csv
4
5
  import json
5
- import os
6
+ from typing import Any, Dict, List, cast
6
7
  from pathlib import Path
7
8
  from traceback import format_exc
8
- from typing import Any, Dict, List
9
9
 
10
10
  from tqdm import tqdm
11
11
 
12
- from together.constants import (
13
- JSONL_REQUIRED_COLUMNS_MAP,
14
- MAX_BASE64_IMAGE_LENGTH,
15
- MAX_FILE_SIZE_GB,
16
- MAX_IMAGES_PER_EXAMPLE,
12
+ from together.types import FilePurpose
13
+ from together.lib.constants import (
17
14
  MIN_SAMPLES,
15
+ DISABLE_TQDM,
18
16
  NUM_BYTES_IN_GB,
17
+ MAX_FILE_SIZE_GB,
19
18
  PARQUET_EXPECTED_COLUMNS,
20
- POSSIBLE_ROLES_CONVERSATION,
21
19
  REQUIRED_COLUMNS_MESSAGE,
20
+ JSONL_REQUIRED_COLUMNS_MAP,
21
+ POSSIBLE_ROLES_CONVERSATION,
22
22
  DatasetFormat,
23
23
  )
24
- from together.types import FilePurpose
25
-
26
-
27
- # MessageContent is a string or a list of dicts with 'type': 'text' or 'image_url', and 'text' or 'image_url.url'
28
- # Example: "Hello" or [
29
- # {"type": "text", "text": "Hello"},
30
- # {"type": "image_url", "image_url": {
31
- # "url": "data:image/jpeg;base64,..."
32
- # }}
33
- # ]
34
- MessageContent = str | list[dict[str, Any]]
35
24
 
36
25
 
37
26
  class InvalidFileFormatError(ValueError):
@@ -51,12 +40,12 @@ class InvalidFileFormatError(ValueError):
51
40
 
52
41
  def check_file(
53
42
  file: Path | str,
54
- purpose: FilePurpose | str = FilePurpose.FineTune,
43
+ purpose: FilePurpose | str = "fine-tune",
55
44
  ) -> Dict[str, Any]:
56
45
  if not isinstance(file, Path):
57
46
  file = Path(file)
58
47
 
59
- report_dict = {
48
+ report_dict: Dict[str, Any] = {
60
49
  "is_check_passed": True,
61
50
  "message": "Checks passed",
62
51
  "found": None,
@@ -78,7 +67,7 @@ def check_file(
78
67
  else:
79
68
  report_dict["found"] = True
80
69
 
81
- file_size = os.stat(file).st_size
70
+ file_size = os.stat(file.as_posix()).st_size
82
71
 
83
72
  if file_size > MAX_FILE_SIZE_GB * NUM_BYTES_IN_GB:
84
73
  report_dict["message"] = (
@@ -105,8 +94,7 @@ def check_file(
105
94
  data_report_dict = _check_csv(file, purpose)
106
95
  else:
107
96
  report_dict["filetype"] = (
108
- f"Unknown extension of file {file}. "
109
- "Only files with extensions .jsonl and .parquet are supported."
97
+ f"Unknown extension of file {file}. Only files with extensions .jsonl, .parquet, and .csv are supported."
110
98
  )
111
99
  report_dict["is_check_passed"] = False
112
100
 
@@ -115,9 +103,7 @@ def check_file(
115
103
  return report_dict
116
104
 
117
105
 
118
- def _check_conversation_type(
119
- messages: List[Dict[str, str | int | MessageContent]], idx: int
120
- ) -> None:
106
+ def _check_conversation_type(messages: List[Dict[str, str | bool]], idx: int) -> None:
121
107
  """Check that the conversation has correct type.
122
108
 
123
109
  Args:
@@ -128,23 +114,22 @@ def _check_conversation_type(
128
114
  Raises:
129
115
  InvalidFileFormatError: If the conversation type is invalid.
130
116
  """
131
- if not isinstance(messages, list):
132
- raise InvalidFileFormatError(
133
- message=f"Invalid format on line {idx + 1} of the input file. "
134
- f"The `messages` column must be a list. Found {type(messages)}",
135
- line_number=idx + 1,
136
- error_source="key_value",
137
- )
117
+ # if not isinstance(messages, list):
118
+ # raise InvalidFileFormatError(
119
+ # message=f"Invalid format on line {idx + 1} of the input file. "
120
+ # f"The `messages` column must be a list. Found {type(messages)}",
121
+ # line_number=idx + 1,
122
+ # error_source="key_value",
123
+ # )
138
124
  if len(messages) == 0:
139
125
  raise InvalidFileFormatError(
140
- message=f"Invalid format on line {idx + 1} of the input file. "
141
- f"The `messages` column must not be empty.",
126
+ message=f"Invalid format on line {idx + 1} of the input file. The `messages` column must not be empty.",
142
127
  line_number=idx + 1,
143
128
  error_source="key_value",
144
129
  )
145
130
 
146
131
  for message in messages:
147
- if not isinstance(message, dict):
132
+ if not isinstance(cast(Any, message), dict):
148
133
  raise InvalidFileFormatError(
149
134
  message=f"Invalid format on line {idx + 1} of the input file. "
150
135
  f"The `messages` column must be a list of dicts. Found {type(message)}",
@@ -159,11 +144,15 @@ def _check_conversation_type(
159
144
  line_number=idx + 1,
160
145
  error_source="key_value",
161
146
  )
147
+ if not isinstance(message[column], str):
148
+ raise InvalidFileFormatError(
149
+ message=f"Column `{column}` is not a string on line {idx + 1}. Found {type(message[column])}",
150
+ line_number=idx + 1,
151
+ error_source="text_field",
152
+ )
162
153
 
163
154
 
164
- def _check_conversation_roles(
165
- require_assistant_role: bool, assistant_role_exists: bool, idx: int
166
- ) -> None:
155
+ def _check_conversation_roles(require_assistant_role: bool, assistant_role_exists: bool, idx: int) -> None:
167
156
  """Check that the conversation has correct roles.
168
157
 
169
158
  Args:
@@ -183,9 +172,7 @@ def _check_conversation_roles(
183
172
  )
184
173
 
185
174
 
186
- def _check_message_weight(
187
- message: Dict[str, str | int | MessageContent], idx: int
188
- ) -> int | None:
175
+ def _check_message_weight(message: Dict[str, str | bool], idx: int) -> None:
189
176
  """Check that the message has a weight with the correct type and value.
190
177
 
191
178
  Args:
@@ -209,14 +196,9 @@ def _check_message_weight(
209
196
  line_number=idx + 1,
210
197
  error_source="key_value",
211
198
  )
212
- return weight
213
199
 
214
- return None
215
200
 
216
-
217
- def _check_message_role(
218
- message: Dict[str, str | int | MessageContent], previous_role: str | None, idx: int
219
- ) -> str:
201
+ def _check_message_role(message: Dict[str, str | bool], previous_role: str | bool | None, idx: int) -> str | bool:
220
202
  """Check that the message has correct roles.
221
203
 
222
204
  Args:
@@ -230,14 +212,6 @@ def _check_message_role(
230
212
  Raises:
231
213
  InvalidFileFormatError: If the message role is invalid.
232
214
  """
233
- if not isinstance(message["role"], str):
234
- raise InvalidFileFormatError(
235
- message=f"Invalid role `{message['role']}` in conversation on line {idx + 1}. "
236
- f"Role must be a string. Found {type(message['role'])}",
237
- line_number=idx + 1,
238
- error_source="key_value",
239
- )
240
-
241
215
  if message["role"] not in POSSIBLE_ROLES_CONVERSATION:
242
216
  raise InvalidFileFormatError(
243
217
  message=f"Invalid role `{message['role']}` in conversation on line {idx + 1}. "
@@ -255,135 +229,7 @@ def _check_message_role(
255
229
  return message["role"]
256
230
 
257
231
 
258
- def _check_message_content(
259
- message_content: str | int | MessageContent, role: str, idx: int
260
- ) -> tuple[bool, int]:
261
- """Check that the message content has the correct type.
262
- Message content can be either a) a string or b) an OpenAI-style multimodal list of content items
263
- Example:
264
- a) "Hello", or
265
- b) [
266
- {"type": "text", "text": "Hello"},
267
- {"type": "image_url", "image_url": {
268
- "url": "data:image/jpeg;base64,..."
269
- }}
270
- ]
271
-
272
- Args:
273
- message: The message to check.
274
- role: The role of the message.
275
- idx: Line number in the file.
276
-
277
- Returns:
278
- tuple[bool, int]: A tuple with message is multimodal and the number of images in the message content.
279
- """
280
- # Text-only message content
281
- if isinstance(message_content, str):
282
- return False, 0
283
-
284
- # Multimodal message content
285
- if isinstance(message_content, list):
286
- num_images = 0
287
- for item in message_content:
288
- if not isinstance(item, dict):
289
- raise InvalidFileFormatError(
290
- "The dataset is malformed, the `content` field must be a list of dicts.",
291
- line_number=idx + 1,
292
- error_source="key_value",
293
- )
294
- if "type" not in item:
295
- raise InvalidFileFormatError(
296
- "The dataset is malformed, the `content` field must be a list of dicts with a `type` field.",
297
- line_number=idx + 1,
298
- error_source="key_value",
299
- )
300
-
301
- if item["type"] == "text":
302
- if "text" not in item or not isinstance(item["text"], str):
303
- raise InvalidFileFormatError(
304
- "The dataset is malformed, the `text` field must be present in the `content` item field and be"
305
- f" a string. Got '{item.get('text')!r}' instead.",
306
- line_number=idx + 1,
307
- error_source="key_value",
308
- )
309
- elif item["type"] == "image_url":
310
- if role != "user":
311
- raise InvalidFileFormatError(
312
- "The dataset is malformed, only user messages can contain images.",
313
- line_number=idx + 1,
314
- error_source="key_value",
315
- )
316
-
317
- if "image_url" not in item or not isinstance(item["image_url"], dict):
318
- raise InvalidFileFormatError(
319
- "The dataset is malformed, the `image_url` field must be present in the `content` field and "
320
- f"be a dictionary. Got {item.get('image_url')!r} instead.",
321
- line_number=idx + 1,
322
- error_source="key_value",
323
- )
324
-
325
- image_data = item["image_url"].get("url")
326
- if not image_data or not isinstance(image_data, str):
327
- raise InvalidFileFormatError(
328
- "The dataset is malformed, the `url` field must be present in the `image_url` field and be "
329
- f"a string. Got {image_data!r} instead.",
330
- line_number=idx + 1,
331
- error_source="key_value",
332
- )
333
-
334
- if not any(
335
- image_data.startswith(f"data:image/{fmt};base64,")
336
- for fmt in ["jpeg", "png", "webp"]
337
- ):
338
- raise InvalidFileFormatError(
339
- "The dataset is malformed, the `url` field must be either a JPEG, PNG or WEBP base64-encoded "
340
- "image in 'data:image/<format>;base64,<base64_encoded_image>' format. "
341
- f"Got '{image_data[:100]}...' instead.",
342
- line_number=idx + 1,
343
- )
344
-
345
- if len(image_data) > MAX_BASE64_IMAGE_LENGTH:
346
- raise InvalidFileFormatError(
347
- "The dataset is malformed, the `url` field must contain base64-encoded image "
348
- f"that is less than 10MB, found ~{len(image_data) * 3 // 4} bytes.",
349
- line_number=idx + 1,
350
- error_source="key_value",
351
- )
352
-
353
- num_images += 1
354
- else:
355
- raise InvalidFileFormatError(
356
- "The dataset is malformed, the `type` field must be either 'text' or 'image_url'. "
357
- f"Got {item['type']!r}.",
358
- line_number=idx + 1,
359
- error_source="key_value",
360
- )
361
-
362
- if num_images > MAX_IMAGES_PER_EXAMPLE:
363
- raise InvalidFileFormatError(
364
- f"The dataset is malformed, the `content` field must contain at most "
365
- f"{MAX_IMAGES_PER_EXAMPLE} images, found {num_images}.",
366
- line_number=idx + 1,
367
- error_source="key_value",
368
- )
369
-
370
- # We still consider text-only messages in such format as multimodal, even if they don't have any images
371
- # included - so we can process datasets with rather sparse images (i.e. not in each sample) consistently.
372
- return True, num_images
373
-
374
- raise InvalidFileFormatError(
375
- f"Invalid content type on line {idx + 1} of the input file. Expected string or multimodal list of dicts, "
376
- f"found {type(message_content)}",
377
- line_number=idx + 1,
378
- error_source="key_value",
379
- )
380
-
381
-
382
- def validate_messages(
383
- messages: List[Dict[str, str | int | MessageContent]],
384
- idx: int,
385
- require_assistant_role: bool = True,
386
- ) -> None:
232
+ def validate_messages(messages: List[Dict[str, str | bool]], idx: int, require_assistant_role: bool = True) -> None:
387
233
  """Validate the messages column.
388
234
 
389
235
  Args:
@@ -396,45 +242,15 @@ def validate_messages(
396
242
  """
397
243
  _check_conversation_type(messages, idx)
398
244
 
245
+ has_weights = any("weight" in message for message in messages)
399
246
  previous_role = None
400
247
  assistant_role_exists = False
401
248
 
402
- messages_are_multimodal: bool | None = None
403
- total_number_of_images = 0
404
-
405
249
  for message in messages:
406
- message_weight = _check_message_weight(message, idx)
250
+ if has_weights:
251
+ _check_message_weight(message, idx)
407
252
  previous_role = _check_message_role(message, previous_role, idx)
408
253
  assistant_role_exists |= previous_role == "assistant"
409
- is_multimodal, number_of_images = _check_message_content(
410
- message["content"], role=previous_role, idx=idx
411
- )
412
- # Multimodal validation
413
- if number_of_images > 0 and message_weight is not None and message_weight != 0:
414
- raise InvalidFileFormatError(
415
- "Messages with images cannot have non-zero weights.",
416
- line_number=idx + 1,
417
- error_source="key_value",
418
- )
419
- if messages_are_multimodal is None:
420
- # Detect the format of the messages in the conversation.
421
- messages_are_multimodal = is_multimodal
422
- elif messages_are_multimodal != is_multimodal:
423
- # Due to the format limitation, we cannot mix multimodal and text only messages in the same sample.
424
- raise InvalidFileFormatError(
425
- "Messages in the conversation must be either all in multimodal or all in text-only format.",
426
- line_number=idx + 1,
427
- error_source="key_value",
428
- )
429
- total_number_of_images += number_of_images
430
-
431
- if total_number_of_images > MAX_IMAGES_PER_EXAMPLE:
432
- raise InvalidFileFormatError(
433
- f"The dataset is malformed, the `messages` must contain at most {MAX_IMAGES_PER_EXAMPLE} images. "
434
- f"Found {total_number_of_images} images.",
435
- line_number=idx + 1,
436
- error_source="key_value",
437
- )
438
254
 
439
255
  _check_conversation_roles(require_assistant_role, assistant_role_exists, idx)
440
256
 
@@ -463,7 +279,8 @@ def validate_preference_openai(example: Dict[str, Any], idx: int = 0) -> None:
463
279
  error_source="key_value",
464
280
  )
465
281
 
466
- validate_messages(example["input"]["messages"], idx, require_assistant_role=False)
282
+ messages: List[Dict[str, str | bool]] = cast(Any, example["input"]["messages"])
283
+ validate_messages(messages, idx, require_assistant_role=False)
467
284
 
468
285
  if example["input"]["messages"][-1]["role"] == "assistant":
469
286
  raise InvalidFileFormatError(
@@ -524,7 +341,12 @@ def validate_preference_openai(example: Dict[str, Any], idx: int = 0) -> None:
524
341
  error_source="key_value",
525
342
  )
526
343
 
527
- _check_message_content(example[key][0]["content"], role="assistant", idx=idx)
344
+ if not isinstance(example[key][0]["content"], str):
345
+ raise InvalidFileFormatError(
346
+ message=f"The dataset is malformed, the 'content' field in `{key}` must be a string on line {idx + 1}.",
347
+ line_number=idx + 1,
348
+ error_source="key_value",
349
+ )
528
350
 
529
351
 
530
352
  def _check_utf8(file: Path) -> Dict[str, Any]:
@@ -536,30 +358,24 @@ def _check_utf8(file: Path) -> Dict[str, Any]:
536
358
  Dict[str, Any]: A dictionary with the results of the check.
537
359
  """
538
360
  report_dict: Dict[str, Any] = {}
539
-
540
361
  try:
541
- # Dry-run UTF-8 decode: iterate through file to validate encoding
362
+ # Dry-run UTF-8 decode by iterating through the file to avoid loading it entirely into memory
542
363
  with file.open(encoding="utf-8") as f:
543
364
  for _ in f:
544
365
  pass
545
-
546
366
  report_dict["utf8"] = True
547
367
  except UnicodeDecodeError as e:
548
368
  report_dict["utf8"] = False
549
369
  report_dict["message"] = f"File is not UTF-8 encoded. Error raised: {e}."
550
370
  report_dict["is_check_passed"] = False
551
-
552
371
  return report_dict
553
372
 
554
373
 
555
- def _check_samples_count(
556
- file: Path, report_dict: Dict[str, Any], idx: int
557
- ) -> Dict[str, Any]:
374
+ def _check_samples_count(file: Path, report_dict: Dict[str, Any], idx: int) -> Dict[str, Any]:
558
375
  if idx + 1 < MIN_SAMPLES:
559
376
  report_dict["has_min_samples"] = False
560
377
  report_dict["message"] = (
561
- f"Processing {file} resulted in only {idx + 1} samples. "
562
- f"Our minimum is {MIN_SAMPLES} samples. "
378
+ f"Processing {file} resulted in only {idx + 1} samples. Our minimum is {MIN_SAMPLES} samples. "
563
379
  )
564
380
  report_dict["is_check_passed"] = False
565
381
  else:
@@ -580,11 +396,10 @@ def _check_csv(file: Path, purpose: FilePurpose | str) -> Dict[str, Any]:
580
396
  Dict[str, Any]: A dictionary with the results of the check.
581
397
  """
582
398
  report_dict: Dict[str, Any] = {}
583
- if purpose != FilePurpose.Eval:
399
+ if purpose != "eval":
584
400
  report_dict["is_check_passed"] = False
585
401
  report_dict["message"] = (
586
- f"CSV files are not supported for {purpose}. "
587
- "Only JSONL and Parquet files are supported."
402
+ f"CSV files are not supported for {purpose}. Only JSONL and Parquet files are supported."
588
403
  )
589
404
  return report_dict
590
405
 
@@ -625,13 +440,9 @@ def _check_csv(file: Path, purpose: FilePurpose | str) -> Dict[str, Any]:
625
440
  except ValueError:
626
441
  report_dict["load_csv"] = False
627
442
  if idx < 0:
628
- report_dict["message"] = (
629
- "Unable to decode file. File may be empty or in an unsupported format. "
630
- )
443
+ report_dict["message"] = "Unable to decode file. File may be empty or in an unsupported format. "
631
444
  else:
632
- report_dict["message"] = (
633
- f"Error parsing the CSV file. Unexpected format on line {idx + 1}."
634
- )
445
+ report_dict["message"] = f"Error parsing the CSV file. Unexpected format on line {idx + 1}."
635
446
  report_dict["is_check_passed"] = False
636
447
 
637
448
  return report_dict
@@ -647,7 +458,12 @@ def _check_jsonl(file: Path, purpose: FilePurpose | str) -> Dict[str, Any]:
647
458
  with file.open() as f:
648
459
  idx = -1
649
460
  try:
650
- for idx, line in tqdm(enumerate(f), desc="Validating file", unit=" lines"):
461
+ for idx, line in tqdm(
462
+ enumerate(f),
463
+ desc="Validating file",
464
+ unit=" lines",
465
+ disable=bool(DISABLE_TQDM),
466
+ ):
651
467
  json_line = json.loads(line)
652
468
 
653
469
  if not isinstance(json_line, dict):
@@ -661,16 +477,13 @@ def _check_jsonl(file: Path, purpose: FilePurpose | str) -> Dict[str, Any]:
661
477
  error_source="line_type",
662
478
  )
663
479
  # In evals, we don't check the format of the dataset.
664
- if purpose != FilePurpose.Eval:
480
+ if purpose != "eval":
665
481
  current_format = None
666
482
  for possible_format in JSONL_REQUIRED_COLUMNS_MAP:
667
- if all(
668
- column in json_line
669
- for column in JSONL_REQUIRED_COLUMNS_MAP[possible_format]
670
- ):
483
+ if all(column in json_line for column in JSONL_REQUIRED_COLUMNS_MAP[possible_format]):
671
484
  if current_format is None:
672
485
  current_format = possible_format
673
- elif current_format != possible_format:
486
+ elif current_format != possible_format: # type: ignore[unreachable]
674
487
  raise InvalidFileFormatError(
675
488
  message="Found multiple dataset formats in the input file. "
676
489
  f"Got {current_format} and {possible_format} on line {idx + 1}.",
@@ -679,11 +492,8 @@ def _check_jsonl(file: Path, purpose: FilePurpose | str) -> Dict[str, Any]:
679
492
  )
680
493
 
681
494
  # Check that there are no extra columns
682
- for column in json_line:
683
- if (
684
- column
685
- not in JSONL_REQUIRED_COLUMNS_MAP[possible_format]
686
- ):
495
+ for column in cast(List[str], json_line.keys()):
496
+ if column not in JSONL_REQUIRED_COLUMNS_MAP[possible_format]:
687
497
  raise InvalidFileFormatError(
688
498
  message=f'Found extra column "{column}" in the line {idx + 1}.',
689
499
  line_number=idx + 1,
@@ -700,36 +510,36 @@ def _check_jsonl(file: Path, purpose: FilePurpose | str) -> Dict[str, Any]:
700
510
  error_source="format",
701
511
  )
702
512
  if current_format == DatasetFormat.PREFERENCE_OPENAI:
703
- validate_preference_openai(json_line, idx)
513
+ validate_preference_openai(cast(Dict[str, Any], json_line), idx)
704
514
  elif current_format == DatasetFormat.CONVERSATION:
705
- message_column = JSONL_REQUIRED_COLUMNS_MAP[
706
- DatasetFormat.CONVERSATION
707
- ][0]
708
- require_assistant = purpose != FilePurpose.Eval
515
+ message_column = JSONL_REQUIRED_COLUMNS_MAP[DatasetFormat.CONVERSATION][0]
516
+ require_assistant = purpose != "eval"
517
+ message: List[Dict[str, str | bool]] = cast(Any, json_line[message_column])
709
518
  validate_messages(
710
- json_line[message_column],
519
+ message,
711
520
  idx,
712
521
  require_assistant_role=require_assistant,
713
522
  )
714
523
  else:
715
524
  for column in JSONL_REQUIRED_COLUMNS_MAP[current_format]:
716
- role = "assistant" if column in {"completion"} else "user"
717
- _check_message_content(
718
- json_line[column], role=role, idx=idx
719
- )
525
+ if not isinstance(json_line[column], str):
526
+ raise InvalidFileFormatError(
527
+ message=f'Invalid value type for "{column}" key on line {idx + 1}. '
528
+ f"Expected string. Found {type(cast(Any, json_line[column]))}.",
529
+ line_number=idx + 1,
530
+ error_source="key_value",
531
+ )
720
532
 
721
533
  if dataset_format is None:
722
534
  dataset_format = current_format
723
- elif current_format is not None:
724
- if current_format != dataset_format:
725
- raise InvalidFileFormatError(
726
- message="All samples in the dataset must have the same dataset format. "
727
- f"Got {dataset_format} for the first line and {current_format} "
728
- f"for the line {idx + 1}.",
729
- line_number=idx + 1,
730
- error_source="format",
731
- )
732
-
535
+ elif current_format != dataset_format: # type: ignore[unreachable]
536
+ raise InvalidFileFormatError(
537
+ message="All samples in the dataset must have the same dataset format. "
538
+ f"Got {dataset_format} for the first line and {current_format} "
539
+ f"for the line {idx + 1}.",
540
+ line_number=idx + 1,
541
+ error_source="format",
542
+ )
733
543
  report_dict.update(_check_samples_count(file, report_dict, idx))
734
544
 
735
545
  report_dict["load_json"] = True
@@ -745,13 +555,9 @@ def _check_jsonl(file: Path, purpose: FilePurpose | str) -> Dict[str, Any]:
745
555
  except ValueError:
746
556
  report_dict["load_json"] = False
747
557
  if idx < 0:
748
- report_dict["message"] = (
749
- "Unable to decode file. File may be empty or in an unsupported format. "
750
- )
558
+ report_dict["message"] = "Unable to decode file. File may be empty or in an unsupported format. "
751
559
  else:
752
- report_dict["message"] = (
753
- f"Error parsing json payload. Unexpected format on line {idx + 1}."
754
- )
560
+ report_dict["message"] = f"Error parsing json payload. Unexpected format on line {idx + 1}."
755
561
  report_dict["is_check_passed"] = False
756
562
 
757
563
  if "text_field" not in report_dict:
@@ -767,22 +573,21 @@ def _check_parquet(file: Path, purpose: FilePurpose | str) -> Dict[str, Any]:
767
573
  try:
768
574
  # Pyarrow is optional as it's large (~80MB) and isn't compatible with older systems.
769
575
  from pyarrow import ArrowInvalid, parquet
770
- except ImportError:
576
+ except ImportError as e:
771
577
  raise ImportError(
772
578
  "pyarrow is not installed and is required to use parquet files. Please install it via `pip install together[pyarrow]`"
773
- )
579
+ ) from e
774
580
 
775
581
  report_dict: Dict[str, Any] = {}
776
- if purpose == FilePurpose.Eval:
582
+ if purpose == "eval":
777
583
  report_dict["is_check_passed"] = False
778
584
  report_dict["message"] = (
779
- f"Parquet files are not supported for {purpose}. "
780
- "Only JSONL and CSV files are supported."
585
+ f"Parquet files are not supported for {purpose}. Only JSONL and CSV files are supported."
781
586
  )
782
587
  return report_dict
783
588
 
784
589
  try:
785
- table = parquet.read_table(str(file), memory_map=True)
590
+ table = parquet.read_table(str(file), memory_map=True) # type: ignore[reportUnknownMemberType]
786
591
  except ArrowInvalid:
787
592
  report_dict["load_parquet"] = (
788
593
  f"An exception has occurred when loading the Parquet file {file}. Please check the file for corruption. "
@@ -793,9 +598,7 @@ def _check_parquet(file: Path, purpose: FilePurpose | str) -> Dict[str, Any]:
793
598
 
794
599
  column_names = table.schema.names
795
600
  if "input_ids" not in column_names:
796
- report_dict["load_parquet"] = (
797
- f"Parquet file {file} does not contain the `input_ids` column."
798
- )
601
+ report_dict["load_parquet"] = f"Parquet file {file} does not contain the `input_ids` column."
799
602
  report_dict["is_check_passed"] = False
800
603
  return report_dict
801
604
 
@@ -813,8 +616,7 @@ def _check_parquet(file: Path, purpose: FilePurpose | str) -> Dict[str, Any]:
813
616
  if num_samples < MIN_SAMPLES:
814
617
  report_dict["has_min_samples"] = False
815
618
  report_dict["message"] = (
816
- f"Processing {file} resulted in only {num_samples} samples. "
817
- f"Our minimum is {MIN_SAMPLES} samples. "
619
+ f"Processing {file} resulted in only {num_samples} samples. Our minimum is {MIN_SAMPLES} samples. "
818
620
  )
819
621
  report_dict["is_check_passed"] = False
820
622
  return report_dict
@@ -0,0 +1,10 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any
4
+ from datetime import datetime
5
+
6
+
7
+ def datetime_serializer(obj: Any) -> str:
8
+ if isinstance(obj, datetime):
9
+ return obj.isoformat()
10
+ raise TypeError(f"Object of type {obj.__class__.__name__} is not JSON serializable")