together 1.5.35__py3-none-any.whl → 2.0.0a6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- together/__init__.py +101 -114
- together/_base_client.py +1995 -0
- together/_client.py +1033 -0
- together/_compat.py +219 -0
- together/_constants.py +14 -0
- together/_exceptions.py +108 -0
- together/_files.py +123 -0
- together/_models.py +857 -0
- together/_qs.py +150 -0
- together/_resource.py +43 -0
- together/_response.py +830 -0
- together/_streaming.py +370 -0
- together/_types.py +260 -0
- together/_utils/__init__.py +64 -0
- together/_utils/_compat.py +45 -0
- together/_utils/_datetime_parse.py +136 -0
- together/_utils/_logs.py +25 -0
- together/_utils/_proxy.py +65 -0
- together/_utils/_reflection.py +42 -0
- together/_utils/_resources_proxy.py +24 -0
- together/_utils/_streams.py +12 -0
- together/_utils/_sync.py +58 -0
- together/_utils/_transform.py +457 -0
- together/_utils/_typing.py +156 -0
- together/_utils/_utils.py +421 -0
- together/_version.py +4 -0
- together/lib/.keep +4 -0
- together/lib/__init__.py +23 -0
- together/{cli → lib/cli}/api/endpoints.py +66 -84
- together/{cli/api/evaluation.py → lib/cli/api/evals.py} +152 -43
- together/{cli → lib/cli}/api/files.py +20 -17
- together/{cli/api/finetune.py → lib/cli/api/fine_tuning.py} +116 -172
- together/{cli → lib/cli}/api/models.py +34 -27
- together/lib/cli/api/utils.py +50 -0
- together/{cli → lib/cli}/cli.py +16 -26
- together/{constants.py → lib/constants.py} +11 -24
- together/lib/resources/__init__.py +11 -0
- together/lib/resources/files.py +999 -0
- together/lib/resources/fine_tuning.py +280 -0
- together/lib/resources/models.py +35 -0
- together/lib/types/__init__.py +13 -0
- together/lib/types/error.py +9 -0
- together/lib/types/fine_tuning.py +397 -0
- together/{utils → lib/utils}/__init__.py +6 -14
- together/{utils → lib/utils}/_log.py +11 -16
- together/{utils → lib/utils}/files.py +90 -288
- together/lib/utils/serializer.py +10 -0
- together/{utils → lib/utils}/tools.py +19 -55
- together/resources/__init__.py +225 -39
- together/resources/audio/__init__.py +72 -48
- together/resources/audio/audio.py +198 -0
- together/resources/audio/speech.py +574 -128
- together/resources/audio/transcriptions.py +247 -261
- together/resources/audio/translations.py +221 -241
- together/resources/audio/voices.py +111 -41
- together/resources/batches.py +417 -0
- together/resources/chat/__init__.py +30 -21
- together/resources/chat/chat.py +102 -0
- together/resources/chat/completions.py +1063 -263
- together/resources/code_interpreter/__init__.py +33 -0
- together/resources/code_interpreter/code_interpreter.py +258 -0
- together/resources/code_interpreter/sessions.py +135 -0
- together/resources/completions.py +884 -225
- together/resources/embeddings.py +172 -68
- together/resources/endpoints.py +589 -490
- together/resources/evals.py +452 -0
- together/resources/files.py +397 -129
- together/resources/fine_tuning.py +1033 -0
- together/resources/hardware.py +181 -0
- together/resources/images.py +258 -104
- together/resources/jobs.py +214 -0
- together/resources/models.py +223 -193
- together/resources/rerank.py +190 -92
- together/resources/videos.py +286 -214
- together/types/__init__.py +66 -167
- together/types/audio/__init__.py +10 -0
- together/types/audio/speech_create_params.py +75 -0
- together/types/audio/transcription_create_params.py +54 -0
- together/types/audio/transcription_create_response.py +111 -0
- together/types/audio/translation_create_params.py +40 -0
- together/types/audio/translation_create_response.py +70 -0
- together/types/audio/voice_list_response.py +23 -0
- together/types/audio_speech_stream_chunk.py +16 -0
- together/types/autoscaling.py +13 -0
- together/types/autoscaling_param.py +15 -0
- together/types/batch_create_params.py +24 -0
- together/types/batch_create_response.py +14 -0
- together/types/batch_job.py +45 -0
- together/types/batch_list_response.py +10 -0
- together/types/chat/__init__.py +18 -0
- together/types/chat/chat_completion.py +60 -0
- together/types/chat/chat_completion_chunk.py +61 -0
- together/types/chat/chat_completion_structured_message_image_url_param.py +18 -0
- together/types/chat/chat_completion_structured_message_text_param.py +13 -0
- together/types/chat/chat_completion_structured_message_video_url_param.py +18 -0
- together/types/chat/chat_completion_usage.py +13 -0
- together/types/chat/chat_completion_warning.py +9 -0
- together/types/chat/completion_create_params.py +329 -0
- together/types/code_interpreter/__init__.py +5 -0
- together/types/code_interpreter/session_list_response.py +31 -0
- together/types/code_interpreter_execute_params.py +45 -0
- together/types/completion.py +42 -0
- together/types/completion_chunk.py +66 -0
- together/types/completion_create_params.py +138 -0
- together/types/dedicated_endpoint.py +44 -0
- together/types/embedding.py +24 -0
- together/types/embedding_create_params.py +31 -0
- together/types/endpoint_create_params.py +43 -0
- together/types/endpoint_list_avzones_response.py +11 -0
- together/types/endpoint_list_params.py +18 -0
- together/types/endpoint_list_response.py +41 -0
- together/types/endpoint_update_params.py +27 -0
- together/types/eval_create_params.py +263 -0
- together/types/eval_create_response.py +16 -0
- together/types/eval_list_params.py +21 -0
- together/types/eval_list_response.py +10 -0
- together/types/eval_status_response.py +100 -0
- together/types/evaluation_job.py +139 -0
- together/types/execute_response.py +108 -0
- together/types/file_delete_response.py +13 -0
- together/types/file_list.py +12 -0
- together/types/file_purpose.py +9 -0
- together/types/file_response.py +31 -0
- together/types/file_type.py +7 -0
- together/types/fine_tuning_cancel_response.py +194 -0
- together/types/fine_tuning_content_params.py +24 -0
- together/types/fine_tuning_delete_params.py +11 -0
- together/types/fine_tuning_delete_response.py +12 -0
- together/types/fine_tuning_list_checkpoints_response.py +21 -0
- together/types/fine_tuning_list_events_response.py +12 -0
- together/types/fine_tuning_list_response.py +199 -0
- together/types/finetune_event.py +41 -0
- together/types/finetune_event_type.py +33 -0
- together/types/finetune_response.py +177 -0
- together/types/hardware_list_params.py +16 -0
- together/types/hardware_list_response.py +58 -0
- together/types/image_data_b64.py +15 -0
- together/types/image_data_url.py +15 -0
- together/types/image_file.py +23 -0
- together/types/image_generate_params.py +85 -0
- together/types/job_list_response.py +47 -0
- together/types/job_retrieve_response.py +43 -0
- together/types/log_probs.py +18 -0
- together/types/model_list_response.py +10 -0
- together/types/model_object.py +42 -0
- together/types/model_upload_params.py +36 -0
- together/types/model_upload_response.py +23 -0
- together/types/rerank_create_params.py +36 -0
- together/types/rerank_create_response.py +36 -0
- together/types/tool_choice.py +23 -0
- together/types/tool_choice_param.py +23 -0
- together/types/tools_param.py +23 -0
- together/types/training_method_dpo.py +22 -0
- together/types/training_method_sft.py +18 -0
- together/types/video_create_params.py +86 -0
- together/types/video_create_response.py +10 -0
- together/types/video_job.py +57 -0
- together-2.0.0a6.dist-info/METADATA +729 -0
- together-2.0.0a6.dist-info/RECORD +165 -0
- {together-1.5.35.dist-info → together-2.0.0a6.dist-info}/WHEEL +1 -1
- together-2.0.0a6.dist-info/entry_points.txt +2 -0
- {together-1.5.35.dist-info → together-2.0.0a6.dist-info}/licenses/LICENSE +1 -1
- together/abstract/api_requestor.py +0 -770
- together/cli/api/chat.py +0 -298
- together/cli/api/completions.py +0 -119
- together/cli/api/images.py +0 -93
- together/cli/api/utils.py +0 -139
- together/client.py +0 -186
- together/error.py +0 -194
- together/filemanager.py +0 -635
- together/legacy/__init__.py +0 -0
- together/legacy/base.py +0 -27
- together/legacy/complete.py +0 -93
- together/legacy/embeddings.py +0 -27
- together/legacy/files.py +0 -146
- together/legacy/finetune.py +0 -177
- together/legacy/images.py +0 -27
- together/legacy/models.py +0 -44
- together/resources/batch.py +0 -165
- together/resources/code_interpreter.py +0 -82
- together/resources/evaluation.py +0 -808
- together/resources/finetune.py +0 -1388
- together/together_response.py +0 -50
- together/types/abstract.py +0 -26
- together/types/audio_speech.py +0 -311
- together/types/batch.py +0 -54
- together/types/chat_completions.py +0 -210
- together/types/code_interpreter.py +0 -57
- together/types/common.py +0 -67
- together/types/completions.py +0 -107
- together/types/embeddings.py +0 -35
- together/types/endpoints.py +0 -123
- together/types/error.py +0 -16
- together/types/evaluation.py +0 -93
- together/types/files.py +0 -93
- together/types/finetune.py +0 -465
- together/types/images.py +0 -42
- together/types/models.py +0 -96
- together/types/rerank.py +0 -43
- together/types/videos.py +0 -69
- together/utils/api_helpers.py +0 -124
- together/version.py +0 -6
- together-1.5.35.dist-info/METADATA +0 -583
- together-1.5.35.dist-info/RECORD +0 -77
- together-1.5.35.dist-info/entry_points.txt +0 -3
- /together/{abstract → lib/cli}/__init__.py +0 -0
- /together/{cli → lib/cli/api}/__init__.py +0 -0
- /together/{cli/api/__init__.py → py.typed} +0 -0
|
@@ -1,37 +1,26 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
import os
|
|
3
4
|
import csv
|
|
4
5
|
import json
|
|
5
|
-
import
|
|
6
|
+
from typing import Any, Dict, List, cast
|
|
6
7
|
from pathlib import Path
|
|
7
8
|
from traceback import format_exc
|
|
8
|
-
from typing import Any, Dict, List
|
|
9
9
|
|
|
10
10
|
from tqdm import tqdm
|
|
11
11
|
|
|
12
|
-
from together.
|
|
13
|
-
|
|
14
|
-
MAX_BASE64_IMAGE_LENGTH,
|
|
15
|
-
MAX_FILE_SIZE_GB,
|
|
16
|
-
MAX_IMAGES_PER_EXAMPLE,
|
|
12
|
+
from together.types import FilePurpose
|
|
13
|
+
from together.lib.constants import (
|
|
17
14
|
MIN_SAMPLES,
|
|
15
|
+
DISABLE_TQDM,
|
|
18
16
|
NUM_BYTES_IN_GB,
|
|
17
|
+
MAX_FILE_SIZE_GB,
|
|
19
18
|
PARQUET_EXPECTED_COLUMNS,
|
|
20
|
-
POSSIBLE_ROLES_CONVERSATION,
|
|
21
19
|
REQUIRED_COLUMNS_MESSAGE,
|
|
20
|
+
JSONL_REQUIRED_COLUMNS_MAP,
|
|
21
|
+
POSSIBLE_ROLES_CONVERSATION,
|
|
22
22
|
DatasetFormat,
|
|
23
23
|
)
|
|
24
|
-
from together.types import FilePurpose
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
# MessageContent is a string or a list of dicts with 'type': 'text' or 'image_url', and 'text' or 'image_url.url'
|
|
28
|
-
# Example: "Hello" or [
|
|
29
|
-
# {"type": "text", "text": "Hello"},
|
|
30
|
-
# {"type": "image_url", "image_url": {
|
|
31
|
-
# "url": "data:image/jpeg;base64,..."
|
|
32
|
-
# }}
|
|
33
|
-
# ]
|
|
34
|
-
MessageContent = str | list[dict[str, Any]]
|
|
35
24
|
|
|
36
25
|
|
|
37
26
|
class InvalidFileFormatError(ValueError):
|
|
@@ -51,12 +40,12 @@ class InvalidFileFormatError(ValueError):
|
|
|
51
40
|
|
|
52
41
|
def check_file(
|
|
53
42
|
file: Path | str,
|
|
54
|
-
purpose: FilePurpose | str =
|
|
43
|
+
purpose: FilePurpose | str = "fine-tune",
|
|
55
44
|
) -> Dict[str, Any]:
|
|
56
45
|
if not isinstance(file, Path):
|
|
57
46
|
file = Path(file)
|
|
58
47
|
|
|
59
|
-
report_dict = {
|
|
48
|
+
report_dict: Dict[str, Any] = {
|
|
60
49
|
"is_check_passed": True,
|
|
61
50
|
"message": "Checks passed",
|
|
62
51
|
"found": None,
|
|
@@ -78,7 +67,7 @@ def check_file(
|
|
|
78
67
|
else:
|
|
79
68
|
report_dict["found"] = True
|
|
80
69
|
|
|
81
|
-
file_size = os.stat(file).st_size
|
|
70
|
+
file_size = os.stat(file.as_posix()).st_size
|
|
82
71
|
|
|
83
72
|
if file_size > MAX_FILE_SIZE_GB * NUM_BYTES_IN_GB:
|
|
84
73
|
report_dict["message"] = (
|
|
@@ -105,8 +94,7 @@ def check_file(
|
|
|
105
94
|
data_report_dict = _check_csv(file, purpose)
|
|
106
95
|
else:
|
|
107
96
|
report_dict["filetype"] = (
|
|
108
|
-
f"Unknown extension of file {file}. "
|
|
109
|
-
"Only files with extensions .jsonl and .parquet are supported."
|
|
97
|
+
f"Unknown extension of file {file}. Only files with extensions .jsonl, .parquet, and .csv are supported."
|
|
110
98
|
)
|
|
111
99
|
report_dict["is_check_passed"] = False
|
|
112
100
|
|
|
@@ -115,9 +103,7 @@ def check_file(
|
|
|
115
103
|
return report_dict
|
|
116
104
|
|
|
117
105
|
|
|
118
|
-
def _check_conversation_type(
|
|
119
|
-
messages: List[Dict[str, str | int | MessageContent]], idx: int
|
|
120
|
-
) -> None:
|
|
106
|
+
def _check_conversation_type(messages: List[Dict[str, str | bool]], idx: int) -> None:
|
|
121
107
|
"""Check that the conversation has correct type.
|
|
122
108
|
|
|
123
109
|
Args:
|
|
@@ -128,23 +114,22 @@ def _check_conversation_type(
|
|
|
128
114
|
Raises:
|
|
129
115
|
InvalidFileFormatError: If the conversation type is invalid.
|
|
130
116
|
"""
|
|
131
|
-
if not isinstance(messages, list):
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
117
|
+
# if not isinstance(messages, list):
|
|
118
|
+
# raise InvalidFileFormatError(
|
|
119
|
+
# message=f"Invalid format on line {idx + 1} of the input file. "
|
|
120
|
+
# f"The `messages` column must be a list. Found {type(messages)}",
|
|
121
|
+
# line_number=idx + 1,
|
|
122
|
+
# error_source="key_value",
|
|
123
|
+
# )
|
|
138
124
|
if len(messages) == 0:
|
|
139
125
|
raise InvalidFileFormatError(
|
|
140
|
-
message=f"Invalid format on line {idx + 1} of the input file. "
|
|
141
|
-
f"The `messages` column must not be empty.",
|
|
126
|
+
message=f"Invalid format on line {idx + 1} of the input file. The `messages` column must not be empty.",
|
|
142
127
|
line_number=idx + 1,
|
|
143
128
|
error_source="key_value",
|
|
144
129
|
)
|
|
145
130
|
|
|
146
131
|
for message in messages:
|
|
147
|
-
if not isinstance(message, dict):
|
|
132
|
+
if not isinstance(cast(Any, message), dict):
|
|
148
133
|
raise InvalidFileFormatError(
|
|
149
134
|
message=f"Invalid format on line {idx + 1} of the input file. "
|
|
150
135
|
f"The `messages` column must be a list of dicts. Found {type(message)}",
|
|
@@ -159,11 +144,15 @@ def _check_conversation_type(
|
|
|
159
144
|
line_number=idx + 1,
|
|
160
145
|
error_source="key_value",
|
|
161
146
|
)
|
|
147
|
+
if not isinstance(message[column], str):
|
|
148
|
+
raise InvalidFileFormatError(
|
|
149
|
+
message=f"Column `{column}` is not a string on line {idx + 1}. Found {type(message[column])}",
|
|
150
|
+
line_number=idx + 1,
|
|
151
|
+
error_source="text_field",
|
|
152
|
+
)
|
|
162
153
|
|
|
163
154
|
|
|
164
|
-
def _check_conversation_roles(
|
|
165
|
-
require_assistant_role: bool, assistant_role_exists: bool, idx: int
|
|
166
|
-
) -> None:
|
|
155
|
+
def _check_conversation_roles(require_assistant_role: bool, assistant_role_exists: bool, idx: int) -> None:
|
|
167
156
|
"""Check that the conversation has correct roles.
|
|
168
157
|
|
|
169
158
|
Args:
|
|
@@ -183,9 +172,7 @@ def _check_conversation_roles(
|
|
|
183
172
|
)
|
|
184
173
|
|
|
185
174
|
|
|
186
|
-
def _check_message_weight(
|
|
187
|
-
message: Dict[str, str | int | MessageContent], idx: int
|
|
188
|
-
) -> int | None:
|
|
175
|
+
def _check_message_weight(message: Dict[str, str | bool], idx: int) -> None:
|
|
189
176
|
"""Check that the message has a weight with the correct type and value.
|
|
190
177
|
|
|
191
178
|
Args:
|
|
@@ -209,14 +196,9 @@ def _check_message_weight(
|
|
|
209
196
|
line_number=idx + 1,
|
|
210
197
|
error_source="key_value",
|
|
211
198
|
)
|
|
212
|
-
return weight
|
|
213
199
|
|
|
214
|
-
return None
|
|
215
200
|
|
|
216
|
-
|
|
217
|
-
def _check_message_role(
|
|
218
|
-
message: Dict[str, str | int | MessageContent], previous_role: str | None, idx: int
|
|
219
|
-
) -> str:
|
|
201
|
+
def _check_message_role(message: Dict[str, str | bool], previous_role: str | bool | None, idx: int) -> str | bool:
|
|
220
202
|
"""Check that the message has correct roles.
|
|
221
203
|
|
|
222
204
|
Args:
|
|
@@ -230,14 +212,6 @@ def _check_message_role(
|
|
|
230
212
|
Raises:
|
|
231
213
|
InvalidFileFormatError: If the message role is invalid.
|
|
232
214
|
"""
|
|
233
|
-
if not isinstance(message["role"], str):
|
|
234
|
-
raise InvalidFileFormatError(
|
|
235
|
-
message=f"Invalid role `{message['role']}` in conversation on line {idx + 1}. "
|
|
236
|
-
f"Role must be a string. Found {type(message['role'])}",
|
|
237
|
-
line_number=idx + 1,
|
|
238
|
-
error_source="key_value",
|
|
239
|
-
)
|
|
240
|
-
|
|
241
215
|
if message["role"] not in POSSIBLE_ROLES_CONVERSATION:
|
|
242
216
|
raise InvalidFileFormatError(
|
|
243
217
|
message=f"Invalid role `{message['role']}` in conversation on line {idx + 1}. "
|
|
@@ -255,135 +229,7 @@ def _check_message_role(
|
|
|
255
229
|
return message["role"]
|
|
256
230
|
|
|
257
231
|
|
|
258
|
-
def
|
|
259
|
-
message_content: str | int | MessageContent, role: str, idx: int
|
|
260
|
-
) -> tuple[bool, int]:
|
|
261
|
-
"""Check that the message content has the correct type.
|
|
262
|
-
Message content can be either a) a string or b) an OpenAI-style multimodal list of content items
|
|
263
|
-
Example:
|
|
264
|
-
a) "Hello", or
|
|
265
|
-
b) [
|
|
266
|
-
{"type": "text", "text": "Hello"},
|
|
267
|
-
{"type": "image_url", "image_url": {
|
|
268
|
-
"url": "data:image/jpeg;base64,..."
|
|
269
|
-
}}
|
|
270
|
-
]
|
|
271
|
-
|
|
272
|
-
Args:
|
|
273
|
-
message: The message to check.
|
|
274
|
-
role: The role of the message.
|
|
275
|
-
idx: Line number in the file.
|
|
276
|
-
|
|
277
|
-
Returns:
|
|
278
|
-
tuple[bool, int]: A tuple with message is multimodal and the number of images in the message content.
|
|
279
|
-
"""
|
|
280
|
-
# Text-only message content
|
|
281
|
-
if isinstance(message_content, str):
|
|
282
|
-
return False, 0
|
|
283
|
-
|
|
284
|
-
# Multimodal message content
|
|
285
|
-
if isinstance(message_content, list):
|
|
286
|
-
num_images = 0
|
|
287
|
-
for item in message_content:
|
|
288
|
-
if not isinstance(item, dict):
|
|
289
|
-
raise InvalidFileFormatError(
|
|
290
|
-
"The dataset is malformed, the `content` field must be a list of dicts.",
|
|
291
|
-
line_number=idx + 1,
|
|
292
|
-
error_source="key_value",
|
|
293
|
-
)
|
|
294
|
-
if "type" not in item:
|
|
295
|
-
raise InvalidFileFormatError(
|
|
296
|
-
"The dataset is malformed, the `content` field must be a list of dicts with a `type` field.",
|
|
297
|
-
line_number=idx + 1,
|
|
298
|
-
error_source="key_value",
|
|
299
|
-
)
|
|
300
|
-
|
|
301
|
-
if item["type"] == "text":
|
|
302
|
-
if "text" not in item or not isinstance(item["text"], str):
|
|
303
|
-
raise InvalidFileFormatError(
|
|
304
|
-
"The dataset is malformed, the `text` field must be present in the `content` item field and be"
|
|
305
|
-
f" a string. Got '{item.get('text')!r}' instead.",
|
|
306
|
-
line_number=idx + 1,
|
|
307
|
-
error_source="key_value",
|
|
308
|
-
)
|
|
309
|
-
elif item["type"] == "image_url":
|
|
310
|
-
if role != "user":
|
|
311
|
-
raise InvalidFileFormatError(
|
|
312
|
-
"The dataset is malformed, only user messages can contain images.",
|
|
313
|
-
line_number=idx + 1,
|
|
314
|
-
error_source="key_value",
|
|
315
|
-
)
|
|
316
|
-
|
|
317
|
-
if "image_url" not in item or not isinstance(item["image_url"], dict):
|
|
318
|
-
raise InvalidFileFormatError(
|
|
319
|
-
"The dataset is malformed, the `image_url` field must be present in the `content` field and "
|
|
320
|
-
f"be a dictionary. Got {item.get('image_url')!r} instead.",
|
|
321
|
-
line_number=idx + 1,
|
|
322
|
-
error_source="key_value",
|
|
323
|
-
)
|
|
324
|
-
|
|
325
|
-
image_data = item["image_url"].get("url")
|
|
326
|
-
if not image_data or not isinstance(image_data, str):
|
|
327
|
-
raise InvalidFileFormatError(
|
|
328
|
-
"The dataset is malformed, the `url` field must be present in the `image_url` field and be "
|
|
329
|
-
f"a string. Got {image_data!r} instead.",
|
|
330
|
-
line_number=idx + 1,
|
|
331
|
-
error_source="key_value",
|
|
332
|
-
)
|
|
333
|
-
|
|
334
|
-
if not any(
|
|
335
|
-
image_data.startswith(f"data:image/{fmt};base64,")
|
|
336
|
-
for fmt in ["jpeg", "png", "webp"]
|
|
337
|
-
):
|
|
338
|
-
raise InvalidFileFormatError(
|
|
339
|
-
"The dataset is malformed, the `url` field must be either a JPEG, PNG or WEBP base64-encoded "
|
|
340
|
-
"image in 'data:image/<format>;base64,<base64_encoded_image>' format. "
|
|
341
|
-
f"Got '{image_data[:100]}...' instead.",
|
|
342
|
-
line_number=idx + 1,
|
|
343
|
-
)
|
|
344
|
-
|
|
345
|
-
if len(image_data) > MAX_BASE64_IMAGE_LENGTH:
|
|
346
|
-
raise InvalidFileFormatError(
|
|
347
|
-
"The dataset is malformed, the `url` field must contain base64-encoded image "
|
|
348
|
-
f"that is less than 10MB, found ~{len(image_data) * 3 // 4} bytes.",
|
|
349
|
-
line_number=idx + 1,
|
|
350
|
-
error_source="key_value",
|
|
351
|
-
)
|
|
352
|
-
|
|
353
|
-
num_images += 1
|
|
354
|
-
else:
|
|
355
|
-
raise InvalidFileFormatError(
|
|
356
|
-
"The dataset is malformed, the `type` field must be either 'text' or 'image_url'. "
|
|
357
|
-
f"Got {item['type']!r}.",
|
|
358
|
-
line_number=idx + 1,
|
|
359
|
-
error_source="key_value",
|
|
360
|
-
)
|
|
361
|
-
|
|
362
|
-
if num_images > MAX_IMAGES_PER_EXAMPLE:
|
|
363
|
-
raise InvalidFileFormatError(
|
|
364
|
-
f"The dataset is malformed, the `content` field must contain at most "
|
|
365
|
-
f"{MAX_IMAGES_PER_EXAMPLE} images, found {num_images}.",
|
|
366
|
-
line_number=idx + 1,
|
|
367
|
-
error_source="key_value",
|
|
368
|
-
)
|
|
369
|
-
|
|
370
|
-
# We still consider text-only messages in such format as multimodal, even if they don't have any images
|
|
371
|
-
# included - so we can process datasets with rather sparse images (i.e. not in each sample) consistently.
|
|
372
|
-
return True, num_images
|
|
373
|
-
|
|
374
|
-
raise InvalidFileFormatError(
|
|
375
|
-
f"Invalid content type on line {idx + 1} of the input file. Expected string or multimodal list of dicts, "
|
|
376
|
-
f"found {type(message_content)}",
|
|
377
|
-
line_number=idx + 1,
|
|
378
|
-
error_source="key_value",
|
|
379
|
-
)
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
def validate_messages(
|
|
383
|
-
messages: List[Dict[str, str | int | MessageContent]],
|
|
384
|
-
idx: int,
|
|
385
|
-
require_assistant_role: bool = True,
|
|
386
|
-
) -> None:
|
|
232
|
+
def validate_messages(messages: List[Dict[str, str | bool]], idx: int, require_assistant_role: bool = True) -> None:
|
|
387
233
|
"""Validate the messages column.
|
|
388
234
|
|
|
389
235
|
Args:
|
|
@@ -396,45 +242,15 @@ def validate_messages(
|
|
|
396
242
|
"""
|
|
397
243
|
_check_conversation_type(messages, idx)
|
|
398
244
|
|
|
245
|
+
has_weights = any("weight" in message for message in messages)
|
|
399
246
|
previous_role = None
|
|
400
247
|
assistant_role_exists = False
|
|
401
248
|
|
|
402
|
-
messages_are_multimodal: bool | None = None
|
|
403
|
-
total_number_of_images = 0
|
|
404
|
-
|
|
405
249
|
for message in messages:
|
|
406
|
-
|
|
250
|
+
if has_weights:
|
|
251
|
+
_check_message_weight(message, idx)
|
|
407
252
|
previous_role = _check_message_role(message, previous_role, idx)
|
|
408
253
|
assistant_role_exists |= previous_role == "assistant"
|
|
409
|
-
is_multimodal, number_of_images = _check_message_content(
|
|
410
|
-
message["content"], role=previous_role, idx=idx
|
|
411
|
-
)
|
|
412
|
-
# Multimodal validation
|
|
413
|
-
if number_of_images > 0 and message_weight is not None and message_weight != 0:
|
|
414
|
-
raise InvalidFileFormatError(
|
|
415
|
-
"Messages with images cannot have non-zero weights.",
|
|
416
|
-
line_number=idx + 1,
|
|
417
|
-
error_source="key_value",
|
|
418
|
-
)
|
|
419
|
-
if messages_are_multimodal is None:
|
|
420
|
-
# Detect the format of the messages in the conversation.
|
|
421
|
-
messages_are_multimodal = is_multimodal
|
|
422
|
-
elif messages_are_multimodal != is_multimodal:
|
|
423
|
-
# Due to the format limitation, we cannot mix multimodal and text only messages in the same sample.
|
|
424
|
-
raise InvalidFileFormatError(
|
|
425
|
-
"Messages in the conversation must be either all in multimodal or all in text-only format.",
|
|
426
|
-
line_number=idx + 1,
|
|
427
|
-
error_source="key_value",
|
|
428
|
-
)
|
|
429
|
-
total_number_of_images += number_of_images
|
|
430
|
-
|
|
431
|
-
if total_number_of_images > MAX_IMAGES_PER_EXAMPLE:
|
|
432
|
-
raise InvalidFileFormatError(
|
|
433
|
-
f"The dataset is malformed, the `messages` must contain at most {MAX_IMAGES_PER_EXAMPLE} images. "
|
|
434
|
-
f"Found {total_number_of_images} images.",
|
|
435
|
-
line_number=idx + 1,
|
|
436
|
-
error_source="key_value",
|
|
437
|
-
)
|
|
438
254
|
|
|
439
255
|
_check_conversation_roles(require_assistant_role, assistant_role_exists, idx)
|
|
440
256
|
|
|
@@ -463,7 +279,8 @@ def validate_preference_openai(example: Dict[str, Any], idx: int = 0) -> None:
|
|
|
463
279
|
error_source="key_value",
|
|
464
280
|
)
|
|
465
281
|
|
|
466
|
-
|
|
282
|
+
messages: List[Dict[str, str | bool]] = cast(Any, example["input"]["messages"])
|
|
283
|
+
validate_messages(messages, idx, require_assistant_role=False)
|
|
467
284
|
|
|
468
285
|
if example["input"]["messages"][-1]["role"] == "assistant":
|
|
469
286
|
raise InvalidFileFormatError(
|
|
@@ -524,7 +341,12 @@ def validate_preference_openai(example: Dict[str, Any], idx: int = 0) -> None:
|
|
|
524
341
|
error_source="key_value",
|
|
525
342
|
)
|
|
526
343
|
|
|
527
|
-
|
|
344
|
+
if not isinstance(example[key][0]["content"], str):
|
|
345
|
+
raise InvalidFileFormatError(
|
|
346
|
+
message=f"The dataset is malformed, the 'content' field in `{key}` must be a string on line {idx + 1}.",
|
|
347
|
+
line_number=idx + 1,
|
|
348
|
+
error_source="key_value",
|
|
349
|
+
)
|
|
528
350
|
|
|
529
351
|
|
|
530
352
|
def _check_utf8(file: Path) -> Dict[str, Any]:
|
|
@@ -536,30 +358,24 @@ def _check_utf8(file: Path) -> Dict[str, Any]:
|
|
|
536
358
|
Dict[str, Any]: A dictionary with the results of the check.
|
|
537
359
|
"""
|
|
538
360
|
report_dict: Dict[str, Any] = {}
|
|
539
|
-
|
|
540
361
|
try:
|
|
541
|
-
# Dry-run UTF-8 decode
|
|
362
|
+
# Dry-run UTF-8 decode by iterating through the file to avoid loading it entirely into memory
|
|
542
363
|
with file.open(encoding="utf-8") as f:
|
|
543
364
|
for _ in f:
|
|
544
365
|
pass
|
|
545
|
-
|
|
546
366
|
report_dict["utf8"] = True
|
|
547
367
|
except UnicodeDecodeError as e:
|
|
548
368
|
report_dict["utf8"] = False
|
|
549
369
|
report_dict["message"] = f"File is not UTF-8 encoded. Error raised: {e}."
|
|
550
370
|
report_dict["is_check_passed"] = False
|
|
551
|
-
|
|
552
371
|
return report_dict
|
|
553
372
|
|
|
554
373
|
|
|
555
|
-
def _check_samples_count(
|
|
556
|
-
file: Path, report_dict: Dict[str, Any], idx: int
|
|
557
|
-
) -> Dict[str, Any]:
|
|
374
|
+
def _check_samples_count(file: Path, report_dict: Dict[str, Any], idx: int) -> Dict[str, Any]:
|
|
558
375
|
if idx + 1 < MIN_SAMPLES:
|
|
559
376
|
report_dict["has_min_samples"] = False
|
|
560
377
|
report_dict["message"] = (
|
|
561
|
-
f"Processing {file} resulted in only {idx + 1} samples. "
|
|
562
|
-
f"Our minimum is {MIN_SAMPLES} samples. "
|
|
378
|
+
f"Processing {file} resulted in only {idx + 1} samples. Our minimum is {MIN_SAMPLES} samples. "
|
|
563
379
|
)
|
|
564
380
|
report_dict["is_check_passed"] = False
|
|
565
381
|
else:
|
|
@@ -580,11 +396,10 @@ def _check_csv(file: Path, purpose: FilePurpose | str) -> Dict[str, Any]:
|
|
|
580
396
|
Dict[str, Any]: A dictionary with the results of the check.
|
|
581
397
|
"""
|
|
582
398
|
report_dict: Dict[str, Any] = {}
|
|
583
|
-
if purpose !=
|
|
399
|
+
if purpose != "eval":
|
|
584
400
|
report_dict["is_check_passed"] = False
|
|
585
401
|
report_dict["message"] = (
|
|
586
|
-
f"CSV files are not supported for {purpose}. "
|
|
587
|
-
"Only JSONL and Parquet files are supported."
|
|
402
|
+
f"CSV files are not supported for {purpose}. Only JSONL and Parquet files are supported."
|
|
588
403
|
)
|
|
589
404
|
return report_dict
|
|
590
405
|
|
|
@@ -625,13 +440,9 @@ def _check_csv(file: Path, purpose: FilePurpose | str) -> Dict[str, Any]:
|
|
|
625
440
|
except ValueError:
|
|
626
441
|
report_dict["load_csv"] = False
|
|
627
442
|
if idx < 0:
|
|
628
|
-
report_dict["message"] =
|
|
629
|
-
"Unable to decode file. File may be empty or in an unsupported format. "
|
|
630
|
-
)
|
|
443
|
+
report_dict["message"] = "Unable to decode file. File may be empty or in an unsupported format. "
|
|
631
444
|
else:
|
|
632
|
-
report_dict["message"] =
|
|
633
|
-
f"Error parsing the CSV file. Unexpected format on line {idx + 1}."
|
|
634
|
-
)
|
|
445
|
+
report_dict["message"] = f"Error parsing the CSV file. Unexpected format on line {idx + 1}."
|
|
635
446
|
report_dict["is_check_passed"] = False
|
|
636
447
|
|
|
637
448
|
return report_dict
|
|
@@ -647,7 +458,12 @@ def _check_jsonl(file: Path, purpose: FilePurpose | str) -> Dict[str, Any]:
|
|
|
647
458
|
with file.open() as f:
|
|
648
459
|
idx = -1
|
|
649
460
|
try:
|
|
650
|
-
for idx, line in tqdm(
|
|
461
|
+
for idx, line in tqdm(
|
|
462
|
+
enumerate(f),
|
|
463
|
+
desc="Validating file",
|
|
464
|
+
unit=" lines",
|
|
465
|
+
disable=bool(DISABLE_TQDM),
|
|
466
|
+
):
|
|
651
467
|
json_line = json.loads(line)
|
|
652
468
|
|
|
653
469
|
if not isinstance(json_line, dict):
|
|
@@ -661,16 +477,13 @@ def _check_jsonl(file: Path, purpose: FilePurpose | str) -> Dict[str, Any]:
|
|
|
661
477
|
error_source="line_type",
|
|
662
478
|
)
|
|
663
479
|
# In evals, we don't check the format of the dataset.
|
|
664
|
-
if purpose !=
|
|
480
|
+
if purpose != "eval":
|
|
665
481
|
current_format = None
|
|
666
482
|
for possible_format in JSONL_REQUIRED_COLUMNS_MAP:
|
|
667
|
-
if all(
|
|
668
|
-
column in json_line
|
|
669
|
-
for column in JSONL_REQUIRED_COLUMNS_MAP[possible_format]
|
|
670
|
-
):
|
|
483
|
+
if all(column in json_line for column in JSONL_REQUIRED_COLUMNS_MAP[possible_format]):
|
|
671
484
|
if current_format is None:
|
|
672
485
|
current_format = possible_format
|
|
673
|
-
elif current_format != possible_format:
|
|
486
|
+
elif current_format != possible_format: # type: ignore[unreachable]
|
|
674
487
|
raise InvalidFileFormatError(
|
|
675
488
|
message="Found multiple dataset formats in the input file. "
|
|
676
489
|
f"Got {current_format} and {possible_format} on line {idx + 1}.",
|
|
@@ -679,11 +492,8 @@ def _check_jsonl(file: Path, purpose: FilePurpose | str) -> Dict[str, Any]:
|
|
|
679
492
|
)
|
|
680
493
|
|
|
681
494
|
# Check that there are no extra columns
|
|
682
|
-
for column in json_line:
|
|
683
|
-
if
|
|
684
|
-
column
|
|
685
|
-
not in JSONL_REQUIRED_COLUMNS_MAP[possible_format]
|
|
686
|
-
):
|
|
495
|
+
for column in cast(List[str], json_line.keys()):
|
|
496
|
+
if column not in JSONL_REQUIRED_COLUMNS_MAP[possible_format]:
|
|
687
497
|
raise InvalidFileFormatError(
|
|
688
498
|
message=f'Found extra column "{column}" in the line {idx + 1}.',
|
|
689
499
|
line_number=idx + 1,
|
|
@@ -700,36 +510,36 @@ def _check_jsonl(file: Path, purpose: FilePurpose | str) -> Dict[str, Any]:
|
|
|
700
510
|
error_source="format",
|
|
701
511
|
)
|
|
702
512
|
if current_format == DatasetFormat.PREFERENCE_OPENAI:
|
|
703
|
-
validate_preference_openai(json_line, idx)
|
|
513
|
+
validate_preference_openai(cast(Dict[str, Any], json_line), idx)
|
|
704
514
|
elif current_format == DatasetFormat.CONVERSATION:
|
|
705
|
-
message_column = JSONL_REQUIRED_COLUMNS_MAP[
|
|
706
|
-
|
|
707
|
-
][
|
|
708
|
-
require_assistant = purpose != FilePurpose.Eval
|
|
515
|
+
message_column = JSONL_REQUIRED_COLUMNS_MAP[DatasetFormat.CONVERSATION][0]
|
|
516
|
+
require_assistant = purpose != "eval"
|
|
517
|
+
message: List[Dict[str, str | bool]] = cast(Any, json_line[message_column])
|
|
709
518
|
validate_messages(
|
|
710
|
-
|
|
519
|
+
message,
|
|
711
520
|
idx,
|
|
712
521
|
require_assistant_role=require_assistant,
|
|
713
522
|
)
|
|
714
523
|
else:
|
|
715
524
|
for column in JSONL_REQUIRED_COLUMNS_MAP[current_format]:
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
|
|
525
|
+
if not isinstance(json_line[column], str):
|
|
526
|
+
raise InvalidFileFormatError(
|
|
527
|
+
message=f'Invalid value type for "{column}" key on line {idx + 1}. '
|
|
528
|
+
f"Expected string. Found {type(cast(Any, json_line[column]))}.",
|
|
529
|
+
line_number=idx + 1,
|
|
530
|
+
error_source="key_value",
|
|
531
|
+
)
|
|
720
532
|
|
|
721
533
|
if dataset_format is None:
|
|
722
534
|
dataset_format = current_format
|
|
723
|
-
elif current_format
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
)
|
|
732
|
-
|
|
535
|
+
elif current_format != dataset_format: # type: ignore[unreachable]
|
|
536
|
+
raise InvalidFileFormatError(
|
|
537
|
+
message="All samples in the dataset must have the same dataset format. "
|
|
538
|
+
f"Got {dataset_format} for the first line and {current_format} "
|
|
539
|
+
f"for the line {idx + 1}.",
|
|
540
|
+
line_number=idx + 1,
|
|
541
|
+
error_source="format",
|
|
542
|
+
)
|
|
733
543
|
report_dict.update(_check_samples_count(file, report_dict, idx))
|
|
734
544
|
|
|
735
545
|
report_dict["load_json"] = True
|
|
@@ -745,13 +555,9 @@ def _check_jsonl(file: Path, purpose: FilePurpose | str) -> Dict[str, Any]:
|
|
|
745
555
|
except ValueError:
|
|
746
556
|
report_dict["load_json"] = False
|
|
747
557
|
if idx < 0:
|
|
748
|
-
report_dict["message"] =
|
|
749
|
-
"Unable to decode file. File may be empty or in an unsupported format. "
|
|
750
|
-
)
|
|
558
|
+
report_dict["message"] = "Unable to decode file. File may be empty or in an unsupported format. "
|
|
751
559
|
else:
|
|
752
|
-
report_dict["message"] =
|
|
753
|
-
f"Error parsing json payload. Unexpected format on line {idx + 1}."
|
|
754
|
-
)
|
|
560
|
+
report_dict["message"] = f"Error parsing json payload. Unexpected format on line {idx + 1}."
|
|
755
561
|
report_dict["is_check_passed"] = False
|
|
756
562
|
|
|
757
563
|
if "text_field" not in report_dict:
|
|
@@ -767,22 +573,21 @@ def _check_parquet(file: Path, purpose: FilePurpose | str) -> Dict[str, Any]:
|
|
|
767
573
|
try:
|
|
768
574
|
# Pyarrow is optional as it's large (~80MB) and isn't compatible with older systems.
|
|
769
575
|
from pyarrow import ArrowInvalid, parquet
|
|
770
|
-
except ImportError:
|
|
576
|
+
except ImportError as e:
|
|
771
577
|
raise ImportError(
|
|
772
578
|
"pyarrow is not installed and is required to use parquet files. Please install it via `pip install together[pyarrow]`"
|
|
773
|
-
)
|
|
579
|
+
) from e
|
|
774
580
|
|
|
775
581
|
report_dict: Dict[str, Any] = {}
|
|
776
|
-
if purpose ==
|
|
582
|
+
if purpose == "eval":
|
|
777
583
|
report_dict["is_check_passed"] = False
|
|
778
584
|
report_dict["message"] = (
|
|
779
|
-
f"Parquet files are not supported for {purpose}. "
|
|
780
|
-
"Only JSONL and CSV files are supported."
|
|
585
|
+
f"Parquet files are not supported for {purpose}. Only JSONL and CSV files are supported."
|
|
781
586
|
)
|
|
782
587
|
return report_dict
|
|
783
588
|
|
|
784
589
|
try:
|
|
785
|
-
table = parquet.read_table(str(file), memory_map=True)
|
|
590
|
+
table = parquet.read_table(str(file), memory_map=True) # type: ignore[reportUnknownMemberType]
|
|
786
591
|
except ArrowInvalid:
|
|
787
592
|
report_dict["load_parquet"] = (
|
|
788
593
|
f"An exception has occurred when loading the Parquet file {file}. Please check the file for corruption. "
|
|
@@ -793,9 +598,7 @@ def _check_parquet(file: Path, purpose: FilePurpose | str) -> Dict[str, Any]:
|
|
|
793
598
|
|
|
794
599
|
column_names = table.schema.names
|
|
795
600
|
if "input_ids" not in column_names:
|
|
796
|
-
report_dict["load_parquet"] =
|
|
797
|
-
f"Parquet file {file} does not contain the `input_ids` column."
|
|
798
|
-
)
|
|
601
|
+
report_dict["load_parquet"] = f"Parquet file {file} does not contain the `input_ids` column."
|
|
799
602
|
report_dict["is_check_passed"] = False
|
|
800
603
|
return report_dict
|
|
801
604
|
|
|
@@ -813,8 +616,7 @@ def _check_parquet(file: Path, purpose: FilePurpose | str) -> Dict[str, Any]:
|
|
|
813
616
|
if num_samples < MIN_SAMPLES:
|
|
814
617
|
report_dict["has_min_samples"] = False
|
|
815
618
|
report_dict["message"] = (
|
|
816
|
-
f"Processing {file} resulted in only {num_samples} samples. "
|
|
817
|
-
f"Our minimum is {MIN_SAMPLES} samples. "
|
|
619
|
+
f"Processing {file} resulted in only {num_samples} samples. Our minimum is {MIN_SAMPLES} samples. "
|
|
818
620
|
)
|
|
819
621
|
report_dict["is_check_passed"] = False
|
|
820
622
|
return report_dict
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any
|
|
4
|
+
from datetime import datetime
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def datetime_serializer(obj: Any) -> str:
|
|
8
|
+
if isinstance(obj, datetime):
|
|
9
|
+
return obj.isoformat()
|
|
10
|
+
raise TypeError(f"Object of type {obj.__class__.__name__} is not JSON serializable")
|