together 1.0.1__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
together/constants.py CHANGED
@@ -26,3 +26,6 @@ NUM_BYTES_IN_GB = 2**30
26
26
 
27
27
  # maximum number of GB sized files we support finetuning for
28
28
  MAX_FILE_SIZE_GB = 4.9
29
+
30
+ # expected columns for Parquet files
31
+ PARQUET_EXPECTED_COLUMNS = ["input_ids", "attention_mask", "labels"]
together/filemanager.py CHANGED
@@ -25,7 +25,13 @@ from together.error import (
25
25
  FileTypeError,
26
26
  )
27
27
  from together.together_response import TogetherResponse
28
- from together.types import FilePurpose, FileResponse, TogetherClient, TogetherRequest
28
+ from together.types import (
29
+ FilePurpose,
30
+ FileResponse,
31
+ FileType,
32
+ TogetherClient,
33
+ TogetherRequest,
34
+ )
29
35
 
30
36
 
31
37
  def chmod_and_replace(src: Path, dst: Path) -> None:
@@ -260,12 +266,17 @@ class UploadManager:
260
266
  http_status=response.status_code,
261
267
  )
262
268
 
263
- def redirect_policy(
264
- self, url: str, file: Path, purpose: FilePurpose
269
+ def get_upload_url(
270
+ self,
271
+ url: str,
272
+ file: Path,
273
+ purpose: FilePurpose,
274
+ filetype: FileType,
265
275
  ) -> Tuple[str, str]:
266
276
  data = {
267
277
  "purpose": purpose.value,
268
278
  "file_name": file.name,
279
+ "file_type": filetype.value,
269
280
  }
270
281
 
271
282
  requestor = api_requestor.APIRequestor(
@@ -324,7 +335,16 @@ class UploadManager:
324
335
 
325
336
  redirect_url = None
326
337
  if redirect:
327
- redirect_url, file_id = self.redirect_policy(url, file, purpose)
338
+ if file.suffix == ".jsonl":
339
+ filetype = FileType.jsonl
340
+ elif file.suffix == ".parquet":
341
+ filetype = FileType.parquet
342
+ else:
343
+ raise FileTypeError(
344
+ f"Unknown extension of file {file}. "
345
+ "Only files with extensions .jsonl and .parquet are supported."
346
+ )
347
+ redirect_url, file_id = self.get_upload_url(url, file, purpose, filetype)
328
348
 
329
349
  file_size = os.stat(file.as_posix()).st_size
330
350
 
@@ -18,6 +18,7 @@ from together.types.files import (
18
18
  FilePurpose,
19
19
  FileRequest,
20
20
  FileResponse,
21
+ FileType,
21
22
  )
22
23
  from together.types.finetune import (
23
24
  FinetuneDownloadResult,
@@ -55,6 +56,7 @@ __all__ = [
55
56
  "FileDeleteResponse",
56
57
  "FileObject",
57
58
  "FilePurpose",
59
+ "FileType",
58
60
  "ImageRequest",
59
61
  "ImageResponse",
60
62
  "ModelObject",
together/types/files.py CHANGED
@@ -15,6 +15,11 @@ class FilePurpose(str, Enum):
15
15
  FineTune = "fine-tune"
16
16
 
17
17
 
18
+ class FileType(str, Enum):
19
+ jsonl = "jsonl"
20
+ parquet = "parquet"
21
+
22
+
18
23
  class FileRequest(BaseModel):
19
24
  """
20
25
  Files request type
@@ -43,21 +48,17 @@ class FileResponse(BaseModel):
43
48
  Files API response type
44
49
  """
45
50
 
46
- # file id
47
51
  id: str
48
- # object type
49
52
  object: Literal[ObjectType.File]
50
53
  # created timestamp
51
54
  created_at: int | None = None
52
- # file purpose
55
+ type: FileType | None = None
53
56
  purpose: FilePurpose | None = None
54
- # file-name
55
57
  filename: str | None = None
56
58
  # file byte size
57
59
  bytes: int | None = None
58
60
  # JSONL line count
59
61
  line_count: int | None = Field(None, alias="LineCount")
60
- # is processed
61
62
  processed: bool | None = Field(None, alias="Processed")
62
63
 
63
64
 
together/utils/files.py CHANGED
@@ -3,9 +3,17 @@ from __future__ import annotations
3
3
  import json
4
4
  import os
5
5
  from pathlib import Path
6
+ from traceback import format_exc
6
7
  from typing import Any, Dict
7
8
 
8
- from together.constants import MAX_FILE_SIZE_GB, MIN_SAMPLES, NUM_BYTES_IN_GB
9
+ from pyarrow import ArrowInvalid, parquet
10
+
11
+ from together.constants import (
12
+ MAX_FILE_SIZE_GB,
13
+ MIN_SAMPLES,
14
+ NUM_BYTES_IN_GB,
15
+ PARQUET_EXPECTED_COLUMNS,
16
+ )
9
17
 
10
18
 
11
19
  def check_file(
@@ -50,6 +58,25 @@ def check_file(
50
58
  else:
51
59
  report_dict["file_size"] = file_size
52
60
 
61
+ if file.suffix == ".jsonl":
62
+ report_dict["filetype"] = "jsonl"
63
+ data_report_dict = _check_jsonl(file)
64
+ elif file.suffix == ".parquet":
65
+ report_dict["filetype"] = "parquet"
66
+ data_report_dict = _check_parquet(file)
67
+ else:
68
+ report_dict["filetype"] = (
69
+ f"Unknown extension of file {file}. "
70
+ "Only files with extensions .jsonl and .parquet are supported."
71
+ )
72
+ report_dict["is_check_passed"] = False
73
+
74
+ report_dict.update(data_report_dict)
75
+ return report_dict
76
+
77
+
78
+ def _check_jsonl(file: Path) -> Dict[str, Any]:
79
+ report_dict: Dict[str, Any] = {}
53
80
  # Check that the file is UTF-8 encoded. If not report where the error occurs.
54
81
  try:
55
82
  with file.open(encoding="utf-8") as f:
@@ -71,7 +98,7 @@ def check_file(
71
98
  if not isinstance(json_line, dict):
72
99
  report_dict["line_type"] = False
73
100
  report_dict["message"] = (
74
- f"Error parsing file. Invalid format on line {idx+1} of the input file. "
101
+ f"Error parsing file. Invalid format on line {idx + 1} of the input file. "
75
102
  'Example of valid json: {"text": "my sample string"}. '
76
103
  )
77
104
 
@@ -80,7 +107,7 @@ def check_file(
80
107
  if "text" not in json_line.keys():
81
108
  report_dict["text_field"] = False
82
109
  report_dict["message"] = (
83
- f"Missing 'text' field was found on line {idx+1} of the the input file. "
110
+ f"Missing 'text' field was found on line {idx + 1} of the the input file. "
84
111
  "Expected format: {'text': 'my sample string'}. "
85
112
  )
86
113
  report_dict["is_check_passed"] = False
@@ -89,7 +116,7 @@ def check_file(
89
116
  if not isinstance(json_line["text"], str):
90
117
  report_dict["key_value"] = False
91
118
  report_dict["message"] = (
92
- f'Invalid value type for "text" key on line {idx+1}. '
119
+ f'Invalid value type for "text" key on line {idx + 1}. '
93
120
  f'Expected string. Found {type(json_line["text"])}.'
94
121
  )
95
122
 
@@ -99,7 +126,7 @@ def check_file(
99
126
  if idx + 1 < MIN_SAMPLES:
100
127
  report_dict["min_samples"] = False
101
128
  report_dict["message"] = (
102
- f"Processing {file} resulted in only {idx+1} samples. "
129
+ f"Processing {file} resulted in only {idx + 1} samples. "
103
130
  f"Our minimum is {MIN_SAMPLES} samples. "
104
131
  )
105
132
  report_dict["is_check_passed"] = False
@@ -118,7 +145,7 @@ def check_file(
118
145
  )
119
146
  else:
120
147
  report_dict["message"] = (
121
- f"Error parsing json payload. Unexpected format on line {idx+1}."
148
+ f"Error parsing json payload. Unexpected format on line {idx + 1}."
122
149
  )
123
150
  report_dict["is_check_passed"] = False
124
151
 
@@ -128,5 +155,50 @@ def check_file(
128
155
  report_dict["line_type"] = True
129
156
  if report_dict["key_value"] is not False:
130
157
  report_dict["key_value"] = True
158
+ return report_dict
159
+
160
+
161
+ def _check_parquet(file: Path) -> Dict[str, Any]:
162
+ report_dict: Dict[str, Any] = {}
163
+
164
+ try:
165
+ table = parquet.read_table(str(file), memory_map=True)
166
+ except ArrowInvalid:
167
+ report_dict["load_parquet"] = (
168
+ f"An exception has occurred when loading the Parquet file {file}. Please check the file for corruption. "
169
+ f"Exception trace:\n{format_exc()}"
170
+ )
171
+ report_dict["is_check_passed"] = False
172
+ return report_dict
173
+
174
+ column_names = table.schema.names
175
+ if "input_ids" not in column_names:
176
+ report_dict["load_parquet"] = (
177
+ f"Parquet file {file} does not contain the `input_ids` column."
178
+ )
179
+ report_dict["is_check_passed"] = False
180
+ return report_dict
181
+
182
+ for column_name in column_names:
183
+ if column_name not in PARQUET_EXPECTED_COLUMNS:
184
+ report_dict["load_parquet"] = (
185
+ f"Parquet file {file} contains an unexpected column {column_name}. "
186
+ f"Only columns {PARQUET_EXPECTED_COLUMNS} are supported."
187
+ )
188
+ report_dict["is_check_passed"] = False
189
+ return report_dict
190
+
191
+ num_samples = len(table)
192
+ if num_samples < MIN_SAMPLES:
193
+ report_dict["min_samples"] = (
194
+ f"Processing {file} resulted in only {num_samples} samples. "
195
+ f"Our minimum is {MIN_SAMPLES} samples. "
196
+ )
197
+ report_dict["is_check_passed"] = False
198
+ return report_dict
199
+ else:
200
+ report_dict["num_samples"] = num_samples
201
+
202
+ report_dict["is_check_passed"] = True
131
203
 
132
204
  return report_dict
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: together
3
- Version: 1.0.1
3
+ Version: 1.1.0
4
4
  Summary: Python client for Together's Cloud Platform!
5
5
  Home-page: https://github.com/togethercomputer/together-python
6
6
  License: Apache-2.0
@@ -19,7 +19,10 @@ Requires-Dist: aiohttp (>=3.9.3,<4.0.0)
19
19
  Requires-Dist: click (>=8.1.7,<9.0.0)
20
20
  Requires-Dist: eval-type-backport (>=0.1.3,<0.2.0)
21
21
  Requires-Dist: filelock (>=3.13.1,<4.0.0)
22
+ Requires-Dist: numpy (>=1.23.5) ; python_version < "3.12"
23
+ Requires-Dist: numpy (>=1.26.0) ; python_version >= "3.12"
22
24
  Requires-Dist: pillow (>=10.3.0,<11.0.0)
25
+ Requires-Dist: pyarrow (>=10.0.1)
23
26
  Requires-Dist: pydantic (>=2.6.3,<3.0.0)
24
27
  Requires-Dist: requests (>=2.31.0,<3.0.0)
25
28
  Requires-Dist: tabulate (>=0.9.0,<0.10.0)
@@ -11,9 +11,9 @@ together/cli/api/images.py,sha256=01dFYa2sK1HqUwVCD9FlwcjqkYWLoNxFZkzok13EriE,23
11
11
  together/cli/api/models.py,sha256=xWEzu8ZpxM_Pz9KEjRPRVuv_v22RayYZ4QcgiezT5tE,1126
12
12
  together/cli/cli.py,sha256=RC0tgapkSOFjsRPg8p-8dx9D2LDzm8YmVCHUjk_aVyQ,1977
13
13
  together/client.py,sha256=7QT5lwn7-QGf0vtgbhslQujm4986CgfE2spyMYP4JyU,4774
14
- together/constants.py,sha256=YxaViLyvBwch6eKC4FlebIRniHl4gMfav9g6PG73Eyk,801
14
+ together/constants.py,sha256=WHe6JA9TliwgErkCnovWPS9w9xXfA3X5PtKJv_y2JxQ,908
15
15
  together/error.py,sha256=cILzwDde18INNFYgbYdgvVfOhEjRZM9sg9I8Rl6nc_Y,5329
16
- together/filemanager.py,sha256=q518tL5lcqSqGmwa2GzpO6cGn1Yazyzp7h0y_TATaE0,10916
16
+ together/filemanager.py,sha256=tBr9LOqiv6G63rH7HYePKeKhyylJXfZFHW5Aybk8UUw,11438
17
17
  together/legacy/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
18
18
  together/legacy/base.py,sha256=ehrX1SCfRbK5OA83wL1q7-tfF-yuZOUxzjxYfFtdvvQ,727
19
19
  together/legacy/complete.py,sha256=8itEMlyuora2EIO-b-tvjvpHfKhxckDEiqaN-z6ccQs,2343
@@ -32,25 +32,25 @@ together/resources/finetune.py,sha256=gxkVOHuBgv19K8EBxhzyrXuJvTVe5b5FHofM9RHHlK
32
32
  together/resources/images.py,sha256=gFzXy7gLzr20KsXJXHEsOZtJJpuR6pH1d8XHCd6Dl9E,4775
33
33
  together/resources/models.py,sha256=2dtHhXAqTDOOpwSbYLzWcKTC0-m2Szlb7LDYvp7Jr4w,1786
34
34
  together/together_response.py,sha256=MhczUCPem93cjX-A1TOAUrRj3sO-o3SLcEcTsZgVzQI,1319
35
- together/types/__init__.py,sha256=L5-JgDRcczrriRYjRb0HzhXPsIgXcMdyjTDyztbfclk,1402
35
+ together/types/__init__.py,sha256=K7Gv6hLmobIfqfmijZbZwFrwxK_YKIDR_v94_ElFmVA,1432
36
36
  together/types/abstract.py,sha256=1lFQI_3WjsR_t1128AeKW0aTk6EiM6Gh1J3ZuyLLPao,642
37
37
  together/types/chat_completions.py,sha256=WoIzd21wXLwen7AIM0Ttq9KFl4GsPOaJs-UMu2Bs110,3630
38
38
  together/types/common.py,sha256=0evjGduXV_tStd0TkGNwvU4fDjfLGUOuUJz6Vl5KYbs,1491
39
39
  together/types/completions.py,sha256=_ll3jCY7OhEJP9md_x2FrmPNGE4P_Nx44klIQ2BrfEc,2173
40
40
  together/types/embeddings.py,sha256=J7grkYYn7xhqeKaBO2T-8XQRtHhkzYzymovtGdIUK5A,751
41
41
  together/types/error.py,sha256=OVlCs3cx_2WhZK4JzHT8SQyRIIqKOP1AZQ4y1PydjAE,370
42
- together/types/files.py,sha256=m2jm6aPPdQ6ieMk2PUKozvFeMSKtWQ9vZugYoIcCaac,1934
42
+ together/types/files.py,sha256=-rEUfsV6f2vZB9NrFxT4_933ubsDIUNkPB-3OlOFk4A,1954
43
43
  together/types/finetune.py,sha256=Td7zEk7ePcR9FTUwiGKl01mT-uRB6R5xzqQM0qmblX4,5779
44
44
  together/types/images.py,sha256=zX4Vt38tFDKU6yGb_hBY_N5eSTn3KPdpP5Ce_qnRHXQ,915
45
45
  together/types/models.py,sha256=3zag9x2fus2McNmLkr3fKzQL-2RNIT1-tiabI-z21p8,1013
46
46
  together/utils/__init__.py,sha256=VpjeRTya1m5eEE-Qe1zYTFsNAvuEA-dy7M2eG9Xu4fc,662
47
47
  together/utils/_log.py,sha256=yzdOV6iBEsyqF8UVvKhZm-ATtRokm34V-dXjTv3WKdE,1665
48
48
  together/utils/api_helpers.py,sha256=RSF7SRhbjHzroMOSWAXscflByM1r1ta_1SpxkAT22iE,2407
49
- together/utils/files.py,sha256=Fn4wNbv2zJPV7AvtZSiCJ8sN7g6xs_PAugwR5Q_G_kY,4835
49
+ together/utils/files.py,sha256=rH407SdONtdqQAUJAFwhTJiLgiLaU5MldvCqsjPfMQ4,7186
50
50
  together/utils/tools.py,sha256=3-lXWP3cBCzOVSZg9tr5zOT1jaVeKAKVWxO2fcXZTh8,1788
51
51
  together/version.py,sha256=p03ivHyE0SyWU4jAnRTBi_sOwywVWoZPU4g2gzRgG-Y,126
52
- together-1.0.1.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
53
- together-1.0.1.dist-info/METADATA,sha256=J13nSD5FoJQCZOujerIxYDxQD73uTbhuhoE-llSqf84,10963
54
- together-1.0.1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
55
- together-1.0.1.dist-info/entry_points.txt,sha256=G-b5NKW6lUUf1V1fH8IPTBb7jXnK7lhbX9H1zTEJXPs,50
56
- together-1.0.1.dist-info/RECORD,,
52
+ together-1.1.0.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
53
+ together-1.1.0.dist-info/METADATA,sha256=Y497GBOVVNallmm3tmuhM6qxa2slJIIeQuBLoyCjyN4,11114
54
+ together-1.1.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
55
+ together-1.1.0.dist-info/entry_points.txt,sha256=G-b5NKW6lUUf1V1fH8IPTBb7jXnK7lhbX9H1zTEJXPs,50
56
+ together-1.1.0.dist-info/RECORD,,