together 1.5.17__py3-none-any.whl → 2.0.0a8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (205) hide show
  1. together/__init__.py +101 -63
  2. together/_base_client.py +1995 -0
  3. together/_client.py +1033 -0
  4. together/_compat.py +219 -0
  5. together/_constants.py +14 -0
  6. together/_exceptions.py +108 -0
  7. together/_files.py +123 -0
  8. together/_models.py +857 -0
  9. together/_qs.py +150 -0
  10. together/_resource.py +43 -0
  11. together/_response.py +830 -0
  12. together/_streaming.py +370 -0
  13. together/_types.py +260 -0
  14. together/_utils/__init__.py +64 -0
  15. together/_utils/_compat.py +45 -0
  16. together/_utils/_datetime_parse.py +136 -0
  17. together/_utils/_logs.py +25 -0
  18. together/_utils/_proxy.py +65 -0
  19. together/_utils/_reflection.py +42 -0
  20. together/_utils/_resources_proxy.py +24 -0
  21. together/_utils/_streams.py +12 -0
  22. together/_utils/_sync.py +58 -0
  23. together/_utils/_transform.py +457 -0
  24. together/_utils/_typing.py +156 -0
  25. together/_utils/_utils.py +421 -0
  26. together/_version.py +4 -0
  27. together/lib/.keep +4 -0
  28. together/lib/__init__.py +23 -0
  29. together/{cli → lib/cli}/api/endpoints.py +108 -75
  30. together/lib/cli/api/evals.py +588 -0
  31. together/{cli → lib/cli}/api/files.py +20 -17
  32. together/{cli/api/finetune.py → lib/cli/api/fine_tuning.py} +161 -120
  33. together/lib/cli/api/models.py +140 -0
  34. together/{cli → lib/cli}/api/utils.py +6 -7
  35. together/{cli → lib/cli}/cli.py +16 -24
  36. together/{constants.py → lib/constants.py} +17 -12
  37. together/lib/resources/__init__.py +11 -0
  38. together/lib/resources/files.py +999 -0
  39. together/lib/resources/fine_tuning.py +280 -0
  40. together/lib/resources/models.py +35 -0
  41. together/lib/types/__init__.py +13 -0
  42. together/lib/types/error.py +9 -0
  43. together/lib/types/fine_tuning.py +455 -0
  44. together/{utils → lib/utils}/__init__.py +6 -14
  45. together/{utils → lib/utils}/_log.py +11 -16
  46. together/lib/utils/files.py +628 -0
  47. together/lib/utils/serializer.py +10 -0
  48. together/{utils → lib/utils}/tools.py +19 -55
  49. together/resources/__init__.py +225 -33
  50. together/resources/audio/__init__.py +72 -21
  51. together/resources/audio/audio.py +198 -0
  52. together/resources/audio/speech.py +574 -122
  53. together/resources/audio/transcriptions.py +282 -0
  54. together/resources/audio/translations.py +256 -0
  55. together/resources/audio/voices.py +135 -0
  56. together/resources/batches.py +417 -0
  57. together/resources/chat/__init__.py +30 -21
  58. together/resources/chat/chat.py +102 -0
  59. together/resources/chat/completions.py +1063 -263
  60. together/resources/code_interpreter/__init__.py +33 -0
  61. together/resources/code_interpreter/code_interpreter.py +258 -0
  62. together/resources/code_interpreter/sessions.py +135 -0
  63. together/resources/completions.py +884 -225
  64. together/resources/embeddings.py +172 -68
  65. together/resources/endpoints.py +598 -395
  66. together/resources/evals.py +452 -0
  67. together/resources/files.py +398 -121
  68. together/resources/fine_tuning.py +1033 -0
  69. together/resources/hardware.py +181 -0
  70. together/resources/images.py +256 -108
  71. together/resources/jobs.py +214 -0
  72. together/resources/models.py +238 -90
  73. together/resources/rerank.py +190 -92
  74. together/resources/videos.py +374 -0
  75. together/types/__init__.py +65 -109
  76. together/types/audio/__init__.py +10 -0
  77. together/types/audio/speech_create_params.py +75 -0
  78. together/types/audio/transcription_create_params.py +54 -0
  79. together/types/audio/transcription_create_response.py +111 -0
  80. together/types/audio/translation_create_params.py +40 -0
  81. together/types/audio/translation_create_response.py +70 -0
  82. together/types/audio/voice_list_response.py +23 -0
  83. together/types/audio_speech_stream_chunk.py +16 -0
  84. together/types/autoscaling.py +13 -0
  85. together/types/autoscaling_param.py +15 -0
  86. together/types/batch_create_params.py +24 -0
  87. together/types/batch_create_response.py +14 -0
  88. together/types/batch_job.py +45 -0
  89. together/types/batch_list_response.py +10 -0
  90. together/types/chat/__init__.py +18 -0
  91. together/types/chat/chat_completion.py +60 -0
  92. together/types/chat/chat_completion_chunk.py +61 -0
  93. together/types/chat/chat_completion_structured_message_image_url_param.py +18 -0
  94. together/types/chat/chat_completion_structured_message_text_param.py +13 -0
  95. together/types/chat/chat_completion_structured_message_video_url_param.py +18 -0
  96. together/types/chat/chat_completion_usage.py +13 -0
  97. together/types/chat/chat_completion_warning.py +9 -0
  98. together/types/chat/completion_create_params.py +329 -0
  99. together/types/code_interpreter/__init__.py +5 -0
  100. together/types/code_interpreter/session_list_response.py +31 -0
  101. together/types/code_interpreter_execute_params.py +45 -0
  102. together/types/completion.py +42 -0
  103. together/types/completion_chunk.py +66 -0
  104. together/types/completion_create_params.py +138 -0
  105. together/types/dedicated_endpoint.py +44 -0
  106. together/types/embedding.py +24 -0
  107. together/types/embedding_create_params.py +31 -0
  108. together/types/endpoint_create_params.py +43 -0
  109. together/types/endpoint_list_avzones_response.py +11 -0
  110. together/types/endpoint_list_params.py +18 -0
  111. together/types/endpoint_list_response.py +41 -0
  112. together/types/endpoint_update_params.py +27 -0
  113. together/types/eval_create_params.py +263 -0
  114. together/types/eval_create_response.py +16 -0
  115. together/types/eval_list_params.py +21 -0
  116. together/types/eval_list_response.py +10 -0
  117. together/types/eval_status_response.py +100 -0
  118. together/types/evaluation_job.py +139 -0
  119. together/types/execute_response.py +108 -0
  120. together/types/file_delete_response.py +13 -0
  121. together/types/file_list.py +12 -0
  122. together/types/file_purpose.py +9 -0
  123. together/types/file_response.py +31 -0
  124. together/types/file_type.py +7 -0
  125. together/types/fine_tuning_cancel_response.py +194 -0
  126. together/types/fine_tuning_content_params.py +24 -0
  127. together/types/fine_tuning_delete_params.py +11 -0
  128. together/types/fine_tuning_delete_response.py +12 -0
  129. together/types/fine_tuning_list_checkpoints_response.py +21 -0
  130. together/types/fine_tuning_list_events_response.py +12 -0
  131. together/types/fine_tuning_list_response.py +199 -0
  132. together/types/finetune_event.py +41 -0
  133. together/types/finetune_event_type.py +33 -0
  134. together/types/finetune_response.py +177 -0
  135. together/types/hardware_list_params.py +16 -0
  136. together/types/hardware_list_response.py +58 -0
  137. together/types/image_data_b64.py +15 -0
  138. together/types/image_data_url.py +15 -0
  139. together/types/image_file.py +23 -0
  140. together/types/image_generate_params.py +85 -0
  141. together/types/job_list_response.py +47 -0
  142. together/types/job_retrieve_response.py +43 -0
  143. together/types/log_probs.py +18 -0
  144. together/types/model_list_response.py +10 -0
  145. together/types/model_object.py +42 -0
  146. together/types/model_upload_params.py +36 -0
  147. together/types/model_upload_response.py +23 -0
  148. together/types/rerank_create_params.py +36 -0
  149. together/types/rerank_create_response.py +36 -0
  150. together/types/tool_choice.py +23 -0
  151. together/types/tool_choice_param.py +23 -0
  152. together/types/tools_param.py +23 -0
  153. together/types/training_method_dpo.py +22 -0
  154. together/types/training_method_sft.py +18 -0
  155. together/types/video_create_params.py +86 -0
  156. together/types/video_job.py +57 -0
  157. together-2.0.0a8.dist-info/METADATA +680 -0
  158. together-2.0.0a8.dist-info/RECORD +164 -0
  159. {together-1.5.17.dist-info → together-2.0.0a8.dist-info}/WHEEL +1 -1
  160. together-2.0.0a8.dist-info/entry_points.txt +2 -0
  161. {together-1.5.17.dist-info → together-2.0.0a8.dist-info/licenses}/LICENSE +1 -1
  162. together/abstract/api_requestor.py +0 -729
  163. together/cli/api/chat.py +0 -276
  164. together/cli/api/completions.py +0 -119
  165. together/cli/api/images.py +0 -93
  166. together/cli/api/models.py +0 -55
  167. together/client.py +0 -176
  168. together/error.py +0 -194
  169. together/filemanager.py +0 -389
  170. together/legacy/__init__.py +0 -0
  171. together/legacy/base.py +0 -27
  172. together/legacy/complete.py +0 -93
  173. together/legacy/embeddings.py +0 -27
  174. together/legacy/files.py +0 -146
  175. together/legacy/finetune.py +0 -177
  176. together/legacy/images.py +0 -27
  177. together/legacy/models.py +0 -44
  178. together/resources/batch.py +0 -136
  179. together/resources/code_interpreter.py +0 -82
  180. together/resources/finetune.py +0 -1064
  181. together/together_response.py +0 -50
  182. together/types/abstract.py +0 -26
  183. together/types/audio_speech.py +0 -110
  184. together/types/batch.py +0 -53
  185. together/types/chat_completions.py +0 -197
  186. together/types/code_interpreter.py +0 -57
  187. together/types/common.py +0 -66
  188. together/types/completions.py +0 -107
  189. together/types/embeddings.py +0 -35
  190. together/types/endpoints.py +0 -123
  191. together/types/error.py +0 -16
  192. together/types/files.py +0 -90
  193. together/types/finetune.py +0 -398
  194. together/types/images.py +0 -44
  195. together/types/models.py +0 -45
  196. together/types/rerank.py +0 -43
  197. together/utils/api_helpers.py +0 -124
  198. together/utils/files.py +0 -425
  199. together/version.py +0 -6
  200. together-1.5.17.dist-info/METADATA +0 -525
  201. together-1.5.17.dist-info/RECORD +0 -69
  202. together-1.5.17.dist-info/entry_points.txt +0 -3
  203. /together/{abstract → lib/cli}/__init__.py +0 -0
  204. /together/{cli → lib/cli/api}/__init__.py +0 -0
  205. /together/{cli/api/__init__.py → py.typed} +0 -0
together/types/models.py DELETED
@@ -1,45 +0,0 @@
1
- from __future__ import annotations
2
-
3
- from enum import Enum
4
- from typing import Literal
5
-
6
- from together.types.abstract import BaseModel
7
- from together.types.common import ObjectType
8
-
9
-
10
- class ModelType(str, Enum):
11
- CHAT = "chat"
12
- LANGUAGE = "language"
13
- CODE = "code"
14
- IMAGE = "image"
15
- EMBEDDING = "embedding"
16
- MODERATION = "moderation"
17
- RERANK = "rerank"
18
- AUDIO = "audio"
19
-
20
-
21
- class PricingObject(BaseModel):
22
- input: float | None = None
23
- output: float | None = None
24
- hourly: float | None = None
25
- base: float | None = None
26
- finetune: float | None = None
27
-
28
-
29
- class ModelObject(BaseModel):
30
- # model id
31
- id: str
32
- # object type
33
- object: Literal[ObjectType.Model]
34
- created: int | None = None
35
- # model type
36
- type: ModelType | None = None
37
- # pretty name
38
- display_name: str | None = None
39
- # model creator organization
40
- organization: str | None = None
41
- # link to model resource
42
- link: str | None = None
43
- license: str | None = None
44
- context_length: int | None = None
45
- pricing: PricingObject
together/types/rerank.py DELETED
@@ -1,43 +0,0 @@
1
- from __future__ import annotations
2
-
3
- from typing import List, Literal, Dict, Any
4
-
5
- from together.types.abstract import BaseModel
6
- from together.types.common import UsageData
7
-
8
-
9
- class RerankRequest(BaseModel):
10
- # model to query
11
- model: str
12
- # input or list of inputs
13
- query: str
14
- # list of documents
15
- documents: List[str] | List[Dict[str, Any]]
16
- # return top_n results
17
- top_n: int | None = None
18
- # boolean to return documents
19
- return_documents: bool = False
20
- # field selector for documents
21
- rank_fields: List[str] | None = None
22
-
23
-
24
- class RerankChoicesData(BaseModel):
25
- # response index
26
- index: int
27
- # object type
28
- relevance_score: float
29
- # rerank response
30
- document: Dict[str, Any] | None = None
31
-
32
-
33
- class RerankResponse(BaseModel):
34
- # job id
35
- id: str | None = None
36
- # object type
37
- object: Literal["rerank"] | None = None
38
- # query model
39
- model: str | None = None
40
- # list of reranked results
41
- results: List[RerankChoicesData] | None = None
42
- # usage stats
43
- usage: UsageData | None = None
@@ -1,124 +0,0 @@
1
- from __future__ import annotations
2
-
3
- import json
4
- import os
5
- import sys
6
- import platform
7
- from typing import TYPE_CHECKING, Any, Dict
8
-
9
-
10
- if TYPE_CHECKING:
11
- from _typeshed import SupportsKeysAndGetItem
12
-
13
- import together
14
- from together import error
15
- from together.utils._log import _console_log_level
16
- from together.utils import log_info
17
-
18
-
19
- def get_headers(
20
- method: str | None = None,
21
- api_key: str | None = None,
22
- extra: "SupportsKeysAndGetItem[str, Any] | None" = None,
23
- ) -> Dict[str, str]:
24
- """
25
- Generates request headers with API key, metadata, and supplied headers
26
-
27
- Args:
28
- method (str, optional): HTTP request type (POST, GET, etc.)
29
- Defaults to None.
30
- api_key (str, optional): API key to add as an Authorization header.
31
- Defaults to None.
32
- extra (SupportsKeysAndGetItem[str, Any], optional): Additional headers to add to request.
33
- Defaults to None.
34
-
35
- Returns:
36
- headers (Dict[str, str]): Compiled headers from data
37
- """
38
-
39
- user_agent = "Together/v1 PythonBindings/%s" % (together.version,)
40
-
41
- uname_without_node = " ".join(
42
- v for k, v in platform.uname()._asdict().items() if k != "node"
43
- )
44
- ua = {
45
- "bindings_version": together.version,
46
- "httplib": "requests",
47
- "lang": "python",
48
- "lang_version": platform.python_version(),
49
- "platform": platform.platform(),
50
- "publisher": "together",
51
- "uname": uname_without_node,
52
- }
53
-
54
- headers: Dict[str, Any] = {
55
- "X-Together-Client-User-Agent": json.dumps(ua),
56
- "Authorization": f"Bearer {default_api_key(api_key)}",
57
- "User-Agent": user_agent,
58
- }
59
-
60
- if _console_log_level():
61
- headers["Together-Debug"] = _console_log_level()
62
- if extra:
63
- headers.update(extra)
64
-
65
- return headers
66
-
67
-
68
- def default_api_key(api_key: str | None = None) -> str | None:
69
- """
70
- API key fallback logic from input argument and environment variable
71
-
72
- Args:
73
- api_key (str, optional): Supplied API key. This argument takes priority over env var
74
-
75
- Returns:
76
- together_api_key (str): Returns API key from supplied input or env var
77
-
78
- Raises:
79
- together.error.AuthenticationError: if API key not found
80
- """
81
- if api_key:
82
- return api_key
83
- if os.environ.get("TOGETHER_API_KEY"):
84
- return os.environ.get("TOGETHER_API_KEY")
85
-
86
- raise error.AuthenticationError(together.constants.MISSING_API_KEY_MESSAGE)
87
-
88
-
89
- def get_google_colab_secret(secret_name: str = "TOGETHER_API_KEY") -> str | None:
90
- """
91
- Checks to see if the user is running in Google Colab, and looks for the Together API Key secret.
92
-
93
- Args:
94
- secret_name (str, optional). Defaults to TOGETHER_API_KEY
95
-
96
- Returns:
97
- str: if the API key is found; None if an error occurred or the secret was not found.
98
- """
99
- # If running in Google Colab, check for Together in notebook secrets
100
- if "google.colab" in sys.modules:
101
- if TYPE_CHECKING:
102
- from google.colab import userdata # type: ignore
103
- else:
104
- from google.colab import userdata
105
-
106
- try:
107
- api_key = userdata.get(secret_name)
108
- if not isinstance(api_key, str):
109
- return None
110
- else:
111
- return str(api_key)
112
- except userdata.NotebookAccessError:
113
- log_info(
114
- "The TOGETHER_API_KEY Colab secret was found, but notebook access is disabled. Please enable notebook "
115
- "access for the secret."
116
- )
117
- except userdata.SecretNotFoundError:
118
- # warn and carry on
119
- log_info("Colab: No Google Colab secret named TOGETHER_API_KEY was found.")
120
-
121
- return None
122
-
123
- else:
124
- return None
together/utils/files.py DELETED
@@ -1,425 +0,0 @@
1
- from __future__ import annotations
2
-
3
- import json
4
- import os
5
- from pathlib import Path
6
- from traceback import format_exc
7
- from typing import Any, Dict, List
8
-
9
-
10
- from together.constants import (
11
- MAX_FILE_SIZE_GB,
12
- MIN_SAMPLES,
13
- NUM_BYTES_IN_GB,
14
- PARQUET_EXPECTED_COLUMNS,
15
- JSONL_REQUIRED_COLUMNS_MAP,
16
- REQUIRED_COLUMNS_MESSAGE,
17
- POSSIBLE_ROLES_CONVERSATION,
18
- DatasetFormat,
19
- )
20
-
21
-
22
- class InvalidFileFormatError(ValueError):
23
- """Exception raised for invalid file formats during file checks."""
24
-
25
- def __init__(
26
- self,
27
- message: str = "",
28
- line_number: int | None = None,
29
- error_source: str | None = None,
30
- ) -> None:
31
- super().__init__(message)
32
- self.message = message
33
- self.line_number = line_number
34
- self.error_source = error_source
35
-
36
-
37
- def check_file(
38
- file: Path | str,
39
- ) -> Dict[str, Any]:
40
- if not isinstance(file, Path):
41
- file = Path(file)
42
-
43
- report_dict = {
44
- "is_check_passed": True,
45
- "message": "Checks passed",
46
- "found": None,
47
- "file_size": None,
48
- "utf8": None,
49
- "line_type": None,
50
- "text_field": None,
51
- "key_value": None,
52
- "has_min_samples": None,
53
- "num_samples": None,
54
- "load_json": None,
55
- }
56
-
57
- if not file.is_file():
58
- report_dict["found"] = False
59
- report_dict["is_check_passed"] = False
60
- return report_dict
61
- else:
62
- report_dict["found"] = True
63
-
64
- file_size = os.stat(file.as_posix()).st_size
65
-
66
- if file_size > MAX_FILE_SIZE_GB * NUM_BYTES_IN_GB:
67
- report_dict["message"] = (
68
- f"Maximum supported file size is {MAX_FILE_SIZE_GB} GB. Found file with size of {round(file_size / NUM_BYTES_IN_GB ,3)} GB."
69
- )
70
- report_dict["is_check_passed"] = False
71
- elif file_size == 0:
72
- report_dict["message"] = "File is empty"
73
- report_dict["file_size"] = 0
74
- report_dict["is_check_passed"] = False
75
- return report_dict
76
- else:
77
- report_dict["file_size"] = file_size
78
-
79
- data_report_dict = {}
80
- if file.suffix == ".jsonl":
81
- report_dict["filetype"] = "jsonl"
82
- data_report_dict = _check_jsonl(file)
83
- elif file.suffix == ".parquet":
84
- report_dict["filetype"] = "parquet"
85
- data_report_dict = _check_parquet(file)
86
- else:
87
- report_dict["filetype"] = (
88
- f"Unknown extension of file {file}. "
89
- "Only files with extensions .jsonl and .parquet are supported."
90
- )
91
- report_dict["is_check_passed"] = False
92
-
93
- report_dict.update(data_report_dict)
94
-
95
- return report_dict
96
-
97
-
98
- def validate_messages(messages: List[Dict[str, str | bool]], idx: int) -> None:
99
- """Validate the messages column."""
100
- if not isinstance(messages, list):
101
- raise InvalidFileFormatError(
102
- message=f"Invalid format on line {idx + 1} of the input file. "
103
- f"Expected a list of messages. Found {type(messages)}",
104
- line_number=idx + 1,
105
- error_source="key_value",
106
- )
107
- if not messages:
108
- raise InvalidFileFormatError(
109
- message=f"Invalid format on line {idx + 1} of the input file. "
110
- f"Expected a non-empty list of messages. Found empty list",
111
- line_number=idx + 1,
112
- error_source="key_value",
113
- )
114
-
115
- has_weights = any("weight" in message for message in messages)
116
-
117
- previous_role = None
118
- for message in messages:
119
- if not isinstance(message, dict):
120
- raise InvalidFileFormatError(
121
- message=f"Invalid format on line {idx + 1} of the input file. "
122
- f"Expected a dictionary in the messages list. Found {type(message)}",
123
- line_number=idx + 1,
124
- error_source="key_value",
125
- )
126
- for column in REQUIRED_COLUMNS_MESSAGE:
127
- if column not in message:
128
- raise InvalidFileFormatError(
129
- message=f"Field `{column}` is missing for a turn `{message}` on line {idx + 1} "
130
- "of the the input file.",
131
- line_number=idx + 1,
132
- error_source="key_value",
133
- )
134
- else:
135
- if not isinstance(message[column], str):
136
- raise InvalidFileFormatError(
137
- message=f"Invalid format on line {idx + 1} in the column {column} for turn `{message}` "
138
- f"of the input file. Expected string. Found {type(message[column])}",
139
- line_number=idx + 1,
140
- error_source="text_field",
141
- )
142
-
143
- if has_weights and "weight" in message:
144
- weight = message["weight"]
145
- if not isinstance(weight, int):
146
- raise InvalidFileFormatError(
147
- message="Weight must be an integer",
148
- line_number=idx + 1,
149
- error_source="key_value",
150
- )
151
- if weight not in {0, 1}:
152
- raise InvalidFileFormatError(
153
- message="Weight must be either 0 or 1",
154
- line_number=idx + 1,
155
- error_source="key_value",
156
- )
157
- if message["role"] not in POSSIBLE_ROLES_CONVERSATION:
158
- raise InvalidFileFormatError(
159
- message=f"Found invalid role `{message['role']}` in the messages on the line {idx + 1}. "
160
- f"Possible roles in the conversation are: {POSSIBLE_ROLES_CONVERSATION}",
161
- line_number=idx + 1,
162
- error_source="key_value",
163
- )
164
-
165
- if previous_role == message["role"]:
166
- raise InvalidFileFormatError(
167
- message=f"Invalid role turns on line {idx + 1} of the input file. "
168
- "`user` and `assistant` roles must alternate user/assistant/user/assistant/...",
169
- line_number=idx + 1,
170
- error_source="key_value",
171
- )
172
- previous_role = message["role"]
173
-
174
-
175
- def validate_preference_openai(example: Dict[str, Any], idx: int = 0) -> None:
176
- """Validate the OpenAI preference dataset format.
177
-
178
- Args:
179
- example (dict): Input entry to be checked.
180
- idx (int): Line number in the file.
181
-
182
- Raises:
183
- InvalidFileFormatError: If the dataset format is invalid.
184
- """
185
- if not isinstance(example["input"], dict):
186
- raise InvalidFileFormatError(
187
- message="The dataset is malformed, the `input` field must be a dictionary.",
188
- line_number=idx + 1,
189
- error_source="key_value",
190
- )
191
-
192
- if "messages" not in example["input"]:
193
- raise InvalidFileFormatError(
194
- message="The dataset is malformed, the `input` dictionary must contain a `messages` field.",
195
- line_number=idx + 1,
196
- error_source="key_value",
197
- )
198
-
199
- validate_messages(example["input"]["messages"], idx)
200
-
201
- for output_field in ["preferred_output", "non_preferred_output"]:
202
- if not isinstance(example[output_field], list):
203
- raise InvalidFileFormatError(
204
- message=f"The dataset is malformed, the `{output_field}` field must be a list.",
205
- line_number=idx + 1,
206
- error_source="key_value",
207
- )
208
-
209
- if len(example[output_field]) != 1:
210
- raise InvalidFileFormatError(
211
- message=f"The dataset is malformed, the `{output_field}` list must contain exactly one message.",
212
- line_number=idx + 1,
213
- error_source="key_value",
214
- )
215
- if "role" not in example[output_field][0]:
216
- raise InvalidFileFormatError(
217
- message=f"The dataset is malformed, the `{output_field}` message is missing the `role` field.",
218
- line_number=idx + 1,
219
- error_source="key_value",
220
- )
221
- elif example[output_field][0]["role"] != "assistant":
222
- raise InvalidFileFormatError(
223
- message=f"The dataset is malformed, the `{output_field}` must contain an assistant message.",
224
- line_number=idx + 1,
225
- error_source="key_value",
226
- )
227
-
228
- validate_messages(example["preferred_output"], idx)
229
- validate_messages(example["non_preferred_output"], idx)
230
-
231
-
232
- def _check_jsonl(file: Path) -> Dict[str, Any]:
233
- report_dict: Dict[str, Any] = {}
234
- # Check that the file is UTF-8 encoded. If not report where the error occurs.
235
- try:
236
- with file.open(encoding="utf-8") as f:
237
- f.read()
238
- report_dict["utf8"] = True
239
- except UnicodeDecodeError as e:
240
- report_dict["utf8"] = False
241
- report_dict["message"] = f"File is not UTF-8 encoded. Error raised: {e}."
242
- report_dict["is_check_passed"] = False
243
- return report_dict
244
-
245
- dataset_format = None
246
- with file.open() as f:
247
- idx = -1
248
- try:
249
- for idx, line in enumerate(f):
250
- json_line = json.loads(line)
251
-
252
- if not isinstance(json_line, dict):
253
- raise InvalidFileFormatError(
254
- message=(
255
- f"Error parsing file. Invalid format on line {idx + 1} of the input file. "
256
- "Datasets must follow text, conversational, or instruction format. For more"
257
- "information, see https://docs.together.ai/docs/fine-tuning-data-preparation"
258
- ),
259
- line_number=idx + 1,
260
- error_source="line_type",
261
- )
262
-
263
- current_format = None
264
- for possible_format in JSONL_REQUIRED_COLUMNS_MAP:
265
- if all(
266
- column in json_line
267
- for column in JSONL_REQUIRED_COLUMNS_MAP[possible_format]
268
- ):
269
- if current_format is None:
270
- current_format = possible_format
271
- elif current_format != possible_format:
272
- raise InvalidFileFormatError(
273
- message="Found multiple dataset formats in the input file. "
274
- f"Got {current_format} and {possible_format} on line {idx + 1}.",
275
- line_number=idx + 1,
276
- error_source="format",
277
- )
278
-
279
- # Check that there are no extra columns
280
- for column in json_line:
281
- if (
282
- column
283
- not in JSONL_REQUIRED_COLUMNS_MAP[possible_format]
284
- ):
285
- raise InvalidFileFormatError(
286
- message=f'Found extra column "{column}" in the line {idx + 1}.',
287
- line_number=idx + 1,
288
- error_source="format",
289
- )
290
-
291
- if current_format is None:
292
- raise InvalidFileFormatError(
293
- message=(
294
- f"Error parsing file. Could not detect a format for the line {idx + 1} with the columns:\n"
295
- f"{json_line.keys()}"
296
- ),
297
- line_number=idx + 1,
298
- error_source="format",
299
- )
300
- if current_format == DatasetFormat.PREFERENCE_OPENAI:
301
- validate_preference_openai(json_line, idx)
302
- elif current_format == DatasetFormat.CONVERSATION:
303
- message_column = JSONL_REQUIRED_COLUMNS_MAP[
304
- DatasetFormat.CONVERSATION
305
- ][0]
306
- validate_messages(json_line[message_column], idx)
307
- else:
308
- for column in JSONL_REQUIRED_COLUMNS_MAP[current_format]:
309
- if not isinstance(json_line[column], str):
310
- raise InvalidFileFormatError(
311
- message=f'Invalid value type for "{column}" key on line {idx + 1}. '
312
- f"Expected string. Found {type(json_line[column])}.",
313
- line_number=idx + 1,
314
- error_source="key_value",
315
- )
316
-
317
- if dataset_format is None:
318
- dataset_format = current_format
319
- elif current_format is not None:
320
- if current_format != dataset_format:
321
- raise InvalidFileFormatError(
322
- message="All samples in the dataset must have the same dataset format. "
323
- f"Got {dataset_format} for the first line and {current_format} "
324
- f"for the line {idx + 1}.",
325
- line_number=idx + 1,
326
- error_source="format",
327
- )
328
-
329
- if idx + 1 < MIN_SAMPLES:
330
- report_dict["has_min_samples"] = False
331
- report_dict["message"] = (
332
- f"Processing {file} resulted in only {idx + 1} samples. "
333
- f"Our minimum is {MIN_SAMPLES} samples. "
334
- )
335
- report_dict["is_check_passed"] = False
336
- else:
337
- report_dict["num_samples"] = idx + 1
338
- report_dict["has_min_samples"] = True
339
- report_dict["is_check_passed"] = True
340
-
341
- report_dict["load_json"] = True
342
-
343
- except InvalidFileFormatError as e:
344
- report_dict["load_json"] = False
345
- report_dict["is_check_passed"] = False
346
- report_dict["message"] = e.message
347
- if e.line_number is not None:
348
- report_dict["line_number"] = e.line_number
349
- if e.error_source is not None:
350
- report_dict[e.error_source] = False
351
- except ValueError:
352
- report_dict["load_json"] = False
353
- if idx < 0:
354
- report_dict["message"] = (
355
- "Unable to decode file. "
356
- "File may be empty or in an unsupported format. "
357
- )
358
- else:
359
- report_dict["message"] = (
360
- f"Error parsing json payload. Unexpected format on line {idx + 1}."
361
- )
362
- report_dict["is_check_passed"] = False
363
-
364
- if "text_field" not in report_dict:
365
- report_dict["text_field"] = True
366
- if "line_type" not in report_dict:
367
- report_dict["line_type"] = True
368
- if "key_value" not in report_dict:
369
- report_dict["key_value"] = True
370
- return report_dict
371
-
372
-
373
- def _check_parquet(file: Path) -> Dict[str, Any]:
374
- try:
375
- # Pyarrow is optional as it's large (~80MB) and isn't compatible with older systems.
376
- from pyarrow import ArrowInvalid, parquet
377
- except ImportError:
378
- raise ImportError(
379
- "pyarrow is not installed and is required to use parquet files. Please install it via `pip install together[pyarrow]`"
380
- )
381
-
382
- report_dict: Dict[str, Any] = {}
383
-
384
- try:
385
- table = parquet.read_table(str(file), memory_map=True)
386
- except ArrowInvalid:
387
- report_dict["load_parquet"] = (
388
- f"An exception has occurred when loading the Parquet file {file}. Please check the file for corruption. "
389
- f"Exception trace:\n{format_exc()}"
390
- )
391
- report_dict["is_check_passed"] = False
392
- return report_dict
393
-
394
- column_names = table.schema.names
395
- if "input_ids" not in column_names:
396
- report_dict["load_parquet"] = (
397
- f"Parquet file {file} does not contain the `input_ids` column."
398
- )
399
- report_dict["is_check_passed"] = False
400
- return report_dict
401
-
402
- for column_name in column_names:
403
- if column_name not in PARQUET_EXPECTED_COLUMNS:
404
- report_dict["load_parquet"] = (
405
- f"Parquet file {file} contains an unexpected column {column_name}. "
406
- f"Only columns {PARQUET_EXPECTED_COLUMNS} are supported."
407
- )
408
- report_dict["is_check_passed"] = False
409
- return report_dict
410
-
411
- num_samples = len(table)
412
- if num_samples < MIN_SAMPLES:
413
- report_dict["has_min_samples"] = False
414
- report_dict["message"] = (
415
- f"Processing {file} resulted in only {num_samples} samples. "
416
- f"Our minimum is {MIN_SAMPLES} samples. "
417
- )
418
- report_dict["is_check_passed"] = False
419
- return report_dict
420
- else:
421
- report_dict["num_samples"] = num_samples
422
-
423
- report_dict["is_check_passed"] = True
424
-
425
- return report_dict
together/version.py DELETED
@@ -1,6 +0,0 @@
1
- import importlib.metadata
2
-
3
-
4
- VERSION = importlib.metadata.version(
5
- "together"
6
- ) # gets version number from pyproject.toml