together 1.5.17__py3-none-any.whl → 2.0.0a8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- together/__init__.py +101 -63
- together/_base_client.py +1995 -0
- together/_client.py +1033 -0
- together/_compat.py +219 -0
- together/_constants.py +14 -0
- together/_exceptions.py +108 -0
- together/_files.py +123 -0
- together/_models.py +857 -0
- together/_qs.py +150 -0
- together/_resource.py +43 -0
- together/_response.py +830 -0
- together/_streaming.py +370 -0
- together/_types.py +260 -0
- together/_utils/__init__.py +64 -0
- together/_utils/_compat.py +45 -0
- together/_utils/_datetime_parse.py +136 -0
- together/_utils/_logs.py +25 -0
- together/_utils/_proxy.py +65 -0
- together/_utils/_reflection.py +42 -0
- together/_utils/_resources_proxy.py +24 -0
- together/_utils/_streams.py +12 -0
- together/_utils/_sync.py +58 -0
- together/_utils/_transform.py +457 -0
- together/_utils/_typing.py +156 -0
- together/_utils/_utils.py +421 -0
- together/_version.py +4 -0
- together/lib/.keep +4 -0
- together/lib/__init__.py +23 -0
- together/{cli → lib/cli}/api/endpoints.py +108 -75
- together/lib/cli/api/evals.py +588 -0
- together/{cli → lib/cli}/api/files.py +20 -17
- together/{cli/api/finetune.py → lib/cli/api/fine_tuning.py} +161 -120
- together/lib/cli/api/models.py +140 -0
- together/{cli → lib/cli}/api/utils.py +6 -7
- together/{cli → lib/cli}/cli.py +16 -24
- together/{constants.py → lib/constants.py} +17 -12
- together/lib/resources/__init__.py +11 -0
- together/lib/resources/files.py +999 -0
- together/lib/resources/fine_tuning.py +280 -0
- together/lib/resources/models.py +35 -0
- together/lib/types/__init__.py +13 -0
- together/lib/types/error.py +9 -0
- together/lib/types/fine_tuning.py +455 -0
- together/{utils → lib/utils}/__init__.py +6 -14
- together/{utils → lib/utils}/_log.py +11 -16
- together/lib/utils/files.py +628 -0
- together/lib/utils/serializer.py +10 -0
- together/{utils → lib/utils}/tools.py +19 -55
- together/resources/__init__.py +225 -33
- together/resources/audio/__init__.py +72 -21
- together/resources/audio/audio.py +198 -0
- together/resources/audio/speech.py +574 -122
- together/resources/audio/transcriptions.py +282 -0
- together/resources/audio/translations.py +256 -0
- together/resources/audio/voices.py +135 -0
- together/resources/batches.py +417 -0
- together/resources/chat/__init__.py +30 -21
- together/resources/chat/chat.py +102 -0
- together/resources/chat/completions.py +1063 -263
- together/resources/code_interpreter/__init__.py +33 -0
- together/resources/code_interpreter/code_interpreter.py +258 -0
- together/resources/code_interpreter/sessions.py +135 -0
- together/resources/completions.py +884 -225
- together/resources/embeddings.py +172 -68
- together/resources/endpoints.py +598 -395
- together/resources/evals.py +452 -0
- together/resources/files.py +398 -121
- together/resources/fine_tuning.py +1033 -0
- together/resources/hardware.py +181 -0
- together/resources/images.py +256 -108
- together/resources/jobs.py +214 -0
- together/resources/models.py +238 -90
- together/resources/rerank.py +190 -92
- together/resources/videos.py +374 -0
- together/types/__init__.py +65 -109
- together/types/audio/__init__.py +10 -0
- together/types/audio/speech_create_params.py +75 -0
- together/types/audio/transcription_create_params.py +54 -0
- together/types/audio/transcription_create_response.py +111 -0
- together/types/audio/translation_create_params.py +40 -0
- together/types/audio/translation_create_response.py +70 -0
- together/types/audio/voice_list_response.py +23 -0
- together/types/audio_speech_stream_chunk.py +16 -0
- together/types/autoscaling.py +13 -0
- together/types/autoscaling_param.py +15 -0
- together/types/batch_create_params.py +24 -0
- together/types/batch_create_response.py +14 -0
- together/types/batch_job.py +45 -0
- together/types/batch_list_response.py +10 -0
- together/types/chat/__init__.py +18 -0
- together/types/chat/chat_completion.py +60 -0
- together/types/chat/chat_completion_chunk.py +61 -0
- together/types/chat/chat_completion_structured_message_image_url_param.py +18 -0
- together/types/chat/chat_completion_structured_message_text_param.py +13 -0
- together/types/chat/chat_completion_structured_message_video_url_param.py +18 -0
- together/types/chat/chat_completion_usage.py +13 -0
- together/types/chat/chat_completion_warning.py +9 -0
- together/types/chat/completion_create_params.py +329 -0
- together/types/code_interpreter/__init__.py +5 -0
- together/types/code_interpreter/session_list_response.py +31 -0
- together/types/code_interpreter_execute_params.py +45 -0
- together/types/completion.py +42 -0
- together/types/completion_chunk.py +66 -0
- together/types/completion_create_params.py +138 -0
- together/types/dedicated_endpoint.py +44 -0
- together/types/embedding.py +24 -0
- together/types/embedding_create_params.py +31 -0
- together/types/endpoint_create_params.py +43 -0
- together/types/endpoint_list_avzones_response.py +11 -0
- together/types/endpoint_list_params.py +18 -0
- together/types/endpoint_list_response.py +41 -0
- together/types/endpoint_update_params.py +27 -0
- together/types/eval_create_params.py +263 -0
- together/types/eval_create_response.py +16 -0
- together/types/eval_list_params.py +21 -0
- together/types/eval_list_response.py +10 -0
- together/types/eval_status_response.py +100 -0
- together/types/evaluation_job.py +139 -0
- together/types/execute_response.py +108 -0
- together/types/file_delete_response.py +13 -0
- together/types/file_list.py +12 -0
- together/types/file_purpose.py +9 -0
- together/types/file_response.py +31 -0
- together/types/file_type.py +7 -0
- together/types/fine_tuning_cancel_response.py +194 -0
- together/types/fine_tuning_content_params.py +24 -0
- together/types/fine_tuning_delete_params.py +11 -0
- together/types/fine_tuning_delete_response.py +12 -0
- together/types/fine_tuning_list_checkpoints_response.py +21 -0
- together/types/fine_tuning_list_events_response.py +12 -0
- together/types/fine_tuning_list_response.py +199 -0
- together/types/finetune_event.py +41 -0
- together/types/finetune_event_type.py +33 -0
- together/types/finetune_response.py +177 -0
- together/types/hardware_list_params.py +16 -0
- together/types/hardware_list_response.py +58 -0
- together/types/image_data_b64.py +15 -0
- together/types/image_data_url.py +15 -0
- together/types/image_file.py +23 -0
- together/types/image_generate_params.py +85 -0
- together/types/job_list_response.py +47 -0
- together/types/job_retrieve_response.py +43 -0
- together/types/log_probs.py +18 -0
- together/types/model_list_response.py +10 -0
- together/types/model_object.py +42 -0
- together/types/model_upload_params.py +36 -0
- together/types/model_upload_response.py +23 -0
- together/types/rerank_create_params.py +36 -0
- together/types/rerank_create_response.py +36 -0
- together/types/tool_choice.py +23 -0
- together/types/tool_choice_param.py +23 -0
- together/types/tools_param.py +23 -0
- together/types/training_method_dpo.py +22 -0
- together/types/training_method_sft.py +18 -0
- together/types/video_create_params.py +86 -0
- together/types/video_job.py +57 -0
- together-2.0.0a8.dist-info/METADATA +680 -0
- together-2.0.0a8.dist-info/RECORD +164 -0
- {together-1.5.17.dist-info → together-2.0.0a8.dist-info}/WHEEL +1 -1
- together-2.0.0a8.dist-info/entry_points.txt +2 -0
- {together-1.5.17.dist-info → together-2.0.0a8.dist-info/licenses}/LICENSE +1 -1
- together/abstract/api_requestor.py +0 -729
- together/cli/api/chat.py +0 -276
- together/cli/api/completions.py +0 -119
- together/cli/api/images.py +0 -93
- together/cli/api/models.py +0 -55
- together/client.py +0 -176
- together/error.py +0 -194
- together/filemanager.py +0 -389
- together/legacy/__init__.py +0 -0
- together/legacy/base.py +0 -27
- together/legacy/complete.py +0 -93
- together/legacy/embeddings.py +0 -27
- together/legacy/files.py +0 -146
- together/legacy/finetune.py +0 -177
- together/legacy/images.py +0 -27
- together/legacy/models.py +0 -44
- together/resources/batch.py +0 -136
- together/resources/code_interpreter.py +0 -82
- together/resources/finetune.py +0 -1064
- together/together_response.py +0 -50
- together/types/abstract.py +0 -26
- together/types/audio_speech.py +0 -110
- together/types/batch.py +0 -53
- together/types/chat_completions.py +0 -197
- together/types/code_interpreter.py +0 -57
- together/types/common.py +0 -66
- together/types/completions.py +0 -107
- together/types/embeddings.py +0 -35
- together/types/endpoints.py +0 -123
- together/types/error.py +0 -16
- together/types/files.py +0 -90
- together/types/finetune.py +0 -398
- together/types/images.py +0 -44
- together/types/models.py +0 -45
- together/types/rerank.py +0 -43
- together/utils/api_helpers.py +0 -124
- together/utils/files.py +0 -425
- together/version.py +0 -6
- together-1.5.17.dist-info/METADATA +0 -525
- together-1.5.17.dist-info/RECORD +0 -69
- together-1.5.17.dist-info/entry_points.txt +0 -3
- /together/{abstract → lib/cli}/__init__.py +0 -0
- /together/{cli → lib/cli/api}/__init__.py +0 -0
- /together/{cli/api/__init__.py → py.typed} +0 -0
together/types/models.py
DELETED
|
@@ -1,45 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
from enum import Enum
|
|
4
|
-
from typing import Literal
|
|
5
|
-
|
|
6
|
-
from together.types.abstract import BaseModel
|
|
7
|
-
from together.types.common import ObjectType
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
class ModelType(str, Enum):
|
|
11
|
-
CHAT = "chat"
|
|
12
|
-
LANGUAGE = "language"
|
|
13
|
-
CODE = "code"
|
|
14
|
-
IMAGE = "image"
|
|
15
|
-
EMBEDDING = "embedding"
|
|
16
|
-
MODERATION = "moderation"
|
|
17
|
-
RERANK = "rerank"
|
|
18
|
-
AUDIO = "audio"
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
class PricingObject(BaseModel):
|
|
22
|
-
input: float | None = None
|
|
23
|
-
output: float | None = None
|
|
24
|
-
hourly: float | None = None
|
|
25
|
-
base: float | None = None
|
|
26
|
-
finetune: float | None = None
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
class ModelObject(BaseModel):
|
|
30
|
-
# model id
|
|
31
|
-
id: str
|
|
32
|
-
# object type
|
|
33
|
-
object: Literal[ObjectType.Model]
|
|
34
|
-
created: int | None = None
|
|
35
|
-
# model type
|
|
36
|
-
type: ModelType | None = None
|
|
37
|
-
# pretty name
|
|
38
|
-
display_name: str | None = None
|
|
39
|
-
# model creator organization
|
|
40
|
-
organization: str | None = None
|
|
41
|
-
# link to model resource
|
|
42
|
-
link: str | None = None
|
|
43
|
-
license: str | None = None
|
|
44
|
-
context_length: int | None = None
|
|
45
|
-
pricing: PricingObject
|
together/types/rerank.py
DELETED
|
@@ -1,43 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
from typing import List, Literal, Dict, Any
|
|
4
|
-
|
|
5
|
-
from together.types.abstract import BaseModel
|
|
6
|
-
from together.types.common import UsageData
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
class RerankRequest(BaseModel):
|
|
10
|
-
# model to query
|
|
11
|
-
model: str
|
|
12
|
-
# input or list of inputs
|
|
13
|
-
query: str
|
|
14
|
-
# list of documents
|
|
15
|
-
documents: List[str] | List[Dict[str, Any]]
|
|
16
|
-
# return top_n results
|
|
17
|
-
top_n: int | None = None
|
|
18
|
-
# boolean to return documents
|
|
19
|
-
return_documents: bool = False
|
|
20
|
-
# field selector for documents
|
|
21
|
-
rank_fields: List[str] | None = None
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
class RerankChoicesData(BaseModel):
|
|
25
|
-
# response index
|
|
26
|
-
index: int
|
|
27
|
-
# object type
|
|
28
|
-
relevance_score: float
|
|
29
|
-
# rerank response
|
|
30
|
-
document: Dict[str, Any] | None = None
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
class RerankResponse(BaseModel):
|
|
34
|
-
# job id
|
|
35
|
-
id: str | None = None
|
|
36
|
-
# object type
|
|
37
|
-
object: Literal["rerank"] | None = None
|
|
38
|
-
# query model
|
|
39
|
-
model: str | None = None
|
|
40
|
-
# list of reranked results
|
|
41
|
-
results: List[RerankChoicesData] | None = None
|
|
42
|
-
# usage stats
|
|
43
|
-
usage: UsageData | None = None
|
together/utils/api_helpers.py
DELETED
|
@@ -1,124 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
import json
|
|
4
|
-
import os
|
|
5
|
-
import sys
|
|
6
|
-
import platform
|
|
7
|
-
from typing import TYPE_CHECKING, Any, Dict
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
if TYPE_CHECKING:
|
|
11
|
-
from _typeshed import SupportsKeysAndGetItem
|
|
12
|
-
|
|
13
|
-
import together
|
|
14
|
-
from together import error
|
|
15
|
-
from together.utils._log import _console_log_level
|
|
16
|
-
from together.utils import log_info
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
def get_headers(
|
|
20
|
-
method: str | None = None,
|
|
21
|
-
api_key: str | None = None,
|
|
22
|
-
extra: "SupportsKeysAndGetItem[str, Any] | None" = None,
|
|
23
|
-
) -> Dict[str, str]:
|
|
24
|
-
"""
|
|
25
|
-
Generates request headers with API key, metadata, and supplied headers
|
|
26
|
-
|
|
27
|
-
Args:
|
|
28
|
-
method (str, optional): HTTP request type (POST, GET, etc.)
|
|
29
|
-
Defaults to None.
|
|
30
|
-
api_key (str, optional): API key to add as an Authorization header.
|
|
31
|
-
Defaults to None.
|
|
32
|
-
extra (SupportsKeysAndGetItem[str, Any], optional): Additional headers to add to request.
|
|
33
|
-
Defaults to None.
|
|
34
|
-
|
|
35
|
-
Returns:
|
|
36
|
-
headers (Dict[str, str]): Compiled headers from data
|
|
37
|
-
"""
|
|
38
|
-
|
|
39
|
-
user_agent = "Together/v1 PythonBindings/%s" % (together.version,)
|
|
40
|
-
|
|
41
|
-
uname_without_node = " ".join(
|
|
42
|
-
v for k, v in platform.uname()._asdict().items() if k != "node"
|
|
43
|
-
)
|
|
44
|
-
ua = {
|
|
45
|
-
"bindings_version": together.version,
|
|
46
|
-
"httplib": "requests",
|
|
47
|
-
"lang": "python",
|
|
48
|
-
"lang_version": platform.python_version(),
|
|
49
|
-
"platform": platform.platform(),
|
|
50
|
-
"publisher": "together",
|
|
51
|
-
"uname": uname_without_node,
|
|
52
|
-
}
|
|
53
|
-
|
|
54
|
-
headers: Dict[str, Any] = {
|
|
55
|
-
"X-Together-Client-User-Agent": json.dumps(ua),
|
|
56
|
-
"Authorization": f"Bearer {default_api_key(api_key)}",
|
|
57
|
-
"User-Agent": user_agent,
|
|
58
|
-
}
|
|
59
|
-
|
|
60
|
-
if _console_log_level():
|
|
61
|
-
headers["Together-Debug"] = _console_log_level()
|
|
62
|
-
if extra:
|
|
63
|
-
headers.update(extra)
|
|
64
|
-
|
|
65
|
-
return headers
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
def default_api_key(api_key: str | None = None) -> str | None:
|
|
69
|
-
"""
|
|
70
|
-
API key fallback logic from input argument and environment variable
|
|
71
|
-
|
|
72
|
-
Args:
|
|
73
|
-
api_key (str, optional): Supplied API key. This argument takes priority over env var
|
|
74
|
-
|
|
75
|
-
Returns:
|
|
76
|
-
together_api_key (str): Returns API key from supplied input or env var
|
|
77
|
-
|
|
78
|
-
Raises:
|
|
79
|
-
together.error.AuthenticationError: if API key not found
|
|
80
|
-
"""
|
|
81
|
-
if api_key:
|
|
82
|
-
return api_key
|
|
83
|
-
if os.environ.get("TOGETHER_API_KEY"):
|
|
84
|
-
return os.environ.get("TOGETHER_API_KEY")
|
|
85
|
-
|
|
86
|
-
raise error.AuthenticationError(together.constants.MISSING_API_KEY_MESSAGE)
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
def get_google_colab_secret(secret_name: str = "TOGETHER_API_KEY") -> str | None:
|
|
90
|
-
"""
|
|
91
|
-
Checks to see if the user is running in Google Colab, and looks for the Together API Key secret.
|
|
92
|
-
|
|
93
|
-
Args:
|
|
94
|
-
secret_name (str, optional). Defaults to TOGETHER_API_KEY
|
|
95
|
-
|
|
96
|
-
Returns:
|
|
97
|
-
str: if the API key is found; None if an error occurred or the secret was not found.
|
|
98
|
-
"""
|
|
99
|
-
# If running in Google Colab, check for Together in notebook secrets
|
|
100
|
-
if "google.colab" in sys.modules:
|
|
101
|
-
if TYPE_CHECKING:
|
|
102
|
-
from google.colab import userdata # type: ignore
|
|
103
|
-
else:
|
|
104
|
-
from google.colab import userdata
|
|
105
|
-
|
|
106
|
-
try:
|
|
107
|
-
api_key = userdata.get(secret_name)
|
|
108
|
-
if not isinstance(api_key, str):
|
|
109
|
-
return None
|
|
110
|
-
else:
|
|
111
|
-
return str(api_key)
|
|
112
|
-
except userdata.NotebookAccessError:
|
|
113
|
-
log_info(
|
|
114
|
-
"The TOGETHER_API_KEY Colab secret was found, but notebook access is disabled. Please enable notebook "
|
|
115
|
-
"access for the secret."
|
|
116
|
-
)
|
|
117
|
-
except userdata.SecretNotFoundError:
|
|
118
|
-
# warn and carry on
|
|
119
|
-
log_info("Colab: No Google Colab secret named TOGETHER_API_KEY was found.")
|
|
120
|
-
|
|
121
|
-
return None
|
|
122
|
-
|
|
123
|
-
else:
|
|
124
|
-
return None
|
together/utils/files.py
DELETED
|
@@ -1,425 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
import json
|
|
4
|
-
import os
|
|
5
|
-
from pathlib import Path
|
|
6
|
-
from traceback import format_exc
|
|
7
|
-
from typing import Any, Dict, List
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
from together.constants import (
|
|
11
|
-
MAX_FILE_SIZE_GB,
|
|
12
|
-
MIN_SAMPLES,
|
|
13
|
-
NUM_BYTES_IN_GB,
|
|
14
|
-
PARQUET_EXPECTED_COLUMNS,
|
|
15
|
-
JSONL_REQUIRED_COLUMNS_MAP,
|
|
16
|
-
REQUIRED_COLUMNS_MESSAGE,
|
|
17
|
-
POSSIBLE_ROLES_CONVERSATION,
|
|
18
|
-
DatasetFormat,
|
|
19
|
-
)
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
class InvalidFileFormatError(ValueError):
|
|
23
|
-
"""Exception raised for invalid file formats during file checks."""
|
|
24
|
-
|
|
25
|
-
def __init__(
|
|
26
|
-
self,
|
|
27
|
-
message: str = "",
|
|
28
|
-
line_number: int | None = None,
|
|
29
|
-
error_source: str | None = None,
|
|
30
|
-
) -> None:
|
|
31
|
-
super().__init__(message)
|
|
32
|
-
self.message = message
|
|
33
|
-
self.line_number = line_number
|
|
34
|
-
self.error_source = error_source
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
def check_file(
|
|
38
|
-
file: Path | str,
|
|
39
|
-
) -> Dict[str, Any]:
|
|
40
|
-
if not isinstance(file, Path):
|
|
41
|
-
file = Path(file)
|
|
42
|
-
|
|
43
|
-
report_dict = {
|
|
44
|
-
"is_check_passed": True,
|
|
45
|
-
"message": "Checks passed",
|
|
46
|
-
"found": None,
|
|
47
|
-
"file_size": None,
|
|
48
|
-
"utf8": None,
|
|
49
|
-
"line_type": None,
|
|
50
|
-
"text_field": None,
|
|
51
|
-
"key_value": None,
|
|
52
|
-
"has_min_samples": None,
|
|
53
|
-
"num_samples": None,
|
|
54
|
-
"load_json": None,
|
|
55
|
-
}
|
|
56
|
-
|
|
57
|
-
if not file.is_file():
|
|
58
|
-
report_dict["found"] = False
|
|
59
|
-
report_dict["is_check_passed"] = False
|
|
60
|
-
return report_dict
|
|
61
|
-
else:
|
|
62
|
-
report_dict["found"] = True
|
|
63
|
-
|
|
64
|
-
file_size = os.stat(file.as_posix()).st_size
|
|
65
|
-
|
|
66
|
-
if file_size > MAX_FILE_SIZE_GB * NUM_BYTES_IN_GB:
|
|
67
|
-
report_dict["message"] = (
|
|
68
|
-
f"Maximum supported file size is {MAX_FILE_SIZE_GB} GB. Found file with size of {round(file_size / NUM_BYTES_IN_GB ,3)} GB."
|
|
69
|
-
)
|
|
70
|
-
report_dict["is_check_passed"] = False
|
|
71
|
-
elif file_size == 0:
|
|
72
|
-
report_dict["message"] = "File is empty"
|
|
73
|
-
report_dict["file_size"] = 0
|
|
74
|
-
report_dict["is_check_passed"] = False
|
|
75
|
-
return report_dict
|
|
76
|
-
else:
|
|
77
|
-
report_dict["file_size"] = file_size
|
|
78
|
-
|
|
79
|
-
data_report_dict = {}
|
|
80
|
-
if file.suffix == ".jsonl":
|
|
81
|
-
report_dict["filetype"] = "jsonl"
|
|
82
|
-
data_report_dict = _check_jsonl(file)
|
|
83
|
-
elif file.suffix == ".parquet":
|
|
84
|
-
report_dict["filetype"] = "parquet"
|
|
85
|
-
data_report_dict = _check_parquet(file)
|
|
86
|
-
else:
|
|
87
|
-
report_dict["filetype"] = (
|
|
88
|
-
f"Unknown extension of file {file}. "
|
|
89
|
-
"Only files with extensions .jsonl and .parquet are supported."
|
|
90
|
-
)
|
|
91
|
-
report_dict["is_check_passed"] = False
|
|
92
|
-
|
|
93
|
-
report_dict.update(data_report_dict)
|
|
94
|
-
|
|
95
|
-
return report_dict
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
def validate_messages(messages: List[Dict[str, str | bool]], idx: int) -> None:
|
|
99
|
-
"""Validate the messages column."""
|
|
100
|
-
if not isinstance(messages, list):
|
|
101
|
-
raise InvalidFileFormatError(
|
|
102
|
-
message=f"Invalid format on line {idx + 1} of the input file. "
|
|
103
|
-
f"Expected a list of messages. Found {type(messages)}",
|
|
104
|
-
line_number=idx + 1,
|
|
105
|
-
error_source="key_value",
|
|
106
|
-
)
|
|
107
|
-
if not messages:
|
|
108
|
-
raise InvalidFileFormatError(
|
|
109
|
-
message=f"Invalid format on line {idx + 1} of the input file. "
|
|
110
|
-
f"Expected a non-empty list of messages. Found empty list",
|
|
111
|
-
line_number=idx + 1,
|
|
112
|
-
error_source="key_value",
|
|
113
|
-
)
|
|
114
|
-
|
|
115
|
-
has_weights = any("weight" in message for message in messages)
|
|
116
|
-
|
|
117
|
-
previous_role = None
|
|
118
|
-
for message in messages:
|
|
119
|
-
if not isinstance(message, dict):
|
|
120
|
-
raise InvalidFileFormatError(
|
|
121
|
-
message=f"Invalid format on line {idx + 1} of the input file. "
|
|
122
|
-
f"Expected a dictionary in the messages list. Found {type(message)}",
|
|
123
|
-
line_number=idx + 1,
|
|
124
|
-
error_source="key_value",
|
|
125
|
-
)
|
|
126
|
-
for column in REQUIRED_COLUMNS_MESSAGE:
|
|
127
|
-
if column not in message:
|
|
128
|
-
raise InvalidFileFormatError(
|
|
129
|
-
message=f"Field `{column}` is missing for a turn `{message}` on line {idx + 1} "
|
|
130
|
-
"of the the input file.",
|
|
131
|
-
line_number=idx + 1,
|
|
132
|
-
error_source="key_value",
|
|
133
|
-
)
|
|
134
|
-
else:
|
|
135
|
-
if not isinstance(message[column], str):
|
|
136
|
-
raise InvalidFileFormatError(
|
|
137
|
-
message=f"Invalid format on line {idx + 1} in the column {column} for turn `{message}` "
|
|
138
|
-
f"of the input file. Expected string. Found {type(message[column])}",
|
|
139
|
-
line_number=idx + 1,
|
|
140
|
-
error_source="text_field",
|
|
141
|
-
)
|
|
142
|
-
|
|
143
|
-
if has_weights and "weight" in message:
|
|
144
|
-
weight = message["weight"]
|
|
145
|
-
if not isinstance(weight, int):
|
|
146
|
-
raise InvalidFileFormatError(
|
|
147
|
-
message="Weight must be an integer",
|
|
148
|
-
line_number=idx + 1,
|
|
149
|
-
error_source="key_value",
|
|
150
|
-
)
|
|
151
|
-
if weight not in {0, 1}:
|
|
152
|
-
raise InvalidFileFormatError(
|
|
153
|
-
message="Weight must be either 0 or 1",
|
|
154
|
-
line_number=idx + 1,
|
|
155
|
-
error_source="key_value",
|
|
156
|
-
)
|
|
157
|
-
if message["role"] not in POSSIBLE_ROLES_CONVERSATION:
|
|
158
|
-
raise InvalidFileFormatError(
|
|
159
|
-
message=f"Found invalid role `{message['role']}` in the messages on the line {idx + 1}. "
|
|
160
|
-
f"Possible roles in the conversation are: {POSSIBLE_ROLES_CONVERSATION}",
|
|
161
|
-
line_number=idx + 1,
|
|
162
|
-
error_source="key_value",
|
|
163
|
-
)
|
|
164
|
-
|
|
165
|
-
if previous_role == message["role"]:
|
|
166
|
-
raise InvalidFileFormatError(
|
|
167
|
-
message=f"Invalid role turns on line {idx + 1} of the input file. "
|
|
168
|
-
"`user` and `assistant` roles must alternate user/assistant/user/assistant/...",
|
|
169
|
-
line_number=idx + 1,
|
|
170
|
-
error_source="key_value",
|
|
171
|
-
)
|
|
172
|
-
previous_role = message["role"]
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
def validate_preference_openai(example: Dict[str, Any], idx: int = 0) -> None:
|
|
176
|
-
"""Validate the OpenAI preference dataset format.
|
|
177
|
-
|
|
178
|
-
Args:
|
|
179
|
-
example (dict): Input entry to be checked.
|
|
180
|
-
idx (int): Line number in the file.
|
|
181
|
-
|
|
182
|
-
Raises:
|
|
183
|
-
InvalidFileFormatError: If the dataset format is invalid.
|
|
184
|
-
"""
|
|
185
|
-
if not isinstance(example["input"], dict):
|
|
186
|
-
raise InvalidFileFormatError(
|
|
187
|
-
message="The dataset is malformed, the `input` field must be a dictionary.",
|
|
188
|
-
line_number=idx + 1,
|
|
189
|
-
error_source="key_value",
|
|
190
|
-
)
|
|
191
|
-
|
|
192
|
-
if "messages" not in example["input"]:
|
|
193
|
-
raise InvalidFileFormatError(
|
|
194
|
-
message="The dataset is malformed, the `input` dictionary must contain a `messages` field.",
|
|
195
|
-
line_number=idx + 1,
|
|
196
|
-
error_source="key_value",
|
|
197
|
-
)
|
|
198
|
-
|
|
199
|
-
validate_messages(example["input"]["messages"], idx)
|
|
200
|
-
|
|
201
|
-
for output_field in ["preferred_output", "non_preferred_output"]:
|
|
202
|
-
if not isinstance(example[output_field], list):
|
|
203
|
-
raise InvalidFileFormatError(
|
|
204
|
-
message=f"The dataset is malformed, the `{output_field}` field must be a list.",
|
|
205
|
-
line_number=idx + 1,
|
|
206
|
-
error_source="key_value",
|
|
207
|
-
)
|
|
208
|
-
|
|
209
|
-
if len(example[output_field]) != 1:
|
|
210
|
-
raise InvalidFileFormatError(
|
|
211
|
-
message=f"The dataset is malformed, the `{output_field}` list must contain exactly one message.",
|
|
212
|
-
line_number=idx + 1,
|
|
213
|
-
error_source="key_value",
|
|
214
|
-
)
|
|
215
|
-
if "role" not in example[output_field][0]:
|
|
216
|
-
raise InvalidFileFormatError(
|
|
217
|
-
message=f"The dataset is malformed, the `{output_field}` message is missing the `role` field.",
|
|
218
|
-
line_number=idx + 1,
|
|
219
|
-
error_source="key_value",
|
|
220
|
-
)
|
|
221
|
-
elif example[output_field][0]["role"] != "assistant":
|
|
222
|
-
raise InvalidFileFormatError(
|
|
223
|
-
message=f"The dataset is malformed, the `{output_field}` must contain an assistant message.",
|
|
224
|
-
line_number=idx + 1,
|
|
225
|
-
error_source="key_value",
|
|
226
|
-
)
|
|
227
|
-
|
|
228
|
-
validate_messages(example["preferred_output"], idx)
|
|
229
|
-
validate_messages(example["non_preferred_output"], idx)
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
def _check_jsonl(file: Path) -> Dict[str, Any]:
|
|
233
|
-
report_dict: Dict[str, Any] = {}
|
|
234
|
-
# Check that the file is UTF-8 encoded. If not report where the error occurs.
|
|
235
|
-
try:
|
|
236
|
-
with file.open(encoding="utf-8") as f:
|
|
237
|
-
f.read()
|
|
238
|
-
report_dict["utf8"] = True
|
|
239
|
-
except UnicodeDecodeError as e:
|
|
240
|
-
report_dict["utf8"] = False
|
|
241
|
-
report_dict["message"] = f"File is not UTF-8 encoded. Error raised: {e}."
|
|
242
|
-
report_dict["is_check_passed"] = False
|
|
243
|
-
return report_dict
|
|
244
|
-
|
|
245
|
-
dataset_format = None
|
|
246
|
-
with file.open() as f:
|
|
247
|
-
idx = -1
|
|
248
|
-
try:
|
|
249
|
-
for idx, line in enumerate(f):
|
|
250
|
-
json_line = json.loads(line)
|
|
251
|
-
|
|
252
|
-
if not isinstance(json_line, dict):
|
|
253
|
-
raise InvalidFileFormatError(
|
|
254
|
-
message=(
|
|
255
|
-
f"Error parsing file. Invalid format on line {idx + 1} of the input file. "
|
|
256
|
-
"Datasets must follow text, conversational, or instruction format. For more"
|
|
257
|
-
"information, see https://docs.together.ai/docs/fine-tuning-data-preparation"
|
|
258
|
-
),
|
|
259
|
-
line_number=idx + 1,
|
|
260
|
-
error_source="line_type",
|
|
261
|
-
)
|
|
262
|
-
|
|
263
|
-
current_format = None
|
|
264
|
-
for possible_format in JSONL_REQUIRED_COLUMNS_MAP:
|
|
265
|
-
if all(
|
|
266
|
-
column in json_line
|
|
267
|
-
for column in JSONL_REQUIRED_COLUMNS_MAP[possible_format]
|
|
268
|
-
):
|
|
269
|
-
if current_format is None:
|
|
270
|
-
current_format = possible_format
|
|
271
|
-
elif current_format != possible_format:
|
|
272
|
-
raise InvalidFileFormatError(
|
|
273
|
-
message="Found multiple dataset formats in the input file. "
|
|
274
|
-
f"Got {current_format} and {possible_format} on line {idx + 1}.",
|
|
275
|
-
line_number=idx + 1,
|
|
276
|
-
error_source="format",
|
|
277
|
-
)
|
|
278
|
-
|
|
279
|
-
# Check that there are no extra columns
|
|
280
|
-
for column in json_line:
|
|
281
|
-
if (
|
|
282
|
-
column
|
|
283
|
-
not in JSONL_REQUIRED_COLUMNS_MAP[possible_format]
|
|
284
|
-
):
|
|
285
|
-
raise InvalidFileFormatError(
|
|
286
|
-
message=f'Found extra column "{column}" in the line {idx + 1}.',
|
|
287
|
-
line_number=idx + 1,
|
|
288
|
-
error_source="format",
|
|
289
|
-
)
|
|
290
|
-
|
|
291
|
-
if current_format is None:
|
|
292
|
-
raise InvalidFileFormatError(
|
|
293
|
-
message=(
|
|
294
|
-
f"Error parsing file. Could not detect a format for the line {idx + 1} with the columns:\n"
|
|
295
|
-
f"{json_line.keys()}"
|
|
296
|
-
),
|
|
297
|
-
line_number=idx + 1,
|
|
298
|
-
error_source="format",
|
|
299
|
-
)
|
|
300
|
-
if current_format == DatasetFormat.PREFERENCE_OPENAI:
|
|
301
|
-
validate_preference_openai(json_line, idx)
|
|
302
|
-
elif current_format == DatasetFormat.CONVERSATION:
|
|
303
|
-
message_column = JSONL_REQUIRED_COLUMNS_MAP[
|
|
304
|
-
DatasetFormat.CONVERSATION
|
|
305
|
-
][0]
|
|
306
|
-
validate_messages(json_line[message_column], idx)
|
|
307
|
-
else:
|
|
308
|
-
for column in JSONL_REQUIRED_COLUMNS_MAP[current_format]:
|
|
309
|
-
if not isinstance(json_line[column], str):
|
|
310
|
-
raise InvalidFileFormatError(
|
|
311
|
-
message=f'Invalid value type for "{column}" key on line {idx + 1}. '
|
|
312
|
-
f"Expected string. Found {type(json_line[column])}.",
|
|
313
|
-
line_number=idx + 1,
|
|
314
|
-
error_source="key_value",
|
|
315
|
-
)
|
|
316
|
-
|
|
317
|
-
if dataset_format is None:
|
|
318
|
-
dataset_format = current_format
|
|
319
|
-
elif current_format is not None:
|
|
320
|
-
if current_format != dataset_format:
|
|
321
|
-
raise InvalidFileFormatError(
|
|
322
|
-
message="All samples in the dataset must have the same dataset format. "
|
|
323
|
-
f"Got {dataset_format} for the first line and {current_format} "
|
|
324
|
-
f"for the line {idx + 1}.",
|
|
325
|
-
line_number=idx + 1,
|
|
326
|
-
error_source="format",
|
|
327
|
-
)
|
|
328
|
-
|
|
329
|
-
if idx + 1 < MIN_SAMPLES:
|
|
330
|
-
report_dict["has_min_samples"] = False
|
|
331
|
-
report_dict["message"] = (
|
|
332
|
-
f"Processing {file} resulted in only {idx + 1} samples. "
|
|
333
|
-
f"Our minimum is {MIN_SAMPLES} samples. "
|
|
334
|
-
)
|
|
335
|
-
report_dict["is_check_passed"] = False
|
|
336
|
-
else:
|
|
337
|
-
report_dict["num_samples"] = idx + 1
|
|
338
|
-
report_dict["has_min_samples"] = True
|
|
339
|
-
report_dict["is_check_passed"] = True
|
|
340
|
-
|
|
341
|
-
report_dict["load_json"] = True
|
|
342
|
-
|
|
343
|
-
except InvalidFileFormatError as e:
|
|
344
|
-
report_dict["load_json"] = False
|
|
345
|
-
report_dict["is_check_passed"] = False
|
|
346
|
-
report_dict["message"] = e.message
|
|
347
|
-
if e.line_number is not None:
|
|
348
|
-
report_dict["line_number"] = e.line_number
|
|
349
|
-
if e.error_source is not None:
|
|
350
|
-
report_dict[e.error_source] = False
|
|
351
|
-
except ValueError:
|
|
352
|
-
report_dict["load_json"] = False
|
|
353
|
-
if idx < 0:
|
|
354
|
-
report_dict["message"] = (
|
|
355
|
-
"Unable to decode file. "
|
|
356
|
-
"File may be empty or in an unsupported format. "
|
|
357
|
-
)
|
|
358
|
-
else:
|
|
359
|
-
report_dict["message"] = (
|
|
360
|
-
f"Error parsing json payload. Unexpected format on line {idx + 1}."
|
|
361
|
-
)
|
|
362
|
-
report_dict["is_check_passed"] = False
|
|
363
|
-
|
|
364
|
-
if "text_field" not in report_dict:
|
|
365
|
-
report_dict["text_field"] = True
|
|
366
|
-
if "line_type" not in report_dict:
|
|
367
|
-
report_dict["line_type"] = True
|
|
368
|
-
if "key_value" not in report_dict:
|
|
369
|
-
report_dict["key_value"] = True
|
|
370
|
-
return report_dict
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
def _check_parquet(file: Path) -> Dict[str, Any]:
|
|
374
|
-
try:
|
|
375
|
-
# Pyarrow is optional as it's large (~80MB) and isn't compatible with older systems.
|
|
376
|
-
from pyarrow import ArrowInvalid, parquet
|
|
377
|
-
except ImportError:
|
|
378
|
-
raise ImportError(
|
|
379
|
-
"pyarrow is not installed and is required to use parquet files. Please install it via `pip install together[pyarrow]`"
|
|
380
|
-
)
|
|
381
|
-
|
|
382
|
-
report_dict: Dict[str, Any] = {}
|
|
383
|
-
|
|
384
|
-
try:
|
|
385
|
-
table = parquet.read_table(str(file), memory_map=True)
|
|
386
|
-
except ArrowInvalid:
|
|
387
|
-
report_dict["load_parquet"] = (
|
|
388
|
-
f"An exception has occurred when loading the Parquet file {file}. Please check the file for corruption. "
|
|
389
|
-
f"Exception trace:\n{format_exc()}"
|
|
390
|
-
)
|
|
391
|
-
report_dict["is_check_passed"] = False
|
|
392
|
-
return report_dict
|
|
393
|
-
|
|
394
|
-
column_names = table.schema.names
|
|
395
|
-
if "input_ids" not in column_names:
|
|
396
|
-
report_dict["load_parquet"] = (
|
|
397
|
-
f"Parquet file {file} does not contain the `input_ids` column."
|
|
398
|
-
)
|
|
399
|
-
report_dict["is_check_passed"] = False
|
|
400
|
-
return report_dict
|
|
401
|
-
|
|
402
|
-
for column_name in column_names:
|
|
403
|
-
if column_name not in PARQUET_EXPECTED_COLUMNS:
|
|
404
|
-
report_dict["load_parquet"] = (
|
|
405
|
-
f"Parquet file {file} contains an unexpected column {column_name}. "
|
|
406
|
-
f"Only columns {PARQUET_EXPECTED_COLUMNS} are supported."
|
|
407
|
-
)
|
|
408
|
-
report_dict["is_check_passed"] = False
|
|
409
|
-
return report_dict
|
|
410
|
-
|
|
411
|
-
num_samples = len(table)
|
|
412
|
-
if num_samples < MIN_SAMPLES:
|
|
413
|
-
report_dict["has_min_samples"] = False
|
|
414
|
-
report_dict["message"] = (
|
|
415
|
-
f"Processing {file} resulted in only {num_samples} samples. "
|
|
416
|
-
f"Our minimum is {MIN_SAMPLES} samples. "
|
|
417
|
-
)
|
|
418
|
-
report_dict["is_check_passed"] = False
|
|
419
|
-
return report_dict
|
|
420
|
-
else:
|
|
421
|
-
report_dict["num_samples"] = num_samples
|
|
422
|
-
|
|
423
|
-
report_dict["is_check_passed"] = True
|
|
424
|
-
|
|
425
|
-
return report_dict
|