PyPI - together - Versions diffs - 1.5.35__py3-none-any.whl → 2.0.0a6__py3-none-any.whl - Mend

together 1.5.35py3-none-any.whl → 2.0.0a6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (208) hide show

together/__init__.py +101 -114
together/_base_client.py +1995 -0
together/_client.py +1033 -0
together/_compat.py +219 -0
together/_constants.py +14 -0
together/_exceptions.py +108 -0
together/_files.py +123 -0
together/_models.py +857 -0
together/_qs.py +150 -0
together/_resource.py +43 -0
together/_response.py +830 -0
together/_streaming.py +370 -0
together/_types.py +260 -0
together/_utils/__init__.py +64 -0
together/_utils/_compat.py +45 -0
together/_utils/_datetime_parse.py +136 -0
together/_utils/_logs.py +25 -0
together/_utils/_proxy.py +65 -0
together/_utils/_reflection.py +42 -0
together/_utils/_resources_proxy.py +24 -0
together/_utils/_streams.py +12 -0
together/_utils/_sync.py +58 -0
together/_utils/_transform.py +457 -0
together/_utils/_typing.py +156 -0
together/_utils/_utils.py +421 -0
together/_version.py +4 -0
together/lib/.keep +4 -0
together/lib/__init__.py +23 -0
together/{cli → lib/cli}/api/endpoints.py +66 -84
together/{cli/api/evaluation.py → lib/cli/api/evals.py} +152 -43
together/{cli → lib/cli}/api/files.py +20 -17
together/{cli/api/finetune.py → lib/cli/api/fine_tuning.py} +116 -172
together/{cli → lib/cli}/api/models.py +34 -27
together/lib/cli/api/utils.py +50 -0
together/{cli → lib/cli}/cli.py +16 -26
together/{constants.py → lib/constants.py} +11 -24
together/lib/resources/__init__.py +11 -0
together/lib/resources/files.py +999 -0
together/lib/resources/fine_tuning.py +280 -0
together/lib/resources/models.py +35 -0
together/lib/types/__init__.py +13 -0
together/lib/types/error.py +9 -0
together/lib/types/fine_tuning.py +397 -0
together/{utils → lib/utils}/__init__.py +6 -14
together/{utils → lib/utils}/_log.py +11 -16
together/{utils → lib/utils}/files.py +90 -288
together/lib/utils/serializer.py +10 -0
together/{utils → lib/utils}/tools.py +19 -55
together/resources/__init__.py +225 -39
together/resources/audio/__init__.py +72 -48
together/resources/audio/audio.py +198 -0
together/resources/audio/speech.py +574 -128
together/resources/audio/transcriptions.py +247 -261
together/resources/audio/translations.py +221 -241
together/resources/audio/voices.py +111 -41
together/resources/batches.py +417 -0
together/resources/chat/__init__.py +30 -21
together/resources/chat/chat.py +102 -0
together/resources/chat/completions.py +1063 -263
together/resources/code_interpreter/__init__.py +33 -0
together/resources/code_interpreter/code_interpreter.py +258 -0
together/resources/code_interpreter/sessions.py +135 -0
together/resources/completions.py +884 -225
together/resources/embeddings.py +172 -68
together/resources/endpoints.py +589 -490
together/resources/evals.py +452 -0
together/resources/files.py +397 -129
together/resources/fine_tuning.py +1033 -0
together/resources/hardware.py +181 -0
together/resources/images.py +258 -104
together/resources/jobs.py +214 -0
together/resources/models.py +223 -193
together/resources/rerank.py +190 -92
together/resources/videos.py +286 -214
together/types/__init__.py +66 -167
together/types/audio/__init__.py +10 -0
together/types/audio/speech_create_params.py +75 -0
together/types/audio/transcription_create_params.py +54 -0
together/types/audio/transcription_create_response.py +111 -0
together/types/audio/translation_create_params.py +40 -0
together/types/audio/translation_create_response.py +70 -0
together/types/audio/voice_list_response.py +23 -0
together/types/audio_speech_stream_chunk.py +16 -0
together/types/autoscaling.py +13 -0
together/types/autoscaling_param.py +15 -0
together/types/batch_create_params.py +24 -0
together/types/batch_create_response.py +14 -0
together/types/batch_job.py +45 -0
together/types/batch_list_response.py +10 -0
together/types/chat/__init__.py +18 -0
together/types/chat/chat_completion.py +60 -0
together/types/chat/chat_completion_chunk.py +61 -0
together/types/chat/chat_completion_structured_message_image_url_param.py +18 -0
together/types/chat/chat_completion_structured_message_text_param.py +13 -0
together/types/chat/chat_completion_structured_message_video_url_param.py +18 -0
together/types/chat/chat_completion_usage.py +13 -0
together/types/chat/chat_completion_warning.py +9 -0
together/types/chat/completion_create_params.py +329 -0
together/types/code_interpreter/__init__.py +5 -0
together/types/code_interpreter/session_list_response.py +31 -0
together/types/code_interpreter_execute_params.py +45 -0
together/types/completion.py +42 -0
together/types/completion_chunk.py +66 -0
together/types/completion_create_params.py +138 -0
together/types/dedicated_endpoint.py +44 -0
together/types/embedding.py +24 -0
together/types/embedding_create_params.py +31 -0
together/types/endpoint_create_params.py +43 -0
together/types/endpoint_list_avzones_response.py +11 -0
together/types/endpoint_list_params.py +18 -0
together/types/endpoint_list_response.py +41 -0
together/types/endpoint_update_params.py +27 -0
together/types/eval_create_params.py +263 -0
together/types/eval_create_response.py +16 -0
together/types/eval_list_params.py +21 -0
together/types/eval_list_response.py +10 -0
together/types/eval_status_response.py +100 -0
together/types/evaluation_job.py +139 -0
together/types/execute_response.py +108 -0
together/types/file_delete_response.py +13 -0
together/types/file_list.py +12 -0
together/types/file_purpose.py +9 -0
together/types/file_response.py +31 -0
together/types/file_type.py +7 -0
together/types/fine_tuning_cancel_response.py +194 -0
together/types/fine_tuning_content_params.py +24 -0
together/types/fine_tuning_delete_params.py +11 -0
together/types/fine_tuning_delete_response.py +12 -0
together/types/fine_tuning_list_checkpoints_response.py +21 -0
together/types/fine_tuning_list_events_response.py +12 -0
together/types/fine_tuning_list_response.py +199 -0
together/types/finetune_event.py +41 -0
together/types/finetune_event_type.py +33 -0
together/types/finetune_response.py +177 -0
together/types/hardware_list_params.py +16 -0
together/types/hardware_list_response.py +58 -0
together/types/image_data_b64.py +15 -0
together/types/image_data_url.py +15 -0
together/types/image_file.py +23 -0
together/types/image_generate_params.py +85 -0
together/types/job_list_response.py +47 -0
together/types/job_retrieve_response.py +43 -0
together/types/log_probs.py +18 -0
together/types/model_list_response.py +10 -0
together/types/model_object.py +42 -0
together/types/model_upload_params.py +36 -0
together/types/model_upload_response.py +23 -0
together/types/rerank_create_params.py +36 -0
together/types/rerank_create_response.py +36 -0
together/types/tool_choice.py +23 -0
together/types/tool_choice_param.py +23 -0
together/types/tools_param.py +23 -0
together/types/training_method_dpo.py +22 -0
together/types/training_method_sft.py +18 -0
together/types/video_create_params.py +86 -0
together/types/video_create_response.py +10 -0
together/types/video_job.py +57 -0
together-2.0.0a6.dist-info/METADATA +729 -0
together-2.0.0a6.dist-info/RECORD +165 -0
{together-1.5.35.dist-info → together-2.0.0a6.dist-info}/WHEEL +1 -1
together-2.0.0a6.dist-info/entry_points.txt +2 -0
{together-1.5.35.dist-info → together-2.0.0a6.dist-info}/licenses/LICENSE +1 -1
together/abstract/api_requestor.py +0 -770
together/cli/api/chat.py +0 -298
together/cli/api/completions.py +0 -119
together/cli/api/images.py +0 -93
together/cli/api/utils.py +0 -139
together/client.py +0 -186
together/error.py +0 -194
together/filemanager.py +0 -635
together/legacy/__init__.py +0 -0
together/legacy/base.py +0 -27
together/legacy/complete.py +0 -93
together/legacy/embeddings.py +0 -27
together/legacy/files.py +0 -146
together/legacy/finetune.py +0 -177
together/legacy/images.py +0 -27
together/legacy/models.py +0 -44
together/resources/batch.py +0 -165
together/resources/code_interpreter.py +0 -82
together/resources/evaluation.py +0 -808
together/resources/finetune.py +0 -1388
together/together_response.py +0 -50
together/types/abstract.py +0 -26
together/types/audio_speech.py +0 -311
together/types/batch.py +0 -54
together/types/chat_completions.py +0 -210
together/types/code_interpreter.py +0 -57
together/types/common.py +0 -67
together/types/completions.py +0 -107
together/types/embeddings.py +0 -35
together/types/endpoints.py +0 -123
together/types/error.py +0 -16
together/types/evaluation.py +0 -93
together/types/files.py +0 -93
together/types/finetune.py +0 -465
together/types/images.py +0 -42
together/types/models.py +0 -96
together/types/rerank.py +0 -43
together/types/videos.py +0 -69
together/utils/api_helpers.py +0 -124
together/version.py +0 -6
together-1.5.35.dist-info/METADATA +0 -583
together-1.5.35.dist-info/RECORD +0 -77
together-1.5.35.dist-info/entry_points.txt +0 -3
/together/{abstract → lib/cli}/__init__.py +0 -0
/together/{cli → lib/cli/api}/__init__.py +0 -0
/together/{cli/api/__init__.py → py.typed} +0 -0

together/{utils → lib/utils}/files.py RENAMED Viewed

@@ -1,37 +1,26 @@
 from __future__ import annotations
+import os
 import csv
 import json
-import os
+from typing import Any, Dict, List, cast
 from pathlib import Path
 from traceback import format_exc
-from typing import Any, Dict, List
 from tqdm import tqdm
-from together.constants import (
-    JSONL_REQUIRED_COLUMNS_MAP,
-    MAX_BASE64_IMAGE_LENGTH,
-    MAX_FILE_SIZE_GB,
-    MAX_IMAGES_PER_EXAMPLE,
+from together.types import FilePurpose
+from together.lib.constants import (
     MIN_SAMPLES,
+    DISABLE_TQDM,
     NUM_BYTES_IN_GB,
+    MAX_FILE_SIZE_GB,
     PARQUET_EXPECTED_COLUMNS,
-    POSSIBLE_ROLES_CONVERSATION,
     REQUIRED_COLUMNS_MESSAGE,
+    JSONL_REQUIRED_COLUMNS_MAP,
+    POSSIBLE_ROLES_CONVERSATION,
     DatasetFormat,
 )
-from together.types import FilePurpose
-# MessageContent is a string or a list of dicts with 'type': 'text' or 'image_url', and 'text' or 'image_url.url'
-# Example: "Hello" or [
-#   {"type": "text", "text": "Hello"},
-#   {"type": "image_url", "image_url": {
-#     "url": "data:image/jpeg;base64,..."
-#   }}
-# ]
-MessageContent = str | list[dict[str, Any]]
 class InvalidFileFormatError(ValueError):
@@ -51,12 +40,12 @@ class InvalidFileFormatError(ValueError):
 def check_file(
     file: Path | str,
-    purpose: FilePurpose | str = FilePurpose.FineTune,
+    purpose: FilePurpose | str = "fine-tune",
 ) -> Dict[str, Any]:
     if not isinstance(file, Path):
         file = Path(file)
-    report_dict = {
+    report_dict: Dict[str, Any] = {
         "is_check_passed": True,
         "message": "Checks passed",
         "found": None,
@@ -78,7 +67,7 @@ def check_file(
     else:
         report_dict["found"] = True
-    file_size = os.stat(file).st_size
+    file_size = os.stat(file.as_posix()).st_size
     if file_size > MAX_FILE_SIZE_GB * NUM_BYTES_IN_GB:
         report_dict["message"] = (
@@ -105,8 +94,7 @@ def check_file(
         data_report_dict = _check_csv(file, purpose)
     else:
         report_dict["filetype"] = (
-            f"Unknown extension of file {file}. "
-            "Only files with extensions .jsonl and .parquet are supported."
+            f"Unknown extension of file {file}. Only files with extensions .jsonl, .parquet, and .csv are supported."
         )
         report_dict["is_check_passed"] = False
@@ -115,9 +103,7 @@ def check_file(
     return report_dict
-def _check_conversation_type(
-    messages: List[Dict[str, str | int | MessageContent]], idx: int
-) -> None:
+def _check_conversation_type(messages: List[Dict[str, str | bool]], idx: int) -> None:
     """Check that the conversation has correct type.
     Args:
@@ -128,23 +114,22 @@ def _check_conversation_type(
     Raises:
         InvalidFileFormatError: If the conversation type is invalid.
     """
-    if not isinstance(messages, list):
-        raise InvalidFileFormatError(
-            message=f"Invalid format on line {idx + 1} of the input file. "
-            f"The `messages` column must be a list. Found {type(messages)}",
-            line_number=idx + 1,
-            error_source="key_value",
-        )
+    # if not isinstance(messages, list):
+    #     raise InvalidFileFormatError(
+    #         message=f"Invalid format on line {idx + 1} of the input file. "
+    #         f"The `messages` column must be a list. Found {type(messages)}",
+    #         line_number=idx + 1,
+    #         error_source="key_value",
+    #     )
     if len(messages) == 0:
         raise InvalidFileFormatError(
-            message=f"Invalid format on line {idx + 1} of the input file. "
-            f"The `messages` column must not be empty.",
+            message=f"Invalid format on line {idx + 1} of the input file. The `messages` column must not be empty.",
             line_number=idx + 1,
             error_source="key_value",
         )
     for message in messages:
-        if not isinstance(message, dict):
+        if not isinstance(cast(Any, message), dict):
             raise InvalidFileFormatError(
                 message=f"Invalid format on line {idx + 1} of the input file. "
                 f"The `messages` column must be a list of dicts. Found {type(message)}",
@@ -159,11 +144,15 @@ def _check_conversation_type(
                     line_number=idx + 1,
                     error_source="key_value",
                 )
+            if not isinstance(message[column], str):
+                raise InvalidFileFormatError(
+                    message=f"Column `{column}` is not a string on line {idx + 1}. Found {type(message[column])}",
+                    line_number=idx + 1,
+                    error_source="text_field",
+                )
-def _check_conversation_roles(
-    require_assistant_role: bool, assistant_role_exists: bool, idx: int
-) -> None:
+def _check_conversation_roles(require_assistant_role: bool, assistant_role_exists: bool, idx: int) -> None:
     """Check that the conversation has correct roles.
     Args:
@@ -183,9 +172,7 @@ def _check_conversation_roles(
         )
-def _check_message_weight(
-    message: Dict[str, str | int | MessageContent], idx: int
-) -> int | None:
+def _check_message_weight(message: Dict[str, str | bool], idx: int) -> None:
     """Check that the message has a weight with the correct type and value.
     Args:
@@ -209,14 +196,9 @@ def _check_message_weight(
                 line_number=idx + 1,
                 error_source="key_value",
             )
-        return weight
-    return None
-def _check_message_role(
-    message: Dict[str, str | int | MessageContent], previous_role: str | None, idx: int
-) -> str:
+def _check_message_role(message: Dict[str, str | bool], previous_role: str | bool | None, idx: int) -> str | bool:
     """Check that the message has correct roles.
     Args:
@@ -230,14 +212,6 @@ def _check_message_role(
     Raises:
         InvalidFileFormatError: If the message role is invalid.
     """
-    if not isinstance(message["role"], str):
-        raise InvalidFileFormatError(
-            message=f"Invalid role `{message['role']}` in conversation on line {idx + 1}. "
-            f"Role must be a string. Found {type(message['role'])}",
-            line_number=idx + 1,
-            error_source="key_value",
-        )
     if message["role"] not in POSSIBLE_ROLES_CONVERSATION:
         raise InvalidFileFormatError(
             message=f"Invalid role `{message['role']}` in conversation on line {idx + 1}. "
@@ -255,135 +229,7 @@ def _check_message_role(
     return message["role"]
-def _check_message_content(
-    message_content: str | int | MessageContent, role: str, idx: int
-) -> tuple[bool, int]:
-    """Check that the message content has the correct type.
-    Message content can be either a) a string or b) an OpenAI-style multimodal list of content items
-    Example:
-        a) "Hello", or
-        b) [
-             {"type": "text", "text": "Hello"},
-             {"type": "image_url", "image_url": {
-                "url": "data:image/jpeg;base64,..."
-             }}
-           ]
-    Args:
-        message: The message to check.
-        role: The role of the message.
-        idx: Line number in the file.
-    Returns:
-        tuple[bool, int]: A tuple with message is multimodal and the number of images in the message content.
-    """
-    # Text-only message content
-    if isinstance(message_content, str):
-        return False, 0
-    # Multimodal message content
-    if isinstance(message_content, list):
-        num_images = 0
-        for item in message_content:
-            if not isinstance(item, dict):
-                raise InvalidFileFormatError(
-                    "The dataset is malformed, the `content` field must be a list of dicts.",
-                    line_number=idx + 1,
-                    error_source="key_value",
-                )
-            if "type" not in item:
-                raise InvalidFileFormatError(
-                    "The dataset is malformed, the `content` field must be a list of dicts with a `type` field.",
-                    line_number=idx + 1,
-                    error_source="key_value",
-                )
-            if item["type"] == "text":
-                if "text" not in item or not isinstance(item["text"], str):
-                    raise InvalidFileFormatError(
-                        "The dataset is malformed, the `text` field must be present in the `content` item field and be"
-                        f" a string. Got '{item.get('text')!r}' instead.",
-                        line_number=idx + 1,
-                        error_source="key_value",
-                    )
-            elif item["type"] == "image_url":
-                if role != "user":
-                    raise InvalidFileFormatError(
-                        "The dataset is malformed, only user messages can contain images.",
-                        line_number=idx + 1,
-                        error_source="key_value",
-                    )
-                if "image_url" not in item or not isinstance(item["image_url"], dict):
-                    raise InvalidFileFormatError(
-                        "The dataset is malformed, the `image_url` field must be present in the `content` field and "
-                        f"be a dictionary. Got {item.get('image_url')!r} instead.",
-                        line_number=idx + 1,
-                        error_source="key_value",
-                    )
-                image_data = item["image_url"].get("url")
-                if not image_data or not isinstance(image_data, str):
-                    raise InvalidFileFormatError(
-                        "The dataset is malformed, the `url` field must be present in the `image_url` field and be "
-                        f"a string. Got {image_data!r} instead.",
-                        line_number=idx + 1,
-                        error_source="key_value",
-                    )
-                if not any(
-                    image_data.startswith(f"data:image/{fmt};base64,")
-                    for fmt in ["jpeg", "png", "webp"]
-                ):
-                    raise InvalidFileFormatError(
-                        "The dataset is malformed, the `url` field must be either a JPEG, PNG or WEBP base64-encoded "
-                        "image in 'data:image/<format>;base64,<base64_encoded_image>' format. "
-                        f"Got '{image_data[:100]}...' instead.",
-                        line_number=idx + 1,
-                    )
-                if len(image_data) > MAX_BASE64_IMAGE_LENGTH:
-                    raise InvalidFileFormatError(
-                        "The dataset is malformed, the `url` field must contain base64-encoded image "
-                        f"that is less than 10MB, found ~{len(image_data) * 3 // 4} bytes.",
-                        line_number=idx + 1,
-                        error_source="key_value",
-                    )
-                num_images += 1
-            else:
-                raise InvalidFileFormatError(
-                    "The dataset is malformed, the `type` field must be either 'text' or 'image_url'. "
-                    f"Got {item['type']!r}.",
-                    line_number=idx + 1,
-                    error_source="key_value",
-                )
-        if num_images > MAX_IMAGES_PER_EXAMPLE:
-            raise InvalidFileFormatError(
-                f"The dataset is malformed, the `content` field must contain at most "
-                f"{MAX_IMAGES_PER_EXAMPLE} images, found {num_images}.",
-                line_number=idx + 1,
-                error_source="key_value",
-            )
-        # We still consider text-only messages in such format as multimodal, even if they don't have any images
-        # included - so we can process datasets with rather sparse images (i.e. not in each sample) consistently.
-        return True, num_images
-    raise InvalidFileFormatError(
-        f"Invalid content type on line {idx + 1} of the input file. Expected string or multimodal list of dicts, "
-        f"found {type(message_content)}",
-        line_number=idx + 1,
-        error_source="key_value",
-    )
-def validate_messages(
-    messages: List[Dict[str, str | int | MessageContent]],
-    idx: int,
-    require_assistant_role: bool = True,
-) -> None:
+def validate_messages(messages: List[Dict[str, str | bool]], idx: int, require_assistant_role: bool = True) -> None:
     """Validate the messages column.
     Args:
@@ -396,45 +242,15 @@ def validate_messages(
     """
     _check_conversation_type(messages, idx)
+    has_weights = any("weight" in message for message in messages)
     previous_role = None
     assistant_role_exists = False
-    messages_are_multimodal: bool | None = None
-    total_number_of_images = 0
     for message in messages:
-        message_weight = _check_message_weight(message, idx)
+        if has_weights:
+            _check_message_weight(message, idx)
         previous_role = _check_message_role(message, previous_role, idx)
         assistant_role_exists |= previous_role == "assistant"
-        is_multimodal, number_of_images = _check_message_content(
-            message["content"], role=previous_role, idx=idx
-        )
-        # Multimodal validation
-        if number_of_images > 0 and message_weight is not None and message_weight != 0:
-            raise InvalidFileFormatError(
-                "Messages with images cannot have non-zero weights.",
-                line_number=idx + 1,
-                error_source="key_value",
-            )
-        if messages_are_multimodal is None:
-            # Detect the format of the messages in the conversation.
-            messages_are_multimodal = is_multimodal
-        elif messages_are_multimodal != is_multimodal:
-            # Due to the format limitation, we cannot mix multimodal and text only messages in the same sample.
-            raise InvalidFileFormatError(
-                "Messages in the conversation must be either all in multimodal or all in text-only format.",
-                line_number=idx + 1,
-                error_source="key_value",
-            )
-        total_number_of_images += number_of_images
-    if total_number_of_images > MAX_IMAGES_PER_EXAMPLE:
-        raise InvalidFileFormatError(
-            f"The dataset is malformed, the `messages` must contain at most {MAX_IMAGES_PER_EXAMPLE} images. "
-            f"Found {total_number_of_images} images.",
-            line_number=idx + 1,
-            error_source="key_value",
-        )
     _check_conversation_roles(require_assistant_role, assistant_role_exists, idx)
@@ -463,7 +279,8 @@ def validate_preference_openai(example: Dict[str, Any], idx: int = 0) -> None:
             error_source="key_value",
         )
-    validate_messages(example["input"]["messages"], idx, require_assistant_role=False)
+    messages: List[Dict[str, str | bool]] = cast(Any, example["input"]["messages"])
+    validate_messages(messages, idx, require_assistant_role=False)
     if example["input"]["messages"][-1]["role"] == "assistant":
         raise InvalidFileFormatError(
@@ -524,7 +341,12 @@ def validate_preference_openai(example: Dict[str, Any], idx: int = 0) -> None:
                 error_source="key_value",
             )
-        _check_message_content(example[key][0]["content"], role="assistant", idx=idx)
+        if not isinstance(example[key][0]["content"], str):
+            raise InvalidFileFormatError(
+                message=f"The dataset is malformed, the 'content' field in `{key}` must be a string on line {idx + 1}.",
+                line_number=idx + 1,
+                error_source="key_value",
+            )
 def _check_utf8(file: Path) -> Dict[str, Any]:
@@ -536,30 +358,24 @@ def _check_utf8(file: Path) -> Dict[str, Any]:
         Dict[str, Any]: A dictionary with the results of the check.
     """
     report_dict: Dict[str, Any] = {}
     try:
-        # Dry-run UTF-8 decode: iterate through file to validate encoding
+        # Dry-run UTF-8 decode by iterating through the file to avoid loading it entirely into memory
         with file.open(encoding="utf-8") as f:
             for _ in f:
                 pass
         report_dict["utf8"] = True
     except UnicodeDecodeError as e:
         report_dict["utf8"] = False
         report_dict["message"] = f"File is not UTF-8 encoded. Error raised: {e}."
         report_dict["is_check_passed"] = False
     return report_dict
-def _check_samples_count(
-    file: Path, report_dict: Dict[str, Any], idx: int
-) -> Dict[str, Any]:
+def _check_samples_count(file: Path, report_dict: Dict[str, Any], idx: int) -> Dict[str, Any]:
     if idx + 1 < MIN_SAMPLES:
         report_dict["has_min_samples"] = False
         report_dict["message"] = (
-            f"Processing {file} resulted in only {idx + 1} samples. "
-            f"Our minimum is {MIN_SAMPLES} samples. "
+            f"Processing {file} resulted in only {idx + 1} samples. Our minimum is {MIN_SAMPLES} samples. "
         )
         report_dict["is_check_passed"] = False
     else:
@@ -580,11 +396,10 @@ def _check_csv(file: Path, purpose: FilePurpose | str) -> Dict[str, Any]:
         Dict[str, Any]: A dictionary with the results of the check.
     """
     report_dict: Dict[str, Any] = {}
-    if purpose != FilePurpose.Eval:
+    if purpose != "eval":
         report_dict["is_check_passed"] = False
         report_dict["message"] = (
-            f"CSV files are not supported for {purpose}. "
-            "Only JSONL and Parquet files are supported."
+            f"CSV files are not supported for {purpose}. Only JSONL and Parquet files are supported."
         )
         return report_dict
@@ -625,13 +440,9 @@ def _check_csv(file: Path, purpose: FilePurpose | str) -> Dict[str, Any]:
         except ValueError:
             report_dict["load_csv"] = False
             if idx < 0:
-                report_dict["message"] = (
-                    "Unable to decode file. File may be empty or in an unsupported format. "
-                )
+                report_dict["message"] = "Unable to decode file. File may be empty or in an unsupported format. "
             else:
-                report_dict["message"] = (
-                    f"Error parsing the CSV file. Unexpected format on line {idx + 1}."
-                )
+                report_dict["message"] = f"Error parsing the CSV file. Unexpected format on line {idx + 1}."
             report_dict["is_check_passed"] = False
     return report_dict
@@ -647,7 +458,12 @@ def _check_jsonl(file: Path, purpose: FilePurpose | str) -> Dict[str, Any]:
     with file.open() as f:
         idx = -1
         try:
-            for idx, line in tqdm(enumerate(f), desc="Validating file", unit=" lines"):
+            for idx, line in tqdm(
+                enumerate(f),
+                desc="Validating file",
+                unit=" lines",
+                disable=bool(DISABLE_TQDM),
+            ):
                 json_line = json.loads(line)
                 if not isinstance(json_line, dict):
@@ -661,16 +477,13 @@ def _check_jsonl(file: Path, purpose: FilePurpose | str) -> Dict[str, Any]:
                         error_source="line_type",
                     )
                 # In evals, we don't check the format of the dataset.
-                if purpose != FilePurpose.Eval:
+                if purpose != "eval":
                     current_format = None
                     for possible_format in JSONL_REQUIRED_COLUMNS_MAP:
-                        if all(
-                            column in json_line
-                            for column in JSONL_REQUIRED_COLUMNS_MAP[possible_format]
-                        ):
+                        if all(column in json_line for column in JSONL_REQUIRED_COLUMNS_MAP[possible_format]):
                             if current_format is None:
                                 current_format = possible_format
-                            elif current_format != possible_format:
+                            elif current_format != possible_format:  # type: ignore[unreachable]
                                 raise InvalidFileFormatError(
                                     message="Found multiple dataset formats in the input file. "
                                     f"Got {current_format} and {possible_format} on line {idx + 1}.",
@@ -679,11 +492,8 @@ def _check_jsonl(file: Path, purpose: FilePurpose | str) -> Dict[str, Any]:
                                 )
                             # Check that there are no extra columns
-                            for column in json_line:
-                                if (
-                                    column
-                                    not in JSONL_REQUIRED_COLUMNS_MAP[possible_format]
-                                ):
+                            for column in cast(List[str], json_line.keys()):
+                                if column not in JSONL_REQUIRED_COLUMNS_MAP[possible_format]:
                                     raise InvalidFileFormatError(
                                         message=f'Found extra column "{column}" in the line {idx + 1}.',
                                         line_number=idx + 1,
@@ -700,36 +510,36 @@ def _check_jsonl(file: Path, purpose: FilePurpose | str) -> Dict[str, Any]:
                             error_source="format",
                         )
                     if current_format == DatasetFormat.PREFERENCE_OPENAI:
-                        validate_preference_openai(json_line, idx)
+                        validate_preference_openai(cast(Dict[str, Any], json_line), idx)
                     elif current_format == DatasetFormat.CONVERSATION:
-                        message_column = JSONL_REQUIRED_COLUMNS_MAP[
-                            DatasetFormat.CONVERSATION
-                        ][0]
-                        require_assistant = purpose != FilePurpose.Eval
+                        message_column = JSONL_REQUIRED_COLUMNS_MAP[DatasetFormat.CONVERSATION][0]
+                        require_assistant = purpose != "eval"
+                        message: List[Dict[str, str | bool]] = cast(Any, json_line[message_column])
                         validate_messages(
-                            json_line[message_column],
+                            message,
                             idx,
                             require_assistant_role=require_assistant,
                         )
                     else:
                         for column in JSONL_REQUIRED_COLUMNS_MAP[current_format]:
-                            role = "assistant" if column in {"completion"} else "user"
-                            _check_message_content(
-                                json_line[column], role=role, idx=idx
-                            )
+                            if not isinstance(json_line[column], str):
+                                raise InvalidFileFormatError(
+                                    message=f'Invalid value type for "{column}" key on line {idx + 1}. '
+                                    f"Expected string. Found {type(cast(Any, json_line[column]))}.",
+                                    line_number=idx + 1,
+                                    error_source="key_value",
+                                )
                     if dataset_format is None:
                         dataset_format = current_format
-                    elif current_format is not None:
-                        if current_format != dataset_format:
-                            raise InvalidFileFormatError(
-                                message="All samples in the dataset must have the same dataset format. "
-                                f"Got {dataset_format} for the first line and {current_format} "
-                                f"for the line {idx + 1}.",
-                                line_number=idx + 1,
-                                error_source="format",
-                            )
+                    elif current_format != dataset_format:  # type: ignore[unreachable]
+                        raise InvalidFileFormatError(
+                            message="All samples in the dataset must have the same dataset format. "
+                            f"Got {dataset_format} for the first line and {current_format} "
+                            f"for the line {idx + 1}.",
+                            line_number=idx + 1,
+                            error_source="format",
+                        )
             report_dict.update(_check_samples_count(file, report_dict, idx))
             report_dict["load_json"] = True
@@ -745,13 +555,9 @@ def _check_jsonl(file: Path, purpose: FilePurpose | str) -> Dict[str, Any]:
         except ValueError:
             report_dict["load_json"] = False
             if idx < 0:
-                report_dict["message"] = (
-                    "Unable to decode file. File may be empty or in an unsupported format. "
-                )
+                report_dict["message"] = "Unable to decode file. File may be empty or in an unsupported format. "
             else:
-                report_dict["message"] = (
-                    f"Error parsing json payload. Unexpected format on line {idx + 1}."
-                )
+                report_dict["message"] = f"Error parsing json payload. Unexpected format on line {idx + 1}."
             report_dict["is_check_passed"] = False
     if "text_field" not in report_dict:
@@ -767,22 +573,21 @@ def _check_parquet(file: Path, purpose: FilePurpose | str) -> Dict[str, Any]:
     try:
         # Pyarrow is optional as it's large (~80MB) and isn't compatible with older systems.
         from pyarrow import ArrowInvalid, parquet
-    except ImportError:
+    except ImportError as e:
         raise ImportError(
             "pyarrow is not installed and is required to use parquet files. Please install it via `pip install together[pyarrow]`"
-        )
+        ) from e
     report_dict: Dict[str, Any] = {}
-    if purpose == FilePurpose.Eval:
+    if purpose == "eval":
         report_dict["is_check_passed"] = False
         report_dict["message"] = (
-            f"Parquet files are not supported for {purpose}. "
-            "Only JSONL and CSV files are supported."
+            f"Parquet files are not supported for {purpose}. Only JSONL and CSV files are supported."
         )
         return report_dict
     try:
-        table = parquet.read_table(str(file), memory_map=True)
+        table = parquet.read_table(str(file), memory_map=True)  # type: ignore[reportUnknownMemberType]
     except ArrowInvalid:
         report_dict["load_parquet"] = (
             f"An exception has occurred when loading the Parquet file {file}. Please check the file for corruption. "
@@ -793,9 +598,7 @@ def _check_parquet(file: Path, purpose: FilePurpose | str) -> Dict[str, Any]:
     column_names = table.schema.names
     if "input_ids" not in column_names:
-        report_dict["load_parquet"] = (
-            f"Parquet file {file} does not contain the `input_ids` column."
-        )
+        report_dict["load_parquet"] = f"Parquet file {file} does not contain the `input_ids` column."
         report_dict["is_check_passed"] = False
         return report_dict
@@ -813,8 +616,7 @@ def _check_parquet(file: Path, purpose: FilePurpose | str) -> Dict[str, Any]:
     if num_samples < MIN_SAMPLES:
         report_dict["has_min_samples"] = False
         report_dict["message"] = (
-            f"Processing {file} resulted in only {num_samples} samples. "
-            f"Our minimum is {MIN_SAMPLES} samples. "
+            f"Processing {file} resulted in only {num_samples} samples. Our minimum is {MIN_SAMPLES} samples. "
         )
         report_dict["is_check_passed"] = False
         return report_dict

together/lib/utils/serializer.py ADDED Viewed

@@ -0,0 +1,10 @@
+from __future__ import annotations
+from typing import Any
+from datetime import datetime
+def datetime_serializer(obj: Any) -> str:
+    if isinstance(obj, datetime):
+        return obj.isoformat()
+    raise TypeError(f"Object of type {obj.__class__.__name__} is not JSON serializable")

together 1.5.35__py3-none-any.whl → 2.0.0a6__py3-none-any.whl

together 1.5.35py3-none-any.whl → 2.0.0a6py3-none-any.whl