PyPI - together - Versions diffs - 1.3.3__tar.gz → 1.3.5__tar.gz - Mend

together 1.3.3tar.gz → 1.3.5tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (59) hide show

{together-1.3.3 → together-1.3.5}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: together
-Version: 1.3.3
+Version: 1.3.5
 Summary: Python client for Together's Cloud Platform!
 Home-page: https://github.com/togethercomputer/together-python
 License: Apache-2.0
@@ -29,7 +29,7 @@ Requires-Dist: requests (>=2.31.0,<3.0.0)
 Requires-Dist: rich (>=13.8.1,<14.0.0)
 Requires-Dist: tabulate (>=0.9.0,<0.10.0)
 Requires-Dist: tqdm (>=4.66.2,<5.0.0)
-Requires-Dist: typer (>=0.9,<0.13)
+Requires-Dist: typer (>=0.9,<0.14)
 Project-URL: Bug Tracker, https://github.com/togethercomputer/together-python/issues
 Project-URL: Repository, https://github.com/togethercomputer/together-python
 Description-Content-Type: text/markdown

{together-1.3.3 → together-1.3.5}/pyproject.toml RENAMED Viewed

@@ -12,7 +12,7 @@ build-backend = "poetry.masonry.api"
 [tool.poetry]
 name = "together"
-version = "1.3.3"
+version = "1.3.5"
 authors = [
     "Together AI <support@together.ai>"
 ]
@@ -29,7 +29,7 @@ homepage = "https://github.com/togethercomputer/together-python"
 [tool.poetry.dependencies]
 python = "^3.8"
-typer = ">=0.9,<0.13"
+typer = ">=0.9,<0.14"
 requests = "^2.31.0"
 rich = "^13.8.1"
 tqdm = "^4.66.2"
@@ -51,7 +51,7 @@ optional = true
 [tool.poetry.group.quality.dependencies]
 black = ">=23.1,<25.0"
-ruff = ">=0.3.2,<0.7.0"
+ruff = ">=0.3.2,<0.8.0"
 types-tqdm = "^4.65.0.0"
 types-tabulate = "^0.9.0.3"
 pre-commit = "3.5.0"

{together-1.3.3 → together-1.3.5}/src/together/cli/api/finetune.py RENAMED Viewed

@@ -11,8 +11,13 @@ from rich import print as rprint
 from tabulate import tabulate
 from together import Together
-from together.cli.api.utils import INT_WITH_MAX
-from together.utils import finetune_price_to_dollars, log_warn, parse_timestamp
+from together.cli.api.utils import BOOL_WITH_AUTO, INT_WITH_MAX
+from together.utils import (
+    finetune_price_to_dollars,
+    log_warn,
+    log_warn_once,
+    parse_timestamp,
+)
 from together.types.finetune import DownloadCheckpointType, FinetuneTrainingLimits
@@ -60,12 +65,30 @@ def fine_tuning(ctx: click.Context) -> None:
 )
 @click.option("--batch-size", type=INT_WITH_MAX, default="max", help="Train batch size")
 @click.option("--learning-rate", type=float, default=1e-5, help="Learning rate")
+@click.option(
+    "--min-lr-ratio",
+    type=float,
+    default=0.0,
+    help="The ratio of the final learning rate to the peak learning rate",
+)
 @click.option(
     "--warmup-ratio",
     type=float,
     default=0.0,
     help="Warmup ratio for learning rate scheduler.",
 )
+@click.option(
+    "--max-grad-norm",
+    type=float,
+    default=1.0,
+    help="Max gradient norm to be used for gradient clipping. Set to 0 to disable.",
+)
+@click.option(
+    "--weight-decay",
+    type=float,
+    default=0.0,
+    help="Weight decay",
+)
 @click.option(
     "--lora/--no-lora",
     type=bool,
@@ -93,6 +116,13 @@ def fine_tuning(ctx: click.Context) -> None:
     default=False,
     help="Whether to skip the launch confirmation message",
 )
+@click.option(
+    "--train-on-inputs",
+    type=BOOL_WITH_AUTO,
+    default="auto",
+    help="Whether to mask the user messages in conversational data or prompts in instruction data. "
+    "`auto` will automatically determine whether to mask the inputs based on the data format.",
+)
 def create(
     ctx: click.Context,
     training_file: str,
@@ -103,7 +133,10 @@ def create(
     n_checkpoints: int,
     batch_size: int | Literal["max"],
     learning_rate: float,
+    min_lr_ratio: float,
     warmup_ratio: float,
+    max_grad_norm: float,
+    weight_decay: float,
     lora: bool,
     lora_r: int,
     lora_dropout: float,
@@ -112,6 +145,7 @@ def create(
     suffix: str,
     wandb_api_key: str,
     confirm: bool,
+    train_on_inputs: bool | Literal["auto"],
 ) -> None:
     """Start fine-tuning"""
     client: Together = ctx.obj
@@ -125,7 +159,10 @@ def create(
         n_checkpoints=n_checkpoints,
         batch_size=batch_size,
         learning_rate=learning_rate,
+        min_lr_ratio=min_lr_ratio,
         warmup_ratio=warmup_ratio,
+        max_grad_norm=max_grad_norm,
+        weight_decay=weight_decay,
         lora=lora,
         lora_r=lora_r,
         lora_dropout=lora_dropout,
@@ -133,6 +170,7 @@ def create(
         lora_trainable_modules=lora_trainable_modules,
         suffix=suffix,
         wandb_api_key=wandb_api_key,
+        train_on_inputs=train_on_inputs,
     )
     model_limits: FinetuneTrainingLimits = client.fine_tuning.get_model_limits(
@@ -150,6 +188,10 @@ def create(
             "batch_size": model_limits.lora_training.max_batch_size,
             "learning_rate": 1e-3,
         }
+        log_warn_once(
+            f"The default LoRA rank for {model} has been changed to {default_values['lora_r']} as the max available.\n"
+            f"Also, the default learning rate for LoRA fine-tuning has been changed to {default_values['learning_rate']}."
+        )
         for arg in default_values:
             arg_source = ctx.get_parameter_source("arg")  # type: ignore[attr-defined]
             if arg_source == ParameterSource.DEFAULT:
@@ -186,22 +228,7 @@ def create(
     if confirm or click.confirm(_CONFIRMATION_MESSAGE, default=True, show_default=True):
         response = client.fine_tuning.create(
-            training_file=training_file,
-            model=model,
-            n_epochs=n_epochs,
-            validation_file=validation_file,
-            n_evals=n_evals,
-            n_checkpoints=n_checkpoints,
-            batch_size=batch_size,
-            learning_rate=learning_rate,
-            warmup_ratio=warmup_ratio,
-            lora=lora,
-            lora_r=lora_r,
-            lora_dropout=lora_dropout,
-            lora_alpha=lora_alpha,
-            lora_trainable_modules=lora_trainable_modules,
-            suffix=suffix,
-            wandb_api_key=wandb_api_key,
+            **training_args,
             verbose=True,
         )

{together-1.3.3 → together-1.3.5}/src/together/cli/api/utils.py RENAMED Viewed

@@ -27,4 +27,25 @@ class AutoIntParamType(click.ParamType):
             )
+class BooleanWithAutoParamType(click.ParamType):
+    name = "boolean_or_auto"
+    def convert(
+        self, value: str, param: click.Parameter | None, ctx: click.Context | None
+    ) -> bool | Literal["auto"] | None:
+        if value == "auto":
+            return "auto"
+        try:
+            return bool(value)
+        except ValueError:
+            self.fail(
+                _("{value!r} is not a valid {type}.").format(
+                    value=value, type=self.name
+                ),
+                param,
+                ctx,
+            )
 INT_WITH_MAX = AutoIntParamType()
+BOOL_WITH_AUTO = BooleanWithAutoParamType()

{together-1.3.3 → together-1.3.5}/src/together/constants.py RENAMED Viewed

@@ -1,3 +1,5 @@
+import enum
 # Session constants
 TIMEOUT_SECS = 600
 MAX_SESSION_LIFETIME_SECS = 180
@@ -29,3 +31,20 @@ MAX_FILE_SIZE_GB = 4.9
 # expected columns for Parquet files
 PARQUET_EXPECTED_COLUMNS = ["input_ids", "attention_mask", "labels"]
+class DatasetFormat(enum.Enum):
+    """Dataset format enum."""
+    GENERAL = "general"
+    CONVERSATION = "conversation"
+    INSTRUCTION = "instruction"
+JSONL_REQUIRED_COLUMNS_MAP = {
+    DatasetFormat.GENERAL: ["text"],
+    DatasetFormat.CONVERSATION: ["messages"],
+    DatasetFormat.INSTRUCTION: ["prompt", "completion"],
+}
+REQUIRED_COLUMNS_MESSAGE = ["role", "content"]
+POSSIBLE_ROLES_CONVERSATION = ["system", "user", "assistant"]

{together-1.3.3 → together-1.3.5}/src/together/resources/finetune.py RENAMED Viewed

@@ -20,6 +20,8 @@ from together.types import (
     TogetherClient,
     TogetherRequest,
     TrainingType,
+    FinetuneLRScheduler,
+    FinetuneLinearLRSchedulerArgs,
 )
 from together.types.finetune import DownloadCheckpointType
 from together.utils import log_warn_once, normalize_key
@@ -35,7 +37,10 @@ def createFinetuneRequest(
     n_checkpoints: int | None = 1,
     batch_size: int | Literal["max"] = "max",
     learning_rate: float | None = 0.00001,
-    warmup_ratio: float | None = 0.0,
+    min_lr_ratio: float = 0.0,
+    warmup_ratio: float = 0.0,
+    max_grad_norm: float = 1.0,
+    weight_decay: float = 0.0,
     lora: bool = False,
     lora_r: int | None = None,
     lora_dropout: float | None = 0,
@@ -43,6 +48,7 @@ def createFinetuneRequest(
     lora_trainable_modules: str | None = "all-linear",
     suffix: str | None = None,
     wandb_api_key: str | None = None,
+    train_on_inputs: bool | Literal["auto"] = "auto",
 ) -> FinetuneRequest:
     if batch_size == "max":
         log_warn_once(
@@ -82,6 +88,20 @@ def createFinetuneRequest(
     if warmup_ratio > 1 or warmup_ratio < 0:
         raise ValueError("Warmup ratio should be between 0 and 1")
+    if min_lr_ratio is not None and (min_lr_ratio > 1 or min_lr_ratio < 0):
+        raise ValueError("Min learning rate ratio should be between 0 and 1")
+    if max_grad_norm < 0:
+        raise ValueError("Max gradient norm should be non-negative")
+    if weight_decay is not None and (weight_decay < 0):
+        raise ValueError("Weight decay should be non-negative")
+    lrScheduler = FinetuneLRScheduler(
+        lr_scheduler_type="linear",
+        lr_scheduler_args=FinetuneLinearLRSchedulerArgs(min_lr_ratio=min_lr_ratio),
+    )
     finetune_request = FinetuneRequest(
         model=model,
         training_file=training_file,
@@ -91,10 +111,14 @@ def createFinetuneRequest(
         n_checkpoints=n_checkpoints,
         batch_size=batch_size,
         learning_rate=learning_rate,
+        lr_scheduler=lrScheduler,
         warmup_ratio=warmup_ratio,
+        max_grad_norm=max_grad_norm,
+        weight_decay=weight_decay,
         training_type=training_type,
         suffix=suffix,
         wandb_key=wandb_api_key,
+        train_on_inputs=train_on_inputs,
     )
     return finetune_request
@@ -115,7 +139,10 @@ class FineTuning:
         n_checkpoints: int | None = 1,
         batch_size: int | Literal["max"] = "max",
         learning_rate: float | None = 0.00001,
-        warmup_ratio: float | None = 0.0,
+        min_lr_ratio: float = 0.0,
+        warmup_ratio: float = 0.0,
+        max_grad_norm: float = 1.0,
+        weight_decay: float = 0.0,
         lora: bool = False,
         lora_r: int | None = None,
         lora_dropout: float | None = 0,
@@ -125,6 +152,7 @@ class FineTuning:
         wandb_api_key: str | None = None,
         verbose: bool = False,
         model_limits: FinetuneTrainingLimits | None = None,
+        train_on_inputs: bool | Literal["auto"] = "auto",
     ) -> FinetuneResponse:
         """
         Method to initiate a fine-tuning job
@@ -137,10 +165,14 @@ class FineTuning:
             n_evals (int, optional): Number of evaluation loops to run. Defaults to 0.
             n_checkpoints (int, optional): Number of checkpoints to save during fine-tuning.
                 Defaults to 1.
-            batch_size (int, optional): Batch size for fine-tuning. Defaults to max.
+            batch_size (int or "max"): Batch size for fine-tuning. Defaults to max.
             learning_rate (float, optional): Learning rate multiplier to use for training
                 Defaults to 0.00001.
+            min_lr_ratio (float, optional): Min learning rate ratio of the initial learning rate for
+                the learning rate scheduler. Defaults to 0.0.
             warmup_ratio (float, optional): Warmup ratio for learning rate scheduler.
+            max_grad_norm (float, optional): Max gradient norm. Defaults to 1.0, set to 0 to disable.
+            weight_decay (float, optional): Weight decay. Defaults to 0.0.
             lora (bool, optional): Whether to use LoRA adapters. Defaults to True.
             lora_r (int, optional): Rank of LoRA adapters. Defaults to 8.
             lora_dropout (float, optional): Dropout rate for LoRA adapters. Defaults to 0.
@@ -154,6 +186,12 @@ class FineTuning:
                 Defaults to False.
             model_limits (FinetuneTrainingLimits, optional): Limits for the hyperparameters the model in Fine-tuning.
                 Defaults to None.
+            train_on_inputs (bool or "auto"): Whether to mask the user messages in conversational data or prompts in instruction data.
+                "auto" will automatically determine whether to mask the inputs based on the data format.
+                For datasets with the "text" field (general format), inputs will not be masked.
+                For datasets with the "messages" field (conversational format) or "prompt" and "completion" fields
+                (Instruction format), inputs will be masked.
+                Defaults to "auto".
         Returns:
             FinetuneResponse: Object containing information about fine-tuning job.
@@ -176,7 +214,10 @@ class FineTuning:
             n_checkpoints=n_checkpoints,
             batch_size=batch_size,
             learning_rate=learning_rate,
+            min_lr_ratio=min_lr_ratio,
             warmup_ratio=warmup_ratio,
+            max_grad_norm=max_grad_norm,
+            weight_decay=weight_decay,
             lora=lora,
             lora_r=lora_r,
             lora_dropout=lora_dropout,
@@ -184,6 +225,7 @@ class FineTuning:
             lora_trainable_modules=lora_trainable_modules,
             suffix=suffix,
             wandb_api_key=wandb_api_key,
+            train_on_inputs=train_on_inputs,
         )
         if verbose:
@@ -426,7 +468,10 @@ class AsyncFineTuning:
         n_checkpoints: int | None = 1,
         batch_size: int | Literal["max"] = "max",
         learning_rate: float | None = 0.00001,
-        warmup_ratio: float | None = 0.0,
+        min_lr_ratio: float = 0.0,
+        warmup_ratio: float = 0.0,
+        max_grad_norm: float = 1.0,
+        weight_decay: float = 0.0,
         lora: bool = False,
         lora_r: int | None = None,
         lora_dropout: float | None = 0,
@@ -436,6 +481,7 @@ class AsyncFineTuning:
         wandb_api_key: str | None = None,
         verbose: bool = False,
         model_limits: FinetuneTrainingLimits | None = None,
+        train_on_inputs: bool | Literal["auto"] = "auto",
     ) -> FinetuneResponse:
         """
         Async method to initiate a fine-tuning job
@@ -451,7 +497,11 @@ class AsyncFineTuning:
             batch_size (int, optional): Batch size for fine-tuning. Defaults to max.
             learning_rate (float, optional): Learning rate multiplier to use for training
                 Defaults to 0.00001.
+            min_lr_ratio (float, optional): Min learning rate ratio of the initial learning rate for
+                the learning rate scheduler. Defaults to 0.0.
             warmup_ratio (float, optional): Warmup ratio for learning rate scheduler.
+            max_grad_norm (float, optional): Max gradient norm. Defaults to 1.0, set to 0 to disable.
+            weight_decay (float, optional): Weight decay. Defaults to 0.0.
             lora (bool, optional): Whether to use LoRA adapters. Defaults to True.
             lora_r (int, optional): Rank of LoRA adapters. Defaults to 8.
             lora_dropout (float, optional): Dropout rate for LoRA adapters. Defaults to 0.
@@ -465,6 +515,12 @@ class AsyncFineTuning:
                 Defaults to False.
             model_limits (FinetuneTrainingLimits, optional): Limits for the hyperparameters the model in Fine-tuning.
                 Defaults to None.
+            train_on_inputs (bool or "auto"): Whether to mask the user messages in conversational data or prompts in instruction data.
+                "auto" will automatically determine whether to mask the inputs based on the data format.
+                For datasets with the "text" field (general format), inputs will not be masked.
+                For datasets with the "messages" field (conversational format) or "prompt" and "completion" fields
+                (Instruction format), inputs will be masked.
+                Defaults to "auto".
         Returns:
             FinetuneResponse: Object containing information about fine-tuning job.
@@ -487,7 +543,10 @@ class AsyncFineTuning:
             n_checkpoints=n_checkpoints,
             batch_size=batch_size,
             learning_rate=learning_rate,
+            min_lr_ratio=min_lr_ratio,
             warmup_ratio=warmup_ratio,
+            max_grad_norm=max_grad_norm,
+            weight_decay=weight_decay,
             lora=lora,
             lora_r=lora_r,
             lora_dropout=lora_dropout,
@@ -495,6 +554,7 @@ class AsyncFineTuning:
             lora_trainable_modules=lora_trainable_modules,
             suffix=suffix,
             wandb_api_key=wandb_api_key,
+            train_on_inputs=train_on_inputs,
         )
         if verbose:

{together-1.3.3 → together-1.3.5}/src/together/types/__init__.py RENAMED Viewed

@@ -30,6 +30,8 @@ from together.types.finetune import (
     LoRATrainingType,
     TrainingType,
     FinetuneTrainingLimits,
+    FinetuneLRScheduler,
+    FinetuneLinearLRSchedulerArgs,
 )
 from together.types.images import (
     ImageRequest,
@@ -57,6 +59,8 @@ __all__ = [
     "FinetuneList",
     "FinetuneListEvents",
     "FinetuneDownloadResult",
+    "FinetuneLRScheduler",
+    "FinetuneLinearLRSchedulerArgs",
     "FileRequest",
     "FileResponse",
     "FileList",

{together-1.3.3 → together-1.3.5}/src/together/types/finetune.py RENAMED Viewed

@@ -3,7 +3,7 @@ from __future__ import annotations
 from enum import Enum
 from typing import List, Literal
-from pydantic import Field, validator, field_validator
+from pydantic import StrictBool, Field, validator, field_validator
 from together.types.abstract import BaseModel
 from together.types.common import (
@@ -150,8 +150,14 @@ class FinetuneRequest(BaseModel):
     n_epochs: int
     # training learning rate
     learning_rate: float
+    # learning rate scheduler type and args
+    lr_scheduler: FinetuneLRScheduler | None = None
     # learning rate warmup ratio
     warmup_ratio: float
+    # max gradient norm
+    max_grad_norm: float
+    # weight decay
+    weight_decay: float
     # number of checkpoints to save
     n_checkpoints: int | None = None
     # number of evaluation loops to run
@@ -163,6 +169,7 @@ class FinetuneRequest(BaseModel):
     # weights & biases api key
     wandb_key: str | None = None
     training_type: FullTrainingType | LoRATrainingType | None = None
+    train_on_inputs: StrictBool | Literal["auto"] = "auto"
 class FinetuneResponse(BaseModel):
@@ -192,8 +199,14 @@ class FinetuneResponse(BaseModel):
     batch_size: int | None = None
     # training learning rate
     learning_rate: float | None = None
+    # learning rate scheduler type and args
+    lr_scheduler: FinetuneLRScheduler | None = None
     # learning rate warmup ratio
     warmup_ratio: float | None = None
+    # max gradient norm
+    max_grad_norm: float | None = None
+    # weight decay
+    weight_decay: float | None = None
     # number of steps between evals
     eval_steps: int | None = None
     # training type
@@ -230,6 +243,7 @@ class FinetuneResponse(BaseModel):
     # training file metadata
     training_file_num_lines: int | None = Field(None, alias="TrainingFileNumLines")
     training_file_size: int | None = Field(None, alias="TrainingFileSize")
+    train_on_inputs: StrictBool | Literal["auto"] | None = "auto"
     @field_validator("training_type")
     @classmethod
@@ -285,3 +299,12 @@ class FinetuneTrainingLimits(BaseModel):
     min_learning_rate: float
     full_training: FinetuneFullTrainingLimits | None = None
     lora_training: FinetuneLoraTrainingLimits | None = None
+class FinetuneLRScheduler(BaseModel):
+    lr_scheduler_type: str
+    lr_scheduler_args: FinetuneLinearLRSchedulerArgs | None = None
+class FinetuneLinearLRSchedulerArgs(BaseModel):
+    min_lr_ratio: float | None = 0.0

together-1.3.5/src/together/utils/files.py ADDED Viewed

@@ -0,0 +1,324 @@
+from __future__ import annotations
+import json
+import os
+from pathlib import Path
+from traceback import format_exc
+from typing import Any, Dict
+from pyarrow import ArrowInvalid, parquet
+from together.constants import (
+    MAX_FILE_SIZE_GB,
+    MIN_SAMPLES,
+    NUM_BYTES_IN_GB,
+    PARQUET_EXPECTED_COLUMNS,
+    JSONL_REQUIRED_COLUMNS_MAP,
+    REQUIRED_COLUMNS_MESSAGE,
+    POSSIBLE_ROLES_CONVERSATION,
+    DatasetFormat,
+)
+class InvalidFileFormatError(ValueError):
+    """Exception raised for invalid file formats during file checks."""
+    def __init__(
+        self,
+        message: str = "",
+        line_number: int | None = None,
+        error_source: str | None = None,
+    ) -> None:
+        super().__init__(message)
+        self.message = message
+        self.line_number = line_number
+        self.error_source = error_source
+def check_file(
+    file: Path | str,
+) -> Dict[str, Any]:
+    if not isinstance(file, Path):
+        file = Path(file)
+    report_dict = {
+        "is_check_passed": True,
+        "message": "Checks passed",
+        "found": None,
+        "file_size": None,
+        "utf8": None,
+        "line_type": None,
+        "text_field": None,
+        "key_value": None,
+        "has_min_samples": None,
+        "num_samples": None,
+        "load_json": None,
+    }
+    if not file.is_file():
+        report_dict["found"] = False
+        report_dict["is_check_passed"] = False
+        return report_dict
+    else:
+        report_dict["found"] = True
+    file_size = os.stat(file.as_posix()).st_size
+    if file_size > MAX_FILE_SIZE_GB * NUM_BYTES_IN_GB:
+        report_dict["message"] = (
+            f"Maximum supported file size is {MAX_FILE_SIZE_GB} GB. Found file with size of {round(file_size / NUM_BYTES_IN_GB ,3)} GB."
+        )
+        report_dict["is_check_passed"] = False
+    elif file_size == 0:
+        report_dict["message"] = "File is empty"
+        report_dict["file_size"] = 0
+        report_dict["is_check_passed"] = False
+        return report_dict
+    else:
+        report_dict["file_size"] = file_size
+    data_report_dict = {}
+    if file.suffix == ".jsonl":
+        report_dict["filetype"] = "jsonl"
+        data_report_dict = _check_jsonl(file)
+    elif file.suffix == ".parquet":
+        report_dict["filetype"] = "parquet"
+        data_report_dict = _check_parquet(file)
+    else:
+        report_dict["filetype"] = (
+            f"Unknown extension of file {file}. "
+            "Only files with extensions .jsonl and .parquet are supported."
+        )
+        report_dict["is_check_passed"] = False
+    report_dict.update(data_report_dict)
+    return report_dict
+def _check_jsonl(file: Path) -> Dict[str, Any]:
+    report_dict: Dict[str, Any] = {}
+    # Check that the file is UTF-8 encoded. If not report where the error occurs.
+    try:
+        with file.open(encoding="utf-8") as f:
+            f.read()
+        report_dict["utf8"] = True
+    except UnicodeDecodeError as e:
+        report_dict["utf8"] = False
+        report_dict["message"] = f"File is not UTF-8 encoded. Error raised: {e}."
+        report_dict["is_check_passed"] = False
+        return report_dict
+    dataset_format = None
+    with file.open() as f:
+        idx = -1
+        try:
+            for idx, line in enumerate(f):
+                json_line = json.loads(line)
+                if not isinstance(json_line, dict):
+                    raise InvalidFileFormatError(
+                        message=(
+                            f"Error parsing file. Invalid format on line {idx + 1} of the input file. "
+                            'Example of valid json: {"text": "my sample string"}. '
+                        ),
+                        line_number=idx + 1,
+                        error_source="line_type",
+                    )
+                current_format = None
+                for possible_format in JSONL_REQUIRED_COLUMNS_MAP:
+                    if all(
+                        column in json_line
+                        for column in JSONL_REQUIRED_COLUMNS_MAP[possible_format]
+                    ):
+                        if current_format is None:
+                            current_format = possible_format
+                        elif current_format != possible_format:
+                            raise InvalidFileFormatError(
+                                message="Found multiple dataset formats in the input file. "
+                                f"Got {current_format} and {possible_format} on line {idx + 1}.",
+                                line_number=idx + 1,
+                                error_source="format",
+                            )
+                if current_format is None:
+                    raise InvalidFileFormatError(
+                        message=(
+                            f"Error parsing file. Could not detect a format for the line {idx + 1} with the columns:\n"
+                            f"{json_line.keys()}"
+                        ),
+                        line_number=idx + 1,
+                        error_source="format",
+                    )
+                if current_format == DatasetFormat.CONVERSATION:
+                    message_column = JSONL_REQUIRED_COLUMNS_MAP[
+                        DatasetFormat.CONVERSATION
+                    ][0]
+                    if not isinstance(json_line[message_column], list):
+                        raise InvalidFileFormatError(
+                            message=f"Invalid format on line {idx + 1} of the input file. "
+                            f"Expected a list of messages. Found {type(json_line[message_column])}",
+                            line_number=idx + 1,
+                            error_source="key_value",
+                        )
+                    for turn_id, turn in enumerate(json_line[message_column]):
+                        if not isinstance(turn, dict):
+                            raise InvalidFileFormatError(
+                                message=f"Invalid format on line {idx + 1} of the input file. "
+                                f"Expected a dictionary in the {turn_id + 1} turn. Found {type(turn)}",
+                                line_number=idx + 1,
+                                error_source="key_value",
+                            )
+                    previous_role = None
+                    for turn in json_line[message_column]:
+                        for column in REQUIRED_COLUMNS_MESSAGE:
+                            if column not in turn:
+                                raise InvalidFileFormatError(
+                                    message=f"Field `{column}` is missing for a turn `{turn}` on line {idx + 1} "
+                                    "of the the input file.",
+                                    line_number=idx + 1,
+                                    error_source="key_value",
+                                )
+                            else:
+                                if not isinstance(turn[column], str):
+                                    raise InvalidFileFormatError(
+                                        message=f"Invalid format on line {idx + 1} in the column {column} for turn `{turn}` "
+                                        f"of the input file. Expected string. Found {type(turn[column])}",
+                                        line_number=idx + 1,
+                                        error_source="text_field",
+                                    )
+                        role = turn["role"]
+                        if role not in POSSIBLE_ROLES_CONVERSATION:
+                            raise InvalidFileFormatError(
+                                message=f"Found invalid role `{role}` in the messages on the line {idx + 1}. "
+                                f"Possible roles in the conversation are: {POSSIBLE_ROLES_CONVERSATION}",
+                                line_number=idx + 1,
+                                error_source="key_value",
+                            )
+                        if previous_role == role:
+                            raise InvalidFileFormatError(
+                                message=f"Invalid role turns on line {idx + 1} of the input file. "
+                                "`user` and `assistant` roles must alternate user/assistant/user/assistant/...",
+                                line_number=idx + 1,
+                                error_source="key_value",
+                            )
+                        previous_role = role
+                else:
+                    for column in JSONL_REQUIRED_COLUMNS_MAP[current_format]:
+                        if not isinstance(json_line[column], str):
+                            raise InvalidFileFormatError(
+                                message=f'Invalid value type for "{column}" key on line {idx + 1}. '
+                                f"Expected string. Found {type(json_line[column])}.",
+                                line_number=idx + 1,
+                                error_source="key_value",
+                            )
+                if dataset_format is None:
+                    dataset_format = current_format
+                elif current_format is not None:
+                    if current_format != dataset_format:
+                        raise InvalidFileFormatError(
+                            message="All samples in the dataset must have the same dataset format. "
+                            f"Got {dataset_format} for the first line and {current_format} "
+                            f"for the line {idx + 1}.",
+                            line_number=idx + 1,
+                            error_source="format",
+                        )
+            if idx + 1 < MIN_SAMPLES:
+                report_dict["has_min_samples"] = False
+                report_dict["message"] = (
+                    f"Processing {file} resulted in only {idx + 1} samples. "
+                    f"Our minimum is {MIN_SAMPLES} samples. "
+                )
+                report_dict["is_check_passed"] = False
+            else:
+                report_dict["num_samples"] = idx + 1
+                report_dict["has_min_samples"] = True
+                report_dict["is_check_passed"] = True
+            report_dict["load_json"] = True
+        except InvalidFileFormatError as e:
+            report_dict["load_json"] = False
+            report_dict["is_check_passed"] = False
+            report_dict["message"] = e.message
+            if e.line_number is not None:
+                report_dict["line_number"] = e.line_number
+            if e.error_source is not None:
+                report_dict[e.error_source] = False
+        except ValueError:
+            report_dict["load_json"] = False
+            if idx < 0:
+                report_dict["message"] = (
+                    "Unable to decode file. "
+                    "File may be empty or in an unsupported format. "
+                )
+            else:
+                report_dict["message"] = (
+                    f"Error parsing json payload. Unexpected format on line {idx + 1}."
+                )
+            report_dict["is_check_passed"] = False
+    if "text_field" not in report_dict:
+        report_dict["text_field"] = True
+    if "line_type" not in report_dict:
+        report_dict["line_type"] = True
+    if "key_value" not in report_dict:
+        report_dict["key_value"] = True
+    return report_dict
+def _check_parquet(file: Path) -> Dict[str, Any]:
+    report_dict: Dict[str, Any] = {}
+    try:
+        table = parquet.read_table(str(file), memory_map=True)
+    except ArrowInvalid:
+        report_dict["load_parquet"] = (
+            f"An exception has occurred when loading the Parquet file {file}. Please check the file for corruption. "
+            f"Exception trace:\n{format_exc()}"
+        )
+        report_dict["is_check_passed"] = False
+        return report_dict
+    column_names = table.schema.names
+    if "input_ids" not in column_names:
+        report_dict["load_parquet"] = (
+            f"Parquet file {file} does not contain the `input_ids` column."
+        )
+        report_dict["is_check_passed"] = False
+        return report_dict
+    for column_name in column_names:
+        if column_name not in PARQUET_EXPECTED_COLUMNS:
+            report_dict["load_parquet"] = (
+                f"Parquet file {file} contains an unexpected column {column_name}. "
+                f"Only columns {PARQUET_EXPECTED_COLUMNS} are supported."
+            )
+            report_dict["is_check_passed"] = False
+            return report_dict
+    num_samples = len(table)
+    if num_samples < MIN_SAMPLES:
+        report_dict["has_min_samples"] = False
+        report_dict["message"] = (
+            f"Processing {file} resulted in only {num_samples} samples. "
+            f"Our minimum is {MIN_SAMPLES} samples. "
+        )
+        report_dict["is_check_passed"] = False
+        return report_dict
+    else:
+        report_dict["num_samples"] = num_samples
+    report_dict["is_check_passed"] = True
+    return report_dict

together-1.3.3/src/together/utils/files.py DELETED Viewed

@@ -1,204 +0,0 @@
-from __future__ import annotations
-import json
-import os
-from pathlib import Path
-from traceback import format_exc
-from typing import Any, Dict
-from pyarrow import ArrowInvalid, parquet
-from together.constants import (
-    MAX_FILE_SIZE_GB,
-    MIN_SAMPLES,
-    NUM_BYTES_IN_GB,
-    PARQUET_EXPECTED_COLUMNS,
-)
-def check_file(
-    file: Path | str,
-) -> Dict[str, Any]:
-    if not isinstance(file, Path):
-        file = Path(file)
-    report_dict = {
-        "is_check_passed": True,
-        "message": "Checks passed",
-        "found": None,
-        "file_size": None,
-        "utf8": None,
-        "line_type": None,
-        "text_field": None,
-        "key_value": None,
-        "min_samples": None,
-        "num_samples": None,
-        "load_json": None,
-    }
-    if not file.is_file():
-        report_dict["found"] = False
-        report_dict["is_check_passed"] = False
-        return report_dict
-    else:
-        report_dict["found"] = True
-    file_size = os.stat(file.as_posix()).st_size
-    if file_size > MAX_FILE_SIZE_GB * NUM_BYTES_IN_GB:
-        report_dict["message"] = (
-            f"Maximum supported file size is {MAX_FILE_SIZE_GB} GB. Found file with size of {round(file_size / NUM_BYTES_IN_GB ,3)} GB."
-        )
-        report_dict["is_check_passed"] = False
-    elif file_size == 0:
-        report_dict["message"] = "File is empty"
-        report_dict["file_size"] = 0
-        report_dict["is_check_passed"] = False
-        return report_dict
-    else:
-        report_dict["file_size"] = file_size
-    if file.suffix == ".jsonl":
-        report_dict["filetype"] = "jsonl"
-        data_report_dict = _check_jsonl(file)
-    elif file.suffix == ".parquet":
-        report_dict["filetype"] = "parquet"
-        data_report_dict = _check_parquet(file)
-    else:
-        report_dict["filetype"] = (
-            f"Unknown extension of file {file}. "
-            "Only files with extensions .jsonl and .parquet are supported."
-        )
-        report_dict["is_check_passed"] = False
-    report_dict.update(data_report_dict)
-    return report_dict
-def _check_jsonl(file: Path) -> Dict[str, Any]:
-    report_dict: Dict[str, Any] = {}
-    # Check that the file is UTF-8 encoded. If not report where the error occurs.
-    try:
-        with file.open(encoding="utf-8") as f:
-            f.read()
-        report_dict["utf8"] = True
-    except UnicodeDecodeError as e:
-        report_dict["utf8"] = False
-        report_dict["message"] = f"File is not UTF-8 encoded. Error raised: {e}."
-        report_dict["is_check_passed"] = False
-        return report_dict
-    with file.open() as f:
-        # idx must be instantiated so decode errors (e.g. file is a tar) or empty files are caught
-        idx = -1
-        try:
-            for idx, line in enumerate(f):
-                json_line = json.loads(line)  # each line in jsonlines should be a json
-                if not isinstance(json_line, dict):
-                    report_dict["line_type"] = False
-                    report_dict["message"] = (
-                        f"Error parsing file. Invalid format on line {idx + 1} of the input file. "
-                        'Example of valid json: {"text": "my sample string"}. '
-                    )
-                    report_dict["is_check_passed"] = False
-                if "text" not in json_line.keys():
-                    report_dict["text_field"] = False
-                    report_dict["message"] = (
-                        f"Missing 'text' field was found on line {idx + 1} of the the input file. "
-                        "Expected format: {'text': 'my sample string'}. "
-                    )
-                    report_dict["is_check_passed"] = False
-                else:
-                    # check to make sure the value of the "text" key is a string
-                    if not isinstance(json_line["text"], str):
-                        report_dict["key_value"] = False
-                        report_dict["message"] = (
-                            f'Invalid value type for "text" key on line {idx + 1}. '
-                            f'Expected string. Found {type(json_line["text"])}.'
-                        )
-                        report_dict["is_check_passed"] = False
-            # make sure this is outside the for idx, line in enumerate(f): for loop
-            if idx + 1 < MIN_SAMPLES:
-                report_dict["min_samples"] = False
-                report_dict["message"] = (
-                    f"Processing {file} resulted in only {idx + 1} samples. "
-                    f"Our minimum is {MIN_SAMPLES} samples. "
-                )
-                report_dict["is_check_passed"] = False
-            else:
-                report_dict["num_samples"] = idx + 1
-                report_dict["min_samples"] = True
-            report_dict["load_json"] = True
-        except ValueError:
-            report_dict["load_json"] = False
-            if idx < 0:
-                report_dict["message"] = (
-                    "Unable to decode file. "
-                    "File may be empty or in an unsupported format. "
-                )
-            else:
-                report_dict["message"] = (
-                    f"Error parsing json payload. Unexpected format on line {idx + 1}."
-                )
-            report_dict["is_check_passed"] = False
-    if "text_field" not in report_dict:
-        report_dict["text_field"] = True
-    if "line_type" not in report_dict:
-        report_dict["line_type"] = True
-    if "key_value" not in report_dict:
-        report_dict["key_value"] = True
-    return report_dict
-def _check_parquet(file: Path) -> Dict[str, Any]:
-    report_dict: Dict[str, Any] = {}
-    try:
-        table = parquet.read_table(str(file), memory_map=True)
-    except ArrowInvalid:
-        report_dict["load_parquet"] = (
-            f"An exception has occurred when loading the Parquet file {file}. Please check the file for corruption. "
-            f"Exception trace:\n{format_exc()}"
-        )
-        report_dict["is_check_passed"] = False
-        return report_dict
-    column_names = table.schema.names
-    if "input_ids" not in column_names:
-        report_dict["load_parquet"] = (
-            f"Parquet file {file} does not contain the `input_ids` column."
-        )
-        report_dict["is_check_passed"] = False
-        return report_dict
-    for column_name in column_names:
-        if column_name not in PARQUET_EXPECTED_COLUMNS:
-            report_dict["load_parquet"] = (
-                f"Parquet file {file} contains an unexpected column {column_name}. "
-                f"Only columns {PARQUET_EXPECTED_COLUMNS} are supported."
-            )
-            report_dict["is_check_passed"] = False
-            return report_dict
-    num_samples = len(table)
-    if num_samples < MIN_SAMPLES:
-        report_dict["min_samples"] = (
-            f"Processing {file} resulted in only {num_samples} samples. "
-            f"Our minimum is {MIN_SAMPLES} samples. "
-        )
-        report_dict["is_check_passed"] = False
-        return report_dict
-    else:
-        report_dict["num_samples"] = num_samples
-    report_dict["is_check_passed"] = True
-    return report_dict

{together-1.3.3 → together-1.3.5}/LICENSE RENAMED Viewed

File without changes

{together-1.3.3 → together-1.3.5}/README.md RENAMED Viewed

File without changes

{together-1.3.3 → together-1.3.5}/src/together/__init__.py RENAMED Viewed

File without changes

{together-1.3.3 → together-1.3.5}/src/together/abstract/__init__.py RENAMED Viewed

File without changes

{together-1.3.3 → together-1.3.5}/src/together/abstract/api_requestor.py RENAMED Viewed

File without changes

{together-1.3.3 → together-1.3.5}/src/together/cli/__init__.py RENAMED Viewed

File without changes

{together-1.3.3 → together-1.3.5}/src/together/cli/api/__init__.py RENAMED Viewed

File without changes

{together-1.3.3 → together-1.3.5}/src/together/cli/api/chat.py RENAMED Viewed

File without changes

{together-1.3.3 → together-1.3.5}/src/together/cli/api/completions.py RENAMED Viewed

File without changes

{together-1.3.3 → together-1.3.5}/src/together/cli/api/files.py RENAMED Viewed

File without changes

{together-1.3.3 → together-1.3.5}/src/together/cli/api/images.py RENAMED Viewed

File without changes

{together-1.3.3 → together-1.3.5}/src/together/cli/api/models.py RENAMED Viewed

File without changes

{together-1.3.3 → together-1.3.5}/src/together/cli/cli.py RENAMED Viewed

File without changes

{together-1.3.3 → together-1.3.5}/src/together/client.py RENAMED Viewed

File without changes

{together-1.3.3 → together-1.3.5}/src/together/error.py RENAMED Viewed

File without changes

{together-1.3.3 → together-1.3.5}/src/together/filemanager.py RENAMED Viewed

File without changes

{together-1.3.3 → together-1.3.5}/src/together/legacy/__init__.py RENAMED Viewed

File without changes

{together-1.3.3 → together-1.3.5}/src/together/legacy/base.py RENAMED Viewed

File without changes

{together-1.3.3 → together-1.3.5}/src/together/legacy/complete.py RENAMED Viewed

File without changes

{together-1.3.3 → together-1.3.5}/src/together/legacy/embeddings.py RENAMED Viewed

File without changes

{together-1.3.3 → together-1.3.5}/src/together/legacy/files.py RENAMED Viewed

File without changes

{together-1.3.3 → together-1.3.5}/src/together/legacy/finetune.py RENAMED Viewed

File without changes

{together-1.3.3 → together-1.3.5}/src/together/legacy/images.py RENAMED Viewed

File without changes

{together-1.3.3 → together-1.3.5}/src/together/legacy/models.py RENAMED Viewed

File without changes

{together-1.3.3 → together-1.3.5}/src/together/resources/__init__.py RENAMED Viewed

File without changes

{together-1.3.3 → together-1.3.5}/src/together/resources/chat/__init__.py RENAMED Viewed

File without changes

{together-1.3.3 → together-1.3.5}/src/together/resources/chat/completions.py RENAMED Viewed

File without changes

{together-1.3.3 → together-1.3.5}/src/together/resources/completions.py RENAMED Viewed

File without changes

{together-1.3.3 → together-1.3.5}/src/together/resources/embeddings.py RENAMED Viewed

File without changes

{together-1.3.3 → together-1.3.5}/src/together/resources/files.py RENAMED Viewed

File without changes

{together-1.3.3 → together-1.3.5}/src/together/resources/images.py RENAMED Viewed

File without changes

{together-1.3.3 → together-1.3.5}/src/together/resources/models.py RENAMED Viewed

File without changes

{together-1.3.3 → together-1.3.5}/src/together/resources/rerank.py RENAMED Viewed

File without changes

{together-1.3.3 → together-1.3.5}/src/together/together_response.py RENAMED Viewed

File without changes

{together-1.3.3 → together-1.3.5}/src/together/types/abstract.py RENAMED Viewed

File without changes

{together-1.3.3 → together-1.3.5}/src/together/types/chat_completions.py RENAMED Viewed

File without changes

{together-1.3.3 → together-1.3.5}/src/together/types/common.py RENAMED Viewed

File without changes

{together-1.3.3 → together-1.3.5}/src/together/types/completions.py RENAMED Viewed

File without changes

{together-1.3.3 → together-1.3.5}/src/together/types/embeddings.py RENAMED Viewed

File without changes

{together-1.3.3 → together-1.3.5}/src/together/types/error.py RENAMED Viewed

File without changes

{together-1.3.3 → together-1.3.5}/src/together/types/files.py RENAMED Viewed

File without changes

{together-1.3.3 → together-1.3.5}/src/together/types/images.py RENAMED Viewed

File without changes

{together-1.3.3 → together-1.3.5}/src/together/types/models.py RENAMED Viewed

File without changes

{together-1.3.3 → together-1.3.5}/src/together/types/rerank.py RENAMED Viewed

File without changes

{together-1.3.3 → together-1.3.5}/src/together/utils/__init__.py RENAMED Viewed

File without changes

{together-1.3.3 → together-1.3.5}/src/together/utils/_log.py RENAMED Viewed

File without changes

{together-1.3.3 → together-1.3.5}/src/together/utils/api_helpers.py RENAMED Viewed

File without changes

{together-1.3.3 → together-1.3.5}/src/together/utils/tools.py RENAMED Viewed

File without changes

{together-1.3.3 → together-1.3.5}/src/together/version.py RENAMED Viewed

File without changes

together 1.3.3__tar.gz → 1.3.5__tar.gz

together 1.3.3tar.gz → 1.3.5tar.gz