PyPI - dslighting - Versions diffs - 1.7.1__py3-none-any.whl → 1.7.8__py3-none-any.whl - Mend

dslighting 1.7.1py3-none-any.whl → 1.7.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (352) hide show

mlebench/competitions/chaii-hindi-and-tamil-question-answering/prepare_val.py ADDED Viewed

@@ -0,0 +1,94 @@
+from pathlib import Path
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from mlebench.utils import read_csv
+def _split_and_save(
+    source_df: pd.DataFrame,
+    public_dir: Path,
+    private_dir: Path,
+    test_size: float,
+    random_state: int,
+) -> pd.DataFrame:
+    """
+    Splits a dataframe into train and test sets and saves them to the specified
+    public and private directories, mimicking the competition data format.
+    Args:
+        source_df: The dataframe to be split.
+        public_dir: The directory for public-facing files (train set, unlabeled test set).
+        private_dir: The directory for private-facing files (test set labels).
+        test_size: The proportion of the dataset to allocate to the test split.
+        random_state: The seed used by the random number generator.
+    Returns:
+        The training set dataframe resulting from the split.
+    """
+    # Create train, test split
+    new_train, new_test = train_test_split(
+        source_df, test_size=test_size, random_state=random_state
+    )
+    new_test_without_labels = new_test.drop(columns=["answer_start", "answer_text"])
+    # make private test match submission format
+    new_test_labels = new_test[["id", "answer_text"]]
+    new_test_labels.columns = ["id", "PredictionString"]
+    # Ensure output directories exist
+    public_dir.mkdir(exist_ok=True)
+    private_dir.mkdir(exist_ok=True)
+    # Copy over files
+    new_train.to_csv(public_dir / "train.csv", index=False)
+    new_test_without_labels.to_csv(public_dir / "test.csv", index=False)
+    new_test_labels.to_csv(private_dir / "test.csv", index=False)
+    # Create sample submission
+    sample_submission = new_test_labels.copy()
+    sample_submission["PredictionString"] = "dummy text"
+    sample_submission.to_csv(public_dir / "sample_submission.csv", index=False)
+    assert len(sample_submission) == len(
+        new_test_labels
+    ), "Sample submission length does not match test length."
+    return new_train
+def prepare(raw: Path, public: Path, private: Path):
+    # --- Define paths for the new validation split ---
+    public_val = public.parent / "public_val"
+    private_val = private.parent / "private_val"
+    # Load the initial raw training data
+    original_train_df = read_csv(raw / "train.csv")
+    # --- First Split: Create original train/test sets ---
+    # This split generates the main competition files in `public` and `private`.
+    # The outputs of this step must remain identical to the original script.
+    main_train_set = _split_and_save(
+        source_df=original_train_df,
+        public_dir=public,
+        private_dir=private,
+        test_size=0.1,
+        random_state=0,
+    )
+    # --- Second Split: Create validation train/test sets from the main train set ---
+    # This split uses the *training data* from the first split as its source.
+    # The new `test_val` set will have approx. the same size as the original `test` set.
+    # test_size for 2nd split = (size of original test) / (size of new train)
+    # = (0.1 * total) / (0.9 * total) = 0.1 / 0.9
+    validation_test_size = 0.1 / (1.0 - 0.1)
+    _split_and_save(
+        source_df=main_train_set,
+        public_dir=public_val,
+        private_dir=private_val,
+        test_size=validation_test_size,
+        random_state=0,  # Use the same random state for consistency
+    )

mlebench/competitions/champs-scalar-coupling/grade.py ADDED Viewed

@@ -0,0 +1,60 @@
+import numpy as np
+from pandas import DataFrame
+from sklearn.metrics import mean_absolute_error
+from mlebench.grade_helpers import InvalidSubmissionError
+def prepare_for_grading(submission: DataFrame, answers: DataFrame, target_column: str) -> DataFrame:
+    """
+    Merge the submission's target_column into the answers DataFrame, matching on 'id'.
+    target_column from the submission DataFrame will have the suffix '_pred',
+    and target_column from the answers DataFrame will have the suffix '_true'.
+    """
+    # Answers checks
+    assert (
+        target_column in answers.columns
+    ), f"Target column {target_column} not found in answers DataFrame."
+    assert "id" in answers.columns, "ID column not found in answers DataFrame."
+    assert "type" in answers.columns, "Type column not found in answers DataFrame."
+    # Submission checks
+    if target_column not in submission.columns:
+        raise InvalidSubmissionError(
+            f"Target column {target_column} not found in submission DataFrame."
+        )
+    if "id" not in submission.columns:
+        raise InvalidSubmissionError("ID column not found in submission DataFrame.")
+    if not set(submission["id"]) == set(answers["id"]):
+        raise InvalidSubmissionError(
+            f"Submission is missing the following ids: {set(answers['id']) - set(submission['id'])}"
+        )
+    merged = answers.merge(submission[["id", target_column]], on="id", suffixes=("_true", "_pred"))
+    return merged
+def log_mae(y_true, y_pred):
+    mae = mean_absolute_error(y_true, y_pred)
+    return np.log(mae)
+def grade(submission: DataFrame, answers: DataFrame) -> float:
+    """
+    Submissions are evaluated on the Log of the Mean Absolute Error, calculated for each scalar coupling type, and then averaged across types, so that a 1% decrease in MAE for one type provides the same improvement in score as a 1% decrease for another type. https://www.kaggle.com/c/champs-scalar-coupling/overview
+    """
+    merged = prepare_for_grading(submission, answers, "scalar_coupling_constant")
+    # Group by scalar coupling type
+    grouped = merged.groupby("type")
+    log_maes = []
+    for name, group in grouped:
+        y_true = group["scalar_coupling_constant_true"]
+        y_pred = group["scalar_coupling_constant_pred"]
+        log_maes.append(log_mae(y_true, y_pred))
+    # Average the log MAEs
+    score = np.mean(log_maes)
+    return score

mlebench/competitions/champs-scalar-coupling/prepare.py ADDED Viewed

@@ -0,0 +1,116 @@
+import shutil
+from pathlib import Path
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from tqdm import tqdm
+from mlebench.utils import read_csv
+def prepare(raw: Path, public: Path, private: Path):
+    # Create train, test from train split
+    old_train = read_csv(raw / "train.csv")
+    grouped_by_molecule = list(old_train.groupby("molecule_name"))
+    train_groups, test_groups = train_test_split(grouped_by_molecule, test_size=0.1, random_state=0)
+    new_train = pd.concat([group for _, group in train_groups])
+    answers = pd.concat([group for _, group in test_groups])
+    new_test = answers.drop(columns=["scalar_coupling_constant"])
+    # Create sample submission
+    sample_submission = new_test[["id"]].copy()
+    sample_submission["scalar_coupling_constant"] = 0
+    # Molecule structure data in CSV format
+    structures = read_csv(raw / "structures.csv")
+    structures = structures[structures["molecule_name"].isin(new_train["molecule_name"])]
+    # Additional data CSVs
+    dipole_moments = read_csv(raw / "dipole_moments.csv")
+    dipole_moments = dipole_moments[
+        dipole_moments["molecule_name"].isin(new_train["molecule_name"])
+    ]
+    magnetic_shielding_tensors = read_csv(raw / "magnetic_shielding_tensors.csv")
+    magnetic_shielding_tensors = magnetic_shielding_tensors[
+        magnetic_shielding_tensors["molecule_name"].isin(new_train["molecule_name"])
+    ]
+    mulliken_charges = read_csv(raw / "mulliken_charges.csv")
+    mulliken_charges = mulliken_charges[
+        mulliken_charges["molecule_name"].isin(new_train["molecule_name"])
+    ]
+    potential_energy = read_csv(raw / "potential_energy.csv")
+    potential_energy = potential_energy[
+        potential_energy["molecule_name"].isin(new_train["molecule_name"])
+    ]
+    scalar_coupling_contributions = read_csv(raw / "scalar_coupling_contributions.csv")
+    scalar_coupling_contributions = scalar_coupling_contributions[
+        scalar_coupling_contributions["molecule_name"].isin(new_train["molecule_name"])
+    ]
+    # Checks before writing
+    data_csvs = {
+        "structures": structures,
+        "dipole_moments": dipole_moments,
+        "magnetic_shielding_tensors": magnetic_shielding_tensors,
+        "mulliken_charges": mulliken_charges,
+        "potential_energy": potential_energy,
+        "scalar_coupling_contributions": scalar_coupling_contributions,
+    }
+    for name, dataset in data_csvs.items():
+        assert set(dataset["molecule_name"]) == set(
+            new_train["molecule_name"]
+        ), f"Filtered {name} should exactly match the molecule names present in the new_train set."
+    assert set(new_train["molecule_name"]).isdisjoint(
+        set(new_test["molecule_name"])
+    ), "Train and test sets should not share any samples with the same molecule name."
+    assert set(new_train["id"]).isdisjoint(
+        set(new_test["id"])
+    ), "Train and test sets should not share any samples with the same id."
+    assert len(sample_submission) == len(
+        new_test
+    ), "Sample submission length does not match test length."
+    assert (
+        sample_submission.shape[1] == 2
+    ), f"Sample submission should have 2 columns, but has {sample_submission.shape[1]}"
+    assert new_test.shape[1] == 5, f"new_test should have 5 columns, but has {new_test.shape[1]}"
+    assert answers.shape[1] == 6, f"answers should have 6 columns, but has {answers.shape[1]}"
+    assert new_train.shape[1] == 6, f"new_train should have 6 columns, but has {new_train.shape[1]}"
+    # Copy over molecule structure data individual files
+    for molecule_name in tqdm(
+        new_train["molecule_name"].unique(), desc="Copying molecule structure files"
+    ):
+        src_file = raw / "structures" / f"{molecule_name}.xyz"
+        dst_file = public / "structures" / f"{molecule_name}.xyz"
+        dst_file.parent.mkdir(parents=True, exist_ok=True)
+        shutil.copyfile(src=src_file, dst=dst_file)
+    # Write CSVs
+    answers.to_csv(private / "answers.csv", index=False)
+    new_train.to_csv(public / "train.csv", index=False)
+    new_test.to_csv(public / "test.csv", index=False)
+    sample_submission.to_csv(public / "sample_submission.csv", index=False)
+    structures.to_csv(public / "structures.csv", index=False)
+    dipole_moments.to_csv(public / "dipole_moments.csv", index=False)
+    magnetic_shielding_tensors.to_csv(public / "magnetic_shielding_tensors.csv", index=False)
+    mulliken_charges.to_csv(public / "mulliken_charges.csv", index=False)
+    potential_energy.to_csv(public / "potential_energy.csv", index=False)
+    scalar_coupling_contributions.to_csv(public / "scalar_coupling_contributions.csv", index=False)
+    # Checks after writing
+    assert len(list((public / "structures").glob("*.xyz"))) == len(
+        new_train["molecule_name"].unique()
+    ), "The number of files in public/structures should match the number of unique molecule names in the train set."

mlebench/competitions/champs-scalar-coupling/prepare_val.py ADDED Viewed

@@ -0,0 +1,155 @@
+import shutil
+from pathlib import Path
+from typing import Dict
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from tqdm import tqdm
+from mlebench.utils import read_csv
+def _process_and_save_split(
+    train_df: pd.DataFrame,
+    test_df: pd.DataFrame,
+    answers_df: pd.DataFrame,
+    public_path: Path,
+    private_path: Path,
+    raw_path: Path,
+    raw_supplementary_data: Dict[str, pd.DataFrame],
+):
+    """
+    Helper function to process and save a single data split (e.g., train/test or train_val/test_val).
+    This function takes a pair of train/test sets, filters the supplementary data accordingly,
+    performs checks, and writes all the necessary files to the specified public and private directories.
+    """
+    public_path.mkdir(exist_ok=True)
+    private_path.mkdir(exist_ok=True)
+    # Create sample submission for the current test set
+    sample_submission = test_df[["id"]].copy()
+    sample_submission["scalar_coupling_constant"] = 0
+    # Filter supplementary data to only include molecules present in the training set
+    train_molecules = set(train_df["molecule_name"])
+    filtered_supplementary_data = {}
+    for name, df in raw_supplementary_data.items():
+        filtered_supplementary_data[name] = df[df["molecule_name"].isin(train_molecules)]
+    # Checks before writing
+    for name, dataset in filtered_supplementary_data.items():
+        assert set(dataset["molecule_name"]) == train_molecules, (
+            f"[{public_path.name}] Filtered {name} should exactly match the molecule names "
+            "present in the train set."
+        )
+    assert set(train_df["molecule_name"]).isdisjoint(
+        set(test_df["molecule_name"])
+    ), f"[{public_path.name}] Train and test sets should not share any molecule names."
+    assert set(train_df["id"]).isdisjoint(
+        set(test_df["id"])
+    ), f"[{public_path.name}] Train and test sets should not share any sample ids."
+    assert len(sample_submission) == len(
+        test_df
+    ), f"[{public_path.name}] Sample submission length does not match test length."
+    # Write CSVs to public and private directories
+    answers_df.to_csv(private_path / "answers.csv", index=False)
+    train_df.to_csv(public_path / "train.csv", index=False)
+    test_df.to_csv(public_path / "test.csv", index=False)
+    sample_submission.to_csv(public_path / "sample_submission.csv", index=False)
+    for name, df in filtered_supplementary_data.items():
+        df.to_csv(public_path / f"{name}.csv", index=False)
+    # Copy over molecule structure .xyz files for the training set
+    structures_xyz_path = public_path / "structures"
+    structures_xyz_path.mkdir(parents=True, exist_ok=True)
+    for molecule_name in tqdm(
+        train_df["molecule_name"].unique(),
+        desc=f"Copying .xyz files to {public_path.name}",
+    ):
+        src_file = raw_path / "structures" / f"{molecule_name}.xyz"
+        dst_file = structures_xyz_path / f"{molecule_name}.xyz"
+        shutil.copyfile(src=src_file, dst=dst_file)
+    # Checks after writing
+    assert len(list(structures_xyz_path.glob("*.xyz"))) == len(
+        train_df["molecule_name"].unique()
+    ), (
+        f"[{public_path.name}] The number of files in {structures_xyz_path} should match the number "
+        "of unique molecule names in the train set."
+    )
+def prepare(raw: Path, public: Path, private: Path):
+    """
+    Prepares the data by performing two splits:
+    1. A main split of the raw data into a definitive train/test set.
+       Outputs are saved to `public/` and `private/`.
+    2. A validation split of the main training set into a smaller train/validation set.
+       Outputs are saved to `public_val/` and `private_val/`, mirroring the main output structure.
+    """
+    # Load all data from raw directory first
+    old_train = read_csv(raw / "train.csv")
+    # Load all supplementary data into a dictionary for easy filtering later
+    raw_supplementary_data = {
+        "structures": read_csv(raw / "structures.csv"),
+        "dipole_moments": read_csv(raw / "dipole_moments.csv"),
+        "magnetic_shielding_tensors": read_csv(raw / "magnetic_shielding_tensors.csv"),
+        "mulliken_charges": read_csv(raw / "mulliken_charges.csv"),
+        "potential_energy": read_csv(raw / "potential_energy.csv"),
+        "scalar_coupling_contributions": read_csv(raw / "scalar_coupling_contributions.csv"),
+    }
+    # --- Create main Train/Test Split (Original Logic) ---
+    # The outputs of this split are final and must not be changed.
+    grouped_by_molecule = list(old_train.groupby("molecule_name"))
+    train_groups, test_groups = train_test_split(grouped_by_molecule, test_size=0.1, random_state=0)
+    new_train = pd.concat([group for _, group in train_groups])
+    answers = pd.concat([group for _, group in test_groups])
+    new_test = answers.drop(columns=["scalar_coupling_constant"])
+    # Process and save the main split to `public` and `private` directories
+    _process_and_save_split(
+        train_df=new_train,
+        test_df=new_test,
+        answers_df=answers,
+        public_path=public,
+        private_path=private,
+        raw_path=raw,
+        raw_supplementary_data=raw_supplementary_data,
+    )
+    # --- Create Validation Split (New Logic) ---
+    # This second split uses the `new_train` set from above to create a smaller
+    # training set and a new validation set.
+    public_val = public.parent / "public_val"
+    private_val = private.parent / "private_val"
+    # Split the `new_train` set by molecule groups again.
+    # The original test set was 10% of the total. The new train set is 90% of the total.
+    # To get a validation set of a similar size (10% of total), the test_size for this
+    # second split should be (10% / 90%) = 1/9.
+    train_val_groups, test_val_groups = train_test_split(
+        train_groups, test_size=1 / 9, random_state=0
+    )
+    train_val = pd.concat([group for _, group in train_val_groups])
+    answers_val = pd.concat([group for _, group in test_val_groups])
+    test_val = answers_val.drop(columns=["scalar_coupling_constant"])
+    # Process and save the validation split to `public_val` and `private_val` directories
+    _process_and_save_split(
+        train_df=train_val,
+        test_df=test_val,
+        answers_df=answers_val,
+        public_path=public_val,
+        private_path=private_val,
+        raw_path=raw,
+        raw_supplementary_data=raw_supplementary_data,
+    )

mlebench/competitions/conways-reverse-game-of-life-2020/__init__.py ADDED Viewed

File without changes

mlebench/competitions/conways-reverse-game-of-life-2020/grade.py ADDED Viewed

@@ -0,0 +1,40 @@
+import numpy as np
+import pandas as pd
+from mlebench.grade_helpers import InvalidSubmissionError
+def grade(submission: pd.DataFrame, answers: pd.DataFrame) -> float:
+    """Return accuracy between predicted and true starting boards."""
+    if "id" not in submission.columns:
+        raise InvalidSubmissionError("Submission is missing required column 'id'.")
+    start_cols = [col for col in answers.columns if col.startswith("start_")]
+    if not start_cols:
+        raise ValueError("Answers DataFrame does not contain any start_* columns.")
+    missing_cols = [col for col in start_cols if col not in submission.columns]
+    if missing_cols:
+        raise InvalidSubmissionError(
+            f"Submission is missing required start columns: {missing_cols[:5]}"
+        )
+    merged = answers[["id", *start_cols]].merge(
+        submission[["id", *start_cols]],
+        on="id",
+        how="left",
+        suffixes=("_true", "_pred"),
+    )
+    pred_cols = [f"{col}_pred" for col in start_cols]
+    if merged[pred_cols].isna().any().any():
+        raise InvalidSubmissionError("Submission contains missing predictions for some ids.")
+    preds = merged[pred_cols].to_numpy(dtype=float)
+    truths = merged[[f"{col}_true" for col in start_cols]].to_numpy(dtype=float)
+    if not np.isin(preds, [0, 1]).all():
+        raise InvalidSubmissionError("Predictions must be binary (0 or 1).")
+    mae = np.abs(preds - truths).mean()
+    return float(1.0 - mae)

mlebench/competitions/conways-reverse-game-of-life-2020/prepare.py ADDED Viewed

@@ -0,0 +1,41 @@
+from pathlib import Path
+import pandas as pd
+from mlebench.utils import read_csv
+def prepare(raw: Path, public: Path, private: Path) -> None:
+    """Prepare Reverse Game of Life tables for MLE-Bench."""
+    public.mkdir(parents=True, exist_ok=True)
+    private.mkdir(parents=True, exist_ok=True)
+    train = read_csv(raw / "train.csv")
+    test = read_csv(raw / "test.csv")
+    answers = read_csv(raw / "test_answer.csv")
+    sample_path = raw / "sample_submission.csv"
+    if sample_path.exists():
+        sample_submission = read_csv(sample_path)
+    else:
+        start_cols = [col for col in answers.columns if col.startswith("start_")]
+        sample_submission = answers[["id", *start_cols]].copy()
+        sample_submission[start_cols] = 0
+    merged_private = test.merge(answers, on="id", how="left", validate="one_to_one")
+    start_cols = [col for col in answers.columns if col.startswith("start_")]
+    if merged_private[start_cols].isna().any().any():
+        missing_ids = merged_private.loc[merged_private[start_cols].isna().any(axis=1), "id"].tolist()
+        raise ValueError(f"Missing start cells for ids: {missing_ids[:5]}")
+    train.to_csv(public / "train.csv", index=False)
+    test.to_csv(public / "test.csv", index=False)
+    sample_submission.to_csv(public / "sample_submission.csv", index=False)
+    merged_private.to_csv(private / "test.csv", index=False)
+    answers.to_csv(private / "gold_submission.csv", index=False)
+    assert len(test) == len(answers) == len(merged_private), "Public test and answers row counts differ."
+    assert sample_submission.columns.tolist()[0] == "id", "Sample submission must start with 'id'."
+    for col in start_cols:
+        assert col in sample_submission.columns, f"Sample submission missing column {col}."

mlebench/competitions/demand-forecasting-kernels-only/__init__.py ADDED Viewed

File without changes

mlebench/competitions/demand-forecasting-kernels-only/grade.py ADDED Viewed

@@ -0,0 +1,66 @@
+import numpy as np
+import pandas as pd
+from mlebench.grade_helpers import InvalidSubmissionError
+def smape(y_true, y_pred):
+    """Calculate SMAPE (Symmetric Mean Absolute Percentage Error)."""
+    # SMAPE = 0 when both actual and predicted are 0
+    denominator = (np.abs(y_true) + np.abs(y_pred))
+    diff = np.abs(y_true - y_pred)
+    # Handle division by zero: set SMAPE to 0 when denominator is 0
+    smape_val = np.where(denominator == 0, 0, diff / denominator)
+    return 100 * np.mean(smape_val)
+def prepare_for_metric(submission: pd.DataFrame, answers: pd.DataFrame):
+    """Prepare submission and answers for SMAPE calculation."""
+    if len(submission) != len(answers):
+        raise InvalidSubmissionError(
+            f"Submission length ({len(submission)}) != answers length ({len(answers)})"
+        )
+    if "id" not in submission.columns:
+        raise InvalidSubmissionError("Submission must have an 'id' column")
+    if "sales" not in submission.columns:
+        raise InvalidSubmissionError("Submission must have a 'sales' column")
+    # Sort by id
+    submission = submission.sort_values("id").reset_index(drop=True)
+    answers = answers.sort_values("id").reset_index(drop=True)
+    # Check id alignment
+    if not (submission["id"] == answers["id"]).all():
+        raise InvalidSubmissionError("Submission and answers id mismatch")
+    # Extract predictions and true values
+    y_pred = submission["sales"].values
+    y_true = answers["sales"].values
+    # Validate predictions
+    try:
+        y_pred = y_pred.astype(float)
+    except (ValueError, TypeError):
+        raise InvalidSubmissionError("Predictions must be numeric")
+    if np.any(np.isnan(y_pred)):
+        raise InvalidSubmissionError("Predictions cannot contain NaN values")
+    if np.any(y_pred < 0):
+        raise InvalidSubmissionError("Predictions cannot be negative")
+    return y_true, y_pred
+def grade(submission: pd.DataFrame, answers: pd.DataFrame) -> float:
+    """Calculate SMAPE score."""
+    y_true, y_pred = prepare_for_metric(submission, answers)
+    # SMAPE
+    smape_score = smape(y_true, y_pred)
+    return smape_score

mlebench/competitions/demand-forecasting-kernels-only/prepare.py ADDED Viewed

@@ -0,0 +1,27 @@
+from pathlib import Path
+import pandas as pd
+def prepare(raw: Path, public: Path, private: Path):
+    """
+    Prepare demand-forecasting-kernels-only dataset.
+    """
+    # Read data
+    train = pd.read_csv(raw / "train.csv")
+    test = pd.read_csv(raw / "test.csv")
+    sample_submission = pd.read_csv(raw / "sample_submission.csv")
+    test_answer = pd.read_csv(raw / "test_answer.csv")
+    # Public files (visible to agents)
+    train.to_csv(public / "train.csv", index=False)
+    test.to_csv(public / "test.csv", index=False)
+    sample_submission.to_csv(public / "sample_submission.csv", index=False)
+    # Private files (for grading)
+    test_answer.to_csv(private / "test.csv", index=False)
+    # Validation checks
+    assert len(test_answer) == len(sample_submission), \
+        f"Test answer ({len(test_answer)}) and sample submission ({len(sample_submission)}) must have same length"
+    assert "id" in test_answer.columns, "Test answer must have 'id' column"
+    assert "sales" in test_answer.columns, "Test answer must have 'sales' column"

mlebench/competitions/demand_forecasting_kernels_only/__init__.py ADDED Viewed

File without changes

mlebench/competitions/demand_forecasting_kernels_only/grade.py ADDED Viewed

@@ -0,0 +1,66 @@
+import numpy as np
+import pandas as pd
+from mlebench.grade_helpers import InvalidSubmissionError
+def smape(y_true, y_pred):
+    """Calculate SMAPE (Symmetric Mean Absolute Percentage Error)."""
+    # SMAPE = 0 when both actual and predicted are 0
+    denominator = (np.abs(y_true) + np.abs(y_pred))
+    diff = np.abs(y_true - y_pred)
+    # Handle division by zero: set SMAPE to 0 when denominator is 0
+    smape_val = np.where(denominator == 0, 0, diff / denominator)
+    return 100 * np.mean(smape_val)
+def prepare_for_metric(submission: pd.DataFrame, answers: pd.DataFrame):
+    """Prepare submission and answers for SMAPE calculation."""
+    if len(submission) != len(answers):
+        raise InvalidSubmissionError(
+            f"Submission length ({len(submission)}) != answers length ({len(answers)})"
+        )
+    if "id" not in submission.columns:
+        raise InvalidSubmissionError("Submission must have an 'id' column")
+    if "sales" not in submission.columns:
+        raise InvalidSubmissionError("Submission must have a 'sales' column")
+    # Sort by id
+    submission = submission.sort_values("id").reset_index(drop=True)
+    answers = answers.sort_values("id").reset_index(drop=True)
+    # Check id alignment
+    if not (submission["id"] == answers["id"]).all():
+        raise InvalidSubmissionError("Submission and answers id mismatch")
+    # Extract predictions and true values
+    y_pred = submission["sales"].values
+    y_true = answers["sales"].values
+    # Validate predictions
+    try:
+        y_pred = y_pred.astype(float)
+    except (ValueError, TypeError):
+        raise InvalidSubmissionError("Predictions must be numeric")
+    if np.any(np.isnan(y_pred)):
+        raise InvalidSubmissionError("Predictions cannot contain NaN values")
+    if np.any(y_pred < 0):
+        raise InvalidSubmissionError("Predictions cannot be negative")
+    return y_true, y_pred
+def grade(submission: pd.DataFrame, answers: pd.DataFrame) -> float:
+    """Calculate SMAPE score."""
+    y_true, y_pred = prepare_for_metric(submission, answers)
+    # SMAPE
+    smape_score = smape(y_true, y_pred)
+    return smape_score

dslighting 1.7.1__py3-none-any.whl → 1.7.8__py3-none-any.whl

dslighting 1.7.1py3-none-any.whl → 1.7.8py3-none-any.whl