PyPI - dslighting - Versions diffs - 1.7.1__py3-none-any.whl → 1.7.8__py3-none-any.whl - Mend

dslighting 1.7.1py3-none-any.whl → 1.7.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (352) hide show

mlebench/competitions/freesound-audio-tagging-2019/prepare_val.py ADDED Viewed

@@ -0,0 +1,175 @@
+import shutil
+from pathlib import Path
+import pandas as pd
+import requests
+from sklearn.model_selection import train_test_split
+from mlebench.utils import get_logger
+from .vocabulary import CLASS_NAMES
+logger = get_logger(__name__)
+def download_file(url, local_filename):
+    with requests.get(url, stream=True) as r:
+        r.raise_for_status()
+        with open(local_filename, "wb") as f:
+            for chunk in r.iter_content(chunk_size=8192):
+                if chunk:  # filter out keep-alive new chunks
+                    f.write(chunk)
+    return local_filename
+def _create_many_hot_labels(df: pd.DataFrame) -> pd.DataFrame:
+    """Converts a DataFrame with comma-separated labels into a many-hot encoded DataFrame."""
+    new_rows = []
+    for _, row in df.iterrows():
+        fname = row["fname"]
+        labels = row["labels"].split(",")
+        labels_one_hot = [1 if class_name in labels else 0 for class_name in CLASS_NAMES]
+        new_rows.append([fname] + labels_one_hot)
+    new_df = pd.DataFrame(new_rows, columns=["fname"] + CLASS_NAMES)
+    return new_df
+def prepare(raw: Path, public: Path, private: Path):
+    """
+    Straightforward: we have access to the post-competition released test labels, so we don't need
+    to create our own split here. We just copy over the raw data provided by the competition and
+    download the test labels.
+    Otherwise, the only work here is to convert the test data into the right format for grading:
+    The original form of `test.csv` is a DataFrame with N rows and 2 columns. The first column is
+    "fname" and the second column is the labels as comma-separated strings (class names). We
+    convert the test labels into a binary many-hot matrix matching the shape of the submission,
+    [N rows, M + 1 columns]: The first column is "fname" and the remaining M columns are the
+    predictions for each class.
+    This script also creates a new validation set by splitting the original train_curated set,
+    saving the results in `public_val` and `private_val` directories.
+    """
+    # =================================================================
+    # Original Data Preparation (public/ and private/)
+    # =================================================================
+    # Copy over everything in the raw directory
+    logger.info("Copying raw data to public directory")
+    # Don't copy the metadata file if it exists
+    items_to_copy = [item for item in raw.iterdir() if "FSDKaggle2019.meta" not in item.name]
+    for item in items_to_copy:
+        dest = public / item.name
+        if dest.exists():
+            continue
+        if item.is_dir():
+            shutil.copytree(item, dest)
+        else:
+            shutil.copy(item, dest)
+    assert len(list(public.iterdir())) >= len(
+        items_to_copy
+    ), "Expected all files in raw to be copied to public"
+    # Download the test labels and metadata that were released after the competition
+    test_url = "https://zenodo.org/records/3612637/files/FSDKaggle2019.meta.zip?download=1"
+    dest_path = raw / "FSDKaggle2019.meta.zip"
+    if not dest_path.exists():
+        download_file(test_url, dest_path)
+        logger.info(f"Downloaded file saved as {dest_path}")
+        # # Unzip
+        shutil.unpack_archive(dest_path, raw)
+        logger.info(f"Unzipped file to {raw / 'FSDKaggle2019.meta'}")
+    unzipped_path = raw / "FSDKaggle2019.meta"
+    # Read test labels
+    test_post_competition = pd.read_csv(unzipped_path / "test_post_competition.csv")
+    private_test_df = test_post_competition[test_post_competition["usage"] == "Private"]
+    # Create a binary many-hot matrix
+    new_test = _create_many_hot_labels(private_test_df)
+    new_test.to_csv(private / "test.csv", index=False)
+    # Check that test and submission match
+    submission = pd.read_csv(public / "sample_submission.csv")
+    assert len(submission) == len(
+        new_test
+    ), f"Expected {len(new_test)} rows in test.csv, but got {len(submission)}"
+    assert (
+        submission.columns[1:].tolist() == CLASS_NAMES
+    ), "Expected class names to match between test.csv and sample_submission.csv"
+    assert all(
+        submission.columns == new_test.columns
+    ), "Expected columns to match between test.csv and sample_submission.csv"
+    new_test.sort_values("fname", inplace=True)
+    submission.sort_values("fname", inplace=True)
+    assert (
+        submission["fname"].tolist() == new_test["fname"].tolist()
+    ), "Expected 'fname' to match between test.csv and sample_submission.csv"
+    # Remove the downloaded metadata
+    if dest_path.exists():
+        dest_path.unlink()
+    if unzipped_path.exists():
+        shutil.rmtree(unzipped_path)
+    # =================================================================
+    # New Validation Set Creation (public_val/ and private_val/)
+    # =================================================================
+    logger.info("Creating new validation set from train_curated.csv")
+    # Define paths and create parallel directories for the validation set
+    public_val = public.parent / "public_val"
+    private_val = private.parent / "private_val"
+    public_val.mkdir(exist_ok=True)
+    private_val.mkdir(exist_ok=True)
+    # Load original curated training data and final test set (to match size)
+    original_train_df = pd.read_csv(public / "train_curated.csv")
+    original_test_df = pd.read_csv(private / "test.csv")
+    # Split the curated training data to create a new train/validation set.
+    # The new validation set ('test_val') will have the same size as the original test set.
+    train_val_df, test_val_df = train_test_split(
+        original_train_df, test_size=len(original_test_df), random_state=42, shuffle=True
+    )
+    # --- Populate public_val directory ---
+    logger.info(f"Populating {public_val} with new training and validation data")
+    # Save the new, smaller curated training set manifest
+    train_val_df.to_csv(public_val / "train_curated.csv", index=False)
+    # To mirror the directory structure, copy the noisy data as-is
+    shutil.copy(public / "train_noisy.csv", public_val / "train_noisy.csv")
+    if (public / "train_noisy").exists():
+        if (public_val / "train_noisy").exists():
+            shutil.rmtree(public_val / "train_noisy")
+        shutil.copytree(public / "train_noisy", public_val / "train_noisy")
+    # Create directories for the new audio file splits
+    (public_val / "train_curated").mkdir(exist_ok=True)
+    (public_val / "test").mkdir(exist_ok=True)
+    # Copy audio files for the new, smaller training set
+    for fname in train_val_df["fname"]:
+        shutil.copy(public / "train_curated" / fname, public_val / "train_curated" / fname)
+    # Copy audio files for the new validation set into its 'test' directory
+    for fname in test_val_df["fname"]:
+        shutil.copy(public / "train_curated" / fname, public_val / "test" / fname)
+    # Create a new sample submission file corresponding to the validation set
+    val_submission = pd.DataFrame({"fname": test_val_df["fname"].sort_values()})
+    for col in CLASS_NAMES:
+        val_submission[col] = 0
+    val_submission.to_csv(public_val / "sample_submission.csv", index=False)
+    # --- Populate private_val directory ---
+    logger.info(f"Populating {private_val} with new validation labels")
+    # Create and save the ground truth labels for the new validation set
+    private_test_val_df = _create_many_hot_labels(test_val_df)
+    private_test_val_df.to_csv(private_val / "test.csv", index=False)
+    logger.info("Validation set creation complete.")

mlebench/competitions/freesound-audio-tagging-2019/vocabulary.py ADDED Viewed

@@ -0,0 +1,83 @@
+CLASS_NAMES = [
+    "Accelerating_and_revving_and_vroom",
+    "Accordion",
+    "Acoustic_guitar",
+    "Applause",
+    "Bark",
+    "Bass_drum",
+    "Bass_guitar",
+    "Bathtub_(filling_or_washing)",
+    "Bicycle_bell",
+    "Burping_and_eructation",
+    "Bus",
+    "Buzz",
+    "Car_passing_by",
+    "Cheering",
+    "Chewing_and_mastication",
+    "Child_speech_and_kid_speaking",
+    "Chink_and_clink",
+    "Chirp_and_tweet",
+    "Church_bell",
+    "Clapping",
+    "Computer_keyboard",
+    "Crackle",
+    "Cricket",
+    "Crowd",
+    "Cupboard_open_or_close",
+    "Cutlery_and_silverware",
+    "Dishes_and_pots_and_pans",
+    "Drawer_open_or_close",
+    "Drip",
+    "Electric_guitar",
+    "Fart",
+    "Female_singing",
+    "Female_speech_and_woman_speaking",
+    "Fill_(with_liquid)",
+    "Finger_snapping",
+    "Frying_(food)",
+    "Gasp",
+    "Glockenspiel",
+    "Gong",
+    "Gurgling",
+    "Harmonica",
+    "Hi-hat",
+    "Hiss",
+    "Keys_jangling",
+    "Knock",
+    "Male_singing",
+    "Male_speech_and_man_speaking",
+    "Marimba_and_xylophone",
+    "Mechanical_fan",
+    "Meow",
+    "Microwave_oven",
+    "Motorcycle",
+    "Printer",
+    "Purr",
+    "Race_car_and_auto_racing",
+    "Raindrop",
+    "Run",
+    "Scissors",
+    "Screaming",
+    "Shatter",
+    "Sigh",
+    "Sink_(filling_or_washing)",
+    "Skateboard",
+    "Slam",
+    "Sneeze",
+    "Squeak",
+    "Stream",
+    "Strum",
+    "Tap",
+    "Tick-tock",
+    "Toilet_flush",
+    "Traffic_noise_and_roadway_noise",
+    "Trickle_and_dribble",
+    "Walk_and_footsteps",
+    "Water_tap_and_faucet",
+    "Waves_and_surf",
+    "Whispering",
+    "Writing",
+    "Yell",
+    "Zipper_(clothing)",
+]
+assert len(CLASS_NAMES) == 80, f"Expected 80 class names, but got {len(CLASS_NAMES)}"

mlebench/competitions/google-quest-challenge/classes.py ADDED Viewed

@@ -0,0 +1,32 @@
+CLASSES = [
+    "question_asker_intent_understanding",
+    "question_body_critical",
+    "question_conversational",
+    "question_expect_short_answer",
+    "question_fact_seeking",
+    "question_has_commonly_accepted_answer",
+    "question_interestingness_others",
+    "question_interestingness_self",
+    "question_multi_intent",
+    "question_not_really_a_question",
+    "question_opinion_seeking",
+    "question_type_choice",
+    "question_type_compare",
+    "question_type_consequence",
+    "question_type_definition",
+    "question_type_entity",
+    "question_type_instructions",
+    "question_type_procedure",
+    "question_type_reason_explanation",
+    "question_type_spelling",
+    "question_well_written",
+    "answer_helpful",
+    "answer_level_of_information",
+    "answer_plausible",
+    "answer_relevance",
+    "answer_satisfaction",
+    "answer_type_instructions",
+    "answer_type_procedure",
+    "answer_type_reason_explanation",
+    "answer_well_written",
+]

mlebench/competitions/google-quest-challenge/grade.py ADDED Viewed

@@ -0,0 +1,45 @@
+import numpy as np
+import pandas as pd
+from scipy.stats import spearmanr
+from mlebench.grade_helpers import InvalidSubmissionError
+from .classes import CLASSES
+def grade(submission: pd.DataFrame, answers: pd.DataFrame) -> float:
+    id_col = "qa_id"
+    classes_set = set(CLASSES)
+    # submission checks
+    if id_col not in submission.columns:
+        raise InvalidSubmissionError(f"Submission is missing `{id_col}` column")
+    sub_cols_set = set(submission.columns)
+    if not classes_set.issubset(sub_cols_set):
+        raise InvalidSubmissionError(
+            f"Submission is missing the following columns: {classes_set - sub_cols_set}"
+        )
+    if len(submission.columns) != len(CLASSES) + 1:
+        raise InvalidSubmissionError(
+            f"Submission has {len(submission.columns) - 1} columns, expected {len(CLASSES)}"
+        )
+    if len(submission) != len(answers):
+        raise InvalidSubmissionError("Submission and answers have different lengths")
+    # answers checks
+    assert id_col in answers.columns, f"Answers is missing `{id_col}` column"
+    ans_cols_set = set(answers.columns)
+    assert classes_set.issubset(
+        ans_cols_set
+    ), f"Answers is missing the following columns: {classes_set - ans_cols_set}"
+    assert (
+        len(answers.columns) == len(CLASSES) + 1
+    ), f"Answers has {len(answers.columns)} columns, expected {len(CLASSES) + 1}"
+    # sort by id to ensure correct order
+    submission = submission.sort_values(id_col)
+    answers = answers.sort_values(id_col)
+    spearmans = [spearmanr(submission[col], answers[col]).correlation for col in CLASSES]
+    score = np.mean(spearmans)
+    return score

mlebench/competitions/google-quest-challenge/prepare.py ADDED Viewed

@@ -0,0 +1,58 @@
+from pathlib import Path
+import numpy as np
+from sklearn.model_selection import train_test_split
+from mlebench.utils import read_csv
+from .classes import CLASSES
+def prepare(raw: Path, public: Path, private: Path):
+    # Create train and test splits from train set
+    old_train = read_csv(raw / "train.csv")
+    new_train, new_test = train_test_split(old_train, test_size=0.1, random_state=0)
+    # question_type_spelling is almost always 0; if entirely 0 in test set, swap one row
+    if new_test["question_type_spelling"].nunique() == 1:
+        # need to do this swapping because spearmanr needs variation in the data to work
+        suitable_train_row_index = new_train[new_train["question_type_spelling"] != 0].index[0]
+        suitable_test_row_index = 0
+        temp = new_test.iloc[suitable_test_row_index].copy()
+        new_test.iloc[suitable_test_row_index] = new_train.loc[suitable_train_row_index].copy()
+        new_train.loc[suitable_train_row_index] = temp
+    new_test_without_labels = new_test.drop(CLASSES, axis=1, inplace=False)
+    # Create sample submission; private test will match this format
+    cols_to_keep = ["qa_id"] + CLASSES
+    new_test = new_test[cols_to_keep]
+    sample_submission = new_test.copy()
+    # spearmanr needs variation in the data to work; make each column increasing from 0 to 1
+    n, M = len(sample_submission), len(CLASSES)
+    sample_submission[CLASSES] = np.tile(np.linspace(0, 1, n)[:, None], (1, M))
+    # Create private files
+    new_test.to_csv(private / "test.csv", index=False)
+    # Create public files visible to agents
+    new_train.to_csv(public / "train.csv", index=False)
+    new_test_without_labels.to_csv(public / "test.csv", index=False)
+    sample_submission.to_csv(public / "sample_submission.csv", index=False)
+    # Checks
+    assert new_test_without_labels.shape[1] == 11, "Public test set should have 11 columns"
+    assert new_train.shape[1] == 41, "Public train set should have 41 columns"
+    # each private test set target column should not be constant
+    for column in CLASSES:
+        assert (
+            new_test[column].nunique() > 1
+        ), f"Column {column} should not be constant in the private test set"
+    assert len(new_train) + len(new_test) == len(
+        old_train
+    ), "Length of new_train and new_test should equal length of old_train"
+    assert (
+        sample_submission.columns.to_list() == new_test.columns.to_list()
+    ), "Sample submission columns should match test set"
+    assert len(sample_submission) == len(new_test), "Sample submission length should match test set"

mlebench/competitions/google-quest-challenge/prepare_val.py ADDED Viewed

@@ -0,0 +1,120 @@
+from pathlib import Path
+import numpy as np
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from mlebench.utils import read_csv
+from .classes import CLASSES
+def _create_split_and_write_files(
+    source_df: pd.DataFrame,
+    public_dir: Path,
+    private_dir: Path,
+    test_size: float,
+    random_state: int,
+):
+    """
+    Splits a dataframe into train and test sets, and writes them to public and private directories.
+    This function encapsulates the core data preparation logic:
+    1. Splits the data.
+    2. Handles a special case for 'question_type_spelling' to ensure variability.
+    3. Creates public-facing test set (without labels) and a sample submission.
+    4. Creates private-facing test set (with labels).
+    5. Writes all files to the specified directories.
+    6. Performs assertions to validate the output.
+    """
+    # Create output directories if they don't exist
+    public_dir.mkdir(parents=True, exist_ok=True)
+    private_dir.mkdir(parents=True, exist_ok=True)
+    # Create train and test splits from the source dataframe
+    train_df, test_df = train_test_split(
+        source_df, test_size=test_size, random_state=random_state
+    )
+    # question_type_spelling is almost always 0; if entirely 0 in test set, swap one row
+    if test_df["question_type_spelling"].nunique() == 1:
+        # need to do this swapping because spearmanr needs variation in the data to work
+        suitable_train_row_index = train_df[train_df["question_type_spelling"] != 0].index[0]
+        suitable_test_row_index = test_df.index[0]
+        temp = test_df.loc[suitable_test_row_index].copy()
+        test_df.loc[suitable_test_row_index] = train_df.loc[suitable_train_row_index].copy()
+        train_df.loc[suitable_train_row_index] = temp
+    test_df_without_labels = test_df.drop(CLASSES, axis=1, inplace=False)
+    # Create sample submission; private test will match this format
+    cols_to_keep = ["qa_id"] + CLASSES
+    test_labels = test_df[cols_to_keep]
+    sample_submission = test_labels.copy()
+    # spearmanr needs variation in the data to work; make each column increasing from 0 to 1
+    n, M = len(sample_submission), len(CLASSES)
+    sample_submission[CLASSES] = np.tile(np.linspace(0, 1, n)[:, None], (1, M))
+    # Create private files
+    test_labels.to_csv(private_dir / "test.csv", index=False)
+    # Create public files visible to agents
+    train_df.to_csv(public_dir / "train.csv", index=False)
+    test_df_without_labels.to_csv(public_dir / "test.csv", index=False)
+    sample_submission.to_csv(public_dir / "sample_submission.csv", index=False)
+    # Checks
+    assert test_df_without_labels.shape[1] == 11, "Public test set should have 11 columns"
+    assert train_df.shape[1] == 41, "Public train set should have 41 columns"
+    # each private test set target column should not be constant
+    for column in CLASSES:
+        assert (
+            test_labels[column].nunique() > 1
+        ), f"Column {column} should not be constant in the private test set"
+    assert len(train_df) + len(test_df) == len(
+        source_df
+    ), "Length of new_train and new_test should equal length of source_df"
+    assert (
+        sample_submission.columns.to_list() == test_labels.columns.to_list()
+    ), "Sample submission columns should match test set"
+    assert len(sample_submission) == len(test_labels), "Sample submission length should match test set"
+    return train_df, test_df
+def prepare(raw: Path, public: Path, private: Path):
+    # Load the raw data from the competition
+    source_data = read_csv(raw / "train.csv")
+    # ---- 1. Create the Original Main Split (train/test) ----
+    # This first call generates the primary train and test sets.
+    # The output files are saved to the `public` and `private` directories,
+    # remaining identical to the original script's output.
+    main_train, main_test = _create_split_and_write_files(
+        source_df=source_data,
+        public_dir=public,
+        private_dir=private,
+        test_size=0.1,
+        random_state=0,
+    )
+    # ---- 2. Create the New Validation Split (train_val/test_val) ----
+    # This second call takes the `main_train` set from the first split and
+    # splits it again to create a new, smaller training set and a validation set.
+    # The outputs are saved to new, parallel `public_val` and `private_val` dirs.
+    public_val_dir = public.parent / "public_val"
+    private_val_dir = private.parent / "private_val"
+    # To make the new validation set (`test_val`) have the same number of samples
+    # as the original test set (`main_test`), we calculate the required `test_size`
+    # relative to the size of the `main_train` dataframe.
+    validation_test_size = len(main_test) / len(main_train)
+    _create_split_and_write_files(
+        source_df=main_train,
+        public_dir=public_val_dir,
+        private_dir=private_val_dir,
+        test_size=validation_test_size,
+        random_state=0,  # Use the same random state for consistency
+    )

mlebench/competitions/google-research-identify-contrails-reduce-global-warming/grade.py ADDED Viewed

@@ -0,0 +1,77 @@
+import numpy as np
+from pandas import DataFrame
+from mlebench.competitions.utils import rle_decode
+from mlebench.grade_helpers import InvalidSubmissionError
+from mlebench.metrics import dice_coefficient
+def grade(submission: DataFrame, answers: DataFrame) -> float:
+    expected_answer_columns = ["record_id", "encoded_pixels", "height", "width"]
+    expected_submission_columns = ["record_id", "encoded_pixels"]
+    assert set(answers.columns) == set(
+        expected_answer_columns
+    ), f"Expected answers to have columns {expected_answer_columns}, but got {answers.columns} instead!"
+    if not set(expected_submission_columns).issubset(set(submission.columns)):
+        raise InvalidSubmissionError(
+            f"Expected submission to have columns {expected_submission_columns}, but got {submission.columns} instead!"
+        )
+    if len(submission) != len(answers):
+        raise InvalidSubmissionError(
+            f"Expected submission to have {len(answers)} rows, but got {len(submission)} instead!"
+        )
+    submission = submission.sort_values(by="record_id")
+    answers = answers.sort_values(by="record_id")
+    if (submission["record_id"].values != answers["record_id"].values).any():
+        raise InvalidSubmissionError(
+            f"Expected submission to have the same record_id's as answers, but got a different set of record_id's!"
+        )
+    # Replace "-" in "encoded_pixels" with ""
+    submission["encoded_pixels"] = submission["encoded_pixels"].replace("-", "")
+    answers["encoded_pixels"] = answers["encoded_pixels"].replace("-", "")
+    y_preds_rle = submission["encoded_pixels"].values
+    y_trues_rle, y_true_height, y_true_width = (
+        answers["encoded_pixels"].values,
+        answers["height"].values,
+        answers["width"].values,
+    )
+    try:
+        y_preds = np.array(
+            [
+                rle_decode(rle, height=int(height), width=int(width))
+                for (rle, height, width) in zip(y_trues_rle, y_true_height, y_true_width)
+            ]
+        )
+    except AssertionError as e:
+        raise InvalidSubmissionError(f"Error decoding RLE masks: {e}")
+    y_trues = np.array(
+        [
+            rle_decode(rle, height=int(height), width=int(width))
+            for (rle, height, width) in zip(y_preds_rle, y_true_height, y_true_width)
+        ]
+    )
+    # Flatten from [3D array (list of 2D masks)] into [1D array] then compute a global dice coefficient
+    """
+    This competition is evaluated on the **global** Dice coefficient. The Dice coefficient can be used to compare the
+    pixel-wise agreement between a predicted segmentation and its corresponding ground truth. The formula is given by:
+    $$
+    \frac{2 \cdot |X \cap Y|}{|X| + |Y|}
+    $$
+    where X is the entire set of predicted contrail pixels for **all** observations in the test data and Y is the
+    ground truth set of **all** contrail pixels in the test data.
+    """
+    y_preds_flat = y_preds.ravel()
+    y_trues_flat = y_trues.ravel()
+    score = dice_coefficient(y_trues_flat, y_preds_flat)
+    return score

dslighting 1.7.1__py3-none-any.whl → 1.7.8__py3-none-any.whl

dslighting 1.7.1py3-none-any.whl → 1.7.8py3-none-any.whl