PyPI - dslighting - Versions diffs - 1.7.1__py3-none-any.whl → 1.7.6__py3-none-any.whl - Mend

dslighting 1.7.1py3-none-any.whl → 1.7.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (352) hide show

mlebench/competitions/ml2021spring-hw2/prepare_val.py ADDED Viewed

@@ -0,0 +1,135 @@
+from pathlib import Path
+import numpy as np
+import pandas as pd
+from sklearn.model_selection import train_test_split
+def _split_and_save(
+    input_features: np.ndarray,
+    input_labels: np.ndarray,
+    test_proportion: float,
+    random_state: int,
+    output_public_path: Path,
+    output_private_path: Path,
+) -> tuple[np.ndarray, np.ndarray]:
+    """
+    Helper function to perform a split, save files to specified paths, and return the new training set.
+    This function encapsulates the logic for:
+    1. Splitting input data into training and testing sets.
+    2. Creating the necessary directory structure.
+    3. Saving the training data, (unlabeled) test data, and sample submission to the public path.
+    4. Saving the ground-truth test labels to the private path.
+    5. Performing sanity checks on the created files.
+    Args:
+        input_features: The feature data to be split.
+        input_labels: The corresponding labels for the feature data.
+        test_proportion: The proportion of the dataset to allocate to the test split.
+        random_state: The seed used by the random number generator for reproducibility.
+        output_public_path: The base directory for public-facing files.
+        output_private_path: The base directory for private/solution files.
+    Returns:
+        A tuple containing the features and labels of the newly created training set,
+        which can be used for subsequent splits.
+    """
+    input_idxs = range(len(input_features))
+    # Create new splits
+    new_train_idxs, new_test_idxs = train_test_split(
+        input_idxs, test_size=test_proportion, random_state=random_state
+    )
+    new_train = input_features[new_train_idxs]
+    new_train_label = input_labels[new_train_idxs]
+    new_test = input_features[new_test_idxs]
+    new_test_label = input_labels[new_test_idxs]
+    answers_df = pd.DataFrame({"Id": range(len(new_test)), "ClassId": new_test_label})
+    # Create sample submission
+    sample_submission = answers_df.copy()
+    sample_submission["ClassId"] = 0
+    # Create directories
+    (output_public_path / "timit_11" / "timit_11").mkdir(parents=True, exist_ok=True)
+    output_private_path.mkdir(parents=True, exist_ok=True)
+    # Save files
+    np.save(output_public_path / "timit_11" / "timit_11" / "train_11.npy", new_train)
+    np.save(output_public_path / "timit_11" / "timit_11" / "train_label_11.npy", new_train_label)
+    np.save(output_public_path / "timit_11" / "timit_11" / "test_11.npy", new_test)
+    sample_submission.to_csv(output_public_path / "sampleSubmission.csv", index=False)
+    answers_df.to_csv(output_private_path / "answers.csv", index=False)
+    # Sanity checks
+    assert (
+        output_public_path / "timit_11" / "timit_11" / "train_11.npy"
+    ).exists(), f"`train_11.npy` doesn't exist in {output_public_path}!"
+    assert (
+        output_public_path / "timit_11" / "timit_11" / "train_label_11.npy"
+    ).exists(), f"`train_label_11.npy` doesn't exist in {output_public_path}!"
+    assert (
+        output_public_path / "timit_11" / "timit_11" / "test_11.npy"
+    ).exists(), f"`test_11.npy` doesn't exist in {output_public_path}!"
+    assert (
+        output_public_path / "sampleSubmission.csv"
+    ).exists(), f"`sampleSubmission.csv` doesn't exist in {output_public_path}!"
+    assert (
+        output_private_path / "answers.csv"
+    ).exists(), f"`answers.csv` doesn't exist in {output_private_path}!"
+    assert len(new_train) + len(new_test) == len(
+        input_features
+    ), f"Expected {len(input_features)} samples in combined new train and test splits, got {len(new_train) + len(new_test)}!"
+    # Return the new training set for potential further splitting
+    return new_train, new_train_label
+def prepare(raw: Path, public: Path, private: Path):
+    """
+    Create a primary train/test split and a secondary train/validation split.
+    The primary split results are saved to `public` and `private` directories.
+    The secondary split results are saved to parallel `public_val` and `private_val` directories.
+    """
+    # Load original data from raw directory
+    old_train = np.load(raw / "timit_11" / "timit_11" / "train_11.npy")
+    old_train_label = np.load(raw / "timit_11" / "timit_11" / "train_label_11.npy")
+    # Determine the test set proportion from the original competition data
+    old_test = np.load(raw / "timit_11" / "timit_11" / "test_11.npy")
+    old_test_prop = len(old_test) / (len(old_train) + len(old_test))  # ~= 0.268
+    RANDOM_STATE = 0
+    # --- First Split: Create Original public/private datasets ---
+    # This call generates the exact same output as the original script.
+    # It returns the features/labels of its training set, which we will use for the next split.
+    train_for_val_split, train_label_for_val_split = _split_and_save(
+        input_features=old_train,
+        input_labels=old_train_label,
+        test_proportion=old_test_prop,
+        random_state=RANDOM_STATE,
+        output_public_path=public,
+        output_private_path=private,
+    )
+    # --- Second Split: Create New public_val/private_val datasets ---
+    # Define the new parallel directories for the validation set
+    public_val = public.parent / "public_val"
+    private_val = private.parent / "private_val"
+    # Call the helper again, this time splitting the data from the *first* split's training set.
+    # This creates the new training set (train_val) and validation set (test_val).
+    # The splitting logic, proportion, and random state are identical to the first split.
+    _split_and_save(
+        input_features=train_for_val_split,
+        input_labels=train_label_for_val_split,
+        test_proportion=old_test_prop,
+        random_state=RANDOM_STATE,
+        output_public_path=public_val,
+        output_private_path=private_val,
+    )

mlebench/competitions/mlsp-2013-birds/grade.py ADDED Viewed

@@ -0,0 +1,11 @@
+import pandas as pd
+from sklearn.metrics import roc_auc_score
+from mlebench.competitions.utils import prepare_for_auroc_metric
+def grade(submission: pd.DataFrame, answers: pd.DataFrame) -> float:
+    roc_auc_inputs = prepare_for_auroc_metric(
+        submission=submission, answers=answers, id_col="Id", target_col="Probability"
+    )
+    return roc_auc_score(y_true=roc_auc_inputs["y_true"], y_score=roc_auc_inputs["y_score"])

mlebench/competitions/mlsp-2013-birds/prepare.py ADDED Viewed

@@ -0,0 +1,182 @@
+import os
+import shutil
+from pathlib import Path
+from typing import Dict
+import pandas as pd
+from mlebench.utils import extract, read_csv
+def filter_and_write_file(src: Path, dst: Path, old_id_to_new: Dict[int, int]):
+    """
+    Given txt file that has column 0 as rec_id, filters out rec_ids that are not in old_id_to_new and writes to dst
+    """
+    history_of_segments = open(src).read().splitlines()
+    history_of_segments = history_of_segments[1:]
+    history_of_segments = [
+        (int(i.split(",")[0]), ",".join(i.split(",")[1:])) for i in history_of_segments
+    ]
+    history_of_segments = [
+        (old_id_to_new[i[0]], i[1]) for i in history_of_segments if i[0] in old_id_to_new.keys()
+    ]
+    with open(dst, "w") as f:
+        f.write("rec_id,[histogram of segment features]\n")
+        for rec_id, labels in history_of_segments:
+            f.write(f"{rec_id},{labels}\n")
+def prepare(raw: Path, public: Path, private: Path):
+    """
+    Splits the data in raw into public and private datasets with appropriate test/train splits.
+    """
+    # extract only what we need
+    extract(raw / "mlsp_contest_dataset.zip", raw)
+    (public / "essential_data").mkdir(exist_ok=True)
+    (public / "supplemental_data").mkdir(exist_ok=True)
+    # Create train, test from train split
+    cv_folds = read_csv(raw / "mlsp_contest_dataset/essential_data/CVfolds_2.txt")
+    cv_folds = cv_folds[cv_folds["fold"] == 0].reset_index(drop=True)
+    cv_folds.loc[cv_folds.sample(frac=0.2, random_state=0).index, "fold"] = 1
+    old_id_to_new = {old_id: new_id for new_id, old_id in enumerate(cv_folds["rec_id"].values)}
+    cv_folds["rec_id"] = cv_folds.index
+    cv_folds.to_csv(public / "essential_data/CVfolds_2.txt", index=False)
+    test_rec_ids = cv_folds[cv_folds["fold"] == 1]["rec_id"].values
+    assert len(test_rec_ids) == 64, f"Expected 64 test rec_ids, got {len(test_rec_ids)}"
+    # Update id2filename with new split
+    rec_id2filename = read_csv(raw / "mlsp_contest_dataset/essential_data/rec_id2filename.txt")
+    rec_id2filename = rec_id2filename[rec_id2filename["rec_id"].isin(old_id_to_new.keys())]
+    rec_id2filename["rec_id"] = rec_id2filename["rec_id"].map(old_id_to_new)
+    rec_id2filename.to_csv(public / "essential_data/rec_id2filename.txt", index=False)
+    assert len(rec_id2filename) == len(
+        cv_folds
+    ), f"Expected {len(cv_folds)} entires in rec_id2filename, got {len(rec_id2filename)}"
+    # Update labels with new split
+    rec_labels = (
+        open(raw / "mlsp_contest_dataset/essential_data/rec_labels_test_hidden.txt")
+        .read()
+        .splitlines()
+    )
+    rec_labels = rec_labels[1:]  # Ignore header line
+    rec_labels_split = []
+    for i in rec_labels:
+        rec_id = i.split(",")[0]
+        labels = ",".join(i.split(",")[1:]) if len(i.split(",")) > 1 else ""
+        rec_labels_split.append((int(rec_id), labels))
+    rec_labels_split = [i for i in rec_labels_split if i[0] in old_id_to_new.keys()]
+    rec_labels_split = [(old_id_to_new[i[0]], i[1]) for i in rec_labels_split]
+    # Public labels
+    with open(public / "essential_data/rec_labels_test_hidden.txt", "w") as f:
+        f.write("rec_id,[labels]\n")
+        for rec_id, labels in rec_labels_split:
+            if rec_id in test_rec_ids:
+                labels = "?"
+            if labels == "":  # Write without comma
+                f.write(f"{rec_id}{labels}\n")
+            else:
+                f.write(f"{rec_id},{labels}\n")
+    # Private labels. Create csv, with each row containing the label for a (rec_id, species_id) pair
+    data = {"Id": [], "Probability": []}
+    for rec_id, labels in rec_labels_split:
+        if rec_id not in test_rec_ids:
+            continue
+        species_ids = [int(i) for i in labels.split(",") if i != ""]
+        for species_id in range(0, 19):
+            data["Id"].append(rec_id * 100 + species_id)
+            data["Probability"].append(int(species_id in species_ids))
+    pd.DataFrame(data).to_csv(private / "answers.csv", index=False)
+    assert (
+        len(pd.DataFrame(data)) == len(test_rec_ids) * 19
+    ), f"Expected {len(test_rec_ids)*19} entires in answers.csv, got {len(pd.DataFrame(data))}"
+    # Create new sample submission, following new submission format
+    # http://www.kaggle.com/c/mlsp-2013-birds/forums/t/4961/new-submission-parser
+    data = {
+        "Id": [rec_id * 100 + species_id for rec_id in test_rec_ids for species_id in range(0, 19)],
+        "Probability": 0,
+    }
+    pd.DataFrame(data).to_csv(public / "sample_submission.csv", index=False)
+    assert (
+        len(pd.DataFrame(data)) == len(test_rec_ids) * 19
+    ), f"Expected {len(test_rec_ids)*19} entires in sample_submission.csv, got {len(pd.DataFrame(data))}"
+    # Copy over species list
+    shutil.copyfile(
+        src=raw / "mlsp_contest_dataset/essential_data/species_list.txt",
+        dst=public / "essential_data/species_list.txt",
+    )
+    # Copy over all src waves from train+test set
+    (public / "essential_data/src_wavs").mkdir(exist_ok=True)
+    for filename in rec_id2filename["filename"]:
+        shutil.copyfile(
+            src=raw / "mlsp_contest_dataset/essential_data/src_wavs" / f"{filename}.wav",
+            dst=public / "essential_data/src_wavs" / f"{filename}.wav",
+        )
+    # Copy over train+test filtered spectrograms, segmentation examples, spectrograms, and supervised segmentation
+    (public / "supplemental_data/filtered_spectrograms").mkdir(exist_ok=True)
+    (public / "supplemental_data/segmentation_examples").mkdir(exist_ok=True)
+    (public / "supplemental_data/spectrograms").mkdir(exist_ok=True)
+    (public / "supplemental_data/supervised_segmentation").mkdir(exist_ok=True)
+    for filename in rec_id2filename["filename"]:
+        shutil.copyfile(
+            src=raw
+            / "mlsp_contest_dataset/supplemental_data/filtered_spectrograms"
+            / f"{filename}.bmp",
+            dst=public / "supplemental_data/filtered_spectrograms" / f"{filename}.bmp",
+        )
+        if os.path.exists(
+            raw / "mlsp_contest_dataset/supplemental_data/segmentation_examples" / f"{filename}.bmp"
+        ):
+            shutil.copyfile(
+                src=raw
+                / "mlsp_contest_dataset/supplemental_data/segmentation_examples"
+                / f"{filename}.bmp",
+                dst=public / "supplemental_data/segmentation_examples" / f"{filename}.bmp",
+            )
+        shutil.copyfile(
+            src=raw / "mlsp_contest_dataset/supplemental_data/spectrograms" / f"{filename}.bmp",
+            dst=public / "supplemental_data/spectrograms" / f"{filename}.bmp",
+        )
+        shutil.copyfile(
+            src=raw
+            / "mlsp_contest_dataset/supplemental_data/supervised_segmentation"
+            / f"{filename}.bmp",
+            dst=public / "supplemental_data/supervised_segmentation" / f"{filename}.bmp",
+        )
+    # Copy over remaining files
+    shutil.copyfile(
+        src=raw / "mlsp_contest_dataset/supplemental_data/segment_clusters.bmp",
+        dst=public / "supplemental_data/segment_clusters.bmp",
+    )
+    shutil.copyfile(
+        src=raw / "mlsp_contest_dataset/supplemental_data/segment_mosaic.bmp",
+        dst=public / "supplemental_data/segment_mosaic.bmp",
+    )
+    filter_and_write_file(
+        src=raw / "mlsp_contest_dataset/supplemental_data/histogram_of_segments.txt",
+        dst=public / "supplemental_data/histogram_of_segments.txt",
+        old_id_to_new=old_id_to_new,
+    )
+    filter_and_write_file(
+        src=raw / "mlsp_contest_dataset/supplemental_data/segment_features.txt",
+        dst=public / "supplemental_data/segment_features.txt",
+        old_id_to_new=old_id_to_new,
+    )
+    filter_and_write_file(
+        src=raw / "mlsp_contest_dataset/supplemental_data/segment_rectangles.txt",
+        dst=public / "supplemental_data/segment_rectangles.txt",
+        old_id_to_new=old_id_to_new,
+    )

mlebench/competitions/mlsp-2013-birds/prepare_val.py ADDED Viewed

@@ -0,0 +1,241 @@
+import os
+import shutil
+from pathlib import Path
+from typing import Dict, List, Tuple
+import pandas as pd
+from mlebench.utils import extract, read_csv
+def filter_and_write_file(src: Path, dst: Path, old_id_to_new: Dict[int, int]):
+    """
+    Given txt file that has column 0 as rec_id, filters out rec_ids that are not in old_id_to_new and writes to dst
+    """
+    history_of_segments = open(src).read().splitlines()
+    history_of_segments = history_of_segments[1:]
+    history_of_segments = [
+        (int(i.split(",")[0]), ",".join(i.split(",")[1:])) for i in history_of_segments
+    ]
+    history_of_segments = [
+        (old_id_to_new[i[0]], i[1]) for i in history_of_segments if i[0] in old_id_to_new.keys()
+    ]
+    with open(dst, "w") as f:
+        f.write("rec_id,[histogram of segment features]\n")
+        for rec_id, labels in history_of_segments:
+            f.write(f"{rec_id},{labels}\n")
+def _process_split(
+    cv_folds: pd.DataFrame,
+    full_rec_id2filename: pd.DataFrame,
+    full_rec_labels_split: List[Tuple[int, str]],
+    raw_path: Path,
+    public_path: Path,
+    private_path: Path,
+):
+    """
+    Helper function to process a single data split and generate corresponding public and private files.
+    """
+    public_path.mkdir(exist_ok=True, parents=True)
+    private_path.mkdir(exist_ok=True, parents=True)
+    (public_path / "essential_data").mkdir(exist_ok=True)
+    (public_path / "supplemental_data").mkdir(exist_ok=True)
+    # Create a new compact ID mapping for the current subset of data
+    old_ids_in_split = cv_folds["rec_id"].values
+    old_id_to_new = {old_id: new_id for new_id, old_id in enumerate(old_ids_in_split)}
+    cv_folds_processed = cv_folds.copy()
+    cv_folds_processed["rec_id"] = cv_folds_processed["rec_id"].map(old_id_to_new)
+    cv_folds_processed.to_csv(public_path / "essential_data/CVfolds_2.txt", index=False)
+    test_rec_ids = cv_folds_processed[cv_folds_processed["fold"] == 1]["rec_id"].values
+    # Update id2filename with new split
+    rec_id2filename = full_rec_id2filename[
+        full_rec_id2filename["rec_id"].isin(old_id_to_new.keys())
+    ].copy()
+    rec_id2filename["rec_id"] = rec_id2filename["rec_id"].map(old_id_to_new)
+    rec_id2filename.to_csv(public_path / "essential_data/rec_id2filename.txt", index=False)
+    assert len(rec_id2filename) == len(
+        cv_folds_processed
+    ), f"Expected {len(cv_folds_processed)} entires in rec_id2filename, got {len(rec_id2filename)}"
+    # Update labels with new split
+    rec_labels_split = [i for i in full_rec_labels_split if i[0] in old_id_to_new.keys()]
+    rec_labels_split = [(old_id_to_new[i[0]], i[1]) for i in rec_labels_split]
+    # Public labels
+    with open(public_path / "essential_data/rec_labels_test_hidden.txt", "w") as f:
+        f.write("rec_id,[labels]\n")
+        for rec_id, labels in rec_labels_split:
+            if rec_id in test_rec_ids:
+                labels = "?"
+            if labels == "":  # Write without comma
+                f.write(f"{rec_id}{labels}\n")
+            else:
+                f.write(f"{rec_id},{labels}\n")
+    # Private labels. Create csv, with each row containing the label for a (rec_id, species_id) pair
+    data = {"Id": [], "Probability": []}
+    for rec_id, labels in rec_labels_split:
+        if rec_id not in test_rec_ids:
+            continue
+        species_ids = [int(i) for i in labels.split(",") if i != ""]
+        for species_id in range(0, 19):
+            data["Id"].append(rec_id * 100 + species_id)
+            data["Probability"].append(int(species_id in species_ids))
+    pd.DataFrame(data).to_csv(private_path / "answers.csv", index=False)
+    if len(test_rec_ids) > 0:
+        assert (
+            len(pd.DataFrame(data)) == len(test_rec_ids) * 19
+        ), f"Expected {len(test_rec_ids)*19} entires in answers.csv, got {len(pd.DataFrame(data))}"
+    # Create new sample submission, following new submission format
+    data = {
+        "Id": [rec_id * 100 + species_id for rec_id in test_rec_ids for species_id in range(0, 19)],
+        "Probability": 0,
+    }
+    pd.DataFrame(data).to_csv(public_path / "sample_submission.csv", index=False)
+    if len(test_rec_ids) > 0:
+        assert (
+            len(pd.DataFrame(data)) == len(test_rec_ids) * 19
+        ), f"Expected {len(test_rec_ids)*19} entires in sample_submission.csv, got {len(pd.DataFrame(data))}"
+    # Copy over species list
+    shutil.copyfile(
+        src=raw_path / "mlsp_contest_dataset/essential_data/species_list.txt",
+        dst=public_path / "essential_data/species_list.txt",
+    )
+    # Copy over all src waves from train+test set
+    (public_path / "essential_data/src_wavs").mkdir(exist_ok=True)
+    for filename in rec_id2filename["filename"]:
+        shutil.copyfile(
+            src=raw_path / "mlsp_contest_dataset/essential_data/src_wavs" / f"{filename}.wav",
+            dst=public_path / "essential_data/src_wavs" / f"{filename}.wav",
+        )
+    # Copy over train+test filtered spectrograms, segmentation examples, spectrograms, and supervised segmentation
+    (public_path / "supplemental_data/filtered_spectrograms").mkdir(exist_ok=True)
+    (public_path / "supplemental_data/segmentation_examples").mkdir(exist_ok=True)
+    (public_path / "supplemental_data/spectrograms").mkdir(exist_ok=True)
+    (public_path / "supplemental_data/supervised_segmentation").mkdir(exist_ok=True)
+    for filename in rec_id2filename["filename"]:
+        shutil.copyfile(
+            src=raw_path
+            / "mlsp_contest_dataset/supplemental_data/filtered_spectrograms"
+            / f"{filename}.bmp",
+            dst=public_path / "supplemental_data/filtered_spectrograms" / f"{filename}.bmp",
+        )
+        if os.path.exists(
+            raw_path / "mlsp_contest_dataset/supplemental_data/segmentation_examples" / f"{filename}.bmp"
+        ):
+            shutil.copyfile(
+                src=raw_path
+                / "mlsp_contest_dataset/supplemental_data/segmentation_examples"
+                / f"{filename}.bmp",
+                dst=public_path / "supplemental_data/segmentation_examples" / f"{filename}.bmp",
+            )
+        shutil.copyfile(
+            src=raw_path / "mlsp_contest_dataset/supplemental_data/spectrograms" / f"{filename}.bmp",
+            dst=public_path / "supplemental_data/spectrograms" / f"{filename}.bmp",
+        )
+        shutil.copyfile(
+            src=raw_path
+            / "mlsp_contest_dataset/supplemental_data/supervised_segmentation"
+            / f"{filename}.bmp",
+            dst=public_path / "supplemental_data/supervised_segmentation" / f"{filename}.bmp",
+        )
+    # Copy over remaining files
+    shutil.copyfile(
+        src=raw_path / "mlsp_contest_dataset/supplemental_data/segment_clusters.bmp",
+        dst=public_path / "supplemental_data/segment_clusters.bmp",
+    )
+    shutil.copyfile(
+        src=raw_path / "mlsp_contest_dataset/supplemental_data/segment_mosaic.bmp",
+        dst=public_path / "supplemental_data/segment_mosaic.bmp",
+    )
+    filter_and_write_file(
+        src=raw_path / "mlsp_contest_dataset/supplemental_data/histogram_of_segments.txt",
+        dst=public_path / "supplemental_data/histogram_of_segments.txt",
+        old_id_to_new=old_id_to_new,
+    )
+    filter_and_write_file(
+        src=raw_path / "mlsp_contest_dataset/supplemental_data/segment_features.txt",
+        dst=public_path / "supplemental_data/segment_features.txt",
+        old_id_to_new=old_id_to_new,
+    )
+    filter_and_write_file(
+        src=raw_path / "mlsp_contest_dataset/supplemental_data/segment_rectangles.txt",
+        dst=public_path / "supplemental_data/segment_rectangles.txt",
+        old_id_to_new=old_id_to_new,
+    )
+def prepare(raw: Path, public: Path, private: Path):
+    """
+    Splits the data in raw into public and private datasets with appropriate test/train splits.
+    Also creates a secondary validation split in public_val and private_val directories.
+    """
+    # extract only what we need
+    extract(raw / "mlsp_contest_dataset.zip", raw)
+    # --- Load all necessary data once ---
+    full_cv_folds = read_csv(raw / "mlsp_contest_dataset/essential_data/CVfolds_2.txt")
+    full_rec_id2filename = read_csv(raw / "mlsp_contest_dataset/essential_data/rec_id2filename.txt")
+    rec_labels_raw = (
+        open(raw / "mlsp_contest_dataset/essential_data/rec_labels_test_hidden.txt")
+        .read()
+        .splitlines()
+    )
+    rec_labels_raw = rec_labels_raw[1:]  # Ignore header line
+    full_rec_labels_split = []
+    for i in rec_labels_raw:
+        rec_id = i.split(",")[0]
+        labels = ",".join(i.split(",")[1:]) if len(i.split(",")) > 1 else ""
+        full_rec_labels_split.append((int(rec_id), labels))
+    # --- 1. Create the original Train/Test split ---
+    # Filter to only the data used in the original split logic and create test set
+    cv_folds = full_cv_folds[full_cv_folds["fold"] == 0].reset_index(drop=True)
+    test_indices = cv_folds.sample(frac=0.2, random_state=0).index
+    cv_folds.loc[test_indices, "fold"] = 1
+    assert len(test_indices) == 64, f"Expected 64 test rec_ids, got {len(test_indices)}"
+    # Process and write files for the original public/private split
+    _process_split(
+        cv_folds=cv_folds,
+        full_rec_id2filename=full_rec_id2filename,
+        full_rec_labels_split=full_rec_labels_split,
+        raw_path=raw,
+        public_path=public,
+        private_path=private,
+    )
+    # --- 2. Create the new Train/Validation split from the first split's TRAIN set ---
+    # Define new paths for the validation split
+    public_val = public.parent / "public_val"
+    private_val = private.parent / "private_val"
+    # Take the training data from the first split (fold=0)
+    train_from_first_split = cv_folds[cv_folds["fold"] == 0].reset_index(drop=True)
+    # Re-split this training data to create a new train_val and test_val set
+    # The new test (validation) set should be ~64 samples.
+    # Original train size = 256. 64/256 = 0.25
+    val_indices = train_from_first_split.sample(frac=0.25, random_state=0).index
+    train_from_first_split.loc[val_indices, "fold"] = 1  # Re-use 'fold=1' to mark test set
+    # Process and write files for the new validation split
+    _process_split(
+        cv_folds=train_from_first_split,
+        full_rec_id2filename=full_rec_id2filename,
+        full_rec_labels_split=full_rec_labels_split,
+        raw_path=raw,
+        public_path=public_val,
+        private_path=private_val,
+    )

mlebench/competitions/movie-review-sentiment-analysis-kernels-only/grade.py ADDED Viewed

@@ -0,0 +1,11 @@
+import pandas as pd
+from sklearn.metrics import accuracy_score
+from mlebench.competitions.utils import prepare_for_accuracy_metric
+def grade(submission: pd.DataFrame, answers: pd.DataFrame) -> float:
+    accuracy_inputs = prepare_for_accuracy_metric(
+        submission=submission, answers=answers, target_column="Sentiment", id_column="PhraseId"
+    )
+    return accuracy_score(**accuracy_inputs)

mlebench/competitions/movie-review-sentiment-analysis-kernels-only/prepare.py ADDED Viewed

@@ -0,0 +1,58 @@
+import shutil
+from pathlib import Path
+from pandas import read_csv
+from sklearn.model_selection import train_test_split
+from mlebench.utils import extract
+def prepare(raw: Path, public: Path, private: Path):
+    # Extract
+    extract(raw / "train.tsv.zip", raw)
+    # Create train and test splits from train set
+    test_ratio = 0.3  # 66293/(156061+66293) = 0.3
+    old_train = read_csv(raw / "train.tsv", sep="\t")
+    new_train, answers = train_test_split(old_train, test_size=test_ratio, random_state=0)
+    # Create public test
+    new_test = answers.copy()
+    new_test = new_test.drop("Sentiment", axis="columns")
+    # Create sample submission
+    sample_submission = answers[["PhraseId", "Sentiment"]].copy()
+    sample_submission["Sentiment"] = 2
+    # Checks
+    assert new_train["PhraseId"].is_unique, "PhraseId in new_train should be unique"
+    assert new_test["PhraseId"].is_unique, "PhraseId in new_test should be unique"
+    assert set(new_train["PhraseId"]).isdisjoint(
+        set(new_test["PhraseId"])
+    ), "PhraseId in new_train and new_test should be disjoint"
+    assert (
+        new_train.shape[0] + new_test.shape[0] == old_train.shape[0]
+    ), "New train and new test should have the same number of rows as the old train set"
+    assert (
+        new_train.columns.tolist() == old_train.columns.tolist()
+    ), "New train and old train should have the same columns"
+    assert new_test.columns.tolist() == [
+        "PhraseId",
+        "SentenceId",
+        "Phrase",
+    ], "new_test should have columns ['PhraseId', 'SentenceId', 'Phrase']"
+    # Write CSVs
+    answers.to_csv(private / "answers.csv", index=False)
+    new_train.to_csv(public / "train.tsv", index=False, sep="\t")
+    new_test.to_csv(public / "test.tsv", index=False, sep="\t")
+    sample_submission.to_csv(public / "sampleSubmission.csv", index=False)
+    # Zip files
+    shutil.make_archive(str(public / "train.tsv"), "zip", public, "train.tsv")
+    shutil.make_archive(str(public / "test.tsv"), "zip", public, "test.tsv")
+    # Delete unzipped files
+    (public / "train.tsv").unlink()
+    (public / "test.tsv").unlink()

dslighting 1.7.1__py3-none-any.whl → 1.7.6__py3-none-any.whl

dslighting 1.7.1py3-none-any.whl → 1.7.6py3-none-any.whl