PyPI - dslighting - Versions diffs - 1.7.1__py3-none-any.whl → 1.7.6__py3-none-any.whl - Mend

dslighting 1.7.1py3-none-any.whl → 1.7.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (352) hide show

mlebench/competitions/stanford-covid-vaccine/prepare_val.py ADDED Viewed

@@ -0,0 +1,199 @@
+from pathlib import Path
+import pandas as pd
+def _create_competition_files(
+    train_df: pd.DataFrame,
+    test_df: pd.DataFrame,
+    old_test_columns: pd.Index,
+    old_sample_submission: pd.DataFrame,
+    public_dir: Path,
+    private_dir: Path,
+    to_predict: list,
+) -> None:
+    """
+    Helper function to generate the set of competition files for a given train/test split.
+    This function creates the public and private directories and populates them with:
+    - public/train.json
+    - public/test.json
+    - public/sample_submission.csv
+    - private/test.csv (ground truth)
+    """
+    public_dir.mkdir(parents=True, exist_ok=True)
+    private_dir.mkdir(parents=True, exist_ok=True)
+    # Create `test.csv` by exploding each list in the `reactivity` and `deg_*` columns, analogous
+    # to `pd.explode`. Only the first `seq_scored` items are scored out of a possible `seq_length`
+    # items. For each row, we keep track of whether it's scored or not with the `keep` column.
+    records = []
+    for _, row in test_df.iterrows():
+        n = row["seq_scored"]
+        assert len(row["reactivity"]) == n
+        assert len(row["deg_Mg_pH10"]) == n
+        assert len(row["deg_pH10"]) == n
+        assert len(row["deg_Mg_50C"]) == n
+        assert len(row["deg_50C"]) == n
+        for j in range(n):
+            records.append(
+                {
+                    "id_seqpos": f"{row['id']}_{j}",
+                    "reactivity": row["reactivity"][j],
+                    "deg_Mg_pH10": row["deg_Mg_pH10"][j],
+                    "deg_pH10": row["deg_pH10"][j],
+                    "deg_Mg_50C": row["deg_Mg_50C"][j],
+                    "deg_50C": row["deg_50C"][j],
+                    "keep": True,
+                }
+            )
+        k = row["seq_length"]
+        assert n < k
+        for j in range(n, k):
+            records.append(
+                {
+                    "id_seqpos": f"{row['id']}_{j}",
+                    "reactivity": 0.0,
+                    "deg_Mg_pH10": 0.0,
+                    "deg_pH10": 0.0,
+                    "deg_Mg_50C": 0.0,
+                    "deg_50C": 0.0,
+                    "keep": False,
+                }
+            )
+    # Write `answers.csv`
+    answers = pd.DataFrame(records)
+    answers.to_csv(private_dir / "test.csv", index=False, float_format="%.10f")
+    # Write `train.json`
+    train_df["index"] = range(len(train_df))
+    train_df.to_json(public_dir / "train.json", orient="records", lines=True)
+    # Write `test.json`
+    test_without_labels = test_df[old_test_columns].copy()
+    test_without_labels["index"] = range(len(test_without_labels))
+    test_without_labels.to_json(public_dir / "test.json", orient="records", lines=True)
+    # Write `sample_submission.csv`
+    new_sample_submission = answers[["id_seqpos"] + to_predict].copy()
+    new_sample_submission.loc[:, to_predict] = 0.0
+    new_sample_submission.to_csv(
+        public_dir / "sample_submission.csv", index=False, float_format="%.10f"
+    )
+    # Sanity checks
+    assert "test" not in train_df.columns
+    assert "test" not in test_df.columns
+    assert set(test_without_labels.columns) == set(old_test_columns), (
+        f"Expected the columns of the new test to be the same as the old test, but got "
+        f"{set(test_without_labels.columns)} instead of {set(old_test_columns)}."
+    )
+    assert set(to_predict).intersection(set(test_without_labels.columns)) == set(), (
+        f"Expected the columns to predict aren't included in the new test, but got "
+        f"{set(to_predict) ^ set(test_without_labels.columns)} instead of the empty set."
+    )
+    assert set(new_sample_submission.columns) == set(old_sample_submission.columns), (
+        f"Expected the columns of the new sample submission to be the same as the old sample "
+        f"submission, but got {set(new_sample_submission.columns)} instead of "
+        f"{set(old_sample_submission.columns)}."
+    )
+    assert len(answers) == len(new_sample_submission), (
+        f"Expected the answers to have the same length as the new sample submission, but got "
+        f"{len(answers)} instead of {len(new_sample_submission)}."
+    )
+    # we can use [0] because all sequences have the same length
+    assert len(new_sample_submission) == (
+        len(test_without_labels) * test_without_labels["seq_length"].iloc[0]
+    ), (
+        "Expected new_sample_submission length to be equal to max seq_length * len(new_test)."
+        f"Got {len(new_sample_submission)} instead of {len(test_without_labels) * test_without_labels['seq_length']}."
+    )
+def prepare(raw: Path, public: Path, private: Path) -> None:
+    old_train = pd.read_json(raw / "train.json", lines=True)
+    old_test = pd.read_json(raw / "test.json", lines=True)
+    old_sample_submission = pd.read_csv(raw / "sample_submission.csv")
+    to_predict = ["reactivity", "deg_Mg_pH10", "deg_pH10", "deg_Mg_50C", "deg_50C"]
+    test_size = 0.1
+    n_test_samples = int(len(old_train) * test_size)
+    # First split: Create the main train and test sets from the raw data
+    # only put samples that pass the SN filter in the test set, as per comp data desc
+    old_train["test"] = False
+    test_indices = (
+        old_train[old_train["SN_filter"] > 0].sample(n=n_test_samples, random_state=0).index
+    )
+    old_train.loc[test_indices, "test"] = True
+    new_train = old_train[~old_train["test"]].copy().drop(columns=["test"])
+    new_test = old_train[old_train["test"]].copy().drop(columns=["test"])
+    # Generate the original competition files. This ensures the contents of `public`
+    # and `private` directories remain identical to the original script's output.
+    _create_competition_files(
+        train_df=new_train,
+        test_df=new_test,
+        old_test_columns=old_test.columns,
+        old_sample_submission=old_sample_submission,
+        public_dir=public,
+        private_dir=private,
+        to_predict=to_predict,
+    )
+    # --- New: Create a validation set by splitting the training set further ---
+    # Define new directories for the validation set, parallel to the original ones
+    public_val = public.parent / "public_val"
+    private_val = private.parent / "private_val"
+    # Second split: Create a new, smaller training set and a validation set.
+    # The logic and size are replicated from the first split for consistency.
+    val_split_base = new_train.copy()
+    val_split_base["test"] = False
+    val_test_indices = (
+        val_split_base[val_split_base["SN_filter"] > 0]
+        .sample(n=n_test_samples, random_state=0)  # Use same size and random state
+        .index
+    )
+    val_split_base.loc[val_test_indices, "test"] = True
+    train_val = val_split_base[~val_split_base["test"]].copy().drop(columns=["test"])
+    test_val = val_split_base[val_split_base["test"]].copy().drop(columns=["test"])
+    # Generate the validation competition files in the new directories.
+    # The file structure and names inside public_val/private_val will mirror
+    # the original public/private directories.
+    _create_competition_files(
+        train_df=train_val,
+        test_df=test_val,
+        old_test_columns=old_test.columns,
+        old_sample_submission=old_sample_submission,
+        public_dir=public_val,
+        private_dir=private_val,
+        to_predict=to_predict,
+    )
+    # Final sanity checks
+    assert len(new_train) + len(new_test) == len(old_train), (
+        f"Expected the length of the new train set plus the length of the new test set to be "
+        f"equal to the length of the old train set, but got {len(new_train) + len(new_test)} "
+        f"instead of {len(old_train)}."
+    )
+    assert len(train_val) + len(test_val) == len(new_train), (
+        f"Expected the length of the validation train set plus the length of the validation test set to be "
+        f"equal to the length of the main train set, but got {len(train_val) + len(test_val)} "
+        f"instead of {len(new_train)}."
+    )

mlebench/competitions/statoil-iceberg-classifier-challenge/grade.py ADDED Viewed

@@ -0,0 +1,41 @@
+from typing import Tuple
+import numpy as np
+import pandas as pd
+from sklearn.metrics import log_loss
+from mlebench.grade_helpers import InvalidSubmissionError
+def prepare_for_metric(
+    submission: pd.DataFrame, answers: pd.DataFrame
+) -> Tuple[np.ndarray, np.ndarray]:
+    # answers checks
+    assert "id" in answers.columns, f"Answers is missing `id` column"
+    assert "is_iceberg" in answers.columns, f"Answers is missing `is_iceberg` column"
+    # submission checks
+    if "id" not in submission.columns:
+        raise InvalidSubmissionError(f"Submission is missing `id` column")
+    if "is_iceberg" not in submission.columns:
+        raise InvalidSubmissionError(f"Submission is missing `is_iceberg` column")
+    if set(submission["id"]) != set(answers["id"]):
+        raise InvalidSubmissionError("Submission and answers have different id's")
+    if not ((submission["is_iceberg"] >= 0) & (submission["is_iceberg"] <= 1)).all():
+        raise InvalidSubmissionError(
+            "All values in submission `is_iceberg` must be between 0 and 1."
+        )
+    # sort by id to ensure correct order
+    submission = submission.sort_values("id")
+    answers = answers.sort_values("id")
+    y_true = answers["is_iceberg"]
+    y_pred = submission["is_iceberg"]
+    return y_true, y_pred
+def grade(submission: pd.DataFrame, answers: pd.DataFrame) -> float:
+    y_true, y_pred = prepare_for_metric(submission, answers)
+    score = log_loss(y_true, y_pred)
+    return score

mlebench/competitions/statoil-iceberg-classifier-challenge/prepare.py ADDED Viewed

@@ -0,0 +1,105 @@
+import json
+import shutil
+from pathlib import Path
+import pandas as pd
+import py7zr
+from sklearn.model_selection import train_test_split
+from mlebench.utils import extract
+def prepare(raw: Path, public: Path, private: Path):
+    """
+    Splits the data in raw into public and private datasets with appropriate test/train splits.
+    """
+    extract(raw / "train.json.7z", raw)
+    extract(raw / "test.json.7z", raw)
+    old_train = json.load((raw / "data/processed/train.json").open())
+    old_test = json.load((raw / "data/processed/test.json").open())
+    all_fields = list([key for key in old_train[0].keys()])
+    assert all(
+        set(all_fields) == set([key for key in sample.keys()]) for sample in old_train
+    ), "Inconsistent fields in train set"
+    test_fields = list([key for key in old_test[0].keys()])
+    assert all(
+        set(test_fields) == set([key for key in sample.keys()]) for sample in old_test
+    ), "Inconsistent fields in test set"
+    # Old ratio is Train set - 1,604 samples; Test set - 8,424 samples (~84% ratio)
+    # We do a 20% ratio to avoid removing too many samples from train
+    new_train, new_test = train_test_split(old_train, test_size=0.2, random_state=0)
+    new_test_without_labels = []
+    for sample in new_test:
+        # Keep only the fields in the test set
+        sample = {key: sample[key] for key in test_fields}
+        new_test_without_labels.append(sample)
+    # Write new train and test splits, compress, then remove the uncompressed files
+    (private / "tmp_data").mkdir(exist_ok=True)
+    with open(private / "tmp_data" / "train.json", "w") as f:
+        json.dump(new_train, f)
+    with open(private / "tmp_data" / "test.json", "w") as f:
+        json.dump(new_test_without_labels, f)
+    with py7zr.SevenZipFile(public / "train.json.7z", "w") as archive:
+        archive.write(
+            private / "tmp_data" / "train.json",
+            arcname=(private / "tmp_data" / "train.json").relative_to(private / "tmp_data"),
+        )
+    with py7zr.SevenZipFile(public / "test.json.7z", "w") as archive:
+        archive.write(
+            private / "tmp_data" / "test.json",
+            arcname=(private / "tmp_data" / "test.json").relative_to(private / "tmp_data"),
+        )
+    # Make answers as csv from json
+    answer_rows = []
+    for sample in new_test:
+        answer_rows.append(
+            {
+                "id": sample["id"],
+                "is_iceberg": int(sample["is_iceberg"]),
+            }
+        )
+    answers = pd.DataFrame(answer_rows)
+    answers.to_csv(private / "test.csv", index=False)
+    # Make sample submission
+    sample_submission = answers.copy()
+    sample_submission["is_iceberg"] = 0.5
+    sample_submission.to_csv(private / "sample_submission.csv", index=False)
+    with py7zr.SevenZipFile(public / "sample_submission.csv.7z", "w") as archive:
+        archive.write(
+            private / "sample_submission.csv",
+            arcname=(private / "sample_submission.csv").relative_to(private),
+        )
+    # Remove uncompressed files
+    shutil.rmtree(private / "tmp_data")
+    # Checks
+    assert len(new_train) + len(new_test) == len(
+        old_train
+    ), f"Expected {len(old_train)} total samples in new_train ({len(new_train)}) and new_test ({len(new_test)})"
+    assert len(new_test) == len(
+        new_test_without_labels
+    ), f"Expected new_test ({len(new_test)}) to have the same length as new_test_without_labels ({len(new_test_without_labels)})"
+    assert len(answers) == len(
+        new_test
+    ), f"Expected answers ({len(answers)}) to have the same length as new_test ({len(new_test)})"
+    assert len(sample_submission) == len(
+        new_test
+    ), f"Expected sample_submission ({len(sample_submission)}) to have the same length as new_test ({len(new_test)})"
+    assert set(answers.columns) == set(
+        ["id", "is_iceberg"]
+    ), "Answers must have 'id' and 'is_iceberg' columns"
+    assert set(sample_submission.columns) == set(
+        ["id", "is_iceberg"]
+    ), "Sample submission must have 'id' and 'is_iceberg' columns"
+    new_train_ids = set([sample["id"] for sample in new_train])
+    new_test_ids = set([sample["id"] for sample in new_test])
+    assert new_train_ids.isdisjoint(new_test_ids), "Train and test ids should not overlap"

mlebench/competitions/statoil-iceberg-classifier-challenge/prepare_val.py ADDED Viewed

@@ -0,0 +1,157 @@
+import json
+import shutil
+from pathlib import Path
+from typing import List, Dict
+import pandas as pd
+import py7zr
+from sklearn.model_selection import train_test_split
+from mlebench.utils import extract
+def _create_split_files(
+    train_set: List[Dict],
+    test_set: List[Dict],
+    test_fields: List[str],
+    public_dir: Path,
+    private_dir: Path,
+):
+    """
+    Helper function to generate all files for a given train/test split.
+    This function creates:
+    - public/{train.json.7z, test.json.7z, sample_submission.csv.7z}
+    - private/{test.csv, sample_submission.csv}
+    """
+    # Ensure output directories exist
+    public_dir.mkdir(exist_ok=True, parents=True)
+    private_dir.mkdir(exist_ok=True, parents=True)
+    # Create test set without labels
+    test_set_without_labels = []
+    for sample in test_set:
+        sample_copy = {key: sample[key] for key in test_fields}
+        test_set_without_labels.append(sample_copy)
+    # Write new train and test splits to a temporary directory
+    tmp_data_dir = private_dir / "tmp_data"
+    tmp_data_dir.mkdir(exist_ok=True)
+    try:
+        with open(tmp_data_dir / "train.json", "w") as f:
+            json.dump(train_set, f)
+        with open(tmp_data_dir / "test.json", "w") as f:
+            json.dump(test_set_without_labels, f)
+        # Compress the JSON files into the public directory
+        with py7zr.SevenZipFile(public_dir / "train.json.7z", "w") as archive:
+            archive.write(
+                tmp_data_dir / "train.json",
+                arcname="train.json",
+            )
+        with py7zr.SevenZipFile(public_dir / "test.json.7z", "w") as archive:
+            archive.write(
+                tmp_data_dir / "test.json",
+                arcname="test.json",
+            )
+        # Make answers as csv from the labeled test set
+        answer_rows = [
+            {"id": sample["id"], "is_iceberg": int(sample["is_iceberg"])}
+            for sample in test_set
+        ]
+        answers = pd.DataFrame(answer_rows)
+        answers.to_csv(private_dir / "test.csv", index=False)
+        # Make a sample submission file
+        sample_submission = answers.copy()
+        sample_submission["is_iceberg"] = 0.5
+        sample_submission.to_csv(private_dir / "sample_submission.csv", index=False)
+        with py7zr.SevenZipFile(public_dir / "sample_submission.csv.7z", "w") as archive:
+            archive.write(
+                private_dir / "sample_submission.csv",
+                arcname="sample_submission.csv",
+            )
+    finally:
+        # Ensure temporary files are removed
+        shutil.rmtree(tmp_data_dir)
+    # --- Final checks for this split ---
+    assert len(test_set) == len(
+        test_set_without_labels
+    ), f"Expected test_set ({len(test_set)}) to have the same length as test_set_without_labels ({len(test_set_without_labels)})"
+    assert len(answers) == len(
+        test_set
+    ), f"Expected answers ({len(answers)}) to have the same length as test_set ({len(test_set)})"
+    assert len(sample_submission) == len(
+        test_set
+    ), f"Expected sample_submission ({len(sample_submission)}) to have the same length as test_set ({len(test_set)})"
+    assert set(answers.columns) == set(
+        ["id", "is_iceberg"]
+    ), "Answers must have 'id' and 'is_iceberg' columns"
+    assert set(sample_submission.columns) == set(
+        ["id", "is_iceberg"]
+    ), "Sample submission must have 'id' and 'is_iceberg' columns"
+    train_ids = set([sample["id"] for sample in train_set])
+    test_ids = set([sample["id"] for sample in test_set])
+    assert train_ids.isdisjoint(test_ids), "Train and test ids should not overlap"
+def prepare(raw: Path, public: Path, private: Path):
+    """
+    Splits the data in raw into public and private datasets with appropriate test/train splits.
+    Also creates a secondary validation split (public_val, private_val) for model development.
+    """
+    extract(raw / "train.json.7z", raw)
+    extract(raw / "test.json.7z", raw)
+    old_train = json.load((raw / "data/processed/train.json").open())
+    old_test = json.load((raw / "data/processed/test.json").open())
+    all_fields = list([key for key in old_train[0].keys()])
+    assert all(
+        set(all_fields) == set([key for key in sample.keys()]) for sample in old_train
+    ), "Inconsistent fields in train set"
+    test_fields = list([key for key in old_test[0].keys()])
+    assert all(
+        set(test_fields) == set([key for key in sample.keys()]) for sample in old_test
+    ), "Inconsistent fields in test set"
+    # --- First Split: Create the main train/test sets for the competition ---
+    # Old ratio is Train set - 1,604 samples; Test set - 8,424 samples (~84% ratio)
+    # We do a 20% ratio to avoid removing too many samples from train
+    new_train, new_test = train_test_split(old_train, test_size=0.2, random_state=0)
+    # Generate the original public and private directory files
+    _create_split_files(
+        train_set=new_train,
+        test_set=new_test,
+        test_fields=test_fields,
+        public_dir=public,
+        private_dir=private,
+    )
+    # Check that the total number of samples is conserved in the first split
+    assert len(new_train) + len(new_test) == len(
+        old_train
+    ), f"Expected {len(old_train)} total samples in new_train ({len(new_train)}) and new_test ({len(new_test)})"
+    # --- Second Split: Create a validation set from the main training set ---
+    # Define paths for the new validation split
+    public_val = public.parent / "public_val"
+    private_val = private.parent / "private_val"
+    # The goal is for the new validation set (test_val) to have the same size as the original test set (new_test).
+    # test_size = len(new_test) / len(new_train) = (0.2 * N) / (0.8 * N) = 0.25
+    test_size_for_val_split = len(new_test) / len(new_train)
+    train_val, test_val = train_test_split(
+        new_train, test_size=test_size_for_val_split, random_state=0
+    )
+    # Generate the validation public_val and private_val directory files
+    _create_split_files(
+        train_set=train_val,
+        test_set=test_val,
+        test_fields=test_fields,
+        public_dir=public_val,
+        private_dir=private_val,
+    )

mlebench/competitions/tabular-playground-series-dec-2021/grade.py ADDED Viewed

@@ -0,0 +1,11 @@
+import pandas as pd
+from sklearn.metrics import accuracy_score
+from mlebench.competitions.utils import prepare_for_accuracy_metric
+def grade(submission: pd.DataFrame, answers: pd.DataFrame) -> float:
+    accuracy_inputs = prepare_for_accuracy_metric(
+        submission=submission, answers=answers, target_column="Cover_Type", id_column="Id"
+    )
+    return accuracy_score(**accuracy_inputs)

mlebench/competitions/tabular-playground-series-dec-2021/prepare.py ADDED Viewed

@@ -0,0 +1,39 @@
+from pathlib import Path
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from mlebench.utils import read_csv
+def prepare(raw: Path, public: Path, private: Path):
+    """
+    Splits the data in raw into public and private datasets with appropriate test/train splits.
+    """
+    # Create train, test from train split
+    old_train = read_csv(raw / "train.csv")
+    new_train, new_test = train_test_split(old_train, test_size=0.1, random_state=0)
+    new_test_without_labels = new_test.drop(columns=["Cover_Type"])
+    new_train.to_csv(public / "train.csv", index=False)
+    new_test.to_csv(private / "test.csv", index=False)
+    new_test_without_labels.to_csv(public / "test.csv", index=False)
+    # Create a sample submission file
+    submission_df = pd.DataFrame(
+        {
+            "Id": new_test["Id"].values,
+            "Cover_Type": 2,
+        }
+    )
+    submission_df.to_csv(public / "sample_submission.csv", index=False)
+    assert len(new_train) + len(new_test) == len(
+        old_train
+    ), "train and test splits lengths do not add up to original data."
+    assert set(new_train["Id"]).isdisjoint(
+        set(new_test["Id"])
+    ), f"there are Ids in both train and test sets: {set(new_train['Id']).intersection(set(new_test['Id']))}"
+    assert len(new_test_without_labels) == len(
+        new_test
+    ), "public and Private tests should have equal length"

mlebench/competitions/tabular-playground-series-dec-2021/prepare_val.py ADDED Viewed

@@ -0,0 +1,99 @@
+from pathlib import Path
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from mlebench.utils import read_csv
+def _split_and_save(
+    df: pd.DataFrame,
+    test_size: float,
+    public_path: Path,
+    private_path: Path,
+    random_state: int,
+):
+    """
+    Helper function to perform a data split, save files, and run assertions.
+    Args:
+        df (pd.DataFrame): The dataframe to split.
+        test_size (float): The proportion of the dataset to allocate to the test split.
+        public_path (Path): The directory for public-facing files (train set, unlabeled test set).
+        private_path (Path): The directory for private-facing files (labeled test set).
+        random_state (int): The seed for the random number generator.
+    Returns:
+        Tuple[pd.DataFrame, pd.DataFrame]: The resulting train and test dataframes.
+    """
+    # Ensure output directories exist
+    public_path.mkdir(parents=True, exist_ok=True)
+    private_path.mkdir(parents=True, exist_ok=True)
+    # Perform the split
+    train_df, test_df = train_test_split(
+        df, test_size=test_size, random_state=random_state
+    )
+    test_df_without_labels = test_df.drop(columns=["Cover_Type"])
+    # Save the split data using standard filenames
+    train_df.to_csv(public_path / "train.csv", index=False)
+    test_df.to_csv(private_path / "test.csv", index=False)
+    test_df_without_labels.to_csv(public_path / "test.csv", index=False)
+    # Create a sample submission file
+    submission_df = pd.DataFrame(
+        {"Id": test_df["Id"].values, "Cover_Type": 2}
+    )
+    submission_df.to_csv(public_path / "sample_submission.csv", index=False)
+    # Assertions
+    assert len(train_df) + len(test_df) == len(
+        df
+    ), "train and test splits lengths do not add up to original data."
+    assert set(train_df["Id"]).isdisjoint(
+        set(test_df["Id"])
+    ), f"there are Ids in both train and test sets: {set(train_df['Id']).intersection(set(test_df['Id']))}"
+    assert len(test_df_without_labels) == len(
+        test_df
+    ), "public and Private tests should have equal length"
+    return train_df, test_df
+def prepare(raw: Path, public: Path, private: Path):
+    """
+    Splits the data in raw into public and private datasets with appropriate test/train splits.
+    Also creates a secondary validation split in public_val/private_val directories.
+    """
+    # Create train, test from train split
+    old_train = read_csv(raw / "train.csv")
+    # --- First Split: Create the original train and test sets ---
+    # This call produces the original outputs, which must remain unchanged.
+    new_train, new_test = _split_and_save(
+        df=old_train,
+        test_size=0.1,
+        public_path=public,
+        private_path=private,
+        random_state=0,
+    )
+    # --- Second Split: Create a new train and validation set ---
+    # Define paths for the new validation set outputs
+    public_val = public.parent / "public_val"
+    private_val = private.parent / "private_val"
+    # Calculate the test size for the second split to make the new validation
+    # set (`test_val`) have the same number of samples as the original test set.
+    test_val_size = len(new_test) / len(new_train)
+    # This call takes the `new_train` data and splits it again, saving the
+    # results to the new `_val` directories with identical structure and filenames.
+    _split_and_save(
+        df=new_train,
+        test_size=test_val_size,
+        public_path=public_val,
+        private_path=private_val,
+        random_state=0,  # Use the same random state for consistency
+    )

mlebench/competitions/tabular-playground-series-may-2022/grade.py ADDED Viewed

@@ -0,0 +1,9 @@
+import pandas as pd
+from sklearn.metrics import roc_auc_score
+from mlebench.competitions.utils import prepare_for_auroc_metric
+def grade(submission: pd.DataFrame, answers: pd.DataFrame) -> float:
+    roc_auc_inputs = prepare_for_auroc_metric(submission, answers, id_col="id", target_col="target")
+    return roc_auc_score(y_true=roc_auc_inputs["y_true"], y_score=roc_auc_inputs["y_score"])

dslighting 1.7.1__py3-none-any.whl → 1.7.6__py3-none-any.whl

dslighting 1.7.1py3-none-any.whl → 1.7.6py3-none-any.whl