PyPI - dslighting - Versions diffs - 1.7.1__py3-none-any.whl → 1.7.6__py3-none-any.whl - Mend

dslighting 1.7.1py3-none-any.whl → 1.7.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (352) hide show

mlebench/competitions/tgs-salt-identification-challenge/prepare_val.py ADDED Viewed

@@ -0,0 +1,166 @@
+import shutil
+from pathlib import Path
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from mlebench.utils import extract, read_csv
+def _process_split(
+    train_df: pd.DataFrame,
+    test_df: pd.DataFrame,
+    public_dir: Path,
+    private_dir: Path,
+    raw_images_dir: Path,
+    raw_masks_dir: Path,
+    all_depths_df: pd.DataFrame,
+) -> None:
+    """
+    Processes a single data split (e.g., train/test or train_val/test_val),
+    saving all required files and performing sanity checks.
+    Args:
+        train_df: DataFrame for the training set.
+        test_df: DataFrame for the test set.
+        public_dir: The public output directory for this split.
+        private_dir: The private output directory for this split.
+        raw_images_dir: Path to the directory containing all source images.
+        raw_masks_dir: Path to the directory containing all source masks.
+        all_depths_df: DataFrame containing depth information for all samples.
+    """
+    public_dir.mkdir(parents=True, exist_ok=True)
+    private_dir.mkdir(parents=True, exist_ok=True)
+    train_ids = set(train_df["id"])
+    test_ids = set(test_df["id"])
+    assert train_ids.isdisjoint(test_ids), "`id` is not disjoint between train and test sets"
+    train_df.sort_values(by="id").to_csv(public_dir / "train.csv", index=False)
+    test_df.sort_values(by="id").to_csv(private_dir / "test.csv", index=False)
+    all_source_images = set(raw_images_dir.glob("*.png"))
+    train_imgs = set(img for img in all_source_images if img.stem in train_ids)
+    test_imgs = set(img for img in all_source_images if img.stem in test_ids)
+    assert train_imgs.isdisjoint(test_imgs), "Images are not disjoint between train and test sets"
+    (public_dir / "train" / "images").mkdir(parents=True, exist_ok=True)
+    (public_dir / "train" / "masks").mkdir(parents=True, exist_ok=True)
+    for fpath in train_imgs:
+        shutil.copyfile(src=fpath, dst=public_dir / "train" / "images" / fpath.name)
+        shutil.copyfile(src=raw_masks_dir / fpath.name, dst=public_dir / "train" / "masks" / fpath.name)
+    (public_dir / "test" / "images").mkdir(parents=True, exist_ok=True)
+    for fpath in test_imgs:
+        shutil.copyfile(src=fpath, dst=public_dir / "test" / "images" / fpath.name)
+    sample_submission = test_df.drop(columns=["rle_mask"]).copy()
+    sample_submission["rle_mask"] = "1 1"
+    sample_submission.sort_values(by="id").to_csv(public_dir / "sample_submission.csv", index=False)
+    depths_mask = all_depths_df["id"].isin(train_ids)
+    filtered_depths = all_depths_df[depths_mask]
+    filtered_depths.sort_values(by="id").to_csv(public_dir / "depths.csv", index=False)
+    # Sanity checks
+    assert (public_dir / "train.csv").exists(), "`train.csv` doesn't exist!"
+    assert (public_dir / "sample_submission.csv").exists(), "`sample_submission.csv` doesn't exist!"
+    assert (public_dir / "depths.csv").exists(), "`depths.csv` doesn't exist!"
+    assert (public_dir / "train").exists(), "`train` directory doesn't exist!"
+    assert (public_dir / "test").exists(), "`test` directory doesn't exist!"
+    assert (private_dir / "test.csv").exists(), "`test.csv` doesn't exist!"
+    actual_train_imgs = set(img.stem for img in (public_dir / "train" / "images").glob("*.png"))
+    actual_train_masks = set(img.stem for img in (public_dir / "train" / "masks").glob("*.png"))
+    assert len(actual_train_imgs) == len(train_df), "The number of images in the train set doesn't match!"
+    assert len(actual_train_masks) == len(train_df), "The number of masks in the train set doesn't match!"
+    for train_id in train_df["id"]:
+        assert (public_dir / "train" / "images" / f"{train_id}.png").exists()
+        assert (public_dir / "train" / "masks" / f"{train_id}.png").exists()
+    actual_test_imgs = set(img.stem for img in (public_dir / "test" / "images").glob("*.png"))
+    assert not (public_dir / "test" / "masks").exists(), f"Expected `{public_dir}/test/masks` to not exist, but it does!"
+    assert len(actual_test_imgs) == len(test_df), "The number of images in the test set doesn't match!"
+    for test_id in test_df["id"]:
+        assert (public_dir / "test" / "images" / f"{test_id}.png").exists()
+        assert not (public_dir / "test" / "masks" / f"{test_id}.png").exists()
+    assert actual_train_imgs.isdisjoint(actual_test_imgs), "Image sets overlap!"
+    actual_sample_submission = read_csv(public_dir / "sample_submission.csv")
+    actual_test = read_csv(private_dir / "test.csv")
+    assert len(actual_sample_submission) == len(actual_test), "Sample submission and test set lengths differ!"
+    assert set(actual_sample_submission["id"]) == set(actual_test["id"]), "Sample submission and test set IDs differ!"
+    assert len(actual_test_imgs) == len(actual_test), "Test image count and test set length differ!"
+    assert set(actual_test["id"]) == actual_test_imgs, "Test set IDs and test images differ!"
+def prepare(raw: Path, public: Path, private: Path) -> None:
+    extract(raw / "competition_data.zip", raw)
+    old_train = read_csv(raw / "competition_data" / "train.csv")
+    old_train = old_train.fillna("")
+    old_depths = read_csv(raw / "depths.csv")
+    # Original ratio is Train set - 4,000 samples; Test set - ~18,000 samples (82% ratio)
+    # We use a 0.25 ratio to get number of test samples into thousand OOM
+    new_train, new_test = train_test_split(old_train, test_size=0.25, random_state=0)
+    assert len(new_train) + len(new_test) == len(
+        old_train
+    ), "Some samples were lost when creating the new train and test sets!"
+    # Create the new validation split from the `new_train` set.
+    # To make `test_val` have the same size as `new_test` (25% of original),
+    # we need to take 1/3 of `new_train` (since 1/3 * 75% = 25%).
+    train_val, test_val = train_test_split(new_train, test_size=(1/3), random_state=0)
+    assert len(train_val) + len(test_val) == len(
+        new_train
+    ), "Some samples were lost when creating the validation train and test sets!"
+    # Define paths for raw images and new validation output directories
+    raw_images_dir = raw / "competition_data" / "train" / "images"
+    raw_masks_dir = raw / "competition_data" / "train" / "masks"
+    public_val = public.parent / "public_val"
+    private_val = private.parent / "private_val"
+    # Process the original split, saving to `public` and `private`
+    # This block ensures the original outputs are not modified.
+    _process_split(
+        train_df=new_train,
+        test_df=new_test,
+        public_dir=public,
+        private_dir=private,
+        raw_images_dir=raw_images_dir,
+        raw_masks_dir=raw_masks_dir,
+        all_depths_df=old_depths,
+    )
+    # Process the new validation split, saving to `public_val` and `private_val`
+    _process_split(
+        train_df=train_val,
+        test_df=test_val,
+        public_dir=public_val,
+        private_dir=private_val,
+        raw_images_dir=raw_images_dir,
+        raw_masks_dir=raw_masks_dir,
+        all_depths_df=old_depths,
+    )
+    # Final checks on data types, which are consistent across all splits.
+    assert new_train.applymap(
+        lambda x: isinstance(x, str)
+    ).values.all(), "Not all elements in the DataFrame are strings!"
+    assert new_test.applymap(
+        lambda x: isinstance(x, str)
+    ).values.all(), "Not all elements in the DataFrame are strings!"

mlebench/competitions/the-icml-2013-whale-challenge-right-whale-redux/grade.py ADDED Viewed

@@ -0,0 +1,11 @@
+import pandas as pd
+from sklearn.metrics import roc_auc_score
+from mlebench.competitions.utils import prepare_for_auroc_metric
+def grade(submission: pd.DataFrame, answers: pd.DataFrame) -> float:
+    roc_auc_inputs = prepare_for_auroc_metric(
+        submission, answers, id_col="clip", target_col="probability"
+    )
+    return roc_auc_score(y_true=roc_auc_inputs["y_true"], y_score=roc_auc_inputs["y_score"])

mlebench/competitions/the-icml-2013-whale-challenge-right-whale-redux/prepare.py ADDED Viewed

@@ -0,0 +1,95 @@
+import re
+import shutil
+from pathlib import Path
+import pandas as pd
+from tqdm import tqdm
+def prepare(raw: Path, public: Path, private: Path):
+    """
+    Splits the data in raw into public and private datasets with appropriate test/train splits.
+    """
+    # Data is in train2.zip - we need to unzip it
+    shutil.unpack_archive(raw / "train2.zip", raw)
+    # Files are named as
+    # Train: "YYYYMMDD_HHMMSS_{seconds}_TRAIN{idx}_{label:0,1}.aif"
+    # Test: "YYYYMMDD_HHMMSS_{seconds}_Test{idx}.aif"
+    # There are 4 days in Train and 3 days in Test
+    # In our new dataset, we'll just split Train_old into 2 days for Train and 2 days for Test
+    samples_by_date = {}
+    n_train_old = 0
+    for sample in (raw / "train2").iterdir():
+        date = sample.name.split("_")[0]
+        if date not in samples_by_date:
+            samples_by_date[date] = []
+        samples_by_date[date].append(sample)
+        n_train_old += 1
+    assert len(samples_by_date) == 4, "Expected 4 days in Train_old"
+    dates = sorted(list(samples_by_date.keys()))
+    new_train = samples_by_date[dates[0]] + samples_by_date[dates[1]]
+    new_test = samples_by_date[dates[2]] + samples_by_date[dates[3]]
+    # Sort files - filenames have timestamps so we want new idxs to be time-ordered
+    new_train = sorted(new_train)
+    new_test = sorted(new_test)
+    # Copy files to new directories
+    (public / "train2").mkdir(exist_ok=True, parents=True)
+    for idx, sample in enumerate(tqdm(new_train)):
+        # Replace index part of filename with new index
+        new_sample_name = re.sub(r"TRAIN\d+", f"TRAIN{idx}", sample.name)
+        new_sample = public / "train2" / new_sample_name
+        shutil.copy(sample, new_sample)
+    answer_rows = []  # While we're at it, collect answers for the new test set
+    (public / "test2").mkdir(exist_ok=True, parents=True)
+    for idx, sample in enumerate(tqdm(new_test)):
+        # Replace everything after the TRAIN{idx} part of the filename
+        # (replaces index as well as label part of filename)
+        new_sample_name = sample.name.split("TRAIN")[0] + f"Test{idx}.aif"
+        new_sample = public / "test2" / new_sample_name
+        shutil.copy(sample, new_sample)
+        # Add to new test set answers
+        answer_rows.append(
+            {"clip": new_sample_name, "probability": 1 if sample.stem.endswith("_1") else 0}
+        )
+    assert len(new_train) == len(
+        list((public / "train2").glob("*.aif"))
+    ), f"Expected {len(new_train)} samples in new_train ({len(list((public / 'train2').glob('*.aif')))}"
+    assert len(new_test) == len(
+        list((public / "test2").glob("*.aif"))
+    ), f"Expected {len(new_test)} samples in new_test ({len(list((public / 'test2').glob('*.aif')))}"
+    assert (
+        len(new_train) + len(new_test) == n_train_old
+    ), f"Expected {n_train_old} total samples in new_train ({len(new_train)}) and new_test ({len(new_test)})"
+    # Make zipped versions
+    shutil.make_archive(public / "train2", "zip", public, "train2")
+    shutil.make_archive(public / "test2", "zip", public, "test2")
+    # Remove unzipped directories (original comp doesn't have these)
+    shutil.rmtree(public / "train2")
+    shutil.rmtree(public / "test2")
+    # we also don't need the raw dirs anymore
+    shutil.rmtree(raw / "train2")
+    # Create answers
+    answers_df = pd.DataFrame(answer_rows)
+    answers_df.to_csv(private / "test.csv", index=False)
+    # Create sample submission
+    sample_submission = answers_df.copy()
+    sample_submission["probability"] = 0
+    sample_submission.to_csv(public / "sampleSubmission.csv", index=False)
+    assert set(answers_df.columns) == set(
+        ["clip", "probability"]
+    ), "Answers must have 'clip' and 'probability' columns"
+    assert set(sample_submission.columns) == set(
+        ["clip", "probability"]
+    ), "Sample submission must have 'clip' and 'probability' columns"

mlebench/competitions/the-icml-2013-whale-challenge-right-whale-redux/prepare_val.py ADDED Viewed

@@ -0,0 +1,141 @@
+import re
+import shutil
+from pathlib import Path
+from typing import List
+import pandas as pd
+from tqdm import tqdm
+def _create_split(
+    train_files: List[Path],
+    test_files: List[Path],
+    public_path: Path,
+    private_path: Path,
+):
+    """
+    Helper function to process and save a single train/test split.
+    This function handles file copying, renaming, zipping, and the creation of
+    answer and sample submission files for a given set of train/test files.
+    """
+    # Create output directories if they don't exist
+    public_path.mkdir(exist_ok=True, parents=True)
+    private_path.mkdir(exist_ok=True, parents=True)
+    # Process and copy train files
+    train_output_dir = public_path / "train2"
+    train_output_dir.mkdir(exist_ok=True)
+    # Sort files to ensure deterministic indexing
+    sorted_train_files = sorted(train_files)
+    for idx, sample in enumerate(
+        tqdm(sorted_train_files, desc=f"Creating train set for {public_path.name}")
+    ):
+        new_sample_name = re.sub(r"TRAIN\d+", f"TRAIN{idx}", sample.name)
+        new_sample = train_output_dir / new_sample_name
+        shutil.copy(sample, new_sample)
+    # Process and copy test files, collecting answers
+    answer_rows = []
+    test_output_dir = public_path / "test2"
+    test_output_dir.mkdir(exist_ok=True)
+    # Sort files to ensure deterministic indexing
+    sorted_test_files = sorted(test_files)
+    for idx, sample in enumerate(
+        tqdm(sorted_test_files, desc=f"Creating test set for {public_path.name}")
+    ):
+        new_sample_name = sample.name.split("TRAIN")[0] + f"Test{idx}.aif"
+        new_sample = test_output_dir / new_sample_name
+        shutil.copy(sample, new_sample)
+        answer_rows.append(
+            {"clip": new_sample_name, "probability": 1 if sample.stem.endswith("_1") else 0}
+        )
+    # Assertions to verify file counts
+    assert len(sorted_train_files) == len(list(train_output_dir.glob("*.aif")))
+    assert len(sorted_test_files) == len(list(test_output_dir.glob("*.aif")))
+    # Create zipped versions and remove temporary unzipped directories
+    shutil.make_archive(public_path / "train2", "zip", public_path, "train2")
+    shutil.make_archive(public_path / "test2", "zip", public_path, "test2")
+    shutil.rmtree(train_output_dir)
+    shutil.rmtree(test_output_dir)
+    # Create answer file
+    answers_df = pd.DataFrame(answer_rows)
+    answers_df.to_csv(private_path / "test.csv", index=False)
+    assert set(answers_df.columns) == set(["clip", "probability"])
+    # Create sample submission file
+    sample_submission = answers_df.copy()
+    sample_submission["probability"] = 0
+    sample_submission.to_csv(public_path / "sampleSubmission.csv", index=False)
+    assert set(sample_submission.columns) == set(["clip", "probability"])
+def prepare(raw: Path, public: Path, private: Path):
+    """
+    Splits the data in raw into public and private datasets with appropriate test/train splits.
+    Also creates a secondary validation split in public_val/private_val directories.
+    """
+    # Data is in train2.zip - we need to unzip it
+    shutil.unpack_archive(raw / "train2.zip", raw)
+    # Files are named as
+    # Train: "YYYYMMDD_HHMMSS_{seconds}_TRAIN{idx}_{label:0,1}.aif"
+    # Test: "YYYYMMDD_HHMMSS_{seconds}_Test{idx}.aif"
+    # There are 4 days in Train and 3 days in Test
+    # In our new dataset, we'll just split Train_old into 2 days for Train and 2 days for Test
+    samples_by_date = {}
+    n_train_old = 0
+    for sample in (raw / "train2").iterdir():
+        date = sample.name.split("_")[0]
+        if date not in samples_by_date:
+            samples_by_date[date] = []
+        samples_by_date[date].append(sample)
+        n_train_old += 1
+    assert len(samples_by_date) == 4, "Expected 4 days in Train_old"
+    dates = sorted(list(samples_by_date.keys()))
+    # --- 1. Create the Original Split (public/private) ---
+    # This split uses the first two days for training and the last two days for testing.
+    # The outputs of this step must remain identical to the original script.
+    original_train_files = samples_by_date[dates[0]] + samples_by_date[dates[1]]
+    original_test_files = samples_by_date[dates[2]] + samples_by_date[dates[3]]
+    _create_split(
+        train_files=original_train_files,
+        test_files=original_test_files,
+        public_path=public,
+        private_path=private,
+    )
+    # --- 2. Create the New Validation Split (public_val/private_val) ---
+    # This second split takes the original *training* data (the first two days) and
+    # splits it again, using the same date-based logic. The first day becomes the
+    # new training set, and the second day becomes the new validation (test) set.
+    public_val = public.parent / "public_val"
+    private_val = private.parent / "private_val"
+    validation_train_files = samples_by_date[dates[0]]
+    validation_test_files = samples_by_date[dates[1]]
+    _create_split(
+        train_files=validation_train_files,
+        test_files=validation_test_files,
+        public_path=public_val,
+        private_path=private_val,
+    )
+    # Final cleanup of the raw unzipped directory after all processing is complete
+    shutil.rmtree(raw / "train2")
+    # Final top-level assertions from the original script
+    assert (
+        len(original_train_files) + len(original_test_files) == n_train_old
+    ), f"Expected {n_train_old} total samples in new_train ({len(original_train_files)}) and new_test ({len(original_test_files)})"
+    assert (public / "sampleSubmission.csv").exists()
+    assert (private / "test.csv").exists()

mlebench/competitions/tmdb-box-office-prediction/__init__.py ADDED Viewed

File without changes

mlebench/competitions/tmdb-box-office-prediction/grade.py ADDED Viewed

@@ -0,0 +1,55 @@
+import numpy as np
+import pandas as pd
+from sklearn.metrics import mean_squared_log_error
+from mlebench.grade_helpers import InvalidSubmissionError
+def prepare_for_metric(submission: pd.DataFrame, answers: pd.DataFrame):
+    """Prepare submission and answers for RMSLE calculation."""
+    if len(submission) != len(answers):
+        raise InvalidSubmissionError(
+            f"Submission length ({len(submission)}) != answers length ({len(answers)})"
+        )
+    if "id" not in submission.columns:
+        raise InvalidSubmissionError("Submission must have an 'id' column")
+    if "revenue" not in submission.columns:
+        raise InvalidSubmissionError("Submission must have a 'revenue' column")
+    # Sort by id
+    submission = submission.sort_values("id").reset_index(drop=True)
+    answers = answers.sort_values("id").reset_index(drop=True)
+    # Check id alignment
+    if not (submission["id"] == answers["id"]).all():
+        raise InvalidSubmissionError("Submission and answers id mismatch")
+    # Extract predictions and true values
+    y_pred = submission["revenue"].values
+    y_true = answers["revenue"].values
+    # Validate predictions
+    try:
+        y_pred = y_pred.astype(float)
+    except (ValueError, TypeError):
+        raise InvalidSubmissionError("Predictions must be numeric")
+    if np.any(np.isnan(y_pred)):
+        raise InvalidSubmissionError("Predictions cannot contain NaN values")
+    if np.any(y_pred < 0):
+        raise InvalidSubmissionError("Predictions cannot be negative")
+    return y_true, y_pred
+def grade(submission: pd.DataFrame, answers: pd.DataFrame) -> float:
+    """Calculate RMSLE score."""
+    y_true, y_pred = prepare_for_metric(submission, answers)
+    # RMSLE = sqrt(MSLE)
+    rmsle = np.sqrt(mean_squared_log_error(y_true, y_pred))
+    return rmsle

mlebench/competitions/tmdb-box-office-prediction/prepare.py ADDED Viewed

@@ -0,0 +1,35 @@
+from pathlib import Path
+import pandas as pd
+def prepare(raw: Path, public: Path, private: Path):
+    """
+    Prepare tmdb-box-office-prediction dataset.
+    Raw data contains:
+    - train.csv: training data with movie features and revenue
+    - test.csv: test data with movie features only
+    - sample_submission.csv: sample submission format
+    - test_answer.csv: test revenue (private)
+    """
+    # Read data
+    train = pd.read_csv(raw / "train.csv")
+    test = pd.read_csv(raw / "test.csv")
+    sample_submission = pd.read_csv(raw / "sample_submission.csv")
+    test_answer = pd.read_csv(raw / "test_answer.csv")
+    # Public files (visible to agents)
+    train.to_csv(public / "train.csv", index=False)
+    test.to_csv(public / "test.csv", index=False)
+    sample_submission.to_csv(public / "sample_submission.csv", index=False)
+    # Private files (for grading)
+    test_answer.to_csv(private / "test.csv", index=False)
+    # Validation checks
+    assert len(test_answer) == len(sample_submission), \
+        f"Test answer ({len(test_answer)}) and sample submission ({len(sample_submission)}) must have same length"
+    assert "id" in test_answer.columns, "Test answer must have 'id' column"
+    assert "revenue" in test_answer.columns, "Test answer must have 'revenue' column"
+    assert "id" in sample_submission.columns, "Sample submission must have 'id' column"
+    assert "revenue" in sample_submission.columns, "Sample submission must have 'revenue' column"

mlebench/competitions/tweet-sentiment-extraction/grade.py ADDED Viewed

@@ -0,0 +1,67 @@
+import numpy as np
+import pandas as pd
+from mlebench.grade_helpers import InvalidSubmissionError
+def jaccard(str1: str, str2: str) -> float:
+    """
+    Given by the competition page: https://www.kaggle.com/competitions/tweet-sentiment-extraction/overview
+    """
+    a = set(str1.lower().split())
+    b = set(str2.lower().split())
+    c = a.intersection(b)
+    return float(len(c)) / (len(a) + len(b) - len(c))
+def mean_jaccard(preds: list[str], answers: list[str]) -> float:
+    return float(np.mean([jaccard(a, b) for a, b in zip(preds, answers)]))
+def prepare_for_metric(
+    submission: pd.DataFrame, answers: pd.DataFrame
+) -> tuple[list[str], list[str]]:
+    """
+    The submission and answers are DataFrames with the following columns:
+    ```
+    textID,selected_text
+    2,"very good"
+    5,"I don't care"
+    6,"bad"
+    8,"it was, yes"
+    etc.
+    ```
+    """
+    if "textID" not in submission.columns:
+        raise InvalidSubmissionError("Submission DataFrame must have a 'textID' column.")
+    if "selected_text" not in submission.columns:
+        raise InvalidSubmissionError("Submission DataFrame must have a 'selected_text' column.")
+    if len(submission) != len(answers):
+        raise InvalidSubmissionError(
+            "Submission and answers DataFrames must have the same number of rows."
+        )
+    assert "textID" in answers.columns, "Answers DataFrame must have a 'textID' column."
+    assert (
+        "selected_text" in answers.columns
+    ), "Answers DataFrame must have a 'selected_text' column."
+    # Sort by textID
+    submission = submission.sort_values("textID")
+    answers = answers.sort_values("textID")
+    if (submission["textID"].values != answers["textID"].values).any():
+        raise InvalidSubmissionError(
+            "Submission and answers DataFrames must have the same textID values."
+        )
+    # Note: empty strings in the submission get read as NaNs, so we fill them with empty strings
+    # before converting to str (otherwise pandas will convert NaNs to "nan" strings)
+    submission_labels = submission["selected_text"].fillna("").astype(str).tolist()
+    answer_labels = answers["selected_text"].fillna("").astype(str).tolist()
+    return submission_labels, answer_labels
+def grade(submission: pd.DataFrame, answers: pd.DataFrame) -> float:
+    submission_labels, answer_labels = prepare_for_metric(submission, answers)
+    return mean_jaccard(submission_labels, answer_labels)

mlebench/competitions/tweet-sentiment-extraction/prepare.py ADDED Viewed

@@ -0,0 +1,36 @@
+from pathlib import Path
+from sklearn.model_selection import train_test_split
+from mlebench.utils import read_csv
+def prepare(raw: Path, public: Path, private: Path):
+    """
+    Splits the data in raw into public and private datasets with appropriate test/train splits.
+    """
+    # Create train, test from train split
+    old_train = read_csv(raw / "train.csv")
+    new_train, new_test = train_test_split(old_train, test_size=0.1, random_state=0)
+    new_test_without_labels = new_test.drop(columns=["selected_text"])
+    new_train.to_csv(public / "train.csv", index=False)
+    new_test.to_csv(private / "test.csv", index=False)
+    new_test_without_labels.to_csv(public / "test.csv", index=False)
+    assert len(new_test_without_labels) == len(
+        new_test
+    ), f"Expected new_test_without_labels ({len(new_test_without_labels)}) == new_test ({len(new_test)})"
+    assert len(new_train) + len(new_test) == len(
+        old_train
+    ), f"Expected new_train ({len(new_train)}) + new_test ({len(new_test)}) == old_train ({len(old_train)})"
+    # Create a sample submission file
+    submission_df = new_test.copy()[["textID", "selected_text"]]
+    submission_df["selected_text"] = ""
+    submission_df.to_csv(public / "sample_submission.csv", index=False)
+    assert len(submission_df) == len(
+        new_test
+    ), f"Expected submission_df ({len(submission_df)}) == new_test ({len(new_test)})"

dslighting 1.7.1__py3-none-any.whl → 1.7.6__py3-none-any.whl

dslighting 1.7.1py3-none-any.whl → 1.7.6py3-none-any.whl