PyPI - dslighting - Versions diffs - 1.7.1__py3-none-any.whl → 1.7.8__py3-none-any.whl - Mend

dslighting 1.7.1py3-none-any.whl → 1.7.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (352) hide show

mlebench/competitions/hms-harmful-brain-activity-classification/kullback_leibler_divergence.py ADDED Viewed

@@ -0,0 +1,118 @@
+"""
+Adapted from: https://www.kaggle.com/code/metric/kullback-leibler-divergence
+Linked from: https://www.kaggle.com/competitions/hms-harmful-brain-activity-classification
+"""
+from typing import Optional
+import numpy as np
+import pandas as pd
+import pandas.api.types
+from . import kaggle_metric_utilities
+class ParticipantVisibleError(Exception):
+    pass
+def kl_divergence(
+    solution: pd.DataFrame,
+    submission: pd.DataFrame,
+    epsilon: float,
+    micro_average: bool,
+    sample_weights: Optional[pd.Series],
+):
+    # Overwrite solution for convenience
+    for col in solution.columns:
+        # Prevent issue with populating int columns with floats
+        if not pandas.api.types.is_float_dtype(solution[col]):
+            solution[col] = solution[col].astype(float)
+        # Clip both the min and max following Kaggle conventions for related metrics like log loss
+        # Clipping the max avoids cases where the loss would be infinite or undefined, clipping the min
+        # prevents users from playing games with the 20th decimal place of predictions.
+        submission[col] = np.clip(submission[col], epsilon, 1 - epsilon)
+        y_nonzero_indices = solution[col] != 0
+        solution[col] = solution[col].astype(float)
+        solution.loc[y_nonzero_indices, col] = solution.loc[y_nonzero_indices, col] * np.log(
+            solution.loc[y_nonzero_indices, col] / submission.loc[y_nonzero_indices, col]
+        )
+        # Set the loss equal to zero where y_true equals zero following the scipy convention:
+        # https://docs.scipy.org/doc/scipy/reference/generated/scipy.special.rel_entr.html#scipy.special.rel_entr
+        solution.loc[~y_nonzero_indices, col] = 0
+    if micro_average:
+        return np.average(solution.sum(axis=1), weights=sample_weights)
+    else:
+        return np.average(solution.mean())
+def score(
+    solution: pd.DataFrame,
+    submission: pd.DataFrame,
+    row_id_column_name: str,
+    epsilon: float = 10**-15,
+    micro_average: bool = True,
+    sample_weights_column_name: Optional[str] = None,
+) -> float:
+    """The Kullback–Leibler divergence.
+    The KL divergence is technically undefined/infinite where the target equals zero.
+    This implementation always assigns those cases a score of zero; effectively removing them from consideration.
+    The predictions in each row must add to one so any probability assigned to a case where y == 0 reduces
+    another prediction where y > 0, so crucially there is an important indirect effect.
+    https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence
+    solution: pd.DataFrame
+    submission: pd.DataFrame
+    epsilon: KL divergence is undefined for p=0 or p=1. If epsilon is not null, solution and submission probabilities are clipped to max(eps, min(1 - eps, p).
+    row_id_column_name: str
+    micro_average: bool. Row-wise average if True, column-wise average if False.
+    Examples
+    --------
+    >>> import pandas as pd
+    >>> row_id_column_name = "id"
+    >>> score(pd.DataFrame({'id': range(4), 'ham': [0, 1, 1, 0], 'spam': [1, 0, 0, 1]}), pd.DataFrame({'id': range(4), 'ham': [.1, .9, .8, .35], 'spam': [.9, .1, .2, .65]}), row_id_column_name=row_id_column_name)
+    0.216161...
+    >>> solution = pd.DataFrame({'id': range(3), 'ham': [0, 0.5, 0.5], 'spam': [0.1, 0.5, 0.5], 'other': [0.9, 0, 0]})
+    >>> submission = pd.DataFrame({'id': range(3), 'ham': [0, 0.5, 0.5], 'spam': [0.1, 0.5, 0.5], 'other': [0.9, 0, 0]})
+    >>> score(solution, submission, 'id')
+    0.0
+    >>> solution = pd.DataFrame({'id': range(3), 'ham': [0, 0.5, 0.5], 'spam': [0.1, 0.5, 0.5], 'other': [0.9, 0, 0]})
+    >>> submission = pd.DataFrame({'id': range(3), 'ham': [0.2, 0.3, 0.5], 'spam': [0.1, 0.5, 0.5], 'other': [0.7, 0.2, 0]})
+    >>> score(solution, submission, 'id')
+    0.160531...
+    """
+    del solution[row_id_column_name]
+    del submission[row_id_column_name]
+    sample_weights = None
+    if sample_weights_column_name:
+        if sample_weights_column_name not in solution.columns:
+            raise ParticipantVisibleError(
+                f"{sample_weights_column_name} not found in solution columns"
+            )
+        sample_weights = solution.pop(sample_weights_column_name)
+    if sample_weights_column_name and not micro_average:
+        raise ParticipantVisibleError("Sample weights are only valid if `micro_average` is `True`")
+    for col in solution.columns:
+        if col not in submission.columns:
+            raise ParticipantVisibleError(f"Missing submission column {col}")
+    kaggle_metric_utilities.verify_valid_probabilities(solution, "solution")
+    kaggle_metric_utilities.verify_valid_probabilities(submission, "submission")
+    return kaggle_metric_utilities.safe_call_score(
+        kl_divergence,
+        solution,
+        submission,
+        epsilon=epsilon,
+        micro_average=micro_average,
+        sample_weights=sample_weights,
+    )

mlebench/competitions/hms-harmful-brain-activity-classification/prepare.py ADDED Viewed

@@ -0,0 +1,121 @@
+import shutil
+from pathlib import Path
+from sklearn.model_selection import train_test_split
+from tqdm.auto import tqdm
+from mlebench.utils import read_csv
+from .constants import TARGET_COLS
+def prepare(raw: Path, public: Path, private: Path):
+    old_train = read_csv(raw / "train.csv")
+    # split based on `spectrogram_id`
+    # this is coarser than `eeg_id` which is coarser than `label_id`, so we avoid data leakage
+    train_spectrograms, test_specrograms = train_test_split(
+        old_train["spectrogram_id"].unique(), test_size=0.1, random_state=0
+    )
+    new_train = old_train[old_train["spectrogram_id"].isin(train_spectrograms)]
+    new_test = old_train[old_train["spectrogram_id"].isin(test_specrograms)]
+    new_train.to_csv(public / "train.csv", index=False)
+    new_test.to_csv(private / "test.csv", index=False)
+    new_test_without_labels = new_test.copy()[["spectrogram_id", "eeg_id", "patient_id"]]
+    new_test_without_labels.to_csv(public / "test.csv", index=False)
+    gold_submission = new_test.copy()[["eeg_id"] + TARGET_COLS]
+    # make the votes into probabilities naively
+    # https://www.kaggle.com/competitions/hms-harmful-brain-activity-classification/discussion/468705#2606605
+    gold_submission[TARGET_COLS] = gold_submission[TARGET_COLS].div(
+        gold_submission[TARGET_COLS].sum(axis=1), axis=0
+    )
+    gold_submission.to_csv(private / "gold_submission.csv", index=False)
+    sample_submission = gold_submission.copy()
+    sample_submission[TARGET_COLS] = 1 / len(TARGET_COLS)
+    sample_submission.to_csv(public / "sample_submission.csv", index=False)
+    shutil.copytree(raw / "example_figures", public / "example_figures")
+    (public / "train_eegs").mkdir(parents=True, exist_ok=True)
+    for eeg_id in tqdm(
+        new_train["eeg_id"].unique(), desc="Train EEGs", total=len(new_train["eeg_id"].unique())
+    ):
+        shutil.copy(
+            raw / "train_eegs" / f"{eeg_id}.parquet", public / "train_eegs" / f"{eeg_id}.parquet"
+        )
+    (public / "test_eegs").mkdir(parents=True, exist_ok=True)
+    for eeg_id in tqdm(
+        new_test["eeg_id"].unique(), desc="Test EEGs", total=len(new_test["eeg_id"].unique())
+    ):
+        shutil.copy(
+            raw / "train_eegs" / f"{eeg_id}.parquet", public / "test_eegs" / f"{eeg_id}.parquet"
+        )
+    (public / "train_spectrograms").mkdir(parents=True, exist_ok=True)
+    for spectrogram_id in tqdm(
+        new_train["spectrogram_id"].unique(),
+        desc="Train Spectrograms",
+        total=len(new_train["spectrogram_id"].unique()),
+    ):
+        shutil.copy(
+            raw / "train_spectrograms" / f"{spectrogram_id}.parquet",
+            public / "train_spectrograms" / f"{spectrogram_id}.parquet",
+        )
+    (public / "test_spectrograms").mkdir(parents=True, exist_ok=True)
+    for spectrogram_id in tqdm(
+        new_test["spectrogram_id"].unique(),
+        desc="Test Spectrograms",
+        total=len(new_test["spectrogram_id"].unique()),
+    ):
+        shutil.copy(
+            raw / "train_spectrograms" / f"{spectrogram_id}.parquet",
+            public / "test_spectrograms" / f"{spectrogram_id}.parquet",
+        )
+    assert len(list((public / "train_eegs").rglob("*"))) == len(
+        new_train["eeg_id"].unique()
+    ), "Unexpected number of train EEGs Copied"
+    assert len(list((public / "test_eegs").rglob("*"))) == len(
+        new_test["eeg_id"].unique()
+    ), "Unexpected number of test EEGs Copied"
+    assert len(list((public / "train_spectrograms").rglob("*"))) == len(
+        train_spectrograms
+    ), "Unexpected number of train Spectrograms Copied"
+    assert len(list((public / "test_spectrograms").rglob("*"))) == len(
+        test_specrograms
+    ), "Unexpected number of test Spectrograms Copied"
+    assert set(new_train.spectrogram_id).isdisjoint(
+        set(new_test.spectrogram_id)
+    ), "Some spectrogram_ids are in both train and test"
+    assert set(new_train.eeg_id).isdisjoint(
+        set(new_test.eeg_id)
+    ), "Some eeg_ids are in both train and test"
+    assert (
+        new_train.columns.tolist() == new_test.columns.tolist()
+    ), "Columns mismatch between public train and private test"
+    assert len(new_train.columns) == 15, "Unexpected number of columns in public train"
+    assert len(new_test.columns) == 15, "Unexpected number of columns in private test"
+    assert len(sample_submission.columns) == 7, "Unexpected number of columns in sample submission"
+    assert len(gold_submission.columns) == 7, "Unexpected number of columns in gold submission"
+    assert len(new_test_without_labels.columns) == 3, "Unexpected number of columns in private test"
+    assert len(new_train) + len(new_test) == len(
+        old_train
+    ), "Expected train + test length to be equal to original train length"
+    assert len(new_test_without_labels) == len(
+        new_test
+    ), "Length mismatch between public test and private test"
+    assert len(sample_submission) == len(
+        new_test
+    ), "Length mismatch between sample submission and private test"
+    assert len(gold_submission) == len(
+        new_test
+    ), "Length mismatch between gold submission and private test"

mlebench/competitions/hms-harmful-brain-activity-classification/prepare_val.py ADDED Viewed

@@ -0,0 +1,190 @@
+import shutil
+from pathlib import Path
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from tqdm.auto import tqdm
+from mlebench.utils import read_csv
+from .constants import TARGET_COLS
+def _process_split(
+    train_df: pd.DataFrame,
+    test_df: pd.DataFrame,
+    public_dir: Path,
+    private_dir: Path,
+    raw_dir: Path,
+):
+    """
+    Helper function to process a single train/test split and save all required files.
+    This ensures that the logic for creating the main dataset and the validation dataset is identical.
+    """
+    public_dir.mkdir(parents=True, exist_ok=True)
+    private_dir.mkdir(parents=True, exist_ok=True)
+    # Save main data CSVs
+    train_df.to_csv(public_dir / "train.csv", index=False)
+    test_df.to_csv(private_dir / "test.csv", index=False)
+    # Save public test CSV (without labels)
+    test_without_labels = test_df.copy()[["spectrogram_id", "eeg_id", "patient_id"]]
+    test_without_labels.to_csv(public_dir / "test.csv", index=False)
+    # Create and save submission files
+    gold_submission = test_df.copy()[["eeg_id"] + TARGET_COLS]
+    # make the votes into probabilities naively
+    # https://www.kaggle.com/competitions/hms-harmful-brain-activity-classification/discussion/468705#2606605
+    gold_submission[TARGET_COLS] = gold_submission[TARGET_COLS].div(
+        gold_submission[TARGET_COLS].sum(axis=1), axis=0
+    )
+    gold_submission.to_csv(private_dir / "gold_submission.csv", index=False)
+    sample_submission = gold_submission.copy()
+    sample_submission[TARGET_COLS] = 1 / len(TARGET_COLS)
+    sample_submission.to_csv(public_dir / "sample_submission.csv", index=False)
+    # Copy EEG files
+    (public_dir / "train_eegs").mkdir(parents=True, exist_ok=True)
+    for eeg_id in tqdm(
+        train_df["eeg_id"].unique(),
+        desc=f"Train EEGs ({public_dir.name})",
+        total=len(train_df["eeg_id"].unique()),
+    ):
+        shutil.copy(
+            raw_dir / "train_eegs" / f"{eeg_id}.parquet",
+            public_dir / "train_eegs" / f"{eeg_id}.parquet",
+        )
+    (public_dir / "test_eegs").mkdir(parents=True, exist_ok=True)
+    for eeg_id in tqdm(
+        test_df["eeg_id"].unique(),
+        desc=f"Test EEGs ({public_dir.name})",
+        total=len(test_df["eeg_id"].unique()),
+    ):
+        shutil.copy(
+            raw_dir / "train_eegs" / f"{eeg_id}.parquet",
+            public_dir / "test_eegs" / f"{eeg_id}.parquet",
+        )
+    # Copy Spectrogram files
+    (public_dir / "train_spectrograms").mkdir(parents=True, exist_ok=True)
+    for spectrogram_id in tqdm(
+        train_df["spectrogram_id"].unique(),
+        desc=f"Train Spectrograms ({public_dir.name})",
+        total=len(train_df["spectrogram_id"].unique()),
+    ):
+        shutil.copy(
+            raw_dir / "train_spectrograms" / f"{spectrogram_id}.parquet",
+            public_dir / "train_spectrograms" / f"{spectrogram_id}.parquet",
+        )
+    (public_dir / "test_spectrograms").mkdir(parents=True, exist_ok=True)
+    for spectrogram_id in tqdm(
+        test_df["spectrogram_id"].unique(),
+        desc=f"Test Spectrograms ({public_dir.name})",
+        total=len(test_df["spectrogram_id"].unique()),
+    ):
+        shutil.copy(
+            raw_dir / "train_spectrograms" / f"{spectrogram_id}.parquet",
+            public_dir / "test_spectrograms" / f"{spectrogram_id}.parquet",
+        )
+    # Assertions for data integrity
+    assert len(list((public_dir / "train_eegs").rglob("*"))) == len(
+        train_df["eeg_id"].unique()
+    ), "Unexpected number of train EEGs Copied"
+    assert len(list((public_dir / "test_eegs").rglob("*"))) == len(
+        test_df["eeg_id"].unique()
+    ), "Unexpected number of test EEGs Copied"
+    assert len(list((public_dir / "train_spectrograms").rglob("*"))) == len(
+        train_df["spectrogram_id"].unique()
+    ), "Unexpected number of train Spectrograms Copied"
+    assert len(list((public_dir / "test_spectrograms").rglob("*"))) == len(
+        test_df["spectrogram_id"].unique()
+    ), "Unexpected number of test Spectrograms Copied"
+    assert set(train_df.spectrogram_id).isdisjoint(
+        set(test_df.spectrogram_id)
+    ), "Some spectrogram_ids are in both train and test"
+    assert set(train_df.eeg_id).isdisjoint(
+        set(test_df.eeg_id)
+    ), "Some eeg_ids are in both train and test"
+    assert (
+        train_df.columns.tolist() == test_df.columns.tolist()
+    ), "Columns mismatch between public train and private test"
+    assert len(train_df.columns) == 15, "Unexpected number of columns in public train"
+    assert len(test_df.columns) == 15, "Unexpected number of columns in private test"
+    assert len(sample_submission.columns) == 7, "Unexpected number of columns in sample submission"
+    assert len(gold_submission.columns) == 7, "Unexpected number of columns in gold submission"
+    assert len(test_without_labels.columns) == 3, "Unexpected number of columns in private test"
+    assert len(test_without_labels) == len(
+        test_df
+    ), "Length mismatch between public test and private test"
+    assert len(sample_submission) == len(
+        test_df
+    ), "Length mismatch between sample submission and private test"
+    assert len(gold_submission) == len(
+        test_df
+    ), "Length mismatch between gold submission and private test"
+def prepare(raw: Path, public: Path, private: Path):
+    old_train = read_csv(raw / "train.csv")
+    # === Main Data Split (Train/Test) ===
+    # This split produces the primary competition data.
+    # split based on `spectrogram_id`
+    # this is coarser than `eeg_id` which is coarser than `label_id`, so we avoid data leakage
+    train_spectrograms, test_specrograms = train_test_split(
+        old_train["spectrogram_id"].unique(), test_size=0.1, random_state=0
+    )
+    new_train = old_train[old_train["spectrogram_id"].isin(train_spectrograms)]
+    new_test = old_train[old_train["spectrogram_id"].isin(test_specrograms)]
+    # Process and save the main split to the `public` and `private` directories
+    _process_split(new_train, new_test, public, private, raw)
+    # Copy shared assets only to the main public directory
+    shutil.copytree(raw / "example_figures", public / "example_figures")
+    # Assertions for the main split (post-processing)
+    assert len(new_train) + len(new_test) == len(
+        old_train
+    ), "Expected train + test length to be equal to original train length"
+    # === Validation Data Split ===
+    # This second split takes the main training set (`new_train`) and splits it
+    # again to create a new, smaller training set and a validation set.
+    # The outputs are saved to parallel `public_val` and `private_val` directories.
+    # Define paths for the validation split output
+    public_val = public.parent / "public_val"
+    private_val = private.parent / "private_val"
+    # The original test set was 10% of the total data. The main training set is 90%.
+    # To get a validation set of the same size (10% of total), we need to take
+    # 1/9th of the main training set (1/9 * 90% = 10%).
+    val_test_size = 1 / 9
+    train_val_spectrograms, test_val_specrograms = train_test_split(
+        new_train["spectrogram_id"].unique(), test_size=val_test_size, random_state=0
+    )
+    train_val_df = new_train[new_train["spectrogram_id"].isin(train_val_spectrograms)]
+    test_val_df = new_train[new_train["spectrogram_id"].isin(test_val_specrograms)]
+    # Process and save the validation split using the same logic as the main split
+    _process_split(train_val_df, test_val_df, public_val, private_val, raw)
+    # Copy shared assets to the validation public directory to mirror the main one
+    shutil.copytree(raw / "example_figures", public_val / "example_figures")
+    # Assertions for the validation split (post-processing)
+    assert len(train_val_df) + len(test_val_df) == len(
+        new_train
+    ), "Expected train_val + test_val length to be equal to new_train length"

mlebench/competitions/hotel-id-2021-fgvc8/grade.py ADDED Viewed

@@ -0,0 +1,41 @@
+import numpy as np
+import pandas as pd
+from mlebench.grade_helpers import InvalidSubmissionError
+from mlebench.metrics import mean_average_precision_at_k
+def prepare_for_metric(submission: pd.DataFrame, answers: pd.DataFrame) -> dict:
+    id_col = "image"
+    target_col = "hotel_id"
+    if id_col not in submission.columns:
+        raise InvalidSubmissionError(f"Submission is missing `{id_col}` column")
+    if target_col not in submission.columns:
+        raise InvalidSubmissionError(f"Submission is missing `{target_col}` column")
+    assert id_col in answers.columns, f"Answers is missing `{id_col}` column"
+    assert target_col in answers.columns, f"Answers is missing `{target_col}` column"
+    if not set(submission[id_col]) == set(answers[id_col]):
+        raise InvalidSubmissionError("Submission and answers have different ids")
+    if not len(submission) == len(answers):
+        raise InvalidSubmissionError("Submission and answers have different lengths")
+    # sort to ensure we're grading the right rows
+    submission = submission.sort_values(id_col).reset_index(drop=True)
+    answers = answers.sort_values(id_col).reset_index(drop=True)
+    # split `hotel_id` column into list of strings
+    submission[target_col] = submission[target_col].astype(str).str.split(" ")
+    answers[target_col] = answers[target_col].astype(str).str.split(" ").apply(set)
+    actual = answers[target_col].tolist()
+    predicted = submission[target_col].tolist()
+    return {"actual": actual, "predicted": predicted}
+def grade(submission: pd.DataFrame, answers: pd.DataFrame) -> float:
+    map_inputs = prepare_for_metric(submission, answers)
+    return mean_average_precision_at_k(**map_inputs, k=5)

mlebench/competitions/hotel-id-2021-fgvc8/prepare.py ADDED Viewed

@@ -0,0 +1,63 @@
+import shutil
+from pathlib import Path
+from sklearn.model_selection import train_test_split
+from tqdm.auto import tqdm
+from mlebench.utils import read_csv
+def prepare(raw: Path, public: Path, private: Path):
+    old_train = read_csv(raw / "train.csv")
+    # drop image ce27d36c9147cc19.jpg: it appears twice and may occur across train and test when split
+    old_train = old_train[old_train["image"] != "ce27d36c9147cc19.jpg"]
+    new_train, new_test = train_test_split(old_train, test_size=0.1, random_state=0)
+    sample_submission = new_test.copy()[["image", "hotel_id"]]
+    sample_submission["hotel_id"] = "36363 53586 18807 64314 60181"
+    # save public files
+    new_train.to_csv(public / "train.csv", index=False)
+    sample_submission.to_csv(public / "sample_submission.csv", index=False)
+    # copy images from raw to prepared/public accordingly
+    (public / "train_images").mkdir(exist_ok=True, parents=True)
+    for image, chain in tqdm(
+        zip(new_train["image"], new_train["chain"]), total=len(new_train), desc="Train images"
+    ):
+        chain = str(chain)
+        (public / "train_images" / chain).mkdir(exist_ok=True, parents=True)
+        shutil.copy(raw / "train_images" / chain / image, public / "train_images" / chain / image)
+    (public / "test_images").mkdir(exist_ok=True, parents=True)
+    for image, chain in tqdm(
+        zip(new_test["image"], new_test["chain"]), total=len(new_test), desc="Test images"
+    ):
+        chain = str(chain)
+        (public / "test_images").mkdir(exist_ok=True, parents=True)
+        shutil.copy(raw / "train_images" / chain / image, public / "test_images" / image)
+    # save private files
+    new_test.to_csv(private / "test.csv", index=False)
+    # checks
+    assert len(new_train) + len(new_test) == len(
+        old_train
+    ), "Length of new_train and new_test should equal length of old_train"
+    assert sample_submission.columns.to_list() == [
+        "image",
+        "hotel_id",
+    ], "Sample submission columns should only be `image` and `hotel_id`"
+    assert len(sample_submission) == len(new_test), "Sample submission length should match test set"
+    for image, chain in zip(new_train["image"], new_train["chain"]):
+        chain = str(chain)
+        assert (
+            public / "train_images" / chain / image
+        ).exists(), f"Image {image} not found in train_images folder"
+    for image in new_test["image"]:
+        assert (
+            public / "test_images" / image
+        ).exists(), f"Image {image} not found in test_images folder"
+    assert not set(new_train["image"]).intersection(
+        set(new_test["image"])
+    ), "Train and test ids overlap"

mlebench/competitions/hotel-id-2021-fgvc8/prepare_val.py ADDED Viewed

@@ -0,0 +1,132 @@
+import shutil
+from pathlib import Path
+from sklearn.model_selection import train_test_split
+from tqdm.auto import tqdm
+from mlebench.utils import read_csv
+def _create_split(
+    df_to_split,
+    raw_images_base_path: Path,
+    public_path: Path,
+    private_path: Path,
+    test_size: float,
+    random_state: int,
+):
+    """
+    Helper function to perform a data split, create corresponding files, and copy images.
+    Args:
+        df_to_split: The DataFrame to be split into train and test sets.
+        raw_images_base_path: Path to the directory containing the original images.
+        public_path: The public output directory for this split.
+        private_path: The private output directory for this split.
+        test_size: The proportion of the dataset to allocate to the test split.
+        random_state: The seed used by the random number generator.
+    """
+    # Create output directories
+    public_path.mkdir(exist_ok=True, parents=True)
+    private_path.mkdir(exist_ok=True, parents=True)
+    # Perform the split
+    new_train, new_test = train_test_split(
+        df_to_split, test_size=test_size, random_state=random_state
+    )
+    sample_submission = new_test.copy()[["image", "hotel_id"]]
+    sample_submission["hotel_id"] = "36363 53586 18807 64314 60181"
+    # Save public files
+    new_train.to_csv(public_path / "train.csv", index=False)
+    sample_submission.to_csv(public_path / "sample_submission.csv", index=False)
+    # Copy images from raw to prepared/public accordingly
+    (public_path / "train_images").mkdir(exist_ok=True, parents=True)
+    for image, chain in tqdm(
+        zip(new_train["image"], new_train["chain"]),
+        total=len(new_train),
+        desc=f"Train images for {public_path.name}",
+    ):
+        chain = str(chain)
+        (public_path / "train_images" / chain).mkdir(exist_ok=True, parents=True)
+        shutil.copy(
+            raw_images_base_path / chain / image,
+            public_path / "train_images" / chain / image,
+        )
+    (public_path / "test_images").mkdir(exist_ok=True, parents=True)
+    for image, chain in tqdm(
+        zip(new_test["image"], new_test["chain"]),
+        total=len(new_test),
+        desc=f"Test images for {public_path.name}",
+    ):
+        chain = str(chain)
+        # Note: Test images are copied to a flat directory structure
+        shutil.copy(raw_images_base_path / chain / image, public_path / "test_images" / image)
+    # Save private files
+    new_test.to_csv(private_path / "test.csv", index=False)
+    # Checks
+    assert len(new_train) + len(new_test) == len(
+        df_to_split
+    ), "Length of new_train and new_test should equal length of input dataframe"
+    assert sample_submission.columns.to_list() == [
+        "image",
+        "hotel_id",
+    ], "Sample submission columns should only be `image` and `hotel_id`"
+    assert len(sample_submission) == len(new_test), "Sample submission length should match test set"
+    for image, chain in zip(new_train["image"], new_train["chain"]):
+        chain = str(chain)
+        assert (
+            public_path / "train_images" / chain / image
+        ).exists(), f"Image {image} not found in train_images folder"
+    for image in new_test["image"]:
+        assert (
+            public_path / "test_images" / image
+        ).exists(), f"Image {image} not found in test_images folder"
+    assert not set(new_train["image"]).intersection(
+        set(new_test["image"])
+    ), "Train and test ids overlap"
+    return new_train, new_test
+def prepare(raw: Path, public: Path, private: Path):
+    old_train = read_csv(raw / "train.csv")
+    # drop image ce27d36c9147cc19.jpg: it appears twice and may occur across train and test when split
+    old_train = old_train[old_train["image"] != "ce27d36c9147cc19.jpg"]
+    # --- First Split (Original Public/Private) ---
+    # This split produces the main benchmark data. The outputs in `public` and `private`
+    # will be identical to the original script's output.
+    original_train_df, _ = _create_split(
+        df_to_split=old_train,
+        raw_images_base_path=raw / "train_images",
+        public_path=public,
+        private_path=private,
+        test_size=0.1,
+        random_state=0,
+    )
+    # --- Second Split (New Validation Set) ---
+    # This split takes the training data from the first split (`original_train_df`) and
+    # splits it again to create a new, smaller training set and a validation set.
+    # The results are saved in parallel `public_val` and `private_val` directories.
+    public_val = public.parent / "public_val"
+    private_val = private.parent / "private_val"
+    # The test size is 1/9 because we want the new validation set to be roughly the
+    # same size as the original test set (10% of the total data).
+    # original_train_df is 90% of the total, so 1/9 of it is 10% of the total.
+    test_size_for_val_split = 1 / 9
+    _create_split(
+        df_to_split=original_train_df,
+        raw_images_base_path=raw / "train_images",
+        public_path=public_val,
+        private_path=private_val,
+        test_size=test_size_for_val_split,
+        random_state=0,  # Use same random state for consistency and determinism
+    )

dslighting 1.7.1__py3-none-any.whl → 1.7.8__py3-none-any.whl

dslighting 1.7.1py3-none-any.whl → 1.7.8py3-none-any.whl