PyPI - dslighting - Versions diffs - 1.7.1__py3-none-any.whl → 1.7.8__py3-none-any.whl - Mend

dslighting 1.7.1py3-none-any.whl → 1.7.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (352) hide show

mlebench/competitions/vinbigdata-chest-xray-abnormalities-detection/prepare.py ADDED Viewed

@@ -0,0 +1,129 @@
+import shutil
+from pathlib import Path
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from tqdm import tqdm
+from mlebench.utils import read_csv
+def prepare(raw: Path, public: Path, private: Path):
+    """
+    Splits the data in raw into public and private datasets with appropriate test/train splits.
+    """
+    dev = False
+    # Create train, test from train split
+    old_train = read_csv(raw / "train.csv")
+    unique_image_ids = old_train["image_id"].unique()
+    # Original train has 15k images, original test has 3k images
+    # Our new train will have 13.5k images, our new test will have 1.5k images
+    expected_train_size = 13500
+    expected_test_size = 1500
+    train_image_ids, test_image_ids = train_test_split(
+        unique_image_ids, test_size=0.1, random_state=0
+    )
+    new_train = old_train[old_train["image_id"].isin(train_image_ids)]
+    answers = old_train[old_train["image_id"].isin(test_image_ids)]
+    # Create sample submission
+    sample_submission = pd.DataFrame(
+        {
+            "image_id": test_image_ids,
+            "PredictionString": "14 1 0 0 1 1",
+        }  # As per the original sample submission
+    )
+    # Checks
+    assert (
+        len(set(new_train["image_id"])) == expected_train_size
+    ), f"Expected {expected_train_size} train image_ids, got {len(set(new_train['image_id']))}"
+    assert (
+        len(set(answers["image_id"])) == expected_test_size
+    ), f"Expected {expected_test_size} test image_ids, got {len(set(answers['image_id']))}"
+    assert set(new_train["image_id"]).isdisjoint(
+        set(answers["image_id"])
+    ), f"image_id is not disjoint between train and test sets"
+    assert (
+        new_train.columns.tolist() == old_train.columns.tolist()
+    ), f"Columns of new train and old train are not the same: {new_train.columns.tolist()} vs {old_train.columns.tolist()}"
+    assert len(new_train) + len(answers) == len(
+        old_train
+    ), f"Length of new train and answers should add up to the length of old train, got {len(new_train) + len(answers)} vs {len(old_train)}"
+    assert len(sample_submission) == len(
+        set(answers["image_id"])
+    ), f"Length of sample submission should be equal to the number of unique image_ids in answers, got {len(sample_submission)} vs {len(set(answers['image_id']))}"
+    # Reformat answers
+    def _get_consensus_annotation(answers, inspect_duplicates=False):
+        """
+        In the original train, there can be multiple annotations of the same image_id, class_id pair. (Different radiologists draw the bounding boxes differently for the same finding)
+        In the original test, there is only one annotation per image_id, class_id pair. The original test set is labeled by consensus of 5 radiologists. (Source: https://www.kaggle.com/competitions/vinbigdata-chest-xray-abnormalities-detection/discussion/207969#1134645)
+        We simulate consensus by taking the first annotation for each image_id, class_id pair.
+        """
+        if inspect_duplicates:
+            duplicates = answers[answers.duplicated(subset=["image_id", "class_id"], keep=False)]
+            duplicates = duplicates.sort_values(by=["image_id", "class_id"])
+            duplicates.to_csv("duplicates.csv", index=False)
+        answers = answers.groupby(by=["image_id", "class_id"]).first().reset_index()
+        return answers
+    answers = _get_consensus_annotation(answers)
+    # Filling in missing values for when there is no finding (class_id = 14)
+    answers = answers.fillna(0)
+    answers.loc[answers["class_id"] == 14, ["x_max", "y_max"]] = 1.0
+    # Create gold submission
+    gold = answers[["image_id", "class_id", "x_min", "y_min", "x_max", "y_max"]].copy()
+    # Create individual prediction strings
+    gold.loc[:, "PredictionString"] = gold.apply(
+        lambda row: f"{row['class_id']} 1.0 {row['x_min']} {row['y_min']} {row['x_max']} {row['y_max']}",
+        axis=1,  # 1.0 is the confidence score
+    )
+    # Group by image_id and concatenate prediction strings
+    gold = gold.groupby("image_id")["PredictionString"].agg(" ".join).reset_index()
+    gold = gold.reset_index(drop=True)
+    assert len(gold) == len(
+        set(answers["image_id"])
+    ), f"Length of gold should be equal to the number of unique image_ids in answers, got {len(gold)} vs {len(set(answers['image_id']))}"
+    # Write CSVs
+    new_train.to_csv(public / "train.csv", index=False)
+    sample_submission.to_csv(public / "sample_submission.csv", index=False)
+    answers.to_csv(private / "answers.csv", index=False)
+    gold.to_csv(private / "gold_submission.csv", index=False)
+    # Copy over files
+    (public / "test").mkdir(exist_ok=True)
+    (public / "train").mkdir(exist_ok=True)
+    if dev == True:
+        train_image_ids = train_image_ids[:10]
+        test_image_ids = test_image_ids[:10]
+    for file_id in tqdm(train_image_ids, desc="Copying train files"):
+        shutil.copyfile(
+            src=raw / "train" / f"{file_id}.dicom",
+            dst=public / "train" / f"{file_id}.dicom",
+        )
+    for file_id in tqdm(test_image_ids, desc="Copying test files"):
+        shutil.copyfile(
+            src=raw / "train" / f"{file_id}.dicom",
+            dst=public / "test" / f"{file_id}.dicom",
+        )
+    # Check files
+    assert len(list(public.glob("train/*.dicom"))) == len(
+        train_image_ids
+    ), f"Expected {len(train_image_ids)} train files, got {len(list(public.glob('train/*.dicom')))}"
+    assert len(list(public.glob("test/*.dicom"))) == len(
+        test_image_ids
+    ), f"Expected {len(test_image_ids)} test files, got {len(list(public.glob('test/*.dicom')))}"

mlebench/competitions/vinbigdata-chest-xray-abnormalities-detection/prepare_val.py ADDED Viewed

@@ -0,0 +1,204 @@
+import shutil
+from pathlib import Path
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from tqdm import tqdm
+from mlebench.utils import read_csv
+def _get_consensus_annotation(answers, inspect_duplicates=False):
+    """
+    In the original train, there can be multiple annotations of the same image_id, class_id pair. (Different radiologists draw the bounding boxes differently for the same finding)
+    In the original test, there is only one annotation per image_id, class_id pair. The original test set is labeled by consensus of 5 radiologists. (Source: https://www.kaggle.com/competitions/vinbigdata-chest-xray-abnormalities-detection/discussion/207969#1134645)
+    We simulate consensus by taking the first annotation for each image_id, class_id pair.
+    """
+    if inspect_duplicates:
+        duplicates = answers[answers.duplicated(subset=["image_id", "class_id"], keep=False)]
+        duplicates = duplicates.sort_values(by=["image_id", "class_id"])
+        duplicates.to_csv("duplicates.csv", index=False)
+    answers = answers.groupby(by=["image_id", "class_id"]).first().reset_index()
+    return answers
+def _create_split_files(
+    train_df: pd.DataFrame,
+    answers_df: pd.DataFrame,
+    train_image_ids: list,
+    test_image_ids: list,
+    public_path: Path,
+    private_path: Path,
+    raw_images_path: Path,
+    dev: bool = False,
+):
+    """
+    Helper function to process and save a single train/test split to the specified paths.
+    This encapsulates the logic for creating submissions, writing CSVs, and copying image files.
+    """
+    public_path.mkdir(parents=True, exist_ok=True)
+    private_path.mkdir(parents=True, exist_ok=True)
+    # Create sample submission for the test set
+    sample_submission = pd.DataFrame(
+        {
+            "image_id": test_image_ids,
+            "PredictionString": "14 1 0 0 1 1",  # As per the original sample submission
+        }
+    )
+    # Reformat answers
+    answers = _get_consensus_annotation(answers_df)
+    # Filling in missing values for when there is no finding (class_id = 14)
+    answers = answers.fillna(0)
+    answers.loc[answers["class_id"] == 14, ["x_max", "y_max"]] = 1.0
+    # Create gold submission
+    gold = answers[["image_id", "class_id", "x_min", "y_min", "x_max", "y_max"]].copy()
+    # Create individual prediction strings
+    gold.loc[:, "PredictionString"] = gold.apply(
+        lambda row: f"{row['class_id']} 1.0 {row['x_min']} {row['y_min']} {row['x_max']} {row['y_max']}",
+        axis=1,  # 1.0 is the confidence score
+    )
+    # Group by image_id and concatenate prediction strings
+    gold = gold.groupby("image_id")["PredictionString"].agg(" ".join).reset_index()
+    gold = gold.reset_index(drop=True)
+    # Checks for this split
+    assert len(set(train_df["image_id"])) == len(
+        train_image_ids
+    ), f"Number of unique image_ids in train_df does not match the provided list for {public_path.name}"
+    assert len(set(answers["image_id"])) == len(
+        test_image_ids
+    ), f"Number of unique image_ids in answers does not match the provided list for {public_path.name}"
+    assert set(train_df["image_id"]).isdisjoint(
+        set(answers["image_id"])
+    ), f"image_id is not disjoint between train and test sets for {public_path.name}"
+    assert len(sample_submission) == len(
+        set(answers["image_id"])
+    ), f"Length of sample submission is incorrect for {public_path.name}"
+    assert len(gold) == len(
+        set(answers["image_id"])
+    ), f"Length of gold submission is incorrect for {public_path.name}"
+    # Write CSVs
+    train_df.to_csv(public_path / "train.csv", index=False)
+    sample_submission.to_csv(public_path / "sample_submission.csv", index=False)
+    answers.to_csv(private_path / "answers.csv", index=False)
+    gold.to_csv(private_path / "gold_submission.csv", index=False)
+    # Copy over files
+    (public_path / "test").mkdir(exist_ok=True)
+    (public_path / "train").mkdir(exist_ok=True)
+    if dev:
+        train_image_ids = train_image_ids[:10]
+        test_image_ids = test_image_ids[:10]
+    for file_id in tqdm(train_image_ids, desc=f"Copying {public_path.name} train files"):
+        shutil.copyfile(
+            src=raw_images_path / "train" / f"{file_id}.dicom",
+            dst=public_path / "train" / f"{file_id}.dicom",
+        )
+    for file_id in tqdm(test_image_ids, desc=f"Copying {public_path.name} test files"):
+        shutil.copyfile(
+            src=raw_images_path / "train" / f"{file_id}.dicom",
+            dst=public_path / "test" / f"{file_id}.dicom",
+        )
+    # Check files
+    assert len(list(public_path.glob("train/*.dicom"))) == len(
+        train_image_ids
+    ), f"Incorrect number of train files copied for {public_path.name}"
+    assert len(list(public_path.glob("test/*.dicom"))) == len(
+        test_image_ids
+    ), f"Incorrect number of test files copied for {public_path.name}"
+def prepare(raw: Path, public: Path, private: Path):
+    """
+    Splits the data in raw into public and private datasets with appropriate test/train splits.
+    Also creates a parallel validation split (public_val, private_val).
+    """
+    dev = False
+    # Create train, test from train split
+    old_train = read_csv(raw / "train.csv")
+    unique_image_ids = old_train["image_id"].unique()
+    # --- 1. Original Train/Test Split ---
+    # Original train has 15k images, original test has 3k images
+    # Our new train will have 13.5k images, our new test will have 1.5k images
+    expected_train_size = 13500
+    expected_test_size = 1500
+    train_image_ids, test_image_ids = train_test_split(
+        unique_image_ids, test_size=0.1, random_state=0
+    )
+    new_train = old_train[old_train["image_id"].isin(train_image_ids)]
+    answers = old_train[old_train["image_id"].isin(test_image_ids)]
+    # Checks
+    assert (
+        len(set(new_train["image_id"])) == expected_train_size
+    ), f"Expected {expected_train_size} train image_ids, got {len(set(new_train['image_id']))}"
+    assert (
+        len(set(answers["image_id"])) == expected_test_size
+    ), f"Expected {expected_test_size} test image_ids, got {len(set(answers['image_id']))}"
+    assert set(new_train["image_id"]).isdisjoint(
+        set(answers["image_id"])
+    ), f"image_id is not disjoint between train and test sets"
+    assert len(new_train) + len(answers) == len(
+        old_train
+    ), f"Length of new train and answers should add up to the length of old train, got {len(new_train) + len(answers)} vs {len(old_train)}"
+    # Create all files for the original public/private split
+    _create_split_files(
+        train_df=new_train,
+        answers_df=answers,
+        train_image_ids=train_image_ids,
+        test_image_ids=test_image_ids,
+        public_path=public,
+        private_path=private,
+        raw_images_path=raw,
+        dev=dev,
+    )
+    # --- 2. New Validation Split (from the `new_train` set) ---
+    print("\nCreating validation split...")
+    # Define paths for the new validation set
+    public_val = public.parent / "public_val"
+    private_val = private.parent / "private_val"
+    # The new validation set should have the same size as the original test set
+    train_for_val_split_ids = new_train["image_id"].unique()
+    val_test_size = expected_test_size / len(train_for_val_split_ids)
+    # Split `new_train` again to create a smaller train set and a validation set
+    # We use the same random_state for consistency in splitting methodology
+    train_val_image_ids, test_val_image_ids = train_test_split(
+        train_for_val_split_ids, test_size=val_test_size, random_state=0
+    )
+    train_val = new_train[new_train["image_id"].isin(train_val_image_ids)]
+    answers_val = new_train[new_train["image_id"].isin(test_val_image_ids)]
+    # Create all files for the new validation split
+    _create_split_files(
+        train_df=train_val,
+        answers_df=answers_val,
+        train_image_ids=train_val_image_ids,
+        test_image_ids=test_val_image_ids,
+        public_path=public_val,
+        private_path=private_val,
+        raw_images_path=raw,
+        dev=dev,
+    )
+    print("Validation split created successfully.")

mlebench/competitions/whale-categorization-playground/grade.py ADDED Viewed

@@ -0,0 +1,41 @@
+import pandas as pd
+from mlebench.grade_helpers import InvalidSubmissionError
+from mlebench.metrics import mean_average_precision_at_k
+def prepare_for_metric(submission: pd.DataFrame, answers: pd.DataFrame) -> dict:
+    # Answers checks
+    assert set(answers.columns) == {
+        "Image",
+        "Id",
+    }, "Answers DataFrame must have 'Image' and 'Id' columns."
+    # Submission checks
+    if set(submission.columns) != {"Image", "Id"}:
+        raise InvalidSubmissionError("Submission DataFrame must have 'Image' and 'Id' columns.")
+    if len(submission) != len(answers):
+        raise InvalidSubmissionError(
+            "Submission and answers DataFrames must have the same number of rows."
+        )
+    # Ensure the IDs match
+    if not set(submission["Image"]) == set(answers["Image"]):
+        raise InvalidSubmissionError("Submission and answers have different Image IDs")
+    # Sort to ensure we're grading the right rows
+    submission = submission.sort_values("Image").reset_index(drop=True)
+    answers = answers.sort_values("Image").reset_index(drop=True)
+    y_true = answers["Id"].astype(str).str.split(" ").apply(set).tolist()
+    y_pred = submission["Id"].astype(str).str.split(" ").tolist()
+    return {"actual": y_true, "predicted": y_pred}
+def grade(submission: pd.DataFrame, answers: pd.DataFrame) -> float:
+    # Prepare the data for metric calculation
+    prepped = prepare_for_metric(submission, answers)
+    return mean_average_precision_at_k(
+        actual=prepped["actual"], predicted=prepped["predicted"], k=5
+    )

mlebench/competitions/whale-categorization-playground/prepare.py ADDED Viewed

@@ -0,0 +1,103 @@
+import shutil
+from pathlib import Path
+import numpy as np
+from sklearn.model_selection import train_test_split
+from tqdm import tqdm
+from mlebench.utils import read_csv
+def prepare(raw: Path, public: Path, private: Path):
+    """
+    Splits the data in raw into public and private datasets with appropriate test/train splits.
+    """
+    # Create train, test from train split
+    old_train = read_csv(raw / "train.csv")
+    old_train["split"] = "undecided"
+    target_test_size = 0.1
+    # seeded random generator for numpy
+    np_rng = np.random.default_rng(0)
+    # ensure each id occurs in train and test set at least once
+    # when there's only one image for an id, goes randomly to train or test
+    whale_ids = old_train["Id"].unique()
+    for whale_id in whale_ids:
+        whale_images = old_train[old_train["Id"] == whale_id]
+        if len(whale_images) >= 2:
+            # randomly assign one of these to train and one to test
+            selected = whale_images.sample(2, random_state=0)
+            old_train.loc[selected.index[0], "split"] = "train"
+            old_train.loc[selected.index[1], "split"] = "test"
+        else:
+            # randomly assign this one image to train or test
+            old_train.loc[whale_images.index[0], "split"] = np_rng.choice(
+                ["train", "test"], replace=False, p=[1 - target_test_size, target_test_size]
+            )
+    # split the remaining data
+    remaining_data = old_train[old_train["split"] == "undecided"]
+    train, test = train_test_split(remaining_data, test_size=target_test_size, random_state=0)
+    old_train.loc[train.index, "split"] = "train"
+    old_train.loc[test.index, "split"] = "test"
+    # finally, can split out into separate dataframes
+    new_train = old_train[old_train["split"] == "train"].drop(columns=["split"]).copy()
+    answers = old_train[old_train["split"] == "test"].drop(columns=["split"]).copy()
+    # If a whale Id is only in the test set, it should be labeled as new_whale instead
+    ids_in_test_but_not_train = set(answers["Id"]) - set(new_train["Id"])
+    answers.loc[answers["Id"].isin(ids_in_test_but_not_train), "Id"] = "new_whale"
+    # Create sample submission
+    sample_submission = answers.copy()
+    sample_submission["Id"] = "new_whale w_1287fbc w_98baff9 w_7554f44 w_1eafe46"
+    # Checks
+    assert len(answers) == len(
+        sample_submission
+    ), "Answers and sample submission should have the same length"
+    assert new_train.shape[1] == 2, "Train should have exactly 2 columns"
+    assert sample_submission.shape[1] == 2, "Sample submission should have exactly 2 columns"
+    assert answers.shape[1] == 2, "Answers should have exactly 2 columns"
+    assert (
+        "new_whale" in answers["Id"].values
+    ), "Answers should contain at least some values with 'new_whale' in the 'Id' column"
+    assert len(new_train) + len(answers) == len(
+        old_train
+    ), "The combined length of new_train and answers should equal the length of old_train"
+    # Write CSVs
+    answers.to_csv(private / "test.csv", index=False)
+    new_train.to_csv(public / "train.csv", index=False)
+    sample_submission.to_csv(public / "sample_submission.csv", index=False)
+    # Copy over files
+    (public / "test").mkdir(exist_ok=True)
+    (public / "train").mkdir(exist_ok=True)
+    for file_id in tqdm(new_train["Image"], desc="Copying train images"):
+        shutil.copyfile(
+            src=raw / "train" / f"{file_id}",
+            dst=public / "train" / f"{file_id}",
+        )
+    for file_id in tqdm(answers["Image"], desc="Copying test images"):
+        shutil.copyfile(
+            src=raw / "train" / f"{file_id}",
+            dst=public / "test" / f"{file_id}",
+        )
+    # File checks
+    train_files = list(public.glob("train/*.jpg"))
+    test_files = list(public.glob("test/*.jpg"))
+    assert len(train_files) == len(
+        new_train
+    ), "Train dir should have the same number of images as the length of train set"
+    assert len(test_files) == len(
+        answers
+    ), "Test dir should have the same number of images as the length of test set"
+    assert not set(train_files) & set(test_files), "Train and test files should be distinct"

mlebench/competitions/whale-categorization-playground/prepare_val.py ADDED Viewed

@@ -0,0 +1,196 @@
+import shutil
+from pathlib import Path
+import numpy as np
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from tqdm import tqdm
+from mlebench.utils import read_csv
+def _perform_split(
+    df: pd.DataFrame, target_test_size: float, random_seed: int
+) -> tuple[pd.DataFrame, pd.DataFrame]:
+    """
+    Performs the custom splitting logic on a given dataframe.
+    This logic ensures that for IDs with 2 or more images, one is guaranteed
+    to be in the train set and one in the test set. The remaining data is
+    split randomly.
+    Args:
+        df: The input dataframe with "Image" and "Id" columns.
+        target_test_size: The approximate fraction of data to allocate to the test set.
+        random_seed: The random seed for reproducibility.
+    Returns:
+        A tuple containing the train dataframe and the test dataframe.
+    """
+    # Make a copy to avoid modifying the original dataframe
+    data_to_split = df.copy()
+    data_to_split["split"] = "undecided"
+    # seeded random generator for numpy
+    np_rng = np.random.default_rng(random_seed)
+    # ensure each id occurs in train and test set at least once
+    # when there's only one image for an id, goes randomly to train or test
+    whale_ids = data_to_split["Id"].unique()
+    for whale_id in whale_ids:
+        whale_images = data_to_split[data_to_split["Id"] == whale_id]
+        if len(whale_images) >= 2:
+            # randomly assign one of these to train and one to test
+            selected = whale_images.sample(2, random_state=random_seed)
+            data_to_split.loc[selected.index[0], "split"] = "train"
+            data_to_split.loc[selected.index[1], "split"] = "test"
+        else:
+            # randomly assign this one image to train or test
+            data_to_split.loc[whale_images.index[0], "split"] = np_rng.choice(
+                ["train", "test"],
+                replace=False,
+                p=[1 - target_test_size, target_test_size],
+            )
+    # split the remaining data
+    remaining_data = data_to_split[data_to_split["split"] == "undecided"]
+    if not remaining_data.empty:
+        train, test = train_test_split(
+            remaining_data, test_size=target_test_size, random_state=random_seed
+        )
+        data_to_split.loc[train.index, "split"] = "train"
+        data_to_split.loc[test.index, "split"] = "test"
+    # finally, can split out into separate dataframes
+    train_df = data_to_split[data_to_split["split"] == "train"].drop(
+        columns=["split"]
+    )
+    test_df = data_to_split[data_to_split["split"] == "test"].drop(columns=["split"])
+    return train_df.copy(), test_df.copy()
+def _write_output_files(
+    train_df: pd.DataFrame,
+    test_df: pd.DataFrame,
+    public_dir: Path,
+    private_dir: Path,
+    raw_dir: Path,
+):
+    """
+    Writes all required output files for a given train/test split.
+    This includes CSVs, images, and a sample submission file.
+    """
+    public_dir.mkdir(exist_ok=True)
+    private_dir.mkdir(exist_ok=True)
+    # Make a copy of the test dataframe to create the final answers
+    answers = test_df.copy()
+    # If a whale Id is only in the test set, it should be labeled as new_whale instead
+    ids_in_test_but_not_train = set(answers["Id"]) - set(train_df["Id"])
+    answers.loc[answers["Id"].isin(ids_in_test_but_not_train), "Id"] = "new_whale"
+    # Create sample submission
+    sample_submission = answers.copy()
+    sample_submission["Id"] = "new_whale w_1287fbc w_98baff9 w_7554f44 w_1eafe46"
+    # Checks
+    assert len(answers) == len(
+        sample_submission
+    ), "Answers and sample submission should have the same length"
+    assert train_df.shape[1] == 2, "Train should have exactly 2 columns"
+    assert sample_submission.shape[1] == 2, "Sample submission should have exactly 2 columns"
+    assert answers.shape[1] == 2, "Answers should have exactly 2 columns"
+    assert (
+        "new_whale" in answers["Id"].values
+    ), "Answers should contain at least some values with 'new_whale' in the 'Id' column"
+    assert len(train_df) + len(answers) == len(train_df) + len(
+        test_df
+    ), "The combined length of train_df and answers should equal their original combined length"
+    # Write CSVs
+    answers.to_csv(private_dir / "test.csv", index=False)
+    train_df.to_csv(public_dir / "train.csv", index=False)
+    sample_submission.to_csv(public_dir / "sample_submission.csv", index=False)
+    # Copy over files
+    (public_dir / "test").mkdir(exist_ok=True)
+    (public_dir / "train").mkdir(exist_ok=True)
+    for file_id in tqdm(train_df["Image"], desc=f"Copying train images to {public_dir}"):
+        shutil.copyfile(
+            src=raw_dir / "train" / f"{file_id}",
+            dst=public_dir / "train" / f"{file_id}",
+        )
+    for file_id in tqdm(answers["Image"], desc=f"Copying test images to {public_dir}"):
+        shutil.copyfile(
+            src=raw_dir / "train" / f"{file_id}",
+            dst=public_dir / "test" / f"{file_id}",
+        )
+    # File checks
+    train_files = list(public_dir.glob("train/*.jpg"))
+    test_files = list(public_dir.glob("test/*.jpg"))
+    assert len(train_files) == len(
+        train_df
+    ), f"Train dir {public_dir / 'train'} should have the same number of images as the length of its train set"
+    assert len(test_files) == len(
+        answers
+    ), f"Test dir {public_dir / 'test'} should have the same number of images as the length of its test set"
+    assert not set(train_files) & set(test_files), "Train and test files should be distinct"
+def prepare(raw: Path, public: Path, private: Path):
+    """
+    Splits the data in raw into public and private datasets with appropriate test/train splits.
+    Additionally, it creates a second parallel split (train_val, test_val) for validation purposes.
+    """
+    old_train = read_csv(raw / "train.csv")
+    target_test_size = 0.1
+    random_seed = 0
+    # --- Step 1: Create the original train/test split ---
+    # This split generates the main competition data.
+    new_train, answers = _perform_split(
+        df=old_train, target_test_size=target_test_size, random_seed=random_seed
+    )
+    # Write the original output files. This part is unchanged in its output.
+    _write_output_files(
+        train_df=new_train,
+        test_df=answers,
+        public_dir=public,
+        private_dir=private,
+        raw_dir=raw,
+    )
+    # --- Step 2: Create the new validation split ---
+    # This takes the `new_train` set from the first split and splits it again
+    # to create a validation set (`test_val`) of a similar size to the original `answers`.
+    # The new, smaller training set is `train_val`.
+    # Define paths for the new validation directories
+    public_val = public.parent / "public_val"
+    private_val = private.parent / "private_val"
+    # To get a validation test set of size ~0.1*N from a training set of size ~0.9*N,
+    # the new test_size must be 0.1/0.9.
+    val_target_test_size = target_test_size / (1 - target_test_size)
+    # Perform the second split on the `new_train` data using the same logic and seed.
+    train_val, test_val = _perform_split(
+        df=new_train,
+        target_test_size=val_target_test_size,
+        random_seed=random_seed,
+    )
+    # Write the validation output files to the new directories.
+    _write_output_files(
+        train_df=train_val,
+        test_df=test_val,
+        public_dir=public_val,
+        private_dir=private_val,
+        raw_dir=raw,
+    )

dslighting 1.7.1__py3-none-any.whl → 1.7.8__py3-none-any.whl

dslighting 1.7.1py3-none-any.whl → 1.7.8py3-none-any.whl