PyPI - dslighting - Versions diffs - 1.7.1__py3-none-any.whl → 1.7.6__py3-none-any.whl - Mend

dslighting 1.7.1py3-none-any.whl → 1.7.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (352) hide show

mlebench/competitions/rsna-2022-cervical-spine-fracture-detection/prepare.py ADDED Viewed

@@ -0,0 +1,219 @@
+import random
+import shutil
+from pathlib import Path
+from tqdm.auto import tqdm
+from mlebench.utils import read_csv
+def prepare(raw: Path, public: Path, private: Path):
+    rng = random.Random(0)
+    # there are two subsets of training data:
+    # 1. one of instances that have bounding boxes
+    # 2. one of instances that have segmentations
+    # we need to preserve the ratios of the sizes of these subsets to the total train samples
+    # additionally, there is an overlap between the two subsets
+    # we need to preserve this overlap
+    DEV = False
+    old_train = read_csv(raw / "train.csv")
+    num_old_train = len(old_train)
+    if DEV:
+        DEV_RATIO = 0.175
+        num_old_train = round(DEV_RATIO * num_old_train)
+    # 2019 train folders (StudyInstanceUIDs), 1500 test folders, 2019 / (1500 + 2019) ~ 0.60 original train ratio
+    # each folder has ~ 300 images
+    # We use 0.1 ratio to avoid taking too many samples out of train
+    TRAIN_RATIO = 0.1
+    num_train_samples = round(num_old_train * TRAIN_RATIO)
+    # bboxes
+    old_train_bboxes = read_csv(raw / "train_bounding_boxes.csv")
+    if DEV:
+        old_train_bboxes = old_train_bboxes.sample(frac=DEV_RATIO, random_state=0)
+    old_train_bbox_ids = sorted(old_train_bboxes["StudyInstanceUID"].unique())
+    old_num_train_bbox_ids = len(old_train_bbox_ids)  # 235
+    new_num_train_bbox_ids = round(old_num_train_bbox_ids * TRAIN_RATIO)
+    # segmentations
+    old_train_segmentation_path = raw / "segmentations"
+    old_train_segmentation_ids = sorted([f.stem for f in old_train_segmentation_path.glob("*.nii")])
+    if DEV:
+        old_train_segmentation_ids = rng.sample(
+            old_train_segmentation_ids, round(DEV_RATIO * len(old_train_segmentation_ids))
+        )
+    old_num_train_segmentation_ids = len(old_train_segmentation_ids)  # 87
+    new_num_train_segmentation_ids = round(old_num_train_segmentation_ids * TRAIN_RATIO)
+    # overlap: list of StudyInstanceUIDs that have both bounding boxes and segmentations
+    old_overlap_ids = [uid for uid in old_train_bbox_ids if uid in old_train_segmentation_ids]
+    old_num_overlap = len(old_overlap_ids)  # 40
+    new_num_overlap = round(old_num_overlap * TRAIN_RATIO)
+    # start populating new train by picking the overlap instances
+    # sample new_num_overlap instances from the overlap randomly
+    new_overlap_ids = rng.sample(old_overlap_ids, new_num_overlap)
+    new_bboxes_ids = new_overlap_ids.copy()
+    new_segmentations_ids = new_overlap_ids.copy()
+    new_train_ids = new_overlap_ids.copy()
+    # add the `new_num_train_segmentation_ids - new_num_overlap`, that are not in the overlap
+    additional_segmentation_ids = rng.sample(
+        [uid for uid in old_train_segmentation_ids if uid not in old_overlap_ids],
+        new_num_train_segmentation_ids - new_num_overlap,
+    )
+    new_segmentations_ids += additional_segmentation_ids
+    new_train_ids += additional_segmentation_ids
+    # add the (`new_num_train_bbox_ids - num_num_overlap`) segmentations, that are not in the overlap
+    additional_bbox_ids = rng.sample(
+        [uid for uid in old_train_bbox_ids if uid not in old_overlap_ids],
+        new_num_train_bbox_ids - new_num_overlap,
+    )
+    new_bboxes_ids += additional_bbox_ids
+    new_train_ids += additional_bbox_ids
+    if DEV:
+        # old train has whatever is currently in new_train_ids
+        # + a random sample of the rest, s.t. its 15% of the original train
+        dev_old_train_ids = new_train_ids + rng.sample(
+            [uid for uid in old_train["StudyInstanceUID"] if uid not in new_train_ids],
+            num_old_train - len(new_train_ids),
+        )
+        old_train = old_train[old_train["StudyInstanceUID"].isin(dev_old_train_ids)].copy()
+    # then, fill the rest of the new train.
+    new_train_ids += rng.sample(
+        [uid for uid in old_train["StudyInstanceUID"] if uid not in new_train_ids],
+        num_train_samples - len(new_train_ids),
+    )
+    train = old_train[old_train["StudyInstanceUID"].isin(new_train_ids)].copy()
+    train.to_csv(public / "train.csv", index=False)
+    train_bboxes = old_train_bboxes[
+        old_train_bboxes["StudyInstanceUID"].isin(new_bboxes_ids)
+    ].copy()
+    train_bboxes.to_csv(public / "train_bounding_boxes.csv", index=False)
+    answers = old_train[~old_train["StudyInstanceUID"].isin(new_train_ids)].copy()
+    # columns become rows for the test and sample submission, so also for answers
+    answers = answers.melt(
+        id_vars="StudyInstanceUID", var_name="prediction_type", value_name="fractured"
+    )
+    answers["row_id"] = answers["StudyInstanceUID"] + "_" + answers["prediction_type"]
+    answers.to_csv(private / "answers.csv", index=False)
+    sample_submission = answers[["row_id", "fractured"]].copy()
+    sample_submission["fractured"] = 0.5
+    sample_submission.to_csv(public / "sample_submission.csv", index=False)
+    public_test = answers.drop(columns=["fractured"]).copy()
+    public_test.to_csv(public / "test.csv", index=False)
+    # assert that the melting worked
+    assert answers["StudyInstanceUID"].nunique() * 8 == len(
+        answers
+    ), "Melting failed, incorrect length"
+    assert answers.columns.tolist() == [
+        "StudyInstanceUID",
+        "prediction_type",
+        "fractured",
+        "row_id",
+    ], "Melting went wrong, columns are wrong"
+    # column checks
+    train_cols = ["StudyInstanceUID", "patient_overall", "C1", "C2", "C3", "C4", "C5", "C6", "C7"]
+    assert train.columns.tolist() == train_cols, "Train columns are wrong"
+    bbox_cols = ["StudyInstanceUID", "x", "y", "width", "height", "slice_number"]
+    assert train_bboxes.columns.tolist() == bbox_cols, "Bounding box columns are wrong"
+    test_cols = ["StudyInstanceUID", "prediction_type", "row_id"]
+    assert public_test.columns.tolist() == test_cols, "Test columns are wrong"
+    submission_cols = ["row_id", "fractured"]
+    assert sample_submission.columns.tolist() == submission_cols, "Submission columns are wrong"
+    # Check that the correct number of training samples is selected
+    assert len(new_train_ids) == round(len(old_train) * TRAIN_RATIO), (
+        "Incorrect number of training samples."
+        " The number of `new_train_ids` doesn't match the expected number given the `TRAIN_RATIO`."
+    )
+    assert len(train) + answers["StudyInstanceUID"].nunique() == len(old_train), (
+        "Incorrect number of training samples."
+        " New train and test splits don't sum to the length of the original train set."
+    )
+    # Check that the correct number of bounding box samples is selected
+    assert len(new_bboxes_ids) == round(
+        len(old_train_bbox_ids) * TRAIN_RATIO
+    ), "Incorrect number of bounding box samples"
+    # Check that the correct number of segmentation samples is selected
+    assert len(new_segmentations_ids) == round(
+        len(old_train_segmentation_ids) * TRAIN_RATIO
+    ), "Incorrect number of segmentation samples"
+    # Check that the overlap is preserved
+    assert len(new_overlap_ids) == round(
+        len(old_overlap_ids) * TRAIN_RATIO
+    ), "Incorrect overlap preservation"
+    # check that test and train dont share study instance ids
+    assert set(train["StudyInstanceUID"]).isdisjoint(
+        set(public_test["StudyInstanceUID"].unique())
+    ), "Train and test share study instance ids"
+    # Now that splitting is done, copy over images accordingly
+    (public / "segmentations").mkdir(exist_ok=True)
+    for file_id in tqdm(
+        new_segmentations_ids, desc="Copying segmentations", total=len(new_segmentations_ids)
+    ):
+        shutil.copyfile(
+            src=old_train_segmentation_path / f"{file_id}.nii",
+            dst=public / "segmentations" / f"{file_id}.nii",
+        )
+    (public / "train_images").mkdir(exist_ok=True)
+    for study_id in tqdm(
+        train["StudyInstanceUID"],
+        desc="Copying train images",
+        total=len(train),
+        unit="StudyInstance",
+    ):
+        shutil.copytree(
+            src=raw / "train_images" / study_id,
+            dst=public / "train_images" / study_id,
+            dirs_exist_ok=True,
+        )
+    (public / "test_images").mkdir(exist_ok=True)
+    for study_id in tqdm(
+        public_test["StudyInstanceUID"].unique(),
+        desc="Copying test images",
+        total=public_test["StudyInstanceUID"].nunique(),
+        unit="StudyInstance",
+    ):
+        shutil.copytree(
+            src=raw / "train_images" / study_id,
+            dst=public / "test_images" / study_id,
+            dirs_exist_ok=True,
+        )
+    assert len(list((public / "segmentations").glob("*.nii"))) == len(
+        new_segmentations_ids
+    ), "Incorrect number of segmentations copied"
+    # check that all the right image directories are copied
+    assert len(list((public / "train_images").glob("*"))) == len(
+        train
+    ), "Incorrect number of train images copied"
+    assert (
+        len(list((public / "test_images").glob("*"))) == public_test["StudyInstanceUID"].nunique()
+    ), "Incorrect number of test images copied"

mlebench/competitions/rsna-2022-cervical-spine-fracture-detection/prepare_val.py ADDED Viewed

@@ -0,0 +1,257 @@
+import random
+import shutil
+from pathlib import Path
+from typing import List, Tuple
+import pandas as pd
+from tqdm.auto import tqdm
+from mlebench.utils import read_csv
+def _split_and_save(
+    source_train_df: pd.DataFrame,
+    source_bbox_df: pd.DataFrame,
+    source_seg_ids: List[str],
+    raw_images_path: Path,
+    raw_segmentations_path: Path,
+    train_ratio: float,
+    rng: random.Random,
+    public_dir: Path,
+    private_dir: Path,
+) -> Tuple[pd.DataFrame, pd.DataFrame, List[str]]:
+    """
+    Performs a data split based on provided source data and saves the results.
+    This function replicates the original script's logic for splitting data based on
+    ratios of bounding boxes, segmentations, and their overlap. It then saves the
+    resulting train/test sets, metadata, and copies image files to the specified
+    public and private directories.
+    Args:
+        source_train_df: DataFrame with the main training metadata to be split.
+        source_bbox_df: DataFrame with bounding box data to be split.
+        source_seg_ids: List of StudyInstanceUIDs that have segmentations.
+        raw_images_path: Path to the original directory of all study images.
+        raw_segmentations_path: Path to the original directory of all segmentations.
+        train_ratio: The ratio of the source data to be used for the new training set.
+        rng: A random number generator instance for deterministic sampling.
+        public_dir: The target public directory for outputs.
+        private_dir: The target private directory for outputs.
+    Returns:
+        A tuple containing the data for the *training* portion of the split:
+        (new_train_df, new_train_bboxes_df, new_segmentation_ids)
+    """
+    public_dir.mkdir(exist_ok=True, parents=True)
+    private_dir.mkdir(exist_ok=True, parents=True)
+    num_source_train = len(source_train_df)
+    num_train_samples = round(num_source_train * train_ratio)
+    # bboxes
+    source_train_bbox_ids = sorted(source_bbox_df["StudyInstanceUID"].unique())
+    source_num_train_bbox_ids = len(source_train_bbox_ids)
+    new_num_train_bbox_ids = round(source_num_train_bbox_ids * train_ratio)
+    # segmentations
+    source_num_train_segmentation_ids = len(source_seg_ids)
+    new_num_train_segmentation_ids = round(source_num_train_segmentation_ids * train_ratio)
+    # overlap: list of StudyInstanceUIDs that have both bounding boxes and segmentations
+    source_overlap_ids = [uid for uid in source_train_bbox_ids if uid in source_seg_ids]
+    source_num_overlap = len(source_overlap_ids)
+    new_num_overlap = round(source_num_overlap * train_ratio)
+    # start populating new train by picking the overlap instances
+    # sample new_num_overlap instances from the overlap randomly
+    new_overlap_ids = rng.sample(source_overlap_ids, new_num_overlap)
+    new_bboxes_ids = new_overlap_ids.copy()
+    new_segmentations_ids = new_overlap_ids.copy()
+    new_train_ids = new_overlap_ids.copy()
+    # add the `new_num_train_segmentation_ids - new_num_overlap`, that are not in the overlap
+    additional_segmentation_ids = rng.sample(
+        [uid for uid in source_seg_ids if uid not in source_overlap_ids],
+        new_num_train_segmentation_ids - new_num_overlap,
+    )
+    new_segmentations_ids += additional_segmentation_ids
+    new_train_ids += additional_segmentation_ids
+    # add the (`new_num_train_bbox_ids - num_num_overlap`) segmentations, that are not in the overlap
+    additional_bbox_ids = rng.sample(
+        [uid for uid in source_train_bbox_ids if uid not in source_overlap_ids],
+        new_num_train_bbox_ids - new_num_overlap,
+    )
+    new_bboxes_ids += additional_bbox_ids
+    new_train_ids += additional_bbox_ids
+    # then, fill the rest of the new train.
+    num_to_sample = num_train_samples - len(new_train_ids)
+    available_pool = [uid for uid in source_train_df["StudyInstanceUID"] if uid not in new_train_ids]
+    new_train_ids += rng.sample(
+        available_pool,
+        min(num_to_sample, len(available_pool)),  # Avoid sampling more than available
+    )
+    train = source_train_df[source_train_df["StudyInstanceUID"].isin(new_train_ids)].copy()
+    train.to_csv(public_dir / "train.csv", index=False)
+    train_bboxes = source_bbox_df[
+        source_bbox_df["StudyInstanceUID"].isin(new_bboxes_ids)
+    ].copy()
+    train_bboxes.to_csv(public_dir / "train_bounding_boxes.csv", index=False)
+    answers = source_train_df[~source_train_df["StudyInstanceUID"].isin(new_train_ids)].copy()
+    # columns become rows for the test and sample submission, so also for answers
+    answers = answers.melt(
+        id_vars="StudyInstanceUID", var_name="prediction_type", value_name="fractured"
+    )
+    answers["row_id"] = answers["StudyInstanceUID"] + "_" + answers["prediction_type"]
+    answers.to_csv(private_dir / "answers.csv", index=False)
+    sample_submission = answers[["row_id", "fractured"]].copy()
+    sample_submission["fractured"] = 0.5
+    sample_submission.to_csv(public_dir / "sample_submission.csv", index=False)
+    public_test = answers.drop(columns=["fractured"]).copy()
+    public_test.to_csv(public_dir / "test.csv", index=False)
+    # assert that the melting worked
+    if answers["StudyInstanceUID"].nunique() > 0:
+        assert answers["StudyInstanceUID"].nunique() * 8 == len(
+            answers
+        ), "Melting failed, incorrect length"
+    assert answers.columns.tolist() == [
+        "StudyInstanceUID",
+        "prediction_type",
+        "fractured",
+        "row_id",
+    ], "Melting went wrong, columns are wrong"
+    # column checks
+    train_cols = ["StudyInstanceUID", "patient_overall", "C1", "C2", "C3", "C4", "C5", "C6", "C7"]
+    assert train.columns.tolist() == train_cols, "Train columns are wrong"
+    bbox_cols = ["StudyInstanceUID", "x", "y", "width", "height", "slice_number"]
+    assert train_bboxes.columns.tolist() == bbox_cols, "Bounding box columns are wrong"
+    test_cols = ["StudyInstanceUID", "prediction_type", "row_id"]
+    assert public_test.columns.tolist() == test_cols, "Test columns are wrong"
+    submission_cols = ["row_id", "fractured"]
+    assert sample_submission.columns.tolist() == submission_cols, "Submission columns are wrong"
+    # check that test and train dont share study instance ids
+    assert set(train["StudyInstanceUID"]).isdisjoint(
+        set(public_test["StudyInstanceUID"].unique())
+    ), "Train and test share study instance ids"
+    # Now that splitting is done, copy over images accordingly
+    (public_dir / "segmentations").mkdir(exist_ok=True)
+    for file_id in tqdm(
+        new_segmentations_ids,
+        desc=f"Copying segmentations to {public_dir.name}",
+        total=len(new_segmentations_ids),
+    ):
+        shutil.copyfile(
+            src=raw_segmentations_path / f"{file_id}.nii",
+            dst=public_dir / "segmentations" / f"{file_id}.nii",
+        )
+    (public_dir / "train_images").mkdir(exist_ok=True)
+    for study_id in tqdm(
+        train["StudyInstanceUID"],
+        desc=f"Copying train images to {public_dir.name}",
+        total=len(train),
+        unit="StudyInstance",
+    ):
+        shutil.copytree(
+            src=raw_images_path / study_id,
+            dst=public_dir / "train_images" / study_id,
+            dirs_exist_ok=True,
+        )
+    (public_dir / "test_images").mkdir(exist_ok=True)
+    for study_id in tqdm(
+        public_test["StudyInstanceUID"].unique(),
+        desc=f"Copying test images to {public_dir.name}",
+        total=public_test["StudyInstanceUID"].nunique(),
+        unit="StudyInstance",
+    ):
+        shutil.copytree(
+            src=raw_images_path / study_id,
+            dst=public_dir / "test_images" / study_id,
+            dirs_exist_ok=True,
+        )
+    return train, train_bboxes, new_segmentations_ids
+def prepare(raw: Path, public: Path, private: Path):
+    rng = random.Random(0)
+    # there are two subsets of training data:
+    # 1. one of instances that have bounding boxes
+    # 2. one of instances that have segmentations
+    # we need to preserve the ratios of the sizes of these subsets to the total train samples
+    # additionally, there is an overlap between the two subsets
+    # we need to preserve this overlap
+    DEV = False
+    old_train = read_csv(raw / "train.csv")
+    num_old_train = len(old_train)
+    if DEV:
+        # This DEV logic is preserved from the original script to ensure
+        # identical behavior if ever enabled. It is currently inactive.
+        DEV_RATIO = 0.175
+        num_old_train = round(DEV_RATIO * num_old_train)
+        # The complex DEV logic from the original script is not fully ported
+        # as it was intertwined with the main logic and is disabled by default.
+        # This simplified version just subsamples the main dataframe.
+        old_train = old_train.sample(n=num_old_train, random_state=0)
+    # 2019 train folders (StudyInstanceUIDs), 1500 test folders, 2019 / (1500 + 2019) ~ 0.60 original train ratio
+    # each folder has ~ 300 images
+    # We use 0.1 ratio to avoid taking too many samples out of train
+    TRAIN_RATIO = 0.1
+    # Load all raw source data once
+    old_train_bboxes = read_csv(raw / "train_bounding_boxes.csv")
+    old_train_segmentation_path = raw / "segmentations"
+    old_train_segmentation_ids = sorted([f.stem for f in old_train_segmentation_path.glob("*.nii")])
+    # === Step 1: Perform the original data split to create `public` and `private` ===
+    # This call produces the main train/test split. The outputs in `public` and
+    # `private` will be identical to the original script's output.
+    # We capture the resulting training set data to be used as the source for our next split.
+    train_df, train_bboxes_df, train_seg_ids = _split_and_save(
+        source_train_df=old_train,
+        source_bbox_df=old_train_bboxes,
+        source_seg_ids=old_train_segmentation_ids,
+        raw_images_path=raw / "train_images",
+        raw_segmentations_path=raw / "segmentations",
+        train_ratio=TRAIN_RATIO,
+        rng=rng,
+        public_dir=public,
+        private_dir=private,
+    )
+    # === Step 2: Perform a second split on the new training set to create a validation set ===
+    # This call takes the *training set* from the first split (`train_df`) and
+    # splits it again using the exact same logic and ratio.
+    # The results are saved to the new `public_val` and `private_val` directories.
+    public_val = public.parent / "public_val"
+    private_val = private.parent / "private_val"
+    _split_and_save(
+        source_train_df=train_df,
+        source_bbox_df=train_bboxes_df,
+        source_seg_ids=train_seg_ids,
+        raw_images_path=raw / "train_images",  # Image source is still the main raw folder
+        raw_segmentations_path=raw / "segmentations",
+        train_ratio=TRAIN_RATIO,  # Use the same split ratio
+        rng=rng,
+        public_dir=public_val,
+        private_dir=private_val,
+    )

mlebench/competitions/rsna-breast-cancer-detection/grade.py ADDED Viewed

@@ -0,0 +1,65 @@
+import pandas as pd
+from mlebench.grade_helpers import InvalidSubmissionError
+def pfbeta(labels, predictions, beta):
+    """
+    Adapted from https://www.kaggle.com/code/sohier/probabilistic-f-score
+    adaptations: code formatting, this docstring.
+    """
+    y_true_count = 0
+    ctp = 0
+    cfp = 0
+    for idx in range(len(labels)):
+        prediction = min(max(predictions[idx], 0), 1)
+        if labels[idx]:
+            y_true_count += 1
+            ctp += prediction
+        else:
+            cfp += prediction
+    beta_squared = beta * beta
+    c_precision = ctp / (ctp + cfp)
+    c_recall = ctp / y_true_count
+    if c_precision > 0 and c_recall > 0:
+        result = (
+            (1 + beta_squared) * (c_precision * c_recall) / (beta_squared * c_precision + c_recall)
+        )
+        return result
+    else:
+        return 0
+def prepare_for_metric(submission: pd.DataFrame, answers: pd.DataFrame) -> dict:
+    id_col = "prediction_id"
+    target_col = "cancer"
+    assert id_col in answers.columns, f"{id_col} not in answers"
+    assert target_col in answers.columns, f"{target_col} not in answers"
+    if id_col not in submission.columns:
+        raise InvalidSubmissionError(f"{id_col} not in submission")
+    if target_col not in submission.columns:
+        raise InvalidSubmissionError(f"{target_col} not in submission")
+    if len(submission) != len(answers):
+        raise InvalidSubmissionError("submission and answers have different lengths")
+    submission = submission.sort_values(by=id_col)
+    answers = answers.sort_values(by=id_col)
+    if (submission[id_col].values != answers[id_col].values).any():
+        raise InvalidSubmissionError(f"{id_col} not aligned")
+    labels = answers[target_col].to_numpy()
+    predictions = submission[target_col].to_numpy()
+    return {"labels": labels, "predictions": predictions}
+def grade(submission: pd.DataFrame, answers: pd.DataFrame) -> float:
+    pf1_inputs = prepare_for_metric(submission, answers)
+    # F1 score, so beta=1; https://www.kaggle.com/competitions/rsna-breast-cancer-detection/discussion/370123
+    score = pfbeta(**pf1_inputs, beta=1)
+    return score

mlebench/competitions/rsna-breast-cancer-detection/prepare.py ADDED Viewed

@@ -0,0 +1,141 @@
+import shutil
+from pathlib import Path
+from sklearn.model_selection import train_test_split
+from tqdm.auto import tqdm
+from mlebench.utils import read_csv
+# columns in train.csv
+train_columns = [
+    "site_id",
+    "patient_id",
+    "image_id",
+    "laterality",
+    "view",
+    "age",
+    "cancer",
+    "biopsy",
+    "invasive",
+    "BIRADS",
+    "implant",
+    "density",
+    "machine_id",
+    "difficult_negative_case",
+]
+# columns in test.csv
+test_columns = [
+    "site_id",
+    "patient_id",
+    "image_id",
+    "laterality",
+    "view",
+    "age",
+    "implant",
+    "machine_id",
+    "prediction_id",
+]
+# columns in answers/submission
+submission_columns = ["prediction_id", "cancer"]
+DEV = False
+def prepare(raw: Path, public: Path, private: Path):
+    old_train = read_csv(raw / "train.csv")
+    # work on 5k samples for now, instead of 54k
+    if DEV:
+        old_train = old_train.sample(5000, random_state=42)
+    # "You can expect roughly 8,000 patients" in the test set
+    # so, split on patients. There are 11913 patients in train set
+    patient_ids = old_train["patient_id"].unique()
+    # Original ratio is 8000/ (8000 + 11913) ~ 0.4
+    # We use 0.1 to avoid taking too many samples out of train
+    train_patients, test_patients = train_test_split(patient_ids, test_size=0.1, random_state=42)
+    train_patients = set(train_patients)
+    test_patients = set(test_patients)
+    new_train = old_train[old_train["patient_id"].isin(train_patients)].copy()
+    # dont index the `prediction_id` (last col) since its not in train and we need to build it
+    # index also the `cancer` column, which we'll drop later for the without_labels version
+    answers = old_train[old_train["patient_id"].isin(test_patients)][
+        test_columns[:-1] + ["cancer"]
+    ].copy()
+    new_train.to_csv(public / "train.csv", index=False)
+    answers["prediction_id"] = answers["patient_id"].astype(str) + "_" + answers["laterality"]
+    new_test_without_labels = answers.drop(columns=["cancer"])
+    new_test_without_labels.to_csv(public / "test.csv", index=False)
+    # merge multiple prediction_ids for the same patient into one for sample_submission and private test
+    answers = answers[submission_columns].copy()
+    # just take the first label for each prediction id -- the rest will be identical duplicates
+    answers = answers.groupby("prediction_id").first().reset_index()
+    answers.to_csv(private / "answers.csv", index=False)
+    sample_submission = answers.copy()
+    sample_submission["cancer"] = new_train.cancer.mean()  # mean cancer rate in train set
+    sample_submission.to_csv(public / "sample_submission.csv", index=False)
+    assert len(sample_submission) == len(
+        answers
+    ), "sample_submission and test.csv should have the same number of rows"
+    assert len(new_test_without_labels) + len(new_train) == len(
+        old_train
+    ), "The sum of the rows in new_test_without_labels and new_train should be equal to the number of rows in old_train"
+    # because of the merging
+    assert len(answers) != len(
+        new_test_without_labels
+    ), "new_test and new_test_without_labels should have different number of rows"
+    assert (
+        answers.columns.tolist() == submission_columns
+    ), f"answers should have columns {submission_columns}"
+    assert (
+        sample_submission.columns.tolist() == submission_columns
+    ), f"sample_submission should have columns {submission_columns}"
+    assert (
+        new_train.columns.tolist() == old_train.columns.tolist()
+    ), f"new_train should have columns {old_train.columns.tolist()}, got {new_train.columns.tolist()}"
+    assert (
+        new_test_without_labels.columns.tolist() == test_columns
+    ), f"new_test_without_labels should have columns {test_columns}, got {new_test_without_labels.columns.tolist()}"
+    assert set(new_test_without_labels["patient_id"]).isdisjoint(
+        set(new_train["patient_id"])
+    ), "new_test_without_labels and new_train should have disjoint patient_ids"
+    # finally, split the images
+    (public / "train_images").mkdir(exist_ok=True)
+    for patient_id in tqdm(train_patients, total=len(train_patients)):
+        patient_id_str = str(patient_id)
+        patient_dir = public / "train_images" / patient_id_str
+        patient_dir.mkdir(exist_ok=True)
+        image_ids = new_train[new_train["patient_id"] == patient_id]["image_id"].to_list()
+        for image_id in image_ids:
+            shutil.copy(raw / "train_images" / patient_id_str / f"{image_id}.dcm", patient_dir)
+    (public / "test_images").mkdir(exist_ok=True)
+    for patient_id in tqdm(test_patients, total=len(test_patients)):
+        patient_id_str = str(patient_id)
+        patient_dir = public / "test_images" / patient_id_str
+        patient_dir.mkdir(exist_ok=True)
+        image_ids = new_test_without_labels[new_test_without_labels["patient_id"] == patient_id][
+            "image_id"
+        ].to_list()
+        for image_id in image_ids:
+            shutil.copy(raw / "train_images" / patient_id_str / f"{image_id}.dcm", patient_dir)
+    # final checks
+    assert len(list((public / "train_images").rglob("*.dcm"))) == len(
+        new_train
+    ), "Number of images in train_images should be equal to the number of rows in new_train"
+    assert len(list((public / "test_images").rglob("*.dcm"))) == len(
+        new_test_without_labels
+    ), "Number of images in test_images should be equal to the number of rows in new_test_without_labels"

dslighting 1.7.1__py3-none-any.whl → 1.7.6__py3-none-any.whl

dslighting 1.7.1py3-none-any.whl → 1.7.6py3-none-any.whl