PyPI - dslighting - Versions diffs - 1.7.1__py3-none-any.whl → 1.7.8__py3-none-any.whl - Mend

dslighting 1.7.1py3-none-any.whl → 1.7.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (352) hide show

mlebench/competitions/hubmap-kidney-segmentation/grade.py ADDED Viewed

@@ -0,0 +1,62 @@
+import numpy as np
+import pandas as pd
+from tqdm.auto import tqdm
+from mlebench.competitions.utils import rles_to_masks
+from mlebench.grade_helpers import InvalidSubmissionError
+from mlebench.metrics import dice_coefficient
+def prepare_for_metric(
+    submission: pd.DataFrame, answers: pd.DataFrame
+) -> tuple[list[np.ndarray], list[np.ndarray]]:
+    id_col = "id"
+    if id_col not in submission.columns:
+        raise InvalidSubmissionError(f"Submission is missing the '{id_col}' column")
+    if "predicted" not in submission.columns:
+        raise InvalidSubmissionError("Submission is missing the 'predicted' column")
+    assert id_col in answers.columns, f"Answers is missing the '{id_col}' column"
+    assert "encoding" in answers.columns, "Answers is missing the 'encoded' column"
+    assert "width_pixels" in answers.columns, "Answers is missing the 'width_pixels' column"
+    assert "height_pixels" in answers.columns, "Answers is missing the 'height_pixels' column"
+    if len(submission) != len(answers):
+        raise InvalidSubmissionError(
+            f"Submission has {len(submission)} rows while answers has {len(answers)} rows"
+        )
+    if set(submission[id_col]) != set(answers[id_col]):
+        raise InvalidSubmissionError(f"Submission and answers have different {id_col} values")
+    submission = submission.sort_values(id_col).reset_index(drop=True)
+    answers = answers.sort_values(id_col).reset_index(drop=True)
+    # pandas reads empty cells as nan, we mark them as empty RLE strings
+    submission["predicted"] = submission["predicted"].fillna("")
+    submission["answers"] = submission["predicted"].fillna("")
+    image_heights = answers["height_pixels"].tolist()
+    image_widths = answers["width_pixels"].tolist()
+    prediced_masks = rles_to_masks(submission["predicted"].to_list(), image_heights, image_widths)
+    true_masks = rles_to_masks(answers["encoding"], image_heights, image_widths)
+    return prediced_masks, true_masks
+def grade(submission: pd.DataFrame, answers: pd.DataFrame) -> float:
+    """
+    Computes the mean dice coefficient for the submission and answers.
+    """
+    predicted_masks, true_masks = prepare_for_metric(submission, answers)
+    return np.mean(
+        [
+            dice_coefficient(predicted_mask, true_mask, both_empty_value=1.0)
+            for predicted_mask, true_mask in tqdm(
+                zip(predicted_masks, true_masks), total=len(predicted_masks)
+            )
+        ]
+    )

mlebench/competitions/hubmap-kidney-segmentation/prepare.py ADDED Viewed

@@ -0,0 +1,108 @@
+import shutil
+from pathlib import Path
+from sklearn.model_selection import train_test_split
+from tqdm.auto import tqdm
+from mlebench.utils import read_csv
+def prepare(raw: Path, public: Path, private: Path):
+    old_train = read_csv(raw / "train.csv")
+    old_dataset_info = read_csv(raw / "HuBMAP-20-dataset_information.csv")
+    new_train, new_test = train_test_split(old_train, train_size=12, test_size=3, random_state=0)
+    # dataset info doesnt have an id column, so quickly add it so that we can filter out old test
+    old_dataset_info["id"] = old_dataset_info["image_file"].str.replace(".tiff", "")
+    dataset_info = old_dataset_info[old_dataset_info["id"].isin(old_train["id"])]
+    # put height and width in new_test, for grading
+    new_test = new_test.merge(dataset_info[["id", "width_pixels", "height_pixels"]], on="id")
+    dataset_info = dataset_info.drop(columns=["id"], inplace=False)
+    dataset_info.to_csv(public / "HuBMAP-20-dataset_information.csv", index=False)
+    new_train.to_csv(public / "train.csv", index=False)
+    new_test.to_csv(private / "test.csv", index=False)
+    sample_submission = new_test[["id"]].copy()
+    sample_submission["predicted"] = ""
+    sample_submission.to_csv(public / "sample_submission.csv", index=False)
+    # basically the same as new_test but with a different column name
+    gold_submission = sample_submission.copy()
+    gold_submission["predicted"] = new_test["encoding"]
+    gold_submission.to_csv(private / "gold_submission.csv", index=False)
+    (public / "train").mkdir(parents=True, exist_ok=True)
+    for image_id in tqdm(new_train["id"], desc="Copying train images"):
+        shutil.copy(raw / "train" / f"{image_id}.tiff", public / "train" / f"{image_id}.tiff")
+        shutil.copy(raw / "train" / f"{image_id}.json", public / "train" / f"{image_id}.json")
+        shutil.copy(
+            raw / "train" / f"{image_id}-anatomical-structure.json",
+            public / "train" / f"{image_id}-anatomical-structure.json",
+        )
+    (public / "test").mkdir(parents=True, exist_ok=True)
+    for image_id in tqdm(new_test["id"], desc="Copying test images"):
+        shutil.copy(raw / "train" / f"{image_id}.tiff", public / "test" / f"{image_id}.tiff")
+        shutil.copy(raw / "train" / f"{image_id}.json", public / "test" / f"{image_id}.json")
+        shutil.copy(
+            raw / "train" / f"{image_id}-anatomical-structure.json",
+            public / "test" / f"{image_id}-anatomical-structure.json",
+        )
+    # for some reason sample_submission.csv is also in test/
+    shutil.copy(public / "sample_submission.csv", public / "test" / "sample_submission.csv")
+    # Checks
+    assert len(new_train) + len(new_test) == len(
+        old_train
+    ), "Length of new_train and new_test should equal length of old_train"
+    assert new_train.columns.to_list() == [
+        "id",
+        "encoding",
+    ], "Public train set should have 2 columns, called 'id' and 'encoding'"
+    assert new_test.columns.to_list() == [
+        "id",
+        "encoding",
+        "width_pixels",
+        "height_pixels",
+    ], "Private test set should have 2 columns called 'id' and 'encoding'"
+    assert len(sample_submission) == len(new_test), "Sample submission length should match test set"
+    assert sample_submission.columns.to_list() == [
+        "id",
+        "predicted",
+    ], "Sample submissions should have two columns, 'id' and 'predicted'"
+    assert len(gold_submission) == len(new_test), "Gold submission length should match test set"
+    assert gold_submission.columns.to_list() == [
+        "id",
+        "predicted",
+    ], "Gold submissions should have two columns, 'id' and 'predicted'"
+    assert gold_submission["predicted"].equals(
+        new_test["encoding"]
+    ), "Gold submission should match private test set"
+    # assert no overlap in ids between train and test
+    assert set(new_train["id"]).isdisjoint(
+        set(new_test["id"])
+    ), "Train and test ids should not overlap"
+    # check that the images are copied correctly
+    assert len(list((public / "train").glob("*.tiff"))) == len(
+        new_train
+    ), "Missing train tiff files"
+    assert len(list((public / "train").glob("*-anatomical-structure.json"))) == len(
+        new_train
+    ), "Missing train structure json files"
+    assert (
+        len(list((public / "train").glob("*.json"))) == len(new_train) * 2
+    ), "Missing train json files"
+    assert len(list((public / "test").glob("*.tiff"))) == len(new_test), "Missing test tiff files"
+    assert len(list((public / "test").glob("*-anatomical-structure.json"))) == len(
+        new_test
+    ), "Missing test structure json files"

mlebench/competitions/hubmap-kidney-segmentation/prepare_val.py ADDED Viewed

@@ -0,0 +1,153 @@
+import shutil
+from pathlib import Path
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from tqdm.auto import tqdm
+from mlebench.utils import read_csv
+def _create_split_files(
+    train_df: pd.DataFrame,
+    test_df: pd.DataFrame,
+    full_dataset_info: pd.DataFrame,
+    raw_path: Path,
+    public_path: Path,
+    private_path: Path,
+):
+    """
+    Helper function to generate all required files for a given train/test split.
+    This function encapsulates the logic of creating CSVs, sample submissions,
+    and copying image files to ensure that the process is identical for both
+    the main split (public/private) and the validation split (public_val/private_val).
+    """
+    # Create output directories
+    public_path.mkdir(parents=True, exist_ok=True)
+    private_path.mkdir(parents=True, exist_ok=True)
+    (public_path / "train").mkdir(parents=True, exist_ok=True)
+    (public_path / "test").mkdir(parents=True, exist_ok=True)
+    # Process and save data files
+    dataset_info = full_dataset_info.drop(columns=["id"], inplace=False)
+    dataset_info.to_csv(public_path / "HuBMAP-20-dataset_information.csv", index=False)
+    train_df.to_csv(public_path / "train.csv", index=False)
+    # Put height and width in test_df, for grading
+    private_test_df = test_df.merge(full_dataset_info[["id", "width_pixels", "height_pixels"]], on="id")
+    private_test_df.to_csv(private_path / "test.csv", index=False)
+    sample_submission = private_test_df[["id"]].copy()
+    sample_submission["predicted"] = ""
+    sample_submission.to_csv(public_path / "sample_submission.csv", index=False)
+    # for some reason sample_submission.csv is also in test/
+    shutil.copy(public_path / "sample_submission.csv", public_path / "test" / "sample_submission.csv")
+    # basically the same as private_test_df but with a different column name
+    gold_submission = sample_submission.copy()
+    gold_submission["predicted"] = private_test_df["encoding"]
+    gold_submission.to_csv(private_path / "gold_submission.csv", index=False)
+    # Copy image files
+    for image_id in tqdm(train_df["id"], desc=f"Copying train images to {public_path.name}"):
+        shutil.copy(raw_path / "train" / f"{image_id}.tiff", public_path / "train" / f"{image_id}.tiff")
+        shutil.copy(raw_path / "train" / f"{image_id}.json", public_path / "train" / f"{image_id}.json")
+        shutil.copy(
+            raw_path / "train" / f"{image_id}-anatomical-structure.json",
+            public_path / "train" / f"{image_id}-anatomical-structure.json",
+        )
+    for image_id in tqdm(private_test_df["id"], desc=f"Copying test images to {public_path.name}"):
+        shutil.copy(raw_path / "train" / f"{image_id}.tiff", public_path / "test" / f"{image_id}.tiff")
+        shutil.copy(raw_path / "train" / f"{image_id}.json", public_path / "test" / f"{image_id}.json")
+        shutil.copy(
+            raw_path / "train" / f"{image_id}-anatomical-structure.json",
+            public_path / "test" / f"{image_id}-anatomical-structure.json",
+        )
+    # Checks
+    assert train_df.columns.to_list() == [
+        "id",
+        "encoding",
+    ], f"Public train set in {public_path.name} should have 2 columns, called 'id' and 'encoding'"
+    assert private_test_df.columns.to_list() == [
+        "id",
+        "encoding",
+        "width_pixels",
+        "height_pixels",
+    ], f"Private test set in {private_path.name} should have 4 columns"
+    assert len(sample_submission) == len(private_test_df), "Sample submission length should match test set"
+    assert sample_submission.columns.to_list() == [
+        "id",
+        "predicted",
+    ], "Sample submissions should have two columns, 'id' and 'predicted'"
+    assert len(gold_submission) == len(private_test_df), "Gold submission length should match test set"
+    assert gold_submission.columns.to_list() == [
+        "id",
+        "predicted",
+    ], "Gold submissions should have two columns, 'id' and 'predicted'"
+    assert gold_submission["predicted"].equals(
+        private_test_df["encoding"]
+    ), "Gold submission should match private test set"
+    assert set(train_df["id"]).isdisjoint(
+        set(private_test_df["id"])
+    ), "Train and test ids should not overlap"
+    assert len(list((public_path / "train").glob("*.tiff"))) == len(
+        train_df
+    ), f"Missing train tiff files in {public_path.name}"
+    assert len(list((public_path / "train").glob("*-anatomical-structure.json"))) == len(
+        train_df
+    ), f"Missing train structure json files in {public_path.name}"
+    assert (
+        len(list((public_path / "train").glob("*.json"))) == len(train_df) * 2
+    ), f"Missing train json files in {public_path.name}"
+    assert len(list((public_path / "test").glob("*.tiff"))) == len(private_test_df), f"Missing test tiff files in {public_path.name}"
+    assert len(list((public_path / "test").glob("*-anatomical-structure.json"))) == len(
+        private_test_df
+    ), f"Missing test structure json files in {public_path.name}"
+def prepare(raw: Path, public: Path, private: Path):
+    old_train = read_csv(raw / "train.csv")
+    old_dataset_info = read_csv(raw / "HuBMAP-20-dataset_information.csv")
+    # --- First Split: Create the main train and test sets ---
+    # This split is identical to the original script to ensure public/private are not changed.
+    new_train, new_test = train_test_split(old_train, train_size=12, test_size=3, random_state=0)
+    # Process dataset_info once. This info is based on the full original train set
+    # and will be used for both the main and validation splits.
+    old_dataset_info["id"] = old_dataset_info["image_file"].str.replace(".tiff", "")
+    dataset_info = old_dataset_info[old_dataset_info["id"].isin(old_train["id"])]
+    # Create the original public and private directories and their contents.
+    # The results of this call will be IDENTICAL to the original script's output.
+    _create_split_files(new_train, new_test, dataset_info, raw, public, private)
+    # --- Second Split: Create a validation set from the main train set ---
+    # This creates a new, smaller training set and a validation set.
+    # The outputs are saved to parallel 'public_val' and 'private_val' directories.
+    public_val = public.parent / "public_val"
+    private_val = private.parent / "private_val"
+    # Split new_train (12 samples) into train_val (9) and test_val (3).
+    # This replicates the test set size (3) and random_state (0) from the first split.
+    train_val, test_val = train_test_split(new_train, train_size=9, test_size=3, random_state=0)
+    # Create the new validation directories and their contents.
+    # The file structure and names inside these directories will mirror the original ones.
+    _create_split_files(train_val, test_val, dataset_info, raw, public_val, private_val)
+    # Final check from original script
+    assert len(new_train) + len(new_test) == len(
+        old_train
+    ), "Length of new_train and new_test should equal length of old_train"

mlebench/competitions/icecube-neutrinos-in-deep-ice/grade.py ADDED Viewed

@@ -0,0 +1,111 @@
+import numpy as np
+import pandas as pd
+from mlebench.grade_helpers import InvalidSubmissionError
+def prepare_for_metric(submission: pd.DataFrame, answers: pd.DataFrame) -> dict:
+    # submission
+    if set(submission.columns) != {"azimuth", "zenith", "event_id"}:
+        raise InvalidSubmissionError(
+            "Submission must contain columns 'azimuth','zenith' and 'event_id'"
+        )
+    if not np.issubdtype(submission["azimuth"].dtype, np.number):
+        raise InvalidSubmissionError("Azimuth must be a number")
+    if not np.issubdtype(submission["zenith"].dtype, np.number):
+        raise InvalidSubmissionError("Zenith must be a number")
+    if not np.all(np.isfinite(submission["azimuth"])):
+        raise InvalidSubmissionError("Azimuth must not be infinite")
+    if not np.all(np.isfinite(submission["zenith"])):
+        raise InvalidSubmissionError("Zenith must not be infinite")
+    if submission["azimuth"].isnull().any():
+        raise InvalidSubmissionError("Azimuth must not be NaN")
+    if submission["zenith"].isnull().any():
+        raise InvalidSubmissionError("Zenith must not be NaN")
+    # answers
+    assert set(answers.columns) == {
+        "azimuth",
+        "zenith",
+        "event_id",
+    }, "Answers must contain columns 'azimuth','zenith' and 'event_id'"
+    assert np.issubdtype(answers["azimuth"].dtype, np.number), "Azimuth must be a number"
+    assert np.issubdtype(answers["zenith"].dtype, np.number), "Zenith must be a number"
+    assert np.all(np.isfinite(answers["azimuth"])), "Azimuth must not be infinite"
+    assert np.all(np.isfinite(answers["zenith"])), "Zenith must not be infinite"
+    assert not answers["azimuth"].isnull().any(), "Azimuth must not be NaN"
+    assert not answers["zenith"].isnull().any(), "Zenith must not be NaN"
+    # both
+    if len(submission) != len(answers):
+        raise InvalidSubmissionError("Submission and answers must have the same length")
+    if set(submission["event_id"]) != set(answers["event_id"]):
+        raise InvalidSubmissionError("Submission and answers must have the same event_ids")
+    # sort values by id so that the order is correct
+    submission = submission.sort_values("event_id")
+    answers = answers.sort_values("event_id")
+    return {
+        "az_true": answers["azimuth"].to_numpy(),
+        "zen_true": answers["zenith"].to_numpy(),
+        "az_pred": submission["azimuth"].to_numpy(),
+        "zen_pred": submission["zenith"].to_numpy(),
+    }
+# courtesy of notebook from competition host: https://www.kaggle.com/code/sohier/mean-angular-error
+def angular_dist_score(az_true, zen_true, az_pred, zen_pred):
+    """
+    calculate the MAE of the angular distance between two directions.
+    The two vectors are first converted to cartesian unit vectors,
+    and then their scalar product is computed, which is equal to
+    the cosine of the angle between the two vectors. The inverse
+    cosine (arccos) thereof is then the angle between the two input vectors
+    Parameters:
+    -----------
+    az_true : float (or array thereof)
+        true azimuth value(s) in radian
+    zen_true : float (or array thereof)
+        true zenith value(s) in radian
+    az_pred : float (or array thereof)
+        predicted azimuth value(s) in radian
+    zen_pred : float (or array thereof)
+        predicted zenith value(s) in radian
+    Returns:
+    --------
+    dist : float
+        mean over the angular distance(s) in radian
+    """
+    # pre-compute all sine and cosine values
+    sa1 = np.sin(az_true)
+    ca1 = np.cos(az_true)
+    sz1 = np.sin(zen_true)
+    cz1 = np.cos(zen_true)
+    sa2 = np.sin(az_pred)
+    ca2 = np.cos(az_pred)
+    sz2 = np.sin(zen_pred)
+    cz2 = np.cos(zen_pred)
+    # scalar product of the two cartesian vectors (x = sz*ca, y = sz*sa, z = cz)
+    scalar_prod = sz1 * sz2 * (ca1 * ca2 + sa1 * sa2) + (cz1 * cz2)
+    # scalar product of two unit vectors is always between -1 and 1, this is against nummerical instability
+    # that might otherwise occure from the finite precision of the sine and cosine functions
+    scalar_prod = np.clip(scalar_prod, -1, 1)
+    # convert back to an angle (in radian)
+    return np.average(np.abs(np.arccos(scalar_prod)))
+def grade(submission: pd.DataFrame, answers: pd.DataFrame) -> float:
+    ang_dist_score_inputs = prepare_for_metric(submission, answers)
+    score = angular_dist_score(**ang_dist_score_inputs)
+    return score

mlebench/competitions/icecube-neutrinos-in-deep-ice/prepare.py ADDED Viewed

@@ -0,0 +1,127 @@
+import shutil
+from pathlib import Path
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from tqdm.auto import tqdm
+from mlebench.utils import get_logger
+logger = get_logger(__name__)
+def prepare(raw: Path, public: Path, private: Path):
+    DEV = False
+    if DEV:
+        batch_cutoff = 66  # 66 instead of 660 when in dev mode
+    else:
+        batch_cutoff = None
+    logger.info("Loading raw metadata")
+    old_train = pd.read_parquet(raw / "train_meta.parquet")
+    # this has batch_id and event_id, we will do a test-train split based on batch_id
+    # each batch id is equally sized so we can proceed with a simple split
+    batch_ids = old_train["batch_id"].unique()[:batch_cutoff]
+    logger.info("Splitting batches into train and test")
+    train_batch_ids, test_batch_ids = train_test_split(batch_ids, test_size=0.1, random_state=0)
+    # new column tracking the split
+    old_train["split"] = None
+    old_train.loc[old_train["batch_id"].isin(train_batch_ids), "split"] = "train"
+    old_train.loc[old_train["batch_id"].isin(test_batch_ids), "split"] = "test"
+    new_train = (
+        old_train[old_train["split"] == "train"]
+        .drop(columns=["split"])
+        .reset_index(drop=True)
+        .copy()
+    )
+    new_test = (
+        old_train[old_train["split"] == "test"]
+        .drop(columns=["split"])
+        .reset_index(drop=True)
+        .copy()
+    )
+    logger.info("Creating label-less test and sample submission")
+    new_test_without_labels = new_test.drop(columns=["azimuth", "zenith"])
+    # match sample submission format
+    new_test = new_test[["event_id", "azimuth", "zenith"]]
+    # copy the format as the private test and fill dummy values like kaggle.com
+    sample_submission = new_test.copy()
+    sample_submission["azimuth"] = 1
+    sample_submission["zenith"] = 1
+    logger.info("Saving files")
+    # save the prepared tables
+    new_train.to_parquet(public / "train_meta.parquet", index=False, engine="fastparquet")
+    new_test_without_labels.to_parquet(
+        public / "test_meta.parquet", index=False, engine="fastparquet"
+    )
+    sample_submission.to_csv(public / "sample_submission.csv", index=False)
+    new_test.to_csv(private / "test.csv", index=False)
+    logger.info("Copying remaining files")
+    # sensor_geometry can be copied as is
+    shutil.copy(raw / "sensor_geometry.csv", public / "sensor_geometry.csv")
+    # copy the raw train files to train and test folders respectively
+    train_batch_ids = set(train_batch_ids)
+    train_dest = public / "train"
+    train_dest.mkdir(exist_ok=True, parents=True)
+    test_batch_ids = set(test_batch_ids)
+    test_dest = public / "test"
+    test_dest.mkdir(exist_ok=True, parents=True)
+    for batch_file in tqdm(
+        sorted((raw / "train").glob("*.parquet")), desc="Copying batch parquet files"
+    ):
+        batch_id = int(
+            batch_file.stem.split("_")[-1]
+        )  # i.e. go from e.g. 'train_000.parquet' to '000' to 0
+        if batch_id in train_batch_ids:
+            shutil.copy(batch_file, train_dest / batch_file.name)
+        elif batch_id in test_batch_ids:
+            shutil.copy(batch_file, test_dest / batch_file.name)
+    logger.info("Running checks")
+    # Asserts
+    assert len(list(public.glob("train/*.parquet"))) == len(
+        train_batch_ids
+    ), "Not all train batches copied"
+    assert len(list(public.glob("test/*.parquet"))) == len(
+        test_batch_ids
+    ), "Not all test batches copied"
+    assert len(train_batch_ids) + len(test_batch_ids) == len(
+        batch_ids
+    ), "Something went wrong with splitting the batches"
+    assert len(new_train) + len(new_test) == len(
+        old_train[old_train["split"].notnull()]
+    ), "Expected train + test to equal the original data"
+    assert len(sample_submission) == len(
+        new_test
+    ), "Length mismatch between private test and sample submission"
+    assert sample_submission.columns.equals(
+        new_test.columns
+    ), f"Column mismatch between sample_submission and private test"
+    assert new_train.columns.equals(
+        old_train.drop(columns=["split"]).columns
+    ), f"Unexpected columns in train, expected {old_train.columns}, got {new_train.columns}"
+    assert new_test_without_labels.columns.equals(
+        old_train.drop(columns=["azimuth", "zenith", "split"]).columns
+    ), f"Unexpected columns in test, expected {old_train.drop(columns=['azimuth', 'zenith']).columns}, got {new_test_without_labels.columns}"
+    assert (
+        len(set(new_train["event_id"]).intersection(set(new_test["event_id"]))) == 0
+    ), "Event ids overlap between train and test"
+    assert set(new_test["event_id"]) == set(
+        sample_submission["event_id"]
+    ), "Event ids mismatch between test and sample submission"
+    logger.info("Done.")

dslighting 1.7.1__py3-none-any.whl → 1.7.8__py3-none-any.whl

dslighting 1.7.1py3-none-any.whl → 1.7.8py3-none-any.whl