PyPI - dslighting - Versions diffs - 1.7.1__py3-none-any.whl → 1.7.8__py3-none-any.whl - Mend

dslighting 1.7.1py3-none-any.whl → 1.7.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (352) hide show

mlebench/competitions/vesuvius-challenge-ink-detection/grade.py ADDED Viewed

@@ -0,0 +1,97 @@
+import numpy as np
+import pandas as pd
+from pandas import DataFrame, Series
+from sklearn.metrics import fbeta_score
+from mlebench.competitions.utils import rle_decode
+from mlebench.grade_helpers import InvalidSubmissionError
+def grade(submission: DataFrame, answers: DataFrame) -> float:
+    scores = []
+    if len(submission) != len(answers):
+        raise InvalidSubmissionError(
+            f"Expected {len(answers)} rows in the submission DataFrame, but got {len(submission)} "
+            f"rows."
+        )
+    for (_, actual), (_, expected) in zip(submission.iterrows(), answers.iterrows()):
+        score = grade_row(actual, expected)
+        scores.append(score)
+    return np.mean(scores)
+def grade_row(submission: Series, answer: Series) -> float:
+    expected_submission_indices = ["Id", "Predicted"]
+    expected_answer_indices = expected_submission_indices + ["width", "height"]
+    assert set(answer.index) == set(
+        expected_answer_indices
+    ), f"Expected indices {set(expected_answer_indices)}, got {set(answer.index)}."
+    if len(submission) != len(expected_submission_indices):
+        raise InvalidSubmissionError(
+            f"Expected {len(expected_submission_indices)} indices in the submission DataFrame, "
+            f"but got {len(submission)} indices."
+        )
+    if not set(submission.index).issuperset(set(expected_submission_indices)):
+        raise InvalidSubmissionError(
+            f"Expected columns {expected_submission_indices}, got {set(submission.index)}."
+        )
+    assert (
+        submission["Id"] == answer["Id"]
+    ), "Expected 'Id' column to be the same in both DataFrames."
+    submission_rle = submission["Predicted"]
+    answer_rle = answer["Predicted"]
+    if not isinstance(submission_rle, str):
+        raise InvalidSubmissionError(
+            f"Expected 'Predicted' column to be a string, got {type(submission_rle)}."
+        )
+    assert isinstance(
+        answer_rle, str
+    ), f"Expected 'Predicted' column in answer to be a string, got {type(answer_rle)}."
+    assert "width" in answer.index, "Expected 'width' index in answer."
+    assert "height" in answer.index, "Expected 'height' index in answer."
+    width = int(answer["width"])
+    height = int(answer["height"])
+    # The `rle_decode` assumes the run-length encoded string is ordered top to bottom then left to right,
+    # that is, 1 is (1,1), 2 is (2,1) and so on. The Vesuvius Challenge requires the opposite order, that
+    # is, 1 is (1,1), 2 is (1,2) and so on. To fix this, we decode the transpose of the matrix, then
+    # transpose it back.
+    try:
+        submission_matrix = rle_decode(submission_rle, height=width, width=height).T
+    except AssertionError as e:
+        raise InvalidSubmissionError(f"Error decoding RLE masks: {e}")
+    answer_matrix = rle_decode(answer_rle, height=width, width=height).T
+    assert submission_matrix.shape == (
+        height,
+        width,
+    ), f"Expected submission matrix to have shape ({height}, {width}), got {submission_matrix.shape}."
+    assert (
+        submission_matrix.shape == answer_matrix.shape
+    ), f"Expected submission matrix to have shape {answer_matrix.shape}, got {submission_matrix.shape}."
+    y_pred = submission_matrix.flatten().astype(bool)
+    y_true = answer_matrix.flatten().astype(bool)
+    assert np.isclose(submission_matrix.sum().sum(), y_pred.sum()), (
+        f"Expected the sum of the submission matrix to be preserved when flattening and converting "
+        f"to bool, but got {np.sum(y_pred)} instead of {np.sum(submission_matrix)}."
+    )
+    score = fbeta_score(y_true=y_true, y_pred=y_pred, beta=0.5)
+    return score

mlebench/competitions/vesuvius-challenge-ink-detection/prepare.py ADDED Viewed

@@ -0,0 +1,122 @@
+import shutil
+from pathlib import Path
+import pandas as pd
+from PIL import Image
+from tqdm import tqdm
+from mlebench.utils import read_csv
+def prepare(raw: Path, public: Path, private: Path) -> None:
+    # Copy train images to `public/train/{1,2}/`
+    shutil.copytree(
+        src=raw / "train" / "1",
+        dst=public / "train" / "1",
+    )
+    shutil.copytree(
+        src=raw / "train" / "2",
+        dst=public / "train" / "2",
+    )
+    # Create test `inklabels_rle.csv`
+    inklabels_rle = read_csv(raw / "train" / "3" / "inklabels_rle.csv")
+    assert (
+        len(inklabels_rle) == 1
+    ), f"Expected a single row in `inklabels_rle.csv`, got {len(inklabels_rle)} rows."
+    img_path = raw / "train" / "3" / "ir.png"
+    assert img_path.is_file(), f"Expected image file at {img_path}, but it does not exist."
+    with Image.open(img_path) as img:
+        width, height = img.size
+    inklabels_rle["width"] = width
+    inklabels_rle["height"] = height
+    inklabels_rle["Id"] = "a"
+    inklabels_rle.to_csv(private / "inklabels_rle.csv", index=False)
+    # Write `gold_submission.csv`
+    inklabels_rle.drop(columns=["width", "height"]).to_csv(
+        private / "gold_submission.csv",
+        index=False,
+    )
+    # Copy test images to `{public,private}/test/a/`
+    test_imgs = list((raw / "train" / "3").rglob("*"))
+    for fpath in tqdm(test_imgs, desc="Creating test images"):
+        if not fpath.is_file():
+            continue
+        assert fpath.suffix in [
+            ".png",
+            ".csv",
+            ".tif",
+        ], f"Expected file with extension png, csv, or tif, got `{fpath.suffix}` for file `{fpath}`"
+        relative_path = fpath.relative_to(raw / "train" / "3")
+        if fpath.name in ["inklabels.png", "inklabels_rle.csv", "ir.png"]:
+            continue  # skip test images and labels
+        dst = public / "test" / "a" / relative_path
+        dst.parent.mkdir(parents=True, exist_ok=True)
+        shutil.copy(fpath, dst)  # everything else to `public`
+    sample_submission = pd.DataFrame({"Id": ["a"], "Predicted": ["1 1 5 1"]})
+    sample_submission.to_csv(public / "sample_submission.csv", index=False)
+    # Sanity checks
+    assert len(sample_submission) == len(inklabels_rle), (
+        f"Expected {len(inklabels_rle)} rows in `sample_submission.csv`, got "
+        f"{len(sample_submission)} rows."
+    )
+    actual_sample_submission = read_csv(public / "sample_submission.csv")
+    actual_inklabels_rle = read_csv(private / "inklabels_rle.csv")
+    assert (
+        "Id" in actual_sample_submission.columns
+    ), f"Expected column `Id` in `sample_submission.csv`."
+    assert (
+        "Predicted" in actual_sample_submission.columns
+    ), f"Expected column `Predicted` in `sample_submission.csv`."
+    assert "Id" in actual_inklabels_rle.columns, f"Expected column `Id` in `inklabels_rle.csv`."
+    assert (
+        "Predicted" in actual_inklabels_rle.columns
+    ), f"Expected column `Predicted` in `inklabels_rle.csv`."
+    assert (
+        "width" in actual_inklabels_rle.columns
+    ), f"Expected column `width` in `inklabels_rle.csv`."
+    assert (
+        "height" in actual_inklabels_rle.columns
+    ), f"Expected column `height` in `inklabels_rle.csv`."
+    assert len(list((public / "train" / "1").rglob("*"))) == len(
+        list((raw / "train" / "1").rglob("*"))
+    ), (
+        f"Expected {len(list(raw / 'train' / '1').rglob('*'))} files in `public/train/1`, got "
+        f"{len(list(public / 'train' / '1').rglob('*'))} files."
+    )
+    assert len(list((public / "train" / "2").rglob("*"))) == len(
+        list((raw / "train" / "2").rglob("*"))
+    ), (
+        f"Expected {len(list(raw / 'train' / '2').rglob('*'))} files in `public/train/2`, got "
+        f"{len(list(public / 'train' / '2').rglob('*'))} files."
+    )
+    n_test_actual = len(list((public / "test" / "a").rglob("*")))
+    n_test_expected = len(list((raw / "train" / "3").rglob("*"))) - len(
+        ["inklabels.png", "inklabels_rle.csv", "ir.png"]
+    )
+    assert n_test_actual == n_test_expected, (
+        f"Expected " f"{n_test_expected} " f"files in `public/test/a`, got {n_test_actual} files."
+    )

mlebench/competitions/vesuvius-challenge-ink-detection/prepare_val.py ADDED Viewed

@@ -0,0 +1,170 @@
+import shutil
+from pathlib import Path
+from typing import List
+import pandas as pd
+from PIL import Image
+from tqdm import tqdm
+from mlebench.utils import read_csv
+def prepare(raw: Path, public: Path, private: Path) -> None:
+    """
+    Prepares the data by creating a primary train/test split and a secondary
+    train/validation split.
+    The primary split uses fragments {1, 2} for training and {3} for testing,
+    outputting to `public/` and `private/` directories.
+    The secondary split uses fragment {1} for training and {2} for validation,
+    outputting to `public_val/` and `private_val/` directories, mirroring the
+    structure and logic of the primary split.
+    """
+    def _create_split_data(
+        train_fragment_ids: List[str],
+        test_fragment_id: str,
+        raw_path: Path,
+        public_path: Path,
+        private_path: Path,
+    ) -> None:
+        """
+        A generic helper function to process a set of train/test fragments
+        and generate the corresponding public and private data.
+        """
+        # Ensure destination directories exist
+        public_path.mkdir(parents=True, exist_ok=True)
+        private_path.mkdir(parents=True, exist_ok=True)
+        # Copy train images to `public_path/train/{id}/`
+        for frag_id in train_fragment_ids:
+            shutil.copytree(
+                src=raw_path / "train" / frag_id,
+                dst=public_path / "train" / frag_id,
+                dirs_exist_ok=True,  # Make script re-runnable
+            )
+        test_fragment_path = raw_path / "train" / test_fragment_id
+        # Create test `inklabels_rle.csv`
+        inklabels_rle = read_csv(test_fragment_path / "inklabels_rle.csv")
+        assert (
+            len(inklabels_rle) == 1
+        ), f"Expected a single row in `inklabels_rle.csv`, got {len(inklabels_rle)} rows."
+        img_path = test_fragment_path / "ir.png"
+        assert img_path.is_file(), f"Expected image file at {img_path}, but it does not exist."
+        with Image.open(img_path) as img:
+            width, height = img.size
+        inklabels_rle["width"] = width
+        inklabels_rle["height"] = height
+        inklabels_rle["Id"] = "a"
+        inklabels_rle.to_csv(private_path / "inklabels_rle.csv", index=False)
+        # Write `gold_submission.csv`
+        inklabels_rle.drop(columns=["width", "height"]).to_csv(
+            private_path / "gold_submission.csv",
+            index=False,
+        )
+        # Copy test images to `public_path/test/a/`
+        test_imgs = list(test_fragment_path.rglob("*"))
+        for fpath in tqdm(test_imgs, desc=f"Creating test images for {public_path.name}"):
+            if not fpath.is_file():
+                continue
+            assert fpath.suffix in [
+                ".png",
+                ".csv",
+                ".tif",
+            ], f"Expected file with extension png, csv, or tif, got `{fpath.suffix}` for file `{fpath}`"
+            relative_path = fpath.relative_to(test_fragment_path)
+            if fpath.name in ["inklabels.png", "inklabels_rle.csv", "ir.png"]:
+                continue  # skip test images and labels
+            dst = public_path / "test" / "a" / relative_path
+            dst.parent.mkdir(parents=True, exist_ok=True)
+            shutil.copy(fpath, dst)  # everything else to `public_path`
+        sample_submission = pd.DataFrame({"Id": ["a"], "Predicted": ["1 1 5 1"]})
+        sample_submission.to_csv(public_path / "sample_submission.csv", index=False)
+        # Sanity checks
+        assert len(sample_submission) == len(inklabels_rle), (
+            f"Expected {len(inklabels_rle)} rows in `sample_submission.csv`, got "
+            f"{len(sample_submission)} rows."
+        )
+        actual_sample_submission = read_csv(public_path / "sample_submission.csv")
+        actual_inklabels_rle = read_csv(private_path / "inklabels_rle.csv")
+        assert (
+            "Id" in actual_sample_submission.columns
+        ), f"Expected column `Id` in `sample_submission.csv`."
+        assert (
+            "Predicted" in actual_sample_submission.columns
+        ), f"Expected column `Predicted` in `sample_submission.csv`."
+        assert "Id" in actual_inklabels_rle.columns, f"Expected column `Id` in `inklabels_rle.csv`."
+        assert (
+            "Predicted" in actual_inklabels_rle.columns
+        ), f"Expected column `Predicted` in `inklabels_rle.csv`."
+        assert (
+            "width" in actual_inklabels_rle.columns
+        ), f"Expected column `width` in `inklabels_rle.csv`."
+        assert (
+            "height" in actual_inklabels_rle.columns
+        ), f"Expected column `height` in `inklabels_rle.csv`."
+        for frag_id in train_fragment_ids:
+            assert len(list((public_path / "train" / frag_id).rglob("*"))) == len(
+                list((raw_path / "train" / frag_id).rglob("*"))
+            ), (
+                f"Expected {len(list((raw_path / 'train' / frag_id).rglob('*')))} files in `{public_path}/train/{frag_id}`, got "
+                f"{len(list((public_path / 'train' / frag_id).rglob('*')))} files."
+            )
+        n_test_actual = len(list((public_path / "test" / "a").rglob("*")))
+        n_test_expected = len(list(test_fragment_path.rglob("*"))) - len(
+            ["inklabels.png", "inklabels_rle.csv", "ir.png"]
+        )
+        assert n_test_actual == n_test_expected, (
+            f"Expected " f"{n_test_expected} " f"files in `{public_path}/test/a`, got {n_test_actual} files."
+        )
+    # --- Primary Split (Original Behavior) ---
+    # This call generates the original `public` and `private` directories.
+    # The contents will be identical to the original script's output.
+    _create_split_data(
+        train_fragment_ids=["1", "2"],
+        test_fragment_id="3",
+        raw_path=raw,
+        public_path=public,
+        private_path=private,
+    )
+    # --- New Validation Split ---
+    # Define paths for the new validation set directories.
+    public_val = public.parent / "public_val"
+    private_val = private.parent / "private_val"
+    # This call splits the original training data ({1, 2}) into a new, smaller
+    # training set ({1}) and a validation set ({2}). The output structure
+    # in `public_val` and `private_val` will mirror the original one.
+    _create_split_data(
+        train_fragment_ids=["1"],
+        test_fragment_id="2",
+        raw_path=raw,
+        public_path=public_val,
+        private_path=private_val,
+    )

mlebench/competitions/vinbigdata-chest-xray-abnormalities-detection/grade.py ADDED Viewed

@@ -0,0 +1,220 @@
+import numpy as np
+import pandas as pd
+from pycocotools.coco import COCO
+from pycocotools.cocoeval import COCOeval
+from mlebench.grade_helpers import InvalidSubmissionError
+def prepare_for_metric(submission: pd.DataFrame, answers: pd.DataFrame) -> dict:
+    """Prepare the submission and answers for the metric."""
+    required_answer_columns = ["class_id", "x_min", "y_min", "x_max", "y_max", "image_id"]
+    for col in required_answer_columns:
+        assert col in answers.columns, f"Answers must have a `{col}` column"
+    for col in ["x_min", "y_min", "x_max", "y_max"]:
+        assert (
+            not answers[col].isnull().values.any()
+        ), f"Answers must not contain any NaN values in `{col}` column, but got {answers[col].isnull().values}"
+    required_submission_columns = ["PredictionString", "image_id"]
+    for col in required_submission_columns:
+        if col not in submission.columns:
+            raise InvalidSubmissionError(f"Submission must have a `{col}` column")
+    if set(submission["image_id"]) != set(answers["image_id"]):
+        raise InvalidSubmissionError(
+            "The set of image_ids in the submission must match the set of image_ids in the answers"
+        )
+    return {"true_df": answers, "pred_df": submission}
+def grade(submission: pd.DataFrame, answers: pd.DataFrame) -> float:
+    inputs = prepare_for_metric(submission, answers)
+    vineval = VinBigDataEval(inputs["true_df"])
+    cocoEvalResults = vineval.evaluate(inputs["pred_df"])
+    return cocoEvalResults.stats[0]
+class VinBigDataEval:
+    """Helper class for calculating the competition metric.
+    You should remove the duplicated annoatations from the `true_df` dataframe
+    before using this script. Otherwise it may give incorrect results.
+        >>> vineval = VinBigDataEval(valid_df)
+        >>> cocoEvalResults = vineval.evaluate(pred_df)
+    Arguments:
+        true_df: pd.DataFrame Clean (no duplication) Training/Validating dataframe.
+    Authors:
+        Peter (https://kaggle.com/pestipeti)
+    See:
+        https://www.kaggle.com/pestipeti/competition-metric-map-0-4
+    Returns: None
+    """
+    def __init__(self, true_df):
+        self.true_df = true_df
+        self.image_ids = true_df["image_id"].unique()
+        self.annotations = {
+            "type": "instances",
+            "images": self.__gen_images(self.image_ids),
+            "categories": self.__gen_categories(self.true_df),
+            "annotations": self.__gen_annotations(self.true_df, self.image_ids),
+        }
+        self.predictions = {
+            "images": self.annotations["images"].copy(),
+            "categories": self.annotations["categories"].copy(),
+            "annotations": None,
+        }
+    def __gen_categories(self, df):
+        print("Generating category data...")
+        if "class_name" not in df.columns:
+            df["class_name"] = df["class_id"]
+        cats = df[["class_name", "class_id"]]
+        cats = cats.drop_duplicates().sort_values(by="class_id").values
+        results = []
+        for cat in cats:
+            results.append(
+                {
+                    "id": cat[1],
+                    "name": cat[0],
+                    "supercategory": "none",
+                }
+            )
+        return results
+    def __gen_images(self, image_ids):
+        print("Generating image data...")
+        results = []
+        for idx, image_id in enumerate(image_ids):
+            # Add image identification.
+            results.append(
+                {
+                    "id": idx,
+                }
+            )
+        return results
+    def __gen_annotations(self, df, image_ids):
+        print("Generating annotation data...")
+        k = 0
+        results = []
+        for idx, image_id in enumerate(image_ids):
+            # Add image annotations
+            for i, row in df[df["image_id"] == image_id].iterrows():
+                results.append(
+                    {
+                        "id": k,
+                        "image_id": idx,
+                        "category_id": row["class_id"],
+                        "bbox": np.array([row["x_min"], row["y_min"], row["x_max"], row["y_max"]]),
+                        "segmentation": [],
+                        "ignore": 0,
+                        "area": (row["x_max"] - row["x_min"]) * (row["y_max"] - row["y_min"]),
+                        "iscrowd": 0,
+                    }
+                )
+                k += 1
+        return results
+    def __decode_prediction_string(self, pred_str):
+        data = list(map(float, pred_str.split(" ")))
+        data = np.array(data)
+        return data.reshape(-1, 6)
+    def __gen_predictions(self, df, image_ids):
+        print("Generating prediction data...")
+        k = 0
+        results = []
+        for i, row in df.iterrows():
+            image_id = row["image_id"]
+            preds = self.__decode_prediction_string(row["PredictionString"])
+            for j, pred in enumerate(preds):
+                results.append(
+                    {
+                        "id": k,
+                        "image_id": int(np.where(image_ids == image_id)[0]),
+                        "category_id": int(pred[0]),
+                        "bbox": np.array([pred[2], pred[3], pred[4], pred[5]]),
+                        "segmentation": [],
+                        "ignore": 0,
+                        "area": (pred[4] - pred[2]) * (pred[5] - pred[3]),
+                        "iscrowd": 0,
+                        "score": pred[1],
+                    }
+                )
+                k += 1
+        return results
+    def evaluate(self, pred_df, n_imgs=-1):
+        """Evaluating your results
+        Arguments:
+            pred_df: pd.DataFrame your predicted results in the
+                     competition output format.
+            n_imgs:  int Number of images use for calculating the
+                     result.All of the images if `n_imgs` <= 0
+        Returns:
+            COCOEval object
+        """
+        if pred_df is not None:
+            self.predictions["annotations"] = self.__gen_predictions(pred_df, self.image_ids)
+        coco_ds = COCO()
+        coco_ds.dataset = self.annotations
+        coco_ds.createIndex()
+        coco_dt = COCO()
+        coco_dt.dataset = self.predictions
+        coco_dt.createIndex()
+        imgIds = sorted(coco_ds.getImgIds())
+        if n_imgs > 0:
+            imgIds = np.random.choice(imgIds, n_imgs)
+        cocoEval = COCOeval(coco_ds, coco_dt, "bbox")
+        cocoEval.params.imgIds = imgIds
+        cocoEval.params.useCats = True
+        cocoEval.params.iouType = "bbox"
+        cocoEval.params.iouThrs = np.array([0.4])
+        cocoEval.evaluate()
+        cocoEval.accumulate()
+        cocoEval.summarize()
+        return cocoEval

dslighting 1.7.1__py3-none-any.whl → 1.7.8__py3-none-any.whl

dslighting 1.7.1py3-none-any.whl → 1.7.8py3-none-any.whl