PyPI - dslighting - Versions diffs - 1.7.1__py3-none-any.whl → 1.7.6__py3-none-any.whl - Mend

dslighting 1.7.1py3-none-any.whl → 1.7.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (352) hide show

mlebench/competitions/herbarium-2022-fgvc9/prepare_val.py ADDED Viewed

@@ -0,0 +1,213 @@
+import json
+import random
+import shutil
+from pathlib import Path
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from tqdm import tqdm
+from mlebench.utils import get_logger
+logger = get_logger(__name__)
+def _organize_data_by_category(metadata: dict) -> dict:
+    """Organizes image and annotation data by category ID."""
+    annotations_images_by_category = {}
+    for annotation, image in list(zip(metadata["annotations"], metadata["images"])):
+        assert annotation["image_id"] == image["image_id"]
+        category_id = annotation["category_id"]
+        if category_id not in annotations_images_by_category:
+            annotations_images_by_category[category_id] = []
+        annotations_images_by_category[category_id].append(
+            {
+                "annotation": annotation,
+                "image": image,
+            }
+        )
+    return annotations_images_by_category
+def _split_data(data_by_category: dict, test_size: float, random_state: int) -> tuple[dict, dict]:
+    """Splits data for each category into train and test sets."""
+    train_annotations_images_by_category = {}
+    test_annotations_images_by_category = {}
+    for category_id, annotations_images in data_by_category.items():
+        # Each category needs to be in both train and test
+        train_annotations_images, test_annotations_images = train_test_split(
+            annotations_images, test_size=test_size, random_state=random_state
+        )
+        assert len(train_annotations_images) > 0 and len(test_annotations_images) > 0
+        train_annotations_images_by_category[category_id] = train_annotations_images
+        test_annotations_images_by_category[category_id] = test_annotations_images
+    return train_annotations_images_by_category, test_annotations_images_by_category
+def _process_train_set(
+    train_data: dict, base_metadata: dict, raw_path: Path, output_public_path: Path
+):
+    """Processes and writes the training set data, images, and metadata."""
+    logger.info(f"Processing train set for output: {output_public_path}")
+    new_train_metadata = base_metadata.copy()
+    new_train_metadata.update({"annotations": [], "images": []})
+    train_sample_count = sum(len(v) for v in train_data.values())
+    output_train_images_path = output_public_path / "train_images"
+    with tqdm(
+        desc=f"Creating train dataset in {output_public_path.name}",
+        total=train_sample_count,
+    ) as pbar:
+        for category_id, annotations_images in train_data.items():
+            category_subdir = f"{category_id // 100:03d}/{category_id % 100:02d}"
+            (output_train_images_path / category_subdir).mkdir(exist_ok=True, parents=True)
+            for idx, annotation_image in enumerate(annotations_images):
+                new_image_id = f"{category_id:05d}__{(idx + 1):03d}"
+                new_file_name = f"{category_subdir}/{new_image_id}.jpg"
+                new_annotation = annotation_image["annotation"].copy()
+                new_annotation["image_id"] = new_image_id
+                new_train_metadata["annotations"].append(new_annotation)
+                new_image = annotation_image["image"].copy()
+                new_image["image_id"] = new_image_id
+                new_image["file_name"] = new_file_name
+                new_train_metadata["images"].append(new_image)
+                src_path = raw_path / "train_images" / annotation_image["image"]["file_name"]
+                dst_path = output_train_images_path / new_file_name
+                shutil.copyfile(src=src_path, dst=dst_path)
+                pbar.update(1)
+    with open(output_public_path / "train_metadata.json", "w") as f:
+        json.dump(new_train_metadata, f, indent=4, sort_keys=True)
+    assert len(list(output_train_images_path.glob("**/*.jpg"))) == len(
+        new_train_metadata["images"]
+    )
+    assert len(new_train_metadata["annotations"]) == len(new_train_metadata["images"])
+def _process_test_set(
+    test_data: dict, raw_path: Path, output_public_path: Path, output_private_path: Path
+):
+    """Processes and writes the test set data, images, metadata, and private answers."""
+    logger.info(
+        f"Processing test set for outputs: {output_public_path} and {output_private_path}"
+    )
+    new_test_metadata = {"annotations": [], "images": []}
+    test_annotations_images = [item for sublist in test_data.values() for item in sublist]
+    random.Random(0).shuffle(test_annotations_images)
+    output_test_images_path = output_public_path / "test_images"
+    for idx, annotation_image in tqdm(
+        enumerate(test_annotations_images),
+        desc=f"Creating test dataset in {output_public_path.name}",
+        total=len(test_annotations_images),
+    ):
+        new_image_id = str(idx)
+        new_file_name = f"{idx // 1000:03d}/test-{idx:06d}.jpg"
+        new_annotation = annotation_image["annotation"].copy()
+        new_annotation["image_id"] = new_image_id
+        new_test_metadata["annotations"].append(new_annotation)
+        new_image = annotation_image["image"].copy()
+        new_image["image_id"] = new_image_id
+        new_image["file_name"] = new_file_name
+        new_test_metadata["images"].append(new_image)
+        src_path = raw_path / "train_images" / annotation_image["image"]["file_name"]
+        dst_path = output_test_images_path / new_file_name
+        dst_path.parent.mkdir(exist_ok=True, parents=True)
+        shutil.copyfile(src=src_path, dst=dst_path)
+    with open(output_public_path / "test_metadata.json", "w") as f:
+        json.dump(new_test_metadata["images"], f, indent=4, sort_keys=True)
+    answers_rows = [
+        {"Id": image["image_id"], "Predicted": annotation["category_id"]}
+        for image, annotation in zip(new_test_metadata["images"], new_test_metadata["annotations"])
+    ]
+    answers_df = pd.DataFrame(answers_rows)
+    answers_df.to_csv(output_private_path / "answers.csv", index=False)
+    sample_rows = [{"Id": image["image_id"], "Predicted": 42} for image in new_test_metadata["images"]]
+    sample_df = pd.DataFrame(sample_rows)
+    sample_df.to_csv(output_public_path / "sample_submission.csv", index=False)
+    assert len(list(output_test_images_path.glob("**/*.jpg"))) == len(new_test_metadata["images"])
+    assert len(new_test_metadata["annotations"]) == len(new_test_metadata["images"])
+    assert len(answers_df) == len(new_test_metadata["images"])
+    assert len(sample_df) == len(answers_df)
+def prepare(raw: Path, public: Path, private: Path):
+    """
+    Splits the raw data into public and private datasets with appropriate test/train splits.
+    `train_metadata.json` is the "table of contents" for our data, with the following structure:
+    (More details at https://www.kaggle.com/competitions/herbarium-2022-fgvc9/data)
+    ```
+    {
+        "annotations" : [annotation],
+        "categories" : [category],
+        "genera" : [genus]
+        "images" : [image],
+        "distances" : [distance],
+        "licenses" : [license],
+        "institutions" : [institution]
+    }
+    ```
+    - `images` and `annotations` are both N-length lists corresponding to the N samples.
+        We'll need to split each of these lists into train and test.
+    - The other fields are dataset-wide metadata that we don't need to touch.
+    Other notes:
+    - train/test splits need to occur per category (each category should be in both train and test).
+    - The `test_images` and `train_images` folders have nested subdirs to make it easier to browse
+        - `train_images` is structured as `{category_id[:3]}/{category_id[3:]}/{image_id}.jpg`
+        - `test_images` is structured as `{image_idx[:3]}/test-{image_idx}.jpg` (to not reveal the category)
+    - When we create the new splits, we re-assign image indices so that we don't give away labels based on the index
+        - train images are indexed within their own category
+        - test images follow a flat index after shuffling the categories
+    """
+    # Load raw data and organize it by category
+    with open(raw / "train_metadata.json") as f:
+        old_train_metadata = json.load(f)
+    annotations_images_by_category = _organize_data_by_category(old_train_metadata)
+    # --- 1. Create the original public/private datasets ---
+    # This first split creates the main train and test sets.
+    # The outputs in `public` and `private` will be identical to the original script.
+    logger.info("--- Creating original train/test split for 'public' and 'private' directories ---")
+    original_train_split, original_test_split = _split_data(
+        annotations_images_by_category, test_size=0.2, random_state=0
+    )
+    _process_train_set(original_train_split, old_train_metadata, raw, public)
+    _process_test_set(original_test_split, raw, public, private)
+    logger.info("Finished creating original 'public' and 'private' datasets.")
+    # --- 2. Create the validation datasets from the original training set ---
+    # This second split takes the `original_train_split` and splits it *again*
+    # to create a new, smaller training set and a validation set.
+    # The outputs are saved to new `public_val` and `private_val` directories.
+    logger.info("--- Creating validation split for 'public_val' and 'private_val' directories ---")
+    public_val = public.parent / "public_val"
+    private_val = private.parent / "private_val"
+    public_val.mkdir(exist_ok=True, parents=True)
+    private_val.mkdir(exist_ok=True, parents=True)
+    # The input for this split is the *train set* from the first split.
+    # We use the exact same splitting logic to ensure consistency.
+    train_val_split, test_val_split = _split_data(
+        original_train_split, test_size=0.2, random_state=0
+    )
+    _process_train_set(train_val_split, old_train_metadata, raw, public_val)
+    _process_test_set(test_val_split, raw, public_val, private_val)
+    logger.info("Finished creating validation 'public_val' and 'private_val' datasets.")

mlebench/competitions/histopathologic-cancer-detection/grade.py ADDED Viewed

@@ -0,0 +1,12 @@
+import numpy as np
+import pandas as pd
+from sklearn.metrics import roc_auc_score
+from mlebench.competitions.utils import prepare_for_auroc_metric
+def grade(submission: pd.DataFrame, answers: pd.DataFrame) -> float:
+    roc_auc_inputs = prepare_for_auroc_metric(
+        submission=submission, answers=answers, id_col="id", target_col="label"
+    )
+    return roc_auc_score(y_true=roc_auc_inputs["y_true"], y_score=roc_auc_inputs["y_score"])

mlebench/competitions/histopathologic-cancer-detection/prepare.py ADDED Viewed

@@ -0,0 +1,59 @@
+import json
+import shutil
+from pathlib import Path
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from tqdm import tqdm
+def prepare(raw: Path, public: Path, private: Path):
+    old_train = pd.read_csv(raw / "train_labels.csv")
+    num_test = len(list((raw / "test").glob("*.tif")))
+    test_ratio = num_test / (len(old_train) + num_test)
+    new_train_ids, new_test_ids = train_test_split(
+        old_train["id"], test_size=test_ratio, random_state=0
+    )
+    new_train = old_train[old_train["id"].isin(new_train_ids)]
+    new_test = old_train[old_train["id"].isin(new_test_ids)]
+    assert set(new_train["id"]).isdisjoint(
+        set(new_test["id"])
+    ), "Train should not contain id's of test images"
+    assert len(new_train_ids) + len(new_test_ids) == len(
+        old_train
+    ), "The combined length of new_train_ids and new_test_ids should equal the length of old_train"
+    # Copy over files
+    (public / "train").mkdir(exist_ok=True)
+    (public / "test").mkdir(exist_ok=True)
+    for file_id in tqdm(new_train_ids, desc="Copying train images"):
+        shutil.copyfile(
+            src=raw / "train" / f"{file_id}.tif",
+            dst=public / "train" / f"{file_id}.tif",
+        )
+    for file_id in tqdm(new_test_ids, desc="Copying test images"):
+        shutil.copyfile(
+            src=raw / "train" / f"{file_id}.tif",
+            dst=public / "test" / f"{file_id}.tif",
+        )
+    # Create sample submission
+    sample_submission = new_test.copy()
+    sample_submission["label"] = 0
+    # Copy over files
+    new_train.to_csv(public / "train_labels.csv", index=False)
+    new_test.to_csv(private / "answers.csv", index=False)
+    sample_submission.to_csv(public / "sample_submission.csv", index=False)
+    # Check integrity of files copied
+    assert len(list(public.glob("train/*.tif"))) == len(
+        new_train_ids
+    ), "Number of train images should be equal to the number of unique id's in the train set"
+    assert len(list(public.glob("test/*.tif"))) == len(
+        new_test_ids
+    ), "Number of test images should be equal to the number of unique id's in the test set"

mlebench/competitions/histopathologic-cancer-detection/prepare_val.py ADDED Viewed

@@ -0,0 +1,131 @@
+import shutil
+from pathlib import Path
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from tqdm import tqdm
+def _create_split(
+    df_to_split: pd.DataFrame,
+    image_source_dir: Path,
+    test_split_size: float,
+    public_dest: Path,
+    private_dest: Path,
+    random_state: int,
+) -> pd.DataFrame:
+    """
+    Splits a dataframe of labels, copies corresponding images, and saves the results.
+    Args:
+        df_to_split: DataFrame containing image IDs and labels to be split.
+        image_source_dir: Path to the directory containing the source images.
+        test_split_size: The proportion of the dataset to allocate to the test split.
+        public_dest: The destination directory for public-facing files (train set, test images, sample submission).
+        private_dest: The destination directory for private files (test set answers).
+        random_state: The seed used by the random number generator for the split.
+    Returns:
+        The training portion of the split as a pandas DataFrame.
+    """
+    # Create destination directories
+    public_dest.mkdir(exist_ok=True, parents=True)
+    private_dest.mkdir(exist_ok=True, parents=True)
+    (public_dest / "train").mkdir(exist_ok=True)
+    (public_dest / "test").mkdir(exist_ok=True)
+    # Perform the split
+    train_ids, test_ids = train_test_split(
+        df_to_split["id"], test_size=test_split_size, random_state=random_state
+    )
+    train_df = df_to_split[df_to_split["id"].isin(train_ids)]
+    test_df = df_to_split[df_to_split["id"].isin(test_ids)]
+    assert set(train_df["id"]).isdisjoint(
+        set(test_df["id"])
+    ), "Train should not contain id's of test images"
+    assert len(train_ids) + len(test_ids) == len(
+        df_to_split
+    ), "The combined length of train_ids and test_ids should equal the length of the source df"
+    # Copy over image files
+    for file_id in tqdm(train_ids, desc=f"Copying train images to {public_dest}"):
+        shutil.copyfile(
+            src=image_source_dir / f"{file_id}.tif",
+            dst=public_dest / "train" / f"{file_id}.tif",
+        )
+    for file_id in tqdm(test_ids, desc=f"Copying test images to {public_dest}"):
+        shutil.copyfile(
+            src=image_source_dir / f"{file_id}.tif",
+            dst=public_dest / "test" / f"{file_id}.tif",
+        )
+    # Create and save label/submission files
+    sample_submission = test_df.copy()
+    sample_submission["label"] = 0
+    train_df.to_csv(public_dest / "train_labels.csv", index=False)
+    test_df.to_csv(private_dest / "answers.csv", index=False)
+    sample_submission.to_csv(public_dest / "sample_submission.csv", index=False)
+    # Check integrity of files copied
+    assert len(list(public_dest.glob("train/*.tif"))) == len(
+        train_ids
+    ), "Number of train images should be equal to the number of unique id's in the train set"
+    assert len(list(public_dest.glob("test/*.tif"))) == len(
+        test_ids
+    ), "Number of test images should be equal to the number of unique id's in the test set"
+    return train_df
+def prepare(raw: Path, public: Path, private: Path):
+    """
+    Prepares the data for the competition by performing two splits.
+    1. A main split of the raw data into a primary train/test set for the competition.
+       Outputs are saved to `public/` and `private/`.
+    2. A secondary split of the primary train set into a smaller train/validation set for user convenience.
+       Outputs are saved to `public_val/` and `private_val/`, mirroring the main structure.
+    """
+    # Common setup
+    all_train_labels = pd.read_csv(raw / "train_labels.csv")
+    image_source_dir = raw / "train"
+    RANDOM_STATE = 0
+    # --- 1. Main Split: Create the primary train/test sets ---
+    # This logic is identical to the original script to ensure outputs do not change.
+    num_test_from_pool = len(list((raw / "test").glob("*.tif")))
+    test_ratio_main = num_test_from_pool / (len(all_train_labels) + num_test_from_pool)
+    # The returned `primary_train_df` is the larger portion of the first split.
+    primary_train_df = _create_split(
+        df_to_split=all_train_labels,
+        image_source_dir=image_source_dir,
+        test_split_size=test_ratio_main,
+        public_dest=public,
+        private_dest=private,
+        random_state=RANDOM_STATE,
+    )
+    # --- 2. Validation Split: Create a secondary train/validation set ---
+    # Define new output paths for the validation split
+    public_val = public.parent / "public_val"
+    private_val = private.parent / "private_val"
+    # To get the same number of test samples as the main split, we must adjust the test_size
+    # for this second split, which operates on a smaller dataset (the primary_train_df).
+    if (1 - test_ratio_main) <= 0:
+        # Avoid division by zero in the unlikely case the main split uses all data for test
+        test_ratio_val = 0
+    else:
+        test_ratio_val = test_ratio_main / (1 - test_ratio_main)
+    # Perform the second split on the primary training data
+    _create_split(
+        df_to_split=primary_train_df,
+        image_source_dir=image_source_dir,
+        test_split_size=test_ratio_val,
+        public_dest=public_val,
+        private_dest=private_val,
+        random_state=RANDOM_STATE,
+    )

mlebench/competitions/hms-harmful-brain-activity-classification/constants.py ADDED Viewed

@@ -0,0 +1,9 @@
+ID_COL = "eeg_id"
+TARGET_COLS = [
+    "seizure_vote",
+    "lpd_vote",
+    "gpd_vote",
+    "lrda_vote",
+    "grda_vote",
+    "other_vote",
+]

mlebench/competitions/hms-harmful-brain-activity-classification/grade.py ADDED Viewed

@@ -0,0 +1,43 @@
+import pandas as pd
+from mlebench.grade_helpers import InvalidSubmissionError
+from . import kullback_leibler_divergence as kl_divergence
+from .constants import ID_COL, TARGET_COLS
+def prepare_for_metric(submission: pd.DataFrame, answers: pd.DataFrame) -> tuple:
+    if ID_COL not in submission.columns:
+        raise InvalidSubmissionError(f"Submission must contain {ID_COL} column")
+    if not all(col in submission.columns for col in TARGET_COLS):
+        raise InvalidSubmissionError(f"Submission must contain all target columns: {TARGET_COLS}")
+    if len(submission) != len(answers):
+        raise InvalidSubmissionError("Submission and answers must have the same length")
+    if not (submission[TARGET_COLS].sum(axis=1).apply(lambda x: round(x, 5) == 1).all()):
+        raise InvalidSubmissionError("Submission probabilities must add to 1 for each row")
+    if not set(answers[ID_COL]) == set(submission[ID_COL]):
+        raise InvalidSubmissionError("Submission and answers must have the same IDs")
+    assert ID_COL in answers.columns, f"Answers must contain {ID_COL} column"
+    assert all(
+        col in answers.columns for col in TARGET_COLS
+    ), f"Answers must contain all target columns: {TARGET_COLS}"
+    submission = submission.sort_values(ID_COL).reset_index(drop=True)
+    answers = answers.sort_values(ID_COL).reset_index(drop=True)
+    answers = answers.copy()[[ID_COL] + TARGET_COLS]
+    # normalize answers to be max 1, by taking vote / sum(votes)
+    # https://www.kaggle.com/competitions/hms-harmful-brain-activity-classification/discussion/468705#2606605
+    answers[TARGET_COLS] = answers[TARGET_COLS].div(answers[TARGET_COLS].sum(axis=1), axis=0)
+    return submission, answers
+def grade(submission: pd.DataFrame, answers: pd.DataFrame) -> float:
+    submission, answers = prepare_for_metric(submission, answers)
+    score = kl_divergence.score(answers, submission, ID_COL)
+    return score

mlebench/competitions/hms-harmful-brain-activity-classification/kaggle_metric_utilities.py ADDED Viewed

@@ -0,0 +1,96 @@
+"""
+This script exists to reduce code duplication across metrics.
+Source: https://www.kaggle.com/code/metric/kaggle-metric-utilities
+Linked from: https://www.kaggle.com/code/metric/kullback-leibler-divergence
+Linked from: https://www.kaggle.com/competitions/hms-harmful-brain-activity-classification
+"""
+from typing import Union
+import numpy as np
+import pandas as pd
+import pandas.api.types
+class ParticipantVisibleError(Exception):
+    pass
+class HostVisibleError(Exception):
+    pass
+def treat_as_participant_error(
+    error_message: str, solution: Union[pd.DataFrame, np.ndarray]
+) -> bool:
+    """Many metrics can raise more errors than can be handled manually. This function attempts
+    to identify errors that can be treated as ParticipantVisibleError without leaking any competition data.
+    If the solution is purely numeric, and there are no numbers in the error message,
+    then the error message is sufficiently unlikely to leak usable data and can be shown to participants.
+    We expect this filter to reject many safe messages. It's intended only to reduce the number of errors we need to manage manually.
+    """
+    # This check treats bools as numeric
+    if isinstance(solution, pd.DataFrame):
+        solution_is_all_numeric = all(
+            [pandas.api.types.is_numeric_dtype(x) for x in solution.dtypes.values]
+        )
+        solution_has_bools = any(
+            [pandas.api.types.is_bool_dtype(x) for x in solution.dtypes.values]
+        )
+    elif isinstance(solution, np.ndarray):
+        solution_is_all_numeric = pandas.api.types.is_numeric_dtype(solution)
+        solution_has_bools = pandas.api.types.is_bool_dtype(solution)
+    if not solution_is_all_numeric:
+        return False
+    for char in error_message:
+        if char.isnumeric():
+            return False
+    if solution_has_bools:
+        if "true" in error_message.lower() or "false" in error_message.lower():
+            return False
+    return True
+def safe_call_score(metric_function, solution, submission, **metric_func_kwargs):
+    """
+    Call score. If that raises an error and that already been specifically handled, just raise it.
+    Otherwise make a conservative attempt to identify potential participant visible errors.
+    """
+    try:
+        score_result = metric_function(solution, submission, **metric_func_kwargs)
+    except Exception as err:
+        error_message = str(err)
+        if err.__class__.__name__ == "ParticipantVisibleError":
+            raise ParticipantVisibleError(error_message)
+        elif err.__class__.__name__ == "HostVisibleError":
+            raise HostVisibleError(error_message)
+        else:
+            if treat_as_participant_error(error_message, solution):
+                raise ParticipantVisibleError(error_message)
+            else:
+                raise err
+    return score_result
+def verify_valid_probabilities(df: pd.DataFrame, df_name: str):
+    """Verify that the dataframe contains valid probabilities.
+    The dataframe must be limited to the target columns; do not pass in any ID columns.
+    """
+    if not pandas.api.types.is_numeric_dtype(df.values):
+        raise ParticipantVisibleError(f"All target values in {df_name} must be numeric")
+    if df.min().min() < 0:
+        raise ParticipantVisibleError(f"All target values in {df_name} must be at least zero")
+    if df.max().max() > 1:
+        raise ParticipantVisibleError(f"All target values in {df_name} must be no greater than one")
+    if not np.allclose(df.sum(axis=1), 1):
+        raise ParticipantVisibleError(
+            f"Target values in {df_name} do not add to one within all rows"
+        )

dslighting 1.7.1__py3-none-any.whl → 1.7.6__py3-none-any.whl

dslighting 1.7.1py3-none-any.whl → 1.7.6py3-none-any.whl