PyPI - dslighting - Versions diffs - 1.7.1__py3-none-any.whl → 1.7.8__py3-none-any.whl - Mend

dslighting 1.7.1py3-none-any.whl → 1.7.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (352) hide show

mlebench/competitions/random-acts-of-pizza/prepare_val.py ADDED Viewed

@@ -0,0 +1,144 @@
+import json
+import shutil
+from pathlib import Path
+from typing import List, Dict, Any
+import pandas as pd
+from sklearn.model_selection import train_test_split
+def _create_split(
+    data_to_split: List[Dict],
+    test_size: float,
+    test_fields: List[str],
+    public_path: Path,
+    private_path: Path,
+    random_state: int,
+) -> List[Dict]:
+    """
+    Helper function to perform a data split and create all required files.
+    Args:
+        data_to_split: The list of data samples to be split.
+        test_size: The proportion of the dataset to allocate to the test split.
+        test_fields: The list of fields to keep in the test set.
+        public_path: The directory for public-facing files (train set, unlabeled test set).
+        private_path: The directory for private files (test set labels).
+        random_state: The seed for the random number generator.
+    Returns:
+        The training portion of the split, to be used for subsequent splits if needed.
+    """
+    # Create output directories if they don't exist
+    public_path.mkdir(parents=True, exist_ok=True)
+    private_path.mkdir(parents=True, exist_ok=True)
+    # Create train, test from the provided data
+    train_set, test_set = train_test_split(
+        data_to_split, test_size=test_size, random_state=random_state
+    )
+    test_set_without_labels = []
+    for sample in test_set:
+        # Keep only the fields that should be in the test set
+        sample_unlabeled = {key: sample[key] for key in test_fields}
+        test_set_without_labels.append(sample_unlabeled)
+    # Save the new train and test splits to the public directory
+    with open(public_path / "train.json", "w") as f:
+        json.dump(train_set, f, indent=4)
+    with open(public_path / "test.json", "w") as f:
+        json.dump(test_set_without_labels, f, indent=4)
+    # Also save zipped versions
+    shutil.make_archive(public_path / "train", "zip", public_path, "train.json")
+    shutil.make_archive(public_path / "test", "zip", public_path, "test.json")
+    # Create answers for the private directory
+    answers_rows = []
+    for sample in test_set:
+        answers_rows.append(
+            {
+                "request_id": sample["request_id"],
+                "requester_received_pizza": int(sample["requester_received_pizza"]),
+            }
+        )
+    answers = pd.DataFrame(answers_rows)
+    answers.to_csv(private_path / "test.csv", index=False)
+    # Create sample submission for the public directory
+    sample_submission = answers.copy()
+    sample_submission["requester_received_pizza"] = 0
+    sample_submission.to_csv(public_path / "sampleSubmission.csv", index=False)
+    # Perform checks
+    assert len(train_set) + len(test_set) == len(
+        data_to_split
+    ), f"Expected {len(data_to_split)} total samples, but got {len(train_set)} in train and {len(test_set)} in test"
+    assert len(test_set) == len(
+        test_set_without_labels
+    ), "Test set and unlabeled test set must have the same length"
+    assert len(answers) == len(test_set), "Answers must have the same length as the test set"
+    assert len(sample_submission) == len(
+        test_set
+    ), "Sample submission must have the same length as the test set"
+    assert set(answers.columns) == set(
+        ["request_id", "requester_received_pizza"]
+    ), "Answers must have 'request_id' and 'requester_received_pizza' columns"
+    assert set(sample_submission.columns) == set(
+        ["request_id", "requester_received_pizza"]
+    ), "Sample submission must have 'request_id' and 'requester_received_pizza' columns"
+    return train_set
+def prepare(raw: Path, public: Path, private: Path):
+    """
+    Splits the data in raw into public and private datasets with appropriate test/train splits.
+    Then, it creates a second, parallel validation split from the first training set.
+    """
+    # Load data
+    with open(raw / "train.json") as f:
+        old_train = json.load(f)
+    with open(raw / "test.json") as f:
+        old_test = json.load(f)
+    test_ratio = len(old_test) / (len(old_train) + len(old_test))
+    all_fields = list([key for key in old_train[0].keys()])
+    assert all(set(all_fields) == set([key for key in sample.keys()]) for sample in old_train)
+    test_fields = list([key for key in old_test[0].keys()])
+    assert all(set(test_fields) == set([key for key in sample.keys()]) for sample in old_test)
+    # --- Original Split ---
+    # This split creates the primary `public` and `private` competition data.
+    # The returned `new_train` set will be used for the subsequent validation split.
+    new_train = _create_split(
+        data_to_split=old_train,
+        test_size=test_ratio,
+        test_fields=test_fields,
+        public_path=public,
+        private_path=private,
+        random_state=0,
+    )
+    # --- New Validation Split ---
+    # Define new directories for the validation set, parallel to the original ones.
+    public_val = public.parent / "public_val"
+    private_val = private.parent / "private_val"
+    # To keep the new test set (`test_val`) size consistent with the original test set,
+    # we adjust the test ratio for the second split.
+    # val_test_size = original_test_size / original_train_size
+    val_test_size = test_ratio / (1.0 - test_ratio)
+    # Create the validation split using the same logic, but on the `new_train` data
+    # and saving to the new `_val` directories.
+    _create_split(
+        data_to_split=new_train,
+        test_size=val_test_size,
+        test_fields=test_fields,
+        public_path=public_val,
+        private_path=private_val,
+        random_state=0,
+    )

mlebench/competitions/ranzcr-clip-catheter-line-classification/classes.py ADDED Viewed

@@ -0,0 +1,11 @@
+CLASSES = [
+    "ETT - Abnormal",
+    "ETT - Borderline",
+    "ETT - Normal",
+    "NGT - Abnormal",
+    "NGT - Borderline",
+    "NGT - Incompletely Imaged",
+    "NGT - Normal",
+    "CVC - Abnormal",
+    "CVC - Borderline",
+]

mlebench/competitions/ranzcr-clip-catheter-line-classification/grade.py ADDED Viewed

@@ -0,0 +1,31 @@
+import numpy as np
+import pandas as pd
+from sklearn.metrics import roc_auc_score
+from mlebench.competitions.utils import prepare_for_auroc_metric
+from mlebench.grade_helpers import InvalidSubmissionError
+from .classes import CLASSES
+def grade(submission: pd.DataFrame, answers: pd.DataFrame) -> float:
+    for class_name in CLASSES:
+        assert class_name in answers.columns, f"Missing class {class_name} in answers."
+        if class_name not in submission.columns:
+            raise InvalidSubmissionError(f"Class {class_name} is not in the submission.")
+    assert len(submission) == len(
+        answers
+    ), f"Expected {len(answers)} rows in submission, got {len(submission)}."
+    roc_auc_scores = []
+    for class_name in CLASSES:
+        roc_auc_inputs = prepare_for_auroc_metric(
+            submission=submission,
+            answers=answers,
+            id_col="StudyInstanceUID",
+            target_col=class_name,
+        )
+        roc_auc_scores.append(roc_auc_score(roc_auc_inputs["y_true"], roc_auc_inputs["y_score"]))
+    score = np.average(roc_auc_scores)
+    return score

mlebench/competitions/ranzcr-clip-catheter-line-classification/prepare.py ADDED Viewed

@@ -0,0 +1,53 @@
+import shutil
+from pathlib import Path
+from sklearn.model_selection import train_test_split
+from mlebench.utils import read_csv
+from .classes import CLASSES
+def prepare(raw: Path, public: Path, private: Path):
+    # Create train, test from train split
+    old_train = read_csv(raw / "train.csv")
+    new_train, new_test = train_test_split(old_train, test_size=0.1, random_state=0)
+    old_train_annotations = read_csv(raw / "train_annotations.csv")
+    old_train_uids = old_train_annotations["StudyInstanceUID"]
+    new_train_uids = new_train["StudyInstanceUID"]
+    is_in_new_train = old_train_uids.isin(new_train_uids)
+    new_train_annotations = old_train_annotations[is_in_new_train]
+    (public / "train").mkdir(exist_ok=True)
+    (public / "test").mkdir(exist_ok=True)
+    for file_id in new_train["StudyInstanceUID"]:
+        shutil.copyfile(
+            src=raw / "train" / f"{file_id}.jpg",
+            dst=public / "train" / f"{file_id}.jpg",
+        )
+    for file_id in new_test["StudyInstanceUID"]:
+        shutil.copyfile(
+            src=raw / "train" / f"{file_id}.jpg",
+            dst=public / "test" / f"{file_id}.jpg",
+        )
+    assert len(list(public.glob("train/*.jpg"))) == len(
+        new_train
+    ), f"Expected {len(new_train)} files in public train, got {len(list(public.glob('train/*.jpg')))}"
+    assert len(list(public.glob("test/*.jpg"))) == len(
+        new_test
+    ), f"Expected {len(new_test)} files in public test, got {len(list(public.glob('test/*.jpg')))}"
+    # Create a sample submission file
+    submission_df = new_test[["StudyInstanceUID"] + CLASSES]
+    submission_df[CLASSES] = 0
+    # Copy over files
+    new_train.to_csv(public / "train.csv", index=False)
+    new_train_annotations.to_csv(public / "train_annotations.csv", index=False)
+    new_test.to_csv(private / "test.csv", index=False)
+    submission_df.to_csv(public / "sample_submission.csv", index=False)

mlebench/competitions/ranzcr-clip-catheter-line-classification/prepare_val.py ADDED Viewed

@@ -0,0 +1,113 @@
+import shutil
+from pathlib import Path
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from mlebench.utils import read_csv
+from .classes import CLASSES
+def _create_split_files(
+    train_df: pd.DataFrame,
+    test_df: pd.DataFrame,
+    all_annotations_df: pd.DataFrame,
+    public_dir: Path,
+    private_dir: Path,
+    raw_image_dir: Path,
+):
+    """
+    Helper function to populate public and private directories for a given data split.
+    This function handles directory creation, image copying, and CSV file generation,
+    ensuring a consistent output structure.
+    """
+    # Ensure output directories exist
+    public_dir.mkdir(parents=True, exist_ok=True)
+    private_dir.mkdir(parents=True, exist_ok=True)
+    (public_dir / "train").mkdir(exist_ok=True)
+    (public_dir / "test").mkdir(exist_ok=True)
+    # Filter annotations to only include those for the current training set
+    train_uids = train_df["StudyInstanceUID"]
+    is_in_train = all_annotations_df["StudyInstanceUID"].isin(train_uids)
+    split_train_annotations = all_annotations_df[is_in_train]
+    # Copy image files for the current train and test sets
+    for file_id in train_df["StudyInstanceUID"]:
+        shutil.copyfile(
+            src=raw_image_dir / f"{file_id}.jpg",
+            dst=public_dir / "train" / f"{file_id}.jpg",
+        )
+    for file_id in test_df["StudyInstanceUID"]:
+        shutil.copyfile(
+            src=raw_image_dir / f"{file_id}.jpg",
+            dst=public_dir / "test" / f"{file_id}.jpg",
+        )
+    # Assert that the correct number of images were copied
+    assert len(list(public_dir.glob("train/*.jpg"))) == len(
+        train_df
+    ), f"Expected {len(train_df)} files in {public_dir}/train, got {len(list(public_dir.glob('train/*.jpg')))}"
+    assert len(list(public_dir.glob("test/*.jpg"))) == len(
+        test_df
+    ), f"Expected {len(test_df)} files in {public_dir}/test, got {len(list(public_dir.glob('test/*.jpg')))}"
+    # Create a sample submission file for the current test set
+    submission_df = test_df[["StudyInstanceUID"] + CLASSES].copy()
+    submission_df[CLASSES] = 0
+    # Save all required CSV files with the required standard filenames
+    train_df.to_csv(public_dir / "train.csv", index=False)
+    split_train_annotations.to_csv(public_dir / "train_annotations.csv", index=False)
+    submission_df.to_csv(public_dir / "sample_submission.csv", index=False)
+    test_df.to_csv(private_dir / "test.csv", index=False)
+def prepare(raw: Path, public: Path, private: Path):
+    # Load raw data once
+    old_train = read_csv(raw / "train.csv")
+    old_train_annotations = read_csv(raw / "train_annotations.csv")
+    raw_image_dir = raw / "train"
+    # --- 1. Original Split: Create main train and test sets ---
+    # This split creates the primary train/test sets for the competition.
+    # The outputs in `public` and `private` must remain identical to the original script.
+    new_train, new_test = train_test_split(old_train, test_size=0.1, random_state=0)
+    # Use the helper to create the original public/private directory contents
+    _create_split_files(
+        train_df=new_train,
+        test_df=new_test,
+        all_annotations_df=old_train_annotations,
+        public_dir=public,
+        private_dir=private,
+        raw_image_dir=raw_image_dir,
+    )
+    # --- 2. New Validation Split: Create a validation set from the main train set ---
+    # Define paths for the new validation split directories.
+    public_val = public.parent / "public_val"
+    private_val = private.parent / "private_val"
+    # Split the main training set (`new_train`) again to create a smaller
+    # training set and a validation set.
+    # The validation set (`test_val`) size should be ~10% of the *original* data.
+    # new_train size = 0.9 * total. test_val_size = (0.1 * total) / (0.9 * total) = 1/9.
+    test_val_size = 1 / 9
+    train_val, test_val = train_test_split(
+        new_train, test_size=test_val_size, random_state=0
+    )
+    # Use the same helper to create the new validation directories (`public_val`, `private_val`)
+    # The helper ensures filenames and structure are identical to the original outputs.
+    _create_split_files(
+        train_df=train_val,
+        test_df=test_val,
+        all_annotations_df=old_train_annotations,
+        public_dir=public_val,
+        private_dir=private_val,
+        raw_image_dir=raw_image_dir,
+    )

mlebench/competitions/rsna-2022-cervical-spine-fracture-detection/grade.py ADDED Viewed

@@ -0,0 +1,124 @@
+import numpy as np
+import pandas as pd
+from mlebench.grade_helpers import InvalidSubmissionError
+column_order = ["patient_overall", "C1", "C2", "C3", "C4", "C5", "C6", "C7"]
+def unmelt_df(melted_df: pd.DataFrame, id_col: str, target_col: str) -> pd.DataFrame:
+    unmelted_df = melted_df.copy()
+    unmelted_df[["StudyInstanceUID", "variable"]] = unmelted_df[id_col].str.split(
+        "_", n=1, expand=True
+    )
+    unmelted_df = unmelted_df.pivot(
+        index="StudyInstanceUID", columns="variable", values=target_col
+    ).reset_index()
+    return unmelted_df
+def binary_log_loss(y_true: np.ndarray, y_pred: np.ndarray) -> np.ndarray:
+    """
+    Computes binary log loss for each sample and variable (no reduction).
+    Args:
+        y_true: True binary labels, shape (n_samples, n_variables)
+        y_pred: Predicted probabilities, shape (n_samples, n_variables)
+    Returns:
+        Binary log loss for each sample and variable, shape (n_samples, n_variables)
+    """
+    epsilon = 1e-15
+    # avoid y_preds of 0 or 1, since we'll be taking the log and that is undefined/inf
+    y_pred = np.clip(y_pred, epsilon, 1 - epsilon)
+    loss = -(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred))
+    return loss
+def custom_comp_weighted_loss(y_true: np.ndarray, y_pred: np.ndarray) -> float:
+    """
+    Adapted from
+    https://www.kaggle.com/competitions/rsna-2022-cervical-spine-fracture-detection/discussion/341854
+    Adaptations:
+        - custom binary_log_loss function in numpy to avoid torch dependency
+        - working in numpy arrays instead of torch tensors
+        - explanatory comments
+        - added comp weights inside fn
+    """
+    # https://www.kaggle.com/competitions/rsna-2022-cervical-spine-fracture-detection/discussion/340392
+    # assuming ordering [patient_overall, C1, C2, C3, C4, C5, C6, C7]
+    competition_weights = {
+        # weights for negative labels
+        "-": np.array([7, 1, 1, 1, 1, 1, 1, 1], dtype=np.float32),
+        # weights for positive labels
+        "+": np.array([14, 2, 2, 2, 2, 2, 2, 2], dtype=np.float32),
+    }
+    # (n_samples, n_variables)
+    loss = binary_log_loss(y_true, y_pred)
+    # (n_samples, n_variables) weights matrix, conditionally built based on y_true values
+    weights = y_true * competition_weights["+"] + (1 - y_true) * competition_weights["-"]
+    # (n_samples, n_variables) element-wise multiplication of loss and weights
+    loss = loss * weights
+    # (n_samples,) sum of weighted losses across variables
+    loss = np.sum(loss, axis=1)
+    # (n_samples,) normalize the loss by the sum of weights
+    loss = loss / weights.sum(axis=1)
+    # finally, take the mean across all samples
+    return np.mean(loss)
+def prepare_for_metric(
+    submission: pd.DataFrame, answers: pd.DataFrame
+) -> tuple[np.ndarray, np.ndarray]:
+    id_col = "row_id"
+    target_col = "fractured"
+    if id_col not in submission.columns:
+        raise InvalidSubmissionError(f"Submission is missing the `{id_col}` column.")
+    if target_col not in submission.columns:
+        raise InvalidSubmissionError(f"Submission is missing the `{target_col}` column.")
+    if not submission[target_col].between(0, 1).all():
+        raise InvalidSubmissionError(f"Submission `{target_col}` values must be between 0 and 1.")
+    if submission[target_col].isnull().any():
+        raise InvalidSubmissionError(f"Submission `{target_col}` values must not be missing.")
+    assert id_col in answers.columns, f"Answers are missing the `{id_col}` column."
+    assert target_col in answers.columns, f"Answers are missing the `{target_col}` column."
+    # all answers should be either 0 or 1
+    assert answers[target_col].isin([0, 1]).all(), f"Answers `{target_col}` values must be 0 or 1."
+    # all answers should be present
+    assert (
+        not answers[target_col].isnull().any()
+    ), f"Answers `{target_col}` values must not be missing."
+    # sort both submission and answers by id_col
+    submission = submission.sort_values(by=id_col)
+    answers = answers.sort_values(by=id_col)
+    # check that the ids match
+    if (submission[id_col].values != answers[id_col].values).any():
+        raise InvalidSubmissionError("Submission should contain same ids as answers.")
+    # checks complete; now we can prepare
+    # need to unmelt both submission and answers, back to getting one row per StudyInstanceUID
+    unmelted_submission = unmelt_df(melted_df=submission, id_col=id_col, target_col=target_col)
+    unmelted_answers = unmelt_df(melted_df=answers, id_col=id_col, target_col=target_col)
+    # sort both by StudyInstanceUID
+    unmelted_submission = unmelted_submission.sort_values(by="StudyInstanceUID")
+    unmelted_answers = unmelted_answers.sort_values(by="StudyInstanceUID")
+    # extract the target columns
+    y_true = unmelted_answers[column_order].to_numpy()
+    y_pred = unmelted_submission[column_order].to_numpy()
+    return y_true, y_pred
+def grade(submission: pd.DataFrame, answers: pd.DataFrame) -> float:
+    y_true, y_pred = prepare_for_metric(submission, answers)
+    return custom_comp_weighted_loss(y_true, y_pred)

dslighting 1.7.1__py3-none-any.whl → 1.7.8__py3-none-any.whl

dslighting 1.7.1py3-none-any.whl → 1.7.8py3-none-any.whl