PyPI - dslighting - Versions diffs - 1.7.1__py3-none-any.whl → 1.7.8__py3-none-any.whl - Mend

dslighting 1.7.1py3-none-any.whl → 1.7.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (352) hide show

mlebench/competitions/iwildcam-2019-fgvc6/prepare_val.py ADDED Viewed

@@ -0,0 +1,194 @@
+import shutil
+from pathlib import Path
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from tqdm import tqdm
+from mlebench.utils import extract, get_logger, read_csv
+logger = get_logger(__name__)
+def _create_split(
+    input_df: pd.DataFrame,
+    test_size: float,
+    random_state: int,
+    raw_images_path: Path,
+    public_path: Path,
+    private_path: Path,
+    dev_mode: bool = False,
+) -> pd.DataFrame:
+    """
+    Helper function to perform a data split, create necessary files, and organize directories.
+    Args:
+        input_df: The dataframe to be split.
+        test_size: The proportion of the dataset to allocate to the test split.
+        random_state: The seed used by the random number generator.
+        raw_images_path: Path to the directory containing all source images.
+        public_path: The target public directory.
+        private_path: The target private directory.
+        dev_mode: If True, uses a small sample for faster processing.
+    Returns:
+        The training portion of the split dataframe.
+    """
+    # Create train, test from the input dataframe
+    locations = input_df["location"].unique()
+    train_locations, test_locations = train_test_split(
+        locations, test_size=test_size, random_state=random_state
+    )
+    input_df["split"] = input_df["location"].apply(
+        lambda loc: "test" if loc in test_locations else "train"
+    )
+    train_df = input_df[input_df["split"] == "train"].drop(columns=["split"])
+    answers_df = input_df[input_df["split"] == "test"].drop(columns=["split"])
+    logger.debug("Train locations: %s", train_locations)
+    logger.debug("Test locations: %s", test_locations)
+    logger.debug(
+        "Test size for this split: %s",
+        len(answers_df) / (len(train_df) + len(answers_df)),
+    )
+    input_df.drop(columns=["split"], inplace=True)  # Drop helper column
+    test_df = answers_df.copy().drop(columns=["category_id"])
+    gold_submission_df = answers_df.copy()[["id", "category_id"]]
+    gold_submission_df.rename(
+        columns={"id": "Id", "category_id": "Category"}, inplace=True
+    )
+    # Make sample submission
+    submission_df = test_df.copy()[["id"]]
+    submission_df["category_id"] = 0
+    submission_df.rename(columns={"id": "Id", "category_id": "Category"}, inplace=True)
+    # Checks
+    assert set(train_df["id"]).isdisjoint(
+        set(test_df["id"])
+    ), "train_df and test_df are not disjoint"
+    assert len(train_df) + len(test_df) == len(
+        input_df
+    ), "Length of train_df and test_df should be equal to the length of the input dataframe"
+    assert len(answers_df) == len(
+        test_df
+    ), "Length of answers_df should be equal to the length of test_df"
+    assert len(submission_df) == len(
+        answers_df
+    ), "Length of answers_df should be equal to the length of the sample submission"
+    assert (
+        input_df.columns.tolist() == train_df.columns.tolist()
+    ), f"train_df should have the same columns as the input dataframe: input_df: {input_df.columns.tolist()} != train_df: {train_df.columns.tolist()}"
+    assert set(train_df["location"]).isdisjoint(
+        set(test_df["location"])
+    ), "train_df and test_df should not share any locations"
+    # Create directories
+    public_path.mkdir(exist_ok=True, parents=True)
+    private_path.mkdir(exist_ok=True, parents=True)
+    # Write CSVs
+    answers_df.to_csv(private_path / "test.csv", index=False)
+    gold_submission_df.to_csv(private_path / "answers.csv", index=False)
+    train_df.to_csv(public_path / "train.csv", index=False)
+    test_df.to_csv(public_path / "test.csv", index=False)
+    submission_df.to_csv(public_path / "sample_submission.csv", index=True)
+    # Prepare for file copy
+    public_train_images = public_path / "train_images"
+    public_test_images = public_path / "test_images"
+    public_train_images.mkdir(exist_ok=True)
+    public_test_images.mkdir(exist_ok=True)
+    loop_train_df = train_df.sample(n=100) if dev_mode else train_df
+    loop_test_df = test_df.sample(n=100) if dev_mode else test_df
+    for file_id in tqdm(loop_train_df["id"], desc=f"Copying train images to {public_path.name}"):
+        shutil.copyfile(
+            src=raw_images_path / f"{file_id}.jpg",
+            dst=public_train_images / f"{file_id}.jpg",
+        )
+    for file_id in tqdm(loop_test_df["id"], desc=f"Copying test images to {public_path.name}"):
+        shutil.copyfile(
+            src=raw_images_path / f"{file_id}.jpg",
+            dst=public_test_images / f"{file_id}.jpg",
+        )
+    # Check integrity of the files copied
+    assert len(list(public_test_images.glob("*.jpg"))) == len(
+        loop_test_df["id"].unique()
+    ), f"Public test images in {public_path.name} should have the same number of images as the unique ids in the test set"
+    assert len(list(public_train_images.glob("*.jpg"))) == len(
+        loop_train_df["id"].unique()
+    ), f"Public train images in {public_path.name} should have the same number of images as the unique ids in the train set"
+    # Zip up image directories and delete non-zipped files
+    shutil.make_archive(
+        public_path / "train_images", "zip", public_train_images
+    )
+    shutil.make_archive(public_path / "test_images", "zip", public_test_images)
+    shutil.rmtree(public_train_images)
+    shutil.rmtree(public_test_images)
+    return train_df
+def prepare(raw: Path, public: Path, private: Path):
+    """
+    Splits the data in raw into public and private datasets with appropriate test/train splits.
+    Also creates a secondary validation split in public_val/private_val directories.
+    """
+    dev_mode = False
+    test_size = 0.1
+    random_state = 8  # We target a 44% test set size, we have empirically trialed seeds and landed on 8 to achieve this
+    # --- Setup and Initial Data Extraction ---
+    old_train = read_csv(raw / "train.csv")
+    raw_images_path = raw / "train_images"
+    raw_images_path.mkdir(exist_ok=True)
+    logger.info("Extracting raw images...")
+    extract(raw / "train_images.zip", raw_images_path)
+    assert len(list(raw_images_path.glob("*.jpg"))) == len(
+        old_train["id"].unique()
+    ), f"Raw train images should have the same number of images as the unique ids in the old train set, but got {len(list(raw_images_path.glob('*.jpg')))} files and {len(old_train['id'].unique())} ids"
+    # --- First Split (Original Public/Private) ---
+    # This creates the main competition data. The outputs in `public` and `private`
+    # will be identical to the original script's output.
+    logger.info("Creating original train/test split for competition...")
+    train_from_first_split = _create_split(
+        input_df=old_train,
+        test_size=test_size,
+        random_state=random_state,
+        raw_images_path=raw_images_path,
+        public_path=public,
+        private_path=private,
+        dev_mode=dev_mode,
+    )
+    # --- Second Split (New Validation Set) ---
+    # This takes the training data from the first split and splits it again
+    # to create a new, smaller training set and a validation set.
+    # The output structure in `public_val` and `private_val` mirrors the original.
+    logger.info("Creating validation train/test split for local development...")
+    public_val = public.parent / "public_val"
+    private_val = private.parent / "private_val"
+    _create_split(
+        input_df=train_from_first_split,
+        test_size=test_size,
+        random_state=random_state,  # Use same params to replicate splitting logic
+        raw_images_path=raw_images_path,
+        public_path=public_val,
+        private_path=private_val,
+        dev_mode=dev_mode,
+    )
+    # Clean up the extracted raw images directory
+    logger.info("Cleaning up extracted raw images...")
+    shutil.rmtree(raw_images_path)
+    logger.info("Data preparation complete.")

mlebench/competitions/iwildcam-2020-fgvc7/grade.py ADDED Viewed

@@ -0,0 +1,11 @@
+import pandas as pd
+from sklearn.metrics import accuracy_score
+from mlebench.competitions.utils import prepare_for_accuracy_metric
+def grade(submission: pd.DataFrame, answers: pd.DataFrame) -> float:
+    accuracy_inputs = prepare_for_accuracy_metric(
+        submission=submission, answers=answers, target_column="Category", id_column="Id"
+    )
+    return accuracy_score(**accuracy_inputs)

mlebench/competitions/iwildcam-2020-fgvc7/prepare.py ADDED Viewed

@@ -0,0 +1,164 @@
+import json
+import shutil
+from pathlib import Path
+import numpy as np
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from tqdm import tqdm
+from mlebench.competitions.utils import df_to_one_hot
+from mlebench.utils import read_csv
+def prepare(raw: Path, public: Path, private: Path):
+    """
+    Splits the data in raw into public and private datasets with appropriate test/train splits.
+    """
+    dev_mode = False
+    # Create train, test from train split
+    # Load old train
+    with open(raw / "iwildcam2020_train_annotations.json", "r") as file:
+        old_train_json = json.load(file)
+    old_train_annotations = pd.DataFrame(old_train_json["annotations"])
+    old_train_images = pd.DataFrame(old_train_json["images"])
+    old_train_categories = pd.DataFrame(old_train_json["categories"])
+    # old_train_info = pd.DataFrame(old_train_json["info"])
+    # Load old test
+    with open(raw / "iwildcam2020_test_information.json", "r") as file:
+        old_test_json = json.load(file)
+    old_test_categories = pd.DataFrame(old_test_json["categories"])
+    # Create splits based on train's images' on 'location'
+    test_size = 0.22  # 62894/(217959+62894) = 0.22
+    train_image_locations = old_train_images["location"].unique()
+    locations_new_train, locations_new_test = train_test_split(
+        train_image_locations, test_size=test_size, random_state=0
+    )
+    # Filter old train to new train and new test based on location
+    new_train_images = old_train_images[old_train_images["location"].isin(locations_new_train)]
+    new_test_images = old_train_images[old_train_images["location"].isin(locations_new_test)]
+    # Adjust the split to ensure around test_size of total samples are in the new test set
+    while len(new_test_images) / (len(old_train_images) + len(new_test_images)) < test_size:
+        # Move some locations from train to test
+        location_to_move = locations_new_train[-1]
+        locations_new_train = locations_new_train[:-1]
+        locations_new_test = np.append(locations_new_test, location_to_move)
+        new_train_images = old_train_images[old_train_images["location"].isin(locations_new_train)]
+        new_test_images = old_train_images[old_train_images["location"].isin(locations_new_test)]
+    while len(new_test_images) / (len(old_train_images) + len(new_test_images)) > test_size:
+        # Move some locations from test to train
+        location_to_move = locations_new_test[-1]
+        locations_new_test = locations_new_test[:-1]
+        locations_new_train = np.append(locations_new_train, location_to_move)
+        new_train_images = old_train_images[old_train_images["location"].isin(locations_new_train)]
+        new_test_images = old_train_images[old_train_images["location"].isin(locations_new_test)]
+    # Get the image ids for new train and new test
+    new_train_ids = new_train_images["id"].unique()
+    new_test_ids = new_test_images["id"].unique()
+    # Filter annotations based on new train and new test image ids
+    new_train_annotations = old_train_annotations[
+        old_train_annotations["image_id"].isin(new_train_ids)
+    ]
+    new_test_annotations = old_train_annotations[
+        old_train_annotations["image_id"].isin(new_test_ids)
+    ]
+    new_train_categories = old_train_categories.copy()
+    new_test_categories = old_test_categories.copy()
+    # Answers
+    answer_annotations = new_test_annotations[["image_id", "category_id"]].copy()
+    answer_annotations.rename(columns={"image_id": "Id", "category_id": "Category"}, inplace=True)
+    # Create a sample submission file
+    sample_submission = answer_annotations.copy()
+    np.random.seed(0)
+    sample_submission["Category"] = np.random.randint(
+        0, 676, size=len(sample_submission)
+    )  # Uniform between 0 and 675
+    # Checks
+    assert set(new_train_annotations["image_id"]).isdisjoint(
+        set(new_test_images["id"])
+    ), "Train should not contain annotations of test images"
+    assert len(new_train_ids) + len(new_test_ids) == len(
+        old_train_images["id"]
+    ), "The combined length of new_train_ids and new_test_ids should equal the length of old_train_images"
+    # Assert that new_train_images and new_test_images have disjoint locations
+    assert set(new_train_images["location"]).isdisjoint(
+        set(new_test_images["location"])
+    ), "Train and test images should not share locations"
+    # Reform JSON files
+    new_train_json = {
+        "annotations": new_train_annotations.to_dict(orient="records"),
+        "images": new_train_images.to_dict(orient="records"),
+        "categories": new_train_categories.to_dict(orient="records"),
+        "info": old_train_json["info"],
+    }
+    new_test_json = {
+        "images": new_test_images.to_dict(orient="records"),
+        "categories": new_test_categories.to_dict(orient="records"),
+        "info": old_test_json["info"],
+    }
+    # Checks on JSONs
+    assert (
+        new_train_json.keys() == old_train_json.keys()
+    ), "new_train_json and old_train_json should have the same keys"
+    assert (
+        new_test_json.keys() == old_test_json.keys()
+    ), "new_test_json and old_test_json should have the same keys"
+    # Write files
+    answer_annotations.to_csv(private / "answers.csv", index=False)
+    sample_submission.to_csv(public / "sample_submission.csv", index=False)
+    with open(public / "iwildcam2020_train_annotations.json", "w") as f:
+        json.dump(new_train_json, f)
+    with open(public / "iwildcam2020_test_information.json", "w") as f:
+        json.dump(new_test_json, f)
+    # Copy over megadetector results
+    shutil.copyfile(
+        raw / "iwildcam2020_megadetector_results.json",
+        public / "iwildcam2020_megadetector_results.json",
+    )
+    # Reduce the number of images copied over to 100 for dev mode
+    if dev_mode:
+        new_train_ids = new_train_ids[:100]
+        new_test_ids = new_test_ids[:100]
+    # Copy over image files
+    (public / "train").mkdir(exist_ok=True)
+    (public / "test").mkdir(exist_ok=True)
+    for file_id in tqdm(new_train_ids, desc="Copying train images", unit="file"):
+        shutil.copyfile(
+            src=raw / "train" / f"{file_id}.jpg",
+            dst=public / "train" / f"{file_id}.jpg",
+        )
+    for file_id in tqdm(new_test_ids, desc="Copying test images", unit="file"):
+        shutil.copyfile(
+            src=raw / "train" / f"{file_id}.jpg",
+            dst=public / "test" / f"{file_id}.jpg",
+        )
+    # Check integrity of the files copied
+    assert len(list(public.glob("train/*.jpg"))) == len(
+        new_train_ids
+    ), "Number of train images should be equal to the number of unique image_id in the train set"
+    assert len(list(public.glob("test/*.jpg"))) == len(
+        new_test_ids
+    ), "Number of test images should be equal to the number of unique image_id in the test set"

mlebench/competitions/iwildcam-2020-fgvc7/prepare_val.py ADDED Viewed

@@ -0,0 +1,245 @@
+import json
+import shutil
+from pathlib import Path
+from typing import Dict, Tuple
+import numpy as np
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from tqdm import tqdm
+from mlebench.competitions.utils import df_to_one_hot
+from mlebench.utils import read_csv
+def _perform_split(
+    images_to_split: pd.DataFrame, test_size: float, random_state: int
+) -> Tuple[np.ndarray, np.ndarray]:
+    """Performs a location-based split on the given images DataFrame."""
+    image_locations = images_to_split["location"].unique()
+    train_locations, test_locations = train_test_split(
+        image_locations, test_size=test_size, random_state=random_state
+    )
+    # The original script's logic for fine-tuning the split size
+    temp_train_images = images_to_split[images_to_split["location"].isin(train_locations)]
+    temp_test_images = images_to_split[images_to_split["location"].isin(test_locations)]
+    while len(temp_test_images) / len(images_to_split) < test_size and len(train_locations) > 1:
+        location_to_move = train_locations[-1]
+        train_locations = train_locations[:-1]
+        test_locations = np.append(test_locations, location_to_move)
+        temp_train_images = images_to_split[images_to_split["location"].isin(train_locations)]
+        temp_test_images = images_to_split[images_to_split["location"].isin(test_locations)]
+    while len(temp_test_images) / len(images_to_split) > test_size and len(test_locations) > 1:
+        location_to_move = test_locations[-1]
+        test_locations = test_locations[:-1]
+        train_locations = np.append(train_locations, location_to_move)
+        temp_train_images = images_to_split[images_to_split["location"].isin(train_locations)]
+        temp_test_images = images_to_split[images_to_split["location"].isin(test_locations)]
+    return train_locations, test_locations
+def _write_dataset_files(
+    public_path: Path,
+    private_path: Path,
+    train_images: pd.DataFrame,
+    test_images: pd.DataFrame,
+    train_annotations: pd.DataFrame,
+    test_annotations: pd.DataFrame,
+    categories_df: pd.DataFrame,
+    info_json: Dict,
+    test_info_json: Dict,
+    raw_path: Path,
+    dev_mode: bool,
+):
+    """Writes all the necessary files for a given train/test split to the specified paths."""
+    # Create output directories
+    public_path.mkdir(exist_ok=True)
+    private_path.mkdir(exist_ok=True)
+    # Answers
+    answer_annotations = test_annotations[["image_id", "category_id"]].copy()
+    answer_annotations.rename(columns={"image_id": "Id", "category_id": "Category"}, inplace=True)
+    # Create a sample submission file
+    sample_submission = answer_annotations.copy()
+    np.random.seed(0)
+    sample_submission["Category"] = np.random.randint(
+        0, 676, size=len(sample_submission)
+    )  # Uniform between 0 and 675
+    # Reform JSON files
+    new_train_json = {
+        "annotations": train_annotations.to_dict(orient="records"),
+        "images": train_images.to_dict(orient="records"),
+        "categories": categories_df.to_dict(orient="records"),
+        "info": info_json,
+    }
+    new_test_json = {
+        "images": test_images.to_dict(orient="records"),
+        "categories": pd.DataFrame(test_info_json["categories"]).to_dict(orient="records"),
+        "info": test_info_json["info"],
+    }
+    # Write files
+    answer_annotations.to_csv(private_path / "answers.csv", index=False)
+    sample_submission.to_csv(public_path / "sample_submission.csv", index=False)
+    with open(public_path / "iwildcam2020_train_annotations.json", "w") as f:
+        json.dump(new_train_json, f)
+    with open(public_path / "iwildcam2020_test_information.json", "w") as f:
+        json.dump(new_test_json, f)
+    # Copy over megadetector results
+    shutil.copyfile(
+        raw_path / "iwildcam2020_megadetector_results.json",
+        public_path / "iwildcam2020_megadetector_results.json",
+    )
+    train_ids_to_copy = train_images["id"].unique()
+    test_ids_to_copy = test_images["id"].unique()
+    # Reduce the number of images copied over to 100 for dev mode
+    if dev_mode:
+        train_ids_to_copy = train_ids_to_copy[:100]
+        test_ids_to_copy = test_ids_to_copy[:100]
+    # Copy over image files
+    (public_path / "train").mkdir(exist_ok=True)
+    (public_path / "test").mkdir(exist_ok=True)
+    print(f"Copying images to {public_path}...")
+    for file_id in tqdm(train_ids_to_copy, desc="Copying train images", unit="file"):
+        shutil.copyfile(
+            src=raw_path / "train" / f"{file_id}.jpg",
+            dst=public_path / "train" / f"{file_id}.jpg",
+        )
+    for file_id in tqdm(test_ids_to_copy, desc="Copying test images", unit="file"):
+        shutil.copyfile(
+            src=raw_path / "train" / f"{file_id}.jpg",
+            dst=public_path / "test" / f"{file_id}.jpg",
+        )
+    # Check integrity of the files copied
+    assert len(list(public_path.glob("train/*.jpg"))) == len(
+        train_ids_to_copy
+    ), "Number of train images should be equal to the number of unique image_id in the train set"
+    assert len(list(public_path.glob("test/*.jpg"))) == len(
+        test_ids_to_copy
+    ), "Number of test images should be equal to the number of unique image_id in the test set"
+def prepare(raw: Path, public: Path, private: Path):
+    """
+    Splits the data in raw into public and private datasets with appropriate test/train splits.
+    Also creates a second, parallel validation split (public_val, private_val).
+    """
+    dev_mode = False
+    # Define paths for the new validation split
+    public_val = public.parent / "public_val"
+    private_val = private.parent / "private_val"
+    # Load raw data once
+    with open(raw / "iwildcam2020_train_annotations.json", "r") as file:
+        old_train_json = json.load(file)
+    old_train_annotations = pd.DataFrame(old_train_json["annotations"])
+    old_train_images = pd.DataFrame(old_train_json["images"])
+    old_train_categories = pd.DataFrame(old_train_json["categories"])
+    with open(raw / "iwildcam2020_test_information.json", "r") as file:
+        old_test_json = json.load(file)
+    # ==================================================================
+    # 1. Create the original Train / Test split
+    # ==================================================================
+    print("--- Creating original Train/Test split ---")
+    test_size_orig = 0.22  # 62894/(217959+62894) = 0.22
+    locations_train, locations_test = _perform_split(
+        images_to_split=old_train_images, test_size=test_size_orig, random_state=0
+    )
+    # Filter original data to create the first train/test sets
+    train_images = old_train_images[old_train_images["location"].isin(locations_train)]
+    test_images = old_train_images[old_train_images["location"].isin(locations_test)]
+    train_ids = train_images["id"].unique()
+    test_ids = test_images["id"].unique()
+    train_annotations = old_train_annotations[old_train_annotations["image_id"].isin(train_ids)]
+    test_annotations = old_train_annotations[old_train_annotations["image_id"].isin(test_ids)]
+    # Checks
+    assert set(train_annotations["image_id"]).isdisjoint(
+        set(test_images["id"])
+    ), "Train should not contain annotations of test images"
+    assert len(train_ids) + len(test_ids) == len(
+        old_train_images["id"]
+    ), "The combined length of new_train_ids and new_test_ids should equal the length of old_train_images"
+    assert set(train_images["location"]).isdisjoint(
+        set(test_images["location"])
+    ), "Train and test images should not share locations"
+    # Write files for the original public/private split
+    _write_dataset_files(
+        public_path=public,
+        private_path=private,
+        train_images=train_images,
+        test_images=test_images,
+        train_annotations=train_annotations,
+        test_annotations=test_annotations,
+        categories_df=old_train_categories,
+        info_json=old_train_json["info"],
+        test_info_json=old_test_json,
+        raw_path=raw,
+        dev_mode=dev_mode,
+    )
+    # ==================================================================
+    # 2. Create the new Train / Validation split from the first training set
+    # ==================================================================
+    print("\n--- Creating new Train/Validation split ---")
+    # The new split is performed on the `train_images` from the *first* split.
+    # We calculate the test_size to make the new validation set have the same
+    # number of images as the original test set.
+    test_size_val = len(test_images) / len(train_images)
+    locations_train_val, locations_test_val = _perform_split(
+        images_to_split=train_images, test_size=test_size_val, random_state=0
+    )
+    # Filter the first training set to create the second (train_val/test_val) sets
+    train_val_images = train_images[train_images["location"].isin(locations_train_val)]
+    test_val_images = train_images[train_images["location"].isin(locations_test_val)]
+    train_val_ids = train_val_images["id"].unique()
+    test_val_ids = test_val_images["id"].unique()
+    train_val_annotations = train_annotations[train_annotations["image_id"].isin(train_val_ids)]
+    test_val_annotations = train_annotations[train_annotations["image_id"].isin(test_val_ids)]
+    # Checks
+    assert set(train_val_images["location"]).isdisjoint(
+        set(test_val_images["location"])
+    ), "Validation train and test images should not share locations"
+    assert len(train_val_ids) + len(test_val_ids) == len(
+        train_ids
+    ), "The combined length of train_val and test_val should equal the length of the original train set"
+    # Write files for the new public_val/private_val split
+    _write_dataset_files(
+        public_path=public_val,
+        private_path=private_val,
+        train_images=train_val_images,
+        test_images=test_val_images,
+        train_annotations=train_val_annotations,
+        test_annotations=test_val_annotations,
+        categories_df=old_train_categories,
+        info_json=old_train_json["info"],
+        test_info_json=old_test_json,
+        raw_path=raw,
+        dev_mode=dev_mode,
+    )
+    print("\nData preparation complete.")

mlebench/competitions/jigsaw-toxic-comment-classification-challenge/classes.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ CLASSES = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

mlebench/competitions/jigsaw-toxic-comment-classification-challenge/grade.py ADDED Viewed

@@ -0,0 +1,54 @@
+import pandas as pd
+from sklearn.metrics import roc_auc_score
+from mlebench.grade_helpers import InvalidSubmissionError
+from .classes import CLASSES
+def prepare_for_metric(submission: pd.DataFrame, answers: pd.DataFrame) -> dict:
+    classes_set = set(CLASSES)
+    id_col = "id"
+    # submission checks
+    if id_col not in submission.columns:
+        raise InvalidSubmissionError(f"Submission is missing the column: {id_col}")
+    sub_cols_set = set(submission.columns)
+    if not classes_set.issubset(sub_cols_set):
+        raise InvalidSubmissionError(
+            f"Submission is missing the following columns: {classes_set - sub_cols_set}"
+        )
+    # answers checks
+    assert id_col in answers.columns, f"Answers is missing `{id_col}` column"
+    ans_cols_set = set(answers.columns)
+    assert classes_set.issubset(
+        ans_cols_set
+    ), f"Answers is missing the following columns: {classes_set - ans_cols_set}"
+    assert len(submission) == len(answers), "Submission and answers have different lengths"
+    submission = submission.set_index("id").sort_index()
+    answers = answers.set_index("id").sort_index()
+    # skip rows marked with -1 in y_true
+    # when it happens entire row is marked so we can check negative sum of the row
+    keep_mask = answers[CLASSES].sum(axis=1) >= 0
+    answers = answers[keep_mask]
+    submission = submission[keep_mask]
+    roc_auc_inputs = {
+        "y_score": submission.to_numpy(),
+        "y_true": answers.to_numpy(),
+        # metric for each column, then average across columns
+        "average": "macro",
+    }
+    return roc_auc_inputs
+def grade(submission: pd.DataFrame, answers: pd.DataFrame) -> float:
+    """
+    Computes the column-wise mean ROC AUC score for the submission.
+    """
+    roc_auc_inputs = prepare_for_metric(submission, answers)
+    return roc_auc_score(**roc_auc_inputs)

dslighting 1.7.1__py3-none-any.whl → 1.7.8__py3-none-any.whl

dslighting 1.7.1py3-none-any.whl → 1.7.8py3-none-any.whl