PyPI - dslighting - Versions diffs - 1.7.1__py3-none-any.whl → 1.7.6__py3-none-any.whl - Mend

dslighting 1.7.1py3-none-any.whl → 1.7.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (352) hide show

mlebench/competitions/instant_gratification/grade.py ADDED Viewed

@@ -0,0 +1,55 @@
+import numpy as np
+import pandas as pd
+from sklearn.metrics import roc_auc_score
+from mlebench.grade_helpers import InvalidSubmissionError
+def prepare_for_metric(submission: pd.DataFrame, answers: pd.DataFrame):
+    """Prepare submission and answers for AUC-ROC calculation."""
+    if len(submission) != len(answers):
+        raise InvalidSubmissionError(
+            f"Submission length ({len(submission)}) != answers length ({len(answers)})"
+        )
+    if "id" not in submission.columns:
+        raise InvalidSubmissionError("Submission must have an 'id' column")
+    if "target" not in submission.columns:
+        raise InvalidSubmissionError("Submission must have a 'target' column")
+    # Sort by id
+    submission = submission.sort_values("id").reset_index(drop=True)
+    answers = answers.sort_values("id").reset_index(drop=True)
+    # Check id alignment
+    if not (submission["id"] == answers["id"]).all():
+        raise InvalidSubmissionError("Submission and answers id mismatch")
+    # Extract predictions and true values
+    y_pred = submission["target"].values
+    y_true = answers["target"].values
+    # Validate predictions
+    try:
+        y_pred = y_pred.astype(float)
+    except (ValueError, TypeError):
+        raise InvalidSubmissionError("Predictions must be numeric")
+    if np.any(np.isnan(y_pred)):
+        raise InvalidSubmissionError("Predictions cannot contain NaN values")
+    if np.any((y_pred < 0) | (y_pred > 1)):
+        raise InvalidSubmissionError("Predictions must be probabilities between 0 and 1")
+    return y_true, y_pred
+def grade(submission: pd.DataFrame, answers: pd.DataFrame) -> float:
+    """Calculate AUC-ROC score."""
+    y_true, y_pred = prepare_for_metric(submission, answers)
+    # AUC-ROC
+    auc = roc_auc_score(y_true, y_pred)
+    return auc

mlebench/competitions/instant_gratification/prepare.py ADDED Viewed

@@ -0,0 +1,25 @@
+from pathlib import Path
+import pandas as pd
+def prepare(raw: Path, public: Path, private: Path):
+    """
+    Prepare instant-gratification dataset.
+    """
+    # Read data
+    train = pd.read_csv(raw / "train.csv")
+    test = pd.read_csv(raw / "test.csv")
+    sample_submission = pd.read_csv(raw / "sample_submission.csv")
+    test_answer = pd.read_csv(raw / "test_answer.csv")
+    # Public files (visible to agents)
+    train.to_csv(public / "train.csv", index=False)
+    test.to_csv(public / "test.csv", index=False)
+    sample_submission.to_csv(public / "sample_submission.csv", index=False)
+    # Private files (for grading)
+    test_answer.to_csv(private / "test.csv", index=False)
+    # Validation checks
+    assert len(test_answer) == len(sample_submission), \
+        f"Test answer ({len(test_answer)}) and sample submission ({len(sample_submission)}) must have same length"

mlebench/competitions/invasive-species-monitoring/grade.py ADDED Viewed

@@ -0,0 +1,11 @@
+import pandas as pd
+from sklearn.metrics import roc_auc_score
+from mlebench.competitions.utils import prepare_for_auroc_metric
+def grade(submission: pd.DataFrame, answers: pd.DataFrame) -> float:
+    roc_auc_inputs = prepare_for_auroc_metric(
+        submission=submission, answers=answers, id_col="name", target_col="invasive"
+    )
+    return roc_auc_score(y_true=roc_auc_inputs["y_true"], y_score=roc_auc_inputs["y_score"])

mlebench/competitions/invasive-species-monitoring/prepare.py ADDED Viewed

@@ -0,0 +1,97 @@
+import shutil
+from pathlib import Path
+import py7zr
+from sklearn.model_selection import train_test_split
+from tqdm import tqdm
+from mlebench.utils import extract, read_csv
+def prepare(raw: Path, public: Path, private: Path):
+    """
+    Splits the data in raw into public and private datasets with appropriate test/train splits.
+    """
+    # extract only what we need
+    extract(raw / "train.7z", raw)
+    extract(raw / "train_labels.csv.zip", raw)
+    # Create train, test from train split
+    # Original ratio is 1531/(1531+2295) = 0.4
+    test_ratio = 0.2
+    old_train = read_csv(raw / "train_labels.csv")
+    new_train, answers = train_test_split(old_train, test_size=test_ratio, random_state=0)
+    # Sample submission
+    sample_submission = answers.copy()
+    sample_submission["invasive"] = 0.5
+    # Checks
+    assert new_train["name"].is_unique, "new_train should have unique names"
+    assert answers["name"].is_unique, "answers should have unique names"
+    assert set(new_train["name"]).isdisjoint(
+        set(answers["name"])
+    ), "new_train and answers should be disjoint"
+    assert len(new_train) + len(answers) == len(
+        old_train
+    ), "new_train and answers together should have the same number of rows as old_train"
+    assert (
+        new_train.columns.tolist() == old_train.columns.tolist()
+    ), "new_train should have the same columns as old_train"
+    assert (
+        answers.columns.tolist() == old_train.columns.tolist()
+    ), "answers should have the same columns as old_train"
+    assert (
+        sample_submission.columns.tolist() == old_train.columns.tolist()
+    ), "sample_submission should have the same columns as old_train"
+    # Write CSVs
+    answers.to_csv(private / "answers.csv", index=False)
+    new_train.to_csv(public / "train_labels.csv", index=False)
+    sample_submission.to_csv(private / "sample_submission.csv", index=False)
+    sample_submission.to_csv(public / "sample_submission.csv", index=False)
+    # Copy files
+    (public / "train").mkdir(exist_ok=True)
+    (public / "test").mkdir(exist_ok=True)
+    for file_id in tqdm(new_train["name"], desc="Copying Train Images"):
+        shutil.copyfile(
+            src=raw / "train" / f"{file_id}.jpg",
+            dst=public / "train" / f"{file_id}.jpg",
+        )
+    for file_id in tqdm(answers["name"], desc="Copying Test Images"):
+        shutil.copyfile(
+            src=raw / "train" / f"{file_id}.jpg",
+            dst=public / "test" / f"{file_id}.jpg",
+        )
+    # Checks
+    assert len(list((public / "train").glob("*.jpg"))) == len(
+        new_train
+    ), "public/train should have the same number of files as new_train"
+    assert len(list((public / "test").glob("*.jpg"))) == len(
+        answers
+    ), "public/test should have the same number of files as answers"
+    # Zip
+    shutil.make_archive(
+        str(public / "sample_submission.csv"),
+        "zip",
+        root_dir=public,
+        base_dir="sample_submission.csv",
+    )
+    shutil.make_archive(
+        str(public / "train_labels.csv"), "zip", root_dir=public, base_dir="train_labels.csv"
+    )
+    with py7zr.SevenZipFile(public / "train.7z", "w") as z:
+        z.write(public / "train")
+    with py7zr.SevenZipFile(public / "test.7z", "w") as z:
+        z.write(public / "test")
+    # Delete
+    shutil.rmtree(public / "train")
+    shutil.rmtree(public / "test")
+    (public / "sample_submission.csv").unlink()
+    (public / "train_labels.csv").unlink()

mlebench/competitions/invasive-species-monitoring/prepare_val.py ADDED Viewed

@@ -0,0 +1,164 @@
+import shutil
+from pathlib import Path
+import pandas as pd
+import py7zr
+from sklearn.model_selection import train_test_split
+from tqdm import tqdm
+from mlebench.utils import extract, read_csv
+def _process_split(
+    train_df: pd.DataFrame,
+    test_df: pd.DataFrame,
+    raw_images_dir: Path,
+    public_dir: Path,
+    private_dir: Path,
+):
+    """
+    Helper function to process a single data split (train/test).
+    This function handles:
+    - Creating a sample submission.
+    - Writing all necessary CSV files to public and private directories.
+    - Copying image files to temporary train/test subdirectories.
+    - Zipping the final artifacts.
+    - Cleaning up temporary files.
+    """
+    # Create output directories if they don't exist
+    public_dir.mkdir(exist_ok=True, parents=True)
+    private_dir.mkdir(exist_ok=True, parents=True)
+    # Sample submission
+    sample_submission = test_df.copy()
+    sample_submission["invasive"] = 0.5
+    # Write CSVs
+    test_df.to_csv(private_dir / "answers.csv", index=False)
+    train_df.to_csv(public_dir / "train_labels.csv", index=False)
+    sample_submission.to_csv(private_dir / "sample_submission.csv", index=False)
+    sample_submission.to_csv(public_dir / "sample_submission.csv", index=False)
+    # Create temporary directories for image copying
+    public_train_images_dir = public_dir / "train"
+    public_test_images_dir = public_dir / "test"
+    public_train_images_dir.mkdir(exist_ok=True)
+    public_test_images_dir.mkdir(exist_ok=True)
+    # Copy files
+    for file_id in tqdm(train_df["name"], desc=f"Copying Train Images to {public_dir.name}"):
+        shutil.copyfile(
+            src=raw_images_dir / f"{file_id}.jpg",
+            dst=public_train_images_dir / f"{file_id}.jpg",
+        )
+    for file_id in tqdm(test_df["name"], desc=f"Copying Test Images to {public_dir.name}"):
+        shutil.copyfile(
+            src=raw_images_dir / f"{file_id}.jpg",
+            dst=public_test_images_dir / f"{file_id}.jpg",
+        )
+    # Checks
+    assert len(list(public_train_images_dir.glob("*.jpg"))) == len(
+        train_df
+    ), f"{public_dir.name}/train should have the same number of files as its corresponding train df"
+    assert len(list(public_test_images_dir.glob("*.jpg"))) == len(
+        test_df
+    ), f"{public_dir.name}/test should have the same number of files as its corresponding test df"
+    # Zip
+    shutil.make_archive(
+        str(public_dir / "sample_submission.csv"),
+        "zip",
+        root_dir=public_dir,
+        base_dir="sample_submission.csv",
+    )
+    shutil.make_archive(
+        str(public_dir / "train_labels.csv"),
+        "zip",
+        root_dir=public_dir,
+        base_dir="train_labels.csv",
+    )
+    with py7zr.SevenZipFile(public_dir / "train.7z", "w") as z:
+        z.write(public_train_images_dir, arcname="train")
+    with py7zr.SevenZipFile(public_dir / "test.7z", "w") as z:
+        z.write(public_test_images_dir, arcname="test")
+    # Delete temporary files and directories
+    shutil.rmtree(public_train_images_dir)
+    shutil.rmtree(public_test_images_dir)
+    (public_dir / "sample_submission.csv").unlink()
+    (public_dir / "train_labels.csv").unlink()
+def prepare(raw: Path, public: Path, private: Path):
+    """
+    Splits the data in raw into public and private datasets with appropriate test/train splits.
+    Additionally, creates a second, parallel validation split (public_val/private_val).
+    """
+    # Define paths for the new validation set
+    public_val = public.parent / "public_val"
+    private_val = private.parent / "private_val"
+    # extract only what we need
+    extract(raw / "train.7z", raw)
+    extract(raw / "train_labels.csv.zip", raw)
+    # ---- FIRST SPLIT (Original Train/Test) ----
+    # Create train, test from train split
+    # Original ratio is 1531/(1531+2295) = 0.4
+    test_ratio = 0.2
+    old_train = read_csv(raw / "train_labels.csv")
+    new_train, answers = train_test_split(old_train, test_size=test_ratio, random_state=0)
+    # Checks
+    assert new_train["name"].is_unique, "new_train should have unique names"
+    assert answers["name"].is_unique, "answers should have unique names"
+    assert set(new_train["name"]).isdisjoint(
+        set(answers["name"])
+    ), "new_train and answers should be disjoint"
+    assert len(new_train) + len(answers) == len(
+        old_train
+    ), "new_train and answers together should have the same number of rows as old_train"
+    assert (
+        new_train.columns.tolist() == old_train.columns.tolist()
+    ), "new_train should have the same columns as old_train"
+    assert (
+        answers.columns.tolist() == old_train.columns.tolist()
+    ), "answers should have the same columns as old_train"
+    # Process and save the original public/private split
+    # This ensures the original outputs are not modified
+    _process_split(
+        train_df=new_train,
+        test_df=answers,
+        raw_images_dir=raw / "train",
+        public_dir=public,
+        private_dir=private,
+    )
+    # ---- SECOND SPLIT (New Train/Validation) ----
+    # Split the `new_train` set again to create a validation set.
+    # The new test set (`test_val`) will have the same size as the original test set (`answers`).
+    val_test_ratio = len(answers) / len(new_train)
+    train_val, test_val = train_test_split(
+        new_train, test_size=val_test_ratio, random_state=0
+    )
+    # Checks for the validation split
+    assert set(train_val["name"]).isdisjoint(
+        set(test_val["name"])
+    ), "train_val and test_val should be disjoint"
+    assert len(train_val) + len(test_val) == len(
+        new_train
+    ), "train_val and test_val together should have the same number of rows as new_train"
+    # Process and save the new validation split into parallel directories
+    _process_split(
+        train_df=train_val,
+        test_df=test_val,
+        raw_images_dir=raw / "train",
+        public_dir=public_val,
+        private_dir=private_val,
+    )

mlebench/competitions/iwildcam-2019-fgvc6/grade.py ADDED Viewed

@@ -0,0 +1,44 @@
+import pandas as pd
+from sklearn.metrics import f1_score
+from mlebench.grade_helpers import InvalidSubmissionError
+def prepare_for_metric(
+    submission: pd.DataFrame,
+    answers: pd.DataFrame,
+    id_col: str = "Id",
+    target_col: str = "Category",
+):
+    """
+    `submission` and `answers` are pd.DataFrame with `id_col` and `target_col` columns
+    """
+    # Answers checks
+    assert id_col in answers.columns, f"Answers must have '{id_col}' column"
+    assert target_col in answers.columns, f"Answers must have '{target_col}' column"
+    # Submission checks
+    if len(submission) != len(answers):
+        raise InvalidSubmissionError(
+            f"Submission length {len(submission)} != answers length {len(answers)}"
+        )
+    if id_col not in submission.columns:
+        raise InvalidSubmissionError(f"Submission must have '{id_col}' column")
+    if target_col not in submission.columns:
+        raise InvalidSubmissionError(f"Submission must have '{target_col}' column")
+    # Sort submission and answers by "Id"
+    submission = submission.sort_values(id_col)
+    answers = answers.sort_values(id_col)
+    if (submission[id_col].values != answers[id_col].values).any():
+        raise InvalidSubmissionError(f"Submission and answers have mismatched '{id_col}' columns")
+    y_true = [int(y) for y in answers[target_col]]
+    y_pred = [int(y) for y in submission[target_col]]
+    return y_true, y_pred
+def grade(submission: pd.DataFrame, answers: pd.DataFrame) -> float:
+    y_true, y_pred = prepare_for_metric(submission, answers)
+    return f1_score(y_true=y_true, y_pred=y_pred, average="macro")

mlebench/competitions/iwildcam-2019-fgvc6/prepare.py ADDED Viewed

@@ -0,0 +1,118 @@
+import shutil
+from pathlib import Path
+from sklearn.model_selection import train_test_split
+from tqdm import tqdm
+from mlebench.utils import extract, get_logger, read_csv
+logger = get_logger(__name__)
+def prepare(raw: Path, public: Path, private: Path):
+    """
+    Splits the data in raw into public and private datasets with appropriate test/train splits.
+    """
+    dev_mode = False
+    # Create train, test from train split
+    # Original split ratio is 153730/(196157+153730) = 0.44
+    # We use 0.1 so as to not take out too many samples from train
+    test_size = 0.1
+    old_train = read_csv(raw / "train.csv")
+    # Create a new column 'split' and assign it randomly to 'test' or 'train' based on the value of the 'location' column
+    locations = old_train["location"].unique()
+    train_locations, test_locations = train_test_split(
+        locations, test_size=test_size, random_state=8
+    )  # We target a 44% test set size, we have empirically trialed seeds and landed on 8 to achieve this
+    old_train["split"] = old_train["location"].apply(
+        lambda loc: "test" if loc in test_locations else "train"
+    )
+    new_train = old_train[old_train["split"] == "train"].drop(columns=["split"])
+    answers = old_train[old_train["split"] == "test"].drop(columns=["split"])
+    logger.debug("Train locations: %s", train_locations)
+    logger.debug("Test locations: %s", test_locations)
+    logger.debug("Test size: %s", len(answers) / (len(new_train) + len(answers)))
+    old_train = old_train.drop(columns=["split"])  # Drop helper column
+    new_test = answers.copy().drop(columns=["category_id"])
+    gold_submission = answers.copy()[["id", "category_id"]]
+    gold_submission.rename(columns={"id": "Id", "category_id": "Category"}, inplace=True)
+    # Extract only what we need
+    (raw / "train_images").mkdir(exist_ok=True)
+    extract(raw / "train_images.zip", raw / "train_images")
+    assert len(list(raw.glob("train_images/*.jpg"))) == len(
+        old_train["id"].unique()
+    ), f"Raw train images should have the same number of images as the unique ids in the old train set, but got {len(list(raw.glob('train_images/*.jpg')))} files and {len(old_train['id'].unique())} ids"
+    # Make sample submission
+    submission_df = new_test.copy()[["id"]]
+    submission_df["category_id"] = 0
+    submission_df.rename(columns={"id": "Id", "category_id": "Category"}, inplace=True)
+    # Checks
+    assert set(new_train["id"]).isdisjoint(
+        set(new_test["id"])
+    ), "new_train and new_test are not disjoint"
+    assert len(new_train) + len(new_test) == len(
+        old_train
+    ), "Length of new_train and new_test should be equal to the length of the original train set"
+    assert len(answers) == len(
+        new_test
+    ), "Length of answers should be equal to the length of new_test"
+    assert len(submission_df) == len(
+        answers
+    ), "Length of answers should be equal to the length of the sample submission"
+    assert (
+        old_train.columns.tolist() == new_train.columns.tolist()
+    ), f"new_train should have the same columns as the original train set: old_train: {old_train.columns.tolist()} != new_train: {new_train.columns.tolist()}"
+    assert set(new_train["location"]).isdisjoint(
+        set(new_test["location"])
+    ), "new_train and new_test should not share any locations"
+    # Write CSVs
+    answers.to_csv(private / "test.csv", index=False)
+    gold_submission.to_csv(private / "answers.csv", index=False)
+    new_train.to_csv(public / "train.csv", index=False)
+    new_test.to_csv(public / "test.csv", index=False)
+    submission_df.to_csv(public / "sample_submission.csv", index=True)
+    # Copy files
+    (public / "train_images").mkdir(exist_ok=True)
+    (public / "test_images").mkdir(exist_ok=True)
+    if dev_mode:
+        new_train = new_train.sample(n=100)
+        new_test = new_test.sample(n=100)
+    for file_id in tqdm(new_train["id"], desc="Copying train images"):
+        shutil.copyfile(
+            src=raw / "train_images" / f"{file_id}.jpg",
+            dst=public / "train_images" / f"{file_id}.jpg",
+        )
+    for file_id in tqdm(new_test["id"], desc="Copying test images"):
+        shutil.copyfile(
+            src=raw / "train_images" / f"{file_id}.jpg",
+            dst=public / "test_images" / f"{file_id}.jpg",
+        )
+    # Check integrity of the files copied
+    assert len(list(public.glob("test_images/*.jpg"))) == len(
+        new_test["id"].unique()
+    ), f"Public test images should have the same number of images as the unique ids in the test set, but got {len(list(public.glob('test_images/*.jpg')))} files and {len(new_test['id'].unique())} ids"
+    assert len(list(public.glob("train_images/*.jpg"))) == len(
+        new_train["id"].unique()
+    ), f"Public train images should have the same number of images as the unique ids in the train set, but got {len(list(public.glob('train_images/*.jpg')))} files and {len(new_train['id'].unique())} ids"
+    # Zip up image directories and delete non-zipped files
+    shutil.make_archive(public / "train_images", "zip", public / "train_images")
+    shutil.make_archive(public / "test_images", "zip", public / "test_images")
+    shutil.rmtree(public / "train_images")
+    shutil.rmtree(public / "test_images")

dslighting 1.7.1__py3-none-any.whl → 1.7.6__py3-none-any.whl

dslighting 1.7.1py3-none-any.whl → 1.7.6py3-none-any.whl