PyPI - dslighting - Versions diffs - 1.7.1__py3-none-any.whl → 1.7.6__py3-none-any.whl - Mend

dslighting 1.7.1py3-none-any.whl → 1.7.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (352) hide show

mlebench/competitions/dog-breed-identification/dogs.py ADDED Viewed

@@ -0,0 +1,124 @@
+_dogs_str = """
+affenpinscher
+afghan_hound
+african_hunting_dog
+airedale
+american_staffordshire_terrier
+appenzeller
+australian_terrier
+basenji
+basset
+beagle
+bedlington_terrier
+bernese_mountain_dog
+black-and-tan_coonhound
+blenheim_spaniel
+bloodhound
+bluetick
+border_collie
+border_terrier
+borzoi
+boston_bull
+bouvier_des_flandres
+boxer
+brabancon_griffon
+briard
+brittany_spaniel
+bull_mastiff
+cairn
+cardigan
+chesapeake_bay_retriever
+chihuahua
+chow
+clumber
+cocker_spaniel
+collie
+curly-coated_retriever
+dandie_dinmont
+dhole
+dingo
+doberman
+english_foxhound
+english_setter
+english_springer
+entlebucher
+eskimo_dog
+flat-coated_retriever
+french_bulldog
+german_shepherd
+german_short-haired_pointer
+giant_schnauzer
+golden_retriever
+gordon_setter
+great_dane
+great_pyrenees
+greater_swiss_mountain_dog
+groenendael
+ibizan_hound
+irish_setter
+irish_terrier
+irish_water_spaniel
+irish_wolfhound
+italian_greyhound
+japanese_spaniel
+keeshond
+kelpie
+kerry_blue_terrier
+komondor
+kuvasz
+labrador_retriever
+lakeland_terrier
+leonberg
+lhasa
+malamute
+malinois
+maltese_dog
+mexican_hairless
+miniature_pinscher
+miniature_poodle
+miniature_schnauzer
+newfoundland
+norfolk_terrier
+norwegian_elkhound
+norwich_terrier
+old_english_sheepdog
+otterhound
+papillon
+pekinese
+pembroke
+pomeranian
+pug
+redbone
+rhodesian_ridgeback
+rottweiler
+saint_bernard
+saluki
+samoyed
+schipperke
+scotch_terrier
+scottish_deerhound
+sealyham_terrier
+shetland_sheepdog
+shih-tzu
+siberian_husky
+silky_terrier
+soft-coated_wheaten_terrier
+staffordshire_bullterrier
+standard_poodle
+standard_schnauzer
+sussex_spaniel
+tibetan_mastiff
+tibetan_terrier
+toy_poodle
+toy_terrier
+vizsla
+walker_hound
+weimaraner
+welsh_springer_spaniel
+west_highland_white_terrier
+whippet
+wire-haired_fox_terrier
+yorkshire_terrier
+"""
+DOGS_LIST = _dogs_str.strip().split("\n")
+assert len(DOGS_LIST) == 120

mlebench/competitions/dog-breed-identification/grade.py ADDED Viewed

@@ -0,0 +1,42 @@
+import numpy as np
+from pandas import DataFrame
+from sklearn.metrics import log_loss
+from mlebench.competitions.utils import one_hot_dfs_to_log_loss_inputs
+from mlebench.grade_helpers import InvalidSubmissionError
+from .dogs import DOGS_LIST
+def prepare_for_metric(submission: DataFrame, answers: DataFrame) -> dict:
+    if not all(dog in submission.columns for dog in DOGS_LIST):
+        raise InvalidSubmissionError(f"Submission must have columns for all dogs: {DOGS_LIST}")
+    if "id" not in submission.columns:
+        raise InvalidSubmissionError("Submission must have an `id` column")
+    if len(submission) != len(answers):
+        raise InvalidSubmissionError("Submission should be the same length as the answers")
+    assert "id" in answers.columns, "Answers must have an `id` column"
+    assert all(
+        dog in answers.columns for dog in DOGS_LIST
+    ), f"Answers must have columns for all dogs: {DOGS_LIST}"
+    tolerance = 1e-6
+    if not np.all(np.isclose(submission[DOGS_LIST].sum(axis=1), 1, atol=tolerance)):
+        raise InvalidSubmissionError(
+            "Dog probabilities in each row in submission should sum to one, as probabilities."
+        )
+    if not ((submission[DOGS_LIST] >= 0) & (submission[DOGS_LIST] <= 1)).all().all():
+        raise InvalidSubmissionError(
+            "All probabilities in submission DataFrame must be between 0 and 1."
+        )
+    log_loss_inputs = one_hot_dfs_to_log_loss_inputs(
+        submission, answers, id_column="id", apply_softmax=False
+    )
+    return log_loss_inputs
+def grade(submission: DataFrame, answers: DataFrame) -> float:
+    log_loss_inputs = prepare_for_metric(submission, answers)
+    return log_loss(**log_loss_inputs)

mlebench/competitions/dog-breed-identification/prepare.py ADDED Viewed

@@ -0,0 +1,55 @@
+import shutil
+from pathlib import Path
+from sklearn.model_selection import train_test_split
+from mlebench.competitions.utils import df_to_one_hot
+from mlebench.utils import read_csv
+from .dogs import DOGS_LIST
+def to_one_hot(df):
+    return df_to_one_hot(df, id_column="id", target_column="breed", classes=DOGS_LIST)
+def prepare(raw: Path, public: Path, private: Path):
+    # Create train, test from train split
+    old_train = read_csv(raw / "labels.csv")
+    new_train, new_test = train_test_split(old_train, test_size=0.1, random_state=0)
+    # one hot the private test because we will one-hot the submission, as per kaggle.com
+    new_test = to_one_hot(new_test)
+    # Copy over files
+    new_train.to_csv(public / "labels.csv", index=False)
+    new_test.to_csv(private / "test.csv", index=False)
+    (public / "train").mkdir(exist_ok=True)
+    for file_id in new_train["id"]:
+        shutil.copyfile(
+            src=raw / "train" / f"{file_id}.jpg",
+            dst=public / "train" / f"{file_id}.jpg",
+        )
+    (public / "test").mkdir(exist_ok=True)
+    for file_id in new_test["id"]:
+        shutil.copyfile(
+            src=raw / "train" / f"{file_id}.jpg",
+            dst=public / "test" / f"{file_id}.jpg",
+        )
+    # Check integrity of the files copied
+    assert len(list(public.glob("train/*.jpg"))) == len(new_train)
+    assert len(list(public.glob("test/*.jpg"))) == len(new_test)
+    # Create a sample submission file
+    submission_df = new_test.copy()
+    for col in submission_df.columns[1:]:
+        submission_df[col] = submission_df[col].astype("float64")
+    submission_df.iloc[:, 1:] = 1 / 120
+    submission_df.to_csv(public / "sample_submission.csv", index=False)
+    assert submission_df.shape == (len(new_test), 121)  # 1 id column + 120 breeds

mlebench/competitions/dog-breed-identification/prepare_val.py ADDED Viewed

@@ -0,0 +1,104 @@
+import shutil
+from pathlib import Path
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from mlebench.competitions.utils import df_to_one_hot
+from mlebench.utils import read_csv
+from .dogs import DOGS_LIST
+def to_one_hot(df: pd.DataFrame) -> pd.DataFrame:
+    return df_to_one_hot(df, id_column="id", target_column="breed", classes=DOGS_LIST)
+def _save_split(
+    raw_data_path: Path,
+    public_dir: Path,
+    private_dir: Path,
+    train_df: pd.DataFrame,
+    test_df: pd.DataFrame,
+):
+    """Saves a given train/test split to the specified public and private directories."""
+    # Ensure target directories exist
+    public_dir.mkdir(exist_ok=True)
+    private_dir.mkdir(exist_ok=True)
+    # one hot the private test because we will one-hot the submission, as per kaggle.com
+    test_labels_private = to_one_hot(test_df.copy())
+    # Copy over files
+    train_df.to_csv(public_dir / "labels.csv", index=False)
+    test_labels_private.to_csv(private_dir / "test.csv", index=False)
+    (public_dir / "train").mkdir(exist_ok=True)
+    for file_id in train_df["id"]:
+        shutil.copyfile(
+            src=raw_data_path / "train" / f"{file_id}.jpg",
+            dst=public_dir / "train" / f"{file_id}.jpg",
+        )
+    (public_dir / "test").mkdir(exist_ok=True)
+    for file_id in test_df["id"]:
+        shutil.copyfile(
+            src=raw_data_path / "train" / f"{file_id}.jpg",
+            dst=public_dir / "test" / f"{file_id}.jpg",
+        )
+    # Check integrity of the files copied
+    assert len(list(public_dir.glob("train/*.jpg"))) == len(train_df)
+    assert len(list(public_dir.glob("test/*.jpg"))) == len(test_df)
+    # Create a sample submission file
+    submission_df = test_labels_private.copy()
+    for col in submission_df.columns[1:]:
+        submission_df[col] = submission_df[col].astype("float64")
+    submission_df.iloc[:, 1:] = 1 / 120
+    submission_df.to_csv(public_dir / "sample_submission.csv", index=False)
+    assert submission_df.shape == (len(test_df), 121)  # 1 id column + 120 breeds
+def prepare(raw: Path, public: Path, private: Path):
+    # Read the full raw dataset labels
+    all_labels = read_csv(raw / "labels.csv")
+    # --- 1. Original Split: Create main train and test sets ---
+    # This split and its outputs must remain identical to the original script.
+    train_df, test_df = train_test_split(all_labels, test_size=0.1, random_state=0)
+    # Save the original split to the 'public' and 'private' directories
+    _save_split(
+        raw_data_path=raw,
+        public_dir=public,
+        private_dir=private,
+        train_df=train_df,
+        test_df=test_df,
+    )
+    # --- 2. New Split: Create a validation set from the main train set ---
+    # Define paths for the new validation directories, parallel to the original ones.
+    public_val = public.parent / "public_val"
+    private_val = private.parent / "private_val"
+    # To ensure the new validation set ('test_val_df') is the same size as the
+    # original test set, we calculate the required test_size for the second split.
+    test_size_for_val_split = len(test_df) / len(train_df)
+    # Split the main training data (train_df) into a smaller training set
+    # and a validation set, using the same random_state for reproducibility.
+    train_val_df, test_val_df = train_test_split(
+        train_df, test_size=test_size_for_val_split, random_state=0
+    )
+    # Save the new validation split to the 'public_val' and 'private_val' directories
+    # using the same helper function to ensure identical structure and filenames.
+    _save_split(
+        raw_data_path=raw,
+        public_dir=public_val,
+        private_dir=private_val,
+        train_df=train_val_df,
+        test_df=test_val_df,
+    )

mlebench/competitions/dogs-vs-cats-redux-kernels-edition/grade.py ADDED Viewed

@@ -0,0 +1,43 @@
+from typing import Tuple
+import numpy as np
+import pandas as pd
+from sklearn.metrics import log_loss
+from mlebench.grade_helpers import InvalidSubmissionError
+def prepare_for_metric(
+    submission: pd.DataFrame, answers: pd.DataFrame
+) -> Tuple[np.ndarray, np.ndarray]:
+    # answers checks
+    assert "id" in answers.columns, f"Answers is missing `id` column"
+    assert "label" in answers.columns, f"Answers is missing `label` column"
+    # submission checks
+    if "id" not in submission.columns:
+        raise InvalidSubmissionError(f"Submission is missing `id` column")
+    if "label" not in submission.columns:
+        raise InvalidSubmissionError(f"Submission is missing `label` column")
+    if set(submission["id"]) != set(answers["id"]):
+        raise InvalidSubmissionError("Submission and answers have different id's")
+    if len(submission) != len(answers):
+        raise InvalidSubmissionError("Submission and answers have different lengths")
+    if not ((submission["label"] >= 0) & (submission["label"] <= 1)).all():
+        raise InvalidSubmissionError(
+            "All values in submission `label` column must be between 0 and 1."
+        )
+    # sort by id to ensure correct order
+    submission = submission.sort_values("id")
+    answers = answers.sort_values("id")
+    y_true = answers["label"]
+    y_pred = submission["label"]
+    return y_true, y_pred
+def grade(submission: pd.DataFrame, answers: pd.DataFrame) -> float:
+    y_true, y_pred = prepare_for_metric(submission, answers)
+    score = log_loss(y_true=y_true, y_pred=y_pred)
+    return score

mlebench/competitions/dogs-vs-cats-redux-kernels-edition/prepare.py ADDED Viewed

@@ -0,0 +1,70 @@
+import shutil
+from pathlib import Path
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from tqdm import tqdm
+from mlebench.utils import compress, extract, read_csv
+def prepare(raw: Path, public: Path, private: Path):
+    extract(raw / "train.zip", raw)
+    extract(raw / "test.zip", raw)
+    all_train_images = sorted(list((raw / "train").glob("*.jpg")))
+    # Original test ratio has Train set - 25,000 samples; Test set - 12,500 samples (33% ratio)
+    # We use 0.1 ratio to avoid removing too many samples from train
+    train_images, test_images = train_test_split(all_train_images, test_size=0.1, random_state=0)
+    # Copy over train images. Rename cat files to cat.0.jpg, cat.1.jpg, etc.
+    # Rename dog files to dog.0.jpg, dog.1.jpg, etc.
+    cat_ctr = 0
+    dog_ctr = 0
+    (public / "train").mkdir(exist_ok=True)
+    for img in tqdm(train_images):
+        if "cat" in img.name:
+            shutil.copy(img, public / "train" / f"cat.{cat_ctr}.jpg")
+            cat_ctr += 1
+        else:
+            shutil.copy(img, public / "train" / f"dog.{dog_ctr}.jpg")
+            dog_ctr += 1
+    assert cat_ctr + dog_ctr == len(
+        train_images
+    ), f"Expected {len(train_images)} train images but got {cat_ctr + dog_ctr} images."
+    # Copy over test images. Rename files to 1.jpg, 2.jpg, etc.
+    (public / "test").mkdir(exist_ok=True)
+    for i, img in enumerate(tqdm(test_images), start=1):
+        shutil.copy(img, public / "test" / f"{i}.jpg")
+    assert i == len(test_images), f"Expected {len(test_images)} test images but got {i} images."
+    # Compress train and test images, then remove the raw images
+    compress(public / "train", public / "train.zip", exist_ok=True)
+    compress(public / "test", public / "test.zip", exist_ok=True)
+    shutil.rmtree(raw / "train")
+    shutil.rmtree(raw / "test")
+    # Make answers
+    answers = pd.DataFrame(
+        {
+            "id": [i for i in range(1, len(test_images) + 1)],
+            "label": [int("dog" in img.name) for img in test_images],
+        }
+    )
+    answers.to_csv(private / "answers.csv", index=False)
+    assert len(answers) == len(
+        test_images
+    ), f"Expected {len(test_images)} answers but got {len(answers)} answers."
+    # Make sample submission
+    sample_submission = pd.DataFrame(
+        {
+            "id": [i for i in range(1, len(test_images) + 1)],
+            "label": [0.5 for _ in range(1, len(test_images) + 1)],
+        }
+    )
+    sample_submission.to_csv(public / "sample_submission.csv", index=False)
+    assert len(sample_submission) == len(
+        test_images
+    ), f"Expected {len(test_images)} sample submission rows but got {len(sample_submission)} rows."

mlebench/competitions/dogs-vs-cats-redux-kernels-edition/prepare_val.py ADDED Viewed

@@ -0,0 +1,143 @@
+import shutil
+from pathlib import Path
+from typing import List
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from tqdm import tqdm
+from mlebench.utils import compress, extract
+def _process_split(
+    image_paths: List[Path],
+    public_dir: Path,
+    private_dir: Path,
+    test_size: float,
+    random_state: int,
+) -> List[Path]:
+    """
+    Splits a list of image paths into train/test sets and generates all required files.
+    This function encapsulates the logic for:
+    1. Splitting data.
+    2. Copying and renaming images to public train/test subdirectories.
+    3. Compressing the public train/test image directories.
+    4. Creating the private answer key for the test set.
+    5. Creating a public sample submission file.
+    Args:
+        image_paths: A list of Path objects for the images to be split.
+        public_dir: The public output directory (e.g., 'data/public').
+        private_dir: The private output directory (e.g., 'data/private').
+        test_size: The proportion of the dataset to allocate to the test set.
+        random_state: The seed for the random number generator.
+    Returns:
+        A list of Path objects corresponding to the training set of this split.
+    """
+    public_dir.mkdir(exist_ok=True, parents=True)
+    private_dir.mkdir(exist_ok=True, parents=True)
+    # Perform the split
+    train_images, test_images = train_test_split(
+        image_paths, test_size=test_size, random_state=random_state
+    )
+    # Copy over train images. Rename cat files to cat.0.jpg, cat.1.jpg, etc.
+    # Rename dog files to dog.0.jpg, dog.1.jpg, etc.
+    cat_ctr = 0
+    dog_ctr = 0
+    (public_dir / "train").mkdir(exist_ok=True)
+    for img in tqdm(train_images, desc=f"Processing train set for {public_dir.name}"):
+        if "cat" in img.name:
+            shutil.copy(img, public_dir / "train" / f"cat.{cat_ctr}.jpg")
+            cat_ctr += 1
+        else:
+            shutil.copy(img, public_dir / "train" / f"dog.{dog_ctr}.jpg")
+            dog_ctr += 1
+    assert cat_ctr + dog_ctr == len(
+        train_images
+    ), f"Expected {len(train_images)} train images but got {cat_ctr + dog_ctr} images."
+    # Copy over test images. Rename files to 1.jpg, 2.jpg, etc.
+    (public_dir / "test").mkdir(exist_ok=True)
+    for i, img in enumerate(tqdm(test_images, desc=f"Processing test set for {public_dir.name}"), start=1):
+        shutil.copy(img, public_dir / "test" / f"{i}.jpg")
+    assert i == len(test_images), f"Expected {len(test_images)} test images but got {i} images."
+    # Compress train and test images
+    compress(public_dir / "train", public_dir / "train.zip", exist_ok=True)
+    compress(public_dir / "test", public_dir / "test.zip", exist_ok=True)
+    # Make answers
+    answers = pd.DataFrame(
+        {
+            "id": [i for i in range(1, len(test_images) + 1)],
+            "label": [int("dog" in img.name) for img in test_images],
+        }
+    )
+    answers.to_csv(private_dir / "answers.csv", index=False)
+    assert len(answers) == len(
+        test_images
+    ), f"Expected {len(test_images)} answers but got {len(answers)} answers."
+    # Make sample submission
+    sample_submission = pd.DataFrame(
+        {
+            "id": [i for i in range(1, len(test_images) + 1)],
+            "label": [0.5 for _ in range(1, len(test_images) + 1)],
+        }
+    )
+    sample_submission.to_csv(public_dir / "sample_submission.csv", index=False)
+    assert len(sample_submission) == len(
+        test_images
+    ), f"Expected {len(test_images)} sample submission rows but got {len(sample_submission)} rows."
+    return train_images
+def prepare(raw: Path, public: Path, private: Path):
+    # This part remains from the original script
+    extract(raw / "train.zip", raw)
+    extract(raw / "test.zip", raw)
+    all_train_images = sorted(list((raw / "train").glob("*.jpg")))
+    # --- 1. Original Split (train -> train + test) ---
+    # This call generates the original competition files in `public` and `private`.
+    # The logic and outputs of this step are unchanged.
+    # Original test ratio has Train set - 25,000 samples; Test set - 12,500 samples (33% ratio)
+    # We use 0.1 ratio to avoid removing too many samples from train
+    original_test_size = 0.1
+    main_train_set = _process_split(
+        image_paths=all_train_images,
+        public_dir=public,
+        private_dir=private,
+        test_size=original_test_size,
+        random_state=0,
+    )
+    # --- 2. New Validation Split (main_train_set -> train_val + test_val) ---
+    # This call generates a new, parallel set of files for validation purposes.
+    # It operates *only* on the training data from the first split.
+    public_val = public.parent / "public_val"
+    private_val = private.parent / "private_val"
+    # To make the new test_val set have the same size as the original test set,
+    # we adjust the test_size for the second split.
+    # new_size = original_test_size / (1 - original_test_size)
+    # e.g., 0.1 / (1 - 0.1) = 0.1 / 0.9 = 1/9
+    validation_test_size = original_test_size / (1 - original_test_size)
+    _process_split(
+        image_paths=main_train_set,
+        public_dir=public_val,
+        private_dir=private_val,
+        test_size=validation_test_size,
+        random_state=0,  # Use the same random state for consistency
+    )
+    # Final cleanup is done after all splits are complete
+    shutil.rmtree(raw / "train")
+    shutil.rmtree(raw / "test")

mlebench/competitions/ethanol-concentration/grade.py ADDED Viewed

@@ -0,0 +1,23 @@
+import pandas as pd
+import numpy as np
+from sklearn.metrics import accuracy_score
+def grade(submission: pd.DataFrame, answers: pd.DataFrame) -> float:
+    """
+    Grade the submission using accuracy metric.
+    Args:
+        submission: DataFrame with columns ['id', 'label']
+        answers: DataFrame with columns ['id', 'label']
+    Returns:
+        Accuracy score (float between 0 and 1)
+    """
+    # Merge on id to ensure alignment
+    merged = pd.merge(answers, submission, on='id', suffixes=('_true', '_pred'))
+    # Calculate accuracy
+    accuracy = accuracy_score(merged['label_true'], merged['label_pred'])
+    return accuracy

dslighting 1.7.1__py3-none-any.whl → 1.7.6__py3-none-any.whl

dslighting 1.7.1py3-none-any.whl → 1.7.6py3-none-any.whl