PyPI - dslighting - Versions diffs - 1.7.1__py3-none-any.whl → 1.7.6__py3-none-any.whl - Mend

dslighting 1.7.1py3-none-any.whl → 1.7.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (352) hide show

mlebench/competitions/alaska2-image-steganalysis/prepare.py ADDED Viewed

@@ -0,0 +1,88 @@
+import random
+import shutil
+from pathlib import Path
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from tqdm import tqdm
+def prepare(raw: Path, public: Path, private: Path):
+    """
+    Splits the data in raw into public and private datasets with appropriate test/train splits.
+    """
+    # List of all train image IDs
+    cover_images_dir = raw / "Cover"
+    cover_images = sorted(list(cover_images_dir.glob("*.jpg")))
+    # Create train, test from train split
+    test_size = 5000
+    new_train, new_test = train_test_split(cover_images, test_size=test_size, random_state=42)
+    # Prepare dirs
+    steganography_algs = ["JMiPOD", "JUNIWARD", "UERD"]
+    for subdir in ["Cover", "Test"] + steganography_algs:
+        (public / subdir).mkdir(parents=True, exist_ok=True)
+    # Copy over new train set, giving them new ids
+    for idx, fp in tqdm(enumerate(new_train), total=len(new_train), desc="Copying train images"):
+        image_id = idx + 1  # 1-indexed
+        shutil.copyfile(src=fp, dst=public / "Cover" / f"{image_id:05d}.jpg")
+        for alg in steganography_algs:
+            shutil.copyfile(src=raw / alg / fp.name, dst=public / alg / f"{image_id:05d}.jpg")
+    # Populate test set
+    answers_rows = []
+    random.seed(0)
+    random.shuffle(new_test)
+    for idx, fp in tqdm(enumerate(new_test), total=len(new_test), desc="Copying test images"):
+        image_id = idx + 1  # 1-indexed
+        test_id = f"{image_id:04d}.jpg"
+        dest = public / "Test" / test_id
+        # For the test set, we randomly select between the "Cover" (unedited image, negative class)
+        # and one of the 3 steganography algorithms (positive class)
+        # 1:1 ratio of positive:negative examples, and even distribution of steganography algorithms
+        if random.choice([True, False]):
+            # Negative class
+            shutil.copyfile(
+                src=fp,
+                dst=dest,
+            )
+            answers_rows.append({"Id": test_id, "Label": 0})
+        else:
+            # Positive class
+            alg = random.choice(steganography_algs)
+            shutil.copyfile(src=raw / alg / fp.name, dst=dest)
+            answers_rows.append({"Id": test_id, "Label": 1})
+    # Write answers to file
+    answers_df = pd.DataFrame(answers_rows)
+    answers_df.to_csv(private / "test.csv", index=False)
+    # Create sample submission
+    sample_submission = answers_df.copy()
+    sample_submission["Label"] = 0
+    sample_submission.to_csv(public / "sample_submission.csv", index=False)
+    # Checks
+    assert "Id" in answers_df.columns, "Answers must have 'Id' column"
+    assert "Label" in answers_df.columns, "Answers must have 'Label' column"
+    assert "Id" in sample_submission.columns, "Sample submission must have 'Id' column"
+    assert "Label" in sample_submission.columns, "Sample submission must have 'Label' column"
+    assert (
+        len(answers_df) == test_size
+    ), f"Expected {test_size} test images, but got {len(answers_df)}"
+    assert len(sample_submission) == len(
+        answers_df
+    ), f"Sample submission ({len(sample_submission)}) and answers ({len(answers_df)}) must have the same length"
+    assert (
+        len(list(public.glob("Test/*.jpg"))) == test_size
+    ), f"Expected {test_size} test images in public/Test, but got {len(list(public.glob('Test/*.jpg')))}"
+    assert len(list(public.glob("Cover/*.jpg"))) == len(
+        new_train
+    ), f"Expected {len(new_train)} train images in public/Cover, but got {len(list(public.glob('Cover/*.jpg')))}"
+    for alg in steganography_algs:
+        assert len(list(public.glob(f"{alg}/*.jpg"))) == len(
+            new_train
+        ), f"Expected {len(new_train)} train images in public/{alg}, but got {len(list(public.glob(f'{alg}/*.jpg')))}"

mlebench/competitions/alaska2-image-steganalysis/prepare_val.py ADDED Viewed

@@ -0,0 +1,148 @@
+import random
+import shutil
+from pathlib import Path
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from tqdm import tqdm
+def _create_dataset_split(
+    train_image_paths: list,
+    test_image_paths: list,
+    raw_dir: Path,
+    public_dir: Path,
+    private_dir: Path,
+    steganography_algs: list,
+):
+    """
+    Helper function to create a dataset split (e.g., train/test or train_val/test_val).
+    This function populates the public and private directories with the respective
+    training images, test images, and test set ground-truth labels.
+    """
+    # Prepare dirs
+    public_dir.mkdir(parents=True, exist_ok=True)
+    private_dir.mkdir(parents=True, exist_ok=True)
+    for subdir in ["Cover", "Test"] + steganography_algs:
+        (public_dir / subdir).mkdir(exist_ok=True)
+    # Copy over the train set for this split, giving them new ids
+    for idx, fp in tqdm(
+        enumerate(train_image_paths), total=len(train_image_paths), desc=f"Copying train images to {public_dir.name}"
+    ):
+        image_id = idx + 1  # 1-indexed
+        shutil.copyfile(src=fp, dst=public_dir / "Cover" / f"{image_id:05d}.jpg")
+        for alg in steganography_algs:
+            shutil.copyfile(src=raw_dir / alg / fp.name, dst=public_dir / alg / f"{image_id:05d}.jpg")
+    # Populate the test set for this split
+    answers_rows = []
+    random.seed(0)  # Reset seed for deterministic test set creation
+    random.shuffle(test_image_paths)
+    for idx, fp in tqdm(
+        enumerate(test_image_paths), total=len(test_image_paths), desc=f"Copying test images to {public_dir.name}"
+    ):
+        image_id = idx + 1  # 1-indexed
+        test_id = f"{image_id:04d}.jpg"
+        dest = public_dir / "Test" / test_id
+        # For the test set, we randomly select between the "Cover" (unedited image, negative class)
+        # and one of the 3 steganography algorithms (positive class)
+        # 1:1 ratio of positive:negative examples, and even distribution of steganography algorithms
+        if random.choice([True, False]):
+            # Negative class
+            shutil.copyfile(
+                src=fp,
+                dst=dest,
+            )
+            answers_rows.append({"Id": test_id, "Label": 0})
+        else:
+            # Positive class
+            alg = random.choice(steganography_algs)
+            shutil.copyfile(src=raw_dir / alg / fp.name, dst=dest)
+            answers_rows.append({"Id": test_id, "Label": 1})
+    # Write answers to file
+    answers_df = pd.DataFrame(answers_rows)
+    answers_df.to_csv(private_dir / "test.csv", index=False)
+    # Create sample submission
+    sample_submission = answers_df.copy()
+    sample_submission["Label"] = 0
+    sample_submission.to_csv(public_dir / "sample_submission.csv", index=False)
+    # Checks
+    test_size = len(test_image_paths)
+    assert "Id" in answers_df.columns, "Answers must have 'Id' column"
+    assert "Label" in answers_df.columns, "Answers must have 'Label' column"
+    assert "Id" in sample_submission.columns, "Sample submission must have 'Id' column"
+    assert "Label" in sample_submission.columns, "Sample submission must have 'Label' column"
+    assert (
+        len(answers_df) == test_size
+    ), f"Expected {test_size} test images, but got {len(answers_df)}"
+    assert len(sample_submission) == len(
+        answers_df
+    ), f"Sample submission ({len(sample_submission)}) and answers ({len(answers_df)}) must have the same length"
+    assert (
+        len(list(public_dir.glob("Test/*.jpg"))) == test_size
+    ), f"Expected {test_size} test images in {public_dir.name}/Test, but got {len(list(public_dir.glob('Test/*.jpg')))}"
+    assert len(list(public_dir.glob("Cover/*.jpg"))) == len(
+        train_image_paths
+    ), f"Expected {len(train_image_paths)} train images in {public_dir.name}/Cover, but got {len(list(public_dir.glob('Cover/*.jpg')))}"
+    for alg in steganography_algs:
+        assert len(list(public_dir.glob(f"{alg}/*.jpg"))) == len(
+            train_image_paths
+        ), f"Expected {len(train_image_paths)} train images in {public_dir.name}/{alg}, but got {len(list(public_dir.glob(f'{alg}/*.jpg')))}"
+def prepare(raw: Path, public: Path, private: Path):
+    """
+    Splits the data in raw into public and private datasets with appropriate test/train splits.
+    Also creates a secondary validation split (train_val/test_val) in parallel directories.
+    """
+    # List of all train image IDs
+    cover_images_dir = raw / "Cover"
+    cover_images = sorted(list(cover_images_dir.glob("*.jpg")))
+    steganography_algs = ["JMiPOD", "JUNIWARD", "UERD"]
+    test_size = 5000
+    # --- Stage 1: Create the main competition train/test split ---
+    # This split creates the final test set used for scoring.
+    # The outputs in `public` and `private` are left untouched by subsequent steps.
+    train_main, test_main = train_test_split(
+        cover_images, test_size=test_size, random_state=42
+    )
+    _create_dataset_split(
+        train_image_paths=train_main,
+        test_image_paths=test_main,
+        raw_dir=raw,
+        public_dir=public,
+        private_dir=private,
+        steganography_algs=steganography_algs,
+    )
+    # --- Stage 2: Create the validation train/test split ---
+    # This performs a second split on the main training data (`train_main`)
+    # to create a new, smaller training set and a validation set.
+    # Outputs are saved to `public_val` and `private_val` to avoid conflicts.
+    public_val = public.parent / "public_val"
+    private_val = private.parent / "private_val"
+    # Split `train_main` again, using the same logic as the first split
+    # to create a validation set of the same size as the main test set.
+    train_val, test_val = train_test_split(
+        train_main, test_size=test_size, random_state=42
+    )
+    # The new split is processed using the same helper function to ensure
+    # identical directory structure, filenames, and creation logic.
+    _create_dataset_split(
+        train_image_paths=train_val,
+        test_image_paths=test_val,
+        raw_dir=raw,
+        public_dir=public_val,
+        private_dir=private_val,
+        steganography_algs=steganography_algs,
+    )

mlebench/competitions/aptos2019-blindness-detection/grade.py ADDED Viewed

@@ -0,0 +1,35 @@
+from pandas import DataFrame
+from sklearn.metrics import cohen_kappa_score
+from mlebench.grade_helpers import InvalidSubmissionError
+def prepare_for_grading(submission: DataFrame, answers: DataFrame, target_column: str) -> tuple:
+    if target_column not in submission.columns:
+        raise InvalidSubmissionError(f"Submission must contain the target column '{target_column}'")
+    assert (
+        target_column in answers.columns
+    ), f"Answers must contain the target column '{target_column}'"
+    if submission.empty:
+        raise InvalidSubmissionError("Submission DataFrame should not be empty")
+    assert not answers.empty, "Answers DataFrame should not be empty"
+    if set(submission["id_code"]) != set(answers["id_code"]):
+        raise InvalidSubmissionError("Submission must have the same id_codes as answers")
+    # Ensure the submissions and answers are sorted by id_code
+    submission_sorted = submission.sort_values(by="id_code")
+    answers_sorted = answers.sort_values(by="id_code")
+    # Extract the target columns
+    y_pred = submission_sorted[target_column].values
+    y_true = answers_sorted[target_column].values
+    return y_pred, y_true
+def grade(submission: DataFrame, answers: DataFrame) -> float:
+    y_pred, y_true = prepare_for_grading(
+        submission=submission, answers=answers, target_column="diagnosis"
+    )
+    return cohen_kappa_score(y_pred, y_true, weights="quadratic")

mlebench/competitions/aptos2019-blindness-detection/prepare.py ADDED Viewed

@@ -0,0 +1,75 @@
+import shutil
+from pathlib import Path
+from sklearn.model_selection import train_test_split
+from mlebench.utils import read_csv
+def prepare(raw: Path, public: Path, private: Path):
+    """
+    Splits the data in raw into public and private datasets with appropriate test/train splits.
+    """
+    # Create train, test from train split
+    old_train = read_csv(raw / "train.csv")
+    new_train, new_test = train_test_split(old_train, test_size=0.1, random_state=0)
+    new_test_without_labels = new_test.drop(columns=["diagnosis"])
+    (public / "test_images").mkdir(exist_ok=True)
+    (public / "train_images").mkdir(exist_ok=True)
+    # Copy data
+    for file_id in new_train["id_code"]:
+        shutil.copyfile(
+            src=raw / "train_images" / f"{file_id}.png",
+            dst=public / "train_images" / f"{file_id}.png",
+        )
+    for file_id in new_test_without_labels["id_code"]:
+        shutil.copyfile(
+            src=raw / "train_images" / f"{file_id}.png",
+            dst=public / "test_images" / f"{file_id}.png",
+        )
+    # Check integrity of the files copied
+    assert set(new_train["id_code"]).isdisjoint(
+        set(new_test["id_code"])
+    ), "Train and test sets should have no shared ids"
+    assert len(new_test_without_labels) == len(
+        new_test
+    ), "Public and Private tests should have equal length"
+    assert len(list(public.glob("train_images/*.png"))) == len(
+        new_train
+    ), "Public train images should have the same number of images as the length of train set"
+    assert len(list(public.glob("test_images/*.png"))) == len(
+        new_test_without_labels
+    ), "Public test images should have the same number of images as the length of test set"
+    train_image_files = set(public.glob("train_images/*.png"))
+    test_image_files = set(public.glob("test_images/*.png"))
+    common_files = train_image_files.intersection(test_image_files)
+    assert not common_files, f"Images found in both train_images and test_images: {common_files}"
+    for file_id in new_test["id_code"]:
+        assert (
+            public / "test_images" / f"{file_id}.png"
+        ).exists(), f"Image file for {file_id} not found in test_images"
+    for file_id in new_train["id_code"]:
+        assert (
+            public / "train_images" / f"{file_id}.png"
+        ).exists(), f"Image file for {file_id} not found in train_images"
+    # Create a sample submission file
+    submission_df = new_test.copy()
+    submission_df["diagnosis"] = 0
+    # Write CSVs
+    new_train.to_csv(public / "train.csv", index=False)
+    new_test.to_csv(private / "test.csv", index=False)
+    new_test_without_labels.to_csv(public / "test.csv", index=False)
+    submission_df.to_csv(public / "sample_submission.csv", index=False)

mlebench/competitions/aptos2019-blindness-detection/prepare_val.py ADDED Viewed

@@ -0,0 +1,123 @@
+import shutil
+from pathlib import Path
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from mlebench.utils import read_csv
+def _process_and_save_split(
+    train_df: pd.DataFrame,
+    test_df: pd.DataFrame,
+    source_images_path: Path,
+    public_path: Path,
+    private_path: Path,
+):
+    """
+    A helper function to process a single split. It handles directory creation,
+    file copying, integrity checks, and writing output CSVs.
+    """
+    # Create output directories
+    public_path.mkdir(exist_ok=True, parents=True)
+    private_path.mkdir(exist_ok=True, parents=True)
+    (public_path / "test_images").mkdir(exist_ok=True)
+    (public_path / "train_images").mkdir(exist_ok=True)
+    test_df_without_labels = test_df.drop(columns=["diagnosis"])
+    # Copy data for the current split
+    for file_id in train_df["id_code"]:
+        shutil.copyfile(
+            src=source_images_path / f"{file_id}.png",
+            dst=public_path / "train_images" / f"{file_id}.png",
+        )
+    for file_id in test_df_without_labels["id_code"]:
+        shutil.copyfile(
+            src=source_images_path / f"{file_id}.png",
+            dst=public_path / "test_images" / f"{file_id}.png",
+        )
+    # Check integrity of the files copied for the current split
+    assert set(train_df["id_code"]).isdisjoint(
+        set(test_df["id_code"])
+    ), "Train and test sets should have no shared ids"
+    assert len(test_df_without_labels) == len(
+        test_df
+    ), "Public and Private tests should have equal length"
+    assert len(list(public_path.glob("train_images/*.png"))) == len(
+        train_df
+    ), "Public train images should have the same number of images as the length of train set"
+    assert len(list(public_path.glob("test_images/*.png"))) == len(
+        test_df_without_labels
+    ), "Public test images should have the same number of images as the length of test set"
+    train_image_files = set(public_path.glob("train_images/*.png"))
+    test_image_files = set(public_path.glob("test_images/*.png"))
+    common_files = train_image_files.intersection(test_image_files)
+    assert not common_files, f"Images found in both train_images and test_images: {common_files}"
+    for file_id in test_df["id_code"]:
+        assert (
+            public_path / "test_images" / f"{file_id}.png"
+        ).exists(), f"Image file for {file_id} not found in test_images"
+    for file_id in train_df["id_code"]:
+        assert (
+            public_path / "train_images" / f"{file_id}.png"
+        ).exists(), f"Image file for {file_id} not found in train_images"
+    # Create a sample submission file
+    submission_df = test_df.copy()
+    submission_df["diagnosis"] = 0
+    # Write CSVs for the current split
+    train_df.to_csv(public_path / "train.csv", index=False)
+    test_df.to_csv(private_path / "test.csv", index=False)
+    test_df_without_labels.to_csv(public_path / "test.csv", index=False)
+    submission_df.to_csv(public_path / "sample_submission.csv", index=False)
+def prepare(raw: Path, public: Path, private: Path):
+    """
+    Splits the data in raw into public and private datasets with appropriate test/train splits.
+    It also creates a secondary validation split in public_val/private_val directories.
+    """
+    # --- Stage 1: Create the original train/test split ---
+    # This section remains functionally identical to the original script
+    # to ensure the contents of `public` and `private` are unchanged.
+    old_train = read_csv(raw / "train.csv")
+    new_train, new_test = train_test_split(old_train, test_size=0.1, random_state=0)
+    # Process and save the original split using the helper function
+    _process_and_save_split(
+        train_df=new_train,
+        test_df=new_test,
+        source_images_path=raw / "train_images",
+        public_path=public,
+        private_path=private,
+    )
+    # --- Stage 2: Create the new train/validation split ---
+    # This split takes the training set from Stage 1 (`new_train`) and splits it again.
+    public_val = public.parent / "public_val"
+    private_val = private.parent / "private_val"
+    # The test size is 1/9 of the `new_train` data, which is equivalent to 10% of the
+    # original total data. This makes the new validation set (`test_val`) have the
+    # same size as the original test set (`new_test`).
+    train_val, test_val = train_test_split(new_train, test_size=1 / 9, random_state=0)
+    # Process and save the new validation split into the _val directories
+    _process_and_save_split(
+        train_df=train_val,
+        test_df=test_val,
+        source_images_path=raw / "train_images",
+        public_path=public_val,
+        private_path=private_val,
+    )

mlebench/competitions/bike-sharing-demand/__init__.py ADDED Viewed

File without changes

mlebench/competitions/bike-sharing-demand/grade.py ADDED Viewed

@@ -0,0 +1,55 @@
+import numpy as np
+import pandas as pd
+from sklearn.metrics import mean_squared_log_error
+from mlebench.grade_helpers import InvalidSubmissionError
+def prepare_for_metric(submission: pd.DataFrame, answers: pd.DataFrame):
+    """Prepare submission and answers for RMSLE calculation."""
+    if len(submission) != len(answers):
+        raise InvalidSubmissionError(
+            f"Submission length ({len(submission)}) != answers length ({len(answers)})"
+        )
+    if "datetime" not in submission.columns:
+        raise InvalidSubmissionError("Submission must have a 'datetime' column")
+    if "count" not in submission.columns:
+        raise InvalidSubmissionError("Submission must have a 'count' column")
+    # Sort by datetime
+    submission = submission.sort_values("datetime").reset_index(drop=True)
+    answers = answers.sort_values("datetime").reset_index(drop=True)
+    # Check datetime alignment
+    if not (submission["datetime"] == answers["datetime"]).all():
+        raise InvalidSubmissionError("Submission and answers datetime mismatch")
+    # Extract predictions and true values
+    y_pred = submission["count"].values
+    y_true = answers["count"].values
+    # Validate predictions
+    try:
+        y_pred = y_pred.astype(float)
+    except (ValueError, TypeError):
+        raise InvalidSubmissionError("Predictions must be numeric")
+    if np.any(np.isnan(y_pred)):
+        raise InvalidSubmissionError("Predictions cannot contain NaN values")
+    if np.any(y_pred < 0):
+        raise InvalidSubmissionError("Predictions cannot be negative")
+    return y_true, y_pred
+def grade(submission: pd.DataFrame, answers: pd.DataFrame) -> float:
+    """Calculate RMSLE score."""
+    y_true, y_pred = prepare_for_metric(submission, answers)
+    # RMSLE = sqrt(MSLE)
+    rmsle = np.sqrt(mean_squared_log_error(y_true, y_pred))
+    return rmsle

mlebench/competitions/bike-sharing-demand/prepare.py ADDED Viewed

@@ -0,0 +1,37 @@
+from pathlib import Path
+import pandas as pd
+def prepare(raw: Path, public: Path, private: Path):
+    """
+    Prepare bike-sharing-demand dataset.
+    Raw data already contains:
+    - train.csv: training data with labels
+    - test.csv: test data without labels
+    - sampleSubmission.csv: sample submission format
+    - test_answer.csv: test labels (private)
+    """
+    # Read data
+    train = pd.read_csv(raw / "train.csv")
+    test = pd.read_csv(raw / "test.csv")
+    sample_submission = pd.read_csv(raw / "sampleSubmission.csv")
+    test_answer = pd.read_csv(raw / "test_answer.csv")
+    # Public files (visible to agents)
+    train.to_csv(public / "train.csv", index=False)
+    test.to_csv(public / "test.csv", index=False)
+    sample_submission.to_csv(public / "sampleSubmission.csv", index=False)
+    # Private files (for grading)
+    test_answer.to_csv(private / "test.csv", index=False)
+    # Validation checks
+    assert len(test_answer) == len(test), \
+        f"Test answer ({len(test_answer)}) and test ({len(test)}) must have same length"
+    assert len(sample_submission) == len(test), \
+        f"Sample submission ({len(sample_submission)}) and test ({len(test)}) must have same length"
+    assert "datetime" in test_answer.columns, "Test answer must have 'datetime' column"
+    assert "count" in test_answer.columns, "Test answer must have 'count' column"
+    assert "datetime" in sample_submission.columns, "Sample submission must have 'datetime' column"
+    assert "count" in sample_submission.columns, "Sample submission must have 'count' column"

mlebench/competitions/billion-word-imputation/grade.py ADDED Viewed

@@ -0,0 +1,37 @@
+import numpy as np
+import pandas as pd
+from Levenshtein import distance
+from mlebench.grade_helpers import InvalidSubmissionError
+def prepare_for_metric(submission: pd.DataFrame, answers: pd.DataFrame) -> None:
+    id_col = "id"
+    target_col = "sentence"
+    assert id_col in answers.columns, f"Missing {id_col} column in answers"
+    assert target_col in answers.columns, f"Missing {target_col} column in answers"
+    if id_col not in submission.columns:
+        raise InvalidSubmissionError(f"Missing {id_col} column in submission")
+    if target_col not in submission.columns:
+        raise InvalidSubmissionError(f"Missing {target_col} column in submission")
+    if len(submission) != len(answers):
+        raise InvalidSubmissionError(
+            f"Submission and answers have different lengths: {len(submission)} vs {len(answers)}"
+        )
+    submission = submission.sort_values(id_col).reset_index(drop=True)
+    answers = answers.sort_values(id_col).reset_index(drop=True)
+    if (submission[id_col].values != answers[id_col].values).any():
+        raise InvalidSubmissionError("Submission and answers should contain the same IDs")
+    return submission[target_col], answers[target_col]
+def grade(submission: pd.DataFrame, answers: pd.DataFrame) -> float:
+    y_pred, y_true = prepare_for_metric(submission, answers)
+    mean_levenshtein = np.mean([distance(a, b) for a, b in zip(y_true, y_pred)])
+    return mean_levenshtein

dslighting 1.7.1__py3-none-any.whl → 1.7.6__py3-none-any.whl

dslighting 1.7.1py3-none-any.whl → 1.7.6py3-none-any.whl