PyPI - dslighting - Versions diffs - 1.7.1__py3-none-any.whl → 1.7.6__py3-none-any.whl - Mend

dslighting 1.7.1py3-none-any.whl → 1.7.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (352) hide show

mlebench/competitions/demand_forecasting_kernels_only/prepare.py ADDED Viewed

@@ -0,0 +1,27 @@
+from pathlib import Path
+import pandas as pd
+def prepare(raw: Path, public: Path, private: Path):
+    """
+    Prepare demand-forecasting-kernels-only dataset.
+    """
+    # Read data
+    train = pd.read_csv(raw / "train.csv")
+    test = pd.read_csv(raw / "test.csv")
+    sample_submission = pd.read_csv(raw / "sample_submission.csv")
+    test_answer = pd.read_csv(raw / "test_answer.csv")
+    # Public files (visible to agents)
+    train.to_csv(public / "train.csv", index=False)
+    test.to_csv(public / "test.csv", index=False)
+    sample_submission.to_csv(public / "sample_submission.csv", index=False)
+    # Private files (for grading)
+    test_answer.to_csv(private / "test.csv", index=False)
+    # Validation checks
+    assert len(test_answer) == len(sample_submission), \
+        f"Test answer ({len(test_answer)}) and sample submission ({len(sample_submission)}) must have same length"
+    assert "id" in test_answer.columns, "Test answer must have 'id' column"
+    assert "sales" in test_answer.columns, "Test answer must have 'sales' column"

mlebench/competitions/denoising-dirty-documents/grade.py ADDED Viewed

@@ -0,0 +1,44 @@
+from pandas import DataFrame
+from sklearn.metrics import root_mean_squared_error
+from mlebench.grade_helpers import InvalidSubmissionError
+def grade(submission: DataFrame, answers: DataFrame) -> float:
+    assert "id" in answers.columns, "Answers must have an 'id' column."
+    assert "value" in answers.columns, "Answers must have a 'value' column."
+    try:
+        answers["value"].astype(float)
+    except ValueError as e:
+        assert False, "Answers must have float values in the 'value' column."
+    if "id" not in submission.columns:
+        raise InvalidSubmissionError("Submission must have an 'id' column.")
+    if "value" not in submission.columns:
+        raise InvalidSubmissionError("Submission must have a 'value' column.")
+    try:
+        submission["value"].astype(float)
+    except ValueError as e:
+        raise InvalidSubmissionError(f"Submission must have float values in the 'value' column.")
+    if len(submission) != len(answers):
+        raise InvalidSubmissionError(
+            f"Expected the submission to have {len(answers)} rows, but got {len(submission)}."
+        )
+    submission_sorted = submission.sort_values(by="id").sort_index(axis=1)
+    answers_sorted = answers.sort_values(by="id").sort_index(axis=1)
+    if (submission_sorted["id"].values != answers_sorted["id"].values).any():
+        raise InvalidSubmissionError(
+            "Expected the submission to have the same 'id' values as the answers, but they differ."
+        )
+    y_true = submission_sorted["value"]
+    y_pred = answers_sorted["value"]
+    score = root_mean_squared_error(y_true=y_true, y_pred=y_pred)
+    return score

mlebench/competitions/denoising-dirty-documents/prepare.py ADDED Viewed

@@ -0,0 +1,134 @@
+import shutil
+from pathlib import Path
+import numpy as np
+import pandas as pd
+from pandas import DataFrame, read_csv
+from PIL import Image
+from sklearn.model_selection import train_test_split
+from mlebench.utils import extract
+def prepare(raw: Path, public: Path, private: Path) -> None:
+    # Create new train and test splits
+    extract(raw / "train.zip", public)
+    # Original test ratio has Train set - 144 samples; Test set - 72 samples (33% ratio)
+    # We set new test ratio to 0.2 to keep it number of train samples at same OOM while having enough samples for new test
+    old_train_img_fpaths = sorted((public / "train").glob("*.png"))
+    new_train_img_fpaths, new_test_img_fpaths = train_test_split(
+        old_train_img_fpaths,
+        test_size=0.2,
+        random_state=0,
+    )
+    new_test_imgs = set([fpath.name for fpath in new_test_img_fpaths])
+    # Create `test` directory
+    (public / "test").mkdir(exist_ok=True, parents=True)
+    for fpath in (public / "train").glob("*.png"):
+        if fpath.name in new_test_imgs:
+            shutil.move(fpath, public / "test" / fpath.name)
+    # Create public `train_cleaned` directories
+    extract(raw / "train_cleaned.zip", public)
+    (private / "train_cleaned").mkdir(exist_ok=True, parents=True)
+    for fpath in (public / "train_cleaned").glob("*.png"):
+        if fpath.name in new_test_imgs:
+            shutil.move(fpath, private / "train_cleaned" / fpath.name)
+    # Write `answers.csv`
+    dfs = []
+    for fpath in sorted((private / "train_cleaned").glob("*.png")):
+        df = to_df(fpath)
+        dfs.append(df)
+    answers = DataFrame(pd.concat(dfs))
+    answers.to_csv(private / "answers.csv", index=False)
+    shutil.rmtree(private / "train_cleaned")
+    # Write `sampleSubmission.csv`
+    sample_submission = answers.copy()
+    sample_submission["value"] = 1
+    sample_submission.to_csv(public / "sampleSubmission.csv", index=False)
+    # Sanity checks
+    assert len(list((public / "train").glob("*.png"))) == len(old_train_img_fpaths) - len(
+        new_test_img_fpaths
+    ), (
+        f"Expected the number of images in `train` to be "
+        f"{len(old_train_img_fpaths) - len(new_test_img_fpaths)}, but got "
+        f"{len(list((public / 'train').glob('*.png')))}."
+    )
+    assert len(list((public / "test").glob("*.png"))) == len(new_test_img_fpaths), (
+        f"Expected {len(new_test_img_fpaths)} in the `test` directory, but got "
+        f"{len(list((public / 'test').glob('*.png')))}."
+    )
+    assert read_csv(private / "answers.csv").drop_duplicates().shape[0] == len(
+        read_csv(private / "answers.csv")
+    ), (
+        f"Expected `answers.csv` to have unique rows, but got "
+        f"{read_csv(private / 'answers.csv').drop_duplicates().shape[0]} unique rows and "
+        f"{len(read_csv(private / 'answers.csv'))} rows in total."
+    )
+    assert len(read_csv(private / "answers.csv")) == len(
+        read_csv(public / "sampleSubmission.csv")
+    ), (
+        f"Expected `answers.csv` and `sampleSubmission.csv` to have the same number of rows, but "
+        f"got {len(read_csv(private / 'answers.csv'))} rows in `answers.csv` and "
+        f"{len(read_csv(public / 'sampleSubmission.csv'))} rows in `sampleSubmission.csv`."
+    )
+    assert "id" in read_csv(private / "answers.csv").columns, (
+        f"Expected `answers.csv` to have an 'id' column, but got "
+        f"{read_csv(private / 'answers.csv').columns}."
+    )
+    assert "value" in read_csv(private / "answers.csv").columns, (
+        f"Expected `answers.csv` to have a 'value' column, but got "
+        f"{read_csv(private / 'answers.csv').columns}."
+    )
+    assert "id" in read_csv(public / "sampleSubmission.csv").columns, (
+        f"Expected `sampleSubmission.csv` to have an 'id' column, but got "
+        f"{read_csv(public / 'sampleSubmission.csv').columns}."
+    )
+    assert "value" in read_csv(public / "sampleSubmission.csv").columns, (
+        f"Expected `sampleSubmission.csv` to have a 'value' column, but got "
+        f"{read_csv(public / 'sampleSubmission.csv').columns}."
+    )
+def to_df(img: Path) -> DataFrame:
+    """Converts an image to a DataFrame, where each row corresponds to a pixel."""
+    image = Image.open(img).convert("L")
+    image_array = np.array(image) / 255.0
+    rows, cols = image_array.shape
+    data = {"id": [], "value": []}
+    for row in range(rows):
+        for col in range(cols):
+            pixel_id = f"{img.stem}_{row+1}_{col+1}"
+            pixel_value = image_array[row, col]
+            data["id"].append(pixel_id)
+            data["value"].append(pixel_value)
+    df = DataFrame(data)
+    assert (
+        len(df) == rows * cols
+    ), f"Expected the DataFrame to have {rows * cols} rows, but got {len(df)} rows."
+    return DataFrame(data)

mlebench/competitions/denoising-dirty-documents/prepare_val.py ADDED Viewed

@@ -0,0 +1,178 @@
+import shutil
+from pathlib import Path
+import numpy as np
+import pandas as pd
+from pandas import DataFrame, read_csv
+from PIL import Image
+from sklearn.model_selection import train_test_split
+from mlebench.utils import extract
+def _generate_split_artifacts(
+    train_fpaths: list[Path],
+    test_fpaths: list[Path],
+    source_data_dir: Path,
+    public_dest: Path,
+    private_dest: Path,
+) -> None:
+    """
+    Helper function to generate all required files for a given train/test split.
+    This function copies the necessary image files, creates the ground-truth
+    `answers.csv` for the test set, and a `sampleSubmission.csv` for the public.
+    """
+    public_dest.mkdir(exist_ok=True, parents=True)
+    private_dest.mkdir(exist_ok=True, parents=True)
+    test_img_names = {fpath.name for fpath in test_fpaths}
+    # Create public train/test directories and copy images
+    (public_dest / "train").mkdir(exist_ok=True)
+    (public_dest / "test").mkdir(exist_ok=True)
+    for fpath in train_fpaths:
+        shutil.copy(fpath, public_dest / "train" / fpath.name)
+    for fpath in test_fpaths:
+        shutil.copy(fpath, public_dest / "test" / fpath.name)
+    # Prepare ground-truth labels for the test set
+    temp_private_cleaned_dir = private_dest / "train_cleaned"
+    temp_private_cleaned_dir.mkdir(exist_ok=True)
+    source_cleaned_dir = source_data_dir / "train_cleaned"
+    for fpath in source_cleaned_dir.glob("*.png"):
+        if fpath.name in test_img_names:
+            shutil.copy(fpath, temp_private_cleaned_dir / fpath.name)
+    # Write `answers.csv` for the test set
+    dfs = []
+    for fpath in sorted(temp_private_cleaned_dir.glob("*.png")):
+        df = to_df(fpath)
+        dfs.append(df)
+    answers = DataFrame(pd.concat(dfs))
+    answers.to_csv(private_dest / "answers.csv", index=False)
+    shutil.rmtree(temp_private_cleaned_dir)
+    # Write `sampleSubmission.csv`
+    sample_submission = answers.copy()
+    sample_submission["value"] = 1
+    sample_submission.to_csv(public_dest / "sampleSubmission.csv", index=False)
+    # Sanity checks
+    assert len(list((public_dest / "train").glob("*.png"))) == len(train_fpaths), (
+        f"Expected the number of images in `{public_dest}/train` to be "
+        f"{len(train_fpaths)}, but got "
+        f"{len(list((public_dest / 'train').glob('*.png')))}."
+    )
+    assert len(list((public_dest / "test").glob("*.png"))) == len(test_fpaths), (
+        f"Expected {len(test_fpaths)} in the `{public_dest}/test` directory, but got "
+        f"{len(list((public_dest / 'test').glob('*.png')))}."
+    )
+    assert read_csv(private_dest / "answers.csv").drop_duplicates().shape[0] == len(
+        read_csv(private_dest / "answers.csv")
+    ), (
+        f"Expected `{private_dest}/answers.csv` to have unique rows, but got "
+        f"{read_csv(private_dest / 'answers.csv').drop_duplicates().shape[0]} unique rows and "
+        f"{len(read_csv(private_dest / 'answers.csv'))} rows in total."
+    )
+    assert len(read_csv(private_dest / "answers.csv")) == len(
+        read_csv(public_dest / "sampleSubmission.csv")
+    ), (
+        f"Expected `answers.csv` and `sampleSubmission.csv` to have the same number of rows, but "
+        f"got {len(read_csv(private_dest / 'answers.csv'))} rows in `answers.csv` and "
+        f"{len(read_csv(public_dest / 'sampleSubmission.csv'))} rows in `sampleSubmission.csv`."
+    )
+    assert "id" in read_csv(private_dest / "answers.csv").columns, (
+        f"Expected `answers.csv` to have an 'id' column, but got "
+        f"{read_csv(private_dest / 'answers.csv').columns}."
+    )
+    assert "value" in read_csv(private_dest / "answers.csv").columns, (
+        f"Expected `answers.csv` to have a 'value' column, but got "
+        f"{read_csv(private_dest / 'answers.csv').columns}."
+    )
+    assert "id" in read_csv(public_dest / "sampleSubmission.csv").columns, (
+        f"Expected `sampleSubmission.csv` to have an 'id' column, but got "
+        f"{read_csv(public_dest / 'sampleSubmission.csv').columns}."
+    )
+    assert "value" in read_csv(public_dest / "sampleSubmission.csv").columns, (
+        f"Expected `sampleSubmission.csv` to have a 'value' column, but got "
+        f"{read_csv(public_dest / 'sampleSubmission.csv').columns}."
+    )
+def prepare(raw: Path, public: Path, private: Path) -> None:
+    # Define paths for the new validation split
+    public_val = public.parent / "public_val"
+    private_val = private.parent / "private_val"
+    # Use a temporary directory to extract raw data to avoid side-effects
+    temp_source_dir = public.parent / "temp_data_source"
+    if temp_source_dir.exists():
+        shutil.rmtree(temp_source_dir)
+    temp_source_dir.mkdir(parents=True)
+    # Extract all necessary data once
+    extract(raw / "train.zip", temp_source_dir)
+    extract(raw / "train_cleaned.zip", temp_source_dir)
+    all_img_fpaths = sorted((temp_source_dir / "train").glob("*.png"))
+    # --- First Split: The Original Train/Test Split ---
+    # We set new test ratio to 0.2 to keep it number of train samples at same OOM while having enough samples for new test
+    orig_train_fpaths, orig_test_fpaths = train_test_split(
+        all_img_fpaths,
+        test_size=0.2,
+        random_state=0,
+    )
+    # Generate artifacts for the original public/private directories
+    _generate_split_artifacts(orig_train_fpaths, orig_test_fpaths, temp_source_dir, public, private)
+    # --- Second Split: Create Train/Validation Split from the Original Train Set ---
+    # To keep the new validation set size similar to the original test set size,
+    # we use test_size=0.25 (since 0.25 * 0.8 = 0.2).
+    new_train_fpaths, val_fpaths = train_test_split(
+        orig_train_fpaths, # Split the original training data again
+        test_size=0.25,
+        random_state=0,
+    )
+    # Generate artifacts for the new validation directories (public_val/private_val)
+    _generate_split_artifacts(new_train_fpaths, val_fpaths, temp_source_dir, public_val, private_val)
+    # Clean up the temporary source directory
+    shutil.rmtree(temp_source_dir)
+def to_df(img: Path) -> DataFrame:
+    """Converts an image to a DataFrame, where each row corresponds to a pixel."""
+    image = Image.open(img).convert("L")
+    image_array = np.array(image) / 255.0
+    rows, cols = image_array.shape
+    data = {"id": [], "value": []}
+    for row in range(rows):
+        for col in range(cols):
+            pixel_id = f"{img.stem}_{row+1}_{col+1}"
+            pixel_value = image_array[row, col]
+            data["id"].append(pixel_id)
+            data["value"].append(pixel_value)
+    df = DataFrame(data)
+    assert (
+        len(df) == rows * cols
+    ), f"Expected the DataFrame to have {rows * cols} rows, but got {len(df)} rows."
+    return DataFrame(data)

mlebench/competitions/detecting-insults-in-social-commentary/grade.py ADDED Viewed

@@ -0,0 +1,11 @@
+import pandas as pd
+from sklearn.metrics import roc_auc_score
+from mlebench.competitions.utils import prepare_for_auroc_metric
+def grade(submission: pd.DataFrame, answers: pd.DataFrame) -> float:
+    roc_auc_inputs = prepare_for_auroc_metric(
+        submission=submission, answers=answers, id_col="Comment", target_col="Insult"
+    )
+    return roc_auc_score(y_true=roc_auc_inputs["y_true"], y_score=roc_auc_inputs["y_score"])

mlebench/competitions/detecting-insults-in-social-commentary/prepare.py ADDED Viewed

@@ -0,0 +1,72 @@
+import shutil
+from pathlib import Path
+from mlebench.utils import read_csv
+def prepare(raw: Path, public: Path, private: Path):
+    # the test set labels are provided so we can just copy things without doing the split ourselves
+    shutil.copy(raw / "train.csv", public / "train.csv")
+    shutil.copy(raw / "test_with_solutions.csv", private / "test.csv")
+    # the public test currently online is for the verification stage, which we are ignoring
+    # (we are ignoring because there is some train/test leakage and unclear how this factors in leaderboard)
+    # we can recover the original public test set by removing the labels from the private test set
+    # can make the gold submission while we're at it
+    gold_submission = read_csv(private / "test.csv")
+    gold_submission = gold_submission[["Insult", "Date", "Comment"]]
+    gold_submission.to_csv(private / "gold_submission.csv", index=False)
+    public_test = gold_submission.drop(columns=["Insult"]).copy()
+    public_test.to_csv(public / "test.csv", index=False)
+    # finally, we also make our own sample_submission, same reasons as public test
+    # but match the format of what's online
+    sample_submission = gold_submission.copy()
+    sample_submission["Insult"] = 0
+    sample_submission.to_csv(public / "sample_submission_null.csv", index=False)
+    # checks
+    public_train = read_csv(public / "train.csv")
+    public_test = read_csv(public / "test.csv")
+    private_test = read_csv(private / "test.csv")
+    # no `Id` column in train, so we check comment content instead
+    assert public_train.columns.to_list() == [
+        "Insult",
+        "Date",
+        "Comment",
+    ], "Train columns should be Insult, Date, Comment"
+    assert public_test.columns.to_list() == [
+        "Date",
+        "Comment",
+    ], "Test columns should be Date, Comment"
+    assert sample_submission.columns.to_list() == [
+        "Insult",
+        "Date",
+        "Comment",
+    ], "Sample submission columns should be Insult, Date, Comment"
+    assert gold_submission.columns.to_list() == [
+        "Insult",
+        "Date",
+        "Comment",
+    ], "Gold submission columns should be Insult, Date, Comment"
+    assert private_test.columns.to_list() == [
+        "Insult",
+        "Date",
+        "Comment",
+        "Usage",
+    ], "Private test columns should be Insult, Date, Comment, Usage"
+    assert set(public_train["Comment"]).isdisjoint(
+        set(public_test["Comment"])
+    ), "None of the test comments should be in the train comments"
+    assert public_test.equals(
+        private_test.drop(columns=["Insult", "Usage"])
+    ), "Public test should be identical to private test, modulo the Insult and Usage columns"
+    assert set(public_test["Comment"]) == set(
+        sample_submission["Comment"]
+    ), "Public test and sample submission should have the same Comments"
+    assert set(public_test["Comment"]) == set(
+        gold_submission["Comment"]
+    ), "Public test and gold submission should have the same Comments"

mlebench/competitions/detecting-insults-in-social-commentary/prepare_val.py ADDED Viewed

@@ -0,0 +1,128 @@
+import shutil
+from pathlib import Path
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from mlebench.utils import read_csv
+def _create_split_files(
+    train_df: pd.DataFrame,
+    test_df_with_solutions: pd.DataFrame,
+    public_dir: Path,
+    private_dir: Path,
+):
+    """
+    Helper function to create the standard file structure for a given data split.
+    This function encapsulates the logic for generating:
+    - public/train.csv
+    - public/test.csv (unlabeled)
+    - public/sample_submission_null.csv
+    - private/test.csv (labeled, ground truth)
+    - private/gold_submission.csv
+    """
+    # Create directories if they don't exist
+    public_dir.mkdir(exist_ok=True)
+    private_dir.mkdir(exist_ok=True)
+    # Save the training data
+    train_df.to_csv(public_dir / "train.csv", index=False)
+    # Save the full test set with solutions to the private directory
+    test_df_with_solutions.to_csv(private_dir / "test.csv", index=False)
+    # Create the gold submission from the private test set
+    gold_submission = test_df_with_solutions[["Insult", "Date", "Comment"]].copy()
+    gold_submission.to_csv(private_dir / "gold_submission.csv", index=False)
+    # Create the public test set by dropping the label
+    public_test = gold_submission.drop(columns=["Insult"]).copy()
+    public_test.to_csv(public_dir / "test.csv", index=False)
+    # Create a sample submission with null labels
+    sample_submission = gold_submission.copy()
+    sample_submission["Insult"] = 0
+    sample_submission.to_csv(public_dir / "sample_submission_null.csv", index=False)
+def prepare(raw: Path, public: Path, private: Path):
+    # Load the original, pre-split data from the raw directory
+    original_train_df = read_csv(raw / "train.csv")
+    original_test_df = read_csv(raw / "test_with_solutions.csv")
+    # --- Part 1: Generate the original public/private split ---
+    # This block uses the original data to create the competition's primary
+    # train/test split, ensuring the output is identical to the original script.
+    _create_split_files(original_train_df, original_test_df, public, private)
+    # --- Part 2: Generate the new validation split ---
+    # This block creates a new split for local validation. It takes the original
+    # training data and splits it again, creating a new, smaller training set
+    # and a validation set. The outputs are saved to parallel directories.
+    # Define paths for the new validation set directories
+    public_val = public.parent / "public_val"
+    private_val = private.parent / "private_val"
+    # Split the original training data to create a new train and validation set.
+    # The size of the validation set is chosen to be the same as the original test set.
+    # We stratify on the 'Insult' column to maintain label distribution.
+    train_val_df, test_val_df = train_test_split(
+        original_train_df,
+        test_size=len(original_test_df),
+        random_state=42,
+        stratify=original_train_df["Insult"],
+    )
+    # Use the same helper function to create the files for the validation split,
+    # ensuring an identical directory structure and naming convention.
+    _create_split_files(train_val_df, test_val_df, public_val, private_val)
+    # --- Final Checks ---
+    # checks for the original split
+    public_train = read_csv(public / "train.csv")
+    public_test = read_csv(public / "test.csv")
+    private_test = read_csv(private / "test.csv")
+    sample_submission = read_csv(public / "sample_submission_null.csv")
+    gold_submission = read_csv(private / "gold_submission.csv")
+    assert public_train.columns.to_list() == [
+        "Insult",
+        "Date",
+        "Comment",
+    ], "Train columns should be Insult, Date, Comment"
+    assert public_test.columns.to_list() == [
+        "Date",
+        "Comment",
+    ], "Test columns should be Date, Comment"
+    assert sample_submission.columns.to_list() == [
+        "Insult",
+        "Date",
+        "Comment",
+    ], "Sample submission columns should be Insult, Date, Comment"
+    assert gold_submission.columns.to_list() == [
+        "Insult",
+        "Date",
+        "Comment",
+    ], "Gold submission columns should be Insult, Date, Comment"
+    assert private_test.columns.to_list() == [
+        "Insult",
+        "Date",
+        "Comment",
+        "Usage",
+    ], "Private test columns should be Insult, Date, Comment, Usage"
+    assert set(public_train["Comment"]).isdisjoint(
+        set(public_test["Comment"])
+    ), "None of the test comments should be in the train comments"
+    assert public_test.equals(
+        private_test.drop(columns=["Insult", "Usage"])
+    ), "Public test should be identical to private test, modulo the Insult and Usage columns"
+    assert set(public_test["Comment"]) == set(
+        sample_submission["Comment"]
+    ), "Public test and sample submission should have the same Comments"
+    assert set(public_test["Comment"]) == set(
+        gold_submission["Comment"]
+    ), "Public test and gold submission should have the same Comments"

dslighting 1.7.1__py3-none-any.whl → 1.7.6__py3-none-any.whl

dslighting 1.7.1py3-none-any.whl → 1.7.6py3-none-any.whl