PyPI - dslighting - Versions diffs - 1.7.1__py3-none-any.whl → 1.7.8__py3-none-any.whl - Mend

dslighting 1.7.1py3-none-any.whl → 1.7.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (352) hide show

mlebench/competitions/playground-series-s3e18/prepare_val.py ADDED Viewed

@@ -0,0 +1,89 @@
+from pathlib import Path
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from mlebench.utils import read_csv
+def _save_split_files(train_df: pd.DataFrame, test_df: pd.DataFrame, public_dir: Path, private_dir: Path):
+    """
+    Saves the train/test splits to the specified public and private directories,
+    ensuring a consistent file structure and content format.
+    - Creates a sample submission file in public_dir.
+    - Saves the full test set (with labels) to private_dir.
+    - Saves the train set to public_dir.
+    - Saves the test set (without labels) to public_dir.
+    """
+    # Ensure output directories exist
+    public_dir.mkdir(parents=True, exist_ok=True)
+    private_dir.mkdir(parents=True, exist_ok=True)
+    # Create sample submission from a copy of the full test set
+    sample_submission = test_df.copy()
+    sample_submission["EC1"] = 0.5
+    sample_submission["EC2"] = 0.5
+    sample_submission.drop(
+        sample_submission.columns.difference(["id", "EC1", "EC2"]), axis=1, inplace=True
+    )
+    sample_submission.to_csv(public_dir / "sample_submission.csv", index=False)
+    # Create private files (full test set with labels)
+    test_df.to_csv(private_dir / "test.csv", index=False)
+    # Create public files visible to agents
+    train_df.to_csv(public_dir / "train.csv", index=False)
+    # Create public test set (without labels) from a new copy
+    public_test_df = test_df.copy()
+    public_test_df.drop(["EC1", "EC2", "EC3", "EC4", "EC5", "EC6"], axis=1, inplace=True)
+    public_test_df.to_csv(public_dir / "test.csv", index=False)
+def prepare(raw: Path, public: Path, private: Path):
+    # Read the raw data
+    old_train = read_csv(raw / "train.csv")
+    # --- Step 1: Create the original train/test split ---
+    # This split is used for the main competition leaderboard. Its outputs
+    # in `public/` and `private/` must remain identical to the original script.
+    original_test_size = 0.1
+    new_train, new_test = train_test_split(
+        old_train, test_size=original_test_size, random_state=0
+    )
+    # Save the files for the original split, ensuring original outputs are unchanged
+    _save_split_files(new_train, new_test, public, private)
+    # --- Step 2: Create the new validation split ---
+    # This second split is derived from the *training data* of the first split.
+    # It creates a smaller training set and a validation set for users,
+    # saved to parallel `public_val/` and `private_val/` directories.
+    # Define paths for the new validation split outputs
+    public_val = public.parent / "public_val"
+    private_val = private.parent / "private_val"
+    # To keep the new validation set size (`test_val`) the same as the original
+    # test set size (`new_test`), we adjust the split ratio.
+    # test_val_size = original_test_size / (1 - original_test_size)
+    val_split_test_size = original_test_size / (1.0 - original_test_size)
+    # Create the new training and validation sets from the first-pass training data
+    train_val, test_val = train_test_split(
+        new_train, test_size=val_split_test_size, random_state=0
+    )
+    # Save the files for the new validation split using the same helper.
+    # This guarantees the file/directory structure is mirrored.
+    _save_split_files(train_val, test_val, public_val, private_val)
+    # Original checks (adapted slightly for clarity after refactoring)
+    public_test_df = read_csv(public / "test.csv")
+    public_train_df = read_csv(public / "train.csv")
+    assert public_test_df.shape[1] == 32, "Public test set should have 32 columns"
+    assert public_train_df.shape[1] == 38, "Public train set should have 38 columns"
+    assert len(public_train_df) + len(public_test_df) == len(
+        old_train
+    ), "Length of new_train and new_test should equal length of old_train"

mlebench/competitions/playground_series_s3e1/__init__.py ADDED Viewed

File without changes

mlebench/competitions/playground_series_s3e1/grade.py ADDED Viewed

@@ -0,0 +1,52 @@
+import numpy as np
+import pandas as pd
+from sklearn.metrics import mean_squared_error
+from mlebench.grade_helpers import InvalidSubmissionError
+def prepare_for_metric(submission: pd.DataFrame, answers: pd.DataFrame):
+    """Prepare submission and answers for RMSE calculation."""
+    if len(submission) != len(answers):
+        raise InvalidSubmissionError(
+            f"Submission length ({len(submission)}) != answers length ({len(answers)})"
+        )
+    if "id" not in submission.columns:
+        raise InvalidSubmissionError("Submission must have an 'id' column")
+    if "MedHouseVal" not in submission.columns:
+        raise InvalidSubmissionError("Submission must have a 'MedHouseVal' column")
+    # Sort by id
+    submission = submission.sort_values("id").reset_index(drop=True)
+    answers = answers.sort_values("id").reset_index(drop=True)
+    # Check id alignment
+    if not (submission["id"] == answers["id"]).all():
+        raise InvalidSubmissionError("Submission and answers id mismatch")
+    # Extract predictions and true values
+    y_pred = submission["MedHouseVal"].values
+    y_true = answers["MedHouseVal"].values
+    # Validate predictions
+    try:
+        y_pred = y_pred.astype(float)
+    except (ValueError, TypeError):
+        raise InvalidSubmissionError("Predictions must be numeric")
+    if np.any(np.isnan(y_pred)):
+        raise InvalidSubmissionError("Predictions cannot contain NaN values")
+    return y_true, y_pred
+def grade(submission: pd.DataFrame, answers: pd.DataFrame) -> float:
+    """Calculate RMSE score."""
+    y_true, y_pred = prepare_for_metric(submission, answers)
+    # RMSE
+    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
+    return rmse

mlebench/competitions/playground_series_s3e1/prepare.py ADDED Viewed

@@ -0,0 +1,25 @@
+from pathlib import Path
+import pandas as pd
+def prepare(raw: Path, public: Path, private: Path):
+    """
+    Prepare playground-series-s3e1 dataset.
+    """
+    # Read data
+    train = pd.read_csv(raw / "train.csv")
+    test = pd.read_csv(raw / "test.csv")
+    sample_submission = pd.read_csv(raw / "sample_submission.csv")
+    test_answer = pd.read_csv(raw / "test_answer.csv")
+    # Public files (visible to agents)
+    train.to_csv(public / "train.csv", index=False)
+    test.to_csv(public / "test.csv", index=False)
+    sample_submission.to_csv(public / "sample_submission.csv", index=False)
+    # Private files (for grading)
+    test_answer.to_csv(private / "test.csv", index=False)
+    # Validation checks
+    assert len(test_answer) == len(sample_submission), \
+        f"Test answer ({len(test_answer)}) and sample submission ({len(sample_submission)}) must have same length"

mlebench/competitions/playground_series_s3e11/__init__.py ADDED Viewed

File without changes

mlebench/competitions/playground_series_s3e11/grade.py ADDED Viewed

@@ -0,0 +1,55 @@
+import numpy as np
+import pandas as pd
+from sklearn.metrics import mean_squared_log_error
+from mlebench.grade_helpers import InvalidSubmissionError
+def prepare_for_metric(submission: pd.DataFrame, answers: pd.DataFrame):
+    """Prepare submission and answers for RMSLE calculation."""
+    if len(submission) != len(answers):
+        raise InvalidSubmissionError(
+            f"Submission length ({len(submission)}) != answers length ({len(answers)})"
+        )
+    if "id" not in submission.columns:
+        raise InvalidSubmissionError("Submission must have an 'id' column")
+    if "cost" not in submission.columns:
+        raise InvalidSubmissionError("Submission must have a 'cost' column")
+    # Sort by id
+    submission = submission.sort_values("id").reset_index(drop=True)
+    answers = answers.sort_values("id").reset_index(drop=True)
+    # Check id alignment
+    if not (submission["id"] == answers["id"]).all():
+        raise InvalidSubmissionError("Submission and answers id mismatch")
+    # Extract predictions and true values
+    y_pred = submission["cost"].values
+    y_true = answers["cost"].values
+    # Validate predictions
+    try:
+        y_pred = y_pred.astype(float)
+    except (ValueError, TypeError):
+        raise InvalidSubmissionError("Predictions must be numeric")
+    if np.any(np.isnan(y_pred)):
+        raise InvalidSubmissionError("Predictions cannot contain NaN values")
+    if np.any(y_pred < 0):
+        raise InvalidSubmissionError("Predictions cannot be negative")
+    return y_true, y_pred
+def grade(submission: pd.DataFrame, answers: pd.DataFrame) -> float:
+    """Calculate RMSLE score."""
+    y_true, y_pred = prepare_for_metric(submission, answers)
+    # RMSLE
+    rmsle = np.sqrt(mean_squared_log_error(y_true, y_pred))
+    return rmsle

mlebench/competitions/playground_series_s3e11/prepare.py ADDED Viewed

@@ -0,0 +1,25 @@
+from pathlib import Path
+import pandas as pd
+def prepare(raw: Path, public: Path, private: Path):
+    """
+    Prepare playground-series-s3e11 dataset.
+    """
+    # Read data
+    train = pd.read_csv(raw / "train.csv")
+    test = pd.read_csv(raw / "test.csv")
+    sample_submission = pd.read_csv(raw / "sample_submission.csv")
+    test_answer = pd.read_csv(raw / "test_answer.csv")
+    # Public files (visible to agents)
+    train.to_csv(public / "train.csv", index=False)
+    test.to_csv(public / "test.csv", index=False)
+    sample_submission.to_csv(public / "sample_submission.csv", index=False)
+    # Private files (for grading)
+    test_answer.to_csv(private / "test.csv", index=False)
+    # Validation checks
+    assert len(test_answer) == len(sample_submission), \
+        f"Test answer ({len(test_answer)}) and sample submission ({len(sample_submission)}) must have same length"

mlebench/competitions/predict-volcanic-eruptions-ingv-oe/grade.py ADDED Viewed

@@ -0,0 +1,44 @@
+from pandas import DataFrame
+from sklearn.metrics import mean_absolute_error
+from mlebench.grade_helpers import InvalidSubmissionError
+def prepare_for_grading(submission: DataFrame, answers: DataFrame, target_column: str) -> DataFrame:
+    """
+    Merge the submission's target_column into the answers DataFrame, matching on 'segment_id'.
+    target_column from the submission DataFrame will have the suffix '_pred',
+    and target_column from the answers DataFrame will have the suffix '_true'.
+    """
+    # Answers checks
+    assert (
+        target_column in answers.columns
+    ), f"Target column {target_column} not found in answers DataFrame."
+    assert "segment_id" in answers.columns, "Segment ID column not found in answers DataFrame."
+    # Submission checks
+    if target_column not in submission.columns:
+        raise InvalidSubmissionError(
+            f"Target column {target_column} not found in submissions DataFrame."
+        )
+    if "segment_id" not in submission.columns:
+        raise InvalidSubmissionError("Segment ID column not found in submission DataFrame.")
+    if not set(submission["segment_id"]) == set(answers["segment_id"]):
+        raise InvalidSubmissionError(
+            f"Submission is missing the following segment_ids: {set(answers['segment_id']) - set(submission['segment_id'])}"
+        )
+    merged = answers.merge(
+        submission[["segment_id", target_column]], on="segment_id", suffixes=("_true", "_pred")
+    )
+    return merged
+def grade(submission: DataFrame, answers: DataFrame) -> float:
+    merged = prepare_for_grading(submission, answers, "time_to_eruption")
+    score = mean_absolute_error(
+        y_true=merged["time_to_eruption_true"], y_pred=merged["time_to_eruption_pred"]
+    )
+    rounded_score = int(round(score))
+    return rounded_score

mlebench/competitions/predict-volcanic-eruptions-ingv-oe/prepare.py ADDED Viewed

@@ -0,0 +1,68 @@
+import shutil
+from pathlib import Path
+from sklearn.model_selection import train_test_split
+from tqdm import tqdm
+from mlebench.utils import read_csv
+def prepare(raw: Path, public: Path, private: Path):
+    # Create train, test from train split
+    old_train = read_csv(raw / "train.csv")
+    new_train, answers = train_test_split(old_train, test_size=0.1, random_state=0)
+    # Create sample submission
+    submission_df = answers.copy()
+    submission_df["time_to_eruption"] = 0
+    # Checks
+    assert len(answers) == len(submission_df), "Answers and submission should have the same length"
+    assert not any(
+        new_train["segment_id"].isin(answers["segment_id"])
+    ), "No segment_id should be in both train and answers"
+    assert list(new_train.columns) == [
+        "segment_id",
+        "time_to_eruption",
+    ], "new_train should have columns 'segment_id' and 'time_to_eruption'"
+    assert list(submission_df.columns) == [
+        "segment_id",
+        "time_to_eruption",
+    ], "submission_df should have columns 'segment_id' and 'time_to_eruption'"
+    assert list(answers.columns) == [
+        "segment_id",
+        "time_to_eruption",
+    ], "answers should have columns 'segment_id' and 'time_to_eruption'"
+    assert len(new_train) + len(answers) == len(
+        old_train
+    ), "The sum of the length of new_train and answers should be equal to the length of old_train"
+    # Write CSVs
+    answers.to_csv(private / "test.csv", index=False)
+    new_train.to_csv(public / "train.csv", index=False)
+    submission_df.to_csv(public / "sample_submission.csv", index=False)
+    # Copy over files
+    (public / "test").mkdir(exist_ok=True)
+    (public / "train").mkdir(exist_ok=True)
+    for file_id in tqdm(new_train["segment_id"], desc="Copying train files"):
+        shutil.copyfile(
+            src=raw / "train" / f"{file_id}.csv",
+            dst=public / "train" / f"{file_id}.csv",
+        )
+    for file_id in tqdm(answers["segment_id"], desc="Copying test files"):
+        shutil.copyfile(
+            src=raw / "train" / f"{file_id}.csv",
+            dst=public / "test" / f"{file_id}.csv",
+        )
+    # Checks on files
+    assert len(list(public.glob("train/*.csv"))) == len(
+        new_train
+    ), "Public train should have the same number of files as new_train"
+    assert len(list(public.glob("test/*.csv"))) == len(
+        answers
+    ), "Public test should have the same number of files as answers"

mlebench/competitions/predict-volcanic-eruptions-ingv-oe/prepare_val.py ADDED Viewed

@@ -0,0 +1,146 @@
+import shutil
+from pathlib import Path
+from typing import Tuple
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from tqdm import tqdm
+from mlebench.utils import read_csv
+def _create_split(
+    source_df: pd.DataFrame,
+    public_path: Path,
+    private_path: Path,
+    raw_files_dir: Path,
+    test_size: float,
+    random_state: int,
+) -> Tuple[pd.DataFrame, pd.DataFrame]:
+    """
+    Splits a dataframe into train and test sets, and creates the corresponding
+    directory structure with data files and labels.
+    Args:
+        source_df: The dataframe to split.
+        public_path: The path to the public output directory.
+        private_path: The path to the private output directory (for test labels).
+        raw_files_dir: The path to the directory containing the raw data files.
+        test_size: The proportion of the dataset to allocate to the test split.
+        random_state: The seed for the random number generator.
+    Returns:
+        A tuple containing the new training dataframe and the answers dataframe.
+    """
+    # Ensure output directories exist
+    public_path.mkdir(parents=True, exist_ok=True)
+    private_path.mkdir(parents=True, exist_ok=True)
+    # Create train, test from the source dataframe
+    new_train, answers = train_test_split(
+        source_df, test_size=test_size, random_state=random_state
+    )
+    # Create sample submission
+    submission_df = answers.copy()
+    submission_df["time_to_eruption"] = 0
+    # Checks
+    assert len(answers) == len(submission_df), "Answers and submission should have the same length"
+    assert not any(
+        new_train["segment_id"].isin(answers["segment_id"])
+    ), "No segment_id should be in both train and answers"
+    assert list(new_train.columns) == [
+        "segment_id",
+        "time_to_eruption",
+    ], "new_train should have columns 'segment_id' and 'time_to_eruption'"
+    assert list(submission_df.columns) == [
+        "segment_id",
+        "time_to_eruption",
+    ], "submission_df should have columns 'segment_id' and 'time_to_eruption'"
+    assert list(answers.columns) == [
+        "segment_id",
+        "time_to_eruption",
+    ], "answers should have columns 'segment_id' and 'time_to_eruption'"
+    assert len(new_train) + len(answers) == len(
+        source_df
+    ), "The sum of the length of new_train and answers should be equal to the length of source_df"
+    # Write CSVs
+    answers.to_csv(private_path / "test.csv", index=False)
+    new_train.to_csv(public_path / "train.csv", index=False)
+    submission_df.to_csv(public_path / "sample_submission.csv", index=False)
+    # Copy over files
+    (public_path / "test").mkdir(exist_ok=True)
+    (public_path / "train").mkdir(exist_ok=True)
+    for file_id in tqdm(new_train["segment_id"], desc=f"Copying train files to {public_path.name}"):
+        shutil.copyfile(
+            src=raw_files_dir / f"{file_id}.csv",
+            dst=public_path / "train" / f"{file_id}.csv",
+        )
+    for file_id in tqdm(answers["segment_id"], desc=f"Copying test files to {public_path.name}"):
+        shutil.copyfile(
+            src=raw_files_dir / f"{file_id}.csv",
+            dst=public_path / "test" / f"{file_id}.csv",
+        )
+    # Checks on files
+    assert len(list(public_path.glob("train/*.csv"))) == len(
+        new_train
+    ), f"Public train in {public_path.name} should have the same number of files as its train split"
+    assert len(list(public_path.glob("test/*.csv"))) == len(
+        answers
+    ), f"Public test in {public_path.name} should have the same number of files as its answer key"
+    return new_train, answers
+def prepare(raw: Path, public: Path, private: Path):
+    """
+    Prepares the dataset by creating two splits:
+    1. A standard train/test split for the main competition.
+    2. A subsequent train/validation split for model development, created from
+       the training set of the first split.
+    """
+    RANDOM_STATE = 0
+    TEST_SIZE_ORIGINAL = 0.1
+    # Load the full raw training data manifest
+    original_train_df = read_csv(raw / "train.csv")
+    raw_files_dir = raw / "train"
+    # --- 1. Create the original train/test split for `public` and `private` ---
+    # This call produces the primary competition data. Its outputs must remain
+    # identical to those of the original script.
+    competition_train_df, competition_test_answers = _create_split(
+        source_df=original_train_df,
+        public_path=public,
+        private_path=private,
+        raw_files_dir=raw_files_dir,
+        test_size=TEST_SIZE_ORIGINAL,
+        random_state=RANDOM_STATE,
+    )
+    # --- 2. Create the new train/validation split for `public_val` and `private_val` ---
+    # This call creates a new split from the training data generated above.
+    # The new directories will mirror the structure of the originals.
+    public_val_path = public.parent / "public_val"
+    private_val_path = private.parent / "private_val"
+    # Calculate the test size for the second split to ensure the number of samples
+    # in the new validation set is the same as in the original test set.
+    validation_test_size = len(competition_test_answers) / len(competition_train_df)
+    # Create the new split using the same logic and random state.
+    # The source data is `competition_train_df`, the training set from the first split.
+    _create_split(
+        source_df=competition_train_df,
+        public_path=public_val_path,
+        private_path=private_val_path,
+        raw_files_dir=raw_files_dir,
+        test_size=validation_test_size,
+        random_state=RANDOM_STATE,
+    )

mlebench/competitions/random-acts-of-pizza/grade.py ADDED Viewed

@@ -0,0 +1,14 @@
+import pandas as pd
+from sklearn.metrics import roc_auc_score
+from mlebench.competitions.utils import prepare_for_auroc_metric
+def grade(submission: pd.DataFrame, answers: pd.DataFrame) -> float:
+    roc_auc_inputs = prepare_for_auroc_metric(
+        submission=submission,
+        answers=answers,
+        id_col="request_id",
+        target_col="requester_received_pizza",
+    )
+    return roc_auc_score(y_true=roc_auc_inputs["y_true"], y_score=roc_auc_inputs["y_score"])

mlebench/competitions/random-acts-of-pizza/prepare.py ADDED Viewed

@@ -0,0 +1,80 @@
+import json
+import shutil
+from pathlib import Path
+import pandas as pd
+from sklearn.model_selection import train_test_split
+def prepare(raw: Path, public: Path, private: Path):
+    """
+    Splits the data in raw into public and private datasets with appropriate test/train splits.
+    """
+    # Load data
+    with open(raw / "train.json") as f:
+        old_train = json.load(f)
+    with open(raw / "test.json") as f:
+        old_test = json.load(f)
+    test_ratio = len(old_test) / (len(old_train) + len(old_test))
+    all_fields = list([key for key in old_train[0].keys()])
+    assert all(set(all_fields) == set([key for key in sample.keys()]) for sample in old_train)
+    test_fields = list([key for key in old_test[0].keys()])
+    assert all(set(test_fields) == set([key for key in sample.keys()]) for sample in old_test)
+    # Create train, test from train split
+    new_train, new_test = train_test_split(old_train, test_size=test_ratio, random_state=0)
+    new_test_without_labels = []
+    for sample in new_test:
+        # Keep only the fields in the test set
+        sample = {key: sample[key] for key in test_fields}
+        new_test_without_labels.append(sample)
+    # Save the new train and test splits
+    with open(public / "train.json", "w") as f:
+        json.dump(new_train, f, indent=4)
+    with open(public / "test.json", "w") as f:
+        json.dump(new_test_without_labels, f, indent=4)
+    # Also save zipped versions "train.json.zip" and "test.json.zip"
+    shutil.make_archive(public / "train", "zip", public, "train.json")
+    shutil.make_archive(public / "test", "zip", public, "test.json")
+    # Create answers
+    answers_rows = []
+    for sample in new_test:
+        answers_rows.append(
+            {
+                "request_id": sample["request_id"],
+                "requester_received_pizza": int(sample["requester_received_pizza"]),
+            }
+        )
+    answers = pd.DataFrame(answers_rows)
+    answers.to_csv(private / "test.csv", index=False)
+    # Create sample submission
+    sample_submission = answers.copy()
+    sample_submission["requester_received_pizza"] = 0
+    sample_submission.to_csv(public / "sampleSubmission.csv", index=False)
+    # Checks
+    assert len(new_train) + len(new_test) == len(
+        old_train
+    ), f"Expected {len(old_train)} total samples in new_train ({len(new_train)}) and new_test ({len(new_test)})"
+    assert len(new_test) == len(
+        new_test_without_labels
+    ), f"Expected new_test ({len(new_test)}) to have the same length as new_test_without_labels ({len(new_test_without_labels)})"
+    assert len(answers) == len(
+        new_test
+    ), f"Expected answers ({len(answers)}) to have the same length as new_test ({len(new_test)})"
+    assert len(sample_submission) == len(
+        new_test
+    ), f"Expected sample_submission ({len(sample_submission)}) to have the same length as new_test ({len(new_test)})"
+    assert set(answers.columns) == set(
+        ["request_id", "requester_received_pizza"]
+    ), "Answers must have 'request_id' and 'requester_received_pizza' columns"
+    assert set(sample_submission.columns) == set(
+        ["request_id", "requester_received_pizza"]
+    ), "Sample submission must have 'request_id' and 'requester_received_pizza' columns"

dslighting 1.7.1__py3-none-any.whl → 1.7.8__py3-none-any.whl

dslighting 1.7.1py3-none-any.whl → 1.7.8py3-none-any.whl