PyPI - dslighting - Versions diffs - 1.7.1__py3-none-any.whl → 1.7.8__py3-none-any.whl - Mend

dslighting 1.7.1py3-none-any.whl → 1.7.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (352) hide show

mlebench/competitions/uw-madison-gi-tract-image-segmentation/prepare.py ADDED Viewed

@@ -0,0 +1,139 @@
+import shutil
+from pathlib import Path
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from tqdm.auto import tqdm
+from mlebench.utils import read_csv
+def verify_directory_sync(df: pd.DataFrame, expected_dir: Path, unexpected_dir: Path):
+    """
+    Checks that the dataframe contents match the directory structure.
+    """
+    for _, row in tqdm(
+        df.iterrows(), desc=f"Verifying directory sync for {expected_dir.name}", total=len(df)
+    ):
+        case_day_path = expected_dir / row["case"] / f"{row['case']}_{row['day']}"
+        assert (
+            case_day_path.exists()
+        ), f"Directory {case_day_path} does not exist but is listed in the dataframe."
+        non_existent_path = unexpected_dir / row["case"] / f"{row['case']}_{row['day']}"
+        assert (
+            not non_existent_path.exists()
+        ), f"Directory {non_existent_path} exists but is not listed in the dataframe."
+def prepare(raw: Path, public: Path, private: Path):
+    old_train = read_csv(raw / "train.csv")
+    # ----------------------- Splitting
+    # Extract case and day from 'id'
+    old_train["case"] = old_train["id"].apply(lambda x: x.split("_")[0])
+    old_train["day"] = old_train["id"].apply(lambda x: x.split("_")[1])
+    old_train["slice"] = old_train["id"].apply(lambda x: x.split("_")[-1])
+    # Split cases into train and test
+    unique_cases = old_train["case"].unique()
+    train_cases, test_cases = train_test_split(unique_cases, test_size=0.1, random_state=42)
+    # Initially assign entire cases to train or test set
+    old_train["set"] = old_train["case"].apply(lambda x: "test" if x in test_cases else "train")
+    # Then mark some days from train to be test, to match competition test description
+    days_df = old_train[old_train["set"] == "train"].groupby("case")["day"].apply(set).reset_index()
+    for _, row in days_df.iterrows():
+        # if theres more than 4 days, we will move any days past the 4th to the test set
+        days = row["day"]
+        if len(days) > 4:
+            days = sorted(days, key=lambda x: int(x[len("day") :]))
+            days_to_move = days[4:]
+            # change their set to "test"
+            old_train.loc[
+                old_train["case"].eq(row["case"]) & old_train["day"].isin(days_to_move), "set"
+            ] = "test"
+    # ----------------------- Move the files to the correct new locations
+    old_train_dir = raw / "train"
+    new_train_dir = public / "train"
+    new_test_dir = public / "test"
+    # Create new directories if they don't exist
+    new_train_dir.mkdir(parents=True, exist_ok=True)
+    new_test_dir.mkdir(parents=True, exist_ok=True)
+    # Move directories based on the set assignment
+    for case in tqdm(unique_cases, desc="Splitting by case"):
+        original_path = old_train_dir / case
+        if case in train_cases:
+            new_path = new_train_dir / case
+        else:
+            new_path = new_test_dir / case
+        # new_path.mkdir(parents=True, exist_ok=True)
+        shutil.copytree(original_path, new_path, dirs_exist_ok=True)
+    # Move specific days from public/train/ to public/test/ for marked case-days
+    for _, row in tqdm(
+        old_train.iterrows(), desc="Handling additional day-based splits", total=len(old_train)
+    ):
+        if row["set"] == "test":
+            source_day_path = new_train_dir / row["case"] / f"{row['case']}_{row['day']}"
+            target_day_path = new_test_dir / row["case"] / f"{row['case']}_{row['day']}"
+            if source_day_path.exists():
+                target_day_path.parent.mkdir(parents=True, exist_ok=True)
+                shutil.move(source_day_path.as_posix(), target_day_path.as_posix())
+    # ------------------------ Saving splits
+    new_train = old_train[old_train["set"] == "train"].copy()
+    new_test = old_train[old_train["set"] == "test"].copy()
+    # some asserts before we drop columns
+    verify_directory_sync(new_train, expected_dir=new_train_dir, unexpected_dir=new_test_dir)
+    verify_directory_sync(new_test, expected_dir=new_test_dir, unexpected_dir=new_train_dir)
+    # get image height and image width for the test set, since this is needed for the metric
+    for _, row in tqdm(
+        new_test.iterrows(), desc="Getting image dimensions for test set", total=len(new_test)
+    ):
+        case, day, day_slice = row["case"], row["day"], row["slice"]
+        image_paths = list(
+            (old_train_dir / case / f"{case}_{day}" / "scans").glob(f"slice_{day_slice}_*.png")
+        )
+        assert len(image_paths) == 1, f"Expected 1 image, found {len(image_paths)}"
+        image_path = image_paths[0]
+        width, height = (int(length) for length in image_path.stem.split("_")[2:4])
+        new_test.loc[row.name, "image_width"] = width
+        new_test.loc[row.name, "image_height"] = height
+    # dont need these anymore, and werent part of the original data
+    new_train.drop(columns=["set", "case", "day", "slice"], inplace=True)
+    new_test.drop(columns=["set", "case", "day", "slice"], inplace=True)
+    # create sample submission
+    sample_submission = new_test.copy()
+    sample_submission["segmentation"] = "1 1 5 2"
+    # these are just metadata for the private test set necessary for the metric
+    sample_submission.drop(columns=["image_height", "image_width"], inplace=True)
+    # rename 'segmentation' to 'predicted' to match kaggle.com
+    sample_submission.rename(columns={"segmentation": "predicted"}, inplace=True)
+    sample_submission.to_csv(public / "sample_submission.csv", index=False, na_rep="")
+    # create private files
+    # rename 'segmentation' to 'predicted' to match sample_submission format
+    new_test.rename(columns={"segmentation": "predicted"}, inplace=True)
+    new_test.to_csv(private / "test.csv", index=False, na_rep="")
+    # create public files
+    new_train.to_csv(public / "train.csv", index=False, na_rep="")
+    # including this because we are converting this from code to csv competition
+    # and we need to point the model to the ids it needs to produce labels for
+    new_test_without_labels = new_test.drop(columns=["predicted", "image_width", "image_height"])
+    new_test_without_labels.to_csv(public / "test.csv", index=False, na_rep="")
+    # ------------------------ checks
+    assert new_test_without_labels.shape[1] == 2, "Public test should have 2 columns."
+    assert new_train.shape[1] == 3, "Public train should have 3 columns."
+    assert len(new_train) + len(new_test) == len(
+        old_train
+    ), "Train and test should sum up to the original data."

mlebench/competitions/uw-madison-gi-tract-image-segmentation/prepare_val.py ADDED Viewed

@@ -0,0 +1,193 @@
+import shutil
+from pathlib import Path
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from tqdm.auto import tqdm
+from mlebench.utils import read_csv
+def verify_directory_sync(df: pd.DataFrame, expected_dir: Path, unexpected_dir: Path):
+    """
+    Checks that the dataframe contents match the directory structure.
+    """
+    for _, row in tqdm(
+        df.iterrows(), desc=f"Verifying directory sync for {expected_dir.name}", total=len(df)
+    ):
+        case_day_path = expected_dir / row["case"] / f"{row['case']}_{row['day']}"
+        assert (
+            case_day_path.exists()
+        ), f"Directory {case_day_path} does not exist but is listed in the dataframe."
+        non_existent_path = unexpected_dir / row["case"] / f"{row['case']}_{row['day']}"
+        assert (
+            not non_existent_path.exists()
+        ), f"Directory {non_existent_path} exists but is not listed in the dataframe."
+def _create_split(
+    input_df: pd.DataFrame,
+    raw_images_dir: Path,
+    output_public_path: Path,
+    output_private_path: Path,
+) -> tuple[pd.DataFrame, pd.DataFrame]:
+    """
+    Helper function to perform a train/test split on a dataframe, move image files accordingly,
+    and save the resulting CSVs and submission files.
+    Args:
+        input_df: The dataframe to be split.
+        raw_images_dir: The source directory of all raw image data.
+        output_public_path: The destination directory for public files (e.g., public/ or public_val/).
+        output_private_path: The destination directory for private files (e.g., private/ or private_val/).
+    Returns:
+        A tuple containing the created training and testing dataframes.
+    """
+    # ----------------------- Splitting
+    # Extract case and day from 'id'
+    df_to_split = input_df.copy()
+    df_to_split["case"] = df_to_split["id"].apply(lambda x: x.split("_")[0])
+    df_to_split["day"] = df_to_split["id"].apply(lambda x: x.split("_")[1])
+    df_to_split["slice"] = df_to_split["id"].apply(lambda x: x.split("_")[-1])
+    # Split cases into train and test
+    unique_cases = df_to_split["case"].unique()
+    train_cases, test_cases = train_test_split(unique_cases, test_size=0.1, random_state=42)
+    # Initially assign entire cases to train or test set
+    df_to_split["set"] = df_to_split["case"].apply(lambda x: "test" if x in test_cases else "train")
+    # Then mark some days from train to be test, to match competition test description
+    days_df = df_to_split[df_to_split["set"] == "train"].groupby("case")["day"].apply(set).reset_index()
+    for _, row in days_df.iterrows():
+        # if theres more than 4 days, we will move any days past the 4th to the test set
+        days = row["day"]
+        if len(days) > 4:
+            days = sorted(days, key=lambda x: int(x[len("day") :]))
+            days_to_move = days[4:]
+            # change their set to "test"
+            df_to_split.loc[
+                df_to_split["case"].eq(row["case"]) & df_to_split["day"].isin(days_to_move), "set"
+            ] = "test"
+    # ----------------------- Move the files to the correct new locations
+    new_train_dir = output_public_path / "train"
+    new_test_dir = output_public_path / "test"
+    # Create new directories if they don't exist
+    new_train_dir.mkdir(parents=True, exist_ok=True)
+    new_test_dir.mkdir(parents=True, exist_ok=True)
+    output_private_path.mkdir(parents=True, exist_ok=True)
+    # Move directories based on the set assignment
+    for case in tqdm(unique_cases, desc=f"Splitting by case for {output_public_path.name}"):
+        original_path = raw_images_dir / case
+        if case in train_cases:
+            new_path = new_train_dir / case
+        else:
+            new_path = new_test_dir / case
+        shutil.copytree(original_path, new_path, dirs_exist_ok=True)
+    # Move specific days from public/train/ to public/test/ for marked case-days
+    for _, row in tqdm(
+        df_to_split.iterrows(),
+        desc=f"Handling day-based splits for {output_public_path.name}",
+        total=len(df_to_split),
+    ):
+        if row["set"] == "test":
+            source_day_path = new_train_dir / row["case"] / f"{row['case']}_{row['day']}"
+            target_day_path = new_test_dir / row["case"] / f"{row['case']}_{row['day']}"
+            if source_day_path.exists():
+                target_day_path.parent.mkdir(parents=True, exist_ok=True)
+                shutil.move(source_day_path.as_posix(), target_day_path.as_posix())
+    # ------------------------ Saving splits
+    new_train = df_to_split[df_to_split["set"] == "train"].copy()
+    new_test = df_to_split[df_to_split["set"] == "test"].copy()
+    # some asserts before we drop columns
+    verify_directory_sync(new_train, expected_dir=new_train_dir, unexpected_dir=new_test_dir)
+    verify_directory_sync(new_test, expected_dir=new_test_dir, unexpected_dir=new_train_dir)
+    # get image height and image width for the test set, since this is needed for the metric
+    for _, row in tqdm(
+        new_test.iterrows(),
+        desc=f"Getting image dimensions for {output_public_path.name} test set",
+        total=len(new_test),
+    ):
+        case, day, day_slice = row["case"], row["day"], row["slice"]
+        image_paths = list(
+            (raw_images_dir / case / f"{case}_{day}" / "scans").glob(f"slice_{day_slice}_*.png")
+        )
+        assert len(image_paths) == 1, f"Expected 1 image, found {len(image_paths)}"
+        image_path = image_paths[0]
+        width, height = (int(length) for length in image_path.stem.split("_")[2:4])
+        new_test.loc[row.name, "image_width"] = width
+        new_test.loc[row.name, "image_height"] = height
+    # dont need these anymore, and werent part of the original data
+    new_train.drop(columns=["set", "case", "day", "slice"], inplace=True)
+    new_test.drop(columns=["set", "case", "day", "slice"], inplace=True)
+    # create sample submission
+    sample_submission = new_test.copy()
+    sample_submission["segmentation"] = "1 1 5 2"
+    sample_submission.drop(columns=["image_height", "image_width"], inplace=True)
+    sample_submission.rename(columns={"segmentation": "predicted"}, inplace=True)
+    sample_submission.to_csv(output_public_path / "sample_submission.csv", index=False, na_rep="")
+    # create private files
+    new_test.rename(columns={"segmentation": "predicted"}, inplace=True)
+    new_test.to_csv(output_private_path / "test.csv", index=False, na_rep="")
+    # create public files
+    new_train.to_csv(output_public_path / "train.csv", index=False, na_rep="")
+    new_test_without_labels = new_test.drop(columns=["predicted", "image_width", "image_height"])
+    new_test_without_labels.to_csv(output_public_path / "test.csv", index=False, na_rep="")
+    # ------------------------ checks
+    assert new_test_without_labels.shape[1] == 2, "Public test should have 2 columns."
+    assert new_train.shape[1] == 3, "Public train should have 3 columns."
+    assert len(new_train) + len(new_test) == len(
+        input_df
+    ), "Train and test should sum up to the original data."
+    return new_train, new_test
+def prepare(raw: Path, public: Path, private: Path):
+    """
+    Prepares the raw data by creating two sets of splits:
+    1. A main train/test split for the final competition (`public`/`private`).
+    2. A validation split from the main training data (`public_val`/`private_val`).
+    """
+    initial_train_df = read_csv(raw / "train.csv")
+    raw_images_dir = raw / "train"
+    # --- 1. Create the original train/test split ---
+    # This generates the primary competition files in public/ and private/.
+    # The output of this step will remain identical to the original script.
+    print("--- Generating main train/test split for 'public' and 'private' directories ---")
+    main_train_df, _ = _create_split(
+        input_df=initial_train_df,
+        raw_images_dir=raw_images_dir,
+        output_public_path=public,
+        output_private_path=private,
+    )
+    # --- 2. Create the validation split from the main training set ---
+    # This takes the training data from the first split and splits it again,
+    # creating a smaller training set and a validation set.
+    # The outputs are saved in parallel directories to avoid conflicts.
+    print("\n--- Generating validation split for 'public_val' and 'private_val' directories ---")
+    public_val = public.parent / "public_val"
+    private_val = private.parent / "private_val"
+    _create_split(
+        input_df=main_train_df,  # Use the training set from the first split as input
+        raw_images_dir=raw_images_dir,  # Image sources are the same
+        output_public_path=public_val,
+        output_private_path=private_val,
+    )
+    print("\nData preparation complete.")

mlebench/competitions/ventilator-pressure-prediction/__init__.py ADDED Viewed

File without changes

mlebench/competitions/ventilator-pressure-prediction/grade.py ADDED Viewed

@@ -0,0 +1,52 @@
+import numpy as np
+import pandas as pd
+from sklearn.metrics import mean_absolute_error
+from mlebench.grade_helpers import InvalidSubmissionError
+def prepare_for_metric(submission: pd.DataFrame, answers: pd.DataFrame):
+    """Prepare submission and answers for MAE calculation."""
+    if len(submission) != len(answers):
+        raise InvalidSubmissionError(
+            f"Submission length ({len(submission)}) != answers length ({len(answers)})"
+        )
+    if "id" not in submission.columns:
+        raise InvalidSubmissionError("Submission must have an 'id' column")
+    if "pressure" not in submission.columns:
+        raise InvalidSubmissionError("Submission must have a 'pressure' column")
+    # Sort by id
+    submission = submission.sort_values("id").reset_index(drop=True)
+    answers = answers.sort_values("id").reset_index(drop=True)
+    # Check id alignment
+    if not (submission["id"] == answers["id"]).all():
+        raise InvalidSubmissionError("Submission and answers id mismatch")
+    # Extract predictions and true values
+    y_pred = submission["pressure"].values
+    y_true = answers["pressure"].values
+    # Validate predictions
+    try:
+        y_pred = y_pred.astype(float)
+    except (ValueError, TypeError):
+        raise InvalidSubmissionError("Predictions must be numeric")
+    if np.any(np.isnan(y_pred)):
+        raise InvalidSubmissionError("Predictions cannot contain NaN values")
+    return y_true, y_pred
+def grade(submission: pd.DataFrame, answers: pd.DataFrame) -> float:
+    """Calculate MAE score."""
+    y_true, y_pred = prepare_for_metric(submission, answers)
+    # MAE
+    mae = mean_absolute_error(y_true, y_pred)
+    return mae

mlebench/competitions/ventilator-pressure-prediction/prepare.py ADDED Viewed

@@ -0,0 +1,27 @@
+from pathlib import Path
+import pandas as pd
+def prepare(raw: Path, public: Path, private: Path):
+    """
+    Prepare ventilator-pressure-prediction dataset.
+    """
+    # Read data
+    train = pd.read_csv(raw / "train.csv")
+    test = pd.read_csv(raw / "test.csv")
+    sample_submission = pd.read_csv(raw / "sample_submission.csv")
+    test_answer = pd.read_csv(raw / "test_answer.csv")
+    # Public files (visible to agents)
+    train.to_csv(public / "train.csv", index=False)
+    test.to_csv(public / "test.csv", index=False)
+    sample_submission.to_csv(public / "sample_submission.csv", index=False)
+    # Private files (for grading)
+    test_answer.to_csv(private / "test.csv", index=False)
+    # Validation checks
+    assert len(test_answer) == len(sample_submission), \
+        f"Test answer ({len(test_answer)}) and sample submission ({len(sample_submission)}) must have same length"
+    assert "id" in test_answer.columns, "Test answer must have 'id' column"
+    assert "pressure" in test_answer.columns, "Test answer must have 'pressure' column"

mlebench/competitions/ventilator-pressure-prediction/prepare_val.py ADDED Viewed

@@ -0,0 +1,142 @@
+from pathlib import Path
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from mlebench.utils import read_csv
+def _save_split_data(
+    train_df: pd.DataFrame,
+    test_df: pd.DataFrame,
+    public_dir: Path,
+    private_dir: Path,
+):
+    """
+    Saves a train/test split to the specified public and private directories,
+    maintaining a consistent file structure and naming convention.
+    Args:
+        train_df (pd.DataFrame): The training dataframe.
+        test_df (pd.DataFrame): The testing dataframe with labels.
+        public_dir (Path): The directory to save public-facing files.
+        private_dir (Path): The directory to save private/ground-truth files.
+    """
+    # Ensure directories exist
+    public_dir.mkdir(exist_ok=True, parents=True)
+    private_dir.mkdir(exist_ok=True, parents=True)
+    # Make copies to avoid side-effects from ID resetting on the original dataframes
+    train_df = train_df.copy()
+    test_df = test_df.copy()
+    # Reset the 'id' column of train and test, starting at 1
+    train_df["id"] = range(1, len(train_df) + 1)
+    test_df["id"] = range(1, len(test_df) + 1)
+    assert set(train_df["breath_id"]).isdisjoint(
+        set(test_df["breath_id"])
+    ), "Test set contains breath_ids that are in the train set"
+    # Create public test
+    test_without_labels = test_df.drop(columns=["pressure"])
+    # Create sample submission
+    sample_submission = test_without_labels.copy()[["id"]]
+    sample_submission["pressure"] = 0
+    # Write CSVs with identical filenames for both original and validation splits
+    train_df.to_csv(public_dir / "train.csv", index=False, float_format="%.10g")
+    test_without_labels.to_csv(public_dir / "test.csv", index=False, float_format="%.10g")
+    sample_submission.to_csv(
+        public_dir / "sample_submission.csv", index=False, float_format="%.10g"
+    )
+    test_df.to_csv(private_dir / "test.csv", index=False, float_format="%.10g")
+    # Checks
+    assert (
+        sample_submission.shape[0] == test_without_labels.shape[0]
+    ), "Sample submission and new_test should have the same number of rows"
+    assert sample_submission.shape[1] == 2, "Sample submission should have 2 columns"
+    assert (
+        test_without_labels.shape[1] == 7
+    ), f"Expected 7 columns in test_without_labels, but got {test_without_labels.shape[1]}"
+    assert (
+        train_df.shape[1] == 8
+    ), f"Expected 8 columns in new_train, but got {train_df.shape[1]}"
+def prepare(raw: Path, public: Path, private: Path):
+    # Create train, test from train split
+    dtypes = {
+        "id": "int32",
+        "breath_id": "int32",
+        "R": "int8",
+        "C": "int8",
+        "time_step": "float64",
+        "u_in": "float64",
+        "u_out": "int8",
+        "pressure": "float64",
+    }
+    old_train = read_csv(raw / "train.csv", dtype=dtypes)
+    # Group by 'breath_id' and maintain the groups as lists of indices
+    groups = [df.index.tolist() for _, df in old_train.groupby("breath_id")]
+    # Split the groups into train and test sets such that train and test sets
+    # do not contain the same 'breath_id's
+    train_groups, test_groups = train_test_split(groups, test_size=0.1, random_state=0)
+    # Flatten the list of indices to get indices for train and test sets
+    train_idx = [idx for sublist in train_groups for idx in sublist]
+    test_idx = [idx for sublist in test_groups for idx in sublist]
+    # Create train and test DataFrames using the indices
+    new_train = old_train.loc[train_idx]
+    new_test = old_train.loc[test_idx]
+    # --- Original Output Generation ---
+    # This part remains unchanged in its output. The original script's file
+    # creation logic is now encapsulated and called here to produce the
+    # final competition assets.
+    _save_split_data(new_train, new_test, public, private)
+    # Check that original total size is preserved
+    assert len(old_train) == len(new_train) + len(
+        new_test
+    ), "New train and test should sum up to the old train size"
+    # --- New Validation Set Generation ---
+    # Define paths for the new validation split
+    public_val = public.parent / "public_val"
+    private_val = private.parent / "private_val"
+    # To make the validation set size match the original test set size,
+    # we use the number of groups from the original test split as the test_size.
+    val_test_group_size = len(test_groups)
+    # Split the train_groups again to create a new, smaller training set and a validation set.
+    # We use the same random_state for reproducibility.
+    train_val_groups, test_val_groups = train_test_split(
+        train_groups, test_size=val_test_group_size, random_state=0
+    )
+    # Flatten the list of indices for the new split
+    train_val_idx = [idx for sublist in train_val_groups for idx in sublist]
+    test_val_idx = [idx for sublist in test_val_groups for idx in sublist]
+    # Create the new train_val and test_val DataFrames from the original data
+    train_val = old_train.loc[train_val_idx]
+    test_val = old_train.loc[test_val_idx]
+    # Save the new validation split using the same helper function to ensure
+    # identical file structure and naming in the new `_val` directories.
+    _save_split_data(train_val, test_val, public_val, private_val)
+    # Check that the validation split correctly partitioned the new_train set
+    assert len(new_train) == len(train_val) + len(
+        test_val
+    ), "train_val and test_val should sum up to the new_train size"

mlebench/competitions/ventilator_pressure_prediction/__init__.py ADDED Viewed

File without changes

mlebench/competitions/ventilator_pressure_prediction/grade.py ADDED Viewed

@@ -0,0 +1,52 @@
+import numpy as np
+import pandas as pd
+from sklearn.metrics import mean_absolute_error
+from mlebench.grade_helpers import InvalidSubmissionError
+def prepare_for_metric(submission: pd.DataFrame, answers: pd.DataFrame):
+    """Prepare submission and answers for MAE calculation."""
+    if len(submission) != len(answers):
+        raise InvalidSubmissionError(
+            f"Submission length ({len(submission)}) != answers length ({len(answers)})"
+        )
+    if "id" not in submission.columns:
+        raise InvalidSubmissionError("Submission must have an 'id' column")
+    if "pressure" not in submission.columns:
+        raise InvalidSubmissionError("Submission must have a 'pressure' column")
+    # Sort by id
+    submission = submission.sort_values("id").reset_index(drop=True)
+    answers = answers.sort_values("id").reset_index(drop=True)
+    # Check id alignment
+    if not (submission["id"] == answers["id"]).all():
+        raise InvalidSubmissionError("Submission and answers id mismatch")
+    # Extract predictions and true values
+    y_pred = submission["pressure"].values
+    y_true = answers["pressure"].values
+    # Validate predictions
+    try:
+        y_pred = y_pred.astype(float)
+    except (ValueError, TypeError):
+        raise InvalidSubmissionError("Predictions must be numeric")
+    if np.any(np.isnan(y_pred)):
+        raise InvalidSubmissionError("Predictions cannot contain NaN values")
+    return y_true, y_pred
+def grade(submission: pd.DataFrame, answers: pd.DataFrame) -> float:
+    """Calculate MAE score."""
+    y_true, y_pred = prepare_for_metric(submission, answers)
+    # MAE
+    mae = mean_absolute_error(y_true, y_pred)
+    return mae

mlebench/competitions/ventilator_pressure_prediction/prepare.py ADDED Viewed

@@ -0,0 +1,27 @@
+from pathlib import Path
+import pandas as pd
+def prepare(raw: Path, public: Path, private: Path):
+    """
+    Prepare ventilator-pressure-prediction dataset.
+    """
+    # Read data
+    train = pd.read_csv(raw / "train.csv")
+    test = pd.read_csv(raw / "test.csv")
+    sample_submission = pd.read_csv(raw / "sample_submission.csv")
+    test_answer = pd.read_csv(raw / "test_answer.csv")
+    # Public files (visible to agents)
+    train.to_csv(public / "train.csv", index=False)
+    test.to_csv(public / "test.csv", index=False)
+    sample_submission.to_csv(public / "sample_submission.csv", index=False)
+    # Private files (for grading)
+    test_answer.to_csv(private / "test.csv", index=False)
+    # Validation checks
+    assert len(test_answer) == len(sample_submission), \
+        f"Test answer ({len(test_answer)}) and sample submission ({len(sample_submission)}) must have same length"
+    assert "id" in test_answer.columns, "Test answer must have 'id' column"
+    assert "pressure" in test_answer.columns, "Test answer must have 'pressure' column"

dslighting 1.7.1__py3-none-any.whl → 1.7.8__py3-none-any.whl

dslighting 1.7.1py3-none-any.whl → 1.7.8py3-none-any.whl