PyPI - dslighting - Versions diffs - 1.7.1__py3-none-any.whl → 1.7.6__py3-none-any.whl - Mend

dslighting 1.7.1py3-none-any.whl → 1.7.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (352) hide show

mlebench/competitions/google-research-identify-contrails-reduce-global-warming/prepare.py ADDED Viewed

@@ -0,0 +1,155 @@
+import json
+import random
+import shutil
+from pathlib import Path
+import numpy as np
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from tqdm import tqdm
+from mlebench.competitions.utils import get_logger, rle_encode
+logger = get_logger(__name__)
+def prepare(raw: Path, public: Path, private: Path):
+    """
+    We make train/test split from old train set, using same train/test proportion as the original
+    competition. Concretely, the new split has 18673 train samples and 1856 test samples. We also
+    copy over the validation set as-is.
+    `sample_submission` is created with random predictions, either "1 3 10 5" or "-" (empty)
+    """
+    DEV = False
+    with open(raw / "train_metadata.json", "r") as f:
+        train_metadata = json.load(f)
+    train_metadata = train_metadata[:100] if DEV else train_metadata
+    with open(raw / "validation_metadata.json", "r") as f:
+        validation_metadata = json.load(f)
+    if DEV:
+        new_train, new_test = train_metadata[:90], train_metadata[90:]
+    else:
+        new_train, new_test = train_test_split(
+            train_metadata, test_size=len(validation_metadata), random_state=0
+        )
+    logger.info(
+        f"Created new split with {len(new_train)} train samples and {len(new_test)} test samples"
+    )
+    # Create answers csv containing ground truth masks, heights, widths
+    test_answers = []
+    for sample in tqdm(new_test):
+        record_id = sample["record_id"]
+        mask = np.load(raw / "train" / record_id / "human_pixel_masks.npy")
+        rle = rle_encode(mask)
+        rle = " ".join([str(i) for i in rle]) if rle else "-"
+        band = np.load(raw / "train" / record_id / "band_08.npy")
+        height, width, _ = band.shape
+        test_answers.append(
+            {
+                "record_id": record_id,
+                "encoded_pixels": rle,
+                "height": height,
+                "width": width,
+            }
+        )
+    test_answers = pd.DataFrame(test_answers)
+    for sample in tqdm(new_train):
+        record_id = sample["record_id"]
+        (public / "train" / record_id).mkdir(exist_ok=True, parents=True)
+        # Copy bands
+        for band_idx in range(8, 17):
+            file_name = f"band_{band_idx:02}.npy"
+            shutil.copyfile(
+                src=raw / "train" / record_id / file_name,
+                dst=public / "train" / record_id / file_name,
+            )
+        # Copy human individual masks
+        shutil.copyfile(
+            src=raw / "train" / record_id / "human_individual_masks.npy",
+            dst=public / "train" / record_id / "human_individual_masks.npy",
+        )
+        # Copy human pixel masks
+        shutil.copyfile(
+            src=raw / "train" / record_id / "human_pixel_masks.npy",
+            dst=public / "train" / record_id / "human_pixel_masks.npy",
+        )
+    for sample in tqdm(new_test):
+        record_id = sample["record_id"]
+        (public / "test" / record_id).mkdir(exist_ok=True, parents=True)
+        # Copy bands
+        for band_idx in range(8, 17):
+            file_name = f"band_{band_idx:02}.npy"
+            shutil.copyfile(
+                src=raw / "train" / record_id / file_name,
+                dst=public / "test" / record_id / file_name,
+            )
+    # Copy over existing validation data
+    (raw / "validation").mkdir(exist_ok=True, parents=True)
+    shutil.copytree(raw / "validation", public / "validation", dirs_exist_ok=True)
+    shutil.copyfile(raw / "validation_metadata.json", public / "validation_metadata.json")
+    # Write other files
+    with open(public / "train_metadata.json", "w") as f:
+        f.write(json.dumps(new_train))
+    test_answers.to_csv(private / "answers.csv", index=False)
+    submission_df = test_answers.copy()
+    random.seed(0)
+    submission_df["encoded_pixels"] = [
+        random.choice(["1 3 10 5", "-"]) for _ in range(len(submission_df))
+    ]
+    submission_df.to_csv(public / "sample_submission.csv", index=False)
+    # Sanity checks
+    assert (public / "train_metadata.json").exists(), "`train_metadata.json` doesn't exist!"
+    assert (public / "sample_submission.csv").exists(), "`sample_submission.csv` doesn't exist!"
+    assert (
+        public / "validation_metadata.json"
+    ).exists(), "`validation_metadata.json` doesn't exist!"
+    assert (public / "train").exists(), "`train` directory doesn't exist!"
+    assert (public / "test").exists(), "`test` directory doesn't exist!"
+    assert (public / "validation").exists(), "`public` directory doesn't exist!"
+    assert (private / "answers.csv").exists(), "`answers.csv` doesn't exist!"
+    new_train_bands = list(img.stem for img in (public / "train").rglob("band*.npy"))
+    assert (
+        len(new_train_bands) == len(new_train) * 9
+    ), f"Expected {len(new_train) * 9} bands in the train set, but got {len(new_train_bands)}!"
+    new_test_bands = list(img.stem for img in (public / "test").rglob("band*.npy"))
+    assert (
+        len(new_test_bands) == len(new_test) * 9
+    ), f"Expected {len(new_test) * 9} in the test set, but got {len(new_test_bands)}!"
+    new_train_individual_masks = list(
+        img.stem for img in (public / "train").rglob("human_individual_masks.npy")
+    )
+    assert len(new_train_individual_masks) == len(
+        new_train
+    ), f"Expected 1 human individual mask per sample in the train set, but got {len(new_train_individual_masks)}!"
+    new_test_individual_masks = list(
+        img.stem for img in (public / "test").rglob("human_individual_masks.npy")
+    )
+    assert (
+        len(new_test_individual_masks) == 0
+    ), f"Expected 0 human individual masks per sample in the test set, but got {len(new_test_individual_masks)}!"
+    new_train_pixel_masks = list(
+        img.stem for img in (public / "train").rglob("human_pixel_masks.npy")
+    )
+    assert len(new_train_pixel_masks) == len(
+        new_train
+    ), f"Expected 1 human pixel mask per sample in the train set, but got {len(new_train_pixel_masks)}!"
+    new_test_pixel_masks = list(
+        img.stem for img in (public / "test").rglob("human_pixel_masks.npy")
+    )
+    assert (
+        len(new_test_pixel_masks) == 0
+    ), f"Expected 0 human pixel masks per sample in the test set, but got {len(new_test_pixel_masks)}!"

mlebench/competitions/google-research-identify-contrails-reduce-global-warming/prepare_val.py ADDED Viewed

@@ -0,0 +1,211 @@
+import json
+import random
+import shutil
+from pathlib import Path
+import numpy as np
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from tqdm import tqdm
+from mlebench.competitions.utils import get_logger, rle_encode
+logger = get_logger(__name__)
+def _create_answers_df(samples: list, raw_path: Path) -> pd.DataFrame:
+    """Creates a DataFrame with ground truth answers for a given set of samples."""
+    answers = []
+    for sample in tqdm(samples, desc="Creating answers CSV"):
+        record_id = sample["record_id"]
+        mask = np.load(raw_path / "train" / record_id / "human_pixel_masks.npy")
+        rle = rle_encode(mask)
+        rle = " ".join([str(i) for i in rle]) if rle else "-"
+        band = np.load(raw_path / "train" / record_id / "band_08.npy")
+        height, width, _ = band.shape
+        answers.append(
+            {
+                "record_id": record_id,
+                "encoded_pixels": rle,
+                "height": height,
+                "width": width,
+            }
+        )
+    return pd.DataFrame(answers)
+def _copy_data_files(samples: list, raw_path: Path, dest_path: Path, include_masks: bool):
+    """Copies data files (bands and optionally masks) for a given set of samples."""
+    desc = f"Copying {'train' if include_masks else 'test'} files"
+    for sample in tqdm(samples, desc=desc):
+        record_id = sample["record_id"]
+        (dest_path / record_id).mkdir(exist_ok=True, parents=True)
+        # Copy bands
+        for band_idx in range(8, 17):
+            file_name = f"band_{band_idx:02}.npy"
+            shutil.copyfile(
+                src=raw_path / "train" / record_id / file_name,
+                dst=dest_path / record_id / file_name,
+            )
+        if include_masks:
+            # Copy human individual masks
+            shutil.copyfile(
+                src=raw_path / "train" / record_id / "human_individual_masks.npy",
+                dst=dest_path / record_id / "human_individual_masks.npy",
+            )
+            # Copy human pixel masks
+            shutil.copyfile(
+                src=raw_path / "train" / record_id / "human_pixel_masks.npy",
+                dst=dest_path / record_id / "human_pixel_masks.npy",
+            )
+def prepare(raw: Path, public: Path, private: Path):
+    """
+    We make train/test split from old train set, using same train/test proportion as the original
+    competition. Concretely, the new split has 18673 train samples and 1856 test samples. We also
+    copy over the validation set as-is.
+    `sample_submission` is created with random predictions, either "1 3 10 5" or "-" (empty)
+    A second split is performed on the train set to create a new validation set in parallel
+    `public_val` and `private_val` directories.
+    """
+    DEV = False
+    with open(raw / "train_metadata.json", "r") as f:
+        train_metadata = json.load(f)
+    train_metadata = train_metadata[:100] if DEV else train_metadata
+    with open(raw / "validation_metadata.json", "r") as f:
+        validation_metadata = json.load(f)
+    # ==================================================================================
+    # 1. Original Split: Create `new_train` and `new_test`
+    # ==================================================================================
+    if DEV:
+        new_train, new_test = train_metadata[:90], train_metadata[90:]
+    else:
+        new_train, new_test = train_test_split(
+            train_metadata, test_size=len(validation_metadata), random_state=0
+        )
+    logger.info(
+        f"Created original split with {len(new_train)} train samples and {len(new_test)} test samples"
+    )
+    # ==================================================================================
+    # 2. Process and Save Original Split to `public` and `private`
+    # ==================================================================================
+    public.mkdir(exist_ok=True)
+    private.mkdir(exist_ok=True)
+    # Copy train and test files
+    _copy_data_files(new_train, raw, public / "train", include_masks=True)
+    _copy_data_files(new_test, raw, public / "test", include_masks=False)
+    # Create and save ground truth answers for the test set
+    test_answers = _create_answers_df(new_test, raw)
+    test_answers.to_csv(private / "answers.csv", index=False)
+    # Save train metadata
+    with open(public / "train_metadata.json", "w") as f:
+        f.write(json.dumps(new_train))
+    # Create and save a sample submission
+    submission_df = test_answers.copy()
+    random.seed(0)
+    submission_df["encoded_pixels"] = [
+        random.choice(["1 3 10 5", "-"]) for _ in range(len(submission_df))
+    ]
+    submission_df.to_csv(public / "sample_submission.csv", index=False)
+    # Copy over existing validation data (this is unique to the original set)
+    (raw / "validation").mkdir(exist_ok=True, parents=True)
+    shutil.copytree(raw / "validation", public / "validation", dirs_exist_ok=True)
+    shutil.copyfile(raw / "validation_metadata.json", public / "validation_metadata.json")
+    # ==================================================================================
+    # 3. New Validation Split: Split `new_train` into `train_val` and `test_val`
+    # ==================================================================================
+    train_val, test_val = train_test_split(
+        new_train, test_size=len(new_test), random_state=0
+    )
+    logger.info(
+        f"Created validation split with {len(train_val)} train_val samples and {len(test_val)} test_val samples"
+    )
+    # ==================================================================================
+    # 4. Process and Save Validation Split to `public_val` and `private_val`
+    # ==================================================================================
+    public_val = public.parent / "public_val"
+    private_val = private.parent / "private_val"
+    public_val.mkdir(exist_ok=True)
+    private_val.mkdir(exist_ok=True)
+    # Copy train_val and test_val files
+    _copy_data_files(train_val, raw, public_val / "train", include_masks=True)
+    _copy_data_files(test_val, raw, public_val / "test", include_masks=False)
+    # Create and save ground truth answers for the test_val set
+    test_val_answers = _create_answers_df(test_val, raw)
+    # The filename must be "answers.csv" to mirror the private directory structure
+    test_val_answers.to_csv(private_val / "answers.csv", index=False)
+    # Save train_val metadata
+    # The filename must be "train_metadata.json" to mirror the public directory structure
+    with open(public_val / "train_metadata.json", "w") as f:
+        f.write(json.dumps(train_val))
+    # Create and save a sample submission for the validation set
+    submission_val_df = test_val_answers.copy()
+    random.seed(0)
+    submission_val_df["encoded_pixels"] = [
+        random.choice(["1 3 10 5", "-"]) for _ in range(len(submission_val_df))
+    ]
+    # The filename must be "sample_submission.csv" to mirror the public directory structure
+    submission_val_df.to_csv(public_val / "sample_submission.csv", index=False)
+    # ==================================================================================
+    # 5. Sanity Checks
+    # ==================================================================================
+    logger.info("Performing sanity checks for original directories...")
+    # Sanity checks for original directories
+    assert (public / "train_metadata.json").exists(), "`train_metadata.json` doesn't exist!"
+    assert (public / "sample_submission.csv").exists(), "`sample_submission.csv` doesn't exist!"
+    assert (
+        public / "validation_metadata.json"
+    ).exists(), "`validation_metadata.json` doesn't exist!"
+    assert (public / "train").exists(), "`train` directory doesn't exist!"
+    assert (public / "test").exists(), "`test` directory doesn't exist!"
+    assert (public / "validation").exists(), "`public` directory doesn't exist!"
+    assert (private / "answers.csv").exists(), "`answers.csv` doesn't exist!"
+    new_train_bands = list(img.stem for img in (public / "train").rglob("band*.npy"))
+    assert (
+        len(new_train_bands) == len(new_train) * 9
+    ), f"Expected {len(new_train) * 9} bands in the train set, but got {len(new_train_bands)}!"
+    new_test_bands = list(img.stem for img in (public / "test").rglob("band*.npy"))
+    assert (
+        len(new_test_bands) == len(new_test) * 9
+    ), f"Expected {len(new_test) * 9} in the test set, but got {len(new_test_bands)}!"
+    new_train_individual_masks = list(
+        img.stem for img in (public / "train").rglob("human_individual_masks.npy")
+    )
+    assert len(new_train_individual_masks) == len(
+        new_train
+    ), f"Expected 1 human individual mask per sample in the train set, but got {len(new_train_individual_masks)}!"
+    new_train_pixel_masks = list(
+        img.stem for img in (public / "train").rglob("human_pixel_masks.npy")
+    )
+    assert len(new_train_pixel_masks) == len(
+        new_train
+    ), f"Expected 1 human pixel mask per sample in the train set, but got {len(new_train_pixel_masks)}!"
+    logger.info("Performing sanity checks for validation directories...")
+    # Sanity checks for new validation directories
+    assert (public_val / "train_metadata.json").exists(), "`public_val/train_metadata.json` doesn't exist!"
+    assert (public_val / "sample_submission.csv").exists(), "`public_val/sample_submission.csv` doesn't exist!"
+    assert (public_val / "train").exists(), "`public_val/train` directory doesn't exist!"
+    assert (public_val / "test").exists(), "`public_val/test` directory doesn't exist!"
+    assert (private_val / "answers.csv").exists(), "`private_val/answers.csv` doesn't exist!"

mlebench/competitions/h-and-m-personalized-fashion-recommendations/grade.py ADDED Viewed

@@ -0,0 +1,42 @@
+import pandas as pd
+from mlebench.grade_helpers import InvalidSubmissionError
+from mlebench.metrics import mean_average_precision_at_k
+def prepare_for_metric(submission: pd.DataFrame, answers: pd.DataFrame) -> dict:
+    # Answers checks
+    assert set(answers.columns) == {
+        "customer_id",
+        "prediction",
+    }, "Answers DataFrame must have 'customer_id' and 'prediction' columns."
+    # Submission checks
+    if set(submission.columns) != {"customer_id", "prediction"}:
+        raise InvalidSubmissionError(
+            "Submission DataFrame must have 'customer_id' and 'prediction' columns."
+        )
+    if not (set(submission["customer_id"]) >= set(answers["customer_id"])):
+        raise InvalidSubmissionError(
+            "Submission customer_id must be a superset of answers customer_id"
+        )
+    # Filter the submission to only consider the customer_ids that exist in answers
+    submission = submission[submission["customer_id"].isin(answers["customer_id"])]
+    # Sort to ensure we're grading the right rows
+    submission = submission.sort_values("customer_id").reset_index(drop=True)
+    answers = answers.sort_values("customer_id").reset_index(drop=True)
+    y_true = answers["prediction"].astype(str).str.split(" ").apply(set).tolist()
+    y_pred = submission["prediction"].astype(str).str.split(" ").tolist()
+    return {"actual": y_true, "predicted": y_pred}
+def grade(submission: pd.DataFrame, answers: pd.DataFrame) -> float:
+    # Prepare the data for metric calculation
+    prepped = prepare_for_metric(submission, answers)
+    return mean_average_precision_at_k(
+        actual=prepped["actual"], predicted=prepped["predicted"], k=12
+    )

mlebench/competitions/h-and-m-personalized-fashion-recommendations/prepare.py ADDED Viewed

@@ -0,0 +1,102 @@
+import os
+import shutil
+from pathlib import Path
+import pandas as pd
+from mlebench.utils import read_csv
+def prepare(raw: Path, public: Path, private: Path):
+    """
+    Splits the data in raw into public and private datasets with appropriate test/train splits.
+    """
+    # Create train, test from train split
+    old_train = read_csv(raw / "transactions_train.csv")
+    old_train["purchase_id"] = (
+        old_train["customer_id"].astype(str)
+        + "_"
+        + old_train["article_id"].astype(str)
+        + "_"
+        + old_train["t_dat"].astype(str)
+    )
+    # The task is to predict what purchases will be made in the next 7 days.
+    # To create our test set, we will take the purchases made in the last 7 days of the training set.
+    old_train["t_dat_parsed"] = pd.to_datetime(
+        old_train["t_dat"]
+    )  # Parse t_dat to datetime in a new column
+    max_date = old_train["t_dat_parsed"].max()  # Find the maximum date in the t_dat_parsed column
+    old_train["in_last_7_days"] = old_train["t_dat_parsed"] >= (max_date - pd.Timedelta(days=7))
+    new_train = old_train[
+        old_train["in_last_7_days"] == False
+    ].copy()  # Filter rows where t_dat_parsed is more than 7 days from the maximum date
+    new_test = old_train[
+        old_train["in_last_7_days"] == True
+    ].copy()  # Filter rows where t_dat_parsed is within the last 7 days of the time series
+    # Train/test checks
+    assert (
+        not new_test["purchase_id"].isin(new_train["purchase_id"]).any()
+    ), "No purchase_ids should be shared between new_test and new_train"
+    new_train = new_train.drop(columns=["purchase_id", "t_dat_parsed", "in_last_7_days"])
+    new_test = new_test.drop(columns=["purchase_id", "t_dat_parsed"])
+    # sample submission and answers differ because the task is predicting what articles each
+    # customer will purchase in the 7-day period immediately after the training data ends. Customer
+    # who did not make any purchase during that time are excluded from the scoring.
+    # As such we can't put the exact customer ids from test set into the sample submission, as this
+    # would leak which customers made purchases in the test set. Instead, we put _all_ the customer
+    # ids in the sample submission, ask the user to predict for all of them, and then we will filter
+    # out in grade.py the customers who did not make any purchases in the test set.
+    # Answers, contains only customers that actually made purchases in the test period.
+    answers = (
+        new_test.groupby("customer_id")["article_id"]
+        .apply(lambda x: " ".join(x.astype(str)))
+        .reset_index()
+    )
+    # rename 'article_id' to 'prediction'
+    answers = answers.rename(columns={"article_id": "prediction"})
+    # Sample submission, which contains all customer ids.
+    shutil.copyfile(
+        src=raw / "sample_submission.csv",
+        dst=public / "sample_submission.csv",
+    )
+    # Write CSVs
+    # new_test.to_csv(private / "test.csv", index=False)
+    answers.to_csv(private / "answers.csv", index=False)
+    new_train.to_csv(public / "transactions_train.csv", index=False)
+    # Copy files and images directory
+    shutil.copyfile(
+        src=raw / "articles.csv",
+        dst=public / "articles.csv",
+    )
+    shutil.copyfile(
+        src=raw / "customers.csv",
+        dst=public / "customers.csv",
+    )
+    shutil.copytree(
+        src=raw / "images",
+        dst=public / "images",
+        dirs_exist_ok=True,
+    )
+    # checks
+    expected_train_columns = ["t_dat", "customer_id", "article_id", "price", "sales_channel_id"]
+    assert (
+        new_train.columns.tolist() == expected_train_columns
+    ), f"Unexcpected columns in new_train, expected {expected_train_columns}, got {new_train.columns.tolist()}"
+    expected_answer_columns = ["customer_id", "prediction"]
+    assert (
+        answers.columns.tolist() == expected_answer_columns
+    ), f"Unexcpected columns in answers, expected {expected_answer_columns}, got {answers.columns.tolist()}"
+    assert answers["customer_id"].nunique() == len(
+        answers
+    ), "There should be no duplicate customer_ids in answers"

mlebench/competitions/h-and-m-personalized-fashion-recommendations/prepare_val.py ADDED Viewed

@@ -0,0 +1,132 @@
+import os
+import shutil
+from pathlib import Path
+import pandas as pd
+from mlebench.utils import read_csv
+def _split_and_process_data(transactions_df: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
+    """
+    Splits a dataframe into train and test sets based on the last 7 days of transactions.
+    Args:
+        transactions_df: The input dataframe with transaction data.
+    Returns:
+        A tuple containing:
+        - new_train_df: The training data (all data except the last 7 days).
+        - answers_df: The ground truth answers for the test set (last 7 days).
+    """
+    df = transactions_df.copy()
+    if "purchase_id" not in df.columns:
+        df["purchase_id"] = (
+            df["customer_id"].astype(str)
+            + "_"
+            + df["article_id"].astype(str)
+            + "_"
+            + df["t_dat"].astype(str)
+        )
+    # The task is to predict what purchases will be made in the next 7 days.
+    # To create our test set, we will take the purchases made in the last 7 days of the training set.
+    df["t_dat_parsed"] = pd.to_datetime(df["t_dat"])  # Parse t_dat to datetime in a new column
+    max_date = df["t_dat_parsed"].max()  # Find the maximum date in the t_dat_parsed column
+    df["in_last_7_days"] = df["t_dat_parsed"] >= (max_date - pd.Timedelta(days=7))
+    new_train_df = df[df["in_last_7_days"] == False].copy()
+    new_test_df = df[df["in_last_7_days"] == True].copy()
+    # Train/test checks
+    assert (
+        not new_test_df["purchase_id"].isin(new_train_df["purchase_id"]).any()
+    ), "No purchase_ids should be shared between a test and train split"
+    new_train_df = new_train_df.drop(columns=["purchase_id", "t_dat_parsed", "in_last_7_days"])
+    # Answers, contains only customers that actually made purchases in the test period.
+    answers_df = (
+        new_test_df.groupby("customer_id")["article_id"]
+        .apply(lambda x: " ".join(x.astype(str)))
+        .reset_index()
+    )
+    # rename 'article_id' to 'prediction'
+    answers_df = answers_df.rename(columns={"article_id": "prediction"})
+    return new_train_df, answers_df
+def _copy_static_files(raw_path: Path, public_path: Path):
+    """Copies static competition files (articles, customers, images) to a public directory."""
+    # Sample submission, which contains all customer ids.
+    shutil.copyfile(
+        src=raw_path / "sample_submission.csv",
+        dst=public_path / "sample_submission.csv",
+    )
+    # Copy files and images directory
+    shutil.copyfile(
+        src=raw_path / "articles.csv",
+        dst=public_path / "articles.csv",
+    )
+    shutil.copyfile(
+        src=raw_path / "customers.csv",
+        dst=public_path / "customers.csv",
+    )
+    shutil.copytree(
+        src=raw_path / "images",
+        dst=public_path / "images",
+        dirs_exist_ok=True,
+    )
+def _run_output_checks(train_df: pd.DataFrame, answers_df: pd.DataFrame):
+    """Runs assertions to check the format of final output dataframes."""
+    expected_train_columns = ["t_dat", "customer_id", "article_id", "price", "sales_channel_id"]
+    assert (
+        train_df.columns.tolist() == expected_train_columns
+    ), f"Unexcpected columns in new_train, expected {expected_train_columns}, got {train_df.columns.tolist()}"
+    expected_answer_columns = ["customer_id", "prediction"]
+    assert (
+        answers_df.columns.tolist() == expected_answer_columns
+    ), f"Unexcpected columns in answers, expected {expected_answer_columns}, got {answers_df.columns.tolist()}"
+    assert answers_df["customer_id"].nunique() == len(
+        answers_df
+    ), "There should be no duplicate customer_ids in answers"
+def prepare(raw: Path, public: Path, private: Path):
+    """
+    Splits the data in raw into public and private datasets with appropriate test/train splits.
+    Also creates a second, parallel validation split (in public_val/private_val).
+    """
+    # Create train, test from train split
+    raw_transactions = read_csv(raw / "transactions_train.csv")
+    # --- Original Data Split (Train/Test) ---
+    # This split generates the main competition files.
+    train_orig, answers_orig = _split_and_process_data(raw_transactions)
+    # Write original public and private files
+    answers_orig.to_csv(private / "answers.csv", index=False)
+    train_orig.to_csv(public / "transactions_train.csv", index=False)
+    _copy_static_files(raw, public)
+    _run_output_checks(train_orig, answers_orig)
+    # --- New Validation Data Split (Train/Val) ---
+    # Define and create the new directories for the validation set.
+    public_val = public.parent / "public_val"
+    private_val = private.parent / "private_val"
+    public_val.mkdir(parents=True, exist_ok=True)
+    private_val.mkdir(parents=True, exist_ok=True)
+    # This second split takes the *output training set* from the first split
+    # and splits it again to create a new, smaller training set and a validation set.
+    train_val, answers_val = _split_and_process_data(train_orig)
+    # Write validation set files to the new directories, mirroring the original structure.
+    answers_val.to_csv(private_val / "answers.csv", index=False)
+    train_val.to_csv(public_val / "transactions_train.csv", index=False)
+    _copy_static_files(raw, public_val)
+    _run_output_checks(train_val, answers_val)

mlebench/competitions/handwriting/grade.py ADDED Viewed

@@ -0,0 +1,23 @@
+import pandas as pd
+import numpy as np
+from sklearn.metrics import accuracy_score
+def grade(submission: pd.DataFrame, answers: pd.DataFrame) -> float:
+    """
+    Grade the submission using accuracy metric.
+    Args:
+        submission: DataFrame with columns ['id', 'label']
+        answers: DataFrame with columns ['id', 'label']
+    Returns:
+        Accuracy score (float between 0 and 1)
+    """
+    # Merge on id to ensure alignment
+    merged = pd.merge(answers, submission, on='id', suffixes=('_true', '_pred'))
+    # Calculate accuracy
+    accuracy = accuracy_score(merged['label_true'], merged['label_pred'])
+    return accuracy

dslighting 1.7.1__py3-none-any.whl → 1.7.6__py3-none-any.whl

dslighting 1.7.1py3-none-any.whl → 1.7.6py3-none-any.whl