PyPI - dslighting - Versions diffs - 1.7.1__py3-none-any.whl → 1.7.6__py3-none-any.whl - Mend

dslighting 1.7.1py3-none-any.whl → 1.7.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (352) hide show

mlebench/competitions/rsna-breast-cancer-detection/prepare_val.py ADDED Viewed

@@ -0,0 +1,201 @@
+import shutil
+from pathlib import Path
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from tqdm.auto import tqdm
+from mlebench.utils import read_csv
+# columns in train.csv
+train_columns = [
+    "site_id",
+    "patient_id",
+    "image_id",
+    "laterality",
+    "view",
+    "age",
+    "cancer",
+    "biopsy",
+    "invasive",
+    "BIRADS",
+    "implant",
+    "density",
+    "machine_id",
+    "difficult_negative_case",
+]
+# columns in test.csv
+test_columns = [
+    "site_id",
+    "patient_id",
+    "image_id",
+    "laterality",
+    "view",
+    "age",
+    "implant",
+    "machine_id",
+    "prediction_id",
+]
+# columns in answers/submission
+submission_columns = ["prediction_id", "cancer"]
+DEV = False
+def _process_split(
+    source_df: pd.DataFrame,
+    train_pids: set,
+    test_pids: set,
+    public_path: Path,
+    private_path: Path,
+    raw_images_path: Path,
+):
+    """
+    Processes a data split, creating all required files and directories.
+    Args:
+        source_df: The DataFrame to split (e.g., the full dataset or a training subset).
+        train_pids: A set of patient IDs for the training set.
+        test_pids: A set of patient IDs for the test set.
+        public_path: The public output directory.
+        private_path: The private output directory.
+        raw_images_path: Path to the original raw images.
+    """
+    public_path.mkdir(exist_ok=True, parents=True)
+    private_path.mkdir(exist_ok=True, parents=True)
+    # Create train and test dataframes from the source data
+    new_train = source_df[source_df["patient_id"].isin(train_pids)].copy()
+    # dont index the `prediction_id` (last col) since its not in train and we need to build it
+    # index also the `cancer` column, which we'll drop later for the without_labels version
+    answers = source_df[source_df["patient_id"].isin(test_pids)][
+        test_columns[:-1] + ["cancer"]
+    ].copy()
+    new_train.to_csv(public_path / "train.csv", index=False)
+    answers["prediction_id"] = answers["patient_id"].astype(str) + "_" + answers["laterality"]
+    new_test_without_labels = answers.drop(columns=["cancer"])
+    new_test_without_labels.to_csv(public_path / "test.csv", index=False)
+    # merge multiple prediction_ids for the same patient into one for sample_submission and private test
+    answers = answers[submission_columns].copy()
+    # just take the first label for each prediction id -- the rest will be identical duplicates
+    answers = answers.groupby("prediction_id").first().reset_index()
+    answers.to_csv(private_path / "answers.csv", index=False)
+    sample_submission = answers.copy()
+    sample_submission["cancer"] = new_train.cancer.mean()  # mean cancer rate in train set
+    sample_submission.to_csv(public_path / "sample_submission.csv", index=False)
+    assert len(sample_submission) == len(
+        answers
+    ), "sample_submission and test.csv should have the same number of rows"
+    assert len(new_test_without_labels) + len(new_train) == len(
+        source_df
+    ), "The sum of the rows in new_test_without_labels and new_train should be equal to the number of rows in the source_df"
+    # because of the merging
+    assert len(answers) != len(
+        new_test_without_labels
+    ), "new_test and new_test_without_labels should have different number of rows"
+    assert (
+        answers.columns.tolist() == submission_columns
+    ), f"answers should have columns {submission_columns}"
+    assert (
+        sample_submission.columns.tolist() == submission_columns
+    ), f"sample_submission should have columns {submission_columns}"
+    assert (
+        new_train.columns.tolist() == source_df.columns.tolist()
+    ), f"new_train should have columns {source_df.columns.tolist()}, got {new_train.columns.tolist()}"
+    assert (
+        new_test_without_labels.columns.tolist() == test_columns
+    ), f"new_test_without_labels should have columns {test_columns}, got {new_test_without_labels.columns.tolist()}"
+    assert set(new_test_without_labels["patient_id"]).isdisjoint(
+        set(new_train["patient_id"])
+    ), "new_test_without_labels and new_train should have disjoint patient_ids"
+    # finally, split the images
+    (public_path / "train_images").mkdir(exist_ok=True)
+    for patient_id in tqdm(train_pids, desc=f"Copying train images to {public_path.name}"):
+        patient_id_str = str(patient_id)
+        patient_dir = public_path / "train_images" / patient_id_str
+        patient_dir.mkdir(exist_ok=True)
+        image_ids = new_train[new_train["patient_id"] == patient_id]["image_id"].to_list()
+        for image_id in image_ids:
+            shutil.copy(raw_images_path / patient_id_str / f"{image_id}.dcm", patient_dir)
+    (public_path / "test_images").mkdir(exist_ok=True)
+    for patient_id in tqdm(test_pids, desc=f"Copying test images to {public_path.name}"):
+        patient_id_str = str(patient_id)
+        patient_dir = public_path / "test_images" / patient_id_str
+        patient_dir.mkdir(exist_ok=True)
+        image_ids = new_test_without_labels[new_test_without_labels["patient_id"] == patient_id][
+            "image_id"
+        ].to_list()
+        for image_id in image_ids:
+            shutil.copy(raw_images_path / patient_id_str / f"{image_id}.dcm", patient_dir)
+    # final checks
+    assert len(list((public_path / "train_images").rglob("*.dcm"))) == len(
+        new_train
+    ), "Number of images in train_images should be equal to the number of rows in new_train"
+    assert len(list((public_path / "test_images").rglob("*.dcm"))) == len(
+        new_test_without_labels
+    ), "Number of images in test_images should be equal to the number of rows in new_test_without_labels"
+def prepare(raw: Path, public: Path, private: Path):
+    old_train = read_csv(raw / "train.csv")
+    # work on 5k samples for now, instead of 54k
+    if DEV:
+        old_train = old_train.sample(5000, random_state=42)
+    # "You can expect roughly 8,000 patients" in the test set
+    # so, split on patients. There are 11913 patients in train set
+    # Original ratio is 8000/ (8000 + 11913) ~ 0.4
+    # We use 0.1 to avoid taking too many samples out of train
+    all_patient_ids = old_train["patient_id"].unique()
+    train_patients_orig, test_patients_orig = train_test_split(
+        all_patient_ids, test_size=0.1, random_state=42
+    )
+    # --- 1. Create the original public/private split ---
+    # This call generates the primary competition data. Its outputs must not be changed.
+    _process_split(
+        source_df=old_train,
+        train_pids=set(train_patients_orig),
+        test_pids=set(test_patients_orig),
+        public_path=public,
+        private_path=private,
+        raw_images_path=raw / "train_images",
+    )
+    # --- 2. Create the new validation split ---
+    # Define new parallel directories for the validation set.
+    public_val = public.parent / "public_val"
+    private_val = private.parent / "private_val"
+    # Split the *original training patients* again to create a new train/validation set.
+    # We use the same test_size and random_state to replicate the splitting methodology.
+    train_val_patients, test_val_patients = train_test_split(
+        train_patients_orig, test_size=0.1, random_state=42
+    )
+    # The source data for this second split is the original training data.
+    original_train_df = old_train[old_train["patient_id"].isin(train_patients_orig)].copy()
+    # This call generates the new validation data in public_val/private_val.
+    _process_split(
+        source_df=original_train_df,
+        train_pids=set(train_val_patients),
+        test_pids=set(test_val_patients),
+        public_path=public_val,
+        private_path=private_val,
+        raw_images_path=raw / "train_images",
+    )

mlebench/competitions/rsna-miccai-brain-tumor-radiogenomic-classification/grade.py ADDED Viewed

@@ -0,0 +1,13 @@
+import pandas as pd
+from sklearn.metrics import roc_auc_score
+from mlebench.competitions.utils import prepare_for_auroc_metric
+from mlebench.grade_helpers import InvalidSubmissionError
+def grade(submission: pd.DataFrame, answers: pd.DataFrame) -> float:
+    roc_auc_inputs = prepare_for_auroc_metric(
+        answers=answers, submission=submission, id_col="BraTS21ID", target_col="MGMT_value"
+    )
+    score = roc_auc_score(y_true=roc_auc_inputs["y_true"], y_score=roc_auc_inputs["y_score"])
+    return score

mlebench/competitions/rsna-miccai-brain-tumor-radiogenomic-classification/prepare.py ADDED Viewed

@@ -0,0 +1,47 @@
+import shutil
+from pathlib import Path
+import numpy as np
+from sklearn.model_selection import train_test_split
+from mlebench.utils import read_csv
+def prepare(raw: Path, public: Path, private: Path):
+    # Create train and test splits from train set
+    old_train = read_csv(raw / "train_labels.csv", dtype={"BraTS21ID": str, "MGMT_value": int})
+    new_train, new_test = train_test_split(old_train, test_size=0.1, random_state=0)
+    # Copy over images
+    (public / "train").mkdir(exist_ok=True)
+    for file_id in new_train["BraTS21ID"]:
+        (public / "train" / file_id).mkdir(exist_ok=True)
+        shutil.copytree(
+            src=raw / "train" / file_id,
+            dst=public / "train" / file_id,
+            dirs_exist_ok=True,
+        )
+    assert len(list(public.glob("train/*"))) == len(
+        new_train
+    ), "Public train should have the same number of images as the train set"
+    (public / "test").mkdir(exist_ok=True)
+    for file_id in new_test["BraTS21ID"]:
+        (public / "test" / file_id).mkdir(exist_ok=True)
+        shutil.copytree(
+            src=raw / "train" / file_id,
+            dst=public / "test" / file_id,
+            dirs_exist_ok=True,
+        )
+    assert len(list(public.glob("test/*"))) == len(
+        new_test
+    ), "Public train should have the same number of images as the train set"
+    # Create a sample submission file
+    submission_df = new_test.copy()
+    submission_df["MGMT_value"] = 0.5
+    # Copy over files
+    new_train.to_csv(public / "train_labels.csv", index=False)
+    new_test.to_csv(private / "test.csv", index=False)
+    submission_df.to_csv(public / "sample_submission.csv", index=False)

mlebench/competitions/rsna-miccai-brain-tumor-radiogenomic-classification/prepare_val.py ADDED Viewed

@@ -0,0 +1,97 @@
+import shutil
+from pathlib import Path
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from mlebench.utils import read_csv
+def _create_split_files(
+    train_df: pd.DataFrame,
+    test_df: pd.DataFrame,
+    public_dest: Path,
+    private_dest: Path,
+    raw_data_path: Path,
+):
+    """
+    Helper function to populate public and private directories for a given split.
+    This function handles copying images, creating label files, and generating
+    a sample submission, ensuring a consistent structure across different data splits.
+    """
+    public_dest.mkdir(exist_ok=True)
+    private_dest.mkdir(exist_ok=True)
+    # Copy over images for the training set
+    (public_dest / "train").mkdir(exist_ok=True)
+    for file_id in train_df["BraTS21ID"]:
+        (public_dest / "train" / file_id).mkdir(exist_ok=True)
+        shutil.copytree(
+            src=raw_data_path / "train" / file_id,
+            dst=public_dest / "train" / file_id,
+            dirs_exist_ok=True,
+        )
+    assert len(list(public_dest.glob("train/*"))) == len(
+        train_df
+    ), "Public train should have the same number of images as the train set"
+    # Copy over images for the test set (without labels)
+    (public_dest / "test").mkdir(exist_ok=True)
+    for file_id in test_df["BraTS21ID"]:
+        (public_dest / "test" / file_id).mkdir(exist_ok=True)
+        shutil.copytree(
+            src=raw_data_path / "train" / file_id,
+            dst=public_dest / "test" / file_id,
+            dirs_exist_ok=True,
+        )
+    assert len(list(public_dest.glob("test/*"))) == len(
+        test_df
+    ), "Public test should have the same number of images as the test set"
+    # Create a sample submission file for the public directory
+    submission_df = test_df.copy()
+    submission_df["MGMT_value"] = 0.5
+    # Copy over the final CSV files to their destinations
+    train_df.to_csv(public_dest / "train_labels.csv", index=False)
+    test_df.to_csv(private_dest / "test.csv", index=False)
+    submission_df.to_csv(public_dest / "sample_submission.csv", index=False)
+def prepare(raw: Path, public: Path, private: Path):
+    # Load the full labeled dataset from the raw directory
+    all_labeled_data = read_csv(raw / "train_labels.csv", dtype={"BraTS21ID": str, "MGMT_value": int})
+    # --- Original Split (for public/private) ---
+    # Create the original train and test splits from the full dataset
+    train_df, test_df = train_test_split(all_labeled_data, test_size=0.1, random_state=0)
+    # Generate the original public and private directories.
+    # The output of this call will be identical to the original script's output.
+    _create_split_files(
+        train_df=train_df,
+        test_df=test_df,
+        public_dest=public,
+        private_dest=private,
+        raw_data_path=raw,
+    )
+    # --- New Validation Split (for public_val/private_val) ---
+    # Define new paths for the validation split, parallel to the original ones
+    public_val = public.parent / "public_val"
+    private_val = private.parent / "private_val"
+    # Create a new train/validation split from the *original training data* (train_df).
+    # The splitting logic (test_size, random_state) is intentionally replicated.
+    train_val_df, test_val_df = train_test_split(train_df, test_size=0.1, random_state=0)
+    # Generate the new public_val and private_val directories using the same helper function.
+    # This ensures the directory structure and filenames are identical to the original split.
+    _create_split_files(
+        train_df=train_val_df,
+        test_df=test_val_df,
+        public_dest=public_val,
+        private_dest=private_val,
+        raw_data_path=raw,
+    )

mlebench/competitions/santander-customer-satisfaction/grade.py ADDED Viewed

@@ -0,0 +1,10 @@
+import pandas as pd
+from sklearn.metrics import roc_auc_score
+from mlebench.competitions.utils import prepare_for_auroc_metric
+def grade(submission: pd.DataFrame, answers: pd.DataFrame) -> float:
+    """Compute AUC-ROC between submitted probabilities and ground truth."""
+    roc_auc_inputs = prepare_for_auroc_metric(submission, answers, id_col="ID", target_col="TARGET")
+    return roc_auc_score(y_true=roc_auc_inputs["y_true"], y_score=roc_auc_inputs["y_score"])

mlebench/competitions/santander-customer-satisfaction/prepare.py ADDED Viewed

@@ -0,0 +1,41 @@
+from pathlib import Path
+import pandas as pd
+from mlebench.utils import read_csv
+def prepare(raw: Path, public: Path, private: Path) -> None:
+    """Materialize Santander Customer Satisfaction CSV splits."""
+    public.mkdir(parents=True, exist_ok=True)
+    private.mkdir(parents=True, exist_ok=True)
+    train = read_csv(raw / "train.csv")
+    test_public = read_csv(raw / "test.csv")
+    answers = read_csv(raw / "test_answer.csv")
+    if (raw / "sample_submission.csv").exists():
+        sample_submission = read_csv(raw / "sample_submission.csv")
+    else:
+        sample_submission = answers.copy()
+        sample_submission["TARGET"] = 0.0
+    merged_private = test_public.merge(answers, on="ID", how="left", validate="one_to_one")
+    if merged_private["TARGET"].isna().any():
+        missing_ids = merged_private.loc[merged_private["TARGET"].isna(), "ID"].tolist()[:5]
+        raise ValueError(f"Missing TARGET labels for IDs (showing up to 5): {missing_ids}")
+    train.to_csv(public / "train.csv", index=False)
+    test_public.to_csv(public / "test.csv", index=False)
+    sample_submission.to_csv(public / "sample_submission.csv", index=False)
+    merged_private.to_csv(private / "test.csv", index=False)
+    answers.to_csv(private / "gold_submission.csv", index=False)
+    assert len(test_public) == len(answers) == len(
+        merged_private
+    ), "Test features, answers, and private test must have the same length."
+    assert set(train.columns) - {"TARGET"} == set(
+        test_public.columns
+    ), "Train and public test feature columns should match aside from TARGET."
+    assert sample_submission.columns.tolist() == ["ID", "TARGET"], "Sample submission must be ID,TARGET."

mlebench/competitions/sciencebench-001-clintox-nn/__init__.py ADDED Viewed

File without changes

mlebench/competitions/sciencebench-001-clintox-nn/grade.py ADDED Viewed

@@ -0,0 +1,56 @@
+"""
+Grading function for ScienceBench task 1
+"""
+import pandas as pd
+from sklearn.metrics import roc_auc_score
+def grade(submission: pd.DataFrame, answers: pd.DataFrame) -> float:
+    """
+    Grade submission using ROC-AUC (as configured in `config.yaml`).
+    Args:
+        submission: DataFrame with predictions
+        answers: DataFrame with ground truth
+    Returns:
+        ROC-AUC score (higher is better)
+    """
+    if submission is None or answers is None:
+        return None
+    id_col = "smiles" if "smiles" in submission.columns and "smiles" in answers.columns else None
+    # Align rows on identifier if available; otherwise assume same order.
+    if id_col:
+        merged = pd.merge(
+            answers,
+            submission,
+            on=id_col,
+            suffixes=("_true", "_pred"),
+            how="inner",
+        )
+    else:
+        merged = pd.concat(
+            [answers.add_suffix("_true"), submission.add_suffix("_pred")],
+            axis=1,
+        )
+    # Score all common target columns (exclude identifier column)
+    target_cols = [c for c in answers.columns if c != id_col and c in submission.columns]
+    if not target_cols:
+        return None
+    aucs: list[float] = []
+    for col in target_cols:
+        y_true = merged[f"{col}_true"]
+        y_pred = merged[f"{col}_pred"]
+        # ROC-AUC is undefined if only one class is present.
+        if y_true.nunique(dropna=True) < 2:
+            continue
+        aucs.append(float(roc_auc_score(y_true, y_pred)))
+    if not aucs:
+        return None
+    return float(sum(aucs) / len(aucs))

mlebench/competitions/sciencebench-001-clintox-nn/prepare.py ADDED Viewed

@@ -0,0 +1,75 @@
+"""
+Data preparation for ScienceBench Task 1: clintox_nn
+Dataset: clintox
+"""
+import pandas as pd
+import numpy as np
+from pathlib import Path
+import shutil
+def prepare(raw: Path, public: Path, private: Path):
+    """
+    Prepare the clintox task data.
+    Args:
+        raw: Path to raw data directory (/path/to/ScienceAgent-bench/benchmark/datasets/clintox)
+        public: Path to public directory (visible to participants)
+        private: Path to private directory (used for grading)
+    """
+    print(f"=" * 60)
+    print(f"Preparing ScienceBench Task 1: clintox_nn")
+    print(f"=" * 60)
+    print(f"Raw directory: {raw}")
+    print(f"Public directory: {public}")
+    print(f"Private directory: {private}")
+    # Source dataset path
+    source_dir = Path("/path/to/ScienceAgent-bench/benchmark/datasets/clintox")
+    if not source_dir.exists():
+        raise FileNotFoundError(f"Source dataset not found: {source_dir}")
+    # Copy training and test data to public
+    train_file = source_dir / "clintox_train.csv"
+    test_file = source_dir / "clintox_test.csv"
+    if not train_file.exists() or not test_file.exists():
+        raise FileNotFoundError(f"Required data files not found in {source_dir}")
+    print(f"\nCopying data files to public directory...")
+    shutil.copy2(train_file, public / "clintox_train.csv")
+    shutil.copy2(test_file, public / "clintox_test.csv")
+    print(f"  ✓ Copied: clintox_train.csv")
+    print(f"  ✓ Copied: clintox_test.csv")
+    # Read test file to get structure
+    test_df = pd.read_csv(test_file)
+    # Create sample_submission with expected format
+    # The submission should contain: smiles, FDA_APPROVED, CT_TOX
+    sample_submission = pd.DataFrame({
+        "smiles": test_df["smiles"],
+        "FDA_APPROVED": 0.5,  # Probability placeholder
+        "CT_TOX": 0.5  # Probability placeholder
+    })
+    sample_submission.to_csv(public / "sample_submission.csv", index=False)
+    print(f"\n✓ Created sample_submission.csv with {len(sample_submission)} rows")
+    # Load gold results for answer
+    gold_file = Path("/path/to/ScienceAgent-bench/benchmark/eval_programs/gold_results/clintox_gold.csv")
+    if gold_file.exists():
+        gold_df = pd.read_csv(gold_file)
+        gold_df.to_csv(private / "answer.csv", index=False)
+        print(f"✓ Created answer.csv with {len(gold_df)} rows from gold results")
+    else:
+        # If gold file doesn't exist, create placeholder
+        print(f"⚠ Warning: Gold results not found at {gold_file}")
+        answer = sample_submission.copy()
+        answer.to_csv(private / "answer.csv", index=False)
+        print(f"✓ Created placeholder answer.csv")
+    print(f"\nData preparation completed!")
+    print(f"  Public files: {sorted([f.name for f in public.glob('*')])}")
+    print(f"  Private files: {sorted([f.name for f in private.glob('*')])}")

mlebench/competitions/sciencebench-015-aai/grade.py ADDED Viewed

@@ -0,0 +1,37 @@
+"""Grading function for ScienceBench task 15 (admet_ai)."""
+from __future__ import annotations
+import pandas as pd
+from sklearn.metrics import roc_auc_score
+REQUIRED_COLUMNS = {"Drug", "Y"}
+AUC_THRESHOLD = 0.84
+def grade(submission: pd.DataFrame, answers: pd.DataFrame) -> float:
+    """Return 1.0 if ordering matches and ROC AUC >= threshold, else 0.0."""
+    if submission.empty:
+        print("Submission is empty.")
+        return 0.0
+    if not REQUIRED_COLUMNS.issubset(submission.columns):
+        print(f"Submission missing required columns: {REQUIRED_COLUMNS - set(submission.columns)}")
+        return 0.0
+    if not REQUIRED_COLUMNS.issubset(answers.columns):
+        print("Answer file is missing required columns.")
+        return 0.0
+    if list(submission["Drug"]) != list(answers["Drug"]):
+        print("Drug ordering mismatch.")
+        return 0.0
+    try:
+        auc = roc_auc_score(answers["Y"].values, submission["Y"].values)
+    except ValueError as exc:
+        print(f"Unable to compute ROC AUC: {exc}")
+        return 0.0
+    print(f"ROC AUC: {auc}")
+    return 1.0 if auc >= AUC_THRESHOLD else 0.0

dslighting 1.7.1__py3-none-any.whl → 1.7.6__py3-none-any.whl

dslighting 1.7.1py3-none-any.whl → 1.7.6py3-none-any.whl