PyPI - dslighting - Versions diffs - 1.7.1__py3-none-any.whl → 1.7.6__py3-none-any.whl - Mend

dslighting 1.7.1py3-none-any.whl → 1.7.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (352) hide show

mlebench/competitions/sciencebench-015-aai/prepare.py ADDED Viewed

@@ -0,0 +1,102 @@
+"""Prepare data for ScienceBench task 15 (admet_ai)."""
+from __future__ import annotations
+from pathlib import Path
+import shutil
+import pandas as pd
+DATASET_ROOT = Path("/path/to/ScienceAgent-bench/benchmark/datasets/ames")
+GOLD_FILE = Path("/path/to/ScienceAgent-bench/benchmark/eval_programs/gold_results/admet_ai_gold.csv")
+EXPECTED_OUTPUT = "aai_preds.csv"
+def _ensure_dir(path: Path) -> None:
+    path.mkdir(parents=True, exist_ok=True)
+def _copy_dataset(dataset_root: Path, target_root: Path) -> int:
+    """Copy the Ames dataset into the public directory."""
+    if not dataset_root.exists():
+        raise FileNotFoundError(f"Source dataset not found: {dataset_root}")
+    copied = 0
+    for item in dataset_root.rglob("*"):
+        if not item.is_file() or item.name.startswith("."):
+            continue
+        relative = item.relative_to(dataset_root)
+        destination = target_root / relative
+        destination.parent.mkdir(parents=True, exist_ok=True)
+        shutil.copy2(item, destination)
+        copied += 1
+        if copied <= 10:
+            print(f"  ✓ Copied: {relative}")
+    if copied > 10:
+        print(f"  ... and {copied - 10} more files")
+    return copied
+def _write_sample_submission(dataset_root: Path, public_dir: Path) -> None:
+    """Build a sample submission aligned with the gold file schema."""
+    test_path = dataset_root / "test.csv"
+    if not test_path.exists():
+        raise FileNotFoundError(f"Test file not found at {test_path}")
+    test_df = pd.read_csv(test_path, index_col=0)
+    sample_df = test_df[["Drug_ID", "Drug"]].copy()
+    sample_df["Y"] = 0.5  # Placeholder probability
+    sample_path = public_dir / "sample_submission.csv"
+    sample_df.to_csv(sample_path, index=False)
+    print(f"✓ Created {sample_path.name} with {len(sample_df)} rows")
+def _write_answers(private_dir: Path) -> None:
+    """Copy the gold answers used by the grader."""
+    if not GOLD_FILE.exists():
+        raise FileNotFoundError(f"Gold file missing: {GOLD_FILE}")
+    gold_df = pd.read_csv(GOLD_FILE)
+    first_col = gold_df.columns[0]
+    if first_col.startswith("Unnamed"):
+        gold_df = gold_df.drop(columns=[first_col])
+    answer_path = private_dir / "answer.csv"
+    gold_df.to_csv(answer_path, index=False)
+    print(f"✓ Copied answer.csv with {len(gold_df)} rows")
+def prepare(raw: Path, public: Path, private: Path) -> None:
+    """
+    Stage the Ames mutagenicity dataset for ScienceBench task 15.
+    Args:
+        raw: Optional pre-staged dataset directory. If empty, the canonical dataset is used.
+        public: Directory exposed to participants.
+        private: Directory used internally for grading.
+    """
+    print("=" * 60)
+    print("Preparing ScienceBench Task 15: admet_ai")
+    print("=" * 60)
+    print(f"Raw directory: {raw}")
+    print(f"Public directory: {public}")
+    print(f"Private directory: {private}")
+    _ensure_dir(public)
+    _ensure_dir(private)
+    dataset_root = raw if raw.exists() and any(raw.iterdir()) else DATASET_ROOT
+    if dataset_root is not DATASET_ROOT:
+        print("✓ Using provided raw dataset directory.")
+    else:
+        print(f"⚠ Raw directory missing or empty. Using canonical dataset: {DATASET_ROOT}")
+    total_copied = _copy_dataset(dataset_root, public)
+    print(f"  Total files copied: {total_copied}")
+    _write_sample_submission(dataset_root, public)
+    _write_answers(private)
+    print(f"\nData preparation completed. Expected submission file: pred_results/{EXPECTED_OUTPUT}")
+if __name__ == "__main__":
+    raise SystemExit("Use via the benchmark preparation tooling.")

mlebench/competitions/sciencebench-051-brain-blood-qsar/grade.py ADDED Viewed

@@ -0,0 +1,58 @@
+"""Grading function for ScienceBench task 51 (brain-blood QSAR)."""
+from __future__ import annotations
+from pathlib import Path
+from typing import Any
+import pandas as pd
+from sklearn.metrics import balanced_accuracy_score
+EXPECTED_FILE = "pred_results/brain_blood_qsar.csv"
+GOLD_FILE = Path("benchmark/eval_programs/gold_results/brain_blood_qsar_gold.csv")
+THRESHOLD = 0.70
+def _extract_labels(data: Any) -> pd.Series:
+    if isinstance(data, pd.DataFrame):
+        if "label" not in data.columns:
+            raise ValueError("Data must contain a 'label' column.")
+        return data["label"]
+    if isinstance(data, (str, Path)):
+        df = pd.read_csv(data)
+        if "label" not in df.columns:
+            raise ValueError(f"CSV at {data} must contain a 'label' column.")
+        return df["label"]
+    raise TypeError(f"Unsupported data type: {type(data)}")
+def _load_submission(submission: Any) -> pd.Series:
+    try:
+        return _extract_labels(submission)
+    except TypeError:
+        path = Path(EXPECTED_FILE)
+        if not path.exists():
+            raise FileNotFoundError(f"Expected prediction file missing: {EXPECTED_FILE}")
+        return _extract_labels(path)
+def _load_answers(answers: Any) -> pd.Series:
+    try:
+        return _extract_labels(answers)
+    except TypeError:
+        if not GOLD_FILE.exists():
+            raise FileNotFoundError(f"Gold file not found: {GOLD_FILE}")
+        return _extract_labels(GOLD_FILE)
+def grade(submission: Any, answers: Any) -> float:
+    pred_labels = _load_submission(submission)
+    gold_labels = _load_answers(answers)
+    if len(pred_labels) != len(gold_labels):
+        print(f"Row count mismatch: {len(pred_labels)} vs {len(gold_labels)}")
+        return 0.0
+    score = balanced_accuracy_score(gold_labels, pred_labels)
+    print(f"Balanced accuracy: {score}")
+    return 1.0 if score >= THRESHOLD else 0.0

mlebench/competitions/sciencebench-051-brain-blood-qsar/prepare.py ADDED Viewed

@@ -0,0 +1,69 @@
+"""
+Data preparation for ScienceBench task 51
+Dataset: brain-blood
+"""
+from __future__ import annotations
+import shutil
+from pathlib import Path
+import pandas as pd
+EXPECTED_FILE = "brain_blood_qsar.csv"
+DATASET_DIR = Path("/path/to/ScienceAgent-bench/benchmark/datasets/brain-blood")
+GOLD_FILE = Path("/path/to/ScienceAgent-bench/benchmark/eval_programs/gold_results/brain_blood_qsar_gold.csv")
+def _ensure_dir(path: Path) -> None:
+    path.mkdir(parents=True, exist_ok=True)
+def prepare(raw: Path, public: Path, private: Path) -> None:
+    print("=" * 60)
+    print("Preparing ScienceBench Task 51")
+    print("Dataset: brain-blood")
+    print("=" * 60)
+    print("Raw directory:", raw)
+    print("Public directory:", public)
+    print("Private directory:", private)
+    source = raw if raw.exists() else DATASET_DIR
+    if not source.exists():
+        raise FileNotFoundError(f"Dataset directory not found: {source}")
+    _ensure_dir(public)
+    _ensure_dir(private)
+    required = [
+        source / "logBB.sdf",
+        source / "logBB_test.sdf",
+    ]
+    missing = [str(path) for path in required if not path.exists()]
+    if missing:
+        raise FileNotFoundError("Missing dataset files: " + ", ".join(missing))
+    for path in required:
+        target = public / path.relative_to(source.parent)
+        target.parent.mkdir(parents=True, exist_ok=True)
+        shutil.copy2(path, target)
+    print("✓ Copied logBB SDF files")
+    sample_df = pd.DataFrame(
+        {
+            "label": [1, 0],
+        }
+    )
+    sample_df.to_csv(public / "sample_submission.csv", index=False)
+    print("✓ Created sample_submission.csv")
+    if not GOLD_FILE.exists():
+        raise FileNotFoundError(f"Gold CSV not found: {GOLD_FILE}")
+    shutil.copy2(GOLD_FILE, private / "answer.csv")
+    print("✓ Copied answer.csv")
+    print("Preparation complete. Expected submission file: pred_results/brain_blood_qsar.csv")
+if __name__ == "__main__":
+    raise SystemExit("Use this module via the benchmark preparation tooling.")

mlebench/competitions/sciencebench-101-experimental-band-gap-prediction/grade.py ADDED Viewed

@@ -0,0 +1,55 @@
+"""Grader for ScienceBench task 101 (experimental band-gap prediction)."""
+from __future__ import annotations
+from pathlib import Path
+import pandas as pd
+PRED_FILENAME = "experimental_band_gap_prediction_pred.csv"
+GOLD_FILENAME = "experimental_band_gap_prediction_gold.csv"
+TARGET_COLUMN = "gap_expt_eV"
+THRESHOLD = 0.6
+def _repo_root() -> Path:
+    return Path(__file__).resolve().parents[2]
+def _pred_path() -> Path:
+    return Path("pred_results") / PRED_FILENAME
+def _gold_path() -> Path:
+    return (
+        _repo_root()
+        / "benchmark"
+        / "eval_programs"
+        / "gold_results"
+        / GOLD_FILENAME
+    )
+def _load_csv(path: Path) -> pd.DataFrame:
+    if not path.exists():
+        raise FileNotFoundError(f"Required file missing: {path}")
+    return pd.read_csv(path)
+def grade(submission, answers) -> float:
+    pred_df = _load_csv(_pred_path())
+    gold_df = _load_csv(_gold_path())
+    if TARGET_COLUMN not in pred_df.columns:
+        print(f"Missing '{TARGET_COLUMN}' column in prediction.")
+        return 0.0
+    merged = gold_df[[TARGET_COLUMN]].join(pred_df[[TARGET_COLUMN]], how="inner", rsuffix="_pred")
+    if merged.empty:
+        print("No overlapping rows between prediction and gold.")
+        return 0.0
+    mae = (merged[TARGET_COLUMN] - merged[f"{TARGET_COLUMN}_pred"]).abs().mean()
+    print(f"MAE: {mae}")
+    return 1.0 if mae < THRESHOLD else 0.0

mlebench/competitions/sciencebench-101-experimental-band-gap-prediction/prepare.py ADDED Viewed

@@ -0,0 +1,88 @@
+"""Data preparation for ScienceBench task 101 (experimental band-gap prediction)."""
+from __future__ import annotations
+import shutil
+from pathlib import Path
+import pandas as pd
+DATASET_NAME = "experimental_band_gap"
+PRED_FILENAME = "experimental_band_gap_prediction_pred.csv"
+GOLD_FILENAME = "experimental_band_gap_prediction_gold.csv"
+SAMPLE_FILENAME = "sample_submission.csv"
+def _repo_root() -> Path:
+    return Path(__file__).resolve().parents[2]
+def _dataset_dir() -> Path:
+    return _repo_root() / "benchmark" / "datasets" / DATASET_NAME
+def _gold_path() -> Path:
+    return (
+        _repo_root()
+        / "benchmark"
+        / "eval_programs"
+        / "gold_results"
+        / GOLD_FILENAME
+    )
+def _ensure_dir(path: Path) -> None:
+    path.mkdir(parents=True, exist_ok=True)
+def _copy_dataset(src: Path, public: Path) -> None:
+    dest_root = public / DATASET_NAME
+    dest_root.mkdir(parents=True, exist_ok=True)
+    copied = 0
+    for item in src.iterdir():
+        if not item.is_file():
+            continue
+        shutil.copy2(item, dest_root / item.name)
+        copied += 1
+    print(f"✓ Copied {copied} dataset file(s) to {dest_root}")
+def prepare(raw: Path, public: Path, private: Path) -> None:
+    print("=" * 60)
+    print("Preparing ScienceBench Task 101")
+    print("Dataset:", DATASET_NAME)
+    print("=" * 60)
+    print("Raw directory:", raw)
+    print("Public directory:", public)
+    print("Private directory:", private)
+    source_dir = raw if raw.exists() else _dataset_dir()
+    if not source_dir.exists():
+        raise FileNotFoundError(f"Dataset directory not found: {source_dir}")
+    gold_path = _gold_path()
+    if not gold_path.exists():
+        raise FileNotFoundError(f"Gold CSV not found: {gold_path}")
+    _ensure_dir(public)
+    _ensure_dir(private)
+    _copy_dataset(source_dir, public)
+    gold_df = pd.read_csv(gold_path)
+    sample = gold_df.head(3).copy()
+    sample["gap_expt_eV"] = 0.0
+    sample.to_csv(public / SAMPLE_FILENAME, index=False)
+    print("✓ Created sample_submission.csv placeholder")
+    gold_df.to_csv(private / "answer.csv", index=False)
+    print("✓ Copied gold CSV to private directory")
+    (private / "notes.txt").write_text(
+        f"Expected submission: pred_results/{PRED_FILENAME}\nMAE threshold: 0.6 eV\n",
+        encoding="utf-8",
+    )
+    print("✓ Wrote notes.txt")
+    print("Data preparation completed.")

mlebench/competitions/see-click-predict-fix/__init__.py ADDED Viewed

File without changes

mlebench/competitions/see-click-predict-fix/grade.py ADDED Viewed

@@ -0,0 +1,66 @@
+import numpy as np
+import pandas as pd
+from sklearn.metrics import mean_squared_log_error
+from mlebench.grade_helpers import InvalidSubmissionError
+TARGET_COLUMNS = ['num_views', 'num_votes', 'num_comments']
+def prepare_for_metric(submission: pd.DataFrame, answers: pd.DataFrame):
+    """Prepare submission and answers for RMSLE calculation."""
+    if len(submission) != len(answers):
+        raise InvalidSubmissionError(
+            f"Submission length ({len(submission)}) != answers length ({len(answers)})"
+        )
+    if "id" not in submission.columns:
+        raise InvalidSubmissionError("Submission must have an 'id' column")
+    for col in TARGET_COLUMNS:
+        if col not in submission.columns:
+            raise InvalidSubmissionError(f"Submission must have a '{col}' column")
+    # Sort by id
+    submission = submission.sort_values("id").reset_index(drop=True)
+    answers = answers.sort_values("id").reset_index(drop=True)
+    # Check id alignment
+    if not (submission["id"] == answers["id"]).all():
+        raise InvalidSubmissionError("Submission and answers id mismatch")
+    # Validate predictions
+    for col in TARGET_COLUMNS:
+        try:
+            submission[col] = submission[col].astype(float)
+        except (ValueError, TypeError):
+            raise InvalidSubmissionError(f"'{col}' predictions must be numeric")
+        if submission[col].isnull().any():
+            raise InvalidSubmissionError(f"'{col}' predictions cannot contain NaN values")
+        if (submission[col] < 0).any():
+            raise InvalidSubmissionError(f"'{col}' predictions cannot be negative")
+    return submission, answers
+def grade(submission: pd.DataFrame, answers: pd.DataFrame) -> float:
+    """Calculate RMSLE score across all target columns."""
+    submission, answers = prepare_for_metric(submission, answers)
+    # Concatenate all predictions and true values
+    all_pred = []
+    all_true = []
+    for col in TARGET_COLUMNS:
+        all_pred.extend(submission[col].values)
+        all_true.extend(answers[col].values)
+    all_pred = np.array(all_pred)
+    all_true = np.array(all_true)
+    # RMSLE
+    rmsle = np.sqrt(mean_squared_log_error(all_true, all_pred))
+    return rmsle

mlebench/competitions/see-click-predict-fix/prepare.py ADDED Viewed

@@ -0,0 +1,25 @@
+from pathlib import Path
+import pandas as pd
+def prepare(raw: Path, public: Path, private: Path):
+    """
+    Prepare see-click-predict-fix dataset.
+    """
+    # Read data
+    train = pd.read_csv(raw / "train.csv")
+    test = pd.read_csv(raw / "test.csv")
+    sample_submission = pd.read_csv(raw / "sample_submission.csv")
+    test_answer = pd.read_csv(raw / "test_answer.csv")
+    # Public files (visible to agents)
+    train.to_csv(public / "train.csv", index=False)
+    test.to_csv(public / "test.csv", index=False)
+    sample_submission.to_csv(public / "sample_submission.csv", index=False)
+    # Private files (for grading)
+    test_answer.to_csv(private / "test.csv", index=False)
+    # Validation checks
+    assert len(test_answer) == len(sample_submission), \
+        f"Test answer ({len(test_answer)}) and sample submission ({len(sample_submission)}) must have same length"

mlebench/competitions/see_click_predict_fix/__init__.py ADDED Viewed

File without changes

mlebench/competitions/see_click_predict_fix/grade.py ADDED Viewed

@@ -0,0 +1,66 @@
+import numpy as np
+import pandas as pd
+from sklearn.metrics import mean_squared_log_error
+from mlebench.grade_helpers import InvalidSubmissionError
+TARGET_COLUMNS = ['num_views', 'num_votes', 'num_comments']
+def prepare_for_metric(submission: pd.DataFrame, answers: pd.DataFrame):
+    """Prepare submission and answers for RMSLE calculation."""
+    if len(submission) != len(answers):
+        raise InvalidSubmissionError(
+            f"Submission length ({len(submission)}) != answers length ({len(answers)})"
+        )
+    if "id" not in submission.columns:
+        raise InvalidSubmissionError("Submission must have an 'id' column")
+    for col in TARGET_COLUMNS:
+        if col not in submission.columns:
+            raise InvalidSubmissionError(f"Submission must have a '{col}' column")
+    # Sort by id
+    submission = submission.sort_values("id").reset_index(drop=True)
+    answers = answers.sort_values("id").reset_index(drop=True)
+    # Check id alignment
+    if not (submission["id"] == answers["id"]).all():
+        raise InvalidSubmissionError("Submission and answers id mismatch")
+    # Validate predictions
+    for col in TARGET_COLUMNS:
+        try:
+            submission[col] = submission[col].astype(float)
+        except (ValueError, TypeError):
+            raise InvalidSubmissionError(f"'{col}' predictions must be numeric")
+        if submission[col].isnull().any():
+            raise InvalidSubmissionError(f"'{col}' predictions cannot contain NaN values")
+        if (submission[col] < 0).any():
+            raise InvalidSubmissionError(f"'{col}' predictions cannot be negative")
+    return submission, answers
+def grade(submission: pd.DataFrame, answers: pd.DataFrame) -> float:
+    """Calculate RMSLE score across all target columns."""
+    submission, answers = prepare_for_metric(submission, answers)
+    # Concatenate all predictions and true values
+    all_pred = []
+    all_true = []
+    for col in TARGET_COLUMNS:
+        all_pred.extend(submission[col].values)
+        all_true.extend(answers[col].values)
+    all_pred = np.array(all_pred)
+    all_true = np.array(all_true)
+    # RMSLE
+    rmsle = np.sqrt(mean_squared_log_error(all_true, all_pred))
+    return rmsle

mlebench/competitions/see_click_predict_fix/prepare.py ADDED Viewed

@@ -0,0 +1,25 @@
+from pathlib import Path
+import pandas as pd
+def prepare(raw: Path, public: Path, private: Path):
+    """
+    Prepare see-click-predict-fix dataset.
+    """
+    # Read data
+    train = pd.read_csv(raw / "train.csv")
+    test = pd.read_csv(raw / "test.csv")
+    sample_submission = pd.read_csv(raw / "sample_submission.csv")
+    test_answer = pd.read_csv(raw / "test_answer.csv")
+    # Public files (visible to agents)
+    train.to_csv(public / "train.csv", index=False)
+    test.to_csv(public / "test.csv", index=False)
+    sample_submission.to_csv(public / "sample_submission.csv", index=False)
+    # Private files (for grading)
+    test_answer.to_csv(private / "test.csv", index=False)
+    # Validation checks
+    assert len(test_answer) == len(sample_submission), \
+        f"Test answer ({len(test_answer)}) and sample submission ({len(sample_submission)}) must have same length"

mlebench/competitions/seti-breakthrough-listen/grade.py ADDED Viewed

@@ -0,0 +1,11 @@
+import pandas as pd
+from sklearn.metrics import roc_auc_score
+from mlebench.competitions.utils import prepare_for_auroc_metric
+def grade(submission: pd.DataFrame, answers: pd.DataFrame) -> float:
+    roc_auc_inputs = prepare_for_auroc_metric(
+        submission=submission, answers=answers, id_col="id", target_col="target"
+    )
+    return roc_auc_score(y_true=roc_auc_inputs["y_true"], y_score=roc_auc_inputs["y_score"])

mlebench/competitions/seti-breakthrough-listen/prepare.py ADDED Viewed

@@ -0,0 +1,71 @@
+import shutil
+from pathlib import Path
+from sklearn.model_selection import train_test_split
+from tqdm import tqdm
+from mlebench.utils import read_csv
+def prepare(raw: Path, public: Path, private: Path):
+    # Create train, test from train split
+    old_train = read_csv(raw / "train_labels.csv")
+    new_train, new_test = train_test_split(old_train, test_size=0.1, random_state=0)
+    # Copy over files
+    new_train.to_csv(public / "train_labels.csv", index=False)
+    new_test.to_csv(private / "test.csv", index=False)
+    shutil.copytree(raw / "old_leaky_data", public / "old_leaky_data", dirs_exist_ok=True)
+    for file_id in tqdm(new_train["id"], desc="Copying train files"):
+        subdir = file_id[0]
+        src = raw / "train" / subdir / f"{file_id}.npy"
+        dst = public / "train" / subdir / f"{file_id}.npy"
+        dst.parent.mkdir(parents=True, exist_ok=True)
+        shutil.copy(src, dst)
+    for file_id in tqdm(new_test["id"], desc="Copying test files"):
+        subdir = file_id[0]
+        src = raw / "train" / subdir / f"{file_id}.npy"
+        dst = public / "test" / subdir / f"{file_id}.npy"
+        dst.parent.mkdir(parents=True, exist_ok=True)
+        shutil.copy(src, dst)
+    # Create sample submission
+    sample_submission = new_test.copy()
+    sample_submission["target"] = 0.5  # Overwrite with dummy values
+    sample_submission.to_csv(public / "sample_submission.csv", index=False)
+    # Checks
+    assert len(sample_submission) == len(
+        new_test
+    ), "Sample submission length does not match test length."
+    assert not set(new_train["id"]).intersection(
+        set(new_test["id"])
+    ), "There are overlapping IDs in train and test sets."
+    train_files = {
+        file_path.name: file_path
+        for file_path in (public / "train").rglob("*")
+        if file_path.is_file()
+    }
+    test_files = {
+        file_path.name: file_path
+        for file_path in (public / "test").rglob("*")
+        if file_path.is_file()
+    }
+    assert len(train_files) == len(
+        new_train
+    ), "Number of train files does not match the number of train records."
+    assert len(test_files) == len(
+        new_test
+    ), "Number of test files does not match the number of test records."
+    assert train_files.keys().isdisjoint(
+        test_files.keys()
+    ), "There are overlapping files in train and test directories."
+    assert len(sample_submission) == len(
+        new_test
+    ), "Sample submission length does not match new test length."

dslighting 1.7.1__py3-none-any.whl → 1.7.6__py3-none-any.whl

dslighting 1.7.1py3-none-any.whl → 1.7.6py3-none-any.whl