PyPI - dslighting - Versions diffs - 1.7.1__py3-none-any.whl → 1.7.8__py3-none-any.whl - Mend

dslighting 1.7.1py3-none-any.whl → 1.7.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (352) hide show

mlebench/data.py ADDED Viewed

@@ -0,0 +1,420 @@
+import functools
+import hashlib
+import inspect
+import os
+import shutil
+import webbrowser
+from datetime import datetime
+from pathlib import Path
+from typing import Any, Callable, Optional
+import diskcache as dc
+import pandas as pd
+import yaml
+from tenacity import retry, retry_if_exception, stop_after_attempt, wait_fixed
+from tqdm.auto import tqdm
+from mlebench.registry import Competition
+from mlebench.utils import (
+    authenticate_kaggle_api,
+    extract,
+    get_diff,
+    get_logger,
+    get_path_to_callable,
+    is_empty,
+    load_yaml,
+)
+logger = get_logger(__name__)
+#cache = dc.Cache("cache", size_limit=2**26)  # 64 MB
+# 修改为
+import tempfile
+cache_dir = os.path.join(tempfile.gettempdir(), f"mlebench_cache_{os.getpid()}")
+cache = dc.Cache(cache_dir, size_limit=2**26)  # 64 MB
+def create_prepared_dir(competition: Competition) -> None:
+    competition.public_dir.mkdir(exist_ok=True, parents=True)
+    competition.private_dir.mkdir(exist_ok=True, parents=True)
+def download_and_prepare_dataset(
+    competition: Competition,
+    keep_raw: bool = True,
+    overwrite_checksums: bool = False,
+    overwrite_leaderboard: bool = False,
+    skip_verification: bool = False,
+) -> None:
+    """
+    Creates a `public` and `private` directory for the competition using the `prepare_fn`,
+    downloading the competition's dataset zip file and extracting it into `raw` if needed.
+    """
+    assert is_valid_prepare_fn(
+        competition.prepare_fn
+    ), f"Provided `prepare_fn` doesn't take arguments `raw`, `private` and `public`!"
+    ensure_leaderboard_exists(competition, force=overwrite_leaderboard)
+    competition_dir = competition.raw_dir.parent
+    competition.raw_dir.mkdir(exist_ok=True, parents=True)
+    create_prepared_dir(competition)
+    zipfile = download_dataset(
+        competition_id=competition.id,
+        download_dir=competition_dir,
+        force=False,
+    )
+    if overwrite_checksums or not skip_verification:
+        logger.info(f"Generating checksum for `{zipfile}`...")
+        actual_zip_checksum = get_checksum(zipfile)
+        if competition.checksums.is_file() and not overwrite_checksums:
+            expected_checksums = load_yaml(competition.checksums)
+            expected_zip_checksum = expected_checksums["zip"]
+            if actual_zip_checksum != expected_zip_checksum:
+                raise ValueError(
+                    f"Checksum for `{zipfile}` does not match the expected checksum! "
+                    f"Expected `{expected_zip_checksum}` but got `{actual_zip_checksum}`."
+                )
+            logger.info(f"Checksum for `{zipfile}` matches the expected checksum.")
+    if is_empty(competition.raw_dir):
+        logger.info(f"Extracting `{zipfile}` to `{competition.raw_dir}`...")
+        extract(zipfile, competition.raw_dir, recursive=False)
+        logger.info(f"Extracted `{zipfile}` to `{competition.raw_dir}` successfully.")
+    if not is_dataset_prepared(competition) or overwrite_checksums:
+        if competition.public_dir.parent.exists() and overwrite_checksums:
+            logger.info(
+                f"Removing the existing prepared data directory for `{competition.id}` since "
+                "`overwrite_checksums` is set to `True`..."
+            )
+            shutil.rmtree(competition.public_dir.parent)
+            create_prepared_dir(competition)
+        logger.info(
+            f"Preparing the dataset using `{competition.prepare_fn.__name__}` from "
+            f"`{get_path_to_callable(competition.prepare_fn)}`..."
+        )
+        competition.prepare_fn(
+            raw=competition.raw_dir,
+            public=competition.public_dir,
+            private=competition.private_dir,
+        )
+        logger.info(f"Data for competition `{competition.id}` prepared successfully.")
+    with open(competition.public_dir / "description.md", "w") as f:
+        f.write(competition.description)
+    # Also save to public_val directory
+    public_val_dir = competition.public_dir.parent / (competition.public_dir.name + "_val")
+    if public_val_dir.exists():
+        with open(public_val_dir / "description.md", "w") as f:
+            f.write(competition.description)
+    if overwrite_checksums or not skip_verification:
+        logger.info(f"Generating checksums for files in `{competition_dir}`...")
+        actual_checksums = {
+            "zip": actual_zip_checksum,
+            "public": generate_checksums(competition.public_dir),
+            "private": generate_checksums(competition.private_dir),
+        }
+        if not competition.checksums.is_file() or overwrite_checksums:
+            with open(competition.checksums, "w") as file:
+                yaml.dump(actual_checksums, file, default_flow_style=False)
+            logger.info(f"Checksums for `{competition.id}` saved to `{competition.checksums}`.")
+        expected_checksums = load_yaml(competition.checksums)
+        if actual_checksums != expected_checksums:
+            logger.error(f"Checksums do not match for `{competition.id}`!")
+            diff = get_diff(
+                actual_checksums,
+                expected_checksums,
+                fromfile="actual_checksums",
+                tofile="expected_checksums",
+            )
+            raise ValueError(f"Checksums do not match for `{competition.id}`!\n{diff}")
+        logger.info(f"Checksums for files in `{competition_dir}` match the expected checksums.")
+    if not keep_raw:
+        logger.info(f"Removing the raw data directory for `{competition.id}`...")
+        shutil.rmtree(competition.raw_dir)
+    assert competition.public_dir.is_dir(), f"Public data directory doesn't exist."
+    assert competition.private_dir.is_dir(), f"Private data directory doesn't exist."
+    assert not is_empty(competition.public_dir), f"Public data directory is empty!"
+    assert not is_empty(competition.private_dir), f"Private data directory is empty!"
+def is_dataset_prepared(competition: Competition, grading_only: bool = False) -> bool:
+    """Checks if the competition has non-empty `public` and `private` directories with the expected files."""
+    assert isinstance(
+        competition, Competition
+    ), f"Expected input to be of type `Competition` but got {type(competition)}."
+    public = competition.public_dir
+    private = competition.private_dir
+    if not grading_only:
+        if not public.is_dir():
+            logger.warning("Public directory does not exist.")
+            return False
+        if is_empty(public):
+            logger.warning("Public directory is empty.")
+            return False
+    if not private.is_dir():
+        logger.warning("Private directory does not exist.")
+        return False
+    if is_empty(private):
+        logger.warning("Private directory is empty.")
+        return False
+    if not competition.answers.is_file():
+        logger.warning("Answers file does not exist.")
+        return False
+    if not competition.sample_submission.is_file() and not grading_only:
+        logger.warning("Sample submission file does not exist.")
+        return False
+    return True
+def is_api_exception(exception: Exception) -> bool:
+    # only import when necessary; otherwise kaggle asks for API key on import
+    #from kaggle.rest import ApiException
+    try:
+        from kaggle.rest import ApiException
+    except ImportError:
+        class ApiException(Exception): pass
+    return isinstance(exception, ApiException)
+@retry(
+    retry=retry_if_exception(is_api_exception),
+    stop=stop_after_attempt(3),  # stop after 3 attempts
+    wait=wait_fixed(5),  # wait 5 seconds between attempts
+    reraise=True,
+)
+def download_dataset(
+    competition_id: str,
+    download_dir: Path,
+    quiet: bool = False,
+    force: bool = False,
+) -> Path:
+    """Downloads the competition data as a zip file using the Kaggle API and returns the path to the zip file."""
+    if not download_dir.exists():
+        download_dir.mkdir(parents=True)
+    logger.info(f"Downloading the dataset for `{competition_id}` to `{download_dir}`...")
+    api = authenticate_kaggle_api()
+    # only import when necessary; otherwise kaggle asks for API key on import
+    from kaggle.rest import ApiException
+    try:
+        api.competition_download_files(
+            competition=competition_id,
+            path=download_dir,
+            quiet=quiet,
+            force=force,
+        )
+    except ApiException as e:
+        if _need_to_accept_rules(str(e)):
+            logger.warning("You must accept the competition rules before downloading the dataset.")
+            _prompt_user_to_accept_rules(competition_id)
+            download_dataset(competition_id, download_dir, quiet, force)
+        else:
+            raise e
+    zip_files = list(download_dir.glob("*.zip"))
+    assert (
+        len(zip_files) == 1
+    ), f"Expected to download a single zip file, but found {len(zip_files)} zip files."
+    zip_file = zip_files[0]
+    return zip_file
+def _need_to_accept_rules(error_msg: str) -> bool:
+    return "You must accept this competition" in error_msg
+def _prompt_user_to_accept_rules(competition_id: str) -> None:
+    response = input("Would you like to open the competition page in your browser now? (y/n): ")
+    if response.lower() != "y":
+        raise RuntimeError("You must accept the competition rules before downloading the dataset.")
+    webbrowser.open(f"https://www.kaggle.com/c/{competition_id}/rules")
+    input("Press Enter to continue after you have accepted the rules...")
+def is_valid_prepare_fn(preparer_fn: Any) -> bool:
+    """Checks if the `preparer_fn` takes three arguments: `raw`, `public` and `private`, in that order."""
+    try:
+        sig = inspect.signature(preparer_fn)
+    except (TypeError, ValueError):
+        return False
+    actual_params = list(sig.parameters.keys())
+    expected_params = ["raw", "public", "private"]
+    return actual_params == expected_params
+def generate_checksums(
+    target_dir: Path,
+    exts: Optional[list[str]] = None,
+    exclude: Optional[list[Path]] = None,
+) -> dict:
+    """
+    Generate checksums for the files directly under the target directory with the specified extensions.
+    Args:
+        target_dir: directory to generate checksums for.
+        exts: List of file extensions to generate checksums for.
+        exclude: List of file paths to exclude from checksum generation.
+    Returns:
+        A dictionary of form file: checksum.
+    """
+    if exts is None:
+        exts = ["csv", "json", "jsonl", "parquet", "bson"]
+    if exclude is None:
+        exclude = []
+    checksums = {}
+    for ext in exts:
+        fpaths = target_dir.glob(f"*.{ext}")
+        for fpath in fpaths:
+            if not fpath.is_file():
+                continue  # skip dirs named like `my/dir.csv/`
+            if fpath in exclude:
+                continue
+            checksums[fpath.name] = get_checksum(fpath)
+    return checksums
+def get_last_modified(fpath: Path) -> datetime:
+    """Return the last modified time of a file."""
+    return datetime.fromtimestamp(fpath.stat().st_mtime)
+def file_cache(fn: Callable) -> Callable:
+    """A decorator that caches results of a function with a Path argument, invalidating the cache when the file is modified."""
+    sig = inspect.signature(fn)
+    params = list(sig.parameters.values())
+    if not (len(params) == 1 and params[0].annotation is Path):
+        raise NotImplementedError("Only functions with a single `Path` argument are supported.")
+    # Use `functools.wraps` to preserve the function's metadata, like the name and docstring.
+    # Query the cache, but with an additional `last_modified` argument in the key, which has the
+    # side effect of invalidating the cache when the file is modified.
+    @functools.wraps(fn)
+    def wrapper(fpath: Path) -> Any:
+        last_modified = get_last_modified(fpath)
+        key = (fn.__name__, str(fpath), last_modified)
+        if key not in cache:
+            cache[key] = fn(fpath)
+        return cache[key]
+    return wrapper
+@file_cache
+def get_checksum(fpath: Path) -> str:
+    """Compute MD5 checksum of a file."""
+    assert fpath.is_file(), f"Expected a file at `{fpath}`, but it doesn't exist."
+    hash_md5 = hashlib.md5()
+    file_size = os.path.getsize(fpath)
+    # only show progress bar for large files (> ~5 GB)
+    show_progress = file_size > 5_000_000_000
+    with open(fpath, "rb") as f:
+        for chunk in tqdm(
+            iter(lambda: f.read(4_096), b""),
+            total=file_size // 4096,
+            unit="B",
+            unit_scale=True,
+            disable=not show_progress,
+        ):
+            hash_md5.update(chunk)
+    return hash_md5.hexdigest()
+def ensure_leaderboard_exists(competition: Competition, force: bool = False) -> Path:
+    """
+    Ensures the leaderboard for a given competition exists in the competition's
+    directory, returning the path to it.
+    If `force` is True, the leaderboard is downloaded using the Kaggle API.
+    If `force` is `false`, if the leaderboard does not exist, an error is raised.
+    """
+    download_dir = competition.leaderboard.parent
+    leaderboard_path = competition.leaderboard
+    if not force:
+        if leaderboard_path.exists():
+            return leaderboard_path
+        else:
+            raise FileNotFoundError(
+                f"Leaderboard not found locally for competition `{competition.id}`. Please flag this to the developers."
+            )
+    api = authenticate_kaggle_api()
+    leaderboard = api.competition_leaderboard_view(competition=competition.id)
+    if leaderboard:
+        leaderboard = [row.__dict__ for row in leaderboard]
+        leaderboard_df = pd.DataFrame(leaderboard)
+        leaderboard_df.drop(columns=["teamNameNullable", "teamName"], inplace=True)
+        leaderboard_df.to_csv(leaderboard_path, index=False)
+        logger.info(
+            f"Downloaded leaderboard for competition `{competition.id}` to `{download_dir.relative_to(Path.cwd()) / 'leaderboard.csv'}`."
+        )
+        return leaderboard_path
+    else:
+        raise RuntimeError(f"Failed to download leaderboard for competition `{competition.id}`.")
+def get_leaderboard(competition: Competition) -> pd.DataFrame:
+    leaderboard_path = competition.leaderboard
+    assert (
+        leaderboard_path.exists()
+    ), f"Leaderboard not found locally for competition `{competition.id}`."
+    leaderboard_df = pd.read_csv(leaderboard_path)
+    return leaderboard_df

mlebench/grade.py ADDED Viewed

@@ -0,0 +1,209 @@
+"""High-level grading functionality"""
+import json
+from datetime import datetime
+from pathlib import Path
+import pandas as pd
+from tqdm import tqdm
+from mlebench.data import get_leaderboard, is_dataset_prepared
+from mlebench.grade_helpers import CompetitionReport
+from mlebench.registry import Competition, Registry
+from mlebench.registry import registry as DEFAULT_REGISTRY
+from mlebench.utils import get_logger, get_timestamp, load_answers, purple, read_csv, read_jsonl
+logger = get_logger(__name__)
+def grade_jsonl(
+    path_to_submissions: Path,
+    output_dir: Path,
+    registry: Registry = DEFAULT_REGISTRY,
+):
+    """
+    Grades multiple submissions stored in a JSONL file.
+    Saves the aggregated report as a JSON file.
+    """
+    submissions = read_jsonl(path_to_submissions, skip_commented_out_lines=True)
+    competitions_reports = []
+    for submission in tqdm(submissions, desc="Grading submissions", unit="submission"):
+        submission_path = Path(str(submission["submission_path"]))
+        competition_id = submission["competition_id"]
+        competition = registry.get_competition(competition_id)
+        single_report = grade_csv(submission_path, competition)
+        competitions_reports.append(single_report)
+    aggregated_report = aggregate_reports(competitions_reports)
+    timestamp = get_timestamp()
+    save_path = output_dir / f"{timestamp}_grading_report.json"
+    logger.info(
+        json.dumps(
+            {k: v for k, v in aggregated_report.items() if k != "competition_reports"}, indent=4
+        )
+    )
+    output_dir.mkdir(exist_ok=True)
+    with open(save_path, "w") as f:
+        json.dump(aggregated_report, f, indent=2)
+    logger.info(purple(f"Saved summary report to {save_path}"))
+def grade_csv(path_to_submission: Path, competition: Competition) -> CompetitionReport:
+    """Grades a submission CSV for the given competition."""
+    if not is_dataset_prepared(competition, grading_only=True):
+        raise ValueError(
+            f"Dataset for competition `{competition.id}` is not prepared! "
+            f"Please run `mlebench prepare -c {competition.id}` to prepare the dataset."
+        )
+    score = None
+    submission_exists = path_to_submission.is_file() and path_to_submission.suffix.lower() == ".csv"
+    if submission_exists:
+        submission_df = read_csv(path_to_submission)
+        logger.info(purple(f"Load answers from {competition.answers}"))
+        answers = load_answers(competition.answers)
+        score = competition.grader(submission_df, answers)
+    else:
+        logger.warning(
+            f"Invalid submission file: {path_to_submission}. Please check that the file exists and it is a CSV."
+        )
+    valid_submission = score is not None
+    competition_leaderboard = get_leaderboard(competition)
+    rank_info = competition.grader.rank_score(score, competition_leaderboard)
+    is_lower_better = competition.grader.is_lower_better(competition_leaderboard)
+    return CompetitionReport(
+        competition_id=competition.id,
+        score=score,
+        gold_threshold=rank_info["gold_threshold"],
+        silver_threshold=rank_info["silver_threshold"],
+        bronze_threshold=rank_info["bronze_threshold"],
+        median_threshold=rank_info["median_threshold"],
+        any_medal=rank_info["gold_medal"] or rank_info["silver_medal"] or rank_info["bronze_medal"],
+        gold_medal=rank_info["gold_medal"],
+        silver_medal=rank_info["silver_medal"],
+        bronze_medal=rank_info["bronze_medal"],
+        above_median=rank_info["above_median"],
+        submission_exists=submission_exists,
+        valid_submission=valid_submission,
+        is_lower_better=is_lower_better,
+        created_at=datetime.now(),
+        submission_path=str(path_to_submission),
+    )
+def validate_submission(submission: Path, competition: Competition) -> tuple[bool, str]:
+    """
+    Validates a submission for the given competition by actually running the competition grader.
+    This is designed for end users, not developers (we assume that the competition grader is
+    correctly implemented and use that for validating the submission, not the other way around).
+    """
+    if not submission.is_file():
+        return False, f"Submission invalid! Submission file {submission} does not exist."
+    if not submission.suffix.lower() == ".csv":
+        return False, "Submission invalid! Submission file must be a CSV file."
+    if not is_dataset_prepared(competition, grading_only=True):
+        raise ValueError(
+            f"Dataset for competition `{competition.id}` is not prepared! "
+            f"Please run `mlebench prepare -c {competition.id}` to prepare the dataset."
+        )
+    try:
+        competition.grader.grade_fn(read_csv(submission), read_csv(competition.answers))
+    except Exception as e:
+        return (
+            False,
+            f"Submission invalid! The attempt to grade the submission has resulted in the following error message:\n{e}",
+        )
+    return True, "Submission is valid."
+def aggregate_reports(competition_reports: list[CompetitionReport]) -> dict:
+    """
+    Builds the summary report from a list of competition reports.
+    If pass_at_n is True, then aggregate performence of competitions by selecting the best performance per
+    competition, otherwise sum metrics
+    """
+    total_gold_medals = sum(report.gold_medal for report in competition_reports)
+    total_silver_medals = sum(report.silver_medal for report in competition_reports)
+    total_bronze_medals = sum(report.bronze_medal for report in competition_reports)
+    total_above_median = sum(report.above_median for report in competition_reports)
+    total_submissions = sum(report.submission_exists for report in competition_reports)
+    total_valid_submissions = sum(report.valid_submission for report in competition_reports)
+    summary_report = {
+        "total_runs": int(len(competition_reports)),
+        "total_runs_with_submissions": int(total_submissions),
+        "total_valid_submissions": int(total_valid_submissions),
+        "total_medals": int(total_gold_medals + total_silver_medals + total_bronze_medals),
+        "total_gold_medals": int(total_gold_medals),
+        "total_silver_medals": int(total_silver_medals),
+        "total_bronze_medals": int(total_bronze_medals),
+        "total_above_median": int(total_above_median),
+        "competition_reports": [cr.to_dict() for cr in competition_reports],
+    }
+    return summary_report
+def simple_accuracy_grader(submission: pd.DataFrame, answers: pd.DataFrame) -> float:
+    """
+    Computes simple accuracy between submission and answers.
+    Assumes matching order or joins on an ID column.
+    """
+    try:
+        # 1. Try to find ID column to merge on
+        id_col = None
+        common_cols = set(submission.columns).intersection(set(answers.columns))
+        for col in common_cols:
+            if col.lower() in ['id', 'key', 'index', 'sample_id', 'image_id', 'patient_id']:
+                id_col = col
+                break
+        if id_col:
+            merged = submission.merge(answers, on=id_col, suffixes=('_pred', '_true'))
+            # Compare all other columns
+            pred_cols = [c for c in submission.columns if c != id_col]
+            true_cols = [c for c in answers.columns if c != id_col]
+            # If there's only one other column, compare it
+            if len(pred_cols) == 1 and len(true_cols) == 1:
+                return float((merged[pred_cols[0]] == merged[true_cols[0]]).mean())
+            # Iterate over shared columns that are NOT id
+            shared_content_cols = [c for c in common_cols if c != id_col]
+            if shared_content_cols:
+                matches = []
+                for c in shared_content_cols:
+                    # In merged df, they are c_pred and c_true
+                    matches.append((merged[f"{c}_pred"] == merged[f"{c}_true"]).mean())
+                if matches:
+                    return float(sum(matches) / len(matches))
+        # 2. Fallback: Position-based comparison (assuming sorted)
+        if len(submission) == len(answers):
+            # Compare last column if dimensions match
+            if submission.shape[1] == answers.shape[1]:
+                return float((submission.iloc[:, -1] == answers.iloc[:, -1]).mean())
+            # Or compare specific common columns
+            common_cols = set(submission.columns).intersection(set(answers.columns))
+            if common_cols:
+                matches = []
+                for c in common_cols:
+                    matches.append((submission[c] == answers[c]).mean())
+                if matches:
+                    return float(sum(matches) / len(matches))
+    except Exception as e:
+        logger.error(f"simple_accuracy_grader failed: {e}")
+        return 0.0
+    return 0.0

dslighting 1.7.1__py3-none-any.whl → 1.7.8__py3-none-any.whl

dslighting 1.7.1py3-none-any.whl → 1.7.8py3-none-any.whl