PyPI - dslighting - Versions diffs - 1.7.1__py3-none-any.whl → 1.7.8__py3-none-any.whl - Mend

dslighting 1.7.1py3-none-any.whl → 1.7.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (352) hide show

mlebench/competitions/ethanol-concentration/prepare.py ADDED Viewed

@@ -0,0 +1,90 @@
+import pandas as pd
+import numpy as np
+import sys
+from pathlib import Path
+def prepare(raw: Path, public: Path, private: Path):
+    """
+    Prepare the ethanol-concentration dataset for the benchmark.
+    This function converts the .ts time series files to numpy arrays,
+    so that public directory only contains data, not data loading code.
+    Args:
+        raw: Path to raw data directory (contains EthanolConcentration_TRAIN.ts and EthanolConcentration_TEST.ts)
+        public: Path to public directory (visible to participants)
+        private: Path to private directory (hidden from participants, used for grading)
+    """
+    # Load test data to extract labels for grading
+    try:
+        # Add raw directory to path temporarily to import dataset module
+        sys.path.insert(0, str(raw))
+        # Use the local dataset module to load data
+        from dataset import get_dataset
+        # Change to raw directory to load data
+        import os
+        original_dir = os.getcwd()
+        os.chdir(str(raw))
+        try:
+            # Load train and test datasets
+            X_train, y_train = get_dataset('TRAIN')
+            X_test, y_test = get_dataset('TEST')
+            print(f"Loaded training data: X_train.shape = {X_train.shape}, y_train.shape = {y_train.shape}")
+            print(f"Loaded test data: X_test.shape = {X_test.shape}, y_test.shape = {y_test.shape}")
+            print(f"Number of unique labels: {len(np.unique(y_train))}")
+            # Save training data as numpy arrays in public directory
+            np.save(public / "train_data.npy", X_train)
+            np.save(public / "train_labels.npy", y_train.flatten())
+            # Save test data (without labels) in public directory
+            np.save(public / "test_data.npy", X_test)
+            # Create test labels dataframe for grading (private)
+            test_labels_df = pd.DataFrame(
+                {"id": range(len(y_test)), "label": y_test.flatten()}
+            )
+            test_labels_df.to_csv(private / "test_labels.csv", index=False)
+            # Create sample submission file
+            sample_submission = pd.DataFrame(
+                {"id": range(len(y_test)), "label": 0}  # Default to class 0
+            )
+            sample_submission.to_csv(public / "sample_submission.csv", index=False)
+            print(f"Data preparation completed:")
+            print(f"  - Training: {len(X_train)} samples")
+            print(f"  - Test: {len(X_test)} samples")
+            print(f"  - Sequence length: {X_train.shape[1]}")
+            print(f"  - Feature dimension: {X_train.shape[2]}")
+            print(f"  - Number of classes: {len(np.unique(y_train))}")
+        finally:
+            os.chdir(original_dir)
+            sys.path.remove(str(raw))
+    except Exception as e:
+        print(f"Error loading test labels: {e}")
+        import traceback
+        traceback.print_exc()
+        # Fallback: create dummy files if loading fails
+        print("Creating dummy submission files...")
+        dummy_df = pd.DataFrame({"id": [0], "label": [0]})
+        dummy_df.to_csv(private / "test_labels.csv", index=False)
+        dummy_df.to_csv(public / "sample_submission.csv", index=False)
+    # Validation checks
+    assert (public / "train_data.npy").exists(), "Training data should exist"
+    assert (public / "train_labels.npy").exists(), "Training labels should exist"
+    assert (public / "test_data.npy").exists(), "Test data should exist"
+    assert (private / "test_labels.csv").exists(), "Test labels should exist"
+    assert (public / "sample_submission.csv").exists(), "Sample submission should exist"
+    print(f"\nPrepared ethanol-concentration dataset:")
+    print(f"  - Public files: {list(public.glob('*'))}")
+    print(f"  - Private files: {list(private.glob('*'))}")

mlebench/competitions/facebook-recruiting-iii-keyword-extraction/grade.py ADDED Viewed

@@ -0,0 +1,60 @@
+import pandas as pd
+from pandas import DataFrame
+from scipy.sparse import csr_matrix
+from sklearn.metrics import f1_score
+from sklearn.preprocessing import MultiLabelBinarizer
+from mlebench.grade_helpers import InvalidSubmissionError
+from mlebench.utils import get_logger
+logger = get_logger(__name__)
+def grade(submission: DataFrame, answers: DataFrame) -> float:
+    """Grades the submission against the test set."""
+    y_true, y_pred = prepare_for_metric(submission, answers)
+    return f1_score(y_true=y_true, y_pred=y_pred, average="micro")
+def prepare_for_metric(
+    submission: pd.DataFrame, answers: pd.DataFrame
+) -> tuple[csr_matrix, csr_matrix]:
+    # Answer checks
+    assert "Id" in answers.columns, "Answers must have an 'Id' column."
+    assert "Tags" in answers.columns, "Answers must have a 'Tags' column."
+    # Submission checks
+    if "Id" not in submission.columns:
+        raise InvalidSubmissionError("Submission DataFrame must have an 'Id' column.")
+    if "Tags" not in submission.columns:
+        raise InvalidSubmissionError("Submission DataFrame must have a 'Tags' column.")
+    if len(submission) != len(answers):
+        raise InvalidSubmissionError(
+            "Submission DataFrame must have the same number of rows as the answers DataFrame."
+        )
+    # Match order
+    submission = submission.sort_values(by="Id").reset_index(drop=True)
+    answers = answers.sort_values(by="Id").reset_index(drop=True)
+    if (submission["Id"].values != answers["Id"].values).any():
+        raise InvalidSubmissionError("Submission and answers must have matching 'Id's.")
+    # Get classes
+    classes = set()
+    for tags in answers["Tags"]:
+        if not isinstance(tags, str):
+            logger.warning(f"Tags from answers '{tags}' not of type str! Skipping.")
+            continue
+        tags_split = tags.split()
+        classes.update(tags_split)
+    # Convert to sparse matrix using MultiLabelBinarizer
+    mlb = MultiLabelBinarizer(classes=sorted(classes), sparse_output=True)
+    y_true = mlb.fit_transform(answers["Tags"].fillna("").str.split())
+    y_pred = mlb.transform(submission["Tags"].fillna("").str.split())
+    return y_true, y_pred

mlebench/competitions/facebook-recruiting-iii-keyword-extraction/prepare.py ADDED Viewed

@@ -0,0 +1,41 @@
+from pathlib import Path
+from sklearn.model_selection import train_test_split
+from mlebench.utils import extract, get_logger, read_csv
+logger = get_logger(__name__)
+def prepare(raw: Path, public: Path, private: Path) -> None:
+    extract(raw / "Train.zip", raw)
+    # Create new train and test from original train set
+    logger.info(f"Creating new train and test sets from {raw}.")
+    old_train = read_csv(raw / "Train.csv", dtype={"Id": str, "Tags": str})
+    new_train, new_test = train_test_split(old_train, test_size=0.1, random_state=0)
+    new_test_without_labels = new_test.drop(columns=["Tags"])
+    sample_submission = new_test_without_labels.copy()
+    sample_submission["Tags"] = "javascript c# python php java"
+    # Copy over files to private and public directories
+    logger.info(f"Copying files to {private} and {public}.")
+    new_train.to_csv(public / "train.csv", index=False)
+    new_test_without_labels.to_csv(public / "test.csv", index=False)
+    sample_submission.to_csv(public / "sample_submission.csv", index=False)
+    new_test.to_csv(private / "test.csv", index=False)
+    # Sanity checks
+    logger.info("Performing sanity checks.")
+    assert len(new_test_without_labels) == len(
+        new_test
+    ), f"Expected {len(new_test)} public test samples, got {len(new_test_without_labels)}."
+    assert len(old_train) == len(new_train) + len(
+        new_test
+    ), f"Mismatch in number of samples in new train and test split! Expected {len(old_train)} samples, got {len(new_train) + len(new_test)}."
+    assert len(sample_submission) == len(
+        new_test
+    ), f"Expected {len(new_test)} public test samples, got {len(sample_submission)}."

mlebench/competitions/facebook-recruiting-iii-keyword-extraction/prepare_val.py ADDED Viewed

@@ -0,0 +1,92 @@
+from pathlib import Path
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from mlebench.utils import extract, get_logger, read_csv
+logger = get_logger(__name__)
+def _create_dataset_split(
+    source_df: pd.DataFrame, public_dir: Path, private_dir: Path
+) -> None:
+    """
+    Splits a source dataframe and saves the resulting files to public and private dirs.
+    This helper function encapsulates the logic for:
+    1. Splitting data into train/test sets.
+    2. Creating public (unlabeled) and private (labeled) test sets.
+    3. Saving all files (train.csv, test.csv, sample_submission.csv) to the
+       specified directories with a consistent naming scheme.
+    """
+    # Create directories if they don't exist
+    public_dir.mkdir(parents=True, exist_ok=True)
+    private_dir.mkdir(parents=True, exist_ok=True)
+    # Split the source dataframe
+    train_split, test_split = train_test_split(
+        source_df, test_size=0.1, random_state=0
+    )
+    test_split_without_labels = test_split.drop(columns=["Tags"])
+    sample_submission = test_split_without_labels.copy()
+    sample_submission["Tags"] = "javascript c# python php java"
+    # Copy over files to private and public directories
+    logger.info(f"Copying files to {private_dir} and {public_dir}.")
+    train_split.to_csv(public_dir / "train.csv", index=False)
+    test_split_without_labels.to_csv(public_dir / "test.csv", index=False)
+    sample_submission.to_csv(public_dir / "sample_submission.csv", index=False)
+    test_split.to_csv(private_dir / "test.csv", index=False)
+    # Sanity checks
+    logger.info(f"Performing sanity checks for {public_dir.name}.")
+    assert len(test_split_without_labels) == len(
+        test_split
+    ), f"Expected {len(test_split)} public test samples, got {len(test_split_without_labels)}."
+    assert len(source_df) == len(train_split) + len(
+        test_split
+    ), f"Mismatch in number of samples! Expected {len(source_df)} samples, got {len(train_split) + len(test_split)}."
+    assert len(sample_submission) == len(
+        test_split
+    ), f"Expected {len(test_split)} public test samples, got {len(sample_submission)}."
+def prepare(raw: Path, public: Path, private: Path) -> None:
+    extract(raw / "Train.zip", raw)
+    # Read the full original training dataset
+    old_train = read_csv(raw / "Train.csv", dtype={"Id": str, "Tags": str})
+    # --- Original Data Split (public/private) ---
+    # This block creates the main train/test split. Its outputs in the `public`
+    # and `private` directories will remain identical to the original script.
+    logger.info(
+        "Creating original train/test split for public and private directories."
+    )
+    _create_dataset_split(source_df=old_train, public_dir=public, private_dir=private)
+    # --- New Validation Data Split (public_val/private_val) ---
+    # This block creates a new, independent validation split. It takes the
+    # training set from the *first* split and splits it again, creating a
+    # smaller training set and a validation set.
+    logger.info(
+        "Creating new train/validation split for public_val and private_val directories."
+    )
+    # Define the new output directories, parallel to the original ones.
+    public_val = public.parent / "public_val"
+    private_val = private.parent / "private_val"
+    # We must use the training set from the first split as the source for the second.
+    train_from_first_split, _ = train_test_split(
+        old_train, test_size=0.1, random_state=0
+    )
+    # Create the second split and save to the new _val directories.
+    _create_dataset_split(
+        source_df=train_from_first_split,
+        public_dir=public_val,
+        private_dir=private_val,
+    )

mlebench/competitions/feedback-prize-english-language-learning/__init__.py ADDED Viewed

File without changes

mlebench/competitions/feedback-prize-english-language-learning/grade.py ADDED Viewed

@@ -0,0 +1,60 @@
+import numpy as np
+import pandas as pd
+from sklearn.metrics import mean_squared_error
+from mlebench.grade_helpers import InvalidSubmissionError
+TARGET_COLUMNS = ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
+def prepare_for_metric(submission: pd.DataFrame, answers: pd.DataFrame):
+    """Prepare submission and answers for MCRMSE calculation."""
+    if len(submission) != len(answers):
+        raise InvalidSubmissionError(
+            f"Submission length ({len(submission)}) != answers length ({len(answers)})"
+        )
+    if "text_id" not in submission.columns:
+        raise InvalidSubmissionError("Submission must have a 'text_id' column")
+    for col in TARGET_COLUMNS:
+        if col not in submission.columns:
+            raise InvalidSubmissionError(f"Submission must have a '{col}' column")
+    # Sort by text_id
+    submission = submission.sort_values("text_id").reset_index(drop=True)
+    answers = answers.sort_values("text_id").reset_index(drop=True)
+    # Check text_id alignment
+    if not (submission["text_id"] == answers["text_id"]).all():
+        raise InvalidSubmissionError("Submission and answers text_id mismatch")
+    # Validate predictions
+    for col in TARGET_COLUMNS:
+        try:
+            submission[col] = submission[col].astype(float)
+        except (ValueError, TypeError):
+            raise InvalidSubmissionError(f"'{col}' predictions must be numeric")
+        if submission[col].isnull().any():
+            raise InvalidSubmissionError(f"'{col}' predictions cannot contain NaN values")
+    return submission, answers
+def grade(submission: pd.DataFrame, answers: pd.DataFrame) -> float:
+    """
+    Calculate MCRMSE (Mean Columnwise Root Mean Squared Error).
+    MCRMSE = mean of RMSE across all target columns
+    """
+    submission, answers = prepare_for_metric(submission, answers)
+    rmse_scores = []
+    for col in TARGET_COLUMNS:
+        rmse = np.sqrt(mean_squared_error(answers[col], submission[col]))
+        rmse_scores.append(rmse)
+    mcrmse = np.mean(rmse_scores)
+    return mcrmse

mlebench/competitions/feedback-prize-english-language-learning/prepare.py ADDED Viewed

@@ -0,0 +1,39 @@
+from pathlib import Path
+import pandas as pd
+TARGET_COLUMNS = ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
+def prepare(raw: Path, public: Path, private: Path):
+    """
+    Prepare feedback-prize-english-language-learning dataset.
+    Raw data contains:
+    - train.csv: training data with text and scores
+    - test.csv: test data with text only
+    - sample_submission.csv: sample submission format
+    - test_answer.csv: test labels (private)
+    """
+    # Read data
+    train = pd.read_csv(raw / "train.csv")
+    test = pd.read_csv(raw / "test.csv")
+    sample_submission = pd.read_csv(raw / "sample_submission.csv")
+    test_answer = pd.read_csv(raw / "test_answer.csv")
+    # Public files (visible to agents)
+    train.to_csv(public / "train.csv", index=False)
+    test.to_csv(public / "test.csv", index=False)
+    sample_submission.to_csv(public / "sample_submission.csv", index=False)
+    # Private files (for grading)
+    test_answer.to_csv(private / "test.csv", index=False)
+    # Validation checks
+    assert len(test_answer) == len(sample_submission), \
+        f"Test answer ({len(test_answer)}) and sample submission ({len(sample_submission)}) must have same length"
+    assert "text_id" in test_answer.columns, "Test answer must have 'text_id' column"
+    assert "text_id" in sample_submission.columns, "Sample submission must have 'text_id' column"
+    for col in TARGET_COLUMNS:
+        assert col in test_answer.columns, f"Test answer must have '{col}' column"
+        assert col in sample_submission.columns, f"Sample submission must have '{col}' column"

mlebench/competitions/freesound-audio-tagging-2019/grade.py ADDED Viewed

@@ -0,0 +1,64 @@
+from pathlib import Path
+import numpy as np
+import pandas as pd
+import sklearn.metrics
+from mlebench.grade_helpers import InvalidSubmissionError
+from .vocabulary import CLASS_NAMES
+# Using the metric implementation given to participants. Source:
+# https://colab.research.google.com/drive/1AgPdhSp7ttY18O3fEoHOQKlt_3HJDLi8
+def calculate_overall_lwlrap_sklearn(truth: np.ndarray, scores: np.ndarray) -> float:
+    """Calculate the overall lwlrap using sklearn.metrics.lrap."""
+    # sklearn doesn't correctly apply weighting to samples with no labels, so just skip them.
+    sample_weight = np.sum(truth > 0, axis=1)
+    nonzero_weight_sample_indices = np.flatnonzero(sample_weight > 0)
+    overall_lwlrap = sklearn.metrics.label_ranking_average_precision_score(
+        truth[nonzero_weight_sample_indices, :] > 0,
+        scores[nonzero_weight_sample_indices, :],
+        sample_weight=sample_weight[nonzero_weight_sample_indices],
+    )
+    return overall_lwlrap
+def prepare_for_metric(submission: pd.DataFrame, answers: pd.DataFrame) -> tuple:
+    """
+    `submission` and `answers` are both DataFrames with N rows and M + 1 columns.
+    The first column is "fname" and the remaining M columns are the predictions for each class.
+    The values are floats between 0 and 1. For `answers`, the values are binary.
+    """
+    if len(submission) != len(answers):
+        raise InvalidSubmissionError("Submission and answers must have the same length")
+    if "fname" != submission.columns[0]:
+        raise InvalidSubmissionError("Submission must have an 'fname' column")
+    if not all(submission.columns[1:] == CLASS_NAMES):
+        raise InvalidSubmissionError("Submission classes do not match expected classes")
+    assert answers.columns[0] == "fname", "Answers must have an 'fname' column"
+    assert all(answers.columns[1:] == CLASS_NAMES), "Answers classes do not match expected classes"
+    submission = submission.sort_values("fname").reset_index(drop=True)
+    answers = answers.sort_values("fname").reset_index(drop=True)
+    if (submission["fname"].values != answers["fname"].values).any():
+        raise InvalidSubmissionError("Submission and answers must have the same ids")
+    # Convert to numbers
+    try:
+        submission.iloc[:, 1:] = submission.iloc[:, 1:].apply(pd.to_numeric)
+    except (ValueError, TypeError) as e:
+        raise InvalidSubmissionError(f"Error converting submission values to float: {str(e)}")
+    answers.iloc[:, 1:] = answers.iloc[:, 1:].apply(pd.to_numeric)
+    # Return as numpy arrays
+    return answers.iloc[:, 1:].values, submission.iloc[:, 1:].values
+def grade(submission: pd.DataFrame, answers: pd.DataFrame) -> float:
+    sub, ans = prepare_for_metric(submission, answers)
+    return calculate_overall_lwlrap_sklearn(sub, ans)

mlebench/competitions/freesound-audio-tagging-2019/prepare.py ADDED Viewed

@@ -0,0 +1,94 @@
+import shutil
+from pathlib import Path
+import pandas as pd
+import requests
+from mlebench.utils import get_logger
+from .vocabulary import CLASS_NAMES
+logger = get_logger(__name__)
+def download_file(url, local_filename):
+    with requests.get(url, stream=True) as r:
+        r.raise_for_status()
+        with open(local_filename, "wb") as f:
+            for chunk in r.iter_content(chunk_size=8192):
+                if chunk:  # filter out keep-alive new chunks
+                    f.write(chunk)
+    return local_filename
+def prepare(raw: Path, public: Path, private: Path):
+    """
+    Straightforward: we have access to the post-competition released test labels, so we don't need
+    to create our own split here. We just copy over the raw data provided by the competition and
+    download the test labels.
+    Otherwise, the only work here is to convert the test data into the right format for grading:
+    The original form of `test.csv` is a DataFrame with N rows and 2 columns. The first column is
+    "fname" and the second column is the labels as comma-separated strings (class names). We
+    convert the test labels into a binary many-hot matrix matching the shape of the submission,
+    [N rows, M + 1 columns]: The first column is "fname" and the remaining M columns are the
+    predictions for each class.
+    """
+    # Copy over everything in the raw directory
+    logger.info("Copying raw data to public directory")
+    # Don't copy the metadata file if it exists
+    items_to_copy = [item for item in raw.iterdir() if "FSDKaggle2019.meta" not in item.name]
+    for item in items_to_copy:
+        dest = public / item.name
+        if item.is_dir():
+            shutil.copytree(item, dest)
+        else:
+            shutil.copy(item, dest)
+    assert len(list(public.iterdir())) == len(
+        items_to_copy
+    ), "Expected all files in raw to be copied to public"
+    # Download the test labels and metadata that were released after the competition
+    test_url = "https://zenodo.org/records/3612637/files/FSDKaggle2019.meta.zip?download=1"
+    dest_path = raw / "FSDKaggle2019.meta.zip"
+    download_file(test_url, dest_path)
+    logger.info(f"Downloaded file saved as {dest_path}")
+    # # Unzip
+    shutil.unpack_archive(dest_path, raw)
+    unzipped_path = raw / "FSDKaggle2019.meta"
+    logger.info(f"Unzipped file to {unzipped_path}")
+    # Read test labels
+    test_post_competition = pd.read_csv(unzipped_path / "test_post_competition.csv")
+    private_test = test_post_competition[test_post_competition["usage"] == "Private"]
+    # Create a binary many-hot matrix
+    new_test_rows = []
+    for idx, row in private_test.iterrows():
+        fname = row["fname"]
+        labels = row["labels"].split(",")
+        labels = [1 if label in labels else 0 for label in CLASS_NAMES]
+        new_test_rows.append([fname] + labels)
+    new_test = pd.DataFrame(new_test_rows, columns=["fname"] + CLASS_NAMES)
+    new_test.to_csv(private / "test.csv", index=False)
+    # Check that test and submission match
+    submission = pd.read_csv(public / "sample_submission.csv")
+    assert len(submission) == len(
+        new_test
+    ), f"Expected {len(new_test)} rows in test.csv, but got {len(submission)}"
+    assert (
+        submission.columns[1:].tolist() == CLASS_NAMES
+    ), "Expected class names to match between test.csv and sample_submission.csv"
+    assert all(
+        submission.columns == new_test.columns
+    ), "Expected columns to match between test.csv and sample_submission.csv"
+    new_test.sort_values("fname", inplace=True)
+    submission.sort_values("fname", inplace=True)
+    assert (
+        submission["fname"].tolist() == new_test["fname"].tolist()
+    ), "Expected 'fname' to match between test.csv and sample_submission.csv"
+    # Remove the downloaded metadata
+    dest_path.unlink()
+    shutil.rmtree(unzipped_path)

dslighting 1.7.1__py3-none-any.whl → 1.7.8__py3-none-any.whl

dslighting 1.7.1py3-none-any.whl → 1.7.8py3-none-any.whl