PyPI - dslighting - Versions diffs - 1.7.1__py3-none-any.whl → 1.7.6__py3-none-any.whl - Mend

dslighting 1.7.1py3-none-any.whl → 1.7.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (352) hide show

mlebench/competitions/text-normalization-challenge-russian-language/prepare.py ADDED Viewed

@@ -0,0 +1,113 @@
+import csv
+import zipfile
+from pathlib import Path
+from sklearn.model_selection import train_test_split
+from mlebench.utils import compress, extract, read_csv
+def prepare(raw: Path, public: Path, private: Path):
+    # Extract
+    extract(raw / "ru_test_2.csv.zip", raw)  # We only use the 2nd stage test set
+    extract(raw / "ru_train.csv.zip", raw)
+    extract(raw / "ru_sample_submission_2.csv.zip", raw)
+    # Create train and test splits from train set
+    old_train = read_csv(raw / "ru_train.csv")
+    # We split so that we don't share any sentence_ids between train and test
+    # This gives us len(new_train) = 9515325 and len(answers) = 1059191
+    unique_sentence_ids = old_train["sentence_id"].unique()
+    train_sentence_ids, test_sentence_ids = train_test_split(
+        unique_sentence_ids, test_size=0.1, random_state=0
+    )
+    new_train = old_train[old_train["sentence_id"].isin(train_sentence_ids)]
+    answers = old_train[old_train["sentence_id"].isin(test_sentence_ids)]
+    assert set(new_train["sentence_id"]).isdisjoint(
+        set(answers["sentence_id"])
+    ), f"sentence_id is not disjoint between train and test sets"
+    # "sentence_id" counts need to be reset for new_train and answers
+    new_train_id_mapping = {
+        old_id: new_id for new_id, old_id in enumerate(new_train["sentence_id"].unique())
+    }
+    new_train["sentence_id"] = new_train["sentence_id"].map(new_train_id_mapping)
+    answers_id_mapping = {
+        old_id: new_id for new_id, old_id in enumerate(answers["sentence_id"].unique())
+    }
+    answers["sentence_id"] = answers["sentence_id"].map(answers_id_mapping)
+    # Create new test set
+    new_test = answers.drop(["after", "class"], axis=1).copy()
+    # Reformat answers to match sample submission format
+    answers = answers[["sentence_id", "token_id", "after"]].copy()
+    answers["id"] = answers["sentence_id"].astype(str) + "_" + answers["token_id"].astype(str)
+    answers = answers[["id", "after"]]
+    # Create sample submission
+    sample_submission = new_test[["sentence_id", "token_id", "before"]].copy()
+    sample_submission["id"] = (
+        sample_submission["sentence_id"].astype(str)
+        + "_"
+        + sample_submission["token_id"].astype(str)
+    )
+    sample_submission["after"] = sample_submission["before"]
+    sample_submission = sample_submission[["id", "after"]]
+    # Checks
+    assert new_train.columns.tolist() == [
+        "sentence_id",
+        "token_id",
+        "class",
+        "before",
+        "after",
+    ], f"new_train.columns.tolist() == {new_train.columns.tolist()}"
+    assert new_test.columns.tolist() == [
+        "sentence_id",
+        "token_id",
+        "before",
+    ], f"new_test.columns.tolist() == {new_test.columns.tolist()}"
+    assert sample_submission.columns.tolist() == [
+        "id",
+        "after",
+    ], f"sample_submission.columns.tolist() == {sample_submission.columns.tolist()}"
+    assert answers.columns.tolist() == [
+        "id",
+        "after",
+    ], f"answers.columns.tolist() == {answers.columns.tolist()}"
+    assert len(new_test) + len(new_train) == len(
+        old_train
+    ), f"New train and test sets do not sum to old train set, got {len(new_test) + len(new_train)} and {len(old_train)}"
+    # Write CSVs
+    answers.to_csv(
+        private / "answers.csv", index=False, quotechar='"', quoting=csv.QUOTE_NONNUMERIC
+    )
+    sample_submission.to_csv(
+        private / "sample_submission.csv", index=False, quotechar='"', quoting=csv.QUOTE_NONNUMERIC
+    )
+    new_train.to_csv(
+        public / "ru_train.csv", index=False, quotechar='"', quoting=csv.QUOTE_NONNUMERIC
+    )
+    new_test.to_csv(
+        public / "ru_test_2.csv", index=False, quotechar='"', quoting=csv.QUOTE_NONNUMERIC
+    )
+    sample_submission.to_csv(
+        public / "ru_sample_submission_2.csv",
+        index=False,
+        quotechar='"',
+        quoting=csv.QUOTE_NONNUMERIC,
+    )
+    # Zip up
+    with zipfile.ZipFile(public / "ru_train.csv.zip", "w") as zipf:
+        zipf.write(public / "ru_train.csv", arcname="ru_train.csv")
+    with zipfile.ZipFile(public / "ru_test_2.csv.zip", "w") as zipf:
+        zipf.write(public / "ru_test_2.csv", arcname="ru_test_2.csv")
+    with zipfile.ZipFile(public / "ru_sample_submission_2.csv.zip", "w") as zipf:
+        zipf.write(public / "ru_sample_submission_2.csv", arcname="ru_sample_submission_2.csv")
+    (public / "ru_train.csv").unlink()
+    (public / "ru_test_2.csv").unlink()
+    (public / "ru_sample_submission_2.csv").unlink()

mlebench/competitions/text-normalization-challenge-russian-language/prepare_val.py ADDED Viewed

@@ -0,0 +1,165 @@
+import csv
+import zipfile
+from pathlib import Path
+from sklearn.model_selection import train_test_split
+from mlebench.utils import compress, extract, read_csv
+def _process_split(input_df, public_path, private_path, test_size, random_state):
+    """
+    Splits an input DataFrame into train and test sets, processes them,
+    and writes the final files to the specified public and private paths.
+    This function encapsulates the entire data preparation logic for one split,
+    ensuring it can be reused for creating both the main test set and a
+    subsequent validation set.
+    Args:
+        input_df: The DataFrame to be split.
+        public_path: The directory to save public artifacts (e.g., train data, test features).
+        private_path: The directory to save private artifacts (e.g., test labels).
+        test_size: The proportion of the dataset to allocate to the test split.
+        random_state: The seed used by the random number generator.
+    Returns:
+        The newly created training DataFrame, which can be used for a subsequent split.
+    """
+    # Create train and test splits from the provided dataframe
+    # We split so that we don't share any sentence_ids between train and test
+    unique_sentence_ids = input_df["sentence_id"].unique()
+    train_sentence_ids, test_sentence_ids = train_test_split(
+        unique_sentence_ids, test_size=test_size, random_state=random_state
+    )
+    new_train = input_df[input_df["sentence_id"].isin(train_sentence_ids)]
+    answers = input_df[input_df["sentence_id"].isin(test_sentence_ids)]
+    assert set(new_train["sentence_id"]).isdisjoint(
+        set(answers["sentence_id"])
+    ), f"sentence_id is not disjoint between train and test sets"
+    # "sentence_id" counts need to be reset for new_train and answers
+    new_train_id_mapping = {
+        old_id: new_id for new_id, old_id in enumerate(new_train["sentence_id"].unique())
+    }
+    new_train["sentence_id"] = new_train["sentence_id"].map(new_train_id_mapping)
+    answers_id_mapping = {
+        old_id: new_id for new_id, old_id in enumerate(answers["sentence_id"].unique())
+    }
+    answers["sentence_id"] = answers["sentence_id"].map(answers_id_mapping)
+    # Create new test set
+    new_test = answers.drop(["after", "class"], axis=1).copy()
+    # Reformat answers to match sample submission format
+    answers = answers[["sentence_id", "token_id", "after"]].copy()
+    answers["id"] = answers["sentence_id"].astype(str) + "_" + answers["token_id"].astype(str)
+    answers = answers[["id", "after"]]
+    # Create sample submission
+    sample_submission = new_test[["sentence_id", "token_id", "before"]].copy()
+    sample_submission["id"] = (
+        sample_submission["sentence_id"].astype(str)
+        + "_"
+        + sample_submission["token_id"].astype(str)
+    )
+    sample_submission["after"] = sample_submission["before"]
+    sample_submission = sample_submission[["id", "after"]]
+    # Checks
+    assert new_train.columns.tolist() == [
+        "sentence_id",
+        "token_id",
+        "class",
+        "before",
+        "after",
+    ], f"new_train.columns.tolist() == {new_train.columns.tolist()}"
+    assert new_test.columns.tolist() == [
+        "sentence_id",
+        "token_id",
+        "before",
+    ], f"new_test.columns.tolist() == {new_test.columns.tolist()}"
+    assert sample_submission.columns.tolist() == [
+        "id",
+        "after",
+    ], f"sample_submission.columns.tolist() == {sample_submission.columns.tolist()}"
+    assert answers.columns.tolist() == [
+        "id",
+        "after",
+    ], f"answers.columns.tolist() == {answers.columns.tolist()}"
+    assert len(new_test) + len(new_train) == len(
+        input_df
+    ), f"New train and test sets do not sum to old train set, got {len(new_test) + len(new_train)} and {len(input_df)}"
+    # Write CSVs
+    answers.to_csv(
+        private_path / "answers.csv", index=False, quotechar='"', quoting=csv.QUOTE_NONNUMERIC
+    )
+    sample_submission.to_csv(
+        private_path / "sample_submission.csv", index=False, quotechar='"', quoting=csv.QUOTE_NONNUMERIC
+    )
+    new_train.to_csv(
+        public_path / "ru_train.csv", index=False, quotechar='"', quoting=csv.QUOTE_NONNUMERIC
+    )
+    new_test.to_csv(
+        public_path / "ru_test_2.csv", index=False, quotechar='"', quoting=csv.QUOTE_NONNUMERIC
+    )
+    sample_submission.to_csv(
+        public_path / "ru_sample_submission_2.csv",
+        index=False,
+        quotechar='"',
+        quoting=csv.QUOTE_NONNUMERIC,
+    )
+    # Zip up
+    with zipfile.ZipFile(public_path / "ru_train.csv.zip", "w") as zipf:
+        zipf.write(public_path / "ru_train.csv", arcname="ru_train.csv")
+    with zipfile.ZipFile(public_path / "ru_test_2.csv.zip", "w") as zipf:
+        zipf.write(public_path / "ru_test_2.csv", arcname="ru_test_2.csv")
+    with zipfile.ZipFile(public_path / "ru_sample_submission_2.csv.zip", "w") as zipf:
+        zipf.write(public_path / "ru_sample_submission_2.csv", arcname="ru_sample_submission_2.csv")
+    (public_path / "ru_train.csv").unlink()
+    (public_path / "ru_test_2.csv").unlink()
+    (public_path / "ru_sample_submission_2.csv").unlink()
+    return new_train
+def prepare(raw: Path, public: Path, private: Path):
+    # Define paths for the new validation set split
+    public_val = public.parent / "public_val"
+    private_val = private.parent / "private_val"
+    # Create all output directories ahead of time
+    for p in [public, private, public_val, private_val]:
+        p.mkdir(parents=True, exist_ok=True)
+    # Extract
+    extract(raw / "ru_test_2.csv.zip", raw)  # We only use the 2nd stage test set
+    extract(raw / "ru_train.csv.zip", raw)
+    extract(raw / "ru_sample_submission_2.csv.zip", raw)
+    # Load initial raw data
+    initial_train_data = read_csv(raw / "ru_train.csv")
+    # --- First Split: Create Original Train and Test sets ---
+    # This call generates the final public/private outputs. These must not be changed.
+    # The returned DataFrame is the training set from this split.
+    main_train_set = _process_split(
+        input_df=initial_train_data,
+        public_path=public,
+        private_path=private,
+        test_size=0.1,
+        random_state=0,
+    )
+    # --- Second Split: Create New Train and Validation sets ---
+    # This call takes the training data from the first split and splits it again
+    # using the exact same logic. The outputs are written to the new _val directories.
+    _ = _process_split(
+        input_df=main_train_set,
+        public_path=public_val,
+        private_path=private_val,
+        test_size=0.1,  # Use same proportion
+        random_state=0, # Use same random seed for consistent splitting logic
+    )

mlebench/competitions/tgs-salt-identification-challenge/grade.py ADDED Viewed

@@ -0,0 +1,144 @@
+import numpy as np
+from numpy import ndarray
+from pandas import DataFrame
+from mlebench.competitions.utils import rle_decode
+from mlebench.grade_helpers import InvalidSubmissionError
+IMG_HEIGHT = 101
+IMG_WIDTH = 101
+def grade(submission: DataFrame, answers: DataFrame) -> float:
+    expected_columns = ["id", "rle_mask"]
+    assert set(answers.columns) == set(
+        expected_columns
+    ), f"Expected answers to have columns {expected_columns}, but got {answers.columns} instead!"
+    if set(submission.columns) != set(expected_columns):
+        raise InvalidSubmissionError(
+            f"Expected submission to have columns {expected_columns}, but got {submission.columns} instead!"
+        )
+    if len(submission) != len(answers):
+        raise InvalidSubmissionError(
+            f"Expected submission to have {len(answers)} rows, but got {len(submission)} instead!"
+        )
+    transformed_submission = submission.fillna("")
+    transformed_answers = answers.fillna("")
+    assert transformed_answers.applymap(
+        lambda x: isinstance(x, str)
+    ).values.all(), "Expected all items in `answers` to be strings, but found non-string items!"
+    if not transformed_submission.applymap(lambda x: isinstance(x, str)).values.all():
+        raise InvalidSubmissionError(
+            "Expected all items in `submission` to be strings, but found non-string items!"
+        )
+    sorted_submission = transformed_submission.sort_values(by="id")
+    sorted_answers = transformed_answers.sort_values(by="id")
+    if (sorted_submission["id"].values != sorted_answers["id"].values).any():
+        raise InvalidSubmissionError(
+            f"Expected submission to have the same IDs as answers, but got a different set of IDs!"
+        )
+    y_preds_rle = sorted_submission["rle_mask"].values
+    y_trues_rle = sorted_answers["rle_mask"].values
+    try:
+        y_preds = np.array(
+            [rle_decode(rle, height=IMG_HEIGHT, width=IMG_WIDTH) for rle in y_preds_rle]
+        )
+    except AssertionError as e:
+        raise InvalidSubmissionError(f"Error decoding RLE masks: {e}")
+    y_trues = np.array([rle_decode(rle, height=IMG_HEIGHT, width=IMG_WIDTH) for rle in y_trues_rle])
+    scores = [iou_metric(y_trues[i], y_preds[i]) for i in range(len(y_trues))]
+    score = np.mean(scores)
+    return score
+def iou_metric(y_true_in: ndarray, y_pred_in: ndarray) -> float:
+    """
+    Adapted from https://www.kaggle.com/code/phoenigs/u-net-dropout-augmentation-stratification.
+    """
+    if np.sum(y_true_in) == 0 and np.sum(y_pred_in) == 0:
+        return 1.0
+    if np.sum(y_true_in) == 0 and np.sum(y_pred_in) > 0:
+        return 0.0
+    labels = y_true_in
+    y_pred = y_pred_in
+    true_objects = 2
+    pred_objects = 2
+    intersection = np.histogram2d(
+        labels.flatten(),
+        y_pred.flatten(),
+        bins=(true_objects, pred_objects),
+    )[0]
+    # Compute areas (needed for finding the union between all objects)
+    area_true = np.histogram(labels, bins=true_objects)[0]
+    area_pred = np.histogram(y_pred, bins=pred_objects)[0]
+    area_true = np.expand_dims(area_true, -1)
+    area_pred = np.expand_dims(area_pred, 0)
+    # Compute union
+    union = area_true + area_pred - intersection
+    # Exclude background from the analysis
+    intersection = intersection[1:, 1:]
+    union = union[1:, 1:]
+    union[union == 0] = 1e-9
+    # Compute the intersection over union
+    iou = intersection / union
+    # Precision helper function
+    def precision_at(threshold, iou):
+        matches = iou > threshold
+        true_positives = np.sum(matches, axis=1) == 1  # Correct objects
+        false_positives = np.sum(matches, axis=0) == 0  # Missed objects
+        false_negatives = np.sum(matches, axis=1) == 0  # Extra objects
+        tp, fp, fn = np.sum(true_positives), np.sum(false_positives), np.sum(false_negatives)
+        return tp, fp, fn
+    # Loop over IoU thresholds
+    prec = []
+    for t in np.arange(0.5, 1.0, 0.05):
+        tp, fp, fn = precision_at(t, iou)
+        if (tp + fp + fn) > 0:
+            p = tp / (tp + fp + fn)
+        else:
+            p = 0
+        prec.append(p)
+    return np.mean(prec)
+def iou_metric_batch(y_true_in, y_pred_in):
+    """
+    Adapted from https://www.kaggle.com/code/phoenigs/u-net-dropout-augmentation-stratification.
+    """
+    batch_size = y_true_in.shape[0]
+    metric = []
+    for batch in range(batch_size):
+        value = iou_metric(y_true_in[batch], y_pred_in[batch])
+        metric.append(value)
+    return np.mean(metric)

mlebench/competitions/tgs-salt-identification-challenge/prepare.py ADDED Viewed

@@ -0,0 +1,158 @@
+import shutil
+from pathlib import Path
+from sklearn.model_selection import train_test_split
+from mlebench.utils import extract, read_csv
+def prepare(raw: Path, public: Path, private: Path) -> None:
+    extract(raw / "competition_data.zip", raw)
+    old_train = read_csv(raw / "competition_data" / "train.csv")
+    old_train = old_train.fillna("")
+    # Original ratio is Train set - 4,000 samples; Test set - ~18,000 samples (82% ratio)
+    # We use a 0.25 ratio to get number of test samples into thousand OOM
+    new_train, new_test = train_test_split(old_train, test_size=0.25, random_state=0)
+    assert len(new_train) + len(new_test) == len(
+        old_train
+    ), "Some samples were lost when creating the new train and test sets!"
+    new_train_ids = set(new_train["id"])
+    new_test_ids = set(new_test["id"])
+    assert new_train_ids.isdisjoint(
+        new_test_ids
+    ), "`id` is not disjoint between train and test sets"
+    new_train.sort_values(by="id").to_csv(public / "train.csv", index=False)
+    new_test.sort_values(by="id").to_csv(private / "test.csv", index=False)
+    old_train_imgs = set((raw / "competition_data" / "train" / "images").glob("*.png"))
+    assert len(old_train_imgs) == len(
+        old_train
+    ), "The number of images in the old train set doesn't match the number of training images!"
+    new_train_imgs = set(img for img in old_train_imgs if img.stem in new_train_ids)
+    new_test_imgs = set(img for img in old_train_imgs if img.stem in new_test_ids)
+    assert new_train_imgs.isdisjoint(
+        new_test_imgs
+    ), "Images are not disjoint between train and test sets"
+    assert len(new_train_imgs) + len(new_test_imgs) == len(
+        old_train_imgs
+    ), "Some images were lost when creating the new train and test sets!"
+    (public / "train" / "images").mkdir(parents=True, exist_ok=True)
+    (public / "train" / "masks").mkdir(parents=True, exist_ok=True)
+    for fpath in new_train_imgs:
+        shutil.copyfile(
+            src=fpath,
+            dst=public / "train" / "images" / fpath.name,
+        )
+        shutil.copyfile(
+            src=raw / "competition_data" / "train" / "masks" / fpath.name,
+            dst=public / "train" / "masks" / fpath.name,
+        )
+    (public / "test" / "images").mkdir(parents=True, exist_ok=True)
+    for fpath in new_test_imgs:
+        shutil.copyfile(
+            src=fpath,
+            dst=public / "test" / "images" / fpath.name,
+        )
+    sample_submission = new_test.drop(columns=["rle_mask"]).copy()
+    sample_submission["rle_mask"] = "1 1"
+    sample_submission.sort_values(by="id").to_csv(public / "sample_submission.csv", index=False)
+    old_depths = read_csv(raw / "depths.csv")
+    new_depths_mask = old_depths["id"].isin(new_train_ids)
+    new_depths = old_depths[new_depths_mask]
+    new_depths.sort_values(by="id").to_csv(public / "depths.csv", index=False)
+    # Sanity checks
+    assert (public / "train.csv").exists(), "`train.csv` doesn't exist!"
+    assert (public / "sample_submission.csv").exists(), "`sample_submission.csv` doesn't exist!"
+    assert (public / "depths.csv").exists(), "`depths.csv` doesn't exist!"
+    assert (public / "train").exists(), "`train` directory doesn't exist!"
+    assert (public / "test").exists(), "`test` directory doesn't exist!"
+    assert (private / "test.csv").exists(), "`test.csv` doesn't exist!"
+    actual_new_train_imgs = set(img.stem for img in (public / "train" / "images").glob("*.png"))
+    actual_new_train_masks = set(img.stem for img in (public / "train" / "masks").glob("*.png"))
+    assert len(actual_new_train_imgs) == len(
+        new_train
+    ), "The number of images in the train set doesn't match the number of training images!"
+    assert len(actual_new_train_masks) == len(
+        new_train
+    ), "The number of masks in the train set doesn't match the number of training masks!"
+    for new_train_id in new_train["id"]:
+        assert (
+            public / "train" / "images" / f"{new_train_id}.png"
+        ).exists(), f"Expected `{new_train_id}.png` to exist in train images, but it doesn't!"
+        assert (
+            public / "train" / "masks" / f"{new_train_id}.png"
+        ).exists(), f"Expected `{new_train_id}.png` to exist in train masks, but it doesn't!"
+    actual_new_test_imgs = set(img.stem for img in (public / "test" / "images").glob("*.png"))
+    assert not (
+        public / "test" / "masks"
+    ).exists(), f"Expected `public / test / masks` to not exist, but it does!"
+    assert len(actual_new_test_imgs) == len(
+        new_test
+    ), "The number of images in the test set doesn't match the number of test images!"
+    for new_test_id in new_test["id"]:
+        assert (
+            public / "test" / "images" / f"{new_test_id}.png"
+        ).exists(), f"Expected `{new_test_id}.png` to exist in test images, but it doesn't!"
+        assert not (
+            public / "test" / "masks" / f"{new_test_id}.png"
+        ).exists(), f"Expected `{new_test_id}.png` to exist in test masks, but it doesn't!"
+    assert actual_new_train_imgs.isdisjoint(
+        actual_new_test_imgs
+    ), "Expected no overlap in images between the new train and test sets, but there is!"
+    actual_sample_submission = read_csv(public / "sample_submission.csv")
+    actual_new_test = read_csv(private / "test.csv")
+    assert len(actual_sample_submission) == len(
+        actual_new_test
+    ), "The number of samples in the sample submission doesn't match the number of samples in the test set!"
+    assert set(actual_sample_submission["id"]) == set(
+        actual_new_test["id"]
+    ), "The ids in the sample submission don't match the ids in the test set!"
+    assert len(actual_new_test_imgs) == len(
+        actual_new_test
+    ), "The number of images in the test set doesn't match the number of test images!"
+    assert (
+        set(actual_new_test["id"]) == actual_new_test_imgs
+    ), "The ids in the test set don't match the test images!"
+    assert new_train.applymap(
+        lambda x: isinstance(x, str)
+    ).values.all(), "Not all elements in the DataFrame are strings!"
+    assert new_test.applymap(
+        lambda x: isinstance(x, str)
+    ).values.all(), "Not all elements in the DataFrame are strings!"

dslighting 1.7.1__py3-none-any.whl → 1.7.6__py3-none-any.whl

dslighting 1.7.1py3-none-any.whl → 1.7.6py3-none-any.whl