PyPI - dslighting - Versions diffs - 1.7.1__py3-none-any.whl → 1.7.8__py3-none-any.whl - Mend

dslighting 1.7.1py3-none-any.whl → 1.7.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (352) hide show

mlebench/competitions/handwriting/prepare.py ADDED Viewed

@@ -0,0 +1,179 @@
+import numpy as np
+import pandas as pd
+from pathlib import Path
+def _label_sort_key(label: str):
+    try:
+        return float(label)
+    except (TypeError, ValueError):
+        return label
+def _build_label_mapping(raw_labels):
+    unique_labels = sorted({lbl for lbl in raw_labels if lbl is not None}, key=_label_sort_key)
+    return {label: idx for idx, label in enumerate(unique_labels)}
+def _load_ts_split(ts_path: Path, label_mapping=None):
+    """
+    Load a .ts file into a dense NumPy tensor and label vector.
+    Returns:
+        data: np.ndarray of shape (num_samples, seq_len, num_features)
+        labels: np.ndarray of int labels with shape (num_samples,)
+        mapping: dict mapping original labels to encoded integers
+    """
+    num_dimensions = None
+    seq_length_hint = None
+    in_data_section = False
+    samples = []
+    raw_labels = []
+    with ts_path.open("r", encoding="utf-8") as handle:
+        for line in handle:
+            stripped = line.strip()
+            if not stripped or stripped.startswith("#"):
+                continue
+            if not in_data_section:
+                if stripped.lower().startswith("@data"):
+                    in_data_section = True
+                    continue
+                if stripped.startswith("@"):
+                    parts = stripped[1:].split(None, 1)
+                    key = parts[0].lower()
+                    value = parts[1].strip() if len(parts) > 1 else ""
+                    if key == "dimensions":
+                        num_dimensions = int(value)
+                    elif key == "serieslength":
+                        try:
+                            seq_length_hint = int(value)
+                        except ValueError:
+                            seq_length_hint = None
+                    continue
+                continue
+            # Data row
+            if num_dimensions is None:
+                raise ValueError(f"Unable to parse dimensions from header in {ts_path}")
+            parts = stripped.split(":")
+            if len(parts) < num_dimensions:
+                raise ValueError(f"Unexpected data row format in {ts_path}: {stripped[:50]}...")
+            dimension_series = []
+            for dim_idx in range(num_dimensions):
+                seq_str = parts[dim_idx].strip()
+                if not seq_str:
+                    dimension_series.append([])
+                    continue
+                values = []
+                for token in seq_str.split(","):
+                    token = token.strip()
+                    if not token:
+                        continue
+                    if token == "?":
+                        values.append(np.nan)
+                    else:
+                        values.append(float(token))
+                dimension_series.append(values)
+            samples.append(dimension_series)
+            label_str = parts[num_dimensions].strip() if len(parts) > num_dimensions else None
+            raw_labels.append(label_str or None)
+    if not samples:
+        raise ValueError(f"No samples parsed from {ts_path}")
+    # Ensure label mapping
+    mapping = label_mapping or _build_label_mapping(raw_labels)
+    labels = np.array([mapping[label] for label in raw_labels], dtype=np.int64)
+    # Convert dimension lists into dense arrays
+    tensor_samples = []
+    global_seq_len = None
+    for dims in samples:
+        series_arrays = []
+        sample_len = None
+        for dim_values in dims:
+            arr = np.asarray(dim_values, dtype=np.float32)
+            if sample_len is None:
+                sample_len = arr.shape[0]
+            elif arr.shape[0] != sample_len:
+                raise ValueError("Inconsistent dimension lengths within sample.")
+            if seq_length_hint is not None and arr.shape[0] != seq_length_hint:
+                raise ValueError("Sequence length mismatch relative to header declaration.")
+            series_arrays.append(arr)
+        sample_tensor = np.stack(series_arrays, axis=-1)  # (seq_len, num_features)
+        global_seq_len = sample_tensor.shape[0] if global_seq_len is None else global_seq_len
+        if sample_tensor.shape[0] != global_seq_len:
+            raise ValueError("Inconsistent sequence lengths across samples.")
+        tensor_samples.append(sample_tensor)
+    data = np.stack(tensor_samples, axis=0)  # (num_samples, seq_len, num_features)
+    return data, labels, mapping
+def prepare(raw: Path, public: Path, private: Path):
+    """
+    Prepare the handwriting dataset for the benchmark.
+    Args:
+        raw: Path to raw data directory (contains Handwriting_TRAIN.ts and Handwriting_TEST.ts)
+        public: Path to public directory (visible to participants)
+        private: Path to private directory (hidden from participants, used for grading)
+    """
+    # Materialize dense tensors from the raw .ts files
+    train_path = raw / "Handwriting_TRAIN.ts"
+    test_path = raw / "Handwriting_TEST.ts"
+    X_train, y_train, label_mapping = _load_ts_split(train_path)
+    X_test, y_test, _ = _load_ts_split(test_path, label_mapping=label_mapping)
+    # Convert labels to 1D arrays
+    y_train = np.asarray(y_train, dtype=np.int64).reshape(-1)
+    y_test = np.asarray(y_test, dtype=np.int64).reshape(-1)
+    # Persist prepared arrays for participants
+    np.save(public / "train_data.npy", X_train)
+    np.save(public / "train_labels.npy", y_train)
+    np.save(public / "test_data.npy", X_test)
+    # Sample submission (default all zeros)
+    sample_submission = pd.DataFrame({"id": range(len(y_test)), "label": 0})
+    sample_submission.to_csv(public / "sample_submission.csv", index=False)
+    # Private ground-truth labels for grading
+    test_labels_df = pd.DataFrame({"id": range(len(y_test)), "label": y_test})
+    test_labels_df.to_csv(private / "test_labels.csv", index=False)
+    # Basic validation checks
+    expected_public = [
+        public / "train_data.npy",
+        public / "train_labels.npy",
+        public / "test_data.npy",
+        public / "sample_submission.csv",
+    ]
+    for path in expected_public:
+        assert path.exists(), f"Missing public artifact: {path.name}"
+    assert (private / "test_labels.csv").exists(), "Test labels should exist"
+    print(
+        "Prepared handwriting dataset:\n"
+        f"  - Train split: {X_train.shape}, labels: {y_train.shape}\n"
+        f"  - Test split: {X_test.shape}, labels: {y_test.shape}"
+    )

mlebench/competitions/herbarium-2020-fgvc7/grade.py ADDED Viewed

@@ -0,0 +1,34 @@
+import pandas as pd
+from sklearn.metrics import f1_score
+from mlebench.grade_helpers import InvalidSubmissionError
+def prepare_for_metric(submission: pd.DataFrame, answers: pd.DataFrame):
+    """
+    `submission` and `answers` are pd.DataFrame with "Id" and "Predicted" columns
+    """
+    if len(submission) != len(answers):
+        raise InvalidSubmissionError(
+            f"Submission length {len(submission)} != answers length {len(answers)}"
+        )
+    if "Id" not in submission.columns or "Predicted" not in submission.columns:
+        raise InvalidSubmissionError("Submission must have 'Id' and 'Predicted' columns")
+    assert "Id" in answers.columns, "Answers must have 'Id' column"
+    assert "Predicted" in answers.columns, "Answers must have 'Predicted' column"
+    # Sort submission and answers by "Id"
+    submission = submission.sort_values("Id")
+    answers = answers.sort_values("Id")
+    if (submission["Id"].values != answers["Id"].values).any():
+        raise InvalidSubmissionError("Submission and answers have mismatched 'Id' columns")
+    y_true = [int(y) for y in answers["Predicted"]]
+    y_pred = [int(y) for y in submission["Predicted"]]
+    return y_true, y_pred
+def grade(submission: pd.DataFrame, answers: pd.DataFrame) -> float:
+    y_true, y_pred = prepare_for_metric(submission, answers)
+    return f1_score(y_true=y_true, y_pred=y_pred, average="macro")

mlebench/competitions/herbarium-2020-fgvc7/prepare.py ADDED Viewed

@@ -0,0 +1,251 @@
+import json
+import random
+import shutil
+from pathlib import Path
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from tqdm import tqdm
+from mlebench.utils import get_logger
+logger = get_logger(__name__)
+def prepare(raw: Path, public: Path, private: Path):
+    """
+    Splits the raw data into public and private datasets with appropriate test/train splits.
+    `train/metadata.json` is the "table of contents" for our data, with the following structure:
+    (More details at https://www.kaggle.com/competitions/herbarium-2020-fgvc7/data)
+    ```
+    {
+        "annotations" : [annotation],
+        "categories" : [category],
+        "images" : [image],
+        "info" : info,
+        "licenses" : [license],
+        "regions" : [region]
+    }
+    ```
+    - `images` and `annotations` are both N-length lists corresponding to the N samples.
+        We'll need to split each of these lists into train and test.
+    - The other fields are dataset-wide metadata that we don't need to touch.
+    - test/metadata.json is the same structure as train/metadata.json, but without "annotations", "categories", "regions"
+    Other notes:
+    - train/test splits need to occur per category (each category should be in both train and test).
+    - The `test/images` and `train/images` folders have nested subdirs to make it easier to browse
+        - `train/images` is structured as `{category_id[:3]}/{category_id[3:]}/{image_id}.jpg`
+        - `test/images` is structured as `{image_idx[:3]}/{image_idx}.jpg` (to not reveal the category)
+    - When we create the new splits, we re-assign image indices so that we don't give away labels based on the index
+        - train images are indexed within their own category
+        - test images follow a flat index after shuffling the categories
+    """
+    dev_mode = False
+    dev_count = 2  # Copy over n images per category when in dev mode
+    # Create train, test from train split
+    json_path = raw / "nybg2020/train/metadata.json"
+    with open(json_path, "r", encoding="latin-1") as f:  # utf-8 fails
+        old_train_metadata = json.load(f)
+    # Organize data by category so that we can split per-category later
+    annotations_images_by_category = {}  # We'll collect both `annotations` and `images` here
+    for annotation, image in list(
+        zip(old_train_metadata["annotations"], old_train_metadata["images"])
+    ):
+        assert (
+            annotation["image_id"] == image["id"]
+        ), f"Mismatching image_id in annotation and image: {annotation['image_id']} vs {image['id']}"
+        category_id = annotation["category_id"]
+        if category_id not in annotations_images_by_category:
+            annotations_images_by_category[category_id] = []
+        annotations_images_by_category[category_id].append(
+            {
+                "annotation": annotation,
+                "image": image,
+            }
+        )
+    # Split train/test
+    train_sample_count = 0  # Useful for tqdm later
+    train_annotations_images_by_category = {}
+    test_annotations_images_by_category = {}
+    for category_id, annotations_images in tqdm(
+        annotations_images_by_category.items(), desc="Assigning train/test splits"
+    ):
+        # Create split by "category" (class): Each category needs to be in both train and test (80:20) as per original ratio
+        test_size = 0.2
+        n_samples = len(annotations_images)
+        if n_samples == 1:
+            # If only one sample, put it in train
+            train_annotations_images = annotations_images
+            test_annotations_images = []
+        elif n_samples < 5:  # Minimum 5 samples to ensure at least 1 in test
+            # Ensure at least 1 sample in test
+            test_size = max(1, int(n_samples * test_size))
+            train_annotations_images = annotations_images[:-test_size]
+            test_annotations_images = annotations_images[-test_size:]
+        else:
+            train_annotations_images, test_annotations_images = train_test_split(
+                annotations_images, test_size=test_size, random_state=0
+            )
+        train_annotations_images_by_category[category_id] = train_annotations_images
+        test_annotations_images_by_category[category_id] = test_annotations_images
+        train_sample_count += len(train_annotations_images)
+    # Add to train set
+    new_train_metadata = (
+        old_train_metadata.copy()
+    )  # Keep 'categories', 'info', 'licenses', 'regions'
+    new_train_metadata.update(
+        {
+            "annotations": [],
+            "images": [],
+        }
+    )
+    with tqdm(
+        desc="Creating new train dataset",
+        total=train_sample_count,
+    ) as pbar:
+        for category_id, annotations_images in train_annotations_images_by_category.items():
+            # Create a nested directory from category_id, e.g. 15504 -> "155/04" or 3 -> "000/03"
+            category_subdir = f"{category_id // 100:03d}/{category_id % 100:02d}"
+            (public / "nybg2020/train/images" / category_subdir).mkdir(exist_ok=True, parents=True)
+            for idx, annotation_image in enumerate(annotations_images):
+                new_annotation = annotation_image["annotation"].copy()
+                new_train_metadata["annotations"].append(new_annotation)
+                new_image = annotation_image["image"].copy()
+                new_train_metadata["images"].append(new_image)
+                # Copy file from raw to public
+                if (
+                    not dev_mode or idx < dev_count
+                ):  # if dev_mode, only copy the first dev_count images
+                    src_path = raw / "nybg2020/train" / annotation_image["image"]["file_name"]
+                    dst_path = public / "nybg2020/train" / annotation_image["image"]["file_name"]
+                    shutil.copyfile(src=src_path, dst=dst_path)
+                pbar.update(1)
+    with open(public / "nybg2020/train/metadata.json", "w") as f:
+        json.dump(new_train_metadata, f, indent=4, sort_keys=True)
+    if not dev_mode:
+        assert len(list((public / "nybg2020/train/images").glob("**/*.jpg"))) == len(
+            new_train_metadata["images"]
+        ), f"Mismatching number of images in train_images, got {len(list((public / 'nybg2020/train/images').glob('**/*.jpg')))}"
+        assert len(new_train_metadata["annotations"]) == len(
+            new_train_metadata["images"]
+        ), f"Mismatching number of annotations in train_metadata, got {len(new_train_metadata['annotations'])}"
+    # Add to test set
+    new_test_metadata = old_train_metadata.copy()
+    del new_test_metadata["categories"]
+    del new_test_metadata["regions"]
+    new_test_metadata.update(
+        {
+            "annotations": [],
+            "images": [],
+        }
+    )
+    # Flatten and shuffle test set so that we don't have all the same categories in a row
+    test_annotations_images = [
+        item for sublist in test_annotations_images_by_category.values() for item in sublist
+    ]
+    random.Random(0).shuffle(test_annotations_images)
+    for idx, annotation_image in tqdm(
+        enumerate(test_annotations_images),
+        desc="Creating new test dataset",
+        total=len(test_annotations_images),
+    ):
+        # Make new image id, for test set this is just the index
+        new_image_id = str(idx)
+        # Make new filename from image id e.g. "000/0.jpg"
+        new_file_name = f"images/{idx // 1000:03d}/{idx}.jpg"
+        new_annotation = annotation_image["annotation"].copy()
+        new_annotation["image_id"] = new_image_id
+        new_test_metadata["annotations"].append(new_annotation)
+        new_image = annotation_image["image"].copy()
+        new_image["id"] = new_image_id
+        new_image["file_name"] = new_file_name
+        new_test_metadata["images"].append(new_image)
+        # Copy file from raw to public
+        if not dev_mode or idx < dev_count:  # if dev_mode, only copy the first dev_count images
+            src_path = raw / "nybg2020/train" / annotation_image["image"]["file_name"]
+            dst_path = public / "nybg2020/test" / new_file_name
+            dst_path.parent.mkdir(exist_ok=True, parents=True)
+            shutil.copyfile(src=src_path, dst=dst_path)
+    # Save new test metadata
+    with open(public / "nybg2020/test/metadata.json", "w") as f:
+        # The public test data, of course, doesn't have annotations
+        public_new_test = new_test_metadata.copy()
+        del public_new_test["annotations"]
+        assert public_new_test.keys() == {
+            "images",
+            "info",
+            "licenses",
+        }, f"Public test metadata keys should be 'images', 'info', 'licenses', but found {public_new_test.keys()}"
+        json.dump(public_new_test, f, indent=4, sort_keys=True)
+    if not dev_mode:
+        assert len(list((public / "nybg2020/test/images").glob("**/*.jpg"))) == len(
+            new_test_metadata["images"]
+        ), f"Mismatching number of images in test_images, got {len(list((public / 'nybg2020/test/images').glob('**/*.jpg')))}"
+        assert len(new_test_metadata["annotations"]) == len(
+            new_test_metadata["images"]
+        ), f"Mismatching number of annotations in test_metadata, got {len(new_test_metadata['annotations'])}"
+        assert len(new_train_metadata["annotations"]) + len(
+            new_test_metadata["annotations"]
+        ) == len(old_train_metadata["annotations"]), (
+            f"Expected {len(old_train_metadata['annotations'])} annotations in total, but found"
+            f"{len(new_train_metadata['annotations'])} in train and {len(new_test_metadata['annotations'])} in test"
+        )
+    # Save private test answers
+    answers_rows = []
+    for image, annotation in zip(new_test_metadata["images"], new_test_metadata["annotations"]):
+        assert (
+            image["id"] == annotation["image_id"]
+        ), f"Mismatching image_id in image and annotation: {image['id']} vs {annotation['image_id']}"
+        answers_rows.append(
+            {
+                "Id": image["id"],
+                "Predicted": annotation["category_id"],
+            }
+        )
+    answers_df = pd.DataFrame(answers_rows)
+    answers_df.to_csv(private / "answers.csv", index=False)
+    # Create new sample submission that matches raw/sample_submission.csv, but for the new test set
+    sample_rows = []
+    for image in new_test_metadata["images"]:
+        sample_rows.append(
+            {
+                "Id": image["id"],
+                "Predicted": 0,
+            }
+        )
+    sample_df = pd.DataFrame(sample_rows)
+    sample_df.to_csv(public / "sample_submission.csv", index=False)
+    assert len(answers_df) == len(
+        new_test_metadata["images"]
+    ), f"Expected {len(new_test_metadata['images'])} rows in answers, but found {len(answers_df)}"
+    assert len(sample_df) == len(
+        answers_df
+    ), f"Expected {len(answers_df)} rows in sample submission, but found {len(sample_df)}"
+    assert answers_df["Id"].equals(
+        sample_df["Id"]
+    ), "Mismatched 'Id' columns between answers and sample submission"

dslighting 1.7.1__py3-none-any.whl → 1.7.8__py3-none-any.whl

dslighting 1.7.1py3-none-any.whl → 1.7.8py3-none-any.whl