PyPI - dslighting - Versions diffs - 1.7.1__py3-none-any.whl → 1.7.6__py3-none-any.whl - Mend

dslighting 1.7.1py3-none-any.whl → 1.7.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (352) hide show

mlebench/competitions/herbarium-2020-fgvc7/prepare_val.py ADDED Viewed

@@ -0,0 +1,242 @@
+import json
+import random
+import shutil
+from pathlib import Path
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from tqdm import tqdm
+from mlebench.utils import get_logger
+logger = get_logger(__name__)
+def _create_dataset_files(
+    train_data_by_cat: dict,
+    test_data_by_cat: dict,
+    base_metadata: dict,
+    public_dir: Path,
+    private_dir: Path,
+    raw_dir: Path,
+    dev_mode: bool,
+    dev_count: int,
+):
+    """
+    Helper function to generate the complete set of dataset files for a given split.
+    This function is responsible for:
+    1. Creating the `train` set (metadata.json and image files).
+    2. Creating the `test` set (public metadata.json and image files).
+    3. Creating the private `answers.csv` for the `test` set.
+    4. Creating the public `sample_submission.csv`.
+    """
+    public_dir.mkdir(exist_ok=True, parents=True)
+    private_dir.mkdir(exist_ok=True, parents=True)
+    # --- Process Train Set ---
+    new_train_metadata = base_metadata.copy()
+    new_train_metadata.update({"annotations": [], "images": []})
+    train_sample_count = sum(len(v) for v in train_data_by_cat.values())
+    with tqdm(
+        desc=f"Creating train set for {public_dir.name}",
+        total=train_sample_count,
+    ) as pbar:
+        for category_id, annotations_images in train_data_by_cat.items():
+            category_subdir = f"{category_id // 100:03d}/{category_id % 100:02d}"
+            (public_dir / "nybg2020/train/images" / category_subdir).mkdir(
+                exist_ok=True, parents=True
+            )
+            for idx, annotation_image in enumerate(annotations_images):
+                new_train_metadata["annotations"].append(annotation_image["annotation"].copy())
+                new_train_metadata["images"].append(annotation_image["image"].copy())
+                if not dev_mode or idx < dev_count:
+                    src_path = raw_dir / "nybg2020/train" / annotation_image["image"]["file_name"]
+                    dst_path = public_dir / "nybg2020/train" / annotation_image["image"]["file_name"]
+                    shutil.copyfile(src=src_path, dst=dst_path)
+                pbar.update(1)
+    with open(public_dir / "nybg2020/train/metadata.json", "w") as f:
+        json.dump(new_train_metadata, f, indent=4, sort_keys=True)
+    if not dev_mode:
+        assert len(list((public_dir / "nybg2020/train/images").glob("**/*.jpg"))) == len(
+            new_train_metadata["images"]
+        )
+        assert len(new_train_metadata["annotations"]) == len(new_train_metadata["images"])
+    # --- Process Test Set ---
+    new_test_metadata = base_metadata.copy()
+    del new_test_metadata["categories"]
+    del new_test_metadata["regions"]
+    new_test_metadata.update({"annotations": [], "images": []})
+    test_annotations_images = [item for sublist in test_data_by_cat.values() for item in sublist]
+    random.Random(0).shuffle(test_annotations_images)
+    for idx, annotation_image in tqdm(
+        enumerate(test_annotations_images),
+        desc=f"Creating test set for {public_dir.name}",
+        total=len(test_annotations_images),
+    ):
+        new_image_id = str(idx)
+        new_file_name = f"images/{idx // 1000:03d}/{idx}.jpg"
+        new_annotation = annotation_image["annotation"].copy()
+        new_annotation["image_id"] = new_image_id
+        new_test_metadata["annotations"].append(new_annotation)
+        new_image = annotation_image["image"].copy()
+        new_image["id"] = new_image_id
+        new_image["file_name"] = new_file_name
+        new_test_metadata["images"].append(new_image)
+        if not dev_mode or idx < dev_count:
+            src_path = raw_dir / "nybg2020/train" / annotation_image["image"]["file_name"]
+            dst_path = public_dir / "nybg2020/test" / new_file_name
+            dst_path.parent.mkdir(exist_ok=True, parents=True)
+            shutil.copyfile(src=src_path, dst=dst_path)
+    # Save public test metadata (without annotations)
+    with open(public_dir / "nybg2020/test/metadata.json", "w") as f:
+        public_new_test = new_test_metadata.copy()
+        del public_new_test["annotations"]
+        assert public_new_test.keys() == {"images", "info", "licenses"}
+        json.dump(public_new_test, f, indent=4, sort_keys=True)
+    if not dev_mode:
+        assert len(list((public_dir / "nybg2020/test/images").glob("**/*.jpg"))) == len(
+            new_test_metadata["images"]
+        )
+        assert len(new_test_metadata["annotations"]) == len(new_test_metadata["images"])
+    # --- Save Private Test Answers and Public Sample Submission ---
+    answers_rows = [
+        {"Id": img["id"], "Predicted": ann["category_id"]}
+        for img, ann in zip(new_test_metadata["images"], new_test_metadata["annotations"])
+    ]
+    answers_df = pd.DataFrame(answers_rows)
+    answers_df.to_csv(private_dir / "answers.csv", index=False)
+    sample_df = pd.DataFrame({"Id": answers_df["Id"], "Predicted": 0})
+    sample_df.to_csv(public_dir / "sample_submission.csv", index=False)
+    assert len(answers_df) == len(new_test_metadata["images"])
+    assert len(sample_df) == len(answers_df)
+    assert answers_df["Id"].equals(sample_df["Id"])
+def _split_data_by_category(
+    data_by_cat: dict, test_size: float, random_state: int
+) -> tuple[dict, dict]:
+    """Splits data within each category into train and test sets."""
+    train_split = {}
+    test_split = {}
+    for category_id, annotations_images in data_by_cat.items():
+        n_samples = len(annotations_images)
+        if n_samples == 1:
+            train_annotations_images = annotations_images
+            test_annotations_images = []
+        elif n_samples < 5:
+            # Ensure at least 1 sample in test for small categories
+            current_test_size = max(1, int(n_samples * test_size))
+            train_annotations_images = annotations_images[:-current_test_size]
+            test_annotations_images = annotations_images[-current_test_size:]
+        else:
+            train_annotations_images, test_annotations_images = train_test_split(
+                annotations_images, test_size=test_size, random_state=random_state
+            )
+        train_split[category_id] = train_annotations_images
+        test_split[category_id] = test_annotations_images
+    return train_split, test_split
+def prepare(raw: Path, public: Path, private: Path):
+    """
+    Splits the raw data into public and private datasets with appropriate test/train splits.
+    This version also creates a second, parallel split for validation purposes.
+    """
+    dev_mode = False
+    dev_count = 2  # Copy over n images per category when in dev mode
+    # Create directories for the new validation split
+    public_val = public.parent / "public_val"
+    private_val = private.parent / "private_val"
+    # Load raw data and organize by category
+    json_path = raw / "nybg2020/train/metadata.json"
+    with open(json_path, "r", encoding="latin-1") as f:
+        old_train_metadata = json.load(f)
+    annotations_images_by_category = {}
+    for annotation, image in list(
+        zip(old_train_metadata["annotations"], old_train_metadata["images"])
+    ):
+        assert annotation["image_id"] == image["id"]
+        category_id = annotation["category_id"]
+        if category_id not in annotations_images_by_category:
+            annotations_images_by_category[category_id] = []
+        annotations_images_by_category[category_id].append(
+            {"annotation": annotation, "image": image}
+        )
+    # --- First Split: Create the main train/test sets (80/20 split of raw data) ---
+    logger.info("Performing first split: raw data -> train/test")
+    original_test_size = 0.2
+    train_data_by_cat, test_data_by_cat = _split_data_by_category(
+        annotations_images_by_category,
+        test_size=original_test_size,
+        random_state=0,
+    )
+    # Generate the original `public` and `private` outputs
+    _create_dataset_files(
+        train_data_by_cat,
+        test_data_by_cat,
+        old_train_metadata,
+        public,
+        private,
+        raw,
+        dev_mode,
+        dev_count,
+    )
+    # --- Second Split: Create the validation train/test sets from the main train set ---
+    # The goal is a validation set (`test_val`) of roughly the same size as the original test set.
+    # Original split: train=0.8*N, test=0.2*N
+    # Second split on train data: We need a test fraction of (0.2*N)/(0.8*N) = 0.25
+    logger.info("Performing second split: train data -> train_val/test_val")
+    validation_test_size = 0.25
+    train_val_data_by_cat, test_val_data_by_cat = _split_data_by_category(
+        train_data_by_cat,  # Use the training data from the first split as input
+        test_size=validation_test_size,
+        random_state=0,  # Use same random state for consistency
+    )
+    # Generate the new `public_val` and `private_val` outputs
+    _create_dataset_files(
+        train_val_data_by_cat,
+        test_val_data_by_cat,
+        old_train_metadata,
+        public_val,
+        private_val,
+        raw,
+        dev_mode,
+        dev_count,
+    )
+    logger.info("Data preparation complete.")
+    if not dev_mode:
+        # Final sanity check on total annotations
+        total_original = len(old_train_metadata["annotations"])
+        total_in_first_split = sum(len(v) for v in train_data_by_cat.values()) + sum(
+            len(v) for v in test_data_by_cat.values()
+        )
+        total_in_second_split = sum(len(v) for v in train_val_data_by_cat.values()) + sum(
+            len(v) for v in test_val_data_by_cat.values()
+        )
+        assert total_original == total_in_first_split
+        assert total_in_second_split == sum(len(v) for v in train_data_by_cat.values())

mlebench/competitions/herbarium-2021-fgvc8/grade.py ADDED Viewed

@@ -0,0 +1,34 @@
+import pandas as pd
+from sklearn.metrics import f1_score
+from mlebench.grade_helpers import InvalidSubmissionError
+def prepare_for_metric(submission: pd.DataFrame, answers: pd.DataFrame):
+    """
+    `submission` and `answers` are pd.DataFrame with "Id" and "Predicted" columns
+    """
+    if len(submission) != len(answers):
+        raise InvalidSubmissionError(
+            f"Submission length {len(submission)} != answers length {len(answers)}"
+        )
+    if "Id" not in submission.columns or "Predicted" not in submission.columns:
+        raise InvalidSubmissionError("Submission must have 'Id' and 'Predicted' columns")
+    assert "Id" in answers.columns, "Answers must have 'Id' column"
+    assert "Predicted" in answers.columns, "Answers must have 'Predicted' column"
+    # Sort submission and answers by "Id"
+    submission = submission.sort_values("Id")
+    answers = answers.sort_values("Id")
+    if (submission["Id"].values != answers["Id"].values).any():
+        raise InvalidSubmissionError("Submission and answers have mismatched 'Id' columns")
+    y_true = [int(y) for y in answers["Predicted"]]
+    y_pred = [int(y) for y in submission["Predicted"]]
+    return y_true, y_pred
+def grade(submission: pd.DataFrame, answers: pd.DataFrame) -> float:
+    y_true, y_pred = prepare_for_metric(submission, answers)
+    return f1_score(y_true=y_true, y_pred=y_pred, average="macro")

mlebench/competitions/herbarium-2021-fgvc8/prepare.py ADDED Viewed

@@ -0,0 +1,251 @@
+import json
+import random
+import shutil
+from pathlib import Path
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from tqdm import tqdm
+from mlebench.utils import get_logger
+logger = get_logger(__name__)
+def prepare(raw: Path, public: Path, private: Path):
+    """
+    Splits the raw data into public and private datasets with appropriate test/train splits.
+    `train/metadata.json` is the "table of contents" for our data, with the following structure:
+    (More details at https://www.kaggle.com/competitions/herbarium-2021-fgvc8/data)
+    ```
+    {
+        "annotations" : [annotation],
+        "categories" : [category],
+        "images" : [image],
+        "info" : info,
+        "licenses" : [license],
+        "institutions" : [region]
+    }
+    ```
+    - `images` and `annotations` are both N-length lists corresponding to the N samples.
+        We'll need to split each of these lists into train and test.
+    - The other fields are dataset-wide metadata that we don't need to touch.
+    - test/metadata.json is the same structure as train/metadata.json, but without "annotations", "categories", "institutions"
+    Other notes:
+    - train/test splits need to occur per category (each category should be in both train and test).
+    - The `test/images` and `train/images` folders have nested subdirs to make it easier to browse
+        - `train/images` is structured as `{category_id[:3]}/{category_id[3:]}/{image_id}.jpg`
+        - `test/images` is structured as `{image_idx[:3]}/{image_idx}.jpg` (to not reveal the category)
+    - When we create the new splits, we re-assign image indices so that we don't give away labels based on the index
+        - train images are indexed within their own category
+        - test images follow a flat index after shuffling the categories
+    """
+    dev_mode = False
+    dev_count = 2  # Copy over n images per category when in dev mode
+    # Create train, test from train split
+    json_path = raw / "train/metadata.json"
+    with open(json_path, "r", encoding="utf-8") as f:
+        old_train_metadata = json.load(f)
+    # Organize data by category so that we can split per-category later
+    annotations_images_by_category = {}  # We'll collect both `annotations` and `images` here
+    for annotation, image in list(
+        zip(old_train_metadata["annotations"], old_train_metadata["images"])
+    ):
+        assert (
+            annotation["image_id"] == image["id"]
+        ), f"Mismatching image_id in annotation and image: {annotation['image_id']} vs {image['id']}"
+        category_id = annotation["category_id"]
+        if category_id not in annotations_images_by_category:
+            annotations_images_by_category[category_id] = []
+        annotations_images_by_category[category_id].append(
+            {
+                "annotation": annotation,
+                "image": image,
+            }
+        )
+    # Split train/test
+    train_sample_count = 0  # Useful for tqdm later
+    train_annotations_images_by_category = {}
+    test_annotations_images_by_category = {}
+    for category_id, annotations_images in tqdm(
+        annotations_images_by_category.items(), desc="Assigning train/test splits"
+    ):
+        test_size = 0.2
+        # Create split by "category" (class): Each category needs to be in both train and test (80:20) as per original ratio
+        n_samples = len(annotations_images)
+        if n_samples == 1:
+            # If only one sample, put it in train
+            train_annotations_images = annotations_images
+            test_annotations_images = []
+        elif n_samples < 5:  # Minimum 5 samples to ensure at least 1 in test
+            # Ensure at least 1 sample in test
+            test_size = max(1, int(n_samples * test_size))
+            train_annotations_images = annotations_images[:-test_size]
+            test_annotations_images = annotations_images[-test_size:]
+        else:
+            # Original split logic
+            train_annotations_images, test_annotations_images = train_test_split(
+                annotations_images, test_size=test_size, random_state=0
+            )
+        train_annotations_images_by_category[category_id] = train_annotations_images
+        test_annotations_images_by_category[category_id] = test_annotations_images
+        train_sample_count += len(train_annotations_images)
+    # Add to train set
+    new_train_metadata = (
+        old_train_metadata.copy()
+    )  # Keep 'categories', 'info', 'licenses', 'institutions'
+    new_train_metadata.update(
+        {
+            "annotations": [],
+            "images": [],
+        }
+    )
+    with tqdm(
+        desc="Creating new train dataset",
+        total=train_sample_count,
+    ) as pbar:
+        for category_id, annotations_images in train_annotations_images_by_category.items():
+            # Create a nested directory from category_id, e.g. 15504 -> "155/04" or 3 -> "000/03"
+            category_subdir = f"{category_id // 100:03d}/{category_id % 100:02d}"
+            (public / "train/images" / category_subdir).mkdir(exist_ok=True, parents=True)
+            for idx, annotation_image in enumerate(annotations_images):
+                new_annotation = annotation_image["annotation"].copy()
+                new_train_metadata["annotations"].append(new_annotation)
+                new_image = annotation_image["image"].copy()
+                new_train_metadata["images"].append(new_image)
+                # Copy file from raw to public
+                if (
+                    not dev_mode or idx < dev_count
+                ):  # if dev_mode, only copy the first dev_count images
+                    src_path = raw / "train" / annotation_image["image"]["file_name"]
+                    dst_path = public / "train" / annotation_image["image"]["file_name"]
+                    shutil.copyfile(src=src_path, dst=dst_path)
+                pbar.update(1)
+    with open(public / "train/metadata.json", "w") as f:
+        json.dump(new_train_metadata, f, indent=4, sort_keys=True)
+    if not dev_mode:
+        assert len(list((public / "train/images").glob("**/*.jpg"))) == len(
+            new_train_metadata["images"]
+        ), f"Mismatching number of images in train_images, got {len(list((public / 'train/images').glob('**/*.jpg')))}"
+        assert len(new_train_metadata["annotations"]) == len(
+            new_train_metadata["images"]
+        ), f"Mismatching number of annotations in train_metadata, got {len(new_train_metadata['annotations'])}"
+    # Add to test set
+    new_test_metadata = old_train_metadata.copy()
+    del new_test_metadata["categories"]
+    del new_test_metadata["institutions"]
+    new_test_metadata.update(
+        {
+            "annotations": [],
+            "images": [],
+        }
+    )
+    # Flatten and shuffle test set so that we don't have all the same categories in a row
+    test_annotations_images = [
+        item for sublist in test_annotations_images_by_category.values() for item in sublist
+    ]
+    random.Random(0).shuffle(test_annotations_images)
+    for idx, annotation_image in tqdm(
+        enumerate(test_annotations_images),
+        desc="Creating new test dataset",
+        total=len(test_annotations_images),
+    ):
+        # Make new image id, for test set this is just the index
+        new_image_id = str(idx)
+        # Make new filename from image id e.g. "000/0.jpg"
+        new_file_name = f"images/{idx // 1000:03d}/{idx}.jpg"
+        new_annotation = annotation_image["annotation"].copy()
+        new_annotation["image_id"] = new_image_id
+        new_test_metadata["annotations"].append(new_annotation)
+        new_image = annotation_image["image"].copy()
+        new_image["id"] = new_image_id
+        new_image["file_name"] = new_file_name
+        new_test_metadata["images"].append(new_image)
+        # Copy file from raw to public
+        if not dev_mode or idx < dev_count:  # if dev_mode, only copy the first dev_count images
+            src_path = raw / "train" / annotation_image["image"]["file_name"]
+            dst_path = public / "test" / new_file_name
+            dst_path.parent.mkdir(exist_ok=True, parents=True)
+            shutil.copyfile(src=src_path, dst=dst_path)
+    # Save new test metadata
+    with open(public / "test/metadata.json", "w") as f:
+        # The public test data, of course, doesn't have annotations
+        public_new_test = new_test_metadata.copy()
+        del public_new_test["annotations"]
+        assert public_new_test.keys() == {
+            "images",
+            "info",
+            "licenses",
+        }, f"Public test metadata keys should be 'images', 'info', 'licenses', but found {public_new_test.keys()}"
+        json.dump(public_new_test, f, indent=4, sort_keys=True)
+    if not dev_mode:
+        assert len(list((public / "test/images").glob("**/*.jpg"))) == len(
+            new_test_metadata["images"]
+        ), f"Mismatching number of images in test_images, got {len(list((public / 'test/images').glob('**/*.jpg')))}"
+        assert len(new_test_metadata["annotations"]) == len(
+            new_test_metadata["images"]
+        ), f"Mismatching number of annotations in test_metadata, got {len(new_test_metadata['annotations'])}"
+        assert len(new_train_metadata["annotations"]) + len(
+            new_test_metadata["annotations"]
+        ) == len(old_train_metadata["annotations"]), (
+            f"Expected {len(old_train_metadata['annotations'])} annotations in total, but found"
+            f"{len(new_train_metadata['annotations'])} in train and {len(new_test_metadata['annotations'])} in test"
+        )
+    # Save private test answers
+    answers_rows = []
+    for image, annotation in zip(new_test_metadata["images"], new_test_metadata["annotations"]):
+        assert (
+            image["id"] == annotation["image_id"]
+        ), f"Mismatching image_id in image and annotation: {image['id']} vs {annotation['image_id']}"
+        answers_rows.append(
+            {
+                "Id": image["id"],
+                "Predicted": annotation["category_id"],
+            }
+        )
+    answers_df = pd.DataFrame(answers_rows)
+    answers_df.to_csv(private / "answers.csv", index=False)
+    # Create new sample submission that matches raw/sample_submission.csv, but for the new test set
+    sample_rows = []
+    for image in new_test_metadata["images"]:
+        sample_rows.append(
+            {
+                "Id": image["id"],
+                "Predicted": 0,
+            }
+        )
+    sample_df = pd.DataFrame(sample_rows)
+    sample_df.to_csv(public / "sample_submission.csv", index=False)
+    assert len(answers_df) == len(
+        new_test_metadata["images"]
+    ), f"Expected {len(new_test_metadata['images'])} rows in answers, but found {len(answers_df)}"
+    assert len(sample_df) == len(
+        answers_df
+    ), f"Expected {len(answers_df)} rows in sample submission, but found {len(sample_df)}"
+    assert answers_df["Id"].equals(
+        sample_df["Id"]
+    ), "Mismatched 'Id' columns between answers and sample submission"

dslighting 1.7.1__py3-none-any.whl → 1.7.6__py3-none-any.whl

dslighting 1.7.1py3-none-any.whl → 1.7.6py3-none-any.whl