PyPI - dslighting - Versions diffs - 1.7.1__py3-none-any.whl → 1.7.8__py3-none-any.whl - Mend

dslighting 1.7.1py3-none-any.whl → 1.7.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (352) hide show

mlebench/competitions/herbarium-2021-fgvc8/prepare_val.py ADDED Viewed

@@ -0,0 +1,222 @@
+import json
+import random
+import shutil
+from pathlib import Path
+from typing import Dict, List
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from tqdm import tqdm
+from mlebench.utils import get_logger
+logger = get_logger(__name__)
+def _split_data_by_category(
+    data_by_category: Dict[int, List[dict]],
+) -> (Dict[int, List[dict]], Dict[int, List[dict]]):
+    """
+    Splits data within each category into train and test sets.
+    This function replicates the original script's splitting logic to ensure
+    consistency.
+    """
+    train_split_by_category = {}
+    test_split_by_category = {}
+    for category_id, annotations_images in data_by_category.items():
+        test_size = 0.2
+        n_samples = len(annotations_images)
+        if n_samples == 1:
+            train_annotations_images = annotations_images
+            test_annotations_images = []
+        elif n_samples < 5:
+            test_size = max(1, int(n_samples * test_size))
+            train_annotations_images = annotations_images[:-test_size]
+            test_annotations_images = annotations_images[-test_size:]
+        else:
+            train_annotations_images, test_annotations_images = train_test_split(
+                annotations_images, test_size=test_size, random_state=0
+            )
+        train_split_by_category[category_id] = train_annotations_images
+        test_split_by_category[category_id] = test_annotations_images
+    return train_split_by_category, test_split_by_category
+def _process_and_save_split(
+    train_data_by_cat: Dict[int, List[dict]],
+    test_data_by_cat: Dict[int, List[dict]],
+    base_metadata: dict,
+    public_dir: Path,
+    private_dir: Path,
+    raw_data_path: Path,
+    dev_mode: bool,
+    dev_count: int,
+):
+    """
+    Processes and saves a single train/test split to the specified directories.
+    This function handles:
+    - Creating training set metadata and copying images.
+    - Creating test set metadata and copying/renaming images.
+    - Creating private ground-truth answers.
+    - Creating a public sample submission file.
+    """
+    # Create required directories
+    public_dir.mkdir(exist_ok=True, parents=True)
+    private_dir.mkdir(exist_ok=True, parents=True)
+    (public_dir / "train/images").mkdir(exist_ok=True, parents=True)
+    (public_dir / "test/images").mkdir(exist_ok=True, parents=True)
+    # Process train set
+    new_train_metadata = base_metadata.copy()
+    new_train_metadata.update({"annotations": [], "images": []})
+    train_sample_count = sum(len(v) for v in train_data_by_cat.values())
+    with tqdm(
+        desc=f"Creating train set for {public_dir.name}",
+        total=train_sample_count,
+    ) as pbar:
+        for category_id, annotations_images in train_data_by_cat.items():
+            category_subdir = f"{category_id // 100:03d}/{category_id % 100:02d}"
+            (public_dir / "train/images" / category_subdir).mkdir(exist_ok=True, parents=True)
+            for idx, annotation_image in enumerate(annotations_images):
+                new_train_metadata["annotations"].append(annotation_image["annotation"].copy())
+                new_train_metadata["images"].append(annotation_image["image"].copy())
+                if not dev_mode or idx < dev_count:
+                    src_path = raw_data_path / "train" / annotation_image["image"]["file_name"]
+                    dst_path = public_dir / "train" / annotation_image["image"]["file_name"]
+                    shutil.copyfile(src=src_path, dst=dst_path)
+                pbar.update(1)
+    with open(public_dir / "train/metadata.json", "w") as f:
+        json.dump(new_train_metadata, f, indent=4, sort_keys=True)
+    # Process test set
+    new_test_metadata = base_metadata.copy()
+    for key_to_del in ["categories", "institutions"]:
+        if key_to_del in new_test_metadata:
+            del new_test_metadata[key_to_del]
+    new_test_metadata.update({"annotations": [], "images": []})
+    test_annotations_images = [
+        item for sublist in test_data_by_cat.values() for item in sublist
+    ]
+    random.Random(0).shuffle(test_annotations_images)
+    for idx, annotation_image in tqdm(
+        enumerate(test_annotations_images),
+        desc=f"Creating test set for {public_dir.name}",
+        total=len(test_annotations_images),
+    ):
+        new_image_id = str(idx)
+        new_file_name = f"images/{idx // 1000:03d}/{idx}.jpg"
+        new_annotation = annotation_image["annotation"].copy()
+        new_annotation["image_id"] = new_image_id
+        new_test_metadata["annotations"].append(new_annotation)
+        new_image = annotation_image["image"].copy()
+        new_image["id"] = new_image_id
+        new_image["file_name"] = new_file_name
+        new_test_metadata["images"].append(new_image)
+        if not dev_mode or idx < dev_count:
+            src_path = raw_data_path / "train" / annotation_image["image"]["file_name"]
+            dst_path = public_dir / "test" / new_file_name
+            dst_path.parent.mkdir(exist_ok=True, parents=True)
+            shutil.copyfile(src=src_path, dst=dst_path)
+    # Save public test metadata (without answers)
+    public_new_test = new_test_metadata.copy()
+    del public_new_test["annotations"]
+    with open(public_dir / "test/metadata.json", "w") as f:
+        json.dump(public_new_test, f, indent=4, sort_keys=True)
+    # Save private test answers
+    answers_rows = [
+        {"Id": img["id"], "Predicted": ann["category_id"]}
+        for img, ann in zip(new_test_metadata["images"], new_test_metadata["annotations"])
+    ]
+    pd.DataFrame(answers_rows).to_csv(private_dir / "answers.csv", index=False)
+    # Save public sample submission
+    sample_rows = [{"Id": img["id"], "Predicted": 0} for img in new_test_metadata["images"]]
+    pd.DataFrame(sample_rows).to_csv(public_dir / "sample_submission.csv", index=False)
+def prepare(raw: Path, public: Path, private: Path):
+    """
+    Splits the raw data into public and private datasets with appropriate test/train splits.
+    This script now performs two splits:
+    1.  raw -> train + test (saved to `public`/`private`)
+    2.  train -> train_val + test_val (saved to `public_val`/`private_val`)
+    The second split uses the exact same logic as the first, creating a smaller
+    dataset for validation that mirrors the structure of the main one.
+    """
+    dev_mode = False
+    dev_count = 2
+    # --- Start: New code for managing validation paths ---
+    # Define and create the new parallel directories for the validation set
+    public_val = public.parent / "public_val"
+    private_val = private.parent / "private_val"
+    public_val.mkdir(exist_ok=True)
+    private_val.mkdir(exist_ok=True)
+    # --- End: New code for managing validation paths ---
+    json_path = raw / "train/metadata.json"
+    with open(json_path, "r", encoding="utf-8") as f:
+        old_train_metadata = json.load(f)
+    annotations_images_by_category = {}
+    for annotation, image in list(
+        zip(old_train_metadata["annotations"], old_train_metadata["images"])
+    ):
+        category_id = annotation["category_id"]
+        if category_id not in annotations_images_by_category:
+            annotations_images_by_category[category_id] = []
+        annotations_images_by_category[category_id].append(
+            {"annotation": annotation, "image": image}
+        )
+    # --- SPLIT 1: Create the original train and test sets ---
+    logger.info("Starting initial split: raw -> train + test")
+    train_annotations_images_by_category, test_annotations_images_by_category = (
+        _split_data_by_category(annotations_images_by_category)
+    )
+    _process_and_save_split(
+        train_data_by_cat=train_annotations_images_by_category,
+        test_data_by_cat=test_annotations_images_by_category,
+        base_metadata=old_train_metadata,
+        public_dir=public,
+        private_dir=private,
+        raw_data_path=raw,
+        dev_mode=dev_mode,
+        dev_count=dev_count,
+    )
+    logger.info(f"Original train/test sets saved to {public.name} and {private.name}")
+    # --- SPLIT 2: Create the new validation sets from the training set of SPLIT 1 ---
+    logger.info("Starting second split: train -> train_val + test_val")
+    # The input to this split is the *training data* from the first split
+    train_val_by_category, test_val_by_category = _split_data_by_category(
+        train_annotations_images_by_category
+    )
+    _process_and_save_split(
+        train_data_by_cat=train_val_by_category,
+        test_data_by_cat=test_val_by_category,
+        base_metadata=old_train_metadata,  # Use original for all top-level keys
+        public_dir=public_val,
+        private_dir=private_val,
+        raw_data_path=raw,
+        dev_mode=dev_mode,
+        dev_count=dev_count,
+    )
+    logger.info(f"Validation sets saved to {public_val.name} and {private_val.name}")

mlebench/competitions/herbarium-2022-fgvc9/grade.py ADDED Viewed

@@ -0,0 +1,31 @@
+import pandas as pd
+from sklearn.metrics import f1_score
+from mlebench.grade_helpers import InvalidSubmissionError
+def prepare_for_metric(submission: pd.DataFrame, answers: pd.DataFrame):
+    """
+    `submission` and `answers` are pd.DataFrame with "Id" and "Predicted" columns
+    """
+    if len(submission) != len(answers):
+        raise InvalidSubmissionError(
+            f"Submission length {len(submission)} != answers length {len(answers)}"
+        )
+    if "Id" not in submission.columns or "Predicted" not in submission.columns:
+        raise InvalidSubmissionError("Submission must have 'Id' and 'Predicted' columns")
+    # Sort submission and answers by "Id"
+    submission = submission.sort_values("Id")
+    answers = answers.sort_values("Id")
+    if (submission["Id"].values != answers["Id"].values).any():
+        raise InvalidSubmissionError("Submission and answers have mismatched 'Id' columns")
+    y_true = [int(y) for y in answers["Predicted"]]
+    y_pred = [int(y) for y in submission["Predicted"]]
+    return y_true, y_pred
+def grade(submission: pd.DataFrame, answers: pd.DataFrame) -> float:
+    y_true, y_pred = prepare_for_metric(submission, answers)
+    return f1_score(y_true=y_true, y_pred=y_pred, average="macro")

mlebench/competitions/herbarium-2022-fgvc9/prepare.py ADDED Viewed

@@ -0,0 +1,233 @@
+import json
+import random
+import shutil
+from pathlib import Path
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from tqdm import tqdm
+from mlebench.utils import get_logger
+logger = get_logger(__name__)
+def prepare(raw: Path, public: Path, private: Path):
+    """
+    Splits the raw data into public and private datasets with appropriate test/train splits.
+    `train_metadata.json` is the "table of contents" for our data, with the following structure:
+    (More details at https://www.kaggle.com/competitions/herbarium-2022-fgvc9/data)
+    ```
+    {
+        "annotations" : [annotation],
+        "categories" : [category],
+        "genera" : [genus]
+        "images" : [image],
+        "distances" : [distance],
+        "licenses" : [license],
+        "institutions" : [institution]
+    }
+    ```
+    - `images` and `annotations` are both N-length lists corresponding to the N samples.
+        We'll need to split each of these lists into train and test.
+    - The other fields are dataset-wide metadata that we don't need to touch.
+    Other notes:
+    - train/test splits need to occur per category (each category should be in both train and test).
+    - The `test_images` and `train_images` folders have nested subdirs to make it easier to browse
+        - `train_images` is structured as `{category_id[:3]}/{category_id[3:]}/{image_id}.jpg`
+        - `test_images` is structured as `{image_idx[:3]}/test-{image_idx}.jpg` (to not reveal the category)
+    - When we create the new splits, we re-assign image indices so that we don't give away labels based on the index
+        - train images are indexed within their own category
+        - test images follow a flat index after shuffling the categories
+    """
+    # Create train, test from train split
+    with open(raw / "train_metadata.json") as f:
+        old_train_metadata = json.load(f)
+    # Organize data by category so that we can split per-category later
+    annotations_images_by_category = {}  # We'll collect both `annotations` and `images` here
+    for annotation, image in list(
+        zip(old_train_metadata["annotations"], old_train_metadata["images"])
+    ):
+        assert annotation["image_id"] == image["image_id"]
+        category_id = annotation["category_id"]
+        if category_id not in annotations_images_by_category:
+            annotations_images_by_category[category_id] = []
+        annotations_images_by_category[category_id].append(
+            {
+                "annotation": annotation,
+                "image": image,
+            }
+        )
+    # Split train/test
+    train_sample_count = 0  # Useful for tqdm later
+    train_annotations_images_by_category = {}
+    test_annotations_images_by_category = {}
+    for category_id, annotations_images in tqdm(
+        annotations_images_by_category.items(), desc="Assigning train/test splits"
+    ):
+        # Create split by "category" (class): Each category needs to be in both train and test (80:20)
+        train_annotations_images, test_annotations_images = train_test_split(
+            annotations_images, test_size=0.2, random_state=0
+        )
+        assert len(train_annotations_images) > 0 and len(test_annotations_images) > 0
+        train_annotations_images_by_category[category_id] = train_annotations_images
+        test_annotations_images_by_category[category_id] = test_annotations_images
+        train_sample_count += len(train_annotations_images)
+    # Add to train set
+    new_train_metadata = old_train_metadata.copy()  # Keep peripheral metadata
+    new_train_metadata.update(
+        {
+            "annotations": [],
+            "images": [],
+        }
+    )
+    with tqdm(
+        desc="Creating new train dataset",
+        total=train_sample_count,
+    ) as pbar:
+        for category_id, annotations_images in train_annotations_images_by_category.items():
+            # Create a nested directory from category_id, e.g. 15504 -> "155/04" or 3 -> "000/03"
+            category_subdir = f"{category_id // 100:03d}/{category_id % 100:02d}"
+            (public / "train_images" / category_subdir).mkdir(exist_ok=True, parents=True)
+            for idx, annotation_image in enumerate(annotations_images):
+                # Update the image_id and file_name so that we don't have gaps in the image_id
+                # (after doing train/test split, image ids are not contiguous within train)
+                # Make new image id from {category_id}__{idx} e.g. 15504__037
+                new_image_id = f"{category_id:05d}__{(idx + 1):03d}"
+                # Make new filename from image id e.g. "155/04/15504__037.jpg"
+                new_file_name = f"{category_subdir}/{new_image_id}.jpg"
+                new_annotation = annotation_image["annotation"].copy()
+                new_annotation["image_id"] = new_image_id
+                new_train_metadata["annotations"].append(new_annotation)
+                new_image = annotation_image["image"].copy()
+                new_image["image_id"] = new_image_id
+                new_image["file_name"] = new_file_name
+                new_train_metadata["images"].append(new_image)
+                # Copy file from raw to public
+                src_path = raw / "train_images" / annotation_image["image"]["file_name"]
+                dst_path = public / "train_images" / new_file_name
+                shutil.copyfile(src=src_path, dst=dst_path)
+                pbar.update(1)
+    with open(public / "train_metadata.json", "w") as f:
+        json.dump(new_train_metadata, f, indent=4, sort_keys=True)
+    assert len(list((public / "train_images").glob("**/*.jpg"))) == len(
+        new_train_metadata["images"]
+    ), (
+        f"Expected {len(new_train_metadata['images'])} images in train_images, but found"
+        f"{len(list((public / 'train_images').glob('**/*.jpg')))}"
+    )
+    assert len(new_train_metadata["annotations"]) == len(new_train_metadata["images"]), (
+        f"Mismatching number of annotations ({len(new_train_metadata['annotations'])}) "
+        f"and images ({len(new_train_metadata['images'])})"
+    )
+    # Add to test set
+    new_test_metadata = {}  # Test doesn't need all that metadata
+    new_test_metadata.update(
+        {
+            "annotations": [],
+            "images": [],
+        }
+    )
+    # Flatten and shuffle test set so that we don't have all the same categories in a row
+    test_annotations_images = [
+        item for sublist in test_annotations_images_by_category.values() for item in sublist
+    ]
+    random.Random(0).shuffle(test_annotations_images)
+    for idx, annotation_image in tqdm(
+        enumerate(test_annotations_images),
+        desc="Creating new test dataset",
+        total=len(test_annotations_images),
+    ):
+        # Update the image_id and file_name so that we don't have gaps in the image_id
+        # (after doing train/test split, image ids are not contiguous within train and test)
+        # Make new image id, for test set this is just the index
+        new_image_id = str(idx)
+        # Make new filename from image id e.g. "000/test-000000.jpg"
+        new_file_name = f"{idx // 1000:03d}/test-{idx:06d}.jpg"
+        new_annotation = annotation_image["annotation"].copy()
+        new_annotation["image_id"] = new_image_id
+        new_test_metadata["annotations"].append(new_annotation)
+        new_image = annotation_image["image"].copy()
+        new_image["image_id"] = new_image_id
+        new_image["file_name"] = new_file_name
+        new_test_metadata["images"].append(new_image)
+        # Copy file from raw to public
+        src_path = raw / "train_images" / annotation_image["image"]["file_name"]
+        dst_path = public / "test_images" / new_file_name
+        dst_path.parent.mkdir(exist_ok=True, parents=True)
+        shutil.copyfile(src=src_path, dst=dst_path)
+    # Save new test metadata
+    with open(public / "test_metadata.json", "w") as f:
+        # The public data only contains the image metadata, not the annotations nor anything else
+        json.dump(new_test_metadata["images"], f, indent=4, sort_keys=True)
+    assert len(list((public / "test_images").glob("**/*.jpg"))) == len(
+        new_test_metadata["images"]
+    ), (
+        f"Expected {len(new_test_metadata['images'])} images in test_images, but found"
+        f"{len(list((public / 'test_images').glob('**/*.jpg')))}"
+    )
+    assert len(new_test_metadata["annotations"]) == len(new_test_metadata["images"]), (
+        f"Mismatching number of annotations ({len(new_test_metadata['annotations'])}) "
+        f"and images ({len(new_test_metadata['images'])})"
+    )
+    assert len(new_train_metadata["annotations"]) + len(new_test_metadata["annotations"]) == len(
+        old_train_metadata["annotations"]
+    ), (
+        f"Expected {len(old_train_metadata['annotations'])} annotations in total, but found"
+        f"{len(new_train_metadata['annotations'])} in train and {len(new_test_metadata['annotations'])} in test"
+    )
+    # Save private test answers
+    answers_rows = []
+    for image, annotation in zip(new_test_metadata["images"], new_test_metadata["annotations"]):
+        assert image["image_id"] == annotation["image_id"]
+        answers_rows.append(
+            {
+                "Id": image["image_id"],
+                "Predicted": annotation["category_id"],
+            }
+        )
+    answers_df = pd.DataFrame(answers_rows)
+    answers_df.to_csv(private / "answers.csv", index=False)
+    # Create new sample submission that matches raw/sample_submission.csv, but for the new test set
+    sample_rows = []
+    for image in new_test_metadata["images"]:
+        sample_rows.append(
+            {
+                "Id": image["image_id"],
+                "Predicted": 42,
+            }
+        )
+    sample_df = pd.DataFrame(sample_rows)
+    sample_df.to_csv(public / "sample_submission.csv", index=False)
+    assert len(answers_df) == len(
+        new_test_metadata["images"]
+    ), f"Expected {len(new_test_metadata['images'])} rows in answers, but found {len(answers_df)}"
+    assert len(sample_df) == len(
+        answers_df
+    ), f"Expected {len(answers_df)} rows in sample submission, but found {len(sample_df)}"
+    assert answers_df["Id"].equals(
+        sample_df["Id"]
+    ), "Mismatched 'Id' columns between answers and sample submission"

dslighting 1.7.1__py3-none-any.whl → 1.7.8__py3-none-any.whl

dslighting 1.7.1py3-none-any.whl → 1.7.8py3-none-any.whl