PyPI - labelr - Versions diffs - 0.10.0__py3-none-any.whl → 0.11.1__py3-none-any.whl - Mend

labelr 0.10.0py3-none-any.whl → 0.11.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

labelr/annotate.py +3 -54
labelr/apps/datasets.py +140 -9
labelr/apps/directus.py +212 -0
labelr/apps/google_batch.py +38 -0
labelr/apps/label_studio.py +295 -104
labelr/apps/typer_description.py +2 -0
labelr/check.py +68 -7
labelr/config.py +57 -1
labelr/export/object_detection.py +96 -18
labelr/main.py +16 -0
labelr/sample/object_detection.py +42 -13
labelr-0.11.1.dist-info/METADATA +230 -0
{labelr-0.10.0.dist-info → labelr-0.11.1.dist-info}/RECORD +17 -15
{labelr-0.10.0.dist-info → labelr-0.11.1.dist-info}/WHEEL +1 -1
labelr-0.10.0.dist-info/METADATA +0 -158
{labelr-0.10.0.dist-info → labelr-0.11.1.dist-info}/entry_points.txt +0 -0
{labelr-0.10.0.dist-info → labelr-0.11.1.dist-info}/licenses/LICENSE +0 -0
{labelr-0.10.0.dist-info → labelr-0.11.1.dist-info}/top_level.txt +0 -0

labelr/check.py CHANGED Viewed

@@ -1,30 +1,64 @@
+import typing
 from collections import defaultdict
 from pathlib import Path
 import imagehash
 import tqdm
 from label_studio_sdk.client import LabelStudio
-from openfoodfacts.utils import get_image_from_url, get_logger
+from openfoodfacts.types import JSONType
+from openfoodfacts.utils import ImageDownloadItem, get_image_from_url, get_logger
 from PIL import Image
 logger = get_logger(__name__)
-def check_ls_dataset(ls: LabelStudio, project_id: int):
+def check_ls_dataset(
+    ls: LabelStudio,
+    project_id: int,
+    view_id: int | None = None,
+    delete_missing_images: bool = False,
+    delete_duplicate_images: bool = False,
+):
+    """Perform sanity checks of a Label Studio dataset.
+    This function checks for:
+    - Tasks with missing images (404)
+    - Duplicate images based on perceptual hash (pHash)
+    - Tasks with multiple annotations
+    This function doesn't perform any modifications to the dataset, except
+    optionally deleting tasks with missing images if `delete_missing_images`
+    is set to True.
+    Args:
+        ls (LabelStudio): Label Studio client instance.
+        project_id (int): ID of the Label Studio project to check.
+        view_id (int): ID of the Label Studio view to check. If None, no
+            filtering is done.
+        delete_missing_images (bool): Whether to delete tasks with missing
+            images.
+        delete_duplicate_images (bool): Whether to delete tasks with duplicate
+            images. If one task has annotations and the other doesn't, the task
+            with annotations will be kept. Otherwise, the most recent task will
+            be kept.
+    """
     skipped = 0
     not_annotated = 0
     annotated = 0
+    deleted = 0
+    multiple_annotations = 0
     hash_map = defaultdict(list)
     for task in tqdm.tqdm(
-        ls.tasks.list(project=project_id, fields="all"), desc="tasks"
+        ls.tasks.list(project=project_id, fields="all", view=view_id), desc="tasks"
     ):
-        annotations = task.annotations
+        annotations = typing.cast(list[JSONType], task.annotations)
         if len(annotations) == 0:
             not_annotated += 1
             continue
         elif len(annotations) > 1:
             logger.warning("Task has multiple annotations: %s", task.id)
+            multiple_annotations += 1
             continue
         annotation = annotations[0]
@@ -34,20 +68,47 @@ def check_ls_dataset(ls: LabelStudio, project_id: int):
         annotated += 1
         image_url = task.data["image_url"]
-        image = get_image_from_url(image_url)
-        image_hash = str(imagehash.phash(image))
+        image_struct = typing.cast(
+            ImageDownloadItem,
+            get_image_from_url(image_url, return_struct=True, error_raise=False),
+        )
+        if image_struct.response.status_code == 404:
+            logger.warning("Image not found (404): %s", image_url)
+            if delete_missing_images:
+                ls.tasks.delete(task.id)
+                deleted += 1
+                logger.info("Deleted task with missing image: %s", task.id)
+            continue
+        if image_struct.image is None:
+            logger.warning("Could not open image: %s", image_url)
+            continue
+        image_hash = str(imagehash.phash(image_struct.image))
         hash_map[image_hash].append(task.id)
     for image_hash, task_ids in hash_map.items():
         if len(task_ids) > 1:
             logger.warning("Duplicate images: %s", task_ids)
+            if delete_duplicate_images:
+                tasks = [ls.tasks.get(id=task_id) for task_id in task_ids]
+                # We sort the tasks by the number of annotations, so that we keep the
+                # one with at least one annotation.
+                for task in sorted(tasks, key=lambda x: len(x.annotations) > 0)[:-1]:
+                    logger.info("Deleting duplicate task: %s", task.id)
+                    ls.tasks.delete(task.id)
+                    deleted += 1
     logger.info(
-        "Tasks - annotated: %d, skipped: %d, not annotated: %d",
+        "Tasks - annotated: %d, skipped: %d, not annotated: %d, multiple annotations: %d",
         annotated,
         skipped,
         not_annotated,
+        multiple_annotations,
     )
+    logger.info("Deleted tasks with missing images: %d", deleted)
 def check_local_dataset(dataset_dir: Path, remove: bool = False):

labelr/config.py CHANGED Viewed

@@ -1 +1,57 @@
-LABEL_STUDIO_DEFAULT_URL = "https://annotate.openfoodfacts.org"
+from pathlib import Path
+from pydantic import BaseModel, Field
+import os
+CONFIG_PATH = Path("~").expanduser() / ".config/labelr/config.json"
+# validate_assignment allows to validate the model everytime it is updated
+class LabelrConfig(BaseModel, validate_assignment=True):
+    label_studio_url: str = Field(
+        default="http://127.0.0.1:8080",
+        description="URL of the Label Studio instance to use. Defaults to http://127.0.0.1:8080.",
+    )
+    label_studio_api_key: str | None = Field(
+        default=None,
+        description="API key for Label Studio.",
+    )
+def get_config() -> LabelrConfig:
+    """Get labelr configuration.
+    The configuration can come from (by order of precedence):
+    - Environment variables
+    - JSON file (see below)
+    The configuration is stored in a JSON file at ~/.config/.labelr/config.json.
+    The following environment variables are supported:
+    - LABELR_LABEL_STUDIO_URL
+    - LABELR_LABEL_STUDIO_API_KEY
+    """
+    if CONFIG_PATH.exists():
+        config = LabelrConfig.model_validate_json(CONFIG_PATH.read_bytes())
+        if "LABELR_LABEL_STUDIO_URL" in os.environ:
+            config.label_studio_url = os.environ["LABELR_LABEL_STUDIO_URL"]
+        if "LABELR_LABEL_STUDIO_API_KEY" in os.environ:
+            config.label_studio_api_key = os.environ["LABELR_LABEL_STUDIO_API_KEY"]
+        return config
+    else:
+        return LabelrConfig()
+def set_file_config(key: str, value: str):
+    """Update the labelr configuration.
+    The configuration is stored in a JSON file at ~/.config/.labelr/config.json.
+    """
+    config = get_config()
+    setattr(config, key, value)
+    CONFIG_PATH.parent.mkdir(parents=True, exist_ok=True)
+    CONFIG_PATH.write_text(config.model_dump_json(indent=2))
+config = get_config()

labelr/export/object_detection.py CHANGED Viewed

@@ -1,18 +1,21 @@
 import functools
 import logging
 import pickle
+import random
 import tempfile
+import typing
 from pathlib import Path
 import datasets
 import tqdm
 from label_studio_sdk.client import LabelStudio
 from openfoodfacts.images import download_image
+from PIL import Image, ImageOps
 from labelr.export.common import _pickle_sample_generator
 from labelr.sample.object_detection import (
-    HF_DS_OBJECT_DETECTION_FEATURES,
     format_object_detection_sample_to_hf,
+    get_hf_object_detection_features,
 )
 logger = logging.getLogger(__name__)
@@ -23,19 +26,47 @@ def export_from_ls_to_hf_object_detection(
     repo_id: str,
     label_names: list[str],
     project_id: int,
+    is_openfoodfacts_dataset: bool,
+    image_max_size: int | None = None,
+    view_id: int | None = None,
     merge_labels: bool = False,
     use_aws_cache: bool = True,
     revision: str = "main",
-):
+) -> None:
+    """Export annotations from a Label Studio project to a Hugging Face
+    dataset.
+    The Label Studio project should be an object detection project.
+    Args:
+        ls (LabelStudio): Label Studio client instance.
+        repo_id (str): Hugging Face repository ID to push the dataset to.
+        label_names (list[str]): List of label names in the project.
+        project_id (int): Label Studio project ID to export from.
+        is_openfoodfacts_dataset (bool): Whether the dataset is an Open Food
+            Facts dataset. If True, the dataset will include additional
+            metadata fields specific to Open Food Facts (`barcode` and
+            `off_image_id`).
+        image_max_size (int | None): Maximum size (in pixels) for the images.
+            If None, no resizing is performed. Defaults to None.
+        view_id (int | None): Label Studio view ID to export from. If None,
+            all tasks are exported. Defaults to None.
+        merge_labels (bool): Whether to merge all labels into a single label
+            named "object". Defaults to False.
+        use_aws_cache (bool): Whether to use the AWS image cache when
+            downloading images. Defaults to True.
+        revision (str): The dataset revision to push to. Defaults to 'main'.
+    """
     if merge_labels:
         label_names = ["object"]
     logger.info(
-        "Project ID: %d, label names: %s, repo_id: %s, revision: %s",
+        "Project ID: %d, label names: %s, repo_id: %s, revision: %s, view ID: %s",
         project_id,
         label_names,
         repo_id,
         revision,
+        view_id,
     )
     for split in ["train", "val"]:
@@ -45,7 +76,9 @@ def export_from_ls_to_hf_object_detection(
             tmp_dir = Path(tmp_dir_str)
             logger.info("Saving samples to temporary directory: %s", tmp_dir)
             for i, task in tqdm.tqdm(
-                enumerate(ls.tasks.list(project=project_id, fields="all")),
+                enumerate(
+                    ls.tasks.list(project=project_id, fields="all", view=view_id)
+                ),
                 desc="tasks",
             ):
                 if task.data["split"] != split:
@@ -56,15 +89,17 @@ def export_from_ls_to_hf_object_detection(
                     label_names=label_names,
                     merge_labels=merge_labels,
                     use_aws_cache=use_aws_cache,
+                    image_max_size=image_max_size,
                 )
                 if sample is not None:
                     # Save output as pickle
                     with open(tmp_dir / f"{split}_{i:05}.pkl", "wb") as f:
                         pickle.dump(sample, f)
+            features = get_hf_object_detection_features(is_openfoodfacts_dataset)
             hf_ds = datasets.Dataset.from_generator(
                 functools.partial(_pickle_sample_generator, tmp_dir),
-                features=HF_DS_OBJECT_DETECTION_FEATURES,
+                features=features,
             )
             hf_ds.push_to_hub(repo_id, split=split, revision=revision)
@@ -78,12 +113,32 @@ def export_from_ls_to_ultralytics_object_detection(
     error_raise: bool = True,
     merge_labels: bool = False,
     use_aws_cache: bool = True,
+    view_id: int | None = None,
+    image_max_size: int | None = None,
 ):
     """Export annotations from a Label Studio project to the Ultralytics
     format.
     The Label Studio project should be an object detection project with a
     single rectanglelabels annotation result per task.
+    Args:
+        ls (LabelStudio): Label Studio client instance.
+        output_dir (Path): Path to the output directory.
+        label_names (list[str]): List of label names in the project.
+        project_id (int): Label Studio project ID to export from.
+        train_ratio (float): Ratio of training samples. The rest will be used
+            for validation. Defaults to 0.8.
+        error_raise (bool): Whether to raise an error if an image fails to
+            download. If False, the image will be skipped. Defaults to True.
+        merge_labels (bool): Whether to merge all labels into a single label
+            named "object". Defaults to False.
+        use_aws_cache (bool): Whether to use the AWS image cache when
+            downloading images. Defaults to True.
+        view_id (int | None): Label Studio view ID to export from. If None,
+            all tasks are exported. Defaults to None.
+        image_max_size (int | None): Maximum size (in pixels) for the images.
+            If None, no resizing is performed. Defaults to None.
     """
     if merge_labels:
         label_names = ["object"]
@@ -101,7 +156,7 @@ def export_from_ls_to_ultralytics_object_detection(
         (images_dir / split).mkdir(parents=True, exist_ok=True)
     for task in tqdm.tqdm(
-        ls.tasks.list(project=project_id, fields="all"),
+        ls.tasks.list(project=project_id, fields="all", view=view_id),
         desc="tasks",
     ):
         split = task.data.get("split")
@@ -179,18 +234,28 @@ def export_from_ls_to_ultralytics_object_detection(
                     has_valid_annotation = True
         if has_valid_annotation:
-            download_output = download_image(
+            image = download_image(
                 image_url,
-                return_struct=True,
+                return_struct=False,
                 error_raise=error_raise,
                 use_cache=use_aws_cache,
             )
-            if download_output is None:
+            if image is None:
                 logger.error("Failed to download image: %s", image_url)
                 continue
-            with (images_dir / split / f"{image_id}.jpg").open("wb") as f:
-                f.write(download_output.image_bytes)
+            image = typing.cast(Image.Image, image)
+            # Rotate image according to exif orientation using Pillow
+            ImageOps.exif_transpose(image, in_place=True)
+            # Resize image if larger than max size
+            if image_max_size is not None and (
+                image.width > image_max_size or image.height > image_max_size
+            ):
+                image.thumbnail(
+                    (image_max_size, image_max_size), Image.Resampling.LANCZOS
+                )
+            image.save(images_dir / split / f"{image_id}.jpg", format="JPEG")
     with (output_dir / "data.yaml").open("w") as f:
         f.write("path: data\n")
@@ -208,6 +273,7 @@ def export_from_hf_to_ultralytics_object_detection(
     download_images: bool = True,
     error_raise: bool = True,
     use_aws_cache: bool = True,
+    image_max_size: int | None = None,
     revision: str = "main",
 ):
     """Export annotations from a Hugging Face dataset project to the
@@ -228,6 +294,8 @@ def export_from_hf_to_ultralytics_object_detection(
         use_aws_cache (bool): Whether to use the AWS image cache when
             downloading images. This option is only used if `download_images`
             is True. Defaults to True.
+        image_max_size (int | None): Maximum size (in pixels) for the images.
+            If None, no resizing is performed. Defaults to None.
         revision (str): The dataset revision to load. Defaults to 'main'.
     """
     logger.info("Repo ID: %s, revision: %s", repo_id, revision)
@@ -263,21 +331,31 @@ def export_from_hf_to_ultralytics_object_detection(
                         "`download_images` to False."
                     )
                 image_url = sample["meta"]["image_url"]
-                download_output = download_image(
+                image = download_image(
                     image_url,
-                    return_struct=True,
+                    return_struct=False,
                     error_raise=error_raise,
                     use_cache=use_aws_cache,
                 )
-                if download_output is None:
+                if image is None:
                     logger.error("Failed to download image: %s", image_url)
                     continue
-                with (split_images_dir / f"{image_id}.jpg").open("wb") as f:
-                    f.write(download_output.image_bytes)
             else:
                 image = sample["image"]
-                image.save(split_images_dir / f"{image_id}.jpg")
+            image = typing.cast(Image.Image, image)
+            # Rotate image according to exif orientation using Pillow
+            # If the image source is Hugging Face, EXIF data is not preserved,
+            # so this step is only useful when downloading images.
+            ImageOps.exif_transpose(image, in_place=True)
+            # Resize image if larger than max size
+            if image_max_size is not None and (
+                image.width > image_max_size or image.height > image_max_size
+            ):
+                image.thumbnail(
+                    (image_max_size, image_max_size), Image.Resampling.LANCZOS
+                )
+            image.save(split_images_dir / f"{image_id}.jpg")
             objects = sample["objects"]
             bboxes = objects["bbox"]

labelr/main.py CHANGED Viewed

@@ -4,11 +4,13 @@ import typer
 from openfoodfacts.utils import get_logger
 from labelr.apps import datasets as dataset_app
+from labelr.apps import directus as directus_app
 from labelr.apps import evaluate as evaluate_app
 from labelr.apps import google_batch as google_batch_app
 from labelr.apps import hugging_face as hf_app
 from labelr.apps import label_studio as ls_app
 from labelr.apps import train as train_app
+from labelr import config as _config
 app = typer.Typer(pretty_exceptions_show_locals=False)
@@ -60,6 +62,17 @@ def predict(
         typer.echo(result)
+@app.command()
+def config(name: str, value: str):
+    """Set a Labelr configuration value.
+    The configuration is stored in a JSON file at ~/.config/.labelr/config.json.
+    """
+    typer.echo(f"Set '{name}' to '{value}'")
+    _config.set_file_config(name, value)
+    typer.echo(f"Configuration saved to {_config.CONFIG_PATH}")
 app.add_typer(
     ls_app.app,
     name="ls",
@@ -90,6 +103,9 @@ app.add_typer(
     name="google-batch",
     help="Generate datasets and launch batch jobs on Google Gemini.",
 )
+app.add_typer(
+    directus_app.app, name="directus", help="Manage directus collections and items."
+)
 if __name__ == "__main__":
     app()

labelr/sample/object_detection.py CHANGED Viewed

@@ -8,7 +8,7 @@ import PIL
 from openfoodfacts import Flavor
 from openfoodfacts.barcode import normalize_barcode
 from openfoodfacts.images import download_image, generate_image_url
-from PIL import ImageOps
+from PIL import Image, ImageOps
 logger = logging.getLogger(__name__)
@@ -153,6 +153,7 @@ def format_object_detection_sample_to_hf(
     label_names: list[str],
     merge_labels: bool = False,
     use_aws_cache: bool = False,
+    image_max_size: int | None = None,
 ) -> dict | None:
     """Format a Label Studio object detection sample to Hugging Face format.
@@ -163,6 +164,8 @@ def format_object_detection_sample_to_hf(
         merge_labels: Whether to merge all labels into a single label (the
             first label in `label_names`).
         use_aws_cache: Whether to use AWS cache when downloading images.
+        image_max_size: Maximum size (in pixels) for the images.
+            If None, no resizing is performed. Defaults to None.
     Returns:
         The formatted sample, or None in the following cases:
@@ -184,7 +187,8 @@ def format_object_detection_sample_to_hf(
     for annotation_result in annotation["result"]:
         if annotation_result["type"] != "rectanglelabels":
-            raise ValueError("Invalid annotation type: %s" % annotation_result["type"])
+            continue
+            # raise ValueError("Invalid annotation type: %s" % annotation_result["type"])
         value = annotation_result["value"]
         x_min = value["x"] / 100
@@ -205,21 +209,34 @@ def format_object_detection_sample_to_hf(
         logger.error("Failed to download image: %s", image_url)
         return None
+    image = typing.cast(Image.Image, image)
     # Correct image orientation using EXIF data
     # Label Studio provides bounding boxes based on the displayed image (after
     # eventual EXIF rotation), so we need to apply the same transformation to
     # the image.
     # Indeed, Hugging Face stores images without applying EXIF rotation, and
     # EXIF data is not preserved in the dataset.
-    ImageOps.exif_transpose(typing.cast(PIL.Image.Image, image), in_place=True)
+    ImageOps.exif_transpose(image, in_place=True)
+    # Resize image if larger than max size
+    if image_max_size is not None and (
+        image.width > image_max_size or image.height > image_max_size
+    ):
+        image.thumbnail((image_max_size, image_max_size), Image.Resampling.LANCZOS)
+    meta = task_data.get("meta", {})
+    barcode = meta.get("barcode", None)
+    off_image_id = meta.get("off_image_id", None)
+    width = image.width
+    height = image.height
     return {
         "image_id": task_data["image_id"],
         "image": image,
-        "width": task_data["meta"]["width"],
-        "height": task_data["meta"]["height"],
+        "width": width,
+        "height": height,
         "meta": {
-            "barcode": task_data["meta"]["barcode"],
-            "off_image_id": task_data["meta"]["off_image_id"],
+            "barcode": barcode,
+            "off_image_id": off_image_id,
             "image_url": image_url,
         },
         "objects": {
@@ -230,16 +247,23 @@ def format_object_detection_sample_to_hf(
     }
-# The HuggingFace Dataset features
-HF_DS_OBJECT_DETECTION_FEATURES = datasets.Features(
-    {
+def get_hf_object_detection_features(
+    is_openfoodfacts_dataset: bool,
+) -> datasets.Features:
+    """Get the HuggingFace Dataset features for object detection.
+    Args:
+        is_openfoodfacts_dataset (bool): Whether the dataset is an Open Food
+            Facts dataset. If True, the dataset will include additional
+            metadata fields specific to Open Food Facts (`barcode` and
+            `off_image_id`).
+    """
+    features_dict = {
         "image_id": datasets.Value("string"),
         "image": datasets.features.Image(),
         "width": datasets.Value("int64"),
         "height": datasets.Value("int64"),
         "meta": {
-            "barcode": datasets.Value("string"),
-            "off_image_id": datasets.Value("string"),
             "image_url": datasets.Value("string"),
         },
         "objects": {
@@ -248,4 +272,9 @@ HF_DS_OBJECT_DETECTION_FEATURES = datasets.Features(
             "category_name": datasets.Sequence(datasets.Value("string")),
         },
     }
-)
+    if is_openfoodfacts_dataset:
+        features_dict["meta"]["barcode"] = datasets.Value("string")
+        features_dict["meta"]["off_image_id"] = datasets.Value("string")
+    return datasets.Features(features_dict)

labelr 0.10.0__py3-none-any.whl → 0.11.1__py3-none-any.whl

labelr 0.10.0py3-none-any.whl → 0.11.1py3-none-any.whl