PyPI - labelr - Versions diffs - 0.3.0__tar.gz → 0.4.1__tar.gz - Mend

labelr 0.3.0tar.gz → 0.4.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

{labelr-0.3.0/src/labelr.egg-info → labelr-0.4.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: labelr
-Version: 0.3.0
+Version: 0.4.1
 Summary: A command-line tool to manage labeling tasks with Label Studio.
 Requires-Python: >=3.10
 Description-Content-Type: text/markdown

{labelr-0.3.0 → labelr-0.4.1}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "labelr"
-version = "0.3.0"
+version = "0.4.1"
 description = "A command-line tool to manage labeling tasks with Label Studio."
 readme = "README.md"
 requires-python = ">=3.10"

{labelr-0.3.0 → labelr-0.4.1}/src/labelr/apps/datasets.py RENAMED Viewed

@@ -6,8 +6,11 @@ from pathlib import Path
 from typing import Annotated, Optional
 import typer
+from openfoodfacts import Flavor
 from openfoodfacts.utils import get_logger
+from labelr.export import export_from_ultralytics_to_hf
 from ..config import LABEL_STUDIO_DEFAULT_URL
 from ..types import ExportDestination, ExportSource, TaskType
@@ -130,6 +133,9 @@ def export(
     from_: Annotated[ExportSource, typer.Option("--from", help="Input source to use")],
     to: Annotated[ExportDestination, typer.Option(help="Where to export the data")],
     api_key: Annotated[Optional[str], typer.Option(envvar="LABEL_STUDIO_API_KEY")],
+    task_type: Annotated[
+        TaskType, typer.Option(help="Type of task to export")
+    ] = TaskType.object_detection,
     repo_id: Annotated[
         Optional[str],
         typer.Option(
@@ -148,12 +154,33 @@ def export(
         Optional[Path],
         typer.Option(help="Path to the output directory", file_okay=False),
     ] = None,
+    dataset_dir: Annotated[
+        Optional[Path],
+        typer.Option(help="Path to the dataset directory, only for Ultralytics source"),
+    ] = None,
     download_images: Annotated[
         bool,
         typer.Option(
             help="if True, don't use HF images and download images from the server"
         ),
     ] = False,
+    is_openfoodfacts_dataset: Annotated[
+        bool,
+        typer.Option(
+            help="Whether the Ultralytics dataset is an OpenFoodFacts dataset, only "
+            "for Ultralytics source. This is used to generate the correct image URLs "
+            "each image name."
+        ),
+    ] = True,
+    openfoodfacts_flavor: Annotated[
+        Flavor,
+        typer.Option(
+            help="Flavor of the Open Food Facts dataset to use for image URLs, only "
+            "for Ultralytics source if is_openfoodfacts_dataset is True. This is used to "
+            "generate the correct image URLs each image name. This option is ignored if "
+            "is_openfoodfacts_dataset is False."
+        ),
+    ] = Flavor.off,
     train_ratio: Annotated[
         float,
         typer.Option(
@@ -167,20 +194,38 @@ def export(
             help="Raise an error if an image download fails, only for Ultralytics"
         ),
     ] = True,
+    use_aws_cache: Annotated[
+        bool,
+        typer.Option(
+            help="Use the AWS S3 cache for image downloads instead of images.openfoodfacts.org, "
+            "it is ignored if the export format is not Ultralytics"
+        ),
+    ] = True,
+    merge_labels: Annotated[
+        bool,
+        typer.Option(help="Merge multiple labels into a single label"),
+    ] = False,
 ):
     """Export Label Studio annotation, either to Hugging Face Datasets or
     local files (ultralytics format)."""
     from label_studio_sdk.client import LabelStudio
     from labelr.export import (
-        export_from_hf_to_ultralytics,
-        export_from_ls_to_hf,
-        export_from_ls_to_ultralytics,
+        export_from_hf_to_ultralytics_object_detection,
+        export_from_ls_to_hf_object_detection,
+        export_from_ls_to_ultralytics_object_detection,
     )
     if (to == ExportDestination.hf or from_ == ExportSource.hf) and repo_id is None:
         raise typer.BadParameter("Repository ID is required for export/import with HF")
+    if from_ == ExportSource.ultralytics and dataset_dir is None:
+        raise typer.BadParameter(
+            "Dataset directory is required for export from Ultralytics source"
+        )
+    label_names_list: list[str] | None = None
     if label_names is None:
         if to == ExportDestination.hf:
             raise typer.BadParameter("Label names are required for HF export")
@@ -188,6 +233,9 @@ def export(
             raise typer.BadParameter(
                 "Label names are required for export from LS source"
             )
+    else:
+        label_names = typing.cast(str, label_names)
+        label_names_list = label_names.split(",")
     if from_ == ExportSource.ls:
         if project_id is None:
@@ -199,31 +247,60 @@ def export(
         raise typer.BadParameter("Output directory is required for Ultralytics export")
     if from_ == ExportSource.ls:
+        if task_type != TaskType.object_detection:
+            raise typer.BadParameter(
+                "Only object detection task is currently supported with LS source"
+            )
         ls = LabelStudio(base_url=label_studio_url, api_key=api_key)
-        label_names = typing.cast(str, label_names)
-        label_names_list = label_names.split(",")
         if to == ExportDestination.hf:
             repo_id = typing.cast(str, repo_id)
-            export_from_ls_to_hf(
-                ls, repo_id, label_names_list, typing.cast(int, project_id)
+            export_from_ls_to_hf_object_detection(
+                ls,
+                repo_id=repo_id,
+                label_names=typing.cast(list[str], label_names_list),
+                project_id=typing.cast(int, project_id),
+                merge_labels=merge_labels,
+                use_aws_cache=use_aws_cache,
             )
         elif to == ExportDestination.ultralytics:
-            export_from_ls_to_ultralytics(
+            export_from_ls_to_ultralytics_object_detection(
                 ls,
                 typing.cast(Path, output_dir),
-                label_names_list,
+                typing.cast(list[str], label_names_list),
                 typing.cast(int, project_id),
                 train_ratio=train_ratio,
                 error_raise=error_raise,
+                merge_labels=merge_labels,
+                use_aws_cache=use_aws_cache,
             )
     elif from_ == ExportSource.hf:
+        if task_type != TaskType.object_detection:
+            raise typer.BadParameter(
+                "Only object detection task is currently supported with HF source"
+            )
         if to == ExportDestination.ultralytics:
-            export_from_hf_to_ultralytics(
+            export_from_hf_to_ultralytics_object_detection(
                 typing.cast(str, repo_id),
                 typing.cast(Path, output_dir),
                 download_images=download_images,
                 error_raise=error_raise,
+                use_aws_cache=use_aws_cache,
             )
         else:
             raise typer.BadParameter("Unsupported export format")
+    elif from_ == ExportSource.ultralytics:
+        if task_type != TaskType.classification:
+            raise typer.BadParameter(
+                "Only classification task is currently supported with Ultralytics source"
+            )
+        if to == ExportDestination.hf:
+            export_from_ultralytics_to_hf(
+                task_type=task_type,
+                dataset_dir=typing.cast(Path, dataset_dir),
+                repo_id=typing.cast(str, repo_id),
+                merge_labels=merge_labels,
+                label_names=typing.cast(list[str], label_names_list),
+                is_openfoodfacts_dataset=is_openfoodfacts_dataset,
+                openfoodfacts_flavor=openfoodfacts_flavor,
+            )

{labelr-0.3.0 → labelr-0.4.1}/src/labelr/apps/projects.py RENAMED Viewed

@@ -90,6 +90,8 @@ def add_split(
     train_split: Annotated[
         float, typer.Option(help="fraction of samples to add in train split")
     ],
+    api_key: Annotated[str, typer.Option(envvar="LABEL_STUDIO_API_KEY")],
+    project_id: Annotated[int, typer.Option(help="Label Studio project ID")],
     split_name: Annotated[
         Optional[str],
         typer.Option(
@@ -97,9 +99,7 @@ def add_split(
             "with the task ID file. If --task-id-file is not provided, "
             "this field is ignored."
         ),
-    ],
-    api_key: Annotated[str, typer.Option(envvar="LABEL_STUDIO_API_KEY")],
-    project_id: Annotated[int, typer.Option(help="Label Studio project ID")],
+    ] = None,
     train_split_name: Annotated[
         str,
         typer.Option(help="name of the train split"),

{labelr-0.3.0 → labelr-0.4.1}/src/labelr/export.py RENAMED Viewed

@@ -3,16 +3,21 @@ import logging
 import pickle
 import random
 import tempfile
-import typing
 from pathlib import Path
 import datasets
 import tqdm
 from label_studio_sdk.client import LabelStudio
-from openfoodfacts.images import download_image
-from PIL import Image
+from openfoodfacts.images import download_image, generate_image_url
+from openfoodfacts.types import Flavor
+from PIL import Image, ImageOps
-from labelr.sample import HF_DS_FEATURES, format_object_detection_sample_to_hf
+from labelr.sample import (
+    HF_DS_CLASSIFICATION_FEATURES,
+    HF_DS_OBJECT_DETECTION_FEATURES,
+    format_object_detection_sample_to_hf,
+)
+from labelr.types import TaskType
 logger = logging.getLogger(__name__)
@@ -24,13 +29,18 @@ def _pickle_sample_generator(dir: Path):
             yield pickle.load(f)
-def export_from_ls_to_hf(
+def export_from_ls_to_hf_object_detection(
     ls: LabelStudio,
     repo_id: str,
-    category_names: list[str],
+    label_names: list[str],
     project_id: int,
+    merge_labels: bool = False,
+    use_aws_cache: bool = True,
 ):
-    logger.info("Project ID: %d, category names: %s", project_id, category_names)
+    if merge_labels:
+        label_names = ["object"]
+    logger.info("Project ID: %d, label names: %s", project_id, label_names)
     for split in ["train", "val"]:
         logger.info("Processing split: %s", split)
@@ -45,7 +55,11 @@ def export_from_ls_to_hf(
                 if task.data["split"] != split:
                     continue
                 sample = format_object_detection_sample_to_hf(
-                    task.data, task.annotations, category_names
+                    task_data=task.data,
+                    annotations=task.annotations,
+                    label_names=label_names,
+                    merge_labels=merge_labels,
+                    use_aws_cache=use_aws_cache,
                 )
                 if sample is not None:
                     # Save output as pickle
@@ -54,18 +68,20 @@ def export_from_ls_to_hf(
             hf_ds = datasets.Dataset.from_generator(
                 functools.partial(_pickle_sample_generator, tmp_dir),
-                features=HF_DS_FEATURES,
+                features=HF_DS_OBJECT_DETECTION_FEATURES,
             )
             hf_ds.push_to_hub(repo_id, split=split)
-def export_from_ls_to_ultralytics(
+def export_from_ls_to_ultralytics_object_detection(
     ls: LabelStudio,
     output_dir: Path,
-    category_names: list[str],
+    label_names: list[str],
     project_id: int,
     train_ratio: float = 0.8,
     error_raise: bool = True,
+    merge_labels: bool = False,
+    use_aws_cache: bool = True,
 ):
     """Export annotations from a Label Studio project to the Ultralytics
     format.
@@ -73,7 +89,9 @@ def export_from_ls_to_ultralytics(
     The Label Studio project should be an object detection project with a
     single rectanglelabels annotation result per task.
     """
-    logger.info("Project ID: %d, category names: %s", project_id, category_names)
+    if merge_labels:
+        label_names = ["object"]
+    logger.info("Project ID: %d, label names: %s", project_id, label_names)
     data_dir = output_dir / "data"
     data_dir.mkdir(parents=True, exist_ok=True)
@@ -146,25 +164,30 @@ def export_from_ls_to_ultralytics(
                     y_min = value["y"] / 100
                     width = value["width"] / 100
                     height = value["height"] / 100
-                    category_name = value["rectanglelabels"][0]
-                    category_id = category_names.index(category_name)
+                    label_name = (
+                        label_names[0] if merge_labels else value["rectanglelabels"][0]
+                    )
+                    label_id = label_names.index(label_name)
                     # Save the labels in the Ultralytics format:
                     # - one label per line
                     # - each line is a list of 5 elements:
-                    #   - category_id
+                    #   - label_id
                     #   - x_center
                     #   - y_center
                     #   - width
                     #   - height
                     x_center = x_min + width / 2
                     y_center = y_min + height / 2
-                    f.write(f"{category_id} {x_center} {y_center} {width} {height}\n")
+                    f.write(f"{label_id} {x_center} {y_center} {width} {height}\n")
                     has_valid_annotation = True
         if has_valid_annotation:
             download_output = download_image(
-                image_url, return_struct=True, error_raise=error_raise
+                image_url,
+                return_struct=True,
+                error_raise=error_raise,
+                use_cache=use_aws_cache,
             )
             if download_output is None:
                 logger.error("Failed to download image: %s", image_url)
@@ -179,15 +202,16 @@ def export_from_ls_to_ultralytics(
         f.write("val: images/val\n")
         f.write("test:\n")
         f.write("names:\n")
-        for i, category_name in enumerate(category_names):
-            f.write(f"  {i}: {category_name}\n")
+        for i, label_name in enumerate(label_names):
+            f.write(f"  {i}: {label_name}\n")
-def export_from_hf_to_ultralytics(
+def export_from_hf_to_ultralytics_object_detection(
     repo_id: str,
     output_dir: Path,
     download_images: bool = True,
     error_raise: bool = True,
+    use_aws_cache: bool = True,
 ):
     """Export annotations from a Hugging Face dataset project to the
     Ultralytics format.
@@ -213,7 +237,10 @@ def export_from_hf_to_ultralytics(
             if download_images:
                 download_output = download_image(
-                    image_url, return_struct=True, error_raise=error_raise
+                    image_url,
+                    return_struct=True,
+                    error_raise=error_raise,
+                    use_cache=use_aws_cache,
                 )
                 if download_output is None:
                     logger.error("Failed to download image: %s", image_url)
@@ -266,3 +293,127 @@ def export_from_hf_to_ultralytics(
         f.write("names:\n")
         for i, category_name in enumerate(category_names):
             f.write(f"  {i}: {category_name}\n")
+def export_from_ultralytics_to_hf(
+    task_type: TaskType,
+    dataset_dir: Path,
+    repo_id: str,
+    label_names: list[str],
+    merge_labels: bool = False,
+    is_openfoodfacts_dataset: bool = False,
+    openfoodfacts_flavor: Flavor = Flavor.off,
+) -> None:
+    if task_type != TaskType.classification:
+        raise NotImplementedError(
+            "Only classification task is currently supported for Ultralytics to HF export"
+        )
+    if task_type == TaskType.classification:
+        export_from_ultralytics_to_hf_classification(
+            dataset_dir=dataset_dir,
+            repo_id=repo_id,
+            label_names=label_names,
+            merge_labels=merge_labels,
+            is_openfoodfacts_dataset=is_openfoodfacts_dataset,
+            openfoodfacts_flavor=openfoodfacts_flavor,
+        )
+def export_from_ultralytics_to_hf_classification(
+    dataset_dir: Path,
+    repo_id: str,
+    label_names: list[str],
+    merge_labels: bool = False,
+    is_openfoodfacts_dataset: bool = False,
+    openfoodfacts_flavor: Flavor = Flavor.off,
+) -> None:
+    """Export an Ultralytics classification dataset to a Hugging Face dataset.
+    The Ultralytics dataset directory should contain 'train', 'val' and/or
+    'test' subdirectories, each containing subdirectories for each label.
+    Args:
+        dataset_dir (Path): Path to the Ultralytics dataset directory.
+        repo_id (str): Hugging Face repository ID to push the dataset to.
+        label_names (list[str]): List of label names.
+        merge_labels (bool): Whether to merge all labels into a single label
+            named 'object'.
+        is_openfoodfacts_dataset (bool): Whether the dataset is from
+            Open Food Facts. If True, the `off_image_id` and `image_url` will
+            be generated automatically. `off_image_id` is extracted from the
+            image filename.
+        openfoodfacts_flavor (Flavor): Flavor of Open Food Facts dataset. This
+            is ignored if `is_openfoodfacts_dataset` is False.
+    """
+    logger.info("Repo ID: %s, dataset_dir: %s", repo_id, dataset_dir)
+    if not any((dataset_dir / split).is_dir() for split in ["train", "val", "test"]):
+        raise ValueError(
+            f"Dataset directory {dataset_dir} does not contain 'train', 'val' or 'test' subdirectories"
+        )
+    # Save output as pickle
+    for split in ["train", "val", "test"]:
+        split_dir = dataset_dir / split
+        if not split_dir.is_dir():
+            logger.info("Skipping missing split directory: %s", split_dir)
+            continue
+        with tempfile.TemporaryDirectory() as tmp_dir_str:
+            tmp_dir = Path(tmp_dir_str)
+            for label_dir in (d for d in split_dir.iterdir() if d.is_dir()):
+                label_name = label_dir.name
+                if merge_labels:
+                    label_name = "object"
+                if label_name not in label_names:
+                    raise ValueError(
+                        "Label name %s not in provided label names (label names: %s)"
+                        % (label_name, label_names),
+                    )
+                label_id = label_names.index(label_name)
+                for image_path in label_dir.glob("*"):
+                    if is_openfoodfacts_dataset:
+                        image_stem_parts = image_path.stem.split("_")
+                        barcode = image_stem_parts[0]
+                        off_image_id = image_stem_parts[1]
+                        image_id = f"{barcode}_{off_image_id}"
+                        image_url = generate_image_url(
+                            barcode, off_image_id, flavor=openfoodfacts_flavor
+                        )
+                    else:
+                        image_id = image_path.stem
+                        barcode = ""
+                        off_image_id = ""
+                        image_url = ""
+                    image = Image.open(image_path)
+                    image.load()
+                    if image.mode != "RGB":
+                        image = image.convert("RGB")
+                    # Rotate image according to exif orientation using Pillow
+                    ImageOps.exif_transpose(image, in_place=True)
+                    sample = {
+                        "image_id": image_id,
+                        "image": image,
+                        "width": image.width,
+                        "height": image.height,
+                        "meta": {
+                            "barcode": barcode,
+                            "off_image_id": off_image_id,
+                            "image_url": image_url,
+                        },
+                        "category_id": label_id,
+                        "category_name": label_name,
+                    }
+                    with open(tmp_dir / f"{split}_{image_id}.pkl", "wb") as f:
+                        pickle.dump(sample, f)
+            hf_ds = datasets.Dataset.from_generator(
+                functools.partial(_pickle_sample_generator, tmp_dir),
+                features=HF_DS_CLASSIFICATION_FEATURES,
+            )
+            hf_ds.push_to_hub(repo_id, split=split)

{labelr-0.3.0 → labelr-0.4.1}/src/labelr/sample.py RENAMED Viewed

@@ -145,7 +145,11 @@ def format_object_detection_sample_to_ls(
 def format_object_detection_sample_to_hf(
-    task_data: dict, annotations: list[dict], category_names: list[str]
+    task_data: dict,
+    annotations: list[dict],
+    label_names: list[str],
+    merge_labels: bool = False,
+    use_aws_cache: bool = True,
 ) -> dict | None:
     if len(annotations) > 1:
         logger.info("More than one annotation found, skipping")
@@ -156,8 +160,8 @@ def format_object_detection_sample_to_hf(
     annotation = annotations[0]
     bboxes = []
-    bbox_category_ids = []
-    bbox_category_names = []
+    bbox_label_ids = []
+    bbox_label_names = []
     for annotation_result in annotation["result"]:
         if annotation_result["type"] != "rectanglelabels":
@@ -171,12 +175,13 @@ def format_object_detection_sample_to_hf(
         x_max = x_min + width
         y_max = y_min + height
         bboxes.append([y_min, x_min, y_max, x_max])
-        category_name = value["rectanglelabels"][0]
-        bbox_category_names.append(category_name)
-        bbox_category_ids.append(category_names.index(category_name))
+        label_name = label_names[0] if merge_labels else value["rectanglelabels"][0]
+        bbox_label_names.append(label_name)
+        bbox_label_ids.append(label_names.index(label_name))
     image_url = task_data["image_url"]
-    image = download_image(image_url, error_raise=False)
+    image = download_image(image_url, error_raise=False, use_cache=use_aws_cache)
     if image is None:
         logger.error("Failed to download image: %s", image_url)
         return None
@@ -193,14 +198,14 @@ def format_object_detection_sample_to_hf(
         },
         "objects": {
             "bbox": bboxes,
-            "category_id": bbox_category_ids,
-            "category_name": bbox_category_names,
+            "category_id": bbox_label_ids,
+            "category_name": bbox_label_names,
         },
     }
 # The HuggingFace Dataset features
-HF_DS_FEATURES = datasets.Features(
+HF_DS_OBJECT_DETECTION_FEATURES = datasets.Features(
     {
         "image_id": datasets.Value("string"),
         "image": datasets.features.Image(),
@@ -218,3 +223,20 @@ HF_DS_FEATURES = datasets.Features(
         },
     }
 )
+HF_DS_CLASSIFICATION_FEATURES = datasets.Features(
+    {
+        "image_id": datasets.Value("string"),
+        "image": datasets.features.Image(),
+        "width": datasets.Value("int64"),
+        "height": datasets.Value("int64"),
+        "meta": {
+            "barcode": datasets.Value("string"),
+            "off_image_id": datasets.Value("string"),
+            "image_url": datasets.Value("string"),
+        },
+        "category_id": datasets.Value("int64"),
+        "category_name": datasets.Value("string"),
+    }
+)

{labelr-0.3.0 → labelr-0.4.1}/src/labelr/types.py RENAMED Viewed

@@ -4,6 +4,7 @@ import enum
 class ExportSource(str, enum.Enum):
     hf = "hf"
     ls = "ls"
+    ultralytics = "ultralytics"
 class ExportDestination(str, enum.Enum):

{labelr-0.3.0 → labelr-0.4.1/src/labelr.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: labelr
-Version: 0.3.0
+Version: 0.4.1
 Summary: A command-line tool to manage labeling tasks with Label Studio.
 Requires-Python: >=3.10
 Description-Content-Type: text/markdown