PyPI - labelr - Versions diffs - 0.9.0__py3-none-any.whl → 0.11.0__py3-none-any.whl - Mend

labelr 0.9.0py3-none-any.whl → 0.11.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

labelr/apps/datasets.py +196 -14
labelr/apps/directus.py +212 -0
labelr/apps/google_batch.py +46 -1
labelr/apps/label_studio.py +261 -64
labelr/apps/typer_description.py +2 -0
labelr/check.py +68 -7
labelr/config.py +57 -1
labelr/export/__init__.py +0 -0
labelr/export/classification.py +114 -0
labelr/export/common.py +42 -0
labelr/export/llm.py +91 -0
labelr/{export.py → export/object_detection.py} +97 -217
labelr/google_genai.py +9 -3
labelr/main.py +16 -0
labelr/sample/__init__.py +0 -0
labelr/sample/classification.py +17 -0
labelr/sample/common.py +14 -0
labelr/sample/llm.py +75 -0
labelr/{sample.py → sample/object_detection.py} +38 -68
labelr/utils.py +55 -5
labelr-0.11.0.dist-info/METADATA +230 -0
labelr-0.11.0.dist-info/RECORD +38 -0
{labelr-0.9.0.dist-info → labelr-0.11.0.dist-info}/WHEEL +1 -1
labelr-0.9.0.dist-info/METADATA +0 -159
labelr-0.9.0.dist-info/RECORD +0 -28
{labelr-0.9.0.dist-info → labelr-0.11.0.dist-info}/entry_points.txt +0 -0
{labelr-0.9.0.dist-info → labelr-0.11.0.dist-info}/licenses/LICENSE +0 -0
{labelr-0.9.0.dist-info → labelr-0.11.0.dist-info}/top_level.txt +0 -0

labelr/check.py CHANGED Viewed

@@ -1,30 +1,64 @@
+import typing
 from collections import defaultdict
 from pathlib import Path
 import imagehash
 import tqdm
 from label_studio_sdk.client import LabelStudio
-from openfoodfacts.utils import get_image_from_url, get_logger
+from openfoodfacts.types import JSONType
+from openfoodfacts.utils import ImageDownloadItem, get_image_from_url, get_logger
 from PIL import Image
 logger = get_logger(__name__)
-def check_ls_dataset(ls: LabelStudio, project_id: int):
+def check_ls_dataset(
+    ls: LabelStudio,
+    project_id: int,
+    view_id: int | None = None,
+    delete_missing_images: bool = False,
+    delete_duplicate_images: bool = False,
+):
+    """Perform sanity checks of a Label Studio dataset.
+    This function checks for:
+    - Tasks with missing images (404)
+    - Duplicate images based on perceptual hash (pHash)
+    - Tasks with multiple annotations
+    This function doesn't perform any modifications to the dataset, except
+    optionally deleting tasks with missing images if `delete_missing_images`
+    is set to True.
+    Args:
+        ls (LabelStudio): Label Studio client instance.
+        project_id (int): ID of the Label Studio project to check.
+        view_id (int): ID of the Label Studio view to check. If None, no
+            filtering is done.
+        delete_missing_images (bool): Whether to delete tasks with missing
+            images.
+        delete_duplicate_images (bool): Whether to delete tasks with duplicate
+            images. If one task has annotations and the other doesn't, the task
+            with annotations will be kept. Otherwise, the most recent task will
+            be kept.
+    """
     skipped = 0
     not_annotated = 0
     annotated = 0
+    deleted = 0
+    multiple_annotations = 0
     hash_map = defaultdict(list)
     for task in tqdm.tqdm(
-        ls.tasks.list(project=project_id, fields="all"), desc="tasks"
+        ls.tasks.list(project=project_id, fields="all", view=view_id), desc="tasks"
     ):
-        annotations = task.annotations
+        annotations = typing.cast(list[JSONType], task.annotations)
         if len(annotations) == 0:
             not_annotated += 1
             continue
         elif len(annotations) > 1:
             logger.warning("Task has multiple annotations: %s", task.id)
+            multiple_annotations += 1
             continue
         annotation = annotations[0]
@@ -34,20 +68,47 @@ def check_ls_dataset(ls: LabelStudio, project_id: int):
         annotated += 1
         image_url = task.data["image_url"]
-        image = get_image_from_url(image_url)
-        image_hash = str(imagehash.phash(image))
+        image_struct = typing.cast(
+            ImageDownloadItem,
+            get_image_from_url(image_url, return_struct=True, error_raise=False),
+        )
+        if image_struct.response.status_code == 404:
+            logger.warning("Image not found (404): %s", image_url)
+            if delete_missing_images:
+                ls.tasks.delete(task.id)
+                deleted += 1
+                logger.info("Deleted task with missing image: %s", task.id)
+            continue
+        if image_struct.image is None:
+            logger.warning("Could not open image: %s", image_url)
+            continue
+        image_hash = str(imagehash.phash(image_struct.image))
         hash_map[image_hash].append(task.id)
     for image_hash, task_ids in hash_map.items():
         if len(task_ids) > 1:
             logger.warning("Duplicate images: %s", task_ids)
+            if delete_duplicate_images:
+                tasks = [ls.tasks.get(id=task_id) for task_id in task_ids]
+                # We sort the tasks by the number of annotations, so that we keep the
+                # one with at least one annotation.
+                for task in sorted(tasks, key=lambda x: len(x.annotations) > 0)[:-1]:
+                    logger.info("Deleting duplicate task: %s", task.id)
+                    ls.tasks.delete(task.id)
+                    deleted += 1
     logger.info(
-        "Tasks - annotated: %d, skipped: %d, not annotated: %d",
+        "Tasks - annotated: %d, skipped: %d, not annotated: %d, multiple annotations: %d",
         annotated,
         skipped,
         not_annotated,
+        multiple_annotations,
     )
+    logger.info("Deleted tasks with missing images: %d", deleted)
 def check_local_dataset(dataset_dir: Path, remove: bool = False):

labelr/config.py CHANGED Viewed

@@ -1 +1,57 @@
-LABEL_STUDIO_DEFAULT_URL = "https://annotate.openfoodfacts.org"
+from pathlib import Path
+from pydantic import BaseModel, Field
+import os
+CONFIG_PATH = Path("~").expanduser() / ".config/.labelr/config.json"
+# validate_assignment allows to validate the model everytime it is updated
+class LabelrConfig(BaseModel, validate_assignment=True):
+    label_studio_url: str = Field(
+        default="http://127.0.0.1:8080",
+        description="URL of the Label Studio instance to use. Defaults to http://127.0.0.1:8080.",
+    )
+    label_studio_api_key: str | None = Field(
+        default=None,
+        description="API key for Label Studio.",
+    )
+def get_config() -> LabelrConfig:
+    """Get labelr configuration.
+    The configuration can come from (by order of precedence):
+    - Environment variables
+    - JSON file (see below)
+    The configuration is stored in a JSON file at ~/.config/.labelr/config.json.
+    The following environment variables are supported:
+    - LABELR_LABEL_STUDIO_URL
+    - LABELR_LABEL_STUDIO_API_KEY
+    """
+    if CONFIG_PATH.exists():
+        config = LabelrConfig.model_validate_json(CONFIG_PATH.read_bytes())
+        if "LABELR_LABEL_STUDIO_URL" in os.environ:
+            config.label_studio_url = os.environ["LABELR_LABEL_STUDIO_URL"]
+        if "LABELR_LABEL_STUDIO_API_KEY" in os.environ:
+            config.label_studio_api_key = os.environ["LABELR_LABEL_STUDIO_API_KEY"]
+        return config
+    else:
+        return LabelrConfig()
+def set_file_config(key: str, value: str):
+    """Update the labelr configuration.
+    The configuration is stored in a JSON file at ~/.config/.labelr/config.json.
+    """
+    config = get_config()
+    setattr(config, key, value)
+    CONFIG_PATH.parent.mkdir(parents=True, exist_ok=True)
+    CONFIG_PATH.write_text(config.model_dump_json(indent=2))
+config = get_config()

labelr/export/__init__.py ADDED Viewed

File without changes

labelr/export/classification.py ADDED Viewed

@@ -0,0 +1,114 @@
+import functools
+import logging
+import pickle
+import tempfile
+from pathlib import Path
+import datasets
+from openfoodfacts.images import generate_image_url
+from openfoodfacts.types import Flavor
+from PIL import Image, ImageOps
+from labelr.export.common import _pickle_sample_generator
+from labelr.sample.classification import HF_DS_CLASSIFICATION_FEATURES
+logger = logging.getLogger(__name__)
+def export_from_ultralytics_to_hf_classification(
+    dataset_dir: Path,
+    repo_id: str,
+    label_names: list[str],
+    merge_labels: bool = False,
+    is_openfoodfacts_dataset: bool = False,
+    openfoodfacts_flavor: Flavor = Flavor.off,
+) -> None:
+    """Export an Ultralytics classification dataset to a Hugging Face dataset.
+    The Ultralytics dataset directory should contain 'train', 'val' and/or
+    'test' subdirectories, each containing subdirectories for each label.
+    Args:
+        dataset_dir (Path): Path to the Ultralytics dataset directory.
+        repo_id (str): Hugging Face repository ID to push the dataset to.
+        label_names (list[str]): List of label names.
+        merge_labels (bool): Whether to merge all labels into a single label
+            named 'object'.
+        is_openfoodfacts_dataset (bool): Whether the dataset is from
+            Open Food Facts. If True, the `off_image_id` and `image_url` will
+            be generated automatically. `off_image_id` is extracted from the
+            image filename.
+        openfoodfacts_flavor (Flavor): Flavor of Open Food Facts dataset. This
+            is ignored if `is_openfoodfacts_dataset` is False.
+    """
+    logger.info("Repo ID: %s, dataset_dir: %s", repo_id, dataset_dir)
+    if not any((dataset_dir / split).is_dir() for split in ["train", "val", "test"]):
+        raise ValueError(
+            f"Dataset directory {dataset_dir} does not contain 'train', 'val' or 'test' subdirectories"
+        )
+    # Save output as pickle
+    for split in ["train", "val", "test"]:
+        split_dir = dataset_dir / split
+        if not split_dir.is_dir():
+            logger.info("Skipping missing split directory: %s", split_dir)
+            continue
+        with tempfile.TemporaryDirectory() as tmp_dir_str:
+            tmp_dir = Path(tmp_dir_str)
+            for label_dir in (d for d in split_dir.iterdir() if d.is_dir()):
+                label_name = label_dir.name
+                if merge_labels:
+                    label_name = "object"
+                if label_name not in label_names:
+                    raise ValueError(
+                        "Label name %s not in provided label names (label names: %s)"
+                        % (label_name, label_names),
+                    )
+                label_id = label_names.index(label_name)
+                for image_path in label_dir.glob("*"):
+                    if is_openfoodfacts_dataset:
+                        image_stem_parts = image_path.stem.split("_")
+                        barcode = image_stem_parts[0]
+                        off_image_id = image_stem_parts[1]
+                        image_id = f"{barcode}_{off_image_id}"
+                        image_url = generate_image_url(
+                            barcode, off_image_id, flavor=openfoodfacts_flavor
+                        )
+                    else:
+                        image_id = image_path.stem
+                        barcode = ""
+                        off_image_id = ""
+                        image_url = ""
+                    image = Image.open(image_path)
+                    image.load()
+                    if image.mode != "RGB":
+                        image = image.convert("RGB")
+                    # Rotate image according to exif orientation using Pillow
+                    ImageOps.exif_transpose(image, in_place=True)
+                    sample = {
+                        "image_id": image_id,
+                        "image": image,
+                        "width": image.width,
+                        "height": image.height,
+                        "meta": {
+                            "barcode": barcode,
+                            "off_image_id": off_image_id,
+                            "image_url": image_url,
+                        },
+                        "category_id": label_id,
+                        "category_name": label_name,
+                    }
+                    with open(tmp_dir / f"{split}_{image_id}.pkl", "wb") as f:
+                        pickle.dump(sample, f)
+            hf_ds = datasets.Dataset.from_generator(
+                functools.partial(_pickle_sample_generator, tmp_dir),
+                features=HF_DS_CLASSIFICATION_FEATURES,
+            )
+            hf_ds.push_to_hub(repo_id, split=split)

labelr/export/common.py ADDED Viewed

@@ -0,0 +1,42 @@
+import pickle
+from pathlib import Path
+from openfoodfacts.types import Flavor
+from labelr.types import TaskType
+def _pickle_sample_generator(dir: Path):
+    """Generator that yields samples from pickles in a directory."""
+    for pkl in dir.glob("*.pkl"):
+        with open(pkl, "rb") as f:
+            yield pickle.load(f)
+def export_from_ultralytics_to_hf(
+    task_type: TaskType,
+    dataset_dir: Path,
+    repo_id: str,
+    label_names: list[str],
+    merge_labels: bool = False,
+    is_openfoodfacts_dataset: bool = False,
+    openfoodfacts_flavor: Flavor = Flavor.off,
+) -> None:
+    from labelr.export.classification import (
+        export_from_ultralytics_to_hf_classification,
+    )
+    if task_type != TaskType.classification:
+        raise NotImplementedError(
+            "Only classification task is currently supported for Ultralytics to HF export"
+        )
+    if task_type == TaskType.classification:
+        export_from_ultralytics_to_hf_classification(
+            dataset_dir=dataset_dir,
+            repo_id=repo_id,
+            label_names=label_names,
+            merge_labels=merge_labels,
+            is_openfoodfacts_dataset=is_openfoodfacts_dataset,
+            openfoodfacts_flavor=openfoodfacts_flavor,
+        )

labelr/export/llm.py ADDED Viewed

@@ -0,0 +1,91 @@
+import functools
+import logging
+import pickle
+import tempfile
+import typing
+from collections.abc import Iterator
+from pathlib import Path
+import datasets
+import tqdm
+from PIL import Image, ImageOps
+from labelr.export.common import _pickle_sample_generator
+from labelr.sample.llm import (
+    HF_DS_LLM_IMAGE_EXTRACTION_FEATURES,
+    LLMImageExtractionSample,
+)
+from labelr.utils import PathWithContext
+logger = logging.getLogger(__name__)
+def export_to_hf_llm_image_extraction(
+    sample_iter: Iterator[LLMImageExtractionSample],
+    split: str,
+    repo_id: str,
+    revision: str = "main",
+    tmp_dir: Path | None = None,
+    image_max_size: int | None = None,
+) -> None:
+    """Export LLM image extraction samples to a Hugging Face dataset.
+    Args:
+        sample_iter (Iterator[LLMImageExtractionSample]): Iterator of samples
+            to export.
+        split (str): Name of the dataset split (e.g., 'train', 'val').
+        repo_id (str): Hugging Face repository ID to push the dataset to.
+        revision (str): Revision (branch, tag or commit) to use for the
+            Hugging Face Datasets repository.
+        tmp_dir (Path | None): Temporary directory to use for intermediate
+            files. If None, a temporary directory will be created
+            automatically.
+        image_max_size (int | None): Maximum size (in pixels) for the images.
+    """
+    logger.info(
+        "Repo ID: %s, revision: %s, split: %s, tmp_dir: %s, image_max_size: %s",
+        repo_id,
+        revision,
+        split,
+        tmp_dir,
+        image_max_size,
+    )
+    tmp_dir_with_context: PathWithContext | tempfile.TemporaryDirectory
+    if tmp_dir:
+        tmp_dir.mkdir(parents=True, exist_ok=True)
+        tmp_dir_with_context = PathWithContext(tmp_dir)
+    else:
+        tmp_dir_with_context = tempfile.TemporaryDirectory()
+    with tmp_dir_with_context as tmp_dir_str:
+        tmp_dir = Path(tmp_dir_str)
+        for sample in tqdm.tqdm(sample_iter, desc="samples"):
+            image = sample.image
+            # Rotate image according to exif orientation using Pillow
+            image = typing.cast(Image.Image, ImageOps.exif_transpose(image))
+            if image_max_size is not None:
+                if image.height > image_max_size or image.width > image_max_size:
+                    image.thumbnail(
+                        (image_max_size, image_max_size),
+                        Image.Resampling.LANCZOS,
+                    )
+            image_id = sample.image_id
+            json_sample = {
+                "image_id": image_id,
+                "image": image,
+                "meta": {
+                    k: v for k, v in sample.meta.model_dump().items() if v is not None
+                },
+                "output": sample.output,
+            }
+            # Save output as pickle
+            with open(tmp_dir / f"{split}_{image_id}.pkl", "wb") as f:
+                pickle.dump(json_sample, f)
+        hf_ds = datasets.Dataset.from_generator(
+            functools.partial(_pickle_sample_generator, tmp_dir),
+            features=HF_DS_LLM_IMAGE_EXTRACTION_FEATURES,
+        )
+        hf_ds.push_to_hub(repo_id, split=split, revision=revision)

labelr 0.9.0__py3-none-any.whl → 0.11.0__py3-none-any.whl

labelr 0.9.0py3-none-any.whl → 0.11.0py3-none-any.whl