PyPI - labelr - Versions diffs - 0.1.0__py3-none-any.whl - Mend

labelr 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

labelr/__init__.py +0 -0
labelr/__main__.py +4 -0
labelr/annotate.py +107 -0
labelr/apps/__init__.py +0 -0
labelr/apps/datasets.py +227 -0
labelr/apps/projects.py +353 -0
labelr/apps/users.py +36 -0
labelr/check.py +86 -0
labelr/config.py +1 -0
labelr/export.py +270 -0
labelr/main.py +269 -0
labelr/sample.py +186 -0
labelr/triton/object_detection.py +241 -0
labelr/types.py +16 -0
labelr-0.1.0.dist-info/LICENSE +661 -0
labelr-0.1.0.dist-info/METADATA +160 -0
labelr-0.1.0.dist-info/RECORD +20 -0
labelr-0.1.0.dist-info/WHEEL +5 -0
labelr-0.1.0.dist-info/entry_points.txt +2 -0
labelr-0.1.0.dist-info/top_level.txt +1 -0

labelr/apps/users.py ADDED Viewed

@@ -0,0 +1,36 @@
+from typing import Annotated
+import typer
+from ..config import LABEL_STUDIO_DEFAULT_URL
+app = typer.Typer()
+# Label Studio user management
+@app.command()
+def list(
+    api_key: Annotated[str, typer.Option(envvar="LABEL_STUDIO_API_KEY")],
+    label_studio_url: str = LABEL_STUDIO_DEFAULT_URL,
+):
+    """List all users in Label Studio."""
+    from label_studio_sdk.client import LabelStudio
+    ls = LabelStudio(base_url=label_studio_url, api_key=api_key)
+    for user in ls.users.list():
+        print(f"{user.id:02d}: {user.email}")
+@app.command()
+def delete(
+    user_id: int,
+    api_key: Annotated[str, typer.Option(envvar="LABEL_STUDIO_API_KEY")],
+    label_studio_url: str = LABEL_STUDIO_DEFAULT_URL,
+):
+    """Delete a user from Label Studio."""
+    from label_studio_sdk.client import LabelStudio
+    ls = LabelStudio(base_url=label_studio_url, api_key=api_key)
+    ls.users.delete(user_id)

labelr/check.py ADDED Viewed

@@ -0,0 +1,86 @@
+from collections import defaultdict
+from pathlib import Path
+import imagehash
+import tqdm
+from label_studio_sdk.client import LabelStudio
+from openfoodfacts.utils import get_image_from_url, get_logger
+from PIL import Image
+logger = get_logger(__name__)
+def check_ls_dataset(ls: LabelStudio, project_id: int):
+    skipped = 0
+    not_annotated = 0
+    annotated = 0
+    hash_map = defaultdict(list)
+    for task in tqdm.tqdm(
+        ls.tasks.list(project=project_id, fields="all"), desc="tasks"
+    ):
+        annotations = task.annotations
+        if len(annotations) == 0:
+            not_annotated += 1
+            continue
+        elif len(annotations) > 1:
+            logger.warning("Task has multiple annotations: %s", task.id)
+            continue
+        annotation = annotations[0]
+        if annotation["was_cancelled"]:
+            skipped += 1
+        annotated += 1
+        image_url = task.data["image_url"]
+        image = get_image_from_url(image_url)
+        image_hash = str(imagehash.phash(image))
+        hash_map[image_hash].append(task.id)
+    for image_hash, task_ids in hash_map.items():
+        if len(task_ids) > 1:
+            logger.warning("Duplicate images: %s", task_ids)
+    logger.info(
+        "Tasks - annotated: %d, skipped: %d, not annotated: %d",
+        annotated,
+        skipped,
+        not_annotated,
+    )
+def check_local_dataset(dataset_dir: Path, remove: bool = False):
+    hash_map = defaultdict(list)
+    for path in tqdm.tqdm(dataset_dir.glob("**/*.jpg"), desc="images"):
+        if path.is_file() and path.suffix in [
+            ".jpg",
+            ".jpeg",
+            ".png",
+            ".webp",
+            ".bmp",
+            ".tiff",
+            ".gif",
+        ]:
+            image = Image.open(path)
+            image_hash = str(imagehash.phash(image))
+            logger.debug("Image hash: %s", image_hash)
+            hash_map[image_hash].append(path)
+    duplicated = 0
+    to_remove = []
+    for image_hash, image_paths in hash_map.items():
+        if len(image_paths) > 1:
+            logger.warning(
+                "Duplicate images: %s",
+                [str(x.relative_to(dataset_dir)) for x in image_paths],
+            )
+            duplicated += 1
+            to_remove.append(image_paths[0])
+    logger.info("Total duplicated groups: %d", duplicated)
+    if remove and to_remove:
+        for path in to_remove:
+            logger.info("Removing: %s", str(path))
+            path.unlink()

labelr/config.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ LABEL_STUDIO_DEFAULT_URL = "https://annotate.openfoodfacts.org"

labelr/export.py ADDED Viewed

@@ -0,0 +1,270 @@
+import functools
+import logging
+import pickle
+import random
+import tempfile
+import typing
+from pathlib import Path
+import datasets
+import tqdm
+from label_studio_sdk.client import LabelStudio
+from openfoodfacts.images import download_image
+from PIL import Image
+from labelr.sample import HF_DS_FEATURES, format_object_detection_sample_to_hf
+logger = logging.getLogger(__name__)
+def _pickle_sample_generator(dir: Path):
+    """Generator that yields samples from pickles in a directory."""
+    for pkl in dir.glob("*.pkl"):
+        with open(pkl, "rb") as f:
+            yield pickle.load(f)
+def export_from_ls_to_hf(
+    ls: LabelStudio,
+    repo_id: str,
+    category_names: list[str],
+    project_id: int,
+):
+    logger.info("Project ID: %d, category names: %s", project_id, category_names)
+    for split in ["train", "val"]:
+        logger.info("Processing split: %s", split)
+        with tempfile.TemporaryDirectory() as tmp_dir_str:
+            tmp_dir = Path(tmp_dir_str)
+            logger.info("Saving samples to temporary directory: %s", tmp_dir)
+            for i, task in tqdm.tqdm(
+                enumerate(ls.tasks.list(project=project_id, fields="all")),
+                desc="tasks",
+            ):
+                if task.data["split"] != split:
+                    continue
+                sample = format_object_detection_sample_to_hf(
+                    task.data, task.annotations, category_names
+                )
+                if sample is not None:
+                    # Save output as pickle
+                    with open(tmp_dir / f"{split}_{i:05}.pkl", "wb") as f:
+                        pickle.dump(sample, f)
+            hf_ds = datasets.Dataset.from_generator(
+                functools.partial(_pickle_sample_generator, tmp_dir),
+                features=HF_DS_FEATURES,
+            )
+            hf_ds.push_to_hub(repo_id, split=split)
+def export_from_ls_to_ultralytics(
+    ls: LabelStudio,
+    output_dir: Path,
+    category_names: list[str],
+    project_id: int,
+    train_ratio: float = 0.8,
+    error_raise: bool = True,
+):
+    """Export annotations from a Label Studio project to the Ultralytics
+    format.
+    The Label Studio project should be an object detection project with a
+    single rectanglelabels annotation result per task.
+    """
+    logger.info("Project ID: %d, category names: %s", project_id, category_names)
+    data_dir = output_dir / "data"
+    data_dir.mkdir(parents=True, exist_ok=True)
+    split_warning_displayed = False
+    # NOTE: before, all images were sent to val, the last split
+    label_dir = data_dir / "labels"
+    images_dir = data_dir / "images"
+    for split in ["train", "val"]:
+        (label_dir / split).mkdir(parents=True, exist_ok=True)
+        (images_dir / split).mkdir(parents=True, exist_ok=True)
+    for task in tqdm.tqdm(
+        ls.tasks.list(project=project_id, fields="all"),
+        desc="tasks",
+    ):
+        split = task.data.get("split")
+        if split is None:
+            if not split_warning_displayed:
+                logger.warning(
+                    "Split information not found, assigning randomly. "
+                    "To avoid this, set the `split` field in the task data."
+                )
+                split_warning_displayed = True
+            split = "train" if random.random() < train_ratio else "val"
+        elif split not in ["train", "val"]:
+            raise ValueError("Invalid split name: %s", split)
+        if len(task.annotations) > 1:
+            logger.warning("More than one annotation found, skipping")
+            continue
+        elif len(task.annotations) == 0:
+            logger.debug("No annotation found, skipping")
+            continue
+        annotation = task.annotations[0]
+        if annotation["was_cancelled"] is True:
+            logger.debug("Annotation was cancelled, skipping")
+            continue
+        if "image_id" not in task.data:
+            raise ValueError(
+                "`image_id` field not found in task data. "
+                "Make sure the task data contains the `image_id` "
+                "field, which should be a unique identifier for the image."
+            )
+        if "image_url" not in task.data:
+            raise ValueError(
+                "`image_url` field not found in task data. "
+                "Make sure the task data contains the `image_url` "
+                "field, which should be the URL of the image."
+            )
+        image_id = task.data["image_id"]
+        image_url = task.data["image_url"]
+        has_valid_annotation = False
+        with (label_dir / split / f"{image_id}.txt").open("w") as f:
+            if not any(
+                annotation_result["type"] == "rectanglelabels"
+                for annotation_result in annotation["result"]
+            ):
+                continue
+            for annotation_result in annotation["result"]:
+                if annotation_result["type"] == "rectanglelabels":
+                    value = annotation_result["value"]
+                    x_min = value["x"] / 100
+                    y_min = value["y"] / 100
+                    width = value["width"] / 100
+                    height = value["height"] / 100
+                    category_name = value["rectanglelabels"][0]
+                    category_id = category_names.index(category_name)
+                    # Save the labels in the Ultralytics format:
+                    # - one label per line
+                    # - each line is a list of 5 elements:
+                    #   - category_id
+                    #   - x_center
+                    #   - y_center
+                    #   - width
+                    #   - height
+                    x_center = x_min + width / 2
+                    y_center = y_min + height / 2
+                    f.write(f"{category_id} {x_center} {y_center} {width} {height}\n")
+                    has_valid_annotation = True
+        if has_valid_annotation:
+            download_output = download_image(
+                image_url, return_bytes=True, error_raise=error_raise
+            )
+            if download_output is None:
+                logger.error("Failed to download image: %s", image_url)
+                continue
+            _, image_bytes = typing.cast(tuple[Image.Image, bytes], download_output)
+            with (images_dir / split / f"{image_id}.jpg").open("wb") as f:
+                f.write(image_bytes)
+    with (output_dir / "data.yaml").open("w") as f:
+        f.write("path: data\n")
+        f.write("train: images/train\n")
+        f.write("val: images/val\n")
+        f.write("test:\n")
+        f.write("names:\n")
+        for i, category_name in enumerate(category_names):
+            f.write(f"  {i}: {category_name}\n")
+def export_from_hf_to_ultralytics(
+    repo_id: str,
+    output_dir: Path,
+    download_images: bool = True,
+    error_raise: bool = True,
+):
+    """Export annotations from a Hugging Face dataset project to the
+    Ultralytics format.
+    The Label Studio project should be an object detection project with a
+    single rectanglelabels annotation result per task.
+    """
+    logger.info("Repo ID: %s", repo_id)
+    ds = datasets.load_dataset(repo_id)
+    data_dir = output_dir / "data"
+    data_dir.mkdir(parents=True, exist_ok=True)
+    category_id_to_name = {}
+    for split in ["train", "val"]:
+        split_labels_dir = data_dir / "labels" / split
+        split_labels_dir.mkdir(parents=True, exist_ok=True)
+        split_images_dir = data_dir / "images" / split
+        split_images_dir.mkdir(parents=True, exist_ok=True)
+        for sample in tqdm.tqdm(ds[split], desc="samples"):
+            image_id = sample["image_id"]
+            image_url = sample["meta"]["image_url"]
+            if download_images:
+                download_output = download_image(
+                    image_url, return_bytes=True, error_raise=error_raise
+                )
+                if download_output is None:
+                    logger.error("Failed to download image: %s", image_url)
+                    continue
+                _, image_bytes = download_output
+                with (split_images_dir / f"{image_id}.jpg").open("wb") as f:
+                    f.write(image_bytes)
+            else:
+                image = sample["image"]
+                image.save(split_images_dir / f"{image_id}.jpg")
+            objects = sample["objects"]
+            bboxes = objects["bbox"]
+            category_ids = objects["category_id"]
+            category_names = objects["category_name"]
+            with (split_labels_dir / f"{image_id}.txt").open("w") as f:
+                for bbox, category_id, category_name in zip(
+                    bboxes, category_ids, category_names
+                ):
+                    if category_id not in category_id_to_name:
+                        category_id_to_name[category_id] = category_name
+                    y_min, x_min, y_max, x_max = bbox
+                    y_min = min(max(y_min, 0.0), 1.0)
+                    x_min = min(max(x_min, 0.0), 1.0)
+                    y_max = min(max(y_max, 0.0), 1.0)
+                    x_max = min(max(x_max, 0.0), 1.0)
+                    width = x_max - x_min
+                    height = y_max - y_min
+                    # Save the labels in the Ultralytics format:
+                    # - one label per line
+                    # - each line is a list of 5 elements:
+                    #   - category_id
+                    #   - x_center
+                    #   - y_center
+                    #   - width
+                    #   - height
+                    x_center = x_min + width / 2
+                    y_center = y_min + height / 2
+                    f.write(f"{category_id} {x_center} {y_center} {width} {height}\n")
+    category_names = [
+        x[1] for x in sorted(category_id_to_name.items(), key=lambda x: x[0])
+    ]
+    with (output_dir / "data.yaml").open("w") as f:
+        f.write("path: data\n")
+        f.write("train: images/train\n")
+        f.write("val: images/val\n")
+        f.write("test:\n")
+        f.write("names:\n")
+        for i, category_name in enumerate(category_names):
+            f.write(f"  {i}: {category_name}\n")

labelr/main.py ADDED Viewed

@@ -0,0 +1,269 @@
+from typing import Annotated, Optional
+import typer
+from openfoodfacts.utils import get_logger
+from labelr.apps import datasets as dataset_app
+from labelr.apps import projects as project_app
+from labelr.apps import users as user_app
+from labelr.config import LABEL_STUDIO_DEFAULT_URL
+app = typer.Typer(pretty_exceptions_show_locals=False)
+logger = get_logger()
+@app.command()
+def predict_object(
+    model_name: Annotated[
+        str, typer.Option(help="Name of the object detection model to run")
+    ],
+    image_url: Annotated[str, typer.Option(help="URL of the image to process")],
+    triton_uri: Annotated[
+        str, typer.Option(help="URI (host+port) of the Triton Inference Server")
+    ],
+    threshold: float = 0.5,
+):
+    from openfoodfacts.utils import get_image_from_url
+    from labelr.triton.object_detection import ObjectDetectionModelRegistry
+    model = ObjectDetectionModelRegistry.get(model_name)
+    image = get_image_from_url(image_url)
+    output = model.detect_from_image(image, triton_uri=triton_uri)
+    results = output.select(threshold=threshold)
+    for result in results:
+        typer.echo(result)
+# Temporary scripts
+@app.command()
+def skip_rotated_images(
+    api_key: Annotated[str, typer.Option(envvar="LABEL_STUDIO_API_KEY")],
+    project_id: Annotated[int, typer.Option(help="Label Studio project ID")],
+    updated_by: Annotated[
+        Optional[int], typer.Option(help="User ID to declare as annotator")
+    ] = None,
+    label_studio_url: str = LABEL_STUDIO_DEFAULT_URL,
+):
+    import requests
+    import tqdm
+    from label_studio_sdk.client import LabelStudio
+    from label_studio_sdk.types.task import Task
+    from openfoodfacts.ocr import OCRResult
+    session = requests.Session()
+    ls = LabelStudio(base_url=label_studio_url, api_key=api_key)
+    task: Task
+    for task in tqdm.tqdm(
+        ls.tasks.list(project=project_id, fields="all"), desc="tasks"
+    ):
+        if any(annotation["was_cancelled"] for annotation in task.annotations):
+            continue
+        assert task.total_annotations == 1, (
+            "Task has multiple annotations (%s)" % task.id
+        )
+        task_id = task.id
+        annotation = task.annotations[0]
+        annotation_id = annotation["id"]
+        ocr_url = task.data["image_url"].replace(".jpg", ".json")
+        ocr_result = OCRResult.from_url(ocr_url, session=session, error_raise=False)
+        if ocr_result is None:
+            logger.warning("No OCR result for task: %s", task_id)
+            continue
+        orientation_result = ocr_result.get_orientation()
+        if orientation_result is None:
+            # logger.info("No orientation for task: %s", task_id)
+            continue
+        orientation = orientation_result.orientation.name
+        if orientation != "up":
+            logger.info(
+                "Skipping rotated image for task: %s (orientation: %s)",
+                task_id,
+                orientation,
+            )
+            ls.annotations.update(
+                id=annotation_id,
+                was_cancelled=True,
+                updated_by=updated_by,
+            )
+        elif orientation == "up":
+            logger.debug("Keeping annotation for task: %s", task_id)
+@app.command()
+def fix_label(
+    api_key: Annotated[str, typer.Option(envvar="LABEL_STUDIO_API_KEY")],
+    project_id: Annotated[int, typer.Option(help="Label Studio project ID")],
+    label_studio_url: str = LABEL_STUDIO_DEFAULT_URL,
+):
+    import tqdm
+    from label_studio_sdk.client import LabelStudio
+    from label_studio_sdk.types.task import Task
+    ls = LabelStudio(base_url=label_studio_url, api_key=api_key)
+    task: Task
+    for task in tqdm.tqdm(
+        ls.tasks.list(project=project_id, fields="all"), desc="tasks"
+    ):
+        for prediction in task.predictions:
+            updated = False
+            if "result" in prediction:
+                for result in prediction["result"]:
+                    value = result["value"]
+                    if "rectanglelabels" in value and value["rectanglelabels"] != [
+                        "price-tag"
+                    ]:
+                        value["rectanglelabels"] = ["price-tag"]
+                        updated = True
+            if updated:
+                print(f"Updating prediction {prediction['id']}, task {task.id}")
+                ls.predictions.update(prediction["id"], result=prediction["result"])
+        for annotation in task.annotations:
+            updated = False
+            if "result" in annotation:
+                for result in annotation["result"]:
+                    value = result["value"]
+                    if "rectanglelabels" in value and value["rectanglelabels"] != [
+                        "price-tag"
+                    ]:
+                        value["rectanglelabels"] = ["price-tag"]
+                        updated = True
+            if updated:
+                print(f"Updating annotation {annotation['id']}, task {task.id}")
+                ls.annotations.update(annotation["id"], result=annotation["result"])
+@app.command()
+def select_price_tag_images(
+    api_key: Annotated[str, typer.Option(envvar="LABEL_STUDIO_API_KEY")],
+    project_id: Annotated[int, typer.Option(help="Label Studio project ID")],
+    label_studio_url: str = LABEL_STUDIO_DEFAULT_URL,
+):
+    import typing
+    from pathlib import Path
+    from typing import Any
+    from urllib.parse import urlparse
+    import requests
+    import tqdm
+    from label_studio_sdk.client import LabelStudio
+    from label_studio_sdk.types.task import Task
+    session = requests.Session()
+    ls = LabelStudio(base_url=label_studio_url, api_key=api_key)
+    proof_paths = (Path(__file__).parent / "proof.txt").read_text().splitlines()
+    task: Task
+    for task in tqdm.tqdm(
+        ls.tasks.list(project=project_id, include="data,id"), desc="tasks"
+    ):
+        data = typing.cast(dict[str, Any], task.data)
+        if "is_raw_product_shelf" in data:
+            continue
+        image_url = data["image_url"]
+        file_path = urlparse(image_url).path.replace("/img/", "")
+        r = session.get(
+            f"https://robotoff.openfoodfacts.org/api/v1/images/predict?image_url={image_url}&models=price_proof_classification",
+        )
+        if r.status_code != 200:
+            print(
+                f"Failed to get prediction for {image_url}, error: {r.text} (status: {r.status_code})"
+            )
+            continue
+        prediction = r.json()["predictions"]["price_proof_classification"][0]["label"]
+        is_raw_preduct_shelf = False
+        if prediction in ("PRICE_TAG", "SHELF"):
+            is_raw_preduct_shelf = file_path in proof_paths
+        ls.tasks.update(
+            task.id,
+            data={
+                **data,
+                "is_raw_product_shelf": "true" if is_raw_preduct_shelf else "false",
+            },
+        )
+@app.command()
+def add_predicted_category(
+    api_key: Annotated[str, typer.Option(envvar="LABEL_STUDIO_API_KEY")],
+    project_id: Annotated[int, typer.Option(help="Label Studio project ID")],
+    label_studio_url: str = LABEL_STUDIO_DEFAULT_URL,
+):
+    import typing
+    from typing import Any
+    import requests
+    import tqdm
+    from label_studio_sdk.client import LabelStudio
+    from label_studio_sdk.types.task import Task
+    session = requests.Session()
+    ls = LabelStudio(base_url=label_studio_url, api_key=api_key)
+    task: Task
+    for task in tqdm.tqdm(
+        ls.tasks.list(project=project_id, include="data,id"), desc="tasks"
+    ):
+        data = typing.cast(dict[str, Any], task.data)
+        if "predicted_category" in data:
+            continue
+        image_url = data["image_url"]
+        r = session.get(
+            f"https://robotoff.openfoodfacts.org/api/v1/images/predict?image_url={image_url}&models=price_proof_classification",
+        )
+        if r.status_code != 200:
+            print(
+                f"Failed to get prediction for {image_url}, error: {r.text} (status: {r.status_code})"
+            )
+            continue
+        predicted_category = r.json()["predictions"]["price_proof_classification"][0][
+            "label"
+        ]
+        ls.tasks.update(
+            task.id,
+            data={
+                **data,
+                "predicted_category": predicted_category,
+            },
+        )
+app.add_typer(user_app.app, name="users", help="Manage Label Studio users")
+app.add_typer(
+    project_app.app,
+    name="projects",
+    help="Manage Label Studio projects (create, import data, etc.)",
+)
+app.add_typer(
+    dataset_app.app,
+    name="datasets",
+    help="Manage datasets (convert, export, check, etc.)",
+)
+if __name__ == "__main__":
+    app()