PyPI - labelr - Versions diffs - 0.10.0__py3-none-any.whl → 0.11.1__py3-none-any.whl - Mend

labelr 0.10.0py3-none-any.whl → 0.11.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

labelr/annotate.py +3 -54
labelr/apps/datasets.py +140 -9
labelr/apps/directus.py +212 -0
labelr/apps/google_batch.py +38 -0
labelr/apps/label_studio.py +295 -104
labelr/apps/typer_description.py +2 -0
labelr/check.py +68 -7
labelr/config.py +57 -1
labelr/export/object_detection.py +96 -18
labelr/main.py +16 -0
labelr/sample/object_detection.py +42 -13
labelr-0.11.1.dist-info/METADATA +230 -0
{labelr-0.10.0.dist-info → labelr-0.11.1.dist-info}/RECORD +17 -15
{labelr-0.10.0.dist-info → labelr-0.11.1.dist-info}/WHEEL +1 -1
labelr-0.10.0.dist-info/METADATA +0 -158
{labelr-0.10.0.dist-info → labelr-0.11.1.dist-info}/entry_points.txt +0 -0
{labelr-0.10.0.dist-info → labelr-0.11.1.dist-info}/licenses/LICENSE +0 -0
{labelr-0.10.0.dist-info → labelr-0.11.1.dist-info}/top_level.txt +0 -0

labelr/annotate.py CHANGED Viewed

@@ -1,66 +1,15 @@
 import random
 import string
-from openfoodfacts.types import JSONType
 from openfoodfacts.utils import get_logger
-logger = get_logger(__name__)
-def format_annotation_results_from_robotoff(
-    objects: list[JSONType],
-    image_width: int,
-    image_height: int,
-    label_mapping: dict[str, str] | None = None,
-) -> list[JSONType]:
-    """Format annotation results from Robotoff prediction endpoint into
-    Label Studio format."""
-    annotation_results = []
-    for object_ in objects:
-        bounding_box = object_["bounding_box"]
-        label_name = object_["label"]
+from ultralytics import Results
-        if label_mapping:
-            label_name = label_mapping.get(label_name, label_name)
-        # These are relative coordinates (between 0.0 and 1.0)
-        y_min, x_min, y_max, x_max = bounding_box
-        # Make sure the coordinates are within the image boundaries,
-        # and convert them to percentages
-        y_min = min(max(0, y_min), 1.0) * 100
-        x_min = min(max(0, x_min), 1.0) * 100
-        y_max = min(max(0, y_max), 1.0) * 100
-        x_max = min(max(0, x_max), 1.0) * 100
-        x = x_min
-        y = y_min
-        width = x_max - x_min
-        height = y_max - y_min
-        id_ = generate_id()
-        annotation_results.append(
-            {
-                "id": id_,
-                "type": "rectanglelabels",
-                "from_name": "label",
-                "to_name": "image",
-                "original_width": image_width,
-                "original_height": image_height,
-                "image_rotation": 0,
-                "value": {
-                    "rotation": 0,
-                    "x": x,
-                    "y": y,
-                    "width": width,
-                    "height": height,
-                    "rectanglelabels": [label_name],
-                },
-            },
-        )
-    return annotation_results
+logger = get_logger(__name__)
 def format_annotation_results_from_ultralytics(
-    results: "Results",
+    results: Results,
     labels: list[str],
     label_mapping: dict[str, str] | None = None,
 ) -> list[dict]:

labelr/apps/datasets.py CHANGED Viewed

@@ -18,7 +18,8 @@ from labelr.export.object_detection import (
     export_from_ls_to_ultralytics_object_detection,
 )
-from ..config import LABEL_STUDIO_DEFAULT_URL
+from . import typer_description
+from ..config import config
 from ..types import ExportDestination, ExportSource, TaskType
 app = typer.Typer()
@@ -125,7 +126,9 @@ def convert_object_detection_dataset(
 def export(
     from_: Annotated[ExportSource, typer.Option("--from", help="Input source to use")],
     to: Annotated[ExportDestination, typer.Option(help="Where to export the data")],
-    api_key: Annotated[Optional[str], typer.Option(envvar="LABEL_STUDIO_API_KEY")],
+    api_key: Annotated[
+        str | None, typer.Option(help=typer_description.LABEL_STUDIO_API_KEY)
+    ] = config.label_studio_api_key,
     task_type: Annotated[
         TaskType, typer.Option(help="Type of task to export")
     ] = TaskType.object_detection,
@@ -142,7 +145,16 @@ def export(
     project_id: Annotated[
         Optional[int], typer.Option(help="Label Studio Project ID")
     ] = None,
-    label_studio_url: Optional[str] = LABEL_STUDIO_DEFAULT_URL,
+    view_id: Annotated[
+        int | None,
+        typer.Option(
+            help="ID of the Label Studio view, if any. This option is useful "
+            "to filter the task to export."
+        ),
+    ] = None,
+    label_studio_url: Annotated[
+        str, typer.Option(help=typer_description.LABEL_STUDIO_URL)
+    ] = config.label_studio_url,
     output_dir: Annotated[
         Optional[Path],
         typer.Option(
@@ -163,11 +175,15 @@ def export(
     is_openfoodfacts_dataset: Annotated[
         bool,
         typer.Option(
-            help="Whether the Ultralytics dataset is an OpenFoodFacts dataset, only "
-            "for Ultralytics source. This is used to generate the correct image URLs "
-            "each image name."
+            help="Whether the Ultralytics dataset is an Open Food Facts dataset, only "
+            "for Ultralytics source. This is used:\n"
+            "- to generate the correct image URLs from each image name, when exporting "
+            "from Ultralytics to Hugging Face Datasets.\n"
+            "- to include additional metadata fields specific to Open Food Facts "
+            "(`barcode` and `off_image_id`) when exporting from Label Studio to "
+            "Hugging Face Datasets."
         ),
-    ] = True,
+    ] = False,
     openfoodfacts_flavor: Annotated[
         Flavor,
         typer.Option(
@@ -181,9 +197,18 @@ def export(
         float,
         typer.Option(
             help="Train ratio for splitting the dataset, if the split name is not "
-            "provided (typically, if the source is Label Studio)"
+            "provided. Only used if the source is Label Studio and the destination "
+            "is Ultralytics."
         ),
     ] = 0.8,
+    image_max_size: Annotated[
+        int | None,
+        typer.Option(
+            help="Maximum size (in pixels) for the images. If None, no resizing is performed."
+            "Otherwise, the longest side of the image will be resized to this value, "
+            "keeping the aspect ratio."
+        ),
+    ] = None,
     error_raise: Annotated[
         bool,
         typer.Option(
@@ -260,9 +285,12 @@ def export(
                 repo_id=repo_id,
                 label_names=typing.cast(list[str], label_names_list),
                 project_id=typing.cast(int, project_id),
+                is_openfoodfacts_dataset=is_openfoodfacts_dataset,
                 merge_labels=merge_labels,
                 use_aws_cache=use_aws_cache,
                 revision=revision,
+                view_id=view_id,
+                image_max_size=image_max_size,
             )
         elif to == ExportDestination.ultralytics:
             export_from_ls_to_ultralytics_object_detection(
@@ -274,6 +302,8 @@ def export(
                 error_raise=error_raise,
                 merge_labels=merge_labels,
                 use_aws_cache=use_aws_cache,
+                view_id=view_id,
+                image_max_size=image_max_size,
             )
     elif from_ == ExportSource.hf:
@@ -289,6 +319,7 @@ def export(
                 error_raise=error_raise,
                 use_aws_cache=use_aws_cache,
                 revision=revision,
+                image_max_size=image_max_size,
             )
         else:
             raise typer.BadParameter("Unsupported export format")
@@ -327,7 +358,8 @@ def export_llm_ds(
     tmp_dir: Annotated[
         Path | None,
         typer.Option(
-            help="Path to a temporary directory to use for image processing",
+            help="Path to the temporary directory used to store intermediate sample files "
+            "created when building the HF dataset.",
         ),
     ] = None,
     image_max_size: Annotated[
@@ -354,3 +386,102 @@ def export_llm_ds(
         tmp_dir=tmp_dir,
         image_max_size=image_max_size,
     )
+@app.command()
+def update_llm_ds(
+    dataset_path: Annotated[
+        Path, typer.Option(help="Path to the JSONL containing the updates.")
+    ],
+    repo_id: Annotated[
+        str, typer.Option(help="Hugging Face Datasets repository ID to update")
+    ],
+    split: Annotated[str, typer.Option(help="Dataset split to use")],
+    revision: Annotated[
+        str,
+        typer.Option(
+            help="Revision (branch, tag or commit) to use when pushing the new version "
+            "of the Hugging Face Dataset."
+        ),
+    ] = "main",
+    tmp_dir: Annotated[
+        Path | None,
+        typer.Option(
+            help="Path to a temporary directory to use for image processing",
+        ),
+    ] = None,
+    show_diff: Annotated[
+        bool,
+        typer.Option(
+            help="Show the differences between the original sample and the update. If "
+            "True, the updated dataset is not pushed to the Hub. Useful to review the "
+            "updates before applying them.",
+        ),
+    ] = False,
+):
+    """Update an existing LLM image extraction dataset, by updating the
+    `output` field of each sample in the dataset.
+    The `--dataset_path` JSONL file should contain items with two fields:
+    - `image_id`: The image ID of the sample to update in the Hugging Face
+        dataset.
+    - `output`: The new output data to set for the sample.
+    """
+    import sys
+    from difflib import Differ
+    import orjson
+    from datasets import load_dataset
+    from diskcache import Cache
+    dataset = load_dataset(repo_id, split=split)
+    # Populate cache with the updates
+    cache = Cache(directory=tmp_dir or None)
+    with dataset_path.open("r") as f:
+        for line in map(orjson.loads, f):
+            if "image_id" not in line or "output" not in line:
+                raise ValueError(
+                    "Each item in the update JSONL file must contain `image_id` and `output` fields"
+                )
+            image_id = line["image_id"]
+            output = line["output"]
+            if not isinstance(output, str):
+                output = orjson.dumps(output).decode("utf-8")
+            cache[image_id] = output
+    def apply_updates(sample):
+        image_id = sample["image_id"]
+        if image_id in cache:
+            cached_item = cache[image_id]
+            sample["output"] = cached_item
+        return sample
+    if show_diff:
+        differ = Differ()
+        for sample in dataset:
+            image_id = sample["image_id"]
+            if image_id in cache:
+                cached_item = orjson.loads(cache[image_id])
+                original_item = orjson.loads(sample["output"])
+                cached_item_str = orjson.dumps(
+                    cached_item, option=orjson.OPT_INDENT_2
+                ).decode("utf8")
+                original_item_str = orjson.dumps(
+                    original_item, option=orjson.OPT_INDENT_2
+                ).decode("utf8")
+                diff = list(
+                    differ.compare(
+                        original_item_str.splitlines(keepends=True),
+                        cached_item_str.splitlines(keepends=True),
+                    )
+                )
+                sys.stdout.writelines(diff)
+                sys.stdout.write("\n" + "-" * 30 + "\n")
+    else:
+        updated_dataset = dataset.map(apply_updates, batched=False)
+        updated_dataset.push_to_hub(repo_id, split=split, revision=revision)

labelr/apps/directus.py ADDED Viewed

@@ -0,0 +1,212 @@
+from pathlib import Path
+from typing import Annotated
+import requests
+import typer
+app = typer.Typer()
+DEFAULT_DIRECTUS_URL = "http://localhost:8055"
+def _list_endpoint_iter(
+    url: str,
+    session: requests.Session,
+    page_size: int,
+    method: str = "GET",
+    list_field: str | None = "data",
+    **kwargs,
+):
+    """Iterate over paginated Directus endpoint.
+    Args:
+        url (str): URL of the Directus endpoint.
+        session (requests.Session): Requests session to use for making HTTP
+            requests.
+        page_size (int): Number of items to fetch per page.
+        method (str, optional): HTTP method to use. Defaults to "GET".
+        list_field (str | None, optional): Field in the response JSON that
+            contains the list of items. If None, the entire response is used as
+            the list. Defaults to "data".
+        **kwargs: Additional keyword arguments to pass to the requests method.
+    Yields:
+        dict: Items from the Directus endpoint.
+    """
+    page = 0
+    next_page = True
+    params = kwargs.pop("params", {})
+    while next_page:
+        params["offset"] = page * page_size
+        params["limit"] = page_size
+        r = session.request(method=method, url=url, params=params, **kwargs)
+        r.raise_for_status()
+        response = r.json()
+        items = response[list_field] if list_field else response
+        if len(items) > 0:
+            yield from items
+        else:
+            next_page = False
+        page += 1
+def iter_items(
+    collection_name: str,
+    url: str,
+    session: requests.Session,
+    page_size: int = 50,
+    **kwargs,
+):
+    """Iterate over items in a Directus collection.
+    Args:
+        collection_name (str): Name of the Directus collection.
+        url (str): Base URL of the Directus server.
+        session (requests.Session): Requests session to use for making HTTP
+            requests.
+        page_size (int, optional): Number of items to fetch per page. Defaults
+            to 50.
+        **kwargs: Additional keyword arguments to pass to the requests method.
+    Yields:
+        dict: Items from the Directus collection.
+    """
+    yield from _list_endpoint_iter(
+        url=f"{url}/items/{collection_name}",
+        session=session,
+        page_size=page_size,
+        **kwargs,
+    )
+@app.command()
+def upload_data(
+    dataset_path: Annotated[
+        Path,
+        typer.Option(
+            help="Path to the dataset JSONL file to upload from.",
+            file_okay=True,
+            dir_okay=False,
+            readable=True,
+        ),
+    ],
+    collection: Annotated[
+        str, typer.Option(help="Name of the collection to upload the items to.")
+    ],
+    directus_url: Annotated[
+        str,
+        typer.Option(
+            help="Base URL of the Directus server.",
+        ),
+    ] = DEFAULT_DIRECTUS_URL,
+):
+    """Upload data to a Directus collection."""
+    import orjson
+    import requests
+    import tqdm
+    session = requests.Session()
+    with dataset_path.open("r") as f:
+        for item in tqdm.tqdm(map(orjson.loads, f), desc="items"):
+            r = session.post(
+                f"{directus_url}/items/{collection}",
+                json=item,
+            )
+            print(r.json())
+            r.raise_for_status()
+@app.command()
+def update_items(
+    collection: Annotated[
+        str, typer.Option(help="Name of the collection to upload the items to.")
+    ],
+    directus_url: Annotated[
+        str,
+        typer.Option(
+            help="Base URL of the Directus server.",
+        ),
+    ] = DEFAULT_DIRECTUS_URL,
+    sort: Annotated[
+        str | None,
+        typer.Option(help="The field to sort items by, defaults to None (no sorting)."),
+    ] = None,
+    skip: Annotated[
+        int, typer.Option(help="Number of items to skip, defaults to 0.")
+    ] = 0,
+):
+    """Update items in a Directus collection.
+    **Warning**: This command requires you to implement the processing
+    function inside the command. It is provided as a template for batch
+    updating items in a Directus collection.
+    """
+    import requests
+    import tqdm
+    session = requests.Session()
+    params = {} if sort is None else {"sort[]": sort}
+    for i, item in tqdm.tqdm(
+        enumerate(
+            iter_items(
+                collection_name=collection,
+                url=directus_url,
+                session=session,
+                params=params,
+            )
+        )
+    ):
+        if i < skip:
+            typer.echo(f"Skipping item {i}")
+            continue
+        item_id = item["id"]
+        # Implement your processing function here
+        # It should return a dict with the fields to update only
+        # If no update is needed, it should return None
+        patch_item = None
+        if patch_item is not None:
+            r = session.patch(
+                f"{directus_url}/items/{collection}/{item_id}",
+                json=patch_item,
+            )
+            r.raise_for_status()
+@app.command()
+def export_data(
+    output_path: Annotated[
+        Path, typer.Option(help="Path to the file to export to.", allow_dash=True)
+    ],
+    collection: Annotated[
+        str, typer.Option(help="Name of the collection to upload the items to.")
+    ],
+    directus_url: Annotated[
+        str,
+        typer.Option(
+            help="Base URL of the Directus server.",
+        ),
+    ] = DEFAULT_DIRECTUS_URL,
+):
+    """Export a directus collection to a JSONL file."""
+    import sys
+    import orjson
+    import requests
+    import tqdm
+    session = requests.Session()
+    f = sys.stdout if output_path.as_posix() == "-" else output_path.open("w")
+    with f:
+        for item in tqdm.tqdm(
+            iter_items(
+                collection_name=collection,
+                url=directus_url,
+                session=session,
+            )
+        ):
+            f.write(orjson.dumps(item).decode("utf-8") + "\n")

labelr/apps/google_batch.py CHANGED Viewed

@@ -7,6 +7,7 @@ import typer
 from google.genai.types import JSONSchema as GoogleJSONSchema
 from google.genai.types import Schema as GoogleSchema
 from openfoodfacts import Flavor
+from openfoodfacts.types import JSONType
 from pydantic import BaseModel
 from labelr.google_genai import generate_batch_dataset, launch_batch_job
@@ -14,6 +15,40 @@ from labelr.google_genai import generate_batch_dataset, launch_batch_job
 app = typer.Typer()
+def _check_json_schema(item: JSONType) -> None:
+    if item.get("type") == "object":
+        required_fields = item.get("required", [])
+        all_fields = item.get("properties", [])
+        diff = set(all_fields) - set(required_fields)
+        if diff:
+            raise ValueError(
+                f"fields '{diff}' must be marked as required in the JSONSchema. "
+                "All fields with type 'object' must be required."
+            )
+    return None
+def check_json_schema(json_schema: JSONType) -> None:
+    """Check that for all `object`s, all fields are marked as required.
+    This is important to check, as otherwise the structured generation
+    backend may prevent the model to generate these fields.
+    This is the case as of vLLM 0.13 and xgrammars as backend.
+    To prevent this, we ask all fields to be marked as required.
+    """
+    stack = [json_schema]
+    for def_item in json_schema.get("$defs", {}).values():
+        stack.append(def_item)
+    while stack:
+        item = stack.pop()
+        _check_json_schema(item)
+        for sub_item in item.get("properties", {}).values():
+            stack.append(sub_item)
 def convert_pydantic_model_to_google_schema(schema: type[BaseModel]) -> dict[str, Any]:
     """Google doesn't support natively OpenAPI schemas, so we convert them to
     Google `Schema` (a subset of OpenAPI)."""
@@ -260,6 +295,9 @@ def upload_training_dataset_from_predictions(
     print(f"Instructions: {instructions}")
     json_schema = orjson.loads(json_schema_path.read_text())
+    # We check that all fields are marked as required
+    check_json_schema(json_schema)
     api = HfApi()
     config = {
         "instructions": instructions,

labelr 0.10.0__py3-none-any.whl → 0.11.1__py3-none-any.whl

labelr 0.10.0py3-none-any.whl → 0.11.1py3-none-any.whl