PyPI - labelr - Versions diffs - 0.9.0__py3-none-any.whl → 0.11.0__py3-none-any.whl - Mend

labelr 0.9.0py3-none-any.whl → 0.11.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

labelr/apps/datasets.py +196 -14
labelr/apps/directus.py +212 -0
labelr/apps/google_batch.py +46 -1
labelr/apps/label_studio.py +261 -64
labelr/apps/typer_description.py +2 -0
labelr/check.py +68 -7
labelr/config.py +57 -1
labelr/export/__init__.py +0 -0
labelr/export/classification.py +114 -0
labelr/export/common.py +42 -0
labelr/export/llm.py +91 -0
labelr/{export.py → export/object_detection.py} +97 -217
labelr/google_genai.py +9 -3
labelr/main.py +16 -0
labelr/sample/__init__.py +0 -0
labelr/sample/classification.py +17 -0
labelr/sample/common.py +14 -0
labelr/sample/llm.py +75 -0
labelr/{sample.py → sample/object_detection.py} +38 -68
labelr/utils.py +55 -5
labelr-0.11.0.dist-info/METADATA +230 -0
labelr-0.11.0.dist-info/RECORD +38 -0
{labelr-0.9.0.dist-info → labelr-0.11.0.dist-info}/WHEEL +1 -1
labelr-0.9.0.dist-info/METADATA +0 -159
labelr-0.9.0.dist-info/RECORD +0 -28
{labelr-0.9.0.dist-info → labelr-0.11.0.dist-info}/entry_points.txt +0 -0
{labelr-0.9.0.dist-info → labelr-0.11.0.dist-info}/licenses/LICENSE +0 -0
{labelr-0.9.0.dist-info → labelr-0.11.0.dist-info}/top_level.txt +0 -0

labelr/apps/datasets.py CHANGED Viewed

@@ -12,9 +12,14 @@ import typer
 from openfoodfacts import Flavor
 from openfoodfacts.utils import get_logger
-from labelr.export import export_from_ultralytics_to_hf
-from ..config import LABEL_STUDIO_DEFAULT_URL
+from labelr.export.common import export_from_ultralytics_to_hf
+from labelr.export.object_detection import (
+    export_from_ls_to_hf_object_detection,
+    export_from_ls_to_ultralytics_object_detection,
+)
+from . import typer_description
+from ..config import config
 from ..types import ExportDestination, ExportSource, TaskType
 app = typer.Typer()
@@ -99,7 +104,9 @@ def convert_object_detection_dataset(
     Studio format, and save it to a JSON file."""
     from datasets import load_dataset
-    from labelr.sample import format_object_detection_sample_from_hf_to_ls
+    from labelr.sample.object_detection import (
+        format_object_detection_sample_from_hf_to_ls,
+    )
     logger.info("Loading dataset: %s", repo_id)
     ds = load_dataset(repo_id)
@@ -119,7 +126,9 @@ def convert_object_detection_dataset(
 def export(
     from_: Annotated[ExportSource, typer.Option("--from", help="Input source to use")],
     to: Annotated[ExportDestination, typer.Option(help="Where to export the data")],
-    api_key: Annotated[Optional[str], typer.Option(envvar="LABEL_STUDIO_API_KEY")],
+    api_key: Annotated[
+        str | None, typer.Option(help=typer_description.LABEL_STUDIO_API_KEY)
+    ] = config.label_studio_api_key,
     task_type: Annotated[
         TaskType, typer.Option(help="Type of task to export")
     ] = TaskType.object_detection,
@@ -136,7 +145,16 @@ def export(
     project_id: Annotated[
         Optional[int], typer.Option(help="Label Studio Project ID")
     ] = None,
-    label_studio_url: Optional[str] = LABEL_STUDIO_DEFAULT_URL,
+    view_id: Annotated[
+        int | None,
+        typer.Option(
+            help="ID of the Label Studio view, if any. This option is useful "
+            "to filter the task to export."
+        ),
+    ] = None,
+    label_studio_url: Annotated[
+        str, typer.Option(help=typer_description.LABEL_STUDIO_URL)
+    ] = config.label_studio_url,
     output_dir: Annotated[
         Optional[Path],
         typer.Option(
@@ -157,11 +175,15 @@ def export(
     is_openfoodfacts_dataset: Annotated[
         bool,
         typer.Option(
-            help="Whether the Ultralytics dataset is an OpenFoodFacts dataset, only "
-            "for Ultralytics source. This is used to generate the correct image URLs "
-            "each image name."
+            help="Whether the Ultralytics dataset is an Open Food Facts dataset, only "
+            "for Ultralytics source. This is used:\n"
+            "- to generate the correct image URLs from each image name, when exporting "
+            "from Ultralytics to Hugging Face Datasets.\n"
+            "- to include additional metadata fields specific to Open Food Facts "
+            "(`barcode` and `off_image_id`) when exporting from Label Studio to "
+            "Hugging Face Datasets."
         ),
-    ] = True,
+    ] = False,
     openfoodfacts_flavor: Annotated[
         Flavor,
         typer.Option(
@@ -175,9 +197,18 @@ def export(
         float,
         typer.Option(
             help="Train ratio for splitting the dataset, if the split name is not "
-            "provided (typically, if the source is Label Studio)"
+            "provided. Only used if the source is Label Studio and the destination "
+            "is Ultralytics."
         ),
     ] = 0.8,
+    image_max_size: Annotated[
+        int | None,
+        typer.Option(
+            help="Maximum size (in pixels) for the images. If None, no resizing is performed."
+            "Otherwise, the longest side of the image will be resized to this value, "
+            "keeping the aspect ratio."
+        ),
+    ] = None,
     error_raise: Annotated[
         bool,
         typer.Option(
@@ -207,10 +238,8 @@ def export(
     local files (ultralytics format)."""
     from label_studio_sdk.client import LabelStudio
-    from labelr.export import (
+    from labelr.export.object_detection import (
         export_from_hf_to_ultralytics_object_detection,
-        export_from_ls_to_hf_object_detection,
-        export_from_ls_to_ultralytics_object_detection,
     )
     if (to == ExportDestination.hf or from_ == ExportSource.hf) and repo_id is None:
@@ -256,9 +285,12 @@ def export(
                 repo_id=repo_id,
                 label_names=typing.cast(list[str], label_names_list),
                 project_id=typing.cast(int, project_id),
+                is_openfoodfacts_dataset=is_openfoodfacts_dataset,
                 merge_labels=merge_labels,
                 use_aws_cache=use_aws_cache,
                 revision=revision,
+                view_id=view_id,
+                image_max_size=image_max_size,
             )
         elif to == ExportDestination.ultralytics:
             export_from_ls_to_ultralytics_object_detection(
@@ -270,6 +302,8 @@ def export(
                 error_raise=error_raise,
                 merge_labels=merge_labels,
                 use_aws_cache=use_aws_cache,
+                view_id=view_id,
+                image_max_size=image_max_size,
             )
     elif from_ == ExportSource.hf:
@@ -285,6 +319,7 @@ def export(
                 error_raise=error_raise,
                 use_aws_cache=use_aws_cache,
                 revision=revision,
+                image_max_size=image_max_size,
             )
         else:
             raise typer.BadParameter("Unsupported export format")
@@ -303,3 +338,150 @@ def export(
                 is_openfoodfacts_dataset=is_openfoodfacts_dataset,
                 openfoodfacts_flavor=openfoodfacts_flavor,
             )
+@app.command()
+def export_llm_ds(
+    dataset_path: Annotated[
+        Path, typer.Option(..., help="Path to the JSONL dataset file")
+    ],
+    repo_id: Annotated[
+        str, typer.Option(..., help="Hugging Face Datasets repository ID to export to")
+    ],
+    split: Annotated[str, typer.Option(..., help="Dataset split to export")],
+    revision: Annotated[
+        str,
+        typer.Option(
+            help="Revision (branch, tag or commit) for the Hugging Face Datasets repository."
+        ),
+    ] = "main",
+    tmp_dir: Annotated[
+        Path | None,
+        typer.Option(
+            help="Path to the temporary directory used to store intermediate sample files "
+            "created when building the HF dataset.",
+        ),
+    ] = None,
+    image_max_size: Annotated[
+        int | None,
+        typer.Option(
+            help="Maximum size (in pixels) for the images. If None, no resizing is performed.",
+        ),
+    ] = None,
+):
+    """Export LLM image extraction dataset with images only to Hugging Face
+    Datasets.
+    """
+    from labelr.export.llm import export_to_hf_llm_image_extraction
+    from labelr.sample.llm import load_llm_image_extraction_dataset_from_jsonl
+    sample_iter = load_llm_image_extraction_dataset_from_jsonl(
+        dataset_path=dataset_path
+    )
+    export_to_hf_llm_image_extraction(
+        sample_iter,
+        split=split,
+        repo_id=repo_id,
+        revision=revision,
+        tmp_dir=tmp_dir,
+        image_max_size=image_max_size,
+    )
+@app.command()
+def update_llm_ds(
+    dataset_path: Annotated[
+        Path, typer.Option(help="Path to the JSONL containing the updates.")
+    ],
+    repo_id: Annotated[
+        str, typer.Option(help="Hugging Face Datasets repository ID to update")
+    ],
+    split: Annotated[str, typer.Option(help="Dataset split to use")],
+    revision: Annotated[
+        str,
+        typer.Option(
+            help="Revision (branch, tag or commit) to use when pushing the new version "
+            "of the Hugging Face Dataset."
+        ),
+    ] = "main",
+    tmp_dir: Annotated[
+        Path | None,
+        typer.Option(
+            help="Path to a temporary directory to use for image processing",
+        ),
+    ] = None,
+    show_diff: Annotated[
+        bool,
+        typer.Option(
+            help="Show the differences between the original sample and the update. If "
+            "True, the updated dataset is not pushed to the Hub. Useful to review the "
+            "updates before applying them.",
+        ),
+    ] = False,
+):
+    """Update an existing LLM image extraction dataset, by updating the
+    `output` field of each sample in the dataset.
+    The `--dataset_path` JSONL file should contain items with two fields:
+    - `image_id`: The image ID of the sample to update in the Hugging Face
+        dataset.
+    - `output`: The new output data to set for the sample.
+    """
+    import sys
+    from difflib import Differ
+    import orjson
+    from datasets import load_dataset
+    from diskcache import Cache
+    dataset = load_dataset(repo_id, split=split)
+    # Populate cache with the updates
+    cache = Cache(directory=tmp_dir or None)
+    with dataset_path.open("r") as f:
+        for line in map(orjson.loads, f):
+            if "image_id" not in line or "output" not in line:
+                raise ValueError(
+                    "Each item in the update JSONL file must contain `image_id` and `output` fields"
+                )
+            image_id = line["image_id"]
+            output = line["output"]
+            if not isinstance(output, str):
+                output = orjson.dumps(output).decode("utf-8")
+            cache[image_id] = output
+    def apply_updates(sample):
+        image_id = sample["image_id"]
+        if image_id in cache:
+            cached_item = cache[image_id]
+            sample["output"] = cached_item
+        return sample
+    if show_diff:
+        differ = Differ()
+        for sample in dataset:
+            image_id = sample["image_id"]
+            if image_id in cache:
+                cached_item = orjson.loads(cache[image_id])
+                original_item = orjson.loads(sample["output"])
+                cached_item_str = orjson.dumps(
+                    cached_item, option=orjson.OPT_INDENT_2
+                ).decode("utf8")
+                original_item_str = orjson.dumps(
+                    original_item, option=orjson.OPT_INDENT_2
+                ).decode("utf8")
+                diff = list(
+                    differ.compare(
+                        original_item_str.splitlines(keepends=True),
+                        cached_item_str.splitlines(keepends=True),
+                    )
+                )
+                sys.stdout.writelines(diff)
+                sys.stdout.write("\n" + "-" * 30 + "\n")
+    else:
+        updated_dataset = dataset.map(apply_updates, batched=False)
+        updated_dataset.push_to_hub(repo_id, split=split, revision=revision)

labelr/apps/directus.py ADDED Viewed

@@ -0,0 +1,212 @@
+from pathlib import Path
+from typing import Annotated
+import requests
+import typer
+app = typer.Typer()
+DEFAULT_DIRECTUS_URL = "http://localhost:8055"
+def _list_endpoint_iter(
+    url: str,
+    session: requests.Session,
+    page_size: int,
+    method: str = "GET",
+    list_field: str | None = "data",
+    **kwargs,
+):
+    """Iterate over paginated Directus endpoint.
+    Args:
+        url (str): URL of the Directus endpoint.
+        session (requests.Session): Requests session to use for making HTTP
+            requests.
+        page_size (int): Number of items to fetch per page.
+        method (str, optional): HTTP method to use. Defaults to "GET".
+        list_field (str | None, optional): Field in the response JSON that
+            contains the list of items. If None, the entire response is used as
+            the list. Defaults to "data".
+        **kwargs: Additional keyword arguments to pass to the requests method.
+    Yields:
+        dict: Items from the Directus endpoint.
+    """
+    page = 0
+    next_page = True
+    params = kwargs.pop("params", {})
+    while next_page:
+        params["offset"] = page * page_size
+        params["limit"] = page_size
+        r = session.request(method=method, url=url, params=params, **kwargs)
+        r.raise_for_status()
+        response = r.json()
+        items = response[list_field] if list_field else response
+        if len(items) > 0:
+            yield from items
+        else:
+            next_page = False
+        page += 1
+def iter_items(
+    collection_name: str,
+    url: str,
+    session: requests.Session,
+    page_size: int = 50,
+    **kwargs,
+):
+    """Iterate over items in a Directus collection.
+    Args:
+        collection_name (str): Name of the Directus collection.
+        url (str): Base URL of the Directus server.
+        session (requests.Session): Requests session to use for making HTTP
+            requests.
+        page_size (int, optional): Number of items to fetch per page. Defaults
+            to 50.
+        **kwargs: Additional keyword arguments to pass to the requests method.
+    Yields:
+        dict: Items from the Directus collection.
+    """
+    yield from _list_endpoint_iter(
+        url=f"{url}/items/{collection_name}",
+        session=session,
+        page_size=page_size,
+        **kwargs,
+    )
+@app.command()
+def upload_data(
+    dataset_path: Annotated[
+        Path,
+        typer.Option(
+            help="Path to the dataset JSONL file to upload from.",
+            file_okay=True,
+            dir_okay=False,
+            readable=True,
+        ),
+    ],
+    collection: Annotated[
+        str, typer.Option(help="Name of the collection to upload the items to.")
+    ],
+    directus_url: Annotated[
+        str,
+        typer.Option(
+            help="Base URL of the Directus server.",
+        ),
+    ] = DEFAULT_DIRECTUS_URL,
+):
+    """Upload data to a Directus collection."""
+    import orjson
+    import requests
+    import tqdm
+    session = requests.Session()
+    with dataset_path.open("r") as f:
+        for item in tqdm.tqdm(map(orjson.loads, f), desc="items"):
+            r = session.post(
+                f"{directus_url}/items/{collection}",
+                json=item,
+            )
+            print(r.json())
+            r.raise_for_status()
+@app.command()
+def update_items(
+    collection: Annotated[
+        str, typer.Option(help="Name of the collection to upload the items to.")
+    ],
+    directus_url: Annotated[
+        str,
+        typer.Option(
+            help="Base URL of the Directus server.",
+        ),
+    ] = DEFAULT_DIRECTUS_URL,
+    sort: Annotated[
+        str | None,
+        typer.Option(help="The field to sort items by, defaults to None (no sorting)."),
+    ] = None,
+    skip: Annotated[
+        int, typer.Option(help="Number of items to skip, defaults to 0.")
+    ] = 0,
+):
+    """Update items in a Directus collection.
+    **Warning**: This command requires you to implement the processing
+    function inside the command. It is provided as a template for batch
+    updating items in a Directus collection.
+    """
+    import requests
+    import tqdm
+    session = requests.Session()
+    params = {} if sort is None else {"sort[]": sort}
+    for i, item in tqdm.tqdm(
+        enumerate(
+            iter_items(
+                collection_name=collection,
+                url=directus_url,
+                session=session,
+                params=params,
+            )
+        )
+    ):
+        if i < skip:
+            typer.echo(f"Skipping item {i}")
+            continue
+        item_id = item["id"]
+        # Implement your processing function here
+        # It should return a dict with the fields to update only
+        # If no update is needed, it should return None
+        patch_item = None
+        if patch_item is not None:
+            r = session.patch(
+                f"{directus_url}/items/{collection}/{item_id}",
+                json=patch_item,
+            )
+            r.raise_for_status()
+@app.command()
+def export_data(
+    output_path: Annotated[
+        Path, typer.Option(help="Path to the file to export to.", allow_dash=True)
+    ],
+    collection: Annotated[
+        str, typer.Option(help="Name of the collection to upload the items to.")
+    ],
+    directus_url: Annotated[
+        str,
+        typer.Option(
+            help="Base URL of the Directus server.",
+        ),
+    ] = DEFAULT_DIRECTUS_URL,
+):
+    """Export a directus collection to a JSONL file."""
+    import sys
+    import orjson
+    import requests
+    import tqdm
+    session = requests.Session()
+    f = sys.stdout if output_path.as_posix() == "-" else output_path.open("w")
+    with f:
+        for item in tqdm.tqdm(
+            iter_items(
+                collection_name=collection,
+                url=directus_url,
+                session=session,
+            )
+        ):
+            f.write(orjson.dumps(item).decode("utf-8") + "\n")

labelr/apps/google_batch.py CHANGED Viewed

@@ -7,6 +7,7 @@ import typer
 from google.genai.types import JSONSchema as GoogleJSONSchema
 from google.genai.types import Schema as GoogleSchema
 from openfoodfacts import Flavor
+from openfoodfacts.types import JSONType
 from pydantic import BaseModel
 from labelr.google_genai import generate_batch_dataset, launch_batch_job
@@ -14,6 +15,40 @@ from labelr.google_genai import generate_batch_dataset, launch_batch_job
 app = typer.Typer()
+def _check_json_schema(item: JSONType) -> None:
+    if item.get("type") == "object":
+        required_fields = item.get("required", [])
+        all_fields = item.get("properties", [])
+        diff = set(all_fields) - set(required_fields)
+        if diff:
+            raise ValueError(
+                f"fields '{diff}' must be marked as required in the JSONSchema. "
+                "All fields with type 'object' must be required."
+            )
+    return None
+def check_json_schema(json_schema: JSONType) -> None:
+    """Check that for all `object`s, all fields are marked as required.
+    This is important to check, as otherwise the structured generation
+    backend may prevent the model to generate these fields.
+    This is the case as of vLLM 0.13 and xgrammars as backend.
+    To prevent this, we ask all fields to be marked as required.
+    """
+    stack = [json_schema]
+    for def_item in json_schema.get("$defs", {}).values():
+        stack.append(def_item)
+    while stack:
+        item = stack.pop()
+        _check_json_schema(item)
+        for sub_item in item.get("properties", {}).values():
+            stack.append(sub_item)
 def convert_pydantic_model_to_google_schema(schema: type[BaseModel]) -> dict[str, Any]:
     """Google doesn't support natively OpenAPI schemas, so we convert them to
     Google `Schema` (a subset of OpenAPI)."""
@@ -239,6 +274,12 @@ def upload_training_dataset_from_predictions(
             help="Whether to raise an error on invalid samples instead of skipping them",
         ),
     ] = False,
+    image_max_size: Annotated[
+        int | None,
+        typer.Option(
+            help="Maximum size (in pixels) for the images. If None, no resizing is performed.",
+        ),
+    ] = None,
 ):
     """Upload a training dataset to a Hugging Face Datasets repository from a
     Gemini batch prediction file."""
@@ -247,13 +288,16 @@ def upload_training_dataset_from_predictions(
     import orjson
     from huggingface_hub import HfApi
-    from labelr.export import export_to_hf_llm_image_extraction
+    from labelr.export.llm import export_to_hf_llm_image_extraction
     from labelr.google_genai import generate_sample_iter
     instructions = instructions_path.read_text()
     print(f"Instructions: {instructions}")
     json_schema = orjson.loads(json_schema_path.read_text())
+    # We check that all fields are marked as required
+    check_json_schema(json_schema)
     api = HfApi()
     config = {
         "instructions": instructions,
@@ -286,4 +330,5 @@ def upload_training_dataset_from_predictions(
         repo_id=repo_id,
         revision=revision,
         tmp_dir=tmp_dir,
+        image_max_size=image_max_size,
     )

labelr 0.9.0__py3-none-any.whl → 0.11.0__py3-none-any.whl

labelr 0.9.0py3-none-any.whl → 0.11.0py3-none-any.whl