PyPI - labelr - Versions diffs - 0.8.0__tar.gz → 0.9.0__tar.gz - Mend

labelr 0.8.0tar.gz → 0.9.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

{labelr-0.8.0/src/labelr.egg-info → labelr-0.9.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: labelr
-Version: 0.8.0
+Version: 0.9.0
 Summary: A command-line tool to manage labeling tasks with Label Studio.
 Requires-Python: >=3.10
 Description-Content-Type: text/markdown
@@ -13,10 +13,19 @@ Requires-Dist: openfoodfacts>=2.9.0
 Requires-Dist: typer>=0.15.1
 Requires-Dist: google-cloud-batch==0.18.0
 Requires-Dist: huggingface-hub
+Requires-Dist: deepdiff>=8.6.1
+Requires-Dist: rapidfuzz>=3.14.3
+Requires-Dist: aiohttp
+Requires-Dist: aiofiles
+Requires-Dist: orjson
 Provides-Extra: ultralytics
 Requires-Dist: ultralytics==8.3.223; extra == "ultralytics"
 Provides-Extra: fiftyone
 Requires-Dist: fiftyone~=1.10.0; extra == "fiftyone"
+Provides-Extra: google
+Requires-Dist: google-genai>=1.56.0; extra == "google"
+Requires-Dist: gcloud-aio-storage; extra == "google"
+Requires-Dist: google-cloud-storage; extra == "google"
 Dynamic: license-file
 # Labelr

{labelr-0.8.0 → labelr-0.9.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "labelr"
-version = "0.8.0"
+version = "0.9.0"
 description = "A command-line tool to manage labeling tasks with Label Studio."
 readme = "README.md"
 requires-python = ">=3.10"
@@ -12,7 +12,12 @@ dependencies = [
     "openfoodfacts>=2.9.0",
     "typer>=0.15.1",
     "google-cloud-batch==0.18.0",
-    "huggingface-hub"
+    "huggingface-hub",
+    "deepdiff>=8.6.1",
+    "rapidfuzz>=3.14.3",
+    "aiohttp",
+    "aiofiles",
+    "orjson",
 ]
 [project.scripts]
@@ -25,6 +30,7 @@ ultralytics = [
 fiftyone = [
     "fiftyone~=1.10.0"
 ]
+google = ["google-genai >= 1.56.0", "gcloud-aio-storage", "google-cloud-storage"]
 [tool.uv]
 package = true

labelr-0.9.0/src/labelr/apps/google_batch.py ADDED Viewed

@@ -0,0 +1,289 @@
+import asyncio
+import importlib
+from pathlib import Path
+from typing import Annotated, Any
+import typer
+from google.genai.types import JSONSchema as GoogleJSONSchema
+from google.genai.types import Schema as GoogleSchema
+from openfoodfacts import Flavor
+from pydantic import BaseModel
+from labelr.google_genai import generate_batch_dataset, launch_batch_job
+app = typer.Typer()
+def convert_pydantic_model_to_google_schema(schema: type[BaseModel]) -> dict[str, Any]:
+    """Google doesn't support natively OpenAPI schemas, so we convert them to
+    Google `Schema` (a subset of OpenAPI)."""
+    return GoogleSchema.from_json_schema(
+        json_schema=GoogleJSONSchema.model_validate(schema.model_json_schema())
+    ).model_dump(mode="json", exclude_none=True, exclude_unset=True)
+@app.command()
+def generate_dataset(
+    data_path: Annotated[
+        Path,
+        typer.Option(
+            ...,
+            help="Path to a JSONL file containing the raw batch samples.",
+            exists=True,
+            dir_okay=False,
+            resolve_path=True,
+        ),
+    ],
+    output_path: Annotated[
+        Path,
+        typer.Option(
+            ...,
+            help="Path where to write the generated dataset file.",
+            exists=False,
+            dir_okay=False,
+            resolve_path=True,
+        ),
+    ],
+    config_module: Annotated[
+        str,
+        typer.Option(
+            ...,
+            help="Python module path (e.g., 'myschema') containing two variables: "
+            "OUTPUT_SCHEMA (a Pydantic class representing the output schema) and "
+            "INSTRUCTIONS (a str containing instructions to add before each sample).",
+        ),
+    ],
+    bucket_name: Annotated[
+        str,
+        typer.Option(
+            ...,
+            help="Name of the GCS bucket where the images are stored.",
+        ),
+    ] = "robotoff-batch",
+    bucket_dir_name: Annotated[
+        str,
+        typer.Option(
+            ...,
+            help="Directory name in the GCS bucket where the images are stored.",
+        ),
+    ] = "gemini-batch-images",
+    max_concurrent_uploads: Annotated[
+        int,
+        typer.Option(
+            ...,
+            help="Maximum number of concurrent uploads to GCS.",
+        ),
+    ] = 30,
+    base_image_dir: Annotated[
+        Path | None,
+        typer.Option(
+            ...,
+            help="Base directory to resolve local image paths from.",
+        ),
+    ] = None,
+    from_key: Annotated[
+        str | None,
+        typer.Option(
+            ...,
+            help="If specified, resume processing from this sample key.",
+        ),
+    ] = None,
+    skip_upload: Annotated[
+        bool, typer.Option(..., help="Skip uploading images to GCS")
+    ] = False,
+    thinking_level: Annotated[
+        str | None,
+        typer.Option(
+            ...,
+            help="Thinking level to use for the generation config.",
+        ),
+    ] = None,
+):
+    """Generate a dataset file in JSONL format to be used for batch
+    processing, using Gemini Batch Inference."""
+    typer.echo(f"Uploading images from '{data_path}' to GCS bucket '{bucket_name}'...")
+    typer.echo(f"Writing updated dataset to {output_path}...")
+    typer.echo(f"Max concurrent uploads: {max_concurrent_uploads}...")
+    typer.echo(f"Base image directory: {base_image_dir}...")
+    typer.echo(f"From key: {from_key}...")
+    typer.echo(f"Skip upload: {skip_upload}...")
+    typer.echo(f"Thinking level: {thinking_level}...")
+    module = importlib.import_module(config_module)
+    base_cls = getattr(module, "OUTPUT_SCHEMA")
+    if not issubclass(base_cls, BaseModel):
+        typer.echo(
+            f"Error: {config_module}.OUTPUT_SCHEMA is not a subclass of pydantic.BaseModel"
+        )
+        raise typer.Exit(code=1)
+    instructions = getattr(module, "INSTRUCTIONS", None) or None
+    if instructions:
+        typer.echo(f"Using instructions: '{instructions}'...")
+    else:
+        typer.echo("No instructions provided.")
+    # JSON Schema is supoorted natively by Vertex AI and Gemini APIs,
+    # but not yet on Batch Inference...
+    # So we convert the JSON schema to Google internal "Schema"
+    # google_json_schema = base_cls.model_json_schema()
+    google_json_schema = convert_pydantic_model_to_google_schema(base_cls)
+    asyncio.run(
+        generate_batch_dataset(
+            data_path=data_path,
+            output_path=output_path,
+            google_json_schema=google_json_schema,
+            instructions=instructions,
+            bucket_name=bucket_name,
+            bucket_dir_name=bucket_dir_name,
+            max_concurrent_uploads=max_concurrent_uploads,
+            base_image_dir=base_image_dir,
+            from_key=from_key,
+            skip_upload=skip_upload,
+            thinking_level=thinking_level,
+        )
+    )
+@app.command(name="launch-batch-job")
+def launch_batch_job_command(
+    run_name: Annotated[str, typer.Argument(..., help="Name of the batch job run")],
+    dataset_path: Annotated[Path, typer.Option(..., help="Path to the dataset file")],
+    model: Annotated[str, typer.Option(..., help="Model to use for the batch job")],
+    location: Annotated[
+        str,
+        typer.Option(..., help="GCP location where to run the batch job"),
+    ] = "europe-west4",
+):
+    """Launch a Gemini Batch Inference job."""
+    launch_batch_job(
+        run_name=run_name,
+        dataset_path=dataset_path,
+        model=model,
+        location=location,
+    )
+@app.command()
+def upload_training_dataset_from_predictions(
+    prediction_path: Annotated[
+        Path,
+        typer.Argument(
+            ...,
+            help="Path to the prediction JSONL file generated by Google Inference Batch",
+            exists=True,
+            dir_okay=False,
+            readable=True,
+        ),
+    ],
+    instructions_path: Annotated[
+        Path,
+        typer.Option(
+            ...,
+            help="Path to the file with the instruction prompt for the model",
+            exists=True,
+            dir_okay=False,
+            readable=True,
+        ),
+    ],
+    json_schema_path: Annotated[
+        Path,
+        typer.Option(
+            ...,
+            help="Path to the file with the JSON schema to follow",
+            dir_okay=False,
+            readable=True,
+        ),
+    ],
+    repo_id: Annotated[
+        str, typer.Option(help="Hugging Face Datasets repository ID to push to")
+    ],
+    revision: Annotated[
+        str,
+        typer.Option(
+            help="Revision (branch, tag or commit) to use for the Hugging Face Datasets repository"
+        ),
+    ] = "main",
+    is_openfoodfacts_dataset: Annotated[
+        bool, typer.Option(..., help="Whether this is an Open Food Facts dataset")
+    ] = False,
+    openfoodfacts_flavor: Annotated[
+        Flavor,
+        typer.Option(
+            ...,
+            help="Open Food Facts flavor of the dataset (if applicable)",
+        ),
+    ] = Flavor.off,
+    split: Annotated[str, typer.Option(..., help="Name of the split")] = "train",
+    tmp_dir: Annotated[
+        Path | None,
+        typer.Option(
+            ...,
+            help="Temporary directory to use for intermediate files, default to a temporary directory "
+            "generated automatically. This is useful to relaunch the command if it fails midway.",
+        ),
+    ] = None,
+    skip: Annotated[int, typer.Option(..., help="Number of samples to skip")] = 0,
+    limit: Annotated[
+        int | None,
+        typer.Option(
+            ..., help="Limit number of samples to process, or None for no limit"
+        ),
+    ] = None,
+    raise_on_invalid_sample: Annotated[
+        bool,
+        typer.Option(
+            ...,
+            help="Whether to raise an error on invalid samples instead of skipping them",
+        ),
+    ] = False,
+):
+    """Upload a training dataset to a Hugging Face Datasets repository from a
+    Gemini batch prediction file."""
+    import tempfile
+    import orjson
+    from huggingface_hub import HfApi
+    from labelr.export import export_to_hf_llm_image_extraction
+    from labelr.google_genai import generate_sample_iter
+    instructions = instructions_path.read_text()
+    print(f"Instructions: {instructions}")
+    json_schema = orjson.loads(json_schema_path.read_text())
+    api = HfApi()
+    config = {
+        "instructions": instructions,
+        "json_schema": json_schema,
+    }
+    with tempfile.TemporaryDirectory() as config_tmp_dir_str:
+        config_tmp_dir = Path(config_tmp_dir_str)
+        config_path = config_tmp_dir / "config.json"
+        config_path.write_text(
+            orjson.dumps(config, option=orjson.OPT_INDENT_2).decode("utf-8")
+        )
+        api.upload_file(
+            path_or_fileobj=config_path,
+            path_in_repo="config.json",
+            repo_id=repo_id,
+            repo_type="dataset",
+        )
+    sample_iter = generate_sample_iter(
+        prediction_path=prediction_path,
+        json_schema=json_schema,
+        is_openfoodfacts_dataset=is_openfoodfacts_dataset,
+        openfoodfacts_flavor=openfoodfacts_flavor,
+        skip=skip,
+        limit=limit,
+        raise_on_invalid_sample=raise_on_invalid_sample,
+    )
+    export_to_hf_llm_image_extraction(
+        sample_iter=sample_iter,
+        split=split,
+        repo_id=repo_id,
+        revision=revision,
+        tmp_dir=tmp_dir,
+    )

{labelr-0.8.0 → labelr-0.9.0}/src/labelr/export.py RENAMED Viewed

@@ -3,6 +3,7 @@ import logging
 import pickle
 import random
 import tempfile
+from collections.abc import Iterator
 from pathlib import Path
 import datasets
@@ -14,10 +15,13 @@ from PIL import Image, ImageOps
 from labelr.sample import (
     HF_DS_CLASSIFICATION_FEATURES,
+    HF_DS_LLM_IMAGE_EXTRACTION_FEATURES,
     HF_DS_OBJECT_DETECTION_FEATURES,
+    LLMImageExtractionSample,
     format_object_detection_sample_to_hf,
 )
 from labelr.types import TaskType
+from labelr.utils import PathWithContext
 logger = logging.getLogger(__name__)
@@ -455,3 +459,62 @@ def export_from_ultralytics_to_hf_classification(
                 features=HF_DS_CLASSIFICATION_FEATURES,
             )
             hf_ds.push_to_hub(repo_id, split=split)
+def export_to_hf_llm_image_extraction(
+    sample_iter: Iterator[LLMImageExtractionSample],
+    split: str,
+    repo_id: str,
+    revision: str = "main",
+    tmp_dir: Path | None = None,
+) -> None:
+    """Export LLM image extraction samples to a Hugging Face dataset.
+    Args:
+        sample_iter (Iterator[LLMImageExtractionSample]): Iterator of samples
+            to export.
+        split (str): Name of the dataset split (e.g., 'train', 'val').
+        repo_id (str): Hugging Face repository ID to push the dataset to.
+        revision (str): Revision (branch, tag or commit) to use for the
+            Hugging Face Datasets repository.
+        tmp_dir (Path | None): Temporary directory to use for intermediate
+            files. If None, a temporary directory will be created
+            automatically.
+    """
+    logger.info(
+        "Repo ID: %s, revision: %s, split: %s, tmp_dir: %s",
+        repo_id,
+        revision,
+        split,
+        tmp_dir,
+    )
+    tmp_dir_with_context: PathWithContext | tempfile.TemporaryDirectory
+    if tmp_dir:
+        tmp_dir.mkdir(parents=True, exist_ok=True)
+        tmp_dir_with_context = PathWithContext(tmp_dir)
+    else:
+        tmp_dir_with_context = tempfile.TemporaryDirectory()
+    with tmp_dir_with_context as tmp_dir_str:
+        tmp_dir = Path(tmp_dir_str)
+        for sample in tqdm.tqdm(sample_iter, desc="samples"):
+            image = sample.image
+            # Rotate image according to exif orientation using Pillow
+            image = ImageOps.exif_transpose(image)
+            image_id = sample.image_id
+            sample = {
+                "image_id": image_id,
+                "image": image,
+                "meta": sample.meta.model_dump(),
+                "output": sample.output,
+            }
+            # Save output as pickle
+            with open(tmp_dir / f"{split}_{image_id}.pkl", "wb") as f:
+                pickle.dump(sample, f)
+        hf_ds = datasets.Dataset.from_generator(
+            functools.partial(_pickle_sample_generator, tmp_dir),
+            features=HF_DS_LLM_IMAGE_EXTRACTION_FEATURES,
+        )
+        hf_ds.push_to_hub(repo_id, split=split, revision=revision)

labelr-0.9.0/src/labelr/google_genai.py ADDED Viewed

@@ -0,0 +1,415 @@
+import asyncio
+import mimetypes
+from collections.abc import Iterator
+from pathlib import Path
+from typing import Literal
+from urllib.parse import urlparse
+import aiofiles
+import jsonschema
+import orjson
+import typer
+from gcloud.aio.storage import Storage
+from openfoodfacts import Flavor
+from openfoodfacts.images import download_image, generate_image_url
+from tqdm.asyncio import tqdm
+from labelr.sample import LLMImageExtractionSample, SampleMeta
+from labelr.utils import download_image_from_gcs
+try:
+    import google.genai  # noqa: F401
+except ImportError:
+    raise ImportError(
+        "The 'google-genai' package is required to use this module. "
+        "Please install labelr with the 'google' extra: "
+        "`pip install labelr[google]`"
+    )
+import aiohttp
+from google import genai
+from google.cloud import storage
+from google.genai.types import CreateBatchJobConfig, HttpOptions
+from google.genai.types import JSONSchema as GoogleJSONSchema
+from google.genai.types import Schema as GoogleSchema
+from openfoodfacts.types import JSONType
+from pydantic import BaseModel
+class RawBatchSamplePart(BaseModel):
+    type: Literal["text", "image"]
+    data: str
+class RawBatchSample(BaseModel):
+    key: str
+    parts: list[RawBatchSamplePart]
+    meta: JSONType = {}
+def convert_pydantic_model_to_google_schema(schema: type[BaseModel]) -> JSONType:
+    """Google doesn't support natively OpenAPI schemas, so we convert them to
+    Google `Schema` (a subset of OpenAPI)."""
+    return GoogleSchema.from_json_schema(
+        json_schema=GoogleJSONSchema.model_validate(schema.model_json_schema())
+    ).model_dump(mode="json", exclude_none=True, exclude_unset=True)
+async def download_image(url: str, session: aiohttp.ClientSession) -> bytes:
+    """Download an image from a URL and return its content as bytes.
+    Args:
+        url (str): URL of the image to download.
+    Returns:
+        bytes: Content of the downloaded image.
+    """
+    async with session.get(url) as response:
+        response.raise_for_status()
+        return await response.read()
+async def download_image_from_filesystem(url: str, base_dir: Path) -> bytes:
+    """Download an image from the filesystem and return its content as bytes.
+    Args:
+        url (str): URL of the image to download.
+        base_dir (Path): Base directory where images are stored.
+    Returns:
+        bytes: Content of the downloaded image.
+    """
+    file_path = urlparse(url).path[1:]  # Remove leading '/'
+    full_file_path = base_dir / file_path
+    async with aiofiles.open(full_file_path, "rb") as f:
+        return await f.read()
+async def upload_to_gcs(
+    image_url: str,
+    bucket_name: str,
+    blob_name: str,
+    session: aiohttp.ClientSession,
+    base_image_dir: Path | None = None,
+) -> dict:
+    """Upload data to Google Cloud Storage.
+    Args:
+        bucket_name (str): Name of the GCS bucket.
+        blob_name (str): Name of the blob (object) in the bucket.
+        data (bytes): Data to upload.
+        session (aiohttp.ClientSession): HTTP session to use for downloading
+            the image.
+        base_image_dir (Path | None): If provided, images will be read from
+            the filesystem under this base directory instead of downloading
+            them from their URLs.
+    Returns:
+        dict: Status of the upload operation.
+    """
+    if base_image_dir is None:
+        image_data = await download_image(image_url, session)
+    else:
+        image_data = await download_image_from_filesystem(image_url, base_image_dir)
+    client = Storage(session=session)
+    status = await client.upload(
+        bucket_name,
+        blob_name,
+        image_data,
+    )
+    return status
+async def upload_to_gcs_format_async(
+    sample: RawBatchSample,
+    google_json_schema: JSONType,
+    instructions: str | None,
+    bucket_name: str,
+    bucket_dir_name: str,
+    session: aiohttp.ClientSession,
+    base_image_dir: Path | None = None,
+    skip_upload: bool = False,
+    thinking_level: str | None = None,
+) -> JSONType | None:
+    parts: list[JSONType] = []
+    if instructions:
+        parts.append({"text": instructions})
+    for part in sample.parts:
+        if part.type == "image":
+            mime_type, _ = mimetypes.guess_type(part.data)
+            if mime_type is None:
+                raise ValueError(f"Cannot guess mimetype of file: {part.data}")
+            file_uri = part.data
+            image_blob_name = f"{bucket_dir_name}/{sample.key}/{Path(file_uri).name}"
+            # Download the image from the URL
+            if not skip_upload:
+                try:
+                    await upload_to_gcs(
+                        image_url=file_uri,
+                        bucket_name=bucket_name,
+                        blob_name=image_blob_name,
+                        session=session,
+                        base_image_dir=base_image_dir,
+                    )
+                except FileNotFoundError:
+                    return None
+            parts.append(
+                {
+                    "file_data": {
+                        "file_uri": f"gs://{bucket_name}/{image_blob_name}",
+                        "mime_type": mime_type,
+                    }
+                }
+            )
+        else:
+            parts.append({"text": part.data})
+    generation_config = {
+        "responseMimeType": "application/json",
+        "response_json_schema": google_json_schema,
+    }
+    if thinking_level is not None:
+        generation_config["thinkingConfig"] = {"thinkingLevel": thinking_level}
+    return {
+        "key": f"key:{sample.key}",
+        "request": {
+            "contents": [
+                {
+                    "parts": parts,
+                    "role": "user",
+                }
+            ],
+            "generationConfig": generation_config,
+        },
+    }
+async def generate_batch_dataset(
+    data_path: Path,
+    output_path: Path,
+    google_json_schema: JSONType,
+    instructions: str | None,
+    bucket_name: str,
+    bucket_dir_name: str,
+    max_concurrent_uploads: int = 30,
+    base_image_dir: Path | None = None,
+    from_key: str | None = None,
+    skip_upload: bool = False,
+    thinking_level: str | None = None,
+):
+    limiter = asyncio.Semaphore(max_concurrent_uploads)
+    ignore = True if from_key is None else False
+    missing_files = 0
+    async with aiohttp.ClientSession() as session:
+        async with asyncio.TaskGroup() as tg:
+            async with (
+                aiofiles.open(data_path, "r") as input_file,
+                aiofiles.open(output_path, "wb") as output_file,
+            ):
+                async with limiter:
+                    tasks = set()
+                    async for line in tqdm(input_file, desc="samples"):
+                        # print(f"line: {line}")
+                        sample = RawBatchSample.model_validate_json(line)
+                        # print(f"sample: {sample}")
+                        record_key = sample.key
+                        if from_key is not None and ignore:
+                            if record_key == from_key:
+                                ignore = False
+                            else:
+                                continue
+                        task = tg.create_task(
+                            upload_to_gcs_format_async(
+                                sample=sample,
+                                google_json_schema=google_json_schema,
+                                instructions=instructions,
+                                bucket_name=bucket_name,
+                                bucket_dir_name=bucket_dir_name,
+                                session=session,
+                                base_image_dir=base_image_dir,
+                                skip_upload=skip_upload,
+                                thinking_level=thinking_level,
+                            )
+                        )
+                        tasks.add(task)
+                        if len(tasks) >= max_concurrent_uploads:
+                            for task in tasks:
+                                await task
+                                updated_record = task.result()
+                                if updated_record is not None:
+                                    await output_file.write(
+                                        orjson.dumps(updated_record) + "\n".encode()
+                                    )
+                                else:
+                                    missing_files += 1
+                            tasks.clear()
+                    for task in tasks:
+                        await task
+                        updated_record = task.result()
+                        if updated_record is not None:
+                            await output_file.write(
+                                orjson.dumps(updated_record) + "\n".encode()
+                            )
+                        else:
+                            missing_files += 1
+    typer.echo(
+        f"Upload and dataset update completed. Wrote updated dataset to {output_path}. "
+        f"Missing files: {missing_files}."
+    )
+def launch_batch_job(
+    run_name: str,
+    dataset_path: Path,
+    model: str,
+    location: str,
+):
+    """Launch a Gemini Batch Inference job.
+    Args:
+        run_name (str): Name of the batch run.
+        dataset_path (Path): Path to the dataset file in JSONL format.
+        model (str): Model to use for the batch job. Example:
+            'gemini-2.5-flash'.
+        location (str): Location for the Vertex AI resources. Example:
+            'europe-west4'.
+    """
+    # We upload the dataset to a GCS bucket using the Gcloud
+    if model == "gemini-3-pro-preview" and location != "global":
+        typer.echo(
+            "Warning: only 'global' location is supported for 'gemini-3-pro-preview' model. Overriding location to 'global'."
+        )
+        location = "global"
+    storage_client = storage.Client()
+    bucket_name = "robotoff-batch"  # Replace with your bucket name
+    run_dir = f"gemini-batch/{run_name}"
+    input_file_blob_name = f"{run_dir}/inputs.jsonl"
+    bucket = storage_client.bucket(bucket_name)
+    blob = bucket.blob(input_file_blob_name)
+    blob.upload_from_filename(dataset_path)
+    client = genai.Client(
+        http_options=HttpOptions(api_version="v1"),
+        vertexai=True,
+        location=location,
+    )
+    output_uri = f"gs://{bucket_name}/{run_dir}"
+    job = client.batches.create(
+        model=model,
+        src=f"gs://{bucket_name}/{input_file_blob_name}",
+        config=CreateBatchJobConfig(dest=output_uri),
+    )
+    print(job)
+def generate_sample_iter(
+    prediction_path: Path,
+    json_schema: JSONType,
+    skip: int = 0,
+    limit: int | None = None,
+    is_openfoodfacts_dataset: bool = False,
+    openfoodfacts_flavor: Flavor = Flavor.off,
+    raise_on_invalid_sample: bool = False,
+) -> Iterator[LLMImageExtractionSample]:
+    """Generate training samples from a Gemini Batch Inference prediction
+    JSONL file.
+    Args:
+        prediction_path (Path): Path to the prediction JSONL file.
+        json_schema (JSONType): JSON schema to validate the predictions.
+        skip (int): Number of initial samples to skip.
+        limit (int | None): Maximum number of samples to generate.
+        is_openfoodfacts_dataset (bool): Whether the dataset is from Open Food
+            Facts.
+        openfoodfacts_flavor (Flavor): Flavor of the Open Food Facts dataset.
+    Yields:
+        Iterator[LLMImageExtractionSample]: Generated samples.
+    """
+    skipped = 0
+    invalid = 0
+    with prediction_path.open("r") as f_in:
+        for i, sample_str in enumerate(f_in):
+            if i < skip:
+                skipped += 1
+                continue
+            if limit is not None and i >= skip + limit:
+                break
+            sample = orjson.loads(sample_str)
+            try:
+                yield generate_sample_from_prediction(
+                    json_schema=json_schema,
+                    sample=sample,
+                    is_openfoodfacts_dataset=is_openfoodfacts_dataset,
+                    openfoodfacts_flavor=openfoodfacts_flavor,
+                )
+            except Exception as e:
+                if raise_on_invalid_sample:
+                    raise
+                else:
+                    typer.echo(
+                        f"Skipping invalid sample at line {i + 1} in {prediction_path}: {e}"
+                    )
+                    invalid += 1
+                    continue
+    if skipped > 0:
+        typer.echo(f"Skipped {skipped} samples.")
+    if invalid > 0:
+        typer.echo(f"Skipped {invalid} invalid samples.")
+def generate_sample_from_prediction(
+    json_schema: JSONType,
+    sample: JSONType,
+    is_openfoodfacts_dataset: bool = False,
+    openfoodfacts_flavor: Flavor = Flavor.off,
+) -> LLMImageExtractionSample:
+    """Generate a LLMImageExtractionSample from a prediction sample.
+    Args:
+        json_schema (JSONType): JSON schema to validate the predictions.
+        sample (JSONType): Prediction sample.
+        is_openfoodfacts_dataset (bool): Whether the dataset is from Open Food
+            Facts.
+        openfoodfacts_flavor (Flavor): Flavor of the Open Food Facts dataset.
+    Returns:
+        LLMImageExtractionSample: Generated sample.
+    """
+    image_id = sample["key"][len("key:") :]
+    response_str = sample["response"]["candidates"][0]["content"]["parts"][0]["text"]
+    image_uri = sample["request"]["contents"][0]["parts"][1]["file_data"]["file_uri"]
+    image = download_image_from_gcs(image_uri=image_uri)
+    response = orjson.loads(response_str)
+    jsonschema.validate(response, json_schema)
+    if is_openfoodfacts_dataset:
+        image_stem_parts = image_id.split("_")
+        barcode = image_stem_parts[0]
+        off_image_id = image_stem_parts[1]
+        image_id = f"{barcode}_{off_image_id}"
+        image_url = generate_image_url(
+            barcode, off_image_id, flavor=openfoodfacts_flavor
+        )
+    else:
+        image_id = image_id
+        barcode = ""
+        off_image_id = ""
+        image_url = ""
+    sample_meta = SampleMeta(
+        barcode=barcode,
+        off_image_id=off_image_id,
+        image_url=image_url,
+    )
+    return LLMImageExtractionSample(
+        image_id=image_id,
+        image=image,
+        output=orjson.dumps(response).decode("utf-8"),
+        meta=sample_meta,
+    )

{labelr-0.8.0 → labelr-0.9.0}/src/labelr/main.py RENAMED Viewed

@@ -5,6 +5,7 @@ from openfoodfacts.utils import get_logger
 from labelr.apps import datasets as dataset_app
 from labelr.apps import evaluate as evaluate_app
+from labelr.apps import google_batch as google_batch_app
 from labelr.apps import hugging_face as hf_app
 from labelr.apps import label_studio as ls_app
 from labelr.apps import train as train_app
@@ -84,6 +85,11 @@ app.add_typer(
     name="evaluate",
     help="Visualize and evaluate trained models.",
 )
+app.add_typer(
+    google_batch_app.app,
+    name="google-batch",
+    help="Generate datasets and launch batch jobs on Google Gemini.",
+)
 if __name__ == "__main__":
     app()

{labelr-0.8.0 → labelr-0.9.0}/src/labelr/sample.py RENAMED Viewed

@@ -8,7 +8,8 @@ import PIL
 from openfoodfacts import Flavor
 from openfoodfacts.barcode import normalize_barcode
 from openfoodfacts.images import download_image, generate_image_url
-from PIL import ImageOps
+from PIL import Image, ImageOps
+from pydantic import BaseModel, Field
 logger = logging.getLogger(__name__)
@@ -230,6 +231,34 @@ def format_object_detection_sample_to_hf(
     }
+class SampleMeta(BaseModel):
+    barcode: str | None = Field(
+        ..., description="The barcode of the product, if applicable"
+    )
+    off_image_id: str | None = Field(
+        ...,
+        description="The Open Food Facts image ID associated with the image, if applicable",
+    )
+    image_url: str | None = Field(
+        ..., description="The URL of the image, if applicable"
+    )
+class LLMImageExtractionSample(BaseModel):
+    class Config:
+        # required to allow PIL Image type
+        arbitrary_types_allowed = True
+    image_id: str = Field(
+        ...,
+        description="unique ID for the image. For Open Food Facts images, it follows the "
+        "format `barcode:imgid`",
+    )
+    image: Image.Image = Field(..., description="Image to extract information from")
+    output: str = Field(..., description="Expected response of the LLM")
+    meta: SampleMeta = Field(..., description="Metadata associated with the sample")
 # The HuggingFace Dataset features
 HF_DS_OBJECT_DETECTION_FEATURES = datasets.Features(
     {
@@ -266,3 +295,16 @@ HF_DS_CLASSIFICATION_FEATURES = datasets.Features(
         "category_name": datasets.Value("string"),
     }
 )
+HF_DS_LLM_IMAGE_EXTRACTION_FEATURES = datasets.Features(
+    {
+        "image_id": datasets.Value("string"),
+        "image": datasets.features.Image(),
+        "output": datasets.features.Value("string"),
+        "meta": {
+            "barcode": datasets.Value("string"),
+            "off_image_id": datasets.Value("string"),
+            "image_url": datasets.Value("string"),
+        },
+    }
+)

labelr-0.9.0/src/labelr/utils.py ADDED Viewed

@@ -0,0 +1,48 @@
+import io
+from pathlib import Path
+from google.cloud import storage
+from PIL import Image
+def parse_hf_repo_id(hf_repo_id: str) -> tuple[str, str]:
+    """Parse the repo_id and the revision from a hf_repo_id in the format:
+    `org/repo-name@revision`.
+    Returns a tuple (repo_id, revision), with revision = 'main' if it
+    was not provided.
+    """
+    if "@" in hf_repo_id:
+        hf_repo_id, revision = hf_repo_id.split("@", 1)
+    else:
+        revision = "main"
+    return hf_repo_id, revision
+def download_image_from_gcs(image_uri: str) -> Image.Image:
+    """Download an image from a Google Cloud Storage URI and return it as a
+    PIL Image."""
+    storage_client = storage.Client()
+    bucket_name, blob_name = image_uri.replace("gs://", "").split("/", 1)
+    bucket = storage_client.bucket(bucket_name)
+    blob = bucket.blob(blob_name)
+    image_data = blob.download_as_bytes()
+    return Image.open(io.BytesIO(image_data))
+class PathWithContext:
+    """A context manager that yields a Path object.
+    This is useful to have a common interface with tempfile.TemporaryDirectory
+    without actually creating a temporary directory.
+    """
+    def __init__(self, path: Path):
+        self.path = path
+    def __enter__(self) -> Path:
+        return self.path
+    def __exit__(self, exc_type, exc_value, traceback) -> None:
+        pass

{labelr-0.8.0 → labelr-0.9.0/src/labelr.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: labelr
-Version: 0.8.0
+Version: 0.9.0
 Summary: A command-line tool to manage labeling tasks with Label Studio.
 Requires-Python: >=3.10
 Description-Content-Type: text/markdown
@@ -13,10 +13,19 @@ Requires-Dist: openfoodfacts>=2.9.0
 Requires-Dist: typer>=0.15.1
 Requires-Dist: google-cloud-batch==0.18.0
 Requires-Dist: huggingface-hub
+Requires-Dist: deepdiff>=8.6.1
+Requires-Dist: rapidfuzz>=3.14.3
+Requires-Dist: aiohttp
+Requires-Dist: aiofiles
+Requires-Dist: orjson
 Provides-Extra: ultralytics
 Requires-Dist: ultralytics==8.3.223; extra == "ultralytics"
 Provides-Extra: fiftyone
 Requires-Dist: fiftyone~=1.10.0; extra == "fiftyone"
+Provides-Extra: google
+Requires-Dist: google-genai>=1.56.0; extra == "google"
+Requires-Dist: gcloud-aio-storage; extra == "google"
+Requires-Dist: google-cloud-storage; extra == "google"
 Dynamic: license-file
 # Labelr

{labelr-0.8.0 → labelr-0.9.0}/src/labelr.egg-info/SOURCES.txt RENAMED Viewed

@@ -8,6 +8,7 @@ src/labelr/check.py
 src/labelr/config.py
 src/labelr/dataset_features.py
 src/labelr/export.py
+src/labelr/google_genai.py
 src/labelr/main.py
 src/labelr/project_config.py
 src/labelr/sample.py
@@ -22,9 +23,9 @@ src/labelr.egg-info/top_level.txt
 src/labelr/apps/__init__.py
 src/labelr/apps/datasets.py
 src/labelr/apps/evaluate.py
+src/labelr/apps/google_batch.py
 src/labelr/apps/hugging_face.py
 src/labelr/apps/label_studio.py
 src/labelr/apps/train.py
 src/labelr/evaluate/__init__.py
-src/labelr/evaluate/llm.py
 src/labelr/evaluate/object_detection.py

{labelr-0.8.0 → labelr-0.9.0}/src/labelr.egg-info/requires.txt RENAMED Viewed

@@ -6,9 +6,19 @@ openfoodfacts>=2.9.0
 typer>=0.15.1
 google-cloud-batch==0.18.0
 huggingface-hub
+deepdiff>=8.6.1
+rapidfuzz>=3.14.3
+aiohttp
+aiofiles
+orjson
 [fiftyone]
 fiftyone~=1.10.0
+[google]
+google-genai>=1.56.0
+gcloud-aio-storage
+google-cloud-storage
 [ultralytics]
 ultralytics==8.3.223

labelr-0.8.0/src/labelr/evaluate/llm.py DELETED Viewed

File without changes

labelr-0.8.0/src/labelr/utils.py DELETED Viewed

@@ -1,13 +0,0 @@
-def parse_hf_repo_id(hf_repo_id: str) -> tuple[str, str]:
-    """Parse the repo_id and the revision from a hf_repo_id in the format:
-    `org/repo-name@revision`.
-    Returns a tuple (repo_id, revision), with revision = 'main' if it
-    was not provided.
-    """
-    if "@" in hf_repo_id:
-        hf_repo_id, revision = hf_repo_id.split("@", 1)
-    else:
-        revision = "main"
-    return hf_repo_id, revision