PyPI - labelr - Versions diffs - 0.9.0__py3-none-any.whl → 0.10.0__py3-none-any.whl - Mend

labelr 0.9.0py3-none-any.whl → 0.10.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

labelr/apps/datasets.py +56 -5
labelr/apps/google_batch.py +8 -1
labelr/apps/label_studio.py +1 -1
labelr/export/__init__.py +0 -0
labelr/export/classification.py +114 -0
labelr/export/common.py +42 -0
labelr/export/llm.py +91 -0
labelr/{export.py → export/object_detection.py} +3 -201
labelr/google_genai.py +9 -3
labelr/sample/__init__.py +0 -0
labelr/sample/classification.py +17 -0
labelr/sample/common.py +14 -0
labelr/sample/llm.py +75 -0
labelr/{sample.py → sample/object_detection.py} +1 -60
labelr/utils.py +55 -5
{labelr-0.9.0.dist-info → labelr-0.10.0.dist-info}/METADATA +4 -5
labelr-0.10.0.dist-info/RECORD +36 -0
labelr-0.9.0.dist-info/RECORD +0 -28
{labelr-0.9.0.dist-info → labelr-0.10.0.dist-info}/WHEEL +0 -0
{labelr-0.9.0.dist-info → labelr-0.10.0.dist-info}/entry_points.txt +0 -0
{labelr-0.9.0.dist-info → labelr-0.10.0.dist-info}/licenses/LICENSE +0 -0
{labelr-0.9.0.dist-info → labelr-0.10.0.dist-info}/top_level.txt +0 -0

labelr/apps/datasets.py CHANGED Viewed

@@ -12,7 +12,11 @@ import typer
 from openfoodfacts import Flavor
 from openfoodfacts.utils import get_logger
-from labelr.export import export_from_ultralytics_to_hf
+from labelr.export.common import export_from_ultralytics_to_hf
+from labelr.export.object_detection import (
+    export_from_ls_to_hf_object_detection,
+    export_from_ls_to_ultralytics_object_detection,
+)
 from ..config import LABEL_STUDIO_DEFAULT_URL
 from ..types import ExportDestination, ExportSource, TaskType
@@ -99,7 +103,9 @@ def convert_object_detection_dataset(
     Studio format, and save it to a JSON file."""
     from datasets import load_dataset
-    from labelr.sample import format_object_detection_sample_from_hf_to_ls
+    from labelr.sample.object_detection import (
+        format_object_detection_sample_from_hf_to_ls,
+    )
     logger.info("Loading dataset: %s", repo_id)
     ds = load_dataset(repo_id)
@@ -207,10 +213,8 @@ def export(
     local files (ultralytics format)."""
     from label_studio_sdk.client import LabelStudio
-    from labelr.export import (
+    from labelr.export.object_detection import (
         export_from_hf_to_ultralytics_object_detection,
-        export_from_ls_to_hf_object_detection,
-        export_from_ls_to_ultralytics_object_detection,
     )
     if (to == ExportDestination.hf or from_ == ExportSource.hf) and repo_id is None:
@@ -303,3 +307,50 @@ def export(
                 is_openfoodfacts_dataset=is_openfoodfacts_dataset,
                 openfoodfacts_flavor=openfoodfacts_flavor,
             )
+@app.command()
+def export_llm_ds(
+    dataset_path: Annotated[
+        Path, typer.Option(..., help="Path to the JSONL dataset file")
+    ],
+    repo_id: Annotated[
+        str, typer.Option(..., help="Hugging Face Datasets repository ID to export to")
+    ],
+    split: Annotated[str, typer.Option(..., help="Dataset split to export")],
+    revision: Annotated[
+        str,
+        typer.Option(
+            help="Revision (branch, tag or commit) for the Hugging Face Datasets repository."
+        ),
+    ] = "main",
+    tmp_dir: Annotated[
+        Path | None,
+        typer.Option(
+            help="Path to a temporary directory to use for image processing",
+        ),
+    ] = None,
+    image_max_size: Annotated[
+        int | None,
+        typer.Option(
+            help="Maximum size (in pixels) for the images. If None, no resizing is performed.",
+        ),
+    ] = None,
+):
+    """Export LLM image extraction dataset with images only to Hugging Face
+    Datasets.
+    """
+    from labelr.export.llm import export_to_hf_llm_image_extraction
+    from labelr.sample.llm import load_llm_image_extraction_dataset_from_jsonl
+    sample_iter = load_llm_image_extraction_dataset_from_jsonl(
+        dataset_path=dataset_path
+    )
+    export_to_hf_llm_image_extraction(
+        sample_iter,
+        split=split,
+        repo_id=repo_id,
+        revision=revision,
+        tmp_dir=tmp_dir,
+        image_max_size=image_max_size,
+    )

labelr/apps/google_batch.py CHANGED Viewed

@@ -239,6 +239,12 @@ def upload_training_dataset_from_predictions(
             help="Whether to raise an error on invalid samples instead of skipping them",
         ),
     ] = False,
+    image_max_size: Annotated[
+        int | None,
+        typer.Option(
+            help="Maximum size (in pixels) for the images. If None, no resizing is performed.",
+        ),
+    ] = None,
 ):
     """Upload a training dataset to a Hugging Face Datasets repository from a
     Gemini batch prediction file."""
@@ -247,7 +253,7 @@ def upload_training_dataset_from_predictions(
     import orjson
     from huggingface_hub import HfApi
-    from labelr.export import export_to_hf_llm_image_extraction
+    from labelr.export.llm import export_to_hf_llm_image_extraction
     from labelr.google_genai import generate_sample_iter
     instructions = instructions_path.read_text()
@@ -286,4 +292,5 @@ def upload_training_dataset_from_predictions(
         repo_id=repo_id,
         revision=revision,
         tmp_dir=tmp_dir,
+        image_max_size=image_max_size,
     )

labelr/apps/label_studio.py CHANGED Viewed

@@ -398,7 +398,7 @@ def create_dataset_file(
     from openfoodfacts.images import extract_barcode_from_url, extract_source_from_url
     from openfoodfacts.utils import get_image_from_url
-    from labelr.sample import format_object_detection_sample_to_ls
+    from labelr.sample.object_detection import format_object_detection_sample_to_ls
     logger.info("Loading dataset: %s", input_file)

labelr/export/__init__.py ADDED Viewed

File without changes

labelr/export/classification.py ADDED Viewed

@@ -0,0 +1,114 @@
+import functools
+import logging
+import pickle
+import tempfile
+from pathlib import Path
+import datasets
+from openfoodfacts.images import generate_image_url
+from openfoodfacts.types import Flavor
+from PIL import Image, ImageOps
+from labelr.export.common import _pickle_sample_generator
+from labelr.sample.classification import HF_DS_CLASSIFICATION_FEATURES
+logger = logging.getLogger(__name__)
+def export_from_ultralytics_to_hf_classification(
+    dataset_dir: Path,
+    repo_id: str,
+    label_names: list[str],
+    merge_labels: bool = False,
+    is_openfoodfacts_dataset: bool = False,
+    openfoodfacts_flavor: Flavor = Flavor.off,
+) -> None:
+    """Export an Ultralytics classification dataset to a Hugging Face dataset.
+    The Ultralytics dataset directory should contain 'train', 'val' and/or
+    'test' subdirectories, each containing subdirectories for each label.
+    Args:
+        dataset_dir (Path): Path to the Ultralytics dataset directory.
+        repo_id (str): Hugging Face repository ID to push the dataset to.
+        label_names (list[str]): List of label names.
+        merge_labels (bool): Whether to merge all labels into a single label
+            named 'object'.
+        is_openfoodfacts_dataset (bool): Whether the dataset is from
+            Open Food Facts. If True, the `off_image_id` and `image_url` will
+            be generated automatically. `off_image_id` is extracted from the
+            image filename.
+        openfoodfacts_flavor (Flavor): Flavor of Open Food Facts dataset. This
+            is ignored if `is_openfoodfacts_dataset` is False.
+    """
+    logger.info("Repo ID: %s, dataset_dir: %s", repo_id, dataset_dir)
+    if not any((dataset_dir / split).is_dir() for split in ["train", "val", "test"]):
+        raise ValueError(
+            f"Dataset directory {dataset_dir} does not contain 'train', 'val' or 'test' subdirectories"
+        )
+    # Save output as pickle
+    for split in ["train", "val", "test"]:
+        split_dir = dataset_dir / split
+        if not split_dir.is_dir():
+            logger.info("Skipping missing split directory: %s", split_dir)
+            continue
+        with tempfile.TemporaryDirectory() as tmp_dir_str:
+            tmp_dir = Path(tmp_dir_str)
+            for label_dir in (d for d in split_dir.iterdir() if d.is_dir()):
+                label_name = label_dir.name
+                if merge_labels:
+                    label_name = "object"
+                if label_name not in label_names:
+                    raise ValueError(
+                        "Label name %s not in provided label names (label names: %s)"
+                        % (label_name, label_names),
+                    )
+                label_id = label_names.index(label_name)
+                for image_path in label_dir.glob("*"):
+                    if is_openfoodfacts_dataset:
+                        image_stem_parts = image_path.stem.split("_")
+                        barcode = image_stem_parts[0]
+                        off_image_id = image_stem_parts[1]
+                        image_id = f"{barcode}_{off_image_id}"
+                        image_url = generate_image_url(
+                            barcode, off_image_id, flavor=openfoodfacts_flavor
+                        )
+                    else:
+                        image_id = image_path.stem
+                        barcode = ""
+                        off_image_id = ""
+                        image_url = ""
+                    image = Image.open(image_path)
+                    image.load()
+                    if image.mode != "RGB":
+                        image = image.convert("RGB")
+                    # Rotate image according to exif orientation using Pillow
+                    ImageOps.exif_transpose(image, in_place=True)
+                    sample = {
+                        "image_id": image_id,
+                        "image": image,
+                        "width": image.width,
+                        "height": image.height,
+                        "meta": {
+                            "barcode": barcode,
+                            "off_image_id": off_image_id,
+                            "image_url": image_url,
+                        },
+                        "category_id": label_id,
+                        "category_name": label_name,
+                    }
+                    with open(tmp_dir / f"{split}_{image_id}.pkl", "wb") as f:
+                        pickle.dump(sample, f)
+            hf_ds = datasets.Dataset.from_generator(
+                functools.partial(_pickle_sample_generator, tmp_dir),
+                features=HF_DS_CLASSIFICATION_FEATURES,
+            )
+            hf_ds.push_to_hub(repo_id, split=split)

labelr/export/common.py ADDED Viewed

@@ -0,0 +1,42 @@
+import pickle
+from pathlib import Path
+from openfoodfacts.types import Flavor
+from labelr.types import TaskType
+def _pickle_sample_generator(dir: Path):
+    """Generator that yields samples from pickles in a directory."""
+    for pkl in dir.glob("*.pkl"):
+        with open(pkl, "rb") as f:
+            yield pickle.load(f)
+def export_from_ultralytics_to_hf(
+    task_type: TaskType,
+    dataset_dir: Path,
+    repo_id: str,
+    label_names: list[str],
+    merge_labels: bool = False,
+    is_openfoodfacts_dataset: bool = False,
+    openfoodfacts_flavor: Flavor = Flavor.off,
+) -> None:
+    from labelr.export.classification import (
+        export_from_ultralytics_to_hf_classification,
+    )
+    if task_type != TaskType.classification:
+        raise NotImplementedError(
+            "Only classification task is currently supported for Ultralytics to HF export"
+        )
+    if task_type == TaskType.classification:
+        export_from_ultralytics_to_hf_classification(
+            dataset_dir=dataset_dir,
+            repo_id=repo_id,
+            label_names=label_names,
+            merge_labels=merge_labels,
+            is_openfoodfacts_dataset=is_openfoodfacts_dataset,
+            openfoodfacts_flavor=openfoodfacts_flavor,
+        )

labelr/export/llm.py ADDED Viewed

@@ -0,0 +1,91 @@
+import functools
+import logging
+import pickle
+import tempfile
+import typing
+from collections.abc import Iterator
+from pathlib import Path
+import datasets
+import tqdm
+from PIL import Image, ImageOps
+from labelr.export.common import _pickle_sample_generator
+from labelr.sample.llm import (
+    HF_DS_LLM_IMAGE_EXTRACTION_FEATURES,
+    LLMImageExtractionSample,
+)
+from labelr.utils import PathWithContext
+logger = logging.getLogger(__name__)
+def export_to_hf_llm_image_extraction(
+    sample_iter: Iterator[LLMImageExtractionSample],
+    split: str,
+    repo_id: str,
+    revision: str = "main",
+    tmp_dir: Path | None = None,
+    image_max_size: int | None = None,
+) -> None:
+    """Export LLM image extraction samples to a Hugging Face dataset.
+    Args:
+        sample_iter (Iterator[LLMImageExtractionSample]): Iterator of samples
+            to export.
+        split (str): Name of the dataset split (e.g., 'train', 'val').
+        repo_id (str): Hugging Face repository ID to push the dataset to.
+        revision (str): Revision (branch, tag or commit) to use for the
+            Hugging Face Datasets repository.
+        tmp_dir (Path | None): Temporary directory to use for intermediate
+            files. If None, a temporary directory will be created
+            automatically.
+        image_max_size (int | None): Maximum size (in pixels) for the images.
+    """
+    logger.info(
+        "Repo ID: %s, revision: %s, split: %s, tmp_dir: %s, image_max_size: %s",
+        repo_id,
+        revision,
+        split,
+        tmp_dir,
+        image_max_size,
+    )
+    tmp_dir_with_context: PathWithContext | tempfile.TemporaryDirectory
+    if tmp_dir:
+        tmp_dir.mkdir(parents=True, exist_ok=True)
+        tmp_dir_with_context = PathWithContext(tmp_dir)
+    else:
+        tmp_dir_with_context = tempfile.TemporaryDirectory()
+    with tmp_dir_with_context as tmp_dir_str:
+        tmp_dir = Path(tmp_dir_str)
+        for sample in tqdm.tqdm(sample_iter, desc="samples"):
+            image = sample.image
+            # Rotate image according to exif orientation using Pillow
+            image = typing.cast(Image.Image, ImageOps.exif_transpose(image))
+            if image_max_size is not None:
+                if image.height > image_max_size or image.width > image_max_size:
+                    image.thumbnail(
+                        (image_max_size, image_max_size),
+                        Image.Resampling.LANCZOS,
+                    )
+            image_id = sample.image_id
+            json_sample = {
+                "image_id": image_id,
+                "image": image,
+                "meta": {
+                    k: v for k, v in sample.meta.model_dump().items() if v is not None
+                },
+                "output": sample.output,
+            }
+            # Save output as pickle
+            with open(tmp_dir / f"{split}_{image_id}.pkl", "wb") as f:
+                pickle.dump(json_sample, f)
+        hf_ds = datasets.Dataset.from_generator(
+            functools.partial(_pickle_sample_generator, tmp_dir),
+            features=HF_DS_LLM_IMAGE_EXTRACTION_FEATURES,
+        )
+        hf_ds.push_to_hub(repo_id, split=split, revision=revision)

labelr/{export.py → export/object_detection.py} RENAMED Viewed

@@ -1,38 +1,23 @@
 import functools
 import logging
 import pickle
-import random
 import tempfile
-from collections.abc import Iterator
 from pathlib import Path
 import datasets
 import tqdm
 from label_studio_sdk.client import LabelStudio
-from openfoodfacts.images import download_image, generate_image_url
-from openfoodfacts.types import Flavor
-from PIL import Image, ImageOps
+from openfoodfacts.images import download_image
-from labelr.sample import (
-    HF_DS_CLASSIFICATION_FEATURES,
-    HF_DS_LLM_IMAGE_EXTRACTION_FEATURES,
+from labelr.export.common import _pickle_sample_generator
+from labelr.sample.object_detection import (
     HF_DS_OBJECT_DETECTION_FEATURES,
-    LLMImageExtractionSample,
     format_object_detection_sample_to_hf,
 )
-from labelr.types import TaskType
-from labelr.utils import PathWithContext
 logger = logging.getLogger(__name__)
-def _pickle_sample_generator(dir: Path):
-    """Generator that yields samples from pickles in a directory."""
-    for pkl in dir.glob("*.pkl"):
-        with open(pkl, "rb") as f:
-            yield pickle.load(f)
 def export_from_ls_to_hf_object_detection(
     ls: LabelStudio,
     repo_id: str,
@@ -335,186 +320,3 @@ def export_from_hf_to_ultralytics_object_detection(
         f.write("names:\n")
         for i, category_name in enumerate(category_names):
             f.write(f"  {i}: {category_name}\n")
-def export_from_ultralytics_to_hf(
-    task_type: TaskType,
-    dataset_dir: Path,
-    repo_id: str,
-    label_names: list[str],
-    merge_labels: bool = False,
-    is_openfoodfacts_dataset: bool = False,
-    openfoodfacts_flavor: Flavor = Flavor.off,
-) -> None:
-    if task_type != TaskType.classification:
-        raise NotImplementedError(
-            "Only classification task is currently supported for Ultralytics to HF export"
-        )
-    if task_type == TaskType.classification:
-        export_from_ultralytics_to_hf_classification(
-            dataset_dir=dataset_dir,
-            repo_id=repo_id,
-            label_names=label_names,
-            merge_labels=merge_labels,
-            is_openfoodfacts_dataset=is_openfoodfacts_dataset,
-            openfoodfacts_flavor=openfoodfacts_flavor,
-        )
-def export_from_ultralytics_to_hf_classification(
-    dataset_dir: Path,
-    repo_id: str,
-    label_names: list[str],
-    merge_labels: bool = False,
-    is_openfoodfacts_dataset: bool = False,
-    openfoodfacts_flavor: Flavor = Flavor.off,
-) -> None:
-    """Export an Ultralytics classification dataset to a Hugging Face dataset.
-    The Ultralytics dataset directory should contain 'train', 'val' and/or
-    'test' subdirectories, each containing subdirectories for each label.
-    Args:
-        dataset_dir (Path): Path to the Ultralytics dataset directory.
-        repo_id (str): Hugging Face repository ID to push the dataset to.
-        label_names (list[str]): List of label names.
-        merge_labels (bool): Whether to merge all labels into a single label
-            named 'object'.
-        is_openfoodfacts_dataset (bool): Whether the dataset is from
-            Open Food Facts. If True, the `off_image_id` and `image_url` will
-            be generated automatically. `off_image_id` is extracted from the
-            image filename.
-        openfoodfacts_flavor (Flavor): Flavor of Open Food Facts dataset. This
-            is ignored if `is_openfoodfacts_dataset` is False.
-    """
-    logger.info("Repo ID: %s, dataset_dir: %s", repo_id, dataset_dir)
-    if not any((dataset_dir / split).is_dir() for split in ["train", "val", "test"]):
-        raise ValueError(
-            f"Dataset directory {dataset_dir} does not contain 'train', 'val' or 'test' subdirectories"
-        )
-    # Save output as pickle
-    for split in ["train", "val", "test"]:
-        split_dir = dataset_dir / split
-        if not split_dir.is_dir():
-            logger.info("Skipping missing split directory: %s", split_dir)
-            continue
-        with tempfile.TemporaryDirectory() as tmp_dir_str:
-            tmp_dir = Path(tmp_dir_str)
-            for label_dir in (d for d in split_dir.iterdir() if d.is_dir()):
-                label_name = label_dir.name
-                if merge_labels:
-                    label_name = "object"
-                if label_name not in label_names:
-                    raise ValueError(
-                        "Label name %s not in provided label names (label names: %s)"
-                        % (label_name, label_names),
-                    )
-                label_id = label_names.index(label_name)
-                for image_path in label_dir.glob("*"):
-                    if is_openfoodfacts_dataset:
-                        image_stem_parts = image_path.stem.split("_")
-                        barcode = image_stem_parts[0]
-                        off_image_id = image_stem_parts[1]
-                        image_id = f"{barcode}_{off_image_id}"
-                        image_url = generate_image_url(
-                            barcode, off_image_id, flavor=openfoodfacts_flavor
-                        )
-                    else:
-                        image_id = image_path.stem
-                        barcode = ""
-                        off_image_id = ""
-                        image_url = ""
-                    image = Image.open(image_path)
-                    image.load()
-                    if image.mode != "RGB":
-                        image = image.convert("RGB")
-                    # Rotate image according to exif orientation using Pillow
-                    ImageOps.exif_transpose(image, in_place=True)
-                    sample = {
-                        "image_id": image_id,
-                        "image": image,
-                        "width": image.width,
-                        "height": image.height,
-                        "meta": {
-                            "barcode": barcode,
-                            "off_image_id": off_image_id,
-                            "image_url": image_url,
-                        },
-                        "category_id": label_id,
-                        "category_name": label_name,
-                    }
-                    with open(tmp_dir / f"{split}_{image_id}.pkl", "wb") as f:
-                        pickle.dump(sample, f)
-            hf_ds = datasets.Dataset.from_generator(
-                functools.partial(_pickle_sample_generator, tmp_dir),
-                features=HF_DS_CLASSIFICATION_FEATURES,
-            )
-            hf_ds.push_to_hub(repo_id, split=split)
-def export_to_hf_llm_image_extraction(
-    sample_iter: Iterator[LLMImageExtractionSample],
-    split: str,
-    repo_id: str,
-    revision: str = "main",
-    tmp_dir: Path | None = None,
-) -> None:
-    """Export LLM image extraction samples to a Hugging Face dataset.
-    Args:
-        sample_iter (Iterator[LLMImageExtractionSample]): Iterator of samples
-            to export.
-        split (str): Name of the dataset split (e.g., 'train', 'val').
-        repo_id (str): Hugging Face repository ID to push the dataset to.
-        revision (str): Revision (branch, tag or commit) to use for the
-            Hugging Face Datasets repository.
-        tmp_dir (Path | None): Temporary directory to use for intermediate
-            files. If None, a temporary directory will be created
-            automatically.
-    """
-    logger.info(
-        "Repo ID: %s, revision: %s, split: %s, tmp_dir: %s",
-        repo_id,
-        revision,
-        split,
-        tmp_dir,
-    )
-    tmp_dir_with_context: PathWithContext | tempfile.TemporaryDirectory
-    if tmp_dir:
-        tmp_dir.mkdir(parents=True, exist_ok=True)
-        tmp_dir_with_context = PathWithContext(tmp_dir)
-    else:
-        tmp_dir_with_context = tempfile.TemporaryDirectory()
-    with tmp_dir_with_context as tmp_dir_str:
-        tmp_dir = Path(tmp_dir_str)
-        for sample in tqdm.tqdm(sample_iter, desc="samples"):
-            image = sample.image
-            # Rotate image according to exif orientation using Pillow
-            image = ImageOps.exif_transpose(image)
-            image_id = sample.image_id
-            sample = {
-                "image_id": image_id,
-                "image": image,
-                "meta": sample.meta.model_dump(),
-                "output": sample.output,
-            }
-            # Save output as pickle
-            with open(tmp_dir / f"{split}_{image_id}.pkl", "wb") as f:
-                pickle.dump(sample, f)
-        hf_ds = datasets.Dataset.from_generator(
-            functools.partial(_pickle_sample_generator, tmp_dir),
-            features=HF_DS_LLM_IMAGE_EXTRACTION_FEATURES,
-        )
-        hf_ds.push_to_hub(repo_id, split=split, revision=revision)

labelr/google_genai.py CHANGED Viewed

@@ -11,10 +11,11 @@ import orjson
 import typer
 from gcloud.aio.storage import Storage
 from openfoodfacts import Flavor
-from openfoodfacts.images import download_image, generate_image_url
+from openfoodfacts.images import generate_image_url
 from tqdm.asyncio import tqdm
-from labelr.sample import LLMImageExtractionSample, SampleMeta
+from labelr.sample.common import SampleMeta
+from labelr.sample.llm import LLMImageExtractionSample
 from labelr.utils import download_image_from_gcs
 try:
@@ -335,6 +336,7 @@ def generate_sample_iter(
     """
     skipped = 0
     invalid = 0
+    storage_client = storage.Client()
     with prediction_path.open("r") as f_in:
         for i, sample_str in enumerate(f_in):
             if i < skip:
@@ -349,6 +351,7 @@ def generate_sample_iter(
                     sample=sample,
                     is_openfoodfacts_dataset=is_openfoodfacts_dataset,
                     openfoodfacts_flavor=openfoodfacts_flavor,
+                    storage_client=storage_client,
                 )
             except Exception as e:
                 if raise_on_invalid_sample:
@@ -370,6 +373,7 @@ def generate_sample_from_prediction(
     sample: JSONType,
     is_openfoodfacts_dataset: bool = False,
     openfoodfacts_flavor: Flavor = Flavor.off,
+    storage_client: storage.Client | None = None,
 ) -> LLMImageExtractionSample:
     """Generate a LLMImageExtractionSample from a prediction sample.
     Args:
@@ -378,13 +382,15 @@ def generate_sample_from_prediction(
         is_openfoodfacts_dataset (bool): Whether the dataset is from Open Food
             Facts.
         openfoodfacts_flavor (Flavor): Flavor of the Open Food Facts dataset.
+        storage_client (storage.Client | None): Optional Google Cloud Storage
+            client. If not provided, a new client will be created.
     Returns:
         LLMImageExtractionSample: Generated sample.
     """
     image_id = sample["key"][len("key:") :]
     response_str = sample["response"]["candidates"][0]["content"]["parts"][0]["text"]
     image_uri = sample["request"]["contents"][0]["parts"][1]["file_data"]["file_uri"]
-    image = download_image_from_gcs(image_uri=image_uri)
+    image = download_image_from_gcs(image_uri=image_uri, client=storage_client)
     response = orjson.loads(response_str)
     jsonschema.validate(response, json_schema)

labelr/sample/__init__.py ADDED Viewed

File without changes

labelr/sample/classification.py ADDED Viewed

@@ -0,0 +1,17 @@
+import datasets
+HF_DS_CLASSIFICATION_FEATURES = datasets.Features(
+    {
+        "image_id": datasets.Value("string"),
+        "image": datasets.features.Image(),
+        "width": datasets.Value("int64"),
+        "height": datasets.Value("int64"),
+        "meta": {
+            "barcode": datasets.Value("string"),
+            "off_image_id": datasets.Value("string"),
+            "image_url": datasets.Value("string"),
+        },
+        "category_id": datasets.Value("int64"),
+        "category_name": datasets.Value("string"),
+    }
+)

labelr/sample/common.py ADDED Viewed

@@ -0,0 +1,14 @@
+from pydantic import BaseModel, Field
+class SampleMeta(BaseModel):
+    barcode: str | None = Field(
+        ..., description="The barcode of the product, if applicable"
+    )
+    off_image_id: str | None = Field(
+        ...,
+        description="The Open Food Facts image ID associated with the image, if applicable",
+    )
+    image_url: str | None = Field(
+        ..., description="The URL of the image, if applicable"
+    )

labelr/sample/llm.py ADDED Viewed

@@ -0,0 +1,75 @@
+import typing
+from collections.abc import Iterator
+from pathlib import Path
+import datasets
+import orjson
+from PIL import Image
+from pydantic import BaseModel, Field
+from labelr.sample.common import SampleMeta
+from labelr.utils import download_image
+class LLMImageExtractionSample(BaseModel):
+    class Config:
+        # required to allow PIL Image type
+        arbitrary_types_allowed = True
+    image_id: str = Field(
+        ...,
+        description="unique ID for the image. For Open Food Facts images, it follows the "
+        "format `barcode:imgid`",
+    )
+    image: Image.Image = Field(..., description="Image to extract information from")
+    output: str | None = Field(..., description="Expected response of the LLM")
+    meta: SampleMeta = Field(..., description="Metadata associated with the sample")
+HF_DS_LLM_IMAGE_EXTRACTION_FEATURES = datasets.Features(
+    {
+        "image_id": datasets.Value("string"),
+        "image": datasets.features.Image(),
+        "output": datasets.features.Value("string"),
+        "meta": {
+            "barcode": datasets.Value("string"),
+            "off_image_id": datasets.Value("string"),
+            "image_url": datasets.Value("string"),
+        },
+    }
+)
+def load_llm_image_extraction_dataset_from_jsonl(
+    dataset_path: Path, **kwargs
+) -> Iterator[LLMImageExtractionSample]:
+    """Load a Hugging Face dataset for LLM image extraction from a JSONL file.
+    Args:
+        dataset_path (Path): Path to the JSONL dataset file.
+        **kwargs: Additional keyword arguments to pass to the image downloader.
+    Yields:
+        Iterator[LLMImageExtractionSample]: Iterator of LLM image extraction
+            samples.
+    """
+    with dataset_path.open("r") as f:
+        for line in f:
+            item = orjson.loads(line)
+            image_id = item["image_id"]
+            image_url = item["image_url"]
+            image = typing.cast(Image.Image, download_image(image_url, **kwargs))
+            barcode = item.pop("barcode", None)
+            off_image_id = item.pop("off_image_id", None)
+            output = item.pop("output", None)
+            meta = SampleMeta(
+                barcode=barcode,
+                off_image_id=off_image_id,
+                image_url=image_url,
+            )
+            sample = LLMImageExtractionSample(
+                image_id=image_id,
+                image=image,
+                output=output,
+                meta=meta,
+            )
+            yield sample

labelr/{sample.py → sample/object_detection.py} RENAMED Viewed

@@ -8,8 +8,7 @@ import PIL
 from openfoodfacts import Flavor
 from openfoodfacts.barcode import normalize_barcode
 from openfoodfacts.images import download_image, generate_image_url
-from PIL import Image, ImageOps
-from pydantic import BaseModel, Field
+from PIL import ImageOps
 logger = logging.getLogger(__name__)
@@ -231,34 +230,6 @@ def format_object_detection_sample_to_hf(
     }
-class SampleMeta(BaseModel):
-    barcode: str | None = Field(
-        ..., description="The barcode of the product, if applicable"
-    )
-    off_image_id: str | None = Field(
-        ...,
-        description="The Open Food Facts image ID associated with the image, if applicable",
-    )
-    image_url: str | None = Field(
-        ..., description="The URL of the image, if applicable"
-    )
-class LLMImageExtractionSample(BaseModel):
-    class Config:
-        # required to allow PIL Image type
-        arbitrary_types_allowed = True
-    image_id: str = Field(
-        ...,
-        description="unique ID for the image. For Open Food Facts images, it follows the "
-        "format `barcode:imgid`",
-    )
-    image: Image.Image = Field(..., description="Image to extract information from")
-    output: str = Field(..., description="Expected response of the LLM")
-    meta: SampleMeta = Field(..., description="Metadata associated with the sample")
 # The HuggingFace Dataset features
 HF_DS_OBJECT_DETECTION_FEATURES = datasets.Features(
     {
@@ -278,33 +249,3 @@ HF_DS_OBJECT_DETECTION_FEATURES = datasets.Features(
         },
     }
 )
-HF_DS_CLASSIFICATION_FEATURES = datasets.Features(
-    {
-        "image_id": datasets.Value("string"),
-        "image": datasets.features.Image(),
-        "width": datasets.Value("int64"),
-        "height": datasets.Value("int64"),
-        "meta": {
-            "barcode": datasets.Value("string"),
-            "off_image_id": datasets.Value("string"),
-            "image_url": datasets.Value("string"),
-        },
-        "category_id": datasets.Value("int64"),
-        "category_name": datasets.Value("string"),
-    }
-)
-HF_DS_LLM_IMAGE_EXTRACTION_FEATURES = datasets.Features(
-    {
-        "image_id": datasets.Value("string"),
-        "image": datasets.features.Image(),
-        "output": datasets.features.Value("string"),
-        "meta": {
-            "barcode": datasets.Value("string"),
-            "off_image_id": datasets.Value("string"),
-            "image_url": datasets.Value("string"),
-        },
-    }
-)

labelr/utils.py CHANGED Viewed

@@ -2,6 +2,8 @@ import io
 from pathlib import Path
 from google.cloud import storage
+from openfoodfacts.images import download_image as _download_image
+from openfoodfacts.utils import ImageDownloadItem
 from PIL import Image
@@ -20,15 +22,63 @@ def parse_hf_repo_id(hf_repo_id: str) -> tuple[str, str]:
     return hf_repo_id, revision
-def download_image_from_gcs(image_uri: str) -> Image.Image:
+def download_image(
+    image: str | tuple[str, str],
+    *,
+    error_raise: bool = True,
+    return_struct: bool = False,
+    **kwargs,
+) -> Image.Image | ImageDownloadItem | None:
+    """Download an image from a URL or GCS URI and return it as a PIL Image.
+    Args:
+        image (str | tuple[str, str]): The URL or GCS URI of the image.
+        error_raise (bool): Whether to raise an error if the image cannot be
+            downloaded.
+        return_struct (bool): Whether to return an ImageDownloadItem struct
+            instead of a PIL Image.
+        **kwargs: Additional arguments to pass to the download function.
+    Returns:
+        Image.Image | ImageDownloadItem: The downloaded image as a PIL Image
+            or an ImageDownloadItem struct.
+    """
+    if isinstance(image, str) and image.startswith("gs://"):
+        return download_image_from_gcs(image, return_struct=return_struct, **kwargs)
+    return _download_image(
+        image,
+        error_raise=error_raise,
+        return_struct=return_struct,
+        **kwargs,
+    )
+def download_image_from_gcs(
+    image_uri: str, client: storage.Client | None = None, return_struct: bool = False
+) -> Image.Image | ImageDownloadItem:
     """Download an image from a Google Cloud Storage URI and return it as a
-    PIL Image."""
-    storage_client = storage.Client()
+    PIL Image.
+    Args:
+        image_uri (str): The GCS URI of the image
+            (e.g., gs://bucket_name/path/to/image.jpg).
+        client (storage.Client | None): An optional Google Cloud Storage
+            client. If not provided, a new client will be created.
+    """
+    if client is None:
+        client = storage.Client()
     bucket_name, blob_name = image_uri.replace("gs://", "").split("/", 1)
-    bucket = storage_client.bucket(bucket_name)
+    bucket = client.bucket(bucket_name)
     blob = bucket.blob(blob_name)
     image_data = blob.download_as_bytes()
-    return Image.open(io.BytesIO(image_data))
+    pil_image = Image.open(io.BytesIO(image_data))
+    if return_struct:
+        return ImageDownloadItem(
+            url=image_uri,
+            image=pil_image,
+            error=None,
+        )
+    return pil_image
 class PathWithContext:

{labelr-0.9.0.dist-info → labelr-0.10.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: labelr
-Version: 0.9.0
+Version: 0.10.0
 Summary: A command-line tool to manage labeling tasks with Label Studio.
 Requires-Python: >=3.10
 Description-Content-Type: text/markdown
@@ -18,14 +18,13 @@ Requires-Dist: rapidfuzz>=3.14.3
 Requires-Dist: aiohttp
 Requires-Dist: aiofiles
 Requires-Dist: orjson
+Requires-Dist: google-cloud-storage
+Requires-Dist: gcloud-aio-storage
+Requires-Dist: google-genai>=1.56.0
 Provides-Extra: ultralytics
 Requires-Dist: ultralytics==8.3.223; extra == "ultralytics"
 Provides-Extra: fiftyone
 Requires-Dist: fiftyone~=1.10.0; extra == "fiftyone"
-Provides-Extra: google
-Requires-Dist: google-genai>=1.56.0; extra == "google"
-Requires-Dist: gcloud-aio-storage; extra == "google"
-Requires-Dist: google-cloud-storage; extra == "google"
 Dynamic: license-file
 # Labelr

labelr-0.10.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,36 @@
+labelr/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+labelr/__main__.py,sha256=G4e95-IfhI-lOmkOBP6kQ8wl1x_Fl7dZlLOYr90K83c,66
+labelr/annotate.py,sha256=3fJ9FYbcozcOoKuhNtzPHV8sSnp-45FsNnMc8UeBHGU,3503
+labelr/check.py,sha256=3wK6mE0UsKvoBNm0_lyWhCMq7gxkv5r50pvO70damXY,2476
+labelr/config.py,sha256=3RXF_NdkSuHvfVMGMlYmjlw45fU77zQkLX7gmZq7NxM,64
+labelr/dataset_features.py,sha256=ZC9QAUw9oKHqyUPla2h3xQFaRT9sHq8hkPNN4RDDwmo,1257
+labelr/google_genai.py,sha256=x5p98eYoI887QMBDgziFxEW9WNdZ8Cw0EHjAFQ71SaE,14728
+labelr/main.py,sha256=OTiJSkD_TrzQmQQm291FhknD-HQQTWfBEBgImxqL0KM,2634
+labelr/project_config.py,sha256=CIHEcgSOfXb53naHWEBkTDm2V9m3abAu8C54VSzHjAs,1260
+labelr/types.py,sha256=8CHfLyifF_N94OYDhG-7IcWboOh9o0Z_0LBtQapT8TQ,313
+labelr/utils.py,sha256=8Yp0L2MCIdUYSjvmF4U5iiaBpaZJbYw4rHJOMhCCudE,3075
+labelr/apps/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+labelr/apps/datasets.py,sha256=tAD6TZSnwh7uhkleSfDP0PFqztXC1S3Vx2aMSVCFfRU,12725
+labelr/apps/evaluate.py,sha256=UC4CuSKa4vgR5xTBZ-dFgp_1pYnkM55s2IJgix0YtkI,1157
+labelr/apps/google_batch.py,sha256=Mlz5jRVcR1XzRJg2HLte3rIhiOk4xQQjjLAJsc3lJjo,9572
+labelr/apps/hugging_face.py,sha256=B0GaDZeUZj2A7nEeC1OtCANb0DqvBkhWwFWM_9Nm2kU,1608
+labelr/apps/label_studio.py,sha256=lQ7K16noA4Mnr1hc0oxya1sgGgABWnpIIJTM5ENp7so,16869
+labelr/apps/train.py,sha256=wmOSpO9JsrwCXYMgRg2srMbV5B5TvnlfhAKPqUt6wSg,7328
+labelr/evaluate/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+labelr/evaluate/object_detection.py,sha256=QJIwrDY-Vsy0-It6tZSkN3qgAlmIu2W1-kGdmibiPSQ,3349
+labelr/export/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+labelr/export/classification.py,sha256=rnm99vGMJy1UkdXiZ8t_TgFe3CyLBBYowWwzaZeniIs,4699
+labelr/export/common.py,sha256=lJ-ZDOMKGpC48fCuEnIrA8sZBhXGZOcghBbsLM1h66o,1252
+labelr/export/llm.py,sha256=Jlopi0EQ4YUWLe_s-kTFcISTzO1QmdX-qXQxayO6E-k,3186
+labelr/export/object_detection.py,sha256=91ywkPago7WgbY2COQKpwjFLYAAsXeGOu7TkGHi17OU,12338
+labelr/sample/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+labelr/sample/classification.py,sha256=7Z5hvxG6q6wfJMYj00JWbRBhfjOyhjaL8fpJjgBi9N8,539
+labelr/sample/common.py,sha256=f0XDS6s0z6Vw4G2FDELJ1VQSe5Tsh0q3-3VU9unK9eY,431
+labelr/sample/llm.py,sha256=zAsI3TmfGCbBPv4_hNtYR4Np3yAmUDzXGAvlQLF6V6w,2474
+labelr/sample/object_detection.py,sha256=XZasR_k4AxzsiWdVMC2ZnyjfA14PKJPrx1U-XPr5tWQ,8427
+labelr-0.10.0.dist-info/licenses/LICENSE,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
+labelr-0.10.0.dist-info/METADATA,sha256=pS2Ipq-aICU3TluuqSNocGP5-V8ztLk6X_udwwnECPk,7243
+labelr-0.10.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+labelr-0.10.0.dist-info/entry_points.txt,sha256=OACukVeR_2z54i8yQuWqqk_jdEHlyTwmTFOFBmxPp1k,43
+labelr-0.10.0.dist-info/top_level.txt,sha256=bjZo50aGZhXIcZYpYOX4sdAQcamxh8nwfEh7A9RD_Ag,7
+labelr-0.10.0.dist-info/RECORD,,

labelr-0.9.0.dist-info/RECORD DELETED Viewed

@@ -1,28 +0,0 @@
-labelr/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-labelr/__main__.py,sha256=G4e95-IfhI-lOmkOBP6kQ8wl1x_Fl7dZlLOYr90K83c,66
-labelr/annotate.py,sha256=3fJ9FYbcozcOoKuhNtzPHV8sSnp-45FsNnMc8UeBHGU,3503
-labelr/check.py,sha256=3wK6mE0UsKvoBNm0_lyWhCMq7gxkv5r50pvO70damXY,2476
-labelr/config.py,sha256=3RXF_NdkSuHvfVMGMlYmjlw45fU77zQkLX7gmZq7NxM,64
-labelr/dataset_features.py,sha256=ZC9QAUw9oKHqyUPla2h3xQFaRT9sHq8hkPNN4RDDwmo,1257
-labelr/export.py,sha256=aPfQ-RaK3C2WJrzbETYdC9kRe0MTpCRs0nu5l2SqiRg,20092
-labelr/google_genai.py,sha256=vn_UNQOxUDOTTTWz-emAVErjOtQmnlxM_m8yo2q01Ok,14401
-labelr/main.py,sha256=OTiJSkD_TrzQmQQm291FhknD-HQQTWfBEBgImxqL0KM,2634
-labelr/project_config.py,sha256=CIHEcgSOfXb53naHWEBkTDm2V9m3abAu8C54VSzHjAs,1260
-labelr/sample.py,sha256=VL-iKDvLaIeViJ0TaBY9uCbv0ey528fkaRTYE-Zr12I,10347
-labelr/types.py,sha256=8CHfLyifF_N94OYDhG-7IcWboOh9o0Z_0LBtQapT8TQ,313
-labelr/utils.py,sha256=-zLOWLbvLwtNFtzzwZ6RjJD9GstoYR-gt4wz9r6u9lE,1363
-labelr/apps/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-labelr/apps/datasets.py,sha256=kJQWwm3mjA2uWIA8O_DslM7OS5ht5mgWqcFC_zF4gCo,11187
-labelr/apps/evaluate.py,sha256=UC4CuSKa4vgR5xTBZ-dFgp_1pYnkM55s2IJgix0YtkI,1157
-labelr/apps/google_batch.py,sha256=BMcfBkDwfu-zOOR80bYmtEy6k_Qc70m7K7wmp4Ww0r8,9335
-labelr/apps/hugging_face.py,sha256=B0GaDZeUZj2A7nEeC1OtCANb0DqvBkhWwFWM_9Nm2kU,1608
-labelr/apps/label_studio.py,sha256=su9shoi0K9PmI8RBLipV2KQf_MRjkF5vy5-JUcbXr5A,16852
-labelr/apps/train.py,sha256=wmOSpO9JsrwCXYMgRg2srMbV5B5TvnlfhAKPqUt6wSg,7328
-labelr/evaluate/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-labelr/evaluate/object_detection.py,sha256=QJIwrDY-Vsy0-It6tZSkN3qgAlmIu2W1-kGdmibiPSQ,3349
-labelr-0.9.0.dist-info/licenses/LICENSE,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
-labelr-0.9.0.dist-info/METADATA,sha256=cNkf4LPmbO_k3UuR7O7NtcCwRF-Z5c-yIyQRAocsjww,7322
-labelr-0.9.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-labelr-0.9.0.dist-info/entry_points.txt,sha256=OACukVeR_2z54i8yQuWqqk_jdEHlyTwmTFOFBmxPp1k,43
-labelr-0.9.0.dist-info/top_level.txt,sha256=bjZo50aGZhXIcZYpYOX4sdAQcamxh8nwfEh7A9RD_Ag,7
-labelr-0.9.0.dist-info/RECORD,,

{labelr-0.9.0.dist-info → labelr-0.10.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{labelr-0.9.0.dist-info → labelr-0.10.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{labelr-0.9.0.dist-info → labelr-0.10.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{labelr-0.9.0.dist-info → labelr-0.10.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

labelr 0.9.0__py3-none-any.whl → 0.10.0__py3-none-any.whl

labelr 0.9.0py3-none-any.whl → 0.10.0py3-none-any.whl