labelr 0.8.0__tar.gz → 0.9.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. {labelr-0.8.0/src/labelr.egg-info → labelr-0.9.0}/PKG-INFO +10 -1
  2. {labelr-0.8.0 → labelr-0.9.0}/pyproject.toml +8 -2
  3. labelr-0.9.0/src/labelr/apps/google_batch.py +289 -0
  4. {labelr-0.8.0 → labelr-0.9.0}/src/labelr/export.py +63 -0
  5. labelr-0.9.0/src/labelr/google_genai.py +415 -0
  6. {labelr-0.8.0 → labelr-0.9.0}/src/labelr/main.py +6 -0
  7. {labelr-0.8.0 → labelr-0.9.0}/src/labelr/sample.py +43 -1
  8. labelr-0.9.0/src/labelr/utils.py +48 -0
  9. {labelr-0.8.0 → labelr-0.9.0/src/labelr.egg-info}/PKG-INFO +10 -1
  10. {labelr-0.8.0 → labelr-0.9.0}/src/labelr.egg-info/SOURCES.txt +2 -1
  11. {labelr-0.8.0 → labelr-0.9.0}/src/labelr.egg-info/requires.txt +10 -0
  12. labelr-0.8.0/src/labelr/evaluate/llm.py +0 -0
  13. labelr-0.8.0/src/labelr/utils.py +0 -13
  14. {labelr-0.8.0 → labelr-0.9.0}/LICENSE +0 -0
  15. {labelr-0.8.0 → labelr-0.9.0}/README.md +0 -0
  16. {labelr-0.8.0 → labelr-0.9.0}/setup.cfg +0 -0
  17. {labelr-0.8.0 → labelr-0.9.0}/src/labelr/__init__.py +0 -0
  18. {labelr-0.8.0 → labelr-0.9.0}/src/labelr/__main__.py +0 -0
  19. {labelr-0.8.0 → labelr-0.9.0}/src/labelr/annotate.py +0 -0
  20. {labelr-0.8.0 → labelr-0.9.0}/src/labelr/apps/__init__.py +0 -0
  21. {labelr-0.8.0 → labelr-0.9.0}/src/labelr/apps/datasets.py +0 -0
  22. {labelr-0.8.0 → labelr-0.9.0}/src/labelr/apps/evaluate.py +0 -0
  23. {labelr-0.8.0 → labelr-0.9.0}/src/labelr/apps/hugging_face.py +0 -0
  24. {labelr-0.8.0 → labelr-0.9.0}/src/labelr/apps/label_studio.py +0 -0
  25. {labelr-0.8.0 → labelr-0.9.0}/src/labelr/apps/train.py +0 -0
  26. {labelr-0.8.0 → labelr-0.9.0}/src/labelr/check.py +0 -0
  27. {labelr-0.8.0 → labelr-0.9.0}/src/labelr/config.py +0 -0
  28. {labelr-0.8.0 → labelr-0.9.0}/src/labelr/dataset_features.py +0 -0
  29. {labelr-0.8.0 → labelr-0.9.0}/src/labelr/evaluate/__init__.py +0 -0
  30. {labelr-0.8.0 → labelr-0.9.0}/src/labelr/evaluate/object_detection.py +0 -0
  31. {labelr-0.8.0 → labelr-0.9.0}/src/labelr/project_config.py +0 -0
  32. {labelr-0.8.0 → labelr-0.9.0}/src/labelr/types.py +0 -0
  33. {labelr-0.8.0 → labelr-0.9.0}/src/labelr.egg-info/dependency_links.txt +0 -0
  34. {labelr-0.8.0 → labelr-0.9.0}/src/labelr.egg-info/entry_points.txt +0 -0
  35. {labelr-0.8.0 → labelr-0.9.0}/src/labelr.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: labelr
3
- Version: 0.8.0
3
+ Version: 0.9.0
4
4
  Summary: A command-line tool to manage labeling tasks with Label Studio.
5
5
  Requires-Python: >=3.10
6
6
  Description-Content-Type: text/markdown
@@ -13,10 +13,19 @@ Requires-Dist: openfoodfacts>=2.9.0
13
13
  Requires-Dist: typer>=0.15.1
14
14
  Requires-Dist: google-cloud-batch==0.18.0
15
15
  Requires-Dist: huggingface-hub
16
+ Requires-Dist: deepdiff>=8.6.1
17
+ Requires-Dist: rapidfuzz>=3.14.3
18
+ Requires-Dist: aiohttp
19
+ Requires-Dist: aiofiles
20
+ Requires-Dist: orjson
16
21
  Provides-Extra: ultralytics
17
22
  Requires-Dist: ultralytics==8.3.223; extra == "ultralytics"
18
23
  Provides-Extra: fiftyone
19
24
  Requires-Dist: fiftyone~=1.10.0; extra == "fiftyone"
25
+ Provides-Extra: google
26
+ Requires-Dist: google-genai>=1.56.0; extra == "google"
27
+ Requires-Dist: gcloud-aio-storage; extra == "google"
28
+ Requires-Dist: google-cloud-storage; extra == "google"
20
29
  Dynamic: license-file
21
30
 
22
31
  # Labelr
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "labelr"
3
- version = "0.8.0"
3
+ version = "0.9.0"
4
4
  description = "A command-line tool to manage labeling tasks with Label Studio."
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.10"
@@ -12,7 +12,12 @@ dependencies = [
12
12
  "openfoodfacts>=2.9.0",
13
13
  "typer>=0.15.1",
14
14
  "google-cloud-batch==0.18.0",
15
- "huggingface-hub"
15
+ "huggingface-hub",
16
+ "deepdiff>=8.6.1",
17
+ "rapidfuzz>=3.14.3",
18
+ "aiohttp",
19
+ "aiofiles",
20
+ "orjson",
16
21
  ]
17
22
 
18
23
  [project.scripts]
@@ -25,6 +30,7 @@ ultralytics = [
25
30
  fiftyone = [
26
31
  "fiftyone~=1.10.0"
27
32
  ]
33
+ google = ["google-genai >= 1.56.0", "gcloud-aio-storage", "google-cloud-storage"]
28
34
 
29
35
  [tool.uv]
30
36
  package = true
@@ -0,0 +1,289 @@
1
+ import asyncio
2
+ import importlib
3
+ from pathlib import Path
4
+ from typing import Annotated, Any
5
+
6
+ import typer
7
+ from google.genai.types import JSONSchema as GoogleJSONSchema
8
+ from google.genai.types import Schema as GoogleSchema
9
+ from openfoodfacts import Flavor
10
+ from pydantic import BaseModel
11
+
12
+ from labelr.google_genai import generate_batch_dataset, launch_batch_job
13
+
14
+ app = typer.Typer()
15
+
16
+
17
+ def convert_pydantic_model_to_google_schema(schema: type[BaseModel]) -> dict[str, Any]:
18
+ """Google doesn't support natively OpenAPI schemas, so we convert them to
19
+ Google `Schema` (a subset of OpenAPI)."""
20
+ return GoogleSchema.from_json_schema(
21
+ json_schema=GoogleJSONSchema.model_validate(schema.model_json_schema())
22
+ ).model_dump(mode="json", exclude_none=True, exclude_unset=True)
23
+
24
+
25
+ @app.command()
26
+ def generate_dataset(
27
+ data_path: Annotated[
28
+ Path,
29
+ typer.Option(
30
+ ...,
31
+ help="Path to a JSONL file containing the raw batch samples.",
32
+ exists=True,
33
+ dir_okay=False,
34
+ resolve_path=True,
35
+ ),
36
+ ],
37
+ output_path: Annotated[
38
+ Path,
39
+ typer.Option(
40
+ ...,
41
+ help="Path where to write the generated dataset file.",
42
+ exists=False,
43
+ dir_okay=False,
44
+ resolve_path=True,
45
+ ),
46
+ ],
47
+ config_module: Annotated[
48
+ str,
49
+ typer.Option(
50
+ ...,
51
+ help="Python module path (e.g., 'myschema') containing two variables: "
52
+ "OUTPUT_SCHEMA (a Pydantic class representing the output schema) and "
53
+ "INSTRUCTIONS (a str containing instructions to add before each sample).",
54
+ ),
55
+ ],
56
+ bucket_name: Annotated[
57
+ str,
58
+ typer.Option(
59
+ ...,
60
+ help="Name of the GCS bucket where the images are stored.",
61
+ ),
62
+ ] = "robotoff-batch",
63
+ bucket_dir_name: Annotated[
64
+ str,
65
+ typer.Option(
66
+ ...,
67
+ help="Directory name in the GCS bucket where the images are stored.",
68
+ ),
69
+ ] = "gemini-batch-images",
70
+ max_concurrent_uploads: Annotated[
71
+ int,
72
+ typer.Option(
73
+ ...,
74
+ help="Maximum number of concurrent uploads to GCS.",
75
+ ),
76
+ ] = 30,
77
+ base_image_dir: Annotated[
78
+ Path | None,
79
+ typer.Option(
80
+ ...,
81
+ help="Base directory to resolve local image paths from.",
82
+ ),
83
+ ] = None,
84
+ from_key: Annotated[
85
+ str | None,
86
+ typer.Option(
87
+ ...,
88
+ help="If specified, resume processing from this sample key.",
89
+ ),
90
+ ] = None,
91
+ skip_upload: Annotated[
92
+ bool, typer.Option(..., help="Skip uploading images to GCS")
93
+ ] = False,
94
+ thinking_level: Annotated[
95
+ str | None,
96
+ typer.Option(
97
+ ...,
98
+ help="Thinking level to use for the generation config.",
99
+ ),
100
+ ] = None,
101
+ ):
102
+ """Generate a dataset file in JSONL format to be used for batch
103
+ processing, using Gemini Batch Inference."""
104
+ typer.echo(f"Uploading images from '{data_path}' to GCS bucket '{bucket_name}'...")
105
+ typer.echo(f"Writing updated dataset to {output_path}...")
106
+ typer.echo(f"Max concurrent uploads: {max_concurrent_uploads}...")
107
+ typer.echo(f"Base image directory: {base_image_dir}...")
108
+ typer.echo(f"From key: {from_key}...")
109
+ typer.echo(f"Skip upload: {skip_upload}...")
110
+ typer.echo(f"Thinking level: {thinking_level}...")
111
+
112
+ module = importlib.import_module(config_module)
113
+ base_cls = getattr(module, "OUTPUT_SCHEMA")
114
+
115
+ if not issubclass(base_cls, BaseModel):
116
+ typer.echo(
117
+ f"Error: {config_module}.OUTPUT_SCHEMA is not a subclass of pydantic.BaseModel"
118
+ )
119
+ raise typer.Exit(code=1)
120
+
121
+ instructions = getattr(module, "INSTRUCTIONS", None) or None
122
+
123
+ if instructions:
124
+ typer.echo(f"Using instructions: '{instructions}'...")
125
+ else:
126
+ typer.echo("No instructions provided.")
127
+
128
+ # JSON Schema is supoorted natively by Vertex AI and Gemini APIs,
129
+ # but not yet on Batch Inference...
130
+ # So we convert the JSON schema to Google internal "Schema"
131
+ # google_json_schema = base_cls.model_json_schema()
132
+ google_json_schema = convert_pydantic_model_to_google_schema(base_cls)
133
+ asyncio.run(
134
+ generate_batch_dataset(
135
+ data_path=data_path,
136
+ output_path=output_path,
137
+ google_json_schema=google_json_schema,
138
+ instructions=instructions,
139
+ bucket_name=bucket_name,
140
+ bucket_dir_name=bucket_dir_name,
141
+ max_concurrent_uploads=max_concurrent_uploads,
142
+ base_image_dir=base_image_dir,
143
+ from_key=from_key,
144
+ skip_upload=skip_upload,
145
+ thinking_level=thinking_level,
146
+ )
147
+ )
148
+
149
+
150
+ @app.command(name="launch-batch-job")
151
+ def launch_batch_job_command(
152
+ run_name: Annotated[str, typer.Argument(..., help="Name of the batch job run")],
153
+ dataset_path: Annotated[Path, typer.Option(..., help="Path to the dataset file")],
154
+ model: Annotated[str, typer.Option(..., help="Model to use for the batch job")],
155
+ location: Annotated[
156
+ str,
157
+ typer.Option(..., help="GCP location where to run the batch job"),
158
+ ] = "europe-west4",
159
+ ):
160
+ """Launch a Gemini Batch Inference job."""
161
+ launch_batch_job(
162
+ run_name=run_name,
163
+ dataset_path=dataset_path,
164
+ model=model,
165
+ location=location,
166
+ )
167
+
168
+
169
+ @app.command()
170
+ def upload_training_dataset_from_predictions(
171
+ prediction_path: Annotated[
172
+ Path,
173
+ typer.Argument(
174
+ ...,
175
+ help="Path to the prediction JSONL file generated by Google Inference Batch",
176
+ exists=True,
177
+ dir_okay=False,
178
+ readable=True,
179
+ ),
180
+ ],
181
+ instructions_path: Annotated[
182
+ Path,
183
+ typer.Option(
184
+ ...,
185
+ help="Path to the file with the instruction prompt for the model",
186
+ exists=True,
187
+ dir_okay=False,
188
+ readable=True,
189
+ ),
190
+ ],
191
+ json_schema_path: Annotated[
192
+ Path,
193
+ typer.Option(
194
+ ...,
195
+ help="Path to the file with the JSON schema to follow",
196
+ dir_okay=False,
197
+ readable=True,
198
+ ),
199
+ ],
200
+ repo_id: Annotated[
201
+ str, typer.Option(help="Hugging Face Datasets repository ID to push to")
202
+ ],
203
+ revision: Annotated[
204
+ str,
205
+ typer.Option(
206
+ help="Revision (branch, tag or commit) to use for the Hugging Face Datasets repository"
207
+ ),
208
+ ] = "main",
209
+ is_openfoodfacts_dataset: Annotated[
210
+ bool, typer.Option(..., help="Whether this is an Open Food Facts dataset")
211
+ ] = False,
212
+ openfoodfacts_flavor: Annotated[
213
+ Flavor,
214
+ typer.Option(
215
+ ...,
216
+ help="Open Food Facts flavor of the dataset (if applicable)",
217
+ ),
218
+ ] = Flavor.off,
219
+ split: Annotated[str, typer.Option(..., help="Name of the split")] = "train",
220
+ tmp_dir: Annotated[
221
+ Path | None,
222
+ typer.Option(
223
+ ...,
224
+ help="Temporary directory to use for intermediate files, default to a temporary directory "
225
+ "generated automatically. This is useful to relaunch the command if it fails midway.",
226
+ ),
227
+ ] = None,
228
+ skip: Annotated[int, typer.Option(..., help="Number of samples to skip")] = 0,
229
+ limit: Annotated[
230
+ int | None,
231
+ typer.Option(
232
+ ..., help="Limit number of samples to process, or None for no limit"
233
+ ),
234
+ ] = None,
235
+ raise_on_invalid_sample: Annotated[
236
+ bool,
237
+ typer.Option(
238
+ ...,
239
+ help="Whether to raise an error on invalid samples instead of skipping them",
240
+ ),
241
+ ] = False,
242
+ ):
243
+ """Upload a training dataset to a Hugging Face Datasets repository from a
244
+ Gemini batch prediction file."""
245
+ import tempfile
246
+
247
+ import orjson
248
+ from huggingface_hub import HfApi
249
+
250
+ from labelr.export import export_to_hf_llm_image_extraction
251
+ from labelr.google_genai import generate_sample_iter
252
+
253
+ instructions = instructions_path.read_text()
254
+ print(f"Instructions: {instructions}")
255
+ json_schema = orjson.loads(json_schema_path.read_text())
256
+
257
+ api = HfApi()
258
+ config = {
259
+ "instructions": instructions,
260
+ "json_schema": json_schema,
261
+ }
262
+ with tempfile.TemporaryDirectory() as config_tmp_dir_str:
263
+ config_tmp_dir = Path(config_tmp_dir_str)
264
+ config_path = config_tmp_dir / "config.json"
265
+ config_path.write_text(
266
+ orjson.dumps(config, option=orjson.OPT_INDENT_2).decode("utf-8")
267
+ )
268
+ api.upload_file(
269
+ path_or_fileobj=config_path,
270
+ path_in_repo="config.json",
271
+ repo_id=repo_id,
272
+ repo_type="dataset",
273
+ )
274
+ sample_iter = generate_sample_iter(
275
+ prediction_path=prediction_path,
276
+ json_schema=json_schema,
277
+ is_openfoodfacts_dataset=is_openfoodfacts_dataset,
278
+ openfoodfacts_flavor=openfoodfacts_flavor,
279
+ skip=skip,
280
+ limit=limit,
281
+ raise_on_invalid_sample=raise_on_invalid_sample,
282
+ )
283
+ export_to_hf_llm_image_extraction(
284
+ sample_iter=sample_iter,
285
+ split=split,
286
+ repo_id=repo_id,
287
+ revision=revision,
288
+ tmp_dir=tmp_dir,
289
+ )
@@ -3,6 +3,7 @@ import logging
3
3
  import pickle
4
4
  import random
5
5
  import tempfile
6
+ from collections.abc import Iterator
6
7
  from pathlib import Path
7
8
 
8
9
  import datasets
@@ -14,10 +15,13 @@ from PIL import Image, ImageOps
14
15
 
15
16
  from labelr.sample import (
16
17
  HF_DS_CLASSIFICATION_FEATURES,
18
+ HF_DS_LLM_IMAGE_EXTRACTION_FEATURES,
17
19
  HF_DS_OBJECT_DETECTION_FEATURES,
20
+ LLMImageExtractionSample,
18
21
  format_object_detection_sample_to_hf,
19
22
  )
20
23
  from labelr.types import TaskType
24
+ from labelr.utils import PathWithContext
21
25
 
22
26
  logger = logging.getLogger(__name__)
23
27
 
@@ -455,3 +459,62 @@ def export_from_ultralytics_to_hf_classification(
455
459
  features=HF_DS_CLASSIFICATION_FEATURES,
456
460
  )
457
461
  hf_ds.push_to_hub(repo_id, split=split)
462
+
463
+
464
+ def export_to_hf_llm_image_extraction(
465
+ sample_iter: Iterator[LLMImageExtractionSample],
466
+ split: str,
467
+ repo_id: str,
468
+ revision: str = "main",
469
+ tmp_dir: Path | None = None,
470
+ ) -> None:
471
+ """Export LLM image extraction samples to a Hugging Face dataset.
472
+
473
+ Args:
474
+ sample_iter (Iterator[LLMImageExtractionSample]): Iterator of samples
475
+ to export.
476
+ split (str): Name of the dataset split (e.g., 'train', 'val').
477
+ repo_id (str): Hugging Face repository ID to push the dataset to.
478
+ revision (str): Revision (branch, tag or commit) to use for the
479
+ Hugging Face Datasets repository.
480
+ tmp_dir (Path | None): Temporary directory to use for intermediate
481
+ files. If None, a temporary directory will be created
482
+ automatically.
483
+ """
484
+ logger.info(
485
+ "Repo ID: %s, revision: %s, split: %s, tmp_dir: %s",
486
+ repo_id,
487
+ revision,
488
+ split,
489
+ tmp_dir,
490
+ )
491
+
492
+ tmp_dir_with_context: PathWithContext | tempfile.TemporaryDirectory
493
+ if tmp_dir:
494
+ tmp_dir.mkdir(parents=True, exist_ok=True)
495
+ tmp_dir_with_context = PathWithContext(tmp_dir)
496
+ else:
497
+ tmp_dir_with_context = tempfile.TemporaryDirectory()
498
+
499
+ with tmp_dir_with_context as tmp_dir_str:
500
+ tmp_dir = Path(tmp_dir_str)
501
+ for sample in tqdm.tqdm(sample_iter, desc="samples"):
502
+ image = sample.image
503
+ # Rotate image according to exif orientation using Pillow
504
+ image = ImageOps.exif_transpose(image)
505
+ image_id = sample.image_id
506
+ sample = {
507
+ "image_id": image_id,
508
+ "image": image,
509
+ "meta": sample.meta.model_dump(),
510
+ "output": sample.output,
511
+ }
512
+ # Save output as pickle
513
+ with open(tmp_dir / f"{split}_{image_id}.pkl", "wb") as f:
514
+ pickle.dump(sample, f)
515
+
516
+ hf_ds = datasets.Dataset.from_generator(
517
+ functools.partial(_pickle_sample_generator, tmp_dir),
518
+ features=HF_DS_LLM_IMAGE_EXTRACTION_FEATURES,
519
+ )
520
+ hf_ds.push_to_hub(repo_id, split=split, revision=revision)
@@ -0,0 +1,415 @@
1
+ import asyncio
2
+ import mimetypes
3
+ from collections.abc import Iterator
4
+ from pathlib import Path
5
+ from typing import Literal
6
+ from urllib.parse import urlparse
7
+
8
+ import aiofiles
9
+ import jsonschema
10
+ import orjson
11
+ import typer
12
+ from gcloud.aio.storage import Storage
13
+ from openfoodfacts import Flavor
14
+ from openfoodfacts.images import download_image, generate_image_url
15
+ from tqdm.asyncio import tqdm
16
+
17
+ from labelr.sample import LLMImageExtractionSample, SampleMeta
18
+ from labelr.utils import download_image_from_gcs
19
+
20
+ try:
21
+ import google.genai # noqa: F401
22
+ except ImportError:
23
+ raise ImportError(
24
+ "The 'google-genai' package is required to use this module. "
25
+ "Please install labelr with the 'google' extra: "
26
+ "`pip install labelr[google]`"
27
+ )
28
+ import aiohttp
29
+ from google import genai
30
+ from google.cloud import storage
31
+ from google.genai.types import CreateBatchJobConfig, HttpOptions
32
+ from google.genai.types import JSONSchema as GoogleJSONSchema
33
+ from google.genai.types import Schema as GoogleSchema
34
+ from openfoodfacts.types import JSONType
35
+ from pydantic import BaseModel
36
+
37
+
38
+ class RawBatchSamplePart(BaseModel):
39
+ type: Literal["text", "image"]
40
+ data: str
41
+
42
+
43
+ class RawBatchSample(BaseModel):
44
+ key: str
45
+ parts: list[RawBatchSamplePart]
46
+ meta: JSONType = {}
47
+
48
+
49
+ def convert_pydantic_model_to_google_schema(schema: type[BaseModel]) -> JSONType:
50
+ """Google doesn't support natively OpenAPI schemas, so we convert them to
51
+ Google `Schema` (a subset of OpenAPI)."""
52
+ return GoogleSchema.from_json_schema(
53
+ json_schema=GoogleJSONSchema.model_validate(schema.model_json_schema())
54
+ ).model_dump(mode="json", exclude_none=True, exclude_unset=True)
55
+
56
+
57
+ async def download_image(url: str, session: aiohttp.ClientSession) -> bytes:
58
+ """Download an image from a URL and return its content as bytes.
59
+
60
+ Args:
61
+ url (str): URL of the image to download.
62
+ Returns:
63
+ bytes: Content of the downloaded image.
64
+ """
65
+ async with session.get(url) as response:
66
+ response.raise_for_status()
67
+ return await response.read()
68
+
69
+
70
+ async def download_image_from_filesystem(url: str, base_dir: Path) -> bytes:
71
+ """Download an image from the filesystem and return its content as bytes.
72
+
73
+ Args:
74
+ url (str): URL of the image to download.
75
+ base_dir (Path): Base directory where images are stored.
76
+ Returns:
77
+ bytes: Content of the downloaded image.
78
+ """
79
+ file_path = urlparse(url).path[1:] # Remove leading '/'
80
+ full_file_path = base_dir / file_path
81
+ async with aiofiles.open(full_file_path, "rb") as f:
82
+ return await f.read()
83
+
84
+
85
+ async def upload_to_gcs(
86
+ image_url: str,
87
+ bucket_name: str,
88
+ blob_name: str,
89
+ session: aiohttp.ClientSession,
90
+ base_image_dir: Path | None = None,
91
+ ) -> dict:
92
+ """Upload data to Google Cloud Storage.
93
+ Args:
94
+ bucket_name (str): Name of the GCS bucket.
95
+ blob_name (str): Name of the blob (object) in the bucket.
96
+ data (bytes): Data to upload.
97
+ session (aiohttp.ClientSession): HTTP session to use for downloading
98
+ the image.
99
+ base_image_dir (Path | None): If provided, images will be read from
100
+ the filesystem under this base directory instead of downloading
101
+ them from their URLs.
102
+ Returns:
103
+ dict: Status of the upload operation.
104
+ """
105
+ if base_image_dir is None:
106
+ image_data = await download_image(image_url, session)
107
+ else:
108
+ image_data = await download_image_from_filesystem(image_url, base_image_dir)
109
+
110
+ client = Storage(session=session)
111
+
112
+ status = await client.upload(
113
+ bucket_name,
114
+ blob_name,
115
+ image_data,
116
+ )
117
+ return status
118
+
119
+
120
+ async def upload_to_gcs_format_async(
121
+ sample: RawBatchSample,
122
+ google_json_schema: JSONType,
123
+ instructions: str | None,
124
+ bucket_name: str,
125
+ bucket_dir_name: str,
126
+ session: aiohttp.ClientSession,
127
+ base_image_dir: Path | None = None,
128
+ skip_upload: bool = False,
129
+ thinking_level: str | None = None,
130
+ ) -> JSONType | None:
131
+ parts: list[JSONType] = []
132
+
133
+ if instructions:
134
+ parts.append({"text": instructions})
135
+
136
+ for part in sample.parts:
137
+ if part.type == "image":
138
+ mime_type, _ = mimetypes.guess_type(part.data)
139
+ if mime_type is None:
140
+ raise ValueError(f"Cannot guess mimetype of file: {part.data}")
141
+
142
+ file_uri = part.data
143
+ image_blob_name = f"{bucket_dir_name}/{sample.key}/{Path(file_uri).name}"
144
+ # Download the image from the URL
145
+ if not skip_upload:
146
+ try:
147
+ await upload_to_gcs(
148
+ image_url=file_uri,
149
+ bucket_name=bucket_name,
150
+ blob_name=image_blob_name,
151
+ session=session,
152
+ base_image_dir=base_image_dir,
153
+ )
154
+ except FileNotFoundError:
155
+ return None
156
+
157
+ parts.append(
158
+ {
159
+ "file_data": {
160
+ "file_uri": f"gs://{bucket_name}/{image_blob_name}",
161
+ "mime_type": mime_type,
162
+ }
163
+ }
164
+ )
165
+ else:
166
+ parts.append({"text": part.data})
167
+
168
+ generation_config = {
169
+ "responseMimeType": "application/json",
170
+ "response_json_schema": google_json_schema,
171
+ }
172
+
173
+ if thinking_level is not None:
174
+ generation_config["thinkingConfig"] = {"thinkingLevel": thinking_level}
175
+
176
+ return {
177
+ "key": f"key:{sample.key}",
178
+ "request": {
179
+ "contents": [
180
+ {
181
+ "parts": parts,
182
+ "role": "user",
183
+ }
184
+ ],
185
+ "generationConfig": generation_config,
186
+ },
187
+ }
188
+
189
+
190
+ async def generate_batch_dataset(
191
+ data_path: Path,
192
+ output_path: Path,
193
+ google_json_schema: JSONType,
194
+ instructions: str | None,
195
+ bucket_name: str,
196
+ bucket_dir_name: str,
197
+ max_concurrent_uploads: int = 30,
198
+ base_image_dir: Path | None = None,
199
+ from_key: str | None = None,
200
+ skip_upload: bool = False,
201
+ thinking_level: str | None = None,
202
+ ):
203
+ limiter = asyncio.Semaphore(max_concurrent_uploads)
204
+ ignore = True if from_key is None else False
205
+ missing_files = 0
206
+ async with aiohttp.ClientSession() as session:
207
+ async with asyncio.TaskGroup() as tg:
208
+ async with (
209
+ aiofiles.open(data_path, "r") as input_file,
210
+ aiofiles.open(output_path, "wb") as output_file,
211
+ ):
212
+ async with limiter:
213
+ tasks = set()
214
+ async for line in tqdm(input_file, desc="samples"):
215
+ # print(f"line: {line}")
216
+ sample = RawBatchSample.model_validate_json(line)
217
+ # print(f"sample: {sample}")
218
+ record_key = sample.key
219
+ if from_key is not None and ignore:
220
+ if record_key == from_key:
221
+ ignore = False
222
+ else:
223
+ continue
224
+ task = tg.create_task(
225
+ upload_to_gcs_format_async(
226
+ sample=sample,
227
+ google_json_schema=google_json_schema,
228
+ instructions=instructions,
229
+ bucket_name=bucket_name,
230
+ bucket_dir_name=bucket_dir_name,
231
+ session=session,
232
+ base_image_dir=base_image_dir,
233
+ skip_upload=skip_upload,
234
+ thinking_level=thinking_level,
235
+ )
236
+ )
237
+ tasks.add(task)
238
+
239
+ if len(tasks) >= max_concurrent_uploads:
240
+ for task in tasks:
241
+ await task
242
+ updated_record = task.result()
243
+ if updated_record is not None:
244
+ await output_file.write(
245
+ orjson.dumps(updated_record) + "\n".encode()
246
+ )
247
+ else:
248
+ missing_files += 1
249
+ tasks.clear()
250
+
251
+ for task in tasks:
252
+ await task
253
+ updated_record = task.result()
254
+ if updated_record is not None:
255
+ await output_file.write(
256
+ orjson.dumps(updated_record) + "\n".encode()
257
+ )
258
+ else:
259
+ missing_files += 1
260
+
261
+ typer.echo(
262
+ f"Upload and dataset update completed. Wrote updated dataset to {output_path}. "
263
+ f"Missing files: {missing_files}."
264
+ )
265
+
266
+
267
+ def launch_batch_job(
268
+ run_name: str,
269
+ dataset_path: Path,
270
+ model: str,
271
+ location: str,
272
+ ):
273
+ """Launch a Gemini Batch Inference job.
274
+
275
+ Args:
276
+ run_name (str): Name of the batch run.
277
+ dataset_path (Path): Path to the dataset file in JSONL format.
278
+ model (str): Model to use for the batch job. Example:
279
+ 'gemini-2.5-flash'.
280
+ location (str): Location for the Vertex AI resources. Example:
281
+ 'europe-west4'.
282
+ """
283
+ # We upload the dataset to a GCS bucket using the Gcloud
284
+
285
+ if model == "gemini-3-pro-preview" and location != "global":
286
+ typer.echo(
287
+ "Warning: only 'global' location is supported for 'gemini-3-pro-preview' model. Overriding location to 'global'."
288
+ )
289
+ location = "global"
290
+
291
+ storage_client = storage.Client()
292
+ bucket_name = "robotoff-batch" # Replace with your bucket name
293
+ run_dir = f"gemini-batch/{run_name}"
294
+ input_file_blob_name = f"{run_dir}/inputs.jsonl"
295
+ bucket = storage_client.bucket(bucket_name)
296
+ blob = bucket.blob(input_file_blob_name)
297
+ blob.upload_from_filename(dataset_path)
298
+
299
+ client = genai.Client(
300
+ http_options=HttpOptions(api_version="v1"),
301
+ vertexai=True,
302
+ location=location,
303
+ )
304
+ output_uri = f"gs://{bucket_name}/{run_dir}"
305
+ job = client.batches.create(
306
+ model=model,
307
+ src=f"gs://{bucket_name}/{input_file_blob_name}",
308
+ config=CreateBatchJobConfig(dest=output_uri),
309
+ )
310
+ print(job)
311
+
312
+
313
+ def generate_sample_iter(
314
+ prediction_path: Path,
315
+ json_schema: JSONType,
316
+ skip: int = 0,
317
+ limit: int | None = None,
318
+ is_openfoodfacts_dataset: bool = False,
319
+ openfoodfacts_flavor: Flavor = Flavor.off,
320
+ raise_on_invalid_sample: bool = False,
321
+ ) -> Iterator[LLMImageExtractionSample]:
322
+ """Generate training samples from a Gemini Batch Inference prediction
323
+ JSONL file.
324
+
325
+ Args:
326
+ prediction_path (Path): Path to the prediction JSONL file.
327
+ json_schema (JSONType): JSON schema to validate the predictions.
328
+ skip (int): Number of initial samples to skip.
329
+ limit (int | None): Maximum number of samples to generate.
330
+ is_openfoodfacts_dataset (bool): Whether the dataset is from Open Food
331
+ Facts.
332
+ openfoodfacts_flavor (Flavor): Flavor of the Open Food Facts dataset.
333
+ Yields:
334
+ Iterator[LLMImageExtractionSample]: Generated samples.
335
+ """
336
+ skipped = 0
337
+ invalid = 0
338
+ with prediction_path.open("r") as f_in:
339
+ for i, sample_str in enumerate(f_in):
340
+ if i < skip:
341
+ skipped += 1
342
+ continue
343
+ if limit is not None and i >= skip + limit:
344
+ break
345
+ sample = orjson.loads(sample_str)
346
+ try:
347
+ yield generate_sample_from_prediction(
348
+ json_schema=json_schema,
349
+ sample=sample,
350
+ is_openfoodfacts_dataset=is_openfoodfacts_dataset,
351
+ openfoodfacts_flavor=openfoodfacts_flavor,
352
+ )
353
+ except Exception as e:
354
+ if raise_on_invalid_sample:
355
+ raise
356
+ else:
357
+ typer.echo(
358
+ f"Skipping invalid sample at line {i + 1} in {prediction_path}: {e}"
359
+ )
360
+ invalid += 1
361
+ continue
362
+ if skipped > 0:
363
+ typer.echo(f"Skipped {skipped} samples.")
364
+ if invalid > 0:
365
+ typer.echo(f"Skipped {invalid} invalid samples.")
366
+
367
+
368
+ def generate_sample_from_prediction(
369
+ json_schema: JSONType,
370
+ sample: JSONType,
371
+ is_openfoodfacts_dataset: bool = False,
372
+ openfoodfacts_flavor: Flavor = Flavor.off,
373
+ ) -> LLMImageExtractionSample:
374
+ """Generate a LLMImageExtractionSample from a prediction sample.
375
+ Args:
376
+ json_schema (JSONType): JSON schema to validate the predictions.
377
+ sample (JSONType): Prediction sample.
378
+ is_openfoodfacts_dataset (bool): Whether the dataset is from Open Food
379
+ Facts.
380
+ openfoodfacts_flavor (Flavor): Flavor of the Open Food Facts dataset.
381
+ Returns:
382
+ LLMImageExtractionSample: Generated sample.
383
+ """
384
+ image_id = sample["key"][len("key:") :]
385
+ response_str = sample["response"]["candidates"][0]["content"]["parts"][0]["text"]
386
+ image_uri = sample["request"]["contents"][0]["parts"][1]["file_data"]["file_uri"]
387
+ image = download_image_from_gcs(image_uri=image_uri)
388
+ response = orjson.loads(response_str)
389
+ jsonschema.validate(response, json_schema)
390
+
391
+ if is_openfoodfacts_dataset:
392
+ image_stem_parts = image_id.split("_")
393
+ barcode = image_stem_parts[0]
394
+ off_image_id = image_stem_parts[1]
395
+ image_id = f"{barcode}_{off_image_id}"
396
+ image_url = generate_image_url(
397
+ barcode, off_image_id, flavor=openfoodfacts_flavor
398
+ )
399
+ else:
400
+ image_id = image_id
401
+ barcode = ""
402
+ off_image_id = ""
403
+ image_url = ""
404
+
405
+ sample_meta = SampleMeta(
406
+ barcode=barcode,
407
+ off_image_id=off_image_id,
408
+ image_url=image_url,
409
+ )
410
+ return LLMImageExtractionSample(
411
+ image_id=image_id,
412
+ image=image,
413
+ output=orjson.dumps(response).decode("utf-8"),
414
+ meta=sample_meta,
415
+ )
@@ -5,6 +5,7 @@ from openfoodfacts.utils import get_logger
5
5
 
6
6
  from labelr.apps import datasets as dataset_app
7
7
  from labelr.apps import evaluate as evaluate_app
8
+ from labelr.apps import google_batch as google_batch_app
8
9
  from labelr.apps import hugging_face as hf_app
9
10
  from labelr.apps import label_studio as ls_app
10
11
  from labelr.apps import train as train_app
@@ -84,6 +85,11 @@ app.add_typer(
84
85
  name="evaluate",
85
86
  help="Visualize and evaluate trained models.",
86
87
  )
88
+ app.add_typer(
89
+ google_batch_app.app,
90
+ name="google-batch",
91
+ help="Generate datasets and launch batch jobs on Google Gemini.",
92
+ )
87
93
 
88
94
  if __name__ == "__main__":
89
95
  app()
@@ -8,7 +8,8 @@ import PIL
8
8
  from openfoodfacts import Flavor
9
9
  from openfoodfacts.barcode import normalize_barcode
10
10
  from openfoodfacts.images import download_image, generate_image_url
11
- from PIL import ImageOps
11
+ from PIL import Image, ImageOps
12
+ from pydantic import BaseModel, Field
12
13
 
13
14
  logger = logging.getLogger(__name__)
14
15
 
@@ -230,6 +231,34 @@ def format_object_detection_sample_to_hf(
230
231
  }
231
232
 
232
233
 
234
+ class SampleMeta(BaseModel):
235
+ barcode: str | None = Field(
236
+ ..., description="The barcode of the product, if applicable"
237
+ )
238
+ off_image_id: str | None = Field(
239
+ ...,
240
+ description="The Open Food Facts image ID associated with the image, if applicable",
241
+ )
242
+ image_url: str | None = Field(
243
+ ..., description="The URL of the image, if applicable"
244
+ )
245
+
246
+
247
+ class LLMImageExtractionSample(BaseModel):
248
+ class Config:
249
+ # required to allow PIL Image type
250
+ arbitrary_types_allowed = True
251
+
252
+ image_id: str = Field(
253
+ ...,
254
+ description="unique ID for the image. For Open Food Facts images, it follows the "
255
+ "format `barcode:imgid`",
256
+ )
257
+ image: Image.Image = Field(..., description="Image to extract information from")
258
+ output: str = Field(..., description="Expected response of the LLM")
259
+ meta: SampleMeta = Field(..., description="Metadata associated with the sample")
260
+
261
+
233
262
  # The HuggingFace Dataset features
234
263
  HF_DS_OBJECT_DETECTION_FEATURES = datasets.Features(
235
264
  {
@@ -266,3 +295,16 @@ HF_DS_CLASSIFICATION_FEATURES = datasets.Features(
266
295
  "category_name": datasets.Value("string"),
267
296
  }
268
297
  )
298
+
299
+ HF_DS_LLM_IMAGE_EXTRACTION_FEATURES = datasets.Features(
300
+ {
301
+ "image_id": datasets.Value("string"),
302
+ "image": datasets.features.Image(),
303
+ "output": datasets.features.Value("string"),
304
+ "meta": {
305
+ "barcode": datasets.Value("string"),
306
+ "off_image_id": datasets.Value("string"),
307
+ "image_url": datasets.Value("string"),
308
+ },
309
+ }
310
+ )
@@ -0,0 +1,48 @@
1
+ import io
2
+ from pathlib import Path
3
+
4
+ from google.cloud import storage
5
+ from PIL import Image
6
+
7
+
8
+ def parse_hf_repo_id(hf_repo_id: str) -> tuple[str, str]:
9
+ """Parse the repo_id and the revision from a hf_repo_id in the format:
10
+ `org/repo-name@revision`.
11
+
12
+ Returns a tuple (repo_id, revision), with revision = 'main' if it
13
+ was not provided.
14
+ """
15
+ if "@" in hf_repo_id:
16
+ hf_repo_id, revision = hf_repo_id.split("@", 1)
17
+ else:
18
+ revision = "main"
19
+
20
+ return hf_repo_id, revision
21
+
22
+
23
+ def download_image_from_gcs(image_uri: str) -> Image.Image:
24
+ """Download an image from a Google Cloud Storage URI and return it as a
25
+ PIL Image."""
26
+ storage_client = storage.Client()
27
+ bucket_name, blob_name = image_uri.replace("gs://", "").split("/", 1)
28
+ bucket = storage_client.bucket(bucket_name)
29
+ blob = bucket.blob(blob_name)
30
+ image_data = blob.download_as_bytes()
31
+ return Image.open(io.BytesIO(image_data))
32
+
33
+
34
+ class PathWithContext:
35
+ """A context manager that yields a Path object.
36
+
37
+ This is useful to have a common interface with tempfile.TemporaryDirectory
38
+ without actually creating a temporary directory.
39
+ """
40
+
41
+ def __init__(self, path: Path):
42
+ self.path = path
43
+
44
+ def __enter__(self) -> Path:
45
+ return self.path
46
+
47
+ def __exit__(self, exc_type, exc_value, traceback) -> None:
48
+ pass
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: labelr
3
- Version: 0.8.0
3
+ Version: 0.9.0
4
4
  Summary: A command-line tool to manage labeling tasks with Label Studio.
5
5
  Requires-Python: >=3.10
6
6
  Description-Content-Type: text/markdown
@@ -13,10 +13,19 @@ Requires-Dist: openfoodfacts>=2.9.0
13
13
  Requires-Dist: typer>=0.15.1
14
14
  Requires-Dist: google-cloud-batch==0.18.0
15
15
  Requires-Dist: huggingface-hub
16
+ Requires-Dist: deepdiff>=8.6.1
17
+ Requires-Dist: rapidfuzz>=3.14.3
18
+ Requires-Dist: aiohttp
19
+ Requires-Dist: aiofiles
20
+ Requires-Dist: orjson
16
21
  Provides-Extra: ultralytics
17
22
  Requires-Dist: ultralytics==8.3.223; extra == "ultralytics"
18
23
  Provides-Extra: fiftyone
19
24
  Requires-Dist: fiftyone~=1.10.0; extra == "fiftyone"
25
+ Provides-Extra: google
26
+ Requires-Dist: google-genai>=1.56.0; extra == "google"
27
+ Requires-Dist: gcloud-aio-storage; extra == "google"
28
+ Requires-Dist: google-cloud-storage; extra == "google"
20
29
  Dynamic: license-file
21
30
 
22
31
  # Labelr
@@ -8,6 +8,7 @@ src/labelr/check.py
8
8
  src/labelr/config.py
9
9
  src/labelr/dataset_features.py
10
10
  src/labelr/export.py
11
+ src/labelr/google_genai.py
11
12
  src/labelr/main.py
12
13
  src/labelr/project_config.py
13
14
  src/labelr/sample.py
@@ -22,9 +23,9 @@ src/labelr.egg-info/top_level.txt
22
23
  src/labelr/apps/__init__.py
23
24
  src/labelr/apps/datasets.py
24
25
  src/labelr/apps/evaluate.py
26
+ src/labelr/apps/google_batch.py
25
27
  src/labelr/apps/hugging_face.py
26
28
  src/labelr/apps/label_studio.py
27
29
  src/labelr/apps/train.py
28
30
  src/labelr/evaluate/__init__.py
29
- src/labelr/evaluate/llm.py
30
31
  src/labelr/evaluate/object_detection.py
@@ -6,9 +6,19 @@ openfoodfacts>=2.9.0
6
6
  typer>=0.15.1
7
7
  google-cloud-batch==0.18.0
8
8
  huggingface-hub
9
+ deepdiff>=8.6.1
10
+ rapidfuzz>=3.14.3
11
+ aiohttp
12
+ aiofiles
13
+ orjson
9
14
 
10
15
  [fiftyone]
11
16
  fiftyone~=1.10.0
12
17
 
18
+ [google]
19
+ google-genai>=1.56.0
20
+ gcloud-aio-storage
21
+ google-cloud-storage
22
+
13
23
  [ultralytics]
14
24
  ultralytics==8.3.223
File without changes
@@ -1,13 +0,0 @@
1
- def parse_hf_repo_id(hf_repo_id: str) -> tuple[str, str]:
2
- """Parse the repo_id and the revision from a hf_repo_id in the format:
3
- `org/repo-name@revision`.
4
-
5
- Returns a tuple (repo_id, revision), with revision = 'main' if it
6
- was not provided.
7
- """
8
- if "@" in hf_repo_id:
9
- hf_repo_id, revision = hf_repo_id.split("@", 1)
10
- else:
11
- revision = "main"
12
-
13
- return hf_repo_id, revision
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes