labelr 0.9.0__py3-none-any.whl → 0.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
labelr/apps/datasets.py CHANGED
@@ -12,7 +12,11 @@ import typer
12
12
  from openfoodfacts import Flavor
13
13
  from openfoodfacts.utils import get_logger
14
14
 
15
- from labelr.export import export_from_ultralytics_to_hf
15
+ from labelr.export.common import export_from_ultralytics_to_hf
16
+ from labelr.export.object_detection import (
17
+ export_from_ls_to_hf_object_detection,
18
+ export_from_ls_to_ultralytics_object_detection,
19
+ )
16
20
 
17
21
  from ..config import LABEL_STUDIO_DEFAULT_URL
18
22
  from ..types import ExportDestination, ExportSource, TaskType
@@ -99,7 +103,9 @@ def convert_object_detection_dataset(
99
103
  Studio format, and save it to a JSON file."""
100
104
  from datasets import load_dataset
101
105
 
102
- from labelr.sample import format_object_detection_sample_from_hf_to_ls
106
+ from labelr.sample.object_detection import (
107
+ format_object_detection_sample_from_hf_to_ls,
108
+ )
103
109
 
104
110
  logger.info("Loading dataset: %s", repo_id)
105
111
  ds = load_dataset(repo_id)
@@ -207,10 +213,8 @@ def export(
207
213
  local files (ultralytics format)."""
208
214
  from label_studio_sdk.client import LabelStudio
209
215
 
210
- from labelr.export import (
216
+ from labelr.export.object_detection import (
211
217
  export_from_hf_to_ultralytics_object_detection,
212
- export_from_ls_to_hf_object_detection,
213
- export_from_ls_to_ultralytics_object_detection,
214
218
  )
215
219
 
216
220
  if (to == ExportDestination.hf or from_ == ExportSource.hf) and repo_id is None:
@@ -303,3 +307,50 @@ def export(
303
307
  is_openfoodfacts_dataset=is_openfoodfacts_dataset,
304
308
  openfoodfacts_flavor=openfoodfacts_flavor,
305
309
  )
310
+
311
+
312
+ @app.command()
313
+ def export_llm_ds(
314
+ dataset_path: Annotated[
315
+ Path, typer.Option(..., help="Path to the JSONL dataset file")
316
+ ],
317
+ repo_id: Annotated[
318
+ str, typer.Option(..., help="Hugging Face Datasets repository ID to export to")
319
+ ],
320
+ split: Annotated[str, typer.Option(..., help="Dataset split to export")],
321
+ revision: Annotated[
322
+ str,
323
+ typer.Option(
324
+ help="Revision (branch, tag or commit) for the Hugging Face Datasets repository."
325
+ ),
326
+ ] = "main",
327
+ tmp_dir: Annotated[
328
+ Path | None,
329
+ typer.Option(
330
+ help="Path to a temporary directory to use for image processing",
331
+ ),
332
+ ] = None,
333
+ image_max_size: Annotated[
334
+ int | None,
335
+ typer.Option(
336
+ help="Maximum size (in pixels) for the images. If None, no resizing is performed.",
337
+ ),
338
+ ] = None,
339
+ ):
340
+ """Export LLM image extraction dataset with images only to Hugging Face
341
+ Datasets.
342
+ """
343
+ from labelr.export.llm import export_to_hf_llm_image_extraction
344
+ from labelr.sample.llm import load_llm_image_extraction_dataset_from_jsonl
345
+
346
+ sample_iter = load_llm_image_extraction_dataset_from_jsonl(
347
+ dataset_path=dataset_path
348
+ )
349
+ export_to_hf_llm_image_extraction(
350
+ sample_iter,
351
+ split=split,
352
+ repo_id=repo_id,
353
+ revision=revision,
354
+ tmp_dir=tmp_dir,
355
+ image_max_size=image_max_size,
356
+ )
@@ -239,6 +239,12 @@ def upload_training_dataset_from_predictions(
239
239
  help="Whether to raise an error on invalid samples instead of skipping them",
240
240
  ),
241
241
  ] = False,
242
+ image_max_size: Annotated[
243
+ int | None,
244
+ typer.Option(
245
+ help="Maximum size (in pixels) for the images. If None, no resizing is performed.",
246
+ ),
247
+ ] = None,
242
248
  ):
243
249
  """Upload a training dataset to a Hugging Face Datasets repository from a
244
250
  Gemini batch prediction file."""
@@ -247,7 +253,7 @@ def upload_training_dataset_from_predictions(
247
253
  import orjson
248
254
  from huggingface_hub import HfApi
249
255
 
250
- from labelr.export import export_to_hf_llm_image_extraction
256
+ from labelr.export.llm import export_to_hf_llm_image_extraction
251
257
  from labelr.google_genai import generate_sample_iter
252
258
 
253
259
  instructions = instructions_path.read_text()
@@ -286,4 +292,5 @@ def upload_training_dataset_from_predictions(
286
292
  repo_id=repo_id,
287
293
  revision=revision,
288
294
  tmp_dir=tmp_dir,
295
+ image_max_size=image_max_size,
289
296
  )
@@ -398,7 +398,7 @@ def create_dataset_file(
398
398
  from openfoodfacts.images import extract_barcode_from_url, extract_source_from_url
399
399
  from openfoodfacts.utils import get_image_from_url
400
400
 
401
- from labelr.sample import format_object_detection_sample_to_ls
401
+ from labelr.sample.object_detection import format_object_detection_sample_to_ls
402
402
 
403
403
  logger.info("Loading dataset: %s", input_file)
404
404
 
File without changes
@@ -0,0 +1,114 @@
1
+ import functools
2
+ import logging
3
+ import pickle
4
+ import tempfile
5
+ from pathlib import Path
6
+
7
+ import datasets
8
+ from openfoodfacts.images import generate_image_url
9
+ from openfoodfacts.types import Flavor
10
+ from PIL import Image, ImageOps
11
+
12
+ from labelr.export.common import _pickle_sample_generator
13
+ from labelr.sample.classification import HF_DS_CLASSIFICATION_FEATURES
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ def export_from_ultralytics_to_hf_classification(
19
+ dataset_dir: Path,
20
+ repo_id: str,
21
+ label_names: list[str],
22
+ merge_labels: bool = False,
23
+ is_openfoodfacts_dataset: bool = False,
24
+ openfoodfacts_flavor: Flavor = Flavor.off,
25
+ ) -> None:
26
+ """Export an Ultralytics classification dataset to a Hugging Face dataset.
27
+
28
+ The Ultralytics dataset directory should contain 'train', 'val' and/or
29
+ 'test' subdirectories, each containing subdirectories for each label.
30
+
31
+ Args:
32
+ dataset_dir (Path): Path to the Ultralytics dataset directory.
33
+ repo_id (str): Hugging Face repository ID to push the dataset to.
34
+ label_names (list[str]): List of label names.
35
+ merge_labels (bool): Whether to merge all labels into a single label
36
+ named 'object'.
37
+ is_openfoodfacts_dataset (bool): Whether the dataset is from
38
+ Open Food Facts. If True, the `off_image_id` and `image_url` will
39
+ be generated automatically. `off_image_id` is extracted from the
40
+ image filename.
41
+ openfoodfacts_flavor (Flavor): Flavor of Open Food Facts dataset. This
42
+ is ignored if `is_openfoodfacts_dataset` is False.
43
+ """
44
+ logger.info("Repo ID: %s, dataset_dir: %s", repo_id, dataset_dir)
45
+
46
+ if not any((dataset_dir / split).is_dir() for split in ["train", "val", "test"]):
47
+ raise ValueError(
48
+ f"Dataset directory {dataset_dir} does not contain 'train', 'val' or 'test' subdirectories"
49
+ )
50
+
51
+ # Save output as pickle
52
+ for split in ["train", "val", "test"]:
53
+ split_dir = dataset_dir / split
54
+
55
+ if not split_dir.is_dir():
56
+ logger.info("Skipping missing split directory: %s", split_dir)
57
+ continue
58
+
59
+ with tempfile.TemporaryDirectory() as tmp_dir_str:
60
+ tmp_dir = Path(tmp_dir_str)
61
+ for label_dir in (d for d in split_dir.iterdir() if d.is_dir()):
62
+ label_name = label_dir.name
63
+ if merge_labels:
64
+ label_name = "object"
65
+ if label_name not in label_names:
66
+ raise ValueError(
67
+ "Label name %s not in provided label names (label names: %s)"
68
+ % (label_name, label_names),
69
+ )
70
+ label_id = label_names.index(label_name)
71
+
72
+ for image_path in label_dir.glob("*"):
73
+ if is_openfoodfacts_dataset:
74
+ image_stem_parts = image_path.stem.split("_")
75
+ barcode = image_stem_parts[0]
76
+ off_image_id = image_stem_parts[1]
77
+ image_id = f"{barcode}_{off_image_id}"
78
+ image_url = generate_image_url(
79
+ barcode, off_image_id, flavor=openfoodfacts_flavor
80
+ )
81
+ else:
82
+ image_id = image_path.stem
83
+ barcode = ""
84
+ off_image_id = ""
85
+ image_url = ""
86
+ image = Image.open(image_path)
87
+ image.load()
88
+
89
+ if image.mode != "RGB":
90
+ image = image.convert("RGB")
91
+
92
+ # Rotate image according to exif orientation using Pillow
93
+ ImageOps.exif_transpose(image, in_place=True)
94
+ sample = {
95
+ "image_id": image_id,
96
+ "image": image,
97
+ "width": image.width,
98
+ "height": image.height,
99
+ "meta": {
100
+ "barcode": barcode,
101
+ "off_image_id": off_image_id,
102
+ "image_url": image_url,
103
+ },
104
+ "category_id": label_id,
105
+ "category_name": label_name,
106
+ }
107
+ with open(tmp_dir / f"{split}_{image_id}.pkl", "wb") as f:
108
+ pickle.dump(sample, f)
109
+
110
+ hf_ds = datasets.Dataset.from_generator(
111
+ functools.partial(_pickle_sample_generator, tmp_dir),
112
+ features=HF_DS_CLASSIFICATION_FEATURES,
113
+ )
114
+ hf_ds.push_to_hub(repo_id, split=split)
@@ -0,0 +1,42 @@
1
+ import pickle
2
+ from pathlib import Path
3
+
4
+ from openfoodfacts.types import Flavor
5
+
6
+ from labelr.types import TaskType
7
+
8
+
9
+ def _pickle_sample_generator(dir: Path):
10
+ """Generator that yields samples from pickles in a directory."""
11
+ for pkl in dir.glob("*.pkl"):
12
+ with open(pkl, "rb") as f:
13
+ yield pickle.load(f)
14
+
15
+
16
+ def export_from_ultralytics_to_hf(
17
+ task_type: TaskType,
18
+ dataset_dir: Path,
19
+ repo_id: str,
20
+ label_names: list[str],
21
+ merge_labels: bool = False,
22
+ is_openfoodfacts_dataset: bool = False,
23
+ openfoodfacts_flavor: Flavor = Flavor.off,
24
+ ) -> None:
25
+ from labelr.export.classification import (
26
+ export_from_ultralytics_to_hf_classification,
27
+ )
28
+
29
+ if task_type != TaskType.classification:
30
+ raise NotImplementedError(
31
+ "Only classification task is currently supported for Ultralytics to HF export"
32
+ )
33
+
34
+ if task_type == TaskType.classification:
35
+ export_from_ultralytics_to_hf_classification(
36
+ dataset_dir=dataset_dir,
37
+ repo_id=repo_id,
38
+ label_names=label_names,
39
+ merge_labels=merge_labels,
40
+ is_openfoodfacts_dataset=is_openfoodfacts_dataset,
41
+ openfoodfacts_flavor=openfoodfacts_flavor,
42
+ )
labelr/export/llm.py ADDED
@@ -0,0 +1,91 @@
1
+ import functools
2
+ import logging
3
+ import pickle
4
+ import tempfile
5
+ import typing
6
+ from collections.abc import Iterator
7
+ from pathlib import Path
8
+
9
+ import datasets
10
+ import tqdm
11
+ from PIL import Image, ImageOps
12
+
13
+ from labelr.export.common import _pickle_sample_generator
14
+ from labelr.sample.llm import (
15
+ HF_DS_LLM_IMAGE_EXTRACTION_FEATURES,
16
+ LLMImageExtractionSample,
17
+ )
18
+ from labelr.utils import PathWithContext
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ def export_to_hf_llm_image_extraction(
24
+ sample_iter: Iterator[LLMImageExtractionSample],
25
+ split: str,
26
+ repo_id: str,
27
+ revision: str = "main",
28
+ tmp_dir: Path | None = None,
29
+ image_max_size: int | None = None,
30
+ ) -> None:
31
+ """Export LLM image extraction samples to a Hugging Face dataset.
32
+
33
+ Args:
34
+ sample_iter (Iterator[LLMImageExtractionSample]): Iterator of samples
35
+ to export.
36
+ split (str): Name of the dataset split (e.g., 'train', 'val').
37
+ repo_id (str): Hugging Face repository ID to push the dataset to.
38
+ revision (str): Revision (branch, tag or commit) to use for the
39
+ Hugging Face Datasets repository.
40
+ tmp_dir (Path | None): Temporary directory to use for intermediate
41
+ files. If None, a temporary directory will be created
42
+ automatically.
43
+ image_max_size (int | None): Maximum size (in pixels) for the images.
44
+ """
45
+ logger.info(
46
+ "Repo ID: %s, revision: %s, split: %s, tmp_dir: %s, image_max_size: %s",
47
+ repo_id,
48
+ revision,
49
+ split,
50
+ tmp_dir,
51
+ image_max_size,
52
+ )
53
+
54
+ tmp_dir_with_context: PathWithContext | tempfile.TemporaryDirectory
55
+ if tmp_dir:
56
+ tmp_dir.mkdir(parents=True, exist_ok=True)
57
+ tmp_dir_with_context = PathWithContext(tmp_dir)
58
+ else:
59
+ tmp_dir_with_context = tempfile.TemporaryDirectory()
60
+
61
+ with tmp_dir_with_context as tmp_dir_str:
62
+ tmp_dir = Path(tmp_dir_str)
63
+ for sample in tqdm.tqdm(sample_iter, desc="samples"):
64
+ image = sample.image
65
+ # Rotate image according to exif orientation using Pillow
66
+ image = typing.cast(Image.Image, ImageOps.exif_transpose(image))
67
+
68
+ if image_max_size is not None:
69
+ if image.height > image_max_size or image.width > image_max_size:
70
+ image.thumbnail(
71
+ (image_max_size, image_max_size),
72
+ Image.Resampling.LANCZOS,
73
+ )
74
+ image_id = sample.image_id
75
+ json_sample = {
76
+ "image_id": image_id,
77
+ "image": image,
78
+ "meta": {
79
+ k: v for k, v in sample.meta.model_dump().items() if v is not None
80
+ },
81
+ "output": sample.output,
82
+ }
83
+ # Save output as pickle
84
+ with open(tmp_dir / f"{split}_{image_id}.pkl", "wb") as f:
85
+ pickle.dump(json_sample, f)
86
+
87
+ hf_ds = datasets.Dataset.from_generator(
88
+ functools.partial(_pickle_sample_generator, tmp_dir),
89
+ features=HF_DS_LLM_IMAGE_EXTRACTION_FEATURES,
90
+ )
91
+ hf_ds.push_to_hub(repo_id, split=split, revision=revision)
@@ -1,38 +1,23 @@
1
1
  import functools
2
2
  import logging
3
3
  import pickle
4
- import random
5
4
  import tempfile
6
- from collections.abc import Iterator
7
5
  from pathlib import Path
8
6
 
9
7
  import datasets
10
8
  import tqdm
11
9
  from label_studio_sdk.client import LabelStudio
12
- from openfoodfacts.images import download_image, generate_image_url
13
- from openfoodfacts.types import Flavor
14
- from PIL import Image, ImageOps
10
+ from openfoodfacts.images import download_image
15
11
 
16
- from labelr.sample import (
17
- HF_DS_CLASSIFICATION_FEATURES,
18
- HF_DS_LLM_IMAGE_EXTRACTION_FEATURES,
12
+ from labelr.export.common import _pickle_sample_generator
13
+ from labelr.sample.object_detection import (
19
14
  HF_DS_OBJECT_DETECTION_FEATURES,
20
- LLMImageExtractionSample,
21
15
  format_object_detection_sample_to_hf,
22
16
  )
23
- from labelr.types import TaskType
24
- from labelr.utils import PathWithContext
25
17
 
26
18
  logger = logging.getLogger(__name__)
27
19
 
28
20
 
29
- def _pickle_sample_generator(dir: Path):
30
- """Generator that yields samples from pickles in a directory."""
31
- for pkl in dir.glob("*.pkl"):
32
- with open(pkl, "rb") as f:
33
- yield pickle.load(f)
34
-
35
-
36
21
  def export_from_ls_to_hf_object_detection(
37
22
  ls: LabelStudio,
38
23
  repo_id: str,
@@ -335,186 +320,3 @@ def export_from_hf_to_ultralytics_object_detection(
335
320
  f.write("names:\n")
336
321
  for i, category_name in enumerate(category_names):
337
322
  f.write(f" {i}: {category_name}\n")
338
-
339
-
340
- def export_from_ultralytics_to_hf(
341
- task_type: TaskType,
342
- dataset_dir: Path,
343
- repo_id: str,
344
- label_names: list[str],
345
- merge_labels: bool = False,
346
- is_openfoodfacts_dataset: bool = False,
347
- openfoodfacts_flavor: Flavor = Flavor.off,
348
- ) -> None:
349
- if task_type != TaskType.classification:
350
- raise NotImplementedError(
351
- "Only classification task is currently supported for Ultralytics to HF export"
352
- )
353
-
354
- if task_type == TaskType.classification:
355
- export_from_ultralytics_to_hf_classification(
356
- dataset_dir=dataset_dir,
357
- repo_id=repo_id,
358
- label_names=label_names,
359
- merge_labels=merge_labels,
360
- is_openfoodfacts_dataset=is_openfoodfacts_dataset,
361
- openfoodfacts_flavor=openfoodfacts_flavor,
362
- )
363
-
364
-
365
- def export_from_ultralytics_to_hf_classification(
366
- dataset_dir: Path,
367
- repo_id: str,
368
- label_names: list[str],
369
- merge_labels: bool = False,
370
- is_openfoodfacts_dataset: bool = False,
371
- openfoodfacts_flavor: Flavor = Flavor.off,
372
- ) -> None:
373
- """Export an Ultralytics classification dataset to a Hugging Face dataset.
374
-
375
- The Ultralytics dataset directory should contain 'train', 'val' and/or
376
- 'test' subdirectories, each containing subdirectories for each label.
377
-
378
- Args:
379
- dataset_dir (Path): Path to the Ultralytics dataset directory.
380
- repo_id (str): Hugging Face repository ID to push the dataset to.
381
- label_names (list[str]): List of label names.
382
- merge_labels (bool): Whether to merge all labels into a single label
383
- named 'object'.
384
- is_openfoodfacts_dataset (bool): Whether the dataset is from
385
- Open Food Facts. If True, the `off_image_id` and `image_url` will
386
- be generated automatically. `off_image_id` is extracted from the
387
- image filename.
388
- openfoodfacts_flavor (Flavor): Flavor of Open Food Facts dataset. This
389
- is ignored if `is_openfoodfacts_dataset` is False.
390
- """
391
- logger.info("Repo ID: %s, dataset_dir: %s", repo_id, dataset_dir)
392
-
393
- if not any((dataset_dir / split).is_dir() for split in ["train", "val", "test"]):
394
- raise ValueError(
395
- f"Dataset directory {dataset_dir} does not contain 'train', 'val' or 'test' subdirectories"
396
- )
397
-
398
- # Save output as pickle
399
- for split in ["train", "val", "test"]:
400
- split_dir = dataset_dir / split
401
-
402
- if not split_dir.is_dir():
403
- logger.info("Skipping missing split directory: %s", split_dir)
404
- continue
405
-
406
- with tempfile.TemporaryDirectory() as tmp_dir_str:
407
- tmp_dir = Path(tmp_dir_str)
408
- for label_dir in (d for d in split_dir.iterdir() if d.is_dir()):
409
- label_name = label_dir.name
410
- if merge_labels:
411
- label_name = "object"
412
- if label_name not in label_names:
413
- raise ValueError(
414
- "Label name %s not in provided label names (label names: %s)"
415
- % (label_name, label_names),
416
- )
417
- label_id = label_names.index(label_name)
418
-
419
- for image_path in label_dir.glob("*"):
420
- if is_openfoodfacts_dataset:
421
- image_stem_parts = image_path.stem.split("_")
422
- barcode = image_stem_parts[0]
423
- off_image_id = image_stem_parts[1]
424
- image_id = f"{barcode}_{off_image_id}"
425
- image_url = generate_image_url(
426
- barcode, off_image_id, flavor=openfoodfacts_flavor
427
- )
428
- else:
429
- image_id = image_path.stem
430
- barcode = ""
431
- off_image_id = ""
432
- image_url = ""
433
- image = Image.open(image_path)
434
- image.load()
435
-
436
- if image.mode != "RGB":
437
- image = image.convert("RGB")
438
-
439
- # Rotate image according to exif orientation using Pillow
440
- ImageOps.exif_transpose(image, in_place=True)
441
- sample = {
442
- "image_id": image_id,
443
- "image": image,
444
- "width": image.width,
445
- "height": image.height,
446
- "meta": {
447
- "barcode": barcode,
448
- "off_image_id": off_image_id,
449
- "image_url": image_url,
450
- },
451
- "category_id": label_id,
452
- "category_name": label_name,
453
- }
454
- with open(tmp_dir / f"{split}_{image_id}.pkl", "wb") as f:
455
- pickle.dump(sample, f)
456
-
457
- hf_ds = datasets.Dataset.from_generator(
458
- functools.partial(_pickle_sample_generator, tmp_dir),
459
- features=HF_DS_CLASSIFICATION_FEATURES,
460
- )
461
- hf_ds.push_to_hub(repo_id, split=split)
462
-
463
-
464
- def export_to_hf_llm_image_extraction(
465
- sample_iter: Iterator[LLMImageExtractionSample],
466
- split: str,
467
- repo_id: str,
468
- revision: str = "main",
469
- tmp_dir: Path | None = None,
470
- ) -> None:
471
- """Export LLM image extraction samples to a Hugging Face dataset.
472
-
473
- Args:
474
- sample_iter (Iterator[LLMImageExtractionSample]): Iterator of samples
475
- to export.
476
- split (str): Name of the dataset split (e.g., 'train', 'val').
477
- repo_id (str): Hugging Face repository ID to push the dataset to.
478
- revision (str): Revision (branch, tag or commit) to use for the
479
- Hugging Face Datasets repository.
480
- tmp_dir (Path | None): Temporary directory to use for intermediate
481
- files. If None, a temporary directory will be created
482
- automatically.
483
- """
484
- logger.info(
485
- "Repo ID: %s, revision: %s, split: %s, tmp_dir: %s",
486
- repo_id,
487
- revision,
488
- split,
489
- tmp_dir,
490
- )
491
-
492
- tmp_dir_with_context: PathWithContext | tempfile.TemporaryDirectory
493
- if tmp_dir:
494
- tmp_dir.mkdir(parents=True, exist_ok=True)
495
- tmp_dir_with_context = PathWithContext(tmp_dir)
496
- else:
497
- tmp_dir_with_context = tempfile.TemporaryDirectory()
498
-
499
- with tmp_dir_with_context as tmp_dir_str:
500
- tmp_dir = Path(tmp_dir_str)
501
- for sample in tqdm.tqdm(sample_iter, desc="samples"):
502
- image = sample.image
503
- # Rotate image according to exif orientation using Pillow
504
- image = ImageOps.exif_transpose(image)
505
- image_id = sample.image_id
506
- sample = {
507
- "image_id": image_id,
508
- "image": image,
509
- "meta": sample.meta.model_dump(),
510
- "output": sample.output,
511
- }
512
- # Save output as pickle
513
- with open(tmp_dir / f"{split}_{image_id}.pkl", "wb") as f:
514
- pickle.dump(sample, f)
515
-
516
- hf_ds = datasets.Dataset.from_generator(
517
- functools.partial(_pickle_sample_generator, tmp_dir),
518
- features=HF_DS_LLM_IMAGE_EXTRACTION_FEATURES,
519
- )
520
- hf_ds.push_to_hub(repo_id, split=split, revision=revision)
labelr/google_genai.py CHANGED
@@ -11,10 +11,11 @@ import orjson
11
11
  import typer
12
12
  from gcloud.aio.storage import Storage
13
13
  from openfoodfacts import Flavor
14
- from openfoodfacts.images import download_image, generate_image_url
14
+ from openfoodfacts.images import generate_image_url
15
15
  from tqdm.asyncio import tqdm
16
16
 
17
- from labelr.sample import LLMImageExtractionSample, SampleMeta
17
+ from labelr.sample.common import SampleMeta
18
+ from labelr.sample.llm import LLMImageExtractionSample
18
19
  from labelr.utils import download_image_from_gcs
19
20
 
20
21
  try:
@@ -335,6 +336,7 @@ def generate_sample_iter(
335
336
  """
336
337
  skipped = 0
337
338
  invalid = 0
339
+ storage_client = storage.Client()
338
340
  with prediction_path.open("r") as f_in:
339
341
  for i, sample_str in enumerate(f_in):
340
342
  if i < skip:
@@ -349,6 +351,7 @@ def generate_sample_iter(
349
351
  sample=sample,
350
352
  is_openfoodfacts_dataset=is_openfoodfacts_dataset,
351
353
  openfoodfacts_flavor=openfoodfacts_flavor,
354
+ storage_client=storage_client,
352
355
  )
353
356
  except Exception as e:
354
357
  if raise_on_invalid_sample:
@@ -370,6 +373,7 @@ def generate_sample_from_prediction(
370
373
  sample: JSONType,
371
374
  is_openfoodfacts_dataset: bool = False,
372
375
  openfoodfacts_flavor: Flavor = Flavor.off,
376
+ storage_client: storage.Client | None = None,
373
377
  ) -> LLMImageExtractionSample:
374
378
  """Generate a LLMImageExtractionSample from a prediction sample.
375
379
  Args:
@@ -378,13 +382,15 @@ def generate_sample_from_prediction(
378
382
  is_openfoodfacts_dataset (bool): Whether the dataset is from Open Food
379
383
  Facts.
380
384
  openfoodfacts_flavor (Flavor): Flavor of the Open Food Facts dataset.
385
+ storage_client (storage.Client | None): Optional Google Cloud Storage
386
+ client. If not provided, a new client will be created.
381
387
  Returns:
382
388
  LLMImageExtractionSample: Generated sample.
383
389
  """
384
390
  image_id = sample["key"][len("key:") :]
385
391
  response_str = sample["response"]["candidates"][0]["content"]["parts"][0]["text"]
386
392
  image_uri = sample["request"]["contents"][0]["parts"][1]["file_data"]["file_uri"]
387
- image = download_image_from_gcs(image_uri=image_uri)
393
+ image = download_image_from_gcs(image_uri=image_uri, client=storage_client)
388
394
  response = orjson.loads(response_str)
389
395
  jsonschema.validate(response, json_schema)
390
396
 
File without changes
@@ -0,0 +1,17 @@
1
+ import datasets
2
+
3
+ HF_DS_CLASSIFICATION_FEATURES = datasets.Features(
4
+ {
5
+ "image_id": datasets.Value("string"),
6
+ "image": datasets.features.Image(),
7
+ "width": datasets.Value("int64"),
8
+ "height": datasets.Value("int64"),
9
+ "meta": {
10
+ "barcode": datasets.Value("string"),
11
+ "off_image_id": datasets.Value("string"),
12
+ "image_url": datasets.Value("string"),
13
+ },
14
+ "category_id": datasets.Value("int64"),
15
+ "category_name": datasets.Value("string"),
16
+ }
17
+ )
@@ -0,0 +1,14 @@
1
+ from pydantic import BaseModel, Field
2
+
3
+
4
+ class SampleMeta(BaseModel):
5
+ barcode: str | None = Field(
6
+ ..., description="The barcode of the product, if applicable"
7
+ )
8
+ off_image_id: str | None = Field(
9
+ ...,
10
+ description="The Open Food Facts image ID associated with the image, if applicable",
11
+ )
12
+ image_url: str | None = Field(
13
+ ..., description="The URL of the image, if applicable"
14
+ )
labelr/sample/llm.py ADDED
@@ -0,0 +1,75 @@
1
+ import typing
2
+ from collections.abc import Iterator
3
+ from pathlib import Path
4
+
5
+ import datasets
6
+ import orjson
7
+ from PIL import Image
8
+ from pydantic import BaseModel, Field
9
+
10
+ from labelr.sample.common import SampleMeta
11
+ from labelr.utils import download_image
12
+
13
+
14
+ class LLMImageExtractionSample(BaseModel):
15
+ class Config:
16
+ # required to allow PIL Image type
17
+ arbitrary_types_allowed = True
18
+
19
+ image_id: str = Field(
20
+ ...,
21
+ description="unique ID for the image. For Open Food Facts images, it follows the "
22
+ "format `barcode:imgid`",
23
+ )
24
+ image: Image.Image = Field(..., description="Image to extract information from")
25
+ output: str | None = Field(..., description="Expected response of the LLM")
26
+ meta: SampleMeta = Field(..., description="Metadata associated with the sample")
27
+
28
+
29
+ HF_DS_LLM_IMAGE_EXTRACTION_FEATURES = datasets.Features(
30
+ {
31
+ "image_id": datasets.Value("string"),
32
+ "image": datasets.features.Image(),
33
+ "output": datasets.features.Value("string"),
34
+ "meta": {
35
+ "barcode": datasets.Value("string"),
36
+ "off_image_id": datasets.Value("string"),
37
+ "image_url": datasets.Value("string"),
38
+ },
39
+ }
40
+ )
41
+
42
+
43
+ def load_llm_image_extraction_dataset_from_jsonl(
44
+ dataset_path: Path, **kwargs
45
+ ) -> Iterator[LLMImageExtractionSample]:
46
+ """Load a Hugging Face dataset for LLM image extraction from a JSONL file.
47
+
48
+ Args:
49
+ dataset_path (Path): Path to the JSONL dataset file.
50
+ **kwargs: Additional keyword arguments to pass to the image downloader.
51
+ Yields:
52
+ Iterator[LLMImageExtractionSample]: Iterator of LLM image extraction
53
+ samples.
54
+ """
55
+ with dataset_path.open("r") as f:
56
+ for line in f:
57
+ item = orjson.loads(line)
58
+ image_id = item["image_id"]
59
+ image_url = item["image_url"]
60
+ image = typing.cast(Image.Image, download_image(image_url, **kwargs))
61
+ barcode = item.pop("barcode", None)
62
+ off_image_id = item.pop("off_image_id", None)
63
+ output = item.pop("output", None)
64
+ meta = SampleMeta(
65
+ barcode=barcode,
66
+ off_image_id=off_image_id,
67
+ image_url=image_url,
68
+ )
69
+ sample = LLMImageExtractionSample(
70
+ image_id=image_id,
71
+ image=image,
72
+ output=output,
73
+ meta=meta,
74
+ )
75
+ yield sample
@@ -8,8 +8,7 @@ import PIL
8
8
  from openfoodfacts import Flavor
9
9
  from openfoodfacts.barcode import normalize_barcode
10
10
  from openfoodfacts.images import download_image, generate_image_url
11
- from PIL import Image, ImageOps
12
- from pydantic import BaseModel, Field
11
+ from PIL import ImageOps
13
12
 
14
13
  logger = logging.getLogger(__name__)
15
14
 
@@ -231,34 +230,6 @@ def format_object_detection_sample_to_hf(
231
230
  }
232
231
 
233
232
 
234
- class SampleMeta(BaseModel):
235
- barcode: str | None = Field(
236
- ..., description="The barcode of the product, if applicable"
237
- )
238
- off_image_id: str | None = Field(
239
- ...,
240
- description="The Open Food Facts image ID associated with the image, if applicable",
241
- )
242
- image_url: str | None = Field(
243
- ..., description="The URL of the image, if applicable"
244
- )
245
-
246
-
247
- class LLMImageExtractionSample(BaseModel):
248
- class Config:
249
- # required to allow PIL Image type
250
- arbitrary_types_allowed = True
251
-
252
- image_id: str = Field(
253
- ...,
254
- description="unique ID for the image. For Open Food Facts images, it follows the "
255
- "format `barcode:imgid`",
256
- )
257
- image: Image.Image = Field(..., description="Image to extract information from")
258
- output: str = Field(..., description="Expected response of the LLM")
259
- meta: SampleMeta = Field(..., description="Metadata associated with the sample")
260
-
261
-
262
233
  # The HuggingFace Dataset features
263
234
  HF_DS_OBJECT_DETECTION_FEATURES = datasets.Features(
264
235
  {
@@ -278,33 +249,3 @@ HF_DS_OBJECT_DETECTION_FEATURES = datasets.Features(
278
249
  },
279
250
  }
280
251
  )
281
-
282
-
283
- HF_DS_CLASSIFICATION_FEATURES = datasets.Features(
284
- {
285
- "image_id": datasets.Value("string"),
286
- "image": datasets.features.Image(),
287
- "width": datasets.Value("int64"),
288
- "height": datasets.Value("int64"),
289
- "meta": {
290
- "barcode": datasets.Value("string"),
291
- "off_image_id": datasets.Value("string"),
292
- "image_url": datasets.Value("string"),
293
- },
294
- "category_id": datasets.Value("int64"),
295
- "category_name": datasets.Value("string"),
296
- }
297
- )
298
-
299
- HF_DS_LLM_IMAGE_EXTRACTION_FEATURES = datasets.Features(
300
- {
301
- "image_id": datasets.Value("string"),
302
- "image": datasets.features.Image(),
303
- "output": datasets.features.Value("string"),
304
- "meta": {
305
- "barcode": datasets.Value("string"),
306
- "off_image_id": datasets.Value("string"),
307
- "image_url": datasets.Value("string"),
308
- },
309
- }
310
- )
labelr/utils.py CHANGED
@@ -2,6 +2,8 @@ import io
2
2
  from pathlib import Path
3
3
 
4
4
  from google.cloud import storage
5
+ from openfoodfacts.images import download_image as _download_image
6
+ from openfoodfacts.utils import ImageDownloadItem
5
7
  from PIL import Image
6
8
 
7
9
 
@@ -20,15 +22,63 @@ def parse_hf_repo_id(hf_repo_id: str) -> tuple[str, str]:
20
22
  return hf_repo_id, revision
21
23
 
22
24
 
23
- def download_image_from_gcs(image_uri: str) -> Image.Image:
25
+ def download_image(
26
+ image: str | tuple[str, str],
27
+ *,
28
+ error_raise: bool = True,
29
+ return_struct: bool = False,
30
+ **kwargs,
31
+ ) -> Image.Image | ImageDownloadItem | None:
32
+ """Download an image from a URL or GCS URI and return it as a PIL Image.
33
+ Args:
34
+ image (str | tuple[str, str]): The URL or GCS URI of the image.
35
+ error_raise (bool): Whether to raise an error if the image cannot be
36
+ downloaded.
37
+ return_struct (bool): Whether to return an ImageDownloadItem struct
38
+ instead of a PIL Image.
39
+ **kwargs: Additional arguments to pass to the download function.
40
+ Returns:
41
+ Image.Image | ImageDownloadItem: The downloaded image as a PIL Image
42
+ or an ImageDownloadItem struct.
43
+ """
44
+ if isinstance(image, str) and image.startswith("gs://"):
45
+ return download_image_from_gcs(image, return_struct=return_struct, **kwargs)
46
+ return _download_image(
47
+ image,
48
+ error_raise=error_raise,
49
+ return_struct=return_struct,
50
+ **kwargs,
51
+ )
52
+
53
+
54
+ def download_image_from_gcs(
55
+ image_uri: str, client: storage.Client | None = None, return_struct: bool = False
56
+ ) -> Image.Image | ImageDownloadItem:
24
57
  """Download an image from a Google Cloud Storage URI and return it as a
25
- PIL Image."""
26
- storage_client = storage.Client()
58
+ PIL Image.
59
+
60
+ Args:
61
+ image_uri (str): The GCS URI of the image
62
+ (e.g., gs://bucket_name/path/to/image.jpg).
63
+ client (storage.Client | None): An optional Google Cloud Storage
64
+ client. If not provided, a new client will be created.
65
+ """
66
+ if client is None:
67
+ client = storage.Client()
68
+
27
69
  bucket_name, blob_name = image_uri.replace("gs://", "").split("/", 1)
28
- bucket = storage_client.bucket(bucket_name)
70
+ bucket = client.bucket(bucket_name)
29
71
  blob = bucket.blob(blob_name)
30
72
  image_data = blob.download_as_bytes()
31
- return Image.open(io.BytesIO(image_data))
73
+ pil_image = Image.open(io.BytesIO(image_data))
74
+
75
+ if return_struct:
76
+ return ImageDownloadItem(
77
+ url=image_uri,
78
+ image=pil_image,
79
+ error=None,
80
+ )
81
+ return pil_image
32
82
 
33
83
 
34
84
  class PathWithContext:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: labelr
3
- Version: 0.9.0
3
+ Version: 0.10.0
4
4
  Summary: A command-line tool to manage labeling tasks with Label Studio.
5
5
  Requires-Python: >=3.10
6
6
  Description-Content-Type: text/markdown
@@ -18,14 +18,13 @@ Requires-Dist: rapidfuzz>=3.14.3
18
18
  Requires-Dist: aiohttp
19
19
  Requires-Dist: aiofiles
20
20
  Requires-Dist: orjson
21
+ Requires-Dist: google-cloud-storage
22
+ Requires-Dist: gcloud-aio-storage
23
+ Requires-Dist: google-genai>=1.56.0
21
24
  Provides-Extra: ultralytics
22
25
  Requires-Dist: ultralytics==8.3.223; extra == "ultralytics"
23
26
  Provides-Extra: fiftyone
24
27
  Requires-Dist: fiftyone~=1.10.0; extra == "fiftyone"
25
- Provides-Extra: google
26
- Requires-Dist: google-genai>=1.56.0; extra == "google"
27
- Requires-Dist: gcloud-aio-storage; extra == "google"
28
- Requires-Dist: google-cloud-storage; extra == "google"
29
28
  Dynamic: license-file
30
29
 
31
30
  # Labelr
@@ -0,0 +1,36 @@
1
+ labelr/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ labelr/__main__.py,sha256=G4e95-IfhI-lOmkOBP6kQ8wl1x_Fl7dZlLOYr90K83c,66
3
+ labelr/annotate.py,sha256=3fJ9FYbcozcOoKuhNtzPHV8sSnp-45FsNnMc8UeBHGU,3503
4
+ labelr/check.py,sha256=3wK6mE0UsKvoBNm0_lyWhCMq7gxkv5r50pvO70damXY,2476
5
+ labelr/config.py,sha256=3RXF_NdkSuHvfVMGMlYmjlw45fU77zQkLX7gmZq7NxM,64
6
+ labelr/dataset_features.py,sha256=ZC9QAUw9oKHqyUPla2h3xQFaRT9sHq8hkPNN4RDDwmo,1257
7
+ labelr/google_genai.py,sha256=x5p98eYoI887QMBDgziFxEW9WNdZ8Cw0EHjAFQ71SaE,14728
8
+ labelr/main.py,sha256=OTiJSkD_TrzQmQQm291FhknD-HQQTWfBEBgImxqL0KM,2634
9
+ labelr/project_config.py,sha256=CIHEcgSOfXb53naHWEBkTDm2V9m3abAu8C54VSzHjAs,1260
10
+ labelr/types.py,sha256=8CHfLyifF_N94OYDhG-7IcWboOh9o0Z_0LBtQapT8TQ,313
11
+ labelr/utils.py,sha256=8Yp0L2MCIdUYSjvmF4U5iiaBpaZJbYw4rHJOMhCCudE,3075
12
+ labelr/apps/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
13
+ labelr/apps/datasets.py,sha256=tAD6TZSnwh7uhkleSfDP0PFqztXC1S3Vx2aMSVCFfRU,12725
14
+ labelr/apps/evaluate.py,sha256=UC4CuSKa4vgR5xTBZ-dFgp_1pYnkM55s2IJgix0YtkI,1157
15
+ labelr/apps/google_batch.py,sha256=Mlz5jRVcR1XzRJg2HLte3rIhiOk4xQQjjLAJsc3lJjo,9572
16
+ labelr/apps/hugging_face.py,sha256=B0GaDZeUZj2A7nEeC1OtCANb0DqvBkhWwFWM_9Nm2kU,1608
17
+ labelr/apps/label_studio.py,sha256=lQ7K16noA4Mnr1hc0oxya1sgGgABWnpIIJTM5ENp7so,16869
18
+ labelr/apps/train.py,sha256=wmOSpO9JsrwCXYMgRg2srMbV5B5TvnlfhAKPqUt6wSg,7328
19
+ labelr/evaluate/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
20
+ labelr/evaluate/object_detection.py,sha256=QJIwrDY-Vsy0-It6tZSkN3qgAlmIu2W1-kGdmibiPSQ,3349
21
+ labelr/export/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
22
+ labelr/export/classification.py,sha256=rnm99vGMJy1UkdXiZ8t_TgFe3CyLBBYowWwzaZeniIs,4699
23
+ labelr/export/common.py,sha256=lJ-ZDOMKGpC48fCuEnIrA8sZBhXGZOcghBbsLM1h66o,1252
24
+ labelr/export/llm.py,sha256=Jlopi0EQ4YUWLe_s-kTFcISTzO1QmdX-qXQxayO6E-k,3186
25
+ labelr/export/object_detection.py,sha256=91ywkPago7WgbY2COQKpwjFLYAAsXeGOu7TkGHi17OU,12338
26
+ labelr/sample/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
27
+ labelr/sample/classification.py,sha256=7Z5hvxG6q6wfJMYj00JWbRBhfjOyhjaL8fpJjgBi9N8,539
28
+ labelr/sample/common.py,sha256=f0XDS6s0z6Vw4G2FDELJ1VQSe5Tsh0q3-3VU9unK9eY,431
29
+ labelr/sample/llm.py,sha256=zAsI3TmfGCbBPv4_hNtYR4Np3yAmUDzXGAvlQLF6V6w,2474
30
+ labelr/sample/object_detection.py,sha256=XZasR_k4AxzsiWdVMC2ZnyjfA14PKJPrx1U-XPr5tWQ,8427
31
+ labelr-0.10.0.dist-info/licenses/LICENSE,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
32
+ labelr-0.10.0.dist-info/METADATA,sha256=pS2Ipq-aICU3TluuqSNocGP5-V8ztLk6X_udwwnECPk,7243
33
+ labelr-0.10.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
34
+ labelr-0.10.0.dist-info/entry_points.txt,sha256=OACukVeR_2z54i8yQuWqqk_jdEHlyTwmTFOFBmxPp1k,43
35
+ labelr-0.10.0.dist-info/top_level.txt,sha256=bjZo50aGZhXIcZYpYOX4sdAQcamxh8nwfEh7A9RD_Ag,7
36
+ labelr-0.10.0.dist-info/RECORD,,
@@ -1,28 +0,0 @@
1
- labelr/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- labelr/__main__.py,sha256=G4e95-IfhI-lOmkOBP6kQ8wl1x_Fl7dZlLOYr90K83c,66
3
- labelr/annotate.py,sha256=3fJ9FYbcozcOoKuhNtzPHV8sSnp-45FsNnMc8UeBHGU,3503
4
- labelr/check.py,sha256=3wK6mE0UsKvoBNm0_lyWhCMq7gxkv5r50pvO70damXY,2476
5
- labelr/config.py,sha256=3RXF_NdkSuHvfVMGMlYmjlw45fU77zQkLX7gmZq7NxM,64
6
- labelr/dataset_features.py,sha256=ZC9QAUw9oKHqyUPla2h3xQFaRT9sHq8hkPNN4RDDwmo,1257
7
- labelr/export.py,sha256=aPfQ-RaK3C2WJrzbETYdC9kRe0MTpCRs0nu5l2SqiRg,20092
8
- labelr/google_genai.py,sha256=vn_UNQOxUDOTTTWz-emAVErjOtQmnlxM_m8yo2q01Ok,14401
9
- labelr/main.py,sha256=OTiJSkD_TrzQmQQm291FhknD-HQQTWfBEBgImxqL0KM,2634
10
- labelr/project_config.py,sha256=CIHEcgSOfXb53naHWEBkTDm2V9m3abAu8C54VSzHjAs,1260
11
- labelr/sample.py,sha256=VL-iKDvLaIeViJ0TaBY9uCbv0ey528fkaRTYE-Zr12I,10347
12
- labelr/types.py,sha256=8CHfLyifF_N94OYDhG-7IcWboOh9o0Z_0LBtQapT8TQ,313
13
- labelr/utils.py,sha256=-zLOWLbvLwtNFtzzwZ6RjJD9GstoYR-gt4wz9r6u9lE,1363
14
- labelr/apps/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
15
- labelr/apps/datasets.py,sha256=kJQWwm3mjA2uWIA8O_DslM7OS5ht5mgWqcFC_zF4gCo,11187
16
- labelr/apps/evaluate.py,sha256=UC4CuSKa4vgR5xTBZ-dFgp_1pYnkM55s2IJgix0YtkI,1157
17
- labelr/apps/google_batch.py,sha256=BMcfBkDwfu-zOOR80bYmtEy6k_Qc70m7K7wmp4Ww0r8,9335
18
- labelr/apps/hugging_face.py,sha256=B0GaDZeUZj2A7nEeC1OtCANb0DqvBkhWwFWM_9Nm2kU,1608
19
- labelr/apps/label_studio.py,sha256=su9shoi0K9PmI8RBLipV2KQf_MRjkF5vy5-JUcbXr5A,16852
20
- labelr/apps/train.py,sha256=wmOSpO9JsrwCXYMgRg2srMbV5B5TvnlfhAKPqUt6wSg,7328
21
- labelr/evaluate/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
22
- labelr/evaluate/object_detection.py,sha256=QJIwrDY-Vsy0-It6tZSkN3qgAlmIu2W1-kGdmibiPSQ,3349
23
- labelr-0.9.0.dist-info/licenses/LICENSE,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
24
- labelr-0.9.0.dist-info/METADATA,sha256=cNkf4LPmbO_k3UuR7O7NtcCwRF-Z5c-yIyQRAocsjww,7322
25
- labelr-0.9.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
26
- labelr-0.9.0.dist-info/entry_points.txt,sha256=OACukVeR_2z54i8yQuWqqk_jdEHlyTwmTFOFBmxPp1k,43
27
- labelr-0.9.0.dist-info/top_level.txt,sha256=bjZo50aGZhXIcZYpYOX4sdAQcamxh8nwfEh7A9RD_Ag,7
28
- labelr-0.9.0.dist-info/RECORD,,