labelr 0.9.0__py3-none-any.whl → 0.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- labelr/apps/datasets.py +56 -5
- labelr/apps/google_batch.py +8 -1
- labelr/apps/label_studio.py +1 -1
- labelr/export/__init__.py +0 -0
- labelr/export/classification.py +114 -0
- labelr/export/common.py +42 -0
- labelr/export/llm.py +91 -0
- labelr/{export.py → export/object_detection.py} +3 -201
- labelr/google_genai.py +9 -3
- labelr/sample/__init__.py +0 -0
- labelr/sample/classification.py +17 -0
- labelr/sample/common.py +14 -0
- labelr/sample/llm.py +75 -0
- labelr/{sample.py → sample/object_detection.py} +1 -60
- labelr/utils.py +55 -5
- {labelr-0.9.0.dist-info → labelr-0.10.0.dist-info}/METADATA +4 -5
- labelr-0.10.0.dist-info/RECORD +36 -0
- labelr-0.9.0.dist-info/RECORD +0 -28
- {labelr-0.9.0.dist-info → labelr-0.10.0.dist-info}/WHEEL +0 -0
- {labelr-0.9.0.dist-info → labelr-0.10.0.dist-info}/entry_points.txt +0 -0
- {labelr-0.9.0.dist-info → labelr-0.10.0.dist-info}/licenses/LICENSE +0 -0
- {labelr-0.9.0.dist-info → labelr-0.10.0.dist-info}/top_level.txt +0 -0
labelr/apps/datasets.py
CHANGED
|
@@ -12,7 +12,11 @@ import typer
|
|
|
12
12
|
from openfoodfacts import Flavor
|
|
13
13
|
from openfoodfacts.utils import get_logger
|
|
14
14
|
|
|
15
|
-
from labelr.export import export_from_ultralytics_to_hf
|
|
15
|
+
from labelr.export.common import export_from_ultralytics_to_hf
|
|
16
|
+
from labelr.export.object_detection import (
|
|
17
|
+
export_from_ls_to_hf_object_detection,
|
|
18
|
+
export_from_ls_to_ultralytics_object_detection,
|
|
19
|
+
)
|
|
16
20
|
|
|
17
21
|
from ..config import LABEL_STUDIO_DEFAULT_URL
|
|
18
22
|
from ..types import ExportDestination, ExportSource, TaskType
|
|
@@ -99,7 +103,9 @@ def convert_object_detection_dataset(
|
|
|
99
103
|
Studio format, and save it to a JSON file."""
|
|
100
104
|
from datasets import load_dataset
|
|
101
105
|
|
|
102
|
-
from labelr.sample import
|
|
106
|
+
from labelr.sample.object_detection import (
|
|
107
|
+
format_object_detection_sample_from_hf_to_ls,
|
|
108
|
+
)
|
|
103
109
|
|
|
104
110
|
logger.info("Loading dataset: %s", repo_id)
|
|
105
111
|
ds = load_dataset(repo_id)
|
|
@@ -207,10 +213,8 @@ def export(
|
|
|
207
213
|
local files (ultralytics format)."""
|
|
208
214
|
from label_studio_sdk.client import LabelStudio
|
|
209
215
|
|
|
210
|
-
from labelr.export import (
|
|
216
|
+
from labelr.export.object_detection import (
|
|
211
217
|
export_from_hf_to_ultralytics_object_detection,
|
|
212
|
-
export_from_ls_to_hf_object_detection,
|
|
213
|
-
export_from_ls_to_ultralytics_object_detection,
|
|
214
218
|
)
|
|
215
219
|
|
|
216
220
|
if (to == ExportDestination.hf or from_ == ExportSource.hf) and repo_id is None:
|
|
@@ -303,3 +307,50 @@ def export(
|
|
|
303
307
|
is_openfoodfacts_dataset=is_openfoodfacts_dataset,
|
|
304
308
|
openfoodfacts_flavor=openfoodfacts_flavor,
|
|
305
309
|
)
|
|
310
|
+
|
|
311
|
+
|
|
312
|
+
@app.command()
|
|
313
|
+
def export_llm_ds(
|
|
314
|
+
dataset_path: Annotated[
|
|
315
|
+
Path, typer.Option(..., help="Path to the JSONL dataset file")
|
|
316
|
+
],
|
|
317
|
+
repo_id: Annotated[
|
|
318
|
+
str, typer.Option(..., help="Hugging Face Datasets repository ID to export to")
|
|
319
|
+
],
|
|
320
|
+
split: Annotated[str, typer.Option(..., help="Dataset split to export")],
|
|
321
|
+
revision: Annotated[
|
|
322
|
+
str,
|
|
323
|
+
typer.Option(
|
|
324
|
+
help="Revision (branch, tag or commit) for the Hugging Face Datasets repository."
|
|
325
|
+
),
|
|
326
|
+
] = "main",
|
|
327
|
+
tmp_dir: Annotated[
|
|
328
|
+
Path | None,
|
|
329
|
+
typer.Option(
|
|
330
|
+
help="Path to a temporary directory to use for image processing",
|
|
331
|
+
),
|
|
332
|
+
] = None,
|
|
333
|
+
image_max_size: Annotated[
|
|
334
|
+
int | None,
|
|
335
|
+
typer.Option(
|
|
336
|
+
help="Maximum size (in pixels) for the images. If None, no resizing is performed.",
|
|
337
|
+
),
|
|
338
|
+
] = None,
|
|
339
|
+
):
|
|
340
|
+
"""Export LLM image extraction dataset with images only to Hugging Face
|
|
341
|
+
Datasets.
|
|
342
|
+
"""
|
|
343
|
+
from labelr.export.llm import export_to_hf_llm_image_extraction
|
|
344
|
+
from labelr.sample.llm import load_llm_image_extraction_dataset_from_jsonl
|
|
345
|
+
|
|
346
|
+
sample_iter = load_llm_image_extraction_dataset_from_jsonl(
|
|
347
|
+
dataset_path=dataset_path
|
|
348
|
+
)
|
|
349
|
+
export_to_hf_llm_image_extraction(
|
|
350
|
+
sample_iter,
|
|
351
|
+
split=split,
|
|
352
|
+
repo_id=repo_id,
|
|
353
|
+
revision=revision,
|
|
354
|
+
tmp_dir=tmp_dir,
|
|
355
|
+
image_max_size=image_max_size,
|
|
356
|
+
)
|
labelr/apps/google_batch.py
CHANGED
|
@@ -239,6 +239,12 @@ def upload_training_dataset_from_predictions(
|
|
|
239
239
|
help="Whether to raise an error on invalid samples instead of skipping them",
|
|
240
240
|
),
|
|
241
241
|
] = False,
|
|
242
|
+
image_max_size: Annotated[
|
|
243
|
+
int | None,
|
|
244
|
+
typer.Option(
|
|
245
|
+
help="Maximum size (in pixels) for the images. If None, no resizing is performed.",
|
|
246
|
+
),
|
|
247
|
+
] = None,
|
|
242
248
|
):
|
|
243
249
|
"""Upload a training dataset to a Hugging Face Datasets repository from a
|
|
244
250
|
Gemini batch prediction file."""
|
|
@@ -247,7 +253,7 @@ def upload_training_dataset_from_predictions(
|
|
|
247
253
|
import orjson
|
|
248
254
|
from huggingface_hub import HfApi
|
|
249
255
|
|
|
250
|
-
from labelr.export import export_to_hf_llm_image_extraction
|
|
256
|
+
from labelr.export.llm import export_to_hf_llm_image_extraction
|
|
251
257
|
from labelr.google_genai import generate_sample_iter
|
|
252
258
|
|
|
253
259
|
instructions = instructions_path.read_text()
|
|
@@ -286,4 +292,5 @@ def upload_training_dataset_from_predictions(
|
|
|
286
292
|
repo_id=repo_id,
|
|
287
293
|
revision=revision,
|
|
288
294
|
tmp_dir=tmp_dir,
|
|
295
|
+
image_max_size=image_max_size,
|
|
289
296
|
)
|
labelr/apps/label_studio.py
CHANGED
|
@@ -398,7 +398,7 @@ def create_dataset_file(
|
|
|
398
398
|
from openfoodfacts.images import extract_barcode_from_url, extract_source_from_url
|
|
399
399
|
from openfoodfacts.utils import get_image_from_url
|
|
400
400
|
|
|
401
|
-
from labelr.sample import format_object_detection_sample_to_ls
|
|
401
|
+
from labelr.sample.object_detection import format_object_detection_sample_to_ls
|
|
402
402
|
|
|
403
403
|
logger.info("Loading dataset: %s", input_file)
|
|
404
404
|
|
|
File without changes
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
import functools
|
|
2
|
+
import logging
|
|
3
|
+
import pickle
|
|
4
|
+
import tempfile
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
import datasets
|
|
8
|
+
from openfoodfacts.images import generate_image_url
|
|
9
|
+
from openfoodfacts.types import Flavor
|
|
10
|
+
from PIL import Image, ImageOps
|
|
11
|
+
|
|
12
|
+
from labelr.export.common import _pickle_sample_generator
|
|
13
|
+
from labelr.sample.classification import HF_DS_CLASSIFICATION_FEATURES
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def export_from_ultralytics_to_hf_classification(
|
|
19
|
+
dataset_dir: Path,
|
|
20
|
+
repo_id: str,
|
|
21
|
+
label_names: list[str],
|
|
22
|
+
merge_labels: bool = False,
|
|
23
|
+
is_openfoodfacts_dataset: bool = False,
|
|
24
|
+
openfoodfacts_flavor: Flavor = Flavor.off,
|
|
25
|
+
) -> None:
|
|
26
|
+
"""Export an Ultralytics classification dataset to a Hugging Face dataset.
|
|
27
|
+
|
|
28
|
+
The Ultralytics dataset directory should contain 'train', 'val' and/or
|
|
29
|
+
'test' subdirectories, each containing subdirectories for each label.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
dataset_dir (Path): Path to the Ultralytics dataset directory.
|
|
33
|
+
repo_id (str): Hugging Face repository ID to push the dataset to.
|
|
34
|
+
label_names (list[str]): List of label names.
|
|
35
|
+
merge_labels (bool): Whether to merge all labels into a single label
|
|
36
|
+
named 'object'.
|
|
37
|
+
is_openfoodfacts_dataset (bool): Whether the dataset is from
|
|
38
|
+
Open Food Facts. If True, the `off_image_id` and `image_url` will
|
|
39
|
+
be generated automatically. `off_image_id` is extracted from the
|
|
40
|
+
image filename.
|
|
41
|
+
openfoodfacts_flavor (Flavor): Flavor of Open Food Facts dataset. This
|
|
42
|
+
is ignored if `is_openfoodfacts_dataset` is False.
|
|
43
|
+
"""
|
|
44
|
+
logger.info("Repo ID: %s, dataset_dir: %s", repo_id, dataset_dir)
|
|
45
|
+
|
|
46
|
+
if not any((dataset_dir / split).is_dir() for split in ["train", "val", "test"]):
|
|
47
|
+
raise ValueError(
|
|
48
|
+
f"Dataset directory {dataset_dir} does not contain 'train', 'val' or 'test' subdirectories"
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
# Save output as pickle
|
|
52
|
+
for split in ["train", "val", "test"]:
|
|
53
|
+
split_dir = dataset_dir / split
|
|
54
|
+
|
|
55
|
+
if not split_dir.is_dir():
|
|
56
|
+
logger.info("Skipping missing split directory: %s", split_dir)
|
|
57
|
+
continue
|
|
58
|
+
|
|
59
|
+
with tempfile.TemporaryDirectory() as tmp_dir_str:
|
|
60
|
+
tmp_dir = Path(tmp_dir_str)
|
|
61
|
+
for label_dir in (d for d in split_dir.iterdir() if d.is_dir()):
|
|
62
|
+
label_name = label_dir.name
|
|
63
|
+
if merge_labels:
|
|
64
|
+
label_name = "object"
|
|
65
|
+
if label_name not in label_names:
|
|
66
|
+
raise ValueError(
|
|
67
|
+
"Label name %s not in provided label names (label names: %s)"
|
|
68
|
+
% (label_name, label_names),
|
|
69
|
+
)
|
|
70
|
+
label_id = label_names.index(label_name)
|
|
71
|
+
|
|
72
|
+
for image_path in label_dir.glob("*"):
|
|
73
|
+
if is_openfoodfacts_dataset:
|
|
74
|
+
image_stem_parts = image_path.stem.split("_")
|
|
75
|
+
barcode = image_stem_parts[0]
|
|
76
|
+
off_image_id = image_stem_parts[1]
|
|
77
|
+
image_id = f"{barcode}_{off_image_id}"
|
|
78
|
+
image_url = generate_image_url(
|
|
79
|
+
barcode, off_image_id, flavor=openfoodfacts_flavor
|
|
80
|
+
)
|
|
81
|
+
else:
|
|
82
|
+
image_id = image_path.stem
|
|
83
|
+
barcode = ""
|
|
84
|
+
off_image_id = ""
|
|
85
|
+
image_url = ""
|
|
86
|
+
image = Image.open(image_path)
|
|
87
|
+
image.load()
|
|
88
|
+
|
|
89
|
+
if image.mode != "RGB":
|
|
90
|
+
image = image.convert("RGB")
|
|
91
|
+
|
|
92
|
+
# Rotate image according to exif orientation using Pillow
|
|
93
|
+
ImageOps.exif_transpose(image, in_place=True)
|
|
94
|
+
sample = {
|
|
95
|
+
"image_id": image_id,
|
|
96
|
+
"image": image,
|
|
97
|
+
"width": image.width,
|
|
98
|
+
"height": image.height,
|
|
99
|
+
"meta": {
|
|
100
|
+
"barcode": barcode,
|
|
101
|
+
"off_image_id": off_image_id,
|
|
102
|
+
"image_url": image_url,
|
|
103
|
+
},
|
|
104
|
+
"category_id": label_id,
|
|
105
|
+
"category_name": label_name,
|
|
106
|
+
}
|
|
107
|
+
with open(tmp_dir / f"{split}_{image_id}.pkl", "wb") as f:
|
|
108
|
+
pickle.dump(sample, f)
|
|
109
|
+
|
|
110
|
+
hf_ds = datasets.Dataset.from_generator(
|
|
111
|
+
functools.partial(_pickle_sample_generator, tmp_dir),
|
|
112
|
+
features=HF_DS_CLASSIFICATION_FEATURES,
|
|
113
|
+
)
|
|
114
|
+
hf_ds.push_to_hub(repo_id, split=split)
|
labelr/export/common.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
import pickle
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
from openfoodfacts.types import Flavor
|
|
5
|
+
|
|
6
|
+
from labelr.types import TaskType
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def _pickle_sample_generator(dir: Path):
|
|
10
|
+
"""Generator that yields samples from pickles in a directory."""
|
|
11
|
+
for pkl in dir.glob("*.pkl"):
|
|
12
|
+
with open(pkl, "rb") as f:
|
|
13
|
+
yield pickle.load(f)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def export_from_ultralytics_to_hf(
|
|
17
|
+
task_type: TaskType,
|
|
18
|
+
dataset_dir: Path,
|
|
19
|
+
repo_id: str,
|
|
20
|
+
label_names: list[str],
|
|
21
|
+
merge_labels: bool = False,
|
|
22
|
+
is_openfoodfacts_dataset: bool = False,
|
|
23
|
+
openfoodfacts_flavor: Flavor = Flavor.off,
|
|
24
|
+
) -> None:
|
|
25
|
+
from labelr.export.classification import (
|
|
26
|
+
export_from_ultralytics_to_hf_classification,
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
if task_type != TaskType.classification:
|
|
30
|
+
raise NotImplementedError(
|
|
31
|
+
"Only classification task is currently supported for Ultralytics to HF export"
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
if task_type == TaskType.classification:
|
|
35
|
+
export_from_ultralytics_to_hf_classification(
|
|
36
|
+
dataset_dir=dataset_dir,
|
|
37
|
+
repo_id=repo_id,
|
|
38
|
+
label_names=label_names,
|
|
39
|
+
merge_labels=merge_labels,
|
|
40
|
+
is_openfoodfacts_dataset=is_openfoodfacts_dataset,
|
|
41
|
+
openfoodfacts_flavor=openfoodfacts_flavor,
|
|
42
|
+
)
|
labelr/export/llm.py
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
import functools
|
|
2
|
+
import logging
|
|
3
|
+
import pickle
|
|
4
|
+
import tempfile
|
|
5
|
+
import typing
|
|
6
|
+
from collections.abc import Iterator
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
import datasets
|
|
10
|
+
import tqdm
|
|
11
|
+
from PIL import Image, ImageOps
|
|
12
|
+
|
|
13
|
+
from labelr.export.common import _pickle_sample_generator
|
|
14
|
+
from labelr.sample.llm import (
|
|
15
|
+
HF_DS_LLM_IMAGE_EXTRACTION_FEATURES,
|
|
16
|
+
LLMImageExtractionSample,
|
|
17
|
+
)
|
|
18
|
+
from labelr.utils import PathWithContext
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def export_to_hf_llm_image_extraction(
|
|
24
|
+
sample_iter: Iterator[LLMImageExtractionSample],
|
|
25
|
+
split: str,
|
|
26
|
+
repo_id: str,
|
|
27
|
+
revision: str = "main",
|
|
28
|
+
tmp_dir: Path | None = None,
|
|
29
|
+
image_max_size: int | None = None,
|
|
30
|
+
) -> None:
|
|
31
|
+
"""Export LLM image extraction samples to a Hugging Face dataset.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
sample_iter (Iterator[LLMImageExtractionSample]): Iterator of samples
|
|
35
|
+
to export.
|
|
36
|
+
split (str): Name of the dataset split (e.g., 'train', 'val').
|
|
37
|
+
repo_id (str): Hugging Face repository ID to push the dataset to.
|
|
38
|
+
revision (str): Revision (branch, tag or commit) to use for the
|
|
39
|
+
Hugging Face Datasets repository.
|
|
40
|
+
tmp_dir (Path | None): Temporary directory to use for intermediate
|
|
41
|
+
files. If None, a temporary directory will be created
|
|
42
|
+
automatically.
|
|
43
|
+
image_max_size (int | None): Maximum size (in pixels) for the images.
|
|
44
|
+
"""
|
|
45
|
+
logger.info(
|
|
46
|
+
"Repo ID: %s, revision: %s, split: %s, tmp_dir: %s, image_max_size: %s",
|
|
47
|
+
repo_id,
|
|
48
|
+
revision,
|
|
49
|
+
split,
|
|
50
|
+
tmp_dir,
|
|
51
|
+
image_max_size,
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
tmp_dir_with_context: PathWithContext | tempfile.TemporaryDirectory
|
|
55
|
+
if tmp_dir:
|
|
56
|
+
tmp_dir.mkdir(parents=True, exist_ok=True)
|
|
57
|
+
tmp_dir_with_context = PathWithContext(tmp_dir)
|
|
58
|
+
else:
|
|
59
|
+
tmp_dir_with_context = tempfile.TemporaryDirectory()
|
|
60
|
+
|
|
61
|
+
with tmp_dir_with_context as tmp_dir_str:
|
|
62
|
+
tmp_dir = Path(tmp_dir_str)
|
|
63
|
+
for sample in tqdm.tqdm(sample_iter, desc="samples"):
|
|
64
|
+
image = sample.image
|
|
65
|
+
# Rotate image according to exif orientation using Pillow
|
|
66
|
+
image = typing.cast(Image.Image, ImageOps.exif_transpose(image))
|
|
67
|
+
|
|
68
|
+
if image_max_size is not None:
|
|
69
|
+
if image.height > image_max_size or image.width > image_max_size:
|
|
70
|
+
image.thumbnail(
|
|
71
|
+
(image_max_size, image_max_size),
|
|
72
|
+
Image.Resampling.LANCZOS,
|
|
73
|
+
)
|
|
74
|
+
image_id = sample.image_id
|
|
75
|
+
json_sample = {
|
|
76
|
+
"image_id": image_id,
|
|
77
|
+
"image": image,
|
|
78
|
+
"meta": {
|
|
79
|
+
k: v for k, v in sample.meta.model_dump().items() if v is not None
|
|
80
|
+
},
|
|
81
|
+
"output": sample.output,
|
|
82
|
+
}
|
|
83
|
+
# Save output as pickle
|
|
84
|
+
with open(tmp_dir / f"{split}_{image_id}.pkl", "wb") as f:
|
|
85
|
+
pickle.dump(json_sample, f)
|
|
86
|
+
|
|
87
|
+
hf_ds = datasets.Dataset.from_generator(
|
|
88
|
+
functools.partial(_pickle_sample_generator, tmp_dir),
|
|
89
|
+
features=HF_DS_LLM_IMAGE_EXTRACTION_FEATURES,
|
|
90
|
+
)
|
|
91
|
+
hf_ds.push_to_hub(repo_id, split=split, revision=revision)
|
|
@@ -1,38 +1,23 @@
|
|
|
1
1
|
import functools
|
|
2
2
|
import logging
|
|
3
3
|
import pickle
|
|
4
|
-
import random
|
|
5
4
|
import tempfile
|
|
6
|
-
from collections.abc import Iterator
|
|
7
5
|
from pathlib import Path
|
|
8
6
|
|
|
9
7
|
import datasets
|
|
10
8
|
import tqdm
|
|
11
9
|
from label_studio_sdk.client import LabelStudio
|
|
12
|
-
from openfoodfacts.images import download_image
|
|
13
|
-
from openfoodfacts.types import Flavor
|
|
14
|
-
from PIL import Image, ImageOps
|
|
10
|
+
from openfoodfacts.images import download_image
|
|
15
11
|
|
|
16
|
-
from labelr.
|
|
17
|
-
|
|
18
|
-
HF_DS_LLM_IMAGE_EXTRACTION_FEATURES,
|
|
12
|
+
from labelr.export.common import _pickle_sample_generator
|
|
13
|
+
from labelr.sample.object_detection import (
|
|
19
14
|
HF_DS_OBJECT_DETECTION_FEATURES,
|
|
20
|
-
LLMImageExtractionSample,
|
|
21
15
|
format_object_detection_sample_to_hf,
|
|
22
16
|
)
|
|
23
|
-
from labelr.types import TaskType
|
|
24
|
-
from labelr.utils import PathWithContext
|
|
25
17
|
|
|
26
18
|
logger = logging.getLogger(__name__)
|
|
27
19
|
|
|
28
20
|
|
|
29
|
-
def _pickle_sample_generator(dir: Path):
|
|
30
|
-
"""Generator that yields samples from pickles in a directory."""
|
|
31
|
-
for pkl in dir.glob("*.pkl"):
|
|
32
|
-
with open(pkl, "rb") as f:
|
|
33
|
-
yield pickle.load(f)
|
|
34
|
-
|
|
35
|
-
|
|
36
21
|
def export_from_ls_to_hf_object_detection(
|
|
37
22
|
ls: LabelStudio,
|
|
38
23
|
repo_id: str,
|
|
@@ -335,186 +320,3 @@ def export_from_hf_to_ultralytics_object_detection(
|
|
|
335
320
|
f.write("names:\n")
|
|
336
321
|
for i, category_name in enumerate(category_names):
|
|
337
322
|
f.write(f" {i}: {category_name}\n")
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
def export_from_ultralytics_to_hf(
|
|
341
|
-
task_type: TaskType,
|
|
342
|
-
dataset_dir: Path,
|
|
343
|
-
repo_id: str,
|
|
344
|
-
label_names: list[str],
|
|
345
|
-
merge_labels: bool = False,
|
|
346
|
-
is_openfoodfacts_dataset: bool = False,
|
|
347
|
-
openfoodfacts_flavor: Flavor = Flavor.off,
|
|
348
|
-
) -> None:
|
|
349
|
-
if task_type != TaskType.classification:
|
|
350
|
-
raise NotImplementedError(
|
|
351
|
-
"Only classification task is currently supported for Ultralytics to HF export"
|
|
352
|
-
)
|
|
353
|
-
|
|
354
|
-
if task_type == TaskType.classification:
|
|
355
|
-
export_from_ultralytics_to_hf_classification(
|
|
356
|
-
dataset_dir=dataset_dir,
|
|
357
|
-
repo_id=repo_id,
|
|
358
|
-
label_names=label_names,
|
|
359
|
-
merge_labels=merge_labels,
|
|
360
|
-
is_openfoodfacts_dataset=is_openfoodfacts_dataset,
|
|
361
|
-
openfoodfacts_flavor=openfoodfacts_flavor,
|
|
362
|
-
)
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
def export_from_ultralytics_to_hf_classification(
|
|
366
|
-
dataset_dir: Path,
|
|
367
|
-
repo_id: str,
|
|
368
|
-
label_names: list[str],
|
|
369
|
-
merge_labels: bool = False,
|
|
370
|
-
is_openfoodfacts_dataset: bool = False,
|
|
371
|
-
openfoodfacts_flavor: Flavor = Flavor.off,
|
|
372
|
-
) -> None:
|
|
373
|
-
"""Export an Ultralytics classification dataset to a Hugging Face dataset.
|
|
374
|
-
|
|
375
|
-
The Ultralytics dataset directory should contain 'train', 'val' and/or
|
|
376
|
-
'test' subdirectories, each containing subdirectories for each label.
|
|
377
|
-
|
|
378
|
-
Args:
|
|
379
|
-
dataset_dir (Path): Path to the Ultralytics dataset directory.
|
|
380
|
-
repo_id (str): Hugging Face repository ID to push the dataset to.
|
|
381
|
-
label_names (list[str]): List of label names.
|
|
382
|
-
merge_labels (bool): Whether to merge all labels into a single label
|
|
383
|
-
named 'object'.
|
|
384
|
-
is_openfoodfacts_dataset (bool): Whether the dataset is from
|
|
385
|
-
Open Food Facts. If True, the `off_image_id` and `image_url` will
|
|
386
|
-
be generated automatically. `off_image_id` is extracted from the
|
|
387
|
-
image filename.
|
|
388
|
-
openfoodfacts_flavor (Flavor): Flavor of Open Food Facts dataset. This
|
|
389
|
-
is ignored if `is_openfoodfacts_dataset` is False.
|
|
390
|
-
"""
|
|
391
|
-
logger.info("Repo ID: %s, dataset_dir: %s", repo_id, dataset_dir)
|
|
392
|
-
|
|
393
|
-
if not any((dataset_dir / split).is_dir() for split in ["train", "val", "test"]):
|
|
394
|
-
raise ValueError(
|
|
395
|
-
f"Dataset directory {dataset_dir} does not contain 'train', 'val' or 'test' subdirectories"
|
|
396
|
-
)
|
|
397
|
-
|
|
398
|
-
# Save output as pickle
|
|
399
|
-
for split in ["train", "val", "test"]:
|
|
400
|
-
split_dir = dataset_dir / split
|
|
401
|
-
|
|
402
|
-
if not split_dir.is_dir():
|
|
403
|
-
logger.info("Skipping missing split directory: %s", split_dir)
|
|
404
|
-
continue
|
|
405
|
-
|
|
406
|
-
with tempfile.TemporaryDirectory() as tmp_dir_str:
|
|
407
|
-
tmp_dir = Path(tmp_dir_str)
|
|
408
|
-
for label_dir in (d for d in split_dir.iterdir() if d.is_dir()):
|
|
409
|
-
label_name = label_dir.name
|
|
410
|
-
if merge_labels:
|
|
411
|
-
label_name = "object"
|
|
412
|
-
if label_name not in label_names:
|
|
413
|
-
raise ValueError(
|
|
414
|
-
"Label name %s not in provided label names (label names: %s)"
|
|
415
|
-
% (label_name, label_names),
|
|
416
|
-
)
|
|
417
|
-
label_id = label_names.index(label_name)
|
|
418
|
-
|
|
419
|
-
for image_path in label_dir.glob("*"):
|
|
420
|
-
if is_openfoodfacts_dataset:
|
|
421
|
-
image_stem_parts = image_path.stem.split("_")
|
|
422
|
-
barcode = image_stem_parts[0]
|
|
423
|
-
off_image_id = image_stem_parts[1]
|
|
424
|
-
image_id = f"{barcode}_{off_image_id}"
|
|
425
|
-
image_url = generate_image_url(
|
|
426
|
-
barcode, off_image_id, flavor=openfoodfacts_flavor
|
|
427
|
-
)
|
|
428
|
-
else:
|
|
429
|
-
image_id = image_path.stem
|
|
430
|
-
barcode = ""
|
|
431
|
-
off_image_id = ""
|
|
432
|
-
image_url = ""
|
|
433
|
-
image = Image.open(image_path)
|
|
434
|
-
image.load()
|
|
435
|
-
|
|
436
|
-
if image.mode != "RGB":
|
|
437
|
-
image = image.convert("RGB")
|
|
438
|
-
|
|
439
|
-
# Rotate image according to exif orientation using Pillow
|
|
440
|
-
ImageOps.exif_transpose(image, in_place=True)
|
|
441
|
-
sample = {
|
|
442
|
-
"image_id": image_id,
|
|
443
|
-
"image": image,
|
|
444
|
-
"width": image.width,
|
|
445
|
-
"height": image.height,
|
|
446
|
-
"meta": {
|
|
447
|
-
"barcode": barcode,
|
|
448
|
-
"off_image_id": off_image_id,
|
|
449
|
-
"image_url": image_url,
|
|
450
|
-
},
|
|
451
|
-
"category_id": label_id,
|
|
452
|
-
"category_name": label_name,
|
|
453
|
-
}
|
|
454
|
-
with open(tmp_dir / f"{split}_{image_id}.pkl", "wb") as f:
|
|
455
|
-
pickle.dump(sample, f)
|
|
456
|
-
|
|
457
|
-
hf_ds = datasets.Dataset.from_generator(
|
|
458
|
-
functools.partial(_pickle_sample_generator, tmp_dir),
|
|
459
|
-
features=HF_DS_CLASSIFICATION_FEATURES,
|
|
460
|
-
)
|
|
461
|
-
hf_ds.push_to_hub(repo_id, split=split)
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
def export_to_hf_llm_image_extraction(
|
|
465
|
-
sample_iter: Iterator[LLMImageExtractionSample],
|
|
466
|
-
split: str,
|
|
467
|
-
repo_id: str,
|
|
468
|
-
revision: str = "main",
|
|
469
|
-
tmp_dir: Path | None = None,
|
|
470
|
-
) -> None:
|
|
471
|
-
"""Export LLM image extraction samples to a Hugging Face dataset.
|
|
472
|
-
|
|
473
|
-
Args:
|
|
474
|
-
sample_iter (Iterator[LLMImageExtractionSample]): Iterator of samples
|
|
475
|
-
to export.
|
|
476
|
-
split (str): Name of the dataset split (e.g., 'train', 'val').
|
|
477
|
-
repo_id (str): Hugging Face repository ID to push the dataset to.
|
|
478
|
-
revision (str): Revision (branch, tag or commit) to use for the
|
|
479
|
-
Hugging Face Datasets repository.
|
|
480
|
-
tmp_dir (Path | None): Temporary directory to use for intermediate
|
|
481
|
-
files. If None, a temporary directory will be created
|
|
482
|
-
automatically.
|
|
483
|
-
"""
|
|
484
|
-
logger.info(
|
|
485
|
-
"Repo ID: %s, revision: %s, split: %s, tmp_dir: %s",
|
|
486
|
-
repo_id,
|
|
487
|
-
revision,
|
|
488
|
-
split,
|
|
489
|
-
tmp_dir,
|
|
490
|
-
)
|
|
491
|
-
|
|
492
|
-
tmp_dir_with_context: PathWithContext | tempfile.TemporaryDirectory
|
|
493
|
-
if tmp_dir:
|
|
494
|
-
tmp_dir.mkdir(parents=True, exist_ok=True)
|
|
495
|
-
tmp_dir_with_context = PathWithContext(tmp_dir)
|
|
496
|
-
else:
|
|
497
|
-
tmp_dir_with_context = tempfile.TemporaryDirectory()
|
|
498
|
-
|
|
499
|
-
with tmp_dir_with_context as tmp_dir_str:
|
|
500
|
-
tmp_dir = Path(tmp_dir_str)
|
|
501
|
-
for sample in tqdm.tqdm(sample_iter, desc="samples"):
|
|
502
|
-
image = sample.image
|
|
503
|
-
# Rotate image according to exif orientation using Pillow
|
|
504
|
-
image = ImageOps.exif_transpose(image)
|
|
505
|
-
image_id = sample.image_id
|
|
506
|
-
sample = {
|
|
507
|
-
"image_id": image_id,
|
|
508
|
-
"image": image,
|
|
509
|
-
"meta": sample.meta.model_dump(),
|
|
510
|
-
"output": sample.output,
|
|
511
|
-
}
|
|
512
|
-
# Save output as pickle
|
|
513
|
-
with open(tmp_dir / f"{split}_{image_id}.pkl", "wb") as f:
|
|
514
|
-
pickle.dump(sample, f)
|
|
515
|
-
|
|
516
|
-
hf_ds = datasets.Dataset.from_generator(
|
|
517
|
-
functools.partial(_pickle_sample_generator, tmp_dir),
|
|
518
|
-
features=HF_DS_LLM_IMAGE_EXTRACTION_FEATURES,
|
|
519
|
-
)
|
|
520
|
-
hf_ds.push_to_hub(repo_id, split=split, revision=revision)
|
labelr/google_genai.py
CHANGED
|
@@ -11,10 +11,11 @@ import orjson
|
|
|
11
11
|
import typer
|
|
12
12
|
from gcloud.aio.storage import Storage
|
|
13
13
|
from openfoodfacts import Flavor
|
|
14
|
-
from openfoodfacts.images import
|
|
14
|
+
from openfoodfacts.images import generate_image_url
|
|
15
15
|
from tqdm.asyncio import tqdm
|
|
16
16
|
|
|
17
|
-
from labelr.sample import
|
|
17
|
+
from labelr.sample.common import SampleMeta
|
|
18
|
+
from labelr.sample.llm import LLMImageExtractionSample
|
|
18
19
|
from labelr.utils import download_image_from_gcs
|
|
19
20
|
|
|
20
21
|
try:
|
|
@@ -335,6 +336,7 @@ def generate_sample_iter(
|
|
|
335
336
|
"""
|
|
336
337
|
skipped = 0
|
|
337
338
|
invalid = 0
|
|
339
|
+
storage_client = storage.Client()
|
|
338
340
|
with prediction_path.open("r") as f_in:
|
|
339
341
|
for i, sample_str in enumerate(f_in):
|
|
340
342
|
if i < skip:
|
|
@@ -349,6 +351,7 @@ def generate_sample_iter(
|
|
|
349
351
|
sample=sample,
|
|
350
352
|
is_openfoodfacts_dataset=is_openfoodfacts_dataset,
|
|
351
353
|
openfoodfacts_flavor=openfoodfacts_flavor,
|
|
354
|
+
storage_client=storage_client,
|
|
352
355
|
)
|
|
353
356
|
except Exception as e:
|
|
354
357
|
if raise_on_invalid_sample:
|
|
@@ -370,6 +373,7 @@ def generate_sample_from_prediction(
|
|
|
370
373
|
sample: JSONType,
|
|
371
374
|
is_openfoodfacts_dataset: bool = False,
|
|
372
375
|
openfoodfacts_flavor: Flavor = Flavor.off,
|
|
376
|
+
storage_client: storage.Client | None = None,
|
|
373
377
|
) -> LLMImageExtractionSample:
|
|
374
378
|
"""Generate a LLMImageExtractionSample from a prediction sample.
|
|
375
379
|
Args:
|
|
@@ -378,13 +382,15 @@ def generate_sample_from_prediction(
|
|
|
378
382
|
is_openfoodfacts_dataset (bool): Whether the dataset is from Open Food
|
|
379
383
|
Facts.
|
|
380
384
|
openfoodfacts_flavor (Flavor): Flavor of the Open Food Facts dataset.
|
|
385
|
+
storage_client (storage.Client | None): Optional Google Cloud Storage
|
|
386
|
+
client. If not provided, a new client will be created.
|
|
381
387
|
Returns:
|
|
382
388
|
LLMImageExtractionSample: Generated sample.
|
|
383
389
|
"""
|
|
384
390
|
image_id = sample["key"][len("key:") :]
|
|
385
391
|
response_str = sample["response"]["candidates"][0]["content"]["parts"][0]["text"]
|
|
386
392
|
image_uri = sample["request"]["contents"][0]["parts"][1]["file_data"]["file_uri"]
|
|
387
|
-
image = download_image_from_gcs(image_uri=image_uri)
|
|
393
|
+
image = download_image_from_gcs(image_uri=image_uri, client=storage_client)
|
|
388
394
|
response = orjson.loads(response_str)
|
|
389
395
|
jsonschema.validate(response, json_schema)
|
|
390
396
|
|
|
File without changes
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
import datasets
|
|
2
|
+
|
|
3
|
+
HF_DS_CLASSIFICATION_FEATURES = datasets.Features(
|
|
4
|
+
{
|
|
5
|
+
"image_id": datasets.Value("string"),
|
|
6
|
+
"image": datasets.features.Image(),
|
|
7
|
+
"width": datasets.Value("int64"),
|
|
8
|
+
"height": datasets.Value("int64"),
|
|
9
|
+
"meta": {
|
|
10
|
+
"barcode": datasets.Value("string"),
|
|
11
|
+
"off_image_id": datasets.Value("string"),
|
|
12
|
+
"image_url": datasets.Value("string"),
|
|
13
|
+
},
|
|
14
|
+
"category_id": datasets.Value("int64"),
|
|
15
|
+
"category_name": datasets.Value("string"),
|
|
16
|
+
}
|
|
17
|
+
)
|
labelr/sample/common.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
from pydantic import BaseModel, Field
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class SampleMeta(BaseModel):
|
|
5
|
+
barcode: str | None = Field(
|
|
6
|
+
..., description="The barcode of the product, if applicable"
|
|
7
|
+
)
|
|
8
|
+
off_image_id: str | None = Field(
|
|
9
|
+
...,
|
|
10
|
+
description="The Open Food Facts image ID associated with the image, if applicable",
|
|
11
|
+
)
|
|
12
|
+
image_url: str | None = Field(
|
|
13
|
+
..., description="The URL of the image, if applicable"
|
|
14
|
+
)
|
labelr/sample/llm.py
ADDED
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
import typing
|
|
2
|
+
from collections.abc import Iterator
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
import datasets
|
|
6
|
+
import orjson
|
|
7
|
+
from PIL import Image
|
|
8
|
+
from pydantic import BaseModel, Field
|
|
9
|
+
|
|
10
|
+
from labelr.sample.common import SampleMeta
|
|
11
|
+
from labelr.utils import download_image
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class LLMImageExtractionSample(BaseModel):
|
|
15
|
+
class Config:
|
|
16
|
+
# required to allow PIL Image type
|
|
17
|
+
arbitrary_types_allowed = True
|
|
18
|
+
|
|
19
|
+
image_id: str = Field(
|
|
20
|
+
...,
|
|
21
|
+
description="unique ID for the image. For Open Food Facts images, it follows the "
|
|
22
|
+
"format `barcode:imgid`",
|
|
23
|
+
)
|
|
24
|
+
image: Image.Image = Field(..., description="Image to extract information from")
|
|
25
|
+
output: str | None = Field(..., description="Expected response of the LLM")
|
|
26
|
+
meta: SampleMeta = Field(..., description="Metadata associated with the sample")
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
HF_DS_LLM_IMAGE_EXTRACTION_FEATURES = datasets.Features(
|
|
30
|
+
{
|
|
31
|
+
"image_id": datasets.Value("string"),
|
|
32
|
+
"image": datasets.features.Image(),
|
|
33
|
+
"output": datasets.features.Value("string"),
|
|
34
|
+
"meta": {
|
|
35
|
+
"barcode": datasets.Value("string"),
|
|
36
|
+
"off_image_id": datasets.Value("string"),
|
|
37
|
+
"image_url": datasets.Value("string"),
|
|
38
|
+
},
|
|
39
|
+
}
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def load_llm_image_extraction_dataset_from_jsonl(
|
|
44
|
+
dataset_path: Path, **kwargs
|
|
45
|
+
) -> Iterator[LLMImageExtractionSample]:
|
|
46
|
+
"""Load a Hugging Face dataset for LLM image extraction from a JSONL file.
|
|
47
|
+
|
|
48
|
+
Args:
|
|
49
|
+
dataset_path (Path): Path to the JSONL dataset file.
|
|
50
|
+
**kwargs: Additional keyword arguments to pass to the image downloader.
|
|
51
|
+
Yields:
|
|
52
|
+
Iterator[LLMImageExtractionSample]: Iterator of LLM image extraction
|
|
53
|
+
samples.
|
|
54
|
+
"""
|
|
55
|
+
with dataset_path.open("r") as f:
|
|
56
|
+
for line in f:
|
|
57
|
+
item = orjson.loads(line)
|
|
58
|
+
image_id = item["image_id"]
|
|
59
|
+
image_url = item["image_url"]
|
|
60
|
+
image = typing.cast(Image.Image, download_image(image_url, **kwargs))
|
|
61
|
+
barcode = item.pop("barcode", None)
|
|
62
|
+
off_image_id = item.pop("off_image_id", None)
|
|
63
|
+
output = item.pop("output", None)
|
|
64
|
+
meta = SampleMeta(
|
|
65
|
+
barcode=barcode,
|
|
66
|
+
off_image_id=off_image_id,
|
|
67
|
+
image_url=image_url,
|
|
68
|
+
)
|
|
69
|
+
sample = LLMImageExtractionSample(
|
|
70
|
+
image_id=image_id,
|
|
71
|
+
image=image,
|
|
72
|
+
output=output,
|
|
73
|
+
meta=meta,
|
|
74
|
+
)
|
|
75
|
+
yield sample
|
|
@@ -8,8 +8,7 @@ import PIL
|
|
|
8
8
|
from openfoodfacts import Flavor
|
|
9
9
|
from openfoodfacts.barcode import normalize_barcode
|
|
10
10
|
from openfoodfacts.images import download_image, generate_image_url
|
|
11
|
-
from PIL import
|
|
12
|
-
from pydantic import BaseModel, Field
|
|
11
|
+
from PIL import ImageOps
|
|
13
12
|
|
|
14
13
|
logger = logging.getLogger(__name__)
|
|
15
14
|
|
|
@@ -231,34 +230,6 @@ def format_object_detection_sample_to_hf(
|
|
|
231
230
|
}
|
|
232
231
|
|
|
233
232
|
|
|
234
|
-
class SampleMeta(BaseModel):
|
|
235
|
-
barcode: str | None = Field(
|
|
236
|
-
..., description="The barcode of the product, if applicable"
|
|
237
|
-
)
|
|
238
|
-
off_image_id: str | None = Field(
|
|
239
|
-
...,
|
|
240
|
-
description="The Open Food Facts image ID associated with the image, if applicable",
|
|
241
|
-
)
|
|
242
|
-
image_url: str | None = Field(
|
|
243
|
-
..., description="The URL of the image, if applicable"
|
|
244
|
-
)
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
class LLMImageExtractionSample(BaseModel):
|
|
248
|
-
class Config:
|
|
249
|
-
# required to allow PIL Image type
|
|
250
|
-
arbitrary_types_allowed = True
|
|
251
|
-
|
|
252
|
-
image_id: str = Field(
|
|
253
|
-
...,
|
|
254
|
-
description="unique ID for the image. For Open Food Facts images, it follows the "
|
|
255
|
-
"format `barcode:imgid`",
|
|
256
|
-
)
|
|
257
|
-
image: Image.Image = Field(..., description="Image to extract information from")
|
|
258
|
-
output: str = Field(..., description="Expected response of the LLM")
|
|
259
|
-
meta: SampleMeta = Field(..., description="Metadata associated with the sample")
|
|
260
|
-
|
|
261
|
-
|
|
262
233
|
# The HuggingFace Dataset features
|
|
263
234
|
HF_DS_OBJECT_DETECTION_FEATURES = datasets.Features(
|
|
264
235
|
{
|
|
@@ -278,33 +249,3 @@ HF_DS_OBJECT_DETECTION_FEATURES = datasets.Features(
|
|
|
278
249
|
},
|
|
279
250
|
}
|
|
280
251
|
)
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
HF_DS_CLASSIFICATION_FEATURES = datasets.Features(
|
|
284
|
-
{
|
|
285
|
-
"image_id": datasets.Value("string"),
|
|
286
|
-
"image": datasets.features.Image(),
|
|
287
|
-
"width": datasets.Value("int64"),
|
|
288
|
-
"height": datasets.Value("int64"),
|
|
289
|
-
"meta": {
|
|
290
|
-
"barcode": datasets.Value("string"),
|
|
291
|
-
"off_image_id": datasets.Value("string"),
|
|
292
|
-
"image_url": datasets.Value("string"),
|
|
293
|
-
},
|
|
294
|
-
"category_id": datasets.Value("int64"),
|
|
295
|
-
"category_name": datasets.Value("string"),
|
|
296
|
-
}
|
|
297
|
-
)
|
|
298
|
-
|
|
299
|
-
HF_DS_LLM_IMAGE_EXTRACTION_FEATURES = datasets.Features(
|
|
300
|
-
{
|
|
301
|
-
"image_id": datasets.Value("string"),
|
|
302
|
-
"image": datasets.features.Image(),
|
|
303
|
-
"output": datasets.features.Value("string"),
|
|
304
|
-
"meta": {
|
|
305
|
-
"barcode": datasets.Value("string"),
|
|
306
|
-
"off_image_id": datasets.Value("string"),
|
|
307
|
-
"image_url": datasets.Value("string"),
|
|
308
|
-
},
|
|
309
|
-
}
|
|
310
|
-
)
|
labelr/utils.py
CHANGED
|
@@ -2,6 +2,8 @@ import io
|
|
|
2
2
|
from pathlib import Path
|
|
3
3
|
|
|
4
4
|
from google.cloud import storage
|
|
5
|
+
from openfoodfacts.images import download_image as _download_image
|
|
6
|
+
from openfoodfacts.utils import ImageDownloadItem
|
|
5
7
|
from PIL import Image
|
|
6
8
|
|
|
7
9
|
|
|
@@ -20,15 +22,63 @@ def parse_hf_repo_id(hf_repo_id: str) -> tuple[str, str]:
|
|
|
20
22
|
return hf_repo_id, revision
|
|
21
23
|
|
|
22
24
|
|
|
23
|
-
def
|
|
25
|
+
def download_image(
|
|
26
|
+
image: str | tuple[str, str],
|
|
27
|
+
*,
|
|
28
|
+
error_raise: bool = True,
|
|
29
|
+
return_struct: bool = False,
|
|
30
|
+
**kwargs,
|
|
31
|
+
) -> Image.Image | ImageDownloadItem | None:
|
|
32
|
+
"""Download an image from a URL or GCS URI and return it as a PIL Image.
|
|
33
|
+
Args:
|
|
34
|
+
image (str | tuple[str, str]): The URL or GCS URI of the image.
|
|
35
|
+
error_raise (bool): Whether to raise an error if the image cannot be
|
|
36
|
+
downloaded.
|
|
37
|
+
return_struct (bool): Whether to return an ImageDownloadItem struct
|
|
38
|
+
instead of a PIL Image.
|
|
39
|
+
**kwargs: Additional arguments to pass to the download function.
|
|
40
|
+
Returns:
|
|
41
|
+
Image.Image | ImageDownloadItem: The downloaded image as a PIL Image
|
|
42
|
+
or an ImageDownloadItem struct.
|
|
43
|
+
"""
|
|
44
|
+
if isinstance(image, str) and image.startswith("gs://"):
|
|
45
|
+
return download_image_from_gcs(image, return_struct=return_struct, **kwargs)
|
|
46
|
+
return _download_image(
|
|
47
|
+
image,
|
|
48
|
+
error_raise=error_raise,
|
|
49
|
+
return_struct=return_struct,
|
|
50
|
+
**kwargs,
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def download_image_from_gcs(
|
|
55
|
+
image_uri: str, client: storage.Client | None = None, return_struct: bool = False
|
|
56
|
+
) -> Image.Image | ImageDownloadItem:
|
|
24
57
|
"""Download an image from a Google Cloud Storage URI and return it as a
|
|
25
|
-
PIL Image.
|
|
26
|
-
|
|
58
|
+
PIL Image.
|
|
59
|
+
|
|
60
|
+
Args:
|
|
61
|
+
image_uri (str): The GCS URI of the image
|
|
62
|
+
(e.g., gs://bucket_name/path/to/image.jpg).
|
|
63
|
+
client (storage.Client | None): An optional Google Cloud Storage
|
|
64
|
+
client. If not provided, a new client will be created.
|
|
65
|
+
"""
|
|
66
|
+
if client is None:
|
|
67
|
+
client = storage.Client()
|
|
68
|
+
|
|
27
69
|
bucket_name, blob_name = image_uri.replace("gs://", "").split("/", 1)
|
|
28
|
-
bucket =
|
|
70
|
+
bucket = client.bucket(bucket_name)
|
|
29
71
|
blob = bucket.blob(blob_name)
|
|
30
72
|
image_data = blob.download_as_bytes()
|
|
31
|
-
|
|
73
|
+
pil_image = Image.open(io.BytesIO(image_data))
|
|
74
|
+
|
|
75
|
+
if return_struct:
|
|
76
|
+
return ImageDownloadItem(
|
|
77
|
+
url=image_uri,
|
|
78
|
+
image=pil_image,
|
|
79
|
+
error=None,
|
|
80
|
+
)
|
|
81
|
+
return pil_image
|
|
32
82
|
|
|
33
83
|
|
|
34
84
|
class PathWithContext:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: labelr
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.10.0
|
|
4
4
|
Summary: A command-line tool to manage labeling tasks with Label Studio.
|
|
5
5
|
Requires-Python: >=3.10
|
|
6
6
|
Description-Content-Type: text/markdown
|
|
@@ -18,14 +18,13 @@ Requires-Dist: rapidfuzz>=3.14.3
|
|
|
18
18
|
Requires-Dist: aiohttp
|
|
19
19
|
Requires-Dist: aiofiles
|
|
20
20
|
Requires-Dist: orjson
|
|
21
|
+
Requires-Dist: google-cloud-storage
|
|
22
|
+
Requires-Dist: gcloud-aio-storage
|
|
23
|
+
Requires-Dist: google-genai>=1.56.0
|
|
21
24
|
Provides-Extra: ultralytics
|
|
22
25
|
Requires-Dist: ultralytics==8.3.223; extra == "ultralytics"
|
|
23
26
|
Provides-Extra: fiftyone
|
|
24
27
|
Requires-Dist: fiftyone~=1.10.0; extra == "fiftyone"
|
|
25
|
-
Provides-Extra: google
|
|
26
|
-
Requires-Dist: google-genai>=1.56.0; extra == "google"
|
|
27
|
-
Requires-Dist: gcloud-aio-storage; extra == "google"
|
|
28
|
-
Requires-Dist: google-cloud-storage; extra == "google"
|
|
29
28
|
Dynamic: license-file
|
|
30
29
|
|
|
31
30
|
# Labelr
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
labelr/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
labelr/__main__.py,sha256=G4e95-IfhI-lOmkOBP6kQ8wl1x_Fl7dZlLOYr90K83c,66
|
|
3
|
+
labelr/annotate.py,sha256=3fJ9FYbcozcOoKuhNtzPHV8sSnp-45FsNnMc8UeBHGU,3503
|
|
4
|
+
labelr/check.py,sha256=3wK6mE0UsKvoBNm0_lyWhCMq7gxkv5r50pvO70damXY,2476
|
|
5
|
+
labelr/config.py,sha256=3RXF_NdkSuHvfVMGMlYmjlw45fU77zQkLX7gmZq7NxM,64
|
|
6
|
+
labelr/dataset_features.py,sha256=ZC9QAUw9oKHqyUPla2h3xQFaRT9sHq8hkPNN4RDDwmo,1257
|
|
7
|
+
labelr/google_genai.py,sha256=x5p98eYoI887QMBDgziFxEW9WNdZ8Cw0EHjAFQ71SaE,14728
|
|
8
|
+
labelr/main.py,sha256=OTiJSkD_TrzQmQQm291FhknD-HQQTWfBEBgImxqL0KM,2634
|
|
9
|
+
labelr/project_config.py,sha256=CIHEcgSOfXb53naHWEBkTDm2V9m3abAu8C54VSzHjAs,1260
|
|
10
|
+
labelr/types.py,sha256=8CHfLyifF_N94OYDhG-7IcWboOh9o0Z_0LBtQapT8TQ,313
|
|
11
|
+
labelr/utils.py,sha256=8Yp0L2MCIdUYSjvmF4U5iiaBpaZJbYw4rHJOMhCCudE,3075
|
|
12
|
+
labelr/apps/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
13
|
+
labelr/apps/datasets.py,sha256=tAD6TZSnwh7uhkleSfDP0PFqztXC1S3Vx2aMSVCFfRU,12725
|
|
14
|
+
labelr/apps/evaluate.py,sha256=UC4CuSKa4vgR5xTBZ-dFgp_1pYnkM55s2IJgix0YtkI,1157
|
|
15
|
+
labelr/apps/google_batch.py,sha256=Mlz5jRVcR1XzRJg2HLte3rIhiOk4xQQjjLAJsc3lJjo,9572
|
|
16
|
+
labelr/apps/hugging_face.py,sha256=B0GaDZeUZj2A7nEeC1OtCANb0DqvBkhWwFWM_9Nm2kU,1608
|
|
17
|
+
labelr/apps/label_studio.py,sha256=lQ7K16noA4Mnr1hc0oxya1sgGgABWnpIIJTM5ENp7so,16869
|
|
18
|
+
labelr/apps/train.py,sha256=wmOSpO9JsrwCXYMgRg2srMbV5B5TvnlfhAKPqUt6wSg,7328
|
|
19
|
+
labelr/evaluate/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
20
|
+
labelr/evaluate/object_detection.py,sha256=QJIwrDY-Vsy0-It6tZSkN3qgAlmIu2W1-kGdmibiPSQ,3349
|
|
21
|
+
labelr/export/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
22
|
+
labelr/export/classification.py,sha256=rnm99vGMJy1UkdXiZ8t_TgFe3CyLBBYowWwzaZeniIs,4699
|
|
23
|
+
labelr/export/common.py,sha256=lJ-ZDOMKGpC48fCuEnIrA8sZBhXGZOcghBbsLM1h66o,1252
|
|
24
|
+
labelr/export/llm.py,sha256=Jlopi0EQ4YUWLe_s-kTFcISTzO1QmdX-qXQxayO6E-k,3186
|
|
25
|
+
labelr/export/object_detection.py,sha256=91ywkPago7WgbY2COQKpwjFLYAAsXeGOu7TkGHi17OU,12338
|
|
26
|
+
labelr/sample/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
27
|
+
labelr/sample/classification.py,sha256=7Z5hvxG6q6wfJMYj00JWbRBhfjOyhjaL8fpJjgBi9N8,539
|
|
28
|
+
labelr/sample/common.py,sha256=f0XDS6s0z6Vw4G2FDELJ1VQSe5Tsh0q3-3VU9unK9eY,431
|
|
29
|
+
labelr/sample/llm.py,sha256=zAsI3TmfGCbBPv4_hNtYR4Np3yAmUDzXGAvlQLF6V6w,2474
|
|
30
|
+
labelr/sample/object_detection.py,sha256=XZasR_k4AxzsiWdVMC2ZnyjfA14PKJPrx1U-XPr5tWQ,8427
|
|
31
|
+
labelr-0.10.0.dist-info/licenses/LICENSE,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
|
|
32
|
+
labelr-0.10.0.dist-info/METADATA,sha256=pS2Ipq-aICU3TluuqSNocGP5-V8ztLk6X_udwwnECPk,7243
|
|
33
|
+
labelr-0.10.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
34
|
+
labelr-0.10.0.dist-info/entry_points.txt,sha256=OACukVeR_2z54i8yQuWqqk_jdEHlyTwmTFOFBmxPp1k,43
|
|
35
|
+
labelr-0.10.0.dist-info/top_level.txt,sha256=bjZo50aGZhXIcZYpYOX4sdAQcamxh8nwfEh7A9RD_Ag,7
|
|
36
|
+
labelr-0.10.0.dist-info/RECORD,,
|
labelr-0.9.0.dist-info/RECORD
DELETED
|
@@ -1,28 +0,0 @@
|
|
|
1
|
-
labelr/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
-
labelr/__main__.py,sha256=G4e95-IfhI-lOmkOBP6kQ8wl1x_Fl7dZlLOYr90K83c,66
|
|
3
|
-
labelr/annotate.py,sha256=3fJ9FYbcozcOoKuhNtzPHV8sSnp-45FsNnMc8UeBHGU,3503
|
|
4
|
-
labelr/check.py,sha256=3wK6mE0UsKvoBNm0_lyWhCMq7gxkv5r50pvO70damXY,2476
|
|
5
|
-
labelr/config.py,sha256=3RXF_NdkSuHvfVMGMlYmjlw45fU77zQkLX7gmZq7NxM,64
|
|
6
|
-
labelr/dataset_features.py,sha256=ZC9QAUw9oKHqyUPla2h3xQFaRT9sHq8hkPNN4RDDwmo,1257
|
|
7
|
-
labelr/export.py,sha256=aPfQ-RaK3C2WJrzbETYdC9kRe0MTpCRs0nu5l2SqiRg,20092
|
|
8
|
-
labelr/google_genai.py,sha256=vn_UNQOxUDOTTTWz-emAVErjOtQmnlxM_m8yo2q01Ok,14401
|
|
9
|
-
labelr/main.py,sha256=OTiJSkD_TrzQmQQm291FhknD-HQQTWfBEBgImxqL0KM,2634
|
|
10
|
-
labelr/project_config.py,sha256=CIHEcgSOfXb53naHWEBkTDm2V9m3abAu8C54VSzHjAs,1260
|
|
11
|
-
labelr/sample.py,sha256=VL-iKDvLaIeViJ0TaBY9uCbv0ey528fkaRTYE-Zr12I,10347
|
|
12
|
-
labelr/types.py,sha256=8CHfLyifF_N94OYDhG-7IcWboOh9o0Z_0LBtQapT8TQ,313
|
|
13
|
-
labelr/utils.py,sha256=-zLOWLbvLwtNFtzzwZ6RjJD9GstoYR-gt4wz9r6u9lE,1363
|
|
14
|
-
labelr/apps/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
15
|
-
labelr/apps/datasets.py,sha256=kJQWwm3mjA2uWIA8O_DslM7OS5ht5mgWqcFC_zF4gCo,11187
|
|
16
|
-
labelr/apps/evaluate.py,sha256=UC4CuSKa4vgR5xTBZ-dFgp_1pYnkM55s2IJgix0YtkI,1157
|
|
17
|
-
labelr/apps/google_batch.py,sha256=BMcfBkDwfu-zOOR80bYmtEy6k_Qc70m7K7wmp4Ww0r8,9335
|
|
18
|
-
labelr/apps/hugging_face.py,sha256=B0GaDZeUZj2A7nEeC1OtCANb0DqvBkhWwFWM_9Nm2kU,1608
|
|
19
|
-
labelr/apps/label_studio.py,sha256=su9shoi0K9PmI8RBLipV2KQf_MRjkF5vy5-JUcbXr5A,16852
|
|
20
|
-
labelr/apps/train.py,sha256=wmOSpO9JsrwCXYMgRg2srMbV5B5TvnlfhAKPqUt6wSg,7328
|
|
21
|
-
labelr/evaluate/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
22
|
-
labelr/evaluate/object_detection.py,sha256=QJIwrDY-Vsy0-It6tZSkN3qgAlmIu2W1-kGdmibiPSQ,3349
|
|
23
|
-
labelr-0.9.0.dist-info/licenses/LICENSE,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
|
|
24
|
-
labelr-0.9.0.dist-info/METADATA,sha256=cNkf4LPmbO_k3UuR7O7NtcCwRF-Z5c-yIyQRAocsjww,7322
|
|
25
|
-
labelr-0.9.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
26
|
-
labelr-0.9.0.dist-info/entry_points.txt,sha256=OACukVeR_2z54i8yQuWqqk_jdEHlyTwmTFOFBmxPp1k,43
|
|
27
|
-
labelr-0.9.0.dist-info/top_level.txt,sha256=bjZo50aGZhXIcZYpYOX4sdAQcamxh8nwfEh7A9RD_Ag,7
|
|
28
|
-
labelr-0.9.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|