labelr 0.7.0__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- labelr/apps/datasets.py +12 -25
- labelr/apps/evaluate.py +41 -0
- labelr/apps/google_batch.py +289 -0
- labelr/apps/hugging_face.py +57 -0
- labelr/apps/{projects.py → label_studio.py} +65 -9
- labelr/apps/train.py +22 -4
- labelr/evaluate/__init__.py +0 -0
- labelr/evaluate/object_detection.py +100 -0
- labelr/export.py +64 -7
- labelr/google_genai.py +415 -0
- labelr/main.py +23 -8
- labelr/sample.py +72 -4
- labelr/utils.py +35 -0
- {labelr-0.7.0.dist-info → labelr-0.9.0.dist-info}/METADATA +17 -6
- labelr-0.9.0.dist-info/RECORD +28 -0
- labelr/apps/users.py +0 -36
- labelr-0.7.0.dist-info/RECORD +0 -23
- {labelr-0.7.0.dist-info → labelr-0.9.0.dist-info}/WHEEL +0 -0
- {labelr-0.7.0.dist-info → labelr-0.9.0.dist-info}/entry_points.txt +0 -0
- {labelr-0.7.0.dist-info → labelr-0.9.0.dist-info}/licenses/LICENSE +0 -0
- {labelr-0.7.0.dist-info → labelr-0.9.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
import tempfile
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
import datasets
|
|
5
|
+
import fiftyone as fo
|
|
6
|
+
from huggingface_hub import hf_hub_download
|
|
7
|
+
|
|
8
|
+
from labelr.dataset_features import OBJECT_DETECTION_DS_PREDICTION_FEATURES
|
|
9
|
+
from labelr.utils import parse_hf_repo_id
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def convert_bbox_to_fo_format(
|
|
13
|
+
bbox: tuple[float, float, float, float],
|
|
14
|
+
) -> tuple[float, float, float, float]:
|
|
15
|
+
# Bounding box coordinates should be relative values
|
|
16
|
+
# in [0, 1] in the following format:
|
|
17
|
+
# [top-left-x, top-left-y, width, height]
|
|
18
|
+
y_min, x_min, y_max, x_max = bbox
|
|
19
|
+
return (
|
|
20
|
+
x_min,
|
|
21
|
+
y_min,
|
|
22
|
+
(x_max - x_min),
|
|
23
|
+
(y_max - y_min),
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def visualize(
|
|
28
|
+
hf_repo_id: str,
|
|
29
|
+
dataset_name: str,
|
|
30
|
+
persistent: bool,
|
|
31
|
+
):
|
|
32
|
+
hf_repo_id, hf_revision = parse_hf_repo_id(hf_repo_id)
|
|
33
|
+
|
|
34
|
+
file_path = hf_hub_download(
|
|
35
|
+
hf_repo_id,
|
|
36
|
+
filename="predictions.parquet",
|
|
37
|
+
revision=hf_revision,
|
|
38
|
+
repo_type="model",
|
|
39
|
+
# local_dir="./predictions/",
|
|
40
|
+
)
|
|
41
|
+
file_path = Path(file_path).absolute()
|
|
42
|
+
prediction_dataset = datasets.load_dataset(
|
|
43
|
+
"parquet",
|
|
44
|
+
data_files=str(file_path),
|
|
45
|
+
split="train",
|
|
46
|
+
features=OBJECT_DETECTION_DS_PREDICTION_FEATURES,
|
|
47
|
+
)
|
|
48
|
+
fo_dataset = fo.Dataset(name=dataset_name, persistent=persistent)
|
|
49
|
+
|
|
50
|
+
with tempfile.TemporaryDirectory() as tmpdir_str:
|
|
51
|
+
tmp_dir = Path(tmpdir_str)
|
|
52
|
+
for i, hf_sample in enumerate(prediction_dataset):
|
|
53
|
+
image = hf_sample["image"]
|
|
54
|
+
image_path = tmp_dir / f"{i}.jpg"
|
|
55
|
+
image.save(image_path)
|
|
56
|
+
split = hf_sample["split"]
|
|
57
|
+
sample = fo.Sample(
|
|
58
|
+
filepath=image_path,
|
|
59
|
+
split=split,
|
|
60
|
+
tags=[split],
|
|
61
|
+
image=hf_sample["image_id"],
|
|
62
|
+
)
|
|
63
|
+
ground_truth_detections = [
|
|
64
|
+
fo.Detection(
|
|
65
|
+
label=hf_sample["objects"]["category_name"][i],
|
|
66
|
+
bounding_box=convert_bbox_to_fo_format(
|
|
67
|
+
bbox=hf_sample["objects"]["bbox"][i],
|
|
68
|
+
),
|
|
69
|
+
)
|
|
70
|
+
for i in range(len(hf_sample["objects"]["bbox"]))
|
|
71
|
+
]
|
|
72
|
+
sample["ground_truth"] = fo.Detections(detections=ground_truth_detections)
|
|
73
|
+
|
|
74
|
+
if hf_sample["detected"] is not None and hf_sample["detected"]["bbox"]:
|
|
75
|
+
model_detections = [
|
|
76
|
+
fo.Detection(
|
|
77
|
+
label=hf_sample["detected"]["category_name"][i],
|
|
78
|
+
bounding_box=convert_bbox_to_fo_format(
|
|
79
|
+
bbox=hf_sample["detected"]["bbox"][i]
|
|
80
|
+
),
|
|
81
|
+
confidence=hf_sample["detected"]["confidence"][i],
|
|
82
|
+
)
|
|
83
|
+
for i in range(len(hf_sample["detected"]["bbox"]))
|
|
84
|
+
]
|
|
85
|
+
sample["model"] = fo.Detections(detections=model_detections)
|
|
86
|
+
|
|
87
|
+
fo_dataset.add_sample(sample)
|
|
88
|
+
|
|
89
|
+
# View summary info about the dataset
|
|
90
|
+
print(fo_dataset)
|
|
91
|
+
|
|
92
|
+
# Print the first few samples in the dataset
|
|
93
|
+
print(fo_dataset.head())
|
|
94
|
+
|
|
95
|
+
# Visualize the dataset in the FiftyOne App
|
|
96
|
+
session = fo.launch_app(fo_dataset)
|
|
97
|
+
fo_dataset.evaluate_detections(
|
|
98
|
+
"model", gt_field="ground_truth", eval_key="eval", compute_mAP=True
|
|
99
|
+
)
|
|
100
|
+
session.wait()
|
labelr/export.py
CHANGED
|
@@ -3,6 +3,7 @@ import logging
|
|
|
3
3
|
import pickle
|
|
4
4
|
import random
|
|
5
5
|
import tempfile
|
|
6
|
+
from collections.abc import Iterator
|
|
6
7
|
from pathlib import Path
|
|
7
8
|
|
|
8
9
|
import datasets
|
|
@@ -14,10 +15,13 @@ from PIL import Image, ImageOps
|
|
|
14
15
|
|
|
15
16
|
from labelr.sample import (
|
|
16
17
|
HF_DS_CLASSIFICATION_FEATURES,
|
|
18
|
+
HF_DS_LLM_IMAGE_EXTRACTION_FEATURES,
|
|
17
19
|
HF_DS_OBJECT_DETECTION_FEATURES,
|
|
20
|
+
LLMImageExtractionSample,
|
|
18
21
|
format_object_detection_sample_to_hf,
|
|
19
22
|
)
|
|
20
23
|
from labelr.types import TaskType
|
|
24
|
+
from labelr.utils import PathWithContext
|
|
21
25
|
|
|
22
26
|
logger = logging.getLogger(__name__)
|
|
23
27
|
|
|
@@ -77,13 +81,7 @@ def export_from_ls_to_hf_object_detection(
|
|
|
77
81
|
functools.partial(_pickle_sample_generator, tmp_dir),
|
|
78
82
|
features=HF_DS_OBJECT_DETECTION_FEATURES,
|
|
79
83
|
)
|
|
80
|
-
hf_ds.push_to_hub(
|
|
81
|
-
repo_id,
|
|
82
|
-
split=split,
|
|
83
|
-
revision=revision,
|
|
84
|
-
# Create a PR if not pushing to main branch
|
|
85
|
-
create_pr=revision != "main",
|
|
86
|
-
)
|
|
84
|
+
hf_ds.push_to_hub(repo_id, split=split, revision=revision)
|
|
87
85
|
|
|
88
86
|
|
|
89
87
|
def export_from_ls_to_ultralytics_object_detection(
|
|
@@ -461,3 +459,62 @@ def export_from_ultralytics_to_hf_classification(
|
|
|
461
459
|
features=HF_DS_CLASSIFICATION_FEATURES,
|
|
462
460
|
)
|
|
463
461
|
hf_ds.push_to_hub(repo_id, split=split)
|
|
462
|
+
|
|
463
|
+
|
|
464
|
+
def export_to_hf_llm_image_extraction(
|
|
465
|
+
sample_iter: Iterator[LLMImageExtractionSample],
|
|
466
|
+
split: str,
|
|
467
|
+
repo_id: str,
|
|
468
|
+
revision: str = "main",
|
|
469
|
+
tmp_dir: Path | None = None,
|
|
470
|
+
) -> None:
|
|
471
|
+
"""Export LLM image extraction samples to a Hugging Face dataset.
|
|
472
|
+
|
|
473
|
+
Args:
|
|
474
|
+
sample_iter (Iterator[LLMImageExtractionSample]): Iterator of samples
|
|
475
|
+
to export.
|
|
476
|
+
split (str): Name of the dataset split (e.g., 'train', 'val').
|
|
477
|
+
repo_id (str): Hugging Face repository ID to push the dataset to.
|
|
478
|
+
revision (str): Revision (branch, tag or commit) to use for the
|
|
479
|
+
Hugging Face Datasets repository.
|
|
480
|
+
tmp_dir (Path | None): Temporary directory to use for intermediate
|
|
481
|
+
files. If None, a temporary directory will be created
|
|
482
|
+
automatically.
|
|
483
|
+
"""
|
|
484
|
+
logger.info(
|
|
485
|
+
"Repo ID: %s, revision: %s, split: %s, tmp_dir: %s",
|
|
486
|
+
repo_id,
|
|
487
|
+
revision,
|
|
488
|
+
split,
|
|
489
|
+
tmp_dir,
|
|
490
|
+
)
|
|
491
|
+
|
|
492
|
+
tmp_dir_with_context: PathWithContext | tempfile.TemporaryDirectory
|
|
493
|
+
if tmp_dir:
|
|
494
|
+
tmp_dir.mkdir(parents=True, exist_ok=True)
|
|
495
|
+
tmp_dir_with_context = PathWithContext(tmp_dir)
|
|
496
|
+
else:
|
|
497
|
+
tmp_dir_with_context = tempfile.TemporaryDirectory()
|
|
498
|
+
|
|
499
|
+
with tmp_dir_with_context as tmp_dir_str:
|
|
500
|
+
tmp_dir = Path(tmp_dir_str)
|
|
501
|
+
for sample in tqdm.tqdm(sample_iter, desc="samples"):
|
|
502
|
+
image = sample.image
|
|
503
|
+
# Rotate image according to exif orientation using Pillow
|
|
504
|
+
image = ImageOps.exif_transpose(image)
|
|
505
|
+
image_id = sample.image_id
|
|
506
|
+
sample = {
|
|
507
|
+
"image_id": image_id,
|
|
508
|
+
"image": image,
|
|
509
|
+
"meta": sample.meta.model_dump(),
|
|
510
|
+
"output": sample.output,
|
|
511
|
+
}
|
|
512
|
+
# Save output as pickle
|
|
513
|
+
with open(tmp_dir / f"{split}_{image_id}.pkl", "wb") as f:
|
|
514
|
+
pickle.dump(sample, f)
|
|
515
|
+
|
|
516
|
+
hf_ds = datasets.Dataset.from_generator(
|
|
517
|
+
functools.partial(_pickle_sample_generator, tmp_dir),
|
|
518
|
+
features=HF_DS_LLM_IMAGE_EXTRACTION_FEATURES,
|
|
519
|
+
)
|
|
520
|
+
hf_ds.push_to_hub(repo_id, split=split, revision=revision)
|
labelr/google_genai.py
ADDED
|
@@ -0,0 +1,415 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import mimetypes
|
|
3
|
+
from collections.abc import Iterator
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Literal
|
|
6
|
+
from urllib.parse import urlparse
|
|
7
|
+
|
|
8
|
+
import aiofiles
|
|
9
|
+
import jsonschema
|
|
10
|
+
import orjson
|
|
11
|
+
import typer
|
|
12
|
+
from gcloud.aio.storage import Storage
|
|
13
|
+
from openfoodfacts import Flavor
|
|
14
|
+
from openfoodfacts.images import download_image, generate_image_url
|
|
15
|
+
from tqdm.asyncio import tqdm
|
|
16
|
+
|
|
17
|
+
from labelr.sample import LLMImageExtractionSample, SampleMeta
|
|
18
|
+
from labelr.utils import download_image_from_gcs
|
|
19
|
+
|
|
20
|
+
try:
|
|
21
|
+
import google.genai # noqa: F401
|
|
22
|
+
except ImportError:
|
|
23
|
+
raise ImportError(
|
|
24
|
+
"The 'google-genai' package is required to use this module. "
|
|
25
|
+
"Please install labelr with the 'google' extra: "
|
|
26
|
+
"`pip install labelr[google]`"
|
|
27
|
+
)
|
|
28
|
+
import aiohttp
|
|
29
|
+
from google import genai
|
|
30
|
+
from google.cloud import storage
|
|
31
|
+
from google.genai.types import CreateBatchJobConfig, HttpOptions
|
|
32
|
+
from google.genai.types import JSONSchema as GoogleJSONSchema
|
|
33
|
+
from google.genai.types import Schema as GoogleSchema
|
|
34
|
+
from openfoodfacts.types import JSONType
|
|
35
|
+
from pydantic import BaseModel
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class RawBatchSamplePart(BaseModel):
|
|
39
|
+
type: Literal["text", "image"]
|
|
40
|
+
data: str
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class RawBatchSample(BaseModel):
|
|
44
|
+
key: str
|
|
45
|
+
parts: list[RawBatchSamplePart]
|
|
46
|
+
meta: JSONType = {}
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def convert_pydantic_model_to_google_schema(schema: type[BaseModel]) -> JSONType:
|
|
50
|
+
"""Google doesn't support natively OpenAPI schemas, so we convert them to
|
|
51
|
+
Google `Schema` (a subset of OpenAPI)."""
|
|
52
|
+
return GoogleSchema.from_json_schema(
|
|
53
|
+
json_schema=GoogleJSONSchema.model_validate(schema.model_json_schema())
|
|
54
|
+
).model_dump(mode="json", exclude_none=True, exclude_unset=True)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
async def download_image(url: str, session: aiohttp.ClientSession) -> bytes:
|
|
58
|
+
"""Download an image from a URL and return its content as bytes.
|
|
59
|
+
|
|
60
|
+
Args:
|
|
61
|
+
url (str): URL of the image to download.
|
|
62
|
+
Returns:
|
|
63
|
+
bytes: Content of the downloaded image.
|
|
64
|
+
"""
|
|
65
|
+
async with session.get(url) as response:
|
|
66
|
+
response.raise_for_status()
|
|
67
|
+
return await response.read()
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
async def download_image_from_filesystem(url: str, base_dir: Path) -> bytes:
|
|
71
|
+
"""Download an image from the filesystem and return its content as bytes.
|
|
72
|
+
|
|
73
|
+
Args:
|
|
74
|
+
url (str): URL of the image to download.
|
|
75
|
+
base_dir (Path): Base directory where images are stored.
|
|
76
|
+
Returns:
|
|
77
|
+
bytes: Content of the downloaded image.
|
|
78
|
+
"""
|
|
79
|
+
file_path = urlparse(url).path[1:] # Remove leading '/'
|
|
80
|
+
full_file_path = base_dir / file_path
|
|
81
|
+
async with aiofiles.open(full_file_path, "rb") as f:
|
|
82
|
+
return await f.read()
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
async def upload_to_gcs(
|
|
86
|
+
image_url: str,
|
|
87
|
+
bucket_name: str,
|
|
88
|
+
blob_name: str,
|
|
89
|
+
session: aiohttp.ClientSession,
|
|
90
|
+
base_image_dir: Path | None = None,
|
|
91
|
+
) -> dict:
|
|
92
|
+
"""Upload data to Google Cloud Storage.
|
|
93
|
+
Args:
|
|
94
|
+
bucket_name (str): Name of the GCS bucket.
|
|
95
|
+
blob_name (str): Name of the blob (object) in the bucket.
|
|
96
|
+
data (bytes): Data to upload.
|
|
97
|
+
session (aiohttp.ClientSession): HTTP session to use for downloading
|
|
98
|
+
the image.
|
|
99
|
+
base_image_dir (Path | None): If provided, images will be read from
|
|
100
|
+
the filesystem under this base directory instead of downloading
|
|
101
|
+
them from their URLs.
|
|
102
|
+
Returns:
|
|
103
|
+
dict: Status of the upload operation.
|
|
104
|
+
"""
|
|
105
|
+
if base_image_dir is None:
|
|
106
|
+
image_data = await download_image(image_url, session)
|
|
107
|
+
else:
|
|
108
|
+
image_data = await download_image_from_filesystem(image_url, base_image_dir)
|
|
109
|
+
|
|
110
|
+
client = Storage(session=session)
|
|
111
|
+
|
|
112
|
+
status = await client.upload(
|
|
113
|
+
bucket_name,
|
|
114
|
+
blob_name,
|
|
115
|
+
image_data,
|
|
116
|
+
)
|
|
117
|
+
return status
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
async def upload_to_gcs_format_async(
|
|
121
|
+
sample: RawBatchSample,
|
|
122
|
+
google_json_schema: JSONType,
|
|
123
|
+
instructions: str | None,
|
|
124
|
+
bucket_name: str,
|
|
125
|
+
bucket_dir_name: str,
|
|
126
|
+
session: aiohttp.ClientSession,
|
|
127
|
+
base_image_dir: Path | None = None,
|
|
128
|
+
skip_upload: bool = False,
|
|
129
|
+
thinking_level: str | None = None,
|
|
130
|
+
) -> JSONType | None:
|
|
131
|
+
parts: list[JSONType] = []
|
|
132
|
+
|
|
133
|
+
if instructions:
|
|
134
|
+
parts.append({"text": instructions})
|
|
135
|
+
|
|
136
|
+
for part in sample.parts:
|
|
137
|
+
if part.type == "image":
|
|
138
|
+
mime_type, _ = mimetypes.guess_type(part.data)
|
|
139
|
+
if mime_type is None:
|
|
140
|
+
raise ValueError(f"Cannot guess mimetype of file: {part.data}")
|
|
141
|
+
|
|
142
|
+
file_uri = part.data
|
|
143
|
+
image_blob_name = f"{bucket_dir_name}/{sample.key}/{Path(file_uri).name}"
|
|
144
|
+
# Download the image from the URL
|
|
145
|
+
if not skip_upload:
|
|
146
|
+
try:
|
|
147
|
+
await upload_to_gcs(
|
|
148
|
+
image_url=file_uri,
|
|
149
|
+
bucket_name=bucket_name,
|
|
150
|
+
blob_name=image_blob_name,
|
|
151
|
+
session=session,
|
|
152
|
+
base_image_dir=base_image_dir,
|
|
153
|
+
)
|
|
154
|
+
except FileNotFoundError:
|
|
155
|
+
return None
|
|
156
|
+
|
|
157
|
+
parts.append(
|
|
158
|
+
{
|
|
159
|
+
"file_data": {
|
|
160
|
+
"file_uri": f"gs://{bucket_name}/{image_blob_name}",
|
|
161
|
+
"mime_type": mime_type,
|
|
162
|
+
}
|
|
163
|
+
}
|
|
164
|
+
)
|
|
165
|
+
else:
|
|
166
|
+
parts.append({"text": part.data})
|
|
167
|
+
|
|
168
|
+
generation_config = {
|
|
169
|
+
"responseMimeType": "application/json",
|
|
170
|
+
"response_json_schema": google_json_schema,
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
if thinking_level is not None:
|
|
174
|
+
generation_config["thinkingConfig"] = {"thinkingLevel": thinking_level}
|
|
175
|
+
|
|
176
|
+
return {
|
|
177
|
+
"key": f"key:{sample.key}",
|
|
178
|
+
"request": {
|
|
179
|
+
"contents": [
|
|
180
|
+
{
|
|
181
|
+
"parts": parts,
|
|
182
|
+
"role": "user",
|
|
183
|
+
}
|
|
184
|
+
],
|
|
185
|
+
"generationConfig": generation_config,
|
|
186
|
+
},
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
async def generate_batch_dataset(
|
|
191
|
+
data_path: Path,
|
|
192
|
+
output_path: Path,
|
|
193
|
+
google_json_schema: JSONType,
|
|
194
|
+
instructions: str | None,
|
|
195
|
+
bucket_name: str,
|
|
196
|
+
bucket_dir_name: str,
|
|
197
|
+
max_concurrent_uploads: int = 30,
|
|
198
|
+
base_image_dir: Path | None = None,
|
|
199
|
+
from_key: str | None = None,
|
|
200
|
+
skip_upload: bool = False,
|
|
201
|
+
thinking_level: str | None = None,
|
|
202
|
+
):
|
|
203
|
+
limiter = asyncio.Semaphore(max_concurrent_uploads)
|
|
204
|
+
ignore = True if from_key is None else False
|
|
205
|
+
missing_files = 0
|
|
206
|
+
async with aiohttp.ClientSession() as session:
|
|
207
|
+
async with asyncio.TaskGroup() as tg:
|
|
208
|
+
async with (
|
|
209
|
+
aiofiles.open(data_path, "r") as input_file,
|
|
210
|
+
aiofiles.open(output_path, "wb") as output_file,
|
|
211
|
+
):
|
|
212
|
+
async with limiter:
|
|
213
|
+
tasks = set()
|
|
214
|
+
async for line in tqdm(input_file, desc="samples"):
|
|
215
|
+
# print(f"line: {line}")
|
|
216
|
+
sample = RawBatchSample.model_validate_json(line)
|
|
217
|
+
# print(f"sample: {sample}")
|
|
218
|
+
record_key = sample.key
|
|
219
|
+
if from_key is not None and ignore:
|
|
220
|
+
if record_key == from_key:
|
|
221
|
+
ignore = False
|
|
222
|
+
else:
|
|
223
|
+
continue
|
|
224
|
+
task = tg.create_task(
|
|
225
|
+
upload_to_gcs_format_async(
|
|
226
|
+
sample=sample,
|
|
227
|
+
google_json_schema=google_json_schema,
|
|
228
|
+
instructions=instructions,
|
|
229
|
+
bucket_name=bucket_name,
|
|
230
|
+
bucket_dir_name=bucket_dir_name,
|
|
231
|
+
session=session,
|
|
232
|
+
base_image_dir=base_image_dir,
|
|
233
|
+
skip_upload=skip_upload,
|
|
234
|
+
thinking_level=thinking_level,
|
|
235
|
+
)
|
|
236
|
+
)
|
|
237
|
+
tasks.add(task)
|
|
238
|
+
|
|
239
|
+
if len(tasks) >= max_concurrent_uploads:
|
|
240
|
+
for task in tasks:
|
|
241
|
+
await task
|
|
242
|
+
updated_record = task.result()
|
|
243
|
+
if updated_record is not None:
|
|
244
|
+
await output_file.write(
|
|
245
|
+
orjson.dumps(updated_record) + "\n".encode()
|
|
246
|
+
)
|
|
247
|
+
else:
|
|
248
|
+
missing_files += 1
|
|
249
|
+
tasks.clear()
|
|
250
|
+
|
|
251
|
+
for task in tasks:
|
|
252
|
+
await task
|
|
253
|
+
updated_record = task.result()
|
|
254
|
+
if updated_record is not None:
|
|
255
|
+
await output_file.write(
|
|
256
|
+
orjson.dumps(updated_record) + "\n".encode()
|
|
257
|
+
)
|
|
258
|
+
else:
|
|
259
|
+
missing_files += 1
|
|
260
|
+
|
|
261
|
+
typer.echo(
|
|
262
|
+
f"Upload and dataset update completed. Wrote updated dataset to {output_path}. "
|
|
263
|
+
f"Missing files: {missing_files}."
|
|
264
|
+
)
|
|
265
|
+
|
|
266
|
+
|
|
267
|
+
def launch_batch_job(
|
|
268
|
+
run_name: str,
|
|
269
|
+
dataset_path: Path,
|
|
270
|
+
model: str,
|
|
271
|
+
location: str,
|
|
272
|
+
):
|
|
273
|
+
"""Launch a Gemini Batch Inference job.
|
|
274
|
+
|
|
275
|
+
Args:
|
|
276
|
+
run_name (str): Name of the batch run.
|
|
277
|
+
dataset_path (Path): Path to the dataset file in JSONL format.
|
|
278
|
+
model (str): Model to use for the batch job. Example:
|
|
279
|
+
'gemini-2.5-flash'.
|
|
280
|
+
location (str): Location for the Vertex AI resources. Example:
|
|
281
|
+
'europe-west4'.
|
|
282
|
+
"""
|
|
283
|
+
# We upload the dataset to a GCS bucket using the Gcloud
|
|
284
|
+
|
|
285
|
+
if model == "gemini-3-pro-preview" and location != "global":
|
|
286
|
+
typer.echo(
|
|
287
|
+
"Warning: only 'global' location is supported for 'gemini-3-pro-preview' model. Overriding location to 'global'."
|
|
288
|
+
)
|
|
289
|
+
location = "global"
|
|
290
|
+
|
|
291
|
+
storage_client = storage.Client()
|
|
292
|
+
bucket_name = "robotoff-batch" # Replace with your bucket name
|
|
293
|
+
run_dir = f"gemini-batch/{run_name}"
|
|
294
|
+
input_file_blob_name = f"{run_dir}/inputs.jsonl"
|
|
295
|
+
bucket = storage_client.bucket(bucket_name)
|
|
296
|
+
blob = bucket.blob(input_file_blob_name)
|
|
297
|
+
blob.upload_from_filename(dataset_path)
|
|
298
|
+
|
|
299
|
+
client = genai.Client(
|
|
300
|
+
http_options=HttpOptions(api_version="v1"),
|
|
301
|
+
vertexai=True,
|
|
302
|
+
location=location,
|
|
303
|
+
)
|
|
304
|
+
output_uri = f"gs://{bucket_name}/{run_dir}"
|
|
305
|
+
job = client.batches.create(
|
|
306
|
+
model=model,
|
|
307
|
+
src=f"gs://{bucket_name}/{input_file_blob_name}",
|
|
308
|
+
config=CreateBatchJobConfig(dest=output_uri),
|
|
309
|
+
)
|
|
310
|
+
print(job)
|
|
311
|
+
|
|
312
|
+
|
|
313
|
+
def generate_sample_iter(
|
|
314
|
+
prediction_path: Path,
|
|
315
|
+
json_schema: JSONType,
|
|
316
|
+
skip: int = 0,
|
|
317
|
+
limit: int | None = None,
|
|
318
|
+
is_openfoodfacts_dataset: bool = False,
|
|
319
|
+
openfoodfacts_flavor: Flavor = Flavor.off,
|
|
320
|
+
raise_on_invalid_sample: bool = False,
|
|
321
|
+
) -> Iterator[LLMImageExtractionSample]:
|
|
322
|
+
"""Generate training samples from a Gemini Batch Inference prediction
|
|
323
|
+
JSONL file.
|
|
324
|
+
|
|
325
|
+
Args:
|
|
326
|
+
prediction_path (Path): Path to the prediction JSONL file.
|
|
327
|
+
json_schema (JSONType): JSON schema to validate the predictions.
|
|
328
|
+
skip (int): Number of initial samples to skip.
|
|
329
|
+
limit (int | None): Maximum number of samples to generate.
|
|
330
|
+
is_openfoodfacts_dataset (bool): Whether the dataset is from Open Food
|
|
331
|
+
Facts.
|
|
332
|
+
openfoodfacts_flavor (Flavor): Flavor of the Open Food Facts dataset.
|
|
333
|
+
Yields:
|
|
334
|
+
Iterator[LLMImageExtractionSample]: Generated samples.
|
|
335
|
+
"""
|
|
336
|
+
skipped = 0
|
|
337
|
+
invalid = 0
|
|
338
|
+
with prediction_path.open("r") as f_in:
|
|
339
|
+
for i, sample_str in enumerate(f_in):
|
|
340
|
+
if i < skip:
|
|
341
|
+
skipped += 1
|
|
342
|
+
continue
|
|
343
|
+
if limit is not None and i >= skip + limit:
|
|
344
|
+
break
|
|
345
|
+
sample = orjson.loads(sample_str)
|
|
346
|
+
try:
|
|
347
|
+
yield generate_sample_from_prediction(
|
|
348
|
+
json_schema=json_schema,
|
|
349
|
+
sample=sample,
|
|
350
|
+
is_openfoodfacts_dataset=is_openfoodfacts_dataset,
|
|
351
|
+
openfoodfacts_flavor=openfoodfacts_flavor,
|
|
352
|
+
)
|
|
353
|
+
except Exception as e:
|
|
354
|
+
if raise_on_invalid_sample:
|
|
355
|
+
raise
|
|
356
|
+
else:
|
|
357
|
+
typer.echo(
|
|
358
|
+
f"Skipping invalid sample at line {i + 1} in {prediction_path}: {e}"
|
|
359
|
+
)
|
|
360
|
+
invalid += 1
|
|
361
|
+
continue
|
|
362
|
+
if skipped > 0:
|
|
363
|
+
typer.echo(f"Skipped {skipped} samples.")
|
|
364
|
+
if invalid > 0:
|
|
365
|
+
typer.echo(f"Skipped {invalid} invalid samples.")
|
|
366
|
+
|
|
367
|
+
|
|
368
|
+
def generate_sample_from_prediction(
|
|
369
|
+
json_schema: JSONType,
|
|
370
|
+
sample: JSONType,
|
|
371
|
+
is_openfoodfacts_dataset: bool = False,
|
|
372
|
+
openfoodfacts_flavor: Flavor = Flavor.off,
|
|
373
|
+
) -> LLMImageExtractionSample:
|
|
374
|
+
"""Generate a LLMImageExtractionSample from a prediction sample.
|
|
375
|
+
Args:
|
|
376
|
+
json_schema (JSONType): JSON schema to validate the predictions.
|
|
377
|
+
sample (JSONType): Prediction sample.
|
|
378
|
+
is_openfoodfacts_dataset (bool): Whether the dataset is from Open Food
|
|
379
|
+
Facts.
|
|
380
|
+
openfoodfacts_flavor (Flavor): Flavor of the Open Food Facts dataset.
|
|
381
|
+
Returns:
|
|
382
|
+
LLMImageExtractionSample: Generated sample.
|
|
383
|
+
"""
|
|
384
|
+
image_id = sample["key"][len("key:") :]
|
|
385
|
+
response_str = sample["response"]["candidates"][0]["content"]["parts"][0]["text"]
|
|
386
|
+
image_uri = sample["request"]["contents"][0]["parts"][1]["file_data"]["file_uri"]
|
|
387
|
+
image = download_image_from_gcs(image_uri=image_uri)
|
|
388
|
+
response = orjson.loads(response_str)
|
|
389
|
+
jsonschema.validate(response, json_schema)
|
|
390
|
+
|
|
391
|
+
if is_openfoodfacts_dataset:
|
|
392
|
+
image_stem_parts = image_id.split("_")
|
|
393
|
+
barcode = image_stem_parts[0]
|
|
394
|
+
off_image_id = image_stem_parts[1]
|
|
395
|
+
image_id = f"{barcode}_{off_image_id}"
|
|
396
|
+
image_url = generate_image_url(
|
|
397
|
+
barcode, off_image_id, flavor=openfoodfacts_flavor
|
|
398
|
+
)
|
|
399
|
+
else:
|
|
400
|
+
image_id = image_id
|
|
401
|
+
barcode = ""
|
|
402
|
+
off_image_id = ""
|
|
403
|
+
image_url = ""
|
|
404
|
+
|
|
405
|
+
sample_meta = SampleMeta(
|
|
406
|
+
barcode=barcode,
|
|
407
|
+
off_image_id=off_image_id,
|
|
408
|
+
image_url=image_url,
|
|
409
|
+
)
|
|
410
|
+
return LLMImageExtractionSample(
|
|
411
|
+
image_id=image_id,
|
|
412
|
+
image=image,
|
|
413
|
+
output=orjson.dumps(response).decode("utf-8"),
|
|
414
|
+
meta=sample_meta,
|
|
415
|
+
)
|
labelr/main.py
CHANGED
|
@@ -4,9 +4,11 @@ import typer
|
|
|
4
4
|
from openfoodfacts.utils import get_logger
|
|
5
5
|
|
|
6
6
|
from labelr.apps import datasets as dataset_app
|
|
7
|
-
from labelr.apps import
|
|
7
|
+
from labelr.apps import evaluate as evaluate_app
|
|
8
|
+
from labelr.apps import google_batch as google_batch_app
|
|
9
|
+
from labelr.apps import hugging_face as hf_app
|
|
10
|
+
from labelr.apps import label_studio as ls_app
|
|
8
11
|
from labelr.apps import train as train_app
|
|
9
|
-
from labelr.apps import users as user_app
|
|
10
12
|
|
|
11
13
|
app = typer.Typer(pretty_exceptions_show_locals=False)
|
|
12
14
|
|
|
@@ -58,22 +60,35 @@ def predict(
|
|
|
58
60
|
typer.echo(result)
|
|
59
61
|
|
|
60
62
|
|
|
61
|
-
app.add_typer(user_app.app, name="users", help="Manage Label Studio users")
|
|
62
63
|
app.add_typer(
|
|
63
|
-
|
|
64
|
-
name="
|
|
65
|
-
help="Manage Label Studio projects (create, import data, etc.)",
|
|
64
|
+
ls_app.app,
|
|
65
|
+
name="ls",
|
|
66
|
+
help="Manage Label Studio projects (create, import data, etc.).",
|
|
67
|
+
)
|
|
68
|
+
app.add_typer(
|
|
69
|
+
hf_app.app,
|
|
70
|
+
name="hf",
|
|
71
|
+
help="Manage Hugging Face Datasets repositories.",
|
|
66
72
|
)
|
|
67
73
|
app.add_typer(
|
|
68
74
|
dataset_app.app,
|
|
69
75
|
name="datasets",
|
|
70
76
|
help="Manage datasets (convert, export, check, etc.)",
|
|
71
77
|
)
|
|
72
|
-
|
|
73
78
|
app.add_typer(
|
|
74
79
|
train_app.app,
|
|
75
80
|
name="train",
|
|
76
|
-
help="Train models",
|
|
81
|
+
help="Train models.",
|
|
82
|
+
)
|
|
83
|
+
app.add_typer(
|
|
84
|
+
evaluate_app.app,
|
|
85
|
+
name="evaluate",
|
|
86
|
+
help="Visualize and evaluate trained models.",
|
|
87
|
+
)
|
|
88
|
+
app.add_typer(
|
|
89
|
+
google_batch_app.app,
|
|
90
|
+
name="google-batch",
|
|
91
|
+
help="Generate datasets and launch batch jobs on Google Gemini.",
|
|
77
92
|
)
|
|
78
93
|
|
|
79
94
|
if __name__ == "__main__":
|