labelr 0.8.0__py3-none-any.whl → 0.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- labelr/apps/datasets.py +56 -5
- labelr/apps/google_batch.py +296 -0
- labelr/apps/label_studio.py +1 -1
- labelr/export/classification.py +114 -0
- labelr/export/common.py +42 -0
- labelr/export/llm.py +91 -0
- labelr/{export.py → export/object_detection.py} +3 -138
- labelr/google_genai.py +421 -0
- labelr/main.py +6 -0
- labelr/sample/__init__.py +0 -0
- labelr/sample/classification.py +17 -0
- labelr/sample/common.py +14 -0
- labelr/sample/llm.py +75 -0
- labelr/{sample.py → sample/object_detection.py} +0 -17
- labelr/utils.py +85 -0
- {labelr-0.8.0.dist-info → labelr-0.10.0.dist-info}/METADATA +9 -1
- labelr-0.10.0.dist-info/RECORD +36 -0
- labelr-0.8.0.dist-info/RECORD +0 -27
- /labelr/{evaluate/llm.py → export/__init__.py} +0 -0
- {labelr-0.8.0.dist-info → labelr-0.10.0.dist-info}/WHEEL +0 -0
- {labelr-0.8.0.dist-info → labelr-0.10.0.dist-info}/entry_points.txt +0 -0
- {labelr-0.8.0.dist-info → labelr-0.10.0.dist-info}/licenses/LICENSE +0 -0
- {labelr-0.8.0.dist-info → labelr-0.10.0.dist-info}/top_level.txt +0 -0
labelr/apps/datasets.py
CHANGED
|
@@ -12,7 +12,11 @@ import typer
|
|
|
12
12
|
from openfoodfacts import Flavor
|
|
13
13
|
from openfoodfacts.utils import get_logger
|
|
14
14
|
|
|
15
|
-
from labelr.export import export_from_ultralytics_to_hf
|
|
15
|
+
from labelr.export.common import export_from_ultralytics_to_hf
|
|
16
|
+
from labelr.export.object_detection import (
|
|
17
|
+
export_from_ls_to_hf_object_detection,
|
|
18
|
+
export_from_ls_to_ultralytics_object_detection,
|
|
19
|
+
)
|
|
16
20
|
|
|
17
21
|
from ..config import LABEL_STUDIO_DEFAULT_URL
|
|
18
22
|
from ..types import ExportDestination, ExportSource, TaskType
|
|
@@ -99,7 +103,9 @@ def convert_object_detection_dataset(
|
|
|
99
103
|
Studio format, and save it to a JSON file."""
|
|
100
104
|
from datasets import load_dataset
|
|
101
105
|
|
|
102
|
-
from labelr.sample import
|
|
106
|
+
from labelr.sample.object_detection import (
|
|
107
|
+
format_object_detection_sample_from_hf_to_ls,
|
|
108
|
+
)
|
|
103
109
|
|
|
104
110
|
logger.info("Loading dataset: %s", repo_id)
|
|
105
111
|
ds = load_dataset(repo_id)
|
|
@@ -207,10 +213,8 @@ def export(
|
|
|
207
213
|
local files (ultralytics format)."""
|
|
208
214
|
from label_studio_sdk.client import LabelStudio
|
|
209
215
|
|
|
210
|
-
from labelr.export import (
|
|
216
|
+
from labelr.export.object_detection import (
|
|
211
217
|
export_from_hf_to_ultralytics_object_detection,
|
|
212
|
-
export_from_ls_to_hf_object_detection,
|
|
213
|
-
export_from_ls_to_ultralytics_object_detection,
|
|
214
218
|
)
|
|
215
219
|
|
|
216
220
|
if (to == ExportDestination.hf or from_ == ExportSource.hf) and repo_id is None:
|
|
@@ -303,3 +307,50 @@ def export(
|
|
|
303
307
|
is_openfoodfacts_dataset=is_openfoodfacts_dataset,
|
|
304
308
|
openfoodfacts_flavor=openfoodfacts_flavor,
|
|
305
309
|
)
|
|
310
|
+
|
|
311
|
+
|
|
312
|
+
@app.command()
|
|
313
|
+
def export_llm_ds(
|
|
314
|
+
dataset_path: Annotated[
|
|
315
|
+
Path, typer.Option(..., help="Path to the JSONL dataset file")
|
|
316
|
+
],
|
|
317
|
+
repo_id: Annotated[
|
|
318
|
+
str, typer.Option(..., help="Hugging Face Datasets repository ID to export to")
|
|
319
|
+
],
|
|
320
|
+
split: Annotated[str, typer.Option(..., help="Dataset split to export")],
|
|
321
|
+
revision: Annotated[
|
|
322
|
+
str,
|
|
323
|
+
typer.Option(
|
|
324
|
+
help="Revision (branch, tag or commit) for the Hugging Face Datasets repository."
|
|
325
|
+
),
|
|
326
|
+
] = "main",
|
|
327
|
+
tmp_dir: Annotated[
|
|
328
|
+
Path | None,
|
|
329
|
+
typer.Option(
|
|
330
|
+
help="Path to a temporary directory to use for image processing",
|
|
331
|
+
),
|
|
332
|
+
] = None,
|
|
333
|
+
image_max_size: Annotated[
|
|
334
|
+
int | None,
|
|
335
|
+
typer.Option(
|
|
336
|
+
help="Maximum size (in pixels) for the images. If None, no resizing is performed.",
|
|
337
|
+
),
|
|
338
|
+
] = None,
|
|
339
|
+
):
|
|
340
|
+
"""Export LLM image extraction dataset with images only to Hugging Face
|
|
341
|
+
Datasets.
|
|
342
|
+
"""
|
|
343
|
+
from labelr.export.llm import export_to_hf_llm_image_extraction
|
|
344
|
+
from labelr.sample.llm import load_llm_image_extraction_dataset_from_jsonl
|
|
345
|
+
|
|
346
|
+
sample_iter = load_llm_image_extraction_dataset_from_jsonl(
|
|
347
|
+
dataset_path=dataset_path
|
|
348
|
+
)
|
|
349
|
+
export_to_hf_llm_image_extraction(
|
|
350
|
+
sample_iter,
|
|
351
|
+
split=split,
|
|
352
|
+
repo_id=repo_id,
|
|
353
|
+
revision=revision,
|
|
354
|
+
tmp_dir=tmp_dir,
|
|
355
|
+
image_max_size=image_max_size,
|
|
356
|
+
)
|
|
@@ -0,0 +1,296 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import importlib
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Annotated, Any
|
|
5
|
+
|
|
6
|
+
import typer
|
|
7
|
+
from google.genai.types import JSONSchema as GoogleJSONSchema
|
|
8
|
+
from google.genai.types import Schema as GoogleSchema
|
|
9
|
+
from openfoodfacts import Flavor
|
|
10
|
+
from pydantic import BaseModel
|
|
11
|
+
|
|
12
|
+
from labelr.google_genai import generate_batch_dataset, launch_batch_job
|
|
13
|
+
|
|
14
|
+
app = typer.Typer()
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def convert_pydantic_model_to_google_schema(schema: type[BaseModel]) -> dict[str, Any]:
|
|
18
|
+
"""Google doesn't support natively OpenAPI schemas, so we convert them to
|
|
19
|
+
Google `Schema` (a subset of OpenAPI)."""
|
|
20
|
+
return GoogleSchema.from_json_schema(
|
|
21
|
+
json_schema=GoogleJSONSchema.model_validate(schema.model_json_schema())
|
|
22
|
+
).model_dump(mode="json", exclude_none=True, exclude_unset=True)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@app.command()
|
|
26
|
+
def generate_dataset(
|
|
27
|
+
data_path: Annotated[
|
|
28
|
+
Path,
|
|
29
|
+
typer.Option(
|
|
30
|
+
...,
|
|
31
|
+
help="Path to a JSONL file containing the raw batch samples.",
|
|
32
|
+
exists=True,
|
|
33
|
+
dir_okay=False,
|
|
34
|
+
resolve_path=True,
|
|
35
|
+
),
|
|
36
|
+
],
|
|
37
|
+
output_path: Annotated[
|
|
38
|
+
Path,
|
|
39
|
+
typer.Option(
|
|
40
|
+
...,
|
|
41
|
+
help="Path where to write the generated dataset file.",
|
|
42
|
+
exists=False,
|
|
43
|
+
dir_okay=False,
|
|
44
|
+
resolve_path=True,
|
|
45
|
+
),
|
|
46
|
+
],
|
|
47
|
+
config_module: Annotated[
|
|
48
|
+
str,
|
|
49
|
+
typer.Option(
|
|
50
|
+
...,
|
|
51
|
+
help="Python module path (e.g., 'myschema') containing two variables: "
|
|
52
|
+
"OUTPUT_SCHEMA (a Pydantic class representing the output schema) and "
|
|
53
|
+
"INSTRUCTIONS (a str containing instructions to add before each sample).",
|
|
54
|
+
),
|
|
55
|
+
],
|
|
56
|
+
bucket_name: Annotated[
|
|
57
|
+
str,
|
|
58
|
+
typer.Option(
|
|
59
|
+
...,
|
|
60
|
+
help="Name of the GCS bucket where the images are stored.",
|
|
61
|
+
),
|
|
62
|
+
] = "robotoff-batch",
|
|
63
|
+
bucket_dir_name: Annotated[
|
|
64
|
+
str,
|
|
65
|
+
typer.Option(
|
|
66
|
+
...,
|
|
67
|
+
help="Directory name in the GCS bucket where the images are stored.",
|
|
68
|
+
),
|
|
69
|
+
] = "gemini-batch-images",
|
|
70
|
+
max_concurrent_uploads: Annotated[
|
|
71
|
+
int,
|
|
72
|
+
typer.Option(
|
|
73
|
+
...,
|
|
74
|
+
help="Maximum number of concurrent uploads to GCS.",
|
|
75
|
+
),
|
|
76
|
+
] = 30,
|
|
77
|
+
base_image_dir: Annotated[
|
|
78
|
+
Path | None,
|
|
79
|
+
typer.Option(
|
|
80
|
+
...,
|
|
81
|
+
help="Base directory to resolve local image paths from.",
|
|
82
|
+
),
|
|
83
|
+
] = None,
|
|
84
|
+
from_key: Annotated[
|
|
85
|
+
str | None,
|
|
86
|
+
typer.Option(
|
|
87
|
+
...,
|
|
88
|
+
help="If specified, resume processing from this sample key.",
|
|
89
|
+
),
|
|
90
|
+
] = None,
|
|
91
|
+
skip_upload: Annotated[
|
|
92
|
+
bool, typer.Option(..., help="Skip uploading images to GCS")
|
|
93
|
+
] = False,
|
|
94
|
+
thinking_level: Annotated[
|
|
95
|
+
str | None,
|
|
96
|
+
typer.Option(
|
|
97
|
+
...,
|
|
98
|
+
help="Thinking level to use for the generation config.",
|
|
99
|
+
),
|
|
100
|
+
] = None,
|
|
101
|
+
):
|
|
102
|
+
"""Generate a dataset file in JSONL format to be used for batch
|
|
103
|
+
processing, using Gemini Batch Inference."""
|
|
104
|
+
typer.echo(f"Uploading images from '{data_path}' to GCS bucket '{bucket_name}'...")
|
|
105
|
+
typer.echo(f"Writing updated dataset to {output_path}...")
|
|
106
|
+
typer.echo(f"Max concurrent uploads: {max_concurrent_uploads}...")
|
|
107
|
+
typer.echo(f"Base image directory: {base_image_dir}...")
|
|
108
|
+
typer.echo(f"From key: {from_key}...")
|
|
109
|
+
typer.echo(f"Skip upload: {skip_upload}...")
|
|
110
|
+
typer.echo(f"Thinking level: {thinking_level}...")
|
|
111
|
+
|
|
112
|
+
module = importlib.import_module(config_module)
|
|
113
|
+
base_cls = getattr(module, "OUTPUT_SCHEMA")
|
|
114
|
+
|
|
115
|
+
if not issubclass(base_cls, BaseModel):
|
|
116
|
+
typer.echo(
|
|
117
|
+
f"Error: {config_module}.OUTPUT_SCHEMA is not a subclass of pydantic.BaseModel"
|
|
118
|
+
)
|
|
119
|
+
raise typer.Exit(code=1)
|
|
120
|
+
|
|
121
|
+
instructions = getattr(module, "INSTRUCTIONS", None) or None
|
|
122
|
+
|
|
123
|
+
if instructions:
|
|
124
|
+
typer.echo(f"Using instructions: '{instructions}'...")
|
|
125
|
+
else:
|
|
126
|
+
typer.echo("No instructions provided.")
|
|
127
|
+
|
|
128
|
+
# JSON Schema is supoorted natively by Vertex AI and Gemini APIs,
|
|
129
|
+
# but not yet on Batch Inference...
|
|
130
|
+
# So we convert the JSON schema to Google internal "Schema"
|
|
131
|
+
# google_json_schema = base_cls.model_json_schema()
|
|
132
|
+
google_json_schema = convert_pydantic_model_to_google_schema(base_cls)
|
|
133
|
+
asyncio.run(
|
|
134
|
+
generate_batch_dataset(
|
|
135
|
+
data_path=data_path,
|
|
136
|
+
output_path=output_path,
|
|
137
|
+
google_json_schema=google_json_schema,
|
|
138
|
+
instructions=instructions,
|
|
139
|
+
bucket_name=bucket_name,
|
|
140
|
+
bucket_dir_name=bucket_dir_name,
|
|
141
|
+
max_concurrent_uploads=max_concurrent_uploads,
|
|
142
|
+
base_image_dir=base_image_dir,
|
|
143
|
+
from_key=from_key,
|
|
144
|
+
skip_upload=skip_upload,
|
|
145
|
+
thinking_level=thinking_level,
|
|
146
|
+
)
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
@app.command(name="launch-batch-job")
|
|
151
|
+
def launch_batch_job_command(
|
|
152
|
+
run_name: Annotated[str, typer.Argument(..., help="Name of the batch job run")],
|
|
153
|
+
dataset_path: Annotated[Path, typer.Option(..., help="Path to the dataset file")],
|
|
154
|
+
model: Annotated[str, typer.Option(..., help="Model to use for the batch job")],
|
|
155
|
+
location: Annotated[
|
|
156
|
+
str,
|
|
157
|
+
typer.Option(..., help="GCP location where to run the batch job"),
|
|
158
|
+
] = "europe-west4",
|
|
159
|
+
):
|
|
160
|
+
"""Launch a Gemini Batch Inference job."""
|
|
161
|
+
launch_batch_job(
|
|
162
|
+
run_name=run_name,
|
|
163
|
+
dataset_path=dataset_path,
|
|
164
|
+
model=model,
|
|
165
|
+
location=location,
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
@app.command()
|
|
170
|
+
def upload_training_dataset_from_predictions(
|
|
171
|
+
prediction_path: Annotated[
|
|
172
|
+
Path,
|
|
173
|
+
typer.Argument(
|
|
174
|
+
...,
|
|
175
|
+
help="Path to the prediction JSONL file generated by Google Inference Batch",
|
|
176
|
+
exists=True,
|
|
177
|
+
dir_okay=False,
|
|
178
|
+
readable=True,
|
|
179
|
+
),
|
|
180
|
+
],
|
|
181
|
+
instructions_path: Annotated[
|
|
182
|
+
Path,
|
|
183
|
+
typer.Option(
|
|
184
|
+
...,
|
|
185
|
+
help="Path to the file with the instruction prompt for the model",
|
|
186
|
+
exists=True,
|
|
187
|
+
dir_okay=False,
|
|
188
|
+
readable=True,
|
|
189
|
+
),
|
|
190
|
+
],
|
|
191
|
+
json_schema_path: Annotated[
|
|
192
|
+
Path,
|
|
193
|
+
typer.Option(
|
|
194
|
+
...,
|
|
195
|
+
help="Path to the file with the JSON schema to follow",
|
|
196
|
+
dir_okay=False,
|
|
197
|
+
readable=True,
|
|
198
|
+
),
|
|
199
|
+
],
|
|
200
|
+
repo_id: Annotated[
|
|
201
|
+
str, typer.Option(help="Hugging Face Datasets repository ID to push to")
|
|
202
|
+
],
|
|
203
|
+
revision: Annotated[
|
|
204
|
+
str,
|
|
205
|
+
typer.Option(
|
|
206
|
+
help="Revision (branch, tag or commit) to use for the Hugging Face Datasets repository"
|
|
207
|
+
),
|
|
208
|
+
] = "main",
|
|
209
|
+
is_openfoodfacts_dataset: Annotated[
|
|
210
|
+
bool, typer.Option(..., help="Whether this is an Open Food Facts dataset")
|
|
211
|
+
] = False,
|
|
212
|
+
openfoodfacts_flavor: Annotated[
|
|
213
|
+
Flavor,
|
|
214
|
+
typer.Option(
|
|
215
|
+
...,
|
|
216
|
+
help="Open Food Facts flavor of the dataset (if applicable)",
|
|
217
|
+
),
|
|
218
|
+
] = Flavor.off,
|
|
219
|
+
split: Annotated[str, typer.Option(..., help="Name of the split")] = "train",
|
|
220
|
+
tmp_dir: Annotated[
|
|
221
|
+
Path | None,
|
|
222
|
+
typer.Option(
|
|
223
|
+
...,
|
|
224
|
+
help="Temporary directory to use for intermediate files, default to a temporary directory "
|
|
225
|
+
"generated automatically. This is useful to relaunch the command if it fails midway.",
|
|
226
|
+
),
|
|
227
|
+
] = None,
|
|
228
|
+
skip: Annotated[int, typer.Option(..., help="Number of samples to skip")] = 0,
|
|
229
|
+
limit: Annotated[
|
|
230
|
+
int | None,
|
|
231
|
+
typer.Option(
|
|
232
|
+
..., help="Limit number of samples to process, or None for no limit"
|
|
233
|
+
),
|
|
234
|
+
] = None,
|
|
235
|
+
raise_on_invalid_sample: Annotated[
|
|
236
|
+
bool,
|
|
237
|
+
typer.Option(
|
|
238
|
+
...,
|
|
239
|
+
help="Whether to raise an error on invalid samples instead of skipping them",
|
|
240
|
+
),
|
|
241
|
+
] = False,
|
|
242
|
+
image_max_size: Annotated[
|
|
243
|
+
int | None,
|
|
244
|
+
typer.Option(
|
|
245
|
+
help="Maximum size (in pixels) for the images. If None, no resizing is performed.",
|
|
246
|
+
),
|
|
247
|
+
] = None,
|
|
248
|
+
):
|
|
249
|
+
"""Upload a training dataset to a Hugging Face Datasets repository from a
|
|
250
|
+
Gemini batch prediction file."""
|
|
251
|
+
import tempfile
|
|
252
|
+
|
|
253
|
+
import orjson
|
|
254
|
+
from huggingface_hub import HfApi
|
|
255
|
+
|
|
256
|
+
from labelr.export.llm import export_to_hf_llm_image_extraction
|
|
257
|
+
from labelr.google_genai import generate_sample_iter
|
|
258
|
+
|
|
259
|
+
instructions = instructions_path.read_text()
|
|
260
|
+
print(f"Instructions: {instructions}")
|
|
261
|
+
json_schema = orjson.loads(json_schema_path.read_text())
|
|
262
|
+
|
|
263
|
+
api = HfApi()
|
|
264
|
+
config = {
|
|
265
|
+
"instructions": instructions,
|
|
266
|
+
"json_schema": json_schema,
|
|
267
|
+
}
|
|
268
|
+
with tempfile.TemporaryDirectory() as config_tmp_dir_str:
|
|
269
|
+
config_tmp_dir = Path(config_tmp_dir_str)
|
|
270
|
+
config_path = config_tmp_dir / "config.json"
|
|
271
|
+
config_path.write_text(
|
|
272
|
+
orjson.dumps(config, option=orjson.OPT_INDENT_2).decode("utf-8")
|
|
273
|
+
)
|
|
274
|
+
api.upload_file(
|
|
275
|
+
path_or_fileobj=config_path,
|
|
276
|
+
path_in_repo="config.json",
|
|
277
|
+
repo_id=repo_id,
|
|
278
|
+
repo_type="dataset",
|
|
279
|
+
)
|
|
280
|
+
sample_iter = generate_sample_iter(
|
|
281
|
+
prediction_path=prediction_path,
|
|
282
|
+
json_schema=json_schema,
|
|
283
|
+
is_openfoodfacts_dataset=is_openfoodfacts_dataset,
|
|
284
|
+
openfoodfacts_flavor=openfoodfacts_flavor,
|
|
285
|
+
skip=skip,
|
|
286
|
+
limit=limit,
|
|
287
|
+
raise_on_invalid_sample=raise_on_invalid_sample,
|
|
288
|
+
)
|
|
289
|
+
export_to_hf_llm_image_extraction(
|
|
290
|
+
sample_iter=sample_iter,
|
|
291
|
+
split=split,
|
|
292
|
+
repo_id=repo_id,
|
|
293
|
+
revision=revision,
|
|
294
|
+
tmp_dir=tmp_dir,
|
|
295
|
+
image_max_size=image_max_size,
|
|
296
|
+
)
|
labelr/apps/label_studio.py
CHANGED
|
@@ -398,7 +398,7 @@ def create_dataset_file(
|
|
|
398
398
|
from openfoodfacts.images import extract_barcode_from_url, extract_source_from_url
|
|
399
399
|
from openfoodfacts.utils import get_image_from_url
|
|
400
400
|
|
|
401
|
-
from labelr.sample import format_object_detection_sample_to_ls
|
|
401
|
+
from labelr.sample.object_detection import format_object_detection_sample_to_ls
|
|
402
402
|
|
|
403
403
|
logger.info("Loading dataset: %s", input_file)
|
|
404
404
|
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
import functools
|
|
2
|
+
import logging
|
|
3
|
+
import pickle
|
|
4
|
+
import tempfile
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
import datasets
|
|
8
|
+
from openfoodfacts.images import generate_image_url
|
|
9
|
+
from openfoodfacts.types import Flavor
|
|
10
|
+
from PIL import Image, ImageOps
|
|
11
|
+
|
|
12
|
+
from labelr.export.common import _pickle_sample_generator
|
|
13
|
+
from labelr.sample.classification import HF_DS_CLASSIFICATION_FEATURES
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def export_from_ultralytics_to_hf_classification(
|
|
19
|
+
dataset_dir: Path,
|
|
20
|
+
repo_id: str,
|
|
21
|
+
label_names: list[str],
|
|
22
|
+
merge_labels: bool = False,
|
|
23
|
+
is_openfoodfacts_dataset: bool = False,
|
|
24
|
+
openfoodfacts_flavor: Flavor = Flavor.off,
|
|
25
|
+
) -> None:
|
|
26
|
+
"""Export an Ultralytics classification dataset to a Hugging Face dataset.
|
|
27
|
+
|
|
28
|
+
The Ultralytics dataset directory should contain 'train', 'val' and/or
|
|
29
|
+
'test' subdirectories, each containing subdirectories for each label.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
dataset_dir (Path): Path to the Ultralytics dataset directory.
|
|
33
|
+
repo_id (str): Hugging Face repository ID to push the dataset to.
|
|
34
|
+
label_names (list[str]): List of label names.
|
|
35
|
+
merge_labels (bool): Whether to merge all labels into a single label
|
|
36
|
+
named 'object'.
|
|
37
|
+
is_openfoodfacts_dataset (bool): Whether the dataset is from
|
|
38
|
+
Open Food Facts. If True, the `off_image_id` and `image_url` will
|
|
39
|
+
be generated automatically. `off_image_id` is extracted from the
|
|
40
|
+
image filename.
|
|
41
|
+
openfoodfacts_flavor (Flavor): Flavor of Open Food Facts dataset. This
|
|
42
|
+
is ignored if `is_openfoodfacts_dataset` is False.
|
|
43
|
+
"""
|
|
44
|
+
logger.info("Repo ID: %s, dataset_dir: %s", repo_id, dataset_dir)
|
|
45
|
+
|
|
46
|
+
if not any((dataset_dir / split).is_dir() for split in ["train", "val", "test"]):
|
|
47
|
+
raise ValueError(
|
|
48
|
+
f"Dataset directory {dataset_dir} does not contain 'train', 'val' or 'test' subdirectories"
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
# Save output as pickle
|
|
52
|
+
for split in ["train", "val", "test"]:
|
|
53
|
+
split_dir = dataset_dir / split
|
|
54
|
+
|
|
55
|
+
if not split_dir.is_dir():
|
|
56
|
+
logger.info("Skipping missing split directory: %s", split_dir)
|
|
57
|
+
continue
|
|
58
|
+
|
|
59
|
+
with tempfile.TemporaryDirectory() as tmp_dir_str:
|
|
60
|
+
tmp_dir = Path(tmp_dir_str)
|
|
61
|
+
for label_dir in (d for d in split_dir.iterdir() if d.is_dir()):
|
|
62
|
+
label_name = label_dir.name
|
|
63
|
+
if merge_labels:
|
|
64
|
+
label_name = "object"
|
|
65
|
+
if label_name not in label_names:
|
|
66
|
+
raise ValueError(
|
|
67
|
+
"Label name %s not in provided label names (label names: %s)"
|
|
68
|
+
% (label_name, label_names),
|
|
69
|
+
)
|
|
70
|
+
label_id = label_names.index(label_name)
|
|
71
|
+
|
|
72
|
+
for image_path in label_dir.glob("*"):
|
|
73
|
+
if is_openfoodfacts_dataset:
|
|
74
|
+
image_stem_parts = image_path.stem.split("_")
|
|
75
|
+
barcode = image_stem_parts[0]
|
|
76
|
+
off_image_id = image_stem_parts[1]
|
|
77
|
+
image_id = f"{barcode}_{off_image_id}"
|
|
78
|
+
image_url = generate_image_url(
|
|
79
|
+
barcode, off_image_id, flavor=openfoodfacts_flavor
|
|
80
|
+
)
|
|
81
|
+
else:
|
|
82
|
+
image_id = image_path.stem
|
|
83
|
+
barcode = ""
|
|
84
|
+
off_image_id = ""
|
|
85
|
+
image_url = ""
|
|
86
|
+
image = Image.open(image_path)
|
|
87
|
+
image.load()
|
|
88
|
+
|
|
89
|
+
if image.mode != "RGB":
|
|
90
|
+
image = image.convert("RGB")
|
|
91
|
+
|
|
92
|
+
# Rotate image according to exif orientation using Pillow
|
|
93
|
+
ImageOps.exif_transpose(image, in_place=True)
|
|
94
|
+
sample = {
|
|
95
|
+
"image_id": image_id,
|
|
96
|
+
"image": image,
|
|
97
|
+
"width": image.width,
|
|
98
|
+
"height": image.height,
|
|
99
|
+
"meta": {
|
|
100
|
+
"barcode": barcode,
|
|
101
|
+
"off_image_id": off_image_id,
|
|
102
|
+
"image_url": image_url,
|
|
103
|
+
},
|
|
104
|
+
"category_id": label_id,
|
|
105
|
+
"category_name": label_name,
|
|
106
|
+
}
|
|
107
|
+
with open(tmp_dir / f"{split}_{image_id}.pkl", "wb") as f:
|
|
108
|
+
pickle.dump(sample, f)
|
|
109
|
+
|
|
110
|
+
hf_ds = datasets.Dataset.from_generator(
|
|
111
|
+
functools.partial(_pickle_sample_generator, tmp_dir),
|
|
112
|
+
features=HF_DS_CLASSIFICATION_FEATURES,
|
|
113
|
+
)
|
|
114
|
+
hf_ds.push_to_hub(repo_id, split=split)
|
labelr/export/common.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
import pickle
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
from openfoodfacts.types import Flavor
|
|
5
|
+
|
|
6
|
+
from labelr.types import TaskType
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def _pickle_sample_generator(dir: Path):
|
|
10
|
+
"""Generator that yields samples from pickles in a directory."""
|
|
11
|
+
for pkl in dir.glob("*.pkl"):
|
|
12
|
+
with open(pkl, "rb") as f:
|
|
13
|
+
yield pickle.load(f)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def export_from_ultralytics_to_hf(
|
|
17
|
+
task_type: TaskType,
|
|
18
|
+
dataset_dir: Path,
|
|
19
|
+
repo_id: str,
|
|
20
|
+
label_names: list[str],
|
|
21
|
+
merge_labels: bool = False,
|
|
22
|
+
is_openfoodfacts_dataset: bool = False,
|
|
23
|
+
openfoodfacts_flavor: Flavor = Flavor.off,
|
|
24
|
+
) -> None:
|
|
25
|
+
from labelr.export.classification import (
|
|
26
|
+
export_from_ultralytics_to_hf_classification,
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
if task_type != TaskType.classification:
|
|
30
|
+
raise NotImplementedError(
|
|
31
|
+
"Only classification task is currently supported for Ultralytics to HF export"
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
if task_type == TaskType.classification:
|
|
35
|
+
export_from_ultralytics_to_hf_classification(
|
|
36
|
+
dataset_dir=dataset_dir,
|
|
37
|
+
repo_id=repo_id,
|
|
38
|
+
label_names=label_names,
|
|
39
|
+
merge_labels=merge_labels,
|
|
40
|
+
is_openfoodfacts_dataset=is_openfoodfacts_dataset,
|
|
41
|
+
openfoodfacts_flavor=openfoodfacts_flavor,
|
|
42
|
+
)
|
labelr/export/llm.py
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
import functools
|
|
2
|
+
import logging
|
|
3
|
+
import pickle
|
|
4
|
+
import tempfile
|
|
5
|
+
import typing
|
|
6
|
+
from collections.abc import Iterator
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
import datasets
|
|
10
|
+
import tqdm
|
|
11
|
+
from PIL import Image, ImageOps
|
|
12
|
+
|
|
13
|
+
from labelr.export.common import _pickle_sample_generator
|
|
14
|
+
from labelr.sample.llm import (
|
|
15
|
+
HF_DS_LLM_IMAGE_EXTRACTION_FEATURES,
|
|
16
|
+
LLMImageExtractionSample,
|
|
17
|
+
)
|
|
18
|
+
from labelr.utils import PathWithContext
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def export_to_hf_llm_image_extraction(
|
|
24
|
+
sample_iter: Iterator[LLMImageExtractionSample],
|
|
25
|
+
split: str,
|
|
26
|
+
repo_id: str,
|
|
27
|
+
revision: str = "main",
|
|
28
|
+
tmp_dir: Path | None = None,
|
|
29
|
+
image_max_size: int | None = None,
|
|
30
|
+
) -> None:
|
|
31
|
+
"""Export LLM image extraction samples to a Hugging Face dataset.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
sample_iter (Iterator[LLMImageExtractionSample]): Iterator of samples
|
|
35
|
+
to export.
|
|
36
|
+
split (str): Name of the dataset split (e.g., 'train', 'val').
|
|
37
|
+
repo_id (str): Hugging Face repository ID to push the dataset to.
|
|
38
|
+
revision (str): Revision (branch, tag or commit) to use for the
|
|
39
|
+
Hugging Face Datasets repository.
|
|
40
|
+
tmp_dir (Path | None): Temporary directory to use for intermediate
|
|
41
|
+
files. If None, a temporary directory will be created
|
|
42
|
+
automatically.
|
|
43
|
+
image_max_size (int | None): Maximum size (in pixels) for the images.
|
|
44
|
+
"""
|
|
45
|
+
logger.info(
|
|
46
|
+
"Repo ID: %s, revision: %s, split: %s, tmp_dir: %s, image_max_size: %s",
|
|
47
|
+
repo_id,
|
|
48
|
+
revision,
|
|
49
|
+
split,
|
|
50
|
+
tmp_dir,
|
|
51
|
+
image_max_size,
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
tmp_dir_with_context: PathWithContext | tempfile.TemporaryDirectory
|
|
55
|
+
if tmp_dir:
|
|
56
|
+
tmp_dir.mkdir(parents=True, exist_ok=True)
|
|
57
|
+
tmp_dir_with_context = PathWithContext(tmp_dir)
|
|
58
|
+
else:
|
|
59
|
+
tmp_dir_with_context = tempfile.TemporaryDirectory()
|
|
60
|
+
|
|
61
|
+
with tmp_dir_with_context as tmp_dir_str:
|
|
62
|
+
tmp_dir = Path(tmp_dir_str)
|
|
63
|
+
for sample in tqdm.tqdm(sample_iter, desc="samples"):
|
|
64
|
+
image = sample.image
|
|
65
|
+
# Rotate image according to exif orientation using Pillow
|
|
66
|
+
image = typing.cast(Image.Image, ImageOps.exif_transpose(image))
|
|
67
|
+
|
|
68
|
+
if image_max_size is not None:
|
|
69
|
+
if image.height > image_max_size or image.width > image_max_size:
|
|
70
|
+
image.thumbnail(
|
|
71
|
+
(image_max_size, image_max_size),
|
|
72
|
+
Image.Resampling.LANCZOS,
|
|
73
|
+
)
|
|
74
|
+
image_id = sample.image_id
|
|
75
|
+
json_sample = {
|
|
76
|
+
"image_id": image_id,
|
|
77
|
+
"image": image,
|
|
78
|
+
"meta": {
|
|
79
|
+
k: v for k, v in sample.meta.model_dump().items() if v is not None
|
|
80
|
+
},
|
|
81
|
+
"output": sample.output,
|
|
82
|
+
}
|
|
83
|
+
# Save output as pickle
|
|
84
|
+
with open(tmp_dir / f"{split}_{image_id}.pkl", "wb") as f:
|
|
85
|
+
pickle.dump(json_sample, f)
|
|
86
|
+
|
|
87
|
+
hf_ds = datasets.Dataset.from_generator(
|
|
88
|
+
functools.partial(_pickle_sample_generator, tmp_dir),
|
|
89
|
+
features=HF_DS_LLM_IMAGE_EXTRACTION_FEATURES,
|
|
90
|
+
)
|
|
91
|
+
hf_ds.push_to_hub(repo_id, split=split, revision=revision)
|