labelr 0.3.0__tar.gz → 0.4.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {labelr-0.3.0/src/labelr.egg-info → labelr-0.4.1}/PKG-INFO +1 -1
- {labelr-0.3.0 → labelr-0.4.1}/pyproject.toml +1 -1
- {labelr-0.3.0 → labelr-0.4.1}/src/labelr/apps/datasets.py +87 -10
- {labelr-0.3.0 → labelr-0.4.1}/src/labelr/apps/projects.py +3 -3
- {labelr-0.3.0 → labelr-0.4.1}/src/labelr/export.py +172 -21
- {labelr-0.3.0 → labelr-0.4.1}/src/labelr/sample.py +32 -10
- {labelr-0.3.0 → labelr-0.4.1}/src/labelr/types.py +1 -0
- {labelr-0.3.0 → labelr-0.4.1/src/labelr.egg-info}/PKG-INFO +1 -1
- {labelr-0.3.0 → labelr-0.4.1}/LICENSE +0 -0
- {labelr-0.3.0 → labelr-0.4.1}/README.md +0 -0
- {labelr-0.3.0 → labelr-0.4.1}/setup.cfg +0 -0
- {labelr-0.3.0 → labelr-0.4.1}/src/labelr/__init__.py +0 -0
- {labelr-0.3.0 → labelr-0.4.1}/src/labelr/__main__.py +0 -0
- {labelr-0.3.0 → labelr-0.4.1}/src/labelr/annotate.py +0 -0
- {labelr-0.3.0 → labelr-0.4.1}/src/labelr/apps/__init__.py +0 -0
- {labelr-0.3.0 → labelr-0.4.1}/src/labelr/apps/users.py +0 -0
- {labelr-0.3.0 → labelr-0.4.1}/src/labelr/check.py +0 -0
- {labelr-0.3.0 → labelr-0.4.1}/src/labelr/config.py +0 -0
- {labelr-0.3.0 → labelr-0.4.1}/src/labelr/main.py +0 -0
- {labelr-0.3.0 → labelr-0.4.1}/src/labelr/project_config.py +0 -0
- {labelr-0.3.0 → labelr-0.4.1}/src/labelr.egg-info/SOURCES.txt +0 -0
- {labelr-0.3.0 → labelr-0.4.1}/src/labelr.egg-info/dependency_links.txt +0 -0
- {labelr-0.3.0 → labelr-0.4.1}/src/labelr.egg-info/entry_points.txt +0 -0
- {labelr-0.3.0 → labelr-0.4.1}/src/labelr.egg-info/requires.txt +0 -0
- {labelr-0.3.0 → labelr-0.4.1}/src/labelr.egg-info/top_level.txt +0 -0
|
@@ -6,8 +6,11 @@ from pathlib import Path
|
|
|
6
6
|
from typing import Annotated, Optional
|
|
7
7
|
|
|
8
8
|
import typer
|
|
9
|
+
from openfoodfacts import Flavor
|
|
9
10
|
from openfoodfacts.utils import get_logger
|
|
10
11
|
|
|
12
|
+
from labelr.export import export_from_ultralytics_to_hf
|
|
13
|
+
|
|
11
14
|
from ..config import LABEL_STUDIO_DEFAULT_URL
|
|
12
15
|
from ..types import ExportDestination, ExportSource, TaskType
|
|
13
16
|
|
|
@@ -130,6 +133,9 @@ def export(
|
|
|
130
133
|
from_: Annotated[ExportSource, typer.Option("--from", help="Input source to use")],
|
|
131
134
|
to: Annotated[ExportDestination, typer.Option(help="Where to export the data")],
|
|
132
135
|
api_key: Annotated[Optional[str], typer.Option(envvar="LABEL_STUDIO_API_KEY")],
|
|
136
|
+
task_type: Annotated[
|
|
137
|
+
TaskType, typer.Option(help="Type of task to export")
|
|
138
|
+
] = TaskType.object_detection,
|
|
133
139
|
repo_id: Annotated[
|
|
134
140
|
Optional[str],
|
|
135
141
|
typer.Option(
|
|
@@ -148,12 +154,33 @@ def export(
|
|
|
148
154
|
Optional[Path],
|
|
149
155
|
typer.Option(help="Path to the output directory", file_okay=False),
|
|
150
156
|
] = None,
|
|
157
|
+
dataset_dir: Annotated[
|
|
158
|
+
Optional[Path],
|
|
159
|
+
typer.Option(help="Path to the dataset directory, only for Ultralytics source"),
|
|
160
|
+
] = None,
|
|
151
161
|
download_images: Annotated[
|
|
152
162
|
bool,
|
|
153
163
|
typer.Option(
|
|
154
164
|
help="if True, don't use HF images and download images from the server"
|
|
155
165
|
),
|
|
156
166
|
] = False,
|
|
167
|
+
is_openfoodfacts_dataset: Annotated[
|
|
168
|
+
bool,
|
|
169
|
+
typer.Option(
|
|
170
|
+
help="Whether the Ultralytics dataset is an OpenFoodFacts dataset, only "
|
|
171
|
+
"for Ultralytics source. This is used to generate the correct image URLs "
|
|
172
|
+
"each image name."
|
|
173
|
+
),
|
|
174
|
+
] = True,
|
|
175
|
+
openfoodfacts_flavor: Annotated[
|
|
176
|
+
Flavor,
|
|
177
|
+
typer.Option(
|
|
178
|
+
help="Flavor of the Open Food Facts dataset to use for image URLs, only "
|
|
179
|
+
"for Ultralytics source if is_openfoodfacts_dataset is True. This is used to "
|
|
180
|
+
"generate the correct image URLs each image name. This option is ignored if "
|
|
181
|
+
"is_openfoodfacts_dataset is False."
|
|
182
|
+
),
|
|
183
|
+
] = Flavor.off,
|
|
157
184
|
train_ratio: Annotated[
|
|
158
185
|
float,
|
|
159
186
|
typer.Option(
|
|
@@ -167,20 +194,38 @@ def export(
|
|
|
167
194
|
help="Raise an error if an image download fails, only for Ultralytics"
|
|
168
195
|
),
|
|
169
196
|
] = True,
|
|
197
|
+
use_aws_cache: Annotated[
|
|
198
|
+
bool,
|
|
199
|
+
typer.Option(
|
|
200
|
+
help="Use the AWS S3 cache for image downloads instead of images.openfoodfacts.org, "
|
|
201
|
+
"it is ignored if the export format is not Ultralytics"
|
|
202
|
+
),
|
|
203
|
+
] = True,
|
|
204
|
+
merge_labels: Annotated[
|
|
205
|
+
bool,
|
|
206
|
+
typer.Option(help="Merge multiple labels into a single label"),
|
|
207
|
+
] = False,
|
|
170
208
|
):
|
|
171
209
|
"""Export Label Studio annotation, either to Hugging Face Datasets or
|
|
172
210
|
local files (ultralytics format)."""
|
|
173
211
|
from label_studio_sdk.client import LabelStudio
|
|
174
212
|
|
|
175
213
|
from labelr.export import (
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
214
|
+
export_from_hf_to_ultralytics_object_detection,
|
|
215
|
+
export_from_ls_to_hf_object_detection,
|
|
216
|
+
export_from_ls_to_ultralytics_object_detection,
|
|
179
217
|
)
|
|
180
218
|
|
|
181
219
|
if (to == ExportDestination.hf or from_ == ExportSource.hf) and repo_id is None:
|
|
182
220
|
raise typer.BadParameter("Repository ID is required for export/import with HF")
|
|
183
221
|
|
|
222
|
+
if from_ == ExportSource.ultralytics and dataset_dir is None:
|
|
223
|
+
raise typer.BadParameter(
|
|
224
|
+
"Dataset directory is required for export from Ultralytics source"
|
|
225
|
+
)
|
|
226
|
+
|
|
227
|
+
label_names_list: list[str] | None = None
|
|
228
|
+
|
|
184
229
|
if label_names is None:
|
|
185
230
|
if to == ExportDestination.hf:
|
|
186
231
|
raise typer.BadParameter("Label names are required for HF export")
|
|
@@ -188,6 +233,9 @@ def export(
|
|
|
188
233
|
raise typer.BadParameter(
|
|
189
234
|
"Label names are required for export from LS source"
|
|
190
235
|
)
|
|
236
|
+
else:
|
|
237
|
+
label_names = typing.cast(str, label_names)
|
|
238
|
+
label_names_list = label_names.split(",")
|
|
191
239
|
|
|
192
240
|
if from_ == ExportSource.ls:
|
|
193
241
|
if project_id is None:
|
|
@@ -199,31 +247,60 @@ def export(
|
|
|
199
247
|
raise typer.BadParameter("Output directory is required for Ultralytics export")
|
|
200
248
|
|
|
201
249
|
if from_ == ExportSource.ls:
|
|
250
|
+
if task_type != TaskType.object_detection:
|
|
251
|
+
raise typer.BadParameter(
|
|
252
|
+
"Only object detection task is currently supported with LS source"
|
|
253
|
+
)
|
|
202
254
|
ls = LabelStudio(base_url=label_studio_url, api_key=api_key)
|
|
203
|
-
label_names = typing.cast(str, label_names)
|
|
204
|
-
label_names_list = label_names.split(",")
|
|
205
255
|
if to == ExportDestination.hf:
|
|
206
256
|
repo_id = typing.cast(str, repo_id)
|
|
207
|
-
|
|
208
|
-
ls,
|
|
257
|
+
export_from_ls_to_hf_object_detection(
|
|
258
|
+
ls,
|
|
259
|
+
repo_id=repo_id,
|
|
260
|
+
label_names=typing.cast(list[str], label_names_list),
|
|
261
|
+
project_id=typing.cast(int, project_id),
|
|
262
|
+
merge_labels=merge_labels,
|
|
263
|
+
use_aws_cache=use_aws_cache,
|
|
209
264
|
)
|
|
210
265
|
elif to == ExportDestination.ultralytics:
|
|
211
|
-
|
|
266
|
+
export_from_ls_to_ultralytics_object_detection(
|
|
212
267
|
ls,
|
|
213
268
|
typing.cast(Path, output_dir),
|
|
214
|
-
label_names_list,
|
|
269
|
+
typing.cast(list[str], label_names_list),
|
|
215
270
|
typing.cast(int, project_id),
|
|
216
271
|
train_ratio=train_ratio,
|
|
217
272
|
error_raise=error_raise,
|
|
273
|
+
merge_labels=merge_labels,
|
|
274
|
+
use_aws_cache=use_aws_cache,
|
|
218
275
|
)
|
|
219
276
|
|
|
220
277
|
elif from_ == ExportSource.hf:
|
|
278
|
+
if task_type != TaskType.object_detection:
|
|
279
|
+
raise typer.BadParameter(
|
|
280
|
+
"Only object detection task is currently supported with HF source"
|
|
281
|
+
)
|
|
221
282
|
if to == ExportDestination.ultralytics:
|
|
222
|
-
|
|
283
|
+
export_from_hf_to_ultralytics_object_detection(
|
|
223
284
|
typing.cast(str, repo_id),
|
|
224
285
|
typing.cast(Path, output_dir),
|
|
225
286
|
download_images=download_images,
|
|
226
287
|
error_raise=error_raise,
|
|
288
|
+
use_aws_cache=use_aws_cache,
|
|
227
289
|
)
|
|
228
290
|
else:
|
|
229
291
|
raise typer.BadParameter("Unsupported export format")
|
|
292
|
+
elif from_ == ExportSource.ultralytics:
|
|
293
|
+
if task_type != TaskType.classification:
|
|
294
|
+
raise typer.BadParameter(
|
|
295
|
+
"Only classification task is currently supported with Ultralytics source"
|
|
296
|
+
)
|
|
297
|
+
if to == ExportDestination.hf:
|
|
298
|
+
export_from_ultralytics_to_hf(
|
|
299
|
+
task_type=task_type,
|
|
300
|
+
dataset_dir=typing.cast(Path, dataset_dir),
|
|
301
|
+
repo_id=typing.cast(str, repo_id),
|
|
302
|
+
merge_labels=merge_labels,
|
|
303
|
+
label_names=typing.cast(list[str], label_names_list),
|
|
304
|
+
is_openfoodfacts_dataset=is_openfoodfacts_dataset,
|
|
305
|
+
openfoodfacts_flavor=openfoodfacts_flavor,
|
|
306
|
+
)
|
|
@@ -90,6 +90,8 @@ def add_split(
|
|
|
90
90
|
train_split: Annotated[
|
|
91
91
|
float, typer.Option(help="fraction of samples to add in train split")
|
|
92
92
|
],
|
|
93
|
+
api_key: Annotated[str, typer.Option(envvar="LABEL_STUDIO_API_KEY")],
|
|
94
|
+
project_id: Annotated[int, typer.Option(help="Label Studio project ID")],
|
|
93
95
|
split_name: Annotated[
|
|
94
96
|
Optional[str],
|
|
95
97
|
typer.Option(
|
|
@@ -97,9 +99,7 @@ def add_split(
|
|
|
97
99
|
"with the task ID file. If --task-id-file is not provided, "
|
|
98
100
|
"this field is ignored."
|
|
99
101
|
),
|
|
100
|
-
],
|
|
101
|
-
api_key: Annotated[str, typer.Option(envvar="LABEL_STUDIO_API_KEY")],
|
|
102
|
-
project_id: Annotated[int, typer.Option(help="Label Studio project ID")],
|
|
102
|
+
] = None,
|
|
103
103
|
train_split_name: Annotated[
|
|
104
104
|
str,
|
|
105
105
|
typer.Option(help="name of the train split"),
|
|
@@ -3,16 +3,21 @@ import logging
|
|
|
3
3
|
import pickle
|
|
4
4
|
import random
|
|
5
5
|
import tempfile
|
|
6
|
-
import typing
|
|
7
6
|
from pathlib import Path
|
|
8
7
|
|
|
9
8
|
import datasets
|
|
10
9
|
import tqdm
|
|
11
10
|
from label_studio_sdk.client import LabelStudio
|
|
12
|
-
from openfoodfacts.images import download_image
|
|
13
|
-
from
|
|
11
|
+
from openfoodfacts.images import download_image, generate_image_url
|
|
12
|
+
from openfoodfacts.types import Flavor
|
|
13
|
+
from PIL import Image, ImageOps
|
|
14
14
|
|
|
15
|
-
from labelr.sample import
|
|
15
|
+
from labelr.sample import (
|
|
16
|
+
HF_DS_CLASSIFICATION_FEATURES,
|
|
17
|
+
HF_DS_OBJECT_DETECTION_FEATURES,
|
|
18
|
+
format_object_detection_sample_to_hf,
|
|
19
|
+
)
|
|
20
|
+
from labelr.types import TaskType
|
|
16
21
|
|
|
17
22
|
logger = logging.getLogger(__name__)
|
|
18
23
|
|
|
@@ -24,13 +29,18 @@ def _pickle_sample_generator(dir: Path):
|
|
|
24
29
|
yield pickle.load(f)
|
|
25
30
|
|
|
26
31
|
|
|
27
|
-
def
|
|
32
|
+
def export_from_ls_to_hf_object_detection(
|
|
28
33
|
ls: LabelStudio,
|
|
29
34
|
repo_id: str,
|
|
30
|
-
|
|
35
|
+
label_names: list[str],
|
|
31
36
|
project_id: int,
|
|
37
|
+
merge_labels: bool = False,
|
|
38
|
+
use_aws_cache: bool = True,
|
|
32
39
|
):
|
|
33
|
-
|
|
40
|
+
if merge_labels:
|
|
41
|
+
label_names = ["object"]
|
|
42
|
+
|
|
43
|
+
logger.info("Project ID: %d, label names: %s", project_id, label_names)
|
|
34
44
|
|
|
35
45
|
for split in ["train", "val"]:
|
|
36
46
|
logger.info("Processing split: %s", split)
|
|
@@ -45,7 +55,11 @@ def export_from_ls_to_hf(
|
|
|
45
55
|
if task.data["split"] != split:
|
|
46
56
|
continue
|
|
47
57
|
sample = format_object_detection_sample_to_hf(
|
|
48
|
-
task.data,
|
|
58
|
+
task_data=task.data,
|
|
59
|
+
annotations=task.annotations,
|
|
60
|
+
label_names=label_names,
|
|
61
|
+
merge_labels=merge_labels,
|
|
62
|
+
use_aws_cache=use_aws_cache,
|
|
49
63
|
)
|
|
50
64
|
if sample is not None:
|
|
51
65
|
# Save output as pickle
|
|
@@ -54,18 +68,20 @@ def export_from_ls_to_hf(
|
|
|
54
68
|
|
|
55
69
|
hf_ds = datasets.Dataset.from_generator(
|
|
56
70
|
functools.partial(_pickle_sample_generator, tmp_dir),
|
|
57
|
-
features=
|
|
71
|
+
features=HF_DS_OBJECT_DETECTION_FEATURES,
|
|
58
72
|
)
|
|
59
73
|
hf_ds.push_to_hub(repo_id, split=split)
|
|
60
74
|
|
|
61
75
|
|
|
62
|
-
def
|
|
76
|
+
def export_from_ls_to_ultralytics_object_detection(
|
|
63
77
|
ls: LabelStudio,
|
|
64
78
|
output_dir: Path,
|
|
65
|
-
|
|
79
|
+
label_names: list[str],
|
|
66
80
|
project_id: int,
|
|
67
81
|
train_ratio: float = 0.8,
|
|
68
82
|
error_raise: bool = True,
|
|
83
|
+
merge_labels: bool = False,
|
|
84
|
+
use_aws_cache: bool = True,
|
|
69
85
|
):
|
|
70
86
|
"""Export annotations from a Label Studio project to the Ultralytics
|
|
71
87
|
format.
|
|
@@ -73,7 +89,9 @@ def export_from_ls_to_ultralytics(
|
|
|
73
89
|
The Label Studio project should be an object detection project with a
|
|
74
90
|
single rectanglelabels annotation result per task.
|
|
75
91
|
"""
|
|
76
|
-
|
|
92
|
+
if merge_labels:
|
|
93
|
+
label_names = ["object"]
|
|
94
|
+
logger.info("Project ID: %d, label names: %s", project_id, label_names)
|
|
77
95
|
|
|
78
96
|
data_dir = output_dir / "data"
|
|
79
97
|
data_dir.mkdir(parents=True, exist_ok=True)
|
|
@@ -146,25 +164,30 @@ def export_from_ls_to_ultralytics(
|
|
|
146
164
|
y_min = value["y"] / 100
|
|
147
165
|
width = value["width"] / 100
|
|
148
166
|
height = value["height"] / 100
|
|
149
|
-
|
|
150
|
-
|
|
167
|
+
label_name = (
|
|
168
|
+
label_names[0] if merge_labels else value["rectanglelabels"][0]
|
|
169
|
+
)
|
|
170
|
+
label_id = label_names.index(label_name)
|
|
151
171
|
|
|
152
172
|
# Save the labels in the Ultralytics format:
|
|
153
173
|
# - one label per line
|
|
154
174
|
# - each line is a list of 5 elements:
|
|
155
|
-
# -
|
|
175
|
+
# - label_id
|
|
156
176
|
# - x_center
|
|
157
177
|
# - y_center
|
|
158
178
|
# - width
|
|
159
179
|
# - height
|
|
160
180
|
x_center = x_min + width / 2
|
|
161
181
|
y_center = y_min + height / 2
|
|
162
|
-
f.write(f"{
|
|
182
|
+
f.write(f"{label_id} {x_center} {y_center} {width} {height}\n")
|
|
163
183
|
has_valid_annotation = True
|
|
164
184
|
|
|
165
185
|
if has_valid_annotation:
|
|
166
186
|
download_output = download_image(
|
|
167
|
-
image_url,
|
|
187
|
+
image_url,
|
|
188
|
+
return_struct=True,
|
|
189
|
+
error_raise=error_raise,
|
|
190
|
+
use_cache=use_aws_cache,
|
|
168
191
|
)
|
|
169
192
|
if download_output is None:
|
|
170
193
|
logger.error("Failed to download image: %s", image_url)
|
|
@@ -179,15 +202,16 @@ def export_from_ls_to_ultralytics(
|
|
|
179
202
|
f.write("val: images/val\n")
|
|
180
203
|
f.write("test:\n")
|
|
181
204
|
f.write("names:\n")
|
|
182
|
-
for i,
|
|
183
|
-
f.write(f" {i}: {
|
|
205
|
+
for i, label_name in enumerate(label_names):
|
|
206
|
+
f.write(f" {i}: {label_name}\n")
|
|
184
207
|
|
|
185
208
|
|
|
186
|
-
def
|
|
209
|
+
def export_from_hf_to_ultralytics_object_detection(
|
|
187
210
|
repo_id: str,
|
|
188
211
|
output_dir: Path,
|
|
189
212
|
download_images: bool = True,
|
|
190
213
|
error_raise: bool = True,
|
|
214
|
+
use_aws_cache: bool = True,
|
|
191
215
|
):
|
|
192
216
|
"""Export annotations from a Hugging Face dataset project to the
|
|
193
217
|
Ultralytics format.
|
|
@@ -213,7 +237,10 @@ def export_from_hf_to_ultralytics(
|
|
|
213
237
|
|
|
214
238
|
if download_images:
|
|
215
239
|
download_output = download_image(
|
|
216
|
-
image_url,
|
|
240
|
+
image_url,
|
|
241
|
+
return_struct=True,
|
|
242
|
+
error_raise=error_raise,
|
|
243
|
+
use_cache=use_aws_cache,
|
|
217
244
|
)
|
|
218
245
|
if download_output is None:
|
|
219
246
|
logger.error("Failed to download image: %s", image_url)
|
|
@@ -266,3 +293,127 @@ def export_from_hf_to_ultralytics(
|
|
|
266
293
|
f.write("names:\n")
|
|
267
294
|
for i, category_name in enumerate(category_names):
|
|
268
295
|
f.write(f" {i}: {category_name}\n")
|
|
296
|
+
|
|
297
|
+
|
|
298
|
+
def export_from_ultralytics_to_hf(
|
|
299
|
+
task_type: TaskType,
|
|
300
|
+
dataset_dir: Path,
|
|
301
|
+
repo_id: str,
|
|
302
|
+
label_names: list[str],
|
|
303
|
+
merge_labels: bool = False,
|
|
304
|
+
is_openfoodfacts_dataset: bool = False,
|
|
305
|
+
openfoodfacts_flavor: Flavor = Flavor.off,
|
|
306
|
+
) -> None:
|
|
307
|
+
if task_type != TaskType.classification:
|
|
308
|
+
raise NotImplementedError(
|
|
309
|
+
"Only classification task is currently supported for Ultralytics to HF export"
|
|
310
|
+
)
|
|
311
|
+
|
|
312
|
+
if task_type == TaskType.classification:
|
|
313
|
+
export_from_ultralytics_to_hf_classification(
|
|
314
|
+
dataset_dir=dataset_dir,
|
|
315
|
+
repo_id=repo_id,
|
|
316
|
+
label_names=label_names,
|
|
317
|
+
merge_labels=merge_labels,
|
|
318
|
+
is_openfoodfacts_dataset=is_openfoodfacts_dataset,
|
|
319
|
+
openfoodfacts_flavor=openfoodfacts_flavor,
|
|
320
|
+
)
|
|
321
|
+
|
|
322
|
+
|
|
323
|
+
def export_from_ultralytics_to_hf_classification(
|
|
324
|
+
dataset_dir: Path,
|
|
325
|
+
repo_id: str,
|
|
326
|
+
label_names: list[str],
|
|
327
|
+
merge_labels: bool = False,
|
|
328
|
+
is_openfoodfacts_dataset: bool = False,
|
|
329
|
+
openfoodfacts_flavor: Flavor = Flavor.off,
|
|
330
|
+
) -> None:
|
|
331
|
+
"""Export an Ultralytics classification dataset to a Hugging Face dataset.
|
|
332
|
+
|
|
333
|
+
The Ultralytics dataset directory should contain 'train', 'val' and/or
|
|
334
|
+
'test' subdirectories, each containing subdirectories for each label.
|
|
335
|
+
|
|
336
|
+
Args:
|
|
337
|
+
dataset_dir (Path): Path to the Ultralytics dataset directory.
|
|
338
|
+
repo_id (str): Hugging Face repository ID to push the dataset to.
|
|
339
|
+
label_names (list[str]): List of label names.
|
|
340
|
+
merge_labels (bool): Whether to merge all labels into a single label
|
|
341
|
+
named 'object'.
|
|
342
|
+
is_openfoodfacts_dataset (bool): Whether the dataset is from
|
|
343
|
+
Open Food Facts. If True, the `off_image_id` and `image_url` will
|
|
344
|
+
be generated automatically. `off_image_id` is extracted from the
|
|
345
|
+
image filename.
|
|
346
|
+
openfoodfacts_flavor (Flavor): Flavor of Open Food Facts dataset. This
|
|
347
|
+
is ignored if `is_openfoodfacts_dataset` is False.
|
|
348
|
+
"""
|
|
349
|
+
logger.info("Repo ID: %s, dataset_dir: %s", repo_id, dataset_dir)
|
|
350
|
+
|
|
351
|
+
if not any((dataset_dir / split).is_dir() for split in ["train", "val", "test"]):
|
|
352
|
+
raise ValueError(
|
|
353
|
+
f"Dataset directory {dataset_dir} does not contain 'train', 'val' or 'test' subdirectories"
|
|
354
|
+
)
|
|
355
|
+
|
|
356
|
+
# Save output as pickle
|
|
357
|
+
for split in ["train", "val", "test"]:
|
|
358
|
+
split_dir = dataset_dir / split
|
|
359
|
+
|
|
360
|
+
if not split_dir.is_dir():
|
|
361
|
+
logger.info("Skipping missing split directory: %s", split_dir)
|
|
362
|
+
continue
|
|
363
|
+
|
|
364
|
+
with tempfile.TemporaryDirectory() as tmp_dir_str:
|
|
365
|
+
tmp_dir = Path(tmp_dir_str)
|
|
366
|
+
for label_dir in (d for d in split_dir.iterdir() if d.is_dir()):
|
|
367
|
+
label_name = label_dir.name
|
|
368
|
+
if merge_labels:
|
|
369
|
+
label_name = "object"
|
|
370
|
+
if label_name not in label_names:
|
|
371
|
+
raise ValueError(
|
|
372
|
+
"Label name %s not in provided label names (label names: %s)"
|
|
373
|
+
% (label_name, label_names),
|
|
374
|
+
)
|
|
375
|
+
label_id = label_names.index(label_name)
|
|
376
|
+
|
|
377
|
+
for image_path in label_dir.glob("*"):
|
|
378
|
+
if is_openfoodfacts_dataset:
|
|
379
|
+
image_stem_parts = image_path.stem.split("_")
|
|
380
|
+
barcode = image_stem_parts[0]
|
|
381
|
+
off_image_id = image_stem_parts[1]
|
|
382
|
+
image_id = f"{barcode}_{off_image_id}"
|
|
383
|
+
image_url = generate_image_url(
|
|
384
|
+
barcode, off_image_id, flavor=openfoodfacts_flavor
|
|
385
|
+
)
|
|
386
|
+
else:
|
|
387
|
+
image_id = image_path.stem
|
|
388
|
+
barcode = ""
|
|
389
|
+
off_image_id = ""
|
|
390
|
+
image_url = ""
|
|
391
|
+
image = Image.open(image_path)
|
|
392
|
+
image.load()
|
|
393
|
+
|
|
394
|
+
if image.mode != "RGB":
|
|
395
|
+
image = image.convert("RGB")
|
|
396
|
+
|
|
397
|
+
# Rotate image according to exif orientation using Pillow
|
|
398
|
+
ImageOps.exif_transpose(image, in_place=True)
|
|
399
|
+
sample = {
|
|
400
|
+
"image_id": image_id,
|
|
401
|
+
"image": image,
|
|
402
|
+
"width": image.width,
|
|
403
|
+
"height": image.height,
|
|
404
|
+
"meta": {
|
|
405
|
+
"barcode": barcode,
|
|
406
|
+
"off_image_id": off_image_id,
|
|
407
|
+
"image_url": image_url,
|
|
408
|
+
},
|
|
409
|
+
"category_id": label_id,
|
|
410
|
+
"category_name": label_name,
|
|
411
|
+
}
|
|
412
|
+
with open(tmp_dir / f"{split}_{image_id}.pkl", "wb") as f:
|
|
413
|
+
pickle.dump(sample, f)
|
|
414
|
+
|
|
415
|
+
hf_ds = datasets.Dataset.from_generator(
|
|
416
|
+
functools.partial(_pickle_sample_generator, tmp_dir),
|
|
417
|
+
features=HF_DS_CLASSIFICATION_FEATURES,
|
|
418
|
+
)
|
|
419
|
+
hf_ds.push_to_hub(repo_id, split=split)
|
|
@@ -145,7 +145,11 @@ def format_object_detection_sample_to_ls(
|
|
|
145
145
|
|
|
146
146
|
|
|
147
147
|
def format_object_detection_sample_to_hf(
|
|
148
|
-
task_data: dict,
|
|
148
|
+
task_data: dict,
|
|
149
|
+
annotations: list[dict],
|
|
150
|
+
label_names: list[str],
|
|
151
|
+
merge_labels: bool = False,
|
|
152
|
+
use_aws_cache: bool = True,
|
|
149
153
|
) -> dict | None:
|
|
150
154
|
if len(annotations) > 1:
|
|
151
155
|
logger.info("More than one annotation found, skipping")
|
|
@@ -156,8 +160,8 @@ def format_object_detection_sample_to_hf(
|
|
|
156
160
|
|
|
157
161
|
annotation = annotations[0]
|
|
158
162
|
bboxes = []
|
|
159
|
-
|
|
160
|
-
|
|
163
|
+
bbox_label_ids = []
|
|
164
|
+
bbox_label_names = []
|
|
161
165
|
|
|
162
166
|
for annotation_result in annotation["result"]:
|
|
163
167
|
if annotation_result["type"] != "rectanglelabels":
|
|
@@ -171,12 +175,13 @@ def format_object_detection_sample_to_hf(
|
|
|
171
175
|
x_max = x_min + width
|
|
172
176
|
y_max = y_min + height
|
|
173
177
|
bboxes.append([y_min, x_min, y_max, x_max])
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
178
|
+
|
|
179
|
+
label_name = label_names[0] if merge_labels else value["rectanglelabels"][0]
|
|
180
|
+
bbox_label_names.append(label_name)
|
|
181
|
+
bbox_label_ids.append(label_names.index(label_name))
|
|
177
182
|
|
|
178
183
|
image_url = task_data["image_url"]
|
|
179
|
-
image = download_image(image_url, error_raise=False)
|
|
184
|
+
image = download_image(image_url, error_raise=False, use_cache=use_aws_cache)
|
|
180
185
|
if image is None:
|
|
181
186
|
logger.error("Failed to download image: %s", image_url)
|
|
182
187
|
return None
|
|
@@ -193,14 +198,14 @@ def format_object_detection_sample_to_hf(
|
|
|
193
198
|
},
|
|
194
199
|
"objects": {
|
|
195
200
|
"bbox": bboxes,
|
|
196
|
-
"category_id":
|
|
197
|
-
"category_name":
|
|
201
|
+
"category_id": bbox_label_ids,
|
|
202
|
+
"category_name": bbox_label_names,
|
|
198
203
|
},
|
|
199
204
|
}
|
|
200
205
|
|
|
201
206
|
|
|
202
207
|
# The HuggingFace Dataset features
|
|
203
|
-
|
|
208
|
+
HF_DS_OBJECT_DETECTION_FEATURES = datasets.Features(
|
|
204
209
|
{
|
|
205
210
|
"image_id": datasets.Value("string"),
|
|
206
211
|
"image": datasets.features.Image(),
|
|
@@ -218,3 +223,20 @@ HF_DS_FEATURES = datasets.Features(
|
|
|
218
223
|
},
|
|
219
224
|
}
|
|
220
225
|
)
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
HF_DS_CLASSIFICATION_FEATURES = datasets.Features(
|
|
229
|
+
{
|
|
230
|
+
"image_id": datasets.Value("string"),
|
|
231
|
+
"image": datasets.features.Image(),
|
|
232
|
+
"width": datasets.Value("int64"),
|
|
233
|
+
"height": datasets.Value("int64"),
|
|
234
|
+
"meta": {
|
|
235
|
+
"barcode": datasets.Value("string"),
|
|
236
|
+
"off_image_id": datasets.Value("string"),
|
|
237
|
+
"image_url": datasets.Value("string"),
|
|
238
|
+
},
|
|
239
|
+
"category_id": datasets.Value("int64"),
|
|
240
|
+
"category_name": datasets.Value("string"),
|
|
241
|
+
}
|
|
242
|
+
)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|