labelr 0.10.0__py3-none-any.whl → 0.11.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- labelr/annotate.py +3 -54
- labelr/apps/datasets.py +140 -9
- labelr/apps/directus.py +212 -0
- labelr/apps/google_batch.py +38 -0
- labelr/apps/label_studio.py +295 -104
- labelr/apps/typer_description.py +2 -0
- labelr/check.py +68 -7
- labelr/config.py +57 -1
- labelr/export/object_detection.py +96 -18
- labelr/main.py +16 -0
- labelr/sample/object_detection.py +42 -13
- labelr-0.11.1.dist-info/METADATA +230 -0
- {labelr-0.10.0.dist-info → labelr-0.11.1.dist-info}/RECORD +17 -15
- {labelr-0.10.0.dist-info → labelr-0.11.1.dist-info}/WHEEL +1 -1
- labelr-0.10.0.dist-info/METADATA +0 -158
- {labelr-0.10.0.dist-info → labelr-0.11.1.dist-info}/entry_points.txt +0 -0
- {labelr-0.10.0.dist-info → labelr-0.11.1.dist-info}/licenses/LICENSE +0 -0
- {labelr-0.10.0.dist-info → labelr-0.11.1.dist-info}/top_level.txt +0 -0
labelr/annotate.py
CHANGED
|
@@ -1,66 +1,15 @@
|
|
|
1
1
|
import random
|
|
2
2
|
import string
|
|
3
3
|
|
|
4
|
-
from openfoodfacts.types import JSONType
|
|
5
4
|
from openfoodfacts.utils import get_logger
|
|
6
5
|
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
def format_annotation_results_from_robotoff(
|
|
11
|
-
objects: list[JSONType],
|
|
12
|
-
image_width: int,
|
|
13
|
-
image_height: int,
|
|
14
|
-
label_mapping: dict[str, str] | None = None,
|
|
15
|
-
) -> list[JSONType]:
|
|
16
|
-
"""Format annotation results from Robotoff prediction endpoint into
|
|
17
|
-
Label Studio format."""
|
|
18
|
-
annotation_results = []
|
|
19
|
-
for object_ in objects:
|
|
20
|
-
bounding_box = object_["bounding_box"]
|
|
21
|
-
label_name = object_["label"]
|
|
6
|
+
from ultralytics import Results
|
|
22
7
|
|
|
23
|
-
|
|
24
|
-
label_name = label_mapping.get(label_name, label_name)
|
|
25
|
-
|
|
26
|
-
# These are relative coordinates (between 0.0 and 1.0)
|
|
27
|
-
y_min, x_min, y_max, x_max = bounding_box
|
|
28
|
-
# Make sure the coordinates are within the image boundaries,
|
|
29
|
-
# and convert them to percentages
|
|
30
|
-
y_min = min(max(0, y_min), 1.0) * 100
|
|
31
|
-
x_min = min(max(0, x_min), 1.0) * 100
|
|
32
|
-
y_max = min(max(0, y_max), 1.0) * 100
|
|
33
|
-
x_max = min(max(0, x_max), 1.0) * 100
|
|
34
|
-
x = x_min
|
|
35
|
-
y = y_min
|
|
36
|
-
width = x_max - x_min
|
|
37
|
-
height = y_max - y_min
|
|
38
|
-
|
|
39
|
-
id_ = generate_id()
|
|
40
|
-
annotation_results.append(
|
|
41
|
-
{
|
|
42
|
-
"id": id_,
|
|
43
|
-
"type": "rectanglelabels",
|
|
44
|
-
"from_name": "label",
|
|
45
|
-
"to_name": "image",
|
|
46
|
-
"original_width": image_width,
|
|
47
|
-
"original_height": image_height,
|
|
48
|
-
"image_rotation": 0,
|
|
49
|
-
"value": {
|
|
50
|
-
"rotation": 0,
|
|
51
|
-
"x": x,
|
|
52
|
-
"y": y,
|
|
53
|
-
"width": width,
|
|
54
|
-
"height": height,
|
|
55
|
-
"rectanglelabels": [label_name],
|
|
56
|
-
},
|
|
57
|
-
},
|
|
58
|
-
)
|
|
59
|
-
return annotation_results
|
|
8
|
+
logger = get_logger(__name__)
|
|
60
9
|
|
|
61
10
|
|
|
62
11
|
def format_annotation_results_from_ultralytics(
|
|
63
|
-
results:
|
|
12
|
+
results: Results,
|
|
64
13
|
labels: list[str],
|
|
65
14
|
label_mapping: dict[str, str] | None = None,
|
|
66
15
|
) -> list[dict]:
|
labelr/apps/datasets.py
CHANGED
|
@@ -18,7 +18,8 @@ from labelr.export.object_detection import (
|
|
|
18
18
|
export_from_ls_to_ultralytics_object_detection,
|
|
19
19
|
)
|
|
20
20
|
|
|
21
|
-
from
|
|
21
|
+
from . import typer_description
|
|
22
|
+
from ..config import config
|
|
22
23
|
from ..types import ExportDestination, ExportSource, TaskType
|
|
23
24
|
|
|
24
25
|
app = typer.Typer()
|
|
@@ -125,7 +126,9 @@ def convert_object_detection_dataset(
|
|
|
125
126
|
def export(
|
|
126
127
|
from_: Annotated[ExportSource, typer.Option("--from", help="Input source to use")],
|
|
127
128
|
to: Annotated[ExportDestination, typer.Option(help="Where to export the data")],
|
|
128
|
-
api_key: Annotated[
|
|
129
|
+
api_key: Annotated[
|
|
130
|
+
str | None, typer.Option(help=typer_description.LABEL_STUDIO_API_KEY)
|
|
131
|
+
] = config.label_studio_api_key,
|
|
129
132
|
task_type: Annotated[
|
|
130
133
|
TaskType, typer.Option(help="Type of task to export")
|
|
131
134
|
] = TaskType.object_detection,
|
|
@@ -142,7 +145,16 @@ def export(
|
|
|
142
145
|
project_id: Annotated[
|
|
143
146
|
Optional[int], typer.Option(help="Label Studio Project ID")
|
|
144
147
|
] = None,
|
|
145
|
-
|
|
148
|
+
view_id: Annotated[
|
|
149
|
+
int | None,
|
|
150
|
+
typer.Option(
|
|
151
|
+
help="ID of the Label Studio view, if any. This option is useful "
|
|
152
|
+
"to filter the task to export."
|
|
153
|
+
),
|
|
154
|
+
] = None,
|
|
155
|
+
label_studio_url: Annotated[
|
|
156
|
+
str, typer.Option(help=typer_description.LABEL_STUDIO_URL)
|
|
157
|
+
] = config.label_studio_url,
|
|
146
158
|
output_dir: Annotated[
|
|
147
159
|
Optional[Path],
|
|
148
160
|
typer.Option(
|
|
@@ -163,11 +175,15 @@ def export(
|
|
|
163
175
|
is_openfoodfacts_dataset: Annotated[
|
|
164
176
|
bool,
|
|
165
177
|
typer.Option(
|
|
166
|
-
help="Whether the Ultralytics dataset is an
|
|
167
|
-
"for Ultralytics source. This is used
|
|
168
|
-
"each image name
|
|
178
|
+
help="Whether the Ultralytics dataset is an Open Food Facts dataset, only "
|
|
179
|
+
"for Ultralytics source. This is used:\n"
|
|
180
|
+
"- to generate the correct image URLs from each image name, when exporting "
|
|
181
|
+
"from Ultralytics to Hugging Face Datasets.\n"
|
|
182
|
+
"- to include additional metadata fields specific to Open Food Facts "
|
|
183
|
+
"(`barcode` and `off_image_id`) when exporting from Label Studio to "
|
|
184
|
+
"Hugging Face Datasets."
|
|
169
185
|
),
|
|
170
|
-
] =
|
|
186
|
+
] = False,
|
|
171
187
|
openfoodfacts_flavor: Annotated[
|
|
172
188
|
Flavor,
|
|
173
189
|
typer.Option(
|
|
@@ -181,9 +197,18 @@ def export(
|
|
|
181
197
|
float,
|
|
182
198
|
typer.Option(
|
|
183
199
|
help="Train ratio for splitting the dataset, if the split name is not "
|
|
184
|
-
"provided
|
|
200
|
+
"provided. Only used if the source is Label Studio and the destination "
|
|
201
|
+
"is Ultralytics."
|
|
185
202
|
),
|
|
186
203
|
] = 0.8,
|
|
204
|
+
image_max_size: Annotated[
|
|
205
|
+
int | None,
|
|
206
|
+
typer.Option(
|
|
207
|
+
help="Maximum size (in pixels) for the images. If None, no resizing is performed."
|
|
208
|
+
"Otherwise, the longest side of the image will be resized to this value, "
|
|
209
|
+
"keeping the aspect ratio."
|
|
210
|
+
),
|
|
211
|
+
] = None,
|
|
187
212
|
error_raise: Annotated[
|
|
188
213
|
bool,
|
|
189
214
|
typer.Option(
|
|
@@ -260,9 +285,12 @@ def export(
|
|
|
260
285
|
repo_id=repo_id,
|
|
261
286
|
label_names=typing.cast(list[str], label_names_list),
|
|
262
287
|
project_id=typing.cast(int, project_id),
|
|
288
|
+
is_openfoodfacts_dataset=is_openfoodfacts_dataset,
|
|
263
289
|
merge_labels=merge_labels,
|
|
264
290
|
use_aws_cache=use_aws_cache,
|
|
265
291
|
revision=revision,
|
|
292
|
+
view_id=view_id,
|
|
293
|
+
image_max_size=image_max_size,
|
|
266
294
|
)
|
|
267
295
|
elif to == ExportDestination.ultralytics:
|
|
268
296
|
export_from_ls_to_ultralytics_object_detection(
|
|
@@ -274,6 +302,8 @@ def export(
|
|
|
274
302
|
error_raise=error_raise,
|
|
275
303
|
merge_labels=merge_labels,
|
|
276
304
|
use_aws_cache=use_aws_cache,
|
|
305
|
+
view_id=view_id,
|
|
306
|
+
image_max_size=image_max_size,
|
|
277
307
|
)
|
|
278
308
|
|
|
279
309
|
elif from_ == ExportSource.hf:
|
|
@@ -289,6 +319,7 @@ def export(
|
|
|
289
319
|
error_raise=error_raise,
|
|
290
320
|
use_aws_cache=use_aws_cache,
|
|
291
321
|
revision=revision,
|
|
322
|
+
image_max_size=image_max_size,
|
|
292
323
|
)
|
|
293
324
|
else:
|
|
294
325
|
raise typer.BadParameter("Unsupported export format")
|
|
@@ -327,7 +358,8 @@ def export_llm_ds(
|
|
|
327
358
|
tmp_dir: Annotated[
|
|
328
359
|
Path | None,
|
|
329
360
|
typer.Option(
|
|
330
|
-
help="Path to
|
|
361
|
+
help="Path to the temporary directory used to store intermediate sample files "
|
|
362
|
+
"created when building the HF dataset.",
|
|
331
363
|
),
|
|
332
364
|
] = None,
|
|
333
365
|
image_max_size: Annotated[
|
|
@@ -354,3 +386,102 @@ def export_llm_ds(
|
|
|
354
386
|
tmp_dir=tmp_dir,
|
|
355
387
|
image_max_size=image_max_size,
|
|
356
388
|
)
|
|
389
|
+
|
|
390
|
+
|
|
391
|
+
@app.command()
|
|
392
|
+
def update_llm_ds(
|
|
393
|
+
dataset_path: Annotated[
|
|
394
|
+
Path, typer.Option(help="Path to the JSONL containing the updates.")
|
|
395
|
+
],
|
|
396
|
+
repo_id: Annotated[
|
|
397
|
+
str, typer.Option(help="Hugging Face Datasets repository ID to update")
|
|
398
|
+
],
|
|
399
|
+
split: Annotated[str, typer.Option(help="Dataset split to use")],
|
|
400
|
+
revision: Annotated[
|
|
401
|
+
str,
|
|
402
|
+
typer.Option(
|
|
403
|
+
help="Revision (branch, tag or commit) to use when pushing the new version "
|
|
404
|
+
"of the Hugging Face Dataset."
|
|
405
|
+
),
|
|
406
|
+
] = "main",
|
|
407
|
+
tmp_dir: Annotated[
|
|
408
|
+
Path | None,
|
|
409
|
+
typer.Option(
|
|
410
|
+
help="Path to a temporary directory to use for image processing",
|
|
411
|
+
),
|
|
412
|
+
] = None,
|
|
413
|
+
show_diff: Annotated[
|
|
414
|
+
bool,
|
|
415
|
+
typer.Option(
|
|
416
|
+
help="Show the differences between the original sample and the update. If "
|
|
417
|
+
"True, the updated dataset is not pushed to the Hub. Useful to review the "
|
|
418
|
+
"updates before applying them.",
|
|
419
|
+
),
|
|
420
|
+
] = False,
|
|
421
|
+
):
|
|
422
|
+
"""Update an existing LLM image extraction dataset, by updating the
|
|
423
|
+
`output` field of each sample in the dataset.
|
|
424
|
+
|
|
425
|
+
The `--dataset_path` JSONL file should contain items with two fields:
|
|
426
|
+
|
|
427
|
+
- `image_id`: The image ID of the sample to update in the Hugging Face
|
|
428
|
+
dataset.
|
|
429
|
+
- `output`: The new output data to set for the sample.
|
|
430
|
+
"""
|
|
431
|
+
import sys
|
|
432
|
+
from difflib import Differ
|
|
433
|
+
|
|
434
|
+
import orjson
|
|
435
|
+
from datasets import load_dataset
|
|
436
|
+
from diskcache import Cache
|
|
437
|
+
|
|
438
|
+
dataset = load_dataset(repo_id, split=split)
|
|
439
|
+
|
|
440
|
+
# Populate cache with the updates
|
|
441
|
+
cache = Cache(directory=tmp_dir or None)
|
|
442
|
+
with dataset_path.open("r") as f:
|
|
443
|
+
for line in map(orjson.loads, f):
|
|
444
|
+
if "image_id" not in line or "output" not in line:
|
|
445
|
+
raise ValueError(
|
|
446
|
+
"Each item in the update JSONL file must contain `image_id` and `output` fields"
|
|
447
|
+
)
|
|
448
|
+
image_id = line["image_id"]
|
|
449
|
+
output = line["output"]
|
|
450
|
+
|
|
451
|
+
if not isinstance(output, str):
|
|
452
|
+
output = orjson.dumps(output).decode("utf-8")
|
|
453
|
+
|
|
454
|
+
cache[image_id] = output
|
|
455
|
+
|
|
456
|
+
def apply_updates(sample):
|
|
457
|
+
image_id = sample["image_id"]
|
|
458
|
+
if image_id in cache:
|
|
459
|
+
cached_item = cache[image_id]
|
|
460
|
+
sample["output"] = cached_item
|
|
461
|
+
return sample
|
|
462
|
+
|
|
463
|
+
if show_diff:
|
|
464
|
+
differ = Differ()
|
|
465
|
+
for sample in dataset:
|
|
466
|
+
image_id = sample["image_id"]
|
|
467
|
+
if image_id in cache:
|
|
468
|
+
cached_item = orjson.loads(cache[image_id])
|
|
469
|
+
original_item = orjson.loads(sample["output"])
|
|
470
|
+
cached_item_str = orjson.dumps(
|
|
471
|
+
cached_item, option=orjson.OPT_INDENT_2
|
|
472
|
+
).decode("utf8")
|
|
473
|
+
original_item_str = orjson.dumps(
|
|
474
|
+
original_item, option=orjson.OPT_INDENT_2
|
|
475
|
+
).decode("utf8")
|
|
476
|
+
diff = list(
|
|
477
|
+
differ.compare(
|
|
478
|
+
original_item_str.splitlines(keepends=True),
|
|
479
|
+
cached_item_str.splitlines(keepends=True),
|
|
480
|
+
)
|
|
481
|
+
)
|
|
482
|
+
sys.stdout.writelines(diff)
|
|
483
|
+
sys.stdout.write("\n" + "-" * 30 + "\n")
|
|
484
|
+
|
|
485
|
+
else:
|
|
486
|
+
updated_dataset = dataset.map(apply_updates, batched=False)
|
|
487
|
+
updated_dataset.push_to_hub(repo_id, split=split, revision=revision)
|
labelr/apps/directus.py
ADDED
|
@@ -0,0 +1,212 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from typing import Annotated
|
|
3
|
+
|
|
4
|
+
import requests
|
|
5
|
+
import typer
|
|
6
|
+
|
|
7
|
+
app = typer.Typer()
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
DEFAULT_DIRECTUS_URL = "http://localhost:8055"
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def _list_endpoint_iter(
|
|
14
|
+
url: str,
|
|
15
|
+
session: requests.Session,
|
|
16
|
+
page_size: int,
|
|
17
|
+
method: str = "GET",
|
|
18
|
+
list_field: str | None = "data",
|
|
19
|
+
**kwargs,
|
|
20
|
+
):
|
|
21
|
+
"""Iterate over paginated Directus endpoint.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
url (str): URL of the Directus endpoint.
|
|
25
|
+
session (requests.Session): Requests session to use for making HTTP
|
|
26
|
+
requests.
|
|
27
|
+
page_size (int): Number of items to fetch per page.
|
|
28
|
+
method (str, optional): HTTP method to use. Defaults to "GET".
|
|
29
|
+
list_field (str | None, optional): Field in the response JSON that
|
|
30
|
+
contains the list of items. If None, the entire response is used as
|
|
31
|
+
the list. Defaults to "data".
|
|
32
|
+
**kwargs: Additional keyword arguments to pass to the requests method.
|
|
33
|
+
Yields:
|
|
34
|
+
dict: Items from the Directus endpoint.
|
|
35
|
+
"""
|
|
36
|
+
page = 0
|
|
37
|
+
next_page = True
|
|
38
|
+
params = kwargs.pop("params", {})
|
|
39
|
+
|
|
40
|
+
while next_page:
|
|
41
|
+
params["offset"] = page * page_size
|
|
42
|
+
params["limit"] = page_size
|
|
43
|
+
r = session.request(method=method, url=url, params=params, **kwargs)
|
|
44
|
+
r.raise_for_status()
|
|
45
|
+
response = r.json()
|
|
46
|
+
items = response[list_field] if list_field else response
|
|
47
|
+
if len(items) > 0:
|
|
48
|
+
yield from items
|
|
49
|
+
else:
|
|
50
|
+
next_page = False
|
|
51
|
+
page += 1
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def iter_items(
|
|
55
|
+
collection_name: str,
|
|
56
|
+
url: str,
|
|
57
|
+
session: requests.Session,
|
|
58
|
+
page_size: int = 50,
|
|
59
|
+
**kwargs,
|
|
60
|
+
):
|
|
61
|
+
"""Iterate over items in a Directus collection.
|
|
62
|
+
|
|
63
|
+
Args:
|
|
64
|
+
collection_name (str): Name of the Directus collection.
|
|
65
|
+
url (str): Base URL of the Directus server.
|
|
66
|
+
session (requests.Session): Requests session to use for making HTTP
|
|
67
|
+
requests.
|
|
68
|
+
page_size (int, optional): Number of items to fetch per page. Defaults
|
|
69
|
+
to 50.
|
|
70
|
+
**kwargs: Additional keyword arguments to pass to the requests method.
|
|
71
|
+
Yields:
|
|
72
|
+
dict: Items from the Directus collection.
|
|
73
|
+
"""
|
|
74
|
+
yield from _list_endpoint_iter(
|
|
75
|
+
url=f"{url}/items/{collection_name}",
|
|
76
|
+
session=session,
|
|
77
|
+
page_size=page_size,
|
|
78
|
+
**kwargs,
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
@app.command()
|
|
83
|
+
def upload_data(
|
|
84
|
+
dataset_path: Annotated[
|
|
85
|
+
Path,
|
|
86
|
+
typer.Option(
|
|
87
|
+
help="Path to the dataset JSONL file to upload from.",
|
|
88
|
+
file_okay=True,
|
|
89
|
+
dir_okay=False,
|
|
90
|
+
readable=True,
|
|
91
|
+
),
|
|
92
|
+
],
|
|
93
|
+
collection: Annotated[
|
|
94
|
+
str, typer.Option(help="Name of the collection to upload the items to.")
|
|
95
|
+
],
|
|
96
|
+
directus_url: Annotated[
|
|
97
|
+
str,
|
|
98
|
+
typer.Option(
|
|
99
|
+
help="Base URL of the Directus server.",
|
|
100
|
+
),
|
|
101
|
+
] = DEFAULT_DIRECTUS_URL,
|
|
102
|
+
):
|
|
103
|
+
"""Upload data to a Directus collection."""
|
|
104
|
+
import orjson
|
|
105
|
+
import requests
|
|
106
|
+
import tqdm
|
|
107
|
+
|
|
108
|
+
session = requests.Session()
|
|
109
|
+
|
|
110
|
+
with dataset_path.open("r") as f:
|
|
111
|
+
for item in tqdm.tqdm(map(orjson.loads, f), desc="items"):
|
|
112
|
+
r = session.post(
|
|
113
|
+
f"{directus_url}/items/{collection}",
|
|
114
|
+
json=item,
|
|
115
|
+
)
|
|
116
|
+
print(r.json())
|
|
117
|
+
r.raise_for_status()
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
@app.command()
|
|
121
|
+
def update_items(
|
|
122
|
+
collection: Annotated[
|
|
123
|
+
str, typer.Option(help="Name of the collection to upload the items to.")
|
|
124
|
+
],
|
|
125
|
+
directus_url: Annotated[
|
|
126
|
+
str,
|
|
127
|
+
typer.Option(
|
|
128
|
+
help="Base URL of the Directus server.",
|
|
129
|
+
),
|
|
130
|
+
] = DEFAULT_DIRECTUS_URL,
|
|
131
|
+
sort: Annotated[
|
|
132
|
+
str | None,
|
|
133
|
+
typer.Option(help="The field to sort items by, defaults to None (no sorting)."),
|
|
134
|
+
] = None,
|
|
135
|
+
skip: Annotated[
|
|
136
|
+
int, typer.Option(help="Number of items to skip, defaults to 0.")
|
|
137
|
+
] = 0,
|
|
138
|
+
):
|
|
139
|
+
"""Update items in a Directus collection.
|
|
140
|
+
|
|
141
|
+
**Warning**: This command requires you to implement the processing
|
|
142
|
+
function inside the command. It is provided as a template for batch
|
|
143
|
+
updating items in a Directus collection.
|
|
144
|
+
"""
|
|
145
|
+
import requests
|
|
146
|
+
import tqdm
|
|
147
|
+
|
|
148
|
+
session = requests.Session()
|
|
149
|
+
|
|
150
|
+
params = {} if sort is None else {"sort[]": sort}
|
|
151
|
+
for i, item in tqdm.tqdm(
|
|
152
|
+
enumerate(
|
|
153
|
+
iter_items(
|
|
154
|
+
collection_name=collection,
|
|
155
|
+
url=directus_url,
|
|
156
|
+
session=session,
|
|
157
|
+
params=params,
|
|
158
|
+
)
|
|
159
|
+
)
|
|
160
|
+
):
|
|
161
|
+
if i < skip:
|
|
162
|
+
typer.echo(f"Skipping item {i}")
|
|
163
|
+
continue
|
|
164
|
+
|
|
165
|
+
item_id = item["id"]
|
|
166
|
+
# Implement your processing function here
|
|
167
|
+
# It should return a dict with the fields to update only
|
|
168
|
+
# If no update is needed, it should return None
|
|
169
|
+
patch_item = None
|
|
170
|
+
|
|
171
|
+
if patch_item is not None:
|
|
172
|
+
r = session.patch(
|
|
173
|
+
f"{directus_url}/items/{collection}/{item_id}",
|
|
174
|
+
json=patch_item,
|
|
175
|
+
)
|
|
176
|
+
r.raise_for_status()
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
@app.command()
|
|
180
|
+
def export_data(
|
|
181
|
+
output_path: Annotated[
|
|
182
|
+
Path, typer.Option(help="Path to the file to export to.", allow_dash=True)
|
|
183
|
+
],
|
|
184
|
+
collection: Annotated[
|
|
185
|
+
str, typer.Option(help="Name of the collection to upload the items to.")
|
|
186
|
+
],
|
|
187
|
+
directus_url: Annotated[
|
|
188
|
+
str,
|
|
189
|
+
typer.Option(
|
|
190
|
+
help="Base URL of the Directus server.",
|
|
191
|
+
),
|
|
192
|
+
] = DEFAULT_DIRECTUS_URL,
|
|
193
|
+
):
|
|
194
|
+
"""Export a directus collection to a JSONL file."""
|
|
195
|
+
import sys
|
|
196
|
+
|
|
197
|
+
import orjson
|
|
198
|
+
import requests
|
|
199
|
+
import tqdm
|
|
200
|
+
|
|
201
|
+
session = requests.Session()
|
|
202
|
+
|
|
203
|
+
f = sys.stdout if output_path.as_posix() == "-" else output_path.open("w")
|
|
204
|
+
with f:
|
|
205
|
+
for item in tqdm.tqdm(
|
|
206
|
+
iter_items(
|
|
207
|
+
collection_name=collection,
|
|
208
|
+
url=directus_url,
|
|
209
|
+
session=session,
|
|
210
|
+
)
|
|
211
|
+
):
|
|
212
|
+
f.write(orjson.dumps(item).decode("utf-8") + "\n")
|
labelr/apps/google_batch.py
CHANGED
|
@@ -7,6 +7,7 @@ import typer
|
|
|
7
7
|
from google.genai.types import JSONSchema as GoogleJSONSchema
|
|
8
8
|
from google.genai.types import Schema as GoogleSchema
|
|
9
9
|
from openfoodfacts import Flavor
|
|
10
|
+
from openfoodfacts.types import JSONType
|
|
10
11
|
from pydantic import BaseModel
|
|
11
12
|
|
|
12
13
|
from labelr.google_genai import generate_batch_dataset, launch_batch_job
|
|
@@ -14,6 +15,40 @@ from labelr.google_genai import generate_batch_dataset, launch_batch_job
|
|
|
14
15
|
app = typer.Typer()
|
|
15
16
|
|
|
16
17
|
|
|
18
|
+
def _check_json_schema(item: JSONType) -> None:
|
|
19
|
+
if item.get("type") == "object":
|
|
20
|
+
required_fields = item.get("required", [])
|
|
21
|
+
all_fields = item.get("properties", [])
|
|
22
|
+
diff = set(all_fields) - set(required_fields)
|
|
23
|
+
if diff:
|
|
24
|
+
raise ValueError(
|
|
25
|
+
f"fields '{diff}' must be marked as required in the JSONSchema. "
|
|
26
|
+
"All fields with type 'object' must be required."
|
|
27
|
+
)
|
|
28
|
+
return None
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def check_json_schema(json_schema: JSONType) -> None:
|
|
32
|
+
"""Check that for all `object`s, all fields are marked as required.
|
|
33
|
+
|
|
34
|
+
This is important to check, as otherwise the structured generation
|
|
35
|
+
backend may prevent the model to generate these fields.
|
|
36
|
+
This is the case as of vLLM 0.13 and xgrammars as backend.
|
|
37
|
+
|
|
38
|
+
To prevent this, we ask all fields to be marked as required.
|
|
39
|
+
"""
|
|
40
|
+
stack = [json_schema]
|
|
41
|
+
|
|
42
|
+
for def_item in json_schema.get("$defs", {}).values():
|
|
43
|
+
stack.append(def_item)
|
|
44
|
+
|
|
45
|
+
while stack:
|
|
46
|
+
item = stack.pop()
|
|
47
|
+
_check_json_schema(item)
|
|
48
|
+
for sub_item in item.get("properties", {}).values():
|
|
49
|
+
stack.append(sub_item)
|
|
50
|
+
|
|
51
|
+
|
|
17
52
|
def convert_pydantic_model_to_google_schema(schema: type[BaseModel]) -> dict[str, Any]:
|
|
18
53
|
"""Google doesn't support natively OpenAPI schemas, so we convert them to
|
|
19
54
|
Google `Schema` (a subset of OpenAPI)."""
|
|
@@ -260,6 +295,9 @@ def upload_training_dataset_from_predictions(
|
|
|
260
295
|
print(f"Instructions: {instructions}")
|
|
261
296
|
json_schema = orjson.loads(json_schema_path.read_text())
|
|
262
297
|
|
|
298
|
+
# We check that all fields are marked as required
|
|
299
|
+
check_json_schema(json_schema)
|
|
300
|
+
|
|
263
301
|
api = HfApi()
|
|
264
302
|
config = {
|
|
265
303
|
"instructions": instructions,
|