labelr 0.10.0__py3-none-any.whl → 0.11.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
labelr/annotate.py CHANGED
@@ -1,66 +1,15 @@
1
1
  import random
2
2
  import string
3
3
 
4
- from openfoodfacts.types import JSONType
5
4
  from openfoodfacts.utils import get_logger
6
5
 
7
- logger = get_logger(__name__)
8
-
9
-
10
- def format_annotation_results_from_robotoff(
11
- objects: list[JSONType],
12
- image_width: int,
13
- image_height: int,
14
- label_mapping: dict[str, str] | None = None,
15
- ) -> list[JSONType]:
16
- """Format annotation results from Robotoff prediction endpoint into
17
- Label Studio format."""
18
- annotation_results = []
19
- for object_ in objects:
20
- bounding_box = object_["bounding_box"]
21
- label_name = object_["label"]
6
+ from ultralytics import Results
22
7
 
23
- if label_mapping:
24
- label_name = label_mapping.get(label_name, label_name)
25
-
26
- # These are relative coordinates (between 0.0 and 1.0)
27
- y_min, x_min, y_max, x_max = bounding_box
28
- # Make sure the coordinates are within the image boundaries,
29
- # and convert them to percentages
30
- y_min = min(max(0, y_min), 1.0) * 100
31
- x_min = min(max(0, x_min), 1.0) * 100
32
- y_max = min(max(0, y_max), 1.0) * 100
33
- x_max = min(max(0, x_max), 1.0) * 100
34
- x = x_min
35
- y = y_min
36
- width = x_max - x_min
37
- height = y_max - y_min
38
-
39
- id_ = generate_id()
40
- annotation_results.append(
41
- {
42
- "id": id_,
43
- "type": "rectanglelabels",
44
- "from_name": "label",
45
- "to_name": "image",
46
- "original_width": image_width,
47
- "original_height": image_height,
48
- "image_rotation": 0,
49
- "value": {
50
- "rotation": 0,
51
- "x": x,
52
- "y": y,
53
- "width": width,
54
- "height": height,
55
- "rectanglelabels": [label_name],
56
- },
57
- },
58
- )
59
- return annotation_results
8
+ logger = get_logger(__name__)
60
9
 
61
10
 
62
11
  def format_annotation_results_from_ultralytics(
63
- results: "Results",
12
+ results: Results,
64
13
  labels: list[str],
65
14
  label_mapping: dict[str, str] | None = None,
66
15
  ) -> list[dict]:
labelr/apps/datasets.py CHANGED
@@ -18,7 +18,8 @@ from labelr.export.object_detection import (
18
18
  export_from_ls_to_ultralytics_object_detection,
19
19
  )
20
20
 
21
- from ..config import LABEL_STUDIO_DEFAULT_URL
21
+ from . import typer_description
22
+ from ..config import config
22
23
  from ..types import ExportDestination, ExportSource, TaskType
23
24
 
24
25
  app = typer.Typer()
@@ -125,7 +126,9 @@ def convert_object_detection_dataset(
125
126
  def export(
126
127
  from_: Annotated[ExportSource, typer.Option("--from", help="Input source to use")],
127
128
  to: Annotated[ExportDestination, typer.Option(help="Where to export the data")],
128
- api_key: Annotated[Optional[str], typer.Option(envvar="LABEL_STUDIO_API_KEY")],
129
+ api_key: Annotated[
130
+ str | None, typer.Option(help=typer_description.LABEL_STUDIO_API_KEY)
131
+ ] = config.label_studio_api_key,
129
132
  task_type: Annotated[
130
133
  TaskType, typer.Option(help="Type of task to export")
131
134
  ] = TaskType.object_detection,
@@ -142,7 +145,16 @@ def export(
142
145
  project_id: Annotated[
143
146
  Optional[int], typer.Option(help="Label Studio Project ID")
144
147
  ] = None,
145
- label_studio_url: Optional[str] = LABEL_STUDIO_DEFAULT_URL,
148
+ view_id: Annotated[
149
+ int | None,
150
+ typer.Option(
151
+ help="ID of the Label Studio view, if any. This option is useful "
152
+ "to filter the task to export."
153
+ ),
154
+ ] = None,
155
+ label_studio_url: Annotated[
156
+ str, typer.Option(help=typer_description.LABEL_STUDIO_URL)
157
+ ] = config.label_studio_url,
146
158
  output_dir: Annotated[
147
159
  Optional[Path],
148
160
  typer.Option(
@@ -163,11 +175,15 @@ def export(
163
175
  is_openfoodfacts_dataset: Annotated[
164
176
  bool,
165
177
  typer.Option(
166
- help="Whether the Ultralytics dataset is an OpenFoodFacts dataset, only "
167
- "for Ultralytics source. This is used to generate the correct image URLs "
168
- "each image name."
178
+ help="Whether the Ultralytics dataset is an Open Food Facts dataset, only "
179
+ "for Ultralytics source. This is used:\n"
180
+ "- to generate the correct image URLs from each image name, when exporting "
181
+ "from Ultralytics to Hugging Face Datasets.\n"
182
+ "- to include additional metadata fields specific to Open Food Facts "
183
+ "(`barcode` and `off_image_id`) when exporting from Label Studio to "
184
+ "Hugging Face Datasets."
169
185
  ),
170
- ] = True,
186
+ ] = False,
171
187
  openfoodfacts_flavor: Annotated[
172
188
  Flavor,
173
189
  typer.Option(
@@ -181,9 +197,18 @@ def export(
181
197
  float,
182
198
  typer.Option(
183
199
  help="Train ratio for splitting the dataset, if the split name is not "
184
- "provided (typically, if the source is Label Studio)"
200
+ "provided. Only used if the source is Label Studio and the destination "
201
+ "is Ultralytics."
185
202
  ),
186
203
  ] = 0.8,
204
+ image_max_size: Annotated[
205
+ int | None,
206
+ typer.Option(
207
+ help="Maximum size (in pixels) for the images. If None, no resizing is performed."
208
+ "Otherwise, the longest side of the image will be resized to this value, "
209
+ "keeping the aspect ratio."
210
+ ),
211
+ ] = None,
187
212
  error_raise: Annotated[
188
213
  bool,
189
214
  typer.Option(
@@ -260,9 +285,12 @@ def export(
260
285
  repo_id=repo_id,
261
286
  label_names=typing.cast(list[str], label_names_list),
262
287
  project_id=typing.cast(int, project_id),
288
+ is_openfoodfacts_dataset=is_openfoodfacts_dataset,
263
289
  merge_labels=merge_labels,
264
290
  use_aws_cache=use_aws_cache,
265
291
  revision=revision,
292
+ view_id=view_id,
293
+ image_max_size=image_max_size,
266
294
  )
267
295
  elif to == ExportDestination.ultralytics:
268
296
  export_from_ls_to_ultralytics_object_detection(
@@ -274,6 +302,8 @@ def export(
274
302
  error_raise=error_raise,
275
303
  merge_labels=merge_labels,
276
304
  use_aws_cache=use_aws_cache,
305
+ view_id=view_id,
306
+ image_max_size=image_max_size,
277
307
  )
278
308
 
279
309
  elif from_ == ExportSource.hf:
@@ -289,6 +319,7 @@ def export(
289
319
  error_raise=error_raise,
290
320
  use_aws_cache=use_aws_cache,
291
321
  revision=revision,
322
+ image_max_size=image_max_size,
292
323
  )
293
324
  else:
294
325
  raise typer.BadParameter("Unsupported export format")
@@ -327,7 +358,8 @@ def export_llm_ds(
327
358
  tmp_dir: Annotated[
328
359
  Path | None,
329
360
  typer.Option(
330
- help="Path to a temporary directory to use for image processing",
361
+ help="Path to the temporary directory used to store intermediate sample files "
362
+ "created when building the HF dataset.",
331
363
  ),
332
364
  ] = None,
333
365
  image_max_size: Annotated[
@@ -354,3 +386,102 @@ def export_llm_ds(
354
386
  tmp_dir=tmp_dir,
355
387
  image_max_size=image_max_size,
356
388
  )
389
+
390
+
391
+ @app.command()
392
+ def update_llm_ds(
393
+ dataset_path: Annotated[
394
+ Path, typer.Option(help="Path to the JSONL containing the updates.")
395
+ ],
396
+ repo_id: Annotated[
397
+ str, typer.Option(help="Hugging Face Datasets repository ID to update")
398
+ ],
399
+ split: Annotated[str, typer.Option(help="Dataset split to use")],
400
+ revision: Annotated[
401
+ str,
402
+ typer.Option(
403
+ help="Revision (branch, tag or commit) to use when pushing the new version "
404
+ "of the Hugging Face Dataset."
405
+ ),
406
+ ] = "main",
407
+ tmp_dir: Annotated[
408
+ Path | None,
409
+ typer.Option(
410
+ help="Path to a temporary directory to use for image processing",
411
+ ),
412
+ ] = None,
413
+ show_diff: Annotated[
414
+ bool,
415
+ typer.Option(
416
+ help="Show the differences between the original sample and the update. If "
417
+ "True, the updated dataset is not pushed to the Hub. Useful to review the "
418
+ "updates before applying them.",
419
+ ),
420
+ ] = False,
421
+ ):
422
+ """Update an existing LLM image extraction dataset, by updating the
423
+ `output` field of each sample in the dataset.
424
+
425
+ The `--dataset_path` JSONL file should contain items with two fields:
426
+
427
+ - `image_id`: The image ID of the sample to update in the Hugging Face
428
+ dataset.
429
+ - `output`: The new output data to set for the sample.
430
+ """
431
+ import sys
432
+ from difflib import Differ
433
+
434
+ import orjson
435
+ from datasets import load_dataset
436
+ from diskcache import Cache
437
+
438
+ dataset = load_dataset(repo_id, split=split)
439
+
440
+ # Populate cache with the updates
441
+ cache = Cache(directory=tmp_dir or None)
442
+ with dataset_path.open("r") as f:
443
+ for line in map(orjson.loads, f):
444
+ if "image_id" not in line or "output" not in line:
445
+ raise ValueError(
446
+ "Each item in the update JSONL file must contain `image_id` and `output` fields"
447
+ )
448
+ image_id = line["image_id"]
449
+ output = line["output"]
450
+
451
+ if not isinstance(output, str):
452
+ output = orjson.dumps(output).decode("utf-8")
453
+
454
+ cache[image_id] = output
455
+
456
+ def apply_updates(sample):
457
+ image_id = sample["image_id"]
458
+ if image_id in cache:
459
+ cached_item = cache[image_id]
460
+ sample["output"] = cached_item
461
+ return sample
462
+
463
+ if show_diff:
464
+ differ = Differ()
465
+ for sample in dataset:
466
+ image_id = sample["image_id"]
467
+ if image_id in cache:
468
+ cached_item = orjson.loads(cache[image_id])
469
+ original_item = orjson.loads(sample["output"])
470
+ cached_item_str = orjson.dumps(
471
+ cached_item, option=orjson.OPT_INDENT_2
472
+ ).decode("utf8")
473
+ original_item_str = orjson.dumps(
474
+ original_item, option=orjson.OPT_INDENT_2
475
+ ).decode("utf8")
476
+ diff = list(
477
+ differ.compare(
478
+ original_item_str.splitlines(keepends=True),
479
+ cached_item_str.splitlines(keepends=True),
480
+ )
481
+ )
482
+ sys.stdout.writelines(diff)
483
+ sys.stdout.write("\n" + "-" * 30 + "\n")
484
+
485
+ else:
486
+ updated_dataset = dataset.map(apply_updates, batched=False)
487
+ updated_dataset.push_to_hub(repo_id, split=split, revision=revision)
@@ -0,0 +1,212 @@
1
+ from pathlib import Path
2
+ from typing import Annotated
3
+
4
+ import requests
5
+ import typer
6
+
7
+ app = typer.Typer()
8
+
9
+
10
+ DEFAULT_DIRECTUS_URL = "http://localhost:8055"
11
+
12
+
13
+ def _list_endpoint_iter(
14
+ url: str,
15
+ session: requests.Session,
16
+ page_size: int,
17
+ method: str = "GET",
18
+ list_field: str | None = "data",
19
+ **kwargs,
20
+ ):
21
+ """Iterate over paginated Directus endpoint.
22
+
23
+ Args:
24
+ url (str): URL of the Directus endpoint.
25
+ session (requests.Session): Requests session to use for making HTTP
26
+ requests.
27
+ page_size (int): Number of items to fetch per page.
28
+ method (str, optional): HTTP method to use. Defaults to "GET".
29
+ list_field (str | None, optional): Field in the response JSON that
30
+ contains the list of items. If None, the entire response is used as
31
+ the list. Defaults to "data".
32
+ **kwargs: Additional keyword arguments to pass to the requests method.
33
+ Yields:
34
+ dict: Items from the Directus endpoint.
35
+ """
36
+ page = 0
37
+ next_page = True
38
+ params = kwargs.pop("params", {})
39
+
40
+ while next_page:
41
+ params["offset"] = page * page_size
42
+ params["limit"] = page_size
43
+ r = session.request(method=method, url=url, params=params, **kwargs)
44
+ r.raise_for_status()
45
+ response = r.json()
46
+ items = response[list_field] if list_field else response
47
+ if len(items) > 0:
48
+ yield from items
49
+ else:
50
+ next_page = False
51
+ page += 1
52
+
53
+
54
+ def iter_items(
55
+ collection_name: str,
56
+ url: str,
57
+ session: requests.Session,
58
+ page_size: int = 50,
59
+ **kwargs,
60
+ ):
61
+ """Iterate over items in a Directus collection.
62
+
63
+ Args:
64
+ collection_name (str): Name of the Directus collection.
65
+ url (str): Base URL of the Directus server.
66
+ session (requests.Session): Requests session to use for making HTTP
67
+ requests.
68
+ page_size (int, optional): Number of items to fetch per page. Defaults
69
+ to 50.
70
+ **kwargs: Additional keyword arguments to pass to the requests method.
71
+ Yields:
72
+ dict: Items from the Directus collection.
73
+ """
74
+ yield from _list_endpoint_iter(
75
+ url=f"{url}/items/{collection_name}",
76
+ session=session,
77
+ page_size=page_size,
78
+ **kwargs,
79
+ )
80
+
81
+
82
+ @app.command()
83
+ def upload_data(
84
+ dataset_path: Annotated[
85
+ Path,
86
+ typer.Option(
87
+ help="Path to the dataset JSONL file to upload from.",
88
+ file_okay=True,
89
+ dir_okay=False,
90
+ readable=True,
91
+ ),
92
+ ],
93
+ collection: Annotated[
94
+ str, typer.Option(help="Name of the collection to upload the items to.")
95
+ ],
96
+ directus_url: Annotated[
97
+ str,
98
+ typer.Option(
99
+ help="Base URL of the Directus server.",
100
+ ),
101
+ ] = DEFAULT_DIRECTUS_URL,
102
+ ):
103
+ """Upload data to a Directus collection."""
104
+ import orjson
105
+ import requests
106
+ import tqdm
107
+
108
+ session = requests.Session()
109
+
110
+ with dataset_path.open("r") as f:
111
+ for item in tqdm.tqdm(map(orjson.loads, f), desc="items"):
112
+ r = session.post(
113
+ f"{directus_url}/items/{collection}",
114
+ json=item,
115
+ )
116
+ print(r.json())
117
+ r.raise_for_status()
118
+
119
+
120
+ @app.command()
121
+ def update_items(
122
+ collection: Annotated[
123
+ str, typer.Option(help="Name of the collection to upload the items to.")
124
+ ],
125
+ directus_url: Annotated[
126
+ str,
127
+ typer.Option(
128
+ help="Base URL of the Directus server.",
129
+ ),
130
+ ] = DEFAULT_DIRECTUS_URL,
131
+ sort: Annotated[
132
+ str | None,
133
+ typer.Option(help="The field to sort items by, defaults to None (no sorting)."),
134
+ ] = None,
135
+ skip: Annotated[
136
+ int, typer.Option(help="Number of items to skip, defaults to 0.")
137
+ ] = 0,
138
+ ):
139
+ """Update items in a Directus collection.
140
+
141
+ **Warning**: This command requires you to implement the processing
142
+ function inside the command. It is provided as a template for batch
143
+ updating items in a Directus collection.
144
+ """
145
+ import requests
146
+ import tqdm
147
+
148
+ session = requests.Session()
149
+
150
+ params = {} if sort is None else {"sort[]": sort}
151
+ for i, item in tqdm.tqdm(
152
+ enumerate(
153
+ iter_items(
154
+ collection_name=collection,
155
+ url=directus_url,
156
+ session=session,
157
+ params=params,
158
+ )
159
+ )
160
+ ):
161
+ if i < skip:
162
+ typer.echo(f"Skipping item {i}")
163
+ continue
164
+
165
+ item_id = item["id"]
166
+ # Implement your processing function here
167
+ # It should return a dict with the fields to update only
168
+ # If no update is needed, it should return None
169
+ patch_item = None
170
+
171
+ if patch_item is not None:
172
+ r = session.patch(
173
+ f"{directus_url}/items/{collection}/{item_id}",
174
+ json=patch_item,
175
+ )
176
+ r.raise_for_status()
177
+
178
+
179
+ @app.command()
180
+ def export_data(
181
+ output_path: Annotated[
182
+ Path, typer.Option(help="Path to the file to export to.", allow_dash=True)
183
+ ],
184
+ collection: Annotated[
185
+ str, typer.Option(help="Name of the collection to upload the items to.")
186
+ ],
187
+ directus_url: Annotated[
188
+ str,
189
+ typer.Option(
190
+ help="Base URL of the Directus server.",
191
+ ),
192
+ ] = DEFAULT_DIRECTUS_URL,
193
+ ):
194
+ """Export a directus collection to a JSONL file."""
195
+ import sys
196
+
197
+ import orjson
198
+ import requests
199
+ import tqdm
200
+
201
+ session = requests.Session()
202
+
203
+ f = sys.stdout if output_path.as_posix() == "-" else output_path.open("w")
204
+ with f:
205
+ for item in tqdm.tqdm(
206
+ iter_items(
207
+ collection_name=collection,
208
+ url=directus_url,
209
+ session=session,
210
+ )
211
+ ):
212
+ f.write(orjson.dumps(item).decode("utf-8") + "\n")
@@ -7,6 +7,7 @@ import typer
7
7
  from google.genai.types import JSONSchema as GoogleJSONSchema
8
8
  from google.genai.types import Schema as GoogleSchema
9
9
  from openfoodfacts import Flavor
10
+ from openfoodfacts.types import JSONType
10
11
  from pydantic import BaseModel
11
12
 
12
13
  from labelr.google_genai import generate_batch_dataset, launch_batch_job
@@ -14,6 +15,40 @@ from labelr.google_genai import generate_batch_dataset, launch_batch_job
14
15
  app = typer.Typer()
15
16
 
16
17
 
18
+ def _check_json_schema(item: JSONType) -> None:
19
+ if item.get("type") == "object":
20
+ required_fields = item.get("required", [])
21
+ all_fields = item.get("properties", [])
22
+ diff = set(all_fields) - set(required_fields)
23
+ if diff:
24
+ raise ValueError(
25
+ f"fields '{diff}' must be marked as required in the JSONSchema. "
26
+ "All fields with type 'object' must be required."
27
+ )
28
+ return None
29
+
30
+
31
+ def check_json_schema(json_schema: JSONType) -> None:
32
+ """Check that for all `object`s, all fields are marked as required.
33
+
34
+ This is important to check, as otherwise the structured generation
35
+ backend may prevent the model to generate these fields.
36
+ This is the case as of vLLM 0.13 and xgrammars as backend.
37
+
38
+ To prevent this, we ask all fields to be marked as required.
39
+ """
40
+ stack = [json_schema]
41
+
42
+ for def_item in json_schema.get("$defs", {}).values():
43
+ stack.append(def_item)
44
+
45
+ while stack:
46
+ item = stack.pop()
47
+ _check_json_schema(item)
48
+ for sub_item in item.get("properties", {}).values():
49
+ stack.append(sub_item)
50
+
51
+
17
52
  def convert_pydantic_model_to_google_schema(schema: type[BaseModel]) -> dict[str, Any]:
18
53
  """Google doesn't support natively OpenAPI schemas, so we convert them to
19
54
  Google `Schema` (a subset of OpenAPI)."""
@@ -260,6 +295,9 @@ def upload_training_dataset_from_predictions(
260
295
  print(f"Instructions: {instructions}")
261
296
  json_schema = orjson.loads(json_schema_path.read_text())
262
297
 
298
+ # We check that all fields are marked as required
299
+ check_json_schema(json_schema)
300
+
263
301
  api = HfApi()
264
302
  config = {
265
303
  "instructions": instructions,