labelr 0.10.0__py3-none-any.whl → 0.11.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
labelr/check.py CHANGED
@@ -1,30 +1,64 @@
1
+ import typing
1
2
  from collections import defaultdict
2
3
  from pathlib import Path
3
4
 
4
5
  import imagehash
5
6
  import tqdm
6
7
  from label_studio_sdk.client import LabelStudio
7
- from openfoodfacts.utils import get_image_from_url, get_logger
8
+ from openfoodfacts.types import JSONType
9
+ from openfoodfacts.utils import ImageDownloadItem, get_image_from_url, get_logger
8
10
  from PIL import Image
9
11
 
10
12
  logger = get_logger(__name__)
11
13
 
12
14
 
13
- def check_ls_dataset(ls: LabelStudio, project_id: int):
15
+ def check_ls_dataset(
16
+ ls: LabelStudio,
17
+ project_id: int,
18
+ view_id: int | None = None,
19
+ delete_missing_images: bool = False,
20
+ delete_duplicate_images: bool = False,
21
+ ):
22
+ """Perform sanity checks of a Label Studio dataset.
23
+
24
+ This function checks for:
25
+ - Tasks with missing images (404)
26
+ - Duplicate images based on perceptual hash (pHash)
27
+ - Tasks with multiple annotations
28
+
29
+ This function doesn't perform any modifications to the dataset, except
30
+ optionally deleting tasks with missing images if `delete_missing_images`
31
+ is set to True.
32
+
33
+ Args:
34
+ ls (LabelStudio): Label Studio client instance.
35
+ project_id (int): ID of the Label Studio project to check.
36
+ view_id (int): ID of the Label Studio view to check. If None, no
37
+ filtering is done.
38
+ delete_missing_images (bool): Whether to delete tasks with missing
39
+ images.
40
+ delete_duplicate_images (bool): Whether to delete tasks with duplicate
41
+ images. If one task has annotations and the other doesn't, the task
42
+ with annotations will be kept. Otherwise, the most recent task will
43
+ be kept.
44
+ """
14
45
  skipped = 0
15
46
  not_annotated = 0
16
47
  annotated = 0
48
+ deleted = 0
49
+ multiple_annotations = 0
17
50
  hash_map = defaultdict(list)
18
51
  for task in tqdm.tqdm(
19
- ls.tasks.list(project=project_id, fields="all"), desc="tasks"
52
+ ls.tasks.list(project=project_id, fields="all", view=view_id), desc="tasks"
20
53
  ):
21
- annotations = task.annotations
54
+ annotations = typing.cast(list[JSONType], task.annotations)
22
55
 
23
56
  if len(annotations) == 0:
24
57
  not_annotated += 1
25
58
  continue
26
59
  elif len(annotations) > 1:
27
60
  logger.warning("Task has multiple annotations: %s", task.id)
61
+ multiple_annotations += 1
28
62
  continue
29
63
 
30
64
  annotation = annotations[0]
@@ -34,20 +68,47 @@ def check_ls_dataset(ls: LabelStudio, project_id: int):
34
68
 
35
69
  annotated += 1
36
70
  image_url = task.data["image_url"]
37
- image = get_image_from_url(image_url)
38
- image_hash = str(imagehash.phash(image))
71
+ image_struct = typing.cast(
72
+ ImageDownloadItem,
73
+ get_image_from_url(image_url, return_struct=True, error_raise=False),
74
+ )
75
+
76
+ if image_struct.response.status_code == 404:
77
+ logger.warning("Image not found (404): %s", image_url)
78
+
79
+ if delete_missing_images:
80
+ ls.tasks.delete(task.id)
81
+ deleted += 1
82
+ logger.info("Deleted task with missing image: %s", task.id)
83
+ continue
84
+
85
+ if image_struct.image is None:
86
+ logger.warning("Could not open image: %s", image_url)
87
+ continue
88
+
89
+ image_hash = str(imagehash.phash(image_struct.image))
39
90
  hash_map[image_hash].append(task.id)
40
91
 
41
92
  for image_hash, task_ids in hash_map.items():
42
93
  if len(task_ids) > 1:
43
94
  logger.warning("Duplicate images: %s", task_ids)
95
+ if delete_duplicate_images:
96
+ tasks = [ls.tasks.get(id=task_id) for task_id in task_ids]
97
+ # We sort the tasks by the number of annotations, so that we keep the
98
+ # one with at least one annotation.
99
+ for task in sorted(tasks, key=lambda x: len(x.annotations) > 0)[:-1]:
100
+ logger.info("Deleting duplicate task: %s", task.id)
101
+ ls.tasks.delete(task.id)
102
+ deleted += 1
44
103
 
45
104
  logger.info(
46
- "Tasks - annotated: %d, skipped: %d, not annotated: %d",
105
+ "Tasks - annotated: %d, skipped: %d, not annotated: %d, multiple annotations: %d",
47
106
  annotated,
48
107
  skipped,
49
108
  not_annotated,
109
+ multiple_annotations,
50
110
  )
111
+ logger.info("Deleted tasks with missing images: %d", deleted)
51
112
 
52
113
 
53
114
  def check_local_dataset(dataset_dir: Path, remove: bool = False):
labelr/config.py CHANGED
@@ -1 +1,57 @@
1
- LABEL_STUDIO_DEFAULT_URL = "https://annotate.openfoodfacts.org"
1
+ from pathlib import Path
2
+
3
+ from pydantic import BaseModel, Field
4
+ import os
5
+
6
+ CONFIG_PATH = Path("~").expanduser() / ".config/labelr/config.json"
7
+
8
+
9
+ # validate_assignment allows to validate the model everytime it is updated
10
+ class LabelrConfig(BaseModel, validate_assignment=True):
11
+ label_studio_url: str = Field(
12
+ default="http://127.0.0.1:8080",
13
+ description="URL of the Label Studio instance to use. Defaults to http://127.0.0.1:8080.",
14
+ )
15
+ label_studio_api_key: str | None = Field(
16
+ default=None,
17
+ description="API key for Label Studio.",
18
+ )
19
+
20
+
21
+ def get_config() -> LabelrConfig:
22
+ """Get labelr configuration.
23
+
24
+ The configuration can come from (by order of precedence):
25
+ - Environment variables
26
+ - JSON file (see below)
27
+
28
+ The configuration is stored in a JSON file at ~/.config/.labelr/config.json.
29
+
30
+ The following environment variables are supported:
31
+ - LABELR_LABEL_STUDIO_URL
32
+ - LABELR_LABEL_STUDIO_API_KEY
33
+ """
34
+ if CONFIG_PATH.exists():
35
+ config = LabelrConfig.model_validate_json(CONFIG_PATH.read_bytes())
36
+
37
+ if "LABELR_LABEL_STUDIO_URL" in os.environ:
38
+ config.label_studio_url = os.environ["LABELR_LABEL_STUDIO_URL"]
39
+ if "LABELR_LABEL_STUDIO_API_KEY" in os.environ:
40
+ config.label_studio_api_key = os.environ["LABELR_LABEL_STUDIO_API_KEY"]
41
+ return config
42
+ else:
43
+ return LabelrConfig()
44
+
45
+
46
+ def set_file_config(key: str, value: str):
47
+ """Update the labelr configuration.
48
+
49
+ The configuration is stored in a JSON file at ~/.config/.labelr/config.json.
50
+ """
51
+ config = get_config()
52
+ setattr(config, key, value)
53
+ CONFIG_PATH.parent.mkdir(parents=True, exist_ok=True)
54
+ CONFIG_PATH.write_text(config.model_dump_json(indent=2))
55
+
56
+
57
+ config = get_config()
@@ -1,18 +1,21 @@
1
1
  import functools
2
2
  import logging
3
3
  import pickle
4
+ import random
4
5
  import tempfile
6
+ import typing
5
7
  from pathlib import Path
6
8
 
7
9
  import datasets
8
10
  import tqdm
9
11
  from label_studio_sdk.client import LabelStudio
10
12
  from openfoodfacts.images import download_image
13
+ from PIL import Image, ImageOps
11
14
 
12
15
  from labelr.export.common import _pickle_sample_generator
13
16
  from labelr.sample.object_detection import (
14
- HF_DS_OBJECT_DETECTION_FEATURES,
15
17
  format_object_detection_sample_to_hf,
18
+ get_hf_object_detection_features,
16
19
  )
17
20
 
18
21
  logger = logging.getLogger(__name__)
@@ -23,19 +26,47 @@ def export_from_ls_to_hf_object_detection(
23
26
  repo_id: str,
24
27
  label_names: list[str],
25
28
  project_id: int,
29
+ is_openfoodfacts_dataset: bool,
30
+ image_max_size: int | None = None,
31
+ view_id: int | None = None,
26
32
  merge_labels: bool = False,
27
33
  use_aws_cache: bool = True,
28
34
  revision: str = "main",
29
- ):
35
+ ) -> None:
36
+ """Export annotations from a Label Studio project to a Hugging Face
37
+ dataset.
38
+
39
+ The Label Studio project should be an object detection project.
40
+
41
+ Args:
42
+ ls (LabelStudio): Label Studio client instance.
43
+ repo_id (str): Hugging Face repository ID to push the dataset to.
44
+ label_names (list[str]): List of label names in the project.
45
+ project_id (int): Label Studio project ID to export from.
46
+ is_openfoodfacts_dataset (bool): Whether the dataset is an Open Food
47
+ Facts dataset. If True, the dataset will include additional
48
+ metadata fields specific to Open Food Facts (`barcode` and
49
+ `off_image_id`).
50
+ image_max_size (int | None): Maximum size (in pixels) for the images.
51
+ If None, no resizing is performed. Defaults to None.
52
+ view_id (int | None): Label Studio view ID to export from. If None,
53
+ all tasks are exported. Defaults to None.
54
+ merge_labels (bool): Whether to merge all labels into a single label
55
+ named "object". Defaults to False.
56
+ use_aws_cache (bool): Whether to use the AWS image cache when
57
+ downloading images. Defaults to True.
58
+ revision (str): The dataset revision to push to. Defaults to 'main'.
59
+ """
30
60
  if merge_labels:
31
61
  label_names = ["object"]
32
62
 
33
63
  logger.info(
34
- "Project ID: %d, label names: %s, repo_id: %s, revision: %s",
64
+ "Project ID: %d, label names: %s, repo_id: %s, revision: %s, view ID: %s",
35
65
  project_id,
36
66
  label_names,
37
67
  repo_id,
38
68
  revision,
69
+ view_id,
39
70
  )
40
71
 
41
72
  for split in ["train", "val"]:
@@ -45,7 +76,9 @@ def export_from_ls_to_hf_object_detection(
45
76
  tmp_dir = Path(tmp_dir_str)
46
77
  logger.info("Saving samples to temporary directory: %s", tmp_dir)
47
78
  for i, task in tqdm.tqdm(
48
- enumerate(ls.tasks.list(project=project_id, fields="all")),
79
+ enumerate(
80
+ ls.tasks.list(project=project_id, fields="all", view=view_id)
81
+ ),
49
82
  desc="tasks",
50
83
  ):
51
84
  if task.data["split"] != split:
@@ -56,15 +89,17 @@ def export_from_ls_to_hf_object_detection(
56
89
  label_names=label_names,
57
90
  merge_labels=merge_labels,
58
91
  use_aws_cache=use_aws_cache,
92
+ image_max_size=image_max_size,
59
93
  )
60
94
  if sample is not None:
61
95
  # Save output as pickle
62
96
  with open(tmp_dir / f"{split}_{i:05}.pkl", "wb") as f:
63
97
  pickle.dump(sample, f)
64
98
 
99
+ features = get_hf_object_detection_features(is_openfoodfacts_dataset)
65
100
  hf_ds = datasets.Dataset.from_generator(
66
101
  functools.partial(_pickle_sample_generator, tmp_dir),
67
- features=HF_DS_OBJECT_DETECTION_FEATURES,
102
+ features=features,
68
103
  )
69
104
  hf_ds.push_to_hub(repo_id, split=split, revision=revision)
70
105
 
@@ -78,12 +113,32 @@ def export_from_ls_to_ultralytics_object_detection(
78
113
  error_raise: bool = True,
79
114
  merge_labels: bool = False,
80
115
  use_aws_cache: bool = True,
116
+ view_id: int | None = None,
117
+ image_max_size: int | None = None,
81
118
  ):
82
119
  """Export annotations from a Label Studio project to the Ultralytics
83
120
  format.
84
121
 
85
122
  The Label Studio project should be an object detection project with a
86
123
  single rectanglelabels annotation result per task.
124
+
125
+ Args:
126
+ ls (LabelStudio): Label Studio client instance.
127
+ output_dir (Path): Path to the output directory.
128
+ label_names (list[str]): List of label names in the project.
129
+ project_id (int): Label Studio project ID to export from.
130
+ train_ratio (float): Ratio of training samples. The rest will be used
131
+ for validation. Defaults to 0.8.
132
+ error_raise (bool): Whether to raise an error if an image fails to
133
+ download. If False, the image will be skipped. Defaults to True.
134
+ merge_labels (bool): Whether to merge all labels into a single label
135
+ named "object". Defaults to False.
136
+ use_aws_cache (bool): Whether to use the AWS image cache when
137
+ downloading images. Defaults to True.
138
+ view_id (int | None): Label Studio view ID to export from. If None,
139
+ all tasks are exported. Defaults to None.
140
+ image_max_size (int | None): Maximum size (in pixels) for the images.
141
+ If None, no resizing is performed. Defaults to None.
87
142
  """
88
143
  if merge_labels:
89
144
  label_names = ["object"]
@@ -101,7 +156,7 @@ def export_from_ls_to_ultralytics_object_detection(
101
156
  (images_dir / split).mkdir(parents=True, exist_ok=True)
102
157
 
103
158
  for task in tqdm.tqdm(
104
- ls.tasks.list(project=project_id, fields="all"),
159
+ ls.tasks.list(project=project_id, fields="all", view=view_id),
105
160
  desc="tasks",
106
161
  ):
107
162
  split = task.data.get("split")
@@ -179,18 +234,28 @@ def export_from_ls_to_ultralytics_object_detection(
179
234
  has_valid_annotation = True
180
235
 
181
236
  if has_valid_annotation:
182
- download_output = download_image(
237
+ image = download_image(
183
238
  image_url,
184
- return_struct=True,
239
+ return_struct=False,
185
240
  error_raise=error_raise,
186
241
  use_cache=use_aws_cache,
187
242
  )
188
- if download_output is None:
243
+ if image is None:
189
244
  logger.error("Failed to download image: %s", image_url)
190
245
  continue
191
246
 
192
- with (images_dir / split / f"{image_id}.jpg").open("wb") as f:
193
- f.write(download_output.image_bytes)
247
+ image = typing.cast(Image.Image, image)
248
+
249
+ # Rotate image according to exif orientation using Pillow
250
+ ImageOps.exif_transpose(image, in_place=True)
251
+ # Resize image if larger than max size
252
+ if image_max_size is not None and (
253
+ image.width > image_max_size or image.height > image_max_size
254
+ ):
255
+ image.thumbnail(
256
+ (image_max_size, image_max_size), Image.Resampling.LANCZOS
257
+ )
258
+ image.save(images_dir / split / f"{image_id}.jpg", format="JPEG")
194
259
 
195
260
  with (output_dir / "data.yaml").open("w") as f:
196
261
  f.write("path: data\n")
@@ -208,6 +273,7 @@ def export_from_hf_to_ultralytics_object_detection(
208
273
  download_images: bool = True,
209
274
  error_raise: bool = True,
210
275
  use_aws_cache: bool = True,
276
+ image_max_size: int | None = None,
211
277
  revision: str = "main",
212
278
  ):
213
279
  """Export annotations from a Hugging Face dataset project to the
@@ -228,6 +294,8 @@ def export_from_hf_to_ultralytics_object_detection(
228
294
  use_aws_cache (bool): Whether to use the AWS image cache when
229
295
  downloading images. This option is only used if `download_images`
230
296
  is True. Defaults to True.
297
+ image_max_size (int | None): Maximum size (in pixels) for the images.
298
+ If None, no resizing is performed. Defaults to None.
231
299
  revision (str): The dataset revision to load. Defaults to 'main'.
232
300
  """
233
301
  logger.info("Repo ID: %s, revision: %s", repo_id, revision)
@@ -263,21 +331,31 @@ def export_from_hf_to_ultralytics_object_detection(
263
331
  "`download_images` to False."
264
332
  )
265
333
  image_url = sample["meta"]["image_url"]
266
- download_output = download_image(
334
+ image = download_image(
267
335
  image_url,
268
- return_struct=True,
336
+ return_struct=False,
269
337
  error_raise=error_raise,
270
338
  use_cache=use_aws_cache,
271
339
  )
272
- if download_output is None:
340
+ if image is None:
273
341
  logger.error("Failed to download image: %s", image_url)
274
342
  continue
275
-
276
- with (split_images_dir / f"{image_id}.jpg").open("wb") as f:
277
- f.write(download_output.image_bytes)
278
343
  else:
279
344
  image = sample["image"]
280
- image.save(split_images_dir / f"{image_id}.jpg")
345
+
346
+ image = typing.cast(Image.Image, image)
347
+ # Rotate image according to exif orientation using Pillow
348
+ # If the image source is Hugging Face, EXIF data is not preserved,
349
+ # so this step is only useful when downloading images.
350
+ ImageOps.exif_transpose(image, in_place=True)
351
+ # Resize image if larger than max size
352
+ if image_max_size is not None and (
353
+ image.width > image_max_size or image.height > image_max_size
354
+ ):
355
+ image.thumbnail(
356
+ (image_max_size, image_max_size), Image.Resampling.LANCZOS
357
+ )
358
+ image.save(split_images_dir / f"{image_id}.jpg")
281
359
 
282
360
  objects = sample["objects"]
283
361
  bboxes = objects["bbox"]
labelr/main.py CHANGED
@@ -4,11 +4,13 @@ import typer
4
4
  from openfoodfacts.utils import get_logger
5
5
 
6
6
  from labelr.apps import datasets as dataset_app
7
+ from labelr.apps import directus as directus_app
7
8
  from labelr.apps import evaluate as evaluate_app
8
9
  from labelr.apps import google_batch as google_batch_app
9
10
  from labelr.apps import hugging_face as hf_app
10
11
  from labelr.apps import label_studio as ls_app
11
12
  from labelr.apps import train as train_app
13
+ from labelr import config as _config
12
14
 
13
15
  app = typer.Typer(pretty_exceptions_show_locals=False)
14
16
 
@@ -60,6 +62,17 @@ def predict(
60
62
  typer.echo(result)
61
63
 
62
64
 
65
+ @app.command()
66
+ def config(name: str, value: str):
67
+ """Set a Labelr configuration value.
68
+
69
+ The configuration is stored in a JSON file at ~/.config/.labelr/config.json.
70
+ """
71
+ typer.echo(f"Set '{name}' to '{value}'")
72
+ _config.set_file_config(name, value)
73
+ typer.echo(f"Configuration saved to {_config.CONFIG_PATH}")
74
+
75
+
63
76
  app.add_typer(
64
77
  ls_app.app,
65
78
  name="ls",
@@ -90,6 +103,9 @@ app.add_typer(
90
103
  name="google-batch",
91
104
  help="Generate datasets and launch batch jobs on Google Gemini.",
92
105
  )
106
+ app.add_typer(
107
+ directus_app.app, name="directus", help="Manage directus collections and items."
108
+ )
93
109
 
94
110
  if __name__ == "__main__":
95
111
  app()
@@ -8,7 +8,7 @@ import PIL
8
8
  from openfoodfacts import Flavor
9
9
  from openfoodfacts.barcode import normalize_barcode
10
10
  from openfoodfacts.images import download_image, generate_image_url
11
- from PIL import ImageOps
11
+ from PIL import Image, ImageOps
12
12
 
13
13
  logger = logging.getLogger(__name__)
14
14
 
@@ -153,6 +153,7 @@ def format_object_detection_sample_to_hf(
153
153
  label_names: list[str],
154
154
  merge_labels: bool = False,
155
155
  use_aws_cache: bool = False,
156
+ image_max_size: int | None = None,
156
157
  ) -> dict | None:
157
158
  """Format a Label Studio object detection sample to Hugging Face format.
158
159
 
@@ -163,6 +164,8 @@ def format_object_detection_sample_to_hf(
163
164
  merge_labels: Whether to merge all labels into a single label (the
164
165
  first label in `label_names`).
165
166
  use_aws_cache: Whether to use AWS cache when downloading images.
167
+ image_max_size: Maximum size (in pixels) for the images.
168
+ If None, no resizing is performed. Defaults to None.
166
169
 
167
170
  Returns:
168
171
  The formatted sample, or None in the following cases:
@@ -184,7 +187,8 @@ def format_object_detection_sample_to_hf(
184
187
 
185
188
  for annotation_result in annotation["result"]:
186
189
  if annotation_result["type"] != "rectanglelabels":
187
- raise ValueError("Invalid annotation type: %s" % annotation_result["type"])
190
+ continue
191
+ # raise ValueError("Invalid annotation type: %s" % annotation_result["type"])
188
192
 
189
193
  value = annotation_result["value"]
190
194
  x_min = value["x"] / 100
@@ -205,21 +209,34 @@ def format_object_detection_sample_to_hf(
205
209
  logger.error("Failed to download image: %s", image_url)
206
210
  return None
207
211
 
212
+ image = typing.cast(Image.Image, image)
208
213
  # Correct image orientation using EXIF data
209
214
  # Label Studio provides bounding boxes based on the displayed image (after
210
215
  # eventual EXIF rotation), so we need to apply the same transformation to
211
216
  # the image.
212
217
  # Indeed, Hugging Face stores images without applying EXIF rotation, and
213
218
  # EXIF data is not preserved in the dataset.
214
- ImageOps.exif_transpose(typing.cast(PIL.Image.Image, image), in_place=True)
219
+ ImageOps.exif_transpose(image, in_place=True)
220
+
221
+ # Resize image if larger than max size
222
+ if image_max_size is not None and (
223
+ image.width > image_max_size or image.height > image_max_size
224
+ ):
225
+ image.thumbnail((image_max_size, image_max_size), Image.Resampling.LANCZOS)
226
+
227
+ meta = task_data.get("meta", {})
228
+ barcode = meta.get("barcode", None)
229
+ off_image_id = meta.get("off_image_id", None)
230
+ width = image.width
231
+ height = image.height
215
232
  return {
216
233
  "image_id": task_data["image_id"],
217
234
  "image": image,
218
- "width": task_data["meta"]["width"],
219
- "height": task_data["meta"]["height"],
235
+ "width": width,
236
+ "height": height,
220
237
  "meta": {
221
- "barcode": task_data["meta"]["barcode"],
222
- "off_image_id": task_data["meta"]["off_image_id"],
238
+ "barcode": barcode,
239
+ "off_image_id": off_image_id,
223
240
  "image_url": image_url,
224
241
  },
225
242
  "objects": {
@@ -230,16 +247,23 @@ def format_object_detection_sample_to_hf(
230
247
  }
231
248
 
232
249
 
233
- # The HuggingFace Dataset features
234
- HF_DS_OBJECT_DETECTION_FEATURES = datasets.Features(
235
- {
250
+ def get_hf_object_detection_features(
251
+ is_openfoodfacts_dataset: bool,
252
+ ) -> datasets.Features:
253
+ """Get the HuggingFace Dataset features for object detection.
254
+
255
+ Args:
256
+ is_openfoodfacts_dataset (bool): Whether the dataset is an Open Food
257
+ Facts dataset. If True, the dataset will include additional
258
+ metadata fields specific to Open Food Facts (`barcode` and
259
+ `off_image_id`).
260
+ """
261
+ features_dict = {
236
262
  "image_id": datasets.Value("string"),
237
263
  "image": datasets.features.Image(),
238
264
  "width": datasets.Value("int64"),
239
265
  "height": datasets.Value("int64"),
240
266
  "meta": {
241
- "barcode": datasets.Value("string"),
242
- "off_image_id": datasets.Value("string"),
243
267
  "image_url": datasets.Value("string"),
244
268
  },
245
269
  "objects": {
@@ -248,4 +272,9 @@ HF_DS_OBJECT_DETECTION_FEATURES = datasets.Features(
248
272
  "category_name": datasets.Sequence(datasets.Value("string")),
249
273
  },
250
274
  }
251
- )
275
+
276
+ if is_openfoodfacts_dataset:
277
+ features_dict["meta"]["barcode"] = datasets.Value("string")
278
+ features_dict["meta"]["off_image_id"] = datasets.Value("string")
279
+
280
+ return datasets.Features(features_dict)