labelr 0.9.0__py3-none-any.whl → 0.11.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
labelr/check.py CHANGED
@@ -1,30 +1,64 @@
1
+ import typing
1
2
  from collections import defaultdict
2
3
  from pathlib import Path
3
4
 
4
5
  import imagehash
5
6
  import tqdm
6
7
  from label_studio_sdk.client import LabelStudio
7
- from openfoodfacts.utils import get_image_from_url, get_logger
8
+ from openfoodfacts.types import JSONType
9
+ from openfoodfacts.utils import ImageDownloadItem, get_image_from_url, get_logger
8
10
  from PIL import Image
9
11
 
10
12
  logger = get_logger(__name__)
11
13
 
12
14
 
13
- def check_ls_dataset(ls: LabelStudio, project_id: int):
15
+ def check_ls_dataset(
16
+ ls: LabelStudio,
17
+ project_id: int,
18
+ view_id: int | None = None,
19
+ delete_missing_images: bool = False,
20
+ delete_duplicate_images: bool = False,
21
+ ):
22
+ """Perform sanity checks of a Label Studio dataset.
23
+
24
+ This function checks for:
25
+ - Tasks with missing images (404)
26
+ - Duplicate images based on perceptual hash (pHash)
27
+ - Tasks with multiple annotations
28
+
29
+ This function doesn't perform any modifications to the dataset, except
30
+ optionally deleting tasks with missing images if `delete_missing_images`
31
+ is set to True.
32
+
33
+ Args:
34
+ ls (LabelStudio): Label Studio client instance.
35
+ project_id (int): ID of the Label Studio project to check.
36
+ view_id (int): ID of the Label Studio view to check. If None, no
37
+ filtering is done.
38
+ delete_missing_images (bool): Whether to delete tasks with missing
39
+ images.
40
+ delete_duplicate_images (bool): Whether to delete tasks with duplicate
41
+ images. If one task has annotations and the other doesn't, the task
42
+ with annotations will be kept. Otherwise, the most recent task will
43
+ be kept.
44
+ """
14
45
  skipped = 0
15
46
  not_annotated = 0
16
47
  annotated = 0
48
+ deleted = 0
49
+ multiple_annotations = 0
17
50
  hash_map = defaultdict(list)
18
51
  for task in tqdm.tqdm(
19
- ls.tasks.list(project=project_id, fields="all"), desc="tasks"
52
+ ls.tasks.list(project=project_id, fields="all", view=view_id), desc="tasks"
20
53
  ):
21
- annotations = task.annotations
54
+ annotations = typing.cast(list[JSONType], task.annotations)
22
55
 
23
56
  if len(annotations) == 0:
24
57
  not_annotated += 1
25
58
  continue
26
59
  elif len(annotations) > 1:
27
60
  logger.warning("Task has multiple annotations: %s", task.id)
61
+ multiple_annotations += 1
28
62
  continue
29
63
 
30
64
  annotation = annotations[0]
@@ -34,20 +68,47 @@ def check_ls_dataset(ls: LabelStudio, project_id: int):
34
68
 
35
69
  annotated += 1
36
70
  image_url = task.data["image_url"]
37
- image = get_image_from_url(image_url)
38
- image_hash = str(imagehash.phash(image))
71
+ image_struct = typing.cast(
72
+ ImageDownloadItem,
73
+ get_image_from_url(image_url, return_struct=True, error_raise=False),
74
+ )
75
+
76
+ if image_struct.response.status_code == 404:
77
+ logger.warning("Image not found (404): %s", image_url)
78
+
79
+ if delete_missing_images:
80
+ ls.tasks.delete(task.id)
81
+ deleted += 1
82
+ logger.info("Deleted task with missing image: %s", task.id)
83
+ continue
84
+
85
+ if image_struct.image is None:
86
+ logger.warning("Could not open image: %s", image_url)
87
+ continue
88
+
89
+ image_hash = str(imagehash.phash(image_struct.image))
39
90
  hash_map[image_hash].append(task.id)
40
91
 
41
92
  for image_hash, task_ids in hash_map.items():
42
93
  if len(task_ids) > 1:
43
94
  logger.warning("Duplicate images: %s", task_ids)
95
+ if delete_duplicate_images:
96
+ tasks = [ls.tasks.get(id=task_id) for task_id in task_ids]
97
+ # We sort the tasks by the number of annotations, so that we keep the
98
+ # one with at least one annotation.
99
+ for task in sorted(tasks, key=lambda x: len(x.annotations) > 0)[:-1]:
100
+ logger.info("Deleting duplicate task: %s", task.id)
101
+ ls.tasks.delete(task.id)
102
+ deleted += 1
44
103
 
45
104
  logger.info(
46
- "Tasks - annotated: %d, skipped: %d, not annotated: %d",
105
+ "Tasks - annotated: %d, skipped: %d, not annotated: %d, multiple annotations: %d",
47
106
  annotated,
48
107
  skipped,
49
108
  not_annotated,
109
+ multiple_annotations,
50
110
  )
111
+ logger.info("Deleted tasks with missing images: %d", deleted)
51
112
 
52
113
 
53
114
  def check_local_dataset(dataset_dir: Path, remove: bool = False):
labelr/config.py CHANGED
@@ -1 +1,57 @@
1
- LABEL_STUDIO_DEFAULT_URL = "https://annotate.openfoodfacts.org"
1
+ from pathlib import Path
2
+
3
+ from pydantic import BaseModel, Field
4
+ import os
5
+
6
+ CONFIG_PATH = Path("~").expanduser() / ".config/.labelr/config.json"
7
+
8
+
9
+ # validate_assignment allows to validate the model everytime it is updated
10
+ class LabelrConfig(BaseModel, validate_assignment=True):
11
+ label_studio_url: str = Field(
12
+ default="http://127.0.0.1:8080",
13
+ description="URL of the Label Studio instance to use. Defaults to http://127.0.0.1:8080.",
14
+ )
15
+ label_studio_api_key: str | None = Field(
16
+ default=None,
17
+ description="API key for Label Studio.",
18
+ )
19
+
20
+
21
+ def get_config() -> LabelrConfig:
22
+ """Get labelr configuration.
23
+
24
+ The configuration can come from (by order of precedence):
25
+ - Environment variables
26
+ - JSON file (see below)
27
+
28
+ The configuration is stored in a JSON file at ~/.config/.labelr/config.json.
29
+
30
+ The following environment variables are supported:
31
+ - LABELR_LABEL_STUDIO_URL
32
+ - LABELR_LABEL_STUDIO_API_KEY
33
+ """
34
+ if CONFIG_PATH.exists():
35
+ config = LabelrConfig.model_validate_json(CONFIG_PATH.read_bytes())
36
+
37
+ if "LABELR_LABEL_STUDIO_URL" in os.environ:
38
+ config.label_studio_url = os.environ["LABELR_LABEL_STUDIO_URL"]
39
+ if "LABELR_LABEL_STUDIO_API_KEY" in os.environ:
40
+ config.label_studio_api_key = os.environ["LABELR_LABEL_STUDIO_API_KEY"]
41
+ return config
42
+ else:
43
+ return LabelrConfig()
44
+
45
+
46
+ def set_file_config(key: str, value: str):
47
+ """Update the labelr configuration.
48
+
49
+ The configuration is stored in a JSON file at ~/.config/.labelr/config.json.
50
+ """
51
+ config = get_config()
52
+ setattr(config, key, value)
53
+ CONFIG_PATH.parent.mkdir(parents=True, exist_ok=True)
54
+ CONFIG_PATH.write_text(config.model_dump_json(indent=2))
55
+
56
+
57
+ config = get_config()
File without changes
@@ -0,0 +1,114 @@
1
+ import functools
2
+ import logging
3
+ import pickle
4
+ import tempfile
5
+ from pathlib import Path
6
+
7
+ import datasets
8
+ from openfoodfacts.images import generate_image_url
9
+ from openfoodfacts.types import Flavor
10
+ from PIL import Image, ImageOps
11
+
12
+ from labelr.export.common import _pickle_sample_generator
13
+ from labelr.sample.classification import HF_DS_CLASSIFICATION_FEATURES
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ def export_from_ultralytics_to_hf_classification(
19
+ dataset_dir: Path,
20
+ repo_id: str,
21
+ label_names: list[str],
22
+ merge_labels: bool = False,
23
+ is_openfoodfacts_dataset: bool = False,
24
+ openfoodfacts_flavor: Flavor = Flavor.off,
25
+ ) -> None:
26
+ """Export an Ultralytics classification dataset to a Hugging Face dataset.
27
+
28
+ The Ultralytics dataset directory should contain 'train', 'val' and/or
29
+ 'test' subdirectories, each containing subdirectories for each label.
30
+
31
+ Args:
32
+ dataset_dir (Path): Path to the Ultralytics dataset directory.
33
+ repo_id (str): Hugging Face repository ID to push the dataset to.
34
+ label_names (list[str]): List of label names.
35
+ merge_labels (bool): Whether to merge all labels into a single label
36
+ named 'object'.
37
+ is_openfoodfacts_dataset (bool): Whether the dataset is from
38
+ Open Food Facts. If True, the `off_image_id` and `image_url` will
39
+ be generated automatically. `off_image_id` is extracted from the
40
+ image filename.
41
+ openfoodfacts_flavor (Flavor): Flavor of Open Food Facts dataset. This
42
+ is ignored if `is_openfoodfacts_dataset` is False.
43
+ """
44
+ logger.info("Repo ID: %s, dataset_dir: %s", repo_id, dataset_dir)
45
+
46
+ if not any((dataset_dir / split).is_dir() for split in ["train", "val", "test"]):
47
+ raise ValueError(
48
+ f"Dataset directory {dataset_dir} does not contain 'train', 'val' or 'test' subdirectories"
49
+ )
50
+
51
+ # Save output as pickle
52
+ for split in ["train", "val", "test"]:
53
+ split_dir = dataset_dir / split
54
+
55
+ if not split_dir.is_dir():
56
+ logger.info("Skipping missing split directory: %s", split_dir)
57
+ continue
58
+
59
+ with tempfile.TemporaryDirectory() as tmp_dir_str:
60
+ tmp_dir = Path(tmp_dir_str)
61
+ for label_dir in (d for d in split_dir.iterdir() if d.is_dir()):
62
+ label_name = label_dir.name
63
+ if merge_labels:
64
+ label_name = "object"
65
+ if label_name not in label_names:
66
+ raise ValueError(
67
+ "Label name %s not in provided label names (label names: %s)"
68
+ % (label_name, label_names),
69
+ )
70
+ label_id = label_names.index(label_name)
71
+
72
+ for image_path in label_dir.glob("*"):
73
+ if is_openfoodfacts_dataset:
74
+ image_stem_parts = image_path.stem.split("_")
75
+ barcode = image_stem_parts[0]
76
+ off_image_id = image_stem_parts[1]
77
+ image_id = f"{barcode}_{off_image_id}"
78
+ image_url = generate_image_url(
79
+ barcode, off_image_id, flavor=openfoodfacts_flavor
80
+ )
81
+ else:
82
+ image_id = image_path.stem
83
+ barcode = ""
84
+ off_image_id = ""
85
+ image_url = ""
86
+ image = Image.open(image_path)
87
+ image.load()
88
+
89
+ if image.mode != "RGB":
90
+ image = image.convert("RGB")
91
+
92
+ # Rotate image according to exif orientation using Pillow
93
+ ImageOps.exif_transpose(image, in_place=True)
94
+ sample = {
95
+ "image_id": image_id,
96
+ "image": image,
97
+ "width": image.width,
98
+ "height": image.height,
99
+ "meta": {
100
+ "barcode": barcode,
101
+ "off_image_id": off_image_id,
102
+ "image_url": image_url,
103
+ },
104
+ "category_id": label_id,
105
+ "category_name": label_name,
106
+ }
107
+ with open(tmp_dir / f"{split}_{image_id}.pkl", "wb") as f:
108
+ pickle.dump(sample, f)
109
+
110
+ hf_ds = datasets.Dataset.from_generator(
111
+ functools.partial(_pickle_sample_generator, tmp_dir),
112
+ features=HF_DS_CLASSIFICATION_FEATURES,
113
+ )
114
+ hf_ds.push_to_hub(repo_id, split=split)
@@ -0,0 +1,42 @@
1
+ import pickle
2
+ from pathlib import Path
3
+
4
+ from openfoodfacts.types import Flavor
5
+
6
+ from labelr.types import TaskType
7
+
8
+
9
+ def _pickle_sample_generator(dir: Path):
10
+ """Generator that yields samples from pickles in a directory."""
11
+ for pkl in dir.glob("*.pkl"):
12
+ with open(pkl, "rb") as f:
13
+ yield pickle.load(f)
14
+
15
+
16
+ def export_from_ultralytics_to_hf(
17
+ task_type: TaskType,
18
+ dataset_dir: Path,
19
+ repo_id: str,
20
+ label_names: list[str],
21
+ merge_labels: bool = False,
22
+ is_openfoodfacts_dataset: bool = False,
23
+ openfoodfacts_flavor: Flavor = Flavor.off,
24
+ ) -> None:
25
+ from labelr.export.classification import (
26
+ export_from_ultralytics_to_hf_classification,
27
+ )
28
+
29
+ if task_type != TaskType.classification:
30
+ raise NotImplementedError(
31
+ "Only classification task is currently supported for Ultralytics to HF export"
32
+ )
33
+
34
+ if task_type == TaskType.classification:
35
+ export_from_ultralytics_to_hf_classification(
36
+ dataset_dir=dataset_dir,
37
+ repo_id=repo_id,
38
+ label_names=label_names,
39
+ merge_labels=merge_labels,
40
+ is_openfoodfacts_dataset=is_openfoodfacts_dataset,
41
+ openfoodfacts_flavor=openfoodfacts_flavor,
42
+ )
labelr/export/llm.py ADDED
@@ -0,0 +1,91 @@
1
+ import functools
2
+ import logging
3
+ import pickle
4
+ import tempfile
5
+ import typing
6
+ from collections.abc import Iterator
7
+ from pathlib import Path
8
+
9
+ import datasets
10
+ import tqdm
11
+ from PIL import Image, ImageOps
12
+
13
+ from labelr.export.common import _pickle_sample_generator
14
+ from labelr.sample.llm import (
15
+ HF_DS_LLM_IMAGE_EXTRACTION_FEATURES,
16
+ LLMImageExtractionSample,
17
+ )
18
+ from labelr.utils import PathWithContext
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ def export_to_hf_llm_image_extraction(
24
+ sample_iter: Iterator[LLMImageExtractionSample],
25
+ split: str,
26
+ repo_id: str,
27
+ revision: str = "main",
28
+ tmp_dir: Path | None = None,
29
+ image_max_size: int | None = None,
30
+ ) -> None:
31
+ """Export LLM image extraction samples to a Hugging Face dataset.
32
+
33
+ Args:
34
+ sample_iter (Iterator[LLMImageExtractionSample]): Iterator of samples
35
+ to export.
36
+ split (str): Name of the dataset split (e.g., 'train', 'val').
37
+ repo_id (str): Hugging Face repository ID to push the dataset to.
38
+ revision (str): Revision (branch, tag or commit) to use for the
39
+ Hugging Face Datasets repository.
40
+ tmp_dir (Path | None): Temporary directory to use for intermediate
41
+ files. If None, a temporary directory will be created
42
+ automatically.
43
+ image_max_size (int | None): Maximum size (in pixels) for the images.
44
+ """
45
+ logger.info(
46
+ "Repo ID: %s, revision: %s, split: %s, tmp_dir: %s, image_max_size: %s",
47
+ repo_id,
48
+ revision,
49
+ split,
50
+ tmp_dir,
51
+ image_max_size,
52
+ )
53
+
54
+ tmp_dir_with_context: PathWithContext | tempfile.TemporaryDirectory
55
+ if tmp_dir:
56
+ tmp_dir.mkdir(parents=True, exist_ok=True)
57
+ tmp_dir_with_context = PathWithContext(tmp_dir)
58
+ else:
59
+ tmp_dir_with_context = tempfile.TemporaryDirectory()
60
+
61
+ with tmp_dir_with_context as tmp_dir_str:
62
+ tmp_dir = Path(tmp_dir_str)
63
+ for sample in tqdm.tqdm(sample_iter, desc="samples"):
64
+ image = sample.image
65
+ # Rotate image according to exif orientation using Pillow
66
+ image = typing.cast(Image.Image, ImageOps.exif_transpose(image))
67
+
68
+ if image_max_size is not None:
69
+ if image.height > image_max_size or image.width > image_max_size:
70
+ image.thumbnail(
71
+ (image_max_size, image_max_size),
72
+ Image.Resampling.LANCZOS,
73
+ )
74
+ image_id = sample.image_id
75
+ json_sample = {
76
+ "image_id": image_id,
77
+ "image": image,
78
+ "meta": {
79
+ k: v for k, v in sample.meta.model_dump().items() if v is not None
80
+ },
81
+ "output": sample.output,
82
+ }
83
+ # Save output as pickle
84
+ with open(tmp_dir / f"{split}_{image_id}.pkl", "wb") as f:
85
+ pickle.dump(json_sample, f)
86
+
87
+ hf_ds = datasets.Dataset.from_generator(
88
+ functools.partial(_pickle_sample_generator, tmp_dir),
89
+ features=HF_DS_LLM_IMAGE_EXTRACTION_FEATURES,
90
+ )
91
+ hf_ds.push_to_hub(repo_id, split=split, revision=revision)