labelr 0.4.0__tar.gz → 0.5.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. {labelr-0.4.0/src/labelr.egg-info → labelr-0.5.0}/PKG-INFO +6 -1
  2. {labelr-0.4.0 → labelr-0.5.0}/README.md +5 -1
  3. {labelr-0.4.0 → labelr-0.5.0}/pyproject.toml +2 -1
  4. {labelr-0.4.0 → labelr-0.5.0}/src/labelr/apps/datasets.py +6 -6
  5. labelr-0.5.0/src/labelr/apps/train.py +158 -0
  6. {labelr-0.4.0 → labelr-0.5.0}/src/labelr/export.py +64 -5
  7. {labelr-0.4.0 → labelr-0.5.0}/src/labelr/main.py +7 -0
  8. {labelr-0.4.0 → labelr-0.5.0/src/labelr.egg-info}/PKG-INFO +6 -1
  9. {labelr-0.4.0 → labelr-0.5.0}/src/labelr.egg-info/SOURCES.txt +1 -0
  10. {labelr-0.4.0 → labelr-0.5.0}/src/labelr.egg-info/requires.txt +1 -0
  11. {labelr-0.4.0 → labelr-0.5.0}/LICENSE +0 -0
  12. {labelr-0.4.0 → labelr-0.5.0}/setup.cfg +0 -0
  13. {labelr-0.4.0 → labelr-0.5.0}/src/labelr/__init__.py +0 -0
  14. {labelr-0.4.0 → labelr-0.5.0}/src/labelr/__main__.py +0 -0
  15. {labelr-0.4.0 → labelr-0.5.0}/src/labelr/annotate.py +0 -0
  16. {labelr-0.4.0 → labelr-0.5.0}/src/labelr/apps/__init__.py +0 -0
  17. {labelr-0.4.0 → labelr-0.5.0}/src/labelr/apps/projects.py +0 -0
  18. {labelr-0.4.0 → labelr-0.5.0}/src/labelr/apps/users.py +0 -0
  19. {labelr-0.4.0 → labelr-0.5.0}/src/labelr/check.py +0 -0
  20. {labelr-0.4.0 → labelr-0.5.0}/src/labelr/config.py +0 -0
  21. {labelr-0.4.0 → labelr-0.5.0}/src/labelr/project_config.py +0 -0
  22. {labelr-0.4.0 → labelr-0.5.0}/src/labelr/sample.py +0 -0
  23. {labelr-0.4.0 → labelr-0.5.0}/src/labelr/types.py +0 -0
  24. {labelr-0.4.0 → labelr-0.5.0}/src/labelr.egg-info/dependency_links.txt +0 -0
  25. {labelr-0.4.0 → labelr-0.5.0}/src/labelr.egg-info/entry_points.txt +0 -0
  26. {labelr-0.4.0 → labelr-0.5.0}/src/labelr.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: labelr
3
- Version: 0.4.0
3
+ Version: 0.5.0
4
4
  Summary: A command-line tool to manage labeling tasks with Label Studio.
5
5
  Requires-Python: >=3.10
6
6
  Description-Content-Type: text/markdown
@@ -11,6 +11,7 @@ Requires-Dist: label-studio-sdk>=1.0.8
11
11
  Requires-Dist: more-itertools>=10.5.0
12
12
  Requires-Dist: openfoodfacts>=2.9.0
13
13
  Requires-Dist: typer>=0.15.1
14
+ Requires-Dist: google-cloud-batch==0.18.0
14
15
  Provides-Extra: ultralytics
15
16
  Requires-Dist: ultralytics>=8.3.49; extra == "ultralytics"
16
17
  Dynamic: license-file
@@ -138,3 +139,7 @@ labelr datasets export --project-id PROJECT_ID --from ls --to huggingface --repo
138
139
  ```
139
140
 
140
141
  where `REPO_ID` is the ID of the Hugging Face repository where the dataset will be uploaded (ex: `openfoodfacts/food-detection`).
142
+
143
+ ### Lauch training jobs
144
+
145
+ You can also launch training jobs for YOLO object detection models using datasets hosted on Hugging Face. Please refer to the [train-yolo package README](packages/train-yolo/README.md) for more details on how to use this feature.
@@ -120,4 +120,8 @@ To export the data to a Hugging Face dataset, use the following command:
120
120
  labelr datasets export --project-id PROJECT_ID --from ls --to huggingface --repo-id REPO_ID --label-names 'product,price-tag'
121
121
  ```
122
122
 
123
- where `REPO_ID` is the ID of the Hugging Face repository where the dataset will be uploaded (ex: `openfoodfacts/food-detection`).
123
+ where `REPO_ID` is the ID of the Hugging Face repository where the dataset will be uploaded (ex: `openfoodfacts/food-detection`).
124
+
125
+ ### Lauch training jobs
126
+
127
+ You can also launch training jobs for YOLO object detection models using datasets hosted on Hugging Face. Please refer to the [train-yolo package README](packages/train-yolo/README.md) for more details on how to use this feature.
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "labelr"
3
- version = "0.4.0"
3
+ version = "0.5.0"
4
4
  description = "A command-line tool to manage labeling tasks with Label Studio."
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.10"
@@ -11,6 +11,7 @@ dependencies = [
11
11
  "more-itertools>=10.5.0",
12
12
  "openfoodfacts>=2.9.0",
13
13
  "typer>=0.15.1",
14
+ "google-cloud-batch==0.18.0",
14
15
  ]
15
16
 
16
17
  [project.scripts]
@@ -211,9 +211,9 @@ def export(
211
211
  from label_studio_sdk.client import LabelStudio
212
212
 
213
213
  from labelr.export import (
214
- export_from_hf_to_ultralytics,
215
- export_from_ls_to_hf,
216
- export_from_ls_to_ultralytics,
214
+ export_from_hf_to_ultralytics_object_detection,
215
+ export_from_ls_to_hf_object_detection,
216
+ export_from_ls_to_ultralytics_object_detection,
217
217
  )
218
218
 
219
219
  if (to == ExportDestination.hf or from_ == ExportSource.hf) and repo_id is None:
@@ -254,7 +254,7 @@ def export(
254
254
  ls = LabelStudio(base_url=label_studio_url, api_key=api_key)
255
255
  if to == ExportDestination.hf:
256
256
  repo_id = typing.cast(str, repo_id)
257
- export_from_ls_to_hf(
257
+ export_from_ls_to_hf_object_detection(
258
258
  ls,
259
259
  repo_id=repo_id,
260
260
  label_names=typing.cast(list[str], label_names_list),
@@ -263,7 +263,7 @@ def export(
263
263
  use_aws_cache=use_aws_cache,
264
264
  )
265
265
  elif to == ExportDestination.ultralytics:
266
- export_from_ls_to_ultralytics(
266
+ export_from_ls_to_ultralytics_object_detection(
267
267
  ls,
268
268
  typing.cast(Path, output_dir),
269
269
  typing.cast(list[str], label_names_list),
@@ -280,7 +280,7 @@ def export(
280
280
  "Only object detection task is currently supported with HF source"
281
281
  )
282
282
  if to == ExportDestination.ultralytics:
283
- export_from_hf_to_ultralytics(
283
+ export_from_hf_to_ultralytics_object_detection(
284
284
  typing.cast(str, repo_id),
285
285
  typing.cast(Path, output_dir),
286
286
  download_images=download_images,
@@ -0,0 +1,158 @@
1
+ import datetime
2
+
3
+ import typer
4
+ from google.cloud import batch_v1
5
+
6
+ app = typer.Typer()
7
+
8
+
9
+ @app.command()
10
+ def train_object_detection(
11
+ wandb_project: str = typer.Option(
12
+ "train-yolo", help="The Weights & Biases project name."
13
+ ),
14
+ wandb_api_key: str = typer.Option(..., envvar="WANDB_API_KEY"),
15
+ hf_token: str = typer.Option(
16
+ ...,
17
+ help="The Hugging Face token, used to push the trained model to Hugging Face Hub.",
18
+ ),
19
+ run_name: str = typer.Option(..., help="A name for the training run."),
20
+ hf_repo_id: str = typer.Option(
21
+ ..., help="The Hugging Face dataset repository ID to use to train."
22
+ ),
23
+ hf_trained_model_repo_id: str = typer.Option(
24
+ ..., help="The Hugging Face repository ID where to push the trained model."
25
+ ),
26
+ epochs: int = typer.Option(100, help="Number of training epochs."),
27
+ imgsz: int = typer.Option(640, help="Size of the image during training."),
28
+ batch_size: int = typer.Option(64, help="Batch size for training."),
29
+ ):
30
+ """Train an object detection model."""
31
+ env_variables = {
32
+ "HF_REPO_ID": hf_repo_id,
33
+ "HF_TRAINED_MODEL_REPO_ID": hf_trained_model_repo_id,
34
+ "HF_TOKEN": hf_token,
35
+ "WANDB_PROJECT": wandb_project,
36
+ "RUN_NAME": run_name,
37
+ "WANDB_API_KEY": wandb_api_key,
38
+ "EPOCHS": str(epochs),
39
+ "IMGSZ": str(imgsz),
40
+ "BATCH_SIZE": str(batch_size),
41
+ "USE_AWS_IMAGE_CACHE": "False",
42
+ }
43
+ job_name = "train-yolo-job"
44
+ job_name = job_name + "-" + datetime.datetime.now().strftime("%Y%m%d%H%M%S")
45
+ job = launch_job(
46
+ job_name=job_name,
47
+ container_image_uri="europe-west9-docker.pkg.dev/robotoff/gcf-artifacts/train-yolo",
48
+ env_variables=env_variables,
49
+ )
50
+ typer.echo("Job launched")
51
+ typer.echo(job)
52
+
53
+
54
+ def launch_job(
55
+ job_name: str = typer.Argument(
56
+ ...,
57
+ help="The name of the Google Batch job that will be created. "
58
+ "It needs to be unique for each project and region pair.",
59
+ ),
60
+ container_image_uri: str = typer.Argument(
61
+ ..., help="The URI of the container image that will be run as part of the job."
62
+ ),
63
+ commands: str | None = None,
64
+ env_variables: dict[str, str] | None = None,
65
+ entrypoint: str | None = None,
66
+ cpu_milli: int = 4000, # in milli-CPU units (4000 = 4 CPUs). This means the task requires 4 whole CPUs.
67
+ memory_mib: int = 16000, # Make sure to have enough memory for the 2GB of shared memory set below.
68
+ boot_disk_mib: int = 100000,
69
+ max_retry_count: int = 1,
70
+ max_run_duration: str = "86400s", # 24 hours
71
+ task_count: int = 1,
72
+ accelerators_type: str = "nvidia-tesla-t4",
73
+ machine_type: str = "n1-standard-8",
74
+ google_project_id: str = "robotoff",
75
+ accelerators_count: int = 1,
76
+ region: str = "europe-west4",
77
+ install_gpu_drivers: bool = True,
78
+ ) -> batch_v1.Job:
79
+ """This method creates a Batch Job on GCP.
80
+
81
+ Sources:
82
+ * https://github.com/GoogleCloudPlatform/python-docs-samples/tree/main/batch/create
83
+ * https://cloud.google.com/python/docs/reference/batch/latest/google.cloud.batch_v1.types # noqa
84
+
85
+ :param google_batch_launch_config: Config to run a job on Google Batch.
86
+ :param batch_job_config: Config to run a specific job on Google Batch.
87
+ :return: Batch job information.
88
+
89
+ Returns:
90
+ Batch job information.
91
+ """
92
+ client = batch_v1.BatchServiceClient()
93
+
94
+ # Define what will be done as part of the job.
95
+ runnable = batch_v1.Runnable()
96
+ runnable.container = batch_v1.Runnable.Container()
97
+ runnable.container.image_uri = container_image_uri
98
+ runnable.container.entrypoint = entrypoint # type: ignore
99
+ # By default, /dev/shm is 64MB which is not enough for Pytorch
100
+ runnable.container.options = "--shm-size=2048m"
101
+ runnable.container.commands = commands
102
+
103
+ # Jobs can be divided into tasks. In this case, we have only one task.
104
+ task = batch_v1.TaskSpec()
105
+ task.runnables = [runnable]
106
+
107
+ # Environment variables.
108
+ envable = batch_v1.Environment()
109
+ envable.variables = env_variables or {}
110
+ task.environment = envable
111
+
112
+ # We can specify what resources are requested by each task.
113
+ resources = batch_v1.ComputeResource()
114
+ resources.cpu_milli = cpu_milli
115
+ resources.memory_mib = memory_mib
116
+ resources.boot_disk_mib = boot_disk_mib # type: ignore
117
+ task.compute_resource = resources
118
+
119
+ task.max_retry_count = max_retry_count
120
+ task.max_run_duration = max_run_duration # type: ignore
121
+
122
+ # Tasks are grouped inside a job using TaskGroups.
123
+ group = batch_v1.TaskGroup()
124
+ group.task_count = task_count # type: ignore
125
+ group.task_spec = task
126
+
127
+ # Policies are used to define on what kind of virtual machines the tasks
128
+ # will run on.
129
+ policy = batch_v1.AllocationPolicy.InstancePolicy()
130
+ # See list of machine types here:
131
+ # https://docs.cloud.google.com/compute/docs/gpus#l4-gpus
132
+ policy.machine_type = machine_type
133
+
134
+ accelerator = batch_v1.AllocationPolicy.Accelerator()
135
+ accelerator.type_ = accelerators_type
136
+ accelerator.count = accelerators_count
137
+
138
+ policy.accelerators = [accelerator]
139
+ instances = batch_v1.AllocationPolicy.InstancePolicyOrTemplate()
140
+ instances.policy = policy
141
+ instances.install_gpu_drivers = install_gpu_drivers
142
+ allocation_policy = batch_v1.AllocationPolicy()
143
+ allocation_policy.instances = [instances]
144
+
145
+ job = batch_v1.Job()
146
+ job.task_groups = [group]
147
+ job.allocation_policy = allocation_policy
148
+ # We use Cloud Logging as it's an out of the box available option
149
+ job.logs_policy = batch_v1.LogsPolicy()
150
+ job.logs_policy.destination = batch_v1.LogsPolicy.Destination.CLOUD_LOGGING # type: ignore
151
+
152
+ create_request = batch_v1.CreateJobRequest()
153
+ create_request.job = job
154
+ create_request.job_id = job_name
155
+ # The job's parent is the region in which the job will run
156
+ create_request.parent = f"projects/{google_project_id}/locations/{region}"
157
+
158
+ return client.create_job(create_request)
@@ -29,7 +29,7 @@ def _pickle_sample_generator(dir: Path):
29
29
  yield pickle.load(f)
30
30
 
31
31
 
32
- def export_from_ls_to_hf(
32
+ def export_from_ls_to_hf_object_detection(
33
33
  ls: LabelStudio,
34
34
  repo_id: str,
35
35
  label_names: list[str],
@@ -73,7 +73,7 @@ def export_from_ls_to_hf(
73
73
  hf_ds.push_to_hub(repo_id, split=split)
74
74
 
75
75
 
76
- def export_from_ls_to_ultralytics(
76
+ def export_from_ls_to_ultralytics_object_detection(
77
77
  ls: LabelStudio,
78
78
  output_dir: Path,
79
79
  label_names: list[str],
@@ -206,21 +206,36 @@ def export_from_ls_to_ultralytics(
206
206
  f.write(f" {i}: {label_name}\n")
207
207
 
208
208
 
209
- def export_from_hf_to_ultralytics(
209
+ def export_from_hf_to_ultralytics_object_detection(
210
210
  repo_id: str,
211
211
  output_dir: Path,
212
212
  download_images: bool = True,
213
213
  error_raise: bool = True,
214
214
  use_aws_cache: bool = True,
215
+ revision: str = "main",
215
216
  ):
216
217
  """Export annotations from a Hugging Face dataset project to the
217
218
  Ultralytics format.
218
219
 
219
220
  The Label Studio project should be an object detection project with a
220
221
  single rectanglelabels annotation result per task.
222
+
223
+ Args:
224
+ repo_id (str): Hugging Face repository ID to load the dataset from.
225
+ output_dir (Path): Path to the output directory.
226
+ download_images (bool): Whether to download images from URLs in the
227
+ dataset. If False, the dataset is expected to contain an `image`
228
+ field with the image data.
229
+ error_raise (bool): Whether to raise an error if an image fails to
230
+ download. If False, the image will be skipped. This option is only
231
+ used if `download_images` is True. Defaults to True.
232
+ use_aws_cache (bool): Whether to use the AWS image cache when
233
+ downloading images. This option is only used if `download_images`
234
+ is True. Defaults to True.
235
+ revision (str): The dataset revision to load. Defaults to 'main'.
221
236
  """
222
237
  logger.info("Repo ID: %s", repo_id)
223
- ds = datasets.load_dataset(repo_id)
238
+ ds = datasets.load_dataset(repo_id, revision=revision)
224
239
  data_dir = output_dir / "data"
225
240
  data_dir.mkdir(parents=True, exist_ok=True)
226
241
  category_id_to_name = {}
@@ -233,9 +248,16 @@ def export_from_hf_to_ultralytics(
233
248
 
234
249
  for sample in tqdm.tqdm(ds[split], desc="samples"):
235
250
  image_id = sample["image_id"]
236
- image_url = sample["meta"]["image_url"]
237
251
 
238
252
  if download_images:
253
+ if "meta" not in sample or "image_url" not in sample["meta"]:
254
+ raise ValueError(
255
+ "`meta.image_url` field not found in sample. "
256
+ "Make sure the dataset contains the `meta.image_url` "
257
+ "field, which should be the URL of the image, or set "
258
+ "`download_images` to False."
259
+ )
260
+ image_url = sample["meta"]["image_url"]
239
261
  download_output = download_image(
240
262
  image_url,
241
263
  return_struct=True,
@@ -309,6 +331,43 @@ def export_from_ultralytics_to_hf(
309
331
  "Only classification task is currently supported for Ultralytics to HF export"
310
332
  )
311
333
 
334
+ if task_type == TaskType.classification:
335
+ export_from_ultralytics_to_hf_classification(
336
+ dataset_dir=dataset_dir,
337
+ repo_id=repo_id,
338
+ label_names=label_names,
339
+ merge_labels=merge_labels,
340
+ is_openfoodfacts_dataset=is_openfoodfacts_dataset,
341
+ openfoodfacts_flavor=openfoodfacts_flavor,
342
+ )
343
+
344
+
345
+ def export_from_ultralytics_to_hf_classification(
346
+ dataset_dir: Path,
347
+ repo_id: str,
348
+ label_names: list[str],
349
+ merge_labels: bool = False,
350
+ is_openfoodfacts_dataset: bool = False,
351
+ openfoodfacts_flavor: Flavor = Flavor.off,
352
+ ) -> None:
353
+ """Export an Ultralytics classification dataset to a Hugging Face dataset.
354
+
355
+ The Ultralytics dataset directory should contain 'train', 'val' and/or
356
+ 'test' subdirectories, each containing subdirectories for each label.
357
+
358
+ Args:
359
+ dataset_dir (Path): Path to the Ultralytics dataset directory.
360
+ repo_id (str): Hugging Face repository ID to push the dataset to.
361
+ label_names (list[str]): List of label names.
362
+ merge_labels (bool): Whether to merge all labels into a single label
363
+ named 'object'.
364
+ is_openfoodfacts_dataset (bool): Whether the dataset is from
365
+ Open Food Facts. If True, the `off_image_id` and `image_url` will
366
+ be generated automatically. `off_image_id` is extracted from the
367
+ image filename.
368
+ openfoodfacts_flavor (Flavor): Flavor of Open Food Facts dataset. This
369
+ is ignored if `is_openfoodfacts_dataset` is False.
370
+ """
312
371
  logger.info("Repo ID: %s, dataset_dir: %s", repo_id, dataset_dir)
313
372
 
314
373
  if not any((dataset_dir / split).is_dir() for split in ["train", "val", "test"]):
@@ -5,6 +5,7 @@ from openfoodfacts.utils import get_logger
5
5
 
6
6
  from labelr.apps import datasets as dataset_app
7
7
  from labelr.apps import projects as project_app
8
+ from labelr.apps import train as train_app
8
9
  from labelr.apps import users as user_app
9
10
 
10
11
  app = typer.Typer(pretty_exceptions_show_locals=False)
@@ -69,5 +70,11 @@ app.add_typer(
69
70
  help="Manage datasets (convert, export, check, etc.)",
70
71
  )
71
72
 
73
+ app.add_typer(
74
+ train_app.app,
75
+ name="train",
76
+ help="Train models",
77
+ )
78
+
72
79
  if __name__ == "__main__":
73
80
  app()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: labelr
3
- Version: 0.4.0
3
+ Version: 0.5.0
4
4
  Summary: A command-line tool to manage labeling tasks with Label Studio.
5
5
  Requires-Python: >=3.10
6
6
  Description-Content-Type: text/markdown
@@ -11,6 +11,7 @@ Requires-Dist: label-studio-sdk>=1.0.8
11
11
  Requires-Dist: more-itertools>=10.5.0
12
12
  Requires-Dist: openfoodfacts>=2.9.0
13
13
  Requires-Dist: typer>=0.15.1
14
+ Requires-Dist: google-cloud-batch==0.18.0
14
15
  Provides-Extra: ultralytics
15
16
  Requires-Dist: ultralytics>=8.3.49; extra == "ultralytics"
16
17
  Dynamic: license-file
@@ -138,3 +139,7 @@ labelr datasets export --project-id PROJECT_ID --from ls --to huggingface --repo
138
139
  ```
139
140
 
140
141
  where `REPO_ID` is the ID of the Hugging Face repository where the dataset will be uploaded (ex: `openfoodfacts/food-detection`).
142
+
143
+ ### Lauch training jobs
144
+
145
+ You can also launch training jobs for YOLO object detection models using datasets hosted on Hugging Face. Please refer to the [train-yolo package README](packages/train-yolo/README.md) for more details on how to use this feature.
@@ -20,4 +20,5 @@ src/labelr.egg-info/top_level.txt
20
20
  src/labelr/apps/__init__.py
21
21
  src/labelr/apps/datasets.py
22
22
  src/labelr/apps/projects.py
23
+ src/labelr/apps/train.py
23
24
  src/labelr/apps/users.py
@@ -4,6 +4,7 @@ label-studio-sdk>=1.0.8
4
4
  more-itertools>=10.5.0
5
5
  openfoodfacts>=2.9.0
6
6
  typer>=0.15.1
7
+ google-cloud-batch==0.18.0
7
8
 
8
9
  [ultralytics]
9
10
  ultralytics>=8.3.49
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes