labelr 0.4.1__tar.gz → 0.5.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. {labelr-0.4.1/src/labelr.egg-info → labelr-0.5.0}/PKG-INFO +6 -1
  2. {labelr-0.4.1 → labelr-0.5.0}/README.md +5 -1
  3. {labelr-0.4.1 → labelr-0.5.0}/pyproject.toml +2 -1
  4. labelr-0.5.0/src/labelr/apps/train.py +158 -0
  5. {labelr-0.4.1 → labelr-0.5.0}/src/labelr/export.py +24 -2
  6. {labelr-0.4.1 → labelr-0.5.0}/src/labelr/main.py +7 -0
  7. {labelr-0.4.1 → labelr-0.5.0/src/labelr.egg-info}/PKG-INFO +6 -1
  8. {labelr-0.4.1 → labelr-0.5.0}/src/labelr.egg-info/SOURCES.txt +1 -0
  9. {labelr-0.4.1 → labelr-0.5.0}/src/labelr.egg-info/requires.txt +1 -0
  10. {labelr-0.4.1 → labelr-0.5.0}/LICENSE +0 -0
  11. {labelr-0.4.1 → labelr-0.5.0}/setup.cfg +0 -0
  12. {labelr-0.4.1 → labelr-0.5.0}/src/labelr/__init__.py +0 -0
  13. {labelr-0.4.1 → labelr-0.5.0}/src/labelr/__main__.py +0 -0
  14. {labelr-0.4.1 → labelr-0.5.0}/src/labelr/annotate.py +0 -0
  15. {labelr-0.4.1 → labelr-0.5.0}/src/labelr/apps/__init__.py +0 -0
  16. {labelr-0.4.1 → labelr-0.5.0}/src/labelr/apps/datasets.py +0 -0
  17. {labelr-0.4.1 → labelr-0.5.0}/src/labelr/apps/projects.py +0 -0
  18. {labelr-0.4.1 → labelr-0.5.0}/src/labelr/apps/users.py +0 -0
  19. {labelr-0.4.1 → labelr-0.5.0}/src/labelr/check.py +0 -0
  20. {labelr-0.4.1 → labelr-0.5.0}/src/labelr/config.py +0 -0
  21. {labelr-0.4.1 → labelr-0.5.0}/src/labelr/project_config.py +0 -0
  22. {labelr-0.4.1 → labelr-0.5.0}/src/labelr/sample.py +0 -0
  23. {labelr-0.4.1 → labelr-0.5.0}/src/labelr/types.py +0 -0
  24. {labelr-0.4.1 → labelr-0.5.0}/src/labelr.egg-info/dependency_links.txt +0 -0
  25. {labelr-0.4.1 → labelr-0.5.0}/src/labelr.egg-info/entry_points.txt +0 -0
  26. {labelr-0.4.1 → labelr-0.5.0}/src/labelr.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: labelr
3
- Version: 0.4.1
3
+ Version: 0.5.0
4
4
  Summary: A command-line tool to manage labeling tasks with Label Studio.
5
5
  Requires-Python: >=3.10
6
6
  Description-Content-Type: text/markdown
@@ -11,6 +11,7 @@ Requires-Dist: label-studio-sdk>=1.0.8
11
11
  Requires-Dist: more-itertools>=10.5.0
12
12
  Requires-Dist: openfoodfacts>=2.9.0
13
13
  Requires-Dist: typer>=0.15.1
14
+ Requires-Dist: google-cloud-batch==0.18.0
14
15
  Provides-Extra: ultralytics
15
16
  Requires-Dist: ultralytics>=8.3.49; extra == "ultralytics"
16
17
  Dynamic: license-file
@@ -138,3 +139,7 @@ labelr datasets export --project-id PROJECT_ID --from ls --to huggingface --repo
138
139
  ```
139
140
 
140
141
  where `REPO_ID` is the ID of the Hugging Face repository where the dataset will be uploaded (ex: `openfoodfacts/food-detection`).
142
+
143
+ ### Lauch training jobs
144
+
145
+ You can also launch training jobs for YOLO object detection models using datasets hosted on Hugging Face. Please refer to the [train-yolo package README](packages/train-yolo/README.md) for more details on how to use this feature.
@@ -120,4 +120,8 @@ To export the data to a Hugging Face dataset, use the following command:
120
120
  labelr datasets export --project-id PROJECT_ID --from ls --to huggingface --repo-id REPO_ID --label-names 'product,price-tag'
121
121
  ```
122
122
 
123
- where `REPO_ID` is the ID of the Hugging Face repository where the dataset will be uploaded (ex: `openfoodfacts/food-detection`).
123
+ where `REPO_ID` is the ID of the Hugging Face repository where the dataset will be uploaded (ex: `openfoodfacts/food-detection`).
124
+
125
+ ### Lauch training jobs
126
+
127
+ You can also launch training jobs for YOLO object detection models using datasets hosted on Hugging Face. Please refer to the [train-yolo package README](packages/train-yolo/README.md) for more details on how to use this feature.
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "labelr"
3
- version = "0.4.1"
3
+ version = "0.5.0"
4
4
  description = "A command-line tool to manage labeling tasks with Label Studio."
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.10"
@@ -11,6 +11,7 @@ dependencies = [
11
11
  "more-itertools>=10.5.0",
12
12
  "openfoodfacts>=2.9.0",
13
13
  "typer>=0.15.1",
14
+ "google-cloud-batch==0.18.0",
14
15
  ]
15
16
 
16
17
  [project.scripts]
@@ -0,0 +1,158 @@
1
+ import datetime
2
+
3
+ import typer
4
+ from google.cloud import batch_v1
5
+
6
+ app = typer.Typer()
7
+
8
+
9
+ @app.command()
10
+ def train_object_detection(
11
+ wandb_project: str = typer.Option(
12
+ "train-yolo", help="The Weights & Biases project name."
13
+ ),
14
+ wandb_api_key: str = typer.Option(..., envvar="WANDB_API_KEY"),
15
+ hf_token: str = typer.Option(
16
+ ...,
17
+ help="The Hugging Face token, used to push the trained model to Hugging Face Hub.",
18
+ ),
19
+ run_name: str = typer.Option(..., help="A name for the training run."),
20
+ hf_repo_id: str = typer.Option(
21
+ ..., help="The Hugging Face dataset repository ID to use to train."
22
+ ),
23
+ hf_trained_model_repo_id: str = typer.Option(
24
+ ..., help="The Hugging Face repository ID where to push the trained model."
25
+ ),
26
+ epochs: int = typer.Option(100, help="Number of training epochs."),
27
+ imgsz: int = typer.Option(640, help="Size of the image during training."),
28
+ batch_size: int = typer.Option(64, help="Batch size for training."),
29
+ ):
30
+ """Train an object detection model."""
31
+ env_variables = {
32
+ "HF_REPO_ID": hf_repo_id,
33
+ "HF_TRAINED_MODEL_REPO_ID": hf_trained_model_repo_id,
34
+ "HF_TOKEN": hf_token,
35
+ "WANDB_PROJECT": wandb_project,
36
+ "RUN_NAME": run_name,
37
+ "WANDB_API_KEY": wandb_api_key,
38
+ "EPOCHS": str(epochs),
39
+ "IMGSZ": str(imgsz),
40
+ "BATCH_SIZE": str(batch_size),
41
+ "USE_AWS_IMAGE_CACHE": "False",
42
+ }
43
+ job_name = "train-yolo-job"
44
+ job_name = job_name + "-" + datetime.datetime.now().strftime("%Y%m%d%H%M%S")
45
+ job = launch_job(
46
+ job_name=job_name,
47
+ container_image_uri="europe-west9-docker.pkg.dev/robotoff/gcf-artifacts/train-yolo",
48
+ env_variables=env_variables,
49
+ )
50
+ typer.echo("Job launched")
51
+ typer.echo(job)
52
+
53
+
54
+ def launch_job(
55
+ job_name: str = typer.Argument(
56
+ ...,
57
+ help="The name of the Google Batch job that will be created. "
58
+ "It needs to be unique for each project and region pair.",
59
+ ),
60
+ container_image_uri: str = typer.Argument(
61
+ ..., help="The URI of the container image that will be run as part of the job."
62
+ ),
63
+ commands: str | None = None,
64
+ env_variables: dict[str, str] | None = None,
65
+ entrypoint: str | None = None,
66
+ cpu_milli: int = 4000, # in milli-CPU units (4000 = 4 CPUs). This means the task requires 4 whole CPUs.
67
+ memory_mib: int = 16000, # Make sure to have enough memory for the 2GB of shared memory set below.
68
+ boot_disk_mib: int = 100000,
69
+ max_retry_count: int = 1,
70
+ max_run_duration: str = "86400s", # 24 hours
71
+ task_count: int = 1,
72
+ accelerators_type: str = "nvidia-tesla-t4",
73
+ machine_type: str = "n1-standard-8",
74
+ google_project_id: str = "robotoff",
75
+ accelerators_count: int = 1,
76
+ region: str = "europe-west4",
77
+ install_gpu_drivers: bool = True,
78
+ ) -> batch_v1.Job:
79
+ """This method creates a Batch Job on GCP.
80
+
81
+ Sources:
82
+ * https://github.com/GoogleCloudPlatform/python-docs-samples/tree/main/batch/create
83
+ * https://cloud.google.com/python/docs/reference/batch/latest/google.cloud.batch_v1.types # noqa
84
+
85
+ :param google_batch_launch_config: Config to run a job on Google Batch.
86
+ :param batch_job_config: Config to run a specific job on Google Batch.
87
+ :return: Batch job information.
88
+
89
+ Returns:
90
+ Batch job information.
91
+ """
92
+ client = batch_v1.BatchServiceClient()
93
+
94
+ # Define what will be done as part of the job.
95
+ runnable = batch_v1.Runnable()
96
+ runnable.container = batch_v1.Runnable.Container()
97
+ runnable.container.image_uri = container_image_uri
98
+ runnable.container.entrypoint = entrypoint # type: ignore
99
+ # By default, /dev/shm is 64MB which is not enough for Pytorch
100
+ runnable.container.options = "--shm-size=2048m"
101
+ runnable.container.commands = commands
102
+
103
+ # Jobs can be divided into tasks. In this case, we have only one task.
104
+ task = batch_v1.TaskSpec()
105
+ task.runnables = [runnable]
106
+
107
+ # Environment variables.
108
+ envable = batch_v1.Environment()
109
+ envable.variables = env_variables or {}
110
+ task.environment = envable
111
+
112
+ # We can specify what resources are requested by each task.
113
+ resources = batch_v1.ComputeResource()
114
+ resources.cpu_milli = cpu_milli
115
+ resources.memory_mib = memory_mib
116
+ resources.boot_disk_mib = boot_disk_mib # type: ignore
117
+ task.compute_resource = resources
118
+
119
+ task.max_retry_count = max_retry_count
120
+ task.max_run_duration = max_run_duration # type: ignore
121
+
122
+ # Tasks are grouped inside a job using TaskGroups.
123
+ group = batch_v1.TaskGroup()
124
+ group.task_count = task_count # type: ignore
125
+ group.task_spec = task
126
+
127
+ # Policies are used to define on what kind of virtual machines the tasks
128
+ # will run on.
129
+ policy = batch_v1.AllocationPolicy.InstancePolicy()
130
+ # See list of machine types here:
131
+ # https://docs.cloud.google.com/compute/docs/gpus#l4-gpus
132
+ policy.machine_type = machine_type
133
+
134
+ accelerator = batch_v1.AllocationPolicy.Accelerator()
135
+ accelerator.type_ = accelerators_type
136
+ accelerator.count = accelerators_count
137
+
138
+ policy.accelerators = [accelerator]
139
+ instances = batch_v1.AllocationPolicy.InstancePolicyOrTemplate()
140
+ instances.policy = policy
141
+ instances.install_gpu_drivers = install_gpu_drivers
142
+ allocation_policy = batch_v1.AllocationPolicy()
143
+ allocation_policy.instances = [instances]
144
+
145
+ job = batch_v1.Job()
146
+ job.task_groups = [group]
147
+ job.allocation_policy = allocation_policy
148
+ # We use Cloud Logging as it's an out of the box available option
149
+ job.logs_policy = batch_v1.LogsPolicy()
150
+ job.logs_policy.destination = batch_v1.LogsPolicy.Destination.CLOUD_LOGGING # type: ignore
151
+
152
+ create_request = batch_v1.CreateJobRequest()
153
+ create_request.job = job
154
+ create_request.job_id = job_name
155
+ # The job's parent is the region in which the job will run
156
+ create_request.parent = f"projects/{google_project_id}/locations/{region}"
157
+
158
+ return client.create_job(create_request)
@@ -212,15 +212,30 @@ def export_from_hf_to_ultralytics_object_detection(
212
212
  download_images: bool = True,
213
213
  error_raise: bool = True,
214
214
  use_aws_cache: bool = True,
215
+ revision: str = "main",
215
216
  ):
216
217
  """Export annotations from a Hugging Face dataset project to the
217
218
  Ultralytics format.
218
219
 
219
220
  The Label Studio project should be an object detection project with a
220
221
  single rectanglelabels annotation result per task.
222
+
223
+ Args:
224
+ repo_id (str): Hugging Face repository ID to load the dataset from.
225
+ output_dir (Path): Path to the output directory.
226
+ download_images (bool): Whether to download images from URLs in the
227
+ dataset. If False, the dataset is expected to contain an `image`
228
+ field with the image data.
229
+ error_raise (bool): Whether to raise an error if an image fails to
230
+ download. If False, the image will be skipped. This option is only
231
+ used if `download_images` is True. Defaults to True.
232
+ use_aws_cache (bool): Whether to use the AWS image cache when
233
+ downloading images. This option is only used if `download_images`
234
+ is True. Defaults to True.
235
+ revision (str): The dataset revision to load. Defaults to 'main'.
221
236
  """
222
237
  logger.info("Repo ID: %s", repo_id)
223
- ds = datasets.load_dataset(repo_id)
238
+ ds = datasets.load_dataset(repo_id, revision=revision)
224
239
  data_dir = output_dir / "data"
225
240
  data_dir.mkdir(parents=True, exist_ok=True)
226
241
  category_id_to_name = {}
@@ -233,9 +248,16 @@ def export_from_hf_to_ultralytics_object_detection(
233
248
 
234
249
  for sample in tqdm.tqdm(ds[split], desc="samples"):
235
250
  image_id = sample["image_id"]
236
- image_url = sample["meta"]["image_url"]
237
251
 
238
252
  if download_images:
253
+ if "meta" not in sample or "image_url" not in sample["meta"]:
254
+ raise ValueError(
255
+ "`meta.image_url` field not found in sample. "
256
+ "Make sure the dataset contains the `meta.image_url` "
257
+ "field, which should be the URL of the image, or set "
258
+ "`download_images` to False."
259
+ )
260
+ image_url = sample["meta"]["image_url"]
239
261
  download_output = download_image(
240
262
  image_url,
241
263
  return_struct=True,
@@ -5,6 +5,7 @@ from openfoodfacts.utils import get_logger
5
5
 
6
6
  from labelr.apps import datasets as dataset_app
7
7
  from labelr.apps import projects as project_app
8
+ from labelr.apps import train as train_app
8
9
  from labelr.apps import users as user_app
9
10
 
10
11
  app = typer.Typer(pretty_exceptions_show_locals=False)
@@ -69,5 +70,11 @@ app.add_typer(
69
70
  help="Manage datasets (convert, export, check, etc.)",
70
71
  )
71
72
 
73
+ app.add_typer(
74
+ train_app.app,
75
+ name="train",
76
+ help="Train models",
77
+ )
78
+
72
79
  if __name__ == "__main__":
73
80
  app()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: labelr
3
- Version: 0.4.1
3
+ Version: 0.5.0
4
4
  Summary: A command-line tool to manage labeling tasks with Label Studio.
5
5
  Requires-Python: >=3.10
6
6
  Description-Content-Type: text/markdown
@@ -11,6 +11,7 @@ Requires-Dist: label-studio-sdk>=1.0.8
11
11
  Requires-Dist: more-itertools>=10.5.0
12
12
  Requires-Dist: openfoodfacts>=2.9.0
13
13
  Requires-Dist: typer>=0.15.1
14
+ Requires-Dist: google-cloud-batch==0.18.0
14
15
  Provides-Extra: ultralytics
15
16
  Requires-Dist: ultralytics>=8.3.49; extra == "ultralytics"
16
17
  Dynamic: license-file
@@ -138,3 +139,7 @@ labelr datasets export --project-id PROJECT_ID --from ls --to huggingface --repo
138
139
  ```
139
140
 
140
141
  where `REPO_ID` is the ID of the Hugging Face repository where the dataset will be uploaded (ex: `openfoodfacts/food-detection`).
142
+
143
+ ### Lauch training jobs
144
+
145
+ You can also launch training jobs for YOLO object detection models using datasets hosted on Hugging Face. Please refer to the [train-yolo package README](packages/train-yolo/README.md) for more details on how to use this feature.
@@ -20,4 +20,5 @@ src/labelr.egg-info/top_level.txt
20
20
  src/labelr/apps/__init__.py
21
21
  src/labelr/apps/datasets.py
22
22
  src/labelr/apps/projects.py
23
+ src/labelr/apps/train.py
23
24
  src/labelr/apps/users.py
@@ -4,6 +4,7 @@ label-studio-sdk>=1.0.8
4
4
  more-itertools>=10.5.0
5
5
  openfoodfacts>=2.9.0
6
6
  typer>=0.15.1
7
+ google-cloud-batch==0.18.0
7
8
 
8
9
  [ultralytics]
9
10
  ultralytics>=8.3.49
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes