PyPI - labelr - Versions diffs - 0.4.0__tar.gz → 0.5.0__tar.gz - Mend

labelr 0.4.0tar.gz → 0.5.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

{labelr-0.4.0/src/labelr.egg-info → labelr-0.5.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: labelr
-Version: 0.4.0
+Version: 0.5.0
 Summary: A command-line tool to manage labeling tasks with Label Studio.
 Requires-Python: >=3.10
 Description-Content-Type: text/markdown
@@ -11,6 +11,7 @@ Requires-Dist: label-studio-sdk>=1.0.8
 Requires-Dist: more-itertools>=10.5.0
 Requires-Dist: openfoodfacts>=2.9.0
 Requires-Dist: typer>=0.15.1
+Requires-Dist: google-cloud-batch==0.18.0
 Provides-Extra: ultralytics
 Requires-Dist: ultralytics>=8.3.49; extra == "ultralytics"
 Dynamic: license-file
@@ -138,3 +139,7 @@ labelr datasets export --project-id PROJECT_ID --from ls --to huggingface --repo
 ```
 where `REPO_ID` is the ID of the Hugging Face repository where the dataset will be uploaded (ex: `openfoodfacts/food-detection`).
+### Lauch training jobs
+You can also launch training jobs for YOLO object detection models using datasets hosted on Hugging Face. Please refer to the [train-yolo package README](packages/train-yolo/README.md) for more details on how to use this feature.

{labelr-0.4.0 → labelr-0.5.0}/README.md RENAMED Viewed

@@ -120,4 +120,8 @@ To export the data to a Hugging Face dataset, use the following command:
 labelr datasets export --project-id PROJECT_ID --from ls --to huggingface --repo-id REPO_ID --label-names 'product,price-tag'
 ```
-where `REPO_ID` is the ID of the Hugging Face repository where the dataset will be uploaded (ex: `openfoodfacts/food-detection`).
+where `REPO_ID` is the ID of the Hugging Face repository where the dataset will be uploaded (ex: `openfoodfacts/food-detection`).
+### Lauch training jobs
+You can also launch training jobs for YOLO object detection models using datasets hosted on Hugging Face. Please refer to the [train-yolo package README](packages/train-yolo/README.md) for more details on how to use this feature.

{labelr-0.4.0 → labelr-0.5.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "labelr"
-version = "0.4.0"
+version = "0.5.0"
 description = "A command-line tool to manage labeling tasks with Label Studio."
 readme = "README.md"
 requires-python = ">=3.10"
@@ -11,6 +11,7 @@ dependencies = [
     "more-itertools>=10.5.0",
     "openfoodfacts>=2.9.0",
     "typer>=0.15.1",
+    "google-cloud-batch==0.18.0",
 ]
 [project.scripts]

{labelr-0.4.0 → labelr-0.5.0}/src/labelr/apps/datasets.py RENAMED Viewed

@@ -211,9 +211,9 @@ def export(
     from label_studio_sdk.client import LabelStudio
     from labelr.export import (
-        export_from_hf_to_ultralytics,
-        export_from_ls_to_hf,
-        export_from_ls_to_ultralytics,
+        export_from_hf_to_ultralytics_object_detection,
+        export_from_ls_to_hf_object_detection,
+        export_from_ls_to_ultralytics_object_detection,
     )
     if (to == ExportDestination.hf or from_ == ExportSource.hf) and repo_id is None:
@@ -254,7 +254,7 @@ def export(
         ls = LabelStudio(base_url=label_studio_url, api_key=api_key)
         if to == ExportDestination.hf:
             repo_id = typing.cast(str, repo_id)
-            export_from_ls_to_hf(
+            export_from_ls_to_hf_object_detection(
                 ls,
                 repo_id=repo_id,
                 label_names=typing.cast(list[str], label_names_list),
@@ -263,7 +263,7 @@ def export(
                 use_aws_cache=use_aws_cache,
             )
         elif to == ExportDestination.ultralytics:
-            export_from_ls_to_ultralytics(
+            export_from_ls_to_ultralytics_object_detection(
                 ls,
                 typing.cast(Path, output_dir),
                 typing.cast(list[str], label_names_list),
@@ -280,7 +280,7 @@ def export(
                 "Only object detection task is currently supported with HF source"
             )
         if to == ExportDestination.ultralytics:
-            export_from_hf_to_ultralytics(
+            export_from_hf_to_ultralytics_object_detection(
                 typing.cast(str, repo_id),
                 typing.cast(Path, output_dir),
                 download_images=download_images,

labelr-0.5.0/src/labelr/apps/train.py ADDED Viewed

@@ -0,0 +1,158 @@
+import datetime
+import typer
+from google.cloud import batch_v1
+app = typer.Typer()
+@app.command()
+def train_object_detection(
+    wandb_project: str = typer.Option(
+        "train-yolo", help="The Weights & Biases project name."
+    ),
+    wandb_api_key: str = typer.Option(..., envvar="WANDB_API_KEY"),
+    hf_token: str = typer.Option(
+        ...,
+        help="The Hugging Face token, used to push the trained model to Hugging Face Hub.",
+    ),
+    run_name: str = typer.Option(..., help="A name for the training run."),
+    hf_repo_id: str = typer.Option(
+        ..., help="The Hugging Face dataset repository ID to use to train."
+    ),
+    hf_trained_model_repo_id: str = typer.Option(
+        ..., help="The Hugging Face repository ID where to push the trained model."
+    ),
+    epochs: int = typer.Option(100, help="Number of training epochs."),
+    imgsz: int = typer.Option(640, help="Size of the image during training."),
+    batch_size: int = typer.Option(64, help="Batch size for training."),
+):
+    """Train an object detection model."""
+    env_variables = {
+        "HF_REPO_ID": hf_repo_id,
+        "HF_TRAINED_MODEL_REPO_ID": hf_trained_model_repo_id,
+        "HF_TOKEN": hf_token,
+        "WANDB_PROJECT": wandb_project,
+        "RUN_NAME": run_name,
+        "WANDB_API_KEY": wandb_api_key,
+        "EPOCHS": str(epochs),
+        "IMGSZ": str(imgsz),
+        "BATCH_SIZE": str(batch_size),
+        "USE_AWS_IMAGE_CACHE": "False",
+    }
+    job_name = "train-yolo-job"
+    job_name = job_name + "-" + datetime.datetime.now().strftime("%Y%m%d%H%M%S")
+    job = launch_job(
+        job_name=job_name,
+        container_image_uri="europe-west9-docker.pkg.dev/robotoff/gcf-artifacts/train-yolo",
+        env_variables=env_variables,
+    )
+    typer.echo("Job launched")
+    typer.echo(job)
+def launch_job(
+    job_name: str = typer.Argument(
+        ...,
+        help="The name of the Google Batch job that will be created. "
+        "It needs to be unique for each project and region pair.",
+    ),
+    container_image_uri: str = typer.Argument(
+        ..., help="The URI of the container image that will be run as part of the job."
+    ),
+    commands: str | None = None,
+    env_variables: dict[str, str] | None = None,
+    entrypoint: str | None = None,
+    cpu_milli: int = 4000,  # in milli-CPU units (4000 = 4 CPUs). This means the task requires 4 whole CPUs.
+    memory_mib: int = 16000,  # Make sure to have enough memory for the 2GB of shared memory set below.
+    boot_disk_mib: int = 100000,
+    max_retry_count: int = 1,
+    max_run_duration: str = "86400s",  # 24 hours
+    task_count: int = 1,
+    accelerators_type: str = "nvidia-tesla-t4",
+    machine_type: str = "n1-standard-8",
+    google_project_id: str = "robotoff",
+    accelerators_count: int = 1,
+    region: str = "europe-west4",
+    install_gpu_drivers: bool = True,
+) -> batch_v1.Job:
+    """This method creates a Batch Job on GCP.
+    Sources:
+    * https://github.com/GoogleCloudPlatform/python-docs-samples/tree/main/batch/create
+    * https://cloud.google.com/python/docs/reference/batch/latest/google.cloud.batch_v1.types  # noqa
+    :param google_batch_launch_config: Config to run a job on Google Batch.
+    :param batch_job_config: Config to run a specific job on Google Batch.
+    :return: Batch job information.
+    Returns:
+        Batch job information.
+    """
+    client = batch_v1.BatchServiceClient()
+    # Define what will be done as part of the job.
+    runnable = batch_v1.Runnable()
+    runnable.container = batch_v1.Runnable.Container()
+    runnable.container.image_uri = container_image_uri
+    runnable.container.entrypoint = entrypoint  # type: ignore
+    # By default, /dev/shm is 64MB which is not enough for Pytorch
+    runnable.container.options = "--shm-size=2048m"
+    runnable.container.commands = commands
+    # Jobs can be divided into tasks. In this case, we have only one task.
+    task = batch_v1.TaskSpec()
+    task.runnables = [runnable]
+    # Environment variables.
+    envable = batch_v1.Environment()
+    envable.variables = env_variables or {}
+    task.environment = envable
+    # We can specify what resources are requested by each task.
+    resources = batch_v1.ComputeResource()
+    resources.cpu_milli = cpu_milli
+    resources.memory_mib = memory_mib
+    resources.boot_disk_mib = boot_disk_mib  # type: ignore
+    task.compute_resource = resources
+    task.max_retry_count = max_retry_count
+    task.max_run_duration = max_run_duration  # type: ignore
+    # Tasks are grouped inside a job using TaskGroups.
+    group = batch_v1.TaskGroup()
+    group.task_count = task_count  # type: ignore
+    group.task_spec = task
+    # Policies are used to define on what kind of virtual machines the tasks
+    # will run on.
+    policy = batch_v1.AllocationPolicy.InstancePolicy()
+    # See list of machine types here:
+    # https://docs.cloud.google.com/compute/docs/gpus#l4-gpus
+    policy.machine_type = machine_type
+    accelerator = batch_v1.AllocationPolicy.Accelerator()
+    accelerator.type_ = accelerators_type
+    accelerator.count = accelerators_count
+    policy.accelerators = [accelerator]
+    instances = batch_v1.AllocationPolicy.InstancePolicyOrTemplate()
+    instances.policy = policy
+    instances.install_gpu_drivers = install_gpu_drivers
+    allocation_policy = batch_v1.AllocationPolicy()
+    allocation_policy.instances = [instances]
+    job = batch_v1.Job()
+    job.task_groups = [group]
+    job.allocation_policy = allocation_policy
+    # We use Cloud Logging as it's an out of the box available option
+    job.logs_policy = batch_v1.LogsPolicy()
+    job.logs_policy.destination = batch_v1.LogsPolicy.Destination.CLOUD_LOGGING  # type: ignore
+    create_request = batch_v1.CreateJobRequest()
+    create_request.job = job
+    create_request.job_id = job_name
+    # The job's parent is the region in which the job will run
+    create_request.parent = f"projects/{google_project_id}/locations/{region}"
+    return client.create_job(create_request)

{labelr-0.4.0 → labelr-0.5.0}/src/labelr/export.py RENAMED Viewed

@@ -29,7 +29,7 @@ def _pickle_sample_generator(dir: Path):
             yield pickle.load(f)
-def export_from_ls_to_hf(
+def export_from_ls_to_hf_object_detection(
     ls: LabelStudio,
     repo_id: str,
     label_names: list[str],
@@ -73,7 +73,7 @@ def export_from_ls_to_hf(
             hf_ds.push_to_hub(repo_id, split=split)
-def export_from_ls_to_ultralytics(
+def export_from_ls_to_ultralytics_object_detection(
     ls: LabelStudio,
     output_dir: Path,
     label_names: list[str],
@@ -206,21 +206,36 @@ def export_from_ls_to_ultralytics(
             f.write(f"  {i}: {label_name}\n")
-def export_from_hf_to_ultralytics(
+def export_from_hf_to_ultralytics_object_detection(
     repo_id: str,
     output_dir: Path,
     download_images: bool = True,
     error_raise: bool = True,
     use_aws_cache: bool = True,
+    revision: str = "main",
 ):
     """Export annotations from a Hugging Face dataset project to the
     Ultralytics format.
     The Label Studio project should be an object detection project with a
     single rectanglelabels annotation result per task.
+    Args:
+        repo_id (str): Hugging Face repository ID to load the dataset from.
+        output_dir (Path): Path to the output directory.
+        download_images (bool): Whether to download images from URLs in the
+            dataset. If False, the dataset is expected to contain an `image`
+            field with the image data.
+        error_raise (bool): Whether to raise an error if an image fails to
+            download. If False, the image will be skipped. This option is only
+            used if `download_images` is True. Defaults to True.
+        use_aws_cache (bool): Whether to use the AWS image cache when
+            downloading images. This option is only used if `download_images`
+            is True. Defaults to True.
+        revision (str): The dataset revision to load. Defaults to 'main'.
     """
     logger.info("Repo ID: %s", repo_id)
-    ds = datasets.load_dataset(repo_id)
+    ds = datasets.load_dataset(repo_id, revision=revision)
     data_dir = output_dir / "data"
     data_dir.mkdir(parents=True, exist_ok=True)
     category_id_to_name = {}
@@ -233,9 +248,16 @@ def export_from_hf_to_ultralytics(
         for sample in tqdm.tqdm(ds[split], desc="samples"):
             image_id = sample["image_id"]
-            image_url = sample["meta"]["image_url"]
             if download_images:
+                if "meta" not in sample or "image_url" not in sample["meta"]:
+                    raise ValueError(
+                        "`meta.image_url` field not found in sample. "
+                        "Make sure the dataset contains the `meta.image_url` "
+                        "field, which should be the URL of the image, or set "
+                        "`download_images` to False."
+                    )
+                image_url = sample["meta"]["image_url"]
                 download_output = download_image(
                     image_url,
                     return_struct=True,
@@ -309,6 +331,43 @@ def export_from_ultralytics_to_hf(
             "Only classification task is currently supported for Ultralytics to HF export"
         )
+    if task_type == TaskType.classification:
+        export_from_ultralytics_to_hf_classification(
+            dataset_dir=dataset_dir,
+            repo_id=repo_id,
+            label_names=label_names,
+            merge_labels=merge_labels,
+            is_openfoodfacts_dataset=is_openfoodfacts_dataset,
+            openfoodfacts_flavor=openfoodfacts_flavor,
+        )
+def export_from_ultralytics_to_hf_classification(
+    dataset_dir: Path,
+    repo_id: str,
+    label_names: list[str],
+    merge_labels: bool = False,
+    is_openfoodfacts_dataset: bool = False,
+    openfoodfacts_flavor: Flavor = Flavor.off,
+) -> None:
+    """Export an Ultralytics classification dataset to a Hugging Face dataset.
+    The Ultralytics dataset directory should contain 'train', 'val' and/or
+    'test' subdirectories, each containing subdirectories for each label.
+    Args:
+        dataset_dir (Path): Path to the Ultralytics dataset directory.
+        repo_id (str): Hugging Face repository ID to push the dataset to.
+        label_names (list[str]): List of label names.
+        merge_labels (bool): Whether to merge all labels into a single label
+            named 'object'.
+        is_openfoodfacts_dataset (bool): Whether the dataset is from
+            Open Food Facts. If True, the `off_image_id` and `image_url` will
+            be generated automatically. `off_image_id` is extracted from the
+            image filename.
+        openfoodfacts_flavor (Flavor): Flavor of Open Food Facts dataset. This
+            is ignored if `is_openfoodfacts_dataset` is False.
+    """
     logger.info("Repo ID: %s, dataset_dir: %s", repo_id, dataset_dir)
     if not any((dataset_dir / split).is_dir() for split in ["train", "val", "test"]):

{labelr-0.4.0 → labelr-0.5.0}/src/labelr/main.py RENAMED Viewed

@@ -5,6 +5,7 @@ from openfoodfacts.utils import get_logger
 from labelr.apps import datasets as dataset_app
 from labelr.apps import projects as project_app
+from labelr.apps import train as train_app
 from labelr.apps import users as user_app
 app = typer.Typer(pretty_exceptions_show_locals=False)
@@ -69,5 +70,11 @@ app.add_typer(
     help="Manage datasets (convert, export, check, etc.)",
 )
+app.add_typer(
+    train_app.app,
+    name="train",
+    help="Train models",
+)
 if __name__ == "__main__":
     app()

{labelr-0.4.0 → labelr-0.5.0/src/labelr.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: labelr
-Version: 0.4.0
+Version: 0.5.0
 Summary: A command-line tool to manage labeling tasks with Label Studio.
 Requires-Python: >=3.10
 Description-Content-Type: text/markdown
@@ -11,6 +11,7 @@ Requires-Dist: label-studio-sdk>=1.0.8
 Requires-Dist: more-itertools>=10.5.0
 Requires-Dist: openfoodfacts>=2.9.0
 Requires-Dist: typer>=0.15.1
+Requires-Dist: google-cloud-batch==0.18.0
 Provides-Extra: ultralytics
 Requires-Dist: ultralytics>=8.3.49; extra == "ultralytics"
 Dynamic: license-file
@@ -138,3 +139,7 @@ labelr datasets export --project-id PROJECT_ID --from ls --to huggingface --repo
 ```
 where `REPO_ID` is the ID of the Hugging Face repository where the dataset will be uploaded (ex: `openfoodfacts/food-detection`).
+### Lauch training jobs
+You can also launch training jobs for YOLO object detection models using datasets hosted on Hugging Face. Please refer to the [train-yolo package README](packages/train-yolo/README.md) for more details on how to use this feature.

{labelr-0.4.0 → labelr-0.5.0}/src/labelr.egg-info/SOURCES.txt RENAMED Viewed

@@ -20,4 +20,5 @@ src/labelr.egg-info/top_level.txt
 src/labelr/apps/__init__.py
 src/labelr/apps/datasets.py
 src/labelr/apps/projects.py
+src/labelr/apps/train.py
 src/labelr/apps/users.py

{labelr-0.4.0 → labelr-0.5.0}/src/labelr.egg-info/requires.txt RENAMED Viewed

@@ -4,6 +4,7 @@ label-studio-sdk>=1.0.8
 more-itertools>=10.5.0
 openfoodfacts>=2.9.0
 typer>=0.15.1
+google-cloud-batch==0.18.0
 [ultralytics]
 ultralytics>=8.3.49