labelr 0.10.0__tar.gz → 0.11.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. labelr-0.11.1/PKG-INFO +230 -0
  2. labelr-0.11.1/README.md +200 -0
  3. {labelr-0.10.0 → labelr-0.11.1}/pyproject.toml +3 -2
  4. labelr-0.11.1/src/labelr/annotate.py +57 -0
  5. {labelr-0.10.0 → labelr-0.11.1}/src/labelr/apps/datasets.py +140 -9
  6. labelr-0.11.1/src/labelr/apps/directus.py +212 -0
  7. {labelr-0.10.0 → labelr-0.11.1}/src/labelr/apps/google_batch.py +38 -0
  8. {labelr-0.10.0 → labelr-0.11.1}/src/labelr/apps/label_studio.py +295 -104
  9. labelr-0.11.1/src/labelr/apps/typer_description.py +2 -0
  10. labelr-0.11.1/src/labelr/check.py +147 -0
  11. labelr-0.11.1/src/labelr/config.py +57 -0
  12. {labelr-0.10.0 → labelr-0.11.1}/src/labelr/export/object_detection.py +96 -18
  13. {labelr-0.10.0 → labelr-0.11.1}/src/labelr/main.py +16 -0
  14. {labelr-0.10.0 → labelr-0.11.1}/src/labelr/sample/object_detection.py +42 -13
  15. labelr-0.11.1/src/labelr.egg-info/PKG-INFO +230 -0
  16. {labelr-0.10.0 → labelr-0.11.1}/src/labelr.egg-info/SOURCES.txt +2 -0
  17. {labelr-0.10.0 → labelr-0.11.1}/src/labelr.egg-info/requires.txt +2 -1
  18. labelr-0.10.0/PKG-INFO +0 -158
  19. labelr-0.10.0/README.md +0 -129
  20. labelr-0.10.0/src/labelr/annotate.py +0 -108
  21. labelr-0.10.0/src/labelr/check.py +0 -86
  22. labelr-0.10.0/src/labelr/config.py +0 -1
  23. labelr-0.10.0/src/labelr.egg-info/PKG-INFO +0 -158
  24. {labelr-0.10.0 → labelr-0.11.1}/LICENSE +0 -0
  25. {labelr-0.10.0 → labelr-0.11.1}/setup.cfg +0 -0
  26. {labelr-0.10.0 → labelr-0.11.1}/src/labelr/__init__.py +0 -0
  27. {labelr-0.10.0 → labelr-0.11.1}/src/labelr/__main__.py +0 -0
  28. {labelr-0.10.0 → labelr-0.11.1}/src/labelr/apps/__init__.py +0 -0
  29. {labelr-0.10.0 → labelr-0.11.1}/src/labelr/apps/evaluate.py +0 -0
  30. {labelr-0.10.0 → labelr-0.11.1}/src/labelr/apps/hugging_face.py +0 -0
  31. {labelr-0.10.0 → labelr-0.11.1}/src/labelr/apps/train.py +0 -0
  32. {labelr-0.10.0 → labelr-0.11.1}/src/labelr/dataset_features.py +0 -0
  33. {labelr-0.10.0 → labelr-0.11.1}/src/labelr/evaluate/__init__.py +0 -0
  34. {labelr-0.10.0 → labelr-0.11.1}/src/labelr/evaluate/object_detection.py +0 -0
  35. {labelr-0.10.0 → labelr-0.11.1}/src/labelr/export/__init__.py +0 -0
  36. {labelr-0.10.0 → labelr-0.11.1}/src/labelr/export/classification.py +0 -0
  37. {labelr-0.10.0 → labelr-0.11.1}/src/labelr/export/common.py +0 -0
  38. {labelr-0.10.0 → labelr-0.11.1}/src/labelr/export/llm.py +0 -0
  39. {labelr-0.10.0 → labelr-0.11.1}/src/labelr/google_genai.py +0 -0
  40. {labelr-0.10.0 → labelr-0.11.1}/src/labelr/project_config.py +0 -0
  41. {labelr-0.10.0 → labelr-0.11.1}/src/labelr/sample/__init__.py +0 -0
  42. {labelr-0.10.0 → labelr-0.11.1}/src/labelr/sample/classification.py +0 -0
  43. {labelr-0.10.0 → labelr-0.11.1}/src/labelr/sample/common.py +0 -0
  44. {labelr-0.10.0 → labelr-0.11.1}/src/labelr/sample/llm.py +0 -0
  45. {labelr-0.10.0 → labelr-0.11.1}/src/labelr/types.py +0 -0
  46. {labelr-0.10.0 → labelr-0.11.1}/src/labelr/utils.py +0 -0
  47. {labelr-0.10.0 → labelr-0.11.1}/src/labelr.egg-info/dependency_links.txt +0 -0
  48. {labelr-0.10.0 → labelr-0.11.1}/src/labelr.egg-info/entry_points.txt +0 -0
  49. {labelr-0.10.0 → labelr-0.11.1}/src/labelr.egg-info/top_level.txt +0 -0
labelr-0.11.1/PKG-INFO ADDED
@@ -0,0 +1,230 @@
1
+ Metadata-Version: 2.4
2
+ Name: labelr
3
+ Version: 0.11.1
4
+ Summary: A command-line tool to manage labeling tasks with Label Studio.
5
+ Requires-Python: >=3.10
6
+ Description-Content-Type: text/markdown
7
+ License-File: LICENSE
8
+ Requires-Dist: datasets>=3.2.0
9
+ Requires-Dist: imagehash>=4.3.1
10
+ Requires-Dist: label-studio-sdk>=1.0.8
11
+ Requires-Dist: more-itertools>=10.5.0
12
+ Requires-Dist: openfoodfacts>=2.9.0
13
+ Requires-Dist: typer>=0.15.1
14
+ Requires-Dist: google-cloud-batch==0.18.0
15
+ Requires-Dist: huggingface-hub
16
+ Requires-Dist: deepdiff>=8.6.1
17
+ Requires-Dist: rapidfuzz>=3.14.3
18
+ Requires-Dist: aiohttp
19
+ Requires-Dist: aiofiles
20
+ Requires-Dist: orjson
21
+ Requires-Dist: google-cloud-storage
22
+ Requires-Dist: gcloud-aio-storage
23
+ Requires-Dist: google-genai>=1.56.0
24
+ Requires-Dist: diskcache>=5.6.3
25
+ Provides-Extra: ultralytics
26
+ Requires-Dist: ultralytics==8.4.8; extra == "ultralytics"
27
+ Provides-Extra: fiftyone
28
+ Requires-Dist: fiftyone~=1.10.0; extra == "fiftyone"
29
+ Dynamic: license-file
30
+
31
+ # Labelr
32
+
33
+ Labelr a command line interface that aims at providing a set of tools to help data scientists and machine learning engineers to deal with ML data annotation, data preprocessing and format conversion.
34
+
35
+ This project started as a way to automate some of the tasks we do at Open Food Facts to manage data at different stages of the machine learning pipeline.
36
+
37
+ The CLI currently is integrated with Label Studio (for data annotation), Ultralytics (for object detection), Google Cloud Batch (for training) and Hugging Face (for model and dataset storage). It only works with some specific tasks (object detection, image classification and image extraction using LVLM for now), but it's meant to be extended to other tasks in the future.
38
+
39
+ For object detection and image classification models, it currently allows to:
40
+
41
+ - create Label Studio projects
42
+ - upload images to Label Studio
43
+ - pre-annotate the tasks either with an existing object detection model, or with a zero-shot model (Yolo-World or SAM), using Ultralytics
44
+ - perform data quality checks on Label Studio datasets
45
+ - export the data to Hugging Face or to local disk
46
+ - train the model on Google Batch (for object detection only)
47
+ - visualize the model predictions and compare them with the ground truth, using [Fiftyone](https://docs.voxel51.com/user_guide/index.html).
48
+
49
+ Labelr also support managing datasets for fine-tuning large visual language models. It currently only support a single task: structured extraction (JSON) from a single image.
50
+ The following features are supported:
51
+
52
+ - creating training datasets using Google Gemini Batch, from a list of images, textual instructions and a JSON schema
53
+ - uploading the dataset to Hugging Face
54
+ - fixing manually or automatically the model output using [Directus](https://directus.io/), a headless CMS used to manage the structured output
55
+ - export the dataset to Hugging Face
56
+
57
+ In addition, Labelr comes with two scripts that can be used to train ML models:
58
+
59
+ - in `packages/train-yolo`: the `main.py` script can be used to train an object detection model using Ultralytics. The training can be fully automatized on Google Batch, and Labelr provides a CLI to launch Google Batch jobs.
60
+ - in `packages/train-unsloth`: the `main.py` script can be used to train a visual language model using Unsloth. The training is not yet automatized on Google Batch, but the script can be used to train the model locally.
61
+
62
+ ## Installation
63
+
64
+ Python 3.10 or higher is required to run this CLI.
65
+
66
+ To install the CLI, simply run:
67
+
68
+ ```bash
69
+ pip install labelr
70
+ ```
71
+ We recommend to install the CLI in a virtual environment. You can either use pip or conda for that.
72
+
73
+ There are two optional dependencies that you can install to use the CLI:
74
+ - `ultralytics`: pre-annotate object detection datasets with an ultralytics model (yolo, yolo-world)
75
+ - `fiftyone`: visualize the model predictions and compare them with the ground truth, using FiftyOne.
76
+
77
+ To install the ultralytics optional dependency, you can run:
78
+
79
+ ```bash
80
+ pip install labelr[ultralytics]
81
+ ```
82
+
83
+ ## Usage
84
+
85
+ ### Label Studio integration
86
+
87
+ To create a Label Studio project, you need to have a Label Studio instance running. Launching a Label Studio instance is out of the scope of this project, but you can follow the instructions on the [Label Studio documentation](https://labelstud.io/guide/install.html).
88
+
89
+ By default, the CLI will assume you're running Label Studio locally (url: http://127.0.0.1:8080). You can change the URL by setting the `--label-studio-url` CLI option or by updating the configuration (see the [Configuration](#configuration) section below for more information).
90
+
91
+ For all the commands that interact with Label Studio, you need to provide an API key using the `--api-key`, or through configuration.
92
+
93
+ #### Create a project
94
+
95
+ Once you have a Label Studio instance running, you can create a project easily. First, you need to create a configuration file for the project. The configuration file is an XML file that defines the labeling interface and the labels to use for the project. You can find an example of a configuration file in the [Label Studio documentation](https://labelstud.io/guide/setup).
96
+
97
+ For an object detection task, a command allows you to create the configuration file automatically:
98
+
99
+ ```bash
100
+ labelr ls create-config-file --labels 'label1' --labels 'label2' --output-file label_config.xml
101
+ ```
102
+
103
+ where `label1` and `label2` are the labels you want to use for the object detection task, and `label_config.xml` is the output file that will contain the configuration.
104
+
105
+ Then, you can create a project on Label Studio with the following command:
106
+
107
+ ```bash
108
+ labelr ls create --title my_project --api-key API_KEY --config-file label_config.xml
109
+ ```
110
+
111
+ where `API_KEY` is the API key of the Label Studio instance (API key is available at Account page), and `label_config.xml` is the configuration file of the project.
112
+
113
+ `ls` stands for Label Studio in the CLI.
114
+
115
+ #### Create a dataset file
116
+
117
+ If you have a list of images, for an object detection task, you can quickly create a dataset file with the following command:
118
+
119
+ ```bash
120
+ labelr ls create-dataset-file --input-file image_urls.txt --output-file dataset.json
121
+ ```
122
+
123
+ where `image_urls.txt` is a file containing the URLs of the images, one per line, and `dataset.json` is the output file.
124
+
125
+ #### Import data
126
+
127
+ Next, import the generated data to a project with the following command:
128
+
129
+ ```bash
130
+ labelr ls import-data --project-id PROJECT_ID --dataset-path dataset.json
131
+ ```
132
+
133
+ where `PROJECT_ID` is the ID of the project you created.
134
+
135
+ #### Pre-annotate the data
136
+
137
+ To accelerate annotation, you can pre-annotate the images with an object detection model. We support three pre-annotation backends:
138
+
139
+ - `ultralytics`: use your own model or [Yolo-World](https://docs.ultralytics.com/models/yolo-world/), a zero-shot model that can detect any object using a text description of the object. You can specify the path or the name of the model with the `--model-name` option. If no model name is provided, the `yolov8x-worldv2.pt` model (Yolo-World) is used.
140
+ - `ultralytics_sam3`: use [SAM3](https://docs.ultralytics.com/models/sam-3/), another zero-shot model. We advice to use this backend, as it's the most accurate. The `--model` option is ignored when this backend is used.
141
+ - `robotoff`: the ML backend of Open Food Facts (specific to Open Food Facts projects).
142
+
143
+ When using `ultralytics` or `ultralytics_sam3`, make sure you installed the labelr package with the `ultralytics` extra.
144
+
145
+ To pre-annotate the data with Ultralytics, use the following command:
146
+
147
+ ```bash
148
+ labelr ls add-prediction --project-id PROJECT_ID --backend ultralytics_sam3 --labels 'product' --labels 'price tag' --label-mapping '{"price tag": "price-tag"}'
149
+ ```
150
+
151
+ The SAM3 model will be automatically downloaded from Hugging Face. [SAM3](https://huggingface.co/facebook/sam3) is a gated model, it requires a permission before getting access to the model.Make sure you were granted the access before launching the command.
152
+
153
+ In the command above, `labels` is the list of labels to use for the object detection task (you can add as many labels as you want). You can also provide a `--label-mapping` option in case the names of the label of the model you use for pre-annotation is different from the names configured on your Label Studio project.
154
+
155
+
156
+ #### Add `train` and `val` split
157
+
158
+ In most machine learning projects, you need to split your data into a training and a validation set. Assigning each sample to a split is required before exporting the dataset. To do so, you can use the following command:
159
+
160
+ ```bash
161
+ labelr ls add-split --train-split 0.8 --project-id PROJECT_ID
162
+ ```
163
+
164
+ For each task in the dataset, it randomly assigns 80% of the samples to the `train` split and 20% to the `val` split. The split is saved in the task `data` in the `split` field.
165
+
166
+ You can change the train/val ratio by changing the `--train-split` option. You can also assign specific sample to a split. For example you can assign the `train` split to specific tasks by storinh the task IDs in a file `task_ids.txt` and by running the following command:
167
+
168
+ ```bash
169
+ labelr ls add-split --split-name train --task-id-file task_ids.txt --project-id PROJECT_ID
170
+ ```
171
+
172
+ #### Performing sanity checks on the dataset
173
+
174
+ Labelr can detect automatically some common data quality issues:
175
+
176
+ - broken image URLs
177
+ - duplicate tasks (based on the image hash)
178
+ - multiple annotations
179
+
180
+ To perform a check, run:
181
+
182
+ ```bash
183
+ labelr ls check-dataset --project-id PROJECT_ID
184
+ ```
185
+
186
+ The command will report the issues found. It is non-destructive by default, but you can use the `--delete-missing-images` and `--delete-duplicate-images` options to delete the tasks with missing images or duplicates respectively.
187
+
188
+ #### Export the data
189
+
190
+ Once the data is annotated, you can export it to a Hugging Face dataset or to local disk (Ultralytics format). To export it to disk, use the following command:
191
+
192
+ ```bash
193
+ labelr datasets export --project-id PROJECT_ID --from ls --to ultralytics --output-dir output --label-names 'product,price-tag'
194
+ ```
195
+
196
+ where `output` is the directory where the data will be exported. Currently, label names must be provided, as the CLI does not support exporting label names from Label Studio yet.
197
+
198
+ To export the data to a Hugging Face dataset, use the following command:
199
+
200
+ ```bash
201
+ labelr datasets export --project-id PROJECT_ID --from ls --to huggingface --repo-id REPO_ID --label-names 'product,price-tag'
202
+ ```
203
+
204
+ where `REPO_ID` is the ID of the Hugging Face repository where the dataset will be uploaded (ex: `openfoodfacts/food-detection`).
205
+
206
+ ### Lauch training jobs
207
+
208
+ You can also launch training jobs for YOLO object detection models using datasets hosted on Hugging Face. Please refer to the [train-yolo package README](packages/train-yolo/README.md) for more details on how to use this feature.
209
+
210
+ ## Configuration
211
+
212
+ Some Labelr settings can be configured using a configuration file or through environment variables. The configuration file is located at `~/.config/labelr/config.json`.
213
+
214
+ By order of precedence, the configuration is loaded from:
215
+
216
+ - CLI command option
217
+ - environment variable
218
+ - file configuration
219
+
220
+ The following variables are currently supported:
221
+
222
+ - `label_studio_url`: URL of the Label Studio server. Can also be set with the `LABELR_LABEL_STUDIO_URL` environment variable.
223
+ - `label_studio_api_key`: API key for Label Studio. Can also be set with the `LABELR_LABEL_STUDIO_API_KEY` environment variable.
224
+
225
+
226
+ Labelr supports configuring settings in config file through the `config` command. For example, to set the Label Studio URL, you can run:
227
+
228
+ ```bash
229
+ labelr config label_studio_url http://127.0.0.1:8080
230
+ ```
@@ -0,0 +1,200 @@
1
+ # Labelr
2
+
3
+ Labelr a command line interface that aims at providing a set of tools to help data scientists and machine learning engineers to deal with ML data annotation, data preprocessing and format conversion.
4
+
5
+ This project started as a way to automate some of the tasks we do at Open Food Facts to manage data at different stages of the machine learning pipeline.
6
+
7
+ The CLI currently is integrated with Label Studio (for data annotation), Ultralytics (for object detection), Google Cloud Batch (for training) and Hugging Face (for model and dataset storage). It only works with some specific tasks (object detection, image classification and image extraction using LVLM for now), but it's meant to be extended to other tasks in the future.
8
+
9
+ For object detection and image classification models, it currently allows to:
10
+
11
+ - create Label Studio projects
12
+ - upload images to Label Studio
13
+ - pre-annotate the tasks either with an existing object detection model, or with a zero-shot model (Yolo-World or SAM), using Ultralytics
14
+ - perform data quality checks on Label Studio datasets
15
+ - export the data to Hugging Face or to local disk
16
+ - train the model on Google Batch (for object detection only)
17
+ - visualize the model predictions and compare them with the ground truth, using [Fiftyone](https://docs.voxel51.com/user_guide/index.html).
18
+
19
+ Labelr also support managing datasets for fine-tuning large visual language models. It currently only support a single task: structured extraction (JSON) from a single image.
20
+ The following features are supported:
21
+
22
+ - creating training datasets using Google Gemini Batch, from a list of images, textual instructions and a JSON schema
23
+ - uploading the dataset to Hugging Face
24
+ - fixing manually or automatically the model output using [Directus](https://directus.io/), a headless CMS used to manage the structured output
25
+ - export the dataset to Hugging Face
26
+
27
+ In addition, Labelr comes with two scripts that can be used to train ML models:
28
+
29
+ - in `packages/train-yolo`: the `main.py` script can be used to train an object detection model using Ultralytics. The training can be fully automatized on Google Batch, and Labelr provides a CLI to launch Google Batch jobs.
30
+ - in `packages/train-unsloth`: the `main.py` script can be used to train a visual language model using Unsloth. The training is not yet automatized on Google Batch, but the script can be used to train the model locally.
31
+
32
+ ## Installation
33
+
34
+ Python 3.10 or higher is required to run this CLI.
35
+
36
+ To install the CLI, simply run:
37
+
38
+ ```bash
39
+ pip install labelr
40
+ ```
41
+ We recommend to install the CLI in a virtual environment. You can either use pip or conda for that.
42
+
43
+ There are two optional dependencies that you can install to use the CLI:
44
+ - `ultralytics`: pre-annotate object detection datasets with an ultralytics model (yolo, yolo-world)
45
+ - `fiftyone`: visualize the model predictions and compare them with the ground truth, using FiftyOne.
46
+
47
+ To install the ultralytics optional dependency, you can run:
48
+
49
+ ```bash
50
+ pip install labelr[ultralytics]
51
+ ```
52
+
53
+ ## Usage
54
+
55
+ ### Label Studio integration
56
+
57
+ To create a Label Studio project, you need to have a Label Studio instance running. Launching a Label Studio instance is out of the scope of this project, but you can follow the instructions on the [Label Studio documentation](https://labelstud.io/guide/install.html).
58
+
59
+ By default, the CLI will assume you're running Label Studio locally (url: http://127.0.0.1:8080). You can change the URL by setting the `--label-studio-url` CLI option or by updating the configuration (see the [Configuration](#configuration) section below for more information).
60
+
61
+ For all the commands that interact with Label Studio, you need to provide an API key using the `--api-key`, or through configuration.
62
+
63
+ #### Create a project
64
+
65
+ Once you have a Label Studio instance running, you can create a project easily. First, you need to create a configuration file for the project. The configuration file is an XML file that defines the labeling interface and the labels to use for the project. You can find an example of a configuration file in the [Label Studio documentation](https://labelstud.io/guide/setup).
66
+
67
+ For an object detection task, a command allows you to create the configuration file automatically:
68
+
69
+ ```bash
70
+ labelr ls create-config-file --labels 'label1' --labels 'label2' --output-file label_config.xml
71
+ ```
72
+
73
+ where `label1` and `label2` are the labels you want to use for the object detection task, and `label_config.xml` is the output file that will contain the configuration.
74
+
75
+ Then, you can create a project on Label Studio with the following command:
76
+
77
+ ```bash
78
+ labelr ls create --title my_project --api-key API_KEY --config-file label_config.xml
79
+ ```
80
+
81
+ where `API_KEY` is the API key of the Label Studio instance (API key is available at Account page), and `label_config.xml` is the configuration file of the project.
82
+
83
+ `ls` stands for Label Studio in the CLI.
84
+
85
+ #### Create a dataset file
86
+
87
+ If you have a list of images, for an object detection task, you can quickly create a dataset file with the following command:
88
+
89
+ ```bash
90
+ labelr ls create-dataset-file --input-file image_urls.txt --output-file dataset.json
91
+ ```
92
+
93
+ where `image_urls.txt` is a file containing the URLs of the images, one per line, and `dataset.json` is the output file.
94
+
95
+ #### Import data
96
+
97
+ Next, import the generated data to a project with the following command:
98
+
99
+ ```bash
100
+ labelr ls import-data --project-id PROJECT_ID --dataset-path dataset.json
101
+ ```
102
+
103
+ where `PROJECT_ID` is the ID of the project you created.
104
+
105
+ #### Pre-annotate the data
106
+
107
+ To accelerate annotation, you can pre-annotate the images with an object detection model. We support three pre-annotation backends:
108
+
109
+ - `ultralytics`: use your own model or [Yolo-World](https://docs.ultralytics.com/models/yolo-world/), a zero-shot model that can detect any object using a text description of the object. You can specify the path or the name of the model with the `--model-name` option. If no model name is provided, the `yolov8x-worldv2.pt` model (Yolo-World) is used.
110
+ - `ultralytics_sam3`: use [SAM3](https://docs.ultralytics.com/models/sam-3/), another zero-shot model. We advice to use this backend, as it's the most accurate. The `--model` option is ignored when this backend is used.
111
+ - `robotoff`: the ML backend of Open Food Facts (specific to Open Food Facts projects).
112
+
113
+ When using `ultralytics` or `ultralytics_sam3`, make sure you installed the labelr package with the `ultralytics` extra.
114
+
115
+ To pre-annotate the data with Ultralytics, use the following command:
116
+
117
+ ```bash
118
+ labelr ls add-prediction --project-id PROJECT_ID --backend ultralytics_sam3 --labels 'product' --labels 'price tag' --label-mapping '{"price tag": "price-tag"}'
119
+ ```
120
+
121
+ The SAM3 model will be automatically downloaded from Hugging Face. [SAM3](https://huggingface.co/facebook/sam3) is a gated model, it requires a permission before getting access to the model.Make sure you were granted the access before launching the command.
122
+
123
+ In the command above, `labels` is the list of labels to use for the object detection task (you can add as many labels as you want). You can also provide a `--label-mapping` option in case the names of the label of the model you use for pre-annotation is different from the names configured on your Label Studio project.
124
+
125
+
126
+ #### Add `train` and `val` split
127
+
128
+ In most machine learning projects, you need to split your data into a training and a validation set. Assigning each sample to a split is required before exporting the dataset. To do so, you can use the following command:
129
+
130
+ ```bash
131
+ labelr ls add-split --train-split 0.8 --project-id PROJECT_ID
132
+ ```
133
+
134
+ For each task in the dataset, it randomly assigns 80% of the samples to the `train` split and 20% to the `val` split. The split is saved in the task `data` in the `split` field.
135
+
136
+ You can change the train/val ratio by changing the `--train-split` option. You can also assign specific sample to a split. For example you can assign the `train` split to specific tasks by storinh the task IDs in a file `task_ids.txt` and by running the following command:
137
+
138
+ ```bash
139
+ labelr ls add-split --split-name train --task-id-file task_ids.txt --project-id PROJECT_ID
140
+ ```
141
+
142
+ #### Performing sanity checks on the dataset
143
+
144
+ Labelr can detect automatically some common data quality issues:
145
+
146
+ - broken image URLs
147
+ - duplicate tasks (based on the image hash)
148
+ - multiple annotations
149
+
150
+ To perform a check, run:
151
+
152
+ ```bash
153
+ labelr ls check-dataset --project-id PROJECT_ID
154
+ ```
155
+
156
+ The command will report the issues found. It is non-destructive by default, but you can use the `--delete-missing-images` and `--delete-duplicate-images` options to delete the tasks with missing images or duplicates respectively.
157
+
158
+ #### Export the data
159
+
160
+ Once the data is annotated, you can export it to a Hugging Face dataset or to local disk (Ultralytics format). To export it to disk, use the following command:
161
+
162
+ ```bash
163
+ labelr datasets export --project-id PROJECT_ID --from ls --to ultralytics --output-dir output --label-names 'product,price-tag'
164
+ ```
165
+
166
+ where `output` is the directory where the data will be exported. Currently, label names must be provided, as the CLI does not support exporting label names from Label Studio yet.
167
+
168
+ To export the data to a Hugging Face dataset, use the following command:
169
+
170
+ ```bash
171
+ labelr datasets export --project-id PROJECT_ID --from ls --to huggingface --repo-id REPO_ID --label-names 'product,price-tag'
172
+ ```
173
+
174
+ where `REPO_ID` is the ID of the Hugging Face repository where the dataset will be uploaded (ex: `openfoodfacts/food-detection`).
175
+
176
+ ### Lauch training jobs
177
+
178
+ You can also launch training jobs for YOLO object detection models using datasets hosted on Hugging Face. Please refer to the [train-yolo package README](packages/train-yolo/README.md) for more details on how to use this feature.
179
+
180
+ ## Configuration
181
+
182
+ Some Labelr settings can be configured using a configuration file or through environment variables. The configuration file is located at `~/.config/labelr/config.json`.
183
+
184
+ By order of precedence, the configuration is loaded from:
185
+
186
+ - CLI command option
187
+ - environment variable
188
+ - file configuration
189
+
190
+ The following variables are currently supported:
191
+
192
+ - `label_studio_url`: URL of the Label Studio server. Can also be set with the `LABELR_LABEL_STUDIO_URL` environment variable.
193
+ - `label_studio_api_key`: API key for Label Studio. Can also be set with the `LABELR_LABEL_STUDIO_API_KEY` environment variable.
194
+
195
+
196
+ Labelr supports configuring settings in config file through the `config` command. For example, to set the Label Studio URL, you can run:
197
+
198
+ ```bash
199
+ labelr config label_studio_url http://127.0.0.1:8080
200
+ ```
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "labelr"
3
- version = "0.10.0"
3
+ version = "0.11.1"
4
4
  description = "A command-line tool to manage labeling tasks with Label Studio."
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.10"
@@ -21,6 +21,7 @@ dependencies = [
21
21
  "google-cloud-storage",
22
22
  "gcloud-aio-storage",
23
23
  "google-genai >= 1.56.0",
24
+ "diskcache>=5.6.3",
24
25
  ]
25
26
 
26
27
  [project.scripts]
@@ -28,7 +29,7 @@ labelr = "labelr.main:app"
28
29
 
29
30
  [project.optional-dependencies]
30
31
  ultralytics = [
31
- "ultralytics==8.3.223",
32
+ "ultralytics==8.4.8",
32
33
  ]
33
34
  fiftyone = [
34
35
  "fiftyone~=1.10.0"
@@ -0,0 +1,57 @@
1
+ import random
2
+ import string
3
+
4
+ from openfoodfacts.utils import get_logger
5
+
6
+ from ultralytics import Results
7
+
8
+ logger = get_logger(__name__)
9
+
10
+
11
+ def format_annotation_results_from_ultralytics(
12
+ results: Results,
13
+ labels: list[str],
14
+ label_mapping: dict[str, str] | None = None,
15
+ ) -> list[dict]:
16
+ annotation_results = []
17
+ orig_height, orig_width = results.orig_shape
18
+ boxes = results.boxes
19
+ classes = boxes.cls.tolist()
20
+ for i, xyxyn in enumerate(boxes.xyxyn):
21
+ # Boxes found.
22
+ if len(xyxyn) > 0:
23
+ xyxyn = xyxyn.tolist()
24
+ x1 = xyxyn[0] * 100
25
+ y1 = xyxyn[1] * 100
26
+ x2 = xyxyn[2] * 100
27
+ y2 = xyxyn[3] * 100
28
+ width = x2 - x1
29
+ height = y2 - y1
30
+ label_id = int(classes[i])
31
+ label_name = labels[label_id]
32
+ if label_mapping:
33
+ label_name = label_mapping.get(label_name, label_name)
34
+ annotation_results.append(
35
+ {
36
+ "id": generate_id(),
37
+ "type": "rectanglelabels",
38
+ "from_name": "label",
39
+ "to_name": "image",
40
+ "original_width": orig_width,
41
+ "original_height": orig_height,
42
+ "image_rotation": 0,
43
+ "value": {
44
+ "rotation": 0,
45
+ "x": x1,
46
+ "y": y1,
47
+ "width": width,
48
+ "height": height,
49
+ "rectanglelabels": [label_name],
50
+ },
51
+ },
52
+ )
53
+ return annotation_results
54
+
55
+
56
+ def generate_id(length: int = 10) -> str:
57
+ return "".join(random.choices(string.ascii_letters + string.digits, k=length))