labelr 0.10.0__tar.gz → 0.11.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. labelr-0.11.0/PKG-INFO +230 -0
  2. labelr-0.11.0/README.md +200 -0
  3. {labelr-0.10.0 → labelr-0.11.0}/pyproject.toml +3 -2
  4. {labelr-0.10.0 → labelr-0.11.0}/src/labelr/apps/datasets.py +140 -9
  5. labelr-0.11.0/src/labelr/apps/directus.py +212 -0
  6. {labelr-0.10.0 → labelr-0.11.0}/src/labelr/apps/google_batch.py +38 -0
  7. {labelr-0.10.0 → labelr-0.11.0}/src/labelr/apps/label_studio.py +260 -63
  8. labelr-0.11.0/src/labelr/apps/typer_description.py +2 -0
  9. labelr-0.11.0/src/labelr/check.py +147 -0
  10. labelr-0.11.0/src/labelr/config.py +57 -0
  11. {labelr-0.10.0 → labelr-0.11.0}/src/labelr/export/object_detection.py +96 -18
  12. {labelr-0.10.0 → labelr-0.11.0}/src/labelr/main.py +16 -0
  13. {labelr-0.10.0 → labelr-0.11.0}/src/labelr/sample/object_detection.py +42 -13
  14. labelr-0.11.0/src/labelr.egg-info/PKG-INFO +230 -0
  15. {labelr-0.10.0 → labelr-0.11.0}/src/labelr.egg-info/SOURCES.txt +2 -0
  16. {labelr-0.10.0 → labelr-0.11.0}/src/labelr.egg-info/requires.txt +2 -1
  17. labelr-0.10.0/PKG-INFO +0 -158
  18. labelr-0.10.0/README.md +0 -129
  19. labelr-0.10.0/src/labelr/check.py +0 -86
  20. labelr-0.10.0/src/labelr/config.py +0 -1
  21. labelr-0.10.0/src/labelr.egg-info/PKG-INFO +0 -158
  22. {labelr-0.10.0 → labelr-0.11.0}/LICENSE +0 -0
  23. {labelr-0.10.0 → labelr-0.11.0}/setup.cfg +0 -0
  24. {labelr-0.10.0 → labelr-0.11.0}/src/labelr/__init__.py +0 -0
  25. {labelr-0.10.0 → labelr-0.11.0}/src/labelr/__main__.py +0 -0
  26. {labelr-0.10.0 → labelr-0.11.0}/src/labelr/annotate.py +0 -0
  27. {labelr-0.10.0 → labelr-0.11.0}/src/labelr/apps/__init__.py +0 -0
  28. {labelr-0.10.0 → labelr-0.11.0}/src/labelr/apps/evaluate.py +0 -0
  29. {labelr-0.10.0 → labelr-0.11.0}/src/labelr/apps/hugging_face.py +0 -0
  30. {labelr-0.10.0 → labelr-0.11.0}/src/labelr/apps/train.py +0 -0
  31. {labelr-0.10.0 → labelr-0.11.0}/src/labelr/dataset_features.py +0 -0
  32. {labelr-0.10.0 → labelr-0.11.0}/src/labelr/evaluate/__init__.py +0 -0
  33. {labelr-0.10.0 → labelr-0.11.0}/src/labelr/evaluate/object_detection.py +0 -0
  34. {labelr-0.10.0 → labelr-0.11.0}/src/labelr/export/__init__.py +0 -0
  35. {labelr-0.10.0 → labelr-0.11.0}/src/labelr/export/classification.py +0 -0
  36. {labelr-0.10.0 → labelr-0.11.0}/src/labelr/export/common.py +0 -0
  37. {labelr-0.10.0 → labelr-0.11.0}/src/labelr/export/llm.py +0 -0
  38. {labelr-0.10.0 → labelr-0.11.0}/src/labelr/google_genai.py +0 -0
  39. {labelr-0.10.0 → labelr-0.11.0}/src/labelr/project_config.py +0 -0
  40. {labelr-0.10.0 → labelr-0.11.0}/src/labelr/sample/__init__.py +0 -0
  41. {labelr-0.10.0 → labelr-0.11.0}/src/labelr/sample/classification.py +0 -0
  42. {labelr-0.10.0 → labelr-0.11.0}/src/labelr/sample/common.py +0 -0
  43. {labelr-0.10.0 → labelr-0.11.0}/src/labelr/sample/llm.py +0 -0
  44. {labelr-0.10.0 → labelr-0.11.0}/src/labelr/types.py +0 -0
  45. {labelr-0.10.0 → labelr-0.11.0}/src/labelr/utils.py +0 -0
  46. {labelr-0.10.0 → labelr-0.11.0}/src/labelr.egg-info/dependency_links.txt +0 -0
  47. {labelr-0.10.0 → labelr-0.11.0}/src/labelr.egg-info/entry_points.txt +0 -0
  48. {labelr-0.10.0 → labelr-0.11.0}/src/labelr.egg-info/top_level.txt +0 -0
labelr-0.11.0/PKG-INFO ADDED
@@ -0,0 +1,230 @@
1
+ Metadata-Version: 2.4
2
+ Name: labelr
3
+ Version: 0.11.0
4
+ Summary: A command-line tool to manage labeling tasks with Label Studio.
5
+ Requires-Python: >=3.10
6
+ Description-Content-Type: text/markdown
7
+ License-File: LICENSE
8
+ Requires-Dist: datasets>=3.2.0
9
+ Requires-Dist: imagehash>=4.3.1
10
+ Requires-Dist: label-studio-sdk>=1.0.8
11
+ Requires-Dist: more-itertools>=10.5.0
12
+ Requires-Dist: openfoodfacts>=2.9.0
13
+ Requires-Dist: typer>=0.15.1
14
+ Requires-Dist: google-cloud-batch==0.18.0
15
+ Requires-Dist: huggingface-hub
16
+ Requires-Dist: deepdiff>=8.6.1
17
+ Requires-Dist: rapidfuzz>=3.14.3
18
+ Requires-Dist: aiohttp
19
+ Requires-Dist: aiofiles
20
+ Requires-Dist: orjson
21
+ Requires-Dist: google-cloud-storage
22
+ Requires-Dist: gcloud-aio-storage
23
+ Requires-Dist: google-genai>=1.56.0
24
+ Requires-Dist: diskcache>=5.6.3
25
+ Provides-Extra: ultralytics
26
+ Requires-Dist: ultralytics==8.4.8; extra == "ultralytics"
27
+ Provides-Extra: fiftyone
28
+ Requires-Dist: fiftyone~=1.10.0; extra == "fiftyone"
29
+ Dynamic: license-file
30
+
31
+ # Labelr
32
+
33
+ Labelr a command line interface that aims at providing a set of tools to help data scientists and machine learning engineers to deal with ML data annotation, data preprocessing and format conversion.
34
+
35
+ This project started as a way to automate some of the tasks we do at Open Food Facts to manage data at different stages of the machine learning pipeline.
36
+
37
+ The CLI currently is integrated with Label Studio (for data annotation), Ultralytics (for object detection), Google Cloud Batch (for training) and Hugging Face (for model and dataset storage). It only works with some specific tasks (object detection, image classification and image extraction using LVLM for now), but it's meant to be extended to other tasks in the future.
38
+
39
+ For object detection and image classification models, it currently allows to:
40
+
41
+ - create Label Studio projects
42
+ - upload images to Label Studio
43
+ - pre-annotate the tasks either with an existing object detection model, or with a zero-shot model (Yolo-World or SAM), using Ultralytics
44
+ - perform data quality checks on Label Studio datasets
45
+ - export the data to Hugging Face or to local disk
46
+ - train the model on Google Batch (for object detection only)
47
+ - visualize the model predictions and compare them with the ground truth, using [Fiftyone](https://docs.voxel51.com/user_guide/index.html).
48
+
49
+ Labelr also support managing datasets for fine-tuning large visual language models. It currently only support a single task: structured extraction (JSON) from a single image.
50
+ The following features are supported:
51
+
52
+ - creating training datasets using Google Gemini Batch, from a list of images, textual instructions and a JSON schema
53
+ - uploading the dataset to Hugging Face
54
+ - fixing manually or automatically the model output using [Directus](https://directus.io/), a headless CMS used to manage the structured output
55
+ - export the dataset to Hugging Face
56
+
57
+ In addition, Labelr comes with two scripts that can be used to train ML models:
58
+
59
+ - in `packages/train-yolo`: the `main.py` script can be used to train an object detection model using Ultralytics. The training can be fully automatized on Google Batch, and Labelr provides a CLI to launch Google Batch jobs.
60
+ - in `packages/train-unsloth`: the `main.py` script can be used to train a visual language model using Unsloth. The training is not yet automatized on Google Batch, but the script can be used to train the model locally.
61
+
62
+ ## Installation
63
+
64
+ Python 3.10 or higher is required to run this CLI.
65
+
66
+ To install the CLI, simply run:
67
+
68
+ ```bash
69
+ pip install labelr
70
+ ```
71
+ We recommend to install the CLI in a virtual environment. You can either use pip or conda for that.
72
+
73
+ There are two optional dependencies that you can install to use the CLI:
74
+ - `ultralytics`: pre-annotate object detection datasets with an ultralytics model (yolo, yolo-world)
75
+ - `fiftyone`: visualize the model predictions and compare them with the ground truth, using FiftyOne.
76
+
77
+ To install the ultralytics optional dependency, you can run:
78
+
79
+ ```bash
80
+ pip install labelr[ultralytics]
81
+ ```
82
+
83
+ ## Usage
84
+
85
+ ### Label Studio integration
86
+
87
+ To create a Label Studio project, you need to have a Label Studio instance running. Launching a Label Studio instance is out of the scope of this project, but you can follow the instructions on the [Label Studio documentation](https://labelstud.io/guide/install.html).
88
+
89
+ By default, the CLI will assume you're running Label Studio locally (url: http://127.0.0.1:8080). You can change the URL by setting the `--label-studio-url` CLI option or by updating the configuration (see the [Configuration](#configuration) section below for more information).
90
+
91
+ For all the commands that interact with Label Studio, you need to provide an API key using the `--api-key`, or through configuration.
92
+
93
+ #### Create a project
94
+
95
+ Once you have a Label Studio instance running, you can create a project easily. First, you need to create a configuration file for the project. The configuration file is an XML file that defines the labeling interface and the labels to use for the project. You can find an example of a configuration file in the [Label Studio documentation](https://labelstud.io/guide/setup).
96
+
97
+ For an object detection task, a command allows you to create the configuration file automatically:
98
+
99
+ ```bash
100
+ labelr ls create-config-file --labels 'label1' --labels 'label2' --output-file label_config.xml
101
+ ```
102
+
103
+ where `label1` and `label2` are the labels you want to use for the object detection task, and `label_config.xml` is the output file that will contain the configuration.
104
+
105
+ Then, you can create a project on Label Studio with the following command:
106
+
107
+ ```bash
108
+ labelr ls create --title my_project --api-key API_KEY --config-file label_config.xml
109
+ ```
110
+
111
+ where `API_KEY` is the API key of the Label Studio instance (API key is available at Account page), and `label_config.xml` is the configuration file of the project.
112
+
113
+ `ls` stands for Label Studio in the CLI.
114
+
115
+ #### Create a dataset file
116
+
117
+ If you have a list of images, for an object detection task, you can quickly create a dataset file with the following command:
118
+
119
+ ```bash
120
+ labelr ls create-dataset-file --input-file image_urls.txt --output-file dataset.json
121
+ ```
122
+
123
+ where `image_urls.txt` is a file containing the URLs of the images, one per line, and `dataset.json` is the output file.
124
+
125
+ #### Import data
126
+
127
+ Next, import the generated data to a project with the following command:
128
+
129
+ ```bash
130
+ labelr ls import-data --project-id PROJECT_ID --dataset-path dataset.json
131
+ ```
132
+
133
+ where `PROJECT_ID` is the ID of the project you created.
134
+
135
+ #### Pre-annotate the data
136
+
137
+ To accelerate annotation, you can pre-annotate the images with an object detection model. We support three pre-annotation backends:
138
+
139
+ - `ultralytics`: use your own model or [Yolo-World](https://docs.ultralytics.com/models/yolo-world/), a zero-shot model that can detect any object using a text description of the object. You can specify the path or the name of the model with the `--model-name` option. If no model name is provided, the `yolov8x-worldv2.pt` model (Yolo-World) is used.
140
+ - `ultralytics_sam3`: use [SAM3](https://docs.ultralytics.com/models/sam-3/), another zero-shot model. We advice to use this backend, as it's the most accurate. The `--model-name` option is ignored when this backend is used.
141
+ - `robotoff`: the ML backend of Open Food Facts (specific to Open Food Facts projects).
142
+
143
+ When using `ultralytics` or `ultralytics_sam3`, make sure you installed the labelr package with the `ultralytics` extra.
144
+
145
+ To pre-annotate the data with Ultralytics, use the following command:
146
+
147
+ ```bash
148
+ labelr ls add-prediction --project-id PROJECT_ID --backend ultralytics_sam3 --labels 'product' --labels 'price tag' --label-mapping '{"price tag": "price-tag"}'
149
+ ```
150
+
151
+ The SAM3 model will be automatically downloaded from Hugging Face. [SAM3](https://huggingface.co/facebook/sam3) is a gated model, it requires a permission before getting access to the model.Make sure you were granted the access before launching the command.
152
+
153
+ In the command above, `labels` is the list of labels to use for the object detection task (you can add as many labels as you want). You can also provide a `--label-mapping` option in case the names of the label of the model you use for pre-annotation is different from the names configured on your Label Studio project.
154
+
155
+
156
+ #### Add `train` and `val` split
157
+
158
+ In most machine learning projects, you need to split your data into a training and a validation set. Assigning each sample to a split is required before exporting the dataset. To do so, you can use the following command:
159
+
160
+ ```bash
161
+ labelr ls add-split --train-split 0.8 --project-id PROJECT_ID
162
+ ```
163
+
164
+ For each task in the dataset, it randomly assigns 80% of the samples to the `train` split and 20% to the `val` split. The split is saved in the task `data` in the `split` field.
165
+
166
+ You can change the train/val ratio by changing the `--train-split` option. You can also assign specific sample to a split. For example you can assign the `train` split to specific tasks by storinh the task IDs in a file `task_ids.txt` and by running the following command:
167
+
168
+ ```bash
169
+ labelr ls add-split --split-name train --task-id-file task_ids.txt --project-id PROJECT_ID
170
+ ```
171
+
172
+ #### Performing sanity checks on the dataset
173
+
174
+ Labelr can detect automatically some common data quality issues:
175
+
176
+ - broken image URLs
177
+ - duplicate tasks (based on the image hash)
178
+ - multiple annotations
179
+
180
+ To perform a check, run:
181
+
182
+ ```bash
183
+ labelr ls check-dataset --project-id PROJECT_ID
184
+ ```
185
+
186
+ The command will report the issues found. It is non-destructive by default, but you can use the `--delete-missing-images` and `--delete-duplicate-images` options to delete the tasks with missing images or duplicates respectively.
187
+
188
+ #### Export the data
189
+
190
+ Once the data is annotated, you can export it to a Hugging Face dataset or to local disk (Ultralytics format). To export it to disk, use the following command:
191
+
192
+ ```bash
193
+ labelr datasets export --project-id PROJECT_ID --from ls --to ultralytics --output-dir output --label-names 'product,price-tag'
194
+ ```
195
+
196
+ where `output` is the directory where the data will be exported. Currently, label names must be provided, as the CLI does not support exporting label names from Label Studio yet.
197
+
198
+ To export the data to a Hugging Face dataset, use the following command:
199
+
200
+ ```bash
201
+ labelr datasets export --project-id PROJECT_ID --from ls --to huggingface --repo-id REPO_ID --label-names 'product,price-tag'
202
+ ```
203
+
204
+ where `REPO_ID` is the ID of the Hugging Face repository where the dataset will be uploaded (ex: `openfoodfacts/food-detection`).
205
+
206
+ ### Lauch training jobs
207
+
208
+ You can also launch training jobs for YOLO object detection models using datasets hosted on Hugging Face. Please refer to the [train-yolo package README](packages/train-yolo/README.md) for more details on how to use this feature.
209
+
210
+ ## Configuration
211
+
212
+ Some Labelr settings can be configured using a configuration file or through environment variables. The configuration file is located at `~/.config/labelr/config.json`.
213
+
214
+ By order of precedence, the configuration is loaded from:
215
+
216
+ - CLI command option
217
+ - environment variable
218
+ - file configuration
219
+
220
+ The following variables are currently supported:
221
+
222
+ - `label_studio_url`: URL of the Label Studio server. Can also be set with the `LABELR_LABEL_STUDIO_URL` environment variable.
223
+ - `label_studio_api_key`: API key for Label Studio. Can also be set with the `LABELR_LABEL_STUDIO_API_KEY` environment variable.
224
+
225
+
226
+ Labelr supports configuring settings in config file through the `config` command. For example, to set the Label Studio URL, you can run:
227
+
228
+ ```bash
229
+ labelr config label_studio_url http://127.0.0.1:8080
230
+ ```
@@ -0,0 +1,200 @@
1
+ # Labelr
2
+
3
+ Labelr a command line interface that aims at providing a set of tools to help data scientists and machine learning engineers to deal with ML data annotation, data preprocessing and format conversion.
4
+
5
+ This project started as a way to automate some of the tasks we do at Open Food Facts to manage data at different stages of the machine learning pipeline.
6
+
7
+ The CLI currently is integrated with Label Studio (for data annotation), Ultralytics (for object detection), Google Cloud Batch (for training) and Hugging Face (for model and dataset storage). It only works with some specific tasks (object detection, image classification and image extraction using LVLM for now), but it's meant to be extended to other tasks in the future.
8
+
9
+ For object detection and image classification models, it currently allows to:
10
+
11
+ - create Label Studio projects
12
+ - upload images to Label Studio
13
+ - pre-annotate the tasks either with an existing object detection model, or with a zero-shot model (Yolo-World or SAM), using Ultralytics
14
+ - perform data quality checks on Label Studio datasets
15
+ - export the data to Hugging Face or to local disk
16
+ - train the model on Google Batch (for object detection only)
17
+ - visualize the model predictions and compare them with the ground truth, using [Fiftyone](https://docs.voxel51.com/user_guide/index.html).
18
+
19
+ Labelr also support managing datasets for fine-tuning large visual language models. It currently only support a single task: structured extraction (JSON) from a single image.
20
+ The following features are supported:
21
+
22
+ - creating training datasets using Google Gemini Batch, from a list of images, textual instructions and a JSON schema
23
+ - uploading the dataset to Hugging Face
24
+ - fixing manually or automatically the model output using [Directus](https://directus.io/), a headless CMS used to manage the structured output
25
+ - export the dataset to Hugging Face
26
+
27
+ In addition, Labelr comes with two scripts that can be used to train ML models:
28
+
29
+ - in `packages/train-yolo`: the `main.py` script can be used to train an object detection model using Ultralytics. The training can be fully automatized on Google Batch, and Labelr provides a CLI to launch Google Batch jobs.
30
+ - in `packages/train-unsloth`: the `main.py` script can be used to train a visual language model using Unsloth. The training is not yet automatized on Google Batch, but the script can be used to train the model locally.
31
+
32
+ ## Installation
33
+
34
+ Python 3.10 or higher is required to run this CLI.
35
+
36
+ To install the CLI, simply run:
37
+
38
+ ```bash
39
+ pip install labelr
40
+ ```
41
+ We recommend to install the CLI in a virtual environment. You can either use pip or conda for that.
42
+
43
+ There are two optional dependencies that you can install to use the CLI:
44
+ - `ultralytics`: pre-annotate object detection datasets with an ultralytics model (yolo, yolo-world)
45
+ - `fiftyone`: visualize the model predictions and compare them with the ground truth, using FiftyOne.
46
+
47
+ To install the ultralytics optional dependency, you can run:
48
+
49
+ ```bash
50
+ pip install labelr[ultralytics]
51
+ ```
52
+
53
+ ## Usage
54
+
55
+ ### Label Studio integration
56
+
57
+ To create a Label Studio project, you need to have a Label Studio instance running. Launching a Label Studio instance is out of the scope of this project, but you can follow the instructions on the [Label Studio documentation](https://labelstud.io/guide/install.html).
58
+
59
+ By default, the CLI will assume you're running Label Studio locally (url: http://127.0.0.1:8080). You can change the URL by setting the `--label-studio-url` CLI option or by updating the configuration (see the [Configuration](#configuration) section below for more information).
60
+
61
+ For all the commands that interact with Label Studio, you need to provide an API key using the `--api-key`, or through configuration.
62
+
63
+ #### Create a project
64
+
65
+ Once you have a Label Studio instance running, you can create a project easily. First, you need to create a configuration file for the project. The configuration file is an XML file that defines the labeling interface and the labels to use for the project. You can find an example of a configuration file in the [Label Studio documentation](https://labelstud.io/guide/setup).
66
+
67
+ For an object detection task, a command allows you to create the configuration file automatically:
68
+
69
+ ```bash
70
+ labelr ls create-config-file --labels 'label1' --labels 'label2' --output-file label_config.xml
71
+ ```
72
+
73
+ where `label1` and `label2` are the labels you want to use for the object detection task, and `label_config.xml` is the output file that will contain the configuration.
74
+
75
+ Then, you can create a project on Label Studio with the following command:
76
+
77
+ ```bash
78
+ labelr ls create --title my_project --api-key API_KEY --config-file label_config.xml
79
+ ```
80
+
81
+ where `API_KEY` is the API key of the Label Studio instance (API key is available at Account page), and `label_config.xml` is the configuration file of the project.
82
+
83
+ `ls` stands for Label Studio in the CLI.
84
+
85
+ #### Create a dataset file
86
+
87
+ If you have a list of images, for an object detection task, you can quickly create a dataset file with the following command:
88
+
89
+ ```bash
90
+ labelr ls create-dataset-file --input-file image_urls.txt --output-file dataset.json
91
+ ```
92
+
93
+ where `image_urls.txt` is a file containing the URLs of the images, one per line, and `dataset.json` is the output file.
94
+
95
+ #### Import data
96
+
97
+ Next, import the generated data to a project with the following command:
98
+
99
+ ```bash
100
+ labelr ls import-data --project-id PROJECT_ID --dataset-path dataset.json
101
+ ```
102
+
103
+ where `PROJECT_ID` is the ID of the project you created.
104
+
105
+ #### Pre-annotate the data
106
+
107
+ To accelerate annotation, you can pre-annotate the images with an object detection model. We support three pre-annotation backends:
108
+
109
+ - `ultralytics`: use your own model or [Yolo-World](https://docs.ultralytics.com/models/yolo-world/), a zero-shot model that can detect any object using a text description of the object. You can specify the path or the name of the model with the `--model-name` option. If no model name is provided, the `yolov8x-worldv2.pt` model (Yolo-World) is used.
110
+ - `ultralytics_sam3`: use [SAM3](https://docs.ultralytics.com/models/sam-3/), another zero-shot model. We advice to use this backend, as it's the most accurate. The `--model-name` option is ignored when this backend is used.
111
+ - `robotoff`: the ML backend of Open Food Facts (specific to Open Food Facts projects).
112
+
113
+ When using `ultralytics` or `ultralytics_sam3`, make sure you installed the labelr package with the `ultralytics` extra.
114
+
115
+ To pre-annotate the data with Ultralytics, use the following command:
116
+
117
+ ```bash
118
+ labelr ls add-prediction --project-id PROJECT_ID --backend ultralytics_sam3 --labels 'product' --labels 'price tag' --label-mapping '{"price tag": "price-tag"}'
119
+ ```
120
+
121
+ The SAM3 model will be automatically downloaded from Hugging Face. [SAM3](https://huggingface.co/facebook/sam3) is a gated model, it requires a permission before getting access to the model.Make sure you were granted the access before launching the command.
122
+
123
+ In the command above, `labels` is the list of labels to use for the object detection task (you can add as many labels as you want). You can also provide a `--label-mapping` option in case the names of the label of the model you use for pre-annotation is different from the names configured on your Label Studio project.
124
+
125
+
126
+ #### Add `train` and `val` split
127
+
128
+ In most machine learning projects, you need to split your data into a training and a validation set. Assigning each sample to a split is required before exporting the dataset. To do so, you can use the following command:
129
+
130
+ ```bash
131
+ labelr ls add-split --train-split 0.8 --project-id PROJECT_ID
132
+ ```
133
+
134
+ For each task in the dataset, it randomly assigns 80% of the samples to the `train` split and 20% to the `val` split. The split is saved in the task `data` in the `split` field.
135
+
136
+ You can change the train/val ratio by changing the `--train-split` option. You can also assign specific sample to a split. For example you can assign the `train` split to specific tasks by storinh the task IDs in a file `task_ids.txt` and by running the following command:
137
+
138
+ ```bash
139
+ labelr ls add-split --split-name train --task-id-file task_ids.txt --project-id PROJECT_ID
140
+ ```
141
+
142
+ #### Performing sanity checks on the dataset
143
+
144
+ Labelr can detect automatically some common data quality issues:
145
+
146
+ - broken image URLs
147
+ - duplicate tasks (based on the image hash)
148
+ - multiple annotations
149
+
150
+ To perform a check, run:
151
+
152
+ ```bash
153
+ labelr ls check-dataset --project-id PROJECT_ID
154
+ ```
155
+
156
+ The command will report the issues found. It is non-destructive by default, but you can use the `--delete-missing-images` and `--delete-duplicate-images` options to delete the tasks with missing images or duplicates respectively.
157
+
158
+ #### Export the data
159
+
160
+ Once the data is annotated, you can export it to a Hugging Face dataset or to local disk (Ultralytics format). To export it to disk, use the following command:
161
+
162
+ ```bash
163
+ labelr datasets export --project-id PROJECT_ID --from ls --to ultralytics --output-dir output --label-names 'product,price-tag'
164
+ ```
165
+
166
+ where `output` is the directory where the data will be exported. Currently, label names must be provided, as the CLI does not support exporting label names from Label Studio yet.
167
+
168
+ To export the data to a Hugging Face dataset, use the following command:
169
+
170
+ ```bash
171
+ labelr datasets export --project-id PROJECT_ID --from ls --to huggingface --repo-id REPO_ID --label-names 'product,price-tag'
172
+ ```
173
+
174
+ where `REPO_ID` is the ID of the Hugging Face repository where the dataset will be uploaded (ex: `openfoodfacts/food-detection`).
175
+
176
+ ### Lauch training jobs
177
+
178
+ You can also launch training jobs for YOLO object detection models using datasets hosted on Hugging Face. Please refer to the [train-yolo package README](packages/train-yolo/README.md) for more details on how to use this feature.
179
+
180
+ ## Configuration
181
+
182
+ Some Labelr settings can be configured using a configuration file or through environment variables. The configuration file is located at `~/.config/labelr/config.json`.
183
+
184
+ By order of precedence, the configuration is loaded from:
185
+
186
+ - CLI command option
187
+ - environment variable
188
+ - file configuration
189
+
190
+ The following variables are currently supported:
191
+
192
+ - `label_studio_url`: URL of the Label Studio server. Can also be set with the `LABELR_LABEL_STUDIO_URL` environment variable.
193
+ - `label_studio_api_key`: API key for Label Studio. Can also be set with the `LABELR_LABEL_STUDIO_API_KEY` environment variable.
194
+
195
+
196
+ Labelr supports configuring settings in config file through the `config` command. For example, to set the Label Studio URL, you can run:
197
+
198
+ ```bash
199
+ labelr config label_studio_url http://127.0.0.1:8080
200
+ ```
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "labelr"
3
- version = "0.10.0"
3
+ version = "0.11.0"
4
4
  description = "A command-line tool to manage labeling tasks with Label Studio."
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.10"
@@ -21,6 +21,7 @@ dependencies = [
21
21
  "google-cloud-storage",
22
22
  "gcloud-aio-storage",
23
23
  "google-genai >= 1.56.0",
24
+ "diskcache>=5.6.3",
24
25
  ]
25
26
 
26
27
  [project.scripts]
@@ -28,7 +29,7 @@ labelr = "labelr.main:app"
28
29
 
29
30
  [project.optional-dependencies]
30
31
  ultralytics = [
31
- "ultralytics==8.3.223",
32
+ "ultralytics==8.4.8",
32
33
  ]
33
34
  fiftyone = [
34
35
  "fiftyone~=1.10.0"
@@ -18,7 +18,8 @@ from labelr.export.object_detection import (
18
18
  export_from_ls_to_ultralytics_object_detection,
19
19
  )
20
20
 
21
- from ..config import LABEL_STUDIO_DEFAULT_URL
21
+ from . import typer_description
22
+ from ..config import config
22
23
  from ..types import ExportDestination, ExportSource, TaskType
23
24
 
24
25
  app = typer.Typer()
@@ -125,7 +126,9 @@ def convert_object_detection_dataset(
125
126
  def export(
126
127
  from_: Annotated[ExportSource, typer.Option("--from", help="Input source to use")],
127
128
  to: Annotated[ExportDestination, typer.Option(help="Where to export the data")],
128
- api_key: Annotated[Optional[str], typer.Option(envvar="LABEL_STUDIO_API_KEY")],
129
+ api_key: Annotated[
130
+ str | None, typer.Option(help=typer_description.LABEL_STUDIO_API_KEY)
131
+ ] = config.label_studio_api_key,
129
132
  task_type: Annotated[
130
133
  TaskType, typer.Option(help="Type of task to export")
131
134
  ] = TaskType.object_detection,
@@ -142,7 +145,16 @@ def export(
142
145
  project_id: Annotated[
143
146
  Optional[int], typer.Option(help="Label Studio Project ID")
144
147
  ] = None,
145
- label_studio_url: Optional[str] = LABEL_STUDIO_DEFAULT_URL,
148
+ view_id: Annotated[
149
+ int | None,
150
+ typer.Option(
151
+ help="ID of the Label Studio view, if any. This option is useful "
152
+ "to filter the task to export."
153
+ ),
154
+ ] = None,
155
+ label_studio_url: Annotated[
156
+ str, typer.Option(help=typer_description.LABEL_STUDIO_URL)
157
+ ] = config.label_studio_url,
146
158
  output_dir: Annotated[
147
159
  Optional[Path],
148
160
  typer.Option(
@@ -163,11 +175,15 @@ def export(
163
175
  is_openfoodfacts_dataset: Annotated[
164
176
  bool,
165
177
  typer.Option(
166
- help="Whether the Ultralytics dataset is an OpenFoodFacts dataset, only "
167
- "for Ultralytics source. This is used to generate the correct image URLs "
168
- "each image name."
178
+ help="Whether the Ultralytics dataset is an Open Food Facts dataset, only "
179
+ "for Ultralytics source. This is used:\n"
180
+ "- to generate the correct image URLs from each image name, when exporting "
181
+ "from Ultralytics to Hugging Face Datasets.\n"
182
+ "- to include additional metadata fields specific to Open Food Facts "
183
+ "(`barcode` and `off_image_id`) when exporting from Label Studio to "
184
+ "Hugging Face Datasets."
169
185
  ),
170
- ] = True,
186
+ ] = False,
171
187
  openfoodfacts_flavor: Annotated[
172
188
  Flavor,
173
189
  typer.Option(
@@ -181,9 +197,18 @@ def export(
181
197
  float,
182
198
  typer.Option(
183
199
  help="Train ratio for splitting the dataset, if the split name is not "
184
- "provided (typically, if the source is Label Studio)"
200
+ "provided. Only used if the source is Label Studio and the destination "
201
+ "is Ultralytics."
185
202
  ),
186
203
  ] = 0.8,
204
+ image_max_size: Annotated[
205
+ int | None,
206
+ typer.Option(
207
+ help="Maximum size (in pixels) for the images. If None, no resizing is performed."
208
+ "Otherwise, the longest side of the image will be resized to this value, "
209
+ "keeping the aspect ratio."
210
+ ),
211
+ ] = None,
187
212
  error_raise: Annotated[
188
213
  bool,
189
214
  typer.Option(
@@ -260,9 +285,12 @@ def export(
260
285
  repo_id=repo_id,
261
286
  label_names=typing.cast(list[str], label_names_list),
262
287
  project_id=typing.cast(int, project_id),
288
+ is_openfoodfacts_dataset=is_openfoodfacts_dataset,
263
289
  merge_labels=merge_labels,
264
290
  use_aws_cache=use_aws_cache,
265
291
  revision=revision,
292
+ view_id=view_id,
293
+ image_max_size=image_max_size,
266
294
  )
267
295
  elif to == ExportDestination.ultralytics:
268
296
  export_from_ls_to_ultralytics_object_detection(
@@ -274,6 +302,8 @@ def export(
274
302
  error_raise=error_raise,
275
303
  merge_labels=merge_labels,
276
304
  use_aws_cache=use_aws_cache,
305
+ view_id=view_id,
306
+ image_max_size=image_max_size,
277
307
  )
278
308
 
279
309
  elif from_ == ExportSource.hf:
@@ -289,6 +319,7 @@ def export(
289
319
  error_raise=error_raise,
290
320
  use_aws_cache=use_aws_cache,
291
321
  revision=revision,
322
+ image_max_size=image_max_size,
292
323
  )
293
324
  else:
294
325
  raise typer.BadParameter("Unsupported export format")
@@ -327,7 +358,8 @@ def export_llm_ds(
327
358
  tmp_dir: Annotated[
328
359
  Path | None,
329
360
  typer.Option(
330
- help="Path to a temporary directory to use for image processing",
361
+ help="Path to the temporary directory used to store intermediate sample files "
362
+ "created when building the HF dataset.",
331
363
  ),
332
364
  ] = None,
333
365
  image_max_size: Annotated[
@@ -354,3 +386,102 @@ def export_llm_ds(
354
386
  tmp_dir=tmp_dir,
355
387
  image_max_size=image_max_size,
356
388
  )
389
+
390
+
391
+ @app.command()
392
+ def update_llm_ds(
393
+ dataset_path: Annotated[
394
+ Path, typer.Option(help="Path to the JSONL containing the updates.")
395
+ ],
396
+ repo_id: Annotated[
397
+ str, typer.Option(help="Hugging Face Datasets repository ID to update")
398
+ ],
399
+ split: Annotated[str, typer.Option(help="Dataset split to use")],
400
+ revision: Annotated[
401
+ str,
402
+ typer.Option(
403
+ help="Revision (branch, tag or commit) to use when pushing the new version "
404
+ "of the Hugging Face Dataset."
405
+ ),
406
+ ] = "main",
407
+ tmp_dir: Annotated[
408
+ Path | None,
409
+ typer.Option(
410
+ help="Path to a temporary directory to use for image processing",
411
+ ),
412
+ ] = None,
413
+ show_diff: Annotated[
414
+ bool,
415
+ typer.Option(
416
+ help="Show the differences between the original sample and the update. If "
417
+ "True, the updated dataset is not pushed to the Hub. Useful to review the "
418
+ "updates before applying them.",
419
+ ),
420
+ ] = False,
421
+ ):
422
+ """Update an existing LLM image extraction dataset, by updating the
423
+ `output` field of each sample in the dataset.
424
+
425
+ The `--dataset_path` JSONL file should contain items with two fields:
426
+
427
+ - `image_id`: The image ID of the sample to update in the Hugging Face
428
+ dataset.
429
+ - `output`: The new output data to set for the sample.
430
+ """
431
+ import sys
432
+ from difflib import Differ
433
+
434
+ import orjson
435
+ from datasets import load_dataset
436
+ from diskcache import Cache
437
+
438
+ dataset = load_dataset(repo_id, split=split)
439
+
440
+ # Populate cache with the updates
441
+ cache = Cache(directory=tmp_dir or None)
442
+ with dataset_path.open("r") as f:
443
+ for line in map(orjson.loads, f):
444
+ if "image_id" not in line or "output" not in line:
445
+ raise ValueError(
446
+ "Each item in the update JSONL file must contain `image_id` and `output` fields"
447
+ )
448
+ image_id = line["image_id"]
449
+ output = line["output"]
450
+
451
+ if not isinstance(output, str):
452
+ output = orjson.dumps(output).decode("utf-8")
453
+
454
+ cache[image_id] = output
455
+
456
+ def apply_updates(sample):
457
+ image_id = sample["image_id"]
458
+ if image_id in cache:
459
+ cached_item = cache[image_id]
460
+ sample["output"] = cached_item
461
+ return sample
462
+
463
+ if show_diff:
464
+ differ = Differ()
465
+ for sample in dataset:
466
+ image_id = sample["image_id"]
467
+ if image_id in cache:
468
+ cached_item = orjson.loads(cache[image_id])
469
+ original_item = orjson.loads(sample["output"])
470
+ cached_item_str = orjson.dumps(
471
+ cached_item, option=orjson.OPT_INDENT_2
472
+ ).decode("utf8")
473
+ original_item_str = orjson.dumps(
474
+ original_item, option=orjson.OPT_INDENT_2
475
+ ).decode("utf8")
476
+ diff = list(
477
+ differ.compare(
478
+ original_item_str.splitlines(keepends=True),
479
+ cached_item_str.splitlines(keepends=True),
480
+ )
481
+ )
482
+ sys.stdout.writelines(diff)
483
+ sys.stdout.write("\n" + "-" * 30 + "\n")
484
+
485
+ else:
486
+ updated_dataset = dataset.map(apply_updates, batched=False)
487
+ updated_dataset.push_to_hub(repo_id, split=split, revision=revision)