labelr 0.7.0__tar.gz → 0.9.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {labelr-0.7.0/src/labelr.egg-info → labelr-0.9.0}/PKG-INFO +17 -6
- {labelr-0.7.0 → labelr-0.9.0}/README.md +7 -5
- {labelr-0.7.0 → labelr-0.9.0}/pyproject.toml +8 -2
- {labelr-0.7.0 → labelr-0.9.0}/src/labelr/apps/datasets.py +12 -25
- labelr-0.9.0/src/labelr/apps/evaluate.py +41 -0
- labelr-0.9.0/src/labelr/apps/google_batch.py +289 -0
- labelr-0.9.0/src/labelr/apps/hugging_face.py +57 -0
- labelr-0.7.0/src/labelr/apps/projects.py → labelr-0.9.0/src/labelr/apps/label_studio.py +65 -9
- {labelr-0.7.0 → labelr-0.9.0}/src/labelr/apps/train.py +22 -4
- labelr-0.9.0/src/labelr/evaluate/__init__.py +0 -0
- labelr-0.9.0/src/labelr/evaluate/object_detection.py +100 -0
- {labelr-0.7.0 → labelr-0.9.0}/src/labelr/export.py +64 -7
- labelr-0.9.0/src/labelr/google_genai.py +415 -0
- {labelr-0.7.0 → labelr-0.9.0}/src/labelr/main.py +23 -8
- {labelr-0.7.0 → labelr-0.9.0}/src/labelr/sample.py +72 -4
- labelr-0.9.0/src/labelr/utils.py +48 -0
- {labelr-0.7.0 → labelr-0.9.0/src/labelr.egg-info}/PKG-INFO +17 -6
- {labelr-0.7.0 → labelr-0.9.0}/src/labelr.egg-info/SOURCES.txt +7 -2
- {labelr-0.7.0 → labelr-0.9.0}/src/labelr.egg-info/requires.txt +10 -0
- labelr-0.7.0/src/labelr/apps/users.py +0 -36
- labelr-0.7.0/src/labelr/utils.py +0 -13
- {labelr-0.7.0 → labelr-0.9.0}/LICENSE +0 -0
- {labelr-0.7.0 → labelr-0.9.0}/setup.cfg +0 -0
- {labelr-0.7.0 → labelr-0.9.0}/src/labelr/__init__.py +0 -0
- {labelr-0.7.0 → labelr-0.9.0}/src/labelr/__main__.py +0 -0
- {labelr-0.7.0 → labelr-0.9.0}/src/labelr/annotate.py +0 -0
- {labelr-0.7.0 → labelr-0.9.0}/src/labelr/apps/__init__.py +0 -0
- {labelr-0.7.0 → labelr-0.9.0}/src/labelr/check.py +0 -0
- {labelr-0.7.0 → labelr-0.9.0}/src/labelr/config.py +0 -0
- {labelr-0.7.0 → labelr-0.9.0}/src/labelr/dataset_features.py +0 -0
- {labelr-0.7.0 → labelr-0.9.0}/src/labelr/project_config.py +0 -0
- {labelr-0.7.0 → labelr-0.9.0}/src/labelr/types.py +0 -0
- {labelr-0.7.0 → labelr-0.9.0}/src/labelr.egg-info/dependency_links.txt +0 -0
- {labelr-0.7.0 → labelr-0.9.0}/src/labelr.egg-info/entry_points.txt +0 -0
- {labelr-0.7.0 → labelr-0.9.0}/src/labelr.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: labelr
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.9.0
|
|
4
4
|
Summary: A command-line tool to manage labeling tasks with Label Studio.
|
|
5
5
|
Requires-Python: >=3.10
|
|
6
6
|
Description-Content-Type: text/markdown
|
|
@@ -13,10 +13,19 @@ Requires-Dist: openfoodfacts>=2.9.0
|
|
|
13
13
|
Requires-Dist: typer>=0.15.1
|
|
14
14
|
Requires-Dist: google-cloud-batch==0.18.0
|
|
15
15
|
Requires-Dist: huggingface-hub
|
|
16
|
+
Requires-Dist: deepdiff>=8.6.1
|
|
17
|
+
Requires-Dist: rapidfuzz>=3.14.3
|
|
18
|
+
Requires-Dist: aiohttp
|
|
19
|
+
Requires-Dist: aiofiles
|
|
20
|
+
Requires-Dist: orjson
|
|
16
21
|
Provides-Extra: ultralytics
|
|
17
22
|
Requires-Dist: ultralytics==8.3.223; extra == "ultralytics"
|
|
18
23
|
Provides-Extra: fiftyone
|
|
19
24
|
Requires-Dist: fiftyone~=1.10.0; extra == "fiftyone"
|
|
25
|
+
Provides-Extra: google
|
|
26
|
+
Requires-Dist: google-genai>=1.56.0; extra == "google"
|
|
27
|
+
Requires-Dist: gcloud-aio-storage; extra == "google"
|
|
28
|
+
Requires-Dist: google-cloud-storage; extra == "google"
|
|
20
29
|
Dynamic: license-file
|
|
21
30
|
|
|
22
31
|
# Labelr
|
|
@@ -73,7 +82,7 @@ Once you have a Label Studio instance running, you can create a project easily.
|
|
|
73
82
|
For an object detection task, a command allows you to create the configuration file automatically:
|
|
74
83
|
|
|
75
84
|
```bash
|
|
76
|
-
labelr
|
|
85
|
+
labelr ls create-config --labels 'label1' --labels 'label2' --output-file label_config.xml
|
|
77
86
|
```
|
|
78
87
|
|
|
79
88
|
where `label1` and `label2` are the labels you want to use for the object detection task, and `label_config.xml` is the output file that will contain the configuration.
|
|
@@ -81,17 +90,19 @@ where `label1` and `label2` are the labels you want to use for the object detect
|
|
|
81
90
|
Then, you can create a project on Label Studio with the following command:
|
|
82
91
|
|
|
83
92
|
```bash
|
|
84
|
-
labelr
|
|
93
|
+
labelr ls create --title my_project --api-key API_KEY --config-file label_config.xml
|
|
85
94
|
```
|
|
86
95
|
|
|
87
96
|
where `API_KEY` is the API key of the Label Studio instance (API key is available at Account page), and `label_config.xml` is the configuration file of the project.
|
|
88
97
|
|
|
98
|
+
`ls` stands for Label Studio in the CLI.
|
|
99
|
+
|
|
89
100
|
#### Create a dataset file
|
|
90
101
|
|
|
91
102
|
If you have a list of images, for an object detection task, you can quickly create a dataset file with the following command:
|
|
92
103
|
|
|
93
104
|
```bash
|
|
94
|
-
labelr
|
|
105
|
+
labelr ls create-dataset-file --input-file image_urls.txt --output-file dataset.json
|
|
95
106
|
```
|
|
96
107
|
|
|
97
108
|
where `image_urls.txt` is a file containing the URLs of the images, one per line, and `dataset.json` is the output file.
|
|
@@ -101,7 +112,7 @@ where `image_urls.txt` is a file containing the URLs of the images, one per line
|
|
|
101
112
|
Next, import the generated data to a project with the following command:
|
|
102
113
|
|
|
103
114
|
```bash
|
|
104
|
-
labelr
|
|
115
|
+
labelr ls import-data --project-id PROJECT_ID --dataset-path dataset.json
|
|
105
116
|
```
|
|
106
117
|
|
|
107
118
|
where `PROJECT_ID` is the ID of the project you created.
|
|
@@ -117,7 +128,7 @@ To accelerate annotation, you can pre-annotate the images with an object detecti
|
|
|
117
128
|
To pre-annotate the data with Triton, use the following command:
|
|
118
129
|
|
|
119
130
|
```bash
|
|
120
|
-
labelr
|
|
131
|
+
labelr ls add-prediction --project-id PROJECT_ID --backend ultralytics --labels 'product' --labels 'price tag' --label-mapping '{"price tag": "price-tag"}'
|
|
121
132
|
```
|
|
122
133
|
|
|
123
134
|
where `labels` is the list of labels to use for the object detection task (you can add as many labels as you want).
|
|
@@ -52,7 +52,7 @@ Once you have a Label Studio instance running, you can create a project easily.
|
|
|
52
52
|
For an object detection task, a command allows you to create the configuration file automatically:
|
|
53
53
|
|
|
54
54
|
```bash
|
|
55
|
-
labelr
|
|
55
|
+
labelr ls create-config --labels 'label1' --labels 'label2' --output-file label_config.xml
|
|
56
56
|
```
|
|
57
57
|
|
|
58
58
|
where `label1` and `label2` are the labels you want to use for the object detection task, and `label_config.xml` is the output file that will contain the configuration.
|
|
@@ -60,17 +60,19 @@ where `label1` and `label2` are the labels you want to use for the object detect
|
|
|
60
60
|
Then, you can create a project on Label Studio with the following command:
|
|
61
61
|
|
|
62
62
|
```bash
|
|
63
|
-
labelr
|
|
63
|
+
labelr ls create --title my_project --api-key API_KEY --config-file label_config.xml
|
|
64
64
|
```
|
|
65
65
|
|
|
66
66
|
where `API_KEY` is the API key of the Label Studio instance (API key is available at Account page), and `label_config.xml` is the configuration file of the project.
|
|
67
67
|
|
|
68
|
+
`ls` stands for Label Studio in the CLI.
|
|
69
|
+
|
|
68
70
|
#### Create a dataset file
|
|
69
71
|
|
|
70
72
|
If you have a list of images, for an object detection task, you can quickly create a dataset file with the following command:
|
|
71
73
|
|
|
72
74
|
```bash
|
|
73
|
-
labelr
|
|
75
|
+
labelr ls create-dataset-file --input-file image_urls.txt --output-file dataset.json
|
|
74
76
|
```
|
|
75
77
|
|
|
76
78
|
where `image_urls.txt` is a file containing the URLs of the images, one per line, and `dataset.json` is the output file.
|
|
@@ -80,7 +82,7 @@ where `image_urls.txt` is a file containing the URLs of the images, one per line
|
|
|
80
82
|
Next, import the generated data to a project with the following command:
|
|
81
83
|
|
|
82
84
|
```bash
|
|
83
|
-
labelr
|
|
85
|
+
labelr ls import-data --project-id PROJECT_ID --dataset-path dataset.json
|
|
84
86
|
```
|
|
85
87
|
|
|
86
88
|
where `PROJECT_ID` is the ID of the project you created.
|
|
@@ -96,7 +98,7 @@ To accelerate annotation, you can pre-annotate the images with an object detecti
|
|
|
96
98
|
To pre-annotate the data with Triton, use the following command:
|
|
97
99
|
|
|
98
100
|
```bash
|
|
99
|
-
labelr
|
|
101
|
+
labelr ls add-prediction --project-id PROJECT_ID --backend ultralytics --labels 'product' --labels 'price tag' --label-mapping '{"price tag": "price-tag"}'
|
|
100
102
|
```
|
|
101
103
|
|
|
102
104
|
where `labels` is the list of labels to use for the object detection task (you can add as many labels as you want).
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "labelr"
|
|
3
|
-
version = "0.
|
|
3
|
+
version = "0.9.0"
|
|
4
4
|
description = "A command-line tool to manage labeling tasks with Label Studio."
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
requires-python = ">=3.10"
|
|
@@ -12,7 +12,12 @@ dependencies = [
|
|
|
12
12
|
"openfoodfacts>=2.9.0",
|
|
13
13
|
"typer>=0.15.1",
|
|
14
14
|
"google-cloud-batch==0.18.0",
|
|
15
|
-
"huggingface-hub"
|
|
15
|
+
"huggingface-hub",
|
|
16
|
+
"deepdiff>=8.6.1",
|
|
17
|
+
"rapidfuzz>=3.14.3",
|
|
18
|
+
"aiohttp",
|
|
19
|
+
"aiofiles",
|
|
20
|
+
"orjson",
|
|
16
21
|
]
|
|
17
22
|
|
|
18
23
|
[project.scripts]
|
|
@@ -25,6 +30,7 @@ ultralytics = [
|
|
|
25
30
|
fiftyone = [
|
|
26
31
|
"fiftyone~=1.10.0"
|
|
27
32
|
]
|
|
33
|
+
google = ["google-genai >= 1.56.0", "gcloud-aio-storage", "google-cloud-storage"]
|
|
28
34
|
|
|
29
35
|
[tool.uv]
|
|
30
36
|
package = true
|
|
@@ -1,3 +1,6 @@
|
|
|
1
|
+
"""Commands to manage datasets local datasets and export between platforms
|
|
2
|
+
(Label Studio, HuggingFace Hub, local dataset,...)."""
|
|
3
|
+
|
|
1
4
|
import json
|
|
2
5
|
import random
|
|
3
6
|
import shutil
|
|
@@ -21,45 +24,29 @@ logger = get_logger(__name__)
|
|
|
21
24
|
|
|
22
25
|
@app.command()
|
|
23
26
|
def check(
|
|
24
|
-
api_key: Annotated[
|
|
25
|
-
Optional[str], typer.Option(envvar="LABEL_STUDIO_API_KEY")
|
|
26
|
-
] = None,
|
|
27
|
-
project_id: Annotated[
|
|
28
|
-
Optional[int], typer.Option(help="Label Studio Project ID")
|
|
29
|
-
] = None,
|
|
30
|
-
label_studio_url: str = LABEL_STUDIO_DEFAULT_URL,
|
|
31
27
|
dataset_dir: Annotated[
|
|
32
|
-
|
|
28
|
+
Path,
|
|
33
29
|
typer.Option(
|
|
34
30
|
help="Path to the dataset directory", exists=True, file_okay=False
|
|
35
31
|
),
|
|
36
|
-
]
|
|
32
|
+
],
|
|
37
33
|
remove: Annotated[
|
|
38
34
|
bool,
|
|
39
|
-
typer.Option(
|
|
40
|
-
help="Remove duplicate images from the dataset, only for local datasets"
|
|
41
|
-
),
|
|
35
|
+
typer.Option(help="Remove duplicate images from the dataset"),
|
|
42
36
|
] = False,
|
|
43
37
|
):
|
|
44
|
-
"""Check a dataset for duplicate images."""
|
|
45
|
-
from label_studio_sdk.client import LabelStudio
|
|
38
|
+
"""Check a local dataset in Ultralytics format for duplicate images."""
|
|
46
39
|
|
|
47
|
-
from ..check import check_local_dataset
|
|
40
|
+
from ..check import check_local_dataset
|
|
48
41
|
|
|
49
|
-
|
|
50
|
-
ls = LabelStudio(base_url=label_studio_url, api_key=api_key)
|
|
51
|
-
check_ls_dataset(ls, project_id)
|
|
52
|
-
elif dataset_dir is not None:
|
|
53
|
-
check_local_dataset(dataset_dir, remove=remove)
|
|
54
|
-
else:
|
|
55
|
-
raise typer.BadParameter("Either project ID or dataset directory is required")
|
|
42
|
+
check_local_dataset(dataset_dir, remove=remove)
|
|
56
43
|
|
|
57
44
|
|
|
58
45
|
@app.command()
|
|
59
46
|
def split_train_test(
|
|
60
47
|
task_type: TaskType, dataset_dir: Path, output_dir: Path, train_ratio: float = 0.8
|
|
61
48
|
):
|
|
62
|
-
"""Split a dataset into training and test sets.
|
|
49
|
+
"""Split a local dataset into training and test sets.
|
|
63
50
|
|
|
64
51
|
Only classification tasks are supported.
|
|
65
52
|
"""
|
|
@@ -112,7 +99,7 @@ def convert_object_detection_dataset(
|
|
|
112
99
|
Studio format, and save it to a JSON file."""
|
|
113
100
|
from datasets import load_dataset
|
|
114
101
|
|
|
115
|
-
from labelr.sample import
|
|
102
|
+
from labelr.sample import format_object_detection_sample_from_hf_to_ls
|
|
116
103
|
|
|
117
104
|
logger.info("Loading dataset: %s", repo_id)
|
|
118
105
|
ds = load_dataset(repo_id)
|
|
@@ -122,7 +109,7 @@ def convert_object_detection_dataset(
|
|
|
122
109
|
for split in ds.keys():
|
|
123
110
|
logger.info("Processing split: %s", split)
|
|
124
111
|
for sample in ds[split]:
|
|
125
|
-
label_studio_sample =
|
|
112
|
+
label_studio_sample = format_object_detection_sample_from_hf_to_ls(
|
|
126
113
|
sample, split=split
|
|
127
114
|
)
|
|
128
115
|
f.write(json.dumps(label_studio_sample) + "\n")
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
from typing import Annotated
|
|
2
|
+
|
|
3
|
+
import typer
|
|
4
|
+
|
|
5
|
+
app = typer.Typer()
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@app.command()
|
|
9
|
+
def visualize_object_detection(
|
|
10
|
+
hf_repo_id: Annotated[
|
|
11
|
+
str,
|
|
12
|
+
typer.Option(
|
|
13
|
+
...,
|
|
14
|
+
help="Hugging Face repository ID of the trained model. "
|
|
15
|
+
"A `predictions.parquet` file is expected in the repo. Revision can be specified "
|
|
16
|
+
"by appending `@<revision>` to the repo ID.",
|
|
17
|
+
),
|
|
18
|
+
],
|
|
19
|
+
dataset_name: Annotated[
|
|
20
|
+
str | None, typer.Option(..., help="Name of the FiftyOne dataset to create.")
|
|
21
|
+
] = None,
|
|
22
|
+
persistent: Annotated[
|
|
23
|
+
bool,
|
|
24
|
+
typer.Option(
|
|
25
|
+
...,
|
|
26
|
+
help="Whether to make the FiftyOne dataset persistent (i.e., saved to disk).",
|
|
27
|
+
),
|
|
28
|
+
] = False,
|
|
29
|
+
):
|
|
30
|
+
"""Visualize object detection model predictions stored in a Hugging Face
|
|
31
|
+
repository using FiftyOne."""
|
|
32
|
+
from labelr.evaluate import object_detection
|
|
33
|
+
|
|
34
|
+
if dataset_name is None:
|
|
35
|
+
dataset_name = hf_repo_id.replace("/", "-").replace("@", "-")
|
|
36
|
+
|
|
37
|
+
object_detection.visualize(
|
|
38
|
+
hf_repo_id=hf_repo_id,
|
|
39
|
+
dataset_name=dataset_name,
|
|
40
|
+
persistent=persistent,
|
|
41
|
+
)
|
|
@@ -0,0 +1,289 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import importlib
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Annotated, Any
|
|
5
|
+
|
|
6
|
+
import typer
|
|
7
|
+
from google.genai.types import JSONSchema as GoogleJSONSchema
|
|
8
|
+
from google.genai.types import Schema as GoogleSchema
|
|
9
|
+
from openfoodfacts import Flavor
|
|
10
|
+
from pydantic import BaseModel
|
|
11
|
+
|
|
12
|
+
from labelr.google_genai import generate_batch_dataset, launch_batch_job
|
|
13
|
+
|
|
14
|
+
app = typer.Typer()
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def convert_pydantic_model_to_google_schema(schema: type[BaseModel]) -> dict[str, Any]:
|
|
18
|
+
"""Google doesn't support natively OpenAPI schemas, so we convert them to
|
|
19
|
+
Google `Schema` (a subset of OpenAPI)."""
|
|
20
|
+
return GoogleSchema.from_json_schema(
|
|
21
|
+
json_schema=GoogleJSONSchema.model_validate(schema.model_json_schema())
|
|
22
|
+
).model_dump(mode="json", exclude_none=True, exclude_unset=True)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@app.command()
|
|
26
|
+
def generate_dataset(
|
|
27
|
+
data_path: Annotated[
|
|
28
|
+
Path,
|
|
29
|
+
typer.Option(
|
|
30
|
+
...,
|
|
31
|
+
help="Path to a JSONL file containing the raw batch samples.",
|
|
32
|
+
exists=True,
|
|
33
|
+
dir_okay=False,
|
|
34
|
+
resolve_path=True,
|
|
35
|
+
),
|
|
36
|
+
],
|
|
37
|
+
output_path: Annotated[
|
|
38
|
+
Path,
|
|
39
|
+
typer.Option(
|
|
40
|
+
...,
|
|
41
|
+
help="Path where to write the generated dataset file.",
|
|
42
|
+
exists=False,
|
|
43
|
+
dir_okay=False,
|
|
44
|
+
resolve_path=True,
|
|
45
|
+
),
|
|
46
|
+
],
|
|
47
|
+
config_module: Annotated[
|
|
48
|
+
str,
|
|
49
|
+
typer.Option(
|
|
50
|
+
...,
|
|
51
|
+
help="Python module path (e.g., 'myschema') containing two variables: "
|
|
52
|
+
"OUTPUT_SCHEMA (a Pydantic class representing the output schema) and "
|
|
53
|
+
"INSTRUCTIONS (a str containing instructions to add before each sample).",
|
|
54
|
+
),
|
|
55
|
+
],
|
|
56
|
+
bucket_name: Annotated[
|
|
57
|
+
str,
|
|
58
|
+
typer.Option(
|
|
59
|
+
...,
|
|
60
|
+
help="Name of the GCS bucket where the images are stored.",
|
|
61
|
+
),
|
|
62
|
+
] = "robotoff-batch",
|
|
63
|
+
bucket_dir_name: Annotated[
|
|
64
|
+
str,
|
|
65
|
+
typer.Option(
|
|
66
|
+
...,
|
|
67
|
+
help="Directory name in the GCS bucket where the images are stored.",
|
|
68
|
+
),
|
|
69
|
+
] = "gemini-batch-images",
|
|
70
|
+
max_concurrent_uploads: Annotated[
|
|
71
|
+
int,
|
|
72
|
+
typer.Option(
|
|
73
|
+
...,
|
|
74
|
+
help="Maximum number of concurrent uploads to GCS.",
|
|
75
|
+
),
|
|
76
|
+
] = 30,
|
|
77
|
+
base_image_dir: Annotated[
|
|
78
|
+
Path | None,
|
|
79
|
+
typer.Option(
|
|
80
|
+
...,
|
|
81
|
+
help="Base directory to resolve local image paths from.",
|
|
82
|
+
),
|
|
83
|
+
] = None,
|
|
84
|
+
from_key: Annotated[
|
|
85
|
+
str | None,
|
|
86
|
+
typer.Option(
|
|
87
|
+
...,
|
|
88
|
+
help="If specified, resume processing from this sample key.",
|
|
89
|
+
),
|
|
90
|
+
] = None,
|
|
91
|
+
skip_upload: Annotated[
|
|
92
|
+
bool, typer.Option(..., help="Skip uploading images to GCS")
|
|
93
|
+
] = False,
|
|
94
|
+
thinking_level: Annotated[
|
|
95
|
+
str | None,
|
|
96
|
+
typer.Option(
|
|
97
|
+
...,
|
|
98
|
+
help="Thinking level to use for the generation config.",
|
|
99
|
+
),
|
|
100
|
+
] = None,
|
|
101
|
+
):
|
|
102
|
+
"""Generate a dataset file in JSONL format to be used for batch
|
|
103
|
+
processing, using Gemini Batch Inference."""
|
|
104
|
+
typer.echo(f"Uploading images from '{data_path}' to GCS bucket '{bucket_name}'...")
|
|
105
|
+
typer.echo(f"Writing updated dataset to {output_path}...")
|
|
106
|
+
typer.echo(f"Max concurrent uploads: {max_concurrent_uploads}...")
|
|
107
|
+
typer.echo(f"Base image directory: {base_image_dir}...")
|
|
108
|
+
typer.echo(f"From key: {from_key}...")
|
|
109
|
+
typer.echo(f"Skip upload: {skip_upload}...")
|
|
110
|
+
typer.echo(f"Thinking level: {thinking_level}...")
|
|
111
|
+
|
|
112
|
+
module = importlib.import_module(config_module)
|
|
113
|
+
base_cls = getattr(module, "OUTPUT_SCHEMA")
|
|
114
|
+
|
|
115
|
+
if not issubclass(base_cls, BaseModel):
|
|
116
|
+
typer.echo(
|
|
117
|
+
f"Error: {config_module}.OUTPUT_SCHEMA is not a subclass of pydantic.BaseModel"
|
|
118
|
+
)
|
|
119
|
+
raise typer.Exit(code=1)
|
|
120
|
+
|
|
121
|
+
instructions = getattr(module, "INSTRUCTIONS", None) or None
|
|
122
|
+
|
|
123
|
+
if instructions:
|
|
124
|
+
typer.echo(f"Using instructions: '{instructions}'...")
|
|
125
|
+
else:
|
|
126
|
+
typer.echo("No instructions provided.")
|
|
127
|
+
|
|
128
|
+
# JSON Schema is supoorted natively by Vertex AI and Gemini APIs,
|
|
129
|
+
# but not yet on Batch Inference...
|
|
130
|
+
# So we convert the JSON schema to Google internal "Schema"
|
|
131
|
+
# google_json_schema = base_cls.model_json_schema()
|
|
132
|
+
google_json_schema = convert_pydantic_model_to_google_schema(base_cls)
|
|
133
|
+
asyncio.run(
|
|
134
|
+
generate_batch_dataset(
|
|
135
|
+
data_path=data_path,
|
|
136
|
+
output_path=output_path,
|
|
137
|
+
google_json_schema=google_json_schema,
|
|
138
|
+
instructions=instructions,
|
|
139
|
+
bucket_name=bucket_name,
|
|
140
|
+
bucket_dir_name=bucket_dir_name,
|
|
141
|
+
max_concurrent_uploads=max_concurrent_uploads,
|
|
142
|
+
base_image_dir=base_image_dir,
|
|
143
|
+
from_key=from_key,
|
|
144
|
+
skip_upload=skip_upload,
|
|
145
|
+
thinking_level=thinking_level,
|
|
146
|
+
)
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
@app.command(name="launch-batch-job")
|
|
151
|
+
def launch_batch_job_command(
|
|
152
|
+
run_name: Annotated[str, typer.Argument(..., help="Name of the batch job run")],
|
|
153
|
+
dataset_path: Annotated[Path, typer.Option(..., help="Path to the dataset file")],
|
|
154
|
+
model: Annotated[str, typer.Option(..., help="Model to use for the batch job")],
|
|
155
|
+
location: Annotated[
|
|
156
|
+
str,
|
|
157
|
+
typer.Option(..., help="GCP location where to run the batch job"),
|
|
158
|
+
] = "europe-west4",
|
|
159
|
+
):
|
|
160
|
+
"""Launch a Gemini Batch Inference job."""
|
|
161
|
+
launch_batch_job(
|
|
162
|
+
run_name=run_name,
|
|
163
|
+
dataset_path=dataset_path,
|
|
164
|
+
model=model,
|
|
165
|
+
location=location,
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
@app.command()
|
|
170
|
+
def upload_training_dataset_from_predictions(
|
|
171
|
+
prediction_path: Annotated[
|
|
172
|
+
Path,
|
|
173
|
+
typer.Argument(
|
|
174
|
+
...,
|
|
175
|
+
help="Path to the prediction JSONL file generated by Google Inference Batch",
|
|
176
|
+
exists=True,
|
|
177
|
+
dir_okay=False,
|
|
178
|
+
readable=True,
|
|
179
|
+
),
|
|
180
|
+
],
|
|
181
|
+
instructions_path: Annotated[
|
|
182
|
+
Path,
|
|
183
|
+
typer.Option(
|
|
184
|
+
...,
|
|
185
|
+
help="Path to the file with the instruction prompt for the model",
|
|
186
|
+
exists=True,
|
|
187
|
+
dir_okay=False,
|
|
188
|
+
readable=True,
|
|
189
|
+
),
|
|
190
|
+
],
|
|
191
|
+
json_schema_path: Annotated[
|
|
192
|
+
Path,
|
|
193
|
+
typer.Option(
|
|
194
|
+
...,
|
|
195
|
+
help="Path to the file with the JSON schema to follow",
|
|
196
|
+
dir_okay=False,
|
|
197
|
+
readable=True,
|
|
198
|
+
),
|
|
199
|
+
],
|
|
200
|
+
repo_id: Annotated[
|
|
201
|
+
str, typer.Option(help="Hugging Face Datasets repository ID to push to")
|
|
202
|
+
],
|
|
203
|
+
revision: Annotated[
|
|
204
|
+
str,
|
|
205
|
+
typer.Option(
|
|
206
|
+
help="Revision (branch, tag or commit) to use for the Hugging Face Datasets repository"
|
|
207
|
+
),
|
|
208
|
+
] = "main",
|
|
209
|
+
is_openfoodfacts_dataset: Annotated[
|
|
210
|
+
bool, typer.Option(..., help="Whether this is an Open Food Facts dataset")
|
|
211
|
+
] = False,
|
|
212
|
+
openfoodfacts_flavor: Annotated[
|
|
213
|
+
Flavor,
|
|
214
|
+
typer.Option(
|
|
215
|
+
...,
|
|
216
|
+
help="Open Food Facts flavor of the dataset (if applicable)",
|
|
217
|
+
),
|
|
218
|
+
] = Flavor.off,
|
|
219
|
+
split: Annotated[str, typer.Option(..., help="Name of the split")] = "train",
|
|
220
|
+
tmp_dir: Annotated[
|
|
221
|
+
Path | None,
|
|
222
|
+
typer.Option(
|
|
223
|
+
...,
|
|
224
|
+
help="Temporary directory to use for intermediate files, default to a temporary directory "
|
|
225
|
+
"generated automatically. This is useful to relaunch the command if it fails midway.",
|
|
226
|
+
),
|
|
227
|
+
] = None,
|
|
228
|
+
skip: Annotated[int, typer.Option(..., help="Number of samples to skip")] = 0,
|
|
229
|
+
limit: Annotated[
|
|
230
|
+
int | None,
|
|
231
|
+
typer.Option(
|
|
232
|
+
..., help="Limit number of samples to process, or None for no limit"
|
|
233
|
+
),
|
|
234
|
+
] = None,
|
|
235
|
+
raise_on_invalid_sample: Annotated[
|
|
236
|
+
bool,
|
|
237
|
+
typer.Option(
|
|
238
|
+
...,
|
|
239
|
+
help="Whether to raise an error on invalid samples instead of skipping them",
|
|
240
|
+
),
|
|
241
|
+
] = False,
|
|
242
|
+
):
|
|
243
|
+
"""Upload a training dataset to a Hugging Face Datasets repository from a
|
|
244
|
+
Gemini batch prediction file."""
|
|
245
|
+
import tempfile
|
|
246
|
+
|
|
247
|
+
import orjson
|
|
248
|
+
from huggingface_hub import HfApi
|
|
249
|
+
|
|
250
|
+
from labelr.export import export_to_hf_llm_image_extraction
|
|
251
|
+
from labelr.google_genai import generate_sample_iter
|
|
252
|
+
|
|
253
|
+
instructions = instructions_path.read_text()
|
|
254
|
+
print(f"Instructions: {instructions}")
|
|
255
|
+
json_schema = orjson.loads(json_schema_path.read_text())
|
|
256
|
+
|
|
257
|
+
api = HfApi()
|
|
258
|
+
config = {
|
|
259
|
+
"instructions": instructions,
|
|
260
|
+
"json_schema": json_schema,
|
|
261
|
+
}
|
|
262
|
+
with tempfile.TemporaryDirectory() as config_tmp_dir_str:
|
|
263
|
+
config_tmp_dir = Path(config_tmp_dir_str)
|
|
264
|
+
config_path = config_tmp_dir / "config.json"
|
|
265
|
+
config_path.write_text(
|
|
266
|
+
orjson.dumps(config, option=orjson.OPT_INDENT_2).decode("utf-8")
|
|
267
|
+
)
|
|
268
|
+
api.upload_file(
|
|
269
|
+
path_or_fileobj=config_path,
|
|
270
|
+
path_in_repo="config.json",
|
|
271
|
+
repo_id=repo_id,
|
|
272
|
+
repo_type="dataset",
|
|
273
|
+
)
|
|
274
|
+
sample_iter = generate_sample_iter(
|
|
275
|
+
prediction_path=prediction_path,
|
|
276
|
+
json_schema=json_schema,
|
|
277
|
+
is_openfoodfacts_dataset=is_openfoodfacts_dataset,
|
|
278
|
+
openfoodfacts_flavor=openfoodfacts_flavor,
|
|
279
|
+
skip=skip,
|
|
280
|
+
limit=limit,
|
|
281
|
+
raise_on_invalid_sample=raise_on_invalid_sample,
|
|
282
|
+
)
|
|
283
|
+
export_to_hf_llm_image_extraction(
|
|
284
|
+
sample_iter=sample_iter,
|
|
285
|
+
split=split,
|
|
286
|
+
repo_id=repo_id,
|
|
287
|
+
revision=revision,
|
|
288
|
+
tmp_dir=tmp_dir,
|
|
289
|
+
)
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from typing import Annotated
|
|
3
|
+
|
|
4
|
+
import typer
|
|
5
|
+
|
|
6
|
+
app = typer.Typer()
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@app.command()
|
|
10
|
+
def show_hf_sample(
|
|
11
|
+
repo_id: Annotated[
|
|
12
|
+
str,
|
|
13
|
+
typer.Argument(
|
|
14
|
+
...,
|
|
15
|
+
help="Hugging Face Datasets repo ID. The revision can be specified by "
|
|
16
|
+
"appending `@<revision>` to the repo ID.",
|
|
17
|
+
),
|
|
18
|
+
],
|
|
19
|
+
image_id: Annotated[
|
|
20
|
+
str,
|
|
21
|
+
typer.Argument(
|
|
22
|
+
...,
|
|
23
|
+
help="ID of the image associated with the sample to display (field: `image_id`)",
|
|
24
|
+
),
|
|
25
|
+
],
|
|
26
|
+
output_image_path: Annotated[
|
|
27
|
+
Path | None,
|
|
28
|
+
typer.Option(help="Path to save the sample image (optional)", exists=False),
|
|
29
|
+
] = None,
|
|
30
|
+
):
|
|
31
|
+
"""Display a sample from a Hugging Face Datasets repository by image ID."""
|
|
32
|
+
from labelr.utils import parse_hf_repo_id
|
|
33
|
+
|
|
34
|
+
repo_id, revision = parse_hf_repo_id(repo_id)
|
|
35
|
+
|
|
36
|
+
from datasets import load_dataset
|
|
37
|
+
|
|
38
|
+
ds = load_dataset(repo_id, revision=revision)
|
|
39
|
+
|
|
40
|
+
sample = None
|
|
41
|
+
for split in ds.keys():
|
|
42
|
+
samples = ds[split].filter(lambda x: x == image_id, input_columns="image_id")
|
|
43
|
+
if len(samples) > 0:
|
|
44
|
+
sample = samples[0]
|
|
45
|
+
break
|
|
46
|
+
if sample is None:
|
|
47
|
+
typer.echo(f"Sample with image ID {image_id} not found in dataset {repo_id}")
|
|
48
|
+
raise typer.Exit(code=1)
|
|
49
|
+
|
|
50
|
+
else:
|
|
51
|
+
for key, value in sample.items():
|
|
52
|
+
typer.echo(f"{key}: {value}")
|
|
53
|
+
|
|
54
|
+
if output_image_path is not None:
|
|
55
|
+
image = sample["image"]
|
|
56
|
+
image.save(output_image_path)
|
|
57
|
+
typer.echo(f"Image saved to {output_image_path}")
|