labelr 0.8.0__py3-none-any.whl → 0.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
labelr/apps/datasets.py CHANGED
@@ -12,7 +12,11 @@ import typer
12
12
  from openfoodfacts import Flavor
13
13
  from openfoodfacts.utils import get_logger
14
14
 
15
- from labelr.export import export_from_ultralytics_to_hf
15
+ from labelr.export.common import export_from_ultralytics_to_hf
16
+ from labelr.export.object_detection import (
17
+ export_from_ls_to_hf_object_detection,
18
+ export_from_ls_to_ultralytics_object_detection,
19
+ )
16
20
 
17
21
  from ..config import LABEL_STUDIO_DEFAULT_URL
18
22
  from ..types import ExportDestination, ExportSource, TaskType
@@ -99,7 +103,9 @@ def convert_object_detection_dataset(
99
103
  Studio format, and save it to a JSON file."""
100
104
  from datasets import load_dataset
101
105
 
102
- from labelr.sample import format_object_detection_sample_from_hf_to_ls
106
+ from labelr.sample.object_detection import (
107
+ format_object_detection_sample_from_hf_to_ls,
108
+ )
103
109
 
104
110
  logger.info("Loading dataset: %s", repo_id)
105
111
  ds = load_dataset(repo_id)
@@ -207,10 +213,8 @@ def export(
207
213
  local files (ultralytics format)."""
208
214
  from label_studio_sdk.client import LabelStudio
209
215
 
210
- from labelr.export import (
216
+ from labelr.export.object_detection import (
211
217
  export_from_hf_to_ultralytics_object_detection,
212
- export_from_ls_to_hf_object_detection,
213
- export_from_ls_to_ultralytics_object_detection,
214
218
  )
215
219
 
216
220
  if (to == ExportDestination.hf or from_ == ExportSource.hf) and repo_id is None:
@@ -303,3 +307,50 @@ def export(
303
307
  is_openfoodfacts_dataset=is_openfoodfacts_dataset,
304
308
  openfoodfacts_flavor=openfoodfacts_flavor,
305
309
  )
310
+
311
+
312
+ @app.command()
313
+ def export_llm_ds(
314
+ dataset_path: Annotated[
315
+ Path, typer.Option(..., help="Path to the JSONL dataset file")
316
+ ],
317
+ repo_id: Annotated[
318
+ str, typer.Option(..., help="Hugging Face Datasets repository ID to export to")
319
+ ],
320
+ split: Annotated[str, typer.Option(..., help="Dataset split to export")],
321
+ revision: Annotated[
322
+ str,
323
+ typer.Option(
324
+ help="Revision (branch, tag or commit) for the Hugging Face Datasets repository."
325
+ ),
326
+ ] = "main",
327
+ tmp_dir: Annotated[
328
+ Path | None,
329
+ typer.Option(
330
+ help="Path to a temporary directory to use for image processing",
331
+ ),
332
+ ] = None,
333
+ image_max_size: Annotated[
334
+ int | None,
335
+ typer.Option(
336
+ help="Maximum size (in pixels) for the images. If None, no resizing is performed.",
337
+ ),
338
+ ] = None,
339
+ ):
340
+ """Export LLM image extraction dataset with images only to Hugging Face
341
+ Datasets.
342
+ """
343
+ from labelr.export.llm import export_to_hf_llm_image_extraction
344
+ from labelr.sample.llm import load_llm_image_extraction_dataset_from_jsonl
345
+
346
+ sample_iter = load_llm_image_extraction_dataset_from_jsonl(
347
+ dataset_path=dataset_path
348
+ )
349
+ export_to_hf_llm_image_extraction(
350
+ sample_iter,
351
+ split=split,
352
+ repo_id=repo_id,
353
+ revision=revision,
354
+ tmp_dir=tmp_dir,
355
+ image_max_size=image_max_size,
356
+ )
@@ -0,0 +1,296 @@
1
+ import asyncio
2
+ import importlib
3
+ from pathlib import Path
4
+ from typing import Annotated, Any
5
+
6
+ import typer
7
+ from google.genai.types import JSONSchema as GoogleJSONSchema
8
+ from google.genai.types import Schema as GoogleSchema
9
+ from openfoodfacts import Flavor
10
+ from pydantic import BaseModel
11
+
12
+ from labelr.google_genai import generate_batch_dataset, launch_batch_job
13
+
14
+ app = typer.Typer()
15
+
16
+
17
+ def convert_pydantic_model_to_google_schema(schema: type[BaseModel]) -> dict[str, Any]:
18
+ """Google doesn't support natively OpenAPI schemas, so we convert them to
19
+ Google `Schema` (a subset of OpenAPI)."""
20
+ return GoogleSchema.from_json_schema(
21
+ json_schema=GoogleJSONSchema.model_validate(schema.model_json_schema())
22
+ ).model_dump(mode="json", exclude_none=True, exclude_unset=True)
23
+
24
+
25
+ @app.command()
26
+ def generate_dataset(
27
+ data_path: Annotated[
28
+ Path,
29
+ typer.Option(
30
+ ...,
31
+ help="Path to a JSONL file containing the raw batch samples.",
32
+ exists=True,
33
+ dir_okay=False,
34
+ resolve_path=True,
35
+ ),
36
+ ],
37
+ output_path: Annotated[
38
+ Path,
39
+ typer.Option(
40
+ ...,
41
+ help="Path where to write the generated dataset file.",
42
+ exists=False,
43
+ dir_okay=False,
44
+ resolve_path=True,
45
+ ),
46
+ ],
47
+ config_module: Annotated[
48
+ str,
49
+ typer.Option(
50
+ ...,
51
+ help="Python module path (e.g., 'myschema') containing two variables: "
52
+ "OUTPUT_SCHEMA (a Pydantic class representing the output schema) and "
53
+ "INSTRUCTIONS (a str containing instructions to add before each sample).",
54
+ ),
55
+ ],
56
+ bucket_name: Annotated[
57
+ str,
58
+ typer.Option(
59
+ ...,
60
+ help="Name of the GCS bucket where the images are stored.",
61
+ ),
62
+ ] = "robotoff-batch",
63
+ bucket_dir_name: Annotated[
64
+ str,
65
+ typer.Option(
66
+ ...,
67
+ help="Directory name in the GCS bucket where the images are stored.",
68
+ ),
69
+ ] = "gemini-batch-images",
70
+ max_concurrent_uploads: Annotated[
71
+ int,
72
+ typer.Option(
73
+ ...,
74
+ help="Maximum number of concurrent uploads to GCS.",
75
+ ),
76
+ ] = 30,
77
+ base_image_dir: Annotated[
78
+ Path | None,
79
+ typer.Option(
80
+ ...,
81
+ help="Base directory to resolve local image paths from.",
82
+ ),
83
+ ] = None,
84
+ from_key: Annotated[
85
+ str | None,
86
+ typer.Option(
87
+ ...,
88
+ help="If specified, resume processing from this sample key.",
89
+ ),
90
+ ] = None,
91
+ skip_upload: Annotated[
92
+ bool, typer.Option(..., help="Skip uploading images to GCS")
93
+ ] = False,
94
+ thinking_level: Annotated[
95
+ str | None,
96
+ typer.Option(
97
+ ...,
98
+ help="Thinking level to use for the generation config.",
99
+ ),
100
+ ] = None,
101
+ ):
102
+ """Generate a dataset file in JSONL format to be used for batch
103
+ processing, using Gemini Batch Inference."""
104
+ typer.echo(f"Uploading images from '{data_path}' to GCS bucket '{bucket_name}'...")
105
+ typer.echo(f"Writing updated dataset to {output_path}...")
106
+ typer.echo(f"Max concurrent uploads: {max_concurrent_uploads}...")
107
+ typer.echo(f"Base image directory: {base_image_dir}...")
108
+ typer.echo(f"From key: {from_key}...")
109
+ typer.echo(f"Skip upload: {skip_upload}...")
110
+ typer.echo(f"Thinking level: {thinking_level}...")
111
+
112
+ module = importlib.import_module(config_module)
113
+ base_cls = getattr(module, "OUTPUT_SCHEMA")
114
+
115
+ if not issubclass(base_cls, BaseModel):
116
+ typer.echo(
117
+ f"Error: {config_module}.OUTPUT_SCHEMA is not a subclass of pydantic.BaseModel"
118
+ )
119
+ raise typer.Exit(code=1)
120
+
121
+ instructions = getattr(module, "INSTRUCTIONS", None) or None
122
+
123
+ if instructions:
124
+ typer.echo(f"Using instructions: '{instructions}'...")
125
+ else:
126
+ typer.echo("No instructions provided.")
127
+
128
+ # JSON Schema is supoorted natively by Vertex AI and Gemini APIs,
129
+ # but not yet on Batch Inference...
130
+ # So we convert the JSON schema to Google internal "Schema"
131
+ # google_json_schema = base_cls.model_json_schema()
132
+ google_json_schema = convert_pydantic_model_to_google_schema(base_cls)
133
+ asyncio.run(
134
+ generate_batch_dataset(
135
+ data_path=data_path,
136
+ output_path=output_path,
137
+ google_json_schema=google_json_schema,
138
+ instructions=instructions,
139
+ bucket_name=bucket_name,
140
+ bucket_dir_name=bucket_dir_name,
141
+ max_concurrent_uploads=max_concurrent_uploads,
142
+ base_image_dir=base_image_dir,
143
+ from_key=from_key,
144
+ skip_upload=skip_upload,
145
+ thinking_level=thinking_level,
146
+ )
147
+ )
148
+
149
+
150
+ @app.command(name="launch-batch-job")
151
+ def launch_batch_job_command(
152
+ run_name: Annotated[str, typer.Argument(..., help="Name of the batch job run")],
153
+ dataset_path: Annotated[Path, typer.Option(..., help="Path to the dataset file")],
154
+ model: Annotated[str, typer.Option(..., help="Model to use for the batch job")],
155
+ location: Annotated[
156
+ str,
157
+ typer.Option(..., help="GCP location where to run the batch job"),
158
+ ] = "europe-west4",
159
+ ):
160
+ """Launch a Gemini Batch Inference job."""
161
+ launch_batch_job(
162
+ run_name=run_name,
163
+ dataset_path=dataset_path,
164
+ model=model,
165
+ location=location,
166
+ )
167
+
168
+
169
+ @app.command()
170
+ def upload_training_dataset_from_predictions(
171
+ prediction_path: Annotated[
172
+ Path,
173
+ typer.Argument(
174
+ ...,
175
+ help="Path to the prediction JSONL file generated by Google Inference Batch",
176
+ exists=True,
177
+ dir_okay=False,
178
+ readable=True,
179
+ ),
180
+ ],
181
+ instructions_path: Annotated[
182
+ Path,
183
+ typer.Option(
184
+ ...,
185
+ help="Path to the file with the instruction prompt for the model",
186
+ exists=True,
187
+ dir_okay=False,
188
+ readable=True,
189
+ ),
190
+ ],
191
+ json_schema_path: Annotated[
192
+ Path,
193
+ typer.Option(
194
+ ...,
195
+ help="Path to the file with the JSON schema to follow",
196
+ dir_okay=False,
197
+ readable=True,
198
+ ),
199
+ ],
200
+ repo_id: Annotated[
201
+ str, typer.Option(help="Hugging Face Datasets repository ID to push to")
202
+ ],
203
+ revision: Annotated[
204
+ str,
205
+ typer.Option(
206
+ help="Revision (branch, tag or commit) to use for the Hugging Face Datasets repository"
207
+ ),
208
+ ] = "main",
209
+ is_openfoodfacts_dataset: Annotated[
210
+ bool, typer.Option(..., help="Whether this is an Open Food Facts dataset")
211
+ ] = False,
212
+ openfoodfacts_flavor: Annotated[
213
+ Flavor,
214
+ typer.Option(
215
+ ...,
216
+ help="Open Food Facts flavor of the dataset (if applicable)",
217
+ ),
218
+ ] = Flavor.off,
219
+ split: Annotated[str, typer.Option(..., help="Name of the split")] = "train",
220
+ tmp_dir: Annotated[
221
+ Path | None,
222
+ typer.Option(
223
+ ...,
224
+ help="Temporary directory to use for intermediate files, default to a temporary directory "
225
+ "generated automatically. This is useful to relaunch the command if it fails midway.",
226
+ ),
227
+ ] = None,
228
+ skip: Annotated[int, typer.Option(..., help="Number of samples to skip")] = 0,
229
+ limit: Annotated[
230
+ int | None,
231
+ typer.Option(
232
+ ..., help="Limit number of samples to process, or None for no limit"
233
+ ),
234
+ ] = None,
235
+ raise_on_invalid_sample: Annotated[
236
+ bool,
237
+ typer.Option(
238
+ ...,
239
+ help="Whether to raise an error on invalid samples instead of skipping them",
240
+ ),
241
+ ] = False,
242
+ image_max_size: Annotated[
243
+ int | None,
244
+ typer.Option(
245
+ help="Maximum size (in pixels) for the images. If None, no resizing is performed.",
246
+ ),
247
+ ] = None,
248
+ ):
249
+ """Upload a training dataset to a Hugging Face Datasets repository from a
250
+ Gemini batch prediction file."""
251
+ import tempfile
252
+
253
+ import orjson
254
+ from huggingface_hub import HfApi
255
+
256
+ from labelr.export.llm import export_to_hf_llm_image_extraction
257
+ from labelr.google_genai import generate_sample_iter
258
+
259
+ instructions = instructions_path.read_text()
260
+ print(f"Instructions: {instructions}")
261
+ json_schema = orjson.loads(json_schema_path.read_text())
262
+
263
+ api = HfApi()
264
+ config = {
265
+ "instructions": instructions,
266
+ "json_schema": json_schema,
267
+ }
268
+ with tempfile.TemporaryDirectory() as config_tmp_dir_str:
269
+ config_tmp_dir = Path(config_tmp_dir_str)
270
+ config_path = config_tmp_dir / "config.json"
271
+ config_path.write_text(
272
+ orjson.dumps(config, option=orjson.OPT_INDENT_2).decode("utf-8")
273
+ )
274
+ api.upload_file(
275
+ path_or_fileobj=config_path,
276
+ path_in_repo="config.json",
277
+ repo_id=repo_id,
278
+ repo_type="dataset",
279
+ )
280
+ sample_iter = generate_sample_iter(
281
+ prediction_path=prediction_path,
282
+ json_schema=json_schema,
283
+ is_openfoodfacts_dataset=is_openfoodfacts_dataset,
284
+ openfoodfacts_flavor=openfoodfacts_flavor,
285
+ skip=skip,
286
+ limit=limit,
287
+ raise_on_invalid_sample=raise_on_invalid_sample,
288
+ )
289
+ export_to_hf_llm_image_extraction(
290
+ sample_iter=sample_iter,
291
+ split=split,
292
+ repo_id=repo_id,
293
+ revision=revision,
294
+ tmp_dir=tmp_dir,
295
+ image_max_size=image_max_size,
296
+ )
@@ -398,7 +398,7 @@ def create_dataset_file(
398
398
  from openfoodfacts.images import extract_barcode_from_url, extract_source_from_url
399
399
  from openfoodfacts.utils import get_image_from_url
400
400
 
401
- from labelr.sample import format_object_detection_sample_to_ls
401
+ from labelr.sample.object_detection import format_object_detection_sample_to_ls
402
402
 
403
403
  logger.info("Loading dataset: %s", input_file)
404
404
 
@@ -0,0 +1,114 @@
1
+ import functools
2
+ import logging
3
+ import pickle
4
+ import tempfile
5
+ from pathlib import Path
6
+
7
+ import datasets
8
+ from openfoodfacts.images import generate_image_url
9
+ from openfoodfacts.types import Flavor
10
+ from PIL import Image, ImageOps
11
+
12
+ from labelr.export.common import _pickle_sample_generator
13
+ from labelr.sample.classification import HF_DS_CLASSIFICATION_FEATURES
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ def export_from_ultralytics_to_hf_classification(
19
+ dataset_dir: Path,
20
+ repo_id: str,
21
+ label_names: list[str],
22
+ merge_labels: bool = False,
23
+ is_openfoodfacts_dataset: bool = False,
24
+ openfoodfacts_flavor: Flavor = Flavor.off,
25
+ ) -> None:
26
+ """Export an Ultralytics classification dataset to a Hugging Face dataset.
27
+
28
+ The Ultralytics dataset directory should contain 'train', 'val' and/or
29
+ 'test' subdirectories, each containing subdirectories for each label.
30
+
31
+ Args:
32
+ dataset_dir (Path): Path to the Ultralytics dataset directory.
33
+ repo_id (str): Hugging Face repository ID to push the dataset to.
34
+ label_names (list[str]): List of label names.
35
+ merge_labels (bool): Whether to merge all labels into a single label
36
+ named 'object'.
37
+ is_openfoodfacts_dataset (bool): Whether the dataset is from
38
+ Open Food Facts. If True, the `off_image_id` and `image_url` will
39
+ be generated automatically. `off_image_id` is extracted from the
40
+ image filename.
41
+ openfoodfacts_flavor (Flavor): Flavor of Open Food Facts dataset. This
42
+ is ignored if `is_openfoodfacts_dataset` is False.
43
+ """
44
+ logger.info("Repo ID: %s, dataset_dir: %s", repo_id, dataset_dir)
45
+
46
+ if not any((dataset_dir / split).is_dir() for split in ["train", "val", "test"]):
47
+ raise ValueError(
48
+ f"Dataset directory {dataset_dir} does not contain 'train', 'val' or 'test' subdirectories"
49
+ )
50
+
51
+ # Save output as pickle
52
+ for split in ["train", "val", "test"]:
53
+ split_dir = dataset_dir / split
54
+
55
+ if not split_dir.is_dir():
56
+ logger.info("Skipping missing split directory: %s", split_dir)
57
+ continue
58
+
59
+ with tempfile.TemporaryDirectory() as tmp_dir_str:
60
+ tmp_dir = Path(tmp_dir_str)
61
+ for label_dir in (d for d in split_dir.iterdir() if d.is_dir()):
62
+ label_name = label_dir.name
63
+ if merge_labels:
64
+ label_name = "object"
65
+ if label_name not in label_names:
66
+ raise ValueError(
67
+ "Label name %s not in provided label names (label names: %s)"
68
+ % (label_name, label_names),
69
+ )
70
+ label_id = label_names.index(label_name)
71
+
72
+ for image_path in label_dir.glob("*"):
73
+ if is_openfoodfacts_dataset:
74
+ image_stem_parts = image_path.stem.split("_")
75
+ barcode = image_stem_parts[0]
76
+ off_image_id = image_stem_parts[1]
77
+ image_id = f"{barcode}_{off_image_id}"
78
+ image_url = generate_image_url(
79
+ barcode, off_image_id, flavor=openfoodfacts_flavor
80
+ )
81
+ else:
82
+ image_id = image_path.stem
83
+ barcode = ""
84
+ off_image_id = ""
85
+ image_url = ""
86
+ image = Image.open(image_path)
87
+ image.load()
88
+
89
+ if image.mode != "RGB":
90
+ image = image.convert("RGB")
91
+
92
+ # Rotate image according to exif orientation using Pillow
93
+ ImageOps.exif_transpose(image, in_place=True)
94
+ sample = {
95
+ "image_id": image_id,
96
+ "image": image,
97
+ "width": image.width,
98
+ "height": image.height,
99
+ "meta": {
100
+ "barcode": barcode,
101
+ "off_image_id": off_image_id,
102
+ "image_url": image_url,
103
+ },
104
+ "category_id": label_id,
105
+ "category_name": label_name,
106
+ }
107
+ with open(tmp_dir / f"{split}_{image_id}.pkl", "wb") as f:
108
+ pickle.dump(sample, f)
109
+
110
+ hf_ds = datasets.Dataset.from_generator(
111
+ functools.partial(_pickle_sample_generator, tmp_dir),
112
+ features=HF_DS_CLASSIFICATION_FEATURES,
113
+ )
114
+ hf_ds.push_to_hub(repo_id, split=split)
@@ -0,0 +1,42 @@
1
+ import pickle
2
+ from pathlib import Path
3
+
4
+ from openfoodfacts.types import Flavor
5
+
6
+ from labelr.types import TaskType
7
+
8
+
9
+ def _pickle_sample_generator(dir: Path):
10
+ """Generator that yields samples from pickles in a directory."""
11
+ for pkl in dir.glob("*.pkl"):
12
+ with open(pkl, "rb") as f:
13
+ yield pickle.load(f)
14
+
15
+
16
+ def export_from_ultralytics_to_hf(
17
+ task_type: TaskType,
18
+ dataset_dir: Path,
19
+ repo_id: str,
20
+ label_names: list[str],
21
+ merge_labels: bool = False,
22
+ is_openfoodfacts_dataset: bool = False,
23
+ openfoodfacts_flavor: Flavor = Flavor.off,
24
+ ) -> None:
25
+ from labelr.export.classification import (
26
+ export_from_ultralytics_to_hf_classification,
27
+ )
28
+
29
+ if task_type != TaskType.classification:
30
+ raise NotImplementedError(
31
+ "Only classification task is currently supported for Ultralytics to HF export"
32
+ )
33
+
34
+ if task_type == TaskType.classification:
35
+ export_from_ultralytics_to_hf_classification(
36
+ dataset_dir=dataset_dir,
37
+ repo_id=repo_id,
38
+ label_names=label_names,
39
+ merge_labels=merge_labels,
40
+ is_openfoodfacts_dataset=is_openfoodfacts_dataset,
41
+ openfoodfacts_flavor=openfoodfacts_flavor,
42
+ )
labelr/export/llm.py ADDED
@@ -0,0 +1,91 @@
1
+ import functools
2
+ import logging
3
+ import pickle
4
+ import tempfile
5
+ import typing
6
+ from collections.abc import Iterator
7
+ from pathlib import Path
8
+
9
+ import datasets
10
+ import tqdm
11
+ from PIL import Image, ImageOps
12
+
13
+ from labelr.export.common import _pickle_sample_generator
14
+ from labelr.sample.llm import (
15
+ HF_DS_LLM_IMAGE_EXTRACTION_FEATURES,
16
+ LLMImageExtractionSample,
17
+ )
18
+ from labelr.utils import PathWithContext
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ def export_to_hf_llm_image_extraction(
24
+ sample_iter: Iterator[LLMImageExtractionSample],
25
+ split: str,
26
+ repo_id: str,
27
+ revision: str = "main",
28
+ tmp_dir: Path | None = None,
29
+ image_max_size: int | None = None,
30
+ ) -> None:
31
+ """Export LLM image extraction samples to a Hugging Face dataset.
32
+
33
+ Args:
34
+ sample_iter (Iterator[LLMImageExtractionSample]): Iterator of samples
35
+ to export.
36
+ split (str): Name of the dataset split (e.g., 'train', 'val').
37
+ repo_id (str): Hugging Face repository ID to push the dataset to.
38
+ revision (str): Revision (branch, tag or commit) to use for the
39
+ Hugging Face Datasets repository.
40
+ tmp_dir (Path | None): Temporary directory to use for intermediate
41
+ files. If None, a temporary directory will be created
42
+ automatically.
43
+ image_max_size (int | None): Maximum size (in pixels) for the images.
44
+ """
45
+ logger.info(
46
+ "Repo ID: %s, revision: %s, split: %s, tmp_dir: %s, image_max_size: %s",
47
+ repo_id,
48
+ revision,
49
+ split,
50
+ tmp_dir,
51
+ image_max_size,
52
+ )
53
+
54
+ tmp_dir_with_context: PathWithContext | tempfile.TemporaryDirectory
55
+ if tmp_dir:
56
+ tmp_dir.mkdir(parents=True, exist_ok=True)
57
+ tmp_dir_with_context = PathWithContext(tmp_dir)
58
+ else:
59
+ tmp_dir_with_context = tempfile.TemporaryDirectory()
60
+
61
+ with tmp_dir_with_context as tmp_dir_str:
62
+ tmp_dir = Path(tmp_dir_str)
63
+ for sample in tqdm.tqdm(sample_iter, desc="samples"):
64
+ image = sample.image
65
+ # Rotate image according to exif orientation using Pillow
66
+ image = typing.cast(Image.Image, ImageOps.exif_transpose(image))
67
+
68
+ if image_max_size is not None:
69
+ if image.height > image_max_size or image.width > image_max_size:
70
+ image.thumbnail(
71
+ (image_max_size, image_max_size),
72
+ Image.Resampling.LANCZOS,
73
+ )
74
+ image_id = sample.image_id
75
+ json_sample = {
76
+ "image_id": image_id,
77
+ "image": image,
78
+ "meta": {
79
+ k: v for k, v in sample.meta.model_dump().items() if v is not None
80
+ },
81
+ "output": sample.output,
82
+ }
83
+ # Save output as pickle
84
+ with open(tmp_dir / f"{split}_{image_id}.pkl", "wb") as f:
85
+ pickle.dump(json_sample, f)
86
+
87
+ hf_ds = datasets.Dataset.from_generator(
88
+ functools.partial(_pickle_sample_generator, tmp_dir),
89
+ features=HF_DS_LLM_IMAGE_EXTRACTION_FEATURES,
90
+ )
91
+ hf_ds.push_to_hub(repo_id, split=split, revision=revision)