datago 2025.8.1__tar.gz → 2025.10.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. {datago-2025.8.1 → datago-2025.10.2}/Cargo.lock +1 -1
  2. {datago-2025.8.1 → datago-2025.10.2}/Cargo.toml +1 -1
  3. {datago-2025.8.1 → datago-2025.10.2}/PKG-INFO +61 -11
  4. {datago-2025.8.1 → datago-2025.10.2}/README.md +60 -10
  5. datago-2025.10.2/assets/epyc_vast.png +0 -0
  6. datago-2025.10.2/assets/epyc_wds.png +0 -0
  7. datago-2025.10.2/assets/zen3_ssd.png +0 -0
  8. {datago-2025.8.1 → datago-2025.10.2}/python/benchmark_db.py +15 -12
  9. datago-2025.10.2/python/benchmark_defaults.py +8 -0
  10. {datago-2025.8.1 → datago-2025.10.2}/python/benchmark_filesystem.py +39 -33
  11. {datago-2025.8.1 → datago-2025.10.2}/python/benchmark_webdataset.py +34 -20
  12. {datago-2025.8.1 → datago-2025.10.2}/python/dataset.py +3 -0
  13. {datago-2025.8.1 → datago-2025.10.2}/python/raw_types.py +26 -4
  14. {datago-2025.8.1 → datago-2025.10.2}/python/test_datago_db.py +59 -13
  15. {datago-2025.8.1 → datago-2025.10.2}/src/client.rs +22 -7
  16. {datago-2025.8.1 → datago-2025.10.2}/src/generator_files.rs +7 -4
  17. {datago-2025.8.1 → datago-2025.10.2}/src/generator_http.rs +7 -4
  18. {datago-2025.8.1 → datago-2025.10.2}/src/generator_wds.rs +25 -9
  19. {datago-2025.8.1 → datago-2025.10.2}/src/image_processing.rs +111 -27
  20. {datago-2025.8.1 → datago-2025.10.2}/src/main.rs +2 -0
  21. {datago-2025.8.1 → datago-2025.10.2}/src/worker_files.rs +86 -45
  22. {datago-2025.8.1 → datago-2025.10.2}/src/worker_http.rs +33 -30
  23. {datago-2025.8.1 → datago-2025.10.2}/src/worker_wds.rs +12 -13
  24. {datago-2025.8.1 → datago-2025.10.2}/.github/workflows/ci-cd.yml +0 -0
  25. {datago-2025.8.1 → datago-2025.10.2}/.github/workflows/rust.yml +0 -0
  26. {datago-2025.8.1 → datago-2025.10.2}/.gitignore +0 -0
  27. {datago-2025.8.1 → datago-2025.10.2}/.pre-commit-config.yaml +0 -0
  28. {datago-2025.8.1 → datago-2025.10.2}/LICENSE +0 -0
  29. {datago-2025.8.1 → datago-2025.10.2}/assets/447175851-2277afcb-8abf-4d17-b2db-dae27c6056d0.png +0 -0
  30. {datago-2025.8.1 → datago-2025.10.2}/pyproject.toml +0 -0
  31. {datago-2025.8.1 → datago-2025.10.2}/python/test_datago_client.py +0 -0
  32. {datago-2025.8.1 → datago-2025.10.2}/python/test_datago_edge_cases.py +0 -0
  33. {datago-2025.8.1 → datago-2025.10.2}/python/test_datago_filesystem.py +0 -0
  34. {datago-2025.8.1 → datago-2025.10.2}/requirements-tests.txt +0 -0
  35. {datago-2025.8.1 → datago-2025.10.2}/requirements.txt +0 -0
  36. {datago-2025.8.1 → datago-2025.10.2}/src/lib.rs +0 -0
  37. {datago-2025.8.1 → datago-2025.10.2}/src/structs.rs +0 -0
@@ -613,7 +613,7 @@ dependencies = [
613
613
 
614
614
  [[package]]
615
615
  name = "datago"
616
- version = "2025.8.1"
616
+ version = "2025.10.2"
617
617
  dependencies = [
618
618
  "async-compression",
619
619
  "async-tar",
@@ -1,7 +1,7 @@
1
1
  [package]
2
2
  name = "datago"
3
3
  edition = "2021"
4
- version = "2025.8.1"
4
+ version = "2025.10.2"
5
5
  readme = "README.md"
6
6
 
7
7
  [lib]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datago
3
- Version: 2025.8.1
3
+ Version: 2025.10.2
4
4
  Classifier: Programming Language :: Rust
5
5
  Classifier: Programming Language :: Python :: Implementation :: CPython
6
6
  Classifier: Programming Language :: Python :: Implementation :: PyPy
@@ -97,7 +97,7 @@ config = {
97
97
  "source_config": {
98
98
  "root_path": "myPath",
99
99
  "random_sampling": False, # True if used directly for training
100
- "rank": 0,
100
+ "rank": 0, # Optional, distributed workloads are possible
101
101
  "world_size": 1,
102
102
  },
103
103
  "limit": 200,
@@ -137,15 +137,6 @@ client_config = {
137
137
  "rank": 0,
138
138
  "world_size": 1,
139
139
  },
140
- # Optional pre-processing of the images, placing them in an aspect ratio bucket to preseve as much as possible of the original content
141
- "image_config": {
142
- "crop_and_resize": True, # False to turn it off, or just omit this part of the config
143
- "default_image_size": 1024,
144
- "downsampling_ratio": 32,
145
- "min_aspect_ratio": 0.5,
146
- "max_aspect_ratio": 2.0,
147
- "pre_encode_images": False,
148
- },
149
140
  "prefetch_buffer_size": 128,
150
141
  "samples_buffer_size": 64,
151
142
  "limit": 1_000_000, # Dummy example, max number of samples you would like to serve
@@ -159,6 +150,38 @@ for _ in range(10):
159
150
 
160
151
  </details>
161
152
 
153
+ ## Process images on the fly
154
+
155
+ Datago can also process images on the fly, for instance to align different image payloads. This is done by adding an `image_config` to the configuration. The following example shows how to align different image payloads.
156
+
157
+ Processing can be very CPU heavy, but it will be distributed over all CPU cores wihout requiring multiple python processes. I.e., you can keep a single python process using `get_sample()` on the client and still saturate all CPU cores.
158
+
159
+ There are three main processing topics that you can choose from:
160
+
161
+ - crop the images to within an aspect ratio bucket (which is very handy for all Transformer / patch based architectures)
162
+ - resize the images (setting here will be related to the square aspect ratio bucket, other buckets will differ of course)
163
+ - pre-encode the images to a specific format (jpg, png, ...)
164
+
165
+ ```python
166
+ config = {
167
+ "source_type": "file",
168
+ "source_config": {
169
+ "root_path": "myPath",
170
+ "random_sampling": False, # True if used directly for training
171
+ },
172
+ # Optional pre-processing of the images, placing them in an aspect ratio bucket to preserve as much as possible of the original content
173
+ "image_config": {
174
+ "crop_and_resize": True, # False to turn it off, or just omit this part of the config
175
+ "default_image_size": 1024,
176
+ "downsampling_ratio": 32,
177
+ "min_aspect_ratio": 0.5,
178
+ "max_aspect_ratio": 2.0,
179
+ "pre_encode_images": False,
180
+ },
181
+ "limit": 200,
182
+ "samples_buffer_size": 32,
183
+ }
184
+ ```
162
185
 
163
186
  ## Match the raw exported buffers with typical python types
164
187
 
@@ -171,6 +194,14 @@ You can set the log level using the RUST_LOG environment variable. E.g. `RUST_LO
171
194
 
172
195
  When using the library from Python, `env_logger` will be initialized automatically when creating a `DatagoClient`. There is also a `initialize_logging` function in the `datago` module, which if called before using a client, allows to customize the log level. This only works if RUST_LOG is not set.
173
196
 
197
+ ## Env variables
198
+
199
+ There are a couple of env variables which will change the behavior of the library, for settings which felt too low level to be exposed in the config.
200
+
201
+ - `DATAGO_MAX_TASKS`: refers to the number of threads which will be used to load the samples. Defaults to a multiple of the CPU cores.
202
+ - `RUST_LOG`: see above, will change the level of logging for the whole library, could be useful for debugging or to report an issue here.
203
+ - `DATAGO_MAX_RETRIES`: number of retries for a failed sample load, defaults to 3.
204
+
174
205
  </details><details> <summary><strong>Build it</strong></summary>
175
206
 
176
207
  ## Preamble
@@ -233,6 +264,25 @@ Create a new tag and a new release in this repo, a new package will be pushed au
233
264
 
234
265
  </details>
235
266
 
267
+ <details> <summary><strong>Benchmarks</strong></summary>
268
+ As usual, benchmarks are a tricky game, and you shouldn't read too much into the following plots but do your own tests. Some python benchmark examples are provided in the [python](./python/) folder.
269
+
270
+ In general, Datago will be impactful if you want to load a lot of images very fast, but if you consume them as you go at a more leisury pace then it's not really needed. The more CPU work there is with the images and the higher quality they are, the more Datago will shine. The following benchmarks are using ImageNet 1k, which is very low resolution and thus kind of a worst case scenario. Data is served from cache (i.e. the OS cache) and the images are not pre-processed. In this case the receiving python process is typically the bottleneck, and caps at around 2000 images per second.
271
+
272
+ ### AMD Zen3 laptop - IN1k - disk
273
+ ![AMD Zen3 laptop & M2 SSD](assets/zen3_ssd.png)
274
+
275
+ ### AMD EPYC 9454 - IN1k - disk
276
+ ![AMD EPYC 9454](assets/epyc_vast.png)
277
+
278
+ This benchmark is using the PD12M dataset, which is a 12M images dataset, with a lot of high resolution images. It's accessed through the webdataset front end, datago is compared with the popular python webdataset library. Note that datago will start streaming the images faster here (almost instantly !), so given enough time the two results would look closer.
279
+
280
+ ### AMD EPYC 9454 - pd12m - webdataset
281
+ ![AMD EPYC 9454](assets/epyc_wds.png)
282
+
283
+ </details>
284
+
285
+
236
286
  ## License
237
287
 
238
288
  MIT License
@@ -80,7 +80,7 @@ config = {
80
80
  "source_config": {
81
81
  "root_path": "myPath",
82
82
  "random_sampling": False, # True if used directly for training
83
- "rank": 0,
83
+ "rank": 0, # Optional, distributed workloads are possible
84
84
  "world_size": 1,
85
85
  },
86
86
  "limit": 200,
@@ -120,15 +120,6 @@ client_config = {
120
120
  "rank": 0,
121
121
  "world_size": 1,
122
122
  },
123
- # Optional pre-processing of the images, placing them in an aspect ratio bucket to preseve as much as possible of the original content
124
- "image_config": {
125
- "crop_and_resize": True, # False to turn it off, or just omit this part of the config
126
- "default_image_size": 1024,
127
- "downsampling_ratio": 32,
128
- "min_aspect_ratio": 0.5,
129
- "max_aspect_ratio": 2.0,
130
- "pre_encode_images": False,
131
- },
132
123
  "prefetch_buffer_size": 128,
133
124
  "samples_buffer_size": 64,
134
125
  "limit": 1_000_000, # Dummy example, max number of samples you would like to serve
@@ -142,6 +133,38 @@ for _ in range(10):
142
133
 
143
134
  </details>
144
135
 
136
+ ## Process images on the fly
137
+
138
+ Datago can also process images on the fly, for instance to align different image payloads. This is done by adding an `image_config` to the configuration. The following example shows how to align different image payloads.
139
+
140
+ Processing can be very CPU heavy, but it will be distributed over all CPU cores wihout requiring multiple python processes. I.e., you can keep a single python process using `get_sample()` on the client and still saturate all CPU cores.
141
+
142
+ There are three main processing topics that you can choose from:
143
+
144
+ - crop the images to within an aspect ratio bucket (which is very handy for all Transformer / patch based architectures)
145
+ - resize the images (setting here will be related to the square aspect ratio bucket, other buckets will differ of course)
146
+ - pre-encode the images to a specific format (jpg, png, ...)
147
+
148
+ ```python
149
+ config = {
150
+ "source_type": "file",
151
+ "source_config": {
152
+ "root_path": "myPath",
153
+ "random_sampling": False, # True if used directly for training
154
+ },
155
+ # Optional pre-processing of the images, placing them in an aspect ratio bucket to preserve as much as possible of the original content
156
+ "image_config": {
157
+ "crop_and_resize": True, # False to turn it off, or just omit this part of the config
158
+ "default_image_size": 1024,
159
+ "downsampling_ratio": 32,
160
+ "min_aspect_ratio": 0.5,
161
+ "max_aspect_ratio": 2.0,
162
+ "pre_encode_images": False,
163
+ },
164
+ "limit": 200,
165
+ "samples_buffer_size": 32,
166
+ }
167
+ ```
145
168
 
146
169
  ## Match the raw exported buffers with typical python types
147
170
 
@@ -154,6 +177,14 @@ You can set the log level using the RUST_LOG environment variable. E.g. `RUST_LO
154
177
 
155
178
  When using the library from Python, `env_logger` will be initialized automatically when creating a `DatagoClient`. There is also a `initialize_logging` function in the `datago` module, which if called before using a client, allows to customize the log level. This only works if RUST_LOG is not set.
156
179
 
180
+ ## Env variables
181
+
182
+ There are a couple of env variables which will change the behavior of the library, for settings which felt too low level to be exposed in the config.
183
+
184
+ - `DATAGO_MAX_TASKS`: refers to the number of threads which will be used to load the samples. Defaults to a multiple of the CPU cores.
185
+ - `RUST_LOG`: see above, will change the level of logging for the whole library, could be useful for debugging or to report an issue here.
186
+ - `DATAGO_MAX_RETRIES`: number of retries for a failed sample load, defaults to 3.
187
+
157
188
  </details><details> <summary><strong>Build it</strong></summary>
158
189
 
159
190
  ## Preamble
@@ -216,6 +247,25 @@ Create a new tag and a new release in this repo, a new package will be pushed au
216
247
 
217
248
  </details>
218
249
 
250
+ <details> <summary><strong>Benchmarks</strong></summary>
251
+ As usual, benchmarks are a tricky game, and you shouldn't read too much into the following plots but do your own tests. Some python benchmark examples are provided in the [python](./python/) folder.
252
+
253
+ In general, Datago will be impactful if you want to load a lot of images very fast, but if you consume them as you go at a more leisury pace then it's not really needed. The more CPU work there is with the images and the higher quality they are, the more Datago will shine. The following benchmarks are using ImageNet 1k, which is very low resolution and thus kind of a worst case scenario. Data is served from cache (i.e. the OS cache) and the images are not pre-processed. In this case the receiving python process is typically the bottleneck, and caps at around 2000 images per second.
254
+
255
+ ### AMD Zen3 laptop - IN1k - disk
256
+ ![AMD Zen3 laptop & M2 SSD](assets/zen3_ssd.png)
257
+
258
+ ### AMD EPYC 9454 - IN1k - disk
259
+ ![AMD EPYC 9454](assets/epyc_vast.png)
260
+
261
+ This benchmark is using the PD12M dataset, which is a 12M images dataset, with a lot of high resolution images. It's accessed through the webdataset front end, datago is compared with the popular python webdataset library. Note that datago will start streaming the images faster here (almost instantly !), so given enough time the two results would look closer.
262
+
263
+ ### AMD EPYC 9454 - pd12m - webdataset
264
+ ![AMD EPYC 9454](assets/epyc_wds.png)
265
+
266
+ </details>
267
+
268
+
219
269
  ## License
220
270
 
221
271
  MIT License
Binary file
Binary file
Binary file
@@ -1,11 +1,13 @@
1
- from datago import DatagoClient # type: ignore
1
+ import json
2
2
  import time
3
- from tqdm import tqdm
3
+
4
4
  import numpy as np
5
- from raw_types import raw_array_to_pil_image, raw_array_to_numpy
6
5
  import typer
7
- import json
6
+ from benchmark_defaults import IMAGE_CONFIG
7
+ from datago import DatagoClient # type: ignore
8
8
  from PIL import Image
9
+ from raw_types import raw_array_to_numpy, raw_array_to_pil_image
10
+ from tqdm import tqdm
9
11
 
10
12
 
11
13
  def benchmark(
@@ -31,19 +33,20 @@ def benchmark(
31
33
  "rank": 0,
32
34
  "world_size": 1,
33
35
  },
34
- "image_config": {
35
- "crop_and_resize": crop_and_resize,
36
- "default_image_size": 1024,
37
- "downsampling_ratio": 32,
38
- "min_aspect_ratio": 0.5,
39
- "max_aspect_ratio": 2.0,
40
- "pre_encode_images": encode_images,
41
- },
42
36
  "prefetch_buffer_size": 128,
43
37
  "samples_buffer_size": 64,
44
38
  "limit": limit,
45
39
  }
46
40
 
41
+ if crop_and_resize or encode_images:
42
+ client_config["image_config"] = IMAGE_CONFIG
43
+
44
+ if encode_images:
45
+ client_config["image_config"]["crop_and_resize"] = ( # type: ignore
46
+ crop_and_resize # You may want to encode images without resizing them
47
+ )
48
+ client_config["image_config"]["pre_encode_images"] = True # type: ignore
49
+
47
50
  client = DatagoClient(json.dumps(client_config))
48
51
  client.start() # Optional, but good practice to start the client to reduce latency to first sample (while you're instantiating models for instance)
49
52
  start = time.time()
@@ -0,0 +1,8 @@
1
+ IMAGE_CONFIG = {
2
+ "crop_and_resize": True,
3
+ "default_image_size": 1024,
4
+ "downsampling_ratio": 32,
5
+ "min_aspect_ratio": 0.5,
6
+ "max_aspect_ratio": 2.0,
7
+ "pre_encode_images": False,
8
+ }
@@ -1,24 +1,37 @@
1
- import time
2
- from tqdm import tqdm
1
+ import json
3
2
  import os
3
+ import time
4
+
4
5
  import typer
6
+ from benchmark_defaults import IMAGE_CONFIG
5
7
  from dataset import DatagoIterDataset
8
+ from tqdm import tqdm
6
9
 
7
10
 
8
11
  def benchmark(
9
- root_path: str = typer.Option(
10
- os.getenv("DATAGO_TEST_FILESYSTEM", ""), help="The source to test out"
11
- ),
12
+ root_path: str = typer.Option(os.getenv("DATAGO_TEST_FILESYSTEM", ""), help="The source to test out"),
12
13
  limit: int = typer.Option(2000, help="The number of samples to test on"),
13
- crop_and_resize: bool = typer.Option(
14
- False, help="Crop and resize the images on the fly"
15
- ),
14
+ crop_and_resize: bool = typer.Option(False, help="Crop and resize the images on the fly"),
16
15
  compare_torch: bool = typer.Option(True, help="Compare against torch dataloader"),
16
+ num_workers: int = typer.Option(os.cpu_count(), help="Number of workers to use"),
17
+ sweep: bool = typer.Option(False, help="Sweep over the number of workers"),
17
18
  ):
18
- print(f"Running benchmark for {root_path} - {limit} samples")
19
- print(
20
- "Please run the benchmark twice if you want to compare against torch dataloader, so that file caching affects both paths"
21
- )
19
+ if sweep:
20
+ results = {}
21
+ for num_workers in range(2, (os.cpu_count() or 2), 16):
22
+ results[num_workers] = benchmark(root_path, limit, crop_and_resize, compare_torch, num_workers, False)
23
+
24
+ # Save results to a json file
25
+
26
+ with open("benchmark_results_filesystem.json", "w") as f:
27
+ json.dump(results, f, indent=2)
28
+
29
+ return results
30
+
31
+ print(f"Running benchmark for {root_path} - {limit} samples - {num_workers} workers")
32
+
33
+ # This setting is not exposed in the config, but an env variable can be used instead
34
+ os.environ["DATAGO_MAX_TASKS"] = str(num_workers)
22
35
 
23
36
  client_config = {
24
37
  "source_type": "file",
@@ -27,19 +40,14 @@ def benchmark(
27
40
  "rank": 0,
28
41
  "world_size": 1,
29
42
  },
30
- "image_config": {
31
- "crop_and_resize": crop_and_resize,
32
- "default_image_size": 1024,
33
- "downsampling_ratio": 32,
34
- "min_aspect_ratio": 0.5,
35
- "max_aspect_ratio": 2.0,
36
- "pre_encode_images": False,
37
- },
38
43
  "prefetch_buffer_size": 256,
39
44
  "samples_buffer_size": 256,
40
45
  "limit": limit,
41
46
  }
42
47
 
48
+ if crop_and_resize:
49
+ client_config["image_config"] = IMAGE_CONFIG
50
+
43
51
  # Make sure in the following that we compare apples to apples, meaning in that case
44
52
  # that we materialize the payloads in the python scope in the expected format
45
53
  # (PIL.Image for images and masks for instance, numpy arrays for latents)
@@ -48,14 +56,15 @@ def benchmark(
48
56
 
49
57
  img = None
50
58
  count = 0
51
- for sample in tqdm(datago_dataset, dynamic_ncols=True):
59
+ for sample in tqdm(datago_dataset, desc="Datago", dynamic_ncols=True):
52
60
  assert sample["id"] != ""
53
61
  img = sample["image"]
54
62
  count += 1
55
63
 
56
64
  assert count == limit, f"Expected {limit} samples, got {count}"
57
65
  fps = limit / (time.time() - start)
58
- print(f"Datago FPS {fps:.2f}")
66
+ results = {"datago": {"fps": fps, "count": count}}
67
+ print(f"Datago - FPS {fps:.2f} - workers {num_workers}")
59
68
  del datago_dataset
60
69
 
61
70
  # Save the last image as a test
@@ -64,17 +73,14 @@ def benchmark(
64
73
 
65
74
  # Let's compare against a classic pytorch dataloader
66
75
  if compare_torch:
67
- from torchvision import datasets, transforms # type: ignore
68
76
  from torch.utils.data import DataLoader
77
+ from torchvision import datasets, transforms # type: ignore
69
78
 
70
- print("Benchmarking torch dataloader")
71
79
  # Define the transformations to apply to each image
72
80
  transform = (
73
81
  transforms.Compose(
74
82
  [
75
- transforms.Resize(
76
- (1024, 1024), interpolation=transforms.InterpolationMode.LANCZOS
77
- ),
83
+ transforms.Resize((1024, 1024), interpolation=transforms.InterpolationMode.LANCZOS),
78
84
  ]
79
85
  )
80
86
  if crop_and_resize
@@ -82,13 +88,10 @@ def benchmark(
82
88
  )
83
89
 
84
90
  # Create the ImageFolder dataset
85
- dataset = datasets.ImageFolder(
86
- root=root_path, transform=transform, allow_empty=True
87
- )
91
+ dataset = datasets.ImageFolder(root=root_path, transform=transform, allow_empty=True)
88
92
 
89
93
  # Create a DataLoader to allow for multiple workers
90
94
  # Use available CPU count for num_workers
91
- num_workers = os.cpu_count() or 8 # Default to 8 if cpu_count returns None
92
95
  dataloader = DataLoader(
93
96
  dataset,
94
97
  batch_size=1,
@@ -100,12 +103,15 @@ def benchmark(
100
103
  # Iterate over the DataLoader
101
104
  start = time.time()
102
105
  n_images = 0
103
- for batch in tqdm(dataloader, dynamic_ncols=True):
106
+ for batch in tqdm(dataloader, desc="Torch", dynamic_ncols=True):
104
107
  n_images += len(batch)
105
108
  if n_images > limit:
106
109
  break
107
110
  fps = n_images / (time.time() - start)
108
- print(f"Torch FPS {fps:.2f}")
111
+ results["torch"] = {"fps": fps, "count": n_images}
112
+ print(f"Torch - FPS {fps:.2f} - workers {num_workers}")
113
+
114
+ return results
109
115
 
110
116
 
111
117
  if __name__ == "__main__":
@@ -1,8 +1,11 @@
1
+ import json
2
+ import os
1
3
  import time
2
- from tqdm import tqdm
4
+
3
5
  import typer
6
+ from benchmark_defaults import IMAGE_CONFIG
4
7
  from dataset import DatagoIterDataset
5
- import os
8
+ from tqdm import tqdm
6
9
 
7
10
 
8
11
  def benchmark(
@@ -11,11 +14,23 @@ def benchmark(
11
14
  True, help="Crop and resize the images on the fly"
12
15
  ),
13
16
  compare_wds: bool = typer.Option(True, help="Compare against torch dataloader"),
14
- n_processes_wds: int = typer.Option(
17
+ num_workers: int = typer.Option(
15
18
  16,
16
- help="Number of processes to use for the torch dataloader - used only if compare_wds is True",
19
+ help="Number of processes to use",
17
20
  ),
21
+ sweep: bool = typer.Option(False, help="Sweep over the number of processes"),
18
22
  ):
23
+ if sweep:
24
+ results = {}
25
+ for num_workers in range(2, max(64, (os.cpu_count() or 1)), 8):
26
+ results[num_workers] = benchmark(limit, crop_and_resize, compare_wds, num_workers, False)
27
+
28
+ # Save results to a json file
29
+ with open("benchmark_results_wds.json", "w") as f:
30
+ json.dump(results, f, indent=2)
31
+
32
+ return results
33
+
19
34
  # URL of the test bucket
20
35
  # bucket = "https://storage.googleapis.com/webdataset/fake-imagenet"
21
36
  # dataset = "/imagenet-train-{000000..001281}.tar"
@@ -32,22 +47,18 @@ def benchmark(
32
47
  "source_config": {
33
48
  "url": url,
34
49
  "shuffle": True,
35
- "max_concurrency": 8, # Number of concurrent TarballSample downloads and dispatch
50
+ "max_concurrency": num_workers, # Number of concurrent TarballSample downloads and dispatch
36
51
  "auth_token": os.environ.get("HF_TOKEN", default=""),
37
52
  },
38
- "image_config": {
39
- "crop_and_resize": crop_and_resize,
40
- "default_image_size": 1024,
41
- "downsampling_ratio": 32,
42
- "min_aspect_ratio": 0.5,
43
- "max_aspect_ratio": 2.0,
44
- "pre_encode_images": False,
45
- },
46
53
  "prefetch_buffer_size": 256,
47
54
  "samples_buffer_size": 256,
48
55
  "limit": limit,
49
56
  }
50
57
 
58
+ if crop_and_resize:
59
+ # Optionally add a custom image config to crop and resize the images on the fly
60
+ client_config["image_config"] = IMAGE_CONFIG
61
+
51
62
  # # Make sure in the following that we compare apples to apples, meaning in that case
52
63
  # # that we materialize the payloads in the python scope in the expected format
53
64
  # # (PIL.Image for images and masks for instance, numpy arrays for latents)
@@ -55,14 +66,15 @@ def benchmark(
55
66
  start = time.time() # Note that the datago dataset will start preparing samples (up to the requested buffer size) at construction time
56
67
 
57
68
  img, count = None, 0
58
- for sample in tqdm(datago_dataset, dynamic_ncols=True):
69
+ for sample in tqdm(datago_dataset, desc="Datago", dynamic_ncols=True):
59
70
  assert sample["id"] != ""
60
71
  img = sample["image"]
61
72
  count += 1
62
73
 
63
74
  assert count == limit, f"Expected {limit} samples, got {count}"
64
75
  fps = limit / (time.time() - start)
65
- print(f"-- Datago WDS FPS {fps:.2f}")
76
+ print(f"-- Datago WDS FPS {fps:.2f} - workers {num_workers}")
77
+ results = {"datago": {"fps": fps, "count": count}}
66
78
  del datago_dataset
67
79
 
68
80
  # Save the last image as a test
@@ -71,9 +83,9 @@ def benchmark(
71
83
 
72
84
  # Let's compare against a classic webdataset dataloader
73
85
  if compare_wds:
74
- from torchvision import transforms
75
- from torch.utils.data import DataLoader
76
86
  import webdataset as wds
87
+ from torch.utils.data import DataLoader
88
+ from torchvision import transforms
77
89
 
78
90
  print("\nBenchmarking webdataset library dataloader")
79
91
  # Define the transformations to apply to each image
@@ -108,19 +120,21 @@ def benchmark(
108
120
  dataloader = DataLoader(
109
121
  dataset,
110
122
  batch_size=1,
111
- num_workers=n_processes_wds,
123
+ num_workers=num_workers,
112
124
  prefetch_factor=2,
113
125
  collate_fn=lambda x: x,
114
126
  )
115
127
 
116
128
  # Iterate over the DataLoader
117
129
  start = time.time()
118
- for n_images, _ in enumerate(tqdm(dataloader, dynamic_ncols=True)):
130
+ for n_images, _ in enumerate(tqdm(dataloader, desc="WDS", dynamic_ncols=True)):
119
131
  if n_images > limit:
120
132
  break
121
133
  fps = n_images / (time.time() - start)
122
- print(f"-- Webdataset lib FPS ({n_processes_wds} processes) {fps:.2f}")
134
+ print(f"-- Webdataset lib FPS ({num_workers} processes) {fps:.2f}")
123
135
 
136
+ results["webdataset"] = {"fps": fps, "count": n_images}
137
+ return results
124
138
 
125
139
  if __name__ == "__main__":
126
140
  typer.run(benchmark)
@@ -85,6 +85,9 @@ if __name__ == "__main__":
85
85
  "min_aspect_ratio": 0.5,
86
86
  "max_aspect_ratio": 2.0,
87
87
  "pre_encode_images": False,
88
+ # Optional: Use JPEG encoding instead of PNG (defaults to PNG if not specified)
89
+ # "encode_format": "jpeg", # or "png"
90
+ # "jpeg_quality": 92, # 0-100, only used when encode_format is "jpeg"
88
91
  },
89
92
  "prefetch_buffer_size": 64,
90
93
  "samples_buffer_size": 128,
@@ -1,9 +1,9 @@
1
1
  from PIL import Image
2
- from typing import Optional
2
+ from typing import Optional, Union
3
3
  import numpy as np
4
4
 
5
5
 
6
- def uint8_array_to_numpy(raw_array):
6
+ def uint8_array_to_numpy(raw_array: 'ImagePayload') -> Optional[np.ndarray]:
7
7
  if len(raw_array.data) == 0:
8
8
  return None
9
9
 
@@ -29,7 +29,7 @@ def uint8_array_to_numpy(raw_array):
29
29
  return np.frombuffer(raw_array.data, dtype=np.uint8).reshape(shape)
30
30
 
31
31
 
32
- def raw_array_to_numpy(raw_array) -> Optional[np.ndarray]:
32
+ def raw_array_to_numpy(raw_array: 'ImagePayload') -> Optional[np.ndarray]:
33
33
  if len(raw_array.data) == 0:
34
34
  return None
35
35
 
@@ -42,7 +42,7 @@ def raw_array_to_numpy(raw_array) -> Optional[np.ndarray]:
42
42
  return None
43
43
 
44
44
 
45
- def raw_array_to_pil_image(raw_array) -> Optional[Image.Image]:
45
+ def raw_array_to_pil_image(raw_array: 'ImagePayload') -> Union[Optional[Image.Image], 'ImagePayload']:
46
46
  if len(raw_array.data) == 0:
47
47
  return None
48
48
 
@@ -63,3 +63,25 @@ def raw_array_to_pil_image(raw_array) -> Optional[Image.Image]:
63
63
 
64
64
  assert c == 3, f"Expected 3 channels, got {c}"
65
65
  return Image.fromarray(np_array)
66
+
67
+
68
+ def decode_image_payload(payload: 'ImagePayload') -> Image.Image:
69
+ """
70
+ Decode an ImagePayload (encoded image) into a PIL Image.
71
+ This is the proper way to decode encoded images for API users.
72
+ """
73
+ import io
74
+ return Image.open(io.BytesIO(payload.data))
75
+
76
+
77
+ def get_image_mode(image_or_payload) -> str:
78
+ """
79
+ Helper function to get the mode of an image, whether it's a PIL Image or ImagePayload.
80
+ For ImagePayload objects (encoded images), we need to decode them first.
81
+ """
82
+ if hasattr(image_or_payload, 'mode'):
83
+ # It's a PIL Image
84
+ return image_or_payload.mode
85
+ else:
86
+ # It's an ImagePayload (encoded image), decode it first
87
+ return decode_image_payload(image_or_payload).mode