datago 2025.8.1__tar.gz → 2025.10.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {datago-2025.8.1 → datago-2025.10.2}/Cargo.lock +1 -1
- {datago-2025.8.1 → datago-2025.10.2}/Cargo.toml +1 -1
- {datago-2025.8.1 → datago-2025.10.2}/PKG-INFO +61 -11
- {datago-2025.8.1 → datago-2025.10.2}/README.md +60 -10
- datago-2025.10.2/assets/epyc_vast.png +0 -0
- datago-2025.10.2/assets/epyc_wds.png +0 -0
- datago-2025.10.2/assets/zen3_ssd.png +0 -0
- {datago-2025.8.1 → datago-2025.10.2}/python/benchmark_db.py +15 -12
- datago-2025.10.2/python/benchmark_defaults.py +8 -0
- {datago-2025.8.1 → datago-2025.10.2}/python/benchmark_filesystem.py +39 -33
- {datago-2025.8.1 → datago-2025.10.2}/python/benchmark_webdataset.py +34 -20
- {datago-2025.8.1 → datago-2025.10.2}/python/dataset.py +3 -0
- {datago-2025.8.1 → datago-2025.10.2}/python/raw_types.py +26 -4
- {datago-2025.8.1 → datago-2025.10.2}/python/test_datago_db.py +59 -13
- {datago-2025.8.1 → datago-2025.10.2}/src/client.rs +22 -7
- {datago-2025.8.1 → datago-2025.10.2}/src/generator_files.rs +7 -4
- {datago-2025.8.1 → datago-2025.10.2}/src/generator_http.rs +7 -4
- {datago-2025.8.1 → datago-2025.10.2}/src/generator_wds.rs +25 -9
- {datago-2025.8.1 → datago-2025.10.2}/src/image_processing.rs +111 -27
- {datago-2025.8.1 → datago-2025.10.2}/src/main.rs +2 -0
- {datago-2025.8.1 → datago-2025.10.2}/src/worker_files.rs +86 -45
- {datago-2025.8.1 → datago-2025.10.2}/src/worker_http.rs +33 -30
- {datago-2025.8.1 → datago-2025.10.2}/src/worker_wds.rs +12 -13
- {datago-2025.8.1 → datago-2025.10.2}/.github/workflows/ci-cd.yml +0 -0
- {datago-2025.8.1 → datago-2025.10.2}/.github/workflows/rust.yml +0 -0
- {datago-2025.8.1 → datago-2025.10.2}/.gitignore +0 -0
- {datago-2025.8.1 → datago-2025.10.2}/.pre-commit-config.yaml +0 -0
- {datago-2025.8.1 → datago-2025.10.2}/LICENSE +0 -0
- {datago-2025.8.1 → datago-2025.10.2}/assets/447175851-2277afcb-8abf-4d17-b2db-dae27c6056d0.png +0 -0
- {datago-2025.8.1 → datago-2025.10.2}/pyproject.toml +0 -0
- {datago-2025.8.1 → datago-2025.10.2}/python/test_datago_client.py +0 -0
- {datago-2025.8.1 → datago-2025.10.2}/python/test_datago_edge_cases.py +0 -0
- {datago-2025.8.1 → datago-2025.10.2}/python/test_datago_filesystem.py +0 -0
- {datago-2025.8.1 → datago-2025.10.2}/requirements-tests.txt +0 -0
- {datago-2025.8.1 → datago-2025.10.2}/requirements.txt +0 -0
- {datago-2025.8.1 → datago-2025.10.2}/src/lib.rs +0 -0
- {datago-2025.8.1 → datago-2025.10.2}/src/structs.rs +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: datago
|
|
3
|
-
Version: 2025.
|
|
3
|
+
Version: 2025.10.2
|
|
4
4
|
Classifier: Programming Language :: Rust
|
|
5
5
|
Classifier: Programming Language :: Python :: Implementation :: CPython
|
|
6
6
|
Classifier: Programming Language :: Python :: Implementation :: PyPy
|
|
@@ -97,7 +97,7 @@ config = {
|
|
|
97
97
|
"source_config": {
|
|
98
98
|
"root_path": "myPath",
|
|
99
99
|
"random_sampling": False, # True if used directly for training
|
|
100
|
-
"rank": 0,
|
|
100
|
+
"rank": 0, # Optional, distributed workloads are possible
|
|
101
101
|
"world_size": 1,
|
|
102
102
|
},
|
|
103
103
|
"limit": 200,
|
|
@@ -137,15 +137,6 @@ client_config = {
|
|
|
137
137
|
"rank": 0,
|
|
138
138
|
"world_size": 1,
|
|
139
139
|
},
|
|
140
|
-
# Optional pre-processing of the images, placing them in an aspect ratio bucket to preseve as much as possible of the original content
|
|
141
|
-
"image_config": {
|
|
142
|
-
"crop_and_resize": True, # False to turn it off, or just omit this part of the config
|
|
143
|
-
"default_image_size": 1024,
|
|
144
|
-
"downsampling_ratio": 32,
|
|
145
|
-
"min_aspect_ratio": 0.5,
|
|
146
|
-
"max_aspect_ratio": 2.0,
|
|
147
|
-
"pre_encode_images": False,
|
|
148
|
-
},
|
|
149
140
|
"prefetch_buffer_size": 128,
|
|
150
141
|
"samples_buffer_size": 64,
|
|
151
142
|
"limit": 1_000_000, # Dummy example, max number of samples you would like to serve
|
|
@@ -159,6 +150,38 @@ for _ in range(10):
|
|
|
159
150
|
|
|
160
151
|
</details>
|
|
161
152
|
|
|
153
|
+
## Process images on the fly
|
|
154
|
+
|
|
155
|
+
Datago can also process images on the fly, for instance to align different image payloads. This is done by adding an `image_config` to the configuration. The following example shows how to align different image payloads.
|
|
156
|
+
|
|
157
|
+
Processing can be very CPU heavy, but it will be distributed over all CPU cores wihout requiring multiple python processes. I.e., you can keep a single python process using `get_sample()` on the client and still saturate all CPU cores.
|
|
158
|
+
|
|
159
|
+
There are three main processing topics that you can choose from:
|
|
160
|
+
|
|
161
|
+
- crop the images to within an aspect ratio bucket (which is very handy for all Transformer / patch based architectures)
|
|
162
|
+
- resize the images (setting here will be related to the square aspect ratio bucket, other buckets will differ of course)
|
|
163
|
+
- pre-encode the images to a specific format (jpg, png, ...)
|
|
164
|
+
|
|
165
|
+
```python
|
|
166
|
+
config = {
|
|
167
|
+
"source_type": "file",
|
|
168
|
+
"source_config": {
|
|
169
|
+
"root_path": "myPath",
|
|
170
|
+
"random_sampling": False, # True if used directly for training
|
|
171
|
+
},
|
|
172
|
+
# Optional pre-processing of the images, placing them in an aspect ratio bucket to preserve as much as possible of the original content
|
|
173
|
+
"image_config": {
|
|
174
|
+
"crop_and_resize": True, # False to turn it off, or just omit this part of the config
|
|
175
|
+
"default_image_size": 1024,
|
|
176
|
+
"downsampling_ratio": 32,
|
|
177
|
+
"min_aspect_ratio": 0.5,
|
|
178
|
+
"max_aspect_ratio": 2.0,
|
|
179
|
+
"pre_encode_images": False,
|
|
180
|
+
},
|
|
181
|
+
"limit": 200,
|
|
182
|
+
"samples_buffer_size": 32,
|
|
183
|
+
}
|
|
184
|
+
```
|
|
162
185
|
|
|
163
186
|
## Match the raw exported buffers with typical python types
|
|
164
187
|
|
|
@@ -171,6 +194,14 @@ You can set the log level using the RUST_LOG environment variable. E.g. `RUST_LO
|
|
|
171
194
|
|
|
172
195
|
When using the library from Python, `env_logger` will be initialized automatically when creating a `DatagoClient`. There is also a `initialize_logging` function in the `datago` module, which if called before using a client, allows to customize the log level. This only works if RUST_LOG is not set.
|
|
173
196
|
|
|
197
|
+
## Env variables
|
|
198
|
+
|
|
199
|
+
There are a couple of env variables which will change the behavior of the library, for settings which felt too low level to be exposed in the config.
|
|
200
|
+
|
|
201
|
+
- `DATAGO_MAX_TASKS`: refers to the number of threads which will be used to load the samples. Defaults to a multiple of the CPU cores.
|
|
202
|
+
- `RUST_LOG`: see above, will change the level of logging for the whole library, could be useful for debugging or to report an issue here.
|
|
203
|
+
- `DATAGO_MAX_RETRIES`: number of retries for a failed sample load, defaults to 3.
|
|
204
|
+
|
|
174
205
|
</details><details> <summary><strong>Build it</strong></summary>
|
|
175
206
|
|
|
176
207
|
## Preamble
|
|
@@ -233,6 +264,25 @@ Create a new tag and a new release in this repo, a new package will be pushed au
|
|
|
233
264
|
|
|
234
265
|
</details>
|
|
235
266
|
|
|
267
|
+
<details> <summary><strong>Benchmarks</strong></summary>
|
|
268
|
+
As usual, benchmarks are a tricky game, and you shouldn't read too much into the following plots but do your own tests. Some python benchmark examples are provided in the [python](./python/) folder.
|
|
269
|
+
|
|
270
|
+
In general, Datago will be impactful if you want to load a lot of images very fast, but if you consume them as you go at a more leisury pace then it's not really needed. The more CPU work there is with the images and the higher quality they are, the more Datago will shine. The following benchmarks are using ImageNet 1k, which is very low resolution and thus kind of a worst case scenario. Data is served from cache (i.e. the OS cache) and the images are not pre-processed. In this case the receiving python process is typically the bottleneck, and caps at around 2000 images per second.
|
|
271
|
+
|
|
272
|
+
### AMD Zen3 laptop - IN1k - disk
|
|
273
|
+

|
|
274
|
+
|
|
275
|
+
### AMD EPYC 9454 - IN1k - disk
|
|
276
|
+

|
|
277
|
+
|
|
278
|
+
This benchmark is using the PD12M dataset, which is a 12M images dataset, with a lot of high resolution images. It's accessed through the webdataset front end, datago is compared with the popular python webdataset library. Note that datago will start streaming the images faster here (almost instantly !), so given enough time the two results would look closer.
|
|
279
|
+
|
|
280
|
+
### AMD EPYC 9454 - pd12m - webdataset
|
|
281
|
+

|
|
282
|
+
|
|
283
|
+
</details>
|
|
284
|
+
|
|
285
|
+
|
|
236
286
|
## License
|
|
237
287
|
|
|
238
288
|
MIT License
|
|
@@ -80,7 +80,7 @@ config = {
|
|
|
80
80
|
"source_config": {
|
|
81
81
|
"root_path": "myPath",
|
|
82
82
|
"random_sampling": False, # True if used directly for training
|
|
83
|
-
"rank": 0,
|
|
83
|
+
"rank": 0, # Optional, distributed workloads are possible
|
|
84
84
|
"world_size": 1,
|
|
85
85
|
},
|
|
86
86
|
"limit": 200,
|
|
@@ -120,15 +120,6 @@ client_config = {
|
|
|
120
120
|
"rank": 0,
|
|
121
121
|
"world_size": 1,
|
|
122
122
|
},
|
|
123
|
-
# Optional pre-processing of the images, placing them in an aspect ratio bucket to preseve as much as possible of the original content
|
|
124
|
-
"image_config": {
|
|
125
|
-
"crop_and_resize": True, # False to turn it off, or just omit this part of the config
|
|
126
|
-
"default_image_size": 1024,
|
|
127
|
-
"downsampling_ratio": 32,
|
|
128
|
-
"min_aspect_ratio": 0.5,
|
|
129
|
-
"max_aspect_ratio": 2.0,
|
|
130
|
-
"pre_encode_images": False,
|
|
131
|
-
},
|
|
132
123
|
"prefetch_buffer_size": 128,
|
|
133
124
|
"samples_buffer_size": 64,
|
|
134
125
|
"limit": 1_000_000, # Dummy example, max number of samples you would like to serve
|
|
@@ -142,6 +133,38 @@ for _ in range(10):
|
|
|
142
133
|
|
|
143
134
|
</details>
|
|
144
135
|
|
|
136
|
+
## Process images on the fly
|
|
137
|
+
|
|
138
|
+
Datago can also process images on the fly, for instance to align different image payloads. This is done by adding an `image_config` to the configuration. The following example shows how to align different image payloads.
|
|
139
|
+
|
|
140
|
+
Processing can be very CPU heavy, but it will be distributed over all CPU cores wihout requiring multiple python processes. I.e., you can keep a single python process using `get_sample()` on the client and still saturate all CPU cores.
|
|
141
|
+
|
|
142
|
+
There are three main processing topics that you can choose from:
|
|
143
|
+
|
|
144
|
+
- crop the images to within an aspect ratio bucket (which is very handy for all Transformer / patch based architectures)
|
|
145
|
+
- resize the images (setting here will be related to the square aspect ratio bucket, other buckets will differ of course)
|
|
146
|
+
- pre-encode the images to a specific format (jpg, png, ...)
|
|
147
|
+
|
|
148
|
+
```python
|
|
149
|
+
config = {
|
|
150
|
+
"source_type": "file",
|
|
151
|
+
"source_config": {
|
|
152
|
+
"root_path": "myPath",
|
|
153
|
+
"random_sampling": False, # True if used directly for training
|
|
154
|
+
},
|
|
155
|
+
# Optional pre-processing of the images, placing them in an aspect ratio bucket to preserve as much as possible of the original content
|
|
156
|
+
"image_config": {
|
|
157
|
+
"crop_and_resize": True, # False to turn it off, or just omit this part of the config
|
|
158
|
+
"default_image_size": 1024,
|
|
159
|
+
"downsampling_ratio": 32,
|
|
160
|
+
"min_aspect_ratio": 0.5,
|
|
161
|
+
"max_aspect_ratio": 2.0,
|
|
162
|
+
"pre_encode_images": False,
|
|
163
|
+
},
|
|
164
|
+
"limit": 200,
|
|
165
|
+
"samples_buffer_size": 32,
|
|
166
|
+
}
|
|
167
|
+
```
|
|
145
168
|
|
|
146
169
|
## Match the raw exported buffers with typical python types
|
|
147
170
|
|
|
@@ -154,6 +177,14 @@ You can set the log level using the RUST_LOG environment variable. E.g. `RUST_LO
|
|
|
154
177
|
|
|
155
178
|
When using the library from Python, `env_logger` will be initialized automatically when creating a `DatagoClient`. There is also a `initialize_logging` function in the `datago` module, which if called before using a client, allows to customize the log level. This only works if RUST_LOG is not set.
|
|
156
179
|
|
|
180
|
+
## Env variables
|
|
181
|
+
|
|
182
|
+
There are a couple of env variables which will change the behavior of the library, for settings which felt too low level to be exposed in the config.
|
|
183
|
+
|
|
184
|
+
- `DATAGO_MAX_TASKS`: refers to the number of threads which will be used to load the samples. Defaults to a multiple of the CPU cores.
|
|
185
|
+
- `RUST_LOG`: see above, will change the level of logging for the whole library, could be useful for debugging or to report an issue here.
|
|
186
|
+
- `DATAGO_MAX_RETRIES`: number of retries for a failed sample load, defaults to 3.
|
|
187
|
+
|
|
157
188
|
</details><details> <summary><strong>Build it</strong></summary>
|
|
158
189
|
|
|
159
190
|
## Preamble
|
|
@@ -216,6 +247,25 @@ Create a new tag and a new release in this repo, a new package will be pushed au
|
|
|
216
247
|
|
|
217
248
|
</details>
|
|
218
249
|
|
|
250
|
+
<details> <summary><strong>Benchmarks</strong></summary>
|
|
251
|
+
As usual, benchmarks are a tricky game, and you shouldn't read too much into the following plots but do your own tests. Some python benchmark examples are provided in the [python](./python/) folder.
|
|
252
|
+
|
|
253
|
+
In general, Datago will be impactful if you want to load a lot of images very fast, but if you consume them as you go at a more leisury pace then it's not really needed. The more CPU work there is with the images and the higher quality they are, the more Datago will shine. The following benchmarks are using ImageNet 1k, which is very low resolution and thus kind of a worst case scenario. Data is served from cache (i.e. the OS cache) and the images are not pre-processed. In this case the receiving python process is typically the bottleneck, and caps at around 2000 images per second.
|
|
254
|
+
|
|
255
|
+
### AMD Zen3 laptop - IN1k - disk
|
|
256
|
+

|
|
257
|
+
|
|
258
|
+
### AMD EPYC 9454 - IN1k - disk
|
|
259
|
+

|
|
260
|
+
|
|
261
|
+
This benchmark is using the PD12M dataset, which is a 12M images dataset, with a lot of high resolution images. It's accessed through the webdataset front end, datago is compared with the popular python webdataset library. Note that datago will start streaming the images faster here (almost instantly !), so given enough time the two results would look closer.
|
|
262
|
+
|
|
263
|
+
### AMD EPYC 9454 - pd12m - webdataset
|
|
264
|
+

|
|
265
|
+
|
|
266
|
+
</details>
|
|
267
|
+
|
|
268
|
+
|
|
219
269
|
## License
|
|
220
270
|
|
|
221
271
|
MIT License
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
@@ -1,11 +1,13 @@
|
|
|
1
|
-
|
|
1
|
+
import json
|
|
2
2
|
import time
|
|
3
|
-
|
|
3
|
+
|
|
4
4
|
import numpy as np
|
|
5
|
-
from raw_types import raw_array_to_pil_image, raw_array_to_numpy
|
|
6
5
|
import typer
|
|
7
|
-
import
|
|
6
|
+
from benchmark_defaults import IMAGE_CONFIG
|
|
7
|
+
from datago import DatagoClient # type: ignore
|
|
8
8
|
from PIL import Image
|
|
9
|
+
from raw_types import raw_array_to_numpy, raw_array_to_pil_image
|
|
10
|
+
from tqdm import tqdm
|
|
9
11
|
|
|
10
12
|
|
|
11
13
|
def benchmark(
|
|
@@ -31,19 +33,20 @@ def benchmark(
|
|
|
31
33
|
"rank": 0,
|
|
32
34
|
"world_size": 1,
|
|
33
35
|
},
|
|
34
|
-
"image_config": {
|
|
35
|
-
"crop_and_resize": crop_and_resize,
|
|
36
|
-
"default_image_size": 1024,
|
|
37
|
-
"downsampling_ratio": 32,
|
|
38
|
-
"min_aspect_ratio": 0.5,
|
|
39
|
-
"max_aspect_ratio": 2.0,
|
|
40
|
-
"pre_encode_images": encode_images,
|
|
41
|
-
},
|
|
42
36
|
"prefetch_buffer_size": 128,
|
|
43
37
|
"samples_buffer_size": 64,
|
|
44
38
|
"limit": limit,
|
|
45
39
|
}
|
|
46
40
|
|
|
41
|
+
if crop_and_resize or encode_images:
|
|
42
|
+
client_config["image_config"] = IMAGE_CONFIG
|
|
43
|
+
|
|
44
|
+
if encode_images:
|
|
45
|
+
client_config["image_config"]["crop_and_resize"] = ( # type: ignore
|
|
46
|
+
crop_and_resize # You may want to encode images without resizing them
|
|
47
|
+
)
|
|
48
|
+
client_config["image_config"]["pre_encode_images"] = True # type: ignore
|
|
49
|
+
|
|
47
50
|
client = DatagoClient(json.dumps(client_config))
|
|
48
51
|
client.start() # Optional, but good practice to start the client to reduce latency to first sample (while you're instantiating models for instance)
|
|
49
52
|
start = time.time()
|
|
@@ -1,24 +1,37 @@
|
|
|
1
|
-
import
|
|
2
|
-
from tqdm import tqdm
|
|
1
|
+
import json
|
|
3
2
|
import os
|
|
3
|
+
import time
|
|
4
|
+
|
|
4
5
|
import typer
|
|
6
|
+
from benchmark_defaults import IMAGE_CONFIG
|
|
5
7
|
from dataset import DatagoIterDataset
|
|
8
|
+
from tqdm import tqdm
|
|
6
9
|
|
|
7
10
|
|
|
8
11
|
def benchmark(
|
|
9
|
-
root_path: str = typer.Option(
|
|
10
|
-
os.getenv("DATAGO_TEST_FILESYSTEM", ""), help="The source to test out"
|
|
11
|
-
),
|
|
12
|
+
root_path: str = typer.Option(os.getenv("DATAGO_TEST_FILESYSTEM", ""), help="The source to test out"),
|
|
12
13
|
limit: int = typer.Option(2000, help="The number of samples to test on"),
|
|
13
|
-
crop_and_resize: bool = typer.Option(
|
|
14
|
-
False, help="Crop and resize the images on the fly"
|
|
15
|
-
),
|
|
14
|
+
crop_and_resize: bool = typer.Option(False, help="Crop and resize the images on the fly"),
|
|
16
15
|
compare_torch: bool = typer.Option(True, help="Compare against torch dataloader"),
|
|
16
|
+
num_workers: int = typer.Option(os.cpu_count(), help="Number of workers to use"),
|
|
17
|
+
sweep: bool = typer.Option(False, help="Sweep over the number of workers"),
|
|
17
18
|
):
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
19
|
+
if sweep:
|
|
20
|
+
results = {}
|
|
21
|
+
for num_workers in range(2, (os.cpu_count() or 2), 16):
|
|
22
|
+
results[num_workers] = benchmark(root_path, limit, crop_and_resize, compare_torch, num_workers, False)
|
|
23
|
+
|
|
24
|
+
# Save results to a json file
|
|
25
|
+
|
|
26
|
+
with open("benchmark_results_filesystem.json", "w") as f:
|
|
27
|
+
json.dump(results, f, indent=2)
|
|
28
|
+
|
|
29
|
+
return results
|
|
30
|
+
|
|
31
|
+
print(f"Running benchmark for {root_path} - {limit} samples - {num_workers} workers")
|
|
32
|
+
|
|
33
|
+
# This setting is not exposed in the config, but an env variable can be used instead
|
|
34
|
+
os.environ["DATAGO_MAX_TASKS"] = str(num_workers)
|
|
22
35
|
|
|
23
36
|
client_config = {
|
|
24
37
|
"source_type": "file",
|
|
@@ -27,19 +40,14 @@ def benchmark(
|
|
|
27
40
|
"rank": 0,
|
|
28
41
|
"world_size": 1,
|
|
29
42
|
},
|
|
30
|
-
"image_config": {
|
|
31
|
-
"crop_and_resize": crop_and_resize,
|
|
32
|
-
"default_image_size": 1024,
|
|
33
|
-
"downsampling_ratio": 32,
|
|
34
|
-
"min_aspect_ratio": 0.5,
|
|
35
|
-
"max_aspect_ratio": 2.0,
|
|
36
|
-
"pre_encode_images": False,
|
|
37
|
-
},
|
|
38
43
|
"prefetch_buffer_size": 256,
|
|
39
44
|
"samples_buffer_size": 256,
|
|
40
45
|
"limit": limit,
|
|
41
46
|
}
|
|
42
47
|
|
|
48
|
+
if crop_and_resize:
|
|
49
|
+
client_config["image_config"] = IMAGE_CONFIG
|
|
50
|
+
|
|
43
51
|
# Make sure in the following that we compare apples to apples, meaning in that case
|
|
44
52
|
# that we materialize the payloads in the python scope in the expected format
|
|
45
53
|
# (PIL.Image for images and masks for instance, numpy arrays for latents)
|
|
@@ -48,14 +56,15 @@ def benchmark(
|
|
|
48
56
|
|
|
49
57
|
img = None
|
|
50
58
|
count = 0
|
|
51
|
-
for sample in tqdm(datago_dataset, dynamic_ncols=True):
|
|
59
|
+
for sample in tqdm(datago_dataset, desc="Datago", dynamic_ncols=True):
|
|
52
60
|
assert sample["id"] != ""
|
|
53
61
|
img = sample["image"]
|
|
54
62
|
count += 1
|
|
55
63
|
|
|
56
64
|
assert count == limit, f"Expected {limit} samples, got {count}"
|
|
57
65
|
fps = limit / (time.time() - start)
|
|
58
|
-
|
|
66
|
+
results = {"datago": {"fps": fps, "count": count}}
|
|
67
|
+
print(f"Datago - FPS {fps:.2f} - workers {num_workers}")
|
|
59
68
|
del datago_dataset
|
|
60
69
|
|
|
61
70
|
# Save the last image as a test
|
|
@@ -64,17 +73,14 @@ def benchmark(
|
|
|
64
73
|
|
|
65
74
|
# Let's compare against a classic pytorch dataloader
|
|
66
75
|
if compare_torch:
|
|
67
|
-
from torchvision import datasets, transforms # type: ignore
|
|
68
76
|
from torch.utils.data import DataLoader
|
|
77
|
+
from torchvision import datasets, transforms # type: ignore
|
|
69
78
|
|
|
70
|
-
print("Benchmarking torch dataloader")
|
|
71
79
|
# Define the transformations to apply to each image
|
|
72
80
|
transform = (
|
|
73
81
|
transforms.Compose(
|
|
74
82
|
[
|
|
75
|
-
transforms.Resize(
|
|
76
|
-
(1024, 1024), interpolation=transforms.InterpolationMode.LANCZOS
|
|
77
|
-
),
|
|
83
|
+
transforms.Resize((1024, 1024), interpolation=transforms.InterpolationMode.LANCZOS),
|
|
78
84
|
]
|
|
79
85
|
)
|
|
80
86
|
if crop_and_resize
|
|
@@ -82,13 +88,10 @@ def benchmark(
|
|
|
82
88
|
)
|
|
83
89
|
|
|
84
90
|
# Create the ImageFolder dataset
|
|
85
|
-
dataset = datasets.ImageFolder(
|
|
86
|
-
root=root_path, transform=transform, allow_empty=True
|
|
87
|
-
)
|
|
91
|
+
dataset = datasets.ImageFolder(root=root_path, transform=transform, allow_empty=True)
|
|
88
92
|
|
|
89
93
|
# Create a DataLoader to allow for multiple workers
|
|
90
94
|
# Use available CPU count for num_workers
|
|
91
|
-
num_workers = os.cpu_count() or 8 # Default to 8 if cpu_count returns None
|
|
92
95
|
dataloader = DataLoader(
|
|
93
96
|
dataset,
|
|
94
97
|
batch_size=1,
|
|
@@ -100,12 +103,15 @@ def benchmark(
|
|
|
100
103
|
# Iterate over the DataLoader
|
|
101
104
|
start = time.time()
|
|
102
105
|
n_images = 0
|
|
103
|
-
for batch in tqdm(dataloader, dynamic_ncols=True):
|
|
106
|
+
for batch in tqdm(dataloader, desc="Torch", dynamic_ncols=True):
|
|
104
107
|
n_images += len(batch)
|
|
105
108
|
if n_images > limit:
|
|
106
109
|
break
|
|
107
110
|
fps = n_images / (time.time() - start)
|
|
108
|
-
|
|
111
|
+
results["torch"] = {"fps": fps, "count": n_images}
|
|
112
|
+
print(f"Torch - FPS {fps:.2f} - workers {num_workers}")
|
|
113
|
+
|
|
114
|
+
return results
|
|
109
115
|
|
|
110
116
|
|
|
111
117
|
if __name__ == "__main__":
|
|
@@ -1,8 +1,11 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
1
3
|
import time
|
|
2
|
-
|
|
4
|
+
|
|
3
5
|
import typer
|
|
6
|
+
from benchmark_defaults import IMAGE_CONFIG
|
|
4
7
|
from dataset import DatagoIterDataset
|
|
5
|
-
import
|
|
8
|
+
from tqdm import tqdm
|
|
6
9
|
|
|
7
10
|
|
|
8
11
|
def benchmark(
|
|
@@ -11,11 +14,23 @@ def benchmark(
|
|
|
11
14
|
True, help="Crop and resize the images on the fly"
|
|
12
15
|
),
|
|
13
16
|
compare_wds: bool = typer.Option(True, help="Compare against torch dataloader"),
|
|
14
|
-
|
|
17
|
+
num_workers: int = typer.Option(
|
|
15
18
|
16,
|
|
16
|
-
help="Number of processes to use
|
|
19
|
+
help="Number of processes to use",
|
|
17
20
|
),
|
|
21
|
+
sweep: bool = typer.Option(False, help="Sweep over the number of processes"),
|
|
18
22
|
):
|
|
23
|
+
if sweep:
|
|
24
|
+
results = {}
|
|
25
|
+
for num_workers in range(2, max(64, (os.cpu_count() or 1)), 8):
|
|
26
|
+
results[num_workers] = benchmark(limit, crop_and_resize, compare_wds, num_workers, False)
|
|
27
|
+
|
|
28
|
+
# Save results to a json file
|
|
29
|
+
with open("benchmark_results_wds.json", "w") as f:
|
|
30
|
+
json.dump(results, f, indent=2)
|
|
31
|
+
|
|
32
|
+
return results
|
|
33
|
+
|
|
19
34
|
# URL of the test bucket
|
|
20
35
|
# bucket = "https://storage.googleapis.com/webdataset/fake-imagenet"
|
|
21
36
|
# dataset = "/imagenet-train-{000000..001281}.tar"
|
|
@@ -32,22 +47,18 @@ def benchmark(
|
|
|
32
47
|
"source_config": {
|
|
33
48
|
"url": url,
|
|
34
49
|
"shuffle": True,
|
|
35
|
-
"max_concurrency":
|
|
50
|
+
"max_concurrency": num_workers, # Number of concurrent TarballSample downloads and dispatch
|
|
36
51
|
"auth_token": os.environ.get("HF_TOKEN", default=""),
|
|
37
52
|
},
|
|
38
|
-
"image_config": {
|
|
39
|
-
"crop_and_resize": crop_and_resize,
|
|
40
|
-
"default_image_size": 1024,
|
|
41
|
-
"downsampling_ratio": 32,
|
|
42
|
-
"min_aspect_ratio": 0.5,
|
|
43
|
-
"max_aspect_ratio": 2.0,
|
|
44
|
-
"pre_encode_images": False,
|
|
45
|
-
},
|
|
46
53
|
"prefetch_buffer_size": 256,
|
|
47
54
|
"samples_buffer_size": 256,
|
|
48
55
|
"limit": limit,
|
|
49
56
|
}
|
|
50
57
|
|
|
58
|
+
if crop_and_resize:
|
|
59
|
+
# Optionally add a custom image config to crop and resize the images on the fly
|
|
60
|
+
client_config["image_config"] = IMAGE_CONFIG
|
|
61
|
+
|
|
51
62
|
# # Make sure in the following that we compare apples to apples, meaning in that case
|
|
52
63
|
# # that we materialize the payloads in the python scope in the expected format
|
|
53
64
|
# # (PIL.Image for images and masks for instance, numpy arrays for latents)
|
|
@@ -55,14 +66,15 @@ def benchmark(
|
|
|
55
66
|
start = time.time() # Note that the datago dataset will start preparing samples (up to the requested buffer size) at construction time
|
|
56
67
|
|
|
57
68
|
img, count = None, 0
|
|
58
|
-
for sample in tqdm(datago_dataset, dynamic_ncols=True):
|
|
69
|
+
for sample in tqdm(datago_dataset, desc="Datago", dynamic_ncols=True):
|
|
59
70
|
assert sample["id"] != ""
|
|
60
71
|
img = sample["image"]
|
|
61
72
|
count += 1
|
|
62
73
|
|
|
63
74
|
assert count == limit, f"Expected {limit} samples, got {count}"
|
|
64
75
|
fps = limit / (time.time() - start)
|
|
65
|
-
print(f"-- Datago WDS FPS {fps:.2f}")
|
|
76
|
+
print(f"-- Datago WDS FPS {fps:.2f} - workers {num_workers}")
|
|
77
|
+
results = {"datago": {"fps": fps, "count": count}}
|
|
66
78
|
del datago_dataset
|
|
67
79
|
|
|
68
80
|
# Save the last image as a test
|
|
@@ -71,9 +83,9 @@ def benchmark(
|
|
|
71
83
|
|
|
72
84
|
# Let's compare against a classic webdataset dataloader
|
|
73
85
|
if compare_wds:
|
|
74
|
-
from torchvision import transforms
|
|
75
|
-
from torch.utils.data import DataLoader
|
|
76
86
|
import webdataset as wds
|
|
87
|
+
from torch.utils.data import DataLoader
|
|
88
|
+
from torchvision import transforms
|
|
77
89
|
|
|
78
90
|
print("\nBenchmarking webdataset library dataloader")
|
|
79
91
|
# Define the transformations to apply to each image
|
|
@@ -108,19 +120,21 @@ def benchmark(
|
|
|
108
120
|
dataloader = DataLoader(
|
|
109
121
|
dataset,
|
|
110
122
|
batch_size=1,
|
|
111
|
-
num_workers=
|
|
123
|
+
num_workers=num_workers,
|
|
112
124
|
prefetch_factor=2,
|
|
113
125
|
collate_fn=lambda x: x,
|
|
114
126
|
)
|
|
115
127
|
|
|
116
128
|
# Iterate over the DataLoader
|
|
117
129
|
start = time.time()
|
|
118
|
-
for n_images, _ in enumerate(tqdm(dataloader, dynamic_ncols=True)):
|
|
130
|
+
for n_images, _ in enumerate(tqdm(dataloader, desc="WDS", dynamic_ncols=True)):
|
|
119
131
|
if n_images > limit:
|
|
120
132
|
break
|
|
121
133
|
fps = n_images / (time.time() - start)
|
|
122
|
-
print(f"-- Webdataset lib FPS ({
|
|
134
|
+
print(f"-- Webdataset lib FPS ({num_workers} processes) {fps:.2f}")
|
|
123
135
|
|
|
136
|
+
results["webdataset"] = {"fps": fps, "count": n_images}
|
|
137
|
+
return results
|
|
124
138
|
|
|
125
139
|
if __name__ == "__main__":
|
|
126
140
|
typer.run(benchmark)
|
|
@@ -85,6 +85,9 @@ if __name__ == "__main__":
|
|
|
85
85
|
"min_aspect_ratio": 0.5,
|
|
86
86
|
"max_aspect_ratio": 2.0,
|
|
87
87
|
"pre_encode_images": False,
|
|
88
|
+
# Optional: Use JPEG encoding instead of PNG (defaults to PNG if not specified)
|
|
89
|
+
# "encode_format": "jpeg", # or "png"
|
|
90
|
+
# "jpeg_quality": 92, # 0-100, only used when encode_format is "jpeg"
|
|
88
91
|
},
|
|
89
92
|
"prefetch_buffer_size": 64,
|
|
90
93
|
"samples_buffer_size": 128,
|
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
from PIL import Image
|
|
2
|
-
from typing import Optional
|
|
2
|
+
from typing import Optional, Union
|
|
3
3
|
import numpy as np
|
|
4
4
|
|
|
5
5
|
|
|
6
|
-
def uint8_array_to_numpy(raw_array):
|
|
6
|
+
def uint8_array_to_numpy(raw_array: 'ImagePayload') -> Optional[np.ndarray]:
|
|
7
7
|
if len(raw_array.data) == 0:
|
|
8
8
|
return None
|
|
9
9
|
|
|
@@ -29,7 +29,7 @@ def uint8_array_to_numpy(raw_array):
|
|
|
29
29
|
return np.frombuffer(raw_array.data, dtype=np.uint8).reshape(shape)
|
|
30
30
|
|
|
31
31
|
|
|
32
|
-
def raw_array_to_numpy(raw_array) -> Optional[np.ndarray]:
|
|
32
|
+
def raw_array_to_numpy(raw_array: 'ImagePayload') -> Optional[np.ndarray]:
|
|
33
33
|
if len(raw_array.data) == 0:
|
|
34
34
|
return None
|
|
35
35
|
|
|
@@ -42,7 +42,7 @@ def raw_array_to_numpy(raw_array) -> Optional[np.ndarray]:
|
|
|
42
42
|
return None
|
|
43
43
|
|
|
44
44
|
|
|
45
|
-
def raw_array_to_pil_image(raw_array) -> Optional[Image.Image]:
|
|
45
|
+
def raw_array_to_pil_image(raw_array: 'ImagePayload') -> Union[Optional[Image.Image], 'ImagePayload']:
|
|
46
46
|
if len(raw_array.data) == 0:
|
|
47
47
|
return None
|
|
48
48
|
|
|
@@ -63,3 +63,25 @@ def raw_array_to_pil_image(raw_array) -> Optional[Image.Image]:
|
|
|
63
63
|
|
|
64
64
|
assert c == 3, f"Expected 3 channels, got {c}"
|
|
65
65
|
return Image.fromarray(np_array)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def decode_image_payload(payload: 'ImagePayload') -> Image.Image:
|
|
69
|
+
"""
|
|
70
|
+
Decode an ImagePayload (encoded image) into a PIL Image.
|
|
71
|
+
This is the proper way to decode encoded images for API users.
|
|
72
|
+
"""
|
|
73
|
+
import io
|
|
74
|
+
return Image.open(io.BytesIO(payload.data))
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def get_image_mode(image_or_payload) -> str:
|
|
78
|
+
"""
|
|
79
|
+
Helper function to get the mode of an image, whether it's a PIL Image or ImagePayload.
|
|
80
|
+
For ImagePayload objects (encoded images), we need to decode them first.
|
|
81
|
+
"""
|
|
82
|
+
if hasattr(image_or_payload, 'mode'):
|
|
83
|
+
# It's a PIL Image
|
|
84
|
+
return image_or_payload.mode
|
|
85
|
+
else:
|
|
86
|
+
# It's an ImagePayload (encoded image), decode it first
|
|
87
|
+
return decode_image_payload(image_or_payload).mode
|