datago 2025.8.1__tar.gz → 2025.12.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {datago-2025.8.1 → datago-2025.12.1}/Cargo.lock +73 -19
- {datago-2025.8.1 → datago-2025.12.1}/Cargo.toml +3 -2
- {datago-2025.8.1 → datago-2025.12.1}/PKG-INFO +62 -12
- {datago-2025.8.1 → datago-2025.12.1}/README.md +60 -10
- datago-2025.12.1/assets/epyc_vast.png +0 -0
- datago-2025.12.1/assets/epyc_wds.png +0 -0
- datago-2025.12.1/assets/zen3_ssd.png +0 -0
- {datago-2025.8.1 → datago-2025.12.1}/pyproject.toml +1 -1
- {datago-2025.8.1 → datago-2025.12.1}/python/benchmark_db.py +16 -27
- datago-2025.12.1/python/benchmark_defaults.py +8 -0
- {datago-2025.8.1 → datago-2025.12.1}/python/benchmark_filesystem.py +44 -19
- {datago-2025.8.1 → datago-2025.12.1}/python/benchmark_webdataset.py +34 -20
- {datago-2025.8.1 → datago-2025.12.1}/python/dataset.py +8 -3
- datago-2025.12.1/python/raw_types.py +40 -0
- {datago-2025.8.1 → datago-2025.12.1}/python/test_datago_client.py +9 -4
- {datago-2025.8.1 → datago-2025.12.1}/python/test_datago_db.py +39 -32
- {datago-2025.8.1 → datago-2025.12.1}/python/test_datago_edge_cases.py +6 -4
- {datago-2025.8.1 → datago-2025.12.1}/python/test_datago_filesystem.py +7 -5
- datago-2025.12.1/python/test_pil_implicit_conversion.py +80 -0
- {datago-2025.8.1 → datago-2025.12.1}/src/client.rs +43 -23
- {datago-2025.8.1 → datago-2025.12.1}/src/generator_files.rs +7 -4
- {datago-2025.8.1 → datago-2025.12.1}/src/generator_http.rs +9 -6
- {datago-2025.8.1 → datago-2025.12.1}/src/generator_wds.rs +38 -21
- {datago-2025.8.1 → datago-2025.12.1}/src/image_processing.rs +111 -27
- {datago-2025.8.1 → datago-2025.12.1}/src/lib.rs +3 -1
- {datago-2025.8.1 → datago-2025.12.1}/src/main.rs +6 -2
- {datago-2025.8.1 → datago-2025.12.1}/src/structs.rs +182 -10
- {datago-2025.8.1 → datago-2025.12.1}/src/worker_files.rs +97 -53
- {datago-2025.8.1 → datago-2025.12.1}/src/worker_http.rs +149 -36
- {datago-2025.8.1 → datago-2025.12.1}/src/worker_wds.rs +30 -25
- datago-2025.8.1/python/raw_types.py +0 -65
- {datago-2025.8.1 → datago-2025.12.1}/.github/workflows/ci-cd.yml +0 -0
- {datago-2025.8.1 → datago-2025.12.1}/.github/workflows/rust.yml +0 -0
- {datago-2025.8.1 → datago-2025.12.1}/.gitignore +0 -0
- {datago-2025.8.1 → datago-2025.12.1}/.pre-commit-config.yaml +0 -0
- {datago-2025.8.1 → datago-2025.12.1}/LICENSE +0 -0
- {datago-2025.8.1 → datago-2025.12.1}/assets/447175851-2277afcb-8abf-4d17-b2db-dae27c6056d0.png +0 -0
- {datago-2025.8.1 → datago-2025.12.1}/requirements-tests.txt +0 -0
- {datago-2025.8.1 → datago-2025.12.1}/requirements.txt +0 -0
|
@@ -111,6 +111,16 @@ version = "0.7.6"
|
|
|
111
111
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
112
112
|
checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50"
|
|
113
113
|
|
|
114
|
+
[[package]]
|
|
115
|
+
name = "assert-json-diff"
|
|
116
|
+
version = "2.0.2"
|
|
117
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
118
|
+
checksum = "47e4f2b81832e72834d7518d8487a0396a28cc408186a2e8854c0f98011faf12"
|
|
119
|
+
dependencies = [
|
|
120
|
+
"serde",
|
|
121
|
+
"serde_json",
|
|
122
|
+
]
|
|
123
|
+
|
|
114
124
|
[[package]]
|
|
115
125
|
name = "async-channel"
|
|
116
126
|
version = "1.9.0"
|
|
@@ -464,7 +474,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
|
464
474
|
checksum = "d067ad48b8650848b989a59a86c6c36a995d02d2bf778d45c3c5d57bc2718f02"
|
|
465
475
|
dependencies = [
|
|
466
476
|
"smallvec",
|
|
467
|
-
"target-lexicon
|
|
477
|
+
"target-lexicon",
|
|
468
478
|
]
|
|
469
479
|
|
|
470
480
|
[[package]]
|
|
@@ -613,7 +623,7 @@ dependencies = [
|
|
|
613
623
|
|
|
614
624
|
[[package]]
|
|
615
625
|
name = "datago"
|
|
616
|
-
version = "2025.
|
|
626
|
+
version = "2025.12.1"
|
|
617
627
|
dependencies = [
|
|
618
628
|
"async-compression",
|
|
619
629
|
"async-tar",
|
|
@@ -644,8 +654,27 @@ dependencies = [
|
|
|
644
654
|
"tokio-util",
|
|
645
655
|
"url",
|
|
646
656
|
"walkdir",
|
|
657
|
+
"wiremock",
|
|
658
|
+
]
|
|
659
|
+
|
|
660
|
+
[[package]]
|
|
661
|
+
name = "deadpool"
|
|
662
|
+
version = "0.10.0"
|
|
663
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
664
|
+
checksum = "fb84100978c1c7b37f09ed3ce3e5f843af02c2a2c431bae5b19230dad2c1b490"
|
|
665
|
+
dependencies = [
|
|
666
|
+
"async-trait",
|
|
667
|
+
"deadpool-runtime",
|
|
668
|
+
"num_cpus",
|
|
669
|
+
"tokio",
|
|
647
670
|
]
|
|
648
671
|
|
|
672
|
+
[[package]]
|
|
673
|
+
name = "deadpool-runtime"
|
|
674
|
+
version = "0.1.4"
|
|
675
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
676
|
+
checksum = "092966b41edc516079bdf31ec78a2e0588d1d0c08f78b91d8307215928642b2b"
|
|
677
|
+
|
|
649
678
|
[[package]]
|
|
650
679
|
name = "dirs-next"
|
|
651
680
|
version = "2.0.0"
|
|
@@ -1118,6 +1147,12 @@ version = "1.10.1"
|
|
|
1118
1147
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
1119
1148
|
checksum = "6dbf3de79e51f3d586ab4cb9d5c3e2c14aa28ed23d180cf89b4df0454a69cc87"
|
|
1120
1149
|
|
|
1150
|
+
[[package]]
|
|
1151
|
+
name = "httpdate"
|
|
1152
|
+
version = "1.0.3"
|
|
1153
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
1154
|
+
checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9"
|
|
1155
|
+
|
|
1121
1156
|
[[package]]
|
|
1122
1157
|
name = "hyper"
|
|
1123
1158
|
version = "1.6.0"
|
|
@@ -1131,6 +1166,7 @@ dependencies = [
|
|
|
1131
1166
|
"http",
|
|
1132
1167
|
"http-body",
|
|
1133
1168
|
"httparse",
|
|
1169
|
+
"httpdate",
|
|
1134
1170
|
"itoa",
|
|
1135
1171
|
"pin-project-lite",
|
|
1136
1172
|
"smallvec",
|
|
@@ -2026,9 +2062,9 @@ dependencies = [
|
|
|
2026
2062
|
|
|
2027
2063
|
[[package]]
|
|
2028
2064
|
name = "pyo3"
|
|
2029
|
-
version = "0.
|
|
2065
|
+
version = "0.22.6"
|
|
2030
2066
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
2031
|
-
checksum = "
|
|
2067
|
+
checksum = "f402062616ab18202ae8319da13fa4279883a2b8a9d9f83f20dbade813ce1884"
|
|
2032
2068
|
dependencies = [
|
|
2033
2069
|
"cfg-if",
|
|
2034
2070
|
"indoc",
|
|
@@ -2044,19 +2080,19 @@ dependencies = [
|
|
|
2044
2080
|
|
|
2045
2081
|
[[package]]
|
|
2046
2082
|
name = "pyo3-build-config"
|
|
2047
|
-
version = "0.
|
|
2083
|
+
version = "0.22.6"
|
|
2048
2084
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
2049
|
-
checksum = "
|
|
2085
|
+
checksum = "b14b5775b5ff446dd1056212d778012cbe8a0fbffd368029fd9e25b514479c38"
|
|
2050
2086
|
dependencies = [
|
|
2051
2087
|
"once_cell",
|
|
2052
|
-
"target-lexicon
|
|
2088
|
+
"target-lexicon",
|
|
2053
2089
|
]
|
|
2054
2090
|
|
|
2055
2091
|
[[package]]
|
|
2056
2092
|
name = "pyo3-ffi"
|
|
2057
|
-
version = "0.
|
|
2093
|
+
version = "0.22.6"
|
|
2058
2094
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
2059
|
-
checksum = "
|
|
2095
|
+
checksum = "9ab5bcf04a2cdcbb50c7d6105de943f543f9ed92af55818fd17b660390fc8636"
|
|
2060
2096
|
dependencies = [
|
|
2061
2097
|
"libc",
|
|
2062
2098
|
"pyo3-build-config",
|
|
@@ -2064,9 +2100,9 @@ dependencies = [
|
|
|
2064
2100
|
|
|
2065
2101
|
[[package]]
|
|
2066
2102
|
name = "pyo3-macros"
|
|
2067
|
-
version = "0.
|
|
2103
|
+
version = "0.22.6"
|
|
2068
2104
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
2069
|
-
checksum = "
|
|
2105
|
+
checksum = "0fd24d897903a9e6d80b968368a34e1525aeb719d568dba8b3d4bfa5dc67d453"
|
|
2070
2106
|
dependencies = [
|
|
2071
2107
|
"proc-macro2",
|
|
2072
2108
|
"pyo3-macros-backend",
|
|
@@ -2076,9 +2112,9 @@ dependencies = [
|
|
|
2076
2112
|
|
|
2077
2113
|
[[package]]
|
|
2078
2114
|
name = "pyo3-macros-backend"
|
|
2079
|
-
version = "0.
|
|
2115
|
+
version = "0.22.6"
|
|
2080
2116
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
2081
|
-
checksum = "
|
|
2117
|
+
checksum = "36c011a03ba1e50152b4b394b479826cad97e7a21eb52df179cd91ac411cbfbe"
|
|
2082
2118
|
dependencies = [
|
|
2083
2119
|
"heck",
|
|
2084
2120
|
"proc-macro2",
|
|
@@ -2753,12 +2789,6 @@ version = "0.12.16"
|
|
|
2753
2789
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
2754
2790
|
checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1"
|
|
2755
2791
|
|
|
2756
|
-
[[package]]
|
|
2757
|
-
name = "target-lexicon"
|
|
2758
|
-
version = "0.13.2"
|
|
2759
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
2760
|
-
checksum = "e502f78cdbb8ba4718f566c418c52bc729126ffd16baee5baa718cf25dd5a69a"
|
|
2761
|
-
|
|
2762
2792
|
[[package]]
|
|
2763
2793
|
name = "tempfile"
|
|
2764
2794
|
version = "3.20.0"
|
|
@@ -3454,6 +3484,30 @@ dependencies = [
|
|
|
3454
3484
|
"memchr",
|
|
3455
3485
|
]
|
|
3456
3486
|
|
|
3487
|
+
[[package]]
|
|
3488
|
+
name = "wiremock"
|
|
3489
|
+
version = "0.6.4"
|
|
3490
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
3491
|
+
checksum = "a2b8b99d4cdbf36b239a9532e31fe4fb8acc38d1897c1761e161550a7dc78e6a"
|
|
3492
|
+
dependencies = [
|
|
3493
|
+
"assert-json-diff",
|
|
3494
|
+
"async-trait",
|
|
3495
|
+
"base64",
|
|
3496
|
+
"deadpool",
|
|
3497
|
+
"futures",
|
|
3498
|
+
"http",
|
|
3499
|
+
"http-body-util",
|
|
3500
|
+
"hyper",
|
|
3501
|
+
"hyper-util",
|
|
3502
|
+
"log",
|
|
3503
|
+
"once_cell",
|
|
3504
|
+
"regex",
|
|
3505
|
+
"serde",
|
|
3506
|
+
"serde_json",
|
|
3507
|
+
"tokio",
|
|
3508
|
+
"url",
|
|
3509
|
+
]
|
|
3510
|
+
|
|
3457
3511
|
[[package]]
|
|
3458
3512
|
name = "wit-bindgen-rt"
|
|
3459
3513
|
version = "0.33.0"
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
[package]
|
|
2
2
|
name = "datago"
|
|
3
3
|
edition = "2021"
|
|
4
|
-
version = "2025.
|
|
4
|
+
version = "2025.12.1"
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
|
|
7
7
|
[lib]
|
|
@@ -24,7 +24,7 @@ kanal = "0.1"
|
|
|
24
24
|
clap = { version = "4.5.27", features = ["derive"] }
|
|
25
25
|
tokio = { version = "1.43.1", features = ["rt-multi-thread", "macros"] }
|
|
26
26
|
prettytable-rs = "0.10.0"
|
|
27
|
-
pyo3 = { version = "0.
|
|
27
|
+
pyo3 = { version = "0.22", features = ["extension-module"] }
|
|
28
28
|
threadpool = "1.8.1"
|
|
29
29
|
openssl = { version = "0.10", features = ["vendored"] }
|
|
30
30
|
walkdir = "2.5.0"
|
|
@@ -46,6 +46,7 @@ fast_image_resize = { version ="5.1.3", features=["image"]}
|
|
|
46
46
|
|
|
47
47
|
[dev-dependencies]
|
|
48
48
|
tempfile = "3.13.0"
|
|
49
|
+
wiremock = "0.6.0"
|
|
49
50
|
|
|
50
51
|
[profile.release]
|
|
51
52
|
opt-level = 3
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: datago
|
|
3
|
-
Version: 2025.
|
|
3
|
+
Version: 2025.12.1
|
|
4
4
|
Classifier: Programming Language :: Rust
|
|
5
5
|
Classifier: Programming Language :: Python :: Implementation :: CPython
|
|
6
6
|
Classifier: Programming Language :: Python :: Implementation :: PyPy
|
|
@@ -8,7 +8,7 @@ Classifier: Programming Language :: Python :: 3
|
|
|
8
8
|
Classifier: License :: OSI Approved :: MIT License
|
|
9
9
|
License-File: LICENSE
|
|
10
10
|
Summary: A high performance dataloader for Python, written in Rust
|
|
11
|
-
Author: Benjamin Lefaudeux
|
|
11
|
+
Author: Benjamin Lefaudeux, Roman Frigg
|
|
12
12
|
Author-email: Photoroom <team@photoroom.com>
|
|
13
13
|
Requires-Python: >=3.8
|
|
14
14
|
Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
|
|
@@ -97,7 +97,7 @@ config = {
|
|
|
97
97
|
"source_config": {
|
|
98
98
|
"root_path": "myPath",
|
|
99
99
|
"random_sampling": False, # True if used directly for training
|
|
100
|
-
"rank": 0,
|
|
100
|
+
"rank": 0, # Optional, distributed workloads are possible
|
|
101
101
|
"world_size": 1,
|
|
102
102
|
},
|
|
103
103
|
"limit": 200,
|
|
@@ -137,15 +137,6 @@ client_config = {
|
|
|
137
137
|
"rank": 0,
|
|
138
138
|
"world_size": 1,
|
|
139
139
|
},
|
|
140
|
-
# Optional pre-processing of the images, placing them in an aspect ratio bucket to preseve as much as possible of the original content
|
|
141
|
-
"image_config": {
|
|
142
|
-
"crop_and_resize": True, # False to turn it off, or just omit this part of the config
|
|
143
|
-
"default_image_size": 1024,
|
|
144
|
-
"downsampling_ratio": 32,
|
|
145
|
-
"min_aspect_ratio": 0.5,
|
|
146
|
-
"max_aspect_ratio": 2.0,
|
|
147
|
-
"pre_encode_images": False,
|
|
148
|
-
},
|
|
149
140
|
"prefetch_buffer_size": 128,
|
|
150
141
|
"samples_buffer_size": 64,
|
|
151
142
|
"limit": 1_000_000, # Dummy example, max number of samples you would like to serve
|
|
@@ -159,6 +150,38 @@ for _ in range(10):
|
|
|
159
150
|
|
|
160
151
|
</details>
|
|
161
152
|
|
|
153
|
+
## Process images on the fly
|
|
154
|
+
|
|
155
|
+
Datago can also process images on the fly, for instance to align different image payloads. This is done by adding an `image_config` to the configuration. The following example shows how to align different image payloads.
|
|
156
|
+
|
|
157
|
+
Processing can be very CPU heavy, but it will be distributed over all CPU cores wihout requiring multiple python processes. I.e., you can keep a single python process using `get_sample()` on the client and still saturate all CPU cores.
|
|
158
|
+
|
|
159
|
+
There are three main processing topics that you can choose from:
|
|
160
|
+
|
|
161
|
+
- crop the images to within an aspect ratio bucket (which is very handy for all Transformer / patch based architectures)
|
|
162
|
+
- resize the images (setting here will be related to the square aspect ratio bucket, other buckets will differ of course)
|
|
163
|
+
- pre-encode the images to a specific format (jpg, png, ...)
|
|
164
|
+
|
|
165
|
+
```python
|
|
166
|
+
config = {
|
|
167
|
+
"source_type": "file",
|
|
168
|
+
"source_config": {
|
|
169
|
+
"root_path": "myPath",
|
|
170
|
+
"random_sampling": False, # True if used directly for training
|
|
171
|
+
},
|
|
172
|
+
# Optional pre-processing of the images, placing them in an aspect ratio bucket to preserve as much as possible of the original content
|
|
173
|
+
"image_config": {
|
|
174
|
+
"crop_and_resize": True, # False to turn it off, or just omit this part of the config
|
|
175
|
+
"default_image_size": 1024,
|
|
176
|
+
"downsampling_ratio": 32,
|
|
177
|
+
"min_aspect_ratio": 0.5,
|
|
178
|
+
"max_aspect_ratio": 2.0,
|
|
179
|
+
"pre_encode_images": False,
|
|
180
|
+
},
|
|
181
|
+
"limit": 200,
|
|
182
|
+
"samples_buffer_size": 32,
|
|
183
|
+
}
|
|
184
|
+
```
|
|
162
185
|
|
|
163
186
|
## Match the raw exported buffers with typical python types
|
|
164
187
|
|
|
@@ -171,6 +194,14 @@ You can set the log level using the RUST_LOG environment variable. E.g. `RUST_LO
|
|
|
171
194
|
|
|
172
195
|
When using the library from Python, `env_logger` will be initialized automatically when creating a `DatagoClient`. There is also a `initialize_logging` function in the `datago` module, which if called before using a client, allows to customize the log level. This only works if RUST_LOG is not set.
|
|
173
196
|
|
|
197
|
+
## Env variables
|
|
198
|
+
|
|
199
|
+
There are a couple of env variables which will change the behavior of the library, for settings which felt too low level to be exposed in the config.
|
|
200
|
+
|
|
201
|
+
- `DATAGO_MAX_TASKS`: refers to the number of threads which will be used to load the samples. Defaults to a multiple of the CPU cores.
|
|
202
|
+
- `RUST_LOG`: see above, will change the level of logging for the whole library, could be useful for debugging or to report an issue here.
|
|
203
|
+
- `DATAGO_MAX_RETRIES`: number of retries for a failed sample load, defaults to 3.
|
|
204
|
+
|
|
174
205
|
</details><details> <summary><strong>Build it</strong></summary>
|
|
175
206
|
|
|
176
207
|
## Preamble
|
|
@@ -233,6 +264,25 @@ Create a new tag and a new release in this repo, a new package will be pushed au
|
|
|
233
264
|
|
|
234
265
|
</details>
|
|
235
266
|
|
|
267
|
+
<details> <summary><strong>Benchmarks</strong></summary>
|
|
268
|
+
As usual, benchmarks are a tricky game, and you shouldn't read too much into the following plots but do your own tests. Some python benchmark examples are provided in the [python](./python/) folder.
|
|
269
|
+
|
|
270
|
+
In general, Datago will be impactful if you want to load a lot of images very fast, but if you consume them as you go at a more leisury pace then it's not really needed. The more CPU work there is with the images and the higher quality they are, the more Datago will shine. The following benchmarks are using ImageNet 1k, which is very low resolution and thus kind of a worst case scenario. Data is served from cache (i.e. the OS cache) and the images are not pre-processed. In this case the receiving python process is typically the bottleneck, and caps at around 2000 images per second.
|
|
271
|
+
|
|
272
|
+
### AMD Zen3 laptop - IN1k - disk
|
|
273
|
+

|
|
274
|
+
|
|
275
|
+
### AMD EPYC 9454 - IN1k - disk
|
|
276
|
+

|
|
277
|
+
|
|
278
|
+
This benchmark is using the PD12M dataset, which is a 12M images dataset, with a lot of high resolution images. It's accessed through the webdataset front end, datago is compared with the popular python webdataset library. Note that datago will start streaming the images faster here (almost instantly !), so given enough time the two results would look closer.
|
|
279
|
+
|
|
280
|
+
### AMD EPYC 9454 - pd12m - webdataset
|
|
281
|
+

|
|
282
|
+
|
|
283
|
+
</details>
|
|
284
|
+
|
|
285
|
+
|
|
236
286
|
## License
|
|
237
287
|
|
|
238
288
|
MIT License
|
|
@@ -80,7 +80,7 @@ config = {
|
|
|
80
80
|
"source_config": {
|
|
81
81
|
"root_path": "myPath",
|
|
82
82
|
"random_sampling": False, # True if used directly for training
|
|
83
|
-
"rank": 0,
|
|
83
|
+
"rank": 0, # Optional, distributed workloads are possible
|
|
84
84
|
"world_size": 1,
|
|
85
85
|
},
|
|
86
86
|
"limit": 200,
|
|
@@ -120,15 +120,6 @@ client_config = {
|
|
|
120
120
|
"rank": 0,
|
|
121
121
|
"world_size": 1,
|
|
122
122
|
},
|
|
123
|
-
# Optional pre-processing of the images, placing them in an aspect ratio bucket to preseve as much as possible of the original content
|
|
124
|
-
"image_config": {
|
|
125
|
-
"crop_and_resize": True, # False to turn it off, or just omit this part of the config
|
|
126
|
-
"default_image_size": 1024,
|
|
127
|
-
"downsampling_ratio": 32,
|
|
128
|
-
"min_aspect_ratio": 0.5,
|
|
129
|
-
"max_aspect_ratio": 2.0,
|
|
130
|
-
"pre_encode_images": False,
|
|
131
|
-
},
|
|
132
123
|
"prefetch_buffer_size": 128,
|
|
133
124
|
"samples_buffer_size": 64,
|
|
134
125
|
"limit": 1_000_000, # Dummy example, max number of samples you would like to serve
|
|
@@ -142,6 +133,38 @@ for _ in range(10):
|
|
|
142
133
|
|
|
143
134
|
</details>
|
|
144
135
|
|
|
136
|
+
## Process images on the fly
|
|
137
|
+
|
|
138
|
+
Datago can also process images on the fly, for instance to align different image payloads. This is done by adding an `image_config` to the configuration. The following example shows how to align different image payloads.
|
|
139
|
+
|
|
140
|
+
Processing can be very CPU heavy, but it will be distributed over all CPU cores wihout requiring multiple python processes. I.e., you can keep a single python process using `get_sample()` on the client and still saturate all CPU cores.
|
|
141
|
+
|
|
142
|
+
There are three main processing topics that you can choose from:
|
|
143
|
+
|
|
144
|
+
- crop the images to within an aspect ratio bucket (which is very handy for all Transformer / patch based architectures)
|
|
145
|
+
- resize the images (setting here will be related to the square aspect ratio bucket, other buckets will differ of course)
|
|
146
|
+
- pre-encode the images to a specific format (jpg, png, ...)
|
|
147
|
+
|
|
148
|
+
```python
|
|
149
|
+
config = {
|
|
150
|
+
"source_type": "file",
|
|
151
|
+
"source_config": {
|
|
152
|
+
"root_path": "myPath",
|
|
153
|
+
"random_sampling": False, # True if used directly for training
|
|
154
|
+
},
|
|
155
|
+
# Optional pre-processing of the images, placing them in an aspect ratio bucket to preserve as much as possible of the original content
|
|
156
|
+
"image_config": {
|
|
157
|
+
"crop_and_resize": True, # False to turn it off, or just omit this part of the config
|
|
158
|
+
"default_image_size": 1024,
|
|
159
|
+
"downsampling_ratio": 32,
|
|
160
|
+
"min_aspect_ratio": 0.5,
|
|
161
|
+
"max_aspect_ratio": 2.0,
|
|
162
|
+
"pre_encode_images": False,
|
|
163
|
+
},
|
|
164
|
+
"limit": 200,
|
|
165
|
+
"samples_buffer_size": 32,
|
|
166
|
+
}
|
|
167
|
+
```
|
|
145
168
|
|
|
146
169
|
## Match the raw exported buffers with typical python types
|
|
147
170
|
|
|
@@ -154,6 +177,14 @@ You can set the log level using the RUST_LOG environment variable. E.g. `RUST_LO
|
|
|
154
177
|
|
|
155
178
|
When using the library from Python, `env_logger` will be initialized automatically when creating a `DatagoClient`. There is also a `initialize_logging` function in the `datago` module, which if called before using a client, allows to customize the log level. This only works if RUST_LOG is not set.
|
|
156
179
|
|
|
180
|
+
## Env variables
|
|
181
|
+
|
|
182
|
+
There are a couple of env variables which will change the behavior of the library, for settings which felt too low level to be exposed in the config.
|
|
183
|
+
|
|
184
|
+
- `DATAGO_MAX_TASKS`: refers to the number of threads which will be used to load the samples. Defaults to a multiple of the CPU cores.
|
|
185
|
+
- `RUST_LOG`: see above, will change the level of logging for the whole library, could be useful for debugging or to report an issue here.
|
|
186
|
+
- `DATAGO_MAX_RETRIES`: number of retries for a failed sample load, defaults to 3.
|
|
187
|
+
|
|
157
188
|
</details><details> <summary><strong>Build it</strong></summary>
|
|
158
189
|
|
|
159
190
|
## Preamble
|
|
@@ -216,6 +247,25 @@ Create a new tag and a new release in this repo, a new package will be pushed au
|
|
|
216
247
|
|
|
217
248
|
</details>
|
|
218
249
|
|
|
250
|
+
<details> <summary><strong>Benchmarks</strong></summary>
|
|
251
|
+
As usual, benchmarks are a tricky game, and you shouldn't read too much into the following plots but do your own tests. Some python benchmark examples are provided in the [python](./python/) folder.
|
|
252
|
+
|
|
253
|
+
In general, Datago will be impactful if you want to load a lot of images very fast, but if you consume them as you go at a more leisury pace then it's not really needed. The more CPU work there is with the images and the higher quality they are, the more Datago will shine. The following benchmarks are using ImageNet 1k, which is very low resolution and thus kind of a worst case scenario. Data is served from cache (i.e. the OS cache) and the images are not pre-processed. In this case the receiving python process is typically the bottleneck, and caps at around 2000 images per second.
|
|
254
|
+
|
|
255
|
+
### AMD Zen3 laptop - IN1k - disk
|
|
256
|
+

|
|
257
|
+
|
|
258
|
+
### AMD EPYC 9454 - IN1k - disk
|
|
259
|
+

|
|
260
|
+
|
|
261
|
+
This benchmark is using the PD12M dataset, which is a 12M images dataset, with a lot of high resolution images. It's accessed through the webdataset front end, datago is compared with the popular python webdataset library. Note that datago will start streaming the images faster here (almost instantly !), so given enough time the two results would look closer.
|
|
262
|
+
|
|
263
|
+
### AMD EPYC 9454 - pd12m - webdataset
|
|
264
|
+

|
|
265
|
+
|
|
266
|
+
</details>
|
|
267
|
+
|
|
268
|
+
|
|
219
269
|
## License
|
|
220
270
|
|
|
221
271
|
MIT License
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
name = "datago"
|
|
3
3
|
dynamic = ["version"]
|
|
4
4
|
authors = [
|
|
5
|
-
{ name = "Benjamin Lefaudeux" },
|
|
5
|
+
{ name = "Benjamin Lefaudeux, Roman Frigg" },
|
|
6
6
|
{ name = "Photoroom", email = "team@photoroom.com" }
|
|
7
7
|
]
|
|
8
8
|
description = "A high performance dataloader for Python, written in Rust"
|
|
@@ -1,11 +1,13 @@
|
|
|
1
|
-
|
|
1
|
+
import json
|
|
2
2
|
import time
|
|
3
|
-
|
|
3
|
+
|
|
4
4
|
import numpy as np
|
|
5
|
-
from raw_types import raw_array_to_pil_image, raw_array_to_numpy
|
|
6
5
|
import typer
|
|
7
|
-
import
|
|
6
|
+
from benchmark_defaults import IMAGE_CONFIG
|
|
7
|
+
from datago import DatagoClient # type: ignore
|
|
8
8
|
from PIL import Image
|
|
9
|
+
from raw_types import raw_array_to_numpy
|
|
10
|
+
from tqdm import tqdm
|
|
9
11
|
|
|
10
12
|
|
|
11
13
|
def benchmark(
|
|
@@ -31,19 +33,20 @@ def benchmark(
|
|
|
31
33
|
"rank": 0,
|
|
32
34
|
"world_size": 1,
|
|
33
35
|
},
|
|
34
|
-
"image_config": {
|
|
35
|
-
"crop_and_resize": crop_and_resize,
|
|
36
|
-
"default_image_size": 1024,
|
|
37
|
-
"downsampling_ratio": 32,
|
|
38
|
-
"min_aspect_ratio": 0.5,
|
|
39
|
-
"max_aspect_ratio": 2.0,
|
|
40
|
-
"pre_encode_images": encode_images,
|
|
41
|
-
},
|
|
42
36
|
"prefetch_buffer_size": 128,
|
|
43
37
|
"samples_buffer_size": 64,
|
|
44
38
|
"limit": limit,
|
|
45
39
|
}
|
|
46
40
|
|
|
41
|
+
if crop_and_resize or encode_images:
|
|
42
|
+
client_config["image_config"] = IMAGE_CONFIG
|
|
43
|
+
|
|
44
|
+
if encode_images:
|
|
45
|
+
client_config["image_config"]["crop_and_resize"] = ( # type: ignore
|
|
46
|
+
crop_and_resize # You may want to encode images without resizing them
|
|
47
|
+
)
|
|
48
|
+
client_config["image_config"]["pre_encode_images"] = True # type: ignore
|
|
49
|
+
|
|
47
50
|
client = DatagoClient(json.dumps(client_config))
|
|
48
51
|
client.start() # Optional, but good practice to start the client to reduce latency to first sample (while you're instantiating models for instance)
|
|
49
52
|
start = time.time()
|
|
@@ -55,21 +58,7 @@ def benchmark(
|
|
|
55
58
|
for _ in tqdm(range(limit), dynamic_ncols=True):
|
|
56
59
|
sample = client.get_sample()
|
|
57
60
|
if sample.id:
|
|
58
|
-
#
|
|
59
|
-
if hasattr(sample, "image"):
|
|
60
|
-
img = raw_array_to_pil_image(sample.image)
|
|
61
|
-
|
|
62
|
-
if hasattr(sample, "masks"):
|
|
63
|
-
for _, mask_buffer in sample.masks.items():
|
|
64
|
-
mask = raw_array_to_pil_image(mask_buffer)
|
|
65
|
-
|
|
66
|
-
if (
|
|
67
|
-
hasattr(sample, "additional_images")
|
|
68
|
-
and "masked_image" in sample.additional_images
|
|
69
|
-
):
|
|
70
|
-
masked_image = raw_array_to_pil_image(
|
|
71
|
-
sample.AdditionalImages["masked_image"]
|
|
72
|
-
)
|
|
61
|
+
# Images are already PIL by default
|
|
73
62
|
|
|
74
63
|
# Bring the latents to numpy
|
|
75
64
|
if hasattr(sample, "latents"):
|
|
@@ -1,8 +1,11 @@
|
|
|
1
|
-
import
|
|
2
|
-
from tqdm import tqdm
|
|
1
|
+
import json
|
|
3
2
|
import os
|
|
3
|
+
import time
|
|
4
|
+
|
|
4
5
|
import typer
|
|
6
|
+
from benchmark_defaults import IMAGE_CONFIG
|
|
5
7
|
from dataset import DatagoIterDataset
|
|
8
|
+
from tqdm import tqdm
|
|
6
9
|
|
|
7
10
|
|
|
8
11
|
def benchmark(
|
|
@@ -14,12 +17,30 @@ def benchmark(
|
|
|
14
17
|
False, help="Crop and resize the images on the fly"
|
|
15
18
|
),
|
|
16
19
|
compare_torch: bool = typer.Option(True, help="Compare against torch dataloader"),
|
|
20
|
+
num_workers: int = typer.Option(os.cpu_count(), help="Number of workers to use"),
|
|
21
|
+
sweep: bool = typer.Option(False, help="Sweep over the number of workers"),
|
|
17
22
|
):
|
|
18
|
-
|
|
23
|
+
if sweep:
|
|
24
|
+
results_sweep = {}
|
|
25
|
+
for num_workers in range(2, (os.cpu_count() * 2 or 2), 2):
|
|
26
|
+
results_sweep[num_workers] = benchmark(
|
|
27
|
+
root_path, limit, crop_and_resize, compare_torch, num_workers, False
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
# Save results to a json file
|
|
31
|
+
|
|
32
|
+
with open("benchmark_results_filesystem.json", "w") as f:
|
|
33
|
+
json.dump(results_sweep, f, indent=2)
|
|
34
|
+
|
|
35
|
+
return results_sweep
|
|
36
|
+
|
|
19
37
|
print(
|
|
20
|
-
"
|
|
38
|
+
f"Running benchmark for {root_path} - {limit} samples - {num_workers} workers"
|
|
21
39
|
)
|
|
22
40
|
|
|
41
|
+
# This setting is not exposed in the config, but an env variable can be used instead
|
|
42
|
+
os.environ["DATAGO_MAX_TASKS"] = str(num_workers)
|
|
43
|
+
|
|
23
44
|
client_config = {
|
|
24
45
|
"source_type": "file",
|
|
25
46
|
"source_config": {
|
|
@@ -27,19 +48,14 @@ def benchmark(
|
|
|
27
48
|
"rank": 0,
|
|
28
49
|
"world_size": 1,
|
|
29
50
|
},
|
|
30
|
-
"image_config": {
|
|
31
|
-
"crop_and_resize": crop_and_resize,
|
|
32
|
-
"default_image_size": 1024,
|
|
33
|
-
"downsampling_ratio": 32,
|
|
34
|
-
"min_aspect_ratio": 0.5,
|
|
35
|
-
"max_aspect_ratio": 2.0,
|
|
36
|
-
"pre_encode_images": False,
|
|
37
|
-
},
|
|
38
51
|
"prefetch_buffer_size": 256,
|
|
39
52
|
"samples_buffer_size": 256,
|
|
40
53
|
"limit": limit,
|
|
41
54
|
}
|
|
42
55
|
|
|
56
|
+
if crop_and_resize:
|
|
57
|
+
client_config["image_config"] = IMAGE_CONFIG
|
|
58
|
+
|
|
43
59
|
# Make sure in the following that we compare apples to apples, meaning in that case
|
|
44
60
|
# that we materialize the payloads in the python scope in the expected format
|
|
45
61
|
# (PIL.Image for images and masks for instance, numpy arrays for latents)
|
|
@@ -48,14 +64,20 @@ def benchmark(
|
|
|
48
64
|
|
|
49
65
|
img = None
|
|
50
66
|
count = 0
|
|
51
|
-
for sample in tqdm(datago_dataset, dynamic_ncols=True):
|
|
67
|
+
for sample in tqdm(datago_dataset, desc="Datago", dynamic_ncols=True):
|
|
52
68
|
assert sample["id"] != ""
|
|
53
69
|
img = sample["image"]
|
|
70
|
+
|
|
71
|
+
if count < limit - 1:
|
|
72
|
+
del img
|
|
73
|
+
img = None # Help with memory pressure
|
|
74
|
+
|
|
54
75
|
count += 1
|
|
55
76
|
|
|
56
77
|
assert count == limit, f"Expected {limit} samples, got {count}"
|
|
57
78
|
fps = limit / (time.time() - start)
|
|
58
|
-
|
|
79
|
+
results = {"datago": {"fps": fps, "count": count}}
|
|
80
|
+
print(f"Datago - FPS {fps:.2f} - workers {num_workers}")
|
|
59
81
|
del datago_dataset
|
|
60
82
|
|
|
61
83
|
# Save the last image as a test
|
|
@@ -64,10 +86,9 @@ def benchmark(
|
|
|
64
86
|
|
|
65
87
|
# Let's compare against a classic pytorch dataloader
|
|
66
88
|
if compare_torch:
|
|
67
|
-
from torchvision import datasets, transforms # type: ignore
|
|
68
89
|
from torch.utils.data import DataLoader
|
|
90
|
+
from torchvision import datasets, transforms # type: ignore
|
|
69
91
|
|
|
70
|
-
print("Benchmarking torch dataloader")
|
|
71
92
|
# Define the transformations to apply to each image
|
|
72
93
|
transform = (
|
|
73
94
|
transforms.Compose(
|
|
@@ -88,7 +109,6 @@ def benchmark(
|
|
|
88
109
|
|
|
89
110
|
# Create a DataLoader to allow for multiple workers
|
|
90
111
|
# Use available CPU count for num_workers
|
|
91
|
-
num_workers = os.cpu_count() or 8 # Default to 8 if cpu_count returns None
|
|
92
112
|
dataloader = DataLoader(
|
|
93
113
|
dataset,
|
|
94
114
|
batch_size=1,
|
|
@@ -100,12 +120,17 @@ def benchmark(
|
|
|
100
120
|
# Iterate over the DataLoader
|
|
101
121
|
start = time.time()
|
|
102
122
|
n_images = 0
|
|
103
|
-
for batch in tqdm(dataloader, dynamic_ncols=True):
|
|
123
|
+
for batch in tqdm(dataloader, desc="Torch", dynamic_ncols=True):
|
|
104
124
|
n_images += len(batch)
|
|
105
125
|
if n_images > limit:
|
|
106
126
|
break
|
|
127
|
+
|
|
128
|
+
del batch # Help with memory pressure, same as above
|
|
107
129
|
fps = n_images / (time.time() - start)
|
|
108
|
-
|
|
130
|
+
results["torch"] = {"fps": fps, "count": n_images}
|
|
131
|
+
print(f"Torch - FPS {fps:.2f} - workers {num_workers}")
|
|
132
|
+
|
|
133
|
+
return results
|
|
109
134
|
|
|
110
135
|
|
|
111
136
|
if __name__ == "__main__":
|