datago 2025.8.1__tar.gz → 2025.12.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. {datago-2025.8.1 → datago-2025.12.1}/Cargo.lock +73 -19
  2. {datago-2025.8.1 → datago-2025.12.1}/Cargo.toml +3 -2
  3. {datago-2025.8.1 → datago-2025.12.1}/PKG-INFO +62 -12
  4. {datago-2025.8.1 → datago-2025.12.1}/README.md +60 -10
  5. datago-2025.12.1/assets/epyc_vast.png +0 -0
  6. datago-2025.12.1/assets/epyc_wds.png +0 -0
  7. datago-2025.12.1/assets/zen3_ssd.png +0 -0
  8. {datago-2025.8.1 → datago-2025.12.1}/pyproject.toml +1 -1
  9. {datago-2025.8.1 → datago-2025.12.1}/python/benchmark_db.py +16 -27
  10. datago-2025.12.1/python/benchmark_defaults.py +8 -0
  11. {datago-2025.8.1 → datago-2025.12.1}/python/benchmark_filesystem.py +44 -19
  12. {datago-2025.8.1 → datago-2025.12.1}/python/benchmark_webdataset.py +34 -20
  13. {datago-2025.8.1 → datago-2025.12.1}/python/dataset.py +8 -3
  14. datago-2025.12.1/python/raw_types.py +40 -0
  15. {datago-2025.8.1 → datago-2025.12.1}/python/test_datago_client.py +9 -4
  16. {datago-2025.8.1 → datago-2025.12.1}/python/test_datago_db.py +39 -32
  17. {datago-2025.8.1 → datago-2025.12.1}/python/test_datago_edge_cases.py +6 -4
  18. {datago-2025.8.1 → datago-2025.12.1}/python/test_datago_filesystem.py +7 -5
  19. datago-2025.12.1/python/test_pil_implicit_conversion.py +80 -0
  20. {datago-2025.8.1 → datago-2025.12.1}/src/client.rs +43 -23
  21. {datago-2025.8.1 → datago-2025.12.1}/src/generator_files.rs +7 -4
  22. {datago-2025.8.1 → datago-2025.12.1}/src/generator_http.rs +9 -6
  23. {datago-2025.8.1 → datago-2025.12.1}/src/generator_wds.rs +38 -21
  24. {datago-2025.8.1 → datago-2025.12.1}/src/image_processing.rs +111 -27
  25. {datago-2025.8.1 → datago-2025.12.1}/src/lib.rs +3 -1
  26. {datago-2025.8.1 → datago-2025.12.1}/src/main.rs +6 -2
  27. {datago-2025.8.1 → datago-2025.12.1}/src/structs.rs +182 -10
  28. {datago-2025.8.1 → datago-2025.12.1}/src/worker_files.rs +97 -53
  29. {datago-2025.8.1 → datago-2025.12.1}/src/worker_http.rs +149 -36
  30. {datago-2025.8.1 → datago-2025.12.1}/src/worker_wds.rs +30 -25
  31. datago-2025.8.1/python/raw_types.py +0 -65
  32. {datago-2025.8.1 → datago-2025.12.1}/.github/workflows/ci-cd.yml +0 -0
  33. {datago-2025.8.1 → datago-2025.12.1}/.github/workflows/rust.yml +0 -0
  34. {datago-2025.8.1 → datago-2025.12.1}/.gitignore +0 -0
  35. {datago-2025.8.1 → datago-2025.12.1}/.pre-commit-config.yaml +0 -0
  36. {datago-2025.8.1 → datago-2025.12.1}/LICENSE +0 -0
  37. {datago-2025.8.1 → datago-2025.12.1}/assets/447175851-2277afcb-8abf-4d17-b2db-dae27c6056d0.png +0 -0
  38. {datago-2025.8.1 → datago-2025.12.1}/requirements-tests.txt +0 -0
  39. {datago-2025.8.1 → datago-2025.12.1}/requirements.txt +0 -0
@@ -111,6 +111,16 @@ version = "0.7.6"
111
111
  source = "registry+https://github.com/rust-lang/crates.io-index"
112
112
  checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50"
113
113
 
114
+ [[package]]
115
+ name = "assert-json-diff"
116
+ version = "2.0.2"
117
+ source = "registry+https://github.com/rust-lang/crates.io-index"
118
+ checksum = "47e4f2b81832e72834d7518d8487a0396a28cc408186a2e8854c0f98011faf12"
119
+ dependencies = [
120
+ "serde",
121
+ "serde_json",
122
+ ]
123
+
114
124
  [[package]]
115
125
  name = "async-channel"
116
126
  version = "1.9.0"
@@ -464,7 +474,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
464
474
  checksum = "d067ad48b8650848b989a59a86c6c36a995d02d2bf778d45c3c5d57bc2718f02"
465
475
  dependencies = [
466
476
  "smallvec",
467
- "target-lexicon 0.12.16",
477
+ "target-lexicon",
468
478
  ]
469
479
 
470
480
  [[package]]
@@ -613,7 +623,7 @@ dependencies = [
613
623
 
614
624
  [[package]]
615
625
  name = "datago"
616
- version = "2025.8.1"
626
+ version = "2025.12.1"
617
627
  dependencies = [
618
628
  "async-compression",
619
629
  "async-tar",
@@ -644,8 +654,27 @@ dependencies = [
644
654
  "tokio-util",
645
655
  "url",
646
656
  "walkdir",
657
+ "wiremock",
658
+ ]
659
+
660
+ [[package]]
661
+ name = "deadpool"
662
+ version = "0.10.0"
663
+ source = "registry+https://github.com/rust-lang/crates.io-index"
664
+ checksum = "fb84100978c1c7b37f09ed3ce3e5f843af02c2a2c431bae5b19230dad2c1b490"
665
+ dependencies = [
666
+ "async-trait",
667
+ "deadpool-runtime",
668
+ "num_cpus",
669
+ "tokio",
647
670
  ]
648
671
 
672
+ [[package]]
673
+ name = "deadpool-runtime"
674
+ version = "0.1.4"
675
+ source = "registry+https://github.com/rust-lang/crates.io-index"
676
+ checksum = "092966b41edc516079bdf31ec78a2e0588d1d0c08f78b91d8307215928642b2b"
677
+
649
678
  [[package]]
650
679
  name = "dirs-next"
651
680
  version = "2.0.0"
@@ -1118,6 +1147,12 @@ version = "1.10.1"
1118
1147
  source = "registry+https://github.com/rust-lang/crates.io-index"
1119
1148
  checksum = "6dbf3de79e51f3d586ab4cb9d5c3e2c14aa28ed23d180cf89b4df0454a69cc87"
1120
1149
 
1150
+ [[package]]
1151
+ name = "httpdate"
1152
+ version = "1.0.3"
1153
+ source = "registry+https://github.com/rust-lang/crates.io-index"
1154
+ checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9"
1155
+
1121
1156
  [[package]]
1122
1157
  name = "hyper"
1123
1158
  version = "1.6.0"
@@ -1131,6 +1166,7 @@ dependencies = [
1131
1166
  "http",
1132
1167
  "http-body",
1133
1168
  "httparse",
1169
+ "httpdate",
1134
1170
  "itoa",
1135
1171
  "pin-project-lite",
1136
1172
  "smallvec",
@@ -2026,9 +2062,9 @@ dependencies = [
2026
2062
 
2027
2063
  [[package]]
2028
2064
  name = "pyo3"
2029
- version = "0.24.1"
2065
+ version = "0.22.6"
2030
2066
  source = "registry+https://github.com/rust-lang/crates.io-index"
2031
- checksum = "17da310086b068fbdcefbba30aeb3721d5bb9af8db4987d6735b2183ca567229"
2067
+ checksum = "f402062616ab18202ae8319da13fa4279883a2b8a9d9f83f20dbade813ce1884"
2032
2068
  dependencies = [
2033
2069
  "cfg-if",
2034
2070
  "indoc",
@@ -2044,19 +2080,19 @@ dependencies = [
2044
2080
 
2045
2081
  [[package]]
2046
2082
  name = "pyo3-build-config"
2047
- version = "0.24.1"
2083
+ version = "0.22.6"
2048
2084
  source = "registry+https://github.com/rust-lang/crates.io-index"
2049
- checksum = "e27165889bd793000a098bb966adc4300c312497ea25cf7a690a9f0ac5aa5fc1"
2085
+ checksum = "b14b5775b5ff446dd1056212d778012cbe8a0fbffd368029fd9e25b514479c38"
2050
2086
  dependencies = [
2051
2087
  "once_cell",
2052
- "target-lexicon 0.13.2",
2088
+ "target-lexicon",
2053
2089
  ]
2054
2090
 
2055
2091
  [[package]]
2056
2092
  name = "pyo3-ffi"
2057
- version = "0.24.1"
2093
+ version = "0.22.6"
2058
2094
  source = "registry+https://github.com/rust-lang/crates.io-index"
2059
- checksum = "05280526e1dbf6b420062f3ef228b78c0c54ba94e157f5cb724a609d0f2faabc"
2095
+ checksum = "9ab5bcf04a2cdcbb50c7d6105de943f543f9ed92af55818fd17b660390fc8636"
2060
2096
  dependencies = [
2061
2097
  "libc",
2062
2098
  "pyo3-build-config",
@@ -2064,9 +2100,9 @@ dependencies = [
2064
2100
 
2065
2101
  [[package]]
2066
2102
  name = "pyo3-macros"
2067
- version = "0.24.1"
2103
+ version = "0.22.6"
2068
2104
  source = "registry+https://github.com/rust-lang/crates.io-index"
2069
- checksum = "5c3ce5686aa4d3f63359a5100c62a127c9f15e8398e5fdeb5deef1fed5cd5f44"
2105
+ checksum = "0fd24d897903a9e6d80b968368a34e1525aeb719d568dba8b3d4bfa5dc67d453"
2070
2106
  dependencies = [
2071
2107
  "proc-macro2",
2072
2108
  "pyo3-macros-backend",
@@ -2076,9 +2112,9 @@ dependencies = [
2076
2112
 
2077
2113
  [[package]]
2078
2114
  name = "pyo3-macros-backend"
2079
- version = "0.24.1"
2115
+ version = "0.22.6"
2080
2116
  source = "registry+https://github.com/rust-lang/crates.io-index"
2081
- checksum = "f4cf6faa0cbfb0ed08e89beb8103ae9724eb4750e3a78084ba4017cbe94f3855"
2117
+ checksum = "36c011a03ba1e50152b4b394b479826cad97e7a21eb52df179cd91ac411cbfbe"
2082
2118
  dependencies = [
2083
2119
  "heck",
2084
2120
  "proc-macro2",
@@ -2753,12 +2789,6 @@ version = "0.12.16"
2753
2789
  source = "registry+https://github.com/rust-lang/crates.io-index"
2754
2790
  checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1"
2755
2791
 
2756
- [[package]]
2757
- name = "target-lexicon"
2758
- version = "0.13.2"
2759
- source = "registry+https://github.com/rust-lang/crates.io-index"
2760
- checksum = "e502f78cdbb8ba4718f566c418c52bc729126ffd16baee5baa718cf25dd5a69a"
2761
-
2762
2792
  [[package]]
2763
2793
  name = "tempfile"
2764
2794
  version = "3.20.0"
@@ -3454,6 +3484,30 @@ dependencies = [
3454
3484
  "memchr",
3455
3485
  ]
3456
3486
 
3487
+ [[package]]
3488
+ name = "wiremock"
3489
+ version = "0.6.4"
3490
+ source = "registry+https://github.com/rust-lang/crates.io-index"
3491
+ checksum = "a2b8b99d4cdbf36b239a9532e31fe4fb8acc38d1897c1761e161550a7dc78e6a"
3492
+ dependencies = [
3493
+ "assert-json-diff",
3494
+ "async-trait",
3495
+ "base64",
3496
+ "deadpool",
3497
+ "futures",
3498
+ "http",
3499
+ "http-body-util",
3500
+ "hyper",
3501
+ "hyper-util",
3502
+ "log",
3503
+ "once_cell",
3504
+ "regex",
3505
+ "serde",
3506
+ "serde_json",
3507
+ "tokio",
3508
+ "url",
3509
+ ]
3510
+
3457
3511
  [[package]]
3458
3512
  name = "wit-bindgen-rt"
3459
3513
  version = "0.33.0"
@@ -1,7 +1,7 @@
1
1
  [package]
2
2
  name = "datago"
3
3
  edition = "2021"
4
- version = "2025.8.1"
4
+ version = "2025.12.1"
5
5
  readme = "README.md"
6
6
 
7
7
  [lib]
@@ -24,7 +24,7 @@ kanal = "0.1"
24
24
  clap = { version = "4.5.27", features = ["derive"] }
25
25
  tokio = { version = "1.43.1", features = ["rt-multi-thread", "macros"] }
26
26
  prettytable-rs = "0.10.0"
27
- pyo3 = { version = "0.24.1", features = ["extension-module"] }
27
+ pyo3 = { version = "0.22", features = ["extension-module"] }
28
28
  threadpool = "1.8.1"
29
29
  openssl = { version = "0.10", features = ["vendored"] }
30
30
  walkdir = "2.5.0"
@@ -46,6 +46,7 @@ fast_image_resize = { version ="5.1.3", features=["image"]}
46
46
 
47
47
  [dev-dependencies]
48
48
  tempfile = "3.13.0"
49
+ wiremock = "0.6.0"
49
50
 
50
51
  [profile.release]
51
52
  opt-level = 3
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datago
3
- Version: 2025.8.1
3
+ Version: 2025.12.1
4
4
  Classifier: Programming Language :: Rust
5
5
  Classifier: Programming Language :: Python :: Implementation :: CPython
6
6
  Classifier: Programming Language :: Python :: Implementation :: PyPy
@@ -8,7 +8,7 @@ Classifier: Programming Language :: Python :: 3
8
8
  Classifier: License :: OSI Approved :: MIT License
9
9
  License-File: LICENSE
10
10
  Summary: A high performance dataloader for Python, written in Rust
11
- Author: Benjamin Lefaudeux
11
+ Author: Benjamin Lefaudeux, Roman Frigg
12
12
  Author-email: Photoroom <team@photoroom.com>
13
13
  Requires-Python: >=3.8
14
14
  Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
@@ -97,7 +97,7 @@ config = {
97
97
  "source_config": {
98
98
  "root_path": "myPath",
99
99
  "random_sampling": False, # True if used directly for training
100
- "rank": 0,
100
+ "rank": 0, # Optional, distributed workloads are possible
101
101
  "world_size": 1,
102
102
  },
103
103
  "limit": 200,
@@ -137,15 +137,6 @@ client_config = {
137
137
  "rank": 0,
138
138
  "world_size": 1,
139
139
  },
140
- # Optional pre-processing of the images, placing them in an aspect ratio bucket to preseve as much as possible of the original content
141
- "image_config": {
142
- "crop_and_resize": True, # False to turn it off, or just omit this part of the config
143
- "default_image_size": 1024,
144
- "downsampling_ratio": 32,
145
- "min_aspect_ratio": 0.5,
146
- "max_aspect_ratio": 2.0,
147
- "pre_encode_images": False,
148
- },
149
140
  "prefetch_buffer_size": 128,
150
141
  "samples_buffer_size": 64,
151
142
  "limit": 1_000_000, # Dummy example, max number of samples you would like to serve
@@ -159,6 +150,38 @@ for _ in range(10):
159
150
 
160
151
  </details>
161
152
 
153
+ ## Process images on the fly
154
+
155
+ Datago can also process images on the fly, for instance to align different image payloads. This is done by adding an `image_config` to the configuration. The following example shows how to align different image payloads.
156
+
157
+ Processing can be very CPU heavy, but it will be distributed over all CPU cores wihout requiring multiple python processes. I.e., you can keep a single python process using `get_sample()` on the client and still saturate all CPU cores.
158
+
159
+ There are three main processing topics that you can choose from:
160
+
161
+ - crop the images to within an aspect ratio bucket (which is very handy for all Transformer / patch based architectures)
162
+ - resize the images (setting here will be related to the square aspect ratio bucket, other buckets will differ of course)
163
+ - pre-encode the images to a specific format (jpg, png, ...)
164
+
165
+ ```python
166
+ config = {
167
+ "source_type": "file",
168
+ "source_config": {
169
+ "root_path": "myPath",
170
+ "random_sampling": False, # True if used directly for training
171
+ },
172
+ # Optional pre-processing of the images, placing them in an aspect ratio bucket to preserve as much as possible of the original content
173
+ "image_config": {
174
+ "crop_and_resize": True, # False to turn it off, or just omit this part of the config
175
+ "default_image_size": 1024,
176
+ "downsampling_ratio": 32,
177
+ "min_aspect_ratio": 0.5,
178
+ "max_aspect_ratio": 2.0,
179
+ "pre_encode_images": False,
180
+ },
181
+ "limit": 200,
182
+ "samples_buffer_size": 32,
183
+ }
184
+ ```
162
185
 
163
186
  ## Match the raw exported buffers with typical python types
164
187
 
@@ -171,6 +194,14 @@ You can set the log level using the RUST_LOG environment variable. E.g. `RUST_LO
171
194
 
172
195
  When using the library from Python, `env_logger` will be initialized automatically when creating a `DatagoClient`. There is also a `initialize_logging` function in the `datago` module, which if called before using a client, allows to customize the log level. This only works if RUST_LOG is not set.
173
196
 
197
+ ## Env variables
198
+
199
+ There are a couple of env variables which will change the behavior of the library, for settings which felt too low level to be exposed in the config.
200
+
201
+ - `DATAGO_MAX_TASKS`: refers to the number of threads which will be used to load the samples. Defaults to a multiple of the CPU cores.
202
+ - `RUST_LOG`: see above, will change the level of logging for the whole library, could be useful for debugging or to report an issue here.
203
+ - `DATAGO_MAX_RETRIES`: number of retries for a failed sample load, defaults to 3.
204
+
174
205
  </details><details> <summary><strong>Build it</strong></summary>
175
206
 
176
207
  ## Preamble
@@ -233,6 +264,25 @@ Create a new tag and a new release in this repo, a new package will be pushed au
233
264
 
234
265
  </details>
235
266
 
267
+ <details> <summary><strong>Benchmarks</strong></summary>
268
+ As usual, benchmarks are a tricky game, and you shouldn't read too much into the following plots but do your own tests. Some python benchmark examples are provided in the [python](./python/) folder.
269
+
270
+ In general, Datago will be impactful if you want to load a lot of images very fast, but if you consume them as you go at a more leisury pace then it's not really needed. The more CPU work there is with the images and the higher quality they are, the more Datago will shine. The following benchmarks are using ImageNet 1k, which is very low resolution and thus kind of a worst case scenario. Data is served from cache (i.e. the OS cache) and the images are not pre-processed. In this case the receiving python process is typically the bottleneck, and caps at around 2000 images per second.
271
+
272
+ ### AMD Zen3 laptop - IN1k - disk
273
+ ![AMD Zen3 laptop & M2 SSD](assets/zen3_ssd.png)
274
+
275
+ ### AMD EPYC 9454 - IN1k - disk
276
+ ![AMD EPYC 9454](assets/epyc_vast.png)
277
+
278
+ This benchmark is using the PD12M dataset, which is a 12M images dataset, with a lot of high resolution images. It's accessed through the webdataset front end, datago is compared with the popular python webdataset library. Note that datago will start streaming the images faster here (almost instantly !), so given enough time the two results would look closer.
279
+
280
+ ### AMD EPYC 9454 - pd12m - webdataset
281
+ ![AMD EPYC 9454](assets/epyc_wds.png)
282
+
283
+ </details>
284
+
285
+
236
286
  ## License
237
287
 
238
288
  MIT License
@@ -80,7 +80,7 @@ config = {
80
80
  "source_config": {
81
81
  "root_path": "myPath",
82
82
  "random_sampling": False, # True if used directly for training
83
- "rank": 0,
83
+ "rank": 0, # Optional, distributed workloads are possible
84
84
  "world_size": 1,
85
85
  },
86
86
  "limit": 200,
@@ -120,15 +120,6 @@ client_config = {
120
120
  "rank": 0,
121
121
  "world_size": 1,
122
122
  },
123
- # Optional pre-processing of the images, placing them in an aspect ratio bucket to preseve as much as possible of the original content
124
- "image_config": {
125
- "crop_and_resize": True, # False to turn it off, or just omit this part of the config
126
- "default_image_size": 1024,
127
- "downsampling_ratio": 32,
128
- "min_aspect_ratio": 0.5,
129
- "max_aspect_ratio": 2.0,
130
- "pre_encode_images": False,
131
- },
132
123
  "prefetch_buffer_size": 128,
133
124
  "samples_buffer_size": 64,
134
125
  "limit": 1_000_000, # Dummy example, max number of samples you would like to serve
@@ -142,6 +133,38 @@ for _ in range(10):
142
133
 
143
134
  </details>
144
135
 
136
+ ## Process images on the fly
137
+
138
+ Datago can also process images on the fly, for instance to align different image payloads. This is done by adding an `image_config` to the configuration. The following example shows how to align different image payloads.
139
+
140
+ Processing can be very CPU heavy, but it will be distributed over all CPU cores wihout requiring multiple python processes. I.e., you can keep a single python process using `get_sample()` on the client and still saturate all CPU cores.
141
+
142
+ There are three main processing topics that you can choose from:
143
+
144
+ - crop the images to within an aspect ratio bucket (which is very handy for all Transformer / patch based architectures)
145
+ - resize the images (setting here will be related to the square aspect ratio bucket, other buckets will differ of course)
146
+ - pre-encode the images to a specific format (jpg, png, ...)
147
+
148
+ ```python
149
+ config = {
150
+ "source_type": "file",
151
+ "source_config": {
152
+ "root_path": "myPath",
153
+ "random_sampling": False, # True if used directly for training
154
+ },
155
+ # Optional pre-processing of the images, placing them in an aspect ratio bucket to preserve as much as possible of the original content
156
+ "image_config": {
157
+ "crop_and_resize": True, # False to turn it off, or just omit this part of the config
158
+ "default_image_size": 1024,
159
+ "downsampling_ratio": 32,
160
+ "min_aspect_ratio": 0.5,
161
+ "max_aspect_ratio": 2.0,
162
+ "pre_encode_images": False,
163
+ },
164
+ "limit": 200,
165
+ "samples_buffer_size": 32,
166
+ }
167
+ ```
145
168
 
146
169
  ## Match the raw exported buffers with typical python types
147
170
 
@@ -154,6 +177,14 @@ You can set the log level using the RUST_LOG environment variable. E.g. `RUST_LO
154
177
 
155
178
  When using the library from Python, `env_logger` will be initialized automatically when creating a `DatagoClient`. There is also a `initialize_logging` function in the `datago` module, which if called before using a client, allows to customize the log level. This only works if RUST_LOG is not set.
156
179
 
180
+ ## Env variables
181
+
182
+ There are a couple of env variables which will change the behavior of the library, for settings which felt too low level to be exposed in the config.
183
+
184
+ - `DATAGO_MAX_TASKS`: refers to the number of threads which will be used to load the samples. Defaults to a multiple of the CPU cores.
185
+ - `RUST_LOG`: see above, will change the level of logging for the whole library, could be useful for debugging or to report an issue here.
186
+ - `DATAGO_MAX_RETRIES`: number of retries for a failed sample load, defaults to 3.
187
+
157
188
  </details><details> <summary><strong>Build it</strong></summary>
158
189
 
159
190
  ## Preamble
@@ -216,6 +247,25 @@ Create a new tag and a new release in this repo, a new package will be pushed au
216
247
 
217
248
  </details>
218
249
 
250
+ <details> <summary><strong>Benchmarks</strong></summary>
251
+ As usual, benchmarks are a tricky game, and you shouldn't read too much into the following plots but do your own tests. Some python benchmark examples are provided in the [python](./python/) folder.
252
+
253
+ In general, Datago will be impactful if you want to load a lot of images very fast, but if you consume them as you go at a more leisury pace then it's not really needed. The more CPU work there is with the images and the higher quality they are, the more Datago will shine. The following benchmarks are using ImageNet 1k, which is very low resolution and thus kind of a worst case scenario. Data is served from cache (i.e. the OS cache) and the images are not pre-processed. In this case the receiving python process is typically the bottleneck, and caps at around 2000 images per second.
254
+
255
+ ### AMD Zen3 laptop - IN1k - disk
256
+ ![AMD Zen3 laptop & M2 SSD](assets/zen3_ssd.png)
257
+
258
+ ### AMD EPYC 9454 - IN1k - disk
259
+ ![AMD EPYC 9454](assets/epyc_vast.png)
260
+
261
+ This benchmark is using the PD12M dataset, which is a 12M images dataset, with a lot of high resolution images. It's accessed through the webdataset front end, datago is compared with the popular python webdataset library. Note that datago will start streaming the images faster here (almost instantly !), so given enough time the two results would look closer.
262
+
263
+ ### AMD EPYC 9454 - pd12m - webdataset
264
+ ![AMD EPYC 9454](assets/epyc_wds.png)
265
+
266
+ </details>
267
+
268
+
219
269
  ## License
220
270
 
221
271
  MIT License
Binary file
Binary file
Binary file
@@ -2,7 +2,7 @@
2
2
  name = "datago"
3
3
  dynamic = ["version"]
4
4
  authors = [
5
- { name = "Benjamin Lefaudeux" },
5
+ { name = "Benjamin Lefaudeux, Roman Frigg" },
6
6
  { name = "Photoroom", email = "team@photoroom.com" }
7
7
  ]
8
8
  description = "A high performance dataloader for Python, written in Rust"
@@ -1,11 +1,13 @@
1
- from datago import DatagoClient # type: ignore
1
+ import json
2
2
  import time
3
- from tqdm import tqdm
3
+
4
4
  import numpy as np
5
- from raw_types import raw_array_to_pil_image, raw_array_to_numpy
6
5
  import typer
7
- import json
6
+ from benchmark_defaults import IMAGE_CONFIG
7
+ from datago import DatagoClient # type: ignore
8
8
  from PIL import Image
9
+ from raw_types import raw_array_to_numpy
10
+ from tqdm import tqdm
9
11
 
10
12
 
11
13
  def benchmark(
@@ -31,19 +33,20 @@ def benchmark(
31
33
  "rank": 0,
32
34
  "world_size": 1,
33
35
  },
34
- "image_config": {
35
- "crop_and_resize": crop_and_resize,
36
- "default_image_size": 1024,
37
- "downsampling_ratio": 32,
38
- "min_aspect_ratio": 0.5,
39
- "max_aspect_ratio": 2.0,
40
- "pre_encode_images": encode_images,
41
- },
42
36
  "prefetch_buffer_size": 128,
43
37
  "samples_buffer_size": 64,
44
38
  "limit": limit,
45
39
  }
46
40
 
41
+ if crop_and_resize or encode_images:
42
+ client_config["image_config"] = IMAGE_CONFIG
43
+
44
+ if encode_images:
45
+ client_config["image_config"]["crop_and_resize"] = ( # type: ignore
46
+ crop_and_resize # You may want to encode images without resizing them
47
+ )
48
+ client_config["image_config"]["pre_encode_images"] = True # type: ignore
49
+
47
50
  client = DatagoClient(json.dumps(client_config))
48
51
  client.start() # Optional, but good practice to start the client to reduce latency to first sample (while you're instantiating models for instance)
49
52
  start = time.time()
@@ -55,21 +58,7 @@ def benchmark(
55
58
  for _ in tqdm(range(limit), dynamic_ncols=True):
56
59
  sample = client.get_sample()
57
60
  if sample.id:
58
- # Bring the masks and image to PIL
59
- if hasattr(sample, "image"):
60
- img = raw_array_to_pil_image(sample.image)
61
-
62
- if hasattr(sample, "masks"):
63
- for _, mask_buffer in sample.masks.items():
64
- mask = raw_array_to_pil_image(mask_buffer)
65
-
66
- if (
67
- hasattr(sample, "additional_images")
68
- and "masked_image" in sample.additional_images
69
- ):
70
- masked_image = raw_array_to_pil_image(
71
- sample.AdditionalImages["masked_image"]
72
- )
61
+ # Images are already PIL by default
73
62
 
74
63
  # Bring the latents to numpy
75
64
  if hasattr(sample, "latents"):
@@ -0,0 +1,8 @@
1
+ IMAGE_CONFIG = {
2
+ "crop_and_resize": True,
3
+ "default_image_size": 1024,
4
+ "downsampling_ratio": 32,
5
+ "min_aspect_ratio": 0.5,
6
+ "max_aspect_ratio": 2.0,
7
+ "pre_encode_images": False,
8
+ }
@@ -1,8 +1,11 @@
1
- import time
2
- from tqdm import tqdm
1
+ import json
3
2
  import os
3
+ import time
4
+
4
5
  import typer
6
+ from benchmark_defaults import IMAGE_CONFIG
5
7
  from dataset import DatagoIterDataset
8
+ from tqdm import tqdm
6
9
 
7
10
 
8
11
  def benchmark(
@@ -14,12 +17,30 @@ def benchmark(
14
17
  False, help="Crop and resize the images on the fly"
15
18
  ),
16
19
  compare_torch: bool = typer.Option(True, help="Compare against torch dataloader"),
20
+ num_workers: int = typer.Option(os.cpu_count(), help="Number of workers to use"),
21
+ sweep: bool = typer.Option(False, help="Sweep over the number of workers"),
17
22
  ):
18
- print(f"Running benchmark for {root_path} - {limit} samples")
23
+ if sweep:
24
+ results_sweep = {}
25
+ for num_workers in range(2, (os.cpu_count() * 2 or 2), 2):
26
+ results_sweep[num_workers] = benchmark(
27
+ root_path, limit, crop_and_resize, compare_torch, num_workers, False
28
+ )
29
+
30
+ # Save results to a json file
31
+
32
+ with open("benchmark_results_filesystem.json", "w") as f:
33
+ json.dump(results_sweep, f, indent=2)
34
+
35
+ return results_sweep
36
+
19
37
  print(
20
- "Please run the benchmark twice if you want to compare against torch dataloader, so that file caching affects both paths"
38
+ f"Running benchmark for {root_path} - {limit} samples - {num_workers} workers"
21
39
  )
22
40
 
41
+ # This setting is not exposed in the config, but an env variable can be used instead
42
+ os.environ["DATAGO_MAX_TASKS"] = str(num_workers)
43
+
23
44
  client_config = {
24
45
  "source_type": "file",
25
46
  "source_config": {
@@ -27,19 +48,14 @@ def benchmark(
27
48
  "rank": 0,
28
49
  "world_size": 1,
29
50
  },
30
- "image_config": {
31
- "crop_and_resize": crop_and_resize,
32
- "default_image_size": 1024,
33
- "downsampling_ratio": 32,
34
- "min_aspect_ratio": 0.5,
35
- "max_aspect_ratio": 2.0,
36
- "pre_encode_images": False,
37
- },
38
51
  "prefetch_buffer_size": 256,
39
52
  "samples_buffer_size": 256,
40
53
  "limit": limit,
41
54
  }
42
55
 
56
+ if crop_and_resize:
57
+ client_config["image_config"] = IMAGE_CONFIG
58
+
43
59
  # Make sure in the following that we compare apples to apples, meaning in that case
44
60
  # that we materialize the payloads in the python scope in the expected format
45
61
  # (PIL.Image for images and masks for instance, numpy arrays for latents)
@@ -48,14 +64,20 @@ def benchmark(
48
64
 
49
65
  img = None
50
66
  count = 0
51
- for sample in tqdm(datago_dataset, dynamic_ncols=True):
67
+ for sample in tqdm(datago_dataset, desc="Datago", dynamic_ncols=True):
52
68
  assert sample["id"] != ""
53
69
  img = sample["image"]
70
+
71
+ if count < limit - 1:
72
+ del img
73
+ img = None # Help with memory pressure
74
+
54
75
  count += 1
55
76
 
56
77
  assert count == limit, f"Expected {limit} samples, got {count}"
57
78
  fps = limit / (time.time() - start)
58
- print(f"Datago FPS {fps:.2f}")
79
+ results = {"datago": {"fps": fps, "count": count}}
80
+ print(f"Datago - FPS {fps:.2f} - workers {num_workers}")
59
81
  del datago_dataset
60
82
 
61
83
  # Save the last image as a test
@@ -64,10 +86,9 @@ def benchmark(
64
86
 
65
87
  # Let's compare against a classic pytorch dataloader
66
88
  if compare_torch:
67
- from torchvision import datasets, transforms # type: ignore
68
89
  from torch.utils.data import DataLoader
90
+ from torchvision import datasets, transforms # type: ignore
69
91
 
70
- print("Benchmarking torch dataloader")
71
92
  # Define the transformations to apply to each image
72
93
  transform = (
73
94
  transforms.Compose(
@@ -88,7 +109,6 @@ def benchmark(
88
109
 
89
110
  # Create a DataLoader to allow for multiple workers
90
111
  # Use available CPU count for num_workers
91
- num_workers = os.cpu_count() or 8 # Default to 8 if cpu_count returns None
92
112
  dataloader = DataLoader(
93
113
  dataset,
94
114
  batch_size=1,
@@ -100,12 +120,17 @@ def benchmark(
100
120
  # Iterate over the DataLoader
101
121
  start = time.time()
102
122
  n_images = 0
103
- for batch in tqdm(dataloader, dynamic_ncols=True):
123
+ for batch in tqdm(dataloader, desc="Torch", dynamic_ncols=True):
104
124
  n_images += len(batch)
105
125
  if n_images > limit:
106
126
  break
127
+
128
+ del batch # Help with memory pressure, same as above
107
129
  fps = n_images / (time.time() - start)
108
- print(f"Torch FPS {fps:.2f}")
130
+ results["torch"] = {"fps": fps, "count": n_images}
131
+ print(f"Torch - FPS {fps:.2f} - workers {num_workers}")
132
+
133
+ return results
109
134
 
110
135
 
111
136
  if __name__ == "__main__":