datago 2025.6.2__tar.gz → 2025.6.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. {datago-2025.6.2 → datago-2025.6.4}/Cargo.lock +8 -7
  2. {datago-2025.6.2 → datago-2025.6.4}/Cargo.toml +16 -1
  3. {datago-2025.6.2 → datago-2025.6.4}/PKG-INFO +1 -1
  4. datago-2025.6.4/python/test_datago_client.py +444 -0
  5. datago-2025.6.4/python/test_datago_edge_cases.py +428 -0
  6. {datago-2025.6.2 → datago-2025.6.4}/src/client.rs +2 -0
  7. datago-2025.6.4/src/generator_files.rs +472 -0
  8. {datago-2025.6.2 → datago-2025.6.4}/src/image_processing.rs +354 -0
  9. datago-2025.6.4/src/structs.rs +406 -0
  10. datago-2025.6.4/src/worker_files.rs +460 -0
  11. {datago-2025.6.2 → datago-2025.6.4}/src/worker_http.rs +7 -1
  12. datago-2025.6.2/src/generator_files.rs +0 -169
  13. datago-2025.6.2/src/structs.rs +0 -177
  14. datago-2025.6.2/src/worker_files.rs +0 -158
  15. {datago-2025.6.2 → datago-2025.6.4}/.github/workflows/ci-cd.yml +0 -0
  16. {datago-2025.6.2 → datago-2025.6.4}/.github/workflows/rust.yml +0 -0
  17. {datago-2025.6.2 → datago-2025.6.4}/.gitignore +0 -0
  18. {datago-2025.6.2 → datago-2025.6.4}/.pre-commit-config.yaml +0 -0
  19. {datago-2025.6.2 → datago-2025.6.4}/LICENSE +0 -0
  20. {datago-2025.6.2 → datago-2025.6.4}/README.md +0 -0
  21. {datago-2025.6.2 → datago-2025.6.4}/assets/447175851-2277afcb-8abf-4d17-b2db-dae27c6056d0.png +0 -0
  22. {datago-2025.6.2 → datago-2025.6.4}/pyproject.toml +0 -0
  23. {datago-2025.6.2 → datago-2025.6.4}/python/benchmark_db.py +0 -0
  24. {datago-2025.6.2 → datago-2025.6.4}/python/benchmark_filesystem.py +0 -0
  25. {datago-2025.6.2 → datago-2025.6.4}/python/benchmark_webdataset.py +0 -0
  26. {datago-2025.6.2 → datago-2025.6.4}/python/dataset.py +0 -0
  27. {datago-2025.6.2 → datago-2025.6.4}/python/raw_types.py +0 -0
  28. {datago-2025.6.2 → datago-2025.6.4}/python/test_datago_db.py +0 -0
  29. {datago-2025.6.2 → datago-2025.6.4}/python/test_datago_filesystem.py +0 -0
  30. {datago-2025.6.2 → datago-2025.6.4}/requirements-tests.txt +0 -0
  31. {datago-2025.6.2 → datago-2025.6.4}/requirements.txt +0 -0
  32. {datago-2025.6.2 → datago-2025.6.4}/src/generator_http.rs +0 -0
  33. {datago-2025.6.2 → datago-2025.6.4}/src/generator_wds.rs +0 -0
  34. {datago-2025.6.2 → datago-2025.6.4}/src/lib.rs +0 -0
  35. {datago-2025.6.2 → datago-2025.6.4}/src/main.rs +0 -0
  36. {datago-2025.6.2 → datago-2025.6.4}/src/worker_wds.rs +0 -0
@@ -500,7 +500,7 @@ dependencies = [
500
500
 
501
501
  [[package]]
502
502
  name = "datago"
503
- version = "2025.6.2"
503
+ version = "2025.6.4"
504
504
  dependencies = [
505
505
  "async-compression",
506
506
  "async-tar",
@@ -525,6 +525,7 @@ dependencies = [
525
525
  "serde",
526
526
  "serde_json",
527
527
  "tar",
528
+ "tempfile",
528
529
  "threadpool",
529
530
  "tokio",
530
531
  "tokio-util",
@@ -1771,7 +1772,7 @@ checksum = "3779b94aeb87e8bd4e834cee3650289ee9e0d5677f976ecdb6d219e5f4f6cd94"
1771
1772
  dependencies = [
1772
1773
  "rand_chacha 0.9.0",
1773
1774
  "rand_core 0.9.3",
1774
- "zerocopy 0.8.23",
1775
+ "zerocopy 0.8.25",
1775
1776
  ]
1776
1777
 
1777
1778
  [[package]]
@@ -2981,11 +2982,11 @@ dependencies = [
2981
2982
 
2982
2983
  [[package]]
2983
2984
  name = "zerocopy"
2984
- version = "0.8.23"
2985
+ version = "0.8.25"
2985
2986
  source = "registry+https://github.com/rust-lang/crates.io-index"
2986
- checksum = "fd97444d05a4328b90e75e503a34bad781f14e28a823ad3557f0750df1ebcbc6"
2987
+ checksum = "a1702d9583232ddb9174e01bb7c15a2ab8fb1bc6f227aa1233858c351a3ba0cb"
2987
2988
  dependencies = [
2988
- "zerocopy-derive 0.8.23",
2989
+ "zerocopy-derive 0.8.25",
2989
2990
  ]
2990
2991
 
2991
2992
  [[package]]
@@ -3001,9 +3002,9 @@ dependencies = [
3001
3002
 
3002
3003
  [[package]]
3003
3004
  name = "zerocopy-derive"
3004
- version = "0.8.23"
3005
+ version = "0.8.25"
3005
3006
  source = "registry+https://github.com/rust-lang/crates.io-index"
3006
- checksum = "6352c01d0edd5db859a63e2605f4ea3183ddbd15e2c4a9e7d32184df75e4f154"
3007
+ checksum = "28a6e20d751156648aa063f3800b706ee209a32c0b4d9f24be3d980b01be55ef"
3007
3008
  dependencies = [
3008
3009
  "proc-macro2",
3009
3010
  "quote",
@@ -1,7 +1,7 @@
1
1
  [package]
2
2
  name = "datago"
3
3
  edition = "2021"
4
- version = "2025.6.2"
4
+ version = "2025.6.4"
5
5
 
6
6
  [lib]
7
7
  # exposed by pyo3
@@ -43,8 +43,23 @@ bracoxide = "0.1.6"
43
43
  infer = "0.16.0"
44
44
  fast_image_resize = { version ="5.1.3", features=["image"]}
45
45
 
46
+ [dev-dependencies]
47
+ tempfile = "3.13.0"
48
+
46
49
  [profile.release]
47
50
  opt-level = 3
48
51
  lto = "fat"
49
52
  codegen-units = 1
50
53
  panic = "abort"
54
+
55
+ [target.x86_64-apple-darwin]
56
+ rustflags = [
57
+ "-C", "link-arg=-undefined",
58
+ "-C", "link-arg=dynamic_lookup",
59
+ ]
60
+
61
+ [target.aarch64-apple-darwin]
62
+ rustflags = [
63
+ "-C", "link-arg=-undefined",
64
+ "-C", "link-arg=dynamic_lookup",
65
+ ]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datago
3
- Version: 2025.6.2
3
+ Version: 2025.6.4
4
4
  Classifier: Programming Language :: Rust
5
5
  Classifier: Programming Language :: Python :: Implementation :: CPython
6
6
  Classifier: Programming Language :: Python :: Implementation :: PyPy
@@ -0,0 +1,444 @@
1
+ import json
2
+ import tempfile
3
+ import os
4
+ from datago import DatagoClient, initialize_logging
5
+ from PIL import Image
6
+
7
+
8
+ def create_test_images(directory, count=5):
9
+ """Helper function to create test images in a directory."""
10
+ image_paths = []
11
+ for i in range(count):
12
+ img = Image.new(
13
+ "RGB", (100, 100), color=(i * 50 % 255, (i * 100) % 255, (i * 150) % 255)
14
+ )
15
+ path = os.path.join(directory, f"test_image_{i}.png")
16
+ img.save(path)
17
+ image_paths.append(path)
18
+ return image_paths
19
+
20
+
21
+ class TestDatagoClient:
22
+ """Test cases for DatagoClient functionality."""
23
+
24
+ def test_initialize_logging(self):
25
+ """Test the initialize_logging function."""
26
+ # Should return True on first call
27
+ result = initialize_logging("info")
28
+ assert isinstance(result, bool)
29
+
30
+ # Test with None parameter
31
+ result = initialize_logging(None)
32
+ assert isinstance(result, bool)
33
+
34
+ def test_client_instantiation_file_source(self):
35
+ """Test creating a client with file source configuration."""
36
+ with tempfile.TemporaryDirectory() as tmpdir:
37
+ create_test_images(tmpdir, 3)
38
+
39
+ config = {
40
+ "source_type": "file",
41
+ "source_config": {
42
+ "root_path": tmpdir,
43
+ "rank": 0,
44
+ "world_size": 1,
45
+ "random_sampling": False,
46
+ },
47
+ "limit": 3,
48
+ "samples_buffer_size": 10,
49
+ }
50
+
51
+ client = DatagoClient(json.dumps(config))
52
+ assert client is not None
53
+
54
+ # We panic out at the moment if the config is invalid, so this test is commented out.
55
+ # Uncomment this test if you want to handle invalid configurations gracefully.
56
+ # def test_client_instantiation_invalid_config(self):
57
+ # """Test that invalid configuration raises an error."""
58
+ # invalid_config = '{"invalid": "config"}'
59
+
60
+ # with pytest.raises((ValueError, RuntimeError)):
61
+ # DatagoClient(invalid_config)
62
+
63
+ def test_client_start_stop_file_source(self):
64
+ """Test starting and stopping client with file source."""
65
+ with tempfile.TemporaryDirectory() as tmpdir:
66
+ create_test_images(tmpdir, 3)
67
+
68
+ config = {
69
+ "source_type": "file",
70
+ "source_config": {
71
+ "root_path": tmpdir,
72
+ "rank": 0,
73
+ "world_size": 1,
74
+ "random_sampling": False,
75
+ },
76
+ "limit": 3,
77
+ "samples_buffer_size": 10,
78
+ }
79
+
80
+ client = DatagoClient(json.dumps(config))
81
+ client.start()
82
+ client.stop()
83
+
84
+ def test_get_sample_file_source(self):
85
+ """Test getting samples from file source."""
86
+ with tempfile.TemporaryDirectory() as tmpdir:
87
+ create_test_images(tmpdir, 5)
88
+
89
+ config = {
90
+ "source_type": "file",
91
+ "source_config": {
92
+ "root_path": tmpdir,
93
+ "rank": 0,
94
+ "world_size": 1,
95
+ "random_sampling": False,
96
+ },
97
+ "limit": 3,
98
+ "samples_buffer_size": 10,
99
+ }
100
+
101
+ client = DatagoClient(json.dumps(config))
102
+
103
+ samples_received = []
104
+ for _ in range(3):
105
+ sample = client.get_sample()
106
+ if sample:
107
+ samples_received.append(sample)
108
+ else:
109
+ break
110
+
111
+ assert len(samples_received) <= 3
112
+
113
+ for sample in samples_received:
114
+ assert sample.id != ""
115
+ assert sample.source == "filesystem"
116
+ assert sample.image.width > 0
117
+ assert sample.image.height > 0
118
+ assert len(sample.image.data) > 0
119
+
120
+ def test_client_with_image_transformations(self):
121
+ """Test client with image transformation configuration."""
122
+ with tempfile.TemporaryDirectory() as tmpdir:
123
+ create_test_images(tmpdir, 3)
124
+
125
+ config = {
126
+ "source_type": "file",
127
+ "source_config": {
128
+ "root_path": tmpdir,
129
+ "rank": 0,
130
+ "world_size": 1,
131
+ "random_sampling": False,
132
+ },
133
+ "image_config": {
134
+ "crop_and_resize": True,
135
+ "default_image_size": 64,
136
+ "downsampling_ratio": 16,
137
+ "min_aspect_ratio": 0.5,
138
+ "max_aspect_ratio": 2.0,
139
+ "pre_encode_images": False,
140
+ "image_to_rgb8": True,
141
+ },
142
+ "limit": 2,
143
+ "samples_buffer_size": 10,
144
+ }
145
+
146
+ client = DatagoClient(json.dumps(config))
147
+ sample = client.get_sample()
148
+
149
+ assert sample is not None
150
+ assert sample.image.width <= 64
151
+ assert sample.image.height <= 64
152
+ assert sample.image.channels == 3 # RGB8
153
+
154
+ def test_client_with_image_encoding(self):
155
+ """Test client with image encoding enabled."""
156
+ with tempfile.TemporaryDirectory() as tmpdir:
157
+ create_test_images(tmpdir, 3)
158
+
159
+ config = {
160
+ "source_type": "file",
161
+ "source_config": {
162
+ "root_path": tmpdir,
163
+ "rank": 0,
164
+ "world_size": 1,
165
+ "random_sampling": False,
166
+ },
167
+ "image_config": {
168
+ "crop_and_resize": False,
169
+ "pre_encode_images": True,
170
+ "image_to_rgb8": False,
171
+ },
172
+ "limit": 2,
173
+ "samples_buffer_size": 10,
174
+ }
175
+
176
+ client = DatagoClient(json.dumps(config))
177
+ sample = client.get_sample()
178
+
179
+ assert sample is not None
180
+ assert sample.image.channels == -1 # Encoded images have channels = -1
181
+ assert len(sample.image.data) > 0
182
+
183
+ def test_random_sampling(self):
184
+ """Test that random sampling produces different results."""
185
+ with tempfile.TemporaryDirectory() as tmpdir:
186
+ create_test_images(tmpdir, 10)
187
+
188
+ config_base = {
189
+ "source_type": "file",
190
+ "source_config": {
191
+ "root_path": tmpdir,
192
+ "rank": 0,
193
+ "world_size": 1,
194
+ "random_sampling": True,
195
+ },
196
+ "limit": 5,
197
+ "samples_buffer_size": 10,
198
+ }
199
+
200
+ # Get two sets of samples with random sampling
201
+ client1 = DatagoClient(json.dumps(config_base))
202
+ samples1 = []
203
+ for _ in range(5):
204
+ sample = client1.get_sample()
205
+ if sample:
206
+ samples1.append(sample.id)
207
+ else:
208
+ break
209
+
210
+ client2 = DatagoClient(json.dumps(config_base))
211
+ samples2 = []
212
+ for _ in range(5):
213
+ sample = client2.get_sample()
214
+ if sample:
215
+ samples2.append(sample.id)
216
+ else:
217
+ break
218
+
219
+ # With random sampling, the samples should be different
220
+ assert len(samples1) > 0
221
+ assert len(samples2) > 0
222
+ assert (
223
+ len(set(samples1) & set(samples2)) < 5
224
+ ) # Expect some overlap, but not all
225
+
226
+ def test_world_size_and_rank(self):
227
+ """Test that different ranks get different subsets of data."""
228
+ with tempfile.TemporaryDirectory() as tmpdir:
229
+ create_test_images(tmpdir, 10)
230
+
231
+ config_rank0 = {
232
+ "source_type": "file",
233
+ "source_config": {
234
+ "root_path": tmpdir,
235
+ "rank": 0,
236
+ "world_size": 2,
237
+ "random_sampling": False,
238
+ },
239
+ "limit": 10,
240
+ "samples_buffer_size": 10,
241
+ }
242
+
243
+ config_rank1 = {
244
+ "source_type": "file",
245
+ "source_config": {
246
+ "root_path": tmpdir,
247
+ "rank": 1,
248
+ "world_size": 2,
249
+ "random_sampling": False,
250
+ },
251
+ "limit": 10,
252
+ "samples_buffer_size": 10,
253
+ }
254
+
255
+ client0 = DatagoClient(json.dumps(config_rank0))
256
+ samples0 = []
257
+ for _ in range(10):
258
+ sample = client0.get_sample()
259
+ if sample:
260
+ samples0.append(sample.id)
261
+ else:
262
+ break
263
+
264
+ client1 = DatagoClient(json.dumps(config_rank1))
265
+ samples1 = []
266
+ for _ in range(10):
267
+ sample = client1.get_sample()
268
+ if sample:
269
+ samples1.append(sample.id)
270
+ else:
271
+ break
272
+
273
+ # Different ranks should get different samples
274
+ assert len(samples0) > 0
275
+ assert len(samples1) > 0
276
+
277
+ # No overlap between ranks
278
+ overlap = set(samples0) & set(samples1)
279
+ assert len(overlap) == 0
280
+
281
+ def test_limit_respected(self):
282
+ """Test that the client respects the limit parameter."""
283
+ with tempfile.TemporaryDirectory() as tmpdir:
284
+ create_test_images(tmpdir, 10)
285
+
286
+ config = {
287
+ "source_type": "file",
288
+ "source_config": {
289
+ "root_path": tmpdir,
290
+ "rank": 0,
291
+ "world_size": 1,
292
+ "random_sampling": False,
293
+ },
294
+ "limit": 3,
295
+ "samples_buffer_size": 10,
296
+ }
297
+
298
+ client = DatagoClient(json.dumps(config))
299
+ samples_received = 0
300
+
301
+ while True:
302
+ sample = client.get_sample()
303
+ if sample:
304
+ samples_received += 1
305
+ else:
306
+ break
307
+
308
+ # Safety valve to prevent infinite loop
309
+ if samples_received > 10:
310
+ break
311
+
312
+ # Should respect the limit (might have small buffer)
313
+ assert samples_received <= 4 # Allow small buffer
314
+
315
+ def test_empty_directory(self):
316
+ """Test client behavior with empty directory."""
317
+ with tempfile.TemporaryDirectory() as tmpdir:
318
+ config = {
319
+ "source_type": "file",
320
+ "source_config": {
321
+ "root_path": tmpdir,
322
+ "rank": 0,
323
+ "world_size": 1,
324
+ "random_sampling": False,
325
+ },
326
+ "limit": 3,
327
+ "samples_buffer_size": 10,
328
+ }
329
+
330
+ client = DatagoClient(json.dumps(config))
331
+ sample = client.get_sample()
332
+
333
+ # Should return None when no files available
334
+ assert sample is None
335
+
336
+ def test_nonexistent_directory(self):
337
+ """Test client behavior with nonexistent directory."""
338
+ config = {
339
+ "source_type": "file",
340
+ "source_config": {
341
+ "root_path": "/nonexistent/directory",
342
+ "rank": 0,
343
+ "world_size": 1,
344
+ "random_sampling": False,
345
+ },
346
+ "limit": 3,
347
+ "samples_buffer_size": 10,
348
+ }
349
+
350
+ client = DatagoClient(json.dumps(config))
351
+ sample = client.get_sample()
352
+
353
+ # Should handle gracefully and return None
354
+ assert sample is None
355
+
356
+ def test_client_drop_cleanup(self):
357
+ """Test that client cleans up properly when dropped."""
358
+ with tempfile.TemporaryDirectory() as tmpdir:
359
+ create_test_images(tmpdir, 3)
360
+
361
+ config = {
362
+ "source_type": "file",
363
+ "source_config": {
364
+ "root_path": tmpdir,
365
+ "rank": 0,
366
+ "world_size": 1,
367
+ "random_sampling": False,
368
+ },
369
+ "limit": 3,
370
+ "samples_buffer_size": 10,
371
+ }
372
+
373
+ client = DatagoClient(json.dumps(config))
374
+ client.start()
375
+
376
+ # Client should clean up when it goes out of scope
377
+ del client
378
+
379
+ def test_multiple_starts_stops(self):
380
+ """Test that multiple start/stop calls don't cause issues."""
381
+ with tempfile.TemporaryDirectory() as tmpdir:
382
+ create_test_images(tmpdir, 3)
383
+
384
+ config = {
385
+ "source_type": "file",
386
+ "source_config": {
387
+ "root_path": tmpdir,
388
+ "rank": 0,
389
+ "world_size": 1,
390
+ "random_sampling": False,
391
+ },
392
+ "limit": 3,
393
+ "samples_buffer_size": 10,
394
+ }
395
+
396
+ client = DatagoClient(json.dumps(config))
397
+
398
+ # Multiple starts should be safe
399
+ client.start()
400
+ client.start()
401
+
402
+ # Multiple stops should be safe
403
+ client.stop()
404
+ client.stop()
405
+
406
+ def test_various_image_formats(self):
407
+ """Test client with various image formats."""
408
+ with tempfile.TemporaryDirectory() as tmpdir:
409
+ # Create images with different formats
410
+ formats = [
411
+ ("test1.png", "PNG"),
412
+ ("test2.jpg", "JPEG"),
413
+ ]
414
+
415
+ for filename, format_name in formats:
416
+ img = Image.new("RGB", (50, 50), color="red")
417
+ path = os.path.join(tmpdir, filename)
418
+ img.save(path, format=format_name)
419
+
420
+ config = {
421
+ "source_type": "file",
422
+ "source_config": {
423
+ "root_path": tmpdir,
424
+ "rank": 0,
425
+ "world_size": 1,
426
+ "random_sampling": False,
427
+ },
428
+ "limit": 4,
429
+ "samples_buffer_size": 10,
430
+ }
431
+
432
+ client = DatagoClient(json.dumps(config))
433
+
434
+ samples_received = 0
435
+ while True:
436
+ sample = client.get_sample()
437
+ if sample:
438
+ samples_received += 1
439
+ assert sample.image.width == 50
440
+ assert sample.image.height == 50
441
+ else:
442
+ break
443
+
444
+ assert samples_received == len(formats)