turboloader 2.3.2__tar.gz → 2.3.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (137) hide show
  1. {turboloader-2.3.2 → turboloader-2.3.4}/PKG-INFO +1 -1
  2. {turboloader-2.3.2 → turboloader-2.3.4}/benchmarks/datasets/generate_synthetic.py +57 -57
  3. {turboloader-2.3.2 → turboloader-2.3.4}/benchmarks/memory/bench_memory.py +78 -69
  4. {turboloader-2.3.2 → turboloader-2.3.4}/benchmarks/throughput/bench_pytorch.py +73 -67
  5. {turboloader-2.3.2 → turboloader-2.3.4}/benchmarks/throughput/bench_tbl_v2.py +39 -42
  6. {turboloader-2.3.2 → turboloader-2.3.4}/benchmarks/throughput/bench_turboloader.py +92 -89
  7. {turboloader-2.3.2 → turboloader-2.3.4}/benchmarks/throughput/bench_webdataset.py +74 -64
  8. turboloader-2.3.4/benchmarks/transforms/bench_transforms.py +478 -0
  9. turboloader-2.3.4/benchmarks/visualization/plot_results.py +480 -0
  10. {turboloader-2.3.2 → turboloader-2.3.4}/pyproject.toml +1 -1
  11. {turboloader-2.3.2 → turboloader-2.3.4}/setup.py +105 -93
  12. {turboloader-2.3.2 → turboloader-2.3.4}/turboloader/__init__.py +2 -2
  13. turboloader-2.3.2/benchmarks/transforms/bench_transforms.py +0 -453
  14. turboloader-2.3.2/benchmarks/visualization/plot_results.py +0 -427
  15. {turboloader-2.3.2 → turboloader-2.3.4}/AUTHORS.md +0 -0
  16. {turboloader-2.3.2 → turboloader-2.3.4}/CMakeLists.txt +0 -0
  17. {turboloader-2.3.2 → turboloader-2.3.4}/CONTRIBUTING.md +0 -0
  18. {turboloader-2.3.2 → turboloader-2.3.4}/LICENSE +0 -0
  19. {turboloader-2.3.2 → turboloader-2.3.4}/MANIFEST.in +0 -0
  20. {turboloader-2.3.2 → turboloader-2.3.4}/README.md +0 -0
  21. {turboloader-2.3.2 → turboloader-2.3.4}/benchmarks/01_pil_baseline.py +0 -0
  22. {turboloader-2.3.2 → turboloader-2.3.4}/benchmarks/02_pytorch_naive.py +0 -0
  23. {turboloader-2.3.2 → turboloader-2.3.4}/benchmarks/03_pytorch_optimized.py +0 -0
  24. {turboloader-2.3.2 → turboloader-2.3.4}/benchmarks/04_pytorch_cached.py +0 -0
  25. {turboloader-2.3.2 → turboloader-2.3.4}/benchmarks/05_turboloader.py +0 -0
  26. {turboloader-2.3.2 → turboloader-2.3.4}/benchmarks/06_ffcv.py +0 -0
  27. {turboloader-2.3.2 → turboloader-2.3.4}/benchmarks/07_dali.py +0 -0
  28. {turboloader-2.3.2 → turboloader-2.3.4}/benchmarks/08_tensorflow.py +0 -0
  29. {turboloader-2.3.2 → turboloader-2.3.4}/benchmarks/09_resnet50_training.py +0 -0
  30. {turboloader-2.3.2 → turboloader-2.3.4}/benchmarks/BENCHMARK_PLAN.md +0 -0
  31. {turboloader-2.3.2 → turboloader-2.3.4}/benchmarks/benchmark_advanced_transforms.py +0 -0
  32. {turboloader-2.3.2 → turboloader-2.3.4}/benchmarks/comprehensive_benchmark.py +0 -0
  33. {turboloader-2.3.2 → turboloader-2.3.4}/benchmarks/final_comprehensive_benchmark.py +0 -0
  34. {turboloader-2.3.2 → turboloader-2.3.4}/benchmarks/format_converter_benchmark.py +0 -0
  35. {turboloader-2.3.2 → turboloader-2.3.4}/benchmarks/generate_web_data.py +0 -0
  36. {turboloader-2.3.2 → turboloader-2.3.4}/benchmarks/run_all_benchmarks.py +0 -0
  37. {turboloader-2.3.2 → turboloader-2.3.4}/examples/README.md +0 -0
  38. {turboloader-2.3.2 → turboloader-2.3.4}/examples/avx512_performance.py +0 -0
  39. {turboloader-2.3.2 → turboloader-2.3.4}/examples/complete_v110_workflow.py +0 -0
  40. {turboloader-2.3.2 → turboloader-2.3.4}/examples/distributed_ddp.py +0 -0
  41. {turboloader-2.3.2 → turboloader-2.3.4}/examples/imagenet_resnet50.py +0 -0
  42. {turboloader-2.3.2 → turboloader-2.3.4}/examples/pytorch_lightning_example.py +0 -0
  43. {turboloader-2.3.2 → turboloader-2.3.4}/examples/tbl_conversion.py +0 -0
  44. {turboloader-2.3.2 → turboloader-2.3.4}/examples/transform_example.py +0 -0
  45. {turboloader-2.3.2 → turboloader-2.3.4}/setup.cfg +0 -0
  46. {turboloader-2.3.2 → turboloader-2.3.4}/src/cache/cache_key.hpp +0 -0
  47. {turboloader-2.3.2 → turboloader-2.3.4}/src/cache/disk_cache.hpp +0 -0
  48. {turboloader-2.3.2 → turboloader-2.3.4}/src/cache/lru_cache.hpp +0 -0
  49. {turboloader-2.3.2 → turboloader-2.3.4}/src/cache/tiered_cache.hpp +0 -0
  50. {turboloader-2.3.2 → turboloader-2.3.4}/src/core/object_pool.hpp +0 -0
  51. {turboloader-2.3.2 → turboloader-2.3.4}/src/core/sample.hpp +0 -0
  52. {turboloader-2.3.2 → turboloader-2.3.4}/src/core/spsc_ring_buffer.hpp +0 -0
  53. {turboloader-2.3.2 → turboloader-2.3.4}/src/decode/bmp_decoder.hpp +0 -0
  54. {turboloader-2.3.2 → turboloader-2.3.4}/src/decode/csv_decoder.hpp +0 -0
  55. {turboloader-2.3.2 → turboloader-2.3.4}/src/decode/image_decoder.hpp +0 -0
  56. {turboloader-2.3.2 → turboloader-2.3.4}/src/decode/jpeg_decoder.hpp +0 -0
  57. {turboloader-2.3.2 → turboloader-2.3.4}/src/decode/nvjpeg_decoder.hpp +0 -0
  58. {turboloader-2.3.2 → turboloader-2.3.4}/src/decode/parquet_decoder.hpp +0 -0
  59. {turboloader-2.3.2 → turboloader-2.3.4}/src/decode/png_decoder.hpp +0 -0
  60. {turboloader-2.3.2 → turboloader-2.3.4}/src/decode/tiff_decoder.hpp +0 -0
  61. {turboloader-2.3.2 → turboloader-2.3.4}/src/decode/video_decoder.hpp +0 -0
  62. {turboloader-2.3.2 → turboloader-2.3.4}/src/decode/webp_decoder.hpp +0 -0
  63. {turboloader-2.3.2 → turboloader-2.3.4}/src/distributed/distributed_dataloader.hpp +0 -0
  64. {turboloader-2.3.2 → turboloader-2.3.4}/src/formats/coco_voc_parser.hpp +0 -0
  65. {turboloader-2.3.2 → turboloader-2.3.4}/src/formats/tbl_v2_format.hpp +0 -0
  66. {turboloader-2.3.2 → turboloader-2.3.4}/src/gpu/multi_gpu_pipeline.cpp +0 -0
  67. {turboloader-2.3.2 → turboloader-2.3.4}/src/gpu/multi_gpu_pipeline.hpp +0 -0
  68. {turboloader-2.3.2 → turboloader-2.3.4}/src/io/io_uring_reader.hpp +0 -0
  69. {turboloader-2.3.2 → turboloader-2.3.4}/src/pipeline/error_recovery.hpp +0 -0
  70. {turboloader-2.3.2 → turboloader-2.3.4}/src/pipeline/pipeline.hpp +0 -0
  71. {turboloader-2.3.2 → turboloader-2.3.4}/src/pipeline/prefetch_pipeline.hpp +0 -0
  72. {turboloader-2.3.2 → turboloader-2.3.4}/src/pipeline/smart_batching.hpp +0 -0
  73. {turboloader-2.3.2 → turboloader-2.3.4}/src/python/turboloader_bindings.cpp +0 -0
  74. {turboloader-2.3.2 → turboloader-2.3.4}/src/readers/azure_blob_reader.hpp +0 -0
  75. {turboloader-2.3.2 → turboloader-2.3.4}/src/readers/gcs_reader.hpp +0 -0
  76. {turboloader-2.3.2 → turboloader-2.3.4}/src/readers/hdf5_reader.hpp +0 -0
  77. {turboloader-2.3.2 → turboloader-2.3.4}/src/readers/http_reader.hpp +0 -0
  78. {turboloader-2.3.2 → turboloader-2.3.4}/src/readers/reader_orchestrator.hpp +0 -0
  79. {turboloader-2.3.2 → turboloader-2.3.4}/src/readers/s3_reader.hpp +0 -0
  80. {turboloader-2.3.2 → turboloader-2.3.4}/src/readers/tar_reader.hpp +0 -0
  81. {turboloader-2.3.2 → turboloader-2.3.4}/src/readers/tbl_v2_reader.hpp +0 -0
  82. {turboloader-2.3.2 → turboloader-2.3.4}/src/readers/tfrecord_reader.hpp +0 -0
  83. {turboloader-2.3.2 → turboloader-2.3.4}/src/readers/zarr_reader.hpp +0 -0
  84. {turboloader-2.3.2 → turboloader-2.3.4}/src/transforms/affine_transform.hpp +0 -0
  85. {turboloader-2.3.2 → turboloader-2.3.4}/src/transforms/autoaugment_transform.hpp +0 -0
  86. {turboloader-2.3.2 → turboloader-2.3.4}/src/transforms/blur_transform.hpp +0 -0
  87. {turboloader-2.3.2 → turboloader-2.3.4}/src/transforms/color_jitter_transform.hpp +0 -0
  88. {turboloader-2.3.2 → turboloader-2.3.4}/src/transforms/crop_transform.hpp +0 -0
  89. {turboloader-2.3.2 → turboloader-2.3.4}/src/transforms/erasing_transform.hpp +0 -0
  90. {turboloader-2.3.2 → turboloader-2.3.4}/src/transforms/flip_transform.hpp +0 -0
  91. {turboloader-2.3.2 → turboloader-2.3.4}/src/transforms/gpu/gpu_transforms.hpp +0 -0
  92. {turboloader-2.3.2 → turboloader-2.3.4}/src/transforms/grayscale_transform.hpp +0 -0
  93. {turboloader-2.3.2 → turboloader-2.3.4}/src/transforms/modern_augment_transform.hpp +0 -0
  94. {turboloader-2.3.2 → turboloader-2.3.4}/src/transforms/normalize_transform.hpp +0 -0
  95. {turboloader-2.3.2 → turboloader-2.3.4}/src/transforms/pad_transform.hpp +0 -0
  96. {turboloader-2.3.2 → turboloader-2.3.4}/src/transforms/perspective_transform.hpp +0 -0
  97. {turboloader-2.3.2 → turboloader-2.3.4}/src/transforms/posterize_transform.hpp +0 -0
  98. {turboloader-2.3.2 → turboloader-2.3.4}/src/transforms/resize_transform.hpp +0 -0
  99. {turboloader-2.3.2 → turboloader-2.3.4}/src/transforms/rotation_transform.hpp +0 -0
  100. {turboloader-2.3.2 → turboloader-2.3.4}/src/transforms/simd_utils.hpp +0 -0
  101. {turboloader-2.3.2 → turboloader-2.3.4}/src/transforms/solarize_transform.hpp +0 -0
  102. {turboloader-2.3.2 → turboloader-2.3.4}/src/transforms/tensor_conversion.hpp +0 -0
  103. {turboloader-2.3.2 → turboloader-2.3.4}/src/transforms/transform_base.hpp +0 -0
  104. {turboloader-2.3.2 → turboloader-2.3.4}/src/transforms/transforms.hpp +0 -0
  105. {turboloader-2.3.2 → turboloader-2.3.4}/src/utils/image_dimensions.hpp +0 -0
  106. {turboloader-2.3.2 → turboloader-2.3.4}/src/writers/tbl_v2_writer.hpp +0 -0
  107. {turboloader-2.3.2 → turboloader-2.3.4}/tests/create_test_dataset.py +0 -0
  108. {turboloader-2.3.2 → turboloader-2.3.4}/tests/test_advanced_transforms.cpp +0 -0
  109. {turboloader-2.3.2 → turboloader-2.3.4}/tests/test_avx512_simd.cpp +0 -0
  110. {turboloader-2.3.2 → turboloader-2.3.4}/tests/test_cache.cpp +0 -0
  111. {turboloader-2.3.2 → turboloader-2.3.4}/tests/test_csv_decoder.cpp +0 -0
  112. {turboloader-2.3.2 → turboloader-2.3.4}/tests/test_distributed.cpp +0 -0
  113. {turboloader-2.3.2 → turboloader-2.3.4}/tests/test_gcs_reader.cpp +0 -0
  114. {turboloader-2.3.2 → turboloader-2.3.4}/tests/test_http_reader.cpp +0 -0
  115. {turboloader-2.3.2 → turboloader-2.3.4}/tests/test_image_decoder.cpp +0 -0
  116. {turboloader-2.3.2 → turboloader-2.3.4}/tests/test_integrations.py +0 -0
  117. {turboloader-2.3.2 → turboloader-2.3.4}/tests/test_jax_integration.py +0 -0
  118. {turboloader-2.3.2 → turboloader-2.3.4}/tests/test_multi_gpu.cpp +0 -0
  119. {turboloader-2.3.2 → turboloader-2.3.4}/tests/test_nvjpeg_decoder.cpp +0 -0
  120. {turboloader-2.3.2 → turboloader-2.3.4}/tests/test_parquet_decoder.cpp +0 -0
  121. {turboloader-2.3.2 → turboloader-2.3.4}/tests/test_pipeline_gpu_decode.cpp +0 -0
  122. {turboloader-2.3.2 → turboloader-2.3.4}/tests/test_prefetch_pipeline.cpp +0 -0
  123. {turboloader-2.3.2 → turboloader-2.3.4}/tests/test_pytorch_transforms.py +0 -0
  124. {turboloader-2.3.2 → turboloader-2.3.4}/tests/test_reader_orchestrator.cpp +0 -0
  125. {turboloader-2.3.2 → turboloader-2.3.4}/tests/test_s3_reader.cpp +0 -0
  126. {turboloader-2.3.2 → turboloader-2.3.4}/tests/test_smart_batching.cpp +0 -0
  127. {turboloader-2.3.2 → turboloader-2.3.4}/tests/test_tar_reader.cpp +0 -0
  128. {turboloader-2.3.2 → turboloader-2.3.4}/tests/test_tbl_v2.cpp +0 -0
  129. {turboloader-2.3.2 → turboloader-2.3.4}/tests/test_tensorflow_integration.py +0 -0
  130. {turboloader-2.3.2 → turboloader-2.3.4}/tests/test_transforms.cpp +0 -0
  131. {turboloader-2.3.2 → turboloader-2.3.4}/tests/test_transforms_tensorflow.py +0 -0
  132. {turboloader-2.3.2 → turboloader-2.3.4}/tests/test_unified_pipeline.cpp +0 -0
  133. {turboloader-2.3.2 → turboloader-2.3.4}/tests/test_v180_features.py +0 -0
  134. {turboloader-2.3.2 → turboloader-2.3.4}/tests/test_v190_features.py +0 -0
  135. {turboloader-2.3.2 → turboloader-2.3.4}/tests/test_video_decoder.cpp +0 -0
  136. {turboloader-2.3.2 → turboloader-2.3.4}/tests/test_webdataset_integration.py +0 -0
  137. {turboloader-2.3.2 → turboloader-2.3.4}/turboloader.egg-info/SOURCES.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: turboloader
3
- Version: 2.3.2
3
+ Version: 2.3.4
4
4
  Summary: Production-ready ML data loading library with distributed training support, SIMD-accelerated transforms, pipe operator composition, HDF5/TFRecord/Zarr support, and GPU transforms. Built with C++20 for maximum performance.
5
5
  Author: TurboLoader Contributors
6
6
  Author-email: Arnav Jain <arnav@example.com>
@@ -22,6 +22,7 @@ import numpy as np
22
22
 
23
23
  try:
24
24
  from PIL import Image
25
+
25
26
  HAS_PIL = True
26
27
  except ImportError:
27
28
  HAS_PIL = False
@@ -58,19 +59,19 @@ def generate_random_image(width: int, height: int, channels: int = 3) -> np.ndar
58
59
  def save_image_jpeg(img: np.ndarray, path: str, quality: int = 85):
59
60
  """Save image as JPEG"""
60
61
  if HAS_PIL:
61
- Image.fromarray(img).save(path, 'JPEG', quality=quality)
62
+ Image.fromarray(img).save(path, "JPEG", quality=quality)
62
63
  else:
63
64
  # Fallback: save as raw binary
64
- with open(path, 'wb') as f:
65
+ with open(path, "wb") as f:
65
66
  f.write(img.tobytes())
66
67
 
67
68
 
68
69
  def save_image_png(img: np.ndarray, path: str):
69
70
  """Save image as PNG"""
70
71
  if HAS_PIL:
71
- Image.fromarray(img).save(path, 'PNG')
72
+ Image.fromarray(img).save(path, "PNG")
72
73
  else:
73
- with open(path, 'wb') as f:
74
+ with open(path, "wb") as f:
74
75
  f.write(img.tobytes())
75
76
 
76
77
 
@@ -79,8 +80,8 @@ def generate_dataset_files(
79
80
  num_images: int,
80
81
  image_size: tuple = (256, 256),
81
82
  num_classes: int = 1000,
82
- format: str = 'jpeg',
83
- num_workers: int = 4
83
+ format: str = "jpeg",
84
+ num_workers: int = 4,
84
85
  ) -> list:
85
86
  """Generate dataset as individual image files"""
86
87
  os.makedirs(output_dir, exist_ok=True)
@@ -92,7 +93,7 @@ def generate_dataset_files(
92
93
  img = generate_random_image(image_size[0], image_size[1])
93
94
  label = idx % num_classes
94
95
 
95
- if format == 'jpeg':
96
+ if format == "jpeg":
96
97
  filename = f"img_{idx:08d}.jpg"
97
98
  filepath = os.path.join(output_dir, filename)
98
99
  save_image_jpeg(img, filepath)
@@ -116,8 +117,8 @@ def generate_dataset_files(
116
117
  print(f" Generated {i + 1}/{num_images} images")
117
118
 
118
119
  # Save labels file
119
- labels_path = os.path.join(output_dir, 'labels.txt')
120
- with open(labels_path, 'w') as f:
120
+ labels_path = os.path.join(output_dir, "labels.txt")
121
+ with open(labels_path, "w") as f:
121
122
  for filepath, label in zip(files, labels):
122
123
  f.write(f"{os.path.basename(filepath)},{label}\n")
123
124
 
@@ -130,14 +131,14 @@ def generate_tar_dataset(
130
131
  num_images: int,
131
132
  image_size: tuple = (256, 256),
132
133
  num_classes: int = 1000,
133
- format: str = 'jpeg'
134
+ format: str = "jpeg",
134
135
  ) -> str:
135
136
  """Generate dataset as a TAR archive"""
136
137
  import io
137
138
 
138
139
  print(f"Generating TAR archive with {num_images} images...")
139
140
 
140
- with tarfile.open(output_path, 'w') as tar:
141
+ with tarfile.open(output_path, "w") as tar:
141
142
  labels = []
142
143
 
143
144
  for idx in range(num_images):
@@ -147,11 +148,11 @@ def generate_tar_dataset(
147
148
  # Create in-memory file
148
149
  if HAS_PIL:
149
150
  buf = io.BytesIO()
150
- if format == 'jpeg':
151
- Image.fromarray(img).save(buf, 'JPEG', quality=85)
151
+ if format == "jpeg":
152
+ Image.fromarray(img).save(buf, "JPEG", quality=85)
152
153
  filename = f"img_{idx:08d}.jpg"
153
154
  else:
154
- Image.fromarray(img).save(buf, 'PNG')
155
+ Image.fromarray(img).save(buf, "PNG")
155
156
  filename = f"img_{idx:08d}.png"
156
157
  buf.seek(0)
157
158
  data = buf.getvalue()
@@ -170,9 +171,9 @@ def generate_tar_dataset(
170
171
  print(f" Added {idx + 1}/{num_images} images to TAR")
171
172
 
172
173
  # Add labels file
173
- labels_content = '\n'.join(f"{f},{l}" for f, l in labels)
174
- labels_data = labels_content.encode('utf-8')
175
- info = tarfile.TarInfo(name='labels.txt')
174
+ labels_content = "\n".join(f"{f},{l}" for f, l in labels)
175
+ labels_data = labels_content.encode("utf-8")
176
+ info = tarfile.TarInfo(name="labels.txt")
176
177
  info.size = len(labels_data)
177
178
  tar.addfile(info, io.BytesIO(labels_data))
178
179
 
@@ -185,7 +186,7 @@ def generate_varying_size_dataset(
185
186
  num_images: int,
186
187
  min_size: int = 128,
187
188
  max_size: int = 512,
188
- num_classes: int = 1000
189
+ num_classes: int = 1000,
189
190
  ) -> str:
190
191
  """Generate dataset with varying image sizes (for smart batching benchmarks)"""
191
192
  import io
@@ -193,7 +194,7 @@ def generate_varying_size_dataset(
193
194
  print(f"Generating varying-size TAR archive with {num_images} images...")
194
195
 
195
196
  sizes = []
196
- with tarfile.open(output_path, 'w') as tar:
197
+ with tarfile.open(output_path, "w") as tar:
197
198
  labels = []
198
199
 
199
200
  for idx in range(num_images):
@@ -207,7 +208,7 @@ def generate_varying_size_dataset(
207
208
 
208
209
  if HAS_PIL:
209
210
  buf = io.BytesIO()
210
- Image.fromarray(img).save(buf, 'JPEG', quality=85)
211
+ Image.fromarray(img).save(buf, "JPEG", quality=85)
211
212
  filename = f"img_{idx:08d}.jpg"
212
213
  buf.seek(0)
213
214
  data = buf.getvalue()
@@ -226,12 +227,12 @@ def generate_varying_size_dataset(
226
227
 
227
228
  # Add metadata
228
229
  meta = {
229
- 'num_images': num_images,
230
- 'sizes': sizes,
231
- 'labels': [(f, l) for f, l, w, h in labels]
230
+ "num_images": num_images,
231
+ "sizes": sizes,
232
+ "labels": [(f, l) for f, l, w, h in labels],
232
233
  }
233
- meta_data = json.dumps(meta).encode('utf-8')
234
- info = tarfile.TarInfo(name='metadata.json')
234
+ meta_data = json.dumps(meta).encode("utf-8")
235
+ info = tarfile.TarInfo(name="metadata.json")
235
236
  info.size = len(meta_data)
236
237
  tar.addfile(info, io.BytesIO(meta_data))
237
238
 
@@ -240,52 +241,51 @@ def generate_varying_size_dataset(
240
241
 
241
242
 
242
243
  def main():
243
- parser = argparse.ArgumentParser(description='Generate synthetic datasets for benchmarking')
244
- parser.add_argument('--output', '-o', type=str, required=True,
245
- help='Output path (directory for files, .tar for archive)')
246
- parser.add_argument('--num-images', '-n', type=int, default=10000,
247
- help='Number of images to generate')
248
- parser.add_argument('--width', type=int, default=256,
249
- help='Image width')
250
- parser.add_argument('--height', type=int, default=256,
251
- help='Image height')
252
- parser.add_argument('--num-classes', type=int, default=1000,
253
- help='Number of classes for labels')
254
- parser.add_argument('--format', choices=['jpeg', 'png'], default='jpeg',
255
- help='Image format')
256
- parser.add_argument('--type', choices=['files', 'tar', 'varying'], default='tar',
257
- help='Dataset type (files, tar archive, or varying sizes)')
258
- parser.add_argument('--workers', type=int, default=4,
259
- help='Number of worker threads')
244
+ parser = argparse.ArgumentParser(description="Generate synthetic datasets for benchmarking")
245
+ parser.add_argument(
246
+ "--output",
247
+ "-o",
248
+ type=str,
249
+ required=True,
250
+ help="Output path (directory for files, .tar for archive)",
251
+ )
252
+ parser.add_argument(
253
+ "--num-images", "-n", type=int, default=10000, help="Number of images to generate"
254
+ )
255
+ parser.add_argument("--width", type=int, default=256, help="Image width")
256
+ parser.add_argument("--height", type=int, default=256, help="Image height")
257
+ parser.add_argument(
258
+ "--num-classes", type=int, default=1000, help="Number of classes for labels"
259
+ )
260
+ parser.add_argument("--format", choices=["jpeg", "png"], default="jpeg", help="Image format")
261
+ parser.add_argument(
262
+ "--type",
263
+ choices=["files", "tar", "varying"],
264
+ default="tar",
265
+ help="Dataset type (files, tar archive, or varying sizes)",
266
+ )
267
+ parser.add_argument("--workers", type=int, default=4, help="Number of worker threads")
260
268
 
261
269
  args = parser.parse_args()
262
270
 
263
- if args.type == 'files':
271
+ if args.type == "files":
264
272
  generate_dataset_files(
265
273
  args.output,
266
274
  args.num_images,
267
275
  (args.width, args.height),
268
276
  args.num_classes,
269
277
  args.format,
270
- args.workers
278
+ args.workers,
271
279
  )
272
- elif args.type == 'tar':
280
+ elif args.type == "tar":
273
281
  generate_tar_dataset(
274
- args.output,
275
- args.num_images,
276
- (args.width, args.height),
277
- args.num_classes,
278
- args.format
282
+ args.output, args.num_images, (args.width, args.height), args.num_classes, args.format
279
283
  )
280
- elif args.type == 'varying':
284
+ elif args.type == "varying":
281
285
  generate_varying_size_dataset(
282
- args.output,
283
- args.num_images,
284
- min_size=128,
285
- max_size=512,
286
- num_classes=args.num_classes
286
+ args.output, args.num_images, min_size=128, max_size=512, num_classes=args.num_classes
287
287
  )
288
288
 
289
289
 
290
- if __name__ == '__main__':
290
+ if __name__ == "__main__":
291
291
  main()
@@ -23,6 +23,7 @@ import numpy as np
23
23
 
24
24
  try:
25
25
  import psutil
26
+
26
27
  HAS_PSUTIL = True
27
28
  except ImportError:
28
29
  HAS_PSUTIL = False
@@ -31,12 +32,14 @@ except ImportError:
31
32
  try:
32
33
  import torch
33
34
  from torch.utils.data import DataLoader, Dataset
35
+
34
36
  HAS_TORCH = True
35
37
  except ImportError:
36
38
  HAS_TORCH = False
37
39
 
38
40
  try:
39
41
  import turboloader
42
+
40
43
  HAS_TURBOLOADER = True
41
44
  except ImportError:
42
45
  HAS_TURBOLOADER = False
@@ -45,6 +48,7 @@ except ImportError:
45
48
  @dataclass
46
49
  class MemoryResult:
47
50
  """Memory benchmark result"""
51
+
48
52
  library: str
49
53
  config: Dict[str, Any]
50
54
  baseline_mb: float
@@ -96,18 +100,18 @@ class MemoryMonitor:
96
100
  """Get memory statistics"""
97
101
  if not self.measurements:
98
102
  return {
99
- 'min_mb': 0,
100
- 'max_mb': 0,
101
- 'avg_mb': 0,
102
- 'std_mb': 0,
103
+ "min_mb": 0,
104
+ "max_mb": 0,
105
+ "avg_mb": 0,
106
+ "std_mb": 0,
103
107
  }
104
108
 
105
109
  measurements = np.array(self.measurements)
106
110
  return {
107
- 'min_mb': np.min(measurements),
108
- 'max_mb': np.max(measurements),
109
- 'avg_mb': np.mean(measurements),
110
- 'std_mb': np.std(measurements),
111
+ "min_mb": np.min(measurements),
112
+ "max_mb": np.max(measurements),
113
+ "avg_mb": np.mean(measurements),
114
+ "std_mb": np.std(measurements),
111
115
  }
112
116
 
113
117
 
@@ -126,10 +130,7 @@ def force_gc():
126
130
 
127
131
 
128
132
  def benchmark_turboloader_memory(
129
- tar_path: str,
130
- batch_size: int = 64,
131
- num_workers: int = 4,
132
- num_batches: int = 50
133
+ tar_path: str, batch_size: int = 64, num_workers: int = 4, num_batches: int = 50
133
134
  ) -> Optional[MemoryResult]:
134
135
  """Benchmark TurboLoader memory usage"""
135
136
  if not HAS_TURBOLOADER:
@@ -142,12 +143,14 @@ def benchmark_turboloader_memory(
142
143
  monitor = MemoryMonitor(interval=0.05)
143
144
 
144
145
  try:
145
- transforms = turboloader.Compose([
146
- turboloader.Resize(256, 256),
147
- turboloader.RandomCrop(224, 224),
148
- turboloader.RandomHorizontalFlip(0.5),
149
- turboloader.ImageNetNormalize(),
150
- ])
146
+ transforms = turboloader.Compose(
147
+ [
148
+ turboloader.Resize(256, 256),
149
+ turboloader.RandomCrop(224, 224),
150
+ turboloader.RandomHorizontalFlip(0.5),
151
+ turboloader.ImageNetNormalize(),
152
+ ]
153
+ )
151
154
 
152
155
  loader = turboloader.DataLoader(
153
156
  tar_path,
@@ -176,22 +179,22 @@ def benchmark_turboloader_memory(
176
179
  monitor.stop()
177
180
  stats = monitor.get_stats()
178
181
 
179
- peak = stats['max_mb']
182
+ peak = stats["max_mb"]
180
183
  delta = peak - baseline
181
184
 
182
185
  return MemoryResult(
183
- library='turboloader',
186
+ library="turboloader",
184
187
  config={
185
- 'batch_size': batch_size,
186
- 'num_workers': num_workers,
187
- 'num_batches': num_batches,
188
+ "batch_size": batch_size,
189
+ "num_workers": num_workers,
190
+ "num_batches": num_batches,
188
191
  },
189
192
  baseline_mb=baseline,
190
193
  peak_mb=peak,
191
194
  delta_mb=delta,
192
- avg_mb=stats['avg_mb'],
193
- samples_per_mb=total_samples / delta if delta > 0 else float('inf'),
194
- timestamp=datetime.now().isoformat()
195
+ avg_mb=stats["avg_mb"],
196
+ samples_per_mb=total_samples / delta if delta > 0 else float("inf"),
197
+ timestamp=datetime.now().isoformat(),
195
198
  )
196
199
 
197
200
  except Exception as e:
@@ -205,7 +208,7 @@ def benchmark_pytorch_memory(
205
208
  batch_size: int = 64,
206
209
  num_workers: int = 4,
207
210
  num_batches: int = 50,
208
- cached: bool = True
211
+ cached: bool = True,
209
212
  ) -> Optional[MemoryResult]:
210
213
  """Benchmark PyTorch DataLoader memory usage"""
211
214
  if not HAS_TORCH:
@@ -223,9 +226,9 @@ def benchmark_pytorch_memory(
223
226
  self.samples = []
224
227
  self.cache = {} if cache else None
225
228
 
226
- with tarfile.open(tar_path, 'r') as tar:
229
+ with tarfile.open(tar_path, "r") as tar:
227
230
  for member in tar.getmembers():
228
- if member.name.endswith(('.jpg', '.jpeg', '.png', '.JPEG', '.JPG')):
231
+ if member.name.endswith((".jpg", ".jpeg", ".png", ".JPEG", ".JPG")):
229
232
  self.samples.append(member.name)
230
233
  if cache:
231
234
  f = tar.extractfile(member)
@@ -242,11 +245,11 @@ def benchmark_pytorch_memory(
242
245
  if self.cache:
243
246
  data = self.cache[filename]
244
247
  else:
245
- with tarfile.open(self.tar_path, 'r') as tar:
248
+ with tarfile.open(self.tar_path, "r") as tar:
246
249
  f = tar.extractfile(tar.getmember(filename))
247
250
  data = f.read()
248
251
 
249
- img = Image.open(BytesIO(data)).convert('RGB')
252
+ img = Image.open(BytesIO(data)).convert("RGB")
250
253
  if self.transform:
251
254
  img = self.transform(img)
252
255
 
@@ -259,13 +262,15 @@ def benchmark_pytorch_memory(
259
262
  monitor = MemoryMonitor(interval=0.05)
260
263
 
261
264
  try:
262
- transform = T.Compose([
263
- T.Resize((256, 256)),
264
- T.RandomCrop(224),
265
- T.RandomHorizontalFlip(),
266
- T.ToTensor(),
267
- T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
268
- ])
265
+ transform = T.Compose(
266
+ [
267
+ T.Resize((256, 256)),
268
+ T.RandomCrop(224),
269
+ T.RandomHorizontalFlip(),
270
+ T.ToTensor(),
271
+ T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
272
+ ]
273
+ )
269
274
 
270
275
  dataset = TarDataset(tar_path, transform=transform, cache=cached)
271
276
 
@@ -288,28 +293,29 @@ def benchmark_pytorch_memory(
288
293
  monitor.stop()
289
294
  stats = monitor.get_stats()
290
295
 
291
- peak = stats['max_mb']
296
+ peak = stats["max_mb"]
292
297
  delta = peak - baseline
293
298
 
294
299
  return MemoryResult(
295
- library='pytorch' + ('_cached' if cached else ''),
300
+ library="pytorch" + ("_cached" if cached else ""),
296
301
  config={
297
- 'batch_size': batch_size,
298
- 'num_workers': num_workers,
299
- 'num_batches': num_batches,
300
- 'cached': cached,
302
+ "batch_size": batch_size,
303
+ "num_workers": num_workers,
304
+ "num_batches": num_batches,
305
+ "cached": cached,
301
306
  },
302
307
  baseline_mb=baseline,
303
308
  peak_mb=peak,
304
309
  delta_mb=delta,
305
- avg_mb=stats['avg_mb'],
306
- samples_per_mb=total_samples / delta if delta > 0 else float('inf'),
307
- timestamp=datetime.now().isoformat()
310
+ avg_mb=stats["avg_mb"],
311
+ samples_per_mb=total_samples / delta if delta > 0 else float("inf"),
312
+ timestamp=datetime.now().isoformat(),
308
313
  )
309
314
 
310
315
  except Exception as e:
311
316
  print(f"PyTorch error: {e}")
312
317
  import traceback
318
+
313
319
  traceback.print_exc()
314
320
  monitor.stop()
315
321
  return None
@@ -319,7 +325,7 @@ def run_memory_benchmarks(
319
325
  tar_path: str,
320
326
  batch_sizes: List[int] = [32, 64, 128],
321
327
  num_workers: int = 4,
322
- num_batches: int = 50
328
+ num_batches: int = 50,
323
329
  ) -> List[MemoryResult]:
324
330
  """Run all memory benchmarks"""
325
331
  results = []
@@ -337,9 +343,7 @@ def run_memory_benchmarks(
337
343
  # TurboLoader
338
344
  if HAS_TURBOLOADER:
339
345
  print(" TurboLoader...")
340
- result = benchmark_turboloader_memory(
341
- tar_path, batch_size, num_workers, num_batches
342
- )
346
+ result = benchmark_turboloader_memory(tar_path, batch_size, num_workers, num_batches)
343
347
  if result:
344
348
  print(f" Peak: {result.peak_mb:.1f} MB, Delta: {result.delta_mb:.1f} MB")
345
349
  results.append(result)
@@ -372,10 +376,10 @@ def run_memory_benchmarks(
372
376
 
373
377
  def save_results(results: List[MemoryResult], output_path: str):
374
378
  """Save results to JSON"""
375
- os.makedirs(os.path.dirname(output_path) or '.', exist_ok=True)
379
+ os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)
376
380
 
377
381
  data = [asdict(r) for r in results]
378
- with open(output_path, 'w') as f:
382
+ with open(output_path, "w") as f:
379
383
  json.dump(data, f, indent=2)
380
384
 
381
385
  print(f"\nResults saved to: {output_path}")
@@ -390,23 +394,28 @@ def print_summary(results: List[MemoryResult]):
390
394
  print("-" * 80)
391
395
 
392
396
  for r in results:
393
- batch = r.config.get('batch_size', 'N/A')
394
- print(f"{r.library:>20} {batch:>8} {r.peak_mb:>12.1f} {r.delta_mb:>12.1f} {r.samples_per_mb:>12.1f}")
397
+ batch = r.config.get("batch_size", "N/A")
398
+ print(
399
+ f"{r.library:>20} {batch:>8} {r.peak_mb:>12.1f} {r.delta_mb:>12.1f} {r.samples_per_mb:>12.1f}"
400
+ )
395
401
 
396
402
 
397
403
  def main():
398
- parser = argparse.ArgumentParser(description='Memory Usage Benchmark')
399
- parser.add_argument('--tar-path', type=str, required=True,
400
- help='Path to TAR dataset')
401
- parser.add_argument('--batch-sizes', type=int, nargs='+', default=[32, 64, 128],
402
- help='Batch sizes to test')
403
- parser.add_argument('--workers', type=int, default=4,
404
- help='Number of workers')
405
- parser.add_argument('--num-batches', type=int, default=50,
406
- help='Number of batches per benchmark')
407
- parser.add_argument('--output', type=str,
408
- default='benchmarks/results/memory/memory.json',
409
- help='Output path for results')
404
+ parser = argparse.ArgumentParser(description="Memory Usage Benchmark")
405
+ parser.add_argument("--tar-path", type=str, required=True, help="Path to TAR dataset")
406
+ parser.add_argument(
407
+ "--batch-sizes", type=int, nargs="+", default=[32, 64, 128], help="Batch sizes to test"
408
+ )
409
+ parser.add_argument("--workers", type=int, default=4, help="Number of workers")
410
+ parser.add_argument(
411
+ "--num-batches", type=int, default=50, help="Number of batches per benchmark"
412
+ )
413
+ parser.add_argument(
414
+ "--output",
415
+ type=str,
416
+ default="benchmarks/results/memory/memory.json",
417
+ help="Output path for results",
418
+ )
410
419
 
411
420
  args = parser.parse_args()
412
421
 
@@ -419,12 +428,12 @@ def main():
419
428
  args.tar_path,
420
429
  batch_sizes=args.batch_sizes,
421
430
  num_workers=args.workers,
422
- num_batches=args.num_batches
431
+ num_batches=args.num_batches,
423
432
  )
424
433
 
425
434
  save_results(results, args.output)
426
435
  print_summary(results)
427
436
 
428
437
 
429
- if __name__ == '__main__':
438
+ if __name__ == "__main__":
430
439
  main()