s3dlio 0.9.26__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (354) hide show
  1. s3dlio-0.9.26/.dockerignore +12 -0
  2. s3dlio-0.9.26/.env.bak +10 -0
  3. s3dlio-0.9.26/.github/ISSUE_TEMPLATE/data_gen_migration.md +123 -0
  4. s3dlio-0.9.26/.github/ISSUE_TEMPLATE/s3dlio-realism-epic.md +106 -0
  5. s3dlio-0.9.26/.github/copilot-instructions.md +409 -0
  6. s3dlio-0.9.26/.gitignore +31 -0
  7. s3dlio-0.9.26/Cargo.lock +7188 -0
  8. s3dlio-0.9.26/Cargo.toml +186 -0
  9. s3dlio-0.9.26/Dockerfile +102 -0
  10. s3dlio-0.9.26/LICENSE +661 -0
  11. s3dlio-0.9.26/PKG-INFO +797 -0
  12. s3dlio-0.9.26/README.md +787 -0
  13. s3dlio-0.9.26/SECURITY.md +14 -0
  14. s3dlio-0.9.26/aws-env +7 -0
  15. s3dlio-0.9.26/benches/data_gen_comparison.rs +282 -0
  16. s3dlio-0.9.26/benches/performance_microbenchmarks.rs +131 -0
  17. s3dlio-0.9.26/benches/rng_performance_benchmark.rs +111 -0
  18. s3dlio-0.9.26/benches/s3_microbenchmarks.rs +131 -0
  19. s3dlio-0.9.26/benches/simple_test.rs +13 -0
  20. s3dlio-0.9.26/build_pyo3.sh +3 -0
  21. s3dlio-0.9.26/configs/aws-root-bad-ca.pem +13 -0
  22. s3dlio-0.9.26/configs/aws-root-ca.pem +100 -0
  23. s3dlio-0.9.26/configs/test_config_hdf5.json +24 -0
  24. s3dlio-0.9.26/configs/test_config_npz.json +24 -0
  25. s3dlio-0.9.26/configs/test_config_tfrecord.json +24 -0
  26. s3dlio-0.9.26/docs/CLI_GUIDE.md +522 -0
  27. s3dlio-0.9.26/docs/Changelog.md +2647 -0
  28. s3dlio-0.9.26/docs/PYTHON_API_GUIDE.md +1342 -0
  29. s3dlio-0.9.26/docs/README.md +129 -0
  30. s3dlio-0.9.26/docs/api/Environment_Variables.md +178 -0
  31. s3dlio-0.9.26/docs/api/README.md +124 -0
  32. s3dlio-0.9.26/docs/archive/Changelog_pre_v090.md +2011 -0
  33. s3dlio-0.9.26/docs/archive/DATA_GEN_MIGRATION_SUMMARY.md +311 -0
  34. s3dlio-0.9.26/docs/archive/LIST_DELETE_PERFORMANCE_OPTIMIZATION.md +534 -0
  35. s3dlio-0.9.26/docs/archive/v0.9.10-RELEASE-SUMMARY.md +275 -0
  36. s3dlio-0.9.26/docs/archive/v0.9.10-WARNING-RESOLUTION.md +199 -0
  37. s3dlio-0.9.26/docs/bugs/AWS_SDK_TRACING_HANG_BUG_REPORT.md +344 -0
  38. s3dlio-0.9.26/docs/enhancement/AI_ML_Realism_Enhancement_Plan.md +123 -0
  39. s3dlio-0.9.26/docs/enhancement/Analysis of Async vs io_uring.md +72 -0
  40. s3dlio-0.9.26/docs/enhancement/BLOCK_STORAGE_INTERFACE.md +1113 -0
  41. s3dlio-0.9.26/docs/enhancement/README.md +225 -0
  42. s3dlio-0.9.26/docs/enhancement/SEWT_LOAD_BALANCING.md +560 -0
  43. s3dlio-0.9.26/docs/enhancement/dl-driver-realism-epic-template.md +125 -0
  44. s3dlio-0.9.26/docs/implementation-plans/AZURE-RANGE-ENGINE-IMPLEMENTATION.md +404 -0
  45. s3dlio-0.9.26/docs/implementation-plans/GCS_BACKEND_IMPLEMENTATION_PLAN.md +597 -0
  46. s3dlio-0.9.26/docs/implementation-plans/OPLOG_STREAMING_ANALYSIS.md +585 -0
  47. s3dlio-0.9.26/docs/implementation-plans/S3DLIO_OPLOG_IMPLEMENTATION_SUMMARY.md +268 -0
  48. s3dlio-0.9.26/docs/implementation-plans/S3DLIO_OPLOG_INTEGRATION.md +390 -0
  49. s3dlio-0.9.26/docs/implementation-plans/v0.9.10-REAL-object-storage-buffer-pool.md +484 -0
  50. s3dlio-0.9.26/docs/implementation-plans/v0.9.10-pre-stat-optimization.md +558 -0
  51. s3dlio-0.9.26/docs/implementation-plans/v0.9.9-buffer-pool-enhancement.md +297 -0
  52. s3dlio-0.9.26/docs/integration/DLIO_BENCHMARK_INTEGRATION.md +477 -0
  53. s3dlio-0.9.26/docs/performance/HowToZeroCopy.md +162 -0
  54. s3dlio-0.9.26/docs/performance/MultiPart_README.md +225 -0
  55. s3dlio-0.9.26/docs/performance/O_DIRECT_Implementation.md +104 -0
  56. s3dlio-0.9.26/docs/performance/Performance_Optimization_Summary.md +233 -0
  57. s3dlio-0.9.26/docs/performance/Performance_Profiling_Guide.md +390 -0
  58. s3dlio-0.9.26/docs/supplemental/ADAPTIVE-TUNING.md +238 -0
  59. s3dlio-0.9.26/docs/supplemental/CONFIGURATION-HIERARCHY.md +251 -0
  60. s3dlio-0.9.26/docs/supplemental/GCS-BACKEND-SELECTION.md +310 -0
  61. s3dlio-0.9.26/docs/supplemental/GCS-QUICK-START.md +116 -0
  62. s3dlio-0.9.26/docs/supplemental/MULTI_ENDPOINT_GUIDE.md +1001 -0
  63. s3dlio-0.9.26/docs/supplemental/OPERATION_LOGGING.md +335 -0
  64. s3dlio-0.9.26/docs/supplemental/OPLOG-GUIDE.md +509 -0
  65. s3dlio-0.9.26/docs/supplemental/RELEASE-CHECKLIST.md +140 -0
  66. s3dlio-0.9.26/docs/supplemental/STREAMING-ARCHITECTURE.md +315 -0
  67. s3dlio-0.9.26/docs/supplemental/TFRECORD-INDEX-QUICKREF.md +219 -0
  68. s3dlio-0.9.26/docs/supplemental/VERSION-MANAGEMENT.md +92 -0
  69. s3dlio-0.9.26/docs/supplemental/ZERO-COPY-API-REFERENCE.md +477 -0
  70. s3dlio-0.9.26/docs/supplemental/v0.9.8-GCS-BACKENDS-SUMMARY.md +216 -0
  71. s3dlio-0.9.26/docs/testing/BACKEND-TESTING.md +434 -0
  72. s3dlio-0.9.26/docs/testing/GCS-PAGINATION-TEST-GUIDE.md +214 -0
  73. s3dlio-0.9.26/docs/testing/GCS-TESTING-SUMMARY.md +91 -0
  74. s3dlio-0.9.26/docs/testing/PAGINATION-ANALYSIS-ALL-BACKENDS.md +240 -0
  75. s3dlio-0.9.26/docs/testing/TESTING-GUIDE.md +426 -0
  76. s3dlio-0.9.26/docs/testing/v0.9.9-phase1-testing-summary.md +188 -0
  77. s3dlio-0.9.26/examples/python/basic_operations.py +141 -0
  78. s3dlio-0.9.26/examples/python/data_loader.py +277 -0
  79. s3dlio-0.9.26/examples/python/oplog_example.py +347 -0
  80. s3dlio-0.9.26/examples/python/parallel_operations.py +210 -0
  81. s3dlio-0.9.26/examples/python/streaming_writer.py +200 -0
  82. s3dlio-0.9.26/examples/python/upload_download.py +198 -0
  83. s3dlio-0.9.26/examples/rust/adaptive_tuning_demo.rs +100 -0
  84. s3dlio-0.9.26/examples/rust/async_pool_dataloader_demo.rs +358 -0
  85. s3dlio-0.9.26/examples/rust/backend_comparison_demo.rs +164 -0
  86. s3dlio-0.9.26/examples/rust/debug_direct_io.rs +69 -0
  87. s3dlio-0.9.26/examples/rust/detailed_direct_io_debug.rs +97 -0
  88. s3dlio-0.9.26/examples/rust/large_scale_s3_test.rs +540 -0
  89. s3dlio-0.9.26/examples/rust/performance_monitoring_demo.rs +189 -0
  90. s3dlio-0.9.26/examples/rust/phase2_streaming_demo.rs +13 -0
  91. s3dlio-0.9.26/examples/rust/profile_s3_operations.rs +312 -0
  92. s3dlio-0.9.26/examples/rust/rust_api_basic_usage.rs +212 -0
  93. s3dlio-0.9.26/examples/rust/s3_backend_comparison.rs +169 -0
  94. s3dlio-0.9.26/examples/rust/simple_flamegraph_test.rs +85 -0
  95. s3dlio-0.9.26/examples/rust/test_aligned_buffers.rs +153 -0
  96. s3dlio-0.9.26/examples/rust/test_direct_io.rs +95 -0
  97. s3dlio-0.9.26/examples/rust/test_direct_io_comprehensive.rs +148 -0
  98. s3dlio-0.9.26/examples/rust/test_hybrid_io_debug.rs +78 -0
  99. s3dlio-0.9.26/examples/rust/test_oplog_multiple_operations.rs +124 -0
  100. s3dlio-0.9.26/examples/rust/validate_aligned_buffers.rs +82 -0
  101. s3dlio-0.9.26/fork-patches/README.md +36 -0
  102. s3dlio-0.9.26/install_pyo3_wheel.sh +4 -0
  103. s3dlio-0.9.26/integrations/dlio/README.md +99 -0
  104. s3dlio-0.9.26/integrations/dlio/pyproject.toml +54 -0
  105. s3dlio-0.9.26/integrations/dlio/src/dlio_s3dlio_storage/__init__.py +11 -0
  106. s3dlio-0.9.26/local-env +10 -0
  107. s3dlio-0.9.26/profiles/large_scale_download_profile.svg +491 -0
  108. s3dlio-0.9.26/profiles/large_scale_upload_profile.svg +491 -0
  109. s3dlio-0.9.26/profiles/simple_test_profile.svg +491 -0
  110. s3dlio-0.9.26/pyproject.toml +22 -0
  111. s3dlio-0.9.26/python/s3dlio/__init__.py +74 -0
  112. s3dlio-0.9.26/python/s3dlio/integrations/__init__.py +6 -0
  113. s3dlio-0.9.26/python/s3dlio/integrations/dlio/__init__.py +203 -0
  114. s3dlio-0.9.26/python/s3dlio/integrations/dlio/s3_torch_storage.py +254 -0
  115. s3dlio-0.9.26/python/s3dlio/integrations/dlio/s3dlio_storage.py +247 -0
  116. s3dlio-0.9.26/python/s3dlio/jax_tf.py +208 -0
  117. s3dlio-0.9.26/python/s3dlio/torch.py +399 -0
  118. s3dlio-0.9.26/python/tests/aws-s3dlio_compare_suite.py +223 -0
  119. s3dlio-0.9.26/python/tests/bench_s3-torch_v8.py +136 -0
  120. s3dlio-0.9.26/python/tests/compare_aws_s3dlio_loaders.py +145 -0
  121. s3dlio-0.9.26/python/tests/jax_smoke.py +27 -0
  122. s3dlio-0.9.26/python/tests/jax_tf_vers_4-3_demo.py +169 -0
  123. s3dlio-0.9.26/python/tests/multi-part_smoke.py +36 -0
  124. s3dlio-0.9.26/python/tests/pytorch_smoke.py +34 -0
  125. s3dlio-0.9.26/python/tests/run-bench.sh +7 -0
  126. s3dlio-0.9.26/python/tests/run_regression_tests.sh +28 -0
  127. s3dlio-0.9.26/python/tests/test_azure_api.py +207 -0
  128. s3dlio-0.9.26/python/tests/test_bug_fix.py +73 -0
  129. s3dlio-0.9.26/python/tests/test_checkpoint_basic.py +126 -0
  130. s3dlio-0.9.26/python/tests/test_checkpoint_basic_python.py +268 -0
  131. s3dlio-0.9.26/python/tests/test_checkpoint_framework_integration.py +307 -0
  132. s3dlio-0.9.26/python/tests/test_comprehensive_api.py +346 -0
  133. s3dlio-0.9.26/python/tests/test_comprehensive_python_v070.py +390 -0
  134. s3dlio-0.9.26/python/tests/test_compression_verification.py +63 -0
  135. s3dlio-0.9.26/python/tests/test_correct_functionality.py +412 -0
  136. s3dlio-0.9.26/python/tests/test_dali_compatibility.py +394 -0
  137. s3dlio-0.9.26/python/tests/test_data_gen_methods.py +194 -0
  138. s3dlio-0.9.26/python/tests/test_dataloader.py +35 -0
  139. s3dlio-0.9.26/python/tests/test_enhanced_api.py +381 -0
  140. s3dlio-0.9.26/python/tests/test_final_validation.py +362 -0
  141. s3dlio-0.9.26/python/tests/test_framework_integration.py +316 -0
  142. s3dlio-0.9.26/python/tests/test_gcs_api.py +204 -0
  143. s3dlio-0.9.26/python/tests/test_index_usage_examples.py +329 -0
  144. s3dlio-0.9.26/python/tests/test_installed_package.py +453 -0
  145. s3dlio-0.9.26/python/tests/test_loader_options_basic.py +131 -0
  146. s3dlio-0.9.26/python/tests/test_loader_options_realism.py +164 -0
  147. s3dlio-0.9.26/python/tests/test_loader_return_type_stability.py +162 -0
  148. s3dlio-0.9.26/python/tests/test_modular_api_regression.py +318 -0
  149. s3dlio-0.9.26/python/tests/test_multi_endpoint.py +369 -0
  150. s3dlio-0.9.26/python/tests/test_multipart_writer.py +59 -0
  151. s3dlio-0.9.26/python/tests/test_new_api.py +171 -0
  152. s3dlio-0.9.26/python/tests/test_new_dlio_s3.py +125 -0
  153. s3dlio-0.9.26/python/tests/test_numpy_functions.py +39 -0
  154. s3dlio-0.9.26/python/tests/test_parameter_validation.py +143 -0
  155. s3dlio-0.9.26/python/tests/test_phase1_optimization.py +176 -0
  156. s3dlio-0.9.26/python/tests/test_phase2_python_validation.py +190 -0
  157. s3dlio-0.9.26/python/tests/test_phase2_streaming_validation.py +191 -0
  158. s3dlio-0.9.26/python/tests/test_phase3_checksum_integration.py +269 -0
  159. s3dlio-0.9.26/python/tests/test_python_api.py +181 -0
  160. s3dlio-0.9.26/python/tests/test_python_compression.py +66 -0
  161. s3dlio-0.9.26/python/tests/test_python_mp_get.py +69 -0
  162. s3dlio-0.9.26/python/tests/test_python_npy_interop.py +349 -0
  163. s3dlio-0.9.26/python/tests/test_range_requests.py +197 -0
  164. s3dlio-0.9.26/python/tests/test_real_s3_api.py +90 -0
  165. s3dlio-0.9.26/python/tests/test_realism_baseline.py +172 -0
  166. s3dlio-0.9.26/python/tests/test_regex.py +111 -0
  167. s3dlio-0.9.26/python/tests/test_s3dlio_production_validation.py +363 -0
  168. s3dlio-0.9.26/python/tests/test_simple_api.py +48 -0
  169. s3dlio-0.9.26/python/tests/test_simple_python_v070.py +209 -0
  170. s3dlio-0.9.26/python/tests/test_simple_python_validation.py +133 -0
  171. s3dlio-0.9.26/python/tests/test_simple_runner.py +272 -0
  172. s3dlio-0.9.26/python/tests/test_streaming_direct.py +45 -0
  173. s3dlio-0.9.26/python/tests/test_streaming_python.py +123 -0
  174. s3dlio-0.9.26/python/tests/test_streaming_python_old.py +329 -0
  175. s3dlio-0.9.26/python/tests/test_tfrecord_index_python.py +196 -0
  176. s3dlio-0.9.26/python/tests/test_v0819_python_api.py +209 -0
  177. s3dlio-0.9.26/python/tests/test_v090_features.py +323 -0
  178. s3dlio-0.9.26/python/tests/test_working_functionality.py +399 -0
  179. s3dlio-0.9.26/python/tests/test_zero_copy.py +74 -0
  180. s3dlio-0.9.26/python/tests/test_zero_copy_comprehensive.py +321 -0
  181. s3dlio-0.9.26/python/tests/tf_smoke.py +24 -0
  182. s3dlio-0.9.26/python/tests/torch_vers_4-3_demo.py +226 -0
  183. s3dlio-0.9.26/python/tests/verify_s3_objects.py +257 -0
  184. s3dlio-0.9.26/s3dlio-metadata-extensions-plan.md +117 -0
  185. s3dlio-0.9.26/scripts/apache_backend_performance_test.sh +197 -0
  186. s3dlio-0.9.26/scripts/build_performance_variants.sh +50 -0
  187. s3dlio-0.9.26/scripts/build_runtime_selection.sh +187 -0
  188. s3dlio-0.9.26/scripts/cli_performance_validation.sh +175 -0
  189. s3dlio-0.9.26/scripts/compare_backends.sh +87 -0
  190. s3dlio-0.9.26/scripts/compare_backends_performance.sh +236 -0
  191. s3dlio-0.9.26/scripts/gcs-login.sh +1 -0
  192. s3dlio-0.9.26/scripts/gcs-native-env.template +21 -0
  193. s3dlio-0.9.26/scripts/gcs-s3-compat-env.template +17 -0
  194. s3dlio-0.9.26/scripts/long_duration_performance_test.sh +203 -0
  195. s3dlio-0.9.26/scripts/managed_performance_test.sh +173 -0
  196. s3dlio-0.9.26/scripts/profile_performance.sh +156 -0
  197. s3dlio-0.9.26/scripts/quick_performance_test.sh +137 -0
  198. s3dlio-0.9.26/scripts/run_backend_comparison.sh +95 -0
  199. s3dlio-0.9.26/scripts/run_tfrecord_tests.sh +105 -0
  200. s3dlio-0.9.26/scripts/s3dlio-cli +32 -0
  201. s3dlio-0.9.26/scripts/sustained_performance_test.sh +223 -0
  202. s3dlio-0.9.26/scripts/test-gcs-comprehensive.sh +64 -0
  203. s3dlio-0.9.26/scripts/test-gcs-final.sh +53 -0
  204. s3dlio-0.9.26/scripts/test_all.sh +50 -0
  205. s3dlio-0.9.26/scripts/test_azure_comprehensive.sh +52 -0
  206. s3dlio-0.9.26/scripts/test_azure_multi.sh +14 -0
  207. s3dlio-0.9.26/scripts/test_azure_range_engine.sh +93 -0
  208. s3dlio-0.9.26/scripts/test_azure_smoke.sh +9 -0
  209. s3dlio-0.9.26/scripts/test_delete_batch.sh +92 -0
  210. s3dlio-0.9.26/scripts/test_enhanced_performance.sh +366 -0
  211. s3dlio-0.9.26/scripts/test_gcs_backends.sh +158 -0
  212. s3dlio-0.9.26/scripts/test_gcs_community.sh +69 -0
  213. s3dlio-0.9.26/scripts/test_gcs_official.sh +69 -0
  214. s3dlio-0.9.26/scripts/test_v0819_all_backends.sh +172 -0
  215. s3dlio-0.9.26/src/adaptive_config.rs +402 -0
  216. s3dlio-0.9.26/src/api/advanced.rs +101 -0
  217. s3dlio-0.9.26/src/api.rs +258 -0
  218. s3dlio-0.9.26/src/azure_client.rs +535 -0
  219. s3dlio-0.9.26/src/bin/cli.rs +1479 -0
  220. s3dlio-0.9.26/src/checkpoint/latest.rs +372 -0
  221. s3dlio-0.9.26/src/checkpoint/manifest.rs +150 -0
  222. s3dlio-0.9.26/src/checkpoint/mod.rs +389 -0
  223. s3dlio-0.9.26/src/checkpoint/paths.rs +254 -0
  224. s3dlio-0.9.26/src/checkpoint/reader.rs +497 -0
  225. s3dlio-0.9.26/src/checkpoint/writer.rs +502 -0
  226. s3dlio-0.9.26/src/concurrency/mod.rs +10 -0
  227. s3dlio-0.9.26/src/concurrency/scheduler.rs +408 -0
  228. s3dlio-0.9.26/src/config.rs +158 -0
  229. s3dlio-0.9.26/src/constants.rs +261 -0
  230. s3dlio-0.9.26/src/data_formats/hdf5.rs +58 -0
  231. s3dlio-0.9.26/src/data_formats/mod.rs +10 -0
  232. s3dlio-0.9.26/src/data_formats/npz.rs +407 -0
  233. s3dlio-0.9.26/src/data_formats/raw.rs +7 -0
  234. s3dlio-0.9.26/src/data_formats/tfrecord.rs +73 -0
  235. s3dlio-0.9.26/src/data_gen.rs +827 -0
  236. s3dlio-0.9.26/src/data_gen_alt.rs +506 -0
  237. s3dlio-0.9.26/src/data_loader/async_pool_dataloader.rs +421 -0
  238. s3dlio-0.9.26/src/data_loader/dataloader.rs +185 -0
  239. s3dlio-0.9.26/src/data_loader/dataset.rs +80 -0
  240. s3dlio-0.9.26/src/data_loader/directio_bytes.rs +182 -0
  241. s3dlio-0.9.26/src/data_loader/fs_bytes.rs +196 -0
  242. s3dlio-0.9.26/src/data_loader/mod.rs +25 -0
  243. s3dlio-0.9.26/src/data_loader/options.rs +595 -0
  244. s3dlio-0.9.26/src/data_loader/prefetch.rs +64 -0
  245. s3dlio-0.9.26/src/data_loader/s3_bytes.rs +108 -0
  246. s3dlio-0.9.26/src/data_loader/sampler.rs +113 -0
  247. s3dlio-0.9.26/src/data_loader/transform.rs +68 -0
  248. s3dlio-0.9.26/src/download.rs +299 -0
  249. s3dlio-0.9.26/src/file_store.rs +814 -0
  250. s3dlio-0.9.26/src/file_store_direct.rs +1425 -0
  251. s3dlio-0.9.26/src/gcs_client.rs +726 -0
  252. s3dlio-0.9.26/src/google_gcs_client.rs +397 -0
  253. s3dlio-0.9.26/src/http/client.rs +281 -0
  254. s3dlio-0.9.26/src/http/mod.rs +9 -0
  255. s3dlio-0.9.26/src/lib.rs +196 -0
  256. s3dlio-0.9.26/src/memory.rs +256 -0
  257. s3dlio-0.9.26/src/metrics/enhanced.rs +511 -0
  258. s3dlio-0.9.26/src/metrics/mod.rs +31 -0
  259. s3dlio-0.9.26/src/mp.rs +547 -0
  260. s3dlio-0.9.26/src/multi_endpoint.rs +1129 -0
  261. s3dlio-0.9.26/src/multipart.rs +483 -0
  262. s3dlio-0.9.26/src/object_size_cache.rs +508 -0
  263. s3dlio-0.9.26/src/object_store.rs +2866 -0
  264. s3dlio-0.9.26/src/object_store_arrow.rs +487 -0
  265. s3dlio-0.9.26/src/object_store_logger.rs +504 -0
  266. s3dlio-0.9.26/src/page_cache.rs +136 -0
  267. s3dlio-0.9.26/src/performance/config.rs +276 -0
  268. s3dlio-0.9.26/src/performance/mod.rs +201 -0
  269. s3dlio-0.9.26/src/prefetch.rs +60 -0
  270. s3dlio-0.9.26/src/profiling.rs +228 -0
  271. s3dlio-0.9.26/src/progress.rs +109 -0
  272. s3dlio-0.9.26/src/python_api/python_advanced_api.rs +318 -0
  273. s3dlio-0.9.26/src/python_api/python_aiml_api.rs +1673 -0
  274. s3dlio-0.9.26/src/python_api/python_core_api.rs +1551 -0
  275. s3dlio-0.9.26/src/python_api/zero_copy_api.rs +296 -0
  276. s3dlio-0.9.26/src/python_api.rs +31 -0
  277. s3dlio-0.9.26/src/range_engine.rs +488 -0
  278. s3dlio-0.9.26/src/range_engine_generic.rs +508 -0
  279. s3dlio-0.9.26/src/s3_client.rs +368 -0
  280. s3dlio-0.9.26/src/s3_copy.rs +236 -0
  281. s3dlio-0.9.26/src/s3_logger.rs +369 -0
  282. s3dlio-0.9.26/src/s3_ops.rs +274 -0
  283. s3dlio-0.9.26/src/s3_utils.rs +1428 -0
  284. s3dlio-0.9.26/src/sharded_client.rs +374 -0
  285. s3dlio-0.9.26/src/streaming_writer.rs +192 -0
  286. s3dlio-0.9.26/src/tfrecord_index.rs +528 -0
  287. s3dlio-0.9.26/src/uri_utils.rs +347 -0
  288. s3dlio-0.9.26/tests/azure_blob_multi.rs +96 -0
  289. s3dlio-0.9.26/tests/azure_blob_sequence.rs +157 -0
  290. s3dlio-0.9.26/tests/azure_blob_smoke.rs +198 -0
  291. s3dlio-0.9.26/tests/common/mod.rs +44 -0
  292. s3dlio-0.9.26/tests/metadata_ops_tests.rs +378 -0
  293. s3dlio-0.9.26/tests/object_format_tests.rs +118 -0
  294. s3dlio-0.9.26/tests/performance_comparison.rs +318 -0
  295. s3dlio-0.9.26/tests/s3_uri_endpoint_tests.rs +218 -0
  296. s3dlio-0.9.26/tests/test_allocation_comparison.rs +275 -0
  297. s3dlio-0.9.26/tests/test_async_pool_dataloader.rs +253 -0
  298. s3dlio-0.9.26/tests/test_azure_comprehensive.rs +582 -0
  299. s3dlio-0.9.26/tests/test_azure_range_engine_integration.rs +289 -0
  300. s3dlio-0.9.26/tests/test_backend_parity.rs +60 -0
  301. s3dlio-0.9.26/tests/test_backend_performance_comparison.rs +342 -0
  302. s3dlio-0.9.26/tests/test_before_after_performance.rs +129 -0
  303. s3dlio-0.9.26/tests/test_buffer_pool_directio.rs +458 -0
  304. s3dlio-0.9.26/tests/test_buffer_pool_validation.rs +354 -0
  305. s3dlio-0.9.26/tests/test_cancellation.rs +391 -0
  306. s3dlio-0.9.26/tests/test_checkpoint_advanced.rs +180 -0
  307. s3dlio-0.9.26/tests/test_checkpoint_checksums.rs +153 -0
  308. s3dlio-0.9.26/tests/test_checkpoint_integration.rs +115 -0
  309. s3dlio-0.9.26/tests/test_comprehensive_speed_benchmark.rs +212 -0
  310. s3dlio-0.9.26/tests/test_comprehensive_streaming.rs +304 -0
  311. s3dlio-0.9.26/tests/test_compression_all_backends.rs +201 -0
  312. s3dlio-0.9.26/tests/test_compression_enhanced_reporting.rs +332 -0
  313. s3dlio-0.9.26/tests/test_data-gen.rs +197 -0
  314. s3dlio-0.9.26/tests/test_data_gen_alt.rs +311 -0
  315. s3dlio-0.9.26/tests/test_data_generation_enhancement.rs +257 -0
  316. s3dlio-0.9.26/tests/test_dataloader.rs +261 -0
  317. s3dlio-0.9.26/tests/test_definitive_speed_benchmark.rs +185 -0
  318. s3dlio-0.9.26/tests/test_direct_io.rs +307 -0
  319. s3dlio-0.9.26/tests/test_directio_range_engine.rs +178 -0
  320. s3dlio-0.9.26/tests/test_error_handling.rs +286 -0
  321. s3dlio-0.9.26/tests/test_file_range_engine.rs +160 -0
  322. s3dlio-0.9.26/tests/test_file_store.rs +256 -0
  323. s3dlio-0.9.26/tests/test_gcs_community.rs +368 -0
  324. s3dlio-0.9.26/tests/test_gcs_functional.rs +437 -0
  325. s3dlio-0.9.26/tests/test_gcs_official.rs +380 -0
  326. s3dlio-0.9.26/tests/test_gcs_smoke.rs +245 -0
  327. s3dlio-0.9.26/tests/test_high_compress.rs +30 -0
  328. s3dlio-0.9.26/tests/test_loader_return_type.py +68 -0
  329. s3dlio-0.9.26/tests/test_multi_npz.rs +164 -0
  330. s3dlio-0.9.26/tests/test_multi_process_performance.rs +291 -0
  331. s3dlio-0.9.26/tests/test_multipart.rs +107 -0
  332. s3dlio-0.9.26/tests/test_npy_serialization.rs +230 -0
  333. s3dlio-0.9.26/tests/test_object_size_cache_performance.rs +394 -0
  334. s3dlio-0.9.26/tests/test_object_store_integration.rs +111 -0
  335. s3dlio-0.9.26/tests/test_op_log_all_backends.sh +157 -0
  336. s3dlio-0.9.26/tests/test_performance.rs +78 -0
  337. s3dlio-0.9.26/tests/test_phase2_streaming.rs +95 -0
  338. s3dlio-0.9.26/tests/test_phase2_streaming_backends.rs +262 -0
  339. s3dlio-0.9.26/tests/test_phase2_validation.rs +237 -0
  340. s3dlio-0.9.26/tests/test_phase3_checksums.rs +151 -0
  341. s3dlio-0.9.26/tests/test_phase3_priority2_compression.rs +174 -0
  342. s3dlio-0.9.26/tests/test_phase3_priority3_integrity.rs +248 -0
  343. s3dlio-0.9.26/tests/test_phase3_priority4_python_rust_exchange.rs +259 -0
  344. s3dlio-0.9.26/tests/test_production_performance.rs +131 -0
  345. s3dlio-0.9.26/tests/test_python_oplog.py +189 -0
  346. s3dlio-0.9.26/tests/test_range_engine_cache_integration.rs +280 -0
  347. s3dlio-0.9.26/tests/test_range_engine_defaults.rs +126 -0
  348. s3dlio-0.9.26/tests/test_real_world_performance.rs +307 -0
  349. s3dlio-0.9.26/tests/test_rng_performance.rs +129 -0
  350. s3dlio-0.9.26/tests/test_s3_backend_comparison.rs +380 -0
  351. s3dlio-0.9.26/tests/test_size_cache_integration.rs +224 -0
  352. s3dlio-0.9.26/tests/test_streaming_data_generation.rs +279 -0
  353. s3dlio-0.9.26/tests/test_streaming_writer_integration.rs +194 -0
  354. s3dlio-0.9.26/uv.lock +7 -0
@@ -0,0 +1,12 @@
1
+ Dockerfile
2
+ .dockerignore
3
+
4
+ # Virtual Environment
5
+ .venv/
6
+
7
+ # Python cache
8
+ __pycache__/
9
+ *.pyc
10
+
11
+ # Git directory
12
+ .git/
s3dlio-0.9.26/.env.bak ADDED
@@ -0,0 +1,10 @@
1
+ AWS_ACCESS_KEY_ID=BG0XPVXISBP41DCXOQR8
2
+ AWS_SECRET_ACCESS_KEY=kGSmlMHBl0ohc/nYRtGbBx4KCfpdPN1/fLbtjUyX
3
+ #AWS_ENDPOINT_URL=https://10.9.0.21
4
+ AWS_ENDPOINT_URL=http://10.9.0.21
5
+ AWS_REGION=us-east-1
6
+ S3_BUCKET=my-bucket2
7
+ #S3_BUCKET=s3b
8
+ #S3_BUCKET=mybucket
9
+ #S3_BUCKET=warp-benchmark-bucket
10
+
@@ -0,0 +1,123 @@
1
+ ---
2
+ name: Data Generation Algorithm Migration Completion
3
+ about: Track the completion of migration from old data_gen to data_gen_alt
4
+ title: 'Complete migration to new data generation algorithm (data_gen_alt)'
5
+ labels: enhancement, performance, tech-debt
6
+ assignees: ''
7
+ ---
8
+
9
+ ## Background
10
+
11
+ In November 2025, we discovered and fixed a critical bug in the original data generation algorithm where `compress=1` (which should produce incompressible data) was actually producing 7.68:1 compression ratio due to cross-block pattern reuse.
12
+
13
+ **Root Cause**: The original algorithm used a shared `BASE_BLOCK` template across all unique blocks, allowing zstd to find patterns across block boundaries.
14
+
15
+ **Solution**: New algorithm (`data_gen_alt.rs`) generates each unique block with its own Xoshiro256++ RNG keystream, ensuring true incompressibility when `compress=1`. Compressibility is achieved via local back-references within each block only.
16
+
17
+ ## Current Status
18
+
19
+ ✅ **Completed**:
20
+ - New algorithm implemented in `src/data_gen_alt.rs`
21
+ - Performance optimized with Xoshiro256++ (5-10x faster than ChaCha20)
22
+ - All existing code redirected via `src/data_gen.rs` to use new algorithm
23
+ - All 162 library tests passing
24
+ - Compression bug fixed: compress=1 now produces ratio ~1.0000 (correct!)
25
+ - Old algorithm preserved as commented-out code for reference
26
+
27
+ ## Remaining Work
28
+
29
+ ### Phase 1: Extended Validation (Target: December 2025)
30
+
31
+ - [ ] Run production workloads for 1 week minimum
32
+ - [ ] Verify sai3-bench with various compress/dedup settings (1-6 for both)
33
+ - [ ] Verify dl-driver checkpoint save/load operations
34
+ - [ ] Performance benchmarking across different data sizes (1MB - 1GB)
35
+ - [ ] Compression ratio validation for compress=1,2,3,4,5,6
36
+ - [ ] Deduplication ratio validation for dedup=1,2,3,4,5,6
37
+
38
+ ### Phase 2: Code Cleanup (After validation passes)
39
+
40
+ - [ ] Remove commented-out code from `src/data_gen.rs`:
41
+ - `generate_controlled_data_original()` (lines ~206-250)
42
+ - `ObjectGen::new_original()` (lines ~488-533)
43
+ - `ObjectGen::*_original()` methods (lines ~594-710)
44
+ - [ ] Update inline documentation to reference only new algorithm
45
+ - [ ] Consider renaming `data_gen_alt.rs` → `data_gen.rs` (major refactor)
46
+ - [ ] Update external documentation and examples
47
+
48
+ ### Phase 3: Optimization Opportunities
49
+
50
+ - [ ] Profile Xoshiro256++ performance across different platforms
51
+ - [ ] Evaluate alternative back-reference strategies for better compression
52
+ - [ ] Benchmark against industry-standard synthetic data generators
53
+ - [ ] Consider SIMD optimizations for block filling
54
+ - [ ] Evaluate streaming performance with different chunk sizes
55
+
56
+ ## Testing Checklist
57
+
58
+ ### Correctness Tests
59
+ - [x] compress=1 produces ratio ~1.0 (incompressible) ✓
60
+ - [x] compress=2,3,4 produce increasing compression ratios ✓
61
+ - [x] dedup=2 produces exactly 50% unique blocks ✓
62
+ - [x] Streaming generator matches single-pass output ✓
63
+ - [ ] Extended compress values (5,6,7,8) work correctly
64
+ - [ ] Extended dedup values (5,6,7,8) work correctly
65
+ - [ ] Large datasets (>1GB) generate correctly
66
+ - [ ] Concurrent generation is thread-safe
67
+
68
+ ### Performance Tests
69
+ - [x] Basic performance: 1-7 GB/s ✓
70
+ - [ ] Performance stability over extended runs
71
+ - [ ] Memory usage within acceptable bounds
72
+ - [ ] No performance degradation with high compress/dedup values
73
+ - [ ] Streaming performance comparable to single-pass
74
+
75
+ ### Integration Tests
76
+ - [ ] sai3-bench workloads complete successfully
77
+ - [ ] dl-driver data generation works correctly
78
+ - [ ] Python bindings work with new algorithm
79
+ - [ ] All downstream project tests pass
80
+
81
+ ## Performance Baseline
82
+
83
+ Current performance (new algorithm with Xoshiro256++):
84
+ - **1MB dataset**: 954 MB/s (3.76x faster than old algorithm)
85
+ - **16MB dataset**: 2,816 MB/s (1.47x slower than old algorithm)
86
+ - **64MB dataset**: 7,351 MB/s (1.38x slower than old algorithm)
87
+ - **Streaming (16MB)**: 1,374 MB/s
88
+
89
+ Note: Old algorithm performance numbers are INVALID due to compression bug. Performance comparison should focus on correctness first, acceptable performance second.
90
+
91
+ ## Documentation Updates Needed
92
+
93
+ - [ ] Update `README.md` with new algorithm details
94
+ - [ ] Update `docs/Changelog.md` with v0.9.17 entry
95
+ - [ ] Update `.github/copilot-instructions.md` (remove migration section)
96
+ - [ ] Add design document for new algorithm
97
+ - [ ] Update code examples in documentation
98
+
99
+ ## Success Criteria
100
+
101
+ 1. **Correctness**: compress=1 maintains ratio ~1.0 in all scenarios
102
+ 2. **Performance**: Within 25% of old algorithm on large datasets (>16MB)
103
+ 3. **Compatibility**: All downstream projects work without code changes
104
+ 4. **Testing**: All tests pass including new extended validation suite
105
+ 5. **Production**: No user-reported issues after 1 week of production use
106
+
107
+ ## Related Files
108
+
109
+ - `src/data_gen.rs` - Redirection wrapper (to be simplified)
110
+ - `src/data_gen_alt.rs` - New algorithm implementation
111
+ - `tests/test_data_gen_alt.rs` - Comprehensive test suite
112
+ - `.github/copilot-instructions.md` - Migration tracking
113
+
114
+ ## Timeline
115
+
116
+ - **November 2025**: Algorithm implemented and redirected ✓
117
+ - **December 2025**: Extended validation period
118
+ - **January 2026**: Code cleanup and optimization
119
+ - **February 2026**: Complete migration, remove old code
120
+
121
+ ## Notes
122
+
123
+ The old algorithm's performance appeared better but was fundamentally broken. The new algorithm trades a small amount of performance (on large datasets) for correctness and proper compression control. This is the right trade-off for a production tool.
@@ -0,0 +1,106 @@
1
+ ---
2
+ name: "Realism Epic: Loader knobs, metrics, trace (s3dlio)"
3
+ about: Track the work to add realism controls, metrics, caching policies, and trace record/replay to s3dlio.
4
+ title: "[Realism] s3dlio: loader options, metrics, trace & cache policy"
5
+ labels: enhancement, performance, io, epic
6
+ assignees: ""
7
+ ---
8
+
9
+ ## Summary
10
+ Add the loader controls, metrics, cache policies, and trace record/replay needed for **realistic AI/ML storage workloads** and for parity with dl-driver profiles.
11
+
12
+ ## Goals
13
+ - Framework-equivalent knobs (prefetch, workers, request shaping, inflight caps, shuffle intensity).
14
+ - First-class metrics (req-size histograms, latency, in-flight bytes, QD).
15
+ - Page cache policy controls (Buffered FS with fadvise, DirectIO).
16
+ - Trace **record** and **replay** primitives.
17
+ - Consistent Python factory API across `file://`, `direct://`, `s3://`, `az://`.
18
+
19
+ ## Non-Goals
20
+ - Implementing model training or transforms.
21
+ - Vendor/SDK benchmarking beyond required metrics.
22
+
23
+ ---
24
+
25
+ ## Scope & Tasks
26
+
27
+ ### 1) Python multi-backend factories (parity)
28
+ - [ ] Export/confirm `create_dataset(...)` and `create_async_loader(...)` in Python surface.
29
+ - [ ] Support `file://`, `direct://`, `s3://`, `az://` uniformly (clear error on unknown scheme).
30
+ - [ ] Ensure options dict passes through consistently to Rust (batch_size, num_workers, prefetch, shuffle, reader_mode, loading_mode).
31
+ - [ ] **Tests:** `python/tests/test_multi_backend_iteration.py` iterates 2 batches per scheme (CI).
32
+
33
+ ### 2) LoaderOptions & PoolConfig realism knobs
34
+ - [ ] Extend `LoaderOptions`:
35
+ - [ ] `target_request_bytes: Option<usize>`
36
+ - [ ] `max_bytes_in_flight: Option<usize>`
37
+ - [ ] `random_index_mode: enum { None, Uniform, Zipfian{theta:f64} }`
38
+ - [ ] `page_cache_mode: enum { Default, FadviseRandom, FadviseSequential, DirectIO }`
39
+ - [ ] Implement request shaping (range reads for object stores, chunked `pread` for FS).
40
+ - [ ] Enforce inflight cap backpressure in async pool.
41
+ - [ ] **Tests:** unit tests for parsing; integration verifying req-size histogram peak and inflight cap.
42
+
43
+ ### 3) Metrics & histograms
44
+ - [ ] Add `MetricsSnapshot` with:
45
+ - [ ] Request-bytes histogram
46
+ - [ ] Latency (µs) histogram
47
+ - [ ] Max queue depth
48
+ - [ ] Peak bytes-in-flight
49
+ - [ ] S3/Azure: count of range GETs; FS: cache hints applied
50
+ - [ ] Expose `to_json()`/`to_csv()`; bind to Python (PyO3).
51
+ - [ ] **Tests:** golden JSON/CSV schema snapshot + sanity test.
52
+
53
+ ### 4) Trace record & replay (feature-flagged)
54
+ - [ ] Trace recorder: JSONL per-op `{uri, offset, len, t_submit[, sample_id]}`.
55
+ - [ ] Trace replayer: issues reads with same cadence; respects inflight caps & cache policy.
56
+ - [ ] Feature flag: `--features trace`.
57
+ - [ ] **Tests:** record from small dataset then replay; compare req-size hist/throughput envelope within ±10–15%.
58
+
59
+ ### 5) Page-cache & DirectIO policy
60
+ - [ ] FS backend: wire `posix_fadvise(SEQUENTIAL|RANDOM|DONTNEED)` based on `page_cache_mode`.
61
+ - [ ] DirectIO backend: validate alignment and document constraints.
62
+ - [ ] **Tests:** A/B runs show expected latency/throughput divergence between modes.
63
+
64
+ ---
65
+
66
+ ## API Changes (Draft)
67
+ ```rust
68
+ // src/data_loader/loader_options.rs
69
+ pub enum RandomIndexMode { None, Uniform, Zipfian { theta: f64 } }
70
+ pub enum PageCacheMode { Default, FadviseRandom, FadviseSequential, DirectIO }
71
+
72
+ #[derive(Clone, Default)]
73
+ pub struct LoaderOptions {
74
+ pub batch_size: usize,
75
+ pub num_workers: usize,
76
+ pub prefetch: usize,
77
+ pub shuffle: bool,
78
+ pub reader_mode: ReaderMode,
79
+ pub loading_mode: LoadingMode,
80
+ pub target_request_bytes: Option<usize>,
81
+ pub max_bytes_in_flight: Option<usize>,
82
+ pub random_index_mode: RandomIndexMode,
83
+ pub page_cache_mode: PageCacheMode,
84
+ }
85
+ ```
86
+
87
+ ## Acceptance Criteria
88
+ - [ ] Python factories run on all 4 schemes with identical option names.
89
+ - [ ] Metrics JSON includes `{req_bytes_hist, latency_us_hist, max_qd, peak_inflight, backend_counters}`.
90
+ - [ ] Inflight cap prevents RAM ballooning; verified via test.
91
+ - [ ] Trace replay matches recorded histogram and average bytes/step within ±10–15%.
92
+ - [ ] Docs updated with new options + metrics schema.
93
+
94
+ ## Test Plan
95
+ - [ ] Unit: option parsing, metrics serialization.
96
+ - [ ] Integration: backend iteration per scheme.
97
+ - [ ] Perf sanity: request histogram peaks where configured.
98
+ - [ ] Trace record/replay parity test.
99
+
100
+ ## Docs
101
+ - [ ] Update `docs/api/python-api-*.md` with new options and examples.
102
+ - [ ] Add `docs/metrics.md` and `docs/trace.md`.
103
+
104
+ ## Risks / Mitigations
105
+ - Alignment/DirectIO errors → preflight checks & clear errors.
106
+ - Histogram overhead → feature flag or sampling rate.
@@ -0,0 +1,409 @@
1
+ # s3dlio AI Coding Agent Instructions
2
+
3
+ ## Project Overview
4
+ s3dlio is a high-performance, multi-protocol storage library built in Rust with Python bindings, designed for AI/ML workloads. It provides universal copy operations across S3, Azure, local file systems, and DirectIO with near line-speed performance.
5
+
6
+ **Current Version**: v0.9.5 (October 2025)
7
+
8
+ ### Performance Targets
9
+ - **Read (GET)**: Minimum 5 GB/s (50 Gb/s) sustained, target higher
10
+ - **Write (PUT)**: Minimum 2.5 GB/s (25 Gb/s) sustained, target higher
11
+ - **Infrastructure**: Tested against Vast storage systems with bonded 100 Gb ports
12
+ - **S3 Compatibility**: MinIO, Vast, AWS S3, and other S3-compatible storage systems
13
+
14
+ ## Core Architecture
15
+
16
+ ### Dual-Backend System
17
+ The project uses **mutually exclusive backend selection** at compile-time:
18
+ - `native-backends` feature: AWS SDK + Azure SDK (**DEFAULT** - recommended for production)
19
+ - `arrow-backend` feature: Apache Arrow object_store implementation (experimental, optional)
20
+
21
+ ```bash
22
+ # Default build (uses native-backends)
23
+ cargo build --release
24
+
25
+ # Explicit native-backends (RECOMMENDED)
26
+ cargo build --no-default-features --features native-backends
27
+
28
+ # Experimental arrow backend (NOT RECOMMENDED for production)
29
+ cargo build --no-default-features --features arrow-backend
30
+ ```
31
+
32
+ **Critical**: These features are mutually exclusive by design (`compile_error!` in `src/lib.rs`).
33
+
34
+ **Backend Status**:
35
+ - **native-backends**: Default, proven performance (5+ GB/s reads, 2.5+ GB/s writes), production-ready
36
+ - **arrow-backend**: Experimental only, no proven performance benefit, kept for comparison testing
37
+
38
+ ### Build Quality Standards
39
+
40
+ **CRITICAL: Zero Warnings Policy**
41
+ - ALL builds MUST be warning-free before commits
42
+ - Never use quick fixes like `_` prefix to silence unused variable warnings
43
+ - Unused variables often indicate logic errors that must be investigated
44
+ - Unused imports must be removed, not ignored
45
+
46
+ **Pre-Commit Checklist**:
47
+ 1. Run `cargo build --release` and verify ZERO warnings
48
+ 2. Run `cargo clippy` and fix all issues
49
+ 3. Investigate root cause of any warning - do not suppress without understanding
50
+ 4. If unsure about a warning, ask for clarification before committing
51
+
52
+ **Shell Command Best Practices**:
53
+ - Never use exclamation marks (`!`) in Python print statements or shell commands
54
+ - Exclamation marks cause shell escaping issues in bash
55
+ - Use simple declarative messages instead: "Import successful" not "Import successful!"
56
+
57
+ **Warning Investigation Process**:
58
+ ```bash
59
+ # Check for warnings
60
+ cargo build --release 2>&1 | grep -i warning
61
+
62
+ # Get full details
63
+ cargo build --release 2>&1 | grep -A 10 warning
64
+
65
+ # For clippy suggestions
66
+ cargo clippy --all-targets --all-features
67
+ ```
68
+
69
+ **Common Warning Anti-Patterns** (DO NOT DO):
70
+ - ❌ Adding `_` prefix to silence unused variable warnings
71
+ - ❌ Using `#[allow(unused)]` without understanding why
72
+ - ❌ Importing modules "just in case" they might be needed
73
+ - ❌ Leaving debug code that uses variables only in certain configs
74
+
75
+ **Correct Approach**:
76
+ - ✅ Remove unused imports completely
77
+ - ✅ Investigate why variables aren't used (logic bug?)
78
+ - ✅ Use feature gates if code is conditionally compiled
79
+ - ✅ Refactor to eliminate the warning's root cause
80
+
81
+ ### Dependency Management
82
+
83
+ **aws-smithy-http-client Patches: REMOVED**
84
+ - Custom patches in `fork-patches/aws-smithy-http-client/` are NOT used by default
85
+ - Patches showed no measurable performance benefit
86
+ - Removed from `[patch.crates-io]` to avoid forcing downstream users to patch
87
+ - Fork preserved for reference/experimentation but not required for builds
88
+
89
+ ### Public API Structure
90
+ - **Stable API**: `src/api.rs` - External developers use this via `s3dlio::api`
91
+ - **Internal modules**: Everything else may change - mark as implementation details
92
+ - **Factory pattern**: `store_for_uri()` creates appropriate backend for any URI scheme
93
+
94
+ ### Python Integration (PyO3/Maturin)
95
+ - **Module structure**: `python/s3dlio/` wraps compiled Rust extension `_pymod`
96
+ - **Build process**: `./build_pyo3.sh` → `./install_pyo3_wheel.sh`
97
+ - **Critical testing rule**: Always test installed package, never development `python/` directory
98
+
99
+ **CRITICAL: Virtual Environment Check**
100
+ - **ALWAYS verify virtual environment is active** before any build/install commands
101
+ - Check for `(s3dlio)` prefix in terminal prompt
102
+
103
+ ### Data Generation Algorithm Migration (November 2025)
104
+
105
+ **Status**: New algorithm active via redirection in `src/data_gen.rs`
106
+
107
+ **Background**:
108
+ - Original algorithm had cross-block compression bug (compress=1 still gave 7.68:1 ratio)
109
+ - New algorithm (`data_gen_alt.rs`) fixes bug using per-block RNG with local back-references
110
+ - Performance optimized with Xoshiro256++ (replaces ChaCha20 for 5-10x speedup)
111
+ - All existing code now uses new algorithm via transparent redirection
112
+
113
+ **Temporary Code Preservation**:
114
+ The following functions in `src/data_gen.rs` are **COMMENTED OUT** and marked for removal:
115
+ - `generate_controlled_data_original()` - lines ~206-250 (old single-pass generator)
116
+ - `ObjectGen::new_original()` - lines ~488-533 (old ObjectGen constructor)
117
+ - `ObjectGen::fill_chunk_original()` and related methods - lines ~594-710 (old streaming implementation)
118
+
119
+ **Action Required** (target: December 2025):
120
+ 1. Run extended validation tests (1 week of production workloads)
121
+ 2. Verify all downstream projects (sai3-bench, dl-driver) working correctly
122
+ 3. Remove commented-out code from `src/data_gen.rs`
123
+ 4. Update documentation to reference only `data_gen_alt.rs`
124
+ 5. Consider promoting `data_gen_alt.rs` to primary `data_gen.rs` (rename)
125
+
126
+ **GitHub Issue**: See `.github/ISSUE_TEMPLATE/data_gen_migration.md` for full tracking issue template
127
+
128
+ **Testing Checklist Before Removal**:
129
+ - [ ] All s3dlio tests pass (currently: 162/162 ✓)
130
+ - [ ] sai3-bench runs successfully with various compress/dedup settings
131
+ - [ ] dl-driver checkpoint save/load works correctly
132
+ - [ ] Performance benchmarks show no regression (<5% variance acceptable)
133
+ - [ ] Compression ratios match specifications (compress=1 → ratio ~1.0)
134
+ - [ ] No user-reported issues with data generation in production
135
+
136
+ ### Future Enhancements - NPY Format Support
137
+
138
+ **NEEDS IMPLEMENTATION**: Zero-copy in-memory .npy serialization (November 2025)
139
+
140
+ **Current State**:
141
+ - s3dlio has `src/data_formats/npz.rs` (reads NPZ files)
142
+ - s3dlio has `src/data_formats/hdf5.rs`, `tfrecord.rs` (format support exists)
143
+ - dl-driver implemented custom zero-copy .npy serializer (November 2025)
144
+ - ndarray-npy 0.9+ only writes to file paths, not in-memory buffers
145
+
146
+ **What Needs to be Added**:
147
+ - In-memory .npy serialization function (NPY 1.0 format)
148
+ - Zero-copy implementation using `ndarray::as_slice_memory_order()`
149
+ - Integration with existing `src/data_formats/` module
150
+ - Python bindings via PyO3 for numpy interop
151
+
152
+ **Reference Implementation**:
153
+ - See `dl-driver/crates/formats/src/npz.rs::NpzFormat::array_to_npy_bytes()`
154
+ - 48 lines, implements NPY 1.0: magic (6B) + version (2B) + header_len (2B) + header (padded dict) + data
155
+ - Zero-copy when ndarray is contiguous, one-copy fallback otherwise
156
+ - No temp files, pre-allocated buffers
157
+
158
+ **Why This Belongs in s3dlio**:
159
+ - s3dlio is explicitly an AI/ML I/O library
160
+ - Already has format support (HDF5, TFRecord, NPZ reading)
161
+ - Would enable direct array → storage without intermediate files
162
+ - Python bindings would benefit numpy/torch users
163
+ - Reusable across dl-driver, sai3-bench, and other tools
164
+
165
+ **Refactoring Plan**:
166
+ 1. Add `array_to_npy_bytes()` to s3dlio `src/data_formats/npz.rs`
167
+ 2. Expose via public API and Python bindings
168
+ 3. Update dl-driver to use s3dlio implementation
169
+ 4. Remove duplicate code from dl-driver
170
+
171
+ **GitHub Issue**: See future enhancement tracking issue (to be filed)
172
+ - If not active, run: `source .venv/bin/activate`
173
+ - Terminal interrupts (Ctrl-C) may exit the virtual environment
174
+ - Re-activate before continuing work
175
+
176
+ ## Key Development Patterns
177
+
178
+ ### URI-based Universal Interface
179
+ All backends use consistent URI schemes:
180
+ ```
181
+ s3://bucket/prefix/ # S3 operations
182
+ az://container/prefix/ # Azure Blob Storage
183
+ file:///local/path/ # Local filesystem
184
+ direct:///local/path/ # DirectIO bypass
185
+ ```
186
+
187
+ ### Feature-gated Development
188
+ When modifying backends, always use feature gates:
189
+ ```rust
190
+ #[cfg(feature = "native-backends")]
191
+ // AWS SDK implementation
192
+
193
+ #[cfg(feature = "arrow-backend")]
194
+ // Arrow object_store implementation
195
+ ```
196
+
197
+ ### Testing Strategy
198
+ - **Backend comparison**: Use `scripts/run_backend_comparison.sh`
199
+ - **Python testing**: Must rebuild/reinstall after Rust changes
200
+ - **Environment**: Tests require `.env` file with S3 credentials
201
+
202
+ ## Critical Development Workflows
203
+
204
+ **CRITICAL: Always Check Virtual Environment Before Building**
205
+ ```bash
206
+ # STEP 1: ALWAYS verify virtual environment is active
207
+ # Look for (s3dlio) prefix in prompt
208
+ # If missing, activate:
209
+ source .venv/bin/activate
210
+
211
+ # STEP 2: Then proceed with builds
212
+ cargo build --release
213
+ # or
214
+ ./build_pyo3.sh
215
+ ```
216
+
217
+ ### Python Extension Development
218
+ ```bash
219
+ # REQUIRED workflow after any Rust changes
220
+ # FIRST: Ensure virtual environment is active
221
+ source .venv/bin/activate # If not already active
222
+
223
+ # Then build and install
224
+ ./build_pyo3.sh && ./install_pyo3_wheel.sh
225
+ python tests/test_functionality.py # Tests installed package
226
+ ```
227
+
228
+ **Never** use `sys.path` manipulation in tests - it imports development Python without compiled Rust.
229
+
230
+ ### UV Package Manager
231
+ Project uses UV (not pip) for Python package management:
232
+ ```bash
233
+ # CRITICAL: Always check if you're in the virtual environment
234
+ # Virtual environment status: prompt shows (s3dlio) prefix when active
235
+
236
+ # Activate UV environment (if not already active)
237
+ source .venv/bin/activate
238
+
239
+ # Install packages
240
+ uv pip install package_name # NOT pip install
241
+
242
+ # Run Python commands
243
+ python -c "import s3dlio; print('Import successful')" # Works when venv active
244
+
245
+ # Deactivate when done
246
+ deactivate
247
+ ```
248
+
249
+ **Important Notes:**
250
+ - Terminal interrupts (Ctrl-C) may exit the virtual environment
251
+ - Always verify `(s3dlio)` prefix appears in prompt before running Python commands
252
+ - If no prefix shown, run `source .venv/bin/activate` to re-enter environment
253
+ - Never use exclamation marks in Python print statements (shell escaping issues)
254
+
255
+ ### Backend Development
256
+ ```bash
257
+ # Performance comparison between backends
258
+ ./scripts/build_performance_variants.sh
259
+ ./scripts/run_backend_comparison.sh
260
+ ```
261
+
262
+ ### Search Tools
263
+ - **ripgrep (rg)**: Fast code search available in terminal
264
+ ```bash
265
+ # Search for pattern across all files
266
+ rg "pattern"
267
+
268
+ # Search in specific file types
269
+ rg "pattern" --type rust
270
+
271
+ # Case-insensitive search
272
+ rg -i "pattern"
273
+ ```
274
+ - **grep_search tool**: Use for exact string or regex searches within files
275
+ - **semantic_search tool**: Use for semantic/natural language code searches
276
+
277
+ ### Environment Configuration
278
+ Key variables for development/testing:
279
+ - `AWS_ENDPOINT_URL`, `AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY`
280
+ - `S3DLIO_BACKEND=native|arrow` - Runtime backend selection
281
+ - `S3DLIO_USE_OPTIMIZED_HTTP=true` - Performance optimization
282
+
283
+ ### S3 Infrastructure Support
284
+ - **Local S3**: MinIO, Vast storage systems with bonded 100 Gb ports
285
+ - **Cloud S3**: AWS S3 with local and cloud deployments
286
+ - **Multi-target scaling**: Currently handled at process level (different instances target different IP addresses)
287
+ - **Future enhancement**: Multi-target addressing within s3dlio for >100 Gb throughput
288
+
289
+ ## Project-Specific Conventions
290
+
291
+ ### Versioning & Releases
292
+ - **Current version**: v0.9.5 (check `Cargo.toml` and `pyproject.toml`)
293
+ - **Next version**: v0.9.6 (in development on v0.9.6-dev branch)
294
+ - **Patch releases**: Increment build number (v0.9.5 → v0.9.6)
295
+ - **Minor releases**: For major features (v0.9.x → v0.10.0)
296
+ - **Major version 0.x**: Until production-ready quality achieved
297
+ - **Documentation**: Update `docs/Changelog.md` and `README.md` for every release
298
+
299
+ ### Logging Framework (v0.8.8+)
300
+ - **Framework**: Uses `tracing` crate (not `log`) for observability
301
+ - **Dependencies**: `tracing ^0.1`, `tracing-subscriber ^0.3`, `tracing-log ^0.2`
302
+ - **Verbosity levels**:
303
+ - Default: WARN level (quiet)
304
+ - `-v`: INFO level
305
+ - `-vv`: DEBUG level
306
+ - **Trace logging**: Operation trace (--op-log) uses separate zstd-compressed TSV format
307
+ - **Compatibility**: dl-driver and s3-bench (io-bench) also use tracing
308
+ - **Usage**: `tracing::info!()`, `tracing::debug!()`, `tracing::warn!()`, `tracing::error!()`
309
+
310
+ ### Page Cache Optimization (v0.8.8+)
311
+ - **Module**: `src/page_cache.rs` - Linux/Unix posix_fadvise() wrapper
312
+ - **PageCacheMode**: Sequential, Random, DontNeed, Normal, Auto
313
+ - **Auto mode**: Sequential for files ≥64MB, Random for smaller files
314
+ - **Integration**: file_store.rs get() and get_range() operations
315
+ - **Platform**: Linux/Unix only (no-op on Windows)
316
+
317
+ ### RangeEngine Performance (v0.9.3+, Updated v0.9.6)
318
+ - **Multi-backend support**: S3, Azure Blob Storage, Google Cloud Storage, file://, direct://
319
+ - **Default status**: **DISABLED** by default as of v0.9.6 (was: enabled in v0.9.3-v0.9.5)
320
+ - **Reason for change**: Stat overhead causes up to 50% slowdown on typical workloads
321
+ - **Default threshold**: 16 MiB (when explicitly enabled)
322
+ - **Configuration**: `DEFAULT_RANGE_ENGINE_THRESHOLD` in `src/constants.rs`
323
+ - **Performance gains**: 30-50% throughput improvement on large files (>= 64MB) **when enabled**
324
+ - **Must opt-in**: Set `enable_range_engine: true` in backend config for large-file workloads
325
+ - **When to enable**: Large-file workloads (>= 64 MiB average), high-bandwidth/high-latency networks
326
+ - **When to keep disabled**: Mixed workloads, small objects, local file systems, benchmarks
327
+
328
+ ### Delete Performance (v0.9.5+)
329
+ - **Adaptive concurrency**: 10-70x faster delete operations
330
+ - **Algorithm**: Scales with workload (10% of total objects, capped at 1,000)
331
+ - **Progress tracking**: Batched updates every 50 operations (98% reduction in overhead)
332
+ - **Universal support**: Works across all backends (S3, Azure, GCS, file://, direct://)
333
+
334
+ ### Error Handling
335
+ - Use `anyhow::Result` for all public APIs
336
+ - Convert to `PyResult` at Python boundary using `map_err(py_err)`
337
+
338
+ ### Performance Patterns
339
+ - **Streaming operations**: Use `ObjectWriter` trait for large uploads
340
+ - **Zero-copy**: Prefer `write_owned_bytes()` over `write_chunk()`
341
+ - **Concurrency**: Default 16 PUT / 32 GET concurrent operations
342
+
343
+ ### Module Organization
344
+ - **Core storage**: `src/object_store*.rs` files implement backend traits
345
+ - **Python API**: Split into `python_core_api.rs`, `python_aiml_api.rs`, etc.
346
+ - **Configuration**: `src/config.rs` defines all tunable parameters
347
+
348
+ ## Build System Gotchas
349
+
350
+ ### System Dependencies
351
+ - **HDF5**: Required for HDF5 format support
352
+ ```bash
353
+ # Ubuntu/Debian
354
+ sudo apt update && sudo apt install -y libhdf5-dev
355
+ # macOS: brew install hdf5
356
+ # RHEL/CentOS/Fedora: sudo dnf install hdf5-devel
357
+ ```
358
+
359
+ ### PyO3 Extension Module
360
+ - **Feature flag**: `extension-module` required for Python builds only
361
+ - **Maturin config**: Uses `python-source = "python"` and `module-name = "s3dlio._pymod"`
362
+ - **Installation**: Wheel goes to `.venv/lib/python3.12/site-packages/s3dlio/`
363
+
364
+ ### Performance Variants
365
+ The project builds multiple CLI variants for benchmarking:
366
+ - `target/performance_variants/s3-cli-native`
367
+ - `target/performance_variants/s3-cli-arrow`
368
+
369
+ ### Dependency Patches
370
+ - Uses forked `aws-smithy-http-client` for connection pool optimization
371
+ - Patch applied via `[patch.crates-io]` in `Cargo.toml`
372
+
373
+ ## Common Tasks
374
+
375
+ ### Adding New Storage Backend
376
+ 1. Implement `ObjectStore` trait in new `src/object_store_<backend>.rs`
377
+ 2. Add feature flag in `Cargo.toml`
378
+ 3. Update `store_for_uri()` factory function
379
+ 4. Add to `scripts/run_backend_comparison.sh`
380
+
381
+ ### Performance Investigation
382
+ ```bash
383
+ # Build with profiling
384
+ cargo build --release --features profiling
385
+ # Run flamegraph analysis
386
+ cargo run --example simple_flamegraph_test --features profiling
387
+ ```
388
+
389
+ ### Testing New Python Features
390
+ ```bash
391
+ # Full test workflow
392
+ cargo test --release --lib # Rust tests
393
+ ./build_pyo3.sh && ./install_pyo3_wheel.sh # Build Python
394
+ python python/tests/test_modular_api_regression.py # Python tests
395
+ ```
396
+
397
+ ### Common Development Commands
398
+ ```bash
399
+ # Backend performance comparison
400
+ ./scripts/build_performance_variants.sh
401
+ ./scripts/run_backend_comparison.sh
402
+
403
+ # Full test suite with S3 credentials
404
+ ./scripts/test_all.sh
405
+
406
+ # Profile performance with flamegraph
407
+ cargo build --release --features profiling
408
+ cargo run --example simple_flamegraph_test --features profiling
409
+ ```
@@ -0,0 +1,31 @@
1
+ # Rust #
2
+ ###########
3
+ /target
4
+ src/*.save
5
+ src/bin/*.save
6
+
7
+ # Misc #
8
+ ###########
9
+ *.zst
10
+
11
+ # Python #
12
+ ###########
13
+ __pycache__/
14
+ *.pyc
15
+ *.pyo
16
+ *.pyd
17
+ .Python
18
+ env/
19
+ .venv
20
+ venv/
21
+ *.env
22
+ .ipynb_checkpoints
23
+
24
+ # Python extension modules (compiled from Rust via maturin)
25
+ *.so
26
+ */_pymod.cpython-*.so
27
+
28
+ # Local environment files with secrets (do not commit)
29
+ *.local
30
+ *-env.local
31
+ API_COMPATIBILITY_REPORT.md