arraymorph 0.2.0b2__cp313-cp313-macosx_15_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
arraymorph/__init__.py
ADDED
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
"""
|
|
2
|
+
ArrayMorph - HDF5 VOL connector for cloud object storage.
|
|
3
|
+
|
|
4
|
+
Supports AWS S3 and Azure Blob Storage via HDF5's Virtual Object Layer.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import os
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
|
|
12
|
+
__version__ = "0.2.0"
|
|
13
|
+
|
|
14
|
+
# The compiled VOL plugin lives next to this file after installation
|
|
15
|
+
_PLUGIN_DIR = str(Path(__file__).parent / "lib")
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def get_plugin_path() -> str:
|
|
19
|
+
"""Return the directory containing the ArrayMorph VOL plugin (.so/.dylib).
|
|
20
|
+
|
|
21
|
+
Use this to set HDF5_PLUGIN_PATH:
|
|
22
|
+
>>> import arraymorph
|
|
23
|
+
>>> os.environ["HDF5_PLUGIN_PATH"] = arraymorph.get_plugin_path()
|
|
24
|
+
"""
|
|
25
|
+
return _PLUGIN_DIR
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def enable() -> None:
|
|
29
|
+
"""Configure HDF5 environment variables to use ArrayMorph.
|
|
30
|
+
|
|
31
|
+
Sets HDF5_PLUGIN_PATH and HDF5_VOL_CONNECTOR so that any
|
|
32
|
+
subsequent h5py calls route through the ArrayMorph VOL connector.
|
|
33
|
+
|
|
34
|
+
Usage:
|
|
35
|
+
>>> import arraymorph
|
|
36
|
+
>>> arraymorph.enable()
|
|
37
|
+
>>> import h5py
|
|
38
|
+
>>> f = h5py.File("s3://bucket/data.h5", "r")
|
|
39
|
+
"""
|
|
40
|
+
os.environ["HDF5_PLUGIN_PATH"] = _PLUGIN_DIR
|
|
41
|
+
os.environ["HDF5_VOL_CONNECTOR"] = "arraymorph"
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def configure_s3(
|
|
45
|
+
bucket: str,
|
|
46
|
+
access_key: str = "",
|
|
47
|
+
secret_key: str = "",
|
|
48
|
+
endpoint: str | None = None,
|
|
49
|
+
region: str = "us-east-2",
|
|
50
|
+
use_tls: bool = False,
|
|
51
|
+
addressing_style: bool = False,
|
|
52
|
+
use_signed_payloads: bool = False,
|
|
53
|
+
) -> None:
|
|
54
|
+
"""Configure AWS S3 credentials and client behavior for ArrayMorph.
|
|
55
|
+
|
|
56
|
+
Sets the environment variables read by the VOL connector's S3 client
|
|
57
|
+
at initialization time. Call this before any h5py file operations.
|
|
58
|
+
|
|
59
|
+
Args:
|
|
60
|
+
bucket: Name of the S3 bucket where HDF5 files are stored.
|
|
61
|
+
Maps to: BUCKET_NAME
|
|
62
|
+
access_key: Access key ID for authentication with the S3 service.
|
|
63
|
+
Maps to: AWS_ACCESS_KEY_ID
|
|
64
|
+
secret_key: Secret access key paired with access_key for authentication.
|
|
65
|
+
Maps to: AWS_SECRET_ACCESS_KEY
|
|
66
|
+
endpoint: Custom S3-compatible endpoint URL (e.g. 'http://localhost:3900').
|
|
67
|
+
When None, the S3 client targets the default AWS endpoint. Required
|
|
68
|
+
for any non-AWS S3-compatible object store (MinIO, Ceph, etc.).
|
|
69
|
+
Maps to: AWS_ENDPOINT_URL_S3
|
|
70
|
+
region: Region label used in SigV4 request signing. Must match the region
|
|
71
|
+
your bucket or S3-compatible store is configured with — a mismatch
|
|
72
|
+
produces signature validation errors. Defaults to 'us-east-2'.
|
|
73
|
+
Maps to: AWS_REGION
|
|
74
|
+
use_tls: Whether to use HTTPS (True) or HTTP (False) for S3 connections.
|
|
75
|
+
Set to False for object stores that do not have TLS configured.
|
|
76
|
+
Defaults to False.
|
|
77
|
+
Maps to: AWS_USE_TLS
|
|
78
|
+
addressing_style: URL addressing style for the S3 client. When True,
|
|
79
|
+
uses path-style ('endpoint/bucket/key'). When False, uses
|
|
80
|
+
virtual-hosted style ('bucket.endpoint/key'), which can cause the
|
|
81
|
+
S3 client to misinterpret the HDF5 filename as the bucket name.
|
|
82
|
+
Most S3-compatible stores require path-style addressing.
|
|
83
|
+
Defaults to False.
|
|
84
|
+
Maps to: AWS_S3_ADDRESSING_STYLE
|
|
85
|
+
use_signed_payloads: Whether to include the request body in the SigV4
|
|
86
|
+
signature (PayloadSigningPolicy::Always). Some S3-compatible stores
|
|
87
|
+
require signed payloads and will reject requests with signature
|
|
88
|
+
validation errors if this is disabled. Defaults to False.
|
|
89
|
+
Maps to: AWS_SIGNED_PAYLOADS
|
|
90
|
+
|
|
91
|
+
Example:
|
|
92
|
+
>>> import arraymorph
|
|
93
|
+
>>> arraymorph.configure_s3(
|
|
94
|
+
... bucket="my-bucket",
|
|
95
|
+
... access_key="my-access-key",
|
|
96
|
+
... secret_key="my-secret-key",
|
|
97
|
+
... endpoint="http://localhost:3900",
|
|
98
|
+
... region="us-east-1",
|
|
99
|
+
... use_tls=False,
|
|
100
|
+
... addressing_style=True,
|
|
101
|
+
... use_signed_payloads=True,
|
|
102
|
+
... )
|
|
103
|
+
>>> arraymorph.enable()
|
|
104
|
+
"""
|
|
105
|
+
if not (access_key and secret_key):
|
|
106
|
+
raise ValueError(
|
|
107
|
+
"configure_s3() requires both 'access_key' and 'secret_key'. "
|
|
108
|
+
"Set them explicitly or export AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY "
|
|
109
|
+
"before calling this function."
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
os.environ["AWS_ACCESS_KEY_ID"] = access_key
|
|
113
|
+
os.environ["AWS_SECRET_ACCESS_KEY"] = secret_key
|
|
114
|
+
os.environ["STORAGE_PLATFORM"] = "S3"
|
|
115
|
+
os.environ["BUCKET_NAME"] = bucket
|
|
116
|
+
os.environ["AWS_REGION"] = region
|
|
117
|
+
|
|
118
|
+
if endpoint:
|
|
119
|
+
os.environ["AWS_ENDPOINT_URL_S3"] = endpoint
|
|
120
|
+
|
|
121
|
+
os.environ["AWS_USE_TLS"] = str(use_tls).lower()
|
|
122
|
+
os.environ["AWS_S3_ADDRESSING_STYLE"] = "path" if addressing_style else "virtual"
|
|
123
|
+
os.environ["AWS_SIGNED_PAYLOADS"] = str(use_signed_payloads).lower()
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def configure_azure(
|
|
127
|
+
container: str,
|
|
128
|
+
connection_string: str | None = None,
|
|
129
|
+
) -> None:
|
|
130
|
+
"""Configure Azure Blob Storage credentials for ArrayMorph.
|
|
131
|
+
|
|
132
|
+
Sets the environment variables read by the VOL connector's Azure client
|
|
133
|
+
at initialization time. Call this before any h5py file operations.
|
|
134
|
+
|
|
135
|
+
Args:
|
|
136
|
+
container: Name of the Azure Blob Storage container where HDF5 files
|
|
137
|
+
are stored. Maps to: BUCKET_NAME
|
|
138
|
+
connection_string: Azure Storage connection string used to authenticate
|
|
139
|
+
and locate the storage account. If None, the connector will fall back
|
|
140
|
+
to the existing AZURE_STORAGE_CONNECTION_STRING environment variable.
|
|
141
|
+
Maps to: AZURE_STORAGE_CONNECTION_STRING
|
|
142
|
+
|
|
143
|
+
Example:
|
|
144
|
+
>>> import arraymorph
|
|
145
|
+
>>> arraymorph.configure_azure(
|
|
146
|
+
... container="my-container",
|
|
147
|
+
... connection_string="DefaultEndpointsProtocol=https;AccountName=...",
|
|
148
|
+
... )
|
|
149
|
+
>>> arraymorph.enable()
|
|
150
|
+
"""
|
|
151
|
+
if not connection_string and not os.environ.get("AZURE_STORAGE_CONNECTION_STRING"):
|
|
152
|
+
raise ValueError(
|
|
153
|
+
"configure_azure() requires a 'connection_string'. "
|
|
154
|
+
"Set it explicitly or export AZURE_STORAGE_CONNECTION_STRING "
|
|
155
|
+
"before calling this function."
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
os.environ["STORAGE_PLATFORM"] = "Azure"
|
|
159
|
+
os.environ["BUCKET_NAME"] = container
|
|
160
|
+
if connection_string:
|
|
161
|
+
os.environ["AZURE_STORAGE_CONNECTION_STRING"] = connection_string
|
|
Binary file
|
|
@@ -0,0 +1,343 @@
|
|
|
1
|
+
Metadata-Version: 2.2
|
|
2
|
+
Name: arraymorph
|
|
3
|
+
Version: 0.2.0b2
|
|
4
|
+
Summary: HDF5 VOL connector for cloud object storage (AWS S3, Azure Blob)
|
|
5
|
+
Author: ruochenj123, wangtg2013
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/ICICLE-ai/ArrayMorph
|
|
8
|
+
Project-URL: Repository, https://github.com/ICICLE-ai/ArrayMorph
|
|
9
|
+
Project-URL: Issues, https://github.com/ICICLE-ai/ArrayMorph/issues
|
|
10
|
+
Requires-Python: >=3.9
|
|
11
|
+
Requires-Dist: h5py>=3.11.0
|
|
12
|
+
Description-Content-Type: text/markdown
|
|
13
|
+
|
|
14
|
+
# ArrayMorph
|
|
15
|
+
|
|
16
|
+
[](https://github.com/ICICLE-ai/arraymorph/actions/workflows/build.yaml)
|
|
17
|
+
[](https://opensource.org/licenses/MIT)
|
|
18
|
+
|
|
19
|
+
ArrayMorph enables efficient storage and retrieval of array data from cloud object stores, supporting AWS S3 and Azure Blob Storage. It is an HDF5 Virtual Object Layer (VOL) plugin that transparently routes HDF5 file operations to cloud storage — existing h5py or HDF5 C++ code works unchanged once the plugin is loaded.
|
|
20
|
+
|
|
21
|
+
**Tag**: CI4AI
|
|
22
|
+
|
|
23
|
+
---
|
|
24
|
+
|
|
25
|
+
# How-To Guides
|
|
26
|
+
|
|
27
|
+
## Install ArrayMorph
|
|
28
|
+
|
|
29
|
+
```bash
|
|
30
|
+
pip install arraymorph
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
Once installed, jump straight to [Configure credentials for AWS S3](#configure-credentials-for-aws-s3) or [Azure](#configure-credentials-for-azure-blob-storage) below.
|
|
34
|
+
|
|
35
|
+
If you need the standalone `lib_arraymorph` binary, you can [download a pre-built release](#download-a-pre-built-lib_arraymorph) or [build from source](#build-from-source).
|
|
36
|
+
|
|
37
|
+
## Configure credentials for AWS S3
|
|
38
|
+
|
|
39
|
+
Use the Python API before opening any HDF5 files:
|
|
40
|
+
|
|
41
|
+
```python
|
|
42
|
+
import arraymorph
|
|
43
|
+
|
|
44
|
+
arraymorph.configure_s3(
|
|
45
|
+
bucket="my-bucket",
|
|
46
|
+
access_key="MY_ACCESS_KEY",
|
|
47
|
+
secret_key="MY_SECRET_KEY",
|
|
48
|
+
region="us-east-1",
|
|
49
|
+
use_tls=True,
|
|
50
|
+
)
|
|
51
|
+
arraymorph.enable()
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
Or set environment variables directly:
|
|
55
|
+
|
|
56
|
+
```bash
|
|
57
|
+
export STORAGE_PLATFORM=S3
|
|
58
|
+
export BUCKET_NAME=my-bucket
|
|
59
|
+
export AWS_ACCESS_KEY_ID=MY_ACCESS_KEY
|
|
60
|
+
export AWS_SECRET_ACCESS_KEY=MY_SECRET_KEY
|
|
61
|
+
export AWS_REGION=us-east-1
|
|
62
|
+
export HDF5_PLUGIN_PATH=$(python -c "import arraymorph; print(arraymorph.get_plugin_path())")
|
|
63
|
+
export HDF5_VOL_CONNECTOR=arraymorph
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
## Configure credentials for Azure Blob Storage
|
|
67
|
+
|
|
68
|
+
```python
|
|
69
|
+
import arraymorph
|
|
70
|
+
|
|
71
|
+
arraymorph.configure_azure(
|
|
72
|
+
container="my-container",
|
|
73
|
+
connection_string="DefaultEndpointsProtocol=https;AccountName=...;AccountKey=...;EndpointSuffix=core.windows.net",
|
|
74
|
+
)
|
|
75
|
+
arraymorph.enable()
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
Or set environment variables directly:
|
|
79
|
+
|
|
80
|
+
```bash
|
|
81
|
+
export STORAGE_PLATFORM=Azure
|
|
82
|
+
export BUCKET_NAME=my-container
|
|
83
|
+
export AZURE_STORAGE_CONNECTION_STRING="DefaultEndpointsProtocol=https;..."
|
|
84
|
+
export HDF5_PLUGIN_PATH=$(python -c "import arraymorph; print(arraymorph.get_plugin_path())")
|
|
85
|
+
export HDF5_VOL_CONNECTOR=arraymorph
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
## Use an S3-compatible object store (MinIO, Ceph, Garage)
|
|
89
|
+
|
|
90
|
+
Pass `endpoint`, `addressing_style=True`, and `use_signed_payloads=True` to match the requirements of most self-hosted S3-compatible stores:
|
|
91
|
+
|
|
92
|
+
```python
|
|
93
|
+
import arraymorph
|
|
94
|
+
|
|
95
|
+
arraymorph.configure_s3(
|
|
96
|
+
bucket="my-bucket",
|
|
97
|
+
access_key="MY_ACCESS_KEY",
|
|
98
|
+
secret_key="MY_SECRET_KEY",
|
|
99
|
+
endpoint="http://localhost:9000",
|
|
100
|
+
region="us-east-1",
|
|
101
|
+
use_tls=False,
|
|
102
|
+
addressing_style=True,
|
|
103
|
+
use_signed_payloads=True,
|
|
104
|
+
)
|
|
105
|
+
arraymorph.enable()
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
## Download a pre-built lib_arraymorph
|
|
109
|
+
|
|
110
|
+
Each [GitHub release](https://github.com/ICICLE-ai/ArrayMorph/releases) attaches standalone pre-compiled binaries of `lib_arraymorph` for all supported platforms:
|
|
111
|
+
|
|
112
|
+
| File | Platform |
|
|
113
|
+
| ---------------------------------- | ------------------- |
|
|
114
|
+
| `lib_arraymorph-linux-x86_64.so` | Linux x86_64 |
|
|
115
|
+
| `lib_arraymorph-linux-aarch64.so` | Linux aarch64 |
|
|
116
|
+
| `lib_arraymorph-macos-arm64.dylib` | macOS Apple Silicon |
|
|
117
|
+
|
|
118
|
+
Download the file for your platform from the release assets and set `HDF5_PLUGIN_PATH` to the directory containing it before calling `arraymorph.enable()` or setting `HDF5_VOL_CONNECTOR` manually.
|
|
119
|
+
|
|
120
|
+
## Build from source
|
|
121
|
+
|
|
122
|
+
Use this path if you want to compile `lib_arraymorph` yourself — for example to target a specific platform, contribute changes, or build a custom wheel.
|
|
123
|
+
|
|
124
|
+
### Prerequisites
|
|
125
|
+
|
|
126
|
+
- [vcpkg](https://github.com/microsoft/vcpkg) — installs the AWS and Azure C++ SDKs via CMake
|
|
127
|
+
- [CMake](https://cmake.org) and [Ninja](https://ninja-build.org)
|
|
128
|
+
- [uv](https://docs.astral.sh/uv/) — Python package manager
|
|
129
|
+
|
|
130
|
+
### Step 1 — Clone and create a virtual environment
|
|
131
|
+
|
|
132
|
+
```bash
|
|
133
|
+
git clone https://github.com/ICICLE-ai/ArrayMorph.git
|
|
134
|
+
cd ArrayMorph
|
|
135
|
+
uv venv
|
|
136
|
+
source .venv/bin/activate
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
### Step 2 — Install h5py
|
|
140
|
+
|
|
141
|
+
`lib_arraymorph` links against an HDF5 shared library at build time. Rather than requiring a separate system-wide HDF5 installation, the build system points CMake at the `.so` / `.dylib` that h5py already bundles. Install h5py first so those libraries are present:
|
|
142
|
+
|
|
143
|
+
```bash
|
|
144
|
+
uv pip install h5py
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
On macOS the bundled libraries land in `.venv/lib/python*/site-packages/h5py/.dylibs/`; on Linux in `.venv/lib/python*/site-packages/h5py.libs/`.
|
|
148
|
+
|
|
149
|
+
### Step 3 — Configure and build the shared library
|
|
150
|
+
|
|
151
|
+
```bash
|
|
152
|
+
export HDF5_DIR=$(.venv/bin/python -c "import h5py,os; d=os.path.dirname(h5py.__file__); print(os.path.join(d,'.dylibs') if os.path.exists(os.path.join(d,'.dylibs')) else os.path.join(os.path.dirname(d),'h5py.libs'))")
|
|
153
|
+
|
|
154
|
+
cmake -B lib/build -S lib \
|
|
155
|
+
-DCMAKE_TOOLCHAIN_FILE=${VCPKG_ROOT:-~/.vcpkg}/scripts/buildsystems/vcpkg.cmake \
|
|
156
|
+
-DCMAKE_BUILD_TYPE=Release \
|
|
157
|
+
-G Ninja
|
|
158
|
+
|
|
159
|
+
cmake --build lib/build
|
|
160
|
+
```
|
|
161
|
+
|
|
162
|
+
This produces `lib/build/lib_arraymorph.dylib` on macOS or `lib/build/lib_arraymorph.so` on Linux.
|
|
163
|
+
|
|
164
|
+
### Optional — Python package
|
|
165
|
+
|
|
166
|
+
If you also want to use the Python API, install the package in editable mode:
|
|
167
|
+
|
|
168
|
+
```bash
|
|
169
|
+
HDF5_DIR=$HDF5_DIR \
|
|
170
|
+
CMAKE_TOOLCHAIN_FILE=${VCPKG_ROOT:-~/.vcpkg}/scripts/buildsystems/vcpkg.cmake \
|
|
171
|
+
uv pip install -e .
|
|
172
|
+
```
|
|
173
|
+
|
|
174
|
+
Or build a redistributable wheel:
|
|
175
|
+
|
|
176
|
+
```bash
|
|
177
|
+
HDF5_DIR=$HDF5_DIR \
|
|
178
|
+
CMAKE_TOOLCHAIN_FILE=${VCPKG_ROOT:-~/.vcpkg}/scripts/buildsystems/vcpkg.cmake \
|
|
179
|
+
uv build --wheel --no-build-isolation
|
|
180
|
+
```
|
|
181
|
+
|
|
182
|
+
The wheel is written to `dist/`. Install it in any environment with:
|
|
183
|
+
|
|
184
|
+
```bash
|
|
185
|
+
pip install dist/arraymorph-*.whl
|
|
186
|
+
```
|
|
187
|
+
|
|
188
|
+
---
|
|
189
|
+
|
|
190
|
+
# Tutorials
|
|
191
|
+
|
|
192
|
+
## Write and read a chunked array on AWS S3
|
|
193
|
+
|
|
194
|
+
This tutorial walks through writing a 2-D NumPy array to a cloud HDF5 file and reading a slice of it back.
|
|
195
|
+
|
|
196
|
+
### Prerequisites
|
|
197
|
+
|
|
198
|
+
- An AWS account with an S3 bucket, or an S3-compatible object store
|
|
199
|
+
- ArrayMorph installed (`pip install arraymorph`)
|
|
200
|
+
|
|
201
|
+
### Step 1 — Configure and enable ArrayMorph
|
|
202
|
+
|
|
203
|
+
```python
|
|
204
|
+
import arraymorph
|
|
205
|
+
|
|
206
|
+
arraymorph.configure_s3(
|
|
207
|
+
bucket="my-bucket",
|
|
208
|
+
access_key="MY_ACCESS_KEY",
|
|
209
|
+
secret_key="MY_SECRET_KEY",
|
|
210
|
+
region="us-east-1",
|
|
211
|
+
use_tls=True,
|
|
212
|
+
)
|
|
213
|
+
arraymorph.enable()
|
|
214
|
+
```
|
|
215
|
+
|
|
216
|
+
`arraymorph.enable()` sets `HDF5_PLUGIN_PATH` and `HDF5_VOL_CONNECTOR` in the current process. Any `h5py.File(...)` call made after this point is routed through ArrayMorph.
|
|
217
|
+
|
|
218
|
+
### Step 2 — Write array data
|
|
219
|
+
|
|
220
|
+
```python
|
|
221
|
+
import h5py
|
|
222
|
+
import numpy as np
|
|
223
|
+
|
|
224
|
+
data = np.fromfunction(lambda i, j: i + j, (100, 100), dtype="i4")
|
|
225
|
+
|
|
226
|
+
with h5py.File("demo.h5", "w") as f:
|
|
227
|
+
f.create_dataset("values", data=data, chunks=(10, 10))
|
|
228
|
+
```
|
|
229
|
+
|
|
230
|
+
Each 10×10 chunk is stored as a separate object in your S3 bucket.
|
|
231
|
+
|
|
232
|
+
### Step 3 — Read a slice back
|
|
233
|
+
|
|
234
|
+
```python
|
|
235
|
+
import h5py
|
|
236
|
+
|
|
237
|
+
with h5py.File("demo.h5", "r") as f:
|
|
238
|
+
dset = f["values"]
|
|
239
|
+
print(dset.dtype) # int32
|
|
240
|
+
print(dset[5:15, 5:15]) # fetches only the chunks that overlap this slice
|
|
241
|
+
```
|
|
242
|
+
|
|
243
|
+
Only the chunks that overlap the requested hyperslab are fetched from cloud storage — no full-file download occurs.
|
|
244
|
+
|
|
245
|
+
---
|
|
246
|
+
|
|
247
|
+
# Explanation
|
|
248
|
+
|
|
249
|
+
## How ArrayMorph works
|
|
250
|
+
|
|
251
|
+
ArrayMorph is implemented as an HDF5 **Virtual Object Layer (VOL)** connector. The VOL is an abstraction layer inside the HDF5 library that separates the public API from the storage implementation. By providing a plugin that registers itself as a VOL connector, ArrayMorph intercepts every HDF5 file operation before it reaches the native POSIX layer.
|
|
252
|
+
|
|
253
|
+
When `arraymorph.enable()` is called:
|
|
254
|
+
|
|
255
|
+
1. `HDF5_PLUGIN_PATH` is set to the directory containing the compiled shared library (`lib_arraymorph.so` / `lib_arraymorph.dylib`).
|
|
256
|
+
2. `HDF5_VOL_CONNECTOR=arraymorph` tells HDF5 to load and activate that plugin for all subsequent file operations.
|
|
257
|
+
|
|
258
|
+
From this point, a call like `h5py.File("demo.h5", "w")` does not touch the local filesystem. Instead, the VOL connector:
|
|
259
|
+
|
|
260
|
+
1. Reads cloud credentials from environment variables and constructs an AWS S3 or Azure Blob client (selected by `STORAGE_PLATFORM`).
|
|
261
|
+
2. On dataset read/write, translates the HDF5 hyperslab selection into a list of chunks and dispatches asynchronous get/put requests against the object store — one object per chunk.
|
|
262
|
+
|
|
263
|
+
### Chunked storage model
|
|
264
|
+
|
|
265
|
+
HDF5 datasets are divided into fixed-size chunks (e.g. `chunks=(64, 64)` for a 2-D dataset). ArrayMorph stores each chunk as an independent object in the bucket. The object key encodes the dataset path and chunk coordinates, so a partial read only fetches the chunks that overlap the requested slice. For large chunks, ArrayMorph can issue byte-range requests to retrieve only the needed bytes within a chunk object.
|
|
266
|
+
|
|
267
|
+
### Async I/O
|
|
268
|
+
|
|
269
|
+
Both the S3 and Azure backends use asynchronous operations dispatched to a thread pool. This allows ArrayMorph to fetch multiple chunks in parallel, which is important for workloads that access many chunks per read (e.g. strided access patterns in machine learning data loaders).
|
|
270
|
+
|
|
271
|
+
### Compatibility
|
|
272
|
+
|
|
273
|
+
Because the interception happens at the VOL layer, no changes to application code are required. Any program that opens HDF5 files with h5py or the HDF5 C++ API will automatically use ArrayMorph once the plugin is loaded.
|
|
274
|
+
|
|
275
|
+
---
|
|
276
|
+
|
|
277
|
+
# References
|
|
278
|
+
|
|
279
|
+
## Python API
|
|
280
|
+
|
|
281
|
+
### `arraymorph.enable() -> None`
|
|
282
|
+
|
|
283
|
+
Sets `HDF5_PLUGIN_PATH` and `HDF5_VOL_CONNECTOR` in the current process environment. Must be called before any `h5py.File(...)` call.
|
|
284
|
+
|
|
285
|
+
### `arraymorph.get_plugin_path() -> str`
|
|
286
|
+
|
|
287
|
+
Returns the directory containing the compiled VOL plugin. Useful when you need to set `HDF5_PLUGIN_PATH` manually.
|
|
288
|
+
|
|
289
|
+
### `arraymorph.configure_s3(bucket, access_key, secret_key, endpoint=None, region="us-east-2", use_tls=False, addressing_style=False, use_signed_payloads=False) -> None`
|
|
290
|
+
|
|
291
|
+
Configures the S3 client. All parameters are written to environment variables consumed by the C++ plugin at file-open time.
|
|
292
|
+
|
|
293
|
+
| Parameter | Environment variable | Default | Description |
|
|
294
|
+
| --------------------- | ------------------------- | ----------- | ---------------------------------------------------- |
|
|
295
|
+
| `bucket` | `BUCKET_NAME` | — | S3 bucket name |
|
|
296
|
+
| `access_key` | `AWS_ACCESS_KEY_ID` | — | Access key ID |
|
|
297
|
+
| `secret_key` | `AWS_SECRET_ACCESS_KEY` | — | Secret access key |
|
|
298
|
+
| `endpoint` | `AWS_ENDPOINT_URL_S3` | AWS default | Custom endpoint for S3-compatible stores |
|
|
299
|
+
| `region` | `AWS_REGION` | `us-east-2` | SigV4 signing region |
|
|
300
|
+
| `use_tls` | `AWS_USE_TLS` | `false` | Use HTTPS when `True` |
|
|
301
|
+
| `addressing_style` | `AWS_S3_ADDRESSING_STYLE` | `virtual` | `path` when `True`; required for most non-AWS stores |
|
|
302
|
+
| `use_signed_payloads` | `AWS_SIGNED_PAYLOADS` | `false` | Include request body in SigV4 signature |
|
|
303
|
+
|
|
304
|
+
### `arraymorph.configure_azure(container, connection_string=None) -> None`
|
|
305
|
+
|
|
306
|
+
Configures the Azure Blob client.
|
|
307
|
+
|
|
308
|
+
| Parameter | Environment variable | Default | Description |
|
|
309
|
+
| ------------------- | --------------------------------- | -------- | ------------------------------- |
|
|
310
|
+
| `container` | `BUCKET_NAME` | — | Azure container name |
|
|
311
|
+
| `connection_string` | `AZURE_STORAGE_CONNECTION_STRING` | From env | Azure Storage connection string |
|
|
312
|
+
|
|
313
|
+
## Environment variables
|
|
314
|
+
|
|
315
|
+
All configuration can be applied via environment variables without using the Python API. This is useful when running HDF5 C++ programs directly.
|
|
316
|
+
|
|
317
|
+
| Variable | Description |
|
|
318
|
+
| --------------------------------- | --------------------------------------------------- |
|
|
319
|
+
| `HDF5_PLUGIN_PATH` | Directory containing `lib_arraymorph.so` / `.dylib` |
|
|
320
|
+
| `HDF5_VOL_CONNECTOR` | Must be `arraymorph` to activate the plugin |
|
|
321
|
+
| `STORAGE_PLATFORM` | `S3` (default) or `Azure` |
|
|
322
|
+
| `BUCKET_NAME` | Bucket or container name |
|
|
323
|
+
| `AWS_ACCESS_KEY_ID` | S3 access key |
|
|
324
|
+
| `AWS_SECRET_ACCESS_KEY` | S3 secret key |
|
|
325
|
+
| `AWS_REGION` | SigV4 signing region |
|
|
326
|
+
| `AWS_ENDPOINT_URL_S3` | Custom S3-compatible endpoint URL |
|
|
327
|
+
| `AWS_USE_TLS` | `true` / `false` |
|
|
328
|
+
| `AWS_S3_ADDRESSING_STYLE` | `path` or `virtual` |
|
|
329
|
+
| `AWS_SIGNED_PAYLOADS` | `true` / `false` |
|
|
330
|
+
| `AZURE_STORAGE_CONNECTION_STRING` | Azure connection string |
|
|
331
|
+
|
|
332
|
+
## External references
|
|
333
|
+
|
|
334
|
+
- [HDF5 VOL connectors](https://docs.hdfgroup.org/hdf5/develop/_v_o_l.html)
|
|
335
|
+
- [AWS SDK for C++](https://github.com/aws/aws-sdk-cpp)
|
|
336
|
+
- [Azure SDK for C++](https://github.com/Azure/azure-sdk-for-cpp)
|
|
337
|
+
- [h5py documentation](https://docs.h5py.org/en/stable/)
|
|
338
|
+
|
|
339
|
+
---
|
|
340
|
+
|
|
341
|
+
## Acknowledgements
|
|
342
|
+
|
|
343
|
+
This project is supported by the National Science Foundation (NSF) funded AI institute for Intelligent Cyberinfrastructure with Computational Learning in the Environment (ICICLE) (OAC 2112606).
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
arraymorph-0.2.0b2.dist-info/RECORD,,
|
|
2
|
+
arraymorph-0.2.0b2.dist-info/WHEEL,sha256=MTtTVEOT0yvdVrqS1QxOocMTET7EOr78SiTGviBQ98U,141
|
|
3
|
+
arraymorph-0.2.0b2.dist-info/METADATA,sha256=dvY-8BAF7tdJwk7xyW8HRFTmtxw9vKrO0Oo5h3u52TY,14099
|
|
4
|
+
arraymorph-0.2.0b2.dist-info/licenses/LICENSE,sha256=dfxSWfn7Ool0X832DV85QnXw3rUwggqXZ1vVDGyxeEA,1145
|
|
5
|
+
arraymorph/__init__.py,sha256=Gz0-ZYGOoSpLbsLBlPIiF5buuizi-Q9iq-HHbsWlI44,6257
|
|
6
|
+
arraymorph/lib/lib_arraymorph.dylib,sha256=C7FcogaWo-QeYIwsvdrgpaC505ZyrWv3WE_CWXR_mJk,13434408
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 Intelligent Cyberinfrastructure with Computational Learning in the Environment -- ICICLE
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|