arraymorph 0.2.0b2.dev0__cp314-cp314-macosx_15_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
arraymorph/__init__.py
ADDED
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
"""
|
|
2
|
+
ArrayMorph - HDF5 VOL connector for cloud object storage.
|
|
3
|
+
|
|
4
|
+
Supports AWS S3 and Azure Blob Storage via HDF5's Virtual Object Layer.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import os
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
|
|
12
|
+
__version__ = "0.2.0"
|
|
13
|
+
|
|
14
|
+
# The compiled VOL plugin lives next to this file after installation
|
|
15
|
+
_PLUGIN_DIR = str(Path(__file__).parent / "lib")
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def get_plugin_path() -> str:
|
|
19
|
+
"""Return the directory containing the ArrayMorph VOL plugin (.so/.dylib).
|
|
20
|
+
|
|
21
|
+
Use this to set HDF5_PLUGIN_PATH:
|
|
22
|
+
>>> import arraymorph
|
|
23
|
+
>>> os.environ["HDF5_PLUGIN_PATH"] = arraymorph.get_plugin_path()
|
|
24
|
+
"""
|
|
25
|
+
return _PLUGIN_DIR
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def enable() -> None:
|
|
29
|
+
"""Configure HDF5 environment variables to use ArrayMorph.
|
|
30
|
+
|
|
31
|
+
Sets HDF5_PLUGIN_PATH and HDF5_VOL_CONNECTOR so that any
|
|
32
|
+
subsequent h5py calls route through the ArrayMorph VOL connector.
|
|
33
|
+
|
|
34
|
+
Usage:
|
|
35
|
+
>>> import arraymorph
|
|
36
|
+
>>> arraymorph.enable()
|
|
37
|
+
>>> import h5py
|
|
38
|
+
>>> f = h5py.File("s3://bucket/data.h5", "r")
|
|
39
|
+
"""
|
|
40
|
+
os.environ["HDF5_PLUGIN_PATH"] = _PLUGIN_DIR
|
|
41
|
+
os.environ["HDF5_VOL_CONNECTOR"] = "arraymorph"
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def configure_s3(
|
|
45
|
+
bucket: str,
|
|
46
|
+
access_key: str = "",
|
|
47
|
+
secret_key: str = "",
|
|
48
|
+
endpoint: str | None = None,
|
|
49
|
+
region: str = "us-east-2",
|
|
50
|
+
use_tls: bool = False,
|
|
51
|
+
addressing_style: bool = False,
|
|
52
|
+
use_signed_payloads: bool = False,
|
|
53
|
+
) -> None:
|
|
54
|
+
"""Configure AWS S3 credentials and client behavior for ArrayMorph.
|
|
55
|
+
|
|
56
|
+
Sets the environment variables read by the VOL connector's S3 client
|
|
57
|
+
at initialization time. Call this before any h5py file operations.
|
|
58
|
+
|
|
59
|
+
Args:
|
|
60
|
+
bucket: Name of the S3 bucket where HDF5 files are stored.
|
|
61
|
+
Maps to: BUCKET_NAME
|
|
62
|
+
access_key: Access key ID for authentication with the S3 service.
|
|
63
|
+
Maps to: AWS_ACCESS_KEY_ID
|
|
64
|
+
secret_key: Secret access key paired with access_key for authentication.
|
|
65
|
+
Maps to: AWS_SECRET_ACCESS_KEY
|
|
66
|
+
endpoint: Custom S3-compatible endpoint URL (e.g. 'http://localhost:3900').
|
|
67
|
+
When None, the S3 client targets the default AWS endpoint. Required
|
|
68
|
+
for any non-AWS S3-compatible object store (MinIO, Ceph, etc.).
|
|
69
|
+
Maps to: AWS_ENDPOINT_URL_S3
|
|
70
|
+
region: Region label used in SigV4 request signing. Must match the region
|
|
71
|
+
your bucket or S3-compatible store is configured with — a mismatch
|
|
72
|
+
produces signature validation errors. Defaults to 'us-east-2'.
|
|
73
|
+
Maps to: AWS_REGION
|
|
74
|
+
use_tls: Whether to use HTTPS (True) or HTTP (False) for S3 connections.
|
|
75
|
+
Set to False for object stores that do not have TLS configured.
|
|
76
|
+
Defaults to False.
|
|
77
|
+
Maps to: AWS_USE_TLS
|
|
78
|
+
addressing_style: URL addressing style for the S3 client. When True,
|
|
79
|
+
uses path-style ('endpoint/bucket/key'). When False, uses
|
|
80
|
+
virtual-hosted style ('bucket.endpoint/key'), which can cause the
|
|
81
|
+
S3 client to misinterpret the HDF5 filename as the bucket name.
|
|
82
|
+
Most S3-compatible stores require path-style addressing.
|
|
83
|
+
Defaults to False.
|
|
84
|
+
Maps to: AWS_S3_ADDRESSING_STYLE
|
|
85
|
+
use_signed_payloads: Whether to include the request body in the SigV4
|
|
86
|
+
signature (PayloadSigningPolicy::Always). Some S3-compatible stores
|
|
87
|
+
require signed payloads and will reject requests with signature
|
|
88
|
+
validation errors if this is disabled. Defaults to False.
|
|
89
|
+
Maps to: AWS_SIGNED_PAYLOADS
|
|
90
|
+
|
|
91
|
+
Example:
|
|
92
|
+
>>> import arraymorph
|
|
93
|
+
>>> arraymorph.configure_s3(
|
|
94
|
+
... bucket="my-bucket",
|
|
95
|
+
... access_key="my-access-key",
|
|
96
|
+
... secret_key="my-secret-key",
|
|
97
|
+
... endpoint="http://localhost:3900",
|
|
98
|
+
... region="us-east-1",
|
|
99
|
+
... use_tls=False,
|
|
100
|
+
... addressing_style=True,
|
|
101
|
+
... use_signed_payloads=True,
|
|
102
|
+
... )
|
|
103
|
+
>>> arraymorph.enable()
|
|
104
|
+
"""
|
|
105
|
+
if not (access_key and secret_key):
|
|
106
|
+
raise ValueError(
|
|
107
|
+
"configure_s3() requires both 'access_key' and 'secret_key'. "
|
|
108
|
+
"Set them explicitly or export AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY "
|
|
109
|
+
"before calling this function."
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
os.environ["AWS_ACCESS_KEY_ID"] = access_key
|
|
113
|
+
os.environ["AWS_SECRET_ACCESS_KEY"] = secret_key
|
|
114
|
+
os.environ["STORAGE_PLATFORM"] = "S3"
|
|
115
|
+
os.environ["BUCKET_NAME"] = bucket
|
|
116
|
+
os.environ["AWS_REGION"] = region
|
|
117
|
+
|
|
118
|
+
if endpoint:
|
|
119
|
+
os.environ["AWS_ENDPOINT_URL_S3"] = endpoint
|
|
120
|
+
|
|
121
|
+
os.environ["AWS_USE_TLS"] = str(use_tls).lower()
|
|
122
|
+
os.environ["AWS_S3_ADDRESSING_STYLE"] = "path" if addressing_style else "virtual"
|
|
123
|
+
os.environ["AWS_SIGNED_PAYLOADS"] = str(use_signed_payloads).lower()
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def configure_azure(
|
|
127
|
+
container: str,
|
|
128
|
+
connection_string: str | None = None,
|
|
129
|
+
) -> None:
|
|
130
|
+
"""Configure Azure Blob Storage credentials for ArrayMorph.
|
|
131
|
+
|
|
132
|
+
Sets the environment variables read by the VOL connector's Azure client
|
|
133
|
+
at initialization time. Call this before any h5py file operations.
|
|
134
|
+
|
|
135
|
+
Args:
|
|
136
|
+
container: Name of the Azure Blob Storage container where HDF5 files
|
|
137
|
+
are stored. Maps to: BUCKET_NAME
|
|
138
|
+
connection_string: Azure Storage connection string used to authenticate
|
|
139
|
+
and locate the storage account. If None, the connector will fall back
|
|
140
|
+
to the existing AZURE_STORAGE_CONNECTION_STRING environment variable.
|
|
141
|
+
Maps to: AZURE_STORAGE_CONNECTION_STRING
|
|
142
|
+
|
|
143
|
+
Example:
|
|
144
|
+
>>> import arraymorph
|
|
145
|
+
>>> arraymorph.configure_azure(
|
|
146
|
+
... container="my-container",
|
|
147
|
+
... connection_string="DefaultEndpointsProtocol=https;AccountName=...",
|
|
148
|
+
... )
|
|
149
|
+
>>> arraymorph.enable()
|
|
150
|
+
"""
|
|
151
|
+
if not connection_string and not os.environ.get("AZURE_STORAGE_CONNECTION_STRING"):
|
|
152
|
+
raise ValueError(
|
|
153
|
+
"configure_azure() requires a 'connection_string'. "
|
|
154
|
+
"Set it explicitly or export AZURE_STORAGE_CONNECTION_STRING "
|
|
155
|
+
"before calling this function."
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
os.environ["STORAGE_PLATFORM"] = "Azure"
|
|
159
|
+
os.environ["BUCKET_NAME"] = container
|
|
160
|
+
if connection_string:
|
|
161
|
+
os.environ["AZURE_STORAGE_CONNECTION_STRING"] = connection_string
|
|
Binary file
|
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
Metadata-Version: 2.2
|
|
2
|
+
Name: arraymorph
|
|
3
|
+
Version: 0.2.0b2.dev0
|
|
4
|
+
Summary: HDF5 VOL connector for cloud object storage (AWS S3, Azure Blob)
|
|
5
|
+
Author: ruochenj123, wangtg2013
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/ICICLE-ai/ArrayMorph
|
|
8
|
+
Project-URL: Repository, https://github.com/ICICLE-ai/ArrayMorph
|
|
9
|
+
Project-URL: Issues, https://github.com/ICICLE-ai/ArrayMorph/issues
|
|
10
|
+
Requires-Python: >=3.9
|
|
11
|
+
Requires-Dist: h5py>=3.11.0
|
|
12
|
+
Description-Content-Type: text/markdown
|
|
13
|
+
|
|
14
|
+
# ArrayMorph
|
|
15
|
+
|
|
16
|
+
[](https://github.com/ICICLE-ai/arraymorph/actions/workflows/build.yml)
|
|
17
|
+
[](https://opensource.org/licenses/MIT)
|
|
18
|
+
|
|
19
|
+
ArrayMorph is a software to manage array data stored on cloud object storage efficiently. It supports both HDF5 C++ API and h5py API. The data returned by h5py API is numpy arrays. By using h5py API, users can access array data stored on the cloud and feed the read data into machine learning pipelines seamlessly.
|
|
20
|
+
|
|
21
|
+
**Tag**: CI4AI
|
|
22
|
+
|
|
23
|
+
---
|
|
24
|
+
|
|
25
|
+
# How-To Guides
|
|
26
|
+
|
|
27
|
+
## Install dependencies
|
|
28
|
+
|
|
29
|
+
It is recommended to use Conda (and conda-forge) for managing dependencies.
|
|
30
|
+
|
|
31
|
+
1. Install [Miniconda](https://docs.anaconda.com/miniconda/)
|
|
32
|
+
2. Install [conda-build](https://docs.conda.io/projects/conda-build/en/stable/install-conda-build.html) for installing local conda packages
|
|
33
|
+
3. Create and activate environment with dependencies:
|
|
34
|
+
```bash
|
|
35
|
+
conda create -n arraymorph conda-forge::gxx=9
|
|
36
|
+
conda activate arraymorph
|
|
37
|
+
conda install -n arraymorph cmake conda-forge::hdf5=1.14.2 conda-forge::aws-sdk-cpp conda-forge::azure-storage-blobs-cpp conda-forge::h5py
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
## Install ArrayMorph via ArrayMorph local conda package
|
|
41
|
+
```bash
|
|
42
|
+
git clone https://github.com/ICICLE-ai/arraymorph.git
|
|
43
|
+
cd arraymorph/arraymorph_channel
|
|
44
|
+
conda index .
|
|
45
|
+
conda install -n arraymorph arraymorph -c file://$(pwd) -c conda-forge
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
## Install ArryMorph from source code
|
|
49
|
+
|
|
50
|
+
### Build ArrayMorph
|
|
51
|
+
```bash
|
|
52
|
+
git clone https://github.com/ICICLE-ai/arraymorph.git
|
|
53
|
+
cd arraymorph/arraymorph
|
|
54
|
+
cmake -B ./build -S . -DCMAKE_PREFIX_PATH=$CONDA_PREFIX
|
|
55
|
+
cd build
|
|
56
|
+
make
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
### Enable VOL plugin:
|
|
60
|
+
```bash
|
|
61
|
+
export HDF5_PLUGIN_PATH=/path/to/arraymorph/arraymorph/build/src
|
|
62
|
+
export HDF5_VOL_CONNECTOR=arraymorph
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
## Configure Environment for Cloud Access
|
|
66
|
+
|
|
67
|
+
### AWS Configuration:
|
|
68
|
+
```bash
|
|
69
|
+
export STORAGE_PLATFORM=S3
|
|
70
|
+
export BUCKET_NAME=XXXXXX
|
|
71
|
+
export AWS_ACCESS_KEY_ID=XXXXXX
|
|
72
|
+
export AWS_SECRET_ACCESS_KEY=XXXXXX
|
|
73
|
+
export AWS_REGION=us-east-2 # or your bucket's region
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
### Azure Configuration:
|
|
77
|
+
```bash
|
|
78
|
+
export STORAGE_PLATFORM=Azure
|
|
79
|
+
export BUCKET_NAME=XXXXXX
|
|
80
|
+
export AZURE_STORAGE_CONNECTION_STRING=XXXXXX
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
---
|
|
84
|
+
|
|
85
|
+
# Tutorials
|
|
86
|
+
|
|
87
|
+
## Run a simple example: Writing and Reading HDF5 files from Cloud
|
|
88
|
+
|
|
89
|
+
### Prerequisites:
|
|
90
|
+
- AWS or Azure cloud account with credentials
|
|
91
|
+
- S3 bucket or Azure container
|
|
92
|
+
- ArrayMorph dependencies installed
|
|
93
|
+
|
|
94
|
+
### Steps:
|
|
95
|
+
1. Activate conda environment
|
|
96
|
+
```bash
|
|
97
|
+
conda activate arraymorph
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
2. Write sample HDF5 data to the cloud
|
|
101
|
+
```bash
|
|
102
|
+
cd examples/python
|
|
103
|
+
python3 write.py
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
3. Read data back from cloud HDF5 file
|
|
107
|
+
```bash
|
|
108
|
+
cd examples/python
|
|
109
|
+
python3 read.py
|
|
110
|
+
```
|
|
111
|
+
---
|
|
112
|
+
|
|
113
|
+
# Explanation
|
|
114
|
+
|
|
115
|
+
### How ArrayMorph Works
|
|
116
|
+
|
|
117
|
+
ArrayMorph plugs into the HDF5 stack using a VOL (Virtual Object Layer) plugin that intercepts file operations and routes them to cloud object storage instead of local files. This allows existing HDF5 APIs (both C++ and h5py in Python) to operate on cloud-based data seamlessly, enabling transparent cloud access for scientific or ML pipelines.
|
|
118
|
+
|
|
119
|
+
It supports:
|
|
120
|
+
- Cloud backends: AWS S3 and Azure Blob
|
|
121
|
+
- File formats: Current binary data stream (we plan to extend to other formats like jpg in the future)
|
|
122
|
+
- Languages: C++ and Python (via h5py compatibility)
|
|
123
|
+
|
|
124
|
+
The system is designed to be efficient in latency-sensitive scenarios and aims to integrate well with large-scale distributed training and inference.
|
|
125
|
+
|
|
126
|
+
---
|
|
127
|
+
|
|
128
|
+
## References
|
|
129
|
+
|
|
130
|
+
- [HDF5 VOL connectors](https://docs.hdfgroup.org/hdf5/develop/_v_o_l.html)
|
|
131
|
+
- [AWS SDK for C++](https://github.com/aws/aws-sdk-cpp)
|
|
132
|
+
- [Azure SDK for C++](https://github.com/Azure/azure-sdk-for-cpp)
|
|
133
|
+
- [h5py documentation](https://docs.h5py.org/en/stable/)
|
|
134
|
+
- [conda-forge](https://conda-forge.org/)
|
|
135
|
+
|
|
136
|
+
---
|
|
137
|
+
|
|
138
|
+
## Acknowledgements
|
|
139
|
+
|
|
140
|
+
This project is supported by:
|
|
141
|
+
|
|
142
|
+
*National Science Foundation (NSF) funded AI institute for Intelligent Cyberinfrastructure with Computational Learning in the Environment (ICICLE) (OAC 2112606)*
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
arraymorph/__init__.py,sha256=Gz0-ZYGOoSpLbsLBlPIiF5buuizi-Q9iq-HHbsWlI44,6257
|
|
2
|
+
arraymorph/lib/lib_arraymorph.dylib,sha256=ujM3DRZPhmlbzC5PIxcOuBkHQayxhezywk6hrYjgevM,14191016
|
|
3
|
+
arraymorph-0.2.0b2.dev0.dist-info/RECORD,,
|
|
4
|
+
arraymorph-0.2.0b2.dev0.dist-info/WHEEL,sha256=nEGB58_ldWTXGbP8ePRzOy9ovTHE0yL0yPMzHJ-Wakw,141
|
|
5
|
+
arraymorph-0.2.0b2.dev0.dist-info/METADATA,sha256=Br1w8Hol-GWPFZLlDrtSuBRA-KoLiz9e198JNo3MPyg,4526
|
|
6
|
+
arraymorph-0.2.0b2.dev0.dist-info/licenses/LICENSE,sha256=dfxSWfn7Ool0X832DV85QnXw3rUwggqXZ1vVDGyxeEA,1145
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 Intelligent Cyberinfrastructure with Computational Learning in the Environment -- ICICLE
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|