dask-array 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dask_array/__init__.py +228 -0
- dask_array/_backends.py +76 -0
- dask_array/_backends_array.py +99 -0
- dask_array/_blockwise.py +1410 -0
- dask_array/_broadcast.py +272 -0
- dask_array/_chunk.py +445 -0
- dask_array/_chunk_types.py +54 -0
- dask_array/_collection.py +1644 -0
- dask_array/_concatenate.py +331 -0
- dask_array/_core_utils.py +1365 -0
- dask_array/_dispatch.py +141 -0
- dask_array/_einsum.py +277 -0
- dask_array/_expr.py +544 -0
- dask_array/_expr_flow.py +586 -0
- dask_array/_gufunc.py +805 -0
- dask_array/_histogram.py +617 -0
- dask_array/_map_blocks.py +652 -0
- dask_array/_new_collection.py +10 -0
- dask_array/_numpy_compat.py +135 -0
- dask_array/_overlap.py +1159 -0
- dask_array/_rechunk.py +1050 -0
- dask_array/_reshape.py +710 -0
- dask_array/_routines.py +102 -0
- dask_array/_shuffle.py +448 -0
- dask_array/_stack.py +264 -0
- dask_array/_svg.py +291 -0
- dask_array/_templates.py +29 -0
- dask_array/_test_utils.py +257 -0
- dask_array/_ufunc.py +385 -0
- dask_array/_utils.py +349 -0
- dask_array/_visualize.py +223 -0
- dask_array/_xarray.py +337 -0
- dask_array/core/__init__.py +34 -0
- dask_array/core/_blockwise_funcs.py +312 -0
- dask_array/core/_conversion.py +422 -0
- dask_array/core/_from_graph.py +97 -0
- dask_array/creation/__init__.py +71 -0
- dask_array/creation/_arange.py +121 -0
- dask_array/creation/_diag.py +116 -0
- dask_array/creation/_diagonal.py +241 -0
- dask_array/creation/_eye.py +103 -0
- dask_array/creation/_linspace.py +102 -0
- dask_array/creation/_mesh.py +134 -0
- dask_array/creation/_ones_zeros.py +454 -0
- dask_array/creation/_pad.py +270 -0
- dask_array/creation/_repeat.py +55 -0
- dask_array/creation/_tile.py +36 -0
- dask_array/creation/_tri.py +28 -0
- dask_array/creation/_utils.py +296 -0
- dask_array/fft.py +320 -0
- dask_array/io/__init__.py +39 -0
- dask_array/io/_base.py +10 -0
- dask_array/io/_from_array.py +257 -0
- dask_array/io/_from_delayed.py +95 -0
- dask_array/io/_from_graph.py +54 -0
- dask_array/io/_from_npy_stack.py +67 -0
- dask_array/io/_store.py +336 -0
- dask_array/io/_tiledb.py +159 -0
- dask_array/io/_to_npy_stack.py +65 -0
- dask_array/io/_zarr.py +449 -0
- dask_array/linalg/__init__.py +39 -0
- dask_array/linalg/_cholesky.py +234 -0
- dask_array/linalg/_lu.py +300 -0
- dask_array/linalg/_norm.py +94 -0
- dask_array/linalg/_qr.py +601 -0
- dask_array/linalg/_solve.py +349 -0
- dask_array/linalg/_svd.py +394 -0
- dask_array/linalg/_tensordot.py +334 -0
- dask_array/linalg/_utils.py +74 -0
- dask_array/manipulation/__init__.py +45 -0
- dask_array/manipulation/_expand.py +321 -0
- dask_array/manipulation/_flip.py +92 -0
- dask_array/manipulation/_roll.py +78 -0
- dask_array/manipulation/_transpose.py +309 -0
- dask_array/random/__init__.py +125 -0
- dask_array/random/_choice.py +181 -0
- dask_array/random/_expr.py +256 -0
- dask_array/random/_generator.py +441 -0
- dask_array/random/_random_state.py +259 -0
- dask_array/random/_utils.py +84 -0
- dask_array/reductions/__init__.py +84 -0
- dask_array/reductions/_arg_reduction.py +130 -0
- dask_array/reductions/_common.py +1082 -0
- dask_array/reductions/_cumulative.py +522 -0
- dask_array/reductions/_percentile.py +261 -0
- dask_array/reductions/_reduction.py +725 -0
- dask_array/reductions/_trace.py +56 -0
- dask_array/routines/__init__.py +133 -0
- dask_array/routines/_apply.py +84 -0
- dask_array/routines/_bincount.py +112 -0
- dask_array/routines/_broadcast.py +111 -0
- dask_array/routines/_coarsen.py +115 -0
- dask_array/routines/_diff.py +79 -0
- dask_array/routines/_gradient.py +158 -0
- dask_array/routines/_indexing.py +65 -0
- dask_array/routines/_insert_delete.py +132 -0
- dask_array/routines/_misc.py +122 -0
- dask_array/routines/_nonzero.py +72 -0
- dask_array/routines/_search.py +123 -0
- dask_array/routines/_select.py +113 -0
- dask_array/routines/_statistics.py +171 -0
- dask_array/routines/_topk.py +82 -0
- dask_array/routines/_triangular.py +74 -0
- dask_array/routines/_unique.py +232 -0
- dask_array/routines/_where.py +62 -0
- dask_array/slicing/__init__.py +67 -0
- dask_array/slicing/_basic.py +550 -0
- dask_array/slicing/_blocks.py +138 -0
- dask_array/slicing/_bool_index.py +145 -0
- dask_array/slicing/_setitem.py +329 -0
- dask_array/slicing/_squeeze.py +101 -0
- dask_array/slicing/_utils.py +1133 -0
- dask_array/slicing/_vindex.py +282 -0
- dask_array/stacking/__init__.py +15 -0
- dask_array/stacking/_block.py +83 -0
- dask_array/stacking/_simple.py +58 -0
- dask_array/templates/array.html.j2 +48 -0
- dask_array/tests/__init__.py +0 -0
- dask_array/tests/conftest.py +22 -0
- dask_array/tests/test_api.py +40 -0
- dask_array/tests/test_binary_op_chunks.py +107 -0
- dask_array/tests/test_coarse_slice_through_blockwise.py +362 -0
- dask_array/tests/test_collection.py +799 -0
- dask_array/tests/test_creation.py +1102 -0
- dask_array/tests/test_expr_flow.py +143 -0
- dask_array/tests/test_linalg.py +1130 -0
- dask_array/tests/test_map_blocks_multi_output.py +104 -0
- dask_array/tests/test_rechunk_pushdown.py +214 -0
- dask_array/tests/test_reductions.py +1091 -0
- dask_array/tests/test_routines.py +2853 -0
- dask_array/tests/test_shuffle_chunks.py +67 -0
- dask_array/tests/test_slice_pushdown.py +968 -0
- dask_array/tests/test_slice_through_blockwise.py +678 -0
- dask_array/tests/test_slice_through_overlap.py +366 -0
- dask_array/tests/test_slice_through_reshape.py +272 -0
- dask_array/tests/test_slicing.py +839 -0
- dask_array/tests/test_transpose_slice_pushdown.py +208 -0
- dask_array/tests/test_visualize.py +94 -0
- dask_array/tests/test_xarray.py +193 -0
- dask_array-0.1.0.dist-info/METADATA +48 -0
- dask_array-0.1.0.dist-info/RECORD +144 -0
- dask_array-0.1.0.dist-info/WHEEL +4 -0
- dask_array-0.1.0.dist-info/entry_points.txt +2 -0
- dask_array-0.1.0.dist-info/licenses/LICENSE +29 -0
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import functools
|
|
4
|
+
from typing import TYPE_CHECKING
|
|
5
|
+
|
|
6
|
+
import numpy as np
|
|
7
|
+
|
|
8
|
+
from dask_array.io._base import IO
|
|
9
|
+
from dask_array._utils import meta_from_array
|
|
10
|
+
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
pass
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class FromDelayed(IO):
|
|
16
|
+
"""Expression for creating an array from a delayed value."""
|
|
17
|
+
|
|
18
|
+
_parameters = ["value", "shape", "dtype", "_meta", "_name_prefix"]
|
|
19
|
+
_defaults = {"dtype": None, "_meta": None, "_name_prefix": None}
|
|
20
|
+
|
|
21
|
+
@functools.cached_property
|
|
22
|
+
def _meta(self):
|
|
23
|
+
meta = self.operand("_meta")
|
|
24
|
+
dtype = self.operand("dtype")
|
|
25
|
+
shape = self.operand("shape")
|
|
26
|
+
if meta is not None:
|
|
27
|
+
if dtype is None:
|
|
28
|
+
dtype = getattr(meta, "dtype", None)
|
|
29
|
+
return meta_from_array(meta, dtype=dtype)
|
|
30
|
+
if dtype is not None:
|
|
31
|
+
return np.empty((0,) * len(shape), dtype=dtype)
|
|
32
|
+
return np.empty((0,) * len(shape))
|
|
33
|
+
|
|
34
|
+
@functools.cached_property
|
|
35
|
+
def chunks(self):
|
|
36
|
+
return tuple((d,) for d in self.operand("shape"))
|
|
37
|
+
|
|
38
|
+
@functools.cached_property
|
|
39
|
+
def _name(self):
|
|
40
|
+
prefix = self.operand("_name_prefix")
|
|
41
|
+
if prefix:
|
|
42
|
+
return prefix
|
|
43
|
+
return "from-delayed-" + self.deterministic_token
|
|
44
|
+
|
|
45
|
+
def _layer(self):
|
|
46
|
+
from dask._task_spec import Alias
|
|
47
|
+
from dask.base import is_dask_collection
|
|
48
|
+
|
|
49
|
+
value = self.operand("value")
|
|
50
|
+
shape = self.operand("shape")
|
|
51
|
+
key = (self._name,) + (0,) * len(shape)
|
|
52
|
+
task = Alias(key=key, target=value.key)
|
|
53
|
+
result = {key: task}
|
|
54
|
+
# Include the delayed value's graph
|
|
55
|
+
if is_dask_collection(value):
|
|
56
|
+
result.update(value.__dask_graph__())
|
|
57
|
+
return result
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def from_delayed(value, shape, dtype=None, meta=None, name=None):
|
|
61
|
+
"""Create a dask array from a dask delayed value
|
|
62
|
+
|
|
63
|
+
This routine is useful for constructing dask arrays in an ad-hoc fashion
|
|
64
|
+
using dask delayed, particularly when combined with stack and concatenate.
|
|
65
|
+
|
|
66
|
+
The dask array will consist of a single chunk.
|
|
67
|
+
|
|
68
|
+
Examples
|
|
69
|
+
--------
|
|
70
|
+
>>> import dask
|
|
71
|
+
>>> import dask_array as da
|
|
72
|
+
>>> import numpy as np
|
|
73
|
+
>>> value = dask.delayed(np.ones)(5)
|
|
74
|
+
>>> array = da.from_delayed(value, (5,), dtype=float)
|
|
75
|
+
>>> array
|
|
76
|
+
dask.array<from-value, shape=(5,), dtype=float64, chunksize=(5,), chunktype=numpy.ndarray>
|
|
77
|
+
>>> array.compute()
|
|
78
|
+
array([1., 1., 1., 1., 1.])
|
|
79
|
+
"""
|
|
80
|
+
from dask_array._new_collection import new_collection
|
|
81
|
+
from dask.delayed import Delayed, delayed
|
|
82
|
+
|
|
83
|
+
# Convert to Delayed if it has a key but isn't a Delayed
|
|
84
|
+
if not isinstance(value, Delayed) and hasattr(value, "key"):
|
|
85
|
+
value = delayed(value)
|
|
86
|
+
|
|
87
|
+
return new_collection(
|
|
88
|
+
FromDelayed(
|
|
89
|
+
value=value,
|
|
90
|
+
shape=shape,
|
|
91
|
+
dtype=dtype,
|
|
92
|
+
_meta=meta,
|
|
93
|
+
_name_prefix=name,
|
|
94
|
+
)
|
|
95
|
+
)
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import functools
|
|
4
|
+
|
|
5
|
+
from dask import istask
|
|
6
|
+
from dask_array._expr import ArrayExpr
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class FromGraph(ArrayExpr):
|
|
10
|
+
_parameters = ["layer", "_meta", "chunks", "keys", "name_prefix", "_dependencies"]
|
|
11
|
+
_defaults = {"_dependencies": ()}
|
|
12
|
+
|
|
13
|
+
@functools.cached_property
|
|
14
|
+
def _meta(self):
|
|
15
|
+
return self.operand("_meta")
|
|
16
|
+
|
|
17
|
+
@functools.cached_property
|
|
18
|
+
def chunks(self):
|
|
19
|
+
return self.operand("chunks")
|
|
20
|
+
|
|
21
|
+
@functools.cached_property
|
|
22
|
+
def _name(self):
|
|
23
|
+
return self.operand("name_prefix") + "-" + self.deterministic_token
|
|
24
|
+
|
|
25
|
+
def dependencies(self):
|
|
26
|
+
return list(self.operand("_dependencies"))
|
|
27
|
+
|
|
28
|
+
def _layer(self):
|
|
29
|
+
layer = self.operand("layer")
|
|
30
|
+
our_keys = set(self.operand("keys"))
|
|
31
|
+
is_hlg = hasattr(layer, "layers")
|
|
32
|
+
|
|
33
|
+
# Persist case: layer is a dict of computed values with potentially
|
|
34
|
+
# different keys (optimization can change key names). Just rename.
|
|
35
|
+
if not is_hlg:
|
|
36
|
+
layer_keys = {k for k in layer if isinstance(k, tuple)}
|
|
37
|
+
if layer_keys and not (layer_keys & our_keys):
|
|
38
|
+
return {(self._name, *k[1:]) if isinstance(k, tuple) else k: v for k, v in layer.items()}
|
|
39
|
+
|
|
40
|
+
# HLG case (e.g., from BlockView): contains tasks and dependencies.
|
|
41
|
+
# Rename output keys and preserve dependency structure.
|
|
42
|
+
dsk = dict(layer)
|
|
43
|
+
result = {}
|
|
44
|
+
for k, v in dsk.items():
|
|
45
|
+
if k in our_keys:
|
|
46
|
+
new_key = (self._name, *k[1:])
|
|
47
|
+
if istask(v):
|
|
48
|
+
result[new_key] = k # Alias to original
|
|
49
|
+
result[k] = v # Keep original task
|
|
50
|
+
else:
|
|
51
|
+
result[new_key] = v # Simple rename
|
|
52
|
+
else:
|
|
53
|
+
result[k] = v # Dependency - keep as-is
|
|
54
|
+
return result
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import functools
|
|
4
|
+
import os
|
|
5
|
+
import pickle
|
|
6
|
+
from itertools import product
|
|
7
|
+
|
|
8
|
+
import numpy as np
|
|
9
|
+
|
|
10
|
+
from dask_array.io._base import IO
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class FromNpyStack(IO):
|
|
14
|
+
"""Expression for loading an array from a stack of .npy files."""
|
|
15
|
+
|
|
16
|
+
_parameters = ["dirname", "mmap_mode"]
|
|
17
|
+
_defaults = {"mmap_mode": "r"}
|
|
18
|
+
|
|
19
|
+
@functools.cached_property
|
|
20
|
+
def _info(self):
|
|
21
|
+
"""Load and cache the info file."""
|
|
22
|
+
dirname = self.operand("dirname")
|
|
23
|
+
with open(os.path.join(dirname, "info"), "rb") as f:
|
|
24
|
+
return pickle.load(f)
|
|
25
|
+
|
|
26
|
+
@functools.cached_property
|
|
27
|
+
def _meta(self):
|
|
28
|
+
info = self._info
|
|
29
|
+
return np.empty((0,) * len(info["chunks"]), dtype=info["dtype"])
|
|
30
|
+
|
|
31
|
+
@functools.cached_property
|
|
32
|
+
def chunks(self):
|
|
33
|
+
return self._info["chunks"]
|
|
34
|
+
|
|
35
|
+
@functools.cached_property
|
|
36
|
+
def _name(self):
|
|
37
|
+
return "from-npy-stack-" + self.deterministic_token
|
|
38
|
+
|
|
39
|
+
def _layer(self):
|
|
40
|
+
dirname = self.operand("dirname")
|
|
41
|
+
mmap_mode = self.operand("mmap_mode")
|
|
42
|
+
info = self._info
|
|
43
|
+
chunks = info["chunks"]
|
|
44
|
+
axis = info["axis"]
|
|
45
|
+
|
|
46
|
+
keys = list(product([self._name], *[range(len(c)) for c in chunks]))
|
|
47
|
+
values = [(np.load, os.path.join(dirname, f"{i}.npy"), mmap_mode) for i in range(len(chunks[axis]))]
|
|
48
|
+
return dict(zip(keys, values))
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def from_npy_stack(dirname, mmap_mode="r"):
|
|
52
|
+
"""Load dask array from stack of npy files
|
|
53
|
+
|
|
54
|
+
Parameters
|
|
55
|
+
----------
|
|
56
|
+
dirname: string
|
|
57
|
+
Directory of .npy files
|
|
58
|
+
mmap_mode: (None or 'r')
|
|
59
|
+
Read data in memory map mode
|
|
60
|
+
|
|
61
|
+
See Also
|
|
62
|
+
--------
|
|
63
|
+
to_npy_stack
|
|
64
|
+
"""
|
|
65
|
+
from dask_array._new_collection import new_collection
|
|
66
|
+
|
|
67
|
+
return new_collection(FromNpyStack(dirname=dirname, mmap_mode=mmap_mode))
|
dask_array/io/_store.py
ADDED
|
@@ -0,0 +1,336 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from collections.abc import Collection
|
|
4
|
+
from threading import Lock
|
|
5
|
+
from typing import TYPE_CHECKING, Any, TypeVar, cast
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
from numpy.typing import ArrayLike
|
|
11
|
+
|
|
12
|
+
from dask.delayed import Delayed
|
|
13
|
+
|
|
14
|
+
from dask.base import named_schedulers
|
|
15
|
+
from dask.utils import SerializableLock
|
|
16
|
+
|
|
17
|
+
from dask_array._utils import is_arraylike
|
|
18
|
+
from dask_array.slicing._utils import fuse_slice
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def get_scheduler_lock(collection, scheduler):
|
|
22
|
+
"""Get an appropriate lock for the given collection and scheduler."""
|
|
23
|
+
if scheduler is None:
|
|
24
|
+
scheduler = collection.__dask_scheduler__
|
|
25
|
+
actual_get = named_schedulers.get(scheduler, scheduler)
|
|
26
|
+
# Only use locks for non-distributed schedulers
|
|
27
|
+
if actual_get is named_schedulers.get("synchronous", None):
|
|
28
|
+
return False
|
|
29
|
+
return SerializableLock()
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def load_store_chunk(
|
|
33
|
+
x: Any,
|
|
34
|
+
out: Any,
|
|
35
|
+
index: slice | None,
|
|
36
|
+
region: slice | None,
|
|
37
|
+
lock: Any,
|
|
38
|
+
return_stored: bool,
|
|
39
|
+
load_stored: bool,
|
|
40
|
+
) -> Any:
|
|
41
|
+
"""
|
|
42
|
+
A function inserted in a Dask graph for storing a chunk.
|
|
43
|
+
|
|
44
|
+
Parameters
|
|
45
|
+
----------
|
|
46
|
+
x: array-like
|
|
47
|
+
An array (potentially a NumPy one)
|
|
48
|
+
out: array-like
|
|
49
|
+
Where to store results.
|
|
50
|
+
index: slice-like
|
|
51
|
+
Where to store result from ``x`` in ``out``.
|
|
52
|
+
lock: Lock-like or False
|
|
53
|
+
Lock to use before writing to ``out``.
|
|
54
|
+
return_stored: bool
|
|
55
|
+
Whether to return ``out``.
|
|
56
|
+
load_stored: bool
|
|
57
|
+
Whether to return the array stored in ``out``.
|
|
58
|
+
Ignored if ``return_stored`` is not ``True``.
|
|
59
|
+
|
|
60
|
+
Returns
|
|
61
|
+
-------
|
|
62
|
+
|
|
63
|
+
If return_stored=True and load_stored=False
|
|
64
|
+
out
|
|
65
|
+
If return_stored=True and load_stored=True
|
|
66
|
+
out[index]
|
|
67
|
+
If return_stored=False and compute=False
|
|
68
|
+
None
|
|
69
|
+
|
|
70
|
+
Examples
|
|
71
|
+
--------
|
|
72
|
+
|
|
73
|
+
>>> a = np.ones((5, 6))
|
|
74
|
+
>>> b = np.empty(a.shape)
|
|
75
|
+
>>> load_store_chunk(a, b, (slice(None), slice(None)), None, False, False, False)
|
|
76
|
+
"""
|
|
77
|
+
if region:
|
|
78
|
+
# Equivalent to `out[region][index]`
|
|
79
|
+
if index:
|
|
80
|
+
index = fuse_slice(region, index)
|
|
81
|
+
else:
|
|
82
|
+
index = region
|
|
83
|
+
if lock:
|
|
84
|
+
lock.acquire()
|
|
85
|
+
try:
|
|
86
|
+
if x is not None and x.size != 0:
|
|
87
|
+
if is_arraylike(x):
|
|
88
|
+
out[index] = x
|
|
89
|
+
else:
|
|
90
|
+
out[index] = np.asanyarray(x)
|
|
91
|
+
|
|
92
|
+
if return_stored and load_stored:
|
|
93
|
+
return out[index]
|
|
94
|
+
elif return_stored and not load_stored:
|
|
95
|
+
return out
|
|
96
|
+
else:
|
|
97
|
+
return None
|
|
98
|
+
finally:
|
|
99
|
+
if lock:
|
|
100
|
+
lock.release()
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
A = TypeVar("A", bound="ArrayLike")
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def load_chunk(out: A, index: slice, lock: Any, region: slice | None) -> A:
|
|
107
|
+
"""Load a chunk from an array-like object.
|
|
108
|
+
|
|
109
|
+
This is used for loading stored chunks back into dask arrays.
|
|
110
|
+
"""
|
|
111
|
+
return load_store_chunk(
|
|
112
|
+
None,
|
|
113
|
+
out=out,
|
|
114
|
+
region=region,
|
|
115
|
+
index=index,
|
|
116
|
+
lock=lock,
|
|
117
|
+
return_stored=True,
|
|
118
|
+
load_stored=True,
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def store(
|
|
123
|
+
sources,
|
|
124
|
+
targets,
|
|
125
|
+
lock: bool | Lock = True,
|
|
126
|
+
regions: tuple[slice, ...] | Collection[tuple[slice, ...]] | None = None,
|
|
127
|
+
compute: bool = True,
|
|
128
|
+
return_stored: bool = False,
|
|
129
|
+
load_stored: bool | None = None,
|
|
130
|
+
**kwargs,
|
|
131
|
+
):
|
|
132
|
+
"""Store dask arrays in array-like objects, overwrite data in target
|
|
133
|
+
|
|
134
|
+
This stores dask arrays into object that supports numpy-style setitem
|
|
135
|
+
indexing. It stores values chunk by chunk so that it does not have to
|
|
136
|
+
fill up memory. For best performance you can align the block size of
|
|
137
|
+
the storage target with the block size of your array.
|
|
138
|
+
|
|
139
|
+
If your data fits in memory then you may prefer calling
|
|
140
|
+
``np.array(myarray)`` instead.
|
|
141
|
+
|
|
142
|
+
Parameters
|
|
143
|
+
----------
|
|
144
|
+
|
|
145
|
+
sources: Array or collection of Arrays
|
|
146
|
+
targets: array-like or Delayed or collection of array-likes and/or Delayeds
|
|
147
|
+
These should support setitem syntax ``target[10:20] = ...``.
|
|
148
|
+
If sources is a single item, targets must be a single item; if sources is a
|
|
149
|
+
collection of arrays, targets must be a matching collection.
|
|
150
|
+
lock: boolean or threading.Lock, optional
|
|
151
|
+
Whether or not to lock the data stores while storing.
|
|
152
|
+
Pass True (lock each file individually), False (don't lock) or a
|
|
153
|
+
particular :class:`threading.Lock` object to be shared among all writes.
|
|
154
|
+
regions: tuple of slices or collection of tuples of slices, optional
|
|
155
|
+
Each ``region`` tuple in ``regions`` should be such that
|
|
156
|
+
``target[region].shape = source.shape``
|
|
157
|
+
for the corresponding source and target in sources and targets,
|
|
158
|
+
respectively. If this is a tuple, the contents will be assumed to be
|
|
159
|
+
slices, so do not provide a tuple of tuples.
|
|
160
|
+
compute: boolean, optional
|
|
161
|
+
If true compute immediately; return :class:`dask.delayed.Delayed` otherwise.
|
|
162
|
+
return_stored: boolean, optional
|
|
163
|
+
Optionally return the stored result (default False).
|
|
164
|
+
load_stored: boolean, optional
|
|
165
|
+
Optionally return the stored result, loaded in to memory (default None).
|
|
166
|
+
If None, ``load_stored`` is True if ``return_stored`` is True and
|
|
167
|
+
``compute`` is False. *This is an advanced option.*
|
|
168
|
+
When False, store will return the appropriate ``target`` for each chunk that is stored.
|
|
169
|
+
Directly computing this result is not what you want.
|
|
170
|
+
Instead, you can use the returned ``target`` to execute followup operations to the store.
|
|
171
|
+
kwargs:
|
|
172
|
+
Parameters passed to compute/persist (only used if compute=True)
|
|
173
|
+
|
|
174
|
+
Returns
|
|
175
|
+
-------
|
|
176
|
+
|
|
177
|
+
If return_stored=True
|
|
178
|
+
tuple of Arrays
|
|
179
|
+
If return_stored=False and compute=True
|
|
180
|
+
None
|
|
181
|
+
If return_stored=False and compute=False
|
|
182
|
+
Delayed
|
|
183
|
+
|
|
184
|
+
Examples
|
|
185
|
+
--------
|
|
186
|
+
|
|
187
|
+
>>> import h5py # doctest: +SKIP
|
|
188
|
+
>>> f = h5py.File('myfile.hdf5', mode='a') # doctest: +SKIP
|
|
189
|
+
>>> dset = f.create_dataset('/data', shape=x.shape,
|
|
190
|
+
... chunks=x.chunks,
|
|
191
|
+
... dtype='f8') # doctest: +SKIP
|
|
192
|
+
|
|
193
|
+
>>> store(x, dset) # doctest: +SKIP
|
|
194
|
+
|
|
195
|
+
Alternatively store many arrays at the same time
|
|
196
|
+
|
|
197
|
+
>>> store([x, y, z], [dset1, dset2, dset3]) # doctest: +SKIP
|
|
198
|
+
"""
|
|
199
|
+
from dask.base import persist
|
|
200
|
+
from dask.layers import ArraySliceDep
|
|
201
|
+
|
|
202
|
+
from dask_array._collection import Array
|
|
203
|
+
from dask_array._map_blocks import map_blocks
|
|
204
|
+
|
|
205
|
+
if isinstance(sources, Array):
|
|
206
|
+
sources = [sources]
|
|
207
|
+
targets = [targets]
|
|
208
|
+
targets = cast("Collection[ArrayLike | Delayed]", targets)
|
|
209
|
+
|
|
210
|
+
if any(not isinstance(s, Array) for s in sources):
|
|
211
|
+
raise ValueError("All sources must be dask array objects")
|
|
212
|
+
|
|
213
|
+
if len(sources) != len(targets):
|
|
214
|
+
raise ValueError(f"Different number of sources [{len(sources)}] and targets [{len(targets)}]")
|
|
215
|
+
|
|
216
|
+
if isinstance(regions, tuple) or regions is None:
|
|
217
|
+
regions_list = [regions] * len(sources)
|
|
218
|
+
else:
|
|
219
|
+
regions_list = list(regions)
|
|
220
|
+
if len(sources) != len(regions_list):
|
|
221
|
+
raise ValueError(
|
|
222
|
+
f"Different number of sources [{len(sources)}] and "
|
|
223
|
+
f"targets [{len(targets)}] than regions [{len(regions_list)}]"
|
|
224
|
+
)
|
|
225
|
+
del regions
|
|
226
|
+
|
|
227
|
+
if load_stored is None:
|
|
228
|
+
load_stored = return_stored and not compute
|
|
229
|
+
|
|
230
|
+
if lock is True:
|
|
231
|
+
lock = get_scheduler_lock(Array, kwargs.get("scheduler"))
|
|
232
|
+
|
|
233
|
+
arrays = []
|
|
234
|
+
for s, t, r in zip(sources, targets, regions_list):
|
|
235
|
+
slices = ArraySliceDep(s.chunks)
|
|
236
|
+
arrays.append(
|
|
237
|
+
map_blocks(
|
|
238
|
+
load_store_chunk,
|
|
239
|
+
s,
|
|
240
|
+
t,
|
|
241
|
+
slices,
|
|
242
|
+
region=r,
|
|
243
|
+
lock=lock,
|
|
244
|
+
return_stored=return_stored,
|
|
245
|
+
load_stored=load_stored,
|
|
246
|
+
name="store-map",
|
|
247
|
+
meta=s._meta,
|
|
248
|
+
)
|
|
249
|
+
)
|
|
250
|
+
|
|
251
|
+
if compute:
|
|
252
|
+
if not return_stored:
|
|
253
|
+
import dask
|
|
254
|
+
|
|
255
|
+
dask.compute(arrays, **kwargs)
|
|
256
|
+
return None
|
|
257
|
+
else:
|
|
258
|
+
stored_persisted = persist(*arrays, **kwargs)
|
|
259
|
+
arrays = []
|
|
260
|
+
for s, r in zip(stored_persisted, regions_list):
|
|
261
|
+
slices = ArraySliceDep(s.chunks)
|
|
262
|
+
arrays.append(
|
|
263
|
+
map_blocks(
|
|
264
|
+
load_chunk,
|
|
265
|
+
s,
|
|
266
|
+
slices,
|
|
267
|
+
lock=lock,
|
|
268
|
+
region=r,
|
|
269
|
+
name="load-stored",
|
|
270
|
+
meta=s._meta,
|
|
271
|
+
)
|
|
272
|
+
)
|
|
273
|
+
if len(arrays) == 1:
|
|
274
|
+
return arrays[0]
|
|
275
|
+
return tuple(arrays)
|
|
276
|
+
|
|
277
|
+
|
|
278
|
+
def to_hdf5(filename, *args, chunks=True, **kwargs):
|
|
279
|
+
"""Store arrays in HDF5 file
|
|
280
|
+
|
|
281
|
+
This saves several dask arrays into several datapaths in an HDF5 file.
|
|
282
|
+
It creates the necessary datasets and handles clean file opening/closing.
|
|
283
|
+
|
|
284
|
+
Parameters
|
|
285
|
+
----------
|
|
286
|
+
chunks: tuple or ``True``
|
|
287
|
+
Chunk shape, or ``True`` to pass the chunks from the dask array.
|
|
288
|
+
Defaults to ``True``.
|
|
289
|
+
|
|
290
|
+
Examples
|
|
291
|
+
--------
|
|
292
|
+
|
|
293
|
+
>>> da.to_hdf5('myfile.hdf5', '/x', x) # doctest: +SKIP
|
|
294
|
+
|
|
295
|
+
or
|
|
296
|
+
|
|
297
|
+
>>> da.to_hdf5('myfile.hdf5', {'/x': x, '/y': y}) # doctest: +SKIP
|
|
298
|
+
|
|
299
|
+
Optionally provide arguments as though to ``h5py.File.create_dataset``
|
|
300
|
+
|
|
301
|
+
>>> da.to_hdf5('myfile.hdf5', '/x', x, compression='lzf', shuffle=True) # doctest: +SKIP
|
|
302
|
+
|
|
303
|
+
>>> da.to_hdf5('myfile.hdf5', '/x', x, chunks=(10,20,30)) # doctest: +SKIP
|
|
304
|
+
|
|
305
|
+
This can also be used as a method on a single Array
|
|
306
|
+
|
|
307
|
+
>>> x.to_hdf5('myfile.hdf5', '/x') # doctest: +SKIP
|
|
308
|
+
|
|
309
|
+
See Also
|
|
310
|
+
--------
|
|
311
|
+
da.store
|
|
312
|
+
h5py.File.create_dataset
|
|
313
|
+
"""
|
|
314
|
+
from dask_array._collection import Array
|
|
315
|
+
|
|
316
|
+
if len(args) == 1 and isinstance(args[0], dict):
|
|
317
|
+
data = args[0]
|
|
318
|
+
elif len(args) == 2 and isinstance(args[0], str) and isinstance(args[1], Array):
|
|
319
|
+
data = {args[0]: args[1]}
|
|
320
|
+
else:
|
|
321
|
+
raise ValueError("Please provide {'/data/path': array} dictionary")
|
|
322
|
+
|
|
323
|
+
import h5py
|
|
324
|
+
|
|
325
|
+
with h5py.File(filename, mode="a") as f:
|
|
326
|
+
dsets = [
|
|
327
|
+
f.require_dataset(
|
|
328
|
+
dp,
|
|
329
|
+
shape=x.shape,
|
|
330
|
+
dtype=x.dtype,
|
|
331
|
+
chunks=tuple(c[0] for c in x.chunks) if chunks is True else chunks,
|
|
332
|
+
**kwargs,
|
|
333
|
+
)
|
|
334
|
+
for dp, x in data.items()
|
|
335
|
+
]
|
|
336
|
+
store(list(data.values()), dsets)
|
dask_array/io/_tiledb.py
ADDED
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dask_array.io._zarr import _check_regular_chunks
|
|
4
|
+
from dask_array.core._conversion import from_array
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def _tiledb_to_chunks(tiledb_array):
|
|
8
|
+
schema = tiledb_array.schema
|
|
9
|
+
return list(schema.domain.dim(i).tile for i in range(schema.ndim))
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def from_tiledb(uri, attribute=None, chunks=None, storage_options=None, **kwargs):
|
|
13
|
+
"""Load array from the TileDB storage format
|
|
14
|
+
|
|
15
|
+
See https://docs.tiledb.io for more information about TileDB.
|
|
16
|
+
|
|
17
|
+
Parameters
|
|
18
|
+
----------
|
|
19
|
+
uri: TileDB array or str
|
|
20
|
+
Location to save the data
|
|
21
|
+
attribute: str or None
|
|
22
|
+
Attribute selection (single-attribute view on multi-attribute array)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
Returns
|
|
26
|
+
-------
|
|
27
|
+
|
|
28
|
+
A Dask Array
|
|
29
|
+
|
|
30
|
+
Examples
|
|
31
|
+
--------
|
|
32
|
+
|
|
33
|
+
>>> import tempfile, tiledb
|
|
34
|
+
>>> import dask_array as da, numpy as np
|
|
35
|
+
>>> uri = tempfile.NamedTemporaryFile().name
|
|
36
|
+
>>> _ = tiledb.from_numpy(uri, np.arange(0,9).reshape(3,3)) # create a tiledb array
|
|
37
|
+
>>> tdb_ar = da.from_tiledb(uri) # read back the array
|
|
38
|
+
>>> tdb_ar.shape
|
|
39
|
+
(3, 3)
|
|
40
|
+
>>> tdb_ar.mean().compute()
|
|
41
|
+
4.0
|
|
42
|
+
"""
|
|
43
|
+
import tiledb
|
|
44
|
+
|
|
45
|
+
tiledb_config = storage_options or dict()
|
|
46
|
+
key = tiledb_config.pop("key", None)
|
|
47
|
+
|
|
48
|
+
if isinstance(uri, tiledb.Array):
|
|
49
|
+
tdb = uri
|
|
50
|
+
else:
|
|
51
|
+
tdb = tiledb.open(uri, attr=attribute, config=tiledb_config, key=key)
|
|
52
|
+
|
|
53
|
+
if tdb.schema.sparse:
|
|
54
|
+
raise ValueError("Sparse TileDB arrays are not supported")
|
|
55
|
+
|
|
56
|
+
if not attribute:
|
|
57
|
+
if tdb.schema.nattr > 1:
|
|
58
|
+
raise TypeError("keyword 'attribute' must be providedwhen loading a multi-attribute TileDB array")
|
|
59
|
+
else:
|
|
60
|
+
attribute = tdb.schema.attr(0).name
|
|
61
|
+
|
|
62
|
+
if tdb.iswritable:
|
|
63
|
+
raise ValueError("TileDB array must be open for reading")
|
|
64
|
+
|
|
65
|
+
chunks = chunks or _tiledb_to_chunks(tdb)
|
|
66
|
+
|
|
67
|
+
assert len(chunks) == tdb.schema.ndim
|
|
68
|
+
|
|
69
|
+
return from_array(tdb, chunks, name=f"tiledb-{uri}")
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def to_tiledb(
|
|
73
|
+
darray,
|
|
74
|
+
uri,
|
|
75
|
+
compute=True,
|
|
76
|
+
return_stored=False,
|
|
77
|
+
storage_options=None,
|
|
78
|
+
key=None,
|
|
79
|
+
**kwargs,
|
|
80
|
+
):
|
|
81
|
+
"""Save array to the TileDB storage format
|
|
82
|
+
|
|
83
|
+
Save 'array' using the TileDB storage manager, to any TileDB-supported URI,
|
|
84
|
+
including local disk, S3, or HDFS.
|
|
85
|
+
|
|
86
|
+
See https://docs.tiledb.io for more information about TileDB.
|
|
87
|
+
|
|
88
|
+
Parameters
|
|
89
|
+
----------
|
|
90
|
+
|
|
91
|
+
darray: dask.array
|
|
92
|
+
A dask array to write.
|
|
93
|
+
uri:
|
|
94
|
+
Any supported TileDB storage location.
|
|
95
|
+
storage_options: dict
|
|
96
|
+
Dict containing any configuration options for the TileDB backend.
|
|
97
|
+
see https://docs.tiledb.io/en/stable/tutorials/config.html
|
|
98
|
+
compute, return_stored: see ``store()``
|
|
99
|
+
key: str or None
|
|
100
|
+
Encryption key
|
|
101
|
+
|
|
102
|
+
Returns
|
|
103
|
+
-------
|
|
104
|
+
|
|
105
|
+
None
|
|
106
|
+
Unless ``return_stored`` is set to ``True`` (``False`` by default)
|
|
107
|
+
|
|
108
|
+
Notes
|
|
109
|
+
-----
|
|
110
|
+
|
|
111
|
+
TileDB only supports regularly-chunked arrays.
|
|
112
|
+
TileDB `tile extents`_ correspond to form 2 of the dask
|
|
113
|
+
`chunk specification`_, and the conversion is
|
|
114
|
+
done automatically for supported arrays.
|
|
115
|
+
|
|
116
|
+
Examples
|
|
117
|
+
--------
|
|
118
|
+
|
|
119
|
+
>>> import dask_array as da, tempfile
|
|
120
|
+
>>> uri = tempfile.NamedTemporaryFile().name
|
|
121
|
+
>>> data = da.random.random(5,5)
|
|
122
|
+
>>> da.to_tiledb(data, uri)
|
|
123
|
+
>>> import tiledb
|
|
124
|
+
>>> tdb_ar = tiledb.open(uri)
|
|
125
|
+
>>> all(tdb_ar == data)
|
|
126
|
+
True
|
|
127
|
+
|
|
128
|
+
.. _chunk specification: https://docs.tiledb.io/en/stable/tutorials/tiling-dense.html
|
|
129
|
+
.. _tile extents: http://docs.dask.org/en/latest/array-chunks.html
|
|
130
|
+
"""
|
|
131
|
+
import tiledb
|
|
132
|
+
|
|
133
|
+
tiledb_config = storage_options or dict()
|
|
134
|
+
# encryption key, if any
|
|
135
|
+
key = key or tiledb_config.pop("key", None)
|
|
136
|
+
|
|
137
|
+
if not _check_regular_chunks(darray.chunks):
|
|
138
|
+
raise ValueError(
|
|
139
|
+
"Attempt to save array to TileDB with irregular chunking, please call `arr.rechunk(...)` first."
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
if isinstance(uri, str):
|
|
143
|
+
chunks = [c[0] for c in darray.chunks]
|
|
144
|
+
# create a suitable, empty, writable TileDB array
|
|
145
|
+
tdb = tiledb.empty_like(uri, darray, tile=chunks, config=tiledb_config, key=key, **kwargs)
|
|
146
|
+
elif isinstance(uri, tiledb.Array):
|
|
147
|
+
tdb = uri
|
|
148
|
+
# sanity checks
|
|
149
|
+
if not ((darray.dtype == tdb.dtype) and (darray.ndim == tdb.ndim)):
|
|
150
|
+
raise ValueError("Target TileDB array layout is not compatible with source array")
|
|
151
|
+
else:
|
|
152
|
+
raise ValueError(
|
|
153
|
+
"'uri' must be string pointing to supported TileDB store location or an open, writable TileDB array."
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
if not (tdb.isopen and tdb.iswritable):
|
|
157
|
+
raise ValueError("Target TileDB array is not open and writable.")
|
|
158
|
+
|
|
159
|
+
return darray.store(tdb, lock=False, compute=compute, return_stored=return_stored)
|