dask-array 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dask_array/__init__.py +228 -0
- dask_array/_backends.py +76 -0
- dask_array/_backends_array.py +99 -0
- dask_array/_blockwise.py +1410 -0
- dask_array/_broadcast.py +272 -0
- dask_array/_chunk.py +445 -0
- dask_array/_chunk_types.py +54 -0
- dask_array/_collection.py +1644 -0
- dask_array/_concatenate.py +331 -0
- dask_array/_core_utils.py +1365 -0
- dask_array/_dispatch.py +141 -0
- dask_array/_einsum.py +277 -0
- dask_array/_expr.py +544 -0
- dask_array/_expr_flow.py +586 -0
- dask_array/_gufunc.py +805 -0
- dask_array/_histogram.py +617 -0
- dask_array/_map_blocks.py +652 -0
- dask_array/_new_collection.py +10 -0
- dask_array/_numpy_compat.py +135 -0
- dask_array/_overlap.py +1159 -0
- dask_array/_rechunk.py +1050 -0
- dask_array/_reshape.py +710 -0
- dask_array/_routines.py +102 -0
- dask_array/_shuffle.py +448 -0
- dask_array/_stack.py +264 -0
- dask_array/_svg.py +291 -0
- dask_array/_templates.py +29 -0
- dask_array/_test_utils.py +257 -0
- dask_array/_ufunc.py +385 -0
- dask_array/_utils.py +349 -0
- dask_array/_visualize.py +223 -0
- dask_array/_xarray.py +337 -0
- dask_array/core/__init__.py +34 -0
- dask_array/core/_blockwise_funcs.py +312 -0
- dask_array/core/_conversion.py +422 -0
- dask_array/core/_from_graph.py +97 -0
- dask_array/creation/__init__.py +71 -0
- dask_array/creation/_arange.py +121 -0
- dask_array/creation/_diag.py +116 -0
- dask_array/creation/_diagonal.py +241 -0
- dask_array/creation/_eye.py +103 -0
- dask_array/creation/_linspace.py +102 -0
- dask_array/creation/_mesh.py +134 -0
- dask_array/creation/_ones_zeros.py +454 -0
- dask_array/creation/_pad.py +270 -0
- dask_array/creation/_repeat.py +55 -0
- dask_array/creation/_tile.py +36 -0
- dask_array/creation/_tri.py +28 -0
- dask_array/creation/_utils.py +296 -0
- dask_array/fft.py +320 -0
- dask_array/io/__init__.py +39 -0
- dask_array/io/_base.py +10 -0
- dask_array/io/_from_array.py +257 -0
- dask_array/io/_from_delayed.py +95 -0
- dask_array/io/_from_graph.py +54 -0
- dask_array/io/_from_npy_stack.py +67 -0
- dask_array/io/_store.py +336 -0
- dask_array/io/_tiledb.py +159 -0
- dask_array/io/_to_npy_stack.py +65 -0
- dask_array/io/_zarr.py +449 -0
- dask_array/linalg/__init__.py +39 -0
- dask_array/linalg/_cholesky.py +234 -0
- dask_array/linalg/_lu.py +300 -0
- dask_array/linalg/_norm.py +94 -0
- dask_array/linalg/_qr.py +601 -0
- dask_array/linalg/_solve.py +349 -0
- dask_array/linalg/_svd.py +394 -0
- dask_array/linalg/_tensordot.py +334 -0
- dask_array/linalg/_utils.py +74 -0
- dask_array/manipulation/__init__.py +45 -0
- dask_array/manipulation/_expand.py +321 -0
- dask_array/manipulation/_flip.py +92 -0
- dask_array/manipulation/_roll.py +78 -0
- dask_array/manipulation/_transpose.py +309 -0
- dask_array/random/__init__.py +125 -0
- dask_array/random/_choice.py +181 -0
- dask_array/random/_expr.py +256 -0
- dask_array/random/_generator.py +441 -0
- dask_array/random/_random_state.py +259 -0
- dask_array/random/_utils.py +84 -0
- dask_array/reductions/__init__.py +84 -0
- dask_array/reductions/_arg_reduction.py +130 -0
- dask_array/reductions/_common.py +1082 -0
- dask_array/reductions/_cumulative.py +522 -0
- dask_array/reductions/_percentile.py +261 -0
- dask_array/reductions/_reduction.py +725 -0
- dask_array/reductions/_trace.py +56 -0
- dask_array/routines/__init__.py +133 -0
- dask_array/routines/_apply.py +84 -0
- dask_array/routines/_bincount.py +112 -0
- dask_array/routines/_broadcast.py +111 -0
- dask_array/routines/_coarsen.py +115 -0
- dask_array/routines/_diff.py +79 -0
- dask_array/routines/_gradient.py +158 -0
- dask_array/routines/_indexing.py +65 -0
- dask_array/routines/_insert_delete.py +132 -0
- dask_array/routines/_misc.py +122 -0
- dask_array/routines/_nonzero.py +72 -0
- dask_array/routines/_search.py +123 -0
- dask_array/routines/_select.py +113 -0
- dask_array/routines/_statistics.py +171 -0
- dask_array/routines/_topk.py +82 -0
- dask_array/routines/_triangular.py +74 -0
- dask_array/routines/_unique.py +232 -0
- dask_array/routines/_where.py +62 -0
- dask_array/slicing/__init__.py +67 -0
- dask_array/slicing/_basic.py +550 -0
- dask_array/slicing/_blocks.py +138 -0
- dask_array/slicing/_bool_index.py +145 -0
- dask_array/slicing/_setitem.py +329 -0
- dask_array/slicing/_squeeze.py +101 -0
- dask_array/slicing/_utils.py +1133 -0
- dask_array/slicing/_vindex.py +282 -0
- dask_array/stacking/__init__.py +15 -0
- dask_array/stacking/_block.py +83 -0
- dask_array/stacking/_simple.py +58 -0
- dask_array/templates/array.html.j2 +48 -0
- dask_array/tests/__init__.py +0 -0
- dask_array/tests/conftest.py +22 -0
- dask_array/tests/test_api.py +40 -0
- dask_array/tests/test_binary_op_chunks.py +107 -0
- dask_array/tests/test_coarse_slice_through_blockwise.py +362 -0
- dask_array/tests/test_collection.py +799 -0
- dask_array/tests/test_creation.py +1102 -0
- dask_array/tests/test_expr_flow.py +143 -0
- dask_array/tests/test_linalg.py +1130 -0
- dask_array/tests/test_map_blocks_multi_output.py +104 -0
- dask_array/tests/test_rechunk_pushdown.py +214 -0
- dask_array/tests/test_reductions.py +1091 -0
- dask_array/tests/test_routines.py +2853 -0
- dask_array/tests/test_shuffle_chunks.py +67 -0
- dask_array/tests/test_slice_pushdown.py +968 -0
- dask_array/tests/test_slice_through_blockwise.py +678 -0
- dask_array/tests/test_slice_through_overlap.py +366 -0
- dask_array/tests/test_slice_through_reshape.py +272 -0
- dask_array/tests/test_slicing.py +839 -0
- dask_array/tests/test_transpose_slice_pushdown.py +208 -0
- dask_array/tests/test_visualize.py +94 -0
- dask_array/tests/test_xarray.py +193 -0
- dask_array-0.1.0.dist-info/METADATA +48 -0
- dask_array-0.1.0.dist-info/RECORD +144 -0
- dask_array-0.1.0.dist-info/WHEEL +4 -0
- dask_array-0.1.0.dist-info/entry_points.txt +2 -0
- dask_array-0.1.0.dist-info/licenses/LICENSE +29 -0
dask_array/_histogram.py
ADDED
|
@@ -0,0 +1,617 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from builtins import range as _range
|
|
4
|
+
from functools import cached_property, reduce
|
|
5
|
+
from operator import mul
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
|
|
9
|
+
from dask._task_spec import Task, TaskRef
|
|
10
|
+
from dask_array._collection import Array, asarray, new_collection
|
|
11
|
+
from dask_array._expr import ArrayExpr
|
|
12
|
+
from dask.base import is_dask_collection
|
|
13
|
+
from dask.delayed import Delayed
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _block_hist(x, bins_range, weights=None):
|
|
17
|
+
"""Compute histogram for a single block."""
|
|
18
|
+
bins, range_ = bins_range
|
|
19
|
+
return np.histogram(x, bins, range=range_, weights=weights)[0][np.newaxis]
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _linspace_from_range(bins_range):
|
|
23
|
+
"""Create bin edges from (num_bins, (start, stop)) at compute time."""
|
|
24
|
+
bins, (start, stop) = bins_range
|
|
25
|
+
return np.linspace(start, stop, num=int(bins) + 1)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _flatten_keys(arr):
|
|
29
|
+
"""Flatten array keys in C order."""
|
|
30
|
+
for idx in np.ndindex(arr.numblocks):
|
|
31
|
+
yield (arr._name,) + idx
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _to_ref(val):
|
|
35
|
+
"""Convert value to TaskRef if it's an ArrayExpr, otherwise return as-is."""
|
|
36
|
+
if isinstance(val, ArrayExpr):
|
|
37
|
+
return TaskRef((val._name,))
|
|
38
|
+
return val
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class HistogramBinned(ArrayExpr):
|
|
42
|
+
"""Expression for mapped histogram computation.
|
|
43
|
+
|
|
44
|
+
Creates a 2D array of shape (nchunks, nbins) where each row is the
|
|
45
|
+
histogram of one input chunk. Use .sum(axis=0) to get the final histogram.
|
|
46
|
+
|
|
47
|
+
Handles both concrete and delayed bins/range.
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
_parameters = ["array", "bins", "range_start", "range_stop", "weights", "nbins"]
|
|
51
|
+
_defaults = {
|
|
52
|
+
"range_start": None,
|
|
53
|
+
"range_stop": None,
|
|
54
|
+
"weights": None,
|
|
55
|
+
"nbins": None,
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
@cached_property
|
|
59
|
+
def _meta(self):
|
|
60
|
+
dtype = np.histogram([])[0].dtype
|
|
61
|
+
if self.weights is not None:
|
|
62
|
+
dtype = self.weights._meta.dtype
|
|
63
|
+
return np.empty((0, 0), dtype=dtype)
|
|
64
|
+
|
|
65
|
+
@cached_property
|
|
66
|
+
def chunks(self):
|
|
67
|
+
nchunks = reduce(mul, self.array.numblocks, 1)
|
|
68
|
+
if self.nbins is not None:
|
|
69
|
+
nbins = self.nbins
|
|
70
|
+
elif isinstance(self.bins, ArrayExpr):
|
|
71
|
+
nbins = None
|
|
72
|
+
else:
|
|
73
|
+
nbins = len(self.bins) - 1
|
|
74
|
+
|
|
75
|
+
if nbins is None or isinstance(nbins, ArrayExpr):
|
|
76
|
+
return ((1,) * nchunks, (np.nan,))
|
|
77
|
+
return ((1,) * nchunks, (int(nbins),))
|
|
78
|
+
|
|
79
|
+
@cached_property
|
|
80
|
+
def _name(self):
|
|
81
|
+
return f"histogram-{self.deterministic_token}"
|
|
82
|
+
|
|
83
|
+
def _layer(self) -> dict:
|
|
84
|
+
from dask._task_spec import List as TaskList
|
|
85
|
+
|
|
86
|
+
dsk = {}
|
|
87
|
+
array_keys = list(_flatten_keys(self.array))
|
|
88
|
+
|
|
89
|
+
# Build bins reference (may be multi-dimensional array)
|
|
90
|
+
if isinstance(self.bins, ArrayExpr):
|
|
91
|
+
bins_ref = TaskRef((self.bins._name,) + (0,) * self.bins.ndim)
|
|
92
|
+
else:
|
|
93
|
+
bins_ref = self.bins
|
|
94
|
+
|
|
95
|
+
# Build bins_range tuple
|
|
96
|
+
range_tuple = TaskList(_to_ref(self.range_start), _to_ref(self.range_stop))
|
|
97
|
+
bins_range = TaskList(bins_ref, range_tuple)
|
|
98
|
+
|
|
99
|
+
if self.weights is None:
|
|
100
|
+
for i, k in enumerate(array_keys):
|
|
101
|
+
dsk[(self._name, i, 0)] = Task(
|
|
102
|
+
(self._name, i, 0),
|
|
103
|
+
_block_hist,
|
|
104
|
+
TaskRef(k),
|
|
105
|
+
bins_range,
|
|
106
|
+
)
|
|
107
|
+
else:
|
|
108
|
+
weight_keys = list(_flatten_keys(self.weights))
|
|
109
|
+
for i, (k, w) in enumerate(zip(array_keys, weight_keys)):
|
|
110
|
+
dsk[(self._name, i, 0)] = Task(
|
|
111
|
+
(self._name, i, 0),
|
|
112
|
+
_block_hist,
|
|
113
|
+
TaskRef(k),
|
|
114
|
+
bins_range,
|
|
115
|
+
TaskRef(w),
|
|
116
|
+
)
|
|
117
|
+
return dsk
|
|
118
|
+
|
|
119
|
+
@property
|
|
120
|
+
def _dependencies(self):
|
|
121
|
+
deps = [self.array]
|
|
122
|
+
for attr in (self.bins, self.range_start, self.range_stop, self.weights):
|
|
123
|
+
if isinstance(attr, ArrayExpr):
|
|
124
|
+
deps.append(attr)
|
|
125
|
+
return deps
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
class LinspaceDelayed(ArrayExpr):
|
|
129
|
+
"""Expression for linspace with delayed start/stop values."""
|
|
130
|
+
|
|
131
|
+
_parameters = ["num_bins", "range_start", "range_stop"]
|
|
132
|
+
|
|
133
|
+
@cached_property
|
|
134
|
+
def _meta(self):
|
|
135
|
+
return np.linspace(0, 1, 2)
|
|
136
|
+
|
|
137
|
+
@cached_property
|
|
138
|
+
def chunks(self):
|
|
139
|
+
if isinstance(self.num_bins, ArrayExpr):
|
|
140
|
+
return ((np.nan,),)
|
|
141
|
+
return ((int(self.num_bins) + 1,),)
|
|
142
|
+
|
|
143
|
+
@cached_property
|
|
144
|
+
def _name(self):
|
|
145
|
+
return f"linspace-delayed-{self.deterministic_token}"
|
|
146
|
+
|
|
147
|
+
def _layer(self) -> dict:
|
|
148
|
+
from dask._task_spec import List as TaskList
|
|
149
|
+
|
|
150
|
+
bins_range = TaskList(
|
|
151
|
+
_to_ref(self.num_bins),
|
|
152
|
+
TaskList(_to_ref(self.range_start), _to_ref(self.range_stop)),
|
|
153
|
+
)
|
|
154
|
+
return {(self._name, 0): Task((self._name, 0), _linspace_from_range, bins_range)}
|
|
155
|
+
|
|
156
|
+
@property
|
|
157
|
+
def _dependencies(self):
|
|
158
|
+
return [v for v in (self.range_start, self.range_stop, self.num_bins) if isinstance(v, ArrayExpr)]
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def histogram(a, bins=None, range=None, normed=False, weights=None, density=None):
|
|
162
|
+
"""
|
|
163
|
+
Blocked variant of :func:`numpy.histogram`.
|
|
164
|
+
|
|
165
|
+
Parameters
|
|
166
|
+
----------
|
|
167
|
+
a : dask.array.Array
|
|
168
|
+
Input data. The histogram is computed over the flattened array.
|
|
169
|
+
bins : int or sequence of scalars, optional
|
|
170
|
+
Either an iterable specifying the bins or the number of bins
|
|
171
|
+
and a range argument is required.
|
|
172
|
+
range : (float, float), optional
|
|
173
|
+
The lower and upper range of the bins.
|
|
174
|
+
normed : bool, optional
|
|
175
|
+
Deprecated, use density instead.
|
|
176
|
+
weights : dask.array.Array, optional
|
|
177
|
+
Weights for the histogram, same shape as a.
|
|
178
|
+
density : bool, optional
|
|
179
|
+
If True, normalize the histogram.
|
|
180
|
+
|
|
181
|
+
Returns
|
|
182
|
+
-------
|
|
183
|
+
hist : dask Array
|
|
184
|
+
The histogram values.
|
|
185
|
+
bin_edges : dask Array
|
|
186
|
+
The bin edges.
|
|
187
|
+
"""
|
|
188
|
+
from dask.base import is_dask_collection
|
|
189
|
+
|
|
190
|
+
if isinstance(bins, Array):
|
|
191
|
+
scalar_bins = bins.ndim == 0
|
|
192
|
+
elif isinstance(bins, Delayed):
|
|
193
|
+
scalar_bins = bins._length is None or bins._length == 1
|
|
194
|
+
else:
|
|
195
|
+
scalar_bins = np.ndim(bins) == 0
|
|
196
|
+
|
|
197
|
+
if bins is None or (scalar_bins and range is None):
|
|
198
|
+
raise ValueError(
|
|
199
|
+
"dask.array.histogram requires either specifying "
|
|
200
|
+
"bins as an iterable or specifying both a range and "
|
|
201
|
+
"the number of bins"
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
if weights is not None and weights.chunks != a.chunks:
|
|
205
|
+
raise ValueError("Input array and weights must have the same chunked structure")
|
|
206
|
+
|
|
207
|
+
if normed is not False:
|
|
208
|
+
raise ValueError(
|
|
209
|
+
"The normed= keyword argument has been deprecated. "
|
|
210
|
+
"Please use density instead. "
|
|
211
|
+
"See the numpy.histogram docstring for more information."
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
if density and scalar_bins and isinstance(bins, (Array, Delayed)):
|
|
215
|
+
raise NotImplementedError(
|
|
216
|
+
"When `density` is True, `bins` cannot be a scalar Dask object. "
|
|
217
|
+
"It must be a concrete number or a (possibly-delayed) array/sequence of bin edges."
|
|
218
|
+
)
|
|
219
|
+
|
|
220
|
+
if range is not None:
|
|
221
|
+
try:
|
|
222
|
+
if len(range) != 2:
|
|
223
|
+
raise ValueError(f"range must be a sequence or array of length 2, but got {len(range)} items")
|
|
224
|
+
if isinstance(range, (Array, np.ndarray)) and range.shape != (2,):
|
|
225
|
+
raise ValueError(
|
|
226
|
+
f"range must be a 1-dimensional array of two items, but got an array of shape {range.shape}"
|
|
227
|
+
)
|
|
228
|
+
except TypeError:
|
|
229
|
+
raise TypeError(f"Expected a sequence or array for range, not {range}") from None
|
|
230
|
+
|
|
231
|
+
# Handle delayed bins/range
|
|
232
|
+
range_has_dask = range is not None and any(is_dask_collection(r) for r in range)
|
|
233
|
+
bins_is_dask_array = isinstance(bins, Array)
|
|
234
|
+
|
|
235
|
+
if is_dask_collection(bins) and not bins_is_dask_array:
|
|
236
|
+
raise NotImplementedError("Delayed bins (non-Array) not yet supported in array-expr. Use a dask Array instead.")
|
|
237
|
+
|
|
238
|
+
# Extract range values
|
|
239
|
+
range_start = range[0] if range is not None else None
|
|
240
|
+
range_stop = range[1] if range is not None else None
|
|
241
|
+
|
|
242
|
+
# Convert range to expressions if they're Arrays
|
|
243
|
+
if isinstance(range_start, Array):
|
|
244
|
+
range_start_expr = range_start.expr
|
|
245
|
+
else:
|
|
246
|
+
range_start_expr = range_start
|
|
247
|
+
|
|
248
|
+
if isinstance(range_stop, Array):
|
|
249
|
+
range_stop_expr = range_stop.expr
|
|
250
|
+
else:
|
|
251
|
+
range_stop_expr = range_stop
|
|
252
|
+
|
|
253
|
+
if range_has_dask or bins_is_dask_array:
|
|
254
|
+
# Delayed bins and/or range
|
|
255
|
+
if scalar_bins:
|
|
256
|
+
# Scalar bins + delayed range: create linspace expression
|
|
257
|
+
if isinstance(bins, Array):
|
|
258
|
+
nbins_expr = bins.expr
|
|
259
|
+
else:
|
|
260
|
+
nbins_expr = int(bins)
|
|
261
|
+
|
|
262
|
+
# Create linspace expression for bin edges
|
|
263
|
+
linspace_expr = LinspaceDelayed(nbins_expr, range_start_expr, range_stop_expr)
|
|
264
|
+
bins_edges = new_collection(linspace_expr)
|
|
265
|
+
|
|
266
|
+
hist_expr = HistogramBinned(
|
|
267
|
+
a.expr,
|
|
268
|
+
linspace_expr,
|
|
269
|
+
range_start_expr,
|
|
270
|
+
range_stop_expr,
|
|
271
|
+
weights.expr if weights is not None else None,
|
|
272
|
+
nbins_expr,
|
|
273
|
+
)
|
|
274
|
+
else:
|
|
275
|
+
# bins is a dask Array of bin edges
|
|
276
|
+
if not isinstance(bins, Array):
|
|
277
|
+
bins = asarray(bins)
|
|
278
|
+
if bins.ndim != 1:
|
|
279
|
+
raise ValueError(f"bins must be a 1-dimensional array or sequence, got shape {bins.ndim}D")
|
|
280
|
+
|
|
281
|
+
# Rechunk bins to a single chunk for numpy.histogram
|
|
282
|
+
bins_rechunked = bins.rechunk(-1)
|
|
283
|
+
|
|
284
|
+
# nbins is len(bins) - 1, but bins may have unknown size
|
|
285
|
+
if np.isnan(bins.shape[0]):
|
|
286
|
+
nbins_expr = bins.expr
|
|
287
|
+
else:
|
|
288
|
+
nbins_expr = int(bins.shape[0]) - 1
|
|
289
|
+
|
|
290
|
+
hist_expr = HistogramBinned(
|
|
291
|
+
a.expr,
|
|
292
|
+
bins_rechunked.expr,
|
|
293
|
+
range_start_expr,
|
|
294
|
+
range_stop_expr,
|
|
295
|
+
weights.expr if weights is not None else None,
|
|
296
|
+
nbins_expr,
|
|
297
|
+
)
|
|
298
|
+
bins_edges = bins
|
|
299
|
+
|
|
300
|
+
mapped = new_collection(hist_expr)
|
|
301
|
+
n = mapped.sum(axis=0)
|
|
302
|
+
|
|
303
|
+
if density:
|
|
304
|
+
db = bins_edges[1:] - bins_edges[:-1]
|
|
305
|
+
db = db.astype(float)
|
|
306
|
+
n = n / db / n.sum()
|
|
307
|
+
|
|
308
|
+
return n, bins_edges
|
|
309
|
+
|
|
310
|
+
# Non-delayed case: convert scalar bins + range to bin edges
|
|
311
|
+
if scalar_bins:
|
|
312
|
+
bins = np.linspace(range[0], range[1], num=int(bins) + 1)
|
|
313
|
+
else:
|
|
314
|
+
if not isinstance(bins, np.ndarray):
|
|
315
|
+
bins = np.asarray(bins)
|
|
316
|
+
if bins.ndim != 1:
|
|
317
|
+
raise ValueError(f"bins must be a 1-dimensional array or sequence, got shape {bins.shape}")
|
|
318
|
+
|
|
319
|
+
# Create the histogram expression (concrete bins)
|
|
320
|
+
hist_expr = HistogramBinned(
|
|
321
|
+
a.expr,
|
|
322
|
+
bins,
|
|
323
|
+
range_start_expr,
|
|
324
|
+
range_stop_expr,
|
|
325
|
+
weights.expr if weights is not None else None,
|
|
326
|
+
)
|
|
327
|
+
mapped = new_collection(hist_expr)
|
|
328
|
+
|
|
329
|
+
# Sum over chunks to get the final histogram
|
|
330
|
+
n = mapped.sum(axis=0)
|
|
331
|
+
|
|
332
|
+
# Handle density normalization
|
|
333
|
+
if density:
|
|
334
|
+
db = asarray(np.diff(bins).astype(float), chunks=n.chunks)
|
|
335
|
+
n = n / db / n.sum()
|
|
336
|
+
|
|
337
|
+
return n, bins
|
|
338
|
+
|
|
339
|
+
|
|
340
|
+
def _block_histogramdd_rect(sample, bins, range, weights):
|
|
341
|
+
"""Call numpy.histogramdd for a blocked/chunked calculation.
|
|
342
|
+
|
|
343
|
+
Slurps the result into an additional outer axis; this new axis
|
|
344
|
+
will be used to stack chunked calls of the numpy function and add
|
|
345
|
+
them together later.
|
|
346
|
+
"""
|
|
347
|
+
return np.histogramdd(sample, bins, range=range, weights=weights)[0][np.newaxis]
|
|
348
|
+
|
|
349
|
+
|
|
350
|
+
def _block_histogramdd_multiarg(*args):
|
|
351
|
+
"""Call numpy.histogramdd for a multi argument blocked/chunked calculation.
|
|
352
|
+
|
|
353
|
+
The last three arguments _must be_ (bins, range, weights).
|
|
354
|
+
"""
|
|
355
|
+
bins, range_, weights = args[-3:]
|
|
356
|
+
sample = args[:-3]
|
|
357
|
+
return np.histogramdd(sample, bins=bins, range=range_, weights=weights)[0][np.newaxis]
|
|
358
|
+
|
|
359
|
+
|
|
360
|
+
class HistogramDDBinned(ArrayExpr):
|
|
361
|
+
"""Expression for mapped histogramdd computation.
|
|
362
|
+
|
|
363
|
+
This creates an (nchunks, *nbins) array where the first axis
|
|
364
|
+
represents each input chunk. Use .sum(axis=0) to get the final histogram.
|
|
365
|
+
"""
|
|
366
|
+
|
|
367
|
+
_parameters = [
|
|
368
|
+
"sample",
|
|
369
|
+
"bins",
|
|
370
|
+
"range",
|
|
371
|
+
"weights",
|
|
372
|
+
"rectangular_sample",
|
|
373
|
+
"n_chunks",
|
|
374
|
+
"D",
|
|
375
|
+
]
|
|
376
|
+
_defaults = {"range": None, "weights": None}
|
|
377
|
+
|
|
378
|
+
@cached_property
|
|
379
|
+
def _meta(self):
|
|
380
|
+
dtype = np.histogramdd(np.empty((0, self.D)))[0].dtype
|
|
381
|
+
if self.weights is not None:
|
|
382
|
+
dtype = self.weights._meta.dtype
|
|
383
|
+
# Meta shape: (0,) * (D + 1)
|
|
384
|
+
return np.empty((0,) * (self.D + 1), dtype=dtype)
|
|
385
|
+
|
|
386
|
+
@cached_property
|
|
387
|
+
def chunks(self):
|
|
388
|
+
# Compute all_nbins from edges
|
|
389
|
+
all_nbins = tuple((len(b) - 1,) for b in self.bins)
|
|
390
|
+
return ((1,) * self.n_chunks, *all_nbins)
|
|
391
|
+
|
|
392
|
+
@cached_property
|
|
393
|
+
def _name(self):
|
|
394
|
+
return f"histogramdd-{self.deterministic_token}"
|
|
395
|
+
|
|
396
|
+
def _layer(self) -> dict:
|
|
397
|
+
dsk = {}
|
|
398
|
+
D = self.D
|
|
399
|
+
n_chunks = self.n_chunks
|
|
400
|
+
|
|
401
|
+
# Column zeros for indexing
|
|
402
|
+
column_zeros = tuple(0 for _ in _range(D))
|
|
403
|
+
|
|
404
|
+
# Get weight keys if provided
|
|
405
|
+
if self.weights is None:
|
|
406
|
+
w_keys = [None] * n_chunks
|
|
407
|
+
else:
|
|
408
|
+
w_keys = list(_flatten_keys(self.weights))
|
|
409
|
+
|
|
410
|
+
# Convert bins to list for passing to numpy
|
|
411
|
+
bins_list = [np.asarray(b) for b in self.bins]
|
|
412
|
+
|
|
413
|
+
if self.rectangular_sample:
|
|
414
|
+
# Single 2D array input
|
|
415
|
+
sample_keys = list(_flatten_keys(self.sample))
|
|
416
|
+
for i, (k, w) in enumerate(zip(sample_keys, w_keys)):
|
|
417
|
+
key = (self._name, i, *column_zeros)
|
|
418
|
+
w_ref = TaskRef(w) if w else None
|
|
419
|
+
dsk[key] = Task(
|
|
420
|
+
key,
|
|
421
|
+
_block_histogramdd_rect,
|
|
422
|
+
TaskRef(k),
|
|
423
|
+
bins_list,
|
|
424
|
+
self.range,
|
|
425
|
+
w_ref,
|
|
426
|
+
)
|
|
427
|
+
else:
|
|
428
|
+
# Sequence of 1D arrays
|
|
429
|
+
sample_keys = [list(_flatten_keys(s)) for s in self.sample]
|
|
430
|
+
for i in _range(n_chunks):
|
|
431
|
+
key = (self._name, i, *column_zeros)
|
|
432
|
+
coord_keys = [sample_keys[j][i] for j in _range(D)]
|
|
433
|
+
w_ref = TaskRef(w_keys[i]) if w_keys[i] else None
|
|
434
|
+
dsk[key] = Task(
|
|
435
|
+
key,
|
|
436
|
+
_block_histogramdd_multiarg,
|
|
437
|
+
*[TaskRef(ck) for ck in coord_keys],
|
|
438
|
+
bins_list,
|
|
439
|
+
self.range,
|
|
440
|
+
w_ref,
|
|
441
|
+
)
|
|
442
|
+
return dsk
|
|
443
|
+
|
|
444
|
+
def dependencies(self):
|
|
445
|
+
if self.rectangular_sample:
|
|
446
|
+
deps = [self.sample]
|
|
447
|
+
else:
|
|
448
|
+
deps = list(self.sample)
|
|
449
|
+
if self.weights is not None:
|
|
450
|
+
deps.append(self.weights)
|
|
451
|
+
return deps
|
|
452
|
+
|
|
453
|
+
|
|
454
|
+
def histogramdd(sample, bins, range=None, normed=None, weights=None, density=None):
|
|
455
|
+
"""Blocked variant of :func:`numpy.histogramdd`.
|
|
456
|
+
|
|
457
|
+
Parameters
|
|
458
|
+
----------
|
|
459
|
+
sample : dask.array.Array (N, D) or sequence of dask.array.Array
|
|
460
|
+
Multidimensional data to be histogrammed.
|
|
461
|
+
bins : sequence of arrays describing bin edges, int, or sequence of ints
|
|
462
|
+
The bin specification.
|
|
463
|
+
range : sequence of pairs, optional
|
|
464
|
+
The outer bin edges for each dimension.
|
|
465
|
+
normed : bool, optional
|
|
466
|
+
Alias for density.
|
|
467
|
+
weights : dask.array.Array, optional
|
|
468
|
+
Weights for the histogram.
|
|
469
|
+
density : bool, optional
|
|
470
|
+
If True, normalize the histogram.
|
|
471
|
+
|
|
472
|
+
Returns
|
|
473
|
+
-------
|
|
474
|
+
hist : dask Array
|
|
475
|
+
The histogram values.
|
|
476
|
+
edges : list of arrays
|
|
477
|
+
The bin edges.
|
|
478
|
+
"""
|
|
479
|
+
# Handle normed/density
|
|
480
|
+
if normed is None:
|
|
481
|
+
if density is None:
|
|
482
|
+
density = False
|
|
483
|
+
elif density is None:
|
|
484
|
+
density = normed
|
|
485
|
+
else:
|
|
486
|
+
raise TypeError("Cannot specify both 'normed' and 'density'")
|
|
487
|
+
|
|
488
|
+
# Check for dask collections in bins/range
|
|
489
|
+
dc_bins = is_dask_collection(bins)
|
|
490
|
+
if isinstance(bins, (list, tuple)):
|
|
491
|
+
dc_bins = dc_bins or any(is_dask_collection(b) for b in bins)
|
|
492
|
+
dc_range = any(is_dask_collection(r) for r in range) if range is not None else False
|
|
493
|
+
if dc_bins or dc_range:
|
|
494
|
+
raise NotImplementedError("Passing dask collections to bins=... or range=... is not supported.")
|
|
495
|
+
|
|
496
|
+
# Determine sample structure
|
|
497
|
+
if hasattr(sample, "shape"):
|
|
498
|
+
if len(sample.shape) != 2:
|
|
499
|
+
raise ValueError("Single array input to histogramdd should be columnar")
|
|
500
|
+
_, D = sample.shape
|
|
501
|
+
n_chunks = sample.numblocks[0]
|
|
502
|
+
rectangular_sample = True
|
|
503
|
+
if sample.shape[1:] != sample.chunksize[1:]:
|
|
504
|
+
raise ValueError("Input array can only be chunked along the 0th axis.")
|
|
505
|
+
elif isinstance(sample, (tuple, list)):
|
|
506
|
+
rectangular_sample = False
|
|
507
|
+
D = len(sample)
|
|
508
|
+
n_chunks = sample[0].numblocks[0]
|
|
509
|
+
for i in _range(1, D):
|
|
510
|
+
if sample[i].chunks != sample[0].chunks:
|
|
511
|
+
raise ValueError("All coordinate arrays must be chunked identically.")
|
|
512
|
+
else:
|
|
513
|
+
raise ValueError("Incompatible sample. Must be a 2D array or a sequence of 1D arrays.")
|
|
514
|
+
|
|
515
|
+
# Validate weights
|
|
516
|
+
if weights is not None:
|
|
517
|
+
if rectangular_sample and weights.chunks[0] != sample.chunks[0]:
|
|
518
|
+
raise ValueError(
|
|
519
|
+
"Input array and weights must have the same shape and chunk structure along the first dimension."
|
|
520
|
+
)
|
|
521
|
+
elif not rectangular_sample and weights.numblocks[0] != n_chunks:
|
|
522
|
+
raise ValueError("Input arrays and weights must have the same shape and chunk structure.")
|
|
523
|
+
|
|
524
|
+
# Validate bins
|
|
525
|
+
if isinstance(bins, (list, tuple)):
|
|
526
|
+
if len(bins) != D:
|
|
527
|
+
raise ValueError("The dimension of bins must be equal to the dimension of the sample.")
|
|
528
|
+
|
|
529
|
+
# Validate range
|
|
530
|
+
if range is not None:
|
|
531
|
+
if len(range) != D:
|
|
532
|
+
raise ValueError("range argument requires one entry, a min max pair, per dimension.")
|
|
533
|
+
if not all(len(r) == 2 for r in range):
|
|
534
|
+
raise ValueError("range argument should be a sequence of pairs")
|
|
535
|
+
|
|
536
|
+
# Convert bins to tuple if single int
|
|
537
|
+
if isinstance(bins, int):
|
|
538
|
+
bins = (bins,) * D
|
|
539
|
+
|
|
540
|
+
# Compute edges
|
|
541
|
+
if all(isinstance(b, int) for b in bins) and range is not None and all(len(r) == 2 for r in range):
|
|
542
|
+
edges = [np.linspace(r[0], r[1], b + 1) for b, r in zip(bins, range)]
|
|
543
|
+
else:
|
|
544
|
+
edges = [np.asarray(b) for b in bins]
|
|
545
|
+
|
|
546
|
+
# Get sample expression(s)
|
|
547
|
+
if rectangular_sample:
|
|
548
|
+
sample_expr = sample.expr
|
|
549
|
+
else:
|
|
550
|
+
sample_expr = tuple(s.expr for s in sample)
|
|
551
|
+
|
|
552
|
+
# Create the histogramdd expression
|
|
553
|
+
hist_expr = HistogramDDBinned(
|
|
554
|
+
sample_expr,
|
|
555
|
+
tuple(edges),
|
|
556
|
+
range,
|
|
557
|
+
weights.expr if weights is not None else None,
|
|
558
|
+
rectangular_sample,
|
|
559
|
+
n_chunks,
|
|
560
|
+
D,
|
|
561
|
+
)
|
|
562
|
+
mapped = new_collection(hist_expr)
|
|
563
|
+
|
|
564
|
+
# Sum over chunks to get the final histogram
|
|
565
|
+
n = mapped.sum(axis=0)
|
|
566
|
+
|
|
567
|
+
# Handle density normalization
|
|
568
|
+
if density:
|
|
569
|
+
width_divider = np.ones(n.shape)
|
|
570
|
+
for i in _range(D):
|
|
571
|
+
shape = np.ones(D, int)
|
|
572
|
+
shape[i] = width_divider.shape[i]
|
|
573
|
+
width_divider *= np.diff(edges[i]).reshape(shape)
|
|
574
|
+
width_divider = asarray(width_divider, chunks=n.chunks)
|
|
575
|
+
return n / width_divider / n.sum(), [asarray(e) for e in edges]
|
|
576
|
+
|
|
577
|
+
return n, [asarray(e) for e in edges]
|
|
578
|
+
|
|
579
|
+
|
|
580
|
+
def histogram2d(x, y, bins=10, range=None, normed=None, weights=None, density=None):
|
|
581
|
+
"""Blocked variant of :func:`numpy.histogram2d`.
|
|
582
|
+
|
|
583
|
+
Parameters
|
|
584
|
+
----------
|
|
585
|
+
x : dask.array.Array
|
|
586
|
+
x-coordinates of the points.
|
|
587
|
+
y : dask.array.Array
|
|
588
|
+
y-coordinates of the points.
|
|
589
|
+
bins : sequence of arrays, int, or sequence of ints
|
|
590
|
+
The bin specification.
|
|
591
|
+
range : tuple of pairs, optional
|
|
592
|
+
The bin edges ((xmin, xmax), (ymin, ymax)).
|
|
593
|
+
normed : bool, optional
|
|
594
|
+
Alias for density.
|
|
595
|
+
weights : dask.array.Array, optional
|
|
596
|
+
Weights for the histogram.
|
|
597
|
+
density : bool, optional
|
|
598
|
+
If True, normalize the histogram.
|
|
599
|
+
|
|
600
|
+
Returns
|
|
601
|
+
-------
|
|
602
|
+
hist : dask Array
|
|
603
|
+
The histogram values.
|
|
604
|
+
xedges : array
|
|
605
|
+
The x bin edges.
|
|
606
|
+
yedges : array
|
|
607
|
+
The y bin edges.
|
|
608
|
+
"""
|
|
609
|
+
counts, edges = histogramdd(
|
|
610
|
+
(x, y),
|
|
611
|
+
bins=bins,
|
|
612
|
+
range=range,
|
|
613
|
+
normed=normed,
|
|
614
|
+
weights=weights,
|
|
615
|
+
density=density,
|
|
616
|
+
)
|
|
617
|
+
return counts, edges[0], edges[1]
|