dask-array 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (144) hide show
  1. dask_array/__init__.py +228 -0
  2. dask_array/_backends.py +76 -0
  3. dask_array/_backends_array.py +99 -0
  4. dask_array/_blockwise.py +1410 -0
  5. dask_array/_broadcast.py +272 -0
  6. dask_array/_chunk.py +445 -0
  7. dask_array/_chunk_types.py +54 -0
  8. dask_array/_collection.py +1644 -0
  9. dask_array/_concatenate.py +331 -0
  10. dask_array/_core_utils.py +1365 -0
  11. dask_array/_dispatch.py +141 -0
  12. dask_array/_einsum.py +277 -0
  13. dask_array/_expr.py +544 -0
  14. dask_array/_expr_flow.py +586 -0
  15. dask_array/_gufunc.py +805 -0
  16. dask_array/_histogram.py +617 -0
  17. dask_array/_map_blocks.py +652 -0
  18. dask_array/_new_collection.py +10 -0
  19. dask_array/_numpy_compat.py +135 -0
  20. dask_array/_overlap.py +1159 -0
  21. dask_array/_rechunk.py +1050 -0
  22. dask_array/_reshape.py +710 -0
  23. dask_array/_routines.py +102 -0
  24. dask_array/_shuffle.py +448 -0
  25. dask_array/_stack.py +264 -0
  26. dask_array/_svg.py +291 -0
  27. dask_array/_templates.py +29 -0
  28. dask_array/_test_utils.py +257 -0
  29. dask_array/_ufunc.py +385 -0
  30. dask_array/_utils.py +349 -0
  31. dask_array/_visualize.py +223 -0
  32. dask_array/_xarray.py +337 -0
  33. dask_array/core/__init__.py +34 -0
  34. dask_array/core/_blockwise_funcs.py +312 -0
  35. dask_array/core/_conversion.py +422 -0
  36. dask_array/core/_from_graph.py +97 -0
  37. dask_array/creation/__init__.py +71 -0
  38. dask_array/creation/_arange.py +121 -0
  39. dask_array/creation/_diag.py +116 -0
  40. dask_array/creation/_diagonal.py +241 -0
  41. dask_array/creation/_eye.py +103 -0
  42. dask_array/creation/_linspace.py +102 -0
  43. dask_array/creation/_mesh.py +134 -0
  44. dask_array/creation/_ones_zeros.py +454 -0
  45. dask_array/creation/_pad.py +270 -0
  46. dask_array/creation/_repeat.py +55 -0
  47. dask_array/creation/_tile.py +36 -0
  48. dask_array/creation/_tri.py +28 -0
  49. dask_array/creation/_utils.py +296 -0
  50. dask_array/fft.py +320 -0
  51. dask_array/io/__init__.py +39 -0
  52. dask_array/io/_base.py +10 -0
  53. dask_array/io/_from_array.py +257 -0
  54. dask_array/io/_from_delayed.py +95 -0
  55. dask_array/io/_from_graph.py +54 -0
  56. dask_array/io/_from_npy_stack.py +67 -0
  57. dask_array/io/_store.py +336 -0
  58. dask_array/io/_tiledb.py +159 -0
  59. dask_array/io/_to_npy_stack.py +65 -0
  60. dask_array/io/_zarr.py +449 -0
  61. dask_array/linalg/__init__.py +39 -0
  62. dask_array/linalg/_cholesky.py +234 -0
  63. dask_array/linalg/_lu.py +300 -0
  64. dask_array/linalg/_norm.py +94 -0
  65. dask_array/linalg/_qr.py +601 -0
  66. dask_array/linalg/_solve.py +349 -0
  67. dask_array/linalg/_svd.py +394 -0
  68. dask_array/linalg/_tensordot.py +334 -0
  69. dask_array/linalg/_utils.py +74 -0
  70. dask_array/manipulation/__init__.py +45 -0
  71. dask_array/manipulation/_expand.py +321 -0
  72. dask_array/manipulation/_flip.py +92 -0
  73. dask_array/manipulation/_roll.py +78 -0
  74. dask_array/manipulation/_transpose.py +309 -0
  75. dask_array/random/__init__.py +125 -0
  76. dask_array/random/_choice.py +181 -0
  77. dask_array/random/_expr.py +256 -0
  78. dask_array/random/_generator.py +441 -0
  79. dask_array/random/_random_state.py +259 -0
  80. dask_array/random/_utils.py +84 -0
  81. dask_array/reductions/__init__.py +84 -0
  82. dask_array/reductions/_arg_reduction.py +130 -0
  83. dask_array/reductions/_common.py +1082 -0
  84. dask_array/reductions/_cumulative.py +522 -0
  85. dask_array/reductions/_percentile.py +261 -0
  86. dask_array/reductions/_reduction.py +725 -0
  87. dask_array/reductions/_trace.py +56 -0
  88. dask_array/routines/__init__.py +133 -0
  89. dask_array/routines/_apply.py +84 -0
  90. dask_array/routines/_bincount.py +112 -0
  91. dask_array/routines/_broadcast.py +111 -0
  92. dask_array/routines/_coarsen.py +115 -0
  93. dask_array/routines/_diff.py +79 -0
  94. dask_array/routines/_gradient.py +158 -0
  95. dask_array/routines/_indexing.py +65 -0
  96. dask_array/routines/_insert_delete.py +132 -0
  97. dask_array/routines/_misc.py +122 -0
  98. dask_array/routines/_nonzero.py +72 -0
  99. dask_array/routines/_search.py +123 -0
  100. dask_array/routines/_select.py +113 -0
  101. dask_array/routines/_statistics.py +171 -0
  102. dask_array/routines/_topk.py +82 -0
  103. dask_array/routines/_triangular.py +74 -0
  104. dask_array/routines/_unique.py +232 -0
  105. dask_array/routines/_where.py +62 -0
  106. dask_array/slicing/__init__.py +67 -0
  107. dask_array/slicing/_basic.py +550 -0
  108. dask_array/slicing/_blocks.py +138 -0
  109. dask_array/slicing/_bool_index.py +145 -0
  110. dask_array/slicing/_setitem.py +329 -0
  111. dask_array/slicing/_squeeze.py +101 -0
  112. dask_array/slicing/_utils.py +1133 -0
  113. dask_array/slicing/_vindex.py +282 -0
  114. dask_array/stacking/__init__.py +15 -0
  115. dask_array/stacking/_block.py +83 -0
  116. dask_array/stacking/_simple.py +58 -0
  117. dask_array/templates/array.html.j2 +48 -0
  118. dask_array/tests/__init__.py +0 -0
  119. dask_array/tests/conftest.py +22 -0
  120. dask_array/tests/test_api.py +40 -0
  121. dask_array/tests/test_binary_op_chunks.py +107 -0
  122. dask_array/tests/test_coarse_slice_through_blockwise.py +362 -0
  123. dask_array/tests/test_collection.py +799 -0
  124. dask_array/tests/test_creation.py +1102 -0
  125. dask_array/tests/test_expr_flow.py +143 -0
  126. dask_array/tests/test_linalg.py +1130 -0
  127. dask_array/tests/test_map_blocks_multi_output.py +104 -0
  128. dask_array/tests/test_rechunk_pushdown.py +214 -0
  129. dask_array/tests/test_reductions.py +1091 -0
  130. dask_array/tests/test_routines.py +2853 -0
  131. dask_array/tests/test_shuffle_chunks.py +67 -0
  132. dask_array/tests/test_slice_pushdown.py +968 -0
  133. dask_array/tests/test_slice_through_blockwise.py +678 -0
  134. dask_array/tests/test_slice_through_overlap.py +366 -0
  135. dask_array/tests/test_slice_through_reshape.py +272 -0
  136. dask_array/tests/test_slicing.py +839 -0
  137. dask_array/tests/test_transpose_slice_pushdown.py +208 -0
  138. dask_array/tests/test_visualize.py +94 -0
  139. dask_array/tests/test_xarray.py +193 -0
  140. dask_array-0.1.0.dist-info/METADATA +48 -0
  141. dask_array-0.1.0.dist-info/RECORD +144 -0
  142. dask_array-0.1.0.dist-info/WHEEL +4 -0
  143. dask_array-0.1.0.dist-info/entry_points.txt +2 -0
  144. dask_array-0.1.0.dist-info/licenses/LICENSE +29 -0
@@ -0,0 +1,617 @@
1
+ from __future__ import annotations
2
+
3
+ from builtins import range as _range
4
+ from functools import cached_property, reduce
5
+ from operator import mul
6
+
7
+ import numpy as np
8
+
9
+ from dask._task_spec import Task, TaskRef
10
+ from dask_array._collection import Array, asarray, new_collection
11
+ from dask_array._expr import ArrayExpr
12
+ from dask.base import is_dask_collection
13
+ from dask.delayed import Delayed
14
+
15
+
16
+ def _block_hist(x, bins_range, weights=None):
17
+ """Compute histogram for a single block."""
18
+ bins, range_ = bins_range
19
+ return np.histogram(x, bins, range=range_, weights=weights)[0][np.newaxis]
20
+
21
+
22
+ def _linspace_from_range(bins_range):
23
+ """Create bin edges from (num_bins, (start, stop)) at compute time."""
24
+ bins, (start, stop) = bins_range
25
+ return np.linspace(start, stop, num=int(bins) + 1)
26
+
27
+
28
+ def _flatten_keys(arr):
29
+ """Flatten array keys in C order."""
30
+ for idx in np.ndindex(arr.numblocks):
31
+ yield (arr._name,) + idx
32
+
33
+
34
+ def _to_ref(val):
35
+ """Convert value to TaskRef if it's an ArrayExpr, otherwise return as-is."""
36
+ if isinstance(val, ArrayExpr):
37
+ return TaskRef((val._name,))
38
+ return val
39
+
40
+
41
+ class HistogramBinned(ArrayExpr):
42
+ """Expression for mapped histogram computation.
43
+
44
+ Creates a 2D array of shape (nchunks, nbins) where each row is the
45
+ histogram of one input chunk. Use .sum(axis=0) to get the final histogram.
46
+
47
+ Handles both concrete and delayed bins/range.
48
+ """
49
+
50
+ _parameters = ["array", "bins", "range_start", "range_stop", "weights", "nbins"]
51
+ _defaults = {
52
+ "range_start": None,
53
+ "range_stop": None,
54
+ "weights": None,
55
+ "nbins": None,
56
+ }
57
+
58
+ @cached_property
59
+ def _meta(self):
60
+ dtype = np.histogram([])[0].dtype
61
+ if self.weights is not None:
62
+ dtype = self.weights._meta.dtype
63
+ return np.empty((0, 0), dtype=dtype)
64
+
65
+ @cached_property
66
+ def chunks(self):
67
+ nchunks = reduce(mul, self.array.numblocks, 1)
68
+ if self.nbins is not None:
69
+ nbins = self.nbins
70
+ elif isinstance(self.bins, ArrayExpr):
71
+ nbins = None
72
+ else:
73
+ nbins = len(self.bins) - 1
74
+
75
+ if nbins is None or isinstance(nbins, ArrayExpr):
76
+ return ((1,) * nchunks, (np.nan,))
77
+ return ((1,) * nchunks, (int(nbins),))
78
+
79
+ @cached_property
80
+ def _name(self):
81
+ return f"histogram-{self.deterministic_token}"
82
+
83
+ def _layer(self) -> dict:
84
+ from dask._task_spec import List as TaskList
85
+
86
+ dsk = {}
87
+ array_keys = list(_flatten_keys(self.array))
88
+
89
+ # Build bins reference (may be multi-dimensional array)
90
+ if isinstance(self.bins, ArrayExpr):
91
+ bins_ref = TaskRef((self.bins._name,) + (0,) * self.bins.ndim)
92
+ else:
93
+ bins_ref = self.bins
94
+
95
+ # Build bins_range tuple
96
+ range_tuple = TaskList(_to_ref(self.range_start), _to_ref(self.range_stop))
97
+ bins_range = TaskList(bins_ref, range_tuple)
98
+
99
+ if self.weights is None:
100
+ for i, k in enumerate(array_keys):
101
+ dsk[(self._name, i, 0)] = Task(
102
+ (self._name, i, 0),
103
+ _block_hist,
104
+ TaskRef(k),
105
+ bins_range,
106
+ )
107
+ else:
108
+ weight_keys = list(_flatten_keys(self.weights))
109
+ for i, (k, w) in enumerate(zip(array_keys, weight_keys)):
110
+ dsk[(self._name, i, 0)] = Task(
111
+ (self._name, i, 0),
112
+ _block_hist,
113
+ TaskRef(k),
114
+ bins_range,
115
+ TaskRef(w),
116
+ )
117
+ return dsk
118
+
119
+ @property
120
+ def _dependencies(self):
121
+ deps = [self.array]
122
+ for attr in (self.bins, self.range_start, self.range_stop, self.weights):
123
+ if isinstance(attr, ArrayExpr):
124
+ deps.append(attr)
125
+ return deps
126
+
127
+
128
+ class LinspaceDelayed(ArrayExpr):
129
+ """Expression for linspace with delayed start/stop values."""
130
+
131
+ _parameters = ["num_bins", "range_start", "range_stop"]
132
+
133
+ @cached_property
134
+ def _meta(self):
135
+ return np.linspace(0, 1, 2)
136
+
137
+ @cached_property
138
+ def chunks(self):
139
+ if isinstance(self.num_bins, ArrayExpr):
140
+ return ((np.nan,),)
141
+ return ((int(self.num_bins) + 1,),)
142
+
143
+ @cached_property
144
+ def _name(self):
145
+ return f"linspace-delayed-{self.deterministic_token}"
146
+
147
+ def _layer(self) -> dict:
148
+ from dask._task_spec import List as TaskList
149
+
150
+ bins_range = TaskList(
151
+ _to_ref(self.num_bins),
152
+ TaskList(_to_ref(self.range_start), _to_ref(self.range_stop)),
153
+ )
154
+ return {(self._name, 0): Task((self._name, 0), _linspace_from_range, bins_range)}
155
+
156
+ @property
157
+ def _dependencies(self):
158
+ return [v for v in (self.range_start, self.range_stop, self.num_bins) if isinstance(v, ArrayExpr)]
159
+
160
+
161
+ def histogram(a, bins=None, range=None, normed=False, weights=None, density=None):
162
+ """
163
+ Blocked variant of :func:`numpy.histogram`.
164
+
165
+ Parameters
166
+ ----------
167
+ a : dask.array.Array
168
+ Input data. The histogram is computed over the flattened array.
169
+ bins : int or sequence of scalars, optional
170
+ Either an iterable specifying the bins or the number of bins
171
+ and a range argument is required.
172
+ range : (float, float), optional
173
+ The lower and upper range of the bins.
174
+ normed : bool, optional
175
+ Deprecated, use density instead.
176
+ weights : dask.array.Array, optional
177
+ Weights for the histogram, same shape as a.
178
+ density : bool, optional
179
+ If True, normalize the histogram.
180
+
181
+ Returns
182
+ -------
183
+ hist : dask Array
184
+ The histogram values.
185
+ bin_edges : dask Array
186
+ The bin edges.
187
+ """
188
+ from dask.base import is_dask_collection
189
+
190
+ if isinstance(bins, Array):
191
+ scalar_bins = bins.ndim == 0
192
+ elif isinstance(bins, Delayed):
193
+ scalar_bins = bins._length is None or bins._length == 1
194
+ else:
195
+ scalar_bins = np.ndim(bins) == 0
196
+
197
+ if bins is None or (scalar_bins and range is None):
198
+ raise ValueError(
199
+ "dask.array.histogram requires either specifying "
200
+ "bins as an iterable or specifying both a range and "
201
+ "the number of bins"
202
+ )
203
+
204
+ if weights is not None and weights.chunks != a.chunks:
205
+ raise ValueError("Input array and weights must have the same chunked structure")
206
+
207
+ if normed is not False:
208
+ raise ValueError(
209
+ "The normed= keyword argument has been deprecated. "
210
+ "Please use density instead. "
211
+ "See the numpy.histogram docstring for more information."
212
+ )
213
+
214
+ if density and scalar_bins and isinstance(bins, (Array, Delayed)):
215
+ raise NotImplementedError(
216
+ "When `density` is True, `bins` cannot be a scalar Dask object. "
217
+ "It must be a concrete number or a (possibly-delayed) array/sequence of bin edges."
218
+ )
219
+
220
+ if range is not None:
221
+ try:
222
+ if len(range) != 2:
223
+ raise ValueError(f"range must be a sequence or array of length 2, but got {len(range)} items")
224
+ if isinstance(range, (Array, np.ndarray)) and range.shape != (2,):
225
+ raise ValueError(
226
+ f"range must be a 1-dimensional array of two items, but got an array of shape {range.shape}"
227
+ )
228
+ except TypeError:
229
+ raise TypeError(f"Expected a sequence or array for range, not {range}") from None
230
+
231
+ # Handle delayed bins/range
232
+ range_has_dask = range is not None and any(is_dask_collection(r) for r in range)
233
+ bins_is_dask_array = isinstance(bins, Array)
234
+
235
+ if is_dask_collection(bins) and not bins_is_dask_array:
236
+ raise NotImplementedError("Delayed bins (non-Array) not yet supported in array-expr. Use a dask Array instead.")
237
+
238
+ # Extract range values
239
+ range_start = range[0] if range is not None else None
240
+ range_stop = range[1] if range is not None else None
241
+
242
+ # Convert range to expressions if they're Arrays
243
+ if isinstance(range_start, Array):
244
+ range_start_expr = range_start.expr
245
+ else:
246
+ range_start_expr = range_start
247
+
248
+ if isinstance(range_stop, Array):
249
+ range_stop_expr = range_stop.expr
250
+ else:
251
+ range_stop_expr = range_stop
252
+
253
+ if range_has_dask or bins_is_dask_array:
254
+ # Delayed bins and/or range
255
+ if scalar_bins:
256
+ # Scalar bins + delayed range: create linspace expression
257
+ if isinstance(bins, Array):
258
+ nbins_expr = bins.expr
259
+ else:
260
+ nbins_expr = int(bins)
261
+
262
+ # Create linspace expression for bin edges
263
+ linspace_expr = LinspaceDelayed(nbins_expr, range_start_expr, range_stop_expr)
264
+ bins_edges = new_collection(linspace_expr)
265
+
266
+ hist_expr = HistogramBinned(
267
+ a.expr,
268
+ linspace_expr,
269
+ range_start_expr,
270
+ range_stop_expr,
271
+ weights.expr if weights is not None else None,
272
+ nbins_expr,
273
+ )
274
+ else:
275
+ # bins is a dask Array of bin edges
276
+ if not isinstance(bins, Array):
277
+ bins = asarray(bins)
278
+ if bins.ndim != 1:
279
+ raise ValueError(f"bins must be a 1-dimensional array or sequence, got shape {bins.ndim}D")
280
+
281
+ # Rechunk bins to a single chunk for numpy.histogram
282
+ bins_rechunked = bins.rechunk(-1)
283
+
284
+ # nbins is len(bins) - 1, but bins may have unknown size
285
+ if np.isnan(bins.shape[0]):
286
+ nbins_expr = bins.expr
287
+ else:
288
+ nbins_expr = int(bins.shape[0]) - 1
289
+
290
+ hist_expr = HistogramBinned(
291
+ a.expr,
292
+ bins_rechunked.expr,
293
+ range_start_expr,
294
+ range_stop_expr,
295
+ weights.expr if weights is not None else None,
296
+ nbins_expr,
297
+ )
298
+ bins_edges = bins
299
+
300
+ mapped = new_collection(hist_expr)
301
+ n = mapped.sum(axis=0)
302
+
303
+ if density:
304
+ db = bins_edges[1:] - bins_edges[:-1]
305
+ db = db.astype(float)
306
+ n = n / db / n.sum()
307
+
308
+ return n, bins_edges
309
+
310
+ # Non-delayed case: convert scalar bins + range to bin edges
311
+ if scalar_bins:
312
+ bins = np.linspace(range[0], range[1], num=int(bins) + 1)
313
+ else:
314
+ if not isinstance(bins, np.ndarray):
315
+ bins = np.asarray(bins)
316
+ if bins.ndim != 1:
317
+ raise ValueError(f"bins must be a 1-dimensional array or sequence, got shape {bins.shape}")
318
+
319
+ # Create the histogram expression (concrete bins)
320
+ hist_expr = HistogramBinned(
321
+ a.expr,
322
+ bins,
323
+ range_start_expr,
324
+ range_stop_expr,
325
+ weights.expr if weights is not None else None,
326
+ )
327
+ mapped = new_collection(hist_expr)
328
+
329
+ # Sum over chunks to get the final histogram
330
+ n = mapped.sum(axis=0)
331
+
332
+ # Handle density normalization
333
+ if density:
334
+ db = asarray(np.diff(bins).astype(float), chunks=n.chunks)
335
+ n = n / db / n.sum()
336
+
337
+ return n, bins
338
+
339
+
340
+ def _block_histogramdd_rect(sample, bins, range, weights):
341
+ """Call numpy.histogramdd for a blocked/chunked calculation.
342
+
343
+ Slurps the result into an additional outer axis; this new axis
344
+ will be used to stack chunked calls of the numpy function and add
345
+ them together later.
346
+ """
347
+ return np.histogramdd(sample, bins, range=range, weights=weights)[0][np.newaxis]
348
+
349
+
350
+ def _block_histogramdd_multiarg(*args):
351
+ """Call numpy.histogramdd for a multi argument blocked/chunked calculation.
352
+
353
+ The last three arguments _must be_ (bins, range, weights).
354
+ """
355
+ bins, range_, weights = args[-3:]
356
+ sample = args[:-3]
357
+ return np.histogramdd(sample, bins=bins, range=range_, weights=weights)[0][np.newaxis]
358
+
359
+
360
+ class HistogramDDBinned(ArrayExpr):
361
+ """Expression for mapped histogramdd computation.
362
+
363
+ This creates an (nchunks, *nbins) array where the first axis
364
+ represents each input chunk. Use .sum(axis=0) to get the final histogram.
365
+ """
366
+
367
+ _parameters = [
368
+ "sample",
369
+ "bins",
370
+ "range",
371
+ "weights",
372
+ "rectangular_sample",
373
+ "n_chunks",
374
+ "D",
375
+ ]
376
+ _defaults = {"range": None, "weights": None}
377
+
378
+ @cached_property
379
+ def _meta(self):
380
+ dtype = np.histogramdd(np.empty((0, self.D)))[0].dtype
381
+ if self.weights is not None:
382
+ dtype = self.weights._meta.dtype
383
+ # Meta shape: (0,) * (D + 1)
384
+ return np.empty((0,) * (self.D + 1), dtype=dtype)
385
+
386
+ @cached_property
387
+ def chunks(self):
388
+ # Compute all_nbins from edges
389
+ all_nbins = tuple((len(b) - 1,) for b in self.bins)
390
+ return ((1,) * self.n_chunks, *all_nbins)
391
+
392
+ @cached_property
393
+ def _name(self):
394
+ return f"histogramdd-{self.deterministic_token}"
395
+
396
+ def _layer(self) -> dict:
397
+ dsk = {}
398
+ D = self.D
399
+ n_chunks = self.n_chunks
400
+
401
+ # Column zeros for indexing
402
+ column_zeros = tuple(0 for _ in _range(D))
403
+
404
+ # Get weight keys if provided
405
+ if self.weights is None:
406
+ w_keys = [None] * n_chunks
407
+ else:
408
+ w_keys = list(_flatten_keys(self.weights))
409
+
410
+ # Convert bins to list for passing to numpy
411
+ bins_list = [np.asarray(b) for b in self.bins]
412
+
413
+ if self.rectangular_sample:
414
+ # Single 2D array input
415
+ sample_keys = list(_flatten_keys(self.sample))
416
+ for i, (k, w) in enumerate(zip(sample_keys, w_keys)):
417
+ key = (self._name, i, *column_zeros)
418
+ w_ref = TaskRef(w) if w else None
419
+ dsk[key] = Task(
420
+ key,
421
+ _block_histogramdd_rect,
422
+ TaskRef(k),
423
+ bins_list,
424
+ self.range,
425
+ w_ref,
426
+ )
427
+ else:
428
+ # Sequence of 1D arrays
429
+ sample_keys = [list(_flatten_keys(s)) for s in self.sample]
430
+ for i in _range(n_chunks):
431
+ key = (self._name, i, *column_zeros)
432
+ coord_keys = [sample_keys[j][i] for j in _range(D)]
433
+ w_ref = TaskRef(w_keys[i]) if w_keys[i] else None
434
+ dsk[key] = Task(
435
+ key,
436
+ _block_histogramdd_multiarg,
437
+ *[TaskRef(ck) for ck in coord_keys],
438
+ bins_list,
439
+ self.range,
440
+ w_ref,
441
+ )
442
+ return dsk
443
+
444
+ def dependencies(self):
445
+ if self.rectangular_sample:
446
+ deps = [self.sample]
447
+ else:
448
+ deps = list(self.sample)
449
+ if self.weights is not None:
450
+ deps.append(self.weights)
451
+ return deps
452
+
453
+
454
+ def histogramdd(sample, bins, range=None, normed=None, weights=None, density=None):
455
+ """Blocked variant of :func:`numpy.histogramdd`.
456
+
457
+ Parameters
458
+ ----------
459
+ sample : dask.array.Array (N, D) or sequence of dask.array.Array
460
+ Multidimensional data to be histogrammed.
461
+ bins : sequence of arrays describing bin edges, int, or sequence of ints
462
+ The bin specification.
463
+ range : sequence of pairs, optional
464
+ The outer bin edges for each dimension.
465
+ normed : bool, optional
466
+ Alias for density.
467
+ weights : dask.array.Array, optional
468
+ Weights for the histogram.
469
+ density : bool, optional
470
+ If True, normalize the histogram.
471
+
472
+ Returns
473
+ -------
474
+ hist : dask Array
475
+ The histogram values.
476
+ edges : list of arrays
477
+ The bin edges.
478
+ """
479
+ # Handle normed/density
480
+ if normed is None:
481
+ if density is None:
482
+ density = False
483
+ elif density is None:
484
+ density = normed
485
+ else:
486
+ raise TypeError("Cannot specify both 'normed' and 'density'")
487
+
488
+ # Check for dask collections in bins/range
489
+ dc_bins = is_dask_collection(bins)
490
+ if isinstance(bins, (list, tuple)):
491
+ dc_bins = dc_bins or any(is_dask_collection(b) for b in bins)
492
+ dc_range = any(is_dask_collection(r) for r in range) if range is not None else False
493
+ if dc_bins or dc_range:
494
+ raise NotImplementedError("Passing dask collections to bins=... or range=... is not supported.")
495
+
496
+ # Determine sample structure
497
+ if hasattr(sample, "shape"):
498
+ if len(sample.shape) != 2:
499
+ raise ValueError("Single array input to histogramdd should be columnar")
500
+ _, D = sample.shape
501
+ n_chunks = sample.numblocks[0]
502
+ rectangular_sample = True
503
+ if sample.shape[1:] != sample.chunksize[1:]:
504
+ raise ValueError("Input array can only be chunked along the 0th axis.")
505
+ elif isinstance(sample, (tuple, list)):
506
+ rectangular_sample = False
507
+ D = len(sample)
508
+ n_chunks = sample[0].numblocks[0]
509
+ for i in _range(1, D):
510
+ if sample[i].chunks != sample[0].chunks:
511
+ raise ValueError("All coordinate arrays must be chunked identically.")
512
+ else:
513
+ raise ValueError("Incompatible sample. Must be a 2D array or a sequence of 1D arrays.")
514
+
515
+ # Validate weights
516
+ if weights is not None:
517
+ if rectangular_sample and weights.chunks[0] != sample.chunks[0]:
518
+ raise ValueError(
519
+ "Input array and weights must have the same shape and chunk structure along the first dimension."
520
+ )
521
+ elif not rectangular_sample and weights.numblocks[0] != n_chunks:
522
+ raise ValueError("Input arrays and weights must have the same shape and chunk structure.")
523
+
524
+ # Validate bins
525
+ if isinstance(bins, (list, tuple)):
526
+ if len(bins) != D:
527
+ raise ValueError("The dimension of bins must be equal to the dimension of the sample.")
528
+
529
+ # Validate range
530
+ if range is not None:
531
+ if len(range) != D:
532
+ raise ValueError("range argument requires one entry, a min max pair, per dimension.")
533
+ if not all(len(r) == 2 for r in range):
534
+ raise ValueError("range argument should be a sequence of pairs")
535
+
536
+ # Convert bins to tuple if single int
537
+ if isinstance(bins, int):
538
+ bins = (bins,) * D
539
+
540
+ # Compute edges
541
+ if all(isinstance(b, int) for b in bins) and range is not None and all(len(r) == 2 for r in range):
542
+ edges = [np.linspace(r[0], r[1], b + 1) for b, r in zip(bins, range)]
543
+ else:
544
+ edges = [np.asarray(b) for b in bins]
545
+
546
+ # Get sample expression(s)
547
+ if rectangular_sample:
548
+ sample_expr = sample.expr
549
+ else:
550
+ sample_expr = tuple(s.expr for s in sample)
551
+
552
+ # Create the histogramdd expression
553
+ hist_expr = HistogramDDBinned(
554
+ sample_expr,
555
+ tuple(edges),
556
+ range,
557
+ weights.expr if weights is not None else None,
558
+ rectangular_sample,
559
+ n_chunks,
560
+ D,
561
+ )
562
+ mapped = new_collection(hist_expr)
563
+
564
+ # Sum over chunks to get the final histogram
565
+ n = mapped.sum(axis=0)
566
+
567
+ # Handle density normalization
568
+ if density:
569
+ width_divider = np.ones(n.shape)
570
+ for i in _range(D):
571
+ shape = np.ones(D, int)
572
+ shape[i] = width_divider.shape[i]
573
+ width_divider *= np.diff(edges[i]).reshape(shape)
574
+ width_divider = asarray(width_divider, chunks=n.chunks)
575
+ return n / width_divider / n.sum(), [asarray(e) for e in edges]
576
+
577
+ return n, [asarray(e) for e in edges]
578
+
579
+
580
+ def histogram2d(x, y, bins=10, range=None, normed=None, weights=None, density=None):
581
+ """Blocked variant of :func:`numpy.histogram2d`.
582
+
583
+ Parameters
584
+ ----------
585
+ x : dask.array.Array
586
+ x-coordinates of the points.
587
+ y : dask.array.Array
588
+ y-coordinates of the points.
589
+ bins : sequence of arrays, int, or sequence of ints
590
+ The bin specification.
591
+ range : tuple of pairs, optional
592
+ The bin edges ((xmin, xmax), (ymin, ymax)).
593
+ normed : bool, optional
594
+ Alias for density.
595
+ weights : dask.array.Array, optional
596
+ Weights for the histogram.
597
+ density : bool, optional
598
+ If True, normalize the histogram.
599
+
600
+ Returns
601
+ -------
602
+ hist : dask Array
603
+ The histogram values.
604
+ xedges : array
605
+ The x bin edges.
606
+ yedges : array
607
+ The y bin edges.
608
+ """
609
+ counts, edges = histogramdd(
610
+ (x, y),
611
+ bins=bins,
612
+ range=range,
613
+ normed=normed,
614
+ weights=weights,
615
+ density=density,
616
+ )
617
+ return counts, edges[0], edges[1]