dask-array 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dask_array/__init__.py +228 -0
- dask_array/_backends.py +76 -0
- dask_array/_backends_array.py +99 -0
- dask_array/_blockwise.py +1410 -0
- dask_array/_broadcast.py +272 -0
- dask_array/_chunk.py +445 -0
- dask_array/_chunk_types.py +54 -0
- dask_array/_collection.py +1644 -0
- dask_array/_concatenate.py +331 -0
- dask_array/_core_utils.py +1365 -0
- dask_array/_dispatch.py +141 -0
- dask_array/_einsum.py +277 -0
- dask_array/_expr.py +544 -0
- dask_array/_expr_flow.py +586 -0
- dask_array/_gufunc.py +805 -0
- dask_array/_histogram.py +617 -0
- dask_array/_map_blocks.py +652 -0
- dask_array/_new_collection.py +10 -0
- dask_array/_numpy_compat.py +135 -0
- dask_array/_overlap.py +1159 -0
- dask_array/_rechunk.py +1050 -0
- dask_array/_reshape.py +710 -0
- dask_array/_routines.py +102 -0
- dask_array/_shuffle.py +448 -0
- dask_array/_stack.py +264 -0
- dask_array/_svg.py +291 -0
- dask_array/_templates.py +29 -0
- dask_array/_test_utils.py +257 -0
- dask_array/_ufunc.py +385 -0
- dask_array/_utils.py +349 -0
- dask_array/_visualize.py +223 -0
- dask_array/_xarray.py +337 -0
- dask_array/core/__init__.py +34 -0
- dask_array/core/_blockwise_funcs.py +312 -0
- dask_array/core/_conversion.py +422 -0
- dask_array/core/_from_graph.py +97 -0
- dask_array/creation/__init__.py +71 -0
- dask_array/creation/_arange.py +121 -0
- dask_array/creation/_diag.py +116 -0
- dask_array/creation/_diagonal.py +241 -0
- dask_array/creation/_eye.py +103 -0
- dask_array/creation/_linspace.py +102 -0
- dask_array/creation/_mesh.py +134 -0
- dask_array/creation/_ones_zeros.py +454 -0
- dask_array/creation/_pad.py +270 -0
- dask_array/creation/_repeat.py +55 -0
- dask_array/creation/_tile.py +36 -0
- dask_array/creation/_tri.py +28 -0
- dask_array/creation/_utils.py +296 -0
- dask_array/fft.py +320 -0
- dask_array/io/__init__.py +39 -0
- dask_array/io/_base.py +10 -0
- dask_array/io/_from_array.py +257 -0
- dask_array/io/_from_delayed.py +95 -0
- dask_array/io/_from_graph.py +54 -0
- dask_array/io/_from_npy_stack.py +67 -0
- dask_array/io/_store.py +336 -0
- dask_array/io/_tiledb.py +159 -0
- dask_array/io/_to_npy_stack.py +65 -0
- dask_array/io/_zarr.py +449 -0
- dask_array/linalg/__init__.py +39 -0
- dask_array/linalg/_cholesky.py +234 -0
- dask_array/linalg/_lu.py +300 -0
- dask_array/linalg/_norm.py +94 -0
- dask_array/linalg/_qr.py +601 -0
- dask_array/linalg/_solve.py +349 -0
- dask_array/linalg/_svd.py +394 -0
- dask_array/linalg/_tensordot.py +334 -0
- dask_array/linalg/_utils.py +74 -0
- dask_array/manipulation/__init__.py +45 -0
- dask_array/manipulation/_expand.py +321 -0
- dask_array/manipulation/_flip.py +92 -0
- dask_array/manipulation/_roll.py +78 -0
- dask_array/manipulation/_transpose.py +309 -0
- dask_array/random/__init__.py +125 -0
- dask_array/random/_choice.py +181 -0
- dask_array/random/_expr.py +256 -0
- dask_array/random/_generator.py +441 -0
- dask_array/random/_random_state.py +259 -0
- dask_array/random/_utils.py +84 -0
- dask_array/reductions/__init__.py +84 -0
- dask_array/reductions/_arg_reduction.py +130 -0
- dask_array/reductions/_common.py +1082 -0
- dask_array/reductions/_cumulative.py +522 -0
- dask_array/reductions/_percentile.py +261 -0
- dask_array/reductions/_reduction.py +725 -0
- dask_array/reductions/_trace.py +56 -0
- dask_array/routines/__init__.py +133 -0
- dask_array/routines/_apply.py +84 -0
- dask_array/routines/_bincount.py +112 -0
- dask_array/routines/_broadcast.py +111 -0
- dask_array/routines/_coarsen.py +115 -0
- dask_array/routines/_diff.py +79 -0
- dask_array/routines/_gradient.py +158 -0
- dask_array/routines/_indexing.py +65 -0
- dask_array/routines/_insert_delete.py +132 -0
- dask_array/routines/_misc.py +122 -0
- dask_array/routines/_nonzero.py +72 -0
- dask_array/routines/_search.py +123 -0
- dask_array/routines/_select.py +113 -0
- dask_array/routines/_statistics.py +171 -0
- dask_array/routines/_topk.py +82 -0
- dask_array/routines/_triangular.py +74 -0
- dask_array/routines/_unique.py +232 -0
- dask_array/routines/_where.py +62 -0
- dask_array/slicing/__init__.py +67 -0
- dask_array/slicing/_basic.py +550 -0
- dask_array/slicing/_blocks.py +138 -0
- dask_array/slicing/_bool_index.py +145 -0
- dask_array/slicing/_setitem.py +329 -0
- dask_array/slicing/_squeeze.py +101 -0
- dask_array/slicing/_utils.py +1133 -0
- dask_array/slicing/_vindex.py +282 -0
- dask_array/stacking/__init__.py +15 -0
- dask_array/stacking/_block.py +83 -0
- dask_array/stacking/_simple.py +58 -0
- dask_array/templates/array.html.j2 +48 -0
- dask_array/tests/__init__.py +0 -0
- dask_array/tests/conftest.py +22 -0
- dask_array/tests/test_api.py +40 -0
- dask_array/tests/test_binary_op_chunks.py +107 -0
- dask_array/tests/test_coarse_slice_through_blockwise.py +362 -0
- dask_array/tests/test_collection.py +799 -0
- dask_array/tests/test_creation.py +1102 -0
- dask_array/tests/test_expr_flow.py +143 -0
- dask_array/tests/test_linalg.py +1130 -0
- dask_array/tests/test_map_blocks_multi_output.py +104 -0
- dask_array/tests/test_rechunk_pushdown.py +214 -0
- dask_array/tests/test_reductions.py +1091 -0
- dask_array/tests/test_routines.py +2853 -0
- dask_array/tests/test_shuffle_chunks.py +67 -0
- dask_array/tests/test_slice_pushdown.py +968 -0
- dask_array/tests/test_slice_through_blockwise.py +678 -0
- dask_array/tests/test_slice_through_overlap.py +366 -0
- dask_array/tests/test_slice_through_reshape.py +272 -0
- dask_array/tests/test_slicing.py +839 -0
- dask_array/tests/test_transpose_slice_pushdown.py +208 -0
- dask_array/tests/test_visualize.py +94 -0
- dask_array/tests/test_xarray.py +193 -0
- dask_array-0.1.0.dist-info/METADATA +48 -0
- dask_array-0.1.0.dist-info/RECORD +144 -0
- dask_array-0.1.0.dist-info/WHEEL +4 -0
- dask_array-0.1.0.dist-info/entry_points.txt +2 -0
- dask_array-0.1.0.dist-info/licenses/LICENSE +29 -0
|
@@ -0,0 +1,261 @@
|
|
|
1
|
+
"""Percentile functions for dask arrays."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import warnings
|
|
6
|
+
from collections.abc import Iterator
|
|
7
|
+
from functools import wraps
|
|
8
|
+
from numbers import Number
|
|
9
|
+
|
|
10
|
+
import numpy as np
|
|
11
|
+
from tlz import merge
|
|
12
|
+
|
|
13
|
+
from dask_array._dispatch import empty_lookup, percentile_lookup
|
|
14
|
+
from dask.base import tokenize
|
|
15
|
+
from dask.utils import derived_from
|
|
16
|
+
|
|
17
|
+
from dask_array.core import from_graph
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@wraps(np.percentile)
|
|
21
|
+
def _percentile(a, q, method="linear"):
|
|
22
|
+
n = len(a)
|
|
23
|
+
if not len(a):
|
|
24
|
+
return None, n
|
|
25
|
+
if isinstance(q, Iterator):
|
|
26
|
+
q = list(q)
|
|
27
|
+
if a.dtype.name == "category":
|
|
28
|
+
result = np.percentile(a.cat.codes, q, method=method)
|
|
29
|
+
import pandas as pd
|
|
30
|
+
|
|
31
|
+
return pd.Categorical.from_codes(result, a.dtype.categories, a.dtype.ordered), n
|
|
32
|
+
if type(a.dtype).__name__ == "DatetimeTZDtype":
|
|
33
|
+
import pandas as pd
|
|
34
|
+
|
|
35
|
+
if isinstance(a, (pd.Series, pd.Index)):
|
|
36
|
+
a = a.values
|
|
37
|
+
|
|
38
|
+
if np.issubdtype(a.dtype, np.datetime64):
|
|
39
|
+
values = a
|
|
40
|
+
if type(a).__name__ in ("Series", "Index"):
|
|
41
|
+
a2 = values.astype("i8")
|
|
42
|
+
else:
|
|
43
|
+
a2 = values.view("i8")
|
|
44
|
+
result = np.percentile(a2, q, method=method).astype(values.dtype)
|
|
45
|
+
if q[0] == 0:
|
|
46
|
+
result[0] = min(result[0], values.min())
|
|
47
|
+
return result, n
|
|
48
|
+
if not np.issubdtype(a.dtype, np.number):
|
|
49
|
+
method = "nearest"
|
|
50
|
+
return np.percentile(a, q, method=method), n
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _tdigest_chunk(a):
|
|
54
|
+
from crick import TDigest
|
|
55
|
+
|
|
56
|
+
t = TDigest()
|
|
57
|
+
t.update(a)
|
|
58
|
+
|
|
59
|
+
return t
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _percentiles_from_tdigest(qs, digests):
|
|
63
|
+
from crick import TDigest
|
|
64
|
+
|
|
65
|
+
t = TDigest()
|
|
66
|
+
t.merge(*digests)
|
|
67
|
+
|
|
68
|
+
return np.array(t.quantile(qs / 100.0))
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def merge_percentiles(finalq, qs, vals, method="lower", Ns=None, raise_on_nan=True):
|
|
72
|
+
"""Combine several percentile calculations of different data."""
|
|
73
|
+
from dask_array._utils import array_safe
|
|
74
|
+
|
|
75
|
+
if isinstance(finalq, Iterator):
|
|
76
|
+
finalq = list(finalq)
|
|
77
|
+
finalq = array_safe(finalq, like=finalq)
|
|
78
|
+
qs = [list(q) for q in qs]
|
|
79
|
+
vals = list(vals)
|
|
80
|
+
if Ns is None:
|
|
81
|
+
vals, Ns = zip(*vals)
|
|
82
|
+
Ns = list(Ns)
|
|
83
|
+
|
|
84
|
+
L = list(zip(*((q, val, N) for q, val, N in zip(qs, vals, Ns) if N)))
|
|
85
|
+
if not L:
|
|
86
|
+
if raise_on_nan:
|
|
87
|
+
raise ValueError("No non-trivial arrays found")
|
|
88
|
+
return np.full(len(qs[0]) - 2, np.nan)
|
|
89
|
+
qs, vals, Ns = L
|
|
90
|
+
|
|
91
|
+
if vals[0].dtype.name == "category":
|
|
92
|
+
result = merge_percentiles(finalq, qs, [v.codes for v in vals], method, Ns, raise_on_nan)
|
|
93
|
+
import pandas as pd
|
|
94
|
+
|
|
95
|
+
return pd.Categorical.from_codes(result, vals[0].categories, vals[0].ordered)
|
|
96
|
+
if not np.issubdtype(vals[0].dtype, np.number):
|
|
97
|
+
method = "nearest"
|
|
98
|
+
|
|
99
|
+
if len(vals) != len(qs) or len(Ns) != len(qs):
|
|
100
|
+
raise ValueError("qs, vals, and Ns parameters must be the same length")
|
|
101
|
+
|
|
102
|
+
total_len = sum(len(q) for q in qs)
|
|
103
|
+
counts = empty_lookup.dispatch(type(finalq))(total_len, dtype=finalq.dtype)
|
|
104
|
+
start = 0
|
|
105
|
+
for q, N in zip(qs, Ns):
|
|
106
|
+
length = len(q)
|
|
107
|
+
count = empty_lookup.dispatch(type(finalq))(length, dtype=finalq.dtype)
|
|
108
|
+
count[1:] = np.diff(array_safe(q, like=q[0]))
|
|
109
|
+
count[0] = q[0]
|
|
110
|
+
count *= N
|
|
111
|
+
counts[start : start + length] = count
|
|
112
|
+
start += length
|
|
113
|
+
|
|
114
|
+
combined_vals = np.concatenate(vals)
|
|
115
|
+
combined_counts = array_safe(counts, like=combined_vals)
|
|
116
|
+
sort_order = np.argsort(combined_vals)
|
|
117
|
+
combined_vals = np.take(combined_vals, sort_order)
|
|
118
|
+
combined_counts = np.take(combined_counts, sort_order)
|
|
119
|
+
|
|
120
|
+
combined_q = np.cumsum(combined_counts)
|
|
121
|
+
|
|
122
|
+
finalq = array_safe(finalq, like=combined_vals)
|
|
123
|
+
desired_q = finalq * sum(Ns)
|
|
124
|
+
|
|
125
|
+
if method == "linear":
|
|
126
|
+
rv = np.interp(desired_q, combined_q, combined_vals)
|
|
127
|
+
else:
|
|
128
|
+
left = np.searchsorted(combined_q, desired_q, side="left")
|
|
129
|
+
right = np.searchsorted(combined_q, desired_q, side="right") - 1
|
|
130
|
+
np.minimum(left, len(combined_vals) - 1, out=left)
|
|
131
|
+
lower = np.minimum(left, right)
|
|
132
|
+
upper = np.maximum(left, right)
|
|
133
|
+
if method == "lower":
|
|
134
|
+
rv = combined_vals[lower]
|
|
135
|
+
elif method == "higher":
|
|
136
|
+
rv = combined_vals[upper]
|
|
137
|
+
elif method == "midpoint":
|
|
138
|
+
rv = 0.5 * (combined_vals[lower] + combined_vals[upper])
|
|
139
|
+
elif method == "nearest":
|
|
140
|
+
lower_residual = np.abs(combined_q[lower] - desired_q)
|
|
141
|
+
upper_residual = np.abs(combined_q[upper] - desired_q)
|
|
142
|
+
mask = lower_residual > upper_residual
|
|
143
|
+
index = lower
|
|
144
|
+
index[mask] = upper[mask]
|
|
145
|
+
rv = combined_vals[index]
|
|
146
|
+
else:
|
|
147
|
+
raise ValueError("interpolation method can only be 'linear', 'lower', 'higher', 'midpoint', or 'nearest'")
|
|
148
|
+
return rv
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def percentile(a, q, method="linear", internal_method="default", **kwargs):
|
|
152
|
+
"""Approximate percentile of 1-D array
|
|
153
|
+
|
|
154
|
+
Parameters
|
|
155
|
+
----------
|
|
156
|
+
a : Array
|
|
157
|
+
q : array_like of float
|
|
158
|
+
Percentile or sequence of percentiles to compute, which must be between
|
|
159
|
+
0 and 100 inclusive.
|
|
160
|
+
method : {'linear', 'lower', 'higher', 'midpoint', 'nearest'}, optional
|
|
161
|
+
The interpolation method to use when the desired percentile lies
|
|
162
|
+
between two data points.
|
|
163
|
+
internal_method : {'default', 'dask', 'tdigest'}, optional
|
|
164
|
+
What internal method to use. By default will use dask's internal custom
|
|
165
|
+
algorithm (``'dask'``).
|
|
166
|
+
"""
|
|
167
|
+
from dask_array._utils import array_safe, meta_from_array
|
|
168
|
+
from dask_array.reductions import quantile
|
|
169
|
+
|
|
170
|
+
if a.ndim == 1:
|
|
171
|
+
allowed_internal_methods = {"default", "dask", "tdigest"}
|
|
172
|
+
|
|
173
|
+
if method in allowed_internal_methods:
|
|
174
|
+
warnings.warn(
|
|
175
|
+
"The `method=` argument was renamed to `internal_method=`",
|
|
176
|
+
FutureWarning,
|
|
177
|
+
)
|
|
178
|
+
internal_method = method
|
|
179
|
+
|
|
180
|
+
if "interpolation" in kwargs:
|
|
181
|
+
warnings.warn(
|
|
182
|
+
"The `interpolation=` argument to percentile was renamed to `method= ` ",
|
|
183
|
+
FutureWarning,
|
|
184
|
+
)
|
|
185
|
+
method = kwargs.pop("interpolation")
|
|
186
|
+
|
|
187
|
+
if kwargs:
|
|
188
|
+
raise TypeError(f"percentile() got an unexpected keyword argument {kwargs.keys()}")
|
|
189
|
+
|
|
190
|
+
q_is_number = False
|
|
191
|
+
if isinstance(q, Number):
|
|
192
|
+
q_is_number = True
|
|
193
|
+
q = [q]
|
|
194
|
+
q = array_safe(q, like=meta_from_array(a))
|
|
195
|
+
token = tokenize(a, q, method)
|
|
196
|
+
|
|
197
|
+
dtype = a.dtype
|
|
198
|
+
if np.issubdtype(dtype, np.integer):
|
|
199
|
+
dtype = (array_safe([], dtype=dtype, like=meta_from_array(a)) / 0.5).dtype
|
|
200
|
+
meta = meta_from_array(a, dtype=dtype)
|
|
201
|
+
|
|
202
|
+
if internal_method not in allowed_internal_methods:
|
|
203
|
+
raise ValueError(f"`internal_method=` must be one of {allowed_internal_methods}")
|
|
204
|
+
|
|
205
|
+
if (
|
|
206
|
+
internal_method == "tdigest"
|
|
207
|
+
and method == "linear"
|
|
208
|
+
and (np.issubdtype(dtype, np.floating) or np.issubdtype(dtype, np.integer))
|
|
209
|
+
):
|
|
210
|
+
from dask.utils import import_required
|
|
211
|
+
|
|
212
|
+
import_required("crick", "crick is a required dependency for using the t-digest method.")
|
|
213
|
+
|
|
214
|
+
name = "percentile_tdigest_chunk-" + token
|
|
215
|
+
dsk = {(name, i): (_tdigest_chunk, key) for i, key in enumerate(a.__dask_keys__())}
|
|
216
|
+
|
|
217
|
+
name2 = "percentile_tdigest-" + token
|
|
218
|
+
dsk2 = {(name2, 0): (_percentiles_from_tdigest, q, sorted(dsk))}
|
|
219
|
+
|
|
220
|
+
else:
|
|
221
|
+
zero = empty_lookup.dispatch(type(q))(1, dtype=q.dtype)
|
|
222
|
+
zero[:] = 0
|
|
223
|
+
|
|
224
|
+
hundred = empty_lookup.dispatch(type(q))(1, dtype=q.dtype)
|
|
225
|
+
hundred[:] = 100
|
|
226
|
+
|
|
227
|
+
calc_q = np.concatenate((zero, q, hundred))
|
|
228
|
+
name = "percentile_chunk-" + token
|
|
229
|
+
dsk = {(name, i): (percentile_lookup, key, calc_q, method) for i, key in enumerate(a.__dask_keys__())}
|
|
230
|
+
|
|
231
|
+
name2 = "percentile-" + token
|
|
232
|
+
dsk2 = {
|
|
233
|
+
(name2, 0): (
|
|
234
|
+
merge_percentiles,
|
|
235
|
+
q,
|
|
236
|
+
[calc_q] * len(a.chunks[0]),
|
|
237
|
+
sorted(dsk),
|
|
238
|
+
method,
|
|
239
|
+
)
|
|
240
|
+
}
|
|
241
|
+
dsk = merge(dsk, dsk2)
|
|
242
|
+
# Merge the dependency graph with our new tasks
|
|
243
|
+
full_dsk = dict(a.__dask_graph__())
|
|
244
|
+
full_dsk.update(dsk)
|
|
245
|
+
arr = from_graph(full_dsk, meta, ((len(q),),), [(name2, 0)], name2)
|
|
246
|
+
return arr.reshape(()) if q_is_number else arr
|
|
247
|
+
|
|
248
|
+
elif a.ndim > 1:
|
|
249
|
+
q = np.true_divide(q, a.dtype.type(100) if a.dtype.kind == "f" else 100)
|
|
250
|
+
return quantile(a, q, method=method, **kwargs)
|
|
251
|
+
else:
|
|
252
|
+
raise NotImplementedError("support for arrays of ndim 0 is not implemented.")
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
@derived_from(np)
|
|
256
|
+
def nanpercentile(a, q, **kwargs):
|
|
257
|
+
from dask_array.reductions import nanquantile
|
|
258
|
+
|
|
259
|
+
q = np.true_divide(q, a.dtype.type(100) if a.dtype.kind == "f" else 100)
|
|
260
|
+
|
|
261
|
+
return nanquantile(a, q, **kwargs)
|