PyPI - dask-array - Versions diffs - 0.1.0__py3-none-any.whl - Mend

dask-array 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (144) hide show

dask_array/__init__.py +228 -0
dask_array/_backends.py +76 -0
dask_array/_backends_array.py +99 -0
dask_array/_blockwise.py +1410 -0
dask_array/_broadcast.py +272 -0
dask_array/_chunk.py +445 -0
dask_array/_chunk_types.py +54 -0
dask_array/_collection.py +1644 -0
dask_array/_concatenate.py +331 -0
dask_array/_core_utils.py +1365 -0
dask_array/_dispatch.py +141 -0
dask_array/_einsum.py +277 -0
dask_array/_expr.py +544 -0
dask_array/_expr_flow.py +586 -0
dask_array/_gufunc.py +805 -0
dask_array/_histogram.py +617 -0
dask_array/_map_blocks.py +652 -0
dask_array/_new_collection.py +10 -0
dask_array/_numpy_compat.py +135 -0
dask_array/_overlap.py +1159 -0
dask_array/_rechunk.py +1050 -0
dask_array/_reshape.py +710 -0
dask_array/_routines.py +102 -0
dask_array/_shuffle.py +448 -0
dask_array/_stack.py +264 -0
dask_array/_svg.py +291 -0
dask_array/_templates.py +29 -0
dask_array/_test_utils.py +257 -0
dask_array/_ufunc.py +385 -0
dask_array/_utils.py +349 -0
dask_array/_visualize.py +223 -0
dask_array/_xarray.py +337 -0
dask_array/core/__init__.py +34 -0
dask_array/core/_blockwise_funcs.py +312 -0
dask_array/core/_conversion.py +422 -0
dask_array/core/_from_graph.py +97 -0
dask_array/creation/__init__.py +71 -0
dask_array/creation/_arange.py +121 -0
dask_array/creation/_diag.py +116 -0
dask_array/creation/_diagonal.py +241 -0
dask_array/creation/_eye.py +103 -0
dask_array/creation/_linspace.py +102 -0
dask_array/creation/_mesh.py +134 -0
dask_array/creation/_ones_zeros.py +454 -0
dask_array/creation/_pad.py +270 -0
dask_array/creation/_repeat.py +55 -0
dask_array/creation/_tile.py +36 -0
dask_array/creation/_tri.py +28 -0
dask_array/creation/_utils.py +296 -0
dask_array/fft.py +320 -0
dask_array/io/__init__.py +39 -0
dask_array/io/_base.py +10 -0
dask_array/io/_from_array.py +257 -0
dask_array/io/_from_delayed.py +95 -0
dask_array/io/_from_graph.py +54 -0
dask_array/io/_from_npy_stack.py +67 -0
dask_array/io/_store.py +336 -0
dask_array/io/_tiledb.py +159 -0
dask_array/io/_to_npy_stack.py +65 -0
dask_array/io/_zarr.py +449 -0
dask_array/linalg/__init__.py +39 -0
dask_array/linalg/_cholesky.py +234 -0
dask_array/linalg/_lu.py +300 -0
dask_array/linalg/_norm.py +94 -0
dask_array/linalg/_qr.py +601 -0
dask_array/linalg/_solve.py +349 -0
dask_array/linalg/_svd.py +394 -0
dask_array/linalg/_tensordot.py +334 -0
dask_array/linalg/_utils.py +74 -0
dask_array/manipulation/__init__.py +45 -0
dask_array/manipulation/_expand.py +321 -0
dask_array/manipulation/_flip.py +92 -0
dask_array/manipulation/_roll.py +78 -0
dask_array/manipulation/_transpose.py +309 -0
dask_array/random/__init__.py +125 -0
dask_array/random/_choice.py +181 -0
dask_array/random/_expr.py +256 -0
dask_array/random/_generator.py +441 -0
dask_array/random/_random_state.py +259 -0
dask_array/random/_utils.py +84 -0
dask_array/reductions/__init__.py +84 -0
dask_array/reductions/_arg_reduction.py +130 -0
dask_array/reductions/_common.py +1082 -0
dask_array/reductions/_cumulative.py +522 -0
dask_array/reductions/_percentile.py +261 -0
dask_array/reductions/_reduction.py +725 -0
dask_array/reductions/_trace.py +56 -0
dask_array/routines/__init__.py +133 -0
dask_array/routines/_apply.py +84 -0
dask_array/routines/_bincount.py +112 -0
dask_array/routines/_broadcast.py +111 -0
dask_array/routines/_coarsen.py +115 -0
dask_array/routines/_diff.py +79 -0
dask_array/routines/_gradient.py +158 -0
dask_array/routines/_indexing.py +65 -0
dask_array/routines/_insert_delete.py +132 -0
dask_array/routines/_misc.py +122 -0
dask_array/routines/_nonzero.py +72 -0
dask_array/routines/_search.py +123 -0
dask_array/routines/_select.py +113 -0
dask_array/routines/_statistics.py +171 -0
dask_array/routines/_topk.py +82 -0
dask_array/routines/_triangular.py +74 -0
dask_array/routines/_unique.py +232 -0
dask_array/routines/_where.py +62 -0
dask_array/slicing/__init__.py +67 -0
dask_array/slicing/_basic.py +550 -0
dask_array/slicing/_blocks.py +138 -0
dask_array/slicing/_bool_index.py +145 -0
dask_array/slicing/_setitem.py +329 -0
dask_array/slicing/_squeeze.py +101 -0
dask_array/slicing/_utils.py +1133 -0
dask_array/slicing/_vindex.py +282 -0
dask_array/stacking/__init__.py +15 -0
dask_array/stacking/_block.py +83 -0
dask_array/stacking/_simple.py +58 -0
dask_array/templates/array.html.j2 +48 -0
dask_array/tests/__init__.py +0 -0
dask_array/tests/conftest.py +22 -0
dask_array/tests/test_api.py +40 -0
dask_array/tests/test_binary_op_chunks.py +107 -0
dask_array/tests/test_coarse_slice_through_blockwise.py +362 -0
dask_array/tests/test_collection.py +799 -0
dask_array/tests/test_creation.py +1102 -0
dask_array/tests/test_expr_flow.py +143 -0
dask_array/tests/test_linalg.py +1130 -0
dask_array/tests/test_map_blocks_multi_output.py +104 -0
dask_array/tests/test_rechunk_pushdown.py +214 -0
dask_array/tests/test_reductions.py +1091 -0
dask_array/tests/test_routines.py +2853 -0
dask_array/tests/test_shuffle_chunks.py +67 -0
dask_array/tests/test_slice_pushdown.py +968 -0
dask_array/tests/test_slice_through_blockwise.py +678 -0
dask_array/tests/test_slice_through_overlap.py +366 -0
dask_array/tests/test_slice_through_reshape.py +272 -0
dask_array/tests/test_slicing.py +839 -0
dask_array/tests/test_transpose_slice_pushdown.py +208 -0
dask_array/tests/test_visualize.py +94 -0
dask_array/tests/test_xarray.py +193 -0
dask_array-0.1.0.dist-info/METADATA +48 -0
dask_array-0.1.0.dist-info/RECORD +144 -0
dask_array-0.1.0.dist-info/WHEEL +4 -0
dask_array-0.1.0.dist-info/entry_points.txt +2 -0
dask_array-0.1.0.dist-info/licenses/LICENSE +29 -0

dask_array/linalg/_svd.py ADDED Viewed

@@ -0,0 +1,394 @@
+"""SVD decomposition for array-expr."""
+from __future__ import annotations
+import functools
+import operator
+import numpy as np
+from dask_array._new_collection import new_collection
+from dask._task_spec import Task, TaskRef
+from dask_array._expr import ArrayExpr
+from dask_array.random import RandomState, default_rng
+from dask_array._utils import meta_from_array, svd_flip
+from dask.base import wait
+from dask.utils import derived_from
+class InCoreSVD(ArrayExpr):
+    """In-core SVD decomposition."""
+    _parameters = ["r"]
+    @functools.cached_property
+    def _meta(self):
+        uu, ss, vvh = np.linalg.svd(np.ones(shape=(1, 1), dtype=self.r.dtype))
+        return (
+            meta_from_array(self.r._meta, ndim=2, dtype=uu.dtype),
+            meta_from_array(self.r._meta, ndim=1, dtype=ss.dtype),
+            meta_from_array(self.r._meta, ndim=2, dtype=vvh.dtype),
+        )
+    @functools.cached_property
+    def chunks(self):
+        return self.r.chunks
+    @functools.cached_property
+    def _name(self):
+        return f"svd-core-{self.deterministic_token}"
+    def _layer(self):
+        out_key = (self._name, 0, 0)
+        in_key = (self.r._name, 0, 0)
+        dsk = {out_key: Task(out_key, np.linalg.svd, TaskRef(in_key))}
+        return dsk
+class InCoreSVDU(ArrayExpr):
+    """Extract U from in-core SVD."""
+    _parameters = ["incore_svd"]
+    @functools.cached_property
+    def _meta(self):
+        return self.incore_svd._meta[0]
+    @functools.cached_property
+    def chunks(self):
+        m = self.incore_svd.r.shape[0]
+        return ((m,), (m,))
+    @functools.cached_property
+    def _name(self):
+        return f"svd-u-{self.deterministic_token}"
+    def _layer(self):
+        out_key = (self._name, 0, 0)
+        in_key = (self.incore_svd._name, 0, 0)
+        dsk = {out_key: Task(out_key, operator.getitem, TaskRef(in_key), 0)}
+        return dsk
+class InCoreSVDS(ArrayExpr):
+    """Extract S from in-core SVD."""
+    _parameters = ["incore_svd"]
+    @functools.cached_property
+    def _meta(self):
+        return self.incore_svd._meta[1]
+    @functools.cached_property
+    def chunks(self):
+        m = self.incore_svd.r.shape[0]
+        n = self.incore_svd.r.shape[1]
+        k = min(m, n)
+        return ((k,),)
+    @functools.cached_property
+    def _name(self):
+        return f"svd-s-{self.deterministic_token}"
+    def _layer(self):
+        out_key = (self._name, 0)
+        in_key = (self.incore_svd._name, 0, 0)
+        dsk = {out_key: Task(out_key, operator.getitem, TaskRef(in_key), 1)}
+        return dsk
+class InCoreSVDVh(ArrayExpr):
+    """Extract Vh from in-core SVD."""
+    _parameters = ["incore_svd"]
+    @functools.cached_property
+    def _meta(self):
+        return self.incore_svd._meta[2]
+    @functools.cached_property
+    def chunks(self):
+        n = self.incore_svd.r.shape[1]
+        return ((n,), (n,))
+    @functools.cached_property
+    def _name(self):
+        return f"svd-vh-{self.deterministic_token}"
+    def _layer(self):
+        out_key = (self._name, 0, 0)
+        in_key = (self.incore_svd._name, 0, 0)
+        dsk = {out_key: Task(out_key, operator.getitem, TaskRef(in_key), 2)}
+        return dsk
+class BlockMatMul(ArrayExpr):
+    """Block-wise matrix multiplication: Q @ U_r.
+    Q has multiple row blocks, U_r is a single block.
+    Result has same row blocks as Q.
+    """
+    _parameters = ["q", "u"]
+    @functools.cached_property
+    def _meta(self):
+        return self.q._meta
+    @functools.cached_property
+    def chunks(self):
+        return (self.q.chunks[0], self.u.chunks[1])
+    @functools.cached_property
+    def _name(self):
+        return f"block-matmul-{self.deterministic_token}"
+    def _layer(self):
+        dsk = {}
+        numblocks = len(self.q.chunks[0])
+        u_key = (self.u._name, 0, 0)
+        for i in range(numblocks):
+            out_key = (self._name, i, 0)
+            q_key = (self.q._name, i, 0)
+            dsk[out_key] = Task(out_key, np.dot, TaskRef(q_key), TaskRef(u_key))
+        return dsk
+def _tsqr_svd(q_expr, r_expr, data_expr):
+    """Compute SVD from TSQR factorization."""
+    svd_r = InCoreSVD(r_expr)
+    u_r = InCoreSVDU(svd_r)
+    s = InCoreSVDS(svd_r)
+    vh = InCoreSVDVh(svd_r)
+    u_final = BlockMatMul(q_expr, u_r)
+    return new_collection(u_final), new_collection(s), new_collection(vh)
+@derived_from(np.linalg)
+def svd(a, full_matrices=False, compute_uv=True):
+    """Singular Value Decomposition.
+    Parameters
+    ----------
+    a : Array
+        Input array
+    full_matrices : bool
+        Full matrices are not supported when ``compute_uv=True``. The default
+        returns reduced factors with ``k = min(a.shape)``.
+    compute_uv : bool
+        If True, compute U and Vh in addition to S
+    Returns
+    -------
+    u, s, vh : Array, Array, Array
+        SVD factors when ``compute_uv=True``
+    s : Array
+        Singular values when ``compute_uv=False``
+    """
+    from dask_array.core import asanyarray
+    from dask_array.linalg._qr import tsqr
+    if full_matrices and compute_uv:
+        raise NotImplementedError("full_matrices=True is not supported")
+    a = asanyarray(a)
+    if a.ndim != 2:
+        raise ValueError("Array must be 2D")
+    nr, nc = len(a.chunks[0]), len(a.chunks[1])
+    if nr > 1 and nc > 1:
+        raise NotImplementedError(
+            "Array must be chunked in one dimension only. "
+            "This function (svd) only supports tall-and-skinny or short-and-fat "
+            "matrices (see da.linalg.svd_compressed for SVD on fully chunked arrays).\n"
+            f"Input shape: {a.shape}\n"
+            f"Input numblocks: {(nr, nc)}\n"
+        )
+    if nr >= nc:
+        u, s, v = tsqr(a, compute_svd=True)
+        if a.shape[0] < a.shape[1]:
+            k = min(a.shape)
+            u, v = u[:, :k], v[:k, :]
+    else:
+        vt, s, ut = tsqr(a.T, compute_svd=True)
+        u, v = ut.T, vt.T
+        if a.shape[0] > a.shape[1]:
+            k = min(a.shape)
+            u, v = u[:, :k], v[:k, :]
+    if not compute_uv:
+        return s
+    return u, s, v
+def compression_level(n, q, n_oversamples=10, min_subspace_size=20):
+    """Compression level to use in svd_compressed.
+    Given the size ``n`` of a space, compress that to one of size
+    ``q`` plus n_oversamples.
+    Parameters
+    ----------
+    n: int
+        Column/row dimension of original matrix
+    q: int
+        Size of the desired subspace
+    n_oversamples: int, default=10
+        Number of oversamples used for generating the sampling matrix.
+    min_subspace_size : int, default=20
+        Minimum subspace size.
+    Returns
+    -------
+    int
+        Compression level
+    """
+    return min(max(min_subspace_size, q + n_oversamples), n)
+def compression_matrix(
+    data,
+    q,
+    iterator="power",
+    n_power_iter=0,
+    n_oversamples=10,
+    seed=None,
+    compute=False,
+):
+    """Randomly sample matrix to find most active subspace.
+    Parameters
+    ----------
+    data: Array
+    q: int
+        Size of the desired subspace
+    iterator: {'power', 'QR'}, default='power'
+        Define the technique used for iterations
+    n_power_iter: int
+        Number of power iterations
+    n_oversamples: int, default=10
+        Number of oversamples
+    compute : bool
+        Whether or not to compute data at each use
+    Returns
+    -------
+    Array
+        Compression matrix
+    """
+    from dask_array.core import asanyarray
+    from dask_array.linalg._qr import tsqr
+    data = asanyarray(data)
+    if iterator not in ["power", "QR"]:
+        raise ValueError(f"Iterator '{iterator}' not valid, must be one of ['power', 'QR']")
+    m, n = data.shape
+    comp_level = compression_level(min(m, n), q, n_oversamples=n_oversamples)
+    if isinstance(seed, RandomState):
+        state = seed
+    else:
+        state = default_rng(seed)
+    datatype = np.float64
+    if (data.dtype).type in {np.float32, np.complex64}:
+        datatype = np.float32
+    omega = state.standard_normal(size=(n, comp_level), chunks=(data.chunks[1], (comp_level,))).astype(
+        datatype, copy=False
+    )
+    mat_h = data.dot(omega)
+    if iterator == "power":
+        for _ in range(n_power_iter):
+            if compute:
+                mat_h = mat_h.persist()
+                wait(mat_h)
+            tmp = data.T.dot(mat_h)
+            if compute:
+                tmp = tmp.persist()
+                wait(tmp)
+            mat_h = data.dot(tmp)
+        q_mat, _ = tsqr(mat_h)
+    else:
+        q_mat, _ = tsqr(mat_h)
+        for _ in range(n_power_iter):
+            if compute:
+                q_mat = q_mat.persist()
+                wait(q_mat)
+            q_mat, _ = tsqr(data.T.dot(q_mat))
+            if compute:
+                q_mat = q_mat.persist()
+                wait(q_mat)
+            q_mat, _ = tsqr(data.dot(q_mat))
+    return q_mat.T
+def svd_compressed(
+    a,
+    k,
+    iterator="power",
+    n_power_iter=0,
+    n_oversamples=10,
+    seed=None,
+    compute=False,
+    coerce_signs=True,
+):
+    """Randomly compressed rank-k thin Singular Value Decomposition.
+    This computes the approximate singular value decomposition of a large
+    array.  This algorithm is generally faster than the normal algorithm
+    but does not provide exact results.
+    Parameters
+    ----------
+    a: Array
+        Input array
+    k: int
+        Rank of the desired thin SVD decomposition.
+    iterator: {'power', 'QR'}, default='power'
+        Define the technique used for iterations
+    n_power_iter: int, default=0
+        Number of power iterations
+    n_oversamples: int, default=10
+        Number of oversamples
+    compute : bool
+        Whether or not to compute data at each use
+    coerce_signs : bool
+        Whether or not to apply sign coercion to singular vectors
+    Returns
+    -------
+    u:  Array, unitary / orthogonal
+    s:  Array, singular values in decreasing order
+    v:  Array, unitary / orthogonal
+    """
+    from dask_array.core import asanyarray
+    from dask_array.linalg._qr import tsqr
+    a = asanyarray(a)
+    comp = compression_matrix(
+        a,
+        k,
+        iterator=iterator,
+        n_power_iter=n_power_iter,
+        n_oversamples=n_oversamples,
+        seed=seed,
+        compute=compute,
+    )
+    if compute:
+        comp = comp.persist()
+        wait(comp)
+    a_compressed = comp.dot(a)
+    v, s, ut = tsqr(a_compressed.T, compute_svd=True)
+    u = comp.T.dot(ut.T)
+    v = v.T
+    u = u[:, :k]
+    s = s[:k]
+    v = v[:k, :]
+    if coerce_signs:
+        u, v = svd_flip(u, v)
+    return u, s, v

dask_array/linalg/_tensordot.py ADDED Viewed

@@ -0,0 +1,334 @@
+"""Tensor operations for array-expr (tensordot, dot, vdot, matmul)."""
+from __future__ import annotations
+from collections.abc import Iterable
+from numbers import Integral
+import numpy as np
+from dask_array._core_utils import is_scalar_for_elemwise, tensordot_lookup
+from dask.utils import derived_from
+def _result_type(*args):
+    """Compute result dtype for operation."""
+    args = [a if is_scalar_for_elemwise(a) else a.dtype for a in args]
+    return np.result_type(*args)
+def _tensordot(a, b, axes, is_sparse):
+    """Helper function for tensordot that handles the actual numpy computation."""
+    x = max([a, b], key=lambda x: x.__array_priority__)
+    tensordot = tensordot_lookup.dispatch(type(x))
+    x = tensordot(a, b, axes=axes)
+    if is_sparse and len(axes[0]) == 1:
+        return x
+    else:
+        ind = [slice(None, None)] * x.ndim
+        for a in sorted(axes[0]):
+            ind.insert(a, None)
+        x = x[tuple(ind)]
+        return x
+def _tensordot_is_sparse(x):
+    """Check if array is sparse (scipy sparse, not pydata sparse)."""
+    is_sparse = "sparse" in str(type(x._meta))
+    if is_sparse:
+        # exclude pydata sparse arrays, no workaround required for these in tensordot
+        is_sparse = "sparse._coo.core.COO" not in str(type(x._meta))
+    return is_sparse
+@derived_from(np)
+def tensordot(lhs, rhs, axes=2):
+    """Compute tensor dot product along specified axes.
+    Parameters
+    ----------
+    lhs : array_like
+        Left argument
+    rhs : array_like
+        Right argument
+    axes : int or tuple of (int, int) or tuple of (sequence[int], sequence[int])
+        If integer, sum over the last `axes` axes of `lhs` and first `axes`
+        axes of `rhs`. If tuple, specifies axes to contract.
+    Returns
+    -------
+    output : dask array
+    See Also
+    --------
+    numpy.tensordot
+    """
+    from dask_array._collection import Array, asarray, blockwise
+    if not isinstance(lhs, Array):
+        lhs = asarray(lhs)
+    if not isinstance(rhs, Array):
+        rhs = asarray(rhs)
+    if isinstance(axes, Iterable):
+        left_axes, right_axes = axes
+    else:
+        left_axes = tuple(range(lhs.ndim - axes, lhs.ndim))
+        right_axes = tuple(range(0, axes))
+    if isinstance(left_axes, Integral):
+        left_axes = (left_axes,)
+    if isinstance(right_axes, Integral):
+        right_axes = (right_axes,)
+    if isinstance(left_axes, list):
+        left_axes = tuple(left_axes)
+    if isinstance(right_axes, list):
+        right_axes = tuple(right_axes)
+    is_sparse = _tensordot_is_sparse(lhs) or _tensordot_is_sparse(rhs)
+    if is_sparse and len(left_axes) == 1:
+        concatenate = True
+    else:
+        concatenate = False
+    dt = np.promote_types(lhs.dtype, rhs.dtype)
+    left_index = list(range(lhs.ndim))
+    right_index = list(range(lhs.ndim, lhs.ndim + rhs.ndim))
+    out_index = left_index + right_index
+    adjust_chunks = {}
+    for l, r in zip(left_axes, right_axes):
+        out_index.remove(right_index[r])
+        right_index[r] = left_index[l]
+        if concatenate:
+            out_index.remove(left_index[l])
+        else:
+            adjust_chunks[left_index[l]] = lambda c: 1
+    # Compute explicit meta to preserve masked array type
+    # (compute_meta fails for masked arrays due to reshape issues with 0-dim arrays)
+    meta = None
+    for arr in (lhs, rhs):
+        if hasattr(arr._meta, "mask"):  # MaskedArray check
+            out_ndim = len(out_index)
+            meta = np.ma.empty((0,) * out_ndim, dtype=dt)
+            break
+    intermediate = blockwise(
+        _tensordot,
+        out_index,
+        lhs,
+        left_index,
+        rhs,
+        right_index,
+        dtype=dt,
+        concatenate=concatenate,
+        adjust_chunks=adjust_chunks,
+        axes=(left_axes, right_axes),
+        is_sparse=is_sparse,
+        meta=meta,
+    )
+    if concatenate:
+        return intermediate
+    else:
+        left_axes = [ax if ax >= 0 else lhs.ndim + ax for ax in left_axes]
+        return intermediate.sum(axis=left_axes)
+@derived_from(np, ua_args=["out"])
+def dot(a, b):
+    """Dot product of two arrays.
+    For 2-D arrays it is equivalent to matrix multiplication,
+    for 1-D arrays to inner product of vectors.
+    Parameters
+    ----------
+    a : array_like
+        First argument
+    b : array_like
+        Second argument
+    Returns
+    -------
+    output : dask array
+    See Also
+    --------
+    numpy.dot
+    tensordot
+    """
+    return tensordot(a, b, axes=((a.ndim - 1,), (b.ndim - 2,)))
+@derived_from(np)
+def vdot(a, b):
+    """Return the dot product of two vectors.
+    The vdot function handles complex numbers differently than dot:
+    if the first argument is complex the complex conjugate of the
+    first argument is used for the calculation of the dot product.
+    Parameters
+    ----------
+    a : array_like
+        First argument
+    b : array_like
+        Second argument
+    Returns
+    -------
+    output : dask array
+    See Also
+    --------
+    numpy.vdot
+    dot
+    """
+    from dask_array._collection import ravel
+    return dot(ravel(a).conj(), ravel(b))
+def _matmul(a, b):
+    """Helper function for matmul that handles the actual numpy computation."""
+    xp = np
+    # Check for cupy
+    try:
+        import cupy
+        if hasattr(a, "__cuda_array_interface__") or hasattr(b, "__cuda_array_interface__"):
+            xp = cupy
+    except ImportError:
+        pass
+    chunk = xp.matmul(a, b)
+    # Since we have performed the contraction via xp.matmul
+    # but blockwise expects all dimensions back (including
+    # the contraction-axis in the 2nd-to-last position of
+    # the output), we must then put it back in the expected
+    # the position ourselves:
+    return chunk[..., xp.newaxis, :]
+def _sum_wo_cat(a, axis=None, dtype=None):
+    """Sum without concatenation - used for matmul reduction."""
+    from functools import partial, reduce
+    from dask_array.reductions import reduction
+    def _chunk_sum(a, axis=None, dtype=None, keepdims=None):
+        # Caution: this is not your conventional array-sum: due
+        # to the special nature of the preceding blockwise con-
+        # traction, each chunk is expected to have exactly the
+        # same shape, with a size of 1 for the dimension given
+        # by `axis` (the reduction axis). This makes mere ele-
+        # ment-wise addition of the arrays possible. Besides,
+        # the output can be merely squeezed to lose the `axis`-
+        # dimension when keepdims = False
+        if type(a) is list:
+            out = reduce(partial(np.add, dtype=dtype), a)
+        else:
+            out = a
+        if keepdims:
+            return out
+        else:
+            return out.squeeze(axis[0])
+    if dtype is None:
+        dtype = getattr(np.zeros(1, dtype=a.dtype).sum(), "dtype", object)
+    if a.shape[axis] == 1:
+        from dask_array._collection import squeeze
+        return squeeze(a, axis=axis)
+    return reduction(a, _chunk_sum, _chunk_sum, axis=axis, dtype=dtype, concatenate=False)
+@derived_from(np)
+def matmul(a, b):
+    """Matrix product of two arrays.
+    Parameters
+    ----------
+    a : array_like
+        First argument
+    b : array_like
+        Second argument
+    Returns
+    -------
+    output : dask array
+    See Also
+    --------
+    numpy.matmul
+    """
+    from dask_array._collection import asanyarray, blockwise
+    a = asanyarray(a)
+    b = asanyarray(b)
+    if a.ndim == 0 or b.ndim == 0:
+        raise ValueError("`matmul` does not support scalars.")
+    a_is_1d = False
+    if a.ndim == 1:
+        a_is_1d = True
+        a = a[np.newaxis, :]
+    b_is_1d = False
+    if b.ndim == 1:
+        b_is_1d = True
+        b = b[:, np.newaxis]
+    if a.ndim < b.ndim:
+        a = a[(b.ndim - a.ndim) * (np.newaxis,)]
+    elif a.ndim > b.ndim:
+        b = b[(a.ndim - b.ndim) * (np.newaxis,)]
+    # out_ind includes all dimensions to prevent contraction
+    # in the blockwise below. We set the last two dimensions
+    # of the output to the contraction axis and the 2nd
+    # (last) dimension of b in that order
+    out_ind = tuple(range(a.ndim + 1))
+    # lhs_ind includes `a`/LHS dimensions
+    lhs_ind = tuple(range(a.ndim))
+    # on `b`/RHS everything above 2nd dimension, is the same
+    # as `a`, -2 dimension is "contracted" with the last dimension
+    # of `a`, last dimension of `b` is `b` specific
+    rhs_ind = tuple(range(a.ndim - 2)) + (lhs_ind[-1], a.ndim)
+    out = blockwise(
+        _matmul,
+        out_ind,
+        a,
+        lhs_ind,
+        b,
+        rhs_ind,
+        adjust_chunks={lhs_ind[-1]: 1},
+        dtype=_result_type(a, b),
+        concatenate=False,
+    )
+    # Because contraction + concatenate in blockwise leads to high
+    # memory footprints, we want to avoid them. Instead we will perform
+    # blockwise (without contraction) followed by reduction. More about
+    # this issue: https://github.com/dask/dask/issues/6874
+    # We will also perform the reduction without concatenation
+    out = _sum_wo_cat(out, axis=-2)
+    if a_is_1d or b_is_1d:
+        from dask_array._collection import squeeze
+        if a_is_1d:
+            out = squeeze(out, axis=-2)
+        if b_is_1d:
+            out = squeeze(out, axis=-1)
+    return out