RubyGems - gumath - Versions diffs - 0.2.0dev5 → 0.2.0dev8 - Mend

gumath 0.2.0dev5 → 0.2.0dev8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (99) hide show

checksums.yaml +4 -4
data/CONTRIBUTING.md +7 -2
data/Gemfile +0 -3
data/ext/ruby_gumath/GPATH +0 -0
data/ext/ruby_gumath/GRTAGS +0 -0
data/ext/ruby_gumath/GTAGS +0 -0
data/ext/ruby_gumath/extconf.rb +0 -5
data/ext/ruby_gumath/functions.c +10 -2
data/ext/ruby_gumath/gufunc_object.c +15 -4
data/ext/ruby_gumath/gufunc_object.h +9 -3
data/ext/ruby_gumath/gumath/Makefile +63 -0
data/ext/ruby_gumath/gumath/Makefile.in +1 -0
data/ext/ruby_gumath/gumath/config.h +56 -0
data/ext/ruby_gumath/gumath/config.h.in +3 -0
data/ext/ruby_gumath/gumath/config.log +497 -0
data/ext/ruby_gumath/gumath/config.status +1034 -0
data/ext/ruby_gumath/gumath/configure +375 -4
data/ext/ruby_gumath/gumath/configure.ac +47 -3
data/ext/ruby_gumath/gumath/libgumath/Makefile +236 -0
data/ext/ruby_gumath/gumath/libgumath/Makefile.in +90 -24
data/ext/ruby_gumath/gumath/libgumath/Makefile.vc +54 -15
data/ext/ruby_gumath/gumath/libgumath/apply.c +92 -28
data/ext/ruby_gumath/gumath/libgumath/apply.o +0 -0
data/ext/ruby_gumath/gumath/libgumath/common.o +0 -0
data/ext/ruby_gumath/gumath/libgumath/cpu_device_binary.o +0 -0
data/ext/ruby_gumath/gumath/libgumath/cpu_device_unary.o +0 -0
data/ext/ruby_gumath/gumath/libgumath/cpu_host_binary.o +0 -0
data/ext/ruby_gumath/gumath/libgumath/cpu_host_unary.o +0 -0
data/ext/ruby_gumath/gumath/libgumath/examples.o +0 -0
data/ext/ruby_gumath/gumath/libgumath/extending/graph.c +27 -20
data/ext/ruby_gumath/gumath/libgumath/extending/pdist.c +1 -1
data/ext/ruby_gumath/gumath/libgumath/func.c +13 -9
data/ext/ruby_gumath/gumath/libgumath/func.o +0 -0
data/ext/ruby_gumath/gumath/libgumath/graph.o +0 -0
data/ext/ruby_gumath/gumath/libgumath/gumath.h +55 -14
data/ext/ruby_gumath/gumath/libgumath/kernels/common.c +513 -0
data/ext/ruby_gumath/gumath/libgumath/kernels/common.h +155 -0
data/ext/ruby_gumath/gumath/libgumath/kernels/contrib/bfloat16.h +520 -0
data/ext/ruby_gumath/gumath/libgumath/kernels/cpu_device_binary.cc +1123 -0
data/ext/ruby_gumath/gumath/libgumath/kernels/cpu_device_binary.h +1062 -0
data/ext/ruby_gumath/gumath/libgumath/kernels/cpu_device_msvc.cc +555 -0
data/ext/ruby_gumath/gumath/libgumath/kernels/cpu_device_unary.cc +368 -0
data/ext/ruby_gumath/gumath/libgumath/kernels/cpu_device_unary.h +335 -0
data/ext/ruby_gumath/gumath/libgumath/kernels/cpu_host_binary.c +2952 -0
data/ext/ruby_gumath/gumath/libgumath/kernels/cpu_host_unary.c +1100 -0
data/ext/ruby_gumath/gumath/libgumath/kernels/cuda_device_binary.cu +1143 -0
data/ext/ruby_gumath/gumath/libgumath/kernels/cuda_device_binary.h +1061 -0
data/ext/ruby_gumath/gumath/libgumath/kernels/cuda_device_unary.cu +528 -0
data/ext/ruby_gumath/gumath/libgumath/kernels/cuda_device_unary.h +463 -0
data/ext/ruby_gumath/gumath/libgumath/kernels/cuda_host_binary.c +2817 -0
data/ext/ruby_gumath/gumath/libgumath/kernels/cuda_host_unary.c +1331 -0
data/ext/ruby_gumath/gumath/libgumath/kernels/device.hh +614 -0
data/ext/ruby_gumath/gumath/libgumath/libgumath.a +0 -0
data/ext/ruby_gumath/gumath/libgumath/libgumath.so +1 -0
data/ext/ruby_gumath/gumath/libgumath/libgumath.so.0 +1 -0
data/ext/ruby_gumath/gumath/libgumath/libgumath.so.0.2.0dev3 +0 -0
data/ext/ruby_gumath/gumath/libgumath/nploops.o +0 -0
data/ext/ruby_gumath/gumath/libgumath/pdist.o +0 -0
data/ext/ruby_gumath/gumath/libgumath/quaternion.o +0 -0
data/ext/ruby_gumath/gumath/libgumath/tbl.o +0 -0
data/ext/ruby_gumath/gumath/libgumath/thread.c +17 -4
data/ext/ruby_gumath/gumath/libgumath/thread.o +0 -0
data/ext/ruby_gumath/gumath/libgumath/xndloops.c +110 -0
data/ext/ruby_gumath/gumath/libgumath/xndloops.o +0 -0
data/ext/ruby_gumath/gumath/python/gumath/__init__.py +150 -0
data/ext/ruby_gumath/gumath/python/gumath/_gumath.c +446 -80
data/ext/ruby_gumath/gumath/python/gumath/cuda.c +78 -0
data/ext/ruby_gumath/gumath/python/gumath/examples.c +0 -5
data/ext/ruby_gumath/gumath/python/gumath/functions.c +2 -2
data/ext/ruby_gumath/gumath/python/gumath/gumath.h +246 -0
data/ext/ruby_gumath/gumath/python/gumath/libgumath.a +0 -0
data/ext/ruby_gumath/gumath/python/gumath/libgumath.so +1 -0
data/ext/ruby_gumath/gumath/python/gumath/libgumath.so.0 +1 -0
data/ext/ruby_gumath/gumath/python/gumath/libgumath.so.0.2.0dev3 +0 -0
data/ext/ruby_gumath/gumath/python/gumath/pygumath.h +31 -2
data/ext/ruby_gumath/gumath/python/gumath_aux.py +767 -0
data/ext/ruby_gumath/gumath/python/randdec.py +535 -0
data/ext/ruby_gumath/gumath/python/randfloat.py +177 -0
data/ext/ruby_gumath/gumath/python/test_gumath.py +1504 -24
data/ext/ruby_gumath/gumath/python/test_xndarray.py +462 -0
data/ext/ruby_gumath/gumath/setup.py +67 -6
data/ext/ruby_gumath/gumath/tools/detect_cuda_arch.cc +35 -0
data/ext/ruby_gumath/include/gumath.h +55 -14
data/ext/ruby_gumath/include/ruby_gumath.h +4 -1
data/ext/ruby_gumath/lib/libgumath.a +0 -0
data/ext/ruby_gumath/lib/libgumath.so.0.2.0dev3 +0 -0
data/ext/ruby_gumath/ruby_gumath.c +231 -70
data/ext/ruby_gumath/ruby_gumath.h +4 -1
data/ext/ruby_gumath/ruby_gumath_internal.h +25 -0
data/ext/ruby_gumath/util.c +34 -0
data/ext/ruby_gumath/util.h +9 -0
data/gumath.gemspec +3 -2
data/lib/gumath.rb +55 -1
data/lib/gumath/version.rb +2 -2
data/lib/ruby_gumath.so +0 -0
metadata +63 -10
data/ext/ruby_gumath/gumath/libgumath/extending/bfloat16.c +0 -130
data/ext/ruby_gumath/gumath/libgumath/kernels/binary.c +0 -547
data/ext/ruby_gumath/gumath/libgumath/kernels/unary.c +0 -449

data/ext/ruby_gumath/gumath/libgumath/libgumath.a ADDED

Binary file

data/ext/ruby_gumath/gumath/libgumath/libgumath.so ADDED

	@@ -0,0 +1 @@
1	+ ext/ruby_gumath/gumath/libgumath/libgumath.so.0.2.0dev3

data/ext/ruby_gumath/gumath/libgumath/libgumath.so.0 ADDED

	@@ -0,0 +1 @@
1	+ ext/ruby_gumath/gumath/libgumath/libgumath.so.0.2.0dev3

data/ext/ruby_gumath/gumath/libgumath/libgumath.so.0.2.0dev3 ADDED

Binary file

data/ext/ruby_gumath/gumath/libgumath/nploops.o ADDED

Binary file

data/ext/ruby_gumath/gumath/libgumath/pdist.o ADDED

Binary file

data/ext/ruby_gumath/gumath/libgumath/quaternion.o ADDED

Binary file

data/ext/ruby_gumath/gumath/libgumath/tbl.o ADDED

Binary file

data/ext/ruby_gumath/gumath/libgumath/thread.c CHANGED

@@ -72,7 +72,7 @@ clear_all_slices(xnd_t *slices[], int *nslices, int stop)
 {
     for (int i = 0; i < stop; i++) {
         for (int k = 0; k < nslices[i]; k++) {
-            ndt_del((ndt_t *)slices[i][k].type);
+            ndt_decref(slices[i][k].type);
         }
         ndt_free(slices[i]);
     }
@@ -94,16 +94,27 @@ apply_thread(void *arg)
 int
 gm_apply_thread(const gm_kernel_t *kernel, xnd_t stack[], int outer_dims,
-                uint32_t flags, const int64_t nthreads, ndt_context_t *ctx)
+                const int64_t nthreads, ndt_context_t *ctx)
 {
     const int nrows = (int)kernel->set->sig->Function.nargs;
     ALLOCA(xnd_t *, slices, nrows);
     ALLOCA(int, nslices, nrows);
     struct thread_info *tinfo;
     int ncols, tnum;
+    bool use_threads = true;
-    if (nthreads <= 1 || nrows == 0 || outer_dims == 0 ||
-        !(flags & NDT_STRIDED)) {
+    if (nthreads <= 1 || nrows == 0 || outer_dims == 0) {
+        use_threads = false;
+    }
+    for (int i = 0; i < nrows; i++) {
+        const ndt_t *t = stack[i].type;
+        if (!ndt_is_ndarray(t) || ndt_nelem(t) < GM_THREAD_CUTOFF) {
+            use_threads = false;
+        }
+    }
+    if (!use_threads) {
         return gm_apply(kernel, stack, outer_dims, ctx);
     }
@@ -147,6 +158,7 @@ gm_apply_thread(const gm_kernel_t *kernel, xnd_t stack[], int outer_dims,
                                  &tinfo[tnum]);
         if (ret != 0) {
             clear_all_slices(slices, nslices, nrows);
+            ndt_free(tinfo);
             ndt_err_format(ctx, NDT_RuntimeError, "could not create thread");
             return -1;
         }
@@ -169,6 +181,7 @@ gm_apply_thread(const gm_kernel_t *kernel, xnd_t stack[], int outer_dims,
     }
     clear_all_slices(slices, nslices, nrows);
+    ndt_free(tinfo);
     return ndt_err_occurred(ctx) ? -1 : 0;
 }

data/ext/ruby_gumath/gumath/libgumath/thread.o ADDED

Binary file

data/ext/ruby_gumath/gumath/libgumath/xndloops.c CHANGED

@@ -38,11 +38,99 @@
 #include "ndtypes.h"
 #include "xnd.h"
 #include "gumath.h"
+#include "overflow.h"
+static int _gm_xnd_map(const gm_xnd_kernel_t f, xnd_t stack[], const int nargs,
+                       const int outer_dims, ndt_context_t *ctx);
+int
+array_shape_check(xnd_t *x, const int64_t shape, ndt_context_t *ctx)
+{
+    const ndt_t *t = x->type;
+    if (t->tag != Array) {
+        ndt_err_format(ctx, NDT_RuntimeError,
+            "type mismatch in outer dimensions");
+        return -1;
+    }
+    if (XND_ARRAY_DATA(x->ptr) == NULL) {
+        bool overflow = false;
+        const int64_t size = MULi64(shape, t->Array.itemsize, &overflow);
+        if (overflow) {
+            ndt_err_format(ctx, NDT_ValueError,
+                "datasize of flexible array is too large");
+            return -1;
+        }
+        char *data = ndt_aligned_calloc(t->align, size);
+        if (data == NULL) {
+            ndt_err_format(ctx, NDT_MemoryError, "out of memory");
+            return -1;
+        }
+        XND_ARRAY_SHAPE(x->ptr) = shape;
+        XND_ARRAY_DATA(x->ptr) = data;
+        return 0;
+    }
+    else if (XND_ARRAY_SHAPE(x->ptr) != shape) {
+        ndt_err_format(ctx, NDT_RuntimeError,
+            "shape mismatch in outer dimensions");
+        return -1;
+    }
+    else {
+        return 0;
+    }
+}
+static inline bool
+any_stored_index(xnd_t stack[], const int nargs)
+{
+    for (int i = 0; i < nargs; i++) {
+        if (stack[i].ptr == NULL) {
+            continue;
+        }
+        const ndt_t *t = stack[i].type;
+        if (have_stored_index(t)) {
+            return true;
+        }
+    }
+    return false;
+}
 int
 gm_xnd_map(const gm_xnd_kernel_t f, xnd_t stack[], const int nargs,
            const int outer_dims, ndt_context_t *ctx)
+{
+    if (any_stored_index(stack, nargs)) {
+        ALLOCA(xnd_t, next, nargs);
+        for (int i = 0; i < nargs; i++) {
+            const ndt_t *t = stack[i].type;
+            if (have_stored_index(t)) {
+                next[i] = apply_stored_indices(&stack[i], ctx);
+                if (xnd_err_occurred(&next[i])) {
+                    return -1;
+                }
+            }
+            else {
+                next[i] = stack[i];
+            }
+        }
+        return _gm_xnd_map(f, next, nargs, outer_dims, ctx);
+    }
+    return _gm_xnd_map(f, stack, nargs, outer_dims, ctx);
+}
+static int
+_gm_xnd_map(const gm_xnd_kernel_t f, xnd_t stack[], const int nargs,
+            const int outer_dims, ndt_context_t *ctx)
 {
     ALLOCA(xnd_t, next, nargs);
     const ndt_t *t;
@@ -123,6 +211,28 @@ gm_xnd_map(const gm_xnd_kernel_t f, xnd_t stack[], const int nargs,
         return 0;
     }
+    case Array: {
+        const int64_t shape = XND_ARRAY_SHAPE(stack[0].ptr);
+        for (int k = 1; k < nargs; k++) {
+            if (array_shape_check(&stack[k], shape, ctx) < 0) {
+                return -1;
+            }
+        }
+        for (int64_t i = 0; i < shape; i++) {
+            for (int k = 0; k < nargs; k++) {
+                next[k] = xnd_array_next(&stack[k], i);
+            }
+            if (gm_xnd_map(f, next, nargs, outer_dims-1, ctx) < 0) {
+                return -1;
+            }
+        }
+        return 0;
+    }
     default:
         ndt_err_format(ctx, NDT_NotImplementedError, "unsupported type");
         return -1;

data/ext/ruby_gumath/gumath/libgumath/xndloops.o ADDED

Binary file

data/ext/ruby_gumath/gumath/python/gumath/__init__.py CHANGED

@@ -33,7 +33,157 @@
 from ndtypes import ndt
 from xnd import xnd
 from ._gumath import *
+from . import functions as _fn
+try:
+    from . import cuda as _cd
+except ImportError:
+    _cd = None
+__all__ = ['cuda', 'fold', 'functions', 'get_max_threads', 'gufunc', 'reduce',
+           'set_max_threads', 'unsafe_add_kernel', 'vfold', 'xndvectorize']
+# ==============================================================================
+#                              Init identity elements
+# ==============================================================================
+# This is done here now, perhaps it should be on the C level.
+_fn.add.identity = 0
+_fn.multiply.identity = 1
+# ==============================================================================
+#                             General fold function
+# ==============================================================================
+def fold(f, acc, x):
+    return vfold(x, f=f, acc=acc)
+# ==============================================================================
+#                        NumPy's reduce in terms of fold
+# ==============================================================================
+def _get_axes(axes, ndim):
+    type_err = "'axes' must be None, a single integer or a tuple of integers"
+    value_err = "axis with value %d out of range"
+    duplicate_err = "'axes' argument contains duplicate values"
+    if axes is None:
+        axes = tuple(range(ndim))
+    elif isinstance(axes, int):
+        axes = (axes,)
+    elif not isinstance(axes, tuple) or \
+         any(not isinstance(v, int) for v in axes):
+        raise TypeError(type_err)
+    if any(n >= ndim for n in axes):
+        raise ValueError(value_err % n)
+    if len(set(axes)) != len(axes):
+        raise ValueError(duplicate_err)
+    return list(axes)
+def _copyto(dest, value):
+    x = xnd(value, dtype=dest.dtype)
+    _fn.copy(x, out=dest)
+def reduce_cpu(f, x, axes, dtype):
+    """NumPy's reduce in terms of fold."""
+    axes = _get_axes(axes, x.ndim)
+    if not axes:
+        return x
+    permute = [n for n in range(x.ndim) if n not in axes]
+    permute = axes + permute
+    T = x.transpose(permute=permute)
+    N = len(axes)
+    t = T.type.at(N, dtype=dtype)
+    acc = x.empty(t, device=x.device)
+    if f.identity is not None:
+        _copyto(acc, f.identity)
+        tl = T
+    elif N == 1 and T.type.shape[0] > 0:
+        hd, tl = T[0], T[1:]
+        acc[()] = hd
+    else:
+        raise ValueError(
+            "reduction not possible for function without an identity element")
+    return fold(f, acc, tl)
+def reduce_cuda(g, x, axes, dtype):
+    """Reductions in CUDA use the thrust library for speed and have limited
+       functionality."""
+    if axes != 0:
+        raise NotImplementedError("'axes' keyword is not implemented for CUDA")
+    return g(x, dtype=dtype)
+def get_cuda_reduction_func(f):
+    if _cd is None:
+        return None
+    elif f == _cd.add:
+        return _cd.reduce_add
+    elif f == _cd.multiply:
+        return _cd.reduce_multiply
+    else:
+        return None
+def reduce(f, x, axes=0, dtype=None):
+    if dtype is None:
+        dtype = maxcast[x.dtype]
+    g = get_cuda_reduction_func(f)
+    if g is not None:
+        return reduce_cuda(g, x, axes, dtype)
+    return reduce_cpu(f, x, axes, dtype)
+maxcast = {
+  ndt("int8"): ndt("int64"),
+  ndt("int16"): ndt("int64"),
+  ndt("int32"): ndt("int64"),
+  ndt("int64"): ndt("int64"),
+  ndt("uint8"): ndt("uint64"),
+  ndt("uint16"): ndt("uint64"),
+  ndt("uint32"): ndt("uint64"),
+  ndt("uint64"): ndt("uint64"),
+  ndt("bfloat16"): ndt("float64"),
+  ndt("float16"): ndt("float64"),
+  ndt("float32"): ndt("float64"),
+  ndt("float64"): ndt("float64"),
+  ndt("complex32"): ndt("complex128"),
+  ndt("complex64"): ndt("complex128"),
+  ndt("complex128"): ndt("complex128"),
+  ndt("?int8"): ndt("?int64"),
+  ndt("?int16"): ndt("?int64"),
+  ndt("?int32"): ndt("?int64"),
+  ndt("?int64"): ndt("?int64"),
+  ndt("?uint8"): ndt("?uint64"),
+  ndt("?uint16"): ndt("?uint64"),
+  ndt("?uint32"): ndt("?uint64"),
+  ndt("?uint64"): ndt("?uint64"),
+  ndt("?bfloat16"): ndt("?float64"),
+  ndt("?float16"): ndt("?float64"),
+  ndt("?float32"): ndt("?float64"),
+  ndt("?float64"): ndt("?float64"),
+  ndt("?complex32"): ndt("?complex128"),
+  ndt("?complex64"): ndt("?complex128"),
+  ndt("?complex128"): ndt("?complex128"),
+}
+# ==============================================================================
+#                         Numba's GUVectorize on xnd arrays
+# ==============================================================================
 try:
     import numpy as np

data/ext/ruby_gumath/gumath/python/gumath/_gumath.c CHANGED

@@ -45,16 +45,24 @@
 #define GUMATH_MODULE
 #include "pygumath.h"
 #ifdef _MSC_VER
   #ifndef UNUSED
     #define UNUSED
   #endif
+  #include <float.h>
+  #include <fenv.h>
+  #pragma fenv_access(on)
 #else
   #if defined(__GNUC__) && !defined(__INTEL_COMPILER)
     #define UNUSED __attribute__((unused))
   #else
     #define UNUSED
   #endif
+  #include <fenv.h>
+  #if 0 /* Not supported by gcc and clang. */
+    #pragma STDC FENV_ACCESS ON
+  #endif
 #endif
@@ -73,6 +81,9 @@ static gm_tbl_t *table = NULL;
 /* Xnd type */
 static PyTypeObject *xnd = NULL;
+/* Empty positional arguments */
+static PyObject *positional_empty = NULL;
 /* Maximum number of threads */
 static int64_t max_threads = 1;
@@ -95,7 +106,7 @@ seterr(ndt_context_t *ctx)
 static PyTypeObject Gufunc_Type;
 static PyObject *
-gufunc_new(const gm_tbl_t *tbl, const char *name)
+gufunc_new(const gm_tbl_t *tbl, const char *name, const uint32_t flags)
 {
     NDT_STATIC_CONTEXT(ctx);
     GufuncObject *self;
@@ -106,12 +117,16 @@ gufunc_new(const gm_tbl_t *tbl, const char *name)
     }
     self->tbl = tbl;
+    self->flags = flags;
     self->name = ndt_strdup(name, &ctx);
     if (self->name == NULL) {
         return seterr(&ctx);
     }
+    self->identity = Py_None;
+    Py_INCREF(self->identity);
     return (PyObject *)self;
 }
@@ -119,6 +134,7 @@ static void
 gufunc_dealloc(GufuncObject *self)
 {
     ndt_free(self->name);
+    Py_DECREF(self->identity);
     PyObject_Del(self);
 }
@@ -128,124 +144,317 @@ gufunc_dealloc(GufuncObject *self)
 /****************************************************************************/
 static void
-clear_objects(PyObject **a, Py_ssize_t len)
+clear_pystack(PyObject *pystack[], Py_ssize_t len)
+{
+    for (Py_ssize_t i = 0; i < len; i++) {
+        Py_CLEAR(pystack[i]);
+    }
+}
+static int
+parse_args(PyObject *pystack[NDT_MAX_ARGS], int *py_nin, int *py_nout, int *py_nargs,
+           PyObject *args, PyObject *out)
 {
-    Py_ssize_t i;
+    Py_ssize_t nin;
+    Py_ssize_t nout;
-    for (i = 0; i < len; i++) {
-        Py_CLEAR(a[i]);
+    if (!args || !PyTuple_Check(args)) {
+        const char *name = args ? Py_TYPE(args)->tp_name : "NULL";
+        PyErr_Format(PyExc_SystemError,
+            "internal error: expected tuple, got '%.200s'", name);
+        return -1;
     }
+    nin = PyTuple_GET_SIZE(args);
+    if (nin > NDT_MAX_ARGS) {
+        PyErr_Format(PyExc_TypeError,
+            "maximum number of arguments is %d, got %n", NDT_MAX_ARGS, nin);
+        return -1;
+    }
+    for (Py_ssize_t i = 0; i < nin; i++) {
+        PyObject *v = PyTuple_GET_ITEM(args, i);
+        if (!Xnd_Check(v)) {
+            PyErr_Format(PyExc_TypeError,
+                "expected xnd argument, got '%.200s'", Py_TYPE(v)->tp_name);
+            return -1;
+        }
+        pystack[i] = v;
+    }
+    if (out == NULL) {
+        nout = 0;
+    }
+    else {
+        if (Xnd_Check(out)) {
+            nout = 1;
+            if (nin+nout > NDT_MAX_ARGS) {
+                PyErr_Format(PyExc_TypeError,
+                    "maximum number of arguments is %d, got %n", NDT_MAX_ARGS, nin+nout);
+                return -1;
+            }
+            pystack[nin] = out;
+        }
+        else if (PyTuple_Check(out)) {
+            nout = PyTuple_GET_SIZE(out);
+            if (nout > NDT_MAX_ARGS || nin+nout > NDT_MAX_ARGS) {
+                PyErr_Format(PyExc_TypeError,
+                    "maximum number of arguments is %d, got %n", NDT_MAX_ARGS, nin+nout);
+                return -1;
+            }
+            for (Py_ssize_t i = 0; i < nout; i++) {
+                PyObject *v = PyTuple_GET_ITEM(out, i);
+                if (!Xnd_Check(v)) {
+                    PyErr_Format(PyExc_TypeError,
+                        "expected xnd argument, got '%.200s'", Py_TYPE(v)->tp_name);
+                    return -1;
+                }
+                pystack[nin+i] = v;
+            }
+        }
+        else {
+            PyErr_Format(PyExc_TypeError,
+                "'out' argument must be xnd or a tuple of xnd, got '%.200s'",
+                Py_TYPE(out)->tp_name);
+            return -1;
+        }
+    }
+    for (int i = 0; i < nin+nout; i++) {
+        Py_INCREF(pystack[i]);
+    }
+    *py_nin = (int)nin;
+    *py_nout = (int)nout;
+    *py_nargs = (int)nin+(int)nout;
+    return 0;
 }
 static PyObject *
-gufunc_call(GufuncObject *self, PyObject *args, PyObject *kwds)
+_gufunc_call(GufuncObject *self, PyObject *args, PyObject *kwargs,
+             bool enable_threads, bool check_broadcast)
 {
+    static char *kwlist[] = {"out", "dtype", "cls", NULL};
+    PyObject *out = Py_None;
+    PyObject *dt = Py_None;
+    PyObject *cls = Py_None;
     NDT_STATIC_CONTEXT(ctx);
-    const Py_ssize_t nin = PyTuple_GET_SIZE(args);
-    PyObject **a = &PyTuple_GET_ITEM(args, 0);
-    PyObject *result[NDT_MAX_ARGS];
-    ndt_apply_spec_t spec = ndt_apply_spec_empty;
-    const ndt_t *in_types[NDT_MAX_ARGS];
+    PyObject *pystack[NDT_MAX_ARGS];
     xnd_t stack[NDT_MAX_ARGS];
+    const ndt_t *types[NDT_MAX_ARGS];
+    int64_t li[NDT_MAX_ARGS];
+    ndt_apply_spec_t spec = ndt_apply_spec_empty;
     gm_kernel_t kernel;
-    int i, k;
+    bool have_cpu_device = false;
+    ndt_t *dtype = NULL;
+    int nin, nout, nargs;
+    int k;
-    if (kwds && PyDict_Size(kwds) > 0) {
-        PyErr_SetString(PyExc_TypeError,
-            "gufunc calls do not support keywords");
+    if (!PyArg_ParseTupleAndKeywords(positional_empty, kwargs, "|$OOO", kwlist,
+                                     &out, &dt, &cls)) {
         return NULL;
     }
-    if (nin > NDT_MAX_ARGS) {
+    out = out == Py_None ? NULL : out;
+    dt = dt == Py_None ? NULL : dt;
+    cls = cls == Py_None ? (PyObject *)xnd : cls;
+    if (dt != NULL) {
+        if (out != NULL) {
+            PyErr_SetString(PyExc_TypeError,
+                "the 'out' and 'dtype' arguments are mutually exclusive");
+            return NULL;
+        }
+        if (!Ndt_Check(dt)) {
+            PyErr_Format(PyExc_TypeError,
+                "'dtype' argument must be ndt, got '%.200s'",
+                Py_TYPE(dt)->tp_name);
+                return NULL;
+        }
+        dtype = (ndt_t *)NDT(dt);
+        ndt_incref(dtype);
+    }
+    if (!PyType_Check(cls) || !PyType_IsSubtype((PyTypeObject *)cls, xnd)) {
         PyErr_SetString(PyExc_TypeError,
-            "invalid number of arguments");
+            "the 'cls' argument must be a subtype of 'xnd'");
+        return NULL;
+    }
+    if (parse_args(pystack, &nin, &nout, &nargs, args, out) < 0) {
         return NULL;
     }
+    assert(nout == 0 || dtype == NULL);
-    for (i = 0; i < nin; i++) {
-        if (!Xnd_Check(a[i])) {
-            PyErr_SetString(PyExc_TypeError, "arguments must be xnd");
+    for (k = 0; k < nargs; k++) {
+        const XndObject *x = (XndObject *)pystack[k];
+        if (!(x->mblock->xnd->flags&XND_CUDA_MANAGED)) {
+            have_cpu_device = true;
+        }
+        stack[k] = *CONST_XND((PyObject *)x);
+        types[k] = stack[k].type;
+        li[k] = stack[k].index;
+    }
+    if (have_cpu_device) {
+        if (self->flags & GM_CUDA_MANAGED_FUNC) {
+            PyErr_SetString(PyExc_ValueError,
+                "cannot run a cuda function on xnd objects with cpu memory");
+            clear_pystack(pystack, nargs);
             return NULL;
         }
-        stack[i] = *CONST_XND(a[i]);
-        in_types[i] = stack[i].type;
     }
-    kernel = gm_select(&spec, self->tbl, self->name, in_types, (int)nin, stack, &ctx);
+    kernel = gm_select(&spec, self->tbl, self->name, types, li, nin, nout,
+                       nout && check_broadcast, stack, &ctx);
     if (kernel.set == NULL) {
         return seterr(&ctx);
     }
-    if (spec.nbroadcast > 0) {
-        for (i = 0; i < nin; i++) {
-            stack[i].type = spec.broadcast[i];
+    if (dtype) {
+        if (spec.nout != 1) {
+            ndt_err_format(&ctx, NDT_TypeError,
+                "the 'dtype' argument is only supported for a single "
+                "return value");
+            ndt_apply_spec_clear(&spec);
+            ndt_decref(dtype);
+            return seterr(&ctx);
+        }
+        const ndt_t *u = spec.types[spec.nin];
+        const ndt_t *v = ndt_copy_contiguous_dtype(u, dtype, 0, &ctx);
+        ndt_apply_spec_clear(&spec);
+        ndt_decref(dtype);
+        if (v == NULL) {
+            return seterr(&ctx);
+        }
+        types[nin] = v;
+        kernel = gm_select(&spec, self->tbl, self->name, types, li, nin, 1,
+                           1 && check_broadcast, stack, &ctx);
+        if (kernel.set == NULL) {
+            return seterr(&ctx);
         }
     }
-    for (i = 0; i < spec.nout; i++) {
-        if (ndt_is_concrete(spec.out[i])) {
-            PyObject *x = Xnd_EmptyFromType(xnd, spec.out[i]);
-            if (x == NULL) {
-                clear_objects(result, i);
-                ndt_apply_spec_clear(&spec);
+    /*
+     * Replace args/kwargs types with types after substitution and broadcasting.
+     * This includes 'out' types, if explicitly passed as kwargs.
+     */
+    for (int i = 0; i < spec.nargs; i++) {
+        stack[i].type = spec.types[i];
+    }
+    if (nout == 0) {
+        /* 'out' types have been inferred, create new XndObjects. */
+        for (int i = 0; i < spec.nout; i++) {
+            if (ndt_is_concrete(spec.types[nin+i])) {
+                uint32_t flags = self->flags == GM_CUDA_MANAGED_FUNC ? XND_CUDA_MANAGED : 0;
+                PyObject *x = Xnd_EmptyFromType((PyTypeObject *)cls, spec.types[nin+i], flags);
+                if (x == NULL) {
+                    clear_pystack(pystack, nin+i);
+                    ndt_apply_spec_clear(&spec);
                 return NULL;
             }
-            result[i] = x;
+            pystack[nin+i] = x;
             stack[nin+i] = *CONST_XND(x);
-         }
-         else {
-            result[i] = NULL;
-            stack[nin+i] = xnd_error;
-         }
+            }
+            else {
+                clear_pystack(pystack, nin+i);
+                ndt_apply_spec_clear(&spec);
+                PyErr_SetString(PyExc_ValueError,
+                    "arguments with abstract types are temporarily disabled");
+                return NULL;
+            }
+        }
     }
-#ifdef HAVE_PTHREAD_H
-    if (gm_apply_thread(&kernel, stack, spec.outer_dims, spec.flags,
-        max_threads, &ctx) < 0) {
-        clear_objects(result, spec.nout);
-        return seterr(&ctx);
-    }
-#else
-    if (gm_apply(&kernel, stack, spec.outer_dims, &ctx) < 0) {
-        clear_objects(result, spec.nout);
-        return seterr(&ctx);
-    }
-#endif
+    if (self->flags == GM_CUDA_MANAGED_FUNC) {
+    #if HAVE_CUDA
+        if (!check_broadcast) {
+            ndt_err_format(&ctx, NDT_NotImplementedError,
+               "fold() is currently not supported on cuda");
+            clear_pystack(pystack, spec.nargs);
+            ndt_apply_spec_clear(&spec);
+            return seterr(&ctx);
+        }
-    for (i = 0; i < spec.nout; i++) {
-        if (ndt_is_abstract(spec.out[i])) {
-            ndt_del(spec.out[i]);
-            PyObject *x = Xnd_FromXnd(xnd, &stack[nin+i]);
-            stack[nin+i] = xnd_error;
-            if (x == NULL) {
-                clear_objects(result, i);
-                for (k = i+1; k < spec.nout; k++) {
-                    if (ndt_is_abstract(spec.out[k])) {
-                        xnd_del_buffer(&stack[nin+k], XND_OWN_ALL);
-                    }
-                }
-            }
-            result[i] = x;
+        const int ret = gm_apply(&kernel, stack, spec.outer_dims, &ctx);
+        if (xnd_cuda_device_synchronize(&ctx) < 0 || ret < 0) {
+            clear_pystack(pystack, spec.nargs);
+            ndt_apply_spec_clear(&spec);
+            return seterr(&ctx);
         }
+    #else
+        ndt_err_format(&ctx, NDT_RuntimeError,
+           "internal error: GM_CUDA_MANAGED_FUNC set in a build without cuda support");
+        clear_pystack(pystack, spec.nargs);
+        ndt_apply_spec_clear(&spec);
+        return seterr(&ctx);
+    #endif
     }
+    else {
+    #ifdef HAVE_PTHREAD_H
+        const int rounding = fegetround();
+        fesetround(FE_TONEAREST);
+        const int64_t N = enable_threads ? max_threads : 1;
+        const int ret = gm_apply_thread(&kernel, stack, spec.outer_dims, N,
+                                        &ctx);
+        fesetround(rounding);
+        if (ret < 0) {
+            clear_pystack(pystack, spec.nargs);
+            ndt_apply_spec_clear(&spec);
+            return seterr(&ctx);
+        }
+    #else
+        const int rounding = fegetround();
+        fesetround(FE_TONEAREST);
+        const int ret = gm_apply(&kernel, stack, spec.outer_dims, &ctx);
-    if (spec.nbroadcast > 0) {
-        for (i = 0; i < nin; i++) {
-            ndt_del(spec.broadcast[i]);
+        fesetround(rounding);
+        if (ret < 0) {
+            clear_pystack(pystack, spec.nargs);
+            ndt_apply_spec_clear(&spec);
+            return seterr(&ctx);
         }
+    #endif
     }
-    switch (spec.nout) {
-    case 0: Py_RETURN_NONE;
-    case 1: return result[0];
+    nin = spec.nin;
+    nout = spec.nout;
+    nargs = spec.nargs;
+    ndt_apply_spec_clear(&spec);
+    switch (nout) {
+    case 0: {
+        clear_pystack(pystack, nargs);
+        Py_RETURN_NONE;
+    }
+    case 1: {
+        clear_pystack(pystack, nin);
+        return pystack[nin];
+    }
     default: {
-        PyObject *tuple = PyTuple_New(spec.nout);
+        PyObject *tuple = PyTuple_New(nout);
         if (tuple == NULL) {
-            clear_objects(result, spec.nout);
+            clear_pystack(pystack, nargs);
             return NULL;
         }
-        for (i = 0; i < spec.nout; i++) {
-            PyTuple_SET_ITEM(tuple, i, result[i]);
+        for (int i = 0; i < nout; i++) {
+            PyTuple_SET_ITEM(tuple, i, pystack[nin+i]);
         }
         return tuple;
       }
@@ -253,7 +462,39 @@ gufunc_call(GufuncObject *self, PyObject *args, PyObject *kwds)
 }
 static PyObject *
-gufunc_kernels(GufuncObject *self, PyObject *args GM_UNUSED)
+gufunc_call(GufuncObject *self, PyObject *args, PyObject *kwargs)
+{
+    return _gufunc_call(self, args, kwargs, true, true);
+}
+static PyObject *
+gufunc_getdevice(GufuncObject *self, PyObject *args GM_UNUSED)
+{
+    if (self->flags & GM_CUDA_MANAGED_FUNC) {
+        return PyUnicode_FromString("cuda:managed");
+    }
+    Py_RETURN_NONE;
+}
+static PyObject *
+gufunc_getidentity(GufuncObject *self, PyObject *args GM_UNUSED)
+{
+    Py_INCREF(self->identity);
+    return self->identity;
+}
+static int
+gufunc_setidentity(GufuncObject *self, PyObject *value, void *closure GM_UNUSED)
+{
+    Py_DECREF(self->identity);
+    Py_INCREF(value);
+    self->identity = value;
+    return 0;
+}
+static PyObject *
+gufunc_getkernels(GufuncObject *self, PyObject *args GM_UNUSED)
 {
     NDT_STATIC_CONTEXT(ctx);
     PyObject *list, *tmp;
@@ -294,7 +535,9 @@ gufunc_kernels(GufuncObject *self, PyObject *args GM_UNUSED)
 static PyGetSetDef gufunc_getsets [] =
 {
-  { "kernels", (getter)gufunc_kernels, NULL, NULL, NULL},
+  { "device", (getter)gufunc_getdevice, NULL, NULL, NULL},
+  { "identity", (getter)gufunc_getidentity, (setter)gufunc_setidentity, NULL, NULL},
+  { "kernels", (getter)gufunc_getkernels, NULL, NULL, NULL},
   {NULL}
 };
@@ -323,13 +566,25 @@ struct map_args {
     const gm_tbl_t *tbl;
 };
+static int
+Gufunc_CheckExact(const PyObject *v)
+{
+    return Py_TYPE(v) == &Gufunc_Type;
+}
+static int
+Gufunc_Check(const PyObject *v)
+{
+    return PyObject_TypeCheck(v, &Gufunc_Type);
+}
 static int
 add_function(const gm_func_t *f, void *args)
 {
     struct map_args *a = (struct map_args *)args;
     PyObject *func;
-    func = gufunc_new(a->tbl, f->name);
+    func = gufunc_new(a->tbl, f->name, GM_CPU_FUNC);
     if (func == NULL) {
         return -1;
     }
@@ -349,10 +604,40 @@ Gumath_AddFunctions(PyObject *m, const gm_tbl_t *tbl)
     return 0;
 }
+static int
+add_cuda_function(const gm_func_t *f, void *args)
+{
+    struct map_args *a = (struct map_args *)args;
+    PyObject *func;
+    func = gufunc_new(a->tbl, f->name, GM_CUDA_MANAGED_FUNC);
+    if (func == NULL) {
+        return -1;
+    }
+    return PyModule_AddObject(a->module, f->name, func);
+}
+static int
+Gumath_AddCudaFunctions(PyObject *m, const gm_tbl_t *tbl)
+{
+    struct map_args args = {m, tbl};
+    if (gm_tbl_map(tbl, add_cuda_function, &args) < 0) {
+        return -1;
+    }
+    return 0;
+}
 static PyObject *
 init_api(void)
 {
+    gumath_api[Gufunc_CheckExact_INDEX] = (void *)Gufunc_CheckExact;
+    gumath_api[Gufunc_Check_INDEX] = (void *)Gufunc_Check;
     gumath_api[Gumath_AddFunctions_INDEX] = (void *)Gumath_AddFunctions;
+    gumath_api[Gumath_AddFunctions_INDEX] = (void *)Gumath_AddFunctions;
+    gumath_api[Gumath_AddCudaFunctions_INDEX] = (void *)Gumath_AddCudaFunctions;
     return PyCapsule_New(gumath_api, "gumath._gumath._API", NULL);
 }
@@ -362,6 +647,75 @@ init_api(void)
 /*                                  Module                                  */
 /****************************************************************************/
+static PyObject *
+gufunc_vfold(PyObject *m GM_UNUSED, PyObject *args, PyObject *kwargs)
+{
+    static char *kwlist[] = {"f", "acc", NULL};
+    PyObject *func = Py_None;
+    PyObject *acc = Py_None;
+    PyObject *tuple;
+    PyObject *dict;
+    PyObject *res;
+    Py_ssize_t size, i;
+    int ret;
+    ret = PyArg_ParseTupleAndKeywords(positional_empty, kwargs, "|$OO", kwlist,
+                                      &func, &acc);
+    if (ret < 0) {
+        return NULL;
+    }
+    if (!Gufunc_Check(func)) {
+        PyErr_Format(PyExc_TypeError,
+            "vfold: expected gufunc object, got '%.200s'", Py_TYPE(func));
+        return NULL;
+    }
+    if (!Xnd_Check(acc)) {
+        PyErr_Format(PyExc_TypeError,
+            "vfold: expected xnd instance, got '%.200s'", Py_TYPE(acc));
+        return NULL;
+    }
+    /* Push the accumulator onto the argument stack. */
+    size = PyTuple_Size(args);
+    tuple = PyTuple_New(size+1);
+    if (tuple == NULL) {
+        return NULL;
+    }
+    Py_INCREF(acc);
+    PyTuple_SET_ITEM(tuple, 0, acc);
+    for (i = 0; i < size; i++) {
+        PyObject *v = PyTuple_GET_ITEM(args, i);
+        Py_INCREF(v);
+        PyTuple_SET_ITEM(tuple, i+1, v);
+    }
+    /* Simultaneously use the accumulator as the 'out' argument. */
+    dict = PyDict_New();
+    if (dict == NULL) {
+        Py_DECREF(tuple);
+        return NULL;
+    }
+    if (PyDict_SetItemString(dict, "out", acc) < 0) {
+        Py_DECREF(dict);
+        Py_DECREF(tuple);
+        return NULL;
+    }
+    if (PyDict_SetItemString(dict, "cls", (PyObject *)(Py_TYPE(acc))) < 0) {
+        Py_DECREF(dict);
+        Py_DECREF(tuple);
+        return NULL;
+    }
+    res = _gufunc_call((GufuncObject *)func, tuple, dict, false, false);
+    Py_DECREF(tuple);
+    Py_DECREF(dict);
+    return res;
+}
 static PyObject *
 unsafe_add_kernel(PyObject *m GM_UNUSED, PyObject *args, PyObject *kwds)
 {
@@ -388,8 +742,8 @@ unsafe_add_kernel(PyObject *m GM_UNUSED, PyObject *args, PyObject *kwds)
     k.name = name;
     k.sig = sig;
-    if (strcmp(tag, "Opt") == 0) {
-        k.Opt = p;
+    if (strcmp(tag, "Opt") == 0) { /* XXX */
+        k.OptC = p;
     }
     else if (strcmp(tag, "C") == 0) {
         k.C = p;
@@ -418,7 +772,7 @@ unsafe_add_kernel(PyObject *m GM_UNUSED, PyObject *args, PyObject *kwds)
         return seterr(&ctx);
     }
-    return gufunc_new(table, f->name);
+    return gufunc_new(table, f->name, GM_CPU_FUNC);
 }
 static void
@@ -490,6 +844,7 @@ set_max_threads(PyObject *m UNUSED, PyObject *obj)
 static PyMethodDef gumath_methods [] =
 {
   /* Methods */
+  { "vfold", (PyCFunction)gufunc_vfold, METH_VARARGS|METH_KEYWORDS, NULL },
   { "unsafe_add_kernel", (PyCFunction)unsafe_add_kernel, METH_VARARGS|METH_KEYWORDS, NULL },
   { "get_max_threads", (PyCFunction)get_max_threads, METH_NOARGS, NULL },
   { "set_max_threads", (PyCFunction)set_max_threads, METH_O, NULL },
@@ -554,11 +909,21 @@ PyInit__gumath(void)
         goto error;
     }
+    positional_empty = PyTuple_New(0);
+    if (positional_empty == NULL) {
+        goto error;
+    }
     m = PyModule_Create(&gumath_module);
     if (m == NULL) {
         goto error;
     }
+    Py_INCREF(&Gufunc_Type);
+    if (PyModule_AddObject(m, "gufunc", (PyObject *)&Gufunc_Type) < 0) {
+        goto error;
+    }
     Py_INCREF(capsule);
     if (PyModule_AddObject(m, "_API", capsule) < 0) {
         goto error;
@@ -571,6 +936,7 @@ PyInit__gumath(void)
     return m;
 error:
+    Py_CLEAR(positional_empty);
     Py_CLEAR(xnd);
     Py_CLEAR(m);
     return NULL;