RubyGems - gumath - Versions diffs - 0.2.0dev5 → 0.2.0dev8 - Mend

gumath 0.2.0dev5 → 0.2.0dev8

Files changed (99) hide show

checksums.yaml +4 -4
data/CONTRIBUTING.md +7 -2
data/Gemfile +0 -3
data/ext/ruby_gumath/GPATH +0 -0
data/ext/ruby_gumath/GRTAGS +0 -0
data/ext/ruby_gumath/GTAGS +0 -0
data/ext/ruby_gumath/extconf.rb +0 -5
data/ext/ruby_gumath/functions.c +10 -2
data/ext/ruby_gumath/gufunc_object.c +15 -4
data/ext/ruby_gumath/gufunc_object.h +9 -3
data/ext/ruby_gumath/gumath/Makefile +63 -0
data/ext/ruby_gumath/gumath/Makefile.in +1 -0
data/ext/ruby_gumath/gumath/config.h +56 -0
data/ext/ruby_gumath/gumath/config.h.in +3 -0
data/ext/ruby_gumath/gumath/config.log +497 -0
data/ext/ruby_gumath/gumath/config.status +1034 -0
data/ext/ruby_gumath/gumath/configure +375 -4
data/ext/ruby_gumath/gumath/configure.ac +47 -3
data/ext/ruby_gumath/gumath/libgumath/Makefile +236 -0
data/ext/ruby_gumath/gumath/libgumath/Makefile.in +90 -24
data/ext/ruby_gumath/gumath/libgumath/Makefile.vc +54 -15
data/ext/ruby_gumath/gumath/libgumath/apply.c +92 -28
data/ext/ruby_gumath/gumath/libgumath/apply.o +0 -0
data/ext/ruby_gumath/gumath/libgumath/common.o +0 -0
data/ext/ruby_gumath/gumath/libgumath/cpu_device_binary.o +0 -0
data/ext/ruby_gumath/gumath/libgumath/cpu_device_unary.o +0 -0
data/ext/ruby_gumath/gumath/libgumath/cpu_host_binary.o +0 -0
data/ext/ruby_gumath/gumath/libgumath/cpu_host_unary.o +0 -0
data/ext/ruby_gumath/gumath/libgumath/examples.o +0 -0
data/ext/ruby_gumath/gumath/libgumath/extending/graph.c +27 -20
data/ext/ruby_gumath/gumath/libgumath/extending/pdist.c +1 -1
data/ext/ruby_gumath/gumath/libgumath/func.c +13 -9
data/ext/ruby_gumath/gumath/libgumath/func.o +0 -0
data/ext/ruby_gumath/gumath/libgumath/graph.o +0 -0
data/ext/ruby_gumath/gumath/libgumath/gumath.h +55 -14
data/ext/ruby_gumath/gumath/libgumath/kernels/common.c +513 -0
data/ext/ruby_gumath/gumath/libgumath/kernels/common.h +155 -0
data/ext/ruby_gumath/gumath/libgumath/kernels/contrib/bfloat16.h +520 -0
data/ext/ruby_gumath/gumath/libgumath/kernels/cpu_device_binary.cc +1123 -0
data/ext/ruby_gumath/gumath/libgumath/kernels/cpu_device_binary.h +1062 -0
data/ext/ruby_gumath/gumath/libgumath/kernels/cpu_device_msvc.cc +555 -0
data/ext/ruby_gumath/gumath/libgumath/kernels/cpu_device_unary.cc +368 -0
data/ext/ruby_gumath/gumath/libgumath/kernels/cpu_device_unary.h +335 -0
data/ext/ruby_gumath/gumath/libgumath/kernels/cpu_host_binary.c +2952 -0
data/ext/ruby_gumath/gumath/libgumath/kernels/cpu_host_unary.c +1100 -0
data/ext/ruby_gumath/gumath/libgumath/kernels/cuda_device_binary.cu +1143 -0
data/ext/ruby_gumath/gumath/libgumath/kernels/cuda_device_binary.h +1061 -0
data/ext/ruby_gumath/gumath/libgumath/kernels/cuda_device_unary.cu +528 -0
data/ext/ruby_gumath/gumath/libgumath/kernels/cuda_device_unary.h +463 -0
data/ext/ruby_gumath/gumath/libgumath/kernels/cuda_host_binary.c +2817 -0
data/ext/ruby_gumath/gumath/libgumath/kernels/cuda_host_unary.c +1331 -0
data/ext/ruby_gumath/gumath/libgumath/kernels/device.hh +614 -0
data/ext/ruby_gumath/gumath/libgumath/libgumath.a +0 -0
data/ext/ruby_gumath/gumath/libgumath/libgumath.so +1 -0
data/ext/ruby_gumath/gumath/libgumath/libgumath.so.0 +1 -0
data/ext/ruby_gumath/gumath/libgumath/libgumath.so.0.2.0dev3 +0 -0
data/ext/ruby_gumath/gumath/libgumath/nploops.o +0 -0
data/ext/ruby_gumath/gumath/libgumath/pdist.o +0 -0
data/ext/ruby_gumath/gumath/libgumath/quaternion.o +0 -0
data/ext/ruby_gumath/gumath/libgumath/tbl.o +0 -0
data/ext/ruby_gumath/gumath/libgumath/thread.c +17 -4
data/ext/ruby_gumath/gumath/libgumath/thread.o +0 -0
data/ext/ruby_gumath/gumath/libgumath/xndloops.c +110 -0
data/ext/ruby_gumath/gumath/libgumath/xndloops.o +0 -0
data/ext/ruby_gumath/gumath/python/gumath/__init__.py +150 -0
data/ext/ruby_gumath/gumath/python/gumath/_gumath.c +446 -80
data/ext/ruby_gumath/gumath/python/gumath/cuda.c +78 -0
data/ext/ruby_gumath/gumath/python/gumath/examples.c +0 -5
data/ext/ruby_gumath/gumath/python/gumath/functions.c +2 -2
data/ext/ruby_gumath/gumath/python/gumath/gumath.h +246 -0
data/ext/ruby_gumath/gumath/python/gumath/libgumath.a +0 -0
data/ext/ruby_gumath/gumath/python/gumath/libgumath.so +1 -0
data/ext/ruby_gumath/gumath/python/gumath/libgumath.so.0 +1 -0
data/ext/ruby_gumath/gumath/python/gumath/libgumath.so.0.2.0dev3 +0 -0
data/ext/ruby_gumath/gumath/python/gumath/pygumath.h +31 -2
data/ext/ruby_gumath/gumath/python/gumath_aux.py +767 -0
data/ext/ruby_gumath/gumath/python/randdec.py +535 -0
data/ext/ruby_gumath/gumath/python/randfloat.py +177 -0
data/ext/ruby_gumath/gumath/python/test_gumath.py +1504 -24
data/ext/ruby_gumath/gumath/python/test_xndarray.py +462 -0
data/ext/ruby_gumath/gumath/setup.py +67 -6
data/ext/ruby_gumath/gumath/tools/detect_cuda_arch.cc +35 -0
data/ext/ruby_gumath/include/gumath.h +55 -14
data/ext/ruby_gumath/include/ruby_gumath.h +4 -1
data/ext/ruby_gumath/lib/libgumath.a +0 -0
data/ext/ruby_gumath/lib/libgumath.so.0.2.0dev3 +0 -0
data/ext/ruby_gumath/ruby_gumath.c +231 -70
data/ext/ruby_gumath/ruby_gumath.h +4 -1
data/ext/ruby_gumath/ruby_gumath_internal.h +25 -0
data/ext/ruby_gumath/util.c +34 -0
data/ext/ruby_gumath/util.h +9 -0
data/gumath.gemspec +3 -2
data/lib/gumath.rb +55 -1
data/lib/gumath/version.rb +2 -2
data/lib/ruby_gumath.so +0 -0
metadata +63 -10
data/ext/ruby_gumath/gumath/libgumath/extending/bfloat16.c +0 -130
data/ext/ruby_gumath/gumath/libgumath/kernels/binary.c +0 -547
data/ext/ruby_gumath/gumath/libgumath/kernels/unary.c +0 -449

data/ext/ruby_gumath/gumath/setup.py CHANGED

@@ -43,6 +43,7 @@ from glob import glob
 import platform
 import subprocess
 import shutil
+import argparse
 import warnings
@@ -55,6 +56,15 @@ LONG_DESCRIPTION = """\
 warnings.simplefilter("ignore", UserWarning)
+# Pre-parse and remove the '-j' argument from sys.argv.
+parser = argparse.ArgumentParser()
+parser.add_argument('-j', default=None)
+values, rest = parser.parse_known_args()
+PARALLEL = values.j
+sys.argv = sys.argv[:1] + rest
 if sys.platform == "darwin":
     LIBNAME = "libgumath.dylib"
     LIBSONAME = "libgumath.0.dylib"
@@ -154,7 +164,10 @@ if len(sys.argv) == 2:
         path = module_path + ':' + python_path if python_path else module_path
         env = os.environ.copy()
         env['PYTHONPATH'] = path
-        ret = subprocess.call([sys.executable, "python/test_gumath.py"], env=env)
+        ret = subprocess.call([sys.executable, "python/test_gumath.py", "--long"], env=env)
+        if ret != 0:
+            sys.exit(ret)
+        ret = subprocess.call([sys.executable, "python/test_xndarray.py"], env=env)
         sys.exit(ret)
     elif sys.argv[1] == 'clean':
         shutil.rmtree("build", ignore_errors=True)
@@ -173,11 +186,27 @@ if len(sys.argv) == 2:
     else:
         pass
+def get_config_vars():
+    f = open("config.h")
+    config_vars = {}
+    for line in f:
+        if line.startswith("#define"):
+            l = line.split()
+            try:
+                config_vars[l[1]] = int(l[2])
+            except ValueError:
+                pass
+        elif line.startswith("/* #undef"):
+            l = line.split()
+            config_vars[l[2]] = 0
+    f.close()
+    return config_vars
 def gumath_extensions():
     add_include_dirs = [".", "libgumath", "ndtypes/python/ndtypes", "xnd/python/xnd"] + INCLUDES
     add_library_dirs = ["libgumath", "ndtypes/libndtypes", "xnd/libxnd"] + LIBS
     add_depends = []
+    config_vars = {}
     if sys.platform == "win32":
         add_libraries = ["libndtypes-0.2.0dev3.dll", "libxnd-0.2.0dev3.dll", "libgumath-0.2.0dev3.dll"]
@@ -199,6 +228,14 @@ def gumath_extensions():
                  os.system("vcbuild32.bat")
             os.chdir("..")
     else:
+        if BUILD_ALL:
+            cflags = '"-I%s -I%s"' % tuple(CONFIGURE_INCLUDES)
+            ldflags = '"-L%s -L%s"' % tuple(CONFIGURE_LIBS)
+            make = "make -j%d" % int(PARALLEL) if PARALLEL else "make"
+            os.system("./configure CFLAGS=%s LDFLAGS=%s && %s" % (cflags, ldflags, make))
+        config_vars = get_config_vars()
         add_extra_compile_args = ["-Wextra", "-Wno-missing-field-initializers", "-std=c11"]
         if sys.platform == "darwin":
             add_libraries = ["ndtypes", "xnd", "gumath"]
@@ -209,10 +246,16 @@ def gumath_extensions():
             add_extra_link_args = []
             add_runtime_library_dirs = ["$ORIGIN"]
-        if BUILD_ALL:
-            cflags = '"-I%s -I%s"' % tuple(CONFIGURE_INCLUDES)
-            ldflags = '"-L%s -L%s"' % tuple(CONFIGURE_LIBS)
-            os.system("./configure CFLAGS=%s LDFLAGS=%s && make" % (cflags, ldflags))
+        if config_vars["HAVE_CUDA"]:
+            add_libraries += ["cudart"]
+            for d in [
+                "/usr/cuda/lib",
+                "/usr/cuda/lib64",
+                "/usr/local/cuda/lib/",
+                "/usr/local/cuda/lib64"]:
+                if os.path.isdir(d):
+                    add_library_dirs.append(d)
     def gumath_ext():
         sources = ["python/gumath/_gumath.c"]
@@ -244,6 +287,21 @@ def gumath_extensions():
             runtime_library_dirs = add_runtime_library_dirs
         )
+    def cuda_ext():
+        sources = ["python/gumath/cuda.c"]
+        return Extension (
+            "gumath.cuda",
+            include_dirs = add_include_dirs,
+            library_dirs = add_library_dirs,
+            depends = add_depends,
+            sources = sources,
+            libraries = add_libraries,
+            extra_compile_args = add_extra_compile_args,
+            extra_link_args = add_extra_link_args,
+            runtime_library_dirs = add_runtime_library_dirs
+        )
     def examples_ext():
         sources = ["python/gumath/examples.c"]
@@ -259,7 +317,10 @@ def gumath_extensions():
             runtime_library_dirs = add_runtime_library_dirs
         )
-    return [gumath_ext(), functions_ext(), examples_ext()]
+    extensions = [gumath_ext(), functions_ext(), examples_ext()]
+    if config_vars.get("HAVE_CUDA"):
+        extensions += [cuda_ext()]
+    return extensions
 setup (
     name = "gumath",

data/ext/ruby_gumath/gumath/tools/detect_cuda_arch.cc ADDED

@@ -0,0 +1,35 @@
+#include <cstdio>
+#include <cstdlib>
+#include <cuda_runtime.h>
+static void
+check(cudaError_t err)
+{
+    if (err != cudaSuccess) {
+        exit(1);
+    }
+}
+static int
+min(int x, int y)
+{
+    return x <= y ? x : y;
+}
+int main()
+{
+    int res = INT_MAX;
+    cudaDeviceProp prop;
+    int count, i, n;
+    check(cudaGetDeviceCount(&count));
+    for (i = 0; i < count; i++) {
+        check(cudaGetDeviceProperties(&prop, i));
+        n = prop.major * 10 + prop.minor;
+        res = min(res, n);
+    }
+    printf("%d", res);
+    return 0;
+}

data/ext/ruby_gumath/include/gumath.h CHANGED

@@ -34,6 +34,17 @@
 #ifndef GUMATH_H
 #define GUMATH_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+#ifdef __cplusplus
+  #include <cstdint>
+#else
+  #include <stdint.h>
+#endif
 #include "ndtypes.h"
 #include "xnd.h"
@@ -65,7 +76,8 @@
 #endif
-#define GM_MAX_KERNELS 512
+#define GM_MAX_KERNELS 8192
+#define GM_THREAD_CUTOFF 1000000
 typedef float float32_t;
 typedef double float64_t;
@@ -74,15 +86,25 @@ typedef double float64_t;
 typedef int (* gm_xnd_kernel_t)(xnd_t stack[], ndt_context_t *ctx);
 typedef int (* gm_strided_kernel_t)(char **args, intptr_t *dimensions, intptr_t *steps, void *data);
-/* Collection of specialized kernels for a single function signature. */
+/*
+ * Collection of specialized kernels for a single function signature.
+ *
+ * NOTE: The specialized kernel lookup scheme is transitional and may
+ * be replaced by something else.
+ *
+ * This should be considered as a first version of a kernel request
+ * protocol.
+ */
 typedef struct {
-    ndt_t *sig;
+    const ndt_t *sig;
     const ndt_constraint_t *constraint;
     /* Xnd signatures */
-    gm_xnd_kernel_t Opt;     /* dispatch ensures elementwise, at least 1D, contiguous in last dimensions */
-    gm_xnd_kernel_t C;       /* dispatch ensures c-contiguous in inner dimensions */
-    gm_xnd_kernel_t Fortran; /* dispatch ensures f-contiguous in inner dimensions */
+    gm_xnd_kernel_t OptC;    /* C in inner+1 dimensions */
+    gm_xnd_kernel_t OptZ;    /* C in inner dimensions, C or zero stride in (inner+1)th. */
+    gm_xnd_kernel_t OptS;    /* strided in (inner+1)th. */
+    gm_xnd_kernel_t C;       /* C in inner dimensions */
+    gm_xnd_kernel_t Fortran; /* Fortran in inner dimensions */
     gm_xnd_kernel_t Xnd;     /* selected if non-contiguous or the other fields are NULL */
     /* NumPy signature */
@@ -99,11 +121,17 @@ typedef struct {
     const char *name;
     const char *sig;
     const ndt_constraint_t *constraint;
+    uint32_t cap;
-    gm_xnd_kernel_t Opt;
+    /* Xnd signatures */
+    gm_xnd_kernel_t OptC;
+    gm_xnd_kernel_t OptZ;
+    gm_xnd_kernel_t OptS;
     gm_xnd_kernel_t C;
     gm_xnd_kernel_t Fortran;
     gm_xnd_kernel_t Xnd;
+    /* NumPy signature */
     gm_strided_kernel_t Strided;
 } gm_kernel_init_t;
@@ -115,7 +143,10 @@ typedef struct {
 /* Multimethod with associated kernels */
 typedef struct gm_func gm_func_t;
-typedef const gm_kernel_set_t *(*gm_typecheck_t)(ndt_apply_spec_t *spec, const gm_func_t *f, const ndt_t *in[], int nin, ndt_context_t *ctx);
+typedef const gm_kernel_set_t *(*gm_typecheck_t)(ndt_apply_spec_t *spec, const gm_func_t *f,
+                                                 const ndt_t *in[], const int64_t li[],
+                                                 int nin, int nout, bool check_broadcast,
+                                                 ndt_context_t *ctx);
 struct gm_func {
     char *name;
     gm_typecheck_t typecheck; /* Experimental optimized type-checking, may be NULL. */
@@ -139,10 +170,10 @@ GM_API int gm_add_kernel(gm_tbl_t *tbl, const gm_kernel_init_t *kernel, ndt_cont
 GM_API int gm_add_kernel_typecheck(gm_tbl_t *tbl, const gm_kernel_init_t *kernel, ndt_context_t *ctx, gm_typecheck_t f);
 GM_API gm_kernel_t gm_select(ndt_apply_spec_t *spec, const gm_tbl_t *tbl, const char *name,
-                             const ndt_t *in_types[], int nin, const xnd_t args[],
-                             ndt_context_t *ctx);
+                             const ndt_t *types[], const int64_t li[], int nin, int nout,
+                             bool check_broadcast, const xnd_t args[], ndt_context_t *ctx);
 GM_API int gm_apply(const gm_kernel_t *kernel, xnd_t stack[], int outer_dims, ndt_context_t *ctx);
-GM_API int gm_apply_thread(const gm_kernel_t *kernel, xnd_t stack[], int outer_dims, uint32_t flags, const int64_t nthreads, ndt_context_t *ctx);
+GM_API int gm_apply_thread(const gm_kernel_t *kernel, xnd_t stack[], int outer_dims, const int64_t nthreads, ndt_context_t *ctx);
 /******************************************************************************/
@@ -171,6 +202,7 @@ GM_API int gm_np_map(const gm_strided_kernel_t f,
 /*                                  Xnd loops                                 */
 /******************************************************************************/
+GM_API int array_shape_check(xnd_t *x, const int64_t shape, ndt_context_t *ctx);
 GM_API int gm_xnd_map(const gm_xnd_kernel_t f, xnd_t stack[], const int nargs,
                       const int outer_dims, ndt_context_t *ctx);
@@ -191,10 +223,14 @@ GM_API int gm_tbl_map(const gm_tbl_t *tbl, int (*f)(const gm_func_t *, void *sta
 /******************************************************************************/
 GM_API void gm_init(void);
-GM_API int gm_init_unary_kernels(gm_tbl_t *tbl, ndt_context_t *ctx);
-GM_API int gm_init_binary_kernels(gm_tbl_t *tbl, ndt_context_t *ctx);
+GM_API int gm_init_cpu_unary_kernels(gm_tbl_t *tbl, ndt_context_t *ctx);
+GM_API int gm_init_cpu_binary_kernels(gm_tbl_t *tbl, ndt_context_t *ctx);
+GM_API int gm_init_bitwise_kernels(gm_tbl_t *tbl, ndt_context_t *ctx);
+GM_API int gm_init_cuda_unary_kernels(gm_tbl_t *tbl, ndt_context_t *ctx);
+GM_API int gm_init_cuda_binary_kernels(gm_tbl_t *tbl, ndt_context_t *ctx);
 GM_API int gm_init_example_kernels(gm_tbl_t *tbl, ndt_context_t *ctx);
-GM_API int gm_init_bfloat16_kernels(gm_tbl_t *tbl, ndt_context_t *ctx);
 GM_API int gm_init_graph_kernels(gm_tbl_t *tbl, ndt_context_t *ctx);
 GM_API int gm_init_quaternion_kernels(gm_tbl_t *tbl, ndt_context_t *ctx);
 GM_API int gm_init_pdist_kernels(gm_tbl_t *tbl, ndt_context_t *ctx);
@@ -202,4 +238,9 @@ GM_API int gm_init_pdist_kernels(gm_tbl_t *tbl, ndt_context_t *ctx);
 GM_API void gm_finalize(void);
+#ifdef __cplusplus
+} /* END extern "C" */
+#endif
 #endif /* GUMATH_H */

data/ext/ruby_gumath/include/ruby_gumath.h CHANGED

@@ -33,8 +33,11 @@
 #define RUBY_GUMATH_H
 /* Classes */
-VALUE cGumath;
+extern VALUE cGumath;
+/* C API call for adding functions from a gumath kernel table to a Ruby module.
+ * Only adds CPU functions.
+ */
 int rb_gumath_add_functions(VALUE module, const gm_tbl_t *tbl);
 #define GUMATH_FUNCTION_HASH rb_intern("@gumath_functions")

data/ext/ruby_gumath/lib/libgumath.a CHANGED

Binary file

data/ext/ruby_gumath/lib/libgumath.so.0.2.0dev3 CHANGED

Binary file

data/ext/ruby_gumath/ruby_gumath.c CHANGED

@@ -43,12 +43,14 @@ static gm_tbl_t *table = NULL;
 /* Maximum number of threads */
 static int64_t max_threads = 1;
 static int initialized = 0;
-extern VALUE cGumath;
+VALUE cGumath;
 /****************************************************************************/
 /*                               Error handling                             */
 /****************************************************************************/
+static VALUE rb_eValueError;
 VALUE
 seterr(ndt_context_t *ctx)
 {
@@ -59,117 +61,274 @@ seterr(ndt_context_t *ctx)
 /*                               Instance methods                           */
 /****************************************************************************/
+/* Parse optional arguments passed to GuFuncObject#call.
+ *
+ * Populates the rbstack with all the input arguments. Then checks whether
+ * the 'out' kwarg has been specified and populates the rest of rbstack
+ * with contents of 'out'.
+ */
+void
+parse_args(VALUE *rbstack, int *rb_nin, int *rb_nout, int *rb_nargs, int noptargs,
+           VALUE *argv, VALUE out)
+{
+  size_t nin = noptargs, nout;
+  if (noptargs == 0) {
+    *rb_nin = 0;
+  }
+  for (int i = 0; i < nin; i++) {
+    if (!rb_is_a(argv[i], cXND)) {
+      rb_raise(rb_eArgError, "expected xnd arguments.");
+    }
+    rbstack[i] = argv[i];
+  }
+  if (out == Qnil) {
+    nout = 0;
+  }
+  else {
+    if (rb_xnd_check_type(out)) {
+      nout = 1;
+      if (nin + nout > NDT_MAX_ARGS) {
+        rb_raise(rb_eTypeError, "max number of arguments is %d, got %ld.",
+                 NDT_MAX_ARGS, nin+nout);
+      }
+      rbstack[nin] = out;
+    }
+    else if (RB_TYPE_P(out, T_ARRAY)) {
+      nout = rb_ary_size(out);
+      if (nout > NDT_MAX_ARGS || nin+nout > NDT_MAX_ARGS) {
+        rb_raise(rb_eTypeError, "max number of arguments is %d, got %ld.",
+                 NDT_MAX_ARGS, nin+nout);
+      }
+      for (int i = 0; i < nout; ++i) {
+        VALUE v = rb_ary_entry(out, i);
+        if (!rb_is_a(v, cXND)) {
+          rb_raise(rb_eTypeError, "expected xnd argument in all elements of out array.");
+        }
+        rbstack[nin+i] = v;
+      }
+    }
+    else {
+      rb_raise(rb_eTypeError, "'out' argument must of type XND or Array of XND objects.");
+    }
+  }
+  *rb_nin = (int)nin;
+  *rb_nout = (int)nout;
+  *rb_nargs = (int)nin + (int)nout;
+}
+/* Implement call method on the GufuncObject call. */
 static VALUE
 Gumath_GufuncObject_call(int argc, VALUE *argv, VALUE self)
 {
+  VALUE out = Qnil;
+  VALUE dt = Qnil;
+  VALUE cls = Qnil;
   NDT_STATIC_CONTEXT(ctx);
+  VALUE rbstack[NDT_MAX_ARGS], opts = Qnil;
   xnd_t stack[NDT_MAX_ARGS];
-  const ndt_t *in_types[NDT_MAX_ARGS];
+  const ndt_t *types[NDT_MAX_ARGS];
   gm_kernel_t kernel;
   ndt_apply_spec_t spec = ndt_apply_spec_empty;
-  GufuncObject *self_p;
-  VALUE result[NDT_MAX_ARGS];
-  int i, k;
-  size_t nin = argc;
+  int64_t li[NDT_MAX_ARGS];
+  NdtObject *dt_p;
+  int k;
+  ndt_t *dtype = NULL;
+  int nin = argc, nout, nargs;
+  bool have_cpu_device = false;
+  GufuncObject * self_p;
+  bool check_broadcast = true, enable_threads = true;
   if (argc > NDT_MAX_ARGS) {
     rb_raise(rb_eArgError, "too many arguments.");
   }
+  /* parse keyword arguments. */
+  int noptargs = argc;
+  for (int i = 0; i < argc; ++i) {
+    if (RB_TYPE_P(argv[i], T_HASH)) {
+      noptargs = i;
+      opts = argv[i];
+      break;
+    }
+  }
+  if (NIL_P(opts)) { opts = rb_hash_new(); }
+  out = rb_hash_aref(opts, ID2SYM(rb_intern("out")));
+  dt = rb_hash_aref(opts, ID2SYM(rb_intern("dtype")));
+  cls = rb_hash_aref(opts, ID2SYM(rb_intern("cls")));
+  if (NIL_P(cls)) { cls = cXND; }
+  if (!NIL_P(dt)) {
+    if (!NIL_P(out)) {
+      rb_raise(rb_eArgError, "the 'out' and 'dtype' arguments are mutually exclusive.");
+    }
-  /* Prepare arguments for sending into gumath function. */
-  for (i = 0; i < argc; i++) {
-    if (!rb_xnd_check_type(argv[i])) {
-      VALUE str = rb_funcall(argv[i], rb_intern("inspect"), 0, NULL);
-      rb_raise(rb_eArgError, "Args must be XND. Received %s.", RSTRING_PTR(str));
+    if (!rb_ndtypes_check_type(dt)) {
+      rb_raise(rb_eArgError, "'dtype' argument must be an NDT object.");
     }
+    dtype = (ndt_t *)rb_ndtypes_const_ndt(dt);
+    ndt_incref(dtype);
+  }
-    stack[i] = *rb_xnd_const_xnd(argv[i]);
-    in_types[i] = stack[i].type;
+  if (!rb_klass_has_ancestor(cls, cXND)) {
+    rb_raise(rb_eTypeError, "the 'cls' argument must be a subtype of 'xnd'.");
   }
-  /* Select the gumath function to be called from the function table. */
+  /* parse leading optional arguments */
+  parse_args(rbstack, &nin, &nout, &nargs, noptargs, argv, out);
+  for (k = 0; k < nargs; ++k) {
+    if (!rb_xnd_is_cuda_managed(rbstack[k])) {
+      have_cpu_device = true;
+    }
+    stack[k] = *rb_xnd_const_xnd(rbstack[k]);
+    types[k] = stack[k].type;
+    li[k] = stack[k].index;
+  }
   GET_GUOBJ(self, self_p);
+  if (have_cpu_device) {
+    if (self_p->flags & GM_CUDA_MANAGED_FUNC) {
+      rb_raise(rb_eValueError,
+               "cannot run a cuda function on xnd objects with cpu memory.");
+    }
+  }
+  kernel = gm_select(&spec, self_p->table, self_p->name, types, li, nin, nout,
+                     nout && check_broadcast, stack, &ctx);
-  kernel = gm_select(&spec, self_p->table, self_p->name, in_types, argc, stack, &ctx);
   if (kernel.set == NULL) {
     seterr(&ctx);
     raise_error();
   }
-  if (spec.nbroadcast > 0) {
-    for (i = 0; i < argc; i++) {
-      stack[i].type = spec.broadcast[i];
+  if (dtype) {
+    if (spec.nout != 1) {
+      ndt_err_format(&ctx, NDT_TypeError,
+                     "the 'dtype' argument is only supported for a single "
+                     "return value.");
+      ndt_apply_spec_clear(&spec);
+      ndt_decref(dtype);
+      seterr(&ctx);
+      raise_error();
     }
-  }
-  /* Populate output values with empty XND objects. */
-  for (i = 0; i < spec.nout; i++) {
-    if (ndt_is_concrete(spec.out[i])) {
-      VALUE x = rb_xnd_empty_from_type(spec.out[i]);
-      if (x == NULL) {
-        ndt_apply_spec_clear(&spec);
-        rb_raise(rb_eNoMemError, "could not allocate empty XND object.");
-      }
-      result[i] = x;
-      stack[nin+i] = *rb_xnd_const_xnd(x);
+    const ndt_t *u = spec.types[spec.nin];
+    const ndt_t *v = ndt_copy_contiguous_dtype(u, dtype, 0, &ctx);
+    ndt_apply_spec_clear(&spec);
+    ndt_decref(dtype);
+    if (v == NULL) {
+      seterr(&ctx);
+      raise_error();
     }
-    else {
-      result[i] = NULL;
-      stack[nin+i] = xnd_error;
+    types[nin] = v;
+    kernel = gm_select(&spec, self_p->table, self_p->name, types, li, nin, 1,
+                       1 && check_broadcast, stack, &ctx);
+    if (kernel.set == NULL) {
+      seterr(&ctx);
+      raise_error();
     }
   }
-  /* Actually call the kernel function with prepared input and output args. */
-#ifdef HAVE_PTHREAD_H
-  if (gm_apply_thread(&kernel, stack, spec.outer_dims, spec.flags,
-                      max_threads, &ctx) < 0) {
-    seterr(&ctx);
-    raise_error();
+  /*
+   * Replace args/kwargs types with types after substitution and broadcasting.
+   * This includes 'out' types, if explicitly passed as kwargs.
+   */
+  for (int i = 0; i < spec.nargs; ++i) {
+    stack[i].type = spec.types[i];
+  }
+  if (nout == 0) {
+    /* 'out' types have been inferred, create new XndObjects. */
+    VALUE x;
+    for (int i = 0; i < spec.nout; ++i) {
+      if (ndt_is_concrete(spec.types[nin+i])) {
+        uint32_t flags = self_p->flags == GM_CUDA_MANAGED_FUNC ? XND_CUDA_MANAGED : 0;
+        x = rb_xnd_empty_from_type(cls, spec.types[nin+i], flags);
+        rbstack[nin+i] = x;
+        stack[nin+i] = *rb_xnd_const_xnd(x);
+      }
+      else {
+        rb_raise(rb_eValueError,
+                 "args with abstract types are temporarily disabled.");
+      }
+    }
   }
+  if (self_p->flags == GM_CUDA_MANAGED_FUNC) {
+#ifdef HAVE_CUDA
+    /* populate with CUDA specific stuff */
 #else
-  if (gm_apply(&kernel, stack, spec.outer_dims, &ctx) < 0) {
+    ndt_err_format(&ctx, NDT_RuntimeError,
+                   "internal error: GM_CUDA_MANAGED_FUNC set in a build without cuda support");
+    ndt_apply_spec_clear(&spec);
     seterr(&ctx);
     raise_error();
+#endif // HAVE_CUDA
   }
-#endif
-  /* Prepare output XND objects. */
-  for (i = 0; i < spec.nout; i++) {
-    if (ndt_is_abstract(spec.out[i])) {
-      ndt_del(spec.out[i]);
-      VALUE x = rb_xnd_from_xnd(&stack[nin+i]);
-      stack[nin+i] = xnd_error;
-      if (x == NULL) {
-        for (k = i+i; k < spec.nout; k++) {
-          if (ndt_is_abstract(spec.out[k])) {
-            xnd_del_buffer(&stack[nin+k], XND_OWN_ALL);
-          }
-        }
-      }
-      result[i] = x;
-    }
-  }
+  else {
+#ifdef HAVE_PTHREAD_H
+    const int rounding = fegetround();
+    fesetround(FE_TONEAREST);
+    const int64_t N = enable_threads ? max_threads : 1;
+    const int ret = gm_apply_thread(&kernel, stack, spec.outer_dims, N, &ctx);
+    fesetround(rounding);
-  if (spec.nbroadcast > 0) {
-    for (i = 0; i < nin; ++i) {
-      ndt_del(spec.broadcast[i]);
+    if (ret < 0) {
+      ndt_apply_spec_clear(&spec);
+      seterr(&ctx);
+      raise_error();
     }
+#else
+    const int rounding = fegetround();
+    fesetround(FE_TONEAREST);
+    const int ret = gm_apply(&kernel, stack, spec.outer_dims, &ctx);
+    fesetround(rounding);
+    if (ret < 0) {
+      ndt_apply_spec_clear(&spec);
+      seterr(&ctx);
+      raise_error();
+    }
+#endif // HAVE_PTHREAD_H
   }
-  /* Return result */
-  switch(spec.nout) {
-  case 0: return Qnil;
-  case 1: return result[0];
+  nin = spec.nin;
+  nout = spec.nout;
+  nargs = spec.nargs;
+  ndt_apply_spec_clear(&spec);
+  switch (nout) {
+  case 0: {
+    return Qnil;
+  }
+  case 1: {
+    return rbstack[nin];
+  }
   default: {
-    VALUE tuple = array_new(spec.nout);
-    for (i = 0; i < spec.nout; ++i) {
-      rb_ary_store(tuple, i, result[i]);
+    VALUE arr = rb_ary_new2(nout);
+    for (int i = 0; i < nout; ++i) {
+      rb_ary_store(arr, i, rbstack[nin+i]);
     }
-    return tuple;
+    return arr;
   }
   }
 }
 /****************************************************************************/
 /*                               Singleton methods                          */
 /****************************************************************************/
@@ -225,7 +384,7 @@ add_function(const gm_func_t *f, void *args)
   struct map_args *a = (struct map_args *)args;
   VALUE func, func_hash;
-  func = GufuncObject_alloc(a->table, f->name);
+  func = GufuncObject_alloc(a->table, f->name, GM_CPU_FUNC);
   if (func == NULL) {
     return -1;
   }
@@ -236,7 +395,6 @@ add_function(const gm_func_t *f, void *args)
   return 0;
 }
-/* C API call for adding functions from a gumath kernel table to  */
 int
 rb_gumath_add_functions(VALUE module, const gm_tbl_t *tbl)
 {
@@ -289,6 +447,9 @@ void Init_ruby_gumath(void)
   /* Instance methods */
   rb_define_method(cGumath_GufuncObject, "call", Gumath_GufuncObject_call,-1);
+  /* errors */
+  rb_eValueError = rb_define_class("ValueError", rb_eRuntimeError);
   Init_gumath_functions();
   Init_gumath_examples();