RubyGems - gumath - Versions diffs - 0.2.0dev5 → 0.2.0dev8 - Mend

gumath 0.2.0dev5 → 0.2.0dev8

Files changed (99) hide show

checksums.yaml +4 -4
data/CONTRIBUTING.md +7 -2
data/Gemfile +0 -3
data/ext/ruby_gumath/GPATH +0 -0
data/ext/ruby_gumath/GRTAGS +0 -0
data/ext/ruby_gumath/GTAGS +0 -0
data/ext/ruby_gumath/extconf.rb +0 -5
data/ext/ruby_gumath/functions.c +10 -2
data/ext/ruby_gumath/gufunc_object.c +15 -4
data/ext/ruby_gumath/gufunc_object.h +9 -3
data/ext/ruby_gumath/gumath/Makefile +63 -0
data/ext/ruby_gumath/gumath/Makefile.in +1 -0
data/ext/ruby_gumath/gumath/config.h +56 -0
data/ext/ruby_gumath/gumath/config.h.in +3 -0
data/ext/ruby_gumath/gumath/config.log +497 -0
data/ext/ruby_gumath/gumath/config.status +1034 -0
data/ext/ruby_gumath/gumath/configure +375 -4
data/ext/ruby_gumath/gumath/configure.ac +47 -3
data/ext/ruby_gumath/gumath/libgumath/Makefile +236 -0
data/ext/ruby_gumath/gumath/libgumath/Makefile.in +90 -24
data/ext/ruby_gumath/gumath/libgumath/Makefile.vc +54 -15
data/ext/ruby_gumath/gumath/libgumath/apply.c +92 -28
data/ext/ruby_gumath/gumath/libgumath/apply.o +0 -0
data/ext/ruby_gumath/gumath/libgumath/common.o +0 -0
data/ext/ruby_gumath/gumath/libgumath/cpu_device_binary.o +0 -0
data/ext/ruby_gumath/gumath/libgumath/cpu_device_unary.o +0 -0
data/ext/ruby_gumath/gumath/libgumath/cpu_host_binary.o +0 -0
data/ext/ruby_gumath/gumath/libgumath/cpu_host_unary.o +0 -0
data/ext/ruby_gumath/gumath/libgumath/examples.o +0 -0
data/ext/ruby_gumath/gumath/libgumath/extending/graph.c +27 -20
data/ext/ruby_gumath/gumath/libgumath/extending/pdist.c +1 -1
data/ext/ruby_gumath/gumath/libgumath/func.c +13 -9
data/ext/ruby_gumath/gumath/libgumath/func.o +0 -0
data/ext/ruby_gumath/gumath/libgumath/graph.o +0 -0
data/ext/ruby_gumath/gumath/libgumath/gumath.h +55 -14
data/ext/ruby_gumath/gumath/libgumath/kernels/common.c +513 -0
data/ext/ruby_gumath/gumath/libgumath/kernels/common.h +155 -0
data/ext/ruby_gumath/gumath/libgumath/kernels/contrib/bfloat16.h +520 -0
data/ext/ruby_gumath/gumath/libgumath/kernels/cpu_device_binary.cc +1123 -0
data/ext/ruby_gumath/gumath/libgumath/kernels/cpu_device_binary.h +1062 -0
data/ext/ruby_gumath/gumath/libgumath/kernels/cpu_device_msvc.cc +555 -0
data/ext/ruby_gumath/gumath/libgumath/kernels/cpu_device_unary.cc +368 -0
data/ext/ruby_gumath/gumath/libgumath/kernels/cpu_device_unary.h +335 -0
data/ext/ruby_gumath/gumath/libgumath/kernels/cpu_host_binary.c +2952 -0
data/ext/ruby_gumath/gumath/libgumath/kernels/cpu_host_unary.c +1100 -0
data/ext/ruby_gumath/gumath/libgumath/kernels/cuda_device_binary.cu +1143 -0
data/ext/ruby_gumath/gumath/libgumath/kernels/cuda_device_binary.h +1061 -0
data/ext/ruby_gumath/gumath/libgumath/kernels/cuda_device_unary.cu +528 -0
data/ext/ruby_gumath/gumath/libgumath/kernels/cuda_device_unary.h +463 -0
data/ext/ruby_gumath/gumath/libgumath/kernels/cuda_host_binary.c +2817 -0
data/ext/ruby_gumath/gumath/libgumath/kernels/cuda_host_unary.c +1331 -0
data/ext/ruby_gumath/gumath/libgumath/kernels/device.hh +614 -0
data/ext/ruby_gumath/gumath/libgumath/libgumath.a +0 -0
data/ext/ruby_gumath/gumath/libgumath/libgumath.so +1 -0
data/ext/ruby_gumath/gumath/libgumath/libgumath.so.0 +1 -0
data/ext/ruby_gumath/gumath/libgumath/libgumath.so.0.2.0dev3 +0 -0
data/ext/ruby_gumath/gumath/libgumath/nploops.o +0 -0
data/ext/ruby_gumath/gumath/libgumath/pdist.o +0 -0
data/ext/ruby_gumath/gumath/libgumath/quaternion.o +0 -0
data/ext/ruby_gumath/gumath/libgumath/tbl.o +0 -0
data/ext/ruby_gumath/gumath/libgumath/thread.c +17 -4
data/ext/ruby_gumath/gumath/libgumath/thread.o +0 -0
data/ext/ruby_gumath/gumath/libgumath/xndloops.c +110 -0
data/ext/ruby_gumath/gumath/libgumath/xndloops.o +0 -0
data/ext/ruby_gumath/gumath/python/gumath/__init__.py +150 -0
data/ext/ruby_gumath/gumath/python/gumath/_gumath.c +446 -80
data/ext/ruby_gumath/gumath/python/gumath/cuda.c +78 -0
data/ext/ruby_gumath/gumath/python/gumath/examples.c +0 -5
data/ext/ruby_gumath/gumath/python/gumath/functions.c +2 -2
data/ext/ruby_gumath/gumath/python/gumath/gumath.h +246 -0
data/ext/ruby_gumath/gumath/python/gumath/libgumath.a +0 -0
data/ext/ruby_gumath/gumath/python/gumath/libgumath.so +1 -0
data/ext/ruby_gumath/gumath/python/gumath/libgumath.so.0 +1 -0
data/ext/ruby_gumath/gumath/python/gumath/libgumath.so.0.2.0dev3 +0 -0
data/ext/ruby_gumath/gumath/python/gumath/pygumath.h +31 -2
data/ext/ruby_gumath/gumath/python/gumath_aux.py +767 -0
data/ext/ruby_gumath/gumath/python/randdec.py +535 -0
data/ext/ruby_gumath/gumath/python/randfloat.py +177 -0
data/ext/ruby_gumath/gumath/python/test_gumath.py +1504 -24
data/ext/ruby_gumath/gumath/python/test_xndarray.py +462 -0
data/ext/ruby_gumath/gumath/setup.py +67 -6
data/ext/ruby_gumath/gumath/tools/detect_cuda_arch.cc +35 -0
data/ext/ruby_gumath/include/gumath.h +55 -14
data/ext/ruby_gumath/include/ruby_gumath.h +4 -1
data/ext/ruby_gumath/lib/libgumath.a +0 -0
data/ext/ruby_gumath/lib/libgumath.so.0.2.0dev3 +0 -0
data/ext/ruby_gumath/ruby_gumath.c +231 -70
data/ext/ruby_gumath/ruby_gumath.h +4 -1
data/ext/ruby_gumath/ruby_gumath_internal.h +25 -0
data/ext/ruby_gumath/util.c +34 -0
data/ext/ruby_gumath/util.h +9 -0
data/gumath.gemspec +3 -2
data/lib/gumath.rb +55 -1
data/lib/gumath/version.rb +2 -2
data/lib/ruby_gumath.so +0 -0
metadata +63 -10
data/ext/ruby_gumath/gumath/libgumath/extending/bfloat16.c +0 -130
data/ext/ruby_gumath/gumath/libgumath/kernels/binary.c +0 -547
data/ext/ruby_gumath/gumath/libgumath/kernels/unary.c +0 -449

data/ext/ruby_gumath/gumath/python/randfloat.py ADDED

@@ -0,0 +1,177 @@
+# Copyright (c) 2010 Python Software Foundation. All Rights Reserved.
+# Adapted from Python's Lib/test/test_strtod.py (by Mark Dickinson)
+# More test cases for deccheck.py.
+import random
+TEST_SIZE = 2
+def test_short_halfway_cases():
+    # exact halfway cases with a small number of significant digits
+    for k in 0, 5, 10, 15, 20:
+        # upper = smallest integer >= 2**54/5**k
+        upper = -(-2**54//5**k)
+        # lower = smallest odd number >= 2**53/5**k
+        lower = -(-2**53//5**k)
+        if lower % 2 == 0:
+            lower += 1
+        for i in range(10 * TEST_SIZE):
+            # Select a random odd n in [2**53/5**k,
+            # 2**54/5**k). Then n * 10**k gives a halfway case
+            # with small number of significant digits.
+            n, e = random.randrange(lower, upper, 2), k
+            # Remove any additional powers of 5.
+            while n % 5 == 0:
+                n, e = n // 5, e + 1
+            assert n % 10 in (1, 3, 7, 9)
+            # Try numbers of the form n * 2**p2 * 10**e, p2 >= 0,
+            # until n * 2**p2 has more than 20 significant digits.
+            digits, exponent = n, e
+            while digits < 10**20:
+                s = '{}e{}'.format(digits, exponent)
+                yield s
+                # Same again, but with extra trailing zeros.
+                s = '{}e{}'.format(digits * 10**40, exponent - 40)
+                yield s
+                digits *= 2
+            # Try numbers of the form n * 5**p2 * 10**(e - p5), p5
+            # >= 0, with n * 5**p5 < 10**20.
+            digits, exponent = n, e
+            while digits < 10**20:
+                s = '{}e{}'.format(digits, exponent)
+                yield s
+                # Same again, but with extra trailing zeros.
+                s = '{}e{}'.format(digits * 10**40, exponent - 40)
+                yield s
+                digits *= 5
+                exponent -= 1
+def test_halfway_cases():
+    # test halfway cases for the round-half-to-even rule
+    for i in range(1000):
+        for j in range(TEST_SIZE):
+            # bit pattern for a random finite positive (or +0.0) float
+            bits = random.randrange(2047*2**52)
+            # convert bit pattern to a number of the form m * 2**e
+            e, m = divmod(bits, 2**52)
+            if e:
+                m, e = m + 2**52, e - 1
+            e -= 1074
+            # add 0.5 ulps
+            m, e = 2*m + 1, e - 1
+            # convert to a decimal string
+            if e >= 0:
+                digits = m << e
+                exponent = 0
+            else:
+                # m * 2**e = (m * 5**-e) * 10**e
+                digits = m * 5**-e
+                exponent = e
+            s = '{}e{}'.format(digits, exponent)
+            yield s
+def test_boundaries():
+    # boundaries expressed as triples (n, e, u), where
+    # n*10**e is an approximation to the boundary value and
+    # u*10**e is 1ulp
+    boundaries = [
+        (10000000000000000000, -19, 1110),   # a power of 2 boundary (1.0)
+        (17976931348623159077, 289, 1995),   # overflow boundary (2.**1024)
+        (22250738585072013831, -327, 4941),  # normal/subnormal (2.**-1022)
+        (0, -327, 4941),                     # zero
+        ]
+    for n, e, u in boundaries:
+        for j in range(1000):
+            for i in range(TEST_SIZE):
+                digits = n + random.randrange(-3*u, 3*u)
+                exponent = e
+                s = '{}e{}'.format(digits, exponent)
+                yield s
+            n *= 10
+            u *= 10
+            e -= 1
+def test_underflow_boundary():
+    # test values close to 2**-1075, the underflow boundary; similar
+    # to boundary_tests, except that the random error doesn't scale
+    # with n
+    for exponent in range(-400, -320):
+        base = 10**-exponent // 2**1075
+        for j in range(TEST_SIZE):
+            digits = base + random.randrange(-1000, 1000)
+            s = '{}e{}'.format(digits, exponent)
+            yield s
+def test_bigcomp():
+    for ndigs in 5, 10, 14, 15, 16, 17, 18, 19, 20, 40, 41, 50:
+        dig10 = 10**ndigs
+        for i in range(100 * TEST_SIZE):
+            digits = random.randrange(dig10)
+            exponent = random.randrange(-400, 400)
+            s = '{}e{}'.format(digits, exponent)
+            yield s
+def test_parsing():
+    # make '0' more likely to be chosen than other digits
+    digits = '000000123456789'
+    signs = ('+', '-', '')
+    # put together random short valid strings
+    # \d*[.\d*]?e
+    for i in range(1000):
+        for j in range(TEST_SIZE):
+            s = random.choice(signs)
+            intpart_len = random.randrange(5)
+            s += ''.join(random.choice(digits) for _ in range(intpart_len))
+            if random.choice([True, False]):
+                s += '.'
+                fracpart_len = random.randrange(5)
+                s += ''.join(random.choice(digits)
+                             for _ in range(fracpart_len))
+            else:
+                fracpart_len = 0
+            if random.choice([True, False]):
+                s += random.choice(['e', 'E'])
+                s += random.choice(signs)
+                exponent_len = random.randrange(1, 4)
+                s += ''.join(random.choice(digits)
+                             for _ in range(exponent_len))
+            if intpart_len + fracpart_len:
+                yield s
+TESTCASES = [
+      [x for x in test_short_halfway_cases()],
+      [x for x in test_halfway_cases()],
+      [x for x in test_boundaries()],
+      [x for x in test_underflow_boundary()],
+      [x for x in test_bigcomp()],
+      [x for x in test_parsing()],
+]
+def un_randfloat():
+    for i in range(2):
+        l = random.choice(TESTCASES[:6])
+        yield random.choice(l)
+def bin_randfloat():
+    for i in range(2):
+        l1 = random.choice(TESTCASES)
+        l2 = random.choice(TESTCASES)
+        yield random.choice(l1), random.choice(l2)
+def tern_randfloat():
+    for i in range(2):
+        l1 = random.choice(TESTCASES)
+        l2 = random.choice(TESTCASES)
+        l3 = random.choice(TESTCASES)
+        yield random.choice(l1), random.choice(l2), random.choice(l3)

data/ext/ruby_gumath/gumath/python/test_gumath.py CHANGED

@@ -35,37 +35,59 @@ import gumath.functions as fn
 import gumath.examples as ex
 from xnd import xnd
 from ndtypes import ndt
-from extending import Graph, bfloat16
+from extending import Graph
 import sys, time
+import platform
 import math
+import cmath
 import unittest
 import argparse
+from gumath_aux import *
+try:
+    import gumath.cuda as cd
+except ImportError:
+    cd = None
 try:
     import numpy as np
+    np.warnings.filterwarnings('ignore')
 except ImportError:
     np = None
+SKIP_LONG = True
+SKIP_BRUTE_FORCE = True
-TEST_CASES = [
-  ([float(i)/100.0 for i in range(2000)], "2000 * float64", "float64"),
+ARCH = platform.architecture()[0]
-  ([[float(i)/100.0 for i in range(1000)], [float(i+1) for i in range(1000)]],
-   "2 * 1000 * float64", "float64"),
-  (1000 * [[float(i+1) for i in range(2)]], "1000 * 2 * float64", "float64"),
+class TestAPI(unittest.TestCase):
-  ([float(i)/10.0 for i in range(2000)], "2000 * float32", "float32"),
+    def test_api(self):
-  ([[float(i)/10.0 for i in range(1000)], [float(i+1) for i in range(1000)]],
-  "2 * 1000 * float32", "float32"),
-  (1000 * [[float(i+1) for i in range(2)]], "1000 * 2 * float32", "float32"),
-]
+        self.assertIsInstance(fn.add, gm.gufunc)
+        self.assertRaises(TypeError, gm.gufunc.__new__)
+        self.assertRaises(TypeError, gm.gufunc.__new__, 1)
 class TestCall(unittest.TestCase):
+    def test_subclass(self):
+        class X(xnd):
+            pass
+        x = X([1, 2, 3])
+        y = X([1, 2, 3])
+        z = fn.multiply(x, y)
+        self.assertEqual(z, [1, 4, 9])
+        self.assertEqual(type(z), xnd)
+        z = fn.multiply(x, y, cls=X)
+        self.assertEqual(z, [1, 4, 9])
+        self.assertEqual(type(z), X)
     def test_sin_scalar(self):
         x1 = xnd(1.2, type="float64")
@@ -213,6 +235,293 @@ class TestMissingValues(unittest.TestCase):
         self.assertEqual(ans.value, [{'valid': 2, 'missing': 1}, {'valid': 1, 'missing': 2}])
+    def test_unary(self):
+        a = [0, None, 2]
+        ans = xnd([math.sin(x) if x is not None else None for x in a])
+        x = xnd(a, dtype="?float64")
+        y = fn.sin(x)
+        self.assertEqual(y.value, ans)
+    def test_binary(self):
+        a = [3, None, 3]
+        b = [100, 1, None]
+        ans = xnd([t[0] * t[1] if t[0] is not None and t[1] is not None else None
+                   for t in zip(a, b)])
+        x = xnd(a)
+        y = xnd(b)
+        z = fn.multiply(x, y)
+        self.assertEqual(z.value, ans)
+    def test_reduce(self):
+        a = [1, None, 2]
+        x = xnd(a)
+        y = gm.reduce(fn.add, x)
+        self.assertEqual(y, None)
+        y = gm.reduce(fn.multiply, x)
+        self.assertEqual(y, None)
+        y = gm.reduce(fn.subtract, x)
+        self.assertEqual(y, None)
+        x = xnd([], dtype="?int32")
+        y = gm.reduce(fn.add, x)
+        self.assertEqual(y, 0)
+    @unittest.skipIf(cd is None, "test requires cuda")
+    def test_reduce_cuda(self):
+        a = [1, None, 2]
+        x = xnd(a, device="cuda:managed")
+        y = gm.reduce(cd.add, x)
+        self.assertEqual(y, None)
+        y = gm.reduce(cd.multiply, x)
+        self.assertEqual(y, None)
+        x = xnd([], dtype="?int32", device="cuda:managed")
+        y = gm.reduce(fn.add, x)
+        self.assertEqual(y, 0)
+    def test_comparisons(self):
+        a = [1, None, 3, 5]
+        b = [2, None, 3, 4]
+        x = xnd(a)
+        y = xnd(b)
+        ans = fn.equal(x, y)
+        self.assertEqual(ans.value, [False, None, True, False])
+        ans = fn.not_equal(x, y)
+        self.assertEqual(ans.value, [True, None, False, True])
+        ans = fn.less(x, y)
+        self.assertEqual(ans.value, [True, None, False, False])
+        ans = fn.less_equal(x, y)
+        self.assertEqual(ans.value, [True, None, True, False])
+        ans = fn.greater_equal(x, y)
+        self.assertEqual(ans.value, [False, None, True, True])
+        ans = fn.greater(x, y)
+        self.assertEqual(ans.value, [False, None, False, True])
+    @unittest.skipIf(cd is None, "test requires cuda")
+    def test_comparisons_cuda(self):
+        a = [1, None, 3, 5]
+        b = [2, None, 3, 4]
+        x = xnd(a, device="cuda:managed")
+        y = xnd(b, device="cuda:managed")
+        ans = cd.equal(x, y)
+        self.assertEqual(ans.value, [False, None, True, False])
+        ans = cd.not_equal(x, y)
+        self.assertEqual(ans.value, [True, None, False, True])
+        ans = cd.less(x, y)
+        self.assertEqual(ans.value, [True, None, False, False])
+        ans = cd.less_equal(x, y)
+        self.assertEqual(ans.value, [True, None, True, False])
+        ans = cd.greater_equal(x, y)
+        self.assertEqual(ans.value, [False, None, True, True])
+        ans = cd.greater(x, y)
+        self.assertEqual(ans.value, [False, None, False, True])
+    def test_equaln(self):
+        a = [1, None, 3, 5]
+        b = [2, None, 3, 4]
+        x = xnd(a)
+        y = xnd(b)
+        z = fn.equaln(x, y)
+        self.assertEqual(z, [False, True, True, False])
+        self.assertEqual(z.dtype, ndt("bool"))
+        a = [1, None, 3, 5]
+        b = [2, 0, 3, 4]
+        x = xnd(a)
+        y = xnd(b)
+        z = fn.equaln(x, y)
+        self.assertEqual(z, [False, False, True, False])
+        self.assertEqual(z.dtype, ndt("bool"))
+        # NA eqn NA
+        a = [None]
+        b = [None]
+        x = xnd(a)
+        y = xnd(b)
+        z = fn.equaln(x, y)
+        self.assertEqual(z, [True])
+        # !(NA eqn 0)
+        a = [None]
+        b = [0.0]
+        x = xnd(a)
+        y = xnd(b)
+        z = fn.equaln(x, y)
+        self.assertEqual(z, [False])
+        # !(0 eqn NA)
+        a = [0.0]
+        b = [None]
+        x = xnd(a)
+        y = xnd(b)
+        z = fn.equaln(x, y)
+        self.assertEqual(z, [False])
+        # !(NA eqn NaN)
+        a = [None]
+        b = [float("nan")]
+        x = xnd(a)
+        y = xnd(b)
+        z = fn.equaln(x, y)
+        self.assertEqual(z, [False])
+        # !(NaN eqn NA)
+        a = [float("nan")]
+        b = [None]
+        x = xnd(a)
+        y = xnd(b)
+        z = fn.equaln(x, y)
+        self.assertEqual(z, [False])
+    @unittest.skipIf(cd is None, "test requires cuda")
+    def test_equaln_cuda(self):
+        a = [1, None, 3, 5]
+        b = [2, None, 3, 4]
+        x = xnd(a, device="cuda:managed")
+        y = xnd(b, device="cuda:managed")
+        z = cd.equaln(x, y)
+        self.assertEqual(z, [False, True, True, False])
+        self.assertEqual(z.dtype, ndt("bool"))
+        a = [1, None, 3, 5]
+        b = [2, 0, 3, 4]
+        x = xnd(a, device="cuda:managed")
+        y = xnd(b, device="cuda:managed")
+        z = cd.equaln(x, y)
+        self.assertEqual(z, [False, False, True, False])
+        self.assertEqual(z.dtype, ndt("bool"))
+        # NA eqn NA
+        a = [None]
+        b = [None]
+        x = xnd(a, device="cuda:managed")
+        y = xnd(b, device="cuda:managed")
+        z = cd.equaln(x, y)
+        self.assertEqual(z, [True])
+        # !(NA eqn 0)
+        a = [None]
+        b = [0.0]
+        x = xnd(a, device="cuda:managed")
+        y = xnd(b, device="cuda:managed")
+        z = cd.equaln(x, y)
+        self.assertEqual(z, [False])
+        # !(0 eqn NA)
+        a = [0.0]
+        b = [None]
+        x = xnd(a, device="cuda:managed")
+        y = xnd(b, device="cuda:managed")
+        z = cd.equaln(x, y)
+        self.assertEqual(z, [False])
+        # !(NA eqn NaN)
+        a = [None]
+        b = [float("nan")]
+        x = xnd(a, device="cuda:managed")
+        y = xnd(b, device="cuda:managed")
+        z = cd.equaln(x, y)
+        self.assertEqual(z, [False])
+        # !(NaN eqn NA)
+        a = [float("nan")]
+        b = [None]
+        x = xnd(a, device="cuda:managed")
+        y = xnd(b, device="cuda:managed")
+        z = cd.equaln(x, y)
+        self.assertEqual(z, [False])
+class TestEqualN(unittest.TestCase):
+    def test_nan_float(self):
+        for dtype in "bfloat16", "float32", "float64":
+            x = xnd([0, float("nan"), 2], dtype=dtype)
+            y = xnd([0, float("nan"), 2], dtype=dtype)
+            z = fn.equaln(x, y)
+            self.assertEqual(z, [True, True, True])
+            y = xnd([0, 1, 2], dtype=dtype)
+            z = fn.equaln(x, y)
+            self.assertEqual(z, [True, False, True])
+    def test_nan_complex(self):
+        for dtype in "complex64", "complex128":
+            for a, b, ans in [
+                (complex(float("nan"), 1.2), complex(float("nan"), 1.2), True),
+                (complex(float("nan"), 1.2), complex(float("nan"), 1), False),
+                (complex(float("nan"), float("nan")), complex(float("nan"), 1.2), False),
+                (complex(1.2, float("nan")), complex(1.2, float("nan")), True),
+                (complex(1.2, float("nan")), complex(1, float("nan")), False),
+                (complex(float("nan"), float("nan")), complex(1.2, float("nan")), False),
+                (complex(float("nan"), float("nan")), complex(float("nan"), float("nan")), True)]:
+                x = xnd([0, a, 2], dtype=dtype)
+                y = xnd([0, b, 2], dtype=dtype)
+                z = fn.equaln(x, y)
+                self.assertEqual(z, [True, ans, True])
+    @unittest.skipIf(cd is None, "test requires cuda")
+    def test_nan_float_cuda(self):
+        for dtype in "bfloat16", "float16", "float32", "float64":
+            x = xnd([0, float("nan"), 2], dtype=dtype, device="cuda:managed")
+            y = xnd([0, float("nan"), 2], dtype=dtype, device="cuda:managed")
+            z = cd.equaln(x, y)
+            self.assertEqual(z, [True, True, True])
+            y = xnd([0, 1, 2], dtype=dtype, device="cuda:managed")
+            z = cd.equaln(x, y)
+            self.assertEqual(z, [True, False, True])
+    @unittest.skipIf(cd is None, "test requires cuda")
+    def test_nan_complex_cuda(self):
+        for dtype in "complex64", "complex128":
+            for a, b, ans in [
+                (complex(float("nan"), 1.2), complex(float("nan"), 1.2), True),
+                (complex(float("nan"), 1.2), complex(float("nan"), 1), False),
+                (complex(float("nan"), float("nan")), complex(float("nan"), 1.2), False),
+                (complex(1.2, float("nan")), complex(1.2, float("nan")), True),
+                (complex(1.2, float("nan")), complex(1, float("nan")), False),
+                (complex(float("nan"), float("nan")), complex(1.2, float("nan")), False),
+                (complex(float("nan"), float("nan")), complex(float("nan"), float("nan")), True)]:
+                x = xnd([0, a, 2], dtype=dtype, device="cuda:managed")
+                y = xnd([0, b, 2], dtype=dtype, device="cuda:managed")
+                z = cd.equaln(x, y)
+                self.assertEqual(z, [True, ans, True])
 class TestRaggedArrays(unittest.TestCase):
@@ -237,8 +546,59 @@ class TestRaggedArrays(unittest.TestCase):
         self.assertEqual(y.value, ans)
+class TestFlexibleArrays(unittest.TestCase):
+    def test_sin_var_compatible(self):
+        s = math.sin
+        lst = [[[1.0],
+                [2.0, 3.0],
+                [4.0, 5.0, 6.0]],
+               [[7.0],
+                [8.0, 9.0],
+                [10.0, 11.0, 12.0]]]
+        ans = [[[s(1.0)],
+                [s(2.0), s(3.0)],
+                [s(4.0), s(5.0), s(6.0)]],
+               [[s(7.0)],
+                [s(8.0), s(9.0)],
+                [s(10.0), s(11.0), s(12.0)]]]
+        x = xnd(lst, type="array * array * array * float64")
+        y = fn.sin(x)
+        self.assertEqual(y.value, ans)
+    def test_add(self):
+        a = [[[1.0],
+              [2.0, 3.0],
+              [4.0, 5.0, 6.0]],
+             [[7.0],
+              [8.0, 9.0],
+              [10.0, 11.0, 12.0]]]
+        b = [[[2.0],
+              [3.0, 4.0],
+              [5.0, 6.0, 7.0]],
+            [[-8.0],
+             [-9.0, -10.0],
+             [111.1, 121.2, 25.3]]]
+        ans = [[[1.0+2.0],
+                [2.0+3.0, 3.0+4.0],
+                [4.0+5.0, 5.0+6.0, 6.0+7.0]],
+               [[7.0-8.0],
+                [8.0-9.0, 9.0-10.0],
+                [10.0+111.1, 11.0+121.2, 12.0+25.3]]]
+        x = xnd(a, type="array * array * array * float64")
+        y = xnd(b, type="array * array * array * float64")
+        z = fn.add(x, y)
+        self.assertEqual(z.value, ans)
 class TestGraphs(unittest.TestCase):
+    @unittest.skipIf(True, "abstract return types are temporarily disabled")
     def test_shortest_path(self):
         graphs = [[[(1, 1.2), (2, 4.4)],
                    [(2, 2.2)],
@@ -274,17 +634,6 @@ class TestGraphs(unittest.TestCase):
         self.assertRaises(ValueError, Graph, lst)
-@unittest.skipIf(sys.platform == "win32", "unresolved external symbols")
-class TestBFloat16(unittest.TestCase):
-    def test_init(self):
-        lst = [1.2e10, 2.1121, -3e20]
-        ans = [11945377792.0, 2.109375, -2.997595911977802e+20]
-        x = bfloat16(lst)
-        self.assertEqual(x.value, ans)
 class TestPdist(unittest.TestCase):
     def test_exceptions(self):
@@ -373,15 +722,1142 @@ class TestNumba(unittest.TestCase):
         np.testing.assert_equal(z, c)
+class TestOut(unittest.TestCase):
+    def test_api_cpu(self):
+        # negative
+        x = xnd([1, 2, 3])
+        y = xnd.empty("3 * int64")
+        z = fn.negative(x, out=y)
+        self.assertIs(z, y)
+        self.assertEqual(y, xnd([-1, -2, -3]))
+        # divmod
+        x = xnd([10, 20, 30])
+        y = xnd([7, 8, 9])
+        a = xnd.empty("3 * int64")
+        b = xnd.empty("3 * int64")
+        q, r = fn.divmod(x, y, out=(a, b))
+        self.assertIs(q, a)
+        self.assertIs(r, b)
+        self.assertEqual(q, xnd([1, 2, 3]))
+        self.assertEqual(r, xnd([3, 4, 3]))
+    @unittest.skipIf(cd is None, "test requires cuda")
+    def test_api_cuda(self):
+        # negative
+        x = xnd([1, 2, 3], device="cuda:managed")
+        y = xnd.empty("3 * int64", device="cuda:managed")
+        z = cd.negative(x, out=y)
+        self.assertIs(z, y)
+        self.assertEqual(y, xnd([-1, -2, -3]))
+        # divmod
+        x = xnd([10, 20, 30], device="cuda:managed")
+        y = xnd([7, 8, 9], device="cuda:managed")
+        a = xnd.empty("3 * int64", device="cuda:managed")
+        b = xnd.empty("3 * int64", device="cuda:managed")
+        q, r = cd.divmod(x, y, out=(a, b))
+        self.assertIs(q, a)
+        self.assertIs(r, b)
+        self.assertEqual(q, xnd([1, 2, 3]))
+        self.assertEqual(r, xnd([3, 4, 3]))
+    def test_broadcast_cpu(self):
+        # multiply
+        x = xnd([1, 2, 3])
+        y = xnd([2])
+        z = xnd.empty("3 * int64")
+        ans = fn.multiply(x, y, out=z)
+        self.assertIs(ans, z)
+        self.assertEqual(ans, xnd([2, 4, 6]))
+        x = xnd([1, 2, 3])
+        y = xnd(2)
+        z = xnd.empty("3 * int64")
+        ans = fn.multiply(x, y, out=z)
+        self.assertIs(ans, z)
+        self.assertEqual(ans, xnd([2, 4, 6]))
+        # divmod
+        x = xnd([10, 20, 30])
+        y = xnd([3])
+        a = xnd.empty("3 * int64")
+        b = xnd.empty("3 * int64")
+        q, r = fn.divmod(x, y, out=(a, b))
+        self.assertIs(q, a)
+        self.assertIs(r, b)
+        self.assertEqual(q, xnd([3, 6, 10]))
+        self.assertEqual(r, xnd([1, 2, 0]))
+        x = xnd([10, 20, 30])
+        y = xnd(3)
+        a = xnd.empty("3 * int64")
+        b = xnd.empty("3 * int64")
+        q, r = fn.divmod(x, y, out=(a, b))
+        self.assertIs(q, a)
+        self.assertIs(r, b)
+        self.assertEqual(q, xnd([3, 6, 10]))
+        self.assertEqual(r, xnd([1, 2, 0]))
+    @unittest.skipIf(cd is None, "test requires cuda")
+    def test_broadcast_cuda(self):
+        # multiply
+        x = xnd([1, 2, 3], device="cuda:managed")
+        y = xnd([2], device="cuda:managed")
+        z = xnd.empty("3 * int64", device="cuda:managed")
+        ans = fn.multiply(x, y, out=z)
+        self.assertIs(ans, z)
+        self.assertEqual(ans, xnd([2, 4, 6]))
+class TestUnaryCPU(unittest.TestCase):
+    def test_acos(self):
+        a = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7]
+        b = [math.acos(x) for x in a]
+        x = xnd(a, dtype="float64")
+        y = fn.acos(x)
+        self.assertEqual(y, b)
+    def test_acos_opt(self):
+        a = [0, 0.1, 0.2, None, 0.4, 0.5, 0.6, None]
+        b = [math.acos(x) if x is not None else None for x in a]
+        x = xnd(a, dtype="?float64")
+        y = fn.acos(x)
+        self.assertEqual(y, b)
+    def test_inexact_cast(self):
+        a = [0, 1, 2, 3, 4, 5, 6, 7]
+        x = xnd(a, dtype="int64")
+        self.assertRaises(ValueError, fn.sin, x)
+@unittest.skipIf(cd is None, "test requires cuda")
+class TestUnaryCUDA(unittest.TestCase):
+    def test_cos(self):
+        a = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7]
+        b = [math.cos(x) for x in a]
+        x = xnd(a, dtype="float64", device="cuda:managed")
+        y = cd.cos(x)
+        self.assertEqual(y, b)
+    def test_cos_opt(self):
+        a = [0, 0.1, 0.2, None, 0.4, 0.5, 0.6, None]
+        b = [math.cos(x) if x is not None else None for x in a]
+        x = xnd(a, dtype="?float64", device="cuda:managed")
+        y = cd.cos(x)
+        self.assertEqual(y, b)
+    def test_inexact_cast(self):
+        a = [0, 1, 2, 3, 4, 5, 6, 7]
+        x = xnd(a, dtype="int64", device="cuda:managed")
+        self.assertRaises(ValueError, cd.sin, x)
+class TestBinaryCPU(unittest.TestCase):
+    def test_binary(self):
+        for t, u in implemented_sigs["binary"]["default"]:
+            w = implemented_sigs["binary"]["default"][(t, u)]
+            if t.cpu_noimpl() or u.cpu_noimpl():
+                continue
+            x = xnd([0, 1, 2, 3, 4, 5, 6, 7], dtype=t.type)
+            y = xnd([1, 2, 3, 4, 5, 6, 7, 8], dtype=u.type)
+            z = fn.add(x, y)
+            self.assertEqual(z, [1, 3, 5, 7, 9, 11, 13, 15])
+    def test_add_opt(self):
+        for t, u in implemented_sigs["binary"]["default"]:
+            w = implemented_sigs["binary"]["default"][(t, u)]
+            if t.cpu_noimpl() or u.cpu_noimpl():
+                continue
+            x = xnd([0, 1, None, 3, 4, 5, 6, 7], dtype="?" + t.type)
+            y = xnd([1, 2, 3, 4, 5, 6, None, 8], dtype="?" + u.type)
+            z = fn.add(x, y)
+            self.assertEqual(z, [1, 3, None, 7, 9, 11, None, 15])
+    def test_subtract(self):
+        for t, u in implemented_sigs["binary"]["default"]:
+            w = implemented_sigs["binary"]["default"][(t, u)]
+            if t.cpu_noimpl() or u.cpu_noimpl():
+                continue
+            x = xnd([2, 3, 4, 5, 6, 7, 8, 9], dtype=t.type)
+            y = xnd([1, 2, 3, 4, 5, 6, 7, 8], dtype=u.type)
+            z = fn.subtract(x, y)
+            self.assertEqual(z, [1, 1, 1, 1, 1, 1, 1, 1])
+    def test_multiply(self):
+        for t, u in implemented_sigs["binary"]["default"]:
+            w = implemented_sigs["binary"]["default"][(t, u)]
+            if t.cpu_noimpl() or u.cpu_noimpl():
+                continue
+            x = xnd([2, 3, 4, 5, 6, 7, 8, 9], dtype=t.type)
+            y = xnd([1, 2, 3, 4, 5, 6, 7, 8], dtype=u.type)
+            z = fn.multiply(x, y)
+            self.assertEqual(z, [2, 6, 12, 20, 30, 42, 56, 72])
+@unittest.skipIf(cd is None, "test requires cuda")
+class TestBinaryCUDA(unittest.TestCase):
+    def test_binary(self):
+        for t, u in implemented_sigs["binary"]["default"]:
+            w = implemented_sigs["binary"]["default"][(t, u)]
+            if t.cpu_noimpl() or u.cpu_noimpl():
+                continue
+            x = xnd([0, 1, 2, 3, 4, 5, 6, 7], dtype=t.type, device="cuda:managed")
+            y = xnd([1, 2, 3, 4, 5, 6, 7, 8], dtype=u.type, device="cuda:managed")
+            z = cd.add(x, y)
+            self.assertEqual(z, [1, 3, 5, 7, 9, 11, 13, 15])
+    def test_add_opt(self):
+        for t, u in implemented_sigs["binary"]["default"]:
+            w = implemented_sigs["binary"]["default"][(t, u)]
+            if t.cpu_noimpl() or u.cpu_noimpl():
+                continue
+            x = xnd([0, 1, None, 3, 4, 5, 6, 7], dtype="?" + t.type, device="cuda:managed")
+            y = xnd([1, 2, 3, 4, 5, 6, None, 8], dtype="?" + u.type, device="cuda:managed")
+            z = cd.add(x, y)
+            self.assertEqual(z, [1, 3, None, 7, 9, 11, None, 15])
+    def test_subtract(self):
+        for t, u in implemented_sigs["binary"]["default"]:
+            w = implemented_sigs["binary"]["default"][(t, u)]
+            if t.cpu_noimpl() or u.cpu_noimpl():
+                continue
+            x = xnd([2, 3, 4, 5, 6, 7, 8, 9], dtype=t.type, device="cuda:managed")
+            y = xnd([1, 2, 3, 4, 5, 6, 7, 8], dtype=u.type, device="cuda:managed")
+            z = cd.subtract(x, y)
+            self.assertEqual(z, [1, 1, 1, 1, 1, 1, 1, 1])
+    def test_multiply(self):
+        for t, u in implemented_sigs["binary"]["default"]:
+            w = implemented_sigs["binary"]["default"][(t, u)]
+            if t.cpu_noimpl() or u.cpu_noimpl():
+                continue
+            x = xnd([2, 3, 4, 5, 6, 7, 8, 9], dtype=t.type, device="cuda:managed")
+            y = xnd([1, 2, 3, 4, 5, 6, 7, 8], dtype=u.type, device="cuda:managed")
+            z = cd.multiply(x, y)
+            self.assertEqual(z, [2, 6, 12, 20, 30, 42, 56, 72])
+class TestBitwiseCPU(unittest.TestCase):
+    def test_and(self):
+        for t, u in implemented_sigs["binary"]["bitwise"]:
+            w = implemented_sigs["binary"]["bitwise"][(t, u)]
+            if t.cpu_noimpl() or u.cpu_noimpl():
+                continue
+            x = xnd([0, 1, 2, 3, 4, 5, 6, 7], dtype=t.type)
+            x = xnd([0, 1, 0, 1, 1, 1, 1, 0], dtype=t.type)
+            y = xnd([1, 0, 0, 0, 1, 1, 1, 1], dtype=u.type)
+            z = fn.bitwise_and(x, y)
+            self.assertEqual(z, [0, 0, 0, 0, 1, 1, 1, 0])
+    def test_and_opt(self):
+        for t, u in implemented_sigs["binary"]["bitwise"]:
+            w = implemented_sigs["binary"]["bitwise"][(t, u)]
+            if t.cpu_noimpl() or u.cpu_noimpl():
+                continue
+            a = [0, 1, None, 1, 1, 1, 1, 0]
+            b = [1, 1, 1, 1, 1, 1, None, 0]
+            c = [0, 1, None, 1, 1, 1, None, 0]
+            x = xnd(a, dtype="?" + t.type)
+            y = xnd(b, dtype="?" + u.type)
+            z = fn.bitwise_and(x, y)
+            self.assertEqual(z, c)
+@unittest.skipIf(cd is None, "test requires cuda")
+class TestBitwiseCUDA(unittest.TestCase):
+    def test_and(self):
+        for t, u in implemented_sigs["binary"]["bitwise"]:
+            w = implemented_sigs["binary"]["bitwise"][(t, u)]
+            if t.cuda_noimpl() or u.cuda_noimpl():
+                continue
+            x = xnd([0, 1, 2, 3, 4, 5, 6, 7], dtype=t.type, device="cuda:managed")
+            x = xnd([0, 1, 0, 1, 1, 1, 1, 0], dtype=t.type, device="cuda:managed")
+            y = xnd([1, 0, 0, 0, 1, 1, 1, 1], dtype=u.type, device="cuda:managed")
+            z = cd.bitwise_and(x, y)
+            self.assertEqual(z, [0, 0, 0, 0, 1, 1, 1, 0])
+    def test_and_opt(self):
+        for t, u in implemented_sigs["binary"]["bitwise"]:
+            w = implemented_sigs["binary"]["bitwise"][(t, u)]
+            if t.cuda_noimpl() or u.cuda_noimpl():
+                continue
+            a = [0, 1, None, 1, 1, 1, 1, 0]
+            b = [1, 1, 1, 1, 1, 1, None, 0]
+            c = [0, 1, None, 1, 1, 1, None, 0]
+            x = xnd(a, dtype="?" + t.type, device="cuda:managed")
+            y = xnd(b, dtype="?" + u.type, device="cuda:managed")
+            z = cd.bitwise_and(x, y)
+            self.assertEqual(z, c)
+@unittest.skipIf(np is None, "test requires numpy")
+class TestFunctions(unittest.TestCase):
+    def assertRelErrorLess(self, calc, expected, maxerr, msg):
+        if cmath.isnan(calc) or cmath.isnan(expected):
+            return
+        elif cmath.isinf(calc) or cmath.isinf(expected):
+            return
+        elif abs(expected) < 1e-5 or abs(calc) < 1e-5:
+            self.assertLess(abs(calc), 1e-5, msg)
+            self.assertLess(abs(expected), 1e-5, msg)
+        else:
+            err = abs((calc-expected) / expected)
+            self.assertLess(err, maxerr, msg)
+    def equal(self, calc, expected, msg):
+        if np.isnan(calc) and np.isnan(expected):
+            return
+        else:
+            self.assertEqual(calc, expected, msg)
+    def assert_equal(self, f, z1, z2, w, msg, a=None, b=None):
+        if w.type == "bfloat16":
+            self.assertRelErrorLess(z1, z2, 1e-2, msg)
+        elif f == "power" and w.type in ("int8", "int16", "int32", "int64"):
+            pass # equal mod INTN_MAX
+        elif f == "power" and isinstance(z1, complex):
+            # multivalued function, compare against Python
+            try:
+                ans = complex(a) ** complex(b)
+            except ZeroDivisionError:
+                pass
+            except OverflowError:
+                pass
+            else:
+                msg = "%s ans=%s" % (msg, ans)
+                self.assertRelErrorLess(z1.real, ans.real, 1e-2, msg)
+                self.assertRelErrorLess(z1.imag, ans.imag, 1e-2, msg)
+        elif isinstance(z1, complex):
+            if f not in ("add", "subtract"):
+                self.assertRelErrorLess(z1.real, z2.real, 1e-2, msg)
+                self.assertRelErrorLess(z1.imag, z2.imag, 1e-2, msg)
+            else:
+                self.equal(z1.real, z2.real, msg) and \
+                self.equal(z1.imag, z2.imag, msg)
+        elif f in functions["unary"]["real_math"] or \
+             f in functions["unary"]["real_math_with_half"] or \
+             f in functions["unary"]["complex_math"] or \
+             f in functions["unary"]["complex_math_with_half"] or \
+             f == "power":
+            self.assertRelErrorLess(z1, z2, 1e-2, msg)
+        elif f == "divide" and w.type in ("float16", "float32"):
+            self.assertRelErrorLess(z1, z2, 1e-2, msg)
+        else:
+            return self.equal(z1, z2, msg)
+    def create_xnd(self, a, t, dev=None):
+        # Check that struct.pack(a) overflows iff xnd(a) overflows.
+        overflow = struct_overflow(a, t)
+        xnd_overflow = False
+        try:
+            x = xnd([a], dtype=t.type, device=dev)
+        except OverflowError:
+            xnd_overflow = True
+        self.assertEqual(xnd_overflow, overflow)
+        return None if xnd_overflow else x
+    def check_unary_not_implemented(self, f, a, t, mod=fn, dev=None):
+        x = self.create_xnd(a, t, dev)
+        if x is None:
+            return
+        self.assertRaises(NotImplementedError, getattr(mod, f), x)
+    def check_unary_type_error(self, f, a, t, mod=fn, dev=None):
+        x = self.create_xnd(a, t, dev)
+        if x is None:
+            return
+        self.assertRaises(TypeError, getattr(mod, f), x)
+    def check_unary(self, f, a, t, u, mod=fn, dev=None):
+        x1 = self.create_xnd(a, t, dev)
+        if x1 is None:
+            return
+        y1 = getattr(mod, f)(x1)
+        self.assertEqual(str(y1[0].type), u.type)
+        v1 = y1[0].value
+        value = x1.value if t.type == "bfloat16" else a
+        dtype = "float32" if t.type == "bfloat16" else t.type
+        x2 = np.array([value], dtype=dtype)
+        y2 = getattr(np, np_function(f))(x2)
+        v2 = y2[0]
+        msg = "%s(%s : %s) -> %s    xnd: %s    np: %s" % (f, a, t, u, y1, y2)
+        self.assert_equal(f, v1, v2, u, msg)
+    def check_binary_not_implemented(self, f, a, t, b, u, mod=fn, dev=None):
+        x1 = self.create_xnd(a, t, dev)
+        if x1 is None:
+            return
+        y1 = self.create_xnd(b, u, dev)
+        if y1 is None:
+            return
+        self.assertRaises(NotImplementedError, getattr(mod, f), x1, y1)
+    def check_binary_type_error(self, f, a, t, b, u, mod=fn, dev=None):
+        x1 = self.create_xnd(a, t, dev)
+        if x1 is None:
+            return
+        y1 = self.create_xnd(b, u, dev)
+        if y1 is None:
+            return
+        self.assertRaises(TypeError, getattr(mod, f), x1, y1)
+    def check_binary(self, f, a, t, b, u, w, mod=fn, dev=None):
+        x1 = self.create_xnd(a, t, dev)
+        if x1 is None:
+            return
+        y1 = self.create_xnd(b, u, dev)
+        if y1 is None:
+            return
+        xnd_exc = z1 = None
+        try:
+            z1 = getattr(mod, f)(x1, y1)
+            self.assertEqual(str(z1[0].type), w.type)
+            v1 = z1[0].value
+        except Exception as e:
+            xnd_exc = e.__class__
+        dtype1 = "float32" if t.type == "bfloat16" else t.type
+        dtype2 = "float32" if u.type == "bfloat16" else u.type
+        value1 = x1.value if t.type == "bfloat16" else a
+        value2 = y1.value if u.type == "bfloat16" else b
+        x2 = np.array([value1], dtype=dtype1)
+        y2 = np.array([value2], dtype=dtype2)
+        np_exc = z2 = None
+        try:
+            z2 = getattr(np, f)(x2, y2)
+            v2 = z2[0]
+        except Exception as e:
+            np_exc = e.__class__
+        if xnd_exc or np_exc:
+            if xnd_exc != NotImplementedError:
+                self.assertEqual(xnd_exc, np_exc)
+        else:
+            msg = "%s(%s : %s, %s : %s) -> %s    xnd: %s    np: %s" % \
+                  (f, a, t, b, u, w, z1, z2)
+            self.assert_equal(f, v1, v2, w, msg, a=x1[0].value, b=y1[0].value)
+    def check_binary_mv(self, f, a, t, b, u, v, w, mod=fn, dev=None):
+        x1 = self.create_xnd(a, t, dev)
+        if x1 is None:
+            return
+        y1 = self.create_xnd(b, u, dev)
+        if y1 is None:
+            return
+        c1, d1 = getattr(mod, f)(x1, y1)
+        self.assertEqual(str(c1[0].type), v.type)
+        self.assertEqual(str(d1[0].type), w.type)
+        cv1 = c1[0].value
+        dv1 = d1[0].value
+        x2 = np.array([a], dtype=t.type)
+        y2 = np.array([b], dtype=u.type)
+        c2, d2 = getattr(np, f)(x2, y2)
+        cv2 = c2[0]
+        dv2 = d2[0]
+        msg = "%s(%s : %s, %s : %s) -> %s, %s    xnd: %s    np: %s" % \
+              (f, a, t, b, u, v, w, (cv1, dv1), (cv2, dv2))
+        self.assert_equal(f, cv1, cv2, v, msg)
+        self.assert_equal(f, dv2, dv2, v, msg)
+    @unittest.skipIf(sys.platform == "darwin", "complex trigonometry errors too large")
+    @unittest.skipIf(sys.platform == "win32" and ARCH == "32bit", "complex trigonometry errors too large")
+    def test_unary_cpu(self):
+        skip_if(SKIP_LONG, "use --long argument to enable these tests")
+        print("\n", flush=True)
+        for pattern, return_type in [
+              ("default", "default"),
+              ("complex_math", "float_result"),
+              ("real_math", "float_result")]:
+            for f in functions["unary"][pattern]:
+                if np_noimpl(f):
+                    continue
+                print("testing %s ..." % f, flush=True)
+                for t, in implemented_sigs["unary"][return_type]:
+                    u = implemented_sigs["unary"][return_type][(t,)]
+                    print("    %s -> %s" % (t, u), flush=True)
+                    for a in t.testcases():
+                        if t.cpu_noimpl(f) or u.cpu_noimpl(f):
+                            self.check_unary_not_implemented(f, a, t)
+                        else:
+                            self.check_unary(f, a, t, u)
+    def test_binary_cpu(self):
+        skip_if(SKIP_LONG, "use --long argument to enable these tests")
+        print("\n", flush=True)
+        for pattern in "default", "float_result", "bool_result":
+            for f in functions["binary"][pattern]:
+                print("testing %s ..." % f, flush=True)
+                for t, u in implemented_sigs["binary"][pattern]:
+                    w = implemented_sigs["binary"][pattern][(t, u)]
+                    print("    %s, %s -> %s" % (t, u, w), flush=True)
+                    for a in t.testcases():
+                        for b in u.testcases():
+                            if t.cpu_nokern(f) or u.cpu_nokern(f) or w.cpu_nokern(f):
+                                self.check_binary_type_error(f, a, t, b, u)
+                            elif t.cpu_noimpl(f) or u.cpu_noimpl(f) or w.cpu_noimpl(f):
+                                self.check_binary_not_implemented(f, a, t, b, u)
+                            else:
+                                self.check_binary(f, a, t, b, u, w)
+    def test_binary_mv_cpu(self):
+        skip_if(SKIP_LONG, "use --long argument to enable these tests")
+        print("\n", flush=True)
+        for f in functions["binary_mv"]["default"]:
+            print("testing %s ..." % f, flush=True)
+            for t, u in implemented_sigs["binary_mv"]["default"]:
+                v, w = implemented_sigs["binary_mv"]["default"][(t, u)]
+                print("    %s, %s -> %s, %s" % (t, u, v, w), flush=True)
+                for a in t.testcases():
+                    for b in u.testcases():
+                        self.check_binary_mv(f, a, t, b, u, v, w)
+    @unittest.skipIf(cd is None, "test requires cuda")
+    def test_unary_cuda(self):
+        skip_if(SKIP_LONG, "use --long argument to enable these tests")
+        print("\n", flush=True)
+        for pattern, return_type in [
+              ("default", "default"),
+              ("complex_math_with_half", "float_result"),
+              ("complex_math", "float_result"),
+              ("real_math_with_half", "float_result"),
+              ("real_math", "float_result")]:
+            for f in functions["unary"][pattern]:
+                if np_noimpl(f):
+                    continue
+                print("testing %s ..." % f, flush=True)
+                for t, in implemented_sigs["unary"][return_type]:
+                    u = implemented_sigs["unary"][return_type][(t,)]
+                    print("    %s -> %s" % (t, u), flush=True)
+                    for a in t.testcases():
+                        if t.cuda_noimpl(f) or u.cuda_noimpl(f):
+                            self.check_unary_not_implemented(
+                                f, a, t, mod=cd, dev="cuda:managed")
+                        else:
+                            self.check_unary(f, a, t, u,
+                                mod=cd, dev="cuda:managed")
+    @unittest.skipIf(cd is None, "test requires cuda")
+    def test_binary_cuda(self):
+        skip_if(SKIP_LONG, "use --long argument to enable these tests")
+        print("\n", flush=True)
+        for pattern in "default", "float_result", "bool_result":
+            for f in functions["binary"][pattern]:
+                print("testing %s ..." % f, flush=True)
+                for t, u in implemented_sigs["binary"][pattern]:
+                    w = implemented_sigs["binary"][pattern][(t, u)]
+                    print("    %s, %s -> %s" % (t, u, w), flush=True)
+                    for a in t.testcases():
+                        for b in u.testcases():
+                            if t.cuda_nokern(f) or u.cuda_nokern(f) or w.cuda_nokern(f):
+                                self.check_binary_type_error(f, a, t, b, u,
+                                    mod=cd, dev="cuda:managed")
+                            elif t.type == "complex32" or u.type == "complex32" or w.cuda_noimpl(f):
+                                self.check_binary_not_implemented(f, a, t, b, u,
+                                    mod=cd, dev="cuda:managed")
+                            else:
+                                self.check_binary(f, a, t, b, u, w,
+                                    mod=cd, dev="cuda:managed")
+    @unittest.skipIf(cd is None, "test requires cuda")
+    def test_binary_mv_cuda(self):
+        skip_if(SKIP_LONG, "use --long argument to enable these tests")
+        print("\n", flush=True)
+        for f in functions["binary_mv"]["default"]:
+            print("testing %s ..." % f, flush=True)
+            for t, u in implemented_sigs["binary_mv"]["default"]:
+                v, w = implemented_sigs["binary_mv"]["default"][(t, u)]
+                print("    %s, %s -> %s, %s" % (t, u, v, w), flush=True)
+                for a in t.testcases():
+                    for b in u.testcases():
+                        self.check_binary_mv(f, a, t, b, u, v, w, mod=cd,
+                                             dev="cuda:managed")
+    def test_divide_inexact_cpu(self):
+        t = Tint("uint8")
+        u = Tint("uint64")
+        a = next(t.testcases())
+        b = next(u.testcases())
+        self.check_binary_type_error("divide", a, t, b, u)
+    @unittest.skipIf(cd is None, "test requires cuda")
+    def test_divide_inexact_cuda(self):
+        t = Tint("uint8")
+        u = Tint("uint64")
+        a = next(t.testcases())
+        b = next(u.testcases())
+        self.check_binary_type_error("divide", a, t, b, u,
+                                     mod=cd, dev="cuda:managed")
+    def test_divmod_type_error_cpu(self):
+        t = Tint("uint8")
+        u = Tint("uint64")
+        a = next(t.testcases())
+        b = next(u.testcases())
+        self.check_binary_type_error("divmod", a, t, b, u)
+    @unittest.skipIf(cd is None, "test requires cuda")
+    def test_divmod_type_error_cuda(self):
+        t = Tint("uint8")
+        u = Tint("uint64")
+        a = next(t.testcases())
+        b = next(u.testcases())
+        self.check_binary_type_error("divmod", a, t, b, u)
+@unittest.skipIf(cd is None, "test requires cuda")
+class TestCudaManaged(unittest.TestCase):
+    def test_mixed_functions(self):
+        x = xnd([1,2,3])
+        y = xnd([1,2,3])
+        a = xnd([1,2,3], device="cuda:managed")
+        b = xnd([1,2,3], device="cuda:managed")
+        z = fn.multiply(x, y)
+        c = cd.multiply(a, b)
+        self.assertEqual(z, c)
+        z = fn.multiply(a, b)
+        self.assertEqual(z, c)
+        z = fn.multiply(x, b)
+        self.assertEqual(z, c)
+        z = fn.multiply(a, y)
+        self.assertEqual(z, c)
+        self.assertRaises(ValueError, cd.multiply, x, y)
+        self.assertRaises(ValueError, cd.multiply, x, b)
+        self.assertRaises(ValueError, cd.multiply, a, y)
+class TestSpec(unittest.TestCase):
+    def __init__(self, *, constr, ndarray, mod,
+                 values, value_generator,
+                 indices_generator, indices_generator_args):
+        super().__init__()
+        self.constr = constr
+        self.ndarray = ndarray
+        self.mod = mod
+        self.values = values
+        self.value_generator = value_generator
+        self.indices_generator = indices_generator
+        self.indices_generator_args = indices_generator_args
+        self.indices_stack = [None] * 8
+    def log_err(self, value, depth):
+        """Dump an error as a Python script for debugging."""
+        dtype = "?int32" if have_none(value) else "int32"
+        sys.stderr.write("\n\nfrom xnd import *\n")
+        sys.stderr.write("import gumath.functions as fn\n")
+        sys.stderr.write("from test_gumath import NDArray\n")
+        sys.stderr.write("lst = %s\n\n" % value)
+        sys.stderr.write("x0 = xnd(lst, dtype=\"%s\")\n" % dtype)
+        sys.stderr.write("y0 = NDArray(lst)\n" % value)
+        for i in range(depth+1):
+            sys.stderr.write("x%d = x%d[%s]\n" % (i+1, i, itos(self.indices_stack[i])))
+            sys.stderr.write("y%d = y%d[%s]\n" % (i+1, i, itos(self.indices_stack[i])))
+        sys.stderr.write("\n")
+    def run_reduce(self, nd, d):
+        if not isinstance(nd, xnd) or not isinstance(d, np.ndarray):
+            return
+        for attr in ["add", "subtract", "multiply"]:
+            f = getattr(fn, attr)
+            g = getattr(np, attr)
+            x = nd_exception = None
+            try:
+                x = gm.reduce(f, nd, dtype=nd.dtype)
+            except Exception as e:
+                nd_exception =  e
+            y = np_exception = None
+            try:
+                y = g.reduce(d, dtype=d.dtype)
+            except Exception as e:
+                np_exception =  e
+            if nd_exception or np_exception:
+                self.assertIs(nd_exception.__class__, np_exception.__class__,
+                              "f: %r nd: %r np: %r x: %r y: %r" % (attr, nd, d, x, y))
+            else:
+                self.assertEqual(x.value, y.tolist(),
+                                 "f: %r nd: %r np: %r x: %r y: %r" % (attr, nd, d, x, y))
+            for axes in gen_axes(d.ndim):
+                nd_exception = None
+                try:
+                    x = gm.reduce(f, nd, axes=axes, dtype=nd.dtype)
+                except Exception as e:
+                    nd_exception =  e
+                np_exception = None
+                try:
+                    y = g.reduce(d, axis=axes, dtype=d.dtype)
+                except Exception as e:
+                    np_exception =  e
+                if nd_exception or np_exception:
+                    self.assertIs(nd_exception.__class__, np_exception.__class__,
+                                  "f: %r axes: %r nd: %r np: %r x: %r y: %r" % (attr, axes, nd, d, x, y))
+                else:
+                    self.assertEqual(x.value, y.tolist(),
+                                     "f: %r axes: %r nd: %r np: %r x: %r y: %r" % (attr, axes, nd, d, x, y))
+    def run_single(self, nd, d, indices):
+        """Run a single test case."""
+        self.assertEqual(len(nd), len(d))
+        nd_exception = None
+        try:
+            nd_result = nd[indices]
+        except Exception as e:
+            nd_exception =  e
+        def_exception = None
+        try:
+            def_result = d[indices]
+        except Exception as e:
+            def_exception = e
+        if nd_exception or def_exception:
+            if nd_exception is None and def_exception.__class__ is IndexError:
+                # Example: type = 0 * 0 * int64
+                if len(indices) <= nd.ndim:
+                    return None, None
+            self.assertIs(nd_exception.__class__, def_exception.__class__)
+            return None, None
+        assert(isinstance(nd_result, xnd))
+        x = self.mod.sin(nd_result)
+        y = self.mod.multiply(nd_result, nd_result)
+        if isinstance(def_result, NDArray):
+            aa = a = def_result.sin()
+            b = def_result * def_result
+        elif isinstance(def_result, int):
+            aa = a = math.sin(def_result)
+            b = def_result * def_result
+        elif def_result is None:
+            aa = a = None
+            aa = b = None
+        elif isinstance(def_result, np.ndarray):
+            aa = np.sin(def_result)
+            a = aa.tolist()
+            bb = np.multiply(def_result, def_result)
+            b = bb.tolist()
+        elif isinstance(def_result, np.int32):
+            aa = np.sin(def_result)
+            a = aa.tolist()
+            bb = np.multiply(def_result, def_result)
+            b = bb.tolist()
+        else:
+            raise TypeError("unexpected def_result: %s : %s" % (def_result, type(def_result)))
+        if self.mod == cd:
+            np.testing.assert_allclose(x, aa, 1e-6)
+            np.testing.assert_allclose(y, bb, 1e-6)
+        else:
+            self.assertEqual(x, a)
+            self.assertEqual(y, b)
+        if self.mod == fn:
+            self.run_reduce(nd_result, def_result)
+        return nd_result, def_result
+    def run(self):
+        def check(nd, d, value, depth):
+            if depth > 3: # adjust for longer tests
+                return
+            g = self.indices_generator(*self.indices_generator_args)
+            for indices in g:
+                self.indices_stack[depth] = indices
+                try:
+                    next_nd, next_d = self.run_single(nd, d, indices)
+                except Exception as e:
+                    self.log_err(value, depth)
+                    raise e
+                if isinstance(next_d, list): # possibly None or scalar
+                    check(next_nd, next_d, value, depth+1)
+        def check_buffer(nd, d, value, depth):
+            if depth > 3: # adjust for longer tests
+                return
+            if not isinstance(nd, xnd) or nd.device == "cuda:managed" or \
+               not isinstance(d, np.ndarray):
+                return
+            nd = xnd.from_buffer(d)
+            d = np.array(nd, copy=False)
+            g = self.indices_generator(*self.indices_generator_args)
+            for indices in g:
+                self.indices_stack[depth] = indices
+                try:
+                    next_nd, next_d = self.run_single(nd, d, indices)
+                except Exception as e:
+                    self.log_err(value, depth)
+                    raise e
+                if isinstance(next_d, list): # possibly None or scalar
+                    check_buffer(next_nd, next_d, value, depth+1)
+        for value in self.values:
+            dtype = "?int32" if have_none(value) else "int32"
+            if self.constr == xnd:
+                nd = xnd(value, dtype=dtype, device=None if self.mod==fn else "cuda:managed")
+            else:
+                nd = self.constr(value, dtype=dtype)
+            # NumPy does not support "?int32", NDArray does not need the dtype.
+            d = self.ndarray(value, dtype="int32")
+            check(nd, d, value, 0)
+        for max_ndim in range(1, 5):
+            for min_shape in (0, 1):
+                for max_shape in range(1, 8):
+                    for value in self.value_generator(max_ndim, min_shape, max_shape):
+                        dtype = "?int32" if have_none(value) else "int32"
+                        if self.constr == xnd:
+                            nd = xnd(value, dtype=dtype, device=None if self.mod==fn else "cuda:managed")
+                        else:
+                            nd = self.constr(value, dtype=dtype)
+                        # See above.
+                        d = self.ndarray(value, dtype="int32")
+                        check(nd, d, value, 0)
+                        check_buffer(nd, d, value, 0)
+class LongIndexSliceTest(unittest.TestCase):
+    def test_subarray(self):
+        # Multidimensional indexing
+        skip_if(SKIP_LONG, "use --long argument to enable these tests")
+        t = TestSpec(constr=xnd,
+                     ndarray=NDArray,
+                     mod=fn,
+                     values=SUBSCRIPT_FIXED_TEST_CASES,
+                     value_generator=gen_fixed,
+                     indices_generator=genindices,
+                     indices_generator_args=())
+        t.run()
+        t = TestSpec(constr=xnd,
+                     ndarray=NDArray,
+                     mod=fn,
+                     values=SUBSCRIPT_VAR_TEST_CASES,
+                     value_generator=gen_var,
+                     indices_generator=genindices,
+                     indices_generator_args=())
+        t.run()
+    @unittest.skipIf(cd is None or np is None, "cuda or numpy not found")
+    def test_subarray_cuda(self):
+        # Multidimensional indexing
+        skip_if(SKIP_LONG, "use --long argument to enable these tests")
+        t = TestSpec(constr=xnd,
+                     ndarray=np.array,
+                     mod=cd,
+                     values=SUBSCRIPT_FIXED_TEST_CASES,
+                     value_generator=gen_fixed,
+                     indices_generator=genindices,
+                     indices_generator_args=())
+        t.run()
+    def test_slices(self):
+        # Multidimensional slicing
+        skip_if(SKIP_LONG, "use --long argument to enable these tests")
+        t = TestSpec(constr=xnd,
+                     ndarray=NDArray,
+                     mod=fn,
+                     values=SUBSCRIPT_FIXED_TEST_CASES,
+                     value_generator=gen_fixed,
+                     indices_generator=randslices,
+                     indices_generator_args=(3,))
+        t.run()
+        t = TestSpec(constr=xnd,
+                     ndarray=NDArray,
+                     mod=fn,
+                     values=SUBSCRIPT_VAR_TEST_CASES,
+                     value_generator=gen_var,
+                     indices_generator=randslices,
+                     indices_generator_args=(3,))
+        t.run()
+    @unittest.skipIf(cd is None or np is None, "cuda or numpy not found")
+    def test_slices_cuda(self):
+        # Multidimensional slicing
+        skip_if(SKIP_LONG, "use --long argument to enable these tests")
+        t = TestSpec(constr=xnd,
+                     ndarray=np.array,
+                     mod=cd,
+                     values=SUBSCRIPT_FIXED_TEST_CASES,
+                     value_generator=gen_fixed,
+                     indices_generator=randslices,
+                     indices_generator_args=(3,))
+        t.run()
+    def test_chained_indices_slices(self):
+        # Multidimensional indexing and slicing, chained
+        skip_if(SKIP_LONG, "use --long argument to enable these tests")
+        t = TestSpec(constr=xnd,
+                     ndarray=NDArray,
+                     mod=fn,
+                     values=SUBSCRIPT_FIXED_TEST_CASES,
+                     value_generator=gen_fixed,
+                     indices_generator=gen_indices_or_slices,
+                     indices_generator_args=())
+        t.run()
+        t = TestSpec(constr=xnd,
+                     ndarray=NDArray,
+                     mod=fn,
+                     values=SUBSCRIPT_VAR_TEST_CASES,
+                     value_generator=gen_var,
+                     indices_generator=gen_indices_or_slices,
+                     indices_generator_args=())
+        t.run()
+    def test_fixed_mixed_indices_slices(self):
+        # Multidimensional indexing and slicing, mixed
+        skip_if(SKIP_LONG, "use --long argument to enable these tests")
+        t = TestSpec(constr=xnd,
+                     ndarray=NDArray,
+                     mod=fn,
+                     values=SUBSCRIPT_FIXED_TEST_CASES,
+                     value_generator=gen_fixed,
+                     indices_generator=mixed_indices,
+                     indices_generator_args=(3,))
+        t.run()
+    def test_var_mixed_indices_slices(self):
+        # Multidimensional indexing and slicing, mixed
+        skip_if(SKIP_LONG, "use --long argument to enable these tests")
+        t = TestSpec(constr=xnd,
+                     ndarray=NDArray,
+                     mod=fn,
+                     values=SUBSCRIPT_VAR_TEST_CASES,
+                     value_generator=gen_var,
+                     indices_generator=mixed_indices,
+                     indices_generator_args=(5,))
+        t.run()
+    def test_slices_brute_force(self):
+        # Test all possible slices for the given ndim and shape
+        skip_if(SKIP_BRUTE_FORCE, "use --all argument to enable these tests")
+        t = TestSpec(constr=xnd,
+                     ndarray=NDArray,
+                     mod=fn,
+                     values=SUBSCRIPT_FIXED_TEST_CASES,
+                     value_generator=gen_fixed,
+                     indices_generator=genslices_ndim,
+                     indices_generator_args=(3, [3,3,3]))
+        t.run()
+        t = TestSpec(constr=xnd,
+                     ndarray=NDArray,
+                     mod=fn,
+                     values=SUBSCRIPT_VAR_TEST_CASES,
+                     value_generator=gen_var,
+                     indices_generator=genslices_ndim,
+                     indices_generator_args=(3, [3,3,3]))
+        t.run()
+    @unittest.skipIf(np is None, "numpy not found")
+    def test_fixed_mixed_indices_slices_np(self):
+        # Multidimensional indexing and slicing, mixed
+        skip_if(SKIP_LONG, "use --long argument to enable these tests")
+        t = TestSpec(constr=xnd,
+                     ndarray=np.array,
+                     mod=fn,
+                     values=SUBSCRIPT_FIXED_TEST_CASES,
+                     value_generator=gen_fixed,
+                     indices_generator=mixed_indices,
+                     indices_generator_args=(3,))
+        t.run()
+    @unittest.skipIf(np is None, "numpy not found")
+    def test_reduce(self):
+        skip_if(SKIP_LONG, "use --long argument to enable these tests")
+        t = TestSpec(constr=xnd,
+                     ndarray=np.array,
+                     mod=fn,
+                     values=SUBSCRIPT_FIXED_TEST_CASES,
+                     value_generator=gen_fixed,
+                     indices_generator=mixed_indices,
+                     indices_generator_args=(3,))
+        t.run()
 ALL_TESTS = [
+  TestAPI,
   TestCall,
   TestRaggedArrays,
+  TestFlexibleArrays,
   TestMissingValues,
+  TestEqualN,
   TestGraphs,
-  TestBFloat16,
   TestPdist,
   TestNumba,
+  TestOut,
+  TestUnaryCPU,
+  TestUnaryCUDA,
+  TestBinaryCPU,
+  TestBinaryCUDA,
+  TestBitwiseCPU,
+  TestBitwiseCUDA,
+  TestFunctions,
+  TestCudaManaged,
+  LongIndexSliceTest,
 ]
@@ -389,7 +1865,11 @@ if __name__ == '__main__':
     parser = argparse.ArgumentParser()
     parser.add_argument("-f", "--failfast", action="store_true",
                         help="stop the test run on first error")
+    parser.add_argument('--long', action="store_true", help="run long slice tests")
+    parser.add_argument('--all', action="store_true", help="run brute force tests")
     args = parser.parse_args()
+    SKIP_LONG = not (args.long or args.all)
+    SKIP_BRUTE_FORCE = not args.all
     suite = unittest.TestSuite()
     loader = unittest.TestLoader()