PyPI - warp-lang - Versions diffs - 0.10.1__py3-none-win_amd64.whl → 0.11.0__py3-none-win_amd64.whl - Mend

warp-lang 0.10.1__py3-none-win_amd64.whl → 0.11.0__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (300) hide show

warp/__init__.py +10 -4
warp/__init__.pyi +1 -0
warp/bin/warp-clang.dll +0 -0
warp/bin/warp.dll +0 -0
warp/build.py +5 -3
warp/build_dll.py +29 -9
warp/builtins.py +868 -507
warp/codegen.py +1074 -638
warp/config.py +3 -3
warp/constants.py +6 -0
warp/context.py +715 -222
warp/fabric.py +326 -0
warp/fem/__init__.py +27 -0
warp/fem/cache.py +389 -0
warp/fem/dirichlet.py +181 -0
warp/fem/domain.py +263 -0
warp/fem/field/__init__.py +101 -0
warp/fem/field/field.py +149 -0
warp/fem/field/nodal_field.py +299 -0
warp/fem/field/restriction.py +21 -0
warp/fem/field/test.py +181 -0
warp/fem/field/trial.py +183 -0
warp/fem/geometry/__init__.py +19 -0
warp/fem/geometry/closest_point.py +70 -0
warp/fem/geometry/deformed_geometry.py +271 -0
warp/fem/geometry/element.py +744 -0
warp/fem/geometry/geometry.py +186 -0
warp/fem/geometry/grid_2d.py +373 -0
warp/fem/geometry/grid_3d.py +435 -0
warp/fem/geometry/hexmesh.py +953 -0
warp/fem/geometry/partition.py +376 -0
warp/fem/geometry/quadmesh_2d.py +532 -0
warp/fem/geometry/tetmesh.py +840 -0
warp/fem/geometry/trimesh_2d.py +577 -0
warp/fem/integrate.py +1616 -0
warp/fem/operator.py +191 -0
warp/fem/polynomial.py +213 -0
warp/fem/quadrature/__init__.py +2 -0
warp/fem/quadrature/pic_quadrature.py +245 -0
warp/fem/quadrature/quadrature.py +294 -0
warp/fem/space/__init__.py +292 -0
warp/fem/space/basis_space.py +489 -0
warp/fem/space/collocated_function_space.py +105 -0
warp/fem/space/dof_mapper.py +236 -0
warp/fem/space/function_space.py +145 -0
warp/fem/space/grid_2d_function_space.py +267 -0
warp/fem/space/grid_3d_function_space.py +306 -0
warp/fem/space/hexmesh_function_space.py +352 -0
warp/fem/space/partition.py +350 -0
warp/fem/space/quadmesh_2d_function_space.py +369 -0
warp/fem/space/restriction.py +160 -0
warp/fem/space/shape/__init__.py +15 -0
warp/fem/space/shape/cube_shape_function.py +738 -0
warp/fem/space/shape/shape_function.py +103 -0
warp/fem/space/shape/square_shape_function.py +611 -0
warp/fem/space/shape/tet_shape_function.py +567 -0
warp/fem/space/shape/triangle_shape_function.py +429 -0
warp/fem/space/tetmesh_function_space.py +292 -0
warp/fem/space/topology.py +295 -0
warp/fem/space/trimesh_2d_function_space.py +221 -0
warp/fem/types.py +77 -0
warp/fem/utils.py +495 -0
warp/native/array.h +147 -44
warp/native/builtin.h +122 -149
warp/native/bvh.cpp +73 -325
warp/native/bvh.cu +406 -23
warp/native/bvh.h +34 -43
warp/native/clang/clang.cpp +13 -8
warp/native/crt.h +2 -0
warp/native/cuda_crt.h +5 -0
warp/native/cuda_util.cpp +15 -3
warp/native/cuda_util.h +3 -1
warp/native/cutlass/tools/library/scripts/conv2d_operation.py +463 -0
warp/native/cutlass/tools/library/scripts/conv3d_operation.py +321 -0
warp/native/cutlass/tools/library/scripts/gemm_operation.py +988 -0
warp/native/cutlass/tools/library/scripts/generator.py +4625 -0
warp/native/cutlass/tools/library/scripts/library.py +799 -0
warp/native/cutlass/tools/library/scripts/manifest.py +402 -0
warp/native/cutlass/tools/library/scripts/pycutlass/docs/source/conf.py +96 -0
warp/native/cutlass/tools/library/scripts/pycutlass/profile/conv/conv2d_f16_sm80.py +106 -0
warp/native/cutlass/tools/library/scripts/pycutlass/profile/gemm/gemm_f32_sm80.py +91 -0
warp/native/cutlass/tools/library/scripts/pycutlass/setup.py +80 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/__init__.py +48 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/arguments.py +118 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/c_types.py +241 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/compiler.py +432 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/conv2d_operation.py +631 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/epilogue.py +1026 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/frontend.py +104 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/gemm_operation.py +1276 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/library.py +744 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/memory_manager.py +74 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/operation.py +110 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/parser.py +619 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/reduction_operation.py +398 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/tensor_ref.py +70 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/test/__init__.py +4 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/test/conv2d_testbed.py +646 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/test/gemm_grouped_testbed.py +235 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/test/gemm_testbed.py +557 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/test/profiler.py +70 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/type_hint.py +39 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/utils/__init__.py +1 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/utils/device.py +76 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/utils/reference_model.py +255 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/__init__.py +0 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.py +201 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py +177 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_dgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.py +98 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_dgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.py +95 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_few_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.py +163 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_fixed_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.py +187 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.py +309 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py +54 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.py +96 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.py +107 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_strided_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py +253 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.py +97 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py +242 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_wgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.py +96 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_wgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.py +107 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/run_all_tests.py +10 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/frontend/test_frontend.py +146 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/__init__.py +0 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/gemm_bf16_sm80.py +96 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/gemm_f16_sm80.py +447 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/gemm_f32_sm80.py +146 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/gemm_f64_sm80.py +102 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/gemm_grouped_sm80.py +203 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/gemm_s8_sm80.py +229 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/run_all_tests.py +9 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/unit/test_sm80.py +453 -0
warp/native/cutlass/tools/library/scripts/rank_2k_operation.py +398 -0
warp/native/cutlass/tools/library/scripts/rank_k_operation.py +387 -0
warp/native/cutlass/tools/library/scripts/rt.py +796 -0
warp/native/cutlass/tools/library/scripts/symm_operation.py +400 -0
warp/native/cutlass/tools/library/scripts/trmm_operation.py +407 -0
warp/native/cutlass_gemm.cu +5 -3
warp/native/exports.h +1240 -952
warp/native/fabric.h +228 -0
warp/native/hashgrid.cpp +4 -4
warp/native/hashgrid.h +22 -2
warp/native/intersect.h +22 -7
warp/native/intersect_adj.h +8 -8
warp/native/intersect_tri.h +1 -1
warp/native/marching.cu +157 -161
warp/native/mat.h +80 -19
warp/native/matnn.h +2 -2
warp/native/mesh.cpp +33 -108
warp/native/mesh.cu +114 -23
warp/native/mesh.h +446 -46
warp/native/noise.h +272 -329
warp/native/quat.h +51 -8
warp/native/rand.h +45 -35
warp/native/range.h +6 -2
warp/native/reduce.cpp +1 -1
warp/native/reduce.cu +10 -12
warp/native/runlength_encode.cu +6 -10
warp/native/scan.cu +8 -11
warp/native/sparse.cpp +4 -4
warp/native/sparse.cu +164 -154
warp/native/spatial.h +2 -2
warp/native/temp_buffer.h +14 -30
warp/native/vec.h +107 -23
warp/native/volume.h +120 -0
warp/native/warp.cpp +560 -30
warp/native/warp.cu +431 -44
warp/native/warp.h +13 -4
warp/optim/__init__.py +1 -0
warp/optim/linear.py +922 -0
warp/optim/sgd.py +92 -0
warp/render/render_opengl.py +335 -119
warp/render/render_usd.py +11 -11
warp/sim/__init__.py +2 -2
warp/sim/articulation.py +385 -185
warp/sim/collide.py +8 -0
warp/sim/import_mjcf.py +297 -106
warp/sim/import_urdf.py +389 -210
warp/sim/import_usd.py +198 -97
warp/sim/inertia.py +17 -18
warp/sim/integrator_euler.py +14 -8
warp/sim/integrator_xpbd.py +158 -16
warp/sim/model.py +795 -291
warp/sim/render.py +3 -3
warp/sim/utils.py +3 -0
warp/sparse.py +640 -150
warp/stubs.py +606 -267
warp/tape.py +61 -10
warp/tests/__main__.py +3 -6
warp/tests/assets/curlnoise_golden.npy +0 -0
warp/tests/assets/pnoise_golden.npy +0 -0
warp/tests/{test_class_kernel.py → aux_test_class_kernel.py} +9 -1
warp/tests/aux_test_conditional_unequal_types_kernels.py +21 -0
warp/tests/{test_dependent.py → aux_test_dependent.py} +2 -2
warp/tests/{test_reference.py → aux_test_reference.py} +1 -1
warp/tests/aux_test_unresolved_func.py +14 -0
warp/tests/aux_test_unresolved_symbol.py +14 -0
warp/tests/disabled_kinematics.py +239 -0
warp/tests/run_coverage_serial.py +31 -0
warp/tests/test_adam.py +103 -106
warp/tests/test_arithmetic.py +128 -74
warp/tests/test_array.py +212 -97
warp/tests/test_array_reduce.py +57 -23
warp/tests/test_atomic.py +64 -28
warp/tests/test_bool.py +99 -0
warp/tests/test_builtins_resolution.py +1292 -0
warp/tests/test_bvh.py +42 -18
warp/tests/test_closest_point_edge_edge.py +54 -57
warp/tests/test_codegen.py +208 -130
warp/tests/test_compile_consts.py +28 -20
warp/tests/test_conditional.py +108 -24
warp/tests/test_copy.py +10 -12
warp/tests/test_ctypes.py +112 -88
warp/tests/test_dense.py +21 -14
warp/tests/test_devices.py +98 -0
warp/tests/test_dlpack.py +75 -75
warp/tests/test_examples.py +277 -0
warp/tests/test_fabricarray.py +955 -0
warp/tests/test_fast_math.py +15 -11
warp/tests/test_fem.py +1271 -0
warp/tests/test_fp16.py +53 -19
warp/tests/test_func.py +187 -86
warp/tests/test_generics.py +194 -49
warp/tests/test_grad.py +178 -109
warp/tests/test_grad_customs.py +176 -0
warp/tests/test_hash_grid.py +52 -37
warp/tests/test_import.py +10 -23
warp/tests/test_indexedarray.py +32 -31
warp/tests/test_intersect.py +18 -9
warp/tests/test_large.py +141 -0
warp/tests/test_launch.py +14 -41
warp/tests/test_lerp.py +64 -65
warp/tests/test_linear_solvers.py +154 -0
warp/tests/test_lvalue.py +493 -0
warp/tests/test_marching_cubes.py +12 -13
warp/tests/test_mat.py +517 -2898
warp/tests/test_mat_lite.py +115 -0
warp/tests/test_mat_scalar_ops.py +2889 -0
warp/tests/test_math.py +103 -9
warp/tests/test_matmul.py +305 -69
warp/tests/test_matmul_lite.py +410 -0
warp/tests/test_mesh.py +71 -14
warp/tests/test_mesh_query_aabb.py +41 -25
warp/tests/test_mesh_query_point.py +140 -22
warp/tests/test_mesh_query_ray.py +39 -22
warp/tests/test_mlp.py +30 -22
warp/tests/test_model.py +92 -89
warp/tests/test_modules_lite.py +39 -0
warp/tests/test_multigpu.py +88 -114
warp/tests/test_noise.py +12 -11
warp/tests/test_operators.py +16 -20
warp/tests/test_options.py +11 -11
warp/tests/test_pinned.py +17 -18
warp/tests/test_print.py +32 -11
warp/tests/test_quat.py +275 -129
warp/tests/test_rand.py +18 -16
warp/tests/test_reload.py +38 -34
warp/tests/test_rounding.py +50 -43
warp/tests/test_runlength_encode.py +168 -20
warp/tests/test_smoothstep.py +9 -11
warp/tests/test_snippet.py +143 -0
warp/tests/test_sparse.py +261 -63
warp/tests/test_spatial.py +276 -243
warp/tests/test_streams.py +110 -85
warp/tests/test_struct.py +268 -63
warp/tests/test_tape.py +39 -21
warp/tests/test_torch.py +118 -89
warp/tests/test_transient_module.py +12 -13
warp/tests/test_types.py +614 -0
warp/tests/test_utils.py +494 -0
warp/tests/test_vec.py +354 -2050
warp/tests/test_vec_lite.py +73 -0
warp/tests/test_vec_scalar_ops.py +2099 -0
warp/tests/test_volume.py +457 -293
warp/tests/test_volume_write.py +124 -134
warp/tests/unittest_serial.py +35 -0
warp/tests/unittest_suites.py +341 -0
warp/tests/unittest_utils.py +568 -0
warp/tests/unused_test_misc.py +71 -0
warp/tests/{test_debug.py → walkthough_debug.py} +3 -17
warp/thirdparty/appdirs.py +36 -45
warp/thirdparty/unittest_parallel.py +549 -0
warp/torch.py +9 -6
warp/types.py +1089 -366
warp/utils.py +93 -387
warp_lang-0.11.0.dist-info/METADATA +238 -0
warp_lang-0.11.0.dist-info/RECORD +332 -0
{warp_lang-0.10.1.dist-info → warp_lang-0.11.0.dist-info}/WHEEL +1 -1
warp/tests/test_all.py +0 -219
warp/tests/test_array_scan.py +0 -60
warp/tests/test_base.py +0 -208
warp/tests/test_unresolved_func.py +0 -7
warp/tests/test_unresolved_symbol.py +0 -7
warp_lang-0.10.1.dist-info/METADATA +0 -21
warp_lang-0.10.1.dist-info/RECORD +0 -188
/warp/tests/{test_compile_consts_dummy.py → aux_test_compile_consts_dummy.py} +0 -0
/warp/tests/{test_reference_reference.py → aux_test_reference_reference.py} +0 -0
/warp/tests/{test_square.py → aux_test_square.py} +0 -0
{warp_lang-0.10.1.dist-info → warp_lang-0.11.0.dist-info}/LICENSE.md +0 -0
{warp_lang-0.10.1.dist-info → warp_lang-0.11.0.dist-info}/top_level.txt +0 -0

warp/native/cutlass/tools/library/scripts/rt.py ADDED Viewed

@@ -0,0 +1,796 @@
+#################################################################################################
+#
+# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+# System imports
+import struct
+import io
+import ctypes
+# CUDA Python import
+from cuda import cuda
+from cuda import nvrtc
+# CUTLASS imports
+from library import *
+from gemm_operation import EmitGemmUniversalInstance
+#################################################################################################
+#
+# CUTLASS Py Runtime Components
+#
+#################################################################################################
+#
+def MaxAlignment(fmt):
+  align = 1
+  for x in fmt:
+    align = max(align, struct.calcsize(x))
+  return align
+#
+def AlignedOffset(offset, align):
+  remainder = (offset % align)
+  if remainder:
+    offset += (align - remainder)
+  return offset
+#
+def PackInteger(host_workspace, offset, value):
+  fmt = "i"
+  padding = AlignedOffset(offset, 4)
+  struct.pack_into(fmt, host_workspace, offset, value)
+  return padding + struct.calcsize(fmt)
+#
+def PackDevicePointer(host_workspace, offset, value):
+  fmt = "P"
+  offset = AlignedOffset(offset, 8)
+  struct.pack_into(fmt, host_workspace, offset, value)
+  return offset + struct.calcsize(fmt)
+#
+def ceil_div(a, b):
+  return -(a // -b)
+#################################################################################################
+#
+class PitchLinearCoord:
+  def __init__(self, contiguous, strided):
+    self.contiguous = contiguous
+    self.strided = strided
+#
+class GemmCoord:
+  def __init__(self, m = 1, n = 1, k = 1):
+    self.m = m
+    self.n = n
+    self.k = k
+    self.fmt = "iii"
+  #
+  def ceil_div(self, rhs):
+    return GemmCoord(ceil_div(self.m, rhs.m), ceil_div(self.n, rhs.n), ceil_div(self.k, rhs.k))
+  #
+  def size(self):
+    return struct.calcsize(self.fmt)
+  #
+  def alignment(self):
+    return MaxAlignment(self.fmt)
+  #
+  def pack_into(self, host_workspace, offset):
+    offset = AlignedOffset(offset, 4)
+    struct.pack_into(
+      self.fmt,
+      host_workspace,
+      offset,
+      self.m, self.n, self.k)
+    return offset + self.size()
+#
+class TensorRef:
+  def __init__(self, pointer = None, layout = 0):
+    self.pointer = pointer
+    self.layout = layout
+  def __str__(self):
+    return "(%x, %d)" % (self.pointer._ptr, self.layout)
+#################################################################################################
+#
+class PredicatedTileAccessIteratorDesc:
+  '''
+  '''
+  def __init__(
+      self,
+      element_size_bits,
+      advance_rank,
+      threadblock_shape,
+      threadmap_iterations,
+      threadmap_delta):
+    self.element_size_bits = element_size_bits
+    self.advance_rank = advance_rank
+    self.threadblock_shape = threadblock_shape
+    self.threadmap_iterations = threadmap_iterations
+    self.threadmap_delta = threadmap_delta
+#
+class PredicatedTileAccessIteratorParams:
+  '''
+  '''
+  #
+  def __init__(self, desc, label):
+    self.desc = desc
+    self.label = label
+    self.fmt = "qqqq"
+  #
+  def size(self):
+    return struct.calcsize(self.fmt)
+  #
+  def alignment(self):
+    return MaxAlignment(self.fmt)
+  #
+  def initialize(self, host_workspace, offset, stride):
+    offset = AlignedOffset(offset, self.alignment())
+    inc_strided = stride *                            \
+                  self.desc.threadmap_delta.strided * \
+                  self.desc.element_size_bits // 8
+    if self.desc.advance_rank:
+      inc_advance = self.desc.threadblock_shape.strided * \
+                          stride *                        \
+                          self.desc.element_size_bits // 8
+    else:
+      inc_advance = self.desc.threadblock_shape.contiguous * \
+                          self.desc.element_size_bits // 8
+    inc_next = inc_advance - (self.desc.threadmap_iterations.strided - 1) * \
+                      self.desc.threadmap_delta.strided *                   \
+                      stride *                                              \
+                      self.desc.element_size_bits // 8
+    struct.pack_into(
+      self.fmt,
+      host_workspace,
+      offset,
+      stride, inc_strided, inc_next, inc_advance)
+    return offset + self.size()
+  #
+#################################################################################################
+#
+class EpilogueTileDesc:
+  '''
+  '''
+  def __init__(self, column, row, group, cluster, tile):
+    self.column = column
+    self.row = row
+    self.group = group
+    self.cluster = cluster
+    self.tile = tile
+#
+class EpilogueThreadMap:
+  '''
+  '''
+  def __init__(self, threads, elements_per_access, element_size_bits, shape, iterations, delta, count):
+    self.threads = threads
+    self.elements_per_access = elements_per_access
+    self.element_size_bits = element_size_bits
+    self.shape = shape
+    self.iterations = iterations
+    self.delta = delta
+    self.count = count
+    pass
+#
+class EpilogueTileIteratorParams:
+  '''
+  '''
+  #
+  def __init__(self, desc, label):
+    self.desc = desc
+    self.label = label
+    self.fmt = "qqqqqqqq"
+  #
+  def size(self):
+    return struct.calcsize(self.fmt)
+  #
+  def alignment(self):
+    return MaxAlignment(self.fmt)
+  #
+  def initialize(self, host_workspace, offset, stride):
+    stride = stride * self.desc.element_size_bits // 8
+    offset = AlignedOffset(offset, self.alignment())
+    increment_row = stride * self.desc.delta.row
+    increment_group = stride * self.desc.delta.group \
+      - stride * self.desc.delta.row * (self.desc.iterations.row - 1)
+    increment_cluster = stride * self.desc.delta.cluster \
+      - stride * self.desc.delta.group * (self.desc.iterations.group - 1) \
+      - stride * self.desc.delta.row * (self.desc.iterations.row - 1)
+    advance_row = stride * self.desc.shape.row
+    advance_group = stride *                   \
+      (self.desc.shape.group - 1) * \
+      self.desc.shape.row *         \
+      self.desc.count.row
+    advance_cluster = stride * \
+      self.desc.count.group * \
+      self.desc.shape.group * \
+      self.desc.count.row   * \
+      self.desc.shape.row
+    advance_tile = stride * \
+      self.desc.shape.group * \
+      self.desc.shape.row   * \
+      self.desc.shape.cluster * \
+      self.desc.shape.tile
+    struct.pack_into(
+      self.fmt,                                           \
+      host_workspace,                                     \
+      offset,                                             \
+      stride,                                             \
+      increment_row, increment_group, increment_cluster,  \
+      advance_row, advance_group, advance_cluster, advance_tile)
+    return offset + self.size()
+#
+#################################################################################################
+#
+# Launch configuration
+#
+#################################################################################################
+class LaunchConfiguration:
+  def __init__(self, grid = [1,1,1], block = [1,1,1], smem = 0):
+    self.grid = grid
+    self.block = block
+    self.shared_memory_capacity = smem
+#################################################################################################
+#
+# Functors
+#
+#################################################################################################
+#
+class Functor:
+  def __init__(self):
+    self.decl = ''
+    self.definition = ''
+    self.fmt = ''
+    self.identifier = ''
+  #
+  def emit_declaration(self):
+    return self.decl
+  #
+  def emit_definition(self):
+    return self.definition
+  #
+  def size(self):
+    '''
+    Size of the packed Params structure
+    '''
+    return struct.calcsize(self.fmt)
+  #
+  def alignment(self):
+    return MaxAlignment(self.fmt)
+  #
+  def initialize(self, host_workspace, offset, arguments):
+    return offset + self.size()
+#################################################################################################
+#
+class LinearCombinationFunctorArguments:
+  def __init__(self, alpha = 1.0, beta = 0.0):
+    self.alpha = alpha
+    self.beta = beta
+    self.alpha_ptr = 0
+    self.beta_ptr = 0
+#
+class LinearCombinationFunctor(Functor):
+  def __init__(self):
+    super().__init__()
+    self.decl = """
+    cutlass::epilogue::thread::LinearCombination<
+      float,
+      1,
+      float,
+      float
+    >"""
+    self.identifier = 'linear_combination'
+    self.fmt = "ffPP"
+  #
+  def size(self):
+    '''
+    Size of the packed Params structure
+    '''
+    return struct.calcsize(self.fmt)
+  #
+  def alignment(self):
+    return MaxAlignment(self.fmt)
+  #
+  def initialize(self, host_workspace, offset, arguments):
+    offset = AlignedOffset(offset, self.alignment())
+    struct.pack_into(
+      self.fmt,
+      host_workspace, offset,
+      arguments.alpha, arguments.beta, arguments.alpha_ptr, arguments.beta_ptr)
+    return offset + self.size()
+#################################################################################################
+#
+# Base class for an executable operation
+#
+#################################################################################################
+#
+class ExecutableOperation:
+  '''
+  '''
+  def __init__(self, operation):
+    self.operation = operation
+    self.module = None
+    self.kernel = None
+  #
+  def name(self):
+    return self.operation.procedural_name()
+  #
+  def emit(self):
+    return ''
+  #
+  def can_implement(self, configuration, arguments):
+    return False
+  #
+  def get_host_workspace_size(self, arguments):
+    return 0
+  #
+  def get_device_workspace_size(self, arguments):
+    return 0
+  #
+  def plan(self, arguments):
+    return LaunchConfiguration()
+  #
+  def initialize(self, host_workspace, device_workspace, launch_config, arguments, stream = cuda.CUstream(0)):
+    raise NotImplementedError()
+  #
+  def run(self, host_workspace, device_workspace, launch_config, stream = cuda.CUstream(0)):
+    cArg = (ctypes.c_char * len(host_workspace)).from_buffer(host_workspace)
+    packed = (ctypes.c_void_p * 1)()
+    packed[0] = ctypes.addressof(cArg)
+    err, = cuda.cuLaunchKernel(
+      self.kernel,
+      launch_config.grid[0], launch_config.grid[1], launch_config.grid[2],
+      launch_config.block[0], launch_config.block[1], launch_config.block[2],
+      launch_config.shared_memory_capacity,
+      stream,
+      packed,
+      0)
+    return err
+#################################################################################################
+#
+class GemmArguments:
+  '''
+  '''
+  def __init__(self):
+    self.problem_size = GemmCoord(0, 0, 0)
+    self.A = TensorRef()
+    self.B = TensorRef()
+    self.C = TensorRef()
+    self.D = TensorRef()
+    self.output_op = LinearCombinationFunctorArguments()
+#
+class ThreadblockSwizzle:
+  def __init__(self, threadblock_shape, log_threadblock_cohort = 0):
+    self.threadblock_shape = threadblock_shape
+    self.log_threadblock_cohort = log_threadblock_cohort
+  def grid_tiled_shape(self, problem_size):
+    return GemmCoord(
+      ceil_div(problem_size.m, self.threadblock_shape.m),
+      ceil_div(problem_size.n, self.threadblock_shape.n),
+      1)
+#
+class Gemm(ExecutableOperation):
+  '''
+  GEMM manages the CUTLASS runtime components
+  '''
+  #
+  def __init__(self, operation):
+    super().__init__(operation)
+    self.emitter = EmitGemmUniversalInstance('_type')
+    self.threadblock_swizzle = ThreadblockSwizzle(GemmCoord(128, 128, 8))
+    self.threads = 256
+    self.shared_memory_capacity = (32 << 10)
+    self.params_A = PredicatedTileAccessIteratorParams(
+        PredicatedTileAccessIteratorDesc(
+          32,
+          1,
+          PitchLinearCoord(128, 8),
+          PitchLinearCoord(1, 4),
+          PitchLinearCoord(1, 2)), 'A')
+    self.params_B = PredicatedTileAccessIteratorParams(
+        PredicatedTileAccessIteratorDesc(
+          32,
+          1,
+          PitchLinearCoord(128, 8),
+          PitchLinearCoord(1, 4),
+          PitchLinearCoord(1, 2)), 'B')
+    self.params_C = EpilogueTileIteratorParams(
+      EpilogueThreadMap(
+        256,
+        1,
+        32,
+        EpilogueTileDesc(128, 1, 4, 4, 1),
+        EpilogueTileDesc(4, 1, 2, 1, 1),
+        EpilogueTileDesc(32, 1, 8, 1, 1),
+        EpilogueTileDesc(1, 4, 2, 1, 8)), 'C')
+    self.params_D = EpilogueTileIteratorParams(
+      EpilogueThreadMap(
+        256,
+        1,
+        32,
+        EpilogueTileDesc(128, 1, 4, 4, 1),
+        EpilogueTileDesc(4, 1, 2, 1, 1),
+        EpilogueTileDesc(32, 1, 8, 1, 1),
+        EpilogueTileDesc(1, 4, 2, 1, 8)), 'D')
+    self.output_op = LinearCombinationFunctor()
+  #
+  def emit(self):
+    return self.emitter.emit(self.operation)
+  #
+  def can_implement(self, configuration, arguments):
+    pass
+  #
+  def get_host_workspace_size(self, arguments):
+    return 336
+  #
+  def get_device_workspace_size(self, arguments):
+    return 0
+  #
+  def plan(self, arguments):
+    grid = self.threadblock_swizzle.grid_tiled_shape(arguments.problem_size)
+    return LaunchConfiguration([grid.m, grid.n, grid.k], [self.threads, 1, 1], self.shared_memory_capacity)
+  #
+  def initialize(self, host_workspace, device_workspace, launch_config, arguments, stream = cuda.CUstream(0)):
+    offset = 0
+    # Compute intermediate results
+    swizzle_log_tile = 0
+    gemm_mode = 0
+    batch_count = 1
+    gemm_k_size = arguments.problem_size.k
+    # Pack into the host workspace buffer
+    offset = arguments.problem_size.pack_into(host_workspace, offset)
+    grid_tiled_shape = self.threadblock_swizzle.grid_tiled_shape(arguments.problem_size)
+    offset = grid_tiled_shape.pack_into(host_workspace, offset)
+    offset = PackInteger(host_workspace, offset, swizzle_log_tile)
+    offset = self.params_A.initialize(host_workspace, offset, arguments.A.layout)
+    offset = self.params_B.initialize(host_workspace, offset, arguments.B.layout)
+    offset = self.params_C.initialize(host_workspace, offset, arguments.C.layout)
+    offset = self.params_D.initialize(host_workspace, offset, arguments.D.layout)
+    offset = self.output_op.initialize(host_workspace, offset, arguments.output_op)
+    offset = PackInteger(host_workspace, offset, gemm_mode)
+    offset = PackInteger(host_workspace, offset, batch_count)
+    offset = PackInteger(host_workspace, offset, gemm_k_size)
+    offset = PackDevicePointer(host_workspace, offset, int(arguments.A.pointer))
+    offset = PackDevicePointer(host_workspace, offset, int(arguments.B.pointer))
+    offset = PackDevicePointer(host_workspace, offset, int(arguments.C.pointer))
+    offset = PackDevicePointer(host_workspace, offset, int(arguments.D.pointer))
+    return offset
+#################################################################################################
+#
+# Module represents a compilation unit
+#
+#################################################################################################
+#
+class CompilationOptions:
+  '''
+  Compilation options.
+  '''
+  #
+  def __init__(self, architectures = [80], include_paths = []):
+    self.includes = []
+    self.include_paths = include_paths
+    self.flags = ['-std=c++11', '-default-device']
+    self.architectures = architectures
+  #
+  def get(self):
+    options = []
+    for flag in self.flags:
+      options.append(bytes(str.encode(flag)))
+    for incl in self.include_paths:
+      options.append(bytes(str.encode('--include-path=%s' % incl)))
+    arch_list = "-arch="
+    for idx, arch in enumerate(self.architectures):
+      if idx:
+        arch_list += ","
+      arch_list += "sm_%d" % arch
+    options.append(bytes(str.encode(arch_list)))
+    return options
+IncludeTemplate = r'''#include "${include}"
+'''
+KernelTemplate = r'''
+extern "C"
+__global__ void
+${operation_name}(${operation_name}${operation_suffix}::Params params) {
+  // Dynamic shared memory base pointer
+  extern __shared__ int SharedStorageBase[];
+  // Declare pointer to dynamic shared memory.
+  ${operation_name}${operation_suffix}::SharedStorage *shared_storage =
+      reinterpret_cast<${operation_name}${operation_suffix}::SharedStorage *>(SharedStorageBase);
+  ${operation_name}${operation_suffix} op;
+  op(params, *shared_storage);
+}
+'''
+#
+class Module:
+  def __init__(self, name, operations, compilation_options):
+    self.name = name
+    self.operations = operations
+    self.module = None
+    self.log = None
+    self.cubin_image = None
+    self.source_buffer = ''
+    #
+    # Emit source
+    #
+    self.emit_()
+    #
+    # Compile
+    #
+    self.compile_(compilation_options)
+    #
+    # Load module
+    #
+    self.load_()
+    # Done
+    return
+  # Emit a source buffer
+  def emit_(self):
+    # 1. Includes
+    includes = []
+    for operation in self.operations:
+      for incl in operation.emitter.includes:
+        if incl not in includes:
+          includes.append(incl)
+    for incl in includes:
+      self.source_buffer += SubstituteTemplate(IncludeTemplate, { 'include': incl} )
+    # 2. Operations
+    for operation in self.operations:
+      self.source_buffer += operation.emit()
+      values = {
+        'operation_name': operation.name(),
+        'operation_suffix': operation.emitter.operation_suffix
+      }
+      self.source_buffer += SubstituteTemplate(KernelTemplate, values)
+    # Done
+    return
+  # Compile with NVRTC
+  def compile_(self, compilation_options):
+    err, program = nvrtc.nvrtcCreateProgram(
+      str.encode(self.source_buffer),
+      bytes(str.encode(self.name)),
+      0, [], [])
+    if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
+        raise RuntimeError('NVRTC Error: {}'.format(err))
+    # Compile program
+    options = compilation_options.get()
+    err, = nvrtc.nvrtcCompileProgram(program, len(options), options)
+    if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
+      error_string = 'NVRTC Error: {}\n'.format(err)
+      # Get log from compilation
+      err, logSize = nvrtc.nvrtcGetProgramLogSize(program)
+      if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
+          raise RuntimeError('NVRTC Error: {}'.format(err))
+      self.log = b' ' * logSize
+      err, = nvrtc.nvrtcGetProgramLog(program, self.log)
+      if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
+          raise RuntimeError('NVRTC Error: {}'.format(err))
+      raise RuntimeError(error_string + self.log.decode() + self.source_buffer)
+    # Get data from compilation
+    err, dataSize = nvrtc.nvrtcGetCUBINSize(program)
+    if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
+        raise RuntimeError('NVRTC Error: {}'.format(err))
+    self.cubin_image = b' ' * dataSize
+    err, = nvrtc.nvrtcGetCUBIN(program, self.cubin_image)
+    if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
+        raise RuntimeError('NVRTC Error: {}'.format(err))
+    return
+  #
+  def load_(self):
+    # Load data as module data
+    err, self.module = cuda.cuModuleLoadData(self.cubin_image)
+    if err != cuda.CUresult.CUDA_SUCCESS:
+        raise RuntimeError('Cuda Error: {}'.format(err))
+    # Get functions
+    for operation in self.operations:
+      err, operation.kernel = cuda.cuModuleGetFunction(
+        self.module,
+        bytes(str.encode(operation.name())))
+      if err != cuda.CUresult.CUDA_SUCCESS:
+          raise RuntimeError('Cuda Error: {}'.format(err))
+      operation.module = self
+    return
+#################################################################################################
+#
+# Manifest represents an 'owner' for modules and operations
+#
+#################################################################################################
+#
+class Manifest:
+  #
+  def __init__(self):
+    self.operations = {}
+    self.modules = []
+    pass
+  #
+  def append_module(self, module):
+    '''
+    Appends a module and takes ownership of operations used to construct it.
+    '''
+    self.modules.append(module)
+    for operation in module.operations:
+      self.operations[operation.name()] = operation
+#################################################################################################