PyPI - warp-lang - Versions diffs - 0.10.1__py3-none-win_amd64.whl → 0.11.0__py3-none-win_amd64.whl - Mend

warp-lang 0.10.1__py3-none-win_amd64.whl → 0.11.0__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (300) hide show

warp/__init__.py +10 -4
warp/__init__.pyi +1 -0
warp/bin/warp-clang.dll +0 -0
warp/bin/warp.dll +0 -0
warp/build.py +5 -3
warp/build_dll.py +29 -9
warp/builtins.py +868 -507
warp/codegen.py +1074 -638
warp/config.py +3 -3
warp/constants.py +6 -0
warp/context.py +715 -222
warp/fabric.py +326 -0
warp/fem/__init__.py +27 -0
warp/fem/cache.py +389 -0
warp/fem/dirichlet.py +181 -0
warp/fem/domain.py +263 -0
warp/fem/field/__init__.py +101 -0
warp/fem/field/field.py +149 -0
warp/fem/field/nodal_field.py +299 -0
warp/fem/field/restriction.py +21 -0
warp/fem/field/test.py +181 -0
warp/fem/field/trial.py +183 -0
warp/fem/geometry/__init__.py +19 -0
warp/fem/geometry/closest_point.py +70 -0
warp/fem/geometry/deformed_geometry.py +271 -0
warp/fem/geometry/element.py +744 -0
warp/fem/geometry/geometry.py +186 -0
warp/fem/geometry/grid_2d.py +373 -0
warp/fem/geometry/grid_3d.py +435 -0
warp/fem/geometry/hexmesh.py +953 -0
warp/fem/geometry/partition.py +376 -0
warp/fem/geometry/quadmesh_2d.py +532 -0
warp/fem/geometry/tetmesh.py +840 -0
warp/fem/geometry/trimesh_2d.py +577 -0
warp/fem/integrate.py +1616 -0
warp/fem/operator.py +191 -0
warp/fem/polynomial.py +213 -0
warp/fem/quadrature/__init__.py +2 -0
warp/fem/quadrature/pic_quadrature.py +245 -0
warp/fem/quadrature/quadrature.py +294 -0
warp/fem/space/__init__.py +292 -0
warp/fem/space/basis_space.py +489 -0
warp/fem/space/collocated_function_space.py +105 -0
warp/fem/space/dof_mapper.py +236 -0
warp/fem/space/function_space.py +145 -0
warp/fem/space/grid_2d_function_space.py +267 -0
warp/fem/space/grid_3d_function_space.py +306 -0
warp/fem/space/hexmesh_function_space.py +352 -0
warp/fem/space/partition.py +350 -0
warp/fem/space/quadmesh_2d_function_space.py +369 -0
warp/fem/space/restriction.py +160 -0
warp/fem/space/shape/__init__.py +15 -0
warp/fem/space/shape/cube_shape_function.py +738 -0
warp/fem/space/shape/shape_function.py +103 -0
warp/fem/space/shape/square_shape_function.py +611 -0
warp/fem/space/shape/tet_shape_function.py +567 -0
warp/fem/space/shape/triangle_shape_function.py +429 -0
warp/fem/space/tetmesh_function_space.py +292 -0
warp/fem/space/topology.py +295 -0
warp/fem/space/trimesh_2d_function_space.py +221 -0
warp/fem/types.py +77 -0
warp/fem/utils.py +495 -0
warp/native/array.h +147 -44
warp/native/builtin.h +122 -149
warp/native/bvh.cpp +73 -325
warp/native/bvh.cu +406 -23
warp/native/bvh.h +34 -43
warp/native/clang/clang.cpp +13 -8
warp/native/crt.h +2 -0
warp/native/cuda_crt.h +5 -0
warp/native/cuda_util.cpp +15 -3
warp/native/cuda_util.h +3 -1
warp/native/cutlass/tools/library/scripts/conv2d_operation.py +463 -0
warp/native/cutlass/tools/library/scripts/conv3d_operation.py +321 -0
warp/native/cutlass/tools/library/scripts/gemm_operation.py +988 -0
warp/native/cutlass/tools/library/scripts/generator.py +4625 -0
warp/native/cutlass/tools/library/scripts/library.py +799 -0
warp/native/cutlass/tools/library/scripts/manifest.py +402 -0
warp/native/cutlass/tools/library/scripts/pycutlass/docs/source/conf.py +96 -0
warp/native/cutlass/tools/library/scripts/pycutlass/profile/conv/conv2d_f16_sm80.py +106 -0
warp/native/cutlass/tools/library/scripts/pycutlass/profile/gemm/gemm_f32_sm80.py +91 -0
warp/native/cutlass/tools/library/scripts/pycutlass/setup.py +80 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/__init__.py +48 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/arguments.py +118 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/c_types.py +241 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/compiler.py +432 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/conv2d_operation.py +631 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/epilogue.py +1026 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/frontend.py +104 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/gemm_operation.py +1276 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/library.py +744 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/memory_manager.py +74 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/operation.py +110 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/parser.py +619 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/reduction_operation.py +398 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/tensor_ref.py +70 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/test/__init__.py +4 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/test/conv2d_testbed.py +646 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/test/gemm_grouped_testbed.py +235 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/test/gemm_testbed.py +557 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/test/profiler.py +70 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/type_hint.py +39 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/utils/__init__.py +1 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/utils/device.py +76 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/utils/reference_model.py +255 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/__init__.py +0 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.py +201 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py +177 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_dgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.py +98 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_dgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.py +95 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_few_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.py +163 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_fixed_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.py +187 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.py +309 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py +54 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.py +96 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.py +107 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_strided_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py +253 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.py +97 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py +242 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_wgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.py +96 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_wgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.py +107 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/run_all_tests.py +10 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/frontend/test_frontend.py +146 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/__init__.py +0 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/gemm_bf16_sm80.py +96 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/gemm_f16_sm80.py +447 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/gemm_f32_sm80.py +146 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/gemm_f64_sm80.py +102 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/gemm_grouped_sm80.py +203 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/gemm_s8_sm80.py +229 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/run_all_tests.py +9 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/unit/test_sm80.py +453 -0
warp/native/cutlass/tools/library/scripts/rank_2k_operation.py +398 -0
warp/native/cutlass/tools/library/scripts/rank_k_operation.py +387 -0
warp/native/cutlass/tools/library/scripts/rt.py +796 -0
warp/native/cutlass/tools/library/scripts/symm_operation.py +400 -0
warp/native/cutlass/tools/library/scripts/trmm_operation.py +407 -0
warp/native/cutlass_gemm.cu +5 -3
warp/native/exports.h +1240 -952
warp/native/fabric.h +228 -0
warp/native/hashgrid.cpp +4 -4
warp/native/hashgrid.h +22 -2
warp/native/intersect.h +22 -7
warp/native/intersect_adj.h +8 -8
warp/native/intersect_tri.h +1 -1
warp/native/marching.cu +157 -161
warp/native/mat.h +80 -19
warp/native/matnn.h +2 -2
warp/native/mesh.cpp +33 -108
warp/native/mesh.cu +114 -23
warp/native/mesh.h +446 -46
warp/native/noise.h +272 -329
warp/native/quat.h +51 -8
warp/native/rand.h +45 -35
warp/native/range.h +6 -2
warp/native/reduce.cpp +1 -1
warp/native/reduce.cu +10 -12
warp/native/runlength_encode.cu +6 -10
warp/native/scan.cu +8 -11
warp/native/sparse.cpp +4 -4
warp/native/sparse.cu +164 -154
warp/native/spatial.h +2 -2
warp/native/temp_buffer.h +14 -30
warp/native/vec.h +107 -23
warp/native/volume.h +120 -0
warp/native/warp.cpp +560 -30
warp/native/warp.cu +431 -44
warp/native/warp.h +13 -4
warp/optim/__init__.py +1 -0
warp/optim/linear.py +922 -0
warp/optim/sgd.py +92 -0
warp/render/render_opengl.py +335 -119
warp/render/render_usd.py +11 -11
warp/sim/__init__.py +2 -2
warp/sim/articulation.py +385 -185
warp/sim/collide.py +8 -0
warp/sim/import_mjcf.py +297 -106
warp/sim/import_urdf.py +389 -210
warp/sim/import_usd.py +198 -97
warp/sim/inertia.py +17 -18
warp/sim/integrator_euler.py +14 -8
warp/sim/integrator_xpbd.py +158 -16
warp/sim/model.py +795 -291
warp/sim/render.py +3 -3
warp/sim/utils.py +3 -0
warp/sparse.py +640 -150
warp/stubs.py +606 -267
warp/tape.py +61 -10
warp/tests/__main__.py +3 -6
warp/tests/assets/curlnoise_golden.npy +0 -0
warp/tests/assets/pnoise_golden.npy +0 -0
warp/tests/{test_class_kernel.py → aux_test_class_kernel.py} +9 -1
warp/tests/aux_test_conditional_unequal_types_kernels.py +21 -0
warp/tests/{test_dependent.py → aux_test_dependent.py} +2 -2
warp/tests/{test_reference.py → aux_test_reference.py} +1 -1
warp/tests/aux_test_unresolved_func.py +14 -0
warp/tests/aux_test_unresolved_symbol.py +14 -0
warp/tests/disabled_kinematics.py +239 -0
warp/tests/run_coverage_serial.py +31 -0
warp/tests/test_adam.py +103 -106
warp/tests/test_arithmetic.py +128 -74
warp/tests/test_array.py +212 -97
warp/tests/test_array_reduce.py +57 -23
warp/tests/test_atomic.py +64 -28
warp/tests/test_bool.py +99 -0
warp/tests/test_builtins_resolution.py +1292 -0
warp/tests/test_bvh.py +42 -18
warp/tests/test_closest_point_edge_edge.py +54 -57
warp/tests/test_codegen.py +208 -130
warp/tests/test_compile_consts.py +28 -20
warp/tests/test_conditional.py +108 -24
warp/tests/test_copy.py +10 -12
warp/tests/test_ctypes.py +112 -88
warp/tests/test_dense.py +21 -14
warp/tests/test_devices.py +98 -0
warp/tests/test_dlpack.py +75 -75
warp/tests/test_examples.py +277 -0
warp/tests/test_fabricarray.py +955 -0
warp/tests/test_fast_math.py +15 -11
warp/tests/test_fem.py +1271 -0
warp/tests/test_fp16.py +53 -19
warp/tests/test_func.py +187 -86
warp/tests/test_generics.py +194 -49
warp/tests/test_grad.py +178 -109
warp/tests/test_grad_customs.py +176 -0
warp/tests/test_hash_grid.py +52 -37
warp/tests/test_import.py +10 -23
warp/tests/test_indexedarray.py +32 -31
warp/tests/test_intersect.py +18 -9
warp/tests/test_large.py +141 -0
warp/tests/test_launch.py +14 -41
warp/tests/test_lerp.py +64 -65
warp/tests/test_linear_solvers.py +154 -0
warp/tests/test_lvalue.py +493 -0
warp/tests/test_marching_cubes.py +12 -13
warp/tests/test_mat.py +517 -2898
warp/tests/test_mat_lite.py +115 -0
warp/tests/test_mat_scalar_ops.py +2889 -0
warp/tests/test_math.py +103 -9
warp/tests/test_matmul.py +305 -69
warp/tests/test_matmul_lite.py +410 -0
warp/tests/test_mesh.py +71 -14
warp/tests/test_mesh_query_aabb.py +41 -25
warp/tests/test_mesh_query_point.py +140 -22
warp/tests/test_mesh_query_ray.py +39 -22
warp/tests/test_mlp.py +30 -22
warp/tests/test_model.py +92 -89
warp/tests/test_modules_lite.py +39 -0
warp/tests/test_multigpu.py +88 -114
warp/tests/test_noise.py +12 -11
warp/tests/test_operators.py +16 -20
warp/tests/test_options.py +11 -11
warp/tests/test_pinned.py +17 -18
warp/tests/test_print.py +32 -11
warp/tests/test_quat.py +275 -129
warp/tests/test_rand.py +18 -16
warp/tests/test_reload.py +38 -34
warp/tests/test_rounding.py +50 -43
warp/tests/test_runlength_encode.py +168 -20
warp/tests/test_smoothstep.py +9 -11
warp/tests/test_snippet.py +143 -0
warp/tests/test_sparse.py +261 -63
warp/tests/test_spatial.py +276 -243
warp/tests/test_streams.py +110 -85
warp/tests/test_struct.py +268 -63
warp/tests/test_tape.py +39 -21
warp/tests/test_torch.py +118 -89
warp/tests/test_transient_module.py +12 -13
warp/tests/test_types.py +614 -0
warp/tests/test_utils.py +494 -0
warp/tests/test_vec.py +354 -2050
warp/tests/test_vec_lite.py +73 -0
warp/tests/test_vec_scalar_ops.py +2099 -0
warp/tests/test_volume.py +457 -293
warp/tests/test_volume_write.py +124 -134
warp/tests/unittest_serial.py +35 -0
warp/tests/unittest_suites.py +341 -0
warp/tests/unittest_utils.py +568 -0
warp/tests/unused_test_misc.py +71 -0
warp/tests/{test_debug.py → walkthough_debug.py} +3 -17
warp/thirdparty/appdirs.py +36 -45
warp/thirdparty/unittest_parallel.py +549 -0
warp/torch.py +9 -6
warp/types.py +1089 -366
warp/utils.py +93 -387
warp_lang-0.11.0.dist-info/METADATA +238 -0
warp_lang-0.11.0.dist-info/RECORD +332 -0
{warp_lang-0.10.1.dist-info → warp_lang-0.11.0.dist-info}/WHEEL +1 -1
warp/tests/test_all.py +0 -219
warp/tests/test_array_scan.py +0 -60
warp/tests/test_base.py +0 -208
warp/tests/test_unresolved_func.py +0 -7
warp/tests/test_unresolved_symbol.py +0 -7
warp_lang-0.10.1.dist-info/METADATA +0 -21
warp_lang-0.10.1.dist-info/RECORD +0 -188
/warp/tests/{test_compile_consts_dummy.py → aux_test_compile_consts_dummy.py} +0 -0
/warp/tests/{test_reference_reference.py → aux_test_reference_reference.py} +0 -0
/warp/tests/{test_square.py → aux_test_square.py} +0 -0
{warp_lang-0.10.1.dist-info → warp_lang-0.11.0.dist-info}/LICENSE.md +0 -0
{warp_lang-0.10.1.dist-info → warp_lang-0.11.0.dist-info}/top_level.txt +0 -0

warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/epilogue.py ADDED Viewed

@@ -0,0 +1,1026 @@
+################################################################################
+#
+# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+################################################################################
+from ast import Num
+from audioop import mul
+from pipes import Template
+import struct
+from pycutlass.library import DataTypeTag
+from pycutlass import *
+import cutlass
+from scipy.special import erf
+from pycutlass.c_types import MatrixCoord_
+from pycutlass.frontend import NumpyFrontend
+from cuda import cuda
+from cuda import cudart
+dtype2ctype = {
+    cutlass.float16: ctypes.c_uint16,
+    cutlass.float32: ctypes.c_float,
+    cutlass.float64: ctypes.c_double,
+    cutlass.int32: ctypes.c_int32
+}
+#################################################################################################
+#
+# Epilogue Functors
+#
+#################################################################################################
+class EpilogueFunctorBase:
+    """
+    Base class for thread-level epilogue functors
+    """
+    def __init__(self) -> None:
+        pass
+    def emit(self, tag, template_argument):
+        template = """${tag}<${arguments}>"""
+        arguments = ""
+        for idx, arg in enumerate(template_argument):
+            arguments += arg
+            if idx < len(template_argument) - 1:
+                arguments += ", "
+        values = {
+            "tag": tag,
+            "arguments": arguments
+        }
+        return SubstituteTemplate(template, values)
+class LinearCombination(EpilogueFunctorBase):
+    """
+    Apply a linear combination operator to an array of elements
+    D = alpha * accumulator + beta * source
+    :param element_output: data type used to load and store tensors
+    :param epilogue_vector_length: number of elements computed per operation.
+    Usually it is 128/sizeof_bits<ElementOutput_>, but we use 64 and 32 sometimes
+    when there are not enough data to store
+    :param element_accumulator: Accumulator data type
+    :param element_epilogue: data type used to compute linear combination
+    """
+    tag = "cutlass::epilogue::thread::LinearCombination"
+    def __init__(
+        self, element_output, epilogue_vector_length,
+        element_accumulator=None, element_epilogue=None) -> None: # TODO bind ScaleType
+        super().__init__()
+        if element_accumulator is None:
+            element_accumulator = element_output
+        if element_epilogue is None:
+            element_epilogue = element_output
+        self.element_output = element_output
+        self.element_accumulator = element_accumulator
+        self.element_epilogue = element_epilogue
+        self.template_arguments = [
+            DataTypeTag[element_output], str(epilogue_vector_length),
+            DataTypeTag[element_accumulator], DataTypeTag[element_epilogue]
+        ]
+        # get epilogue output op type
+        c_element_epilogue = dtype2ctype[self.element_epilogue]
+        element_epilogue = self.element_epilogue
+        class _EpilogueOutputOpParams(ctypes.Structure):
+            _fields_ = [
+                ("alpha_data", ctypes.c_longlong*2),
+                ("beta_data", ctypes.c_longlong*2),
+                ("alpha", c_element_epilogue),
+                ("beta", c_element_epilogue),
+                ("alpha_ptr", ctypes.c_void_p),
+                ("beta_ptr", ctypes.c_void_p),
+            ]
+            def __init__(self, alpha, beta, *args) -> None:
+                self.alpha = element_epilogue(alpha).storage
+                self.beta = element_epilogue(beta).storage
+        self.epilogue_type = _EpilogueOutputOpParams
+    def emit(self):
+        return super().emit(self.tag, self.template_arguments)
+class LinearCombinationClamp(LinearCombination):
+    """
+    Applies a linear combination operator to an array of elements then clamps
+    the output before converting to the output element type.
+    D = alpha * accumulator + beta * source + uniform
+    :param element_output: data type used to load and store tensors
+    :param epilogue_vector_length: number of elements computed per operation.
+    Usually it is 128/sizeof_bits<ElementOutput_>, but we use 64 and 32 sometimes
+    when there are not enough data to store
+    :param element_accumulator: Accumulator data type
+    :param element_epilogue: data type used to compute linear combination
+    """
+    tag = "cutlass::epilogue::thread::LinearCombinationClamp"
+    def __init__(
+        self, element_output, epilogue_vector_length,
+        element_accumulator=None, element_epilogue=None) -> None:
+        # Base constructor
+        super().__init__(
+            element_output, epilogue_vector_length,
+            element_accumulator, element_epilogue)
+        c_element_epilogue = dtype2ctype[self.element_epilogue]
+        element_epilogue = self.element_epilogue
+        class _EpilogueOutputOpParams(ctypes.Structure):
+            _fields_ = [
+                ("alpha", c_element_epilogue),
+                ("beta", c_element_epilogue),
+                ("alpha_ptr", ctypes.c_void_p),
+                ("beta_ptr", ctypes.c_void_p),
+            ]
+            def __init__(self, alpha, beta, *args) -> None:
+                self.alpha = element_epilogue(alpha).storage
+                self.beta = element_epilogue(beta).storage
+        self.epilogue_type =  _EpilogueOutputOpParams
+class FastLinearCombinationClamp(EpilogueFunctorBase):
+    """
+    Applies a linear combination operator to an array of elements then clamps
+    the output before converting to the output element type.
+    D = alpha * accumulator + beta * source
+    Note: The below method only when problem_size_K <= 256 for signed int8 gemm
+    or problem_size_K <= 128 for unsigned int8 gemm. The default approach is
+    above.
+    :param element_output: data type used to load and store tensors
+    :param epilogue_vector_length: number of elements computed per operation.
+    Usually it is 128/sizeof_bits<ElementOutput_>, but we use 64 and 32 sometimes
+    when there are not enough data to store
+    """
+    tag = "cutlass::epilogue::thread::FastLinearCombinationClamp"
+    def __init__(self, element_output, epilogue_vector_length, *args) -> None:
+        super().__init__()
+        self.template_arguments = [
+            DataTypeTag[element_output], str(epilogue_vector_length)
+        ]
+        self.element_accumulator = cutlass.int32
+        self.element_epilogue = cutlass.float32
+        # get epilogue output op
+        c_element_epilogue = dtype2ctype[self.element_epilogue]
+        element_epilogue = self.element_epilogue
+        class _EpilogueOutputOpParams(ctypes.Structure):
+            _fields_ = [
+                ("alpha", c_element_epilogue),
+                ("beta", c_element_epilogue),
+                ("alpha_ptr", ctypes.c_void_p),
+                ("beta_ptr", ctypes.c_void_p),
+            ]
+            def __init__(self, alpha, beta, *args) -> None:
+                self.alpha = element_epilogue(alpha).storage
+                self.beta = element_epilogue(beta).storage
+        self.epilogue_type =  _EpilogueOutputOpParams
+    def emit(self):
+        return super().emit(self.tag, self.template_arguments)
+class LinearCombinationGeneric(LinearCombination):
+    """
+    Applies a linear combination operator followed by an activation function
+    to an array of elements.
+    D = activation(alpha * accumulator + beta * source)
+    :param activation_functor: input activation functor
+    :param element_output: data type used to load and store tensors
+    :param epilogue_vector_length: number of elements computed per operation.
+    Usually it is 128/sizeof_bits<ElementOutput_>, but we use 64 and 32 sometimes
+    when there are not enough data to store
+    :param element_accumulator: Accumulator data type
+    :param element_epilogue: data type used to compute linear combination
+    """
+    tag = "cutlass::epilogue::thread::LinearCombinationGeneric"
+    def __init__(
+        self, activation_functor,
+        element_output, epilogue_vector_length,
+        element_accumulator=None, element_epilogue=None) -> None:
+        super().__init__(
+            element_output, epilogue_vector_length,
+            element_accumulator, element_epilogue)
+        self.template_arguments = [
+            activation_functor.emit(),] + self.template_arguments
+        self.activation_functor = activation_functor
+        self.element_epilogue = element_epilogue
+        # get epilogue output op
+        self.epilogue_type = self.activation_functor.epilogue_output_op(self.element_epilogue)
+class ActivationFunctor:
+    """
+    Base class for frequently used activation functions
+    """
+    def __init__(self, element_compute) -> None:
+        pass
+    @staticmethod
+    def numpy(x: np.ndarray):
+        raise NotImplementedError()
+    def emit(self):
+        return self.tag
+    @staticmethod
+    def epilogue_output_op(element_epilogue):
+        c_element_epilogue = dtype2ctype[element_epilogue]
+        class _EpilogueOutputOpParams(ctypes.Structure):
+            _fields_ = [
+                ("alpha", c_element_epilogue),
+                ("beta", c_element_epilogue),
+                ("alpha_ptr", ctypes.c_void_p),
+                ("beta_ptr", ctypes.c_void_p),
+            ]
+            def __init__(self, alpha, beta, *args) -> None:
+                self.alpha = element_epilogue(alpha).storage
+                self.beta = element_epilogue(beta).storage
+        return _EpilogueOutputOpParams
+# identity operator
+class identity(ActivationFunctor):
+    def numpy(x: np.ndarray):
+        return x
+# ReLu operator,
+class relu(ActivationFunctor):
+    tag = "cutlass::epilogue::thread::ReLu"
+    def __init__(self, element_compute):
+        super().__init__(element_compute)
+        class _Arguments(ctypes.Structure):
+            _fields_ = [
+                ("threshold", dtype2ctype[element_compute])
+            ]
+            def __init__(self, threshold=0.) -> None:
+                self.threshold = element_compute(threshold).storage
+        self.argument_type = _Arguments
+    def emit_visitor(self):
+        return "cutlass::ReLUVisitor"
+    @staticmethod
+    def numpy(x: np.ndarray):
+        return np.maximum(x, 0)
+# Leaky ReLu operator
+class leaky_relu(ActivationFunctor):
+    tag = "cutlass::epilogue::thread::LeakyReLU"
+    def __init__(self, element_compute) -> None:
+        super().__init__(element_compute)
+        class _Arguments(ctypes.Structure):
+            _fields_ = [
+                ("leaky_alpha", dtype2ctype[element_compute])
+            ]
+            def __init__(self, leaky_alpha) -> None:
+                self.leaky_alpha = element_compute(leaky_alpha).storage
+        self.argument_type = _Arguments
+    def emit_visitor(self):
+        return "cutlass::LeakyReLUVisitor"
+    @staticmethod
+    def numpy(x: np.ndarray, leaky_alpha):
+        return np.maximum(x, 0) + np.minimum(x, 0) * leaky_alpha
+    def epilogue_output_op(self, element_epilogue):
+        c_element_epilogue = dtype2ctype[element_epilogue]
+        class _EpilogueOutputOpParams(ctypes.Structure):
+            _fields_ = [
+                ("alpha", c_element_epilogue),
+                ("beta", c_element_epilogue),
+                ("alpha_ptr", ctypes.c_void_p),
+                ("beta_ptr", ctypes.c_void_p),
+                ("leaky_alpha", c_element_epilogue)
+            ]
+            def __init__(self, alpha, beta, leaky_alpha=0.2, *args) -> None:
+                self.alpha = element_epilogue(alpha).storage
+                self.beta = element_epilogue(beta).storage
+                self.alpha_ptr = 0
+                self.beta_ptr = 0
+                self.leaky_alpha = element_epilogue(leaky_alpha).storage
+        return _EpilogueOutputOpParams
+# Tanh operator
+class tanh(ActivationFunctor):
+    tag = "cutlass::epilogue::thread::Tanh"
+    def __init__(self, element_compute) -> None:
+        super().__init__(element_compute)
+        class _Arguments(ctypes.Structure):
+            _fields_ = [
+                ("tmp", ctypes.c_int)
+            ]
+            def __init__(self, *args) -> None:
+                self.tmp = 0
+        self.argument_type = _Arguments
+    def emit_visitor(self):
+        return "cutlass::TanhVisitor"
+    @staticmethod
+    def numpy(x: np.ndarray):
+        return np.tanh(x)
+def sigmoid_op(x: np.ndarray):
+    return 1. / (1. + np.exp(-x))
+# Sigmoid operator
+class sigmoid(ActivationFunctor):
+    tag = "cutlass::epilogue::thread::Sigmoid"
+    @staticmethod
+    def numpy(x: np.ndarray):
+        return sigmoid_op(x)
+# SiLu operator
+class silu(ActivationFunctor):
+    tag = "cutlass::epilogue::thread::SiLu"
+    @staticmethod
+    def numpy(x: np.ndarray):
+        return x * sigmoid_op(x)
+# Hardswish operator
+class hardswish(ActivationFunctor):
+    tag = "cutlass::epilogue::thread::HardSwish"
+    @staticmethod
+    def numpy(x: np.ndarray):
+        relu6 = np.minimum(np.maximum(x + 3., 0), 6.)
+        return x * relu6 / 6.
+# GELU operator
+class gelu(ActivationFunctor):
+    tag = "cutlass::epilogue::thread::GELU"
+    @staticmethod
+    def numpy(x: np.ndarray):
+        return 0.5 * x * (1 + erf(x / np.sqrt(2.)))
+# reduction operator
+def reduction_op(tensor, direction, math, factor):
+    batch, m, n = tensor.shape
+    if math == "Add":
+        if direction == "row":
+            num_cta_n = (n + factor - 1) // factor
+            reduction = np.transpose(
+                np.sum(tensor.reshape(batch, m, num_cta_n, factor), axis=-1),
+                axes=[0, 2, 1]).flatten()
+        elif direction == "column":
+            num_cta_m = (m + factor - 1) // factor
+            reduction = np.sum(
+                tensor.reshape(batch, num_cta_m, factor, n), axis=-2).flatten()
+        else:
+            raise NotImplementedError
+        return  reduction
+    else:
+        raise NotImplementedError
+# # GELU operator implemented using the taylor series approximation
+# class GELU_taylor(ActivationFunctor):
+#     tag = "cutlass::epilogue::thread::GELU_taylor"
+# # Computes backwards pass for GELU operator
+# class dGELU(ActivationFunctor):
+#     tag = "cutlass::epilogue::thread::dGELU"
+################################################################################
+# Epilogue Visitor
+################################################################################
+class LayerNorm(EpilogueFunctorBase):
+    """
+    Apply a linear combination operator to an array of elements
+    D = alpha * accumulator + beta * source
+    :param element_output: data type used to load and store tensors
+    :param epilogue_vector_length: number of elements computed per operation.
+    Usually it is 128/sizeof_bits<ElementOutput_>, but we use 64 and 32 sometimes
+    when there are not enough data to store
+    :param element_accumulator: Accumulator data type
+    :param element_epilogue: data type used to compute linear combination
+    """
+    KernelTemplate = """
+cutlass::epilogue::threadblock::EpilogueVisitorLayerNorm<
+    cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>,
+    ${operation_name}_default::kThreadCount,
+    ${operation_name}_default::Epilogue::OutputTileIterator,
+    ${operation_name}_default::Epilogue::AccumulatorFragmentIterator::AccumulatorTile,
+    ${element_compute}, // element_compute
+    ${element_variance}, // element_variance
+    ${element_mean}, // element_mean
+    ${element_layer_norm_compute}, // element_layer_norm_compute
+    ${epilogue_functor},
+    ${shifted_k}>;
+"""
+    headers = ["gemm/gemm_universal_with_visitor.h",
+                "epilogue/epilogue_visitor_with_layernorm.h"]
+    def __init__(
+        self, elementwise_functor,
+        element_variance=None, element_mean=None,
+        element_layer_norm_compute=None, shifted_k=True) -> None: # TODO bind ScaleType
+        super().__init__()
+        self.elementwise_functor = elementwise_functor
+        self.element_compute = elementwise_functor.element_epilogue
+        self.element_output = elementwise_functor.element_output
+        if element_variance is None:
+            self.element_variance = self.element_output
+        if element_mean is None:
+            self.element_mean = self.element_output
+        if element_layer_norm_compute is None:
+            self.element_layer_norm_compute = self.element_compute
+        if shifted_k:
+            self.shifted_k = "true"
+        else:
+            self.shifted_k = "false"
+        # get epilogue output op
+        elementwise_params_type = self.elementwise_functor.epilogue_type
+        class _EpilogueVisitorParams(ctypes.Structure):
+            _fields_ = [
+                ("element_wise", elementwise_params_type),
+                ("ptr_Variance", ctypes.c_void_p),
+                ("ptr_Mean_", ctypes.c_void_p),
+                ("ptr_Shifted_K_", ctypes.c_void_p),
+                ("extent", MatrixCoord_)
+            ]
+            def __init__(self, elementwise_params, variance, mean, shift_k, extent) -> None:
+                self.element_wise = elementwise_params
+                if isinstance(variance, np.ndarray):
+                    self.buffer_variance = NumpyFrontend.argument(variance, False)
+                    self.buffer_mean = NumpyFrontend.argument(mean, False)
+                    self.buffer_shift_k = NumpyFrontend.argument(shift_k, False)
+                    self.ptr_Variance = int(self.buffer_variance.ptr)
+                    self.ptr_Mean_ = int(self.buffer_mean.ptr)
+                    self.ptr_Shifted_K_ = int(self.buffer_shift_k.ptr)
+                    self.extent = MatrixCoord_(extent[0], extent[1])
+                    self.host_variance = variance
+                    self.host_mean = mean
+                    self.host_shift_k = shift_k
+            def sync(self, stream_sync=True):
+                if stream_sync:
+                    err, = cudart.cudaDeviceSynchronize()
+                    if err != cuda.CUresult.CUDA_SUCCESS:
+                        raise RuntimeError("CUDA Error %s" % str(err))
+                # if hasattr(self, "host_variance"):
+                err, = cuda.cuMemcpyDtoH(
+                    self.host_variance, cuda.CUdeviceptr(self.ptr_Variance),
+                    self.host_variance.size * self.host_variance.itemsize)
+                err, = cuda.cuMemcpyDtoH(
+                    self.host_mean, cuda.CUdeviceptr(self.ptr_Mean_),
+                    self.host_mean.size * self.host_mean.itemsize)
+                err, = cuda.cuMemcpyDtoH(
+                    self.host_shift_k, cuda.CUdeviceptr(self.ptr_Shifted_K_),
+                    self.host_shift_k.size * self.host_shift_k.itemsize)
+                if err != cuda.CUresult.CUDA_SUCCESS:
+                    raise RuntimeError("CUDA Error %s" % str(err))
+        self.epilogue_type =  _EpilogueVisitorParams
+    def emit(self, operation):
+        values = {
+            'threadblock_shape_m': str(operation.tile_description.threadblock_shape[0]),
+            'threadblock_shape_n': str(operation.tile_description.threadblock_shape[1]),
+            'threadblock_shape_k': str(operation.tile_description.threadblock_shape[2]),
+            'operation_name': operation.procedural_name(),
+            'element_compute': DataTypeTag[self.element_compute],
+            'element_variance': DataTypeTag[self.element_variance],
+            'element_mean': DataTypeTag[self.element_mean],
+            'element_layer_norm_compute': DataTypeTag[self.element_layer_norm_compute],
+            'epilogue_functor': self.elementwise_functor.emit(),
+            'shifted_k': self.shifted_k
+        }
+        return SubstituteTemplate(self.KernelTemplate, values)
+class AccumulatorOp:
+    Template = """
+using ${instance_name} = cutlass::epilogue::threadblock::VisitorOpAccumulator<${element_accumulator}, ${elements_per_access}>;
+"""
+    counter = 0
+    def __init__(self, element_accumulator, elements_per_access) -> None:
+        self.element_accumulator = element_accumulator
+        self.elements_per_access = elements_per_access
+        self.instance_name = "AccumulatorOp%d" % AccumulatorOp.counter
+        AccumulatorOp.counter += 1
+        class _Arguments(ctypes.Structure):
+            _fields_ = [
+                ("tmp", ctypes.c_int)
+            ]
+            def __init__(self):
+                self.tmp = 0
+        self.argument_type = _Arguments
+    def emit(self, *args):
+        values = {
+            "instance_name": self.instance_name,
+            "element_accumulator": DataTypeTag[self.element_accumulator],
+            "elements_per_access": str(self.elements_per_access)
+        }
+        return SubstituteTemplate(self.Template, values)
+class LinearCombinationOp:
+    Template = """
+${visitor_a}
+${visitor_b}
+using ${instance_name} = cutlass::epilogue::threadblock::VisitorOpLinearCombination<
+    ${element_accumulator}, ${element_compute},
+    ${elements_per_access}, ${visitor_a_name}, ${visitor_b_name}>;
+"""
+    counter = 0
+    def __init__(self, element_accumulator, element_compute,
+        elements_per_access, visitor_a, visitor_b) -> None:
+        #
+        self.element_accumulator = element_accumulator
+        self.element_compute = element_compute
+        self.elements_per_access = elements_per_access
+        self.visitor_a = visitor_a
+        self.visitor_b = visitor_b
+        self.instance_name = "LinearCombinationOp%d" % LinearCombinationOp.counter
+        LinearCombinationOp.counter += 1
+        class _Arguments(ctypes.Structure):
+            _fields_ = [
+                ("alpha", dtype2ctype[self.element_compute]),
+                ("beta", dtype2ctype[self.element_compute]),
+                ("visitor_a", self.visitor_a.argument_type),
+                ("visitor_b", self.visitor_b.argument_type)
+            ]
+            def __init__(self, alpha, beta, visitor_a_arg, visitor_b_arg) -> None:
+                self.alpha = element_compute(alpha).storage
+                self.beta = element_compute(beta).storage
+                self.visitor_a = visitor_a_arg
+                self.visitor_b = visitor_b_arg
+        self.argument_type = _Arguments
+    def emit(self, operation):
+        values = {
+            "instance_name": self.instance_name,
+            "element_accumulator": DataTypeTag[self.element_accumulator],
+            "element_compute": DataTypeTag[self.element_compute],
+            "elements_per_access": str(self.elements_per_access),
+            "visitor_a_name": self.visitor_a.instance_name,
+            "visitor_b_name": self.visitor_b.instance_name,
+            "visitor_a": self.visitor_a.emit(operation),
+            "visitor_b": self.visitor_b.emit(operation)
+        }
+        return SubstituteTemplate(self.Template, values)
+class VectorAdd:
+    def __init__(self, *args) -> None:
+        class _Arguments(ctypes.Structure):
+            _fields_ = [
+                ("tmp", ctypes.c_int)
+            ]
+            def __init__(self, *args) -> None:
+                self.tmp = 0
+        self.argument_type = _Arguments
+    def emit(self):
+        return "cutlass::VectorAdd"
+class VectorMult:
+    def __init__(self, *args) -> None:
+        class _Arguments(ctypes.Structure):
+            _fields_ = [
+                ("tmp", ctypes.c_int)
+            ]
+            def __init__(self, *args) -> None:
+                self.tmp = 0
+        self.argument_type = _Arguments
+    def emit(self):
+        return "cutlass::VectorMult"
+class BinaryOp:
+    Template = """
+${visitor_a}
+${visitor_b}
+using ${instance_name} = cutlass::epilogue::threadblock::VisitorOpBinary<
+    ${element_accumulator}, ${element_compute},
+    ${elements_per_access}, ${visitor_a_name}, ${visitor_b_name}, ${binary_op}>;
+"""
+    counter = 0
+    def __init__(self, element_accumulator, element_compute,
+        elements_per_access, visitor_a, visitor_b, binary_op) -> None:
+        #
+        self.element_accumulator = element_accumulator
+        self.element_compute = element_compute
+        self.elements_per_access = elements_per_access
+        self.visitor_a = visitor_a
+        self.visitor_b = visitor_b
+        self.binary_op = binary_op
+        self.instance_name = "BinaryOp%d" % BinaryOp.counter
+        BinaryOp.counter += 1
+        class _Arguments(ctypes.Structure):
+            _fields_ = [
+                ("binary_param", binary_op.argument_type),
+                ("visitor_a", self.visitor_a.argument_type),
+                ("visitor_b", self.visitor_b.argument_type)
+            ]
+            def __init__(self, binary_param, visitor_a_arg, visitor_b_arg) -> None:
+                self.binary_param = binary_param
+                self.visitor_a = visitor_a_arg
+                self.visitor_b = visitor_b_arg
+        self.argument_type = _Arguments
+    def emit(self, operation):
+        values = {
+            "instance_name": self.instance_name,
+            "element_accumulator": DataTypeTag[self.element_accumulator],
+            "element_compute": DataTypeTag[self.element_compute],
+            "elements_per_access": str(self.elements_per_access),
+            "visitor_a_name": self.visitor_a.instance_name,
+            "visitor_b_name": self.visitor_b.instance_name,
+            "visitor_a": self.visitor_a.emit(operation),
+            "visitor_b": self.visitor_b.emit(operation),
+            "binary_op": self.binary_op.emit()
+        }
+        return SubstituteTemplate(self.Template, values)
+class Mult:
+    def __init__(self, element_compute) -> None:
+        class _Arguments(ctypes.Structure):
+            _fields_ = [
+                ("alpha", dtype2ctype[element_compute])
+            ]
+            def __init__(self, alpha) -> None:
+                self.alpha = element_compute(alpha).storage
+        self.argument_type = _Arguments
+    def emit_visitor(self):
+        return "cutlass::Mult"
+class UnaryOp:
+    Template = """
+${visitor}
+using ${instance_name} = cutlass::epilogue::threadblock::VisitorOpUnary<
+    ${element_accumulator}, ${element_compute},
+    ${elements_per_access}, ${visitor_name}, ${unary_op}>;
+"""
+    counter = 0
+    def __init__(self, element_accumulator, element_compute,
+        elements_per_access, visitor, unary_op) -> None:
+        #
+        self.element_accumulator = element_accumulator
+        self.element_compute = element_compute
+        self.elements_per_access = elements_per_access
+        self.visitor = visitor
+        self.unary_op = unary_op
+        self.instance_name = "UnaryOp%d" % UnaryOp.counter
+        UnaryOp.counter += 1
+        class _Arguments(ctypes.Structure):
+            _fields_ = [
+                ("unary_param", unary_op.argument_type),
+                ("visitor_arg", self.visitor.argument_type)
+            ]
+            def __init__(self, unary_param, visitor_arg) -> None:
+                self.unary_param = unary_param
+                self.visitor_arg = visitor_arg
+        self.argument_type = _Arguments
+    def emit(self, operation):
+        values = {
+            "instance_name": self.instance_name,
+            "element_accumulator": DataTypeTag[self.element_accumulator],
+            "element_compute": DataTypeTag[self.element_compute],
+            "elements_per_access": str(self.elements_per_access),
+            "visitor_name": self.visitor.instance_name,
+            "unary_op": self.unary_op.emit_visitor(),
+            "visitor": self.visitor.emit(operation)
+        }
+        return SubstituteTemplate(self.Template, values)
+class RowBroadcastOp:
+    Template = """
+using ${instance_name} = cutlass::epilogue::threadblock::VisitorOpRowBroadcast<
+    ${element_accumulator}, ${element_fragment}, ${input_tile_iterator}>;
+"""
+    counter = 0
+    def __init__(self, element_accumulator, element_fragment) -> None:
+        self.element_accumulator = element_accumulator
+        self.element_fragment = element_fragment
+        self.instance_name = "RowBroadcastOp%d" % RowBroadcastOp.counter
+        RowBroadcastOp.counter += 1
+        class _Arguments(ctypes.Structure):
+            _fields_ = [
+                ("broadcast_ptr", ctypes.c_void_p),
+                ("batch_stride", ctypes.c_longlong)
+            ]
+            def __init__(self, broadcast_ptr, batch_stride=0):
+                self.broadcast_ptr = int(broadcast_ptr)
+                self.batch_stride = batch_stride
+        self.argument_type = _Arguments
+    def emit(self, operation):
+        values = {
+            "instance_name": self.instance_name,
+            "element_accumulator": DataTypeTag[self.element_accumulator],
+            "element_fragment": DataTypeTag[self.element_fragment],
+            "input_tile_iterator": operation.procedural_name() + "_default::Epilogue::OutputTileIterator"
+        }
+        return SubstituteTemplate(self.Template, values)
+class ColumnBroadcastOp:
+    Template = """
+using ${instance_name} = cutlass::epilogue::threadblock::VisitorOpColumnBroadcast<
+    ${element_accumulator}, ${element_fragment}, ${input_tile_iterator}>;
+"""
+    counter = 0
+    def __init__(self, element_accumulator, element_fragment) -> None:
+        self.element_accumulator = element_accumulator
+        self.element_fragment = element_fragment
+        self.instance_name = "ColumnBroadcastOp%d" % ColumnBroadcastOp.counter
+        ColumnBroadcastOp.counter += 1
+        class _Arguments(ctypes.Structure):
+            _fields_ = [
+                ("broadcast_ptr", ctypes.c_void_p),
+                ("batch_stride", ctypes.c_longlong)
+            ]
+            def __init__(self, broadcast_ptr, batch_stride=0):
+                self.broadcast_ptr = int(broadcast_ptr)
+                self.batch_stride = batch_stride
+        self.argument_type = _Arguments
+    def emit(self, operation):
+        values = {
+            "instance_name": self.instance_name,
+            "element_accumulator": DataTypeTag[self.element_accumulator],
+            "element_fragment": DataTypeTag[self.element_fragment],
+            "input_tile_iterator": operation.procedural_name() + "_default::Epilogue::OutputTileIterator"
+        }
+        return SubstituteTemplate(self.Template, values)
+class TensorInputOp:
+    Template = """
+using ${instance_name} = cutlass::epilogue::threadblock::VisitorOpTensorInput<
+    ${element_accumulator}, ${input_tile_iterator}>;
+"""
+    counter = 0
+    def __init__(self, element_accumulator) -> None:
+        self.element_accumulator = element_accumulator
+        self.instance_name = "TensorInputOp%d" % TensorInputOp.counter
+        TensorInputOp.counter += 1
+        class _Arguments(ctypes.Structure):
+            _fields_ = [
+                ("input_ptr", ctypes.c_void_p),
+                ("ldt", ctypes.c_int),
+                ("batch_stride", ctypes.c_longlong)
+            ]
+            def __init__(self, input_ptr, ldt, batch_stride=0) -> None:
+                self.input_ptr = int(input_ptr)
+                self.ldt = ldt
+                self.batch_stride = batch_stride
+        self.argument_type = _Arguments
+    def emit(self, operation):
+        values = {
+            "instance_name": self.instance_name,
+            "element_accumulator": DataTypeTag[self.element_accumulator],
+            "input_tile_iterator": operation.procedural_name() + "_default::Epilogue::OutputTileIterator"
+        }
+        return SubstituteTemplate(self.Template, values)
+class TensorOutputOp:
+    Template = """
+${visitor}
+using ${instance_name} = cutlass::epilogue::threadblock::VisitorOpTensorOutput<
+    ${element_accumulator}, ${output_tile_iterator}, ${visitor_name}>;
+"""
+    counter = 0
+    def __init__(self, element_accumulator, visitor) -> None:
+        self.element_accumulator = element_accumulator
+        self.visitor = visitor
+        self.instance_name = "TensorOutputOp%d" % TensorOutputOp.counter
+        TensorOutputOp.counter += 1
+        class _Arguments(ctypes.Structure):
+            _fields_ = [
+                ("output_ptr", ctypes.c_void_p),
+                ("ldt", ctypes.c_int),
+                ("batch_stride", ctypes.c_longlong),
+                ("visitor_arg", self.visitor.argument_type)
+            ]
+            def __init__(self, output_ptr, ldt, visitor_arg, batch_stride=0) -> None:
+                self.output_ptr = int(output_ptr)
+                self.ldt = int(ldt)
+                self.visitor_arg = visitor_arg
+                self.batch_stride = batch_stride
+        self.argument_type = _Arguments
+    def emit(self, operation):
+        values = {
+            "instance_name": self.instance_name,
+            "element_accumulator": DataTypeTag[self.element_accumulator],
+            "output_tile_iterator": operation.procedural_name() + "_default::Epilogue::OutputTileIterator",
+            "visitor_name": self.visitor.instance_name,
+            "visitor": self.visitor.emit(operation)
+        }
+        return SubstituteTemplate(self.Template, values)
+class ColumnReductionOp:
+    Template = """
+${visitor}
+using ${instance_name} = cutlass::epilogue::threadblock::VisitorOpColumnReduction<
+    cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>,
+    ${element_accumulator}, ${element_reduction}, ${element_reduction_accumulator},
+    ${output_tile_iterator}, ${visitor_name}>;
+"""
+    counter = 0
+    def __init__(self, element_accumulator, element_reduction,
+        element_reduction_accumulator, visitor) -> None:
+        self.element_accumulator = element_accumulator
+        self.element_reduction = element_reduction
+        self.element_reduction_accumulator = element_reduction_accumulator
+        self.visitor = visitor
+        self.instance_name = "ColumnReductionOp%d" % ColumnReductionOp.counter
+        ColumnReductionOp.counter += 1
+        class _Arguments(ctypes.Structure):
+            _fields_ = [
+                ("reduction_ptr", ctypes.c_void_p),
+                ("batch_stride", ctypes.c_longlong),
+                ("visitor_arg", self.visitor.argument_type)
+            ]
+            def __init__(self, reduction_ptr, visitor_arg, batch_stride=0) -> None:
+                self.reduction_ptr = reduction_ptr
+                self.batch_stride = batch_stride
+                self.visitor_arg = visitor_arg
+        self.argument_type = _Arguments
+    def emit(self, operation):
+        values = {
+            "instance_name": self.instance_name,
+            'threadblock_shape_m': str(operation.tile_description.threadblock_shape[0]),
+            'threadblock_shape_n': str(operation.tile_description.threadblock_shape[1]),
+            'threadblock_shape_k': str(operation.tile_description.threadblock_shape[2]),
+            "element_accumulator": DataTypeTag[self.element_accumulator],
+            "element_reduction": DataTypeTag[self.element_reduction],
+            "element_reduction_accumulator": DataTypeTag[self.element_reduction_accumulator],
+            "output_tile_iterator": operation.procedural_name() + "_default::Epilogue::OutputTileIterator",
+            "visitor_name": self.visitor.instance_name,
+            "visitor": self.visitor.emit(operation)
+        }
+        return SubstituteTemplate(self.Template, values)
+class RowReductionOp:
+    Template = """
+${visitor}
+using ${instance_name} = cutlass::epilogue::threadblock::VisitorOpRowReduction<
+    cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>,
+    ${element_accumulator}, ${element_reduction}, ${element_reduction_accumulator},
+    ${output_tile_iterator}, ${visitor_name}>;
+"""
+    counter = 0
+    def __init__(self, element_accumulator, element_reduction,
+        element_reduction_accumulator, visitor) -> None:
+        self.element_accumulator = element_accumulator
+        self.element_reduction = element_reduction
+        self.element_reduction_accumulator = element_reduction_accumulator
+        self.visitor = visitor
+        self.instance_name = "RowReductionOp%d" % RowReductionOp.counter
+        RowReductionOp.counter += 1
+        class _Arguments(ctypes.Structure):
+            _fields_ = [
+                ("reduction_ptr", ctypes.c_void_p),
+                ("batch_stride", ctypes.c_longlong),
+                ("visitor_arg", self.visitor.argument_type)
+            ]
+            def __init__(self, reduction_ptr, visitor_arg, batch_stride=0) -> None:
+                self.reduction_ptr = reduction_ptr
+                self.visitor_arg = visitor_arg
+                self.batch_stride = batch_stride
+        self.argument_type = _Arguments
+    def emit(self, operation):
+        values = {
+            "instance_name": self.instance_name,
+            'threadblock_shape_m': str(operation.tile_description.threadblock_shape[0]),
+            'threadblock_shape_n': str(operation.tile_description.threadblock_shape[1]),
+            'threadblock_shape_k': str(operation.tile_description.threadblock_shape[2]),
+            "element_accumulator": DataTypeTag[self.element_accumulator],
+            "element_reduction": DataTypeTag[self.element_reduction],
+            "element_reduction_accumulator": DataTypeTag[self.element_reduction_accumulator],
+            "output_tile_iterator": operation.procedural_name() + "_default::Epilogue::OutputTileIterator",
+            "visitor_name": self.visitor.instance_name,
+            "visitor": self.visitor.emit(operation)
+        }
+        return SubstituteTemplate(self.Template, values)