tilelang-rocm 0.1.4.post5__cp310-cp310-manylinux1_x86_64.whl → 0.1.4.post9__cp310-cp310-manylinux1_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (1011) hide show
  1. tilelang/3rdparty/tvm/python/tvm/_ffi/runtime_ctypes.py +7 -1
  2. tilelang/3rdparty/tvm/python/tvm/contrib/tvmjs.py +7 -0
  3. tilelang/3rdparty/tvm/python/tvm/runtime/ndarray.py +8 -0
  4. tilelang/3rdparty/tvm/python/tvm/script/ir_builder/tir/ir.py +7 -0
  5. tilelang/3rdparty/tvm/src/target/llvm/codegen_llvm.cc +2 -1
  6. tilelang/3rdparty/tvm/src/target/source/codegen_cuda.cc +4 -1
  7. tilelang/3rdparty/tvm/src/target/source/ptx.cc +3 -0
  8. tilelang/3rdparty/tvm/src/tir/ir/index_map.cc +2 -1
  9. tilelang/3rdparty/tvm/src/tir/ir/stmt.cc +0 -6
  10. tilelang/3rdparty/tvm/src/tir/op/op.cc +6 -0
  11. tilelang/3rdparty/tvm/src/tir/transforms/dtype_conversion.h +3 -0
  12. tilelang/README.md +3 -1
  13. tilelang/VERSION +1 -1
  14. tilelang/__init__.py +2 -63
  15. tilelang/autotuner/__init__.py +334 -277
  16. tilelang/autotuner/param.py +329 -0
  17. tilelang/cache/kernel_cache.py +6 -11
  18. tilelang/cache/tuner_cache.py +356 -0
  19. tilelang/carver/arch/driver/__init__.py +2 -0
  20. tilelang/carver/arch/driver/cuda_driver.py +28 -0
  21. tilelang/contrib/dlpack.py +1 -1
  22. tilelang/contrib/nvcc.py +27 -3
  23. tilelang/engine/phase.py +44 -11
  24. tilelang/intrinsics/mfma_macro_generator.py +12 -3
  25. tilelang/intrinsics/mma_layout.py +33 -51
  26. tilelang/jit/__init__.py +215 -93
  27. tilelang/jit/adapter/cython/cython_wrapper.pyx +13 -10
  28. tilelang/jit/adapter/libgen.py +3 -1
  29. tilelang/jit/adapter/wrapper.py +91 -6
  30. tilelang/jit/kernel.py +52 -1
  31. tilelang/jit/param.py +45 -0
  32. tilelang/language/__init__.py +83 -1
  33. tilelang/language/builtin.py +90 -1
  34. tilelang/language/copy.py +13 -11
  35. tilelang/language/customize.py +13 -0
  36. tilelang/language/print.py +27 -0
  37. tilelang/language/reduce.py +16 -5
  38. tilelang/language/tir/op.py +19 -0
  39. tilelang/language/warpgroup.py +7 -2
  40. tilelang/lib/libtilelang.so +0 -0
  41. tilelang/lib/libtilelang_module.so +0 -0
  42. tilelang/lib/libtvm.so +0 -0
  43. tilelang/lib/libtvm_runtime.so +0 -0
  44. tilelang/primitives/gemm/base.py +61 -24
  45. tilelang/profiler/__init__.py +41 -2
  46. tilelang/quantize/__init__.py +18 -0
  47. tilelang/quantize/lop3.py +1202 -0
  48. tilelang/quantize/quantization.py +234 -0
  49. tilelang/quantize/utils.py +126 -0
  50. tilelang/src/tl_templates/cuda/common.h +23 -0
  51. tilelang/src/tl_templates/cuda/cuda_fp8.h +42 -13
  52. tilelang/src/tl_templates/cuda/debug.h +41 -3
  53. tilelang/src/tl_templates/cuda/gemm_sm80.h +25 -13
  54. tilelang/src/tl_templates/cuda/gemm_sm89.h +25 -14
  55. tilelang/src/tl_templates/cuda/gemm_sm90.h +28 -24
  56. tilelang/src/tl_templates/hip/hip_fp8.h +18 -0
  57. tilelang/transform/__init__.py +17 -0
  58. tilelang/transform/pass_config.py +3 -0
  59. tilelang/utils/tensor.py +1 -0
  60. tilelang/version.py +21 -0
  61. {tilelang_rocm-0.1.4.post5.dist-info → tilelang_rocm-0.1.4.post9.dist-info}/METADATA +4 -2
  62. {tilelang_rocm-0.1.4.post5.dist-info → tilelang_rocm-0.1.4.post9.dist-info}/RECORD +65 -1003
  63. {tilelang_rocm-0.1.4.post5.dist-info → tilelang_rocm-0.1.4.post9.dist-info}/WHEEL +1 -1
  64. tilelang/3rdparty/tvm/python/tvm/__pycache__/__init__.cpython-310.pyc +0 -0
  65. tilelang/3rdparty/tvm/python/tvm/__pycache__/error.cpython-310.pyc +0 -0
  66. tilelang/3rdparty/tvm/python/tvm/__pycache__/parser.cpython-310.pyc +0 -0
  67. tilelang/3rdparty/tvm/python/tvm/__pycache__/support.cpython-310.pyc +0 -0
  68. tilelang/3rdparty/tvm/python/tvm/_ffi/__pycache__/__init__.cpython-310.pyc +0 -0
  69. tilelang/3rdparty/tvm/python/tvm/_ffi/__pycache__/_pyversion.cpython-310.pyc +0 -0
  70. tilelang/3rdparty/tvm/python/tvm/_ffi/__pycache__/base.cpython-310.pyc +0 -0
  71. tilelang/3rdparty/tvm/python/tvm/_ffi/__pycache__/libinfo.cpython-310.pyc +0 -0
  72. tilelang/3rdparty/tvm/python/tvm/_ffi/__pycache__/registry.cpython-310.pyc +0 -0
  73. tilelang/3rdparty/tvm/python/tvm/_ffi/__pycache__/runtime_ctypes.cpython-310.pyc +0 -0
  74. tilelang/3rdparty/tvm/python/tvm/_ffi/_ctypes/__pycache__/__init__.cpython-310.pyc +0 -0
  75. tilelang/3rdparty/tvm/python/tvm/_ffi/_ctypes/__pycache__/ndarray.cpython-310.pyc +0 -0
  76. tilelang/3rdparty/tvm/python/tvm/_ffi/_ctypes/__pycache__/object.cpython-310.pyc +0 -0
  77. tilelang/3rdparty/tvm/python/tvm/_ffi/_ctypes/__pycache__/packed_func.cpython-310.pyc +0 -0
  78. tilelang/3rdparty/tvm/python/tvm/_ffi/_ctypes/__pycache__/types.cpython-310.pyc +0 -0
  79. tilelang/3rdparty/tvm/python/tvm/_ffi/_cy3/__pycache__/__init__.cpython-310.pyc +0 -0
  80. tilelang/3rdparty/tvm/python/tvm/arith/__pycache__/__init__.cpython-310.pyc +0 -0
  81. tilelang/3rdparty/tvm/python/tvm/arith/__pycache__/_ffi_api.cpython-310.pyc +0 -0
  82. tilelang/3rdparty/tvm/python/tvm/arith/__pycache__/analyzer.cpython-310.pyc +0 -0
  83. tilelang/3rdparty/tvm/python/tvm/arith/__pycache__/bound.cpython-310.pyc +0 -0
  84. tilelang/3rdparty/tvm/python/tvm/arith/__pycache__/int_set.cpython-310.pyc +0 -0
  85. tilelang/3rdparty/tvm/python/tvm/arith/__pycache__/int_solver.cpython-310.pyc +0 -0
  86. tilelang/3rdparty/tvm/python/tvm/arith/__pycache__/iter_affine_map.cpython-310.pyc +0 -0
  87. tilelang/3rdparty/tvm/python/tvm/arith/__pycache__/pattern.cpython-310.pyc +0 -0
  88. tilelang/3rdparty/tvm/python/tvm/auto_scheduler/__pycache__/__init__.cpython-310.pyc +0 -0
  89. tilelang/3rdparty/tvm/python/tvm/auto_scheduler/__pycache__/_ffi_api.cpython-310.pyc +0 -0
  90. tilelang/3rdparty/tvm/python/tvm/auto_scheduler/__pycache__/compute_dag.cpython-310.pyc +0 -0
  91. tilelang/3rdparty/tvm/python/tvm/auto_scheduler/__pycache__/dispatcher.cpython-310.pyc +0 -0
  92. tilelang/3rdparty/tvm/python/tvm/auto_scheduler/__pycache__/feature.cpython-310.pyc +0 -0
  93. tilelang/3rdparty/tvm/python/tvm/auto_scheduler/__pycache__/loop_state.cpython-310.pyc +0 -0
  94. tilelang/3rdparty/tvm/python/tvm/auto_scheduler/__pycache__/measure.cpython-310.pyc +0 -0
  95. tilelang/3rdparty/tvm/python/tvm/auto_scheduler/__pycache__/measure_record.cpython-310.pyc +0 -0
  96. tilelang/3rdparty/tvm/python/tvm/auto_scheduler/__pycache__/relay_integration.cpython-310.pyc +0 -0
  97. tilelang/3rdparty/tvm/python/tvm/auto_scheduler/__pycache__/search_policy.cpython-310.pyc +0 -0
  98. tilelang/3rdparty/tvm/python/tvm/auto_scheduler/__pycache__/search_task.cpython-310.pyc +0 -0
  99. tilelang/3rdparty/tvm/python/tvm/auto_scheduler/__pycache__/task_scheduler.cpython-310.pyc +0 -0
  100. tilelang/3rdparty/tvm/python/tvm/auto_scheduler/__pycache__/utils.cpython-310.pyc +0 -0
  101. tilelang/3rdparty/tvm/python/tvm/auto_scheduler/__pycache__/workload_registry.cpython-310.pyc +0 -0
  102. tilelang/3rdparty/tvm/python/tvm/auto_scheduler/cost_model/__pycache__/__init__.cpython-310.pyc +0 -0
  103. tilelang/3rdparty/tvm/python/tvm/auto_scheduler/cost_model/__pycache__/cost_model.cpython-310.pyc +0 -0
  104. tilelang/3rdparty/tvm/python/tvm/auto_scheduler/cost_model/__pycache__/xgb_model.cpython-310.pyc +0 -0
  105. tilelang/3rdparty/tvm/python/tvm/autotvm/__pycache__/__init__.cpython-310.pyc +0 -0
  106. tilelang/3rdparty/tvm/python/tvm/autotvm/__pycache__/database.cpython-310.pyc +0 -0
  107. tilelang/3rdparty/tvm/python/tvm/autotvm/__pycache__/env.cpython-310.pyc +0 -0
  108. tilelang/3rdparty/tvm/python/tvm/autotvm/__pycache__/feature.cpython-310.pyc +0 -0
  109. tilelang/3rdparty/tvm/python/tvm/autotvm/__pycache__/record.cpython-310.pyc +0 -0
  110. tilelang/3rdparty/tvm/python/tvm/autotvm/__pycache__/tophub.cpython-310.pyc +0 -0
  111. tilelang/3rdparty/tvm/python/tvm/autotvm/__pycache__/utils.cpython-310.pyc +0 -0
  112. tilelang/3rdparty/tvm/python/tvm/autotvm/measure/__pycache__/__init__.cpython-310.pyc +0 -0
  113. tilelang/3rdparty/tvm/python/tvm/autotvm/measure/__pycache__/executor.cpython-310.pyc +0 -0
  114. tilelang/3rdparty/tvm/python/tvm/autotvm/measure/__pycache__/measure.cpython-310.pyc +0 -0
  115. tilelang/3rdparty/tvm/python/tvm/autotvm/measure/__pycache__/measure_methods.cpython-310.pyc +0 -0
  116. tilelang/3rdparty/tvm/python/tvm/autotvm/task/__pycache__/__init__.cpython-310.pyc +0 -0
  117. tilelang/3rdparty/tvm/python/tvm/autotvm/task/__pycache__/code_hash.cpython-310.pyc +0 -0
  118. tilelang/3rdparty/tvm/python/tvm/autotvm/task/__pycache__/dispatcher.cpython-310.pyc +0 -0
  119. tilelang/3rdparty/tvm/python/tvm/autotvm/task/__pycache__/relay_integration.cpython-310.pyc +0 -0
  120. tilelang/3rdparty/tvm/python/tvm/autotvm/task/__pycache__/space.cpython-310.pyc +0 -0
  121. tilelang/3rdparty/tvm/python/tvm/autotvm/task/__pycache__/task.cpython-310.pyc +0 -0
  122. tilelang/3rdparty/tvm/python/tvm/autotvm/task/__pycache__/topi_integration.cpython-310.pyc +0 -0
  123. tilelang/3rdparty/tvm/python/tvm/autotvm/tuner/__pycache__/__init__.cpython-310.pyc +0 -0
  124. tilelang/3rdparty/tvm/python/tvm/autotvm/tuner/__pycache__/callback.cpython-310.pyc +0 -0
  125. tilelang/3rdparty/tvm/python/tvm/autotvm/tuner/__pycache__/droplet_tuner.cpython-310.pyc +0 -0
  126. tilelang/3rdparty/tvm/python/tvm/autotvm/tuner/__pycache__/ga_tuner.cpython-310.pyc +0 -0
  127. tilelang/3rdparty/tvm/python/tvm/autotvm/tuner/__pycache__/index_based_tuner.cpython-310.pyc +0 -0
  128. tilelang/3rdparty/tvm/python/tvm/autotvm/tuner/__pycache__/metric.cpython-310.pyc +0 -0
  129. tilelang/3rdparty/tvm/python/tvm/autotvm/tuner/__pycache__/model_based_tuner.cpython-310.pyc +0 -0
  130. tilelang/3rdparty/tvm/python/tvm/autotvm/tuner/__pycache__/sa_model_optimizer.cpython-310.pyc +0 -0
  131. tilelang/3rdparty/tvm/python/tvm/autotvm/tuner/__pycache__/tuner.cpython-310.pyc +0 -0
  132. tilelang/3rdparty/tvm/python/tvm/autotvm/tuner/__pycache__/xgboost_cost_model.cpython-310.pyc +0 -0
  133. tilelang/3rdparty/tvm/python/tvm/autotvm/tuner/__pycache__/xgboost_tuner.cpython-310.pyc +0 -0
  134. tilelang/3rdparty/tvm/python/tvm/contrib/__pycache__/__init__.cpython-310.pyc +0 -0
  135. tilelang/3rdparty/tvm/python/tvm/contrib/__pycache__/cblas.cpython-310.pyc +0 -0
  136. tilelang/3rdparty/tvm/python/tvm/contrib/__pycache__/cc.cpython-310.pyc +0 -0
  137. tilelang/3rdparty/tvm/python/tvm/contrib/__pycache__/coreml_runtime.cpython-310.pyc +0 -0
  138. tilelang/3rdparty/tvm/python/tvm/contrib/__pycache__/cublas.cpython-310.pyc +0 -0
  139. tilelang/3rdparty/tvm/python/tvm/contrib/__pycache__/cudnn.cpython-310.pyc +0 -0
  140. tilelang/3rdparty/tvm/python/tvm/contrib/__pycache__/dnnl.cpython-310.pyc +0 -0
  141. tilelang/3rdparty/tvm/python/tvm/contrib/__pycache__/download.cpython-310.pyc +0 -0
  142. tilelang/3rdparty/tvm/python/tvm/contrib/__pycache__/graph_executor.cpython-310.pyc +0 -0
  143. tilelang/3rdparty/tvm/python/tvm/contrib/__pycache__/hipcc.cpython-310.pyc +0 -0
  144. tilelang/3rdparty/tvm/python/tvm/contrib/__pycache__/miopen.cpython-310.pyc +0 -0
  145. tilelang/3rdparty/tvm/python/tvm/contrib/__pycache__/mkl.cpython-310.pyc +0 -0
  146. tilelang/3rdparty/tvm/python/tvm/contrib/__pycache__/mrvl.cpython-310.pyc +0 -0
  147. tilelang/3rdparty/tvm/python/tvm/contrib/__pycache__/ndk.cpython-310.pyc +0 -0
  148. tilelang/3rdparty/tvm/python/tvm/contrib/__pycache__/nnpack.cpython-310.pyc +0 -0
  149. tilelang/3rdparty/tvm/python/tvm/contrib/__pycache__/nvcc.cpython-310.pyc +0 -0
  150. tilelang/3rdparty/tvm/python/tvm/contrib/__pycache__/pickle_memoize.cpython-310.pyc +0 -0
  151. tilelang/3rdparty/tvm/python/tvm/contrib/__pycache__/popen_pool.cpython-310.pyc +0 -0
  152. tilelang/3rdparty/tvm/python/tvm/contrib/__pycache__/rocblas.cpython-310.pyc +0 -0
  153. tilelang/3rdparty/tvm/python/tvm/contrib/__pycache__/rocm.cpython-310.pyc +0 -0
  154. tilelang/3rdparty/tvm/python/tvm/contrib/__pycache__/sdaccel.cpython-310.pyc +0 -0
  155. tilelang/3rdparty/tvm/python/tvm/contrib/__pycache__/stackvm.cpython-310.pyc +0 -0
  156. tilelang/3rdparty/tvm/python/tvm/contrib/__pycache__/tar.cpython-310.pyc +0 -0
  157. tilelang/3rdparty/tvm/python/tvm/contrib/__pycache__/thrust.cpython-310.pyc +0 -0
  158. tilelang/3rdparty/tvm/python/tvm/contrib/__pycache__/utils.cpython-310.pyc +0 -0
  159. tilelang/3rdparty/tvm/python/tvm/contrib/__pycache__/xcode.cpython-310.pyc +0 -0
  160. tilelang/3rdparty/tvm/python/tvm/contrib/target/__pycache__/__init__.cpython-310.pyc +0 -0
  161. tilelang/3rdparty/tvm/python/tvm/contrib/target/__pycache__/coreml.cpython-310.pyc +0 -0
  162. tilelang/3rdparty/tvm/python/tvm/dlight/__pycache__/__init__.cpython-310.pyc +0 -0
  163. tilelang/3rdparty/tvm/python/tvm/dlight/base/__pycache__/__init__.cpython-310.pyc +0 -0
  164. tilelang/3rdparty/tvm/python/tvm/dlight/base/__pycache__/analysis.cpython-310.pyc +0 -0
  165. tilelang/3rdparty/tvm/python/tvm/dlight/base/__pycache__/common_schedules.cpython-310.pyc +0 -0
  166. tilelang/3rdparty/tvm/python/tvm/dlight/base/__pycache__/schedule_rule.cpython-310.pyc +0 -0
  167. tilelang/3rdparty/tvm/python/tvm/dlight/base/__pycache__/transform.cpython-310.pyc +0 -0
  168. tilelang/3rdparty/tvm/python/tvm/dlight/gpu/__pycache__/__init__.cpython-310.pyc +0 -0
  169. tilelang/3rdparty/tvm/python/tvm/dlight/gpu/__pycache__/base.cpython-310.pyc +0 -0
  170. tilelang/3rdparty/tvm/python/tvm/dlight/gpu/__pycache__/fallback.cpython-310.pyc +0 -0
  171. tilelang/3rdparty/tvm/python/tvm/dlight/gpu/__pycache__/gemv.cpython-310.pyc +0 -0
  172. tilelang/3rdparty/tvm/python/tvm/dlight/gpu/__pycache__/general_reduction.cpython-310.pyc +0 -0
  173. tilelang/3rdparty/tvm/python/tvm/dlight/gpu/__pycache__/low_batch_gemv.cpython-310.pyc +0 -0
  174. tilelang/3rdparty/tvm/python/tvm/dlight/gpu/__pycache__/matmul.cpython-310.pyc +0 -0
  175. tilelang/3rdparty/tvm/python/tvm/dlight/gpu/__pycache__/reduction.cpython-310.pyc +0 -0
  176. tilelang/3rdparty/tvm/python/tvm/dlight/gpu/__pycache__/rmsnorm.cpython-310.pyc +0 -0
  177. tilelang/3rdparty/tvm/python/tvm/dlight/gpu/__pycache__/transpose.cpython-310.pyc +0 -0
  178. tilelang/3rdparty/tvm/python/tvm/dlight/gpu/__pycache__/utils.cpython-310.pyc +0 -0
  179. tilelang/3rdparty/tvm/python/tvm/driver/__pycache__/__init__.cpython-310.pyc +0 -0
  180. tilelang/3rdparty/tvm/python/tvm/driver/__pycache__/_ffi_api.cpython-310.pyc +0 -0
  181. tilelang/3rdparty/tvm/python/tvm/driver/__pycache__/build_module.cpython-310.pyc +0 -0
  182. tilelang/3rdparty/tvm/python/tvm/ir/__pycache__/__init__.cpython-310.pyc +0 -0
  183. tilelang/3rdparty/tvm/python/tvm/ir/__pycache__/_ffi_api.cpython-310.pyc +0 -0
  184. tilelang/3rdparty/tvm/python/tvm/ir/__pycache__/_ffi_instrument_api.cpython-310.pyc +0 -0
  185. tilelang/3rdparty/tvm/python/tvm/ir/__pycache__/_ffi_transform_api.cpython-310.pyc +0 -0
  186. tilelang/3rdparty/tvm/python/tvm/ir/__pycache__/adt.cpython-310.pyc +0 -0
  187. tilelang/3rdparty/tvm/python/tvm/ir/__pycache__/affine_type.cpython-310.pyc +0 -0
  188. tilelang/3rdparty/tvm/python/tvm/ir/__pycache__/attrs.cpython-310.pyc +0 -0
  189. tilelang/3rdparty/tvm/python/tvm/ir/__pycache__/base.cpython-310.pyc +0 -0
  190. tilelang/3rdparty/tvm/python/tvm/ir/__pycache__/container.cpython-310.pyc +0 -0
  191. tilelang/3rdparty/tvm/python/tvm/ir/__pycache__/expr.cpython-310.pyc +0 -0
  192. tilelang/3rdparty/tvm/python/tvm/ir/__pycache__/function.cpython-310.pyc +0 -0
  193. tilelang/3rdparty/tvm/python/tvm/ir/__pycache__/global_info.cpython-310.pyc +0 -0
  194. tilelang/3rdparty/tvm/python/tvm/ir/__pycache__/instrument.cpython-310.pyc +0 -0
  195. tilelang/3rdparty/tvm/python/tvm/ir/__pycache__/json_compact.cpython-310.pyc +0 -0
  196. tilelang/3rdparty/tvm/python/tvm/ir/__pycache__/memory_pools.cpython-310.pyc +0 -0
  197. tilelang/3rdparty/tvm/python/tvm/ir/__pycache__/module.cpython-310.pyc +0 -0
  198. tilelang/3rdparty/tvm/python/tvm/ir/__pycache__/op.cpython-310.pyc +0 -0
  199. tilelang/3rdparty/tvm/python/tvm/ir/__pycache__/tensor_type.cpython-310.pyc +0 -0
  200. tilelang/3rdparty/tvm/python/tvm/ir/__pycache__/transform.cpython-310.pyc +0 -0
  201. tilelang/3rdparty/tvm/python/tvm/ir/__pycache__/type.cpython-310.pyc +0 -0
  202. tilelang/3rdparty/tvm/python/tvm/ir/__pycache__/type_relation.cpython-310.pyc +0 -0
  203. tilelang/3rdparty/tvm/python/tvm/ir/diagnostics/__pycache__/__init__.cpython-310.pyc +0 -0
  204. tilelang/3rdparty/tvm/python/tvm/ir/diagnostics/__pycache__/_ffi_api.cpython-310.pyc +0 -0
  205. tilelang/3rdparty/tvm/python/tvm/meta_schedule/__pycache__/__init__.cpython-310.pyc +0 -0
  206. tilelang/3rdparty/tvm/python/tvm/meta_schedule/__pycache__/_ffi_api.cpython-310.pyc +0 -0
  207. tilelang/3rdparty/tvm/python/tvm/meta_schedule/__pycache__/arg_info.cpython-310.pyc +0 -0
  208. tilelang/3rdparty/tvm/python/tvm/meta_schedule/__pycache__/extracted_task.cpython-310.pyc +0 -0
  209. tilelang/3rdparty/tvm/python/tvm/meta_schedule/__pycache__/logging.cpython-310.pyc +0 -0
  210. tilelang/3rdparty/tvm/python/tvm/meta_schedule/__pycache__/profiler.cpython-310.pyc +0 -0
  211. tilelang/3rdparty/tvm/python/tvm/meta_schedule/__pycache__/relax_integration.cpython-310.pyc +0 -0
  212. tilelang/3rdparty/tvm/python/tvm/meta_schedule/__pycache__/relay_integration.cpython-310.pyc +0 -0
  213. tilelang/3rdparty/tvm/python/tvm/meta_schedule/__pycache__/tir_integration.cpython-310.pyc +0 -0
  214. tilelang/3rdparty/tvm/python/tvm/meta_schedule/__pycache__/trace_apply.cpython-310.pyc +0 -0
  215. tilelang/3rdparty/tvm/python/tvm/meta_schedule/__pycache__/tune.cpython-310.pyc +0 -0
  216. tilelang/3rdparty/tvm/python/tvm/meta_schedule/__pycache__/tune_context.cpython-310.pyc +0 -0
  217. tilelang/3rdparty/tvm/python/tvm/meta_schedule/__pycache__/utils.cpython-310.pyc +0 -0
  218. tilelang/3rdparty/tvm/python/tvm/meta_schedule/builder/__pycache__/__init__.cpython-310.pyc +0 -0
  219. tilelang/3rdparty/tvm/python/tvm/meta_schedule/builder/__pycache__/builder.cpython-310.pyc +0 -0
  220. tilelang/3rdparty/tvm/python/tvm/meta_schedule/builder/__pycache__/local_builder.cpython-310.pyc +0 -0
  221. tilelang/3rdparty/tvm/python/tvm/meta_schedule/cost_model/__pycache__/__init__.cpython-310.pyc +0 -0
  222. tilelang/3rdparty/tvm/python/tvm/meta_schedule/cost_model/__pycache__/cost_model.cpython-310.pyc +0 -0
  223. tilelang/3rdparty/tvm/python/tvm/meta_schedule/cost_model/__pycache__/metric.cpython-310.pyc +0 -0
  224. tilelang/3rdparty/tvm/python/tvm/meta_schedule/cost_model/__pycache__/random_model.cpython-310.pyc +0 -0
  225. tilelang/3rdparty/tvm/python/tvm/meta_schedule/cost_model/__pycache__/xgb_model.cpython-310.pyc +0 -0
  226. tilelang/3rdparty/tvm/python/tvm/meta_schedule/database/__pycache__/__init__.cpython-310.pyc +0 -0
  227. tilelang/3rdparty/tvm/python/tvm/meta_schedule/database/__pycache__/database.cpython-310.pyc +0 -0
  228. tilelang/3rdparty/tvm/python/tvm/meta_schedule/database/__pycache__/json_database.cpython-310.pyc +0 -0
  229. tilelang/3rdparty/tvm/python/tvm/meta_schedule/database/__pycache__/memory_database.cpython-310.pyc +0 -0
  230. tilelang/3rdparty/tvm/python/tvm/meta_schedule/database/__pycache__/ordered_union_database.cpython-310.pyc +0 -0
  231. tilelang/3rdparty/tvm/python/tvm/meta_schedule/database/__pycache__/schedule_fn_database.cpython-310.pyc +0 -0
  232. tilelang/3rdparty/tvm/python/tvm/meta_schedule/database/__pycache__/union_database.cpython-310.pyc +0 -0
  233. tilelang/3rdparty/tvm/python/tvm/meta_schedule/feature_extractor/__pycache__/__init__.cpython-310.pyc +0 -0
  234. tilelang/3rdparty/tvm/python/tvm/meta_schedule/feature_extractor/__pycache__/feature_extractor.cpython-310.pyc +0 -0
  235. tilelang/3rdparty/tvm/python/tvm/meta_schedule/feature_extractor/__pycache__/per_store_feature.cpython-310.pyc +0 -0
  236. tilelang/3rdparty/tvm/python/tvm/meta_schedule/feature_extractor/__pycache__/random_feature_extractor.cpython-310.pyc +0 -0
  237. tilelang/3rdparty/tvm/python/tvm/meta_schedule/measure_callback/__pycache__/__init__.cpython-310.pyc +0 -0
  238. tilelang/3rdparty/tvm/python/tvm/meta_schedule/measure_callback/__pycache__/add_to_database.cpython-310.pyc +0 -0
  239. tilelang/3rdparty/tvm/python/tvm/meta_schedule/measure_callback/__pycache__/measure_callback.cpython-310.pyc +0 -0
  240. tilelang/3rdparty/tvm/python/tvm/meta_schedule/measure_callback/__pycache__/remove_build_artifact.cpython-310.pyc +0 -0
  241. tilelang/3rdparty/tvm/python/tvm/meta_schedule/measure_callback/__pycache__/update_cost_model.cpython-310.pyc +0 -0
  242. tilelang/3rdparty/tvm/python/tvm/meta_schedule/mutator/__pycache__/__init__.cpython-310.pyc +0 -0
  243. tilelang/3rdparty/tvm/python/tvm/meta_schedule/mutator/__pycache__/mutate_compute_location.cpython-310.pyc +0 -0
  244. tilelang/3rdparty/tvm/python/tvm/meta_schedule/mutator/__pycache__/mutate_parallel.cpython-310.pyc +0 -0
  245. tilelang/3rdparty/tvm/python/tvm/meta_schedule/mutator/__pycache__/mutate_thread_binding.cpython-310.pyc +0 -0
  246. tilelang/3rdparty/tvm/python/tvm/meta_schedule/mutator/__pycache__/mutate_tile_size.cpython-310.pyc +0 -0
  247. tilelang/3rdparty/tvm/python/tvm/meta_schedule/mutator/__pycache__/mutate_unroll.cpython-310.pyc +0 -0
  248. tilelang/3rdparty/tvm/python/tvm/meta_schedule/mutator/__pycache__/mutator.cpython-310.pyc +0 -0
  249. tilelang/3rdparty/tvm/python/tvm/meta_schedule/postproc/__pycache__/__init__.cpython-310.pyc +0 -0
  250. tilelang/3rdparty/tvm/python/tvm/meta_schedule/postproc/__pycache__/disallow_async_strided_mem_copy.cpython-310.pyc +0 -0
  251. tilelang/3rdparty/tvm/python/tvm/meta_schedule/postproc/__pycache__/disallow_dynamic_loop.cpython-310.pyc +0 -0
  252. tilelang/3rdparty/tvm/python/tvm/meta_schedule/postproc/__pycache__/postproc.cpython-310.pyc +0 -0
  253. tilelang/3rdparty/tvm/python/tvm/meta_schedule/postproc/__pycache__/rewrite_cooperative_fetch.cpython-310.pyc +0 -0
  254. tilelang/3rdparty/tvm/python/tvm/meta_schedule/postproc/__pycache__/rewrite_layout.cpython-310.pyc +0 -0
  255. tilelang/3rdparty/tvm/python/tvm/meta_schedule/postproc/__pycache__/rewrite_parallel_vectorize_unroll.cpython-310.pyc +0 -0
  256. tilelang/3rdparty/tvm/python/tvm/meta_schedule/postproc/__pycache__/rewrite_reduction_block.cpython-310.pyc +0 -0
  257. tilelang/3rdparty/tvm/python/tvm/meta_schedule/postproc/__pycache__/rewrite_tensorize.cpython-310.pyc +0 -0
  258. tilelang/3rdparty/tvm/python/tvm/meta_schedule/postproc/__pycache__/rewrite_unbound_block.cpython-310.pyc +0 -0
  259. tilelang/3rdparty/tvm/python/tvm/meta_schedule/postproc/__pycache__/verify_gpu_code.cpython-310.pyc +0 -0
  260. tilelang/3rdparty/tvm/python/tvm/meta_schedule/postproc/__pycache__/verify_vtcm_limit.cpython-310.pyc +0 -0
  261. tilelang/3rdparty/tvm/python/tvm/meta_schedule/runner/__pycache__/__init__.cpython-310.pyc +0 -0
  262. tilelang/3rdparty/tvm/python/tvm/meta_schedule/runner/__pycache__/config.cpython-310.pyc +0 -0
  263. tilelang/3rdparty/tvm/python/tvm/meta_schedule/runner/__pycache__/local_runner.cpython-310.pyc +0 -0
  264. tilelang/3rdparty/tvm/python/tvm/meta_schedule/runner/__pycache__/rpc_runner.cpython-310.pyc +0 -0
  265. tilelang/3rdparty/tvm/python/tvm/meta_schedule/runner/__pycache__/runner.cpython-310.pyc +0 -0
  266. tilelang/3rdparty/tvm/python/tvm/meta_schedule/runner/__pycache__/utils.cpython-310.pyc +0 -0
  267. tilelang/3rdparty/tvm/python/tvm/meta_schedule/schedule/__pycache__/__init__.cpython-310.pyc +0 -0
  268. tilelang/3rdparty/tvm/python/tvm/meta_schedule/schedule/cpu/__pycache__/__init__.cpython-310.pyc +0 -0
  269. tilelang/3rdparty/tvm/python/tvm/meta_schedule/schedule/cuda/__pycache__/__init__.cpython-310.pyc +0 -0
  270. tilelang/3rdparty/tvm/python/tvm/meta_schedule/schedule/cuda/__pycache__/layout_transform.cpython-310.pyc +0 -0
  271. tilelang/3rdparty/tvm/python/tvm/meta_schedule/schedule/generic/__pycache__/__init__.cpython-310.pyc +0 -0
  272. tilelang/3rdparty/tvm/python/tvm/meta_schedule/schedule/x86/__pycache__/__init__.cpython-310.pyc +0 -0
  273. tilelang/3rdparty/tvm/python/tvm/meta_schedule/schedule_rule/__pycache__/__init__.cpython-310.pyc +0 -0
  274. tilelang/3rdparty/tvm/python/tvm/meta_schedule/schedule_rule/__pycache__/add_rfactor.cpython-310.pyc +0 -0
  275. tilelang/3rdparty/tvm/python/tvm/meta_schedule/schedule_rule/__pycache__/apply_custom_rule.cpython-310.pyc +0 -0
  276. tilelang/3rdparty/tvm/python/tvm/meta_schedule/schedule_rule/__pycache__/auto_bind.cpython-310.pyc +0 -0
  277. tilelang/3rdparty/tvm/python/tvm/meta_schedule/schedule_rule/__pycache__/auto_inline.cpython-310.pyc +0 -0
  278. tilelang/3rdparty/tvm/python/tvm/meta_schedule/schedule_rule/__pycache__/cross_thread_reduction.cpython-310.pyc +0 -0
  279. tilelang/3rdparty/tvm/python/tvm/meta_schedule/schedule_rule/__pycache__/multi_level_tiling.cpython-310.pyc +0 -0
  280. tilelang/3rdparty/tvm/python/tvm/meta_schedule/schedule_rule/__pycache__/parallel_vectorize_unroll.cpython-310.pyc +0 -0
  281. tilelang/3rdparty/tvm/python/tvm/meta_schedule/schedule_rule/__pycache__/random_compute_location.cpython-310.pyc +0 -0
  282. tilelang/3rdparty/tvm/python/tvm/meta_schedule/schedule_rule/__pycache__/schedule_rule.cpython-310.pyc +0 -0
  283. tilelang/3rdparty/tvm/python/tvm/meta_schedule/search_strategy/__pycache__/__init__.cpython-310.pyc +0 -0
  284. tilelang/3rdparty/tvm/python/tvm/meta_schedule/search_strategy/__pycache__/evolutionary_search.cpython-310.pyc +0 -0
  285. tilelang/3rdparty/tvm/python/tvm/meta_schedule/search_strategy/__pycache__/replay_func.cpython-310.pyc +0 -0
  286. tilelang/3rdparty/tvm/python/tvm/meta_schedule/search_strategy/__pycache__/replay_trace.cpython-310.pyc +0 -0
  287. tilelang/3rdparty/tvm/python/tvm/meta_schedule/search_strategy/__pycache__/search_strategy.cpython-310.pyc +0 -0
  288. tilelang/3rdparty/tvm/python/tvm/meta_schedule/space_generator/__pycache__/__init__.cpython-310.pyc +0 -0
  289. tilelang/3rdparty/tvm/python/tvm/meta_schedule/space_generator/__pycache__/post_order_apply.cpython-310.pyc +0 -0
  290. tilelang/3rdparty/tvm/python/tvm/meta_schedule/space_generator/__pycache__/schedule_fn.cpython-310.pyc +0 -0
  291. tilelang/3rdparty/tvm/python/tvm/meta_schedule/space_generator/__pycache__/space_generator.cpython-310.pyc +0 -0
  292. tilelang/3rdparty/tvm/python/tvm/meta_schedule/space_generator/__pycache__/space_generator_union.cpython-310.pyc +0 -0
  293. tilelang/3rdparty/tvm/python/tvm/meta_schedule/task_scheduler/__pycache__/__init__.cpython-310.pyc +0 -0
  294. tilelang/3rdparty/tvm/python/tvm/meta_schedule/task_scheduler/__pycache__/gradient_based.cpython-310.pyc +0 -0
  295. tilelang/3rdparty/tvm/python/tvm/meta_schedule/task_scheduler/__pycache__/round_robin.cpython-310.pyc +0 -0
  296. tilelang/3rdparty/tvm/python/tvm/meta_schedule/task_scheduler/__pycache__/task_scheduler.cpython-310.pyc +0 -0
  297. tilelang/3rdparty/tvm/python/tvm/relax/__pycache__/__init__.cpython-310.pyc +0 -0
  298. tilelang/3rdparty/tvm/python/tvm/relax/__pycache__/_ffi_api.cpython-310.pyc +0 -0
  299. tilelang/3rdparty/tvm/python/tvm/relax/__pycache__/binding_rewrite.cpython-310.pyc +0 -0
  300. tilelang/3rdparty/tvm/python/tvm/relax/__pycache__/block_builder.cpython-310.pyc +0 -0
  301. tilelang/3rdparty/tvm/python/tvm/relax/__pycache__/exec_builder.cpython-310.pyc +0 -0
  302. tilelang/3rdparty/tvm/python/tvm/relax/__pycache__/expr.cpython-310.pyc +0 -0
  303. tilelang/3rdparty/tvm/python/tvm/relax/__pycache__/expr_functor.cpython-310.pyc +0 -0
  304. tilelang/3rdparty/tvm/python/tvm/relax/__pycache__/pipeline.cpython-310.pyc +0 -0
  305. tilelang/3rdparty/tvm/python/tvm/relax/__pycache__/struct_info.cpython-310.pyc +0 -0
  306. tilelang/3rdparty/tvm/python/tvm/relax/__pycache__/ty.cpython-310.pyc +0 -0
  307. tilelang/3rdparty/tvm/python/tvm/relax/__pycache__/utils.cpython-310.pyc +0 -0
  308. tilelang/3rdparty/tvm/python/tvm/relax/__pycache__/vm_build.cpython-310.pyc +0 -0
  309. tilelang/3rdparty/tvm/python/tvm/relax/analysis/__pycache__/__init__.cpython-310.pyc +0 -0
  310. tilelang/3rdparty/tvm/python/tvm/relax/analysis/__pycache__/_ffi_api.cpython-310.pyc +0 -0
  311. tilelang/3rdparty/tvm/python/tvm/relax/analysis/__pycache__/analysis.cpython-310.pyc +0 -0
  312. tilelang/3rdparty/tvm/python/tvm/relax/analysis/__pycache__/estimate_memory_usage.cpython-310.pyc +0 -0
  313. tilelang/3rdparty/tvm/python/tvm/relax/backend/__pycache__/__init__.cpython-310.pyc +0 -0
  314. tilelang/3rdparty/tvm/python/tvm/relax/backend/__pycache__/_ffi_api.cpython-310.pyc +0 -0
  315. tilelang/3rdparty/tvm/python/tvm/relax/backend/__pycache__/dispatch_sort_scan.cpython-310.pyc +0 -0
  316. tilelang/3rdparty/tvm/python/tvm/relax/backend/__pycache__/pattern_registry.cpython-310.pyc +0 -0
  317. tilelang/3rdparty/tvm/python/tvm/relax/backend/contrib/__pycache__/__init__.cpython-310.pyc +0 -0
  318. tilelang/3rdparty/tvm/python/tvm/relax/distributed/__pycache__/__init__.cpython-310.pyc +0 -0
  319. tilelang/3rdparty/tvm/python/tvm/relax/distributed/__pycache__/_ffi_api.cpython-310.pyc +0 -0
  320. tilelang/3rdparty/tvm/python/tvm/relax/distributed/__pycache__/global_info.cpython-310.pyc +0 -0
  321. tilelang/3rdparty/tvm/python/tvm/relax/distributed/__pycache__/struct_info.cpython-310.pyc +0 -0
  322. tilelang/3rdparty/tvm/python/tvm/relax/distributed/transform/__pycache__/__init__.cpython-310.pyc +0 -0
  323. tilelang/3rdparty/tvm/python/tvm/relax/distributed/transform/__pycache__/_ffi_api.cpython-310.pyc +0 -0
  324. tilelang/3rdparty/tvm/python/tvm/relax/distributed/transform/__pycache__/transform.cpython-310.pyc +0 -0
  325. tilelang/3rdparty/tvm/python/tvm/relax/dpl/__pycache__/__init__.cpython-310.pyc +0 -0
  326. tilelang/3rdparty/tvm/python/tvm/relax/dpl/__pycache__/_ffi.cpython-310.pyc +0 -0
  327. tilelang/3rdparty/tvm/python/tvm/relax/dpl/__pycache__/context.cpython-310.pyc +0 -0
  328. tilelang/3rdparty/tvm/python/tvm/relax/dpl/__pycache__/pattern.cpython-310.pyc +0 -0
  329. tilelang/3rdparty/tvm/python/tvm/relax/dpl/__pycache__/rewrite.cpython-310.pyc +0 -0
  330. tilelang/3rdparty/tvm/python/tvm/relax/frontend/__pycache__/__init__.cpython-310.pyc +0 -0
  331. tilelang/3rdparty/tvm/python/tvm/relax/frontend/__pycache__/common.cpython-310.pyc +0 -0
  332. tilelang/3rdparty/tvm/python/tvm/relax/frontend/nn/__pycache__/__init__.cpython-310.pyc +0 -0
  333. tilelang/3rdparty/tvm/python/tvm/relax/frontend/nn/__pycache__/_tensor_op.cpython-310.pyc +0 -0
  334. tilelang/3rdparty/tvm/python/tvm/relax/frontend/nn/__pycache__/core.cpython-310.pyc +0 -0
  335. tilelang/3rdparty/tvm/python/tvm/relax/frontend/nn/__pycache__/exporter.cpython-310.pyc +0 -0
  336. tilelang/3rdparty/tvm/python/tvm/relax/frontend/nn/__pycache__/extern.cpython-310.pyc +0 -0
  337. tilelang/3rdparty/tvm/python/tvm/relax/frontend/nn/__pycache__/modules.cpython-310.pyc +0 -0
  338. tilelang/3rdparty/tvm/python/tvm/relax/frontend/nn/__pycache__/op.cpython-310.pyc +0 -0
  339. tilelang/3rdparty/tvm/python/tvm/relax/frontend/nn/__pycache__/spec.cpython-310.pyc +0 -0
  340. tilelang/3rdparty/tvm/python/tvm/relax/frontend/nn/__pycache__/subroutine.cpython-310.pyc +0 -0
  341. tilelang/3rdparty/tvm/python/tvm/relax/frontend/nn/__pycache__/visitor.cpython-310.pyc +0 -0
  342. tilelang/3rdparty/tvm/python/tvm/relax/op/__pycache__/__init__.cpython-310.pyc +0 -0
  343. tilelang/3rdparty/tvm/python/tvm/relax/op/__pycache__/_ffi_api.cpython-310.pyc +0 -0
  344. tilelang/3rdparty/tvm/python/tvm/relax/op/__pycache__/_op_gradient.cpython-310.pyc +0 -0
  345. tilelang/3rdparty/tvm/python/tvm/relax/op/__pycache__/base.cpython-310.pyc +0 -0
  346. tilelang/3rdparty/tvm/python/tvm/relax/op/__pycache__/binary.cpython-310.pyc +0 -0
  347. tilelang/3rdparty/tvm/python/tvm/relax/op/__pycache__/create.cpython-310.pyc +0 -0
  348. tilelang/3rdparty/tvm/python/tvm/relax/op/__pycache__/datatype.cpython-310.pyc +0 -0
  349. tilelang/3rdparty/tvm/python/tvm/relax/op/__pycache__/index.cpython-310.pyc +0 -0
  350. tilelang/3rdparty/tvm/python/tvm/relax/op/__pycache__/linear_algebra.cpython-310.pyc +0 -0
  351. tilelang/3rdparty/tvm/python/tvm/relax/op/__pycache__/manipulate.cpython-310.pyc +0 -0
  352. tilelang/3rdparty/tvm/python/tvm/relax/op/__pycache__/mask.cpython-310.pyc +0 -0
  353. tilelang/3rdparty/tvm/python/tvm/relax/op/__pycache__/op_attrs.cpython-310.pyc +0 -0
  354. tilelang/3rdparty/tvm/python/tvm/relax/op/__pycache__/qdq.cpython-310.pyc +0 -0
  355. tilelang/3rdparty/tvm/python/tvm/relax/op/__pycache__/search.cpython-310.pyc +0 -0
  356. tilelang/3rdparty/tvm/python/tvm/relax/op/__pycache__/set.cpython-310.pyc +0 -0
  357. tilelang/3rdparty/tvm/python/tvm/relax/op/__pycache__/sorting.cpython-310.pyc +0 -0
  358. tilelang/3rdparty/tvm/python/tvm/relax/op/__pycache__/statistical.cpython-310.pyc +0 -0
  359. tilelang/3rdparty/tvm/python/tvm/relax/op/__pycache__/ternary.cpython-310.pyc +0 -0
  360. tilelang/3rdparty/tvm/python/tvm/relax/op/__pycache__/unary.cpython-310.pyc +0 -0
  361. tilelang/3rdparty/tvm/python/tvm/relax/op/builtin/__pycache__/__init__.cpython-310.pyc +0 -0
  362. tilelang/3rdparty/tvm/python/tvm/relax/op/builtin/__pycache__/_ffi_api.cpython-310.pyc +0 -0
  363. tilelang/3rdparty/tvm/python/tvm/relax/op/builtin/__pycache__/builtin.cpython-310.pyc +0 -0
  364. tilelang/3rdparty/tvm/python/tvm/relax/op/ccl/__pycache__/__init__.cpython-310.pyc +0 -0
  365. tilelang/3rdparty/tvm/python/tvm/relax/op/ccl/__pycache__/_ffi_api.cpython-310.pyc +0 -0
  366. tilelang/3rdparty/tvm/python/tvm/relax/op/ccl/__pycache__/ccl.cpython-310.pyc +0 -0
  367. tilelang/3rdparty/tvm/python/tvm/relax/op/distributed/__pycache__/__init__.cpython-310.pyc +0 -0
  368. tilelang/3rdparty/tvm/python/tvm/relax/op/distributed/__pycache__/_ffi_api.cpython-310.pyc +0 -0
  369. tilelang/3rdparty/tvm/python/tvm/relax/op/distributed/__pycache__/distributed.cpython-310.pyc +0 -0
  370. tilelang/3rdparty/tvm/python/tvm/relax/op/grad/__pycache__/__init__.cpython-310.pyc +0 -0
  371. tilelang/3rdparty/tvm/python/tvm/relax/op/grad/__pycache__/_ffi_api.cpython-310.pyc +0 -0
  372. tilelang/3rdparty/tvm/python/tvm/relax/op/grad/__pycache__/grad.cpython-310.pyc +0 -0
  373. tilelang/3rdparty/tvm/python/tvm/relax/op/image/__pycache__/__init__.cpython-310.pyc +0 -0
  374. tilelang/3rdparty/tvm/python/tvm/relax/op/image/__pycache__/_ffi_api.cpython-310.pyc +0 -0
  375. tilelang/3rdparty/tvm/python/tvm/relax/op/image/__pycache__/image.cpython-310.pyc +0 -0
  376. tilelang/3rdparty/tvm/python/tvm/relax/op/memory/__pycache__/__init__.cpython-310.pyc +0 -0
  377. tilelang/3rdparty/tvm/python/tvm/relax/op/memory/__pycache__/_ffi_api.cpython-310.pyc +0 -0
  378. tilelang/3rdparty/tvm/python/tvm/relax/op/memory/__pycache__/memory.cpython-310.pyc +0 -0
  379. tilelang/3rdparty/tvm/python/tvm/relax/op/nn/__pycache__/__init__.cpython-310.pyc +0 -0
  380. tilelang/3rdparty/tvm/python/tvm/relax/op/nn/__pycache__/_ffi_api.cpython-310.pyc +0 -0
  381. tilelang/3rdparty/tvm/python/tvm/relax/op/nn/__pycache__/nn.cpython-310.pyc +0 -0
  382. tilelang/3rdparty/tvm/python/tvm/relax/training/__pycache__/__init__.cpython-310.pyc +0 -0
  383. tilelang/3rdparty/tvm/python/tvm/relax/training/__pycache__/_ffi_api.cpython-310.pyc +0 -0
  384. tilelang/3rdparty/tvm/python/tvm/relax/training/__pycache__/loss.cpython-310.pyc +0 -0
  385. tilelang/3rdparty/tvm/python/tvm/relax/training/__pycache__/optimizer.cpython-310.pyc +0 -0
  386. tilelang/3rdparty/tvm/python/tvm/relax/training/__pycache__/setup_trainer.cpython-310.pyc +0 -0
  387. tilelang/3rdparty/tvm/python/tvm/relax/training/__pycache__/trainer.cpython-310.pyc +0 -0
  388. tilelang/3rdparty/tvm/python/tvm/relax/training/__pycache__/utils.cpython-310.pyc +0 -0
  389. tilelang/3rdparty/tvm/python/tvm/relax/transform/__pycache__/__init__.cpython-310.pyc +0 -0
  390. tilelang/3rdparty/tvm/python/tvm/relax/transform/__pycache__/_ffi_api.cpython-310.pyc +0 -0
  391. tilelang/3rdparty/tvm/python/tvm/relax/transform/__pycache__/attach_external_modules.cpython-310.pyc +0 -0
  392. tilelang/3rdparty/tvm/python/tvm/relax/transform/__pycache__/fast_math.cpython-310.pyc +0 -0
  393. tilelang/3rdparty/tvm/python/tvm/relax/transform/__pycache__/ipc_allreduce_rewrite.cpython-310.pyc +0 -0
  394. tilelang/3rdparty/tvm/python/tvm/relax/transform/__pycache__/lazy_transform_params.cpython-310.pyc +0 -0
  395. tilelang/3rdparty/tvm/python/tvm/relax/transform/__pycache__/lower_gpu_ipc_alloc_storage.cpython-310.pyc +0 -0
  396. tilelang/3rdparty/tvm/python/tvm/relax/transform/__pycache__/optimize_layout_transform.cpython-310.pyc +0 -0
  397. tilelang/3rdparty/tvm/python/tvm/relax/transform/__pycache__/remove_redundant_reshape.cpython-310.pyc +0 -0
  398. tilelang/3rdparty/tvm/python/tvm/relax/transform/__pycache__/transform.cpython-310.pyc +0 -0
  399. tilelang/3rdparty/tvm/python/tvm/relax/transform/legalize_ops/__pycache__/__init__.cpython-310.pyc +0 -0
  400. tilelang/3rdparty/tvm/python/tvm/relax/transform/legalize_ops/__pycache__/binary.cpython-310.pyc +0 -0
  401. tilelang/3rdparty/tvm/python/tvm/relax/transform/legalize_ops/__pycache__/ccl.cpython-310.pyc +0 -0
  402. tilelang/3rdparty/tvm/python/tvm/relax/transform/legalize_ops/__pycache__/common.cpython-310.pyc +0 -0
  403. tilelang/3rdparty/tvm/python/tvm/relax/transform/legalize_ops/__pycache__/create.cpython-310.pyc +0 -0
  404. tilelang/3rdparty/tvm/python/tvm/relax/transform/legalize_ops/__pycache__/datatype.cpython-310.pyc +0 -0
  405. tilelang/3rdparty/tvm/python/tvm/relax/transform/legalize_ops/__pycache__/distributed.cpython-310.pyc +0 -0
  406. tilelang/3rdparty/tvm/python/tvm/relax/transform/legalize_ops/__pycache__/grad.cpython-310.pyc +0 -0
  407. tilelang/3rdparty/tvm/python/tvm/relax/transform/legalize_ops/__pycache__/image.cpython-310.pyc +0 -0
  408. tilelang/3rdparty/tvm/python/tvm/relax/transform/legalize_ops/__pycache__/index.cpython-310.pyc +0 -0
  409. tilelang/3rdparty/tvm/python/tvm/relax/transform/legalize_ops/__pycache__/inspect_op.cpython-310.pyc +0 -0
  410. tilelang/3rdparty/tvm/python/tvm/relax/transform/legalize_ops/__pycache__/linear_algebra.cpython-310.pyc +0 -0
  411. tilelang/3rdparty/tvm/python/tvm/relax/transform/legalize_ops/__pycache__/manipulate.cpython-310.pyc +0 -0
  412. tilelang/3rdparty/tvm/python/tvm/relax/transform/legalize_ops/__pycache__/nn.cpython-310.pyc +0 -0
  413. tilelang/3rdparty/tvm/python/tvm/relax/transform/legalize_ops/__pycache__/qdq.cpython-310.pyc +0 -0
  414. tilelang/3rdparty/tvm/python/tvm/relax/transform/legalize_ops/__pycache__/search.cpython-310.pyc +0 -0
  415. tilelang/3rdparty/tvm/python/tvm/relax/transform/legalize_ops/__pycache__/statistical.cpython-310.pyc +0 -0
  416. tilelang/3rdparty/tvm/python/tvm/relax/transform/legalize_ops/__pycache__/unary.cpython-310.pyc +0 -0
  417. tilelang/3rdparty/tvm/python/tvm/relax/transform/tuning_api/__pycache__/__init__.cpython-310.pyc +0 -0
  418. tilelang/3rdparty/tvm/python/tvm/relax/transform/tuning_api/__pycache__/_ffi_api.cpython-310.pyc +0 -0
  419. tilelang/3rdparty/tvm/python/tvm/relax/transform/tuning_api/__pycache__/database.cpython-310.pyc +0 -0
  420. tilelang/3rdparty/tvm/python/tvm/relax/transform/tuning_api/__pycache__/default_functions.cpython-310.pyc +0 -0
  421. tilelang/3rdparty/tvm/python/tvm/relax/transform/tuning_api/__pycache__/primitives.cpython-310.pyc +0 -0
  422. tilelang/3rdparty/tvm/python/tvm/relay/__pycache__/__init__.cpython-310.pyc +0 -0
  423. tilelang/3rdparty/tvm/python/tvm/relay/__pycache__/_build_module.cpython-310.pyc +0 -0
  424. tilelang/3rdparty/tvm/python/tvm/relay/__pycache__/_ffi_api.cpython-310.pyc +0 -0
  425. tilelang/3rdparty/tvm/python/tvm/relay/__pycache__/_ffi_api_parser.cpython-310.pyc +0 -0
  426. tilelang/3rdparty/tvm/python/tvm/relay/__pycache__/_make.cpython-310.pyc +0 -0
  427. tilelang/3rdparty/tvm/python/tvm/relay/__pycache__/adt.cpython-310.pyc +0 -0
  428. tilelang/3rdparty/tvm/python/tvm/relay/__pycache__/base.cpython-310.pyc +0 -0
  429. tilelang/3rdparty/tvm/python/tvm/relay/__pycache__/build_module.cpython-310.pyc +0 -0
  430. tilelang/3rdparty/tvm/python/tvm/relay/__pycache__/debug.cpython-310.pyc +0 -0
  431. tilelang/3rdparty/tvm/python/tvm/relay/__pycache__/expr.cpython-310.pyc +0 -0
  432. tilelang/3rdparty/tvm/python/tvm/relay/__pycache__/expr_functor.cpython-310.pyc +0 -0
  433. tilelang/3rdparty/tvm/python/tvm/relay/__pycache__/function.cpython-310.pyc +0 -0
  434. tilelang/3rdparty/tvm/python/tvm/relay/__pycache__/loops.cpython-310.pyc +0 -0
  435. tilelang/3rdparty/tvm/python/tvm/relay/__pycache__/param_dict.cpython-310.pyc +0 -0
  436. tilelang/3rdparty/tvm/python/tvm/relay/__pycache__/parser.cpython-310.pyc +0 -0
  437. tilelang/3rdparty/tvm/python/tvm/relay/__pycache__/prelude.cpython-310.pyc +0 -0
  438. tilelang/3rdparty/tvm/python/tvm/relay/__pycache__/scope_builder.cpython-310.pyc +0 -0
  439. tilelang/3rdparty/tvm/python/tvm/relay/__pycache__/ty.cpython-310.pyc +0 -0
  440. tilelang/3rdparty/tvm/python/tvm/relay/__pycache__/type_functor.cpython-310.pyc +0 -0
  441. tilelang/3rdparty/tvm/python/tvm/relay/analysis/__pycache__/__init__.cpython-310.pyc +0 -0
  442. tilelang/3rdparty/tvm/python/tvm/relay/analysis/__pycache__/_ffi_api.cpython-310.pyc +0 -0
  443. tilelang/3rdparty/tvm/python/tvm/relay/analysis/__pycache__/analysis.cpython-310.pyc +0 -0
  444. tilelang/3rdparty/tvm/python/tvm/relay/analysis/__pycache__/annotated_regions.cpython-310.pyc +0 -0
  445. tilelang/3rdparty/tvm/python/tvm/relay/analysis/__pycache__/call_graph.cpython-310.pyc +0 -0
  446. tilelang/3rdparty/tvm/python/tvm/relay/analysis/__pycache__/count_layers.cpython-310.pyc +0 -0
  447. tilelang/3rdparty/tvm/python/tvm/relay/analysis/__pycache__/feature.cpython-310.pyc +0 -0
  448. tilelang/3rdparty/tvm/python/tvm/relay/analysis/__pycache__/sparse_conv2d.cpython-310.pyc +0 -0
  449. tilelang/3rdparty/tvm/python/tvm/relay/analysis/__pycache__/sparse_dense.cpython-310.pyc +0 -0
  450. tilelang/3rdparty/tvm/python/tvm/relay/backend/__pycache__/__init__.cpython-310.pyc +0 -0
  451. tilelang/3rdparty/tvm/python/tvm/relay/backend/__pycache__/_backend.cpython-310.pyc +0 -0
  452. tilelang/3rdparty/tvm/python/tvm/relay/backend/__pycache__/_vm.cpython-310.pyc +0 -0
  453. tilelang/3rdparty/tvm/python/tvm/relay/backend/__pycache__/executor.cpython-310.pyc +0 -0
  454. tilelang/3rdparty/tvm/python/tvm/relay/backend/__pycache__/executor_factory.cpython-310.pyc +0 -0
  455. tilelang/3rdparty/tvm/python/tvm/relay/backend/__pycache__/interpreter.cpython-310.pyc +0 -0
  456. tilelang/3rdparty/tvm/python/tvm/relay/backend/__pycache__/runtime.cpython-310.pyc +0 -0
  457. tilelang/3rdparty/tvm/python/tvm/relay/backend/__pycache__/te_compiler.cpython-310.pyc +0 -0
  458. tilelang/3rdparty/tvm/python/tvm/relay/backend/__pycache__/utils.cpython-310.pyc +0 -0
  459. tilelang/3rdparty/tvm/python/tvm/relay/backend/__pycache__/vm.cpython-310.pyc +0 -0
  460. tilelang/3rdparty/tvm/python/tvm/relay/collage/__pycache__/__init__.cpython-310.pyc +0 -0
  461. tilelang/3rdparty/tvm/python/tvm/relay/collage/__pycache__/_ffi_api.cpython-310.pyc +0 -0
  462. tilelang/3rdparty/tvm/python/tvm/relay/collage/__pycache__/collage.cpython-310.pyc +0 -0
  463. tilelang/3rdparty/tvm/python/tvm/relay/data_dep_optimization/__pycache__/__init__.cpython-310.pyc +0 -0
  464. tilelang/3rdparty/tvm/python/tvm/relay/data_dep_optimization/__pycache__/bsr_conv2d.cpython-310.pyc +0 -0
  465. tilelang/3rdparty/tvm/python/tvm/relay/data_dep_optimization/__pycache__/bsr_dense.cpython-310.pyc +0 -0
  466. tilelang/3rdparty/tvm/python/tvm/relay/data_dep_optimization/__pycache__/simplify_fc_transpose.cpython-310.pyc +0 -0
  467. tilelang/3rdparty/tvm/python/tvm/relay/data_dep_optimization/__pycache__/utils.cpython-310.pyc +0 -0
  468. tilelang/3rdparty/tvm/python/tvm/relay/dataflow_pattern/__pycache__/__init__.cpython-310.pyc +0 -0
  469. tilelang/3rdparty/tvm/python/tvm/relay/dataflow_pattern/__pycache__/_ffi.cpython-310.pyc +0 -0
  470. tilelang/3rdparty/tvm/python/tvm/relay/frontend/__pycache__/__init__.cpython-310.pyc +0 -0
  471. tilelang/3rdparty/tvm/python/tvm/relay/frontend/__pycache__/caffe.cpython-310.pyc +0 -0
  472. tilelang/3rdparty/tvm/python/tvm/relay/frontend/__pycache__/caffe2.cpython-310.pyc +0 -0
  473. tilelang/3rdparty/tvm/python/tvm/relay/frontend/__pycache__/change_datatype.cpython-310.pyc +0 -0
  474. tilelang/3rdparty/tvm/python/tvm/relay/frontend/__pycache__/common.cpython-310.pyc +0 -0
  475. tilelang/3rdparty/tvm/python/tvm/relay/frontend/__pycache__/coreml.cpython-310.pyc +0 -0
  476. tilelang/3rdparty/tvm/python/tvm/relay/frontend/__pycache__/darknet.cpython-310.pyc +0 -0
  477. tilelang/3rdparty/tvm/python/tvm/relay/frontend/__pycache__/keras.cpython-310.pyc +0 -0
  478. tilelang/3rdparty/tvm/python/tvm/relay/frontend/__pycache__/mxnet.cpython-310.pyc +0 -0
  479. tilelang/3rdparty/tvm/python/tvm/relay/frontend/__pycache__/mxnet_qnn_op_utils.cpython-310.pyc +0 -0
  480. tilelang/3rdparty/tvm/python/tvm/relay/frontend/__pycache__/nnvm_common.cpython-310.pyc +0 -0
  481. tilelang/3rdparty/tvm/python/tvm/relay/frontend/__pycache__/oneflow.cpython-310.pyc +0 -0
  482. tilelang/3rdparty/tvm/python/tvm/relay/frontend/__pycache__/onnx.cpython-310.pyc +0 -0
  483. tilelang/3rdparty/tvm/python/tvm/relay/frontend/__pycache__/paddlepaddle.cpython-310.pyc +0 -0
  484. tilelang/3rdparty/tvm/python/tvm/relay/frontend/__pycache__/pytorch.cpython-310.pyc +0 -0
  485. tilelang/3rdparty/tvm/python/tvm/relay/frontend/__pycache__/pytorch_utils.cpython-310.pyc +0 -0
  486. tilelang/3rdparty/tvm/python/tvm/relay/frontend/__pycache__/qnn_torch.cpython-310.pyc +0 -0
  487. tilelang/3rdparty/tvm/python/tvm/relay/frontend/__pycache__/tensorflow.cpython-310.pyc +0 -0
  488. tilelang/3rdparty/tvm/python/tvm/relay/frontend/__pycache__/tensorflow_ops.cpython-310.pyc +0 -0
  489. tilelang/3rdparty/tvm/python/tvm/relay/frontend/__pycache__/tflite.cpython-310.pyc +0 -0
  490. tilelang/3rdparty/tvm/python/tvm/relay/frontend/__pycache__/tflite_flexbuffer.cpython-310.pyc +0 -0
  491. tilelang/3rdparty/tvm/python/tvm/relay/op/__pycache__/__init__.cpython-310.pyc +0 -0
  492. tilelang/3rdparty/tvm/python/tvm/relay/op/__pycache__/_algorithm.cpython-310.pyc +0 -0
  493. tilelang/3rdparty/tvm/python/tvm/relay/op/__pycache__/_make.cpython-310.pyc +0 -0
  494. tilelang/3rdparty/tvm/python/tvm/relay/op/__pycache__/_math.cpython-310.pyc +0 -0
  495. tilelang/3rdparty/tvm/python/tvm/relay/op/__pycache__/_reduce.cpython-310.pyc +0 -0
  496. tilelang/3rdparty/tvm/python/tvm/relay/op/__pycache__/_tensor.cpython-310.pyc +0 -0
  497. tilelang/3rdparty/tvm/python/tvm/relay/op/__pycache__/_tensor_grad.cpython-310.pyc +0 -0
  498. tilelang/3rdparty/tvm/python/tvm/relay/op/__pycache__/_transform.cpython-310.pyc +0 -0
  499. tilelang/3rdparty/tvm/python/tvm/relay/op/__pycache__/algorithm.cpython-310.pyc +0 -0
  500. tilelang/3rdparty/tvm/python/tvm/relay/op/__pycache__/op.cpython-310.pyc +0 -0
  501. tilelang/3rdparty/tvm/python/tvm/relay/op/__pycache__/op_attrs.cpython-310.pyc +0 -0
  502. tilelang/3rdparty/tvm/python/tvm/relay/op/__pycache__/reduce.cpython-310.pyc +0 -0
  503. tilelang/3rdparty/tvm/python/tvm/relay/op/__pycache__/tensor.cpython-310.pyc +0 -0
  504. tilelang/3rdparty/tvm/python/tvm/relay/op/__pycache__/transform.cpython-310.pyc +0 -0
  505. tilelang/3rdparty/tvm/python/tvm/relay/op/annotation/__pycache__/__init__.cpython-310.pyc +0 -0
  506. tilelang/3rdparty/tvm/python/tvm/relay/op/annotation/__pycache__/_make.cpython-310.pyc +0 -0
  507. tilelang/3rdparty/tvm/python/tvm/relay/op/annotation/__pycache__/annotation.cpython-310.pyc +0 -0
  508. tilelang/3rdparty/tvm/python/tvm/relay/op/contrib/__pycache__/__init__.cpython-310.pyc +0 -0
  509. tilelang/3rdparty/tvm/python/tvm/relay/op/contrib/__pycache__/_ethosn.cpython-310.pyc +0 -0
  510. tilelang/3rdparty/tvm/python/tvm/relay/op/contrib/__pycache__/arm_compute_lib.cpython-310.pyc +0 -0
  511. tilelang/3rdparty/tvm/python/tvm/relay/op/contrib/__pycache__/bnns.cpython-310.pyc +0 -0
  512. tilelang/3rdparty/tvm/python/tvm/relay/op/contrib/__pycache__/clml.cpython-310.pyc +0 -0
  513. tilelang/3rdparty/tvm/python/tvm/relay/op/contrib/__pycache__/coreml.cpython-310.pyc +0 -0
  514. tilelang/3rdparty/tvm/python/tvm/relay/op/contrib/__pycache__/cutlass.cpython-310.pyc +0 -0
  515. tilelang/3rdparty/tvm/python/tvm/relay/op/contrib/__pycache__/dnnl.cpython-310.pyc +0 -0
  516. tilelang/3rdparty/tvm/python/tvm/relay/op/contrib/__pycache__/ethosn.cpython-310.pyc +0 -0
  517. tilelang/3rdparty/tvm/python/tvm/relay/op/contrib/__pycache__/libtorch.cpython-310.pyc +0 -0
  518. tilelang/3rdparty/tvm/python/tvm/relay/op/contrib/__pycache__/mrvl.cpython-310.pyc +0 -0
  519. tilelang/3rdparty/tvm/python/tvm/relay/op/contrib/__pycache__/register.cpython-310.pyc +0 -0
  520. tilelang/3rdparty/tvm/python/tvm/relay/op/contrib/__pycache__/tensorrt.cpython-310.pyc +0 -0
  521. tilelang/3rdparty/tvm/python/tvm/relay/op/dyn/__pycache__/__init__.cpython-310.pyc +0 -0
  522. tilelang/3rdparty/tvm/python/tvm/relay/op/dyn/__pycache__/_algorithm.cpython-310.pyc +0 -0
  523. tilelang/3rdparty/tvm/python/tvm/relay/op/dyn/__pycache__/_make.cpython-310.pyc +0 -0
  524. tilelang/3rdparty/tvm/python/tvm/relay/op/dyn/__pycache__/_tensor.cpython-310.pyc +0 -0
  525. tilelang/3rdparty/tvm/python/tvm/relay/op/dyn/__pycache__/_transform.cpython-310.pyc +0 -0
  526. tilelang/3rdparty/tvm/python/tvm/relay/op/dyn/image/__pycache__/__init__.cpython-310.pyc +0 -0
  527. tilelang/3rdparty/tvm/python/tvm/relay/op/dyn/image/__pycache__/_image.cpython-310.pyc +0 -0
  528. tilelang/3rdparty/tvm/python/tvm/relay/op/dyn/image/__pycache__/_make.cpython-310.pyc +0 -0
  529. tilelang/3rdparty/tvm/python/tvm/relay/op/dyn/nn/__pycache__/__init__.cpython-310.pyc +0 -0
  530. tilelang/3rdparty/tvm/python/tvm/relay/op/dyn/nn/__pycache__/_make.cpython-310.pyc +0 -0
  531. tilelang/3rdparty/tvm/python/tvm/relay/op/dyn/nn/__pycache__/_nn.cpython-310.pyc +0 -0
  532. tilelang/3rdparty/tvm/python/tvm/relay/op/image/__pycache__/__init__.cpython-310.pyc +0 -0
  533. tilelang/3rdparty/tvm/python/tvm/relay/op/image/__pycache__/_image.cpython-310.pyc +0 -0
  534. tilelang/3rdparty/tvm/python/tvm/relay/op/image/__pycache__/_make.cpython-310.pyc +0 -0
  535. tilelang/3rdparty/tvm/python/tvm/relay/op/image/__pycache__/image.cpython-310.pyc +0 -0
  536. tilelang/3rdparty/tvm/python/tvm/relay/op/memory/__pycache__/__init__.cpython-310.pyc +0 -0
  537. tilelang/3rdparty/tvm/python/tvm/relay/op/memory/__pycache__/_make.cpython-310.pyc +0 -0
  538. tilelang/3rdparty/tvm/python/tvm/relay/op/memory/__pycache__/memory.cpython-310.pyc +0 -0
  539. tilelang/3rdparty/tvm/python/tvm/relay/op/nn/__pycache__/__init__.cpython-310.pyc +0 -0
  540. tilelang/3rdparty/tvm/python/tvm/relay/op/nn/__pycache__/_make.cpython-310.pyc +0 -0
  541. tilelang/3rdparty/tvm/python/tvm/relay/op/nn/__pycache__/_nn.cpython-310.pyc +0 -0
  542. tilelang/3rdparty/tvm/python/tvm/relay/op/nn/__pycache__/nn.cpython-310.pyc +0 -0
  543. tilelang/3rdparty/tvm/python/tvm/relay/op/nn/__pycache__/utils.cpython-310.pyc +0 -0
  544. tilelang/3rdparty/tvm/python/tvm/relay/op/random/__pycache__/__init__.cpython-310.pyc +0 -0
  545. tilelang/3rdparty/tvm/python/tvm/relay/op/random/__pycache__/_kernel.cpython-310.pyc +0 -0
  546. tilelang/3rdparty/tvm/python/tvm/relay/op/random/__pycache__/_make.cpython-310.pyc +0 -0
  547. tilelang/3rdparty/tvm/python/tvm/relay/op/random/__pycache__/kernel.cpython-310.pyc +0 -0
  548. tilelang/3rdparty/tvm/python/tvm/relay/op/strategy/__pycache__/__init__.cpython-310.pyc +0 -0
  549. tilelang/3rdparty/tvm/python/tvm/relay/op/strategy/__pycache__/adreno.cpython-310.pyc +0 -0
  550. tilelang/3rdparty/tvm/python/tvm/relay/op/strategy/__pycache__/arm_cpu.cpython-310.pyc +0 -0
  551. tilelang/3rdparty/tvm/python/tvm/relay/op/strategy/__pycache__/bifrost.cpython-310.pyc +0 -0
  552. tilelang/3rdparty/tvm/python/tvm/relay/op/strategy/__pycache__/cuda.cpython-310.pyc +0 -0
  553. tilelang/3rdparty/tvm/python/tvm/relay/op/strategy/__pycache__/generic.cpython-310.pyc +0 -0
  554. tilelang/3rdparty/tvm/python/tvm/relay/op/strategy/__pycache__/hexagon.cpython-310.pyc +0 -0
  555. tilelang/3rdparty/tvm/python/tvm/relay/op/strategy/__pycache__/hls.cpython-310.pyc +0 -0
  556. tilelang/3rdparty/tvm/python/tvm/relay/op/strategy/__pycache__/intel_graphics.cpython-310.pyc +0 -0
  557. tilelang/3rdparty/tvm/python/tvm/relay/op/strategy/__pycache__/mali.cpython-310.pyc +0 -0
  558. tilelang/3rdparty/tvm/python/tvm/relay/op/strategy/__pycache__/rocm.cpython-310.pyc +0 -0
  559. tilelang/3rdparty/tvm/python/tvm/relay/op/strategy/__pycache__/x86.cpython-310.pyc +0 -0
  560. tilelang/3rdparty/tvm/python/tvm/relay/op/vision/__pycache__/__init__.cpython-310.pyc +0 -0
  561. tilelang/3rdparty/tvm/python/tvm/relay/op/vision/__pycache__/_make.cpython-310.pyc +0 -0
  562. tilelang/3rdparty/tvm/python/tvm/relay/op/vision/__pycache__/_rcnn.cpython-310.pyc +0 -0
  563. tilelang/3rdparty/tvm/python/tvm/relay/op/vision/__pycache__/_vision.cpython-310.pyc +0 -0
  564. tilelang/3rdparty/tvm/python/tvm/relay/op/vision/__pycache__/_yolo.cpython-310.pyc +0 -0
  565. tilelang/3rdparty/tvm/python/tvm/relay/op/vision/__pycache__/multibox.cpython-310.pyc +0 -0
  566. tilelang/3rdparty/tvm/python/tvm/relay/op/vision/__pycache__/nms.cpython-310.pyc +0 -0
  567. tilelang/3rdparty/tvm/python/tvm/relay/op/vision/__pycache__/rcnn.cpython-310.pyc +0 -0
  568. tilelang/3rdparty/tvm/python/tvm/relay/op/vision/__pycache__/yolo.cpython-310.pyc +0 -0
  569. tilelang/3rdparty/tvm/python/tvm/relay/op/vm/__pycache__/__init__.cpython-310.pyc +0 -0
  570. tilelang/3rdparty/tvm/python/tvm/relay/op/vm/__pycache__/_ffi_api.cpython-310.pyc +0 -0
  571. tilelang/3rdparty/tvm/python/tvm/relay/op/vm/__pycache__/vm.cpython-310.pyc +0 -0
  572. tilelang/3rdparty/tvm/python/tvm/relay/qnn/__pycache__/__init__.cpython-310.pyc +0 -0
  573. tilelang/3rdparty/tvm/python/tvm/relay/qnn/__pycache__/transform.cpython-310.pyc +0 -0
  574. tilelang/3rdparty/tvm/python/tvm/relay/qnn/op/__pycache__/__init__.cpython-310.pyc +0 -0
  575. tilelang/3rdparty/tvm/python/tvm/relay/qnn/op/__pycache__/_make.cpython-310.pyc +0 -0
  576. tilelang/3rdparty/tvm/python/tvm/relay/qnn/op/__pycache__/_qnn.cpython-310.pyc +0 -0
  577. tilelang/3rdparty/tvm/python/tvm/relay/qnn/op/__pycache__/_requantize.cpython-310.pyc +0 -0
  578. tilelang/3rdparty/tvm/python/tvm/relay/qnn/op/__pycache__/canonicalizations.cpython-310.pyc +0 -0
  579. tilelang/3rdparty/tvm/python/tvm/relay/qnn/op/__pycache__/layout_conversions.cpython-310.pyc +0 -0
  580. tilelang/3rdparty/tvm/python/tvm/relay/qnn/op/__pycache__/legalizations.cpython-310.pyc +0 -0
  581. tilelang/3rdparty/tvm/python/tvm/relay/qnn/op/__pycache__/op.cpython-310.pyc +0 -0
  582. tilelang/3rdparty/tvm/python/tvm/relay/qnn/op/__pycache__/qnn.cpython-310.pyc +0 -0
  583. tilelang/3rdparty/tvm/python/tvm/relay/qnn/strategy/__pycache__/__init__.cpython-310.pyc +0 -0
  584. tilelang/3rdparty/tvm/python/tvm/relay/qnn/strategy/__pycache__/arm_cpu.cpython-310.pyc +0 -0
  585. tilelang/3rdparty/tvm/python/tvm/relay/qnn/strategy/__pycache__/generic.cpython-310.pyc +0 -0
  586. tilelang/3rdparty/tvm/python/tvm/relay/qnn/strategy/__pycache__/hexagon.cpython-310.pyc +0 -0
  587. tilelang/3rdparty/tvm/python/tvm/relay/quantize/__pycache__/__init__.cpython-310.pyc +0 -0
  588. tilelang/3rdparty/tvm/python/tvm/relay/quantize/__pycache__/_annotate.cpython-310.pyc +0 -0
  589. tilelang/3rdparty/tvm/python/tvm/relay/quantize/__pycache__/_calibrate.cpython-310.pyc +0 -0
  590. tilelang/3rdparty/tvm/python/tvm/relay/quantize/__pycache__/_partition.cpython-310.pyc +0 -0
  591. tilelang/3rdparty/tvm/python/tvm/relay/quantize/__pycache__/_partition_conversions.cpython-310.pyc +0 -0
  592. tilelang/3rdparty/tvm/python/tvm/relay/quantize/__pycache__/_quantize.cpython-310.pyc +0 -0
  593. tilelang/3rdparty/tvm/python/tvm/relay/quantize/__pycache__/kl_divergence.cpython-310.pyc +0 -0
  594. tilelang/3rdparty/tvm/python/tvm/relay/quantize/__pycache__/quantize.cpython-310.pyc +0 -0
  595. tilelang/3rdparty/tvm/python/tvm/relay/transform/__pycache__/__init__.cpython-310.pyc +0 -0
  596. tilelang/3rdparty/tvm/python/tvm/relay/transform/__pycache__/_ffi_api.cpython-310.pyc +0 -0
  597. tilelang/3rdparty/tvm/python/tvm/relay/transform/__pycache__/fake_quantization_to_integer.cpython-310.pyc +0 -0
  598. tilelang/3rdparty/tvm/python/tvm/relay/transform/__pycache__/flexible_shape.cpython-310.pyc +0 -0
  599. tilelang/3rdparty/tvm/python/tvm/relay/transform/__pycache__/memory_plan.cpython-310.pyc +0 -0
  600. tilelang/3rdparty/tvm/python/tvm/relay/transform/__pycache__/mixed_precision.cpython-310.pyc +0 -0
  601. tilelang/3rdparty/tvm/python/tvm/relay/transform/__pycache__/recast.cpython-310.pyc +0 -0
  602. tilelang/3rdparty/tvm/python/tvm/relay/transform/__pycache__/transform.cpython-310.pyc +0 -0
  603. tilelang/3rdparty/tvm/python/tvm/rpc/__pycache__/__init__.cpython-310.pyc +0 -0
  604. tilelang/3rdparty/tvm/python/tvm/rpc/__pycache__/_ffi_api.cpython-310.pyc +0 -0
  605. tilelang/3rdparty/tvm/python/tvm/rpc/__pycache__/base.cpython-310.pyc +0 -0
  606. tilelang/3rdparty/tvm/python/tvm/rpc/__pycache__/client.cpython-310.pyc +0 -0
  607. tilelang/3rdparty/tvm/python/tvm/rpc/__pycache__/minrpc.cpython-310.pyc +0 -0
  608. tilelang/3rdparty/tvm/python/tvm/rpc/__pycache__/server.cpython-310.pyc +0 -0
  609. tilelang/3rdparty/tvm/python/tvm/rpc/__pycache__/testing.cpython-310.pyc +0 -0
  610. tilelang/3rdparty/tvm/python/tvm/runtime/__pycache__/__init__.cpython-310.pyc +0 -0
  611. tilelang/3rdparty/tvm/python/tvm/runtime/__pycache__/_ffi_api.cpython-310.pyc +0 -0
  612. tilelang/3rdparty/tvm/python/tvm/runtime/__pycache__/_ffi_node_api.cpython-310.pyc +0 -0
  613. tilelang/3rdparty/tvm/python/tvm/runtime/__pycache__/container.cpython-310.pyc +0 -0
  614. tilelang/3rdparty/tvm/python/tvm/runtime/__pycache__/module.cpython-310.pyc +0 -0
  615. tilelang/3rdparty/tvm/python/tvm/runtime/__pycache__/name_transforms.cpython-310.pyc +0 -0
  616. tilelang/3rdparty/tvm/python/tvm/runtime/__pycache__/ndarray.cpython-310.pyc +0 -0
  617. tilelang/3rdparty/tvm/python/tvm/runtime/__pycache__/object.cpython-310.pyc +0 -0
  618. tilelang/3rdparty/tvm/python/tvm/runtime/__pycache__/object_generic.cpython-310.pyc +0 -0
  619. tilelang/3rdparty/tvm/python/tvm/runtime/__pycache__/object_path.cpython-310.pyc +0 -0
  620. tilelang/3rdparty/tvm/python/tvm/runtime/__pycache__/packed_func.cpython-310.pyc +0 -0
  621. tilelang/3rdparty/tvm/python/tvm/runtime/__pycache__/params.cpython-310.pyc +0 -0
  622. tilelang/3rdparty/tvm/python/tvm/runtime/__pycache__/relax_vm.cpython-310.pyc +0 -0
  623. tilelang/3rdparty/tvm/python/tvm/runtime/__pycache__/script_printer.cpython-310.pyc +0 -0
  624. tilelang/3rdparty/tvm/python/tvm/runtime/__pycache__/support.cpython-310.pyc +0 -0
  625. tilelang/3rdparty/tvm/python/tvm/runtime/__pycache__/vm.cpython-310.pyc +0 -0
  626. tilelang/3rdparty/tvm/python/tvm/runtime/disco/__pycache__/__init__.cpython-310.pyc +0 -0
  627. tilelang/3rdparty/tvm/python/tvm/runtime/disco/__pycache__/_ffi_api.cpython-310.pyc +0 -0
  628. tilelang/3rdparty/tvm/python/tvm/runtime/disco/__pycache__/process_pool.cpython-310.pyc +0 -0
  629. tilelang/3rdparty/tvm/python/tvm/runtime/disco/__pycache__/session.cpython-310.pyc +0 -0
  630. tilelang/3rdparty/tvm/python/tvm/runtime/executor/__pycache__/__init__.cpython-310.pyc +0 -0
  631. tilelang/3rdparty/tvm/python/tvm/runtime/executor/__pycache__/aot_executor.cpython-310.pyc +0 -0
  632. tilelang/3rdparty/tvm/python/tvm/runtime/profiling/__pycache__/__init__.cpython-310.pyc +0 -0
  633. tilelang/3rdparty/tvm/python/tvm/runtime/profiling/__pycache__/_ffi_api.cpython-310.pyc +0 -0
  634. tilelang/3rdparty/tvm/python/tvm/script/__pycache__/__init__.cpython-310.pyc +0 -0
  635. tilelang/3rdparty/tvm/python/tvm/script/__pycache__/tir.cpython-310.pyc +0 -0
  636. tilelang/3rdparty/tvm/python/tvm/script/ir_builder/__pycache__/__init__.cpython-310.pyc +0 -0
  637. tilelang/3rdparty/tvm/python/tvm/script/ir_builder/__pycache__/_ffi_api.cpython-310.pyc +0 -0
  638. tilelang/3rdparty/tvm/python/tvm/script/ir_builder/__pycache__/base.cpython-310.pyc +0 -0
  639. tilelang/3rdparty/tvm/python/tvm/script/ir_builder/ir/__pycache__/__init__.cpython-310.pyc +0 -0
  640. tilelang/3rdparty/tvm/python/tvm/script/ir_builder/ir/__pycache__/_ffi_api.cpython-310.pyc +0 -0
  641. tilelang/3rdparty/tvm/python/tvm/script/ir_builder/ir/__pycache__/frame.cpython-310.pyc +0 -0
  642. tilelang/3rdparty/tvm/python/tvm/script/ir_builder/ir/__pycache__/ir.cpython-310.pyc +0 -0
  643. tilelang/3rdparty/tvm/python/tvm/script/ir_builder/tir/__pycache__/__init__.cpython-310.pyc +0 -0
  644. tilelang/3rdparty/tvm/python/tvm/script/ir_builder/tir/__pycache__/_ffi_api.cpython-310.pyc +0 -0
  645. tilelang/3rdparty/tvm/python/tvm/script/ir_builder/tir/__pycache__/frame.cpython-310.pyc +0 -0
  646. tilelang/3rdparty/tvm/python/tvm/script/ir_builder/tir/__pycache__/ir.cpython-310.pyc +0 -0
  647. tilelang/3rdparty/tvm/python/tvm/script/parser/__pycache__/__init__.cpython-310.pyc +0 -0
  648. tilelang/3rdparty/tvm/python/tvm/script/parser/__pycache__/_core.cpython-310.pyc +0 -0
  649. tilelang/3rdparty/tvm/python/tvm/script/parser/core/__pycache__/__init__.cpython-310.pyc +0 -0
  650. tilelang/3rdparty/tvm/python/tvm/script/parser/core/__pycache__/diagnostics.cpython-310.pyc +0 -0
  651. tilelang/3rdparty/tvm/python/tvm/script/parser/core/__pycache__/dispatch.cpython-310.pyc +0 -0
  652. tilelang/3rdparty/tvm/python/tvm/script/parser/core/__pycache__/doc.cpython-310.pyc +0 -0
  653. tilelang/3rdparty/tvm/python/tvm/script/parser/core/__pycache__/doc_core.cpython-310.pyc +0 -0
  654. tilelang/3rdparty/tvm/python/tvm/script/parser/core/__pycache__/entry.cpython-310.pyc +0 -0
  655. tilelang/3rdparty/tvm/python/tvm/script/parser/core/__pycache__/error.cpython-310.pyc +0 -0
  656. tilelang/3rdparty/tvm/python/tvm/script/parser/core/__pycache__/evaluator.cpython-310.pyc +0 -0
  657. tilelang/3rdparty/tvm/python/tvm/script/parser/core/__pycache__/parser.cpython-310.pyc +0 -0
  658. tilelang/3rdparty/tvm/python/tvm/script/parser/core/__pycache__/utils.cpython-310.pyc +0 -0
  659. tilelang/3rdparty/tvm/python/tvm/script/parser/ir/__pycache__/__init__.cpython-310.pyc +0 -0
  660. tilelang/3rdparty/tvm/python/tvm/script/parser/ir/__pycache__/entry.cpython-310.pyc +0 -0
  661. tilelang/3rdparty/tvm/python/tvm/script/parser/ir/__pycache__/parser.cpython-310.pyc +0 -0
  662. tilelang/3rdparty/tvm/python/tvm/script/parser/tir/__pycache__/__init__.cpython-310.pyc +0 -0
  663. tilelang/3rdparty/tvm/python/tvm/script/parser/tir/__pycache__/entry.cpython-310.pyc +0 -0
  664. tilelang/3rdparty/tvm/python/tvm/script/parser/tir/__pycache__/operation.cpython-310.pyc +0 -0
  665. tilelang/3rdparty/tvm/python/tvm/script/parser/tir/__pycache__/parser.cpython-310.pyc +0 -0
  666. tilelang/3rdparty/tvm/python/tvm/target/__pycache__/__init__.cpython-310.pyc +0 -0
  667. tilelang/3rdparty/tvm/python/tvm/target/__pycache__/_ffi_api.cpython-310.pyc +0 -0
  668. tilelang/3rdparty/tvm/python/tvm/target/__pycache__/codegen.cpython-310.pyc +0 -0
  669. tilelang/3rdparty/tvm/python/tvm/target/__pycache__/compilation_config.cpython-310.pyc +0 -0
  670. tilelang/3rdparty/tvm/python/tvm/target/__pycache__/datatype.cpython-310.pyc +0 -0
  671. tilelang/3rdparty/tvm/python/tvm/target/__pycache__/generic_func.cpython-310.pyc +0 -0
  672. tilelang/3rdparty/tvm/python/tvm/target/__pycache__/tag.cpython-310.pyc +0 -0
  673. tilelang/3rdparty/tvm/python/tvm/target/__pycache__/target.cpython-310.pyc +0 -0
  674. tilelang/3rdparty/tvm/python/tvm/target/__pycache__/virtual_device.cpython-310.pyc +0 -0
  675. tilelang/3rdparty/tvm/python/tvm/target/__pycache__/x86.cpython-310.pyc +0 -0
  676. tilelang/3rdparty/tvm/python/tvm/te/__pycache__/__init__.cpython-310.pyc +0 -0
  677. tilelang/3rdparty/tvm/python/tvm/te/__pycache__/_ffi_api.cpython-310.pyc +0 -0
  678. tilelang/3rdparty/tvm/python/tvm/te/__pycache__/autodiff.cpython-310.pyc +0 -0
  679. tilelang/3rdparty/tvm/python/tvm/te/__pycache__/operation.cpython-310.pyc +0 -0
  680. tilelang/3rdparty/tvm/python/tvm/te/__pycache__/schedule.cpython-310.pyc +0 -0
  681. tilelang/3rdparty/tvm/python/tvm/te/__pycache__/tag.cpython-310.pyc +0 -0
  682. tilelang/3rdparty/tvm/python/tvm/te/__pycache__/tensor.cpython-310.pyc +0 -0
  683. tilelang/3rdparty/tvm/python/tvm/te/__pycache__/tensor_intrin.cpython-310.pyc +0 -0
  684. tilelang/3rdparty/tvm/python/tvm/te/hybrid/__pycache__/__init__.cpython-310.pyc +0 -0
  685. tilelang/3rdparty/tvm/python/tvm/te/hybrid/__pycache__/calls.cpython-310.pyc +0 -0
  686. tilelang/3rdparty/tvm/python/tvm/te/hybrid/__pycache__/module.cpython-310.pyc +0 -0
  687. tilelang/3rdparty/tvm/python/tvm/te/hybrid/__pycache__/parser.cpython-310.pyc +0 -0
  688. tilelang/3rdparty/tvm/python/tvm/te/hybrid/__pycache__/preprocessor.cpython-310.pyc +0 -0
  689. tilelang/3rdparty/tvm/python/tvm/te/hybrid/__pycache__/runtime.cpython-310.pyc +0 -0
  690. tilelang/3rdparty/tvm/python/tvm/te/hybrid/__pycache__/utils.cpython-310.pyc +0 -0
  691. tilelang/3rdparty/tvm/python/tvm/tir/__pycache__/__init__.cpython-310.pyc +0 -0
  692. tilelang/3rdparty/tvm/python/tvm/tir/__pycache__/_ffi_api.cpython-310.pyc +0 -0
  693. tilelang/3rdparty/tvm/python/tvm/tir/__pycache__/block_dependence_info.cpython-310.pyc +0 -0
  694. tilelang/3rdparty/tvm/python/tvm/tir/__pycache__/block_scope.cpython-310.pyc +0 -0
  695. tilelang/3rdparty/tvm/python/tvm/tir/__pycache__/buffer.cpython-310.pyc +0 -0
  696. tilelang/3rdparty/tvm/python/tvm/tir/__pycache__/data_layout.cpython-310.pyc +0 -0
  697. tilelang/3rdparty/tvm/python/tvm/tir/__pycache__/expr.cpython-310.pyc +0 -0
  698. tilelang/3rdparty/tvm/python/tvm/tir/__pycache__/function.cpython-310.pyc +0 -0
  699. tilelang/3rdparty/tvm/python/tvm/tir/__pycache__/generic.cpython-310.pyc +0 -0
  700. tilelang/3rdparty/tvm/python/tvm/tir/__pycache__/ir_builder.cpython-310.pyc +0 -0
  701. tilelang/3rdparty/tvm/python/tvm/tir/__pycache__/op.cpython-310.pyc +0 -0
  702. tilelang/3rdparty/tvm/python/tvm/tir/__pycache__/stmt.cpython-310.pyc +0 -0
  703. tilelang/3rdparty/tvm/python/tvm/tir/__pycache__/stmt_functor.cpython-310.pyc +0 -0
  704. tilelang/3rdparty/tvm/python/tvm/tir/analysis/__pycache__/__init__.cpython-310.pyc +0 -0
  705. tilelang/3rdparty/tvm/python/tvm/tir/analysis/__pycache__/_ffi_api.cpython-310.pyc +0 -0
  706. tilelang/3rdparty/tvm/python/tvm/tir/analysis/__pycache__/analysis.cpython-310.pyc +0 -0
  707. tilelang/3rdparty/tvm/python/tvm/tir/schedule/__pycache__/__init__.cpython-310.pyc +0 -0
  708. tilelang/3rdparty/tvm/python/tvm/tir/schedule/__pycache__/_ffi_api.cpython-310.pyc +0 -0
  709. tilelang/3rdparty/tvm/python/tvm/tir/schedule/__pycache__/_type_checker.cpython-310.pyc +0 -0
  710. tilelang/3rdparty/tvm/python/tvm/tir/schedule/__pycache__/analysis.cpython-310.pyc +0 -0
  711. tilelang/3rdparty/tvm/python/tvm/tir/schedule/__pycache__/instruction.cpython-310.pyc +0 -0
  712. tilelang/3rdparty/tvm/python/tvm/tir/schedule/__pycache__/schedule.cpython-310.pyc +0 -0
  713. tilelang/3rdparty/tvm/python/tvm/tir/schedule/__pycache__/state.cpython-310.pyc +0 -0
  714. tilelang/3rdparty/tvm/python/tvm/tir/schedule/__pycache__/trace.cpython-310.pyc +0 -0
  715. tilelang/3rdparty/tvm/python/tvm/tir/schedule/__pycache__/transform.cpython-310.pyc +0 -0
  716. tilelang/3rdparty/tvm/python/tvm/tir/transform/__pycache__/__init__.cpython-310.pyc +0 -0
  717. tilelang/3rdparty/tvm/python/tvm/tir/transform/__pycache__/_ffi_api.cpython-310.pyc +0 -0
  718. tilelang/3rdparty/tvm/python/tvm/tir/transform/__pycache__/function_pass.cpython-310.pyc +0 -0
  719. tilelang/3rdparty/tvm/python/tvm/tir/transform/__pycache__/transform.cpython-310.pyc +0 -0
  720. tilelang/3rdparty/tvm/python/tvm/tir/usmp/__pycache__/__init__.cpython-310.pyc +0 -0
  721. tilelang/3rdparty/tvm/python/tvm/tir/usmp/__pycache__/_ffi_api.cpython-310.pyc +0 -0
  722. tilelang/3rdparty/tvm/python/tvm/tir/usmp/__pycache__/utils.cpython-310.pyc +0 -0
  723. tilelang/3rdparty/tvm/python/tvm/tir/usmp/analysis/__pycache__/__init__.cpython-310.pyc +0 -0
  724. tilelang/3rdparty/tvm/python/tvm/tir/usmp/analysis/__pycache__/_ffi_api.cpython-310.pyc +0 -0
  725. tilelang/3rdparty/tvm/python/tvm/tir/usmp/analysis/__pycache__/analysis.cpython-310.pyc +0 -0
  726. tilelang/3rdparty/tvm/python/tvm/tir/usmp/transform/__pycache__/__init__.cpython-310.pyc +0 -0
  727. tilelang/3rdparty/tvm/python/tvm/tir/usmp/transform/__pycache__/_ffi_api.cpython-310.pyc +0 -0
  728. tilelang/3rdparty/tvm/python/tvm/tir/usmp/transform/__pycache__/transform.cpython-310.pyc +0 -0
  729. tilelang/3rdparty/tvm/python/tvm/topi/__pycache__/__init__.cpython-310.pyc +0 -0
  730. tilelang/3rdparty/tvm/python/tvm/topi/__pycache__/argwhere.cpython-310.pyc +0 -0
  731. tilelang/3rdparty/tvm/python/tvm/topi/__pycache__/broadcast.cpython-310.pyc +0 -0
  732. tilelang/3rdparty/tvm/python/tvm/topi/__pycache__/einsum.cpython-310.pyc +0 -0
  733. tilelang/3rdparty/tvm/python/tvm/topi/__pycache__/generic_op_impl.cpython-310.pyc +0 -0
  734. tilelang/3rdparty/tvm/python/tvm/topi/__pycache__/math.cpython-310.pyc +0 -0
  735. tilelang/3rdparty/tvm/python/tvm/topi/__pycache__/reduction.cpython-310.pyc +0 -0
  736. tilelang/3rdparty/tvm/python/tvm/topi/__pycache__/scan.cpython-310.pyc +0 -0
  737. tilelang/3rdparty/tvm/python/tvm/topi/__pycache__/scatter.cpython-310.pyc +0 -0
  738. tilelang/3rdparty/tvm/python/tvm/topi/__pycache__/scatter_elements.cpython-310.pyc +0 -0
  739. tilelang/3rdparty/tvm/python/tvm/topi/__pycache__/searchsorted.cpython-310.pyc +0 -0
  740. tilelang/3rdparty/tvm/python/tvm/topi/__pycache__/signal.cpython-310.pyc +0 -0
  741. tilelang/3rdparty/tvm/python/tvm/topi/__pycache__/sort.cpython-310.pyc +0 -0
  742. tilelang/3rdparty/tvm/python/tvm/topi/__pycache__/sparse_fill_empty_rows.cpython-310.pyc +0 -0
  743. tilelang/3rdparty/tvm/python/tvm/topi/__pycache__/sparse_reshape.cpython-310.pyc +0 -0
  744. tilelang/3rdparty/tvm/python/tvm/topi/__pycache__/tag.cpython-310.pyc +0 -0
  745. tilelang/3rdparty/tvm/python/tvm/topi/__pycache__/tensor.cpython-310.pyc +0 -0
  746. tilelang/3rdparty/tvm/python/tvm/topi/__pycache__/transform.cpython-310.pyc +0 -0
  747. tilelang/3rdparty/tvm/python/tvm/topi/__pycache__/unique.cpython-310.pyc +0 -0
  748. tilelang/3rdparty/tvm/python/tvm/topi/__pycache__/utils.cpython-310.pyc +0 -0
  749. tilelang/3rdparty/tvm/python/tvm/topi/adreno/__pycache__/__init__.cpython-310.pyc +0 -0
  750. tilelang/3rdparty/tvm/python/tvm/topi/adreno/__pycache__/conv2d_alter_op.cpython-310.pyc +0 -0
  751. tilelang/3rdparty/tvm/python/tvm/topi/adreno/__pycache__/conv2d_nchw.cpython-310.pyc +0 -0
  752. tilelang/3rdparty/tvm/python/tvm/topi/adreno/__pycache__/conv2d_nchw_winograd.cpython-310.pyc +0 -0
  753. tilelang/3rdparty/tvm/python/tvm/topi/adreno/__pycache__/conv2d_nhwc.cpython-310.pyc +0 -0
  754. tilelang/3rdparty/tvm/python/tvm/topi/adreno/__pycache__/conv2d_nhwc_winograd.cpython-310.pyc +0 -0
  755. tilelang/3rdparty/tvm/python/tvm/topi/adreno/__pycache__/conv2d_transpose_alter_op.cpython-310.pyc +0 -0
  756. tilelang/3rdparty/tvm/python/tvm/topi/adreno/__pycache__/conv2d_transpose_nchw.cpython-310.pyc +0 -0
  757. tilelang/3rdparty/tvm/python/tvm/topi/adreno/__pycache__/conv2d_winograd_common.cpython-310.pyc +0 -0
  758. tilelang/3rdparty/tvm/python/tvm/topi/adreno/__pycache__/depthwise_conv2d_nchw.cpython-310.pyc +0 -0
  759. tilelang/3rdparty/tvm/python/tvm/topi/adreno/__pycache__/depthwise_conv2d_nhwc.cpython-310.pyc +0 -0
  760. tilelang/3rdparty/tvm/python/tvm/topi/adreno/__pycache__/injective.cpython-310.pyc +0 -0
  761. tilelang/3rdparty/tvm/python/tvm/topi/adreno/__pycache__/pooling.cpython-310.pyc +0 -0
  762. tilelang/3rdparty/tvm/python/tvm/topi/adreno/__pycache__/reduction.cpython-310.pyc +0 -0
  763. tilelang/3rdparty/tvm/python/tvm/topi/adreno/__pycache__/utils.cpython-310.pyc +0 -0
  764. tilelang/3rdparty/tvm/python/tvm/topi/arm_cpu/__pycache__/__init__.cpython-310.pyc +0 -0
  765. tilelang/3rdparty/tvm/python/tvm/topi/arm_cpu/__pycache__/arm_utils.cpython-310.pyc +0 -0
  766. tilelang/3rdparty/tvm/python/tvm/topi/arm_cpu/__pycache__/bitserial_conv2d.cpython-310.pyc +0 -0
  767. tilelang/3rdparty/tvm/python/tvm/topi/arm_cpu/__pycache__/bitserial_dense.cpython-310.pyc +0 -0
  768. tilelang/3rdparty/tvm/python/tvm/topi/arm_cpu/__pycache__/conv1d.cpython-310.pyc +0 -0
  769. tilelang/3rdparty/tvm/python/tvm/topi/arm_cpu/__pycache__/conv2d.cpython-310.pyc +0 -0
  770. tilelang/3rdparty/tvm/python/tvm/topi/arm_cpu/__pycache__/conv2d_alter_op.cpython-310.pyc +0 -0
  771. tilelang/3rdparty/tvm/python/tvm/topi/arm_cpu/__pycache__/conv2d_gemm.cpython-310.pyc +0 -0
  772. tilelang/3rdparty/tvm/python/tvm/topi/arm_cpu/__pycache__/conv2d_int8.cpython-310.pyc +0 -0
  773. tilelang/3rdparty/tvm/python/tvm/topi/arm_cpu/__pycache__/conv2d_spatial_pack.cpython-310.pyc +0 -0
  774. tilelang/3rdparty/tvm/python/tvm/topi/arm_cpu/__pycache__/conv2d_transpose.cpython-310.pyc +0 -0
  775. tilelang/3rdparty/tvm/python/tvm/topi/arm_cpu/__pycache__/dense.cpython-310.pyc +0 -0
  776. tilelang/3rdparty/tvm/python/tvm/topi/arm_cpu/__pycache__/depthwise_conv2d.cpython-310.pyc +0 -0
  777. tilelang/3rdparty/tvm/python/tvm/topi/arm_cpu/__pycache__/group_conv2d.cpython-310.pyc +0 -0
  778. tilelang/3rdparty/tvm/python/tvm/topi/arm_cpu/__pycache__/injective.cpython-310.pyc +0 -0
  779. tilelang/3rdparty/tvm/python/tvm/topi/arm_cpu/__pycache__/pooling.cpython-310.pyc +0 -0
  780. tilelang/3rdparty/tvm/python/tvm/topi/arm_cpu/__pycache__/qnn.cpython-310.pyc +0 -0
  781. tilelang/3rdparty/tvm/python/tvm/topi/arm_cpu/__pycache__/qnn_alter_op.cpython-310.pyc +0 -0
  782. tilelang/3rdparty/tvm/python/tvm/topi/arm_cpu/__pycache__/qnn_legalize.cpython-310.pyc +0 -0
  783. tilelang/3rdparty/tvm/python/tvm/topi/arm_cpu/__pycache__/tensor_intrin.cpython-310.pyc +0 -0
  784. tilelang/3rdparty/tvm/python/tvm/topi/arm_cpu/mprofile/__pycache__/__init__.cpython-310.pyc +0 -0
  785. tilelang/3rdparty/tvm/python/tvm/topi/arm_cpu/mprofile/dsp/__pycache__/__init__.cpython-310.pyc +0 -0
  786. tilelang/3rdparty/tvm/python/tvm/topi/arm_cpu/mprofile/dsp/__pycache__/conv1d.cpython-310.pyc +0 -0
  787. tilelang/3rdparty/tvm/python/tvm/topi/arm_cpu/mprofile/dsp/__pycache__/conv2d.cpython-310.pyc +0 -0
  788. tilelang/3rdparty/tvm/python/tvm/topi/arm_cpu/mprofile/dsp/__pycache__/dense.cpython-310.pyc +0 -0
  789. tilelang/3rdparty/tvm/python/tvm/topi/arm_cpu/mprofile/dsp/__pycache__/depthwise_conv2d.cpython-310.pyc +0 -0
  790. tilelang/3rdparty/tvm/python/tvm/topi/arm_cpu/mprofile/dsp/__pycache__/pool.cpython-310.pyc +0 -0
  791. tilelang/3rdparty/tvm/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/__pycache__/__init__.cpython-310.pyc +0 -0
  792. tilelang/3rdparty/tvm/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/__pycache__/avg_pool.cpython-310.pyc +0 -0
  793. tilelang/3rdparty/tvm/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/__pycache__/common.cpython-310.pyc +0 -0
  794. tilelang/3rdparty/tvm/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/__pycache__/gemm.cpython-310.pyc +0 -0
  795. tilelang/3rdparty/tvm/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/__pycache__/max_pool.cpython-310.pyc +0 -0
  796. tilelang/3rdparty/tvm/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/__pycache__/multi_channel_convolve.cpython-310.pyc +0 -0
  797. tilelang/3rdparty/tvm/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/__pycache__/tensordot.cpython-310.pyc +0 -0
  798. tilelang/3rdparty/tvm/python/tvm/topi/bifrost/__pycache__/__init__.cpython-310.pyc +0 -0
  799. tilelang/3rdparty/tvm/python/tvm/topi/bifrost/__pycache__/conv2d.cpython-310.pyc +0 -0
  800. tilelang/3rdparty/tvm/python/tvm/topi/bifrost/__pycache__/dense.cpython-310.pyc +0 -0
  801. tilelang/3rdparty/tvm/python/tvm/topi/bifrost/__pycache__/depthwise_conv2d.cpython-310.pyc +0 -0
  802. tilelang/3rdparty/tvm/python/tvm/topi/bifrost/__pycache__/gemm.cpython-310.pyc +0 -0
  803. tilelang/3rdparty/tvm/python/tvm/topi/bifrost/__pycache__/transforms.cpython-310.pyc +0 -0
  804. tilelang/3rdparty/tvm/python/tvm/topi/cpp/__pycache__/__init__.cpython-310.pyc +0 -0
  805. tilelang/3rdparty/tvm/python/tvm/topi/cpp/__pycache__/cuda.cpython-310.pyc +0 -0
  806. tilelang/3rdparty/tvm/python/tvm/topi/cpp/__pycache__/generic.cpython-310.pyc +0 -0
  807. tilelang/3rdparty/tvm/python/tvm/topi/cpp/__pycache__/impl.cpython-310.pyc +0 -0
  808. tilelang/3rdparty/tvm/python/tvm/topi/cpp/__pycache__/nn.cpython-310.pyc +0 -0
  809. tilelang/3rdparty/tvm/python/tvm/topi/cpp/__pycache__/rocm.cpython-310.pyc +0 -0
  810. tilelang/3rdparty/tvm/python/tvm/topi/cpp/__pycache__/utils.cpython-310.pyc +0 -0
  811. tilelang/3rdparty/tvm/python/tvm/topi/cpp/__pycache__/x86.cpython-310.pyc +0 -0
  812. tilelang/3rdparty/tvm/python/tvm/topi/cpp/vision/__pycache__/__init__.cpython-310.pyc +0 -0
  813. tilelang/3rdparty/tvm/python/tvm/topi/cpp/vision/__pycache__/yolo.cpython-310.pyc +0 -0
  814. tilelang/3rdparty/tvm/python/tvm/topi/cuda/__pycache__/__init__.cpython-310.pyc +0 -0
  815. tilelang/3rdparty/tvm/python/tvm/topi/cuda/__pycache__/argwhere.cpython-310.pyc +0 -0
  816. tilelang/3rdparty/tvm/python/tvm/topi/cuda/__pycache__/batch_matmul.cpython-310.pyc +0 -0
  817. tilelang/3rdparty/tvm/python/tvm/topi/cuda/__pycache__/batch_matmul_tensorcore.cpython-310.pyc +0 -0
  818. tilelang/3rdparty/tvm/python/tvm/topi/cuda/__pycache__/conv1d.cpython-310.pyc +0 -0
  819. tilelang/3rdparty/tvm/python/tvm/topi/cuda/__pycache__/conv1d_transpose_ncw.cpython-310.pyc +0 -0
  820. tilelang/3rdparty/tvm/python/tvm/topi/cuda/__pycache__/conv2d.cpython-310.pyc +0 -0
  821. tilelang/3rdparty/tvm/python/tvm/topi/cuda/__pycache__/conv2d_alter_op.cpython-310.pyc +0 -0
  822. tilelang/3rdparty/tvm/python/tvm/topi/cuda/__pycache__/conv2d_direct.cpython-310.pyc +0 -0
  823. tilelang/3rdparty/tvm/python/tvm/topi/cuda/__pycache__/conv2d_hwcn.cpython-310.pyc +0 -0
  824. tilelang/3rdparty/tvm/python/tvm/topi/cuda/__pycache__/conv2d_hwnc_tensorcore.cpython-310.pyc +0 -0
  825. tilelang/3rdparty/tvm/python/tvm/topi/cuda/__pycache__/conv2d_int8.cpython-310.pyc +0 -0
  826. tilelang/3rdparty/tvm/python/tvm/topi/cuda/__pycache__/conv2d_nhwc_tensorcore.cpython-310.pyc +0 -0
  827. tilelang/3rdparty/tvm/python/tvm/topi/cuda/__pycache__/conv2d_nhwc_winograd.cpython-310.pyc +0 -0
  828. tilelang/3rdparty/tvm/python/tvm/topi/cuda/__pycache__/conv2d_transpose.cpython-310.pyc +0 -0
  829. tilelang/3rdparty/tvm/python/tvm/topi/cuda/__pycache__/conv2d_winograd.cpython-310.pyc +0 -0
  830. tilelang/3rdparty/tvm/python/tvm/topi/cuda/__pycache__/conv3d.cpython-310.pyc +0 -0
  831. tilelang/3rdparty/tvm/python/tvm/topi/cuda/__pycache__/conv3d_alter_op.cpython-310.pyc +0 -0
  832. tilelang/3rdparty/tvm/python/tvm/topi/cuda/__pycache__/conv3d_direct.cpython-310.pyc +0 -0
  833. tilelang/3rdparty/tvm/python/tvm/topi/cuda/__pycache__/conv3d_ndhwc_tensorcore.cpython-310.pyc +0 -0
  834. tilelang/3rdparty/tvm/python/tvm/topi/cuda/__pycache__/conv3d_transpose_ncdhw.cpython-310.pyc +0 -0
  835. tilelang/3rdparty/tvm/python/tvm/topi/cuda/__pycache__/conv3d_winograd.cpython-310.pyc +0 -0
  836. tilelang/3rdparty/tvm/python/tvm/topi/cuda/__pycache__/correlation.cpython-310.pyc +0 -0
  837. tilelang/3rdparty/tvm/python/tvm/topi/cuda/__pycache__/deformable_conv2d.cpython-310.pyc +0 -0
  838. tilelang/3rdparty/tvm/python/tvm/topi/cuda/__pycache__/dense.cpython-310.pyc +0 -0
  839. tilelang/3rdparty/tvm/python/tvm/topi/cuda/__pycache__/dense_tensorcore.cpython-310.pyc +0 -0
  840. tilelang/3rdparty/tvm/python/tvm/topi/cuda/__pycache__/depthwise_conv2d.cpython-310.pyc +0 -0
  841. tilelang/3rdparty/tvm/python/tvm/topi/cuda/__pycache__/group_conv2d_nchw.cpython-310.pyc +0 -0
  842. tilelang/3rdparty/tvm/python/tvm/topi/cuda/__pycache__/injective.cpython-310.pyc +0 -0
  843. tilelang/3rdparty/tvm/python/tvm/topi/cuda/__pycache__/nms.cpython-310.pyc +0 -0
  844. tilelang/3rdparty/tvm/python/tvm/topi/cuda/__pycache__/nn.cpython-310.pyc +0 -0
  845. tilelang/3rdparty/tvm/python/tvm/topi/cuda/__pycache__/pooling.cpython-310.pyc +0 -0
  846. tilelang/3rdparty/tvm/python/tvm/topi/cuda/__pycache__/reduction.cpython-310.pyc +0 -0
  847. tilelang/3rdparty/tvm/python/tvm/topi/cuda/__pycache__/scan.cpython-310.pyc +0 -0
  848. tilelang/3rdparty/tvm/python/tvm/topi/cuda/__pycache__/scatter.cpython-310.pyc +0 -0
  849. tilelang/3rdparty/tvm/python/tvm/topi/cuda/__pycache__/scatter_elements.cpython-310.pyc +0 -0
  850. tilelang/3rdparty/tvm/python/tvm/topi/cuda/__pycache__/searchsorted.cpython-310.pyc +0 -0
  851. tilelang/3rdparty/tvm/python/tvm/topi/cuda/__pycache__/signal.cpython-310.pyc +0 -0
  852. tilelang/3rdparty/tvm/python/tvm/topi/cuda/__pycache__/softmax.cpython-310.pyc +0 -0
  853. tilelang/3rdparty/tvm/python/tvm/topi/cuda/__pycache__/sort.cpython-310.pyc +0 -0
  854. tilelang/3rdparty/tvm/python/tvm/topi/cuda/__pycache__/sparse.cpython-310.pyc +0 -0
  855. tilelang/3rdparty/tvm/python/tvm/topi/cuda/__pycache__/sparse_reshape.cpython-310.pyc +0 -0
  856. tilelang/3rdparty/tvm/python/tvm/topi/cuda/__pycache__/tensor_intrin.cpython-310.pyc +0 -0
  857. tilelang/3rdparty/tvm/python/tvm/topi/cuda/__pycache__/tensorcore_alter_op.cpython-310.pyc +0 -0
  858. tilelang/3rdparty/tvm/python/tvm/topi/cuda/__pycache__/transform.cpython-310.pyc +0 -0
  859. tilelang/3rdparty/tvm/python/tvm/topi/cuda/__pycache__/unique.cpython-310.pyc +0 -0
  860. tilelang/3rdparty/tvm/python/tvm/topi/cuda/__pycache__/vision.cpython-310.pyc +0 -0
  861. tilelang/3rdparty/tvm/python/tvm/topi/cuda/rcnn/__pycache__/__init__.cpython-310.pyc +0 -0
  862. tilelang/3rdparty/tvm/python/tvm/topi/cuda/rcnn/__pycache__/proposal.cpython-310.pyc +0 -0
  863. tilelang/3rdparty/tvm/python/tvm/topi/cuda/ssd/__pycache__/__init__.cpython-310.pyc +0 -0
  864. tilelang/3rdparty/tvm/python/tvm/topi/cuda/ssd/__pycache__/multibox.cpython-310.pyc +0 -0
  865. tilelang/3rdparty/tvm/python/tvm/topi/generic/__pycache__/__init__.cpython-310.pyc +0 -0
  866. tilelang/3rdparty/tvm/python/tvm/topi/generic/__pycache__/conv2d.cpython-310.pyc +0 -0
  867. tilelang/3rdparty/tvm/python/tvm/topi/generic/__pycache__/default.cpython-310.pyc +0 -0
  868. tilelang/3rdparty/tvm/python/tvm/topi/generic/__pycache__/extern.cpython-310.pyc +0 -0
  869. tilelang/3rdparty/tvm/python/tvm/topi/generic/__pycache__/image.cpython-310.pyc +0 -0
  870. tilelang/3rdparty/tvm/python/tvm/topi/generic/__pycache__/injective.cpython-310.pyc +0 -0
  871. tilelang/3rdparty/tvm/python/tvm/topi/generic/__pycache__/math.cpython-310.pyc +0 -0
  872. tilelang/3rdparty/tvm/python/tvm/topi/generic/__pycache__/nn.cpython-310.pyc +0 -0
  873. tilelang/3rdparty/tvm/python/tvm/topi/generic/__pycache__/search.cpython-310.pyc +0 -0
  874. tilelang/3rdparty/tvm/python/tvm/topi/generic/__pycache__/sort.cpython-310.pyc +0 -0
  875. tilelang/3rdparty/tvm/python/tvm/topi/generic/__pycache__/vision.cpython-310.pyc +0 -0
  876. tilelang/3rdparty/tvm/python/tvm/topi/gpu/__pycache__/__init__.cpython-310.pyc +0 -0
  877. tilelang/3rdparty/tvm/python/tvm/topi/gpu/__pycache__/conv2d.cpython-310.pyc +0 -0
  878. tilelang/3rdparty/tvm/python/tvm/topi/gpu/__pycache__/conv2d_nhwc.cpython-310.pyc +0 -0
  879. tilelang/3rdparty/tvm/python/tvm/topi/gpu/__pycache__/dense.cpython-310.pyc +0 -0
  880. tilelang/3rdparty/tvm/python/tvm/topi/hexagon/__pycache__/__init__.cpython-310.pyc +0 -0
  881. tilelang/3rdparty/tvm/python/tvm/topi/hexagon/__pycache__/batch_matmul.cpython-310.pyc +0 -0
  882. tilelang/3rdparty/tvm/python/tvm/topi/hexagon/__pycache__/compute_poolarea.cpython-310.pyc +0 -0
  883. tilelang/3rdparty/tvm/python/tvm/topi/hexagon/__pycache__/conv2d.cpython-310.pyc +0 -0
  884. tilelang/3rdparty/tvm/python/tvm/topi/hexagon/__pycache__/conv2d_alter_op.cpython-310.pyc +0 -0
  885. tilelang/3rdparty/tvm/python/tvm/topi/hexagon/__pycache__/dense.cpython-310.pyc +0 -0
  886. tilelang/3rdparty/tvm/python/tvm/topi/hexagon/__pycache__/dense_alter_op.cpython-310.pyc +0 -0
  887. tilelang/3rdparty/tvm/python/tvm/topi/hexagon/__pycache__/injective.cpython-310.pyc +0 -0
  888. tilelang/3rdparty/tvm/python/tvm/topi/hexagon/__pycache__/pad.cpython-310.pyc +0 -0
  889. tilelang/3rdparty/tvm/python/tvm/topi/hexagon/__pycache__/pooling.cpython-310.pyc +0 -0
  890. tilelang/3rdparty/tvm/python/tvm/topi/hexagon/__pycache__/reduce.cpython-310.pyc +0 -0
  891. tilelang/3rdparty/tvm/python/tvm/topi/hexagon/__pycache__/resize2d.cpython-310.pyc +0 -0
  892. tilelang/3rdparty/tvm/python/tvm/topi/hexagon/__pycache__/tensor_intrin.cpython-310.pyc +0 -0
  893. tilelang/3rdparty/tvm/python/tvm/topi/hexagon/__pycache__/utils.cpython-310.pyc +0 -0
  894. tilelang/3rdparty/tvm/python/tvm/topi/hexagon/qnn/__pycache__/__init__.cpython-310.pyc +0 -0
  895. tilelang/3rdparty/tvm/python/tvm/topi/hexagon/qnn/__pycache__/adaptive_avg_pool1d.cpython-310.pyc +0 -0
  896. tilelang/3rdparty/tvm/python/tvm/topi/hexagon/qnn/__pycache__/avg_pool2d.cpython-310.pyc +0 -0
  897. tilelang/3rdparty/tvm/python/tvm/topi/hexagon/qnn/__pycache__/conv2d_alter_op.cpython-310.pyc +0 -0
  898. tilelang/3rdparty/tvm/python/tvm/topi/hexagon/qnn/__pycache__/dense_alter_op.cpython-310.pyc +0 -0
  899. tilelang/3rdparty/tvm/python/tvm/topi/hexagon/qnn/__pycache__/dequantize.cpython-310.pyc +0 -0
  900. tilelang/3rdparty/tvm/python/tvm/topi/hexagon/qnn/__pycache__/global_avg_pool2d.cpython-310.pyc +0 -0
  901. tilelang/3rdparty/tvm/python/tvm/topi/hexagon/qnn/__pycache__/nn.cpython-310.pyc +0 -0
  902. tilelang/3rdparty/tvm/python/tvm/topi/hexagon/qnn/__pycache__/qadd_qsub_qmul.cpython-310.pyc +0 -0
  903. tilelang/3rdparty/tvm/python/tvm/topi/hexagon/qnn/__pycache__/qdense.cpython-310.pyc +0 -0
  904. tilelang/3rdparty/tvm/python/tvm/topi/hexagon/qnn/__pycache__/qdepthwise_conv2d_slice.cpython-310.pyc +0 -0
  905. tilelang/3rdparty/tvm/python/tvm/topi/hexagon/qnn/__pycache__/quantize.cpython-310.pyc +0 -0
  906. tilelang/3rdparty/tvm/python/tvm/topi/hls/__pycache__/__init__.cpython-310.pyc +0 -0
  907. tilelang/3rdparty/tvm/python/tvm/topi/hls/__pycache__/injective.cpython-310.pyc +0 -0
  908. tilelang/3rdparty/tvm/python/tvm/topi/hls/__pycache__/nn.cpython-310.pyc +0 -0
  909. tilelang/3rdparty/tvm/python/tvm/topi/image/__pycache__/__init__.cpython-310.pyc +0 -0
  910. tilelang/3rdparty/tvm/python/tvm/topi/image/__pycache__/dilation2d.cpython-310.pyc +0 -0
  911. tilelang/3rdparty/tvm/python/tvm/topi/image/__pycache__/grid_sample.cpython-310.pyc +0 -0
  912. tilelang/3rdparty/tvm/python/tvm/topi/image/__pycache__/resize.cpython-310.pyc +0 -0
  913. tilelang/3rdparty/tvm/python/tvm/topi/intel_graphics/__pycache__/__init__.cpython-310.pyc +0 -0
  914. tilelang/3rdparty/tvm/python/tvm/topi/intel_graphics/__pycache__/conv2d.cpython-310.pyc +0 -0
  915. tilelang/3rdparty/tvm/python/tvm/topi/intel_graphics/__pycache__/conv2d_alter_op.cpython-310.pyc +0 -0
  916. tilelang/3rdparty/tvm/python/tvm/topi/intel_graphics/__pycache__/depthwise_conv2d.cpython-310.pyc +0 -0
  917. tilelang/3rdparty/tvm/python/tvm/topi/mali/__pycache__/__init__.cpython-310.pyc +0 -0
  918. tilelang/3rdparty/tvm/python/tvm/topi/mali/__pycache__/conv2d.cpython-310.pyc +0 -0
  919. tilelang/3rdparty/tvm/python/tvm/topi/mali/__pycache__/dense.cpython-310.pyc +0 -0
  920. tilelang/3rdparty/tvm/python/tvm/topi/mali/__pycache__/depthwise_conv2d.cpython-310.pyc +0 -0
  921. tilelang/3rdparty/tvm/python/tvm/topi/nn/__pycache__/__init__.cpython-310.pyc +0 -0
  922. tilelang/3rdparty/tvm/python/tvm/topi/nn/__pycache__/batch_matmul.cpython-310.pyc +0 -0
  923. tilelang/3rdparty/tvm/python/tvm/topi/nn/__pycache__/batch_norm.cpython-310.pyc +0 -0
  924. tilelang/3rdparty/tvm/python/tvm/topi/nn/__pycache__/batch_to_space_nd.cpython-310.pyc +0 -0
  925. tilelang/3rdparty/tvm/python/tvm/topi/nn/__pycache__/bitserial_conv2d.cpython-310.pyc +0 -0
  926. tilelang/3rdparty/tvm/python/tvm/topi/nn/__pycache__/bitserial_dense.cpython-310.pyc +0 -0
  927. tilelang/3rdparty/tvm/python/tvm/topi/nn/__pycache__/bitserial_util.cpython-310.pyc +0 -0
  928. tilelang/3rdparty/tvm/python/tvm/topi/nn/__pycache__/bnn.cpython-310.pyc +0 -0
  929. tilelang/3rdparty/tvm/python/tvm/topi/nn/__pycache__/conv1d.cpython-310.pyc +0 -0
  930. tilelang/3rdparty/tvm/python/tvm/topi/nn/__pycache__/conv1d_transpose.cpython-310.pyc +0 -0
  931. tilelang/3rdparty/tvm/python/tvm/topi/nn/__pycache__/conv2d.cpython-310.pyc +0 -0
  932. tilelang/3rdparty/tvm/python/tvm/topi/nn/__pycache__/conv2d_transpose.cpython-310.pyc +0 -0
  933. tilelang/3rdparty/tvm/python/tvm/topi/nn/__pycache__/conv3d.cpython-310.pyc +0 -0
  934. tilelang/3rdparty/tvm/python/tvm/topi/nn/__pycache__/conv3d_transpose.cpython-310.pyc +0 -0
  935. tilelang/3rdparty/tvm/python/tvm/topi/nn/__pycache__/correlation.cpython-310.pyc +0 -0
  936. tilelang/3rdparty/tvm/python/tvm/topi/nn/__pycache__/deformable_conv2d.cpython-310.pyc +0 -0
  937. tilelang/3rdparty/tvm/python/tvm/topi/nn/__pycache__/dense.cpython-310.pyc +0 -0
  938. tilelang/3rdparty/tvm/python/tvm/topi/nn/__pycache__/depth_to_space.cpython-310.pyc +0 -0
  939. tilelang/3rdparty/tvm/python/tvm/topi/nn/__pycache__/depthwise_conv2d.cpython-310.pyc +0 -0
  940. tilelang/3rdparty/tvm/python/tvm/topi/nn/__pycache__/dilate.cpython-310.pyc +0 -0
  941. tilelang/3rdparty/tvm/python/tvm/topi/nn/__pycache__/elemwise.cpython-310.pyc +0 -0
  942. tilelang/3rdparty/tvm/python/tvm/topi/nn/__pycache__/fifo_buffer.cpython-310.pyc +0 -0
  943. tilelang/3rdparty/tvm/python/tvm/topi/nn/__pycache__/flatten.cpython-310.pyc +0 -0
  944. tilelang/3rdparty/tvm/python/tvm/topi/nn/__pycache__/group_norm.cpython-310.pyc +0 -0
  945. tilelang/3rdparty/tvm/python/tvm/topi/nn/__pycache__/instance_norm.cpython-310.pyc +0 -0
  946. tilelang/3rdparty/tvm/python/tvm/topi/nn/__pycache__/layer_norm.cpython-310.pyc +0 -0
  947. tilelang/3rdparty/tvm/python/tvm/topi/nn/__pycache__/local_response_norm.cpython-310.pyc +0 -0
  948. tilelang/3rdparty/tvm/python/tvm/topi/nn/__pycache__/loss.cpython-310.pyc +0 -0
  949. tilelang/3rdparty/tvm/python/tvm/topi/nn/__pycache__/lstm.cpython-310.pyc +0 -0
  950. tilelang/3rdparty/tvm/python/tvm/topi/nn/__pycache__/mapping.cpython-310.pyc +0 -0
  951. tilelang/3rdparty/tvm/python/tvm/topi/nn/__pycache__/pad.cpython-310.pyc +0 -0
  952. tilelang/3rdparty/tvm/python/tvm/topi/nn/__pycache__/pooling.cpython-310.pyc +0 -0
  953. tilelang/3rdparty/tvm/python/tvm/topi/nn/__pycache__/qnn.cpython-310.pyc +0 -0
  954. tilelang/3rdparty/tvm/python/tvm/topi/nn/__pycache__/rms_norm.cpython-310.pyc +0 -0
  955. tilelang/3rdparty/tvm/python/tvm/topi/nn/__pycache__/softmax.cpython-310.pyc +0 -0
  956. tilelang/3rdparty/tvm/python/tvm/topi/nn/__pycache__/space_to_batch_nd.cpython-310.pyc +0 -0
  957. tilelang/3rdparty/tvm/python/tvm/topi/nn/__pycache__/space_to_depth.cpython-310.pyc +0 -0
  958. tilelang/3rdparty/tvm/python/tvm/topi/nn/__pycache__/sparse.cpython-310.pyc +0 -0
  959. tilelang/3rdparty/tvm/python/tvm/topi/nn/__pycache__/upsampling.cpython-310.pyc +0 -0
  960. tilelang/3rdparty/tvm/python/tvm/topi/nn/__pycache__/utils.cpython-310.pyc +0 -0
  961. tilelang/3rdparty/tvm/python/tvm/topi/nn/__pycache__/winograd_util.cpython-310.pyc +0 -0
  962. tilelang/3rdparty/tvm/python/tvm/topi/random/__pycache__/__init__.cpython-310.pyc +0 -0
  963. tilelang/3rdparty/tvm/python/tvm/topi/random/__pycache__/kernel.cpython-310.pyc +0 -0
  964. tilelang/3rdparty/tvm/python/tvm/topi/rocm/__pycache__/__init__.cpython-310.pyc +0 -0
  965. tilelang/3rdparty/tvm/python/tvm/topi/rocm/__pycache__/batch_matmul.cpython-310.pyc +0 -0
  966. tilelang/3rdparty/tvm/python/tvm/topi/rocm/__pycache__/conv2d.cpython-310.pyc +0 -0
  967. tilelang/3rdparty/tvm/python/tvm/topi/rocm/__pycache__/dense.cpython-310.pyc +0 -0
  968. tilelang/3rdparty/tvm/python/tvm/topi/sparse/__pycache__/__init__.cpython-310.pyc +0 -0
  969. tilelang/3rdparty/tvm/python/tvm/topi/sparse/__pycache__/csrmm.cpython-310.pyc +0 -0
  970. tilelang/3rdparty/tvm/python/tvm/topi/sparse/__pycache__/csrmv.cpython-310.pyc +0 -0
  971. tilelang/3rdparty/tvm/python/tvm/topi/sparse/__pycache__/dense.cpython-310.pyc +0 -0
  972. tilelang/3rdparty/tvm/python/tvm/topi/vision/__pycache__/__init__.cpython-310.pyc +0 -0
  973. tilelang/3rdparty/tvm/python/tvm/topi/vision/__pycache__/nms.cpython-310.pyc +0 -0
  974. tilelang/3rdparty/tvm/python/tvm/topi/vision/__pycache__/nms_util.cpython-310.pyc +0 -0
  975. tilelang/3rdparty/tvm/python/tvm/topi/vision/__pycache__/reorg.cpython-310.pyc +0 -0
  976. tilelang/3rdparty/tvm/python/tvm/topi/vision/rcnn/__pycache__/__init__.cpython-310.pyc +0 -0
  977. tilelang/3rdparty/tvm/python/tvm/topi/vision/rcnn/__pycache__/proposal.cpython-310.pyc +0 -0
  978. tilelang/3rdparty/tvm/python/tvm/topi/vision/rcnn/__pycache__/roi_align.cpython-310.pyc +0 -0
  979. tilelang/3rdparty/tvm/python/tvm/topi/vision/rcnn/__pycache__/roi_pool.cpython-310.pyc +0 -0
  980. tilelang/3rdparty/tvm/python/tvm/topi/vision/ssd/__pycache__/__init__.cpython-310.pyc +0 -0
  981. tilelang/3rdparty/tvm/python/tvm/topi/vision/ssd/__pycache__/multibox.cpython-310.pyc +0 -0
  982. tilelang/3rdparty/tvm/python/tvm/topi/x86/__pycache__/__init__.cpython-310.pyc +0 -0
  983. tilelang/3rdparty/tvm/python/tvm/topi/x86/__pycache__/batch_matmul.cpython-310.pyc +0 -0
  984. tilelang/3rdparty/tvm/python/tvm/topi/x86/__pycache__/binarize_pack.cpython-310.pyc +0 -0
  985. tilelang/3rdparty/tvm/python/tvm/topi/x86/__pycache__/binary_dense.cpython-310.pyc +0 -0
  986. tilelang/3rdparty/tvm/python/tvm/topi/x86/__pycache__/bitserial_conv2d.cpython-310.pyc +0 -0
  987. tilelang/3rdparty/tvm/python/tvm/topi/x86/__pycache__/bitserial_dense.cpython-310.pyc +0 -0
  988. tilelang/3rdparty/tvm/python/tvm/topi/x86/__pycache__/concat.cpython-310.pyc +0 -0
  989. tilelang/3rdparty/tvm/python/tvm/topi/x86/__pycache__/conv1d.cpython-310.pyc +0 -0
  990. tilelang/3rdparty/tvm/python/tvm/topi/x86/__pycache__/conv2d.cpython-310.pyc +0 -0
  991. tilelang/3rdparty/tvm/python/tvm/topi/x86/__pycache__/conv2d_alter_op.cpython-310.pyc +0 -0
  992. tilelang/3rdparty/tvm/python/tvm/topi/x86/__pycache__/conv2d_avx_1x1.cpython-310.pyc +0 -0
  993. tilelang/3rdparty/tvm/python/tvm/topi/x86/__pycache__/conv2d_avx_common.cpython-310.pyc +0 -0
  994. tilelang/3rdparty/tvm/python/tvm/topi/x86/__pycache__/conv2d_int8.cpython-310.pyc +0 -0
  995. tilelang/3rdparty/tvm/python/tvm/topi/x86/__pycache__/conv2d_transpose.cpython-310.pyc +0 -0
  996. tilelang/3rdparty/tvm/python/tvm/topi/x86/__pycache__/conv3d.cpython-310.pyc +0 -0
  997. tilelang/3rdparty/tvm/python/tvm/topi/x86/__pycache__/conv3d_transpose.cpython-310.pyc +0 -0
  998. tilelang/3rdparty/tvm/python/tvm/topi/x86/__pycache__/dense.cpython-310.pyc +0 -0
  999. tilelang/3rdparty/tvm/python/tvm/topi/x86/__pycache__/dense_alter_op.cpython-310.pyc +0 -0
  1000. tilelang/3rdparty/tvm/python/tvm/topi/x86/__pycache__/depthwise_conv2d.cpython-310.pyc +0 -0
  1001. tilelang/3rdparty/tvm/python/tvm/topi/x86/__pycache__/group_conv2d.cpython-310.pyc +0 -0
  1002. tilelang/3rdparty/tvm/python/tvm/topi/x86/__pycache__/injective.cpython-310.pyc +0 -0
  1003. tilelang/3rdparty/tvm/python/tvm/topi/x86/__pycache__/math_alter_op.cpython-310.pyc +0 -0
  1004. tilelang/3rdparty/tvm/python/tvm/topi/x86/__pycache__/nn.cpython-310.pyc +0 -0
  1005. tilelang/3rdparty/tvm/python/tvm/topi/x86/__pycache__/pooling.cpython-310.pyc +0 -0
  1006. tilelang/3rdparty/tvm/python/tvm/topi/x86/__pycache__/reduction.cpython-310.pyc +0 -0
  1007. tilelang/3rdparty/tvm/python/tvm/topi/x86/__pycache__/roi_align.cpython-310.pyc +0 -0
  1008. tilelang/3rdparty/tvm/python/tvm/topi/x86/__pycache__/sparse.cpython-310.pyc +0 -0
  1009. tilelang/3rdparty/tvm/python/tvm/topi/x86/__pycache__/tensor_intrin.cpython-310.pyc +0 -0
  1010. {tilelang_rocm-0.1.4.post5.dist-info → tilelang_rocm-0.1.4.post9.dist-info}/licenses/LICENSE +0 -0
  1011. {tilelang_rocm-0.1.4.post5.dist-info → tilelang_rocm-0.1.4.post9.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1202 @@
1
+ # Copyright (c) Microsoft Corporation.
2
+ # Licensed under the MIT License.
3
+ from typing import Dict, Literal
4
+
5
+ decode_i4_to_f16 = """
6
+ template <typename T1, typename T2, bool isSigned = false>
7
+ __device__ void decode_i4b_to_f16(T1 *_i4s, T2 *B_local_decode, const int N = 8)
8
+ {
9
+ uint *h = reinterpret_cast<uint *>(B_local_decode);
10
+
11
+ static constexpr uint immLut = (0xf0 & 0xcc) | 0xaa;
12
+ static constexpr uint BOTTOM_MASK = 0x000f000f;
13
+ static constexpr uint FP16_TOP_MAGIC_NUM = 0x64006400;
14
+ static constexpr uint MEDIAN_NUM = isSigned ? 0x64086408 : 0x64006400;
15
+ uint const i4s = *reinterpret_cast<uint *>(_i4s);
16
+ #pragma unroll
17
+ for (int i = 0; i < (N / 2); i++)
18
+ {
19
+
20
+ asm volatile("lop3.b32 %0, %1, %2, %3, %4;\\n"
21
+ : "=r"(h[i])
22
+ : "r"(i4s >> (4 * i)), "n"(BOTTOM_MASK), "n"(FP16_TOP_MAGIC_NUM), "n"(immLut));
23
+ asm volatile("sub.f16x2 %0, %1, %2;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(MEDIAN_NUM));
24
+ }
25
+ }
26
+
27
+ template <typename T1, typename T2>
28
+ __device__ void decode_i4s_to_f16(T1 *_i4s, T2 *B_local_decode, const int N = 8)
29
+ {
30
+ decode_i4b_to_f16<T1, T2, true>(_i4s, B_local_decode, N);
31
+ }
32
+
33
+ template <typename T1, typename T2>
34
+ __device__ void decode_i4u_to_f16(T1 *_i4u, T2 *B_local_decode, const int N = 8)
35
+ {
36
+ decode_i4b_to_f16<T1, T2, false>(_i4u, B_local_decode, N);
37
+ }
38
+ """
39
+
40
+ decode_i4_to_f16_scale = """
41
+ template <typename T1, typename T2, typename T3, bool isSigned = false, bool withScaling = false>
42
+ __device__ void decode_i4b_to_f16_scale(T1 *_i4s, T2 *B_local_decode, const int N = 8, const T3 *scale = nullptr)
43
+ {
44
+ uint *h = reinterpret_cast<uint *>(B_local_decode);
45
+
46
+ static constexpr uint immLut = (0xf0 & 0xcc) | 0xaa;
47
+ static constexpr uint BOTTOM_MASK = 0x000f000f;
48
+ static constexpr uint FP16_TOP_MAGIC_NUM = 0x64006400;
49
+ // Minus 7 to scale the value to signed
50
+ static constexpr uint MEDIAN_NUM = isSigned ? 0x64086408 : 0x64006400;
51
+ uint const i4s = *reinterpret_cast<uint *>(_i4s);
52
+ T3 const scale_r = *scale;
53
+ uint const packed_scales = __pack_half2(scale_r, scale_r);
54
+
55
+ #pragma unroll
56
+ // decode 2 elems at one time.
57
+ for (int i = 0; i < (N / 2); i++)
58
+ {
59
+
60
+ asm volatile("lop3.b32 %0, %1, %2, %3, %4;\\n"
61
+ : "=r"(h[i])
62
+ : "r"(i4s >> (4 * i)), "n"(BOTTOM_MASK), "n"(FP16_TOP_MAGIC_NUM), "n"(immLut));
63
+ asm volatile("sub.f16x2 %0, %1, %2;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(MEDIAN_NUM));
64
+ asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(packed_scales), "r"(0));
65
+ }
66
+ }
67
+
68
+ template <typename T1, typename T2, typename T3>
69
+ __device__ void decode_i4s_to_f16_scale(T1 *_i4s, T2 *B_local_decode, T3 *scale = nullptr, const int N = 8)
70
+ {
71
+ decode_i4b_to_f16_scale<T1, T2, T3, true, true>(_i4s, B_local_decode, N, scale);
72
+ }
73
+
74
+ template <typename T1, typename T2, typename T3>
75
+ __device__ void decode_i4u_to_f16_scale(T1 *_i4u, T2 *B_local_decode, T3 *scale = nullptr, const int N = 8)
76
+ {
77
+ decode_i4b_to_f16_scale<T1, T2, T3, false, true>(_i4u, B_local_decode, N, scale);
78
+ }
79
+
80
+ """
81
+
82
+ decode_i4_to_f16_scale_offset = """
83
+ template <typename T1, typename T2, typename T3, bool isSigned = false, bool withScaling = false>
84
+ __device__ void decode_i4b_to_f16_scale_offset(T1 *_i4s, T2 *B_local_decode, const int N = 8, const T3 *scale = nullptr, const int offset = 0)
85
+ {
86
+ uint *h = reinterpret_cast<uint *>(B_local_decode);
87
+
88
+ static constexpr uint immLut = (0xf0 & 0xcc) | 0xaa;
89
+ static constexpr uint BOTTOM_MASK = 0x000f000f;
90
+ static constexpr uint FP16_TOP_MAGIC_NUM = 0x64006400;
91
+ // Minus 7 to scale the value to signed
92
+ static constexpr uint MEDIAN_NUM = isSigned ? 0x64086408 : 0x64006400;
93
+ uint const i4s = *reinterpret_cast<uint *>(_i4s);
94
+ T3 const scale_l = *scale;
95
+ T3 const scale_r = *(scale + offset);
96
+ uint const packed_scales_l = __pack_half2(scale_l, scale_l);
97
+ uint const packed_scales_r = __pack_half2(scale_r, scale_r);
98
+
99
+ #pragma unroll
100
+ // decode 2 elems at one time.
101
+ for (int i = 0; i < (N / 2); i++)
102
+ {
103
+
104
+ asm volatile("lop3.b32 %0, %1, %2, %3, %4;\\n"
105
+ : "=r"(h[i])
106
+ : "r"(i4s >> (4 * i)), "n"(BOTTOM_MASK), "n"(FP16_TOP_MAGIC_NUM), "n"(immLut));
107
+ asm volatile("sub.f16x2 %0, %1, %2;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(MEDIAN_NUM));
108
+ }
109
+ #pragma unroll
110
+ for (int i = 0; i < (N / 4); i++)
111
+ {
112
+ asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(packed_scales_l), "r"(0));
113
+ }
114
+ #pragma unroll
115
+ for (int i = (N / 4); i < (N / 2); i++)
116
+ {
117
+ asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(packed_scales_r), "r"(0));
118
+ }
119
+ }
120
+
121
+ template <typename T1, typename T2, typename T3>
122
+ __device__ void decode_i4s_to_f16_scale_offset(T1 *_i4s, T2 *B_local_decode, T3 *scale = nullptr, const int offset = 0, const int N = 8)
123
+ {
124
+ decode_i4b_to_f16_scale_offset<T1, T2, T3, true, true>(_i4s, B_local_decode, N, scale, offset);
125
+ }
126
+
127
+ template <typename T1, typename T2, typename T3>
128
+ __device__ void decode_i4u_to_f16_scale_offset(T1 *_i4u, T2 *B_local_decode, T3 *scale = nullptr, const int offset = 0, const int N = 8)
129
+ {
130
+ decode_i4b_to_f16_scale_offset<T1, T2, T3, false, true>(_i4u, B_local_decode, N, scale, offset);
131
+ }
132
+
133
+ """
134
+
135
+ decode_i4_to_f16_scale_zeros_original = """
136
+ template <typename T1, typename T2, typename T3, typename T4, bool isSigned = false>
137
+ __device__ void decode_i4b_to_f16_zeros_original(T1 *_i4s, T2 *B_local_decode, const int N = 8, const T3 *scale = nullptr, const T4 *zeros = nullptr)
138
+ {
139
+ uint *h = reinterpret_cast<uint *>(B_local_decode);
140
+
141
+ static constexpr uint immLut = (0xf0 & 0xcc) | 0xaa;
142
+ static constexpr uint BOTTOM_MASK = 0x000f000f;
143
+ static constexpr uint FP16_TOP_MAGIC_NUM = 0x64006400;
144
+ // Minus 7 to scale the value to signed
145
+ static constexpr uint MEDIAN_NUM = isSigned ? 0x64086408 : 0x64006400;
146
+ uint const i4s = *reinterpret_cast<uint *>(_i4s);
147
+ T3 const scale_r = *scale;
148
+ uint const packed_scales = __pack_half2(scale_r, scale_r);
149
+ // input zeros maybe int32(qzeros) or half format
150
+ T4 const zero_r = *zeros;
151
+ uint const packed_zeros = __pack_half2(zero_r, zero_r);
152
+
153
+
154
+ #pragma unroll
155
+ // decode 2 elems at one time.
156
+ for (int i = 0; i < (N / 2); i++)
157
+ {
158
+
159
+ asm volatile("lop3.b32 %0, %1, %2, %3, %4;\\n"
160
+ : "=r"(h[i])
161
+ : "r"(i4s >> (4 * i)), "n"(BOTTOM_MASK), "n"(FP16_TOP_MAGIC_NUM), "n"(immLut));
162
+
163
+ asm volatile("sub.f16x2 %0, %1, %2;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(MEDIAN_NUM));
164
+
165
+ asm volatile("sub.f16x2 %0, %1, %2;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(packed_zeros));
166
+ asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(packed_scales), "r"(0));
167
+ }
168
+ }
169
+
170
+ template <typename T1, typename T2, typename T3, typename T4>
171
+ __device__ void decode_i4u_to_f16_scale_zeros_original(T1 *_i4u, T2 *B_local_decode, T3 *scale = nullptr, T4 *zeros = nullptr, const int N = 8)
172
+ {
173
+ decode_i4b_to_f16_zeros_original<T1, T2, T3, T4, false>(_i4u, B_local_decode, N, scale, zeros);
174
+ }
175
+ """
176
+
177
+ decode_i4_to_f16_scale_zeros_original_offset = """
178
+ template <typename T1, typename T2, typename T3, typename T4, bool isSigned = false>
179
+ __device__ void decode_i4b_to_f16_zeros_original_offset(T1 *_i4s, T2 *B_local_decode, const int N = 8, const T3 *scale = nullptr, const T4 *zeros = nullptr, const int offset = 0)
180
+ {
181
+ uint *h = reinterpret_cast<uint *>(B_local_decode);
182
+
183
+ static constexpr uint immLut = (0xf0 & 0xcc) | 0xaa;
184
+ static constexpr uint BOTTOM_MASK = 0x000f000f;
185
+ static constexpr uint FP16_TOP_MAGIC_NUM = 0x64006400;
186
+ // Minus 7 to scale the value to signed
187
+ static constexpr uint MEDIAN_NUM = isSigned ? 0x64086408 : 0x64006400;
188
+ uint const i4s = *reinterpret_cast<uint *>(_i4s);
189
+ T3 const scale_l = *scale;
190
+ T3 const scale_r = *(scale + offset);
191
+ uint const packed_scales_l = __pack_half2(scale_l, scale_l);
192
+ uint const packed_scales_r = __pack_half2(scale_r, scale_r);
193
+ // input zeros maybe int32(qzeros) or half format
194
+ T3 const zeros_l = *zeros;
195
+ T3 const zeros_r = *(zeros + offset);
196
+ uint const packed_zeros_l = __pack_half2(zeros_l, zeros_l);
197
+ uint const packed_zeros_r = __pack_half2(zeros_r, zeros_r);
198
+
199
+ #pragma unroll
200
+ // decode 2 elems at one time.
201
+ for (int i = 0; i < (N / 2); i++)
202
+ {
203
+
204
+ asm volatile("lop3.b32 %0, %1, %2, %3, %4;\\n"
205
+ : "=r"(h[i])
206
+ : "r"(i4s >> (4 * i)), "n"(BOTTOM_MASK), "n"(FP16_TOP_MAGIC_NUM), "n"(immLut));
207
+
208
+ asm volatile("sub.f16x2 %0, %1, %2;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(MEDIAN_NUM));
209
+ }
210
+
211
+ #pragma unroll
212
+ for (int i = 0; i < (N / 4); i++)
213
+ {
214
+ asm volatile("sub.f16x2 %0, %1, %2;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(packed_zeros_l));
215
+ asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(packed_scales_l), "r"(0));
216
+ }
217
+ #pragma unroll
218
+ for (int i = (N / 4); i < (N / 2); i++)
219
+ {
220
+ asm volatile("sub.f16x2 %0, %1, %2;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(packed_zeros_r));
221
+ asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(packed_scales_r), "r"(0));
222
+ }
223
+ }
224
+
225
+ template <typename T1, typename T2, typename T3, typename T4>
226
+ __device__ void decode_i4u_to_f16_scale_zeros_original_offset(T1 *_i4u, T2 *B_local_decode, T3 *scale = nullptr, T4 *zeros = nullptr, const int offset = 0, const int N = 8)
227
+ {
228
+ decode_i4b_to_f16_zeros_original_offset<T1, T2, T3, T4, false>(_i4u, B_local_decode, N, scale, zeros, offset);
229
+ }
230
+ """
231
+
232
+ decode_i4_to_f16_scale_zeros_rescale = """
233
+ template <typename T1, typename T2, typename T3, typename T4, bool isSigned = false>
234
+ __device__ void decode_i4b_to_f16_scale_zeros_rescale(T1 *_i4s, T2 *B_local_decode, const int N = 8, const T3 *scale = nullptr, const T4 *zeros = nullptr)
235
+ {
236
+ uint *h = reinterpret_cast<uint *>(B_local_decode);
237
+
238
+ static constexpr uint immLut = (0xf0 & 0xcc) | 0xaa;
239
+ static constexpr uint BOTTOM_MASK = 0x000f000f;
240
+ static constexpr uint FP16_TOP_MAGIC_NUM = 0x64006400;
241
+ // Minus 7 to scale the value to signed
242
+ static constexpr uint MEDIAN_NUM = isSigned ? 0x64086408 : 0x64006400;
243
+ uint const i4s = *reinterpret_cast<uint *>(_i4s);
244
+ T3 const scale_r = *scale;
245
+ uint const packed_scales = __pack_half2(scale_r, scale_r);
246
+ T4 const zero_r = *zeros;
247
+ uint const packed_zeros = 0x80008000 | __pack_half2(zero_r, zero_r);
248
+
249
+ #pragma unroll
250
+ // decode 2 elems at one time.
251
+ for (int i = 0; i < (N / 2); i++)
252
+ {
253
+
254
+ asm volatile("lop3.b32 %0, %1, %2, %3, %4;\\n"
255
+ : "=r"(h[i])
256
+ : "r"(i4s >> (4 * i)), "n"(BOTTOM_MASK), "n"(FP16_TOP_MAGIC_NUM), "n"(immLut));
257
+
258
+ asm volatile("sub.f16x2 %0, %1, %2;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(MEDIAN_NUM));
259
+
260
+ asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(packed_scales), "r"(packed_zeros));
261
+ }
262
+ }
263
+
264
+ template <typename T1, typename T2, typename T3, typename T4>
265
+ __device__ void decode_i4u_to_f16_scale_zeros_rescale(T1 *_i4u, T2 *B_local_decode, T3 *scale = nullptr, T4 *zeros = nullptr, const int N = 8)
266
+ {
267
+ decode_i4b_to_f16_scale_zeros_rescale<T1, T2, T3, T4, false>(_i4u, B_local_decode, N, scale, zeros);
268
+ }
269
+
270
+ """
271
+
272
+ decode_i4_to_f16_scale_zeros_rescale_offset = """
273
+ template <typename T1, typename T2, typename T3, typename T4, bool isSigned = false>
274
+ __device__ void decode_i4b_to_f16_scale_zeros_rescale_offset(T1 *_i4s, T2 *B_local_decode, const int N = 8, const T3 *scale = nullptr, const T4 *zeros = nullptr, const int offset = 0)
275
+ {
276
+ uint *h = reinterpret_cast<uint *>(B_local_decode);
277
+
278
+ static constexpr uint immLut = (0xf0 & 0xcc) | 0xaa;
279
+ static constexpr uint BOTTOM_MASK = 0x000f000f;
280
+ static constexpr uint FP16_TOP_MAGIC_NUM = 0x64006400;
281
+ // Minus 7 to scale the value to signed
282
+ static constexpr uint MEDIAN_NUM = isSigned ? 0x64086408 : 0x64006400;
283
+ uint const i4s = *reinterpret_cast<uint *>(_i4s);
284
+ T3 const scale_l = *scale;
285
+ T3 const scale_r = *(scale + offset);
286
+ uint const packed_scales_l = __pack_half2(scale_l, scale_l);
287
+ uint const packed_scales_r = __pack_half2(scale_r, scale_r);
288
+ // input zeros maybe int32(qzeros) or half format
289
+ T3 const zeros_l = *zeros;
290
+ T3 const zeros_r = *(zeros + offset);
291
+ uint const packed_zeros_l = 0x80008000 | __pack_half2(zeros_l, zeros_l);
292
+ uint const packed_zeros_r = 0x80008000 | __pack_half2(zeros_r, zeros_r);
293
+
294
+ #pragma unroll
295
+ // decode 2 elems at one time.
296
+ for (int i = 0; i < (N / 2); i++)
297
+ {
298
+
299
+ asm volatile("lop3.b32 %0, %1, %2, %3, %4;\\n"
300
+ : "=r"(h[i])
301
+ : "r"(i4s >> (4 * i)), "n"(BOTTOM_MASK), "n"(FP16_TOP_MAGIC_NUM), "n"(immLut));
302
+
303
+ asm volatile("sub.f16x2 %0, %1, %2;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(MEDIAN_NUM));
304
+ }
305
+ #pragma unroll
306
+ for (int i = 0; i < (N / 4); i++)
307
+ {
308
+ asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(packed_scales_l), "r"(packed_zeros_l));
309
+ }
310
+ #pragma unroll
311
+ for (int i = (N / 4); i < (N / 2); i++)
312
+ {
313
+ asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(packed_scales_r), "r"(packed_zeros_r));
314
+ }
315
+ }
316
+
317
+ template <typename T1, typename T2, typename T3, typename T4>
318
+ __device__ void decode_i4u_to_f16_scale_zeros_rescale_offset(T1 *_i4u, T2 *B_local_decode, T3 *scale = nullptr, T4 *zeros = nullptr, const int offset = 0, const int N = 8)
319
+ {
320
+ decode_i4b_to_f16_scale_zeros_rescale_offset<T1, T2, T3, T4, false>(_i4u, B_local_decode, N, scale, zeros, offset);
321
+ }
322
+
323
+ """
324
+
325
+ decode_i4_to_f16_scale_zeros_quantized = """
326
+ template <typename T1, typename T2, typename T3, typename T4, bool isSigned = false>
327
+ __device__ void decode_i4b_to_f16_scale_zeros_quantized(T1 *_i4s, T2 *B_local_decode, const int N = 8, const T3 *scale = nullptr, const T4 *zeros = nullptr)
328
+ {
329
+ uint *h = reinterpret_cast<uint *>(B_local_decode);
330
+
331
+ static constexpr uint immLut = (0xf0 & 0xcc) | 0xaa;
332
+ static constexpr uint BOTTOM_MASK = 0x000f000f;
333
+ static constexpr uint FP16_TOP_MAGIC_NUM = 0x64006400;
334
+ // Minus 7 to scale the value to signed
335
+ uint const i4s = *reinterpret_cast<uint *>(_i4s);
336
+ T3 const scale_r = *scale;
337
+ uint const packed_scales = __pack_half2(scale_r, scale_r);
338
+ // input zeros maybe int32(qzeros) or half format
339
+ int16_t const zero_r = *((int16_t*)zeros);
340
+ uint median_num = ((0xe400 | zero_r) << 16) | (0xe400 | zero_r);
341
+
342
+ #pragma unroll
343
+ // decode 2 elems at one time.
344
+ for (int i = 0; i < (N / 2); i++)
345
+ {
346
+
347
+ asm volatile("lop3.b32 %0, %1, %2, %3, %4;\\n"
348
+ : "=r"(h[i])
349
+ : "r"(i4s >> (4 * i)), "n"(BOTTOM_MASK), "n"(FP16_TOP_MAGIC_NUM), "n"(immLut));
350
+
351
+ asm volatile("add.f16x2 %0, %1, %2;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(median_num));
352
+
353
+ asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(packed_scales), "r"(0));
354
+ }
355
+ }
356
+
357
+ template <typename storage_dtype, typename target_dtype, typename scale_dtype, typename zero_dtype>
358
+ __device__ void decode_i4u_to_f16_scale_zeros_quantized(storage_dtype *_i4u, target_dtype *B_local_decode, scale_dtype *scale = nullptr, zero_dtype *zeros = nullptr, const int N = 8)
359
+ {
360
+ decode_i4b_to_f16_scale_zeros_quantized<storage_dtype, target_dtype, scale_dtype, zero_dtype, false>(_i4u, B_local_decode, N, scale, zeros);
361
+ }
362
+ """
363
+
364
+ decode_i4_to_f16_scale_zeros_quantized_offset = """
365
+ template <typename T1, typename T2, typename T3, bool isSigned = false>
366
+ __device__ void decode_i4b_to_f16_scale_zeros_quantized_offset(T1 *_i4s, T2 *B_local_decode, const int N = 8, const T3 *scale = nullptr, const T1 *qzeros = nullptr, const int scale_offset = 0, const int qzeros_offset = 0, const int group_offset = 0)
367
+ {
368
+ uint *h = reinterpret_cast<uint *>(B_local_decode);
369
+
370
+ static constexpr uint immLut = (0xf0 & 0xcc) | 0xaa;
371
+ static constexpr uint BOTTOM_MASK = 0x000f000f;
372
+ static constexpr uint FP16_TOP_MAGIC_NUM = 0x64006400;
373
+ // Minus 7 to scale the value to signed
374
+ uint const i4s = *reinterpret_cast<uint *>(_i4s);
375
+
376
+ T3 const scale_l = *scale;
377
+ T3 const scale_r = *(scale + scale_offset);
378
+ uint const packed_scales_l = __pack_half2(scale_l, scale_l);
379
+ uint const packed_scales_r = __pack_half2(scale_r, scale_r);
380
+
381
+ const int num_elems_per_storage_dtype = sizeof(T1) * 8 / 4;
382
+
383
+ T1 const qzeros_l = *qzeros;
384
+ T1 const qzeros_r = *(qzeros + qzeros_offset);
385
+ int16_t const zero_l = (qzeros_l >> (group_offset * 4) & 0xf);
386
+ int16_t const zero_r = (qzeros_r >> (group_offset * 4) & 0xf);
387
+
388
+ uint median_num_l = ((0xe400 | zero_l) << 16) | (0xe400 | zero_l);
389
+ uint median_num_r = ((0xe400 | zero_r) << 16) | (0xe400 | zero_r);
390
+
391
+ #pragma unroll
392
+ // decode 2 elems at one time.
393
+ for (int i = 0; i < (N / 2); i++)
394
+ {
395
+
396
+ asm volatile("lop3.b32 %0, %1, %2, %3, %4;\\n"
397
+ : "=r"(h[i])
398
+ : "r"(i4s >> (4 * i)), "n"(BOTTOM_MASK), "n"(FP16_TOP_MAGIC_NUM), "n"(immLut));
399
+ }
400
+ #pragma unroll
401
+ for (int i = 0; i < (N / 4); i++)
402
+ {
403
+ asm volatile("add.f16x2 %0, %1, %2;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(median_num_l));
404
+
405
+ asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(packed_scales_l), "r"(0));
406
+ }
407
+ #pragma unroll
408
+ for (int i = (N / 4); i < (N / 2); i++)
409
+ {
410
+ asm volatile("add.f16x2 %0, %1, %2;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(median_num_r));
411
+
412
+ asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(packed_scales_r), "r"(0));
413
+ }
414
+ }
415
+
416
+ template <typename storage_dtype, typename target_dtype, typename scale_dtype>
417
+ __device__ void decode_i4u_to_f16_scale_zeros_quantized_offset(storage_dtype *_i4u, target_dtype *B_local_decode, scale_dtype *scale = nullptr, storage_dtype *qzeros = nullptr, const int scale_offset = 0, const int zero_offset = 0, const int group_offset = 0, const int N = 8)
418
+ {
419
+ decode_i4b_to_f16_scale_zeros_quantized_offset<storage_dtype, target_dtype, scale_dtype, false>(_i4u, B_local_decode, N, scale, qzeros, scale_offset, zero_offset, group_offset);
420
+ }
421
+ """
422
+
423
+ decode_i2_to_f16 = """
424
+ template <typename T1, typename T2, bool isSigned = false>
425
+ __device__ void decode_i2b_to_f16(T1 *_i2s, T2 *B_local_decode, const int N = 8)
426
+ {
427
+ uint *h = reinterpret_cast<uint *>(B_local_decode);
428
+
429
+ static constexpr uint immLut = (0xf0 & 0xcc) | 0xaa;
430
+ static constexpr uint BOTTOM_MASK = 0x00030003;
431
+ static constexpr uint FP16_TOP_MAGIC_NUM = 0x64006400;
432
+ static constexpr uint MEDIAN_NUM = isSigned ? 0x64026402 : 0x64006400;
433
+ int16_t const i2s_i16 = *reinterpret_cast<int16_t *>(_i2s);
434
+ // decode 2 elems at one time.
435
+ // interleave {e15,e13,e11,e9,e7,e5,e3,e1,e14,e12,e10,e8,e6,e4,e2,e0}
436
+ // only decode for {x,x,x,x,e7,e5,e3,e1,x,x,x,x,e6,e4,e2,e0}
437
+ // otherwise the pointer of _i2s should be moved to
438
+ int i2s = (i2s_i16 & 0x00ff);
439
+ i2s |= ((i2s_i16 & 0xff00) << 8);
440
+
441
+ #pragma unroll
442
+ for (int i = 0; i < (N / 2); i++)
443
+ {
444
+ asm volatile("lop3.b32 %0, %1, %2, %3, %4;\\n"
445
+ : "=r"(h[i])
446
+ : "r"(i2s >> (2 * i)), "n"(BOTTOM_MASK), "n"(FP16_TOP_MAGIC_NUM), "n"(immLut));
447
+ asm volatile("sub.f16x2 %0, %1, %2;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(MEDIAN_NUM));
448
+ }
449
+ }
450
+
451
+ template <typename T1, typename T2>
452
+ __device__ void decode_i2s_to_f16(T1 *_i2s, T2 *B_local_decode, const int N = 8)
453
+ {
454
+ decode_i2b_to_f16<T1, T2, true>(_i2s, B_local_decode, N);
455
+ }
456
+
457
+ template <typename T1, typename T2>
458
+ __device__ void decode_i2u_to_f16(T1 *_i2u, T2 *B_local_decode, const int N = 8)
459
+ {
460
+ decode_i2b_to_f16<T1, T2, false>(_i2u, B_local_decode, N);
461
+ }
462
+ """
463
+
464
+ decode_i2_to_f16_scale = """
465
+ template <typename T1, typename T2, typename T3, bool isSigned = false>
466
+ __device__ void decode_i2b_to_f16_scale(T1 *_i2s, T2 *B_local_decode, T3 *scale = nullptr, const int N = 8)
467
+ {
468
+ uint *h = reinterpret_cast<uint *>(B_local_decode);
469
+
470
+ static constexpr uint immLut = (0xf0 & 0xcc) | 0xaa;
471
+ static constexpr uint BOTTOM_MASK = 0x00030003;
472
+ static constexpr uint FP16_TOP_MAGIC_NUM = 0x64006400;
473
+ static constexpr uint MEDIAN_NUM = isSigned ? 0x64026402 : 0x64006400;
474
+ int16_t const i2s_i16 = *reinterpret_cast<int16_t *>(_i2s);
475
+ // decode 2 elems at one time.
476
+ // interleave {e15,e13,e11,e9,e7,e5,e3,e1,e14,e12,e10,e8,e6,e4,e2,e0}
477
+ // only decode for {x,x,x,x,e7,e5,e3,e1,x,x,x,x,e6,e4,e2,e0}
478
+ // otherwise the pointer of _i2s should be moved to
479
+ int i2s = (i2s_i16 & 0x00ff);
480
+ i2s |= ((i2s_i16 & 0xff00) << 8);
481
+
482
+ #pragma unroll
483
+ for (int i = 0; i < (N / 2); i++)
484
+ {
485
+ asm volatile("lop3.b32 %0, %1, %2, %3, %4;\\n"
486
+ : "=r"(h[i])
487
+ : "r"(i2s >> (2 * i)), "n"(BOTTOM_MASK), "n"(FP16_TOP_MAGIC_NUM), "n"(immLut));
488
+ asm volatile("sub.f16x2 %0, %1, %2;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(MEDIAN_NUM));
489
+ asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(__pack_half2(*scale, *scale)), "r"(0));
490
+ }
491
+ }
492
+
493
+ template <typename T1, typename T2, typename T3>
494
+ __device__ void decode_i2s_to_f16_scale(T1 *_i2s, T2 *B_local_decode, T3 *scale, const int N = 8)
495
+ {
496
+ decode_i2b_to_f16_scale<T1, T2, T3, true>(_i2s, B_local_decode, scale, N);
497
+ }
498
+
499
+ template <typename T1, typename T2, typename T3>
500
+ __device__ void decode_i2u_to_f16_scale(T1 *_i2u, T2 *B_local_decode, T3 *scale, const int N = 8)
501
+ {
502
+ decode_i2b_to_f16_scale<T1, T2, T3, false>(_i2u, B_local_decode, scale, N);
503
+ }
504
+ """
505
+
506
+ decode_i2_to_f16_scale_zeros_original_offset = """
507
+ template <typename T1, typename T2, typename T3, bool isSigned = false>
508
+ __device__ void decode_i2b_to_f16_scale_zeros_original_offset(T1 *_i2s, T2 *B_local_decode, T3 *scale = nullptr, T3 *zeros = nullptr, const int offset = 0, const int N = 8)
509
+ {
510
+ uint *h = reinterpret_cast<uint *>(B_local_decode);
511
+
512
+ static constexpr uint immLut = (0xf0 & 0xcc) | 0xaa;
513
+ static constexpr uint BOTTOM_MASK = 0x00030003;
514
+ static constexpr uint FP16_TOP_MAGIC_NUM = 0x64006400;
515
+ static constexpr uint MEDIAN_NUM = isSigned ? 0x64026402 : 0x64006400;
516
+ int16_t const i2s_i16 = *reinterpret_cast<int16_t *>(_i2s);
517
+ // decode 2 elems at one time.
518
+ // interleave {e15,e13,e11,e9,e7,e5,e3,e1,e14,e12,e10,e8,e6,e4,e2,e0}
519
+ // only decode for {x,x,x,x,e7,e5,e3,e1,x,x,x,x,e6,e4,e2,e0}
520
+ // otherwise the pointer of _i2s should be moved to
521
+ int i2s = (i2s_i16 & 0x00ff);
522
+ i2s |= ((i2s_i16 & 0xff00) << 8);
523
+
524
+ T3 const zeros_l = *zeros;
525
+ T3 const zeros_r = *(zeros + offset);
526
+ uint const packed_zeros_l = __pack_half2(zeros_l, zeros_l);
527
+ uint const packed_zeros_r = __pack_half2(zeros_r, zeros_r);
528
+
529
+ T3 const scale_l = *scale;
530
+ T3 const scale_r = *(scale + offset);
531
+ uint const packed_scales_l = __pack_half2(scale_l, scale_l);
532
+ uint const packed_scales_r = __pack_half2(scale_r, scale_r);
533
+
534
+ #pragma unroll
535
+ for (int i = 0; i < (N / 2); i++)
536
+ {
537
+ asm volatile("lop3.b32 %0, %1, %2, %3, %4;\\n"
538
+ : "=r"(h[i])
539
+ : "r"(i2s >> (2 * i)), "n"(BOTTOM_MASK), "n"(FP16_TOP_MAGIC_NUM), "n"(immLut));
540
+ asm volatile("sub.f16x2 %0, %1, %2;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(MEDIAN_NUM));
541
+ }
542
+ #pragma unroll
543
+ for (int i = 0; i < (N / 4); i++)
544
+ {
545
+ asm volatile("sub.f16x2 %0, %1, %2;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(packed_zeros_l));
546
+ asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(packed_scales_l), "r"(0));
547
+ }
548
+ #pragma unroll
549
+ for (int i = (N / 4); i < (N / 2); i++)
550
+ {
551
+ asm volatile("sub.f16x2 %0, %1, %2;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(packed_zeros_r));
552
+ asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(packed_scales_r), "r"(0));
553
+ }
554
+ }
555
+
556
+ template <typename T1, typename T2, typename T3>
557
+ __device__ void decode_i2u_to_f16_scale_zeros_original_offset(T1 *_i2u, T2 *B_local_decode, T3 *scale, T3 *zeros, const int offset = 0, const int N = 8)
558
+ {
559
+ decode_i2b_to_f16_scale_zeros_original<T1, T2, T3, false>(_i2u, B_local_decode, scale, zeros, offset, N);
560
+ }
561
+ """
562
+
563
+ decode_i2_to_f16_scale_zeros_original = """
564
+ template <typename T1, typename T2, typename T3, bool isSigned = false>
565
+ __device__ void decode_i2b_to_f16_scale_zeros_original(T1 *_i2s, T2 *B_local_decode, T3 *scale = nullptr, T3 *zeros = nullptr, const int N = 8)
566
+ {
567
+ uint *h = reinterpret_cast<uint *>(B_local_decode);
568
+
569
+ static constexpr uint immLut = (0xf0 & 0xcc) | 0xaa;
570
+ static constexpr uint BOTTOM_MASK = 0x00030003;
571
+ static constexpr uint FP16_TOP_MAGIC_NUM = 0x64006400;
572
+ static constexpr uint MEDIAN_NUM = isSigned ? 0x64026402 : 0x64006400;
573
+ int16_t const i2s_i16 = *reinterpret_cast<int16_t *>(_i2s);
574
+ // decode 2 elems at one time.
575
+ // interleave {e15,e13,e11,e9,e7,e5,e3,e1,e14,e12,e10,e8,e6,e4,e2,e0}
576
+ // only decode for {x,x,x,x,e7,e5,e3,e1,x,x,x,x,e6,e4,e2,e0}
577
+ // otherwise the pointer of _i2s should be moved to
578
+ int i2s = (i2s_i16 & 0x00ff);
579
+ i2s |= ((i2s_i16 & 0xff00) << 8);
580
+
581
+ #pragma unroll
582
+ for (int i = 0; i < (N / 2); i++)
583
+ {
584
+ asm volatile("lop3.b32 %0, %1, %2, %3, %4;\\n"
585
+ : "=r"(h[i])
586
+ : "r"(i2s >> (2 * i)), "n"(BOTTOM_MASK), "n"(FP16_TOP_MAGIC_NUM), "n"(immLut));
587
+ asm volatile("sub.f16x2 %0, %1, %2;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(MEDIAN_NUM));
588
+ asm volatile("sub.f16x2 %0, %1, %2;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(__pack_half2(*zeros, *zeros)));
589
+ asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(__pack_half2(*scale, *scale)), "r"(0));
590
+ }
591
+ }
592
+
593
+ template <typename T1, typename T2, typename T3>
594
+ __device__ void decode_i2u_to_f16_scale_zeros_original(T1 *_i2u, T2 *B_local_decode, T3 *scale, T3 *zeros, const int N = 8)
595
+ {
596
+ decode_i2b_to_f16_scale_zeros_original<T1, T2, T3, false>(_i2u, B_local_decode, scale, zeros, N);
597
+ }
598
+ """
599
+
600
+ decode_i2_to_f16_scale_zeros_rescale = """
601
+ template <typename T1, typename T2, typename T3, bool isSigned = false>
602
+ __device__ void decode_i2b_to_f16_scale_zeros_rescale(T1 *_i2s, T2 *B_local_decode, T3 *scale = nullptr, T3 *zeros = nullptr, const int N = 8)
603
+ {
604
+ uint *h = reinterpret_cast<uint *>(B_local_decode);
605
+
606
+ static constexpr uint immLut = (0xf0 & 0xcc) | 0xaa;
607
+ static constexpr uint BOTTOM_MASK = 0x00030003;
608
+ static constexpr uint FP16_TOP_MAGIC_NUM = 0x64006400;
609
+ static constexpr uint MEDIAN_NUM = isSigned ? 0x64026402 : 0x64006400;
610
+ int16_t const i2s_i16 = *reinterpret_cast<int16_t *>(_i2s);
611
+ // decode 2 elems at one time.
612
+ // interleave {e15,e13,e11,e9,e7,e5,e3,e1,e14,e12,e10,e8,e6,e4,e2,e0}
613
+ // only decode for {x,x,x,x,e7,e5,e3,e1,x,x,x,x,e6,e4,e2,e0}
614
+ // otherwise the pointer of _i2s should be moved to
615
+ int i2s = (i2s_i16 & 0x00ff);
616
+ i2s |= ((i2s_i16 & 0xff00) << 8);
617
+
618
+ #pragma unroll
619
+ for (int i = 0; i < (N / 2); i++)
620
+ {
621
+ asm volatile("lop3.b32 %0, %1, %2, %3, %4;\\n"
622
+ : "=r"(h[i])
623
+ : "r"(i2s >> (2 * i)), "n"(BOTTOM_MASK), "n"(FP16_TOP_MAGIC_NUM), "n"(immLut));
624
+ asm volatile("sub.f16x2 %0, %1, %2;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(MEDIAN_NUM));
625
+ asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(__pack_half2(*scale, *scale)), "r"(0));
626
+ asm volatile("sub.f16x2 %0, %1, %2;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(__pack_half2(*zeros, *zeros)));
627
+ }
628
+ }
629
+
630
+ template <typename T1, typename T2, typename T3>
631
+ __device__ void decode_i2u_to_f16_scale_zeros_rescale(T1 *_i2u, T2 *B_local_decode, T3 *scale, T3 *zeros, const int N = 8)
632
+ {
633
+ decode_i2b_to_f16_scale_zeros_rescale<T1, T2, T3, false>(_i2u, B_local_decode, scale, zeros, N);
634
+ }
635
+ """
636
+
637
+ decode_i2_to_f16_scale_zeros_quantized = """
638
+ template <typename T1, typename T2, typename T3, typename T4, bool isSigned = false>
639
+ __device__ void decode_i2b_to_f16_scale_zeros_quantized(T1 *_i2s, T2 *B_local_decode, const int N = 8, T3 *scale = nullptr, T4 *zeros = nullptr)
640
+ {
641
+ uint *h = reinterpret_cast<uint *>(B_local_decode);
642
+
643
+ static constexpr uint immLut = (0xf0 & 0xcc) | 0xaa;
644
+ static constexpr uint BOTTOM_MASK = 0x00030003;
645
+ static constexpr uint FP16_TOP_MAGIC_NUM = 0x64006400;
646
+ static constexpr uint MEDIAN_NUM = isSigned ? 0x64016401 : 0x64006400;
647
+ int16_t const i2s_i16 = *reinterpret_cast<int16_t *>(_i2s);
648
+ T3 const scale_r = *scale;
649
+ uint const packed_scales = __pack_half2(scale_r, scale_r);
650
+ int16_t const zero_r = *((int16_t*)zeros);
651
+ uint median_num = ((0xe400 | zero_r) << 16) | (0xe400 | zero_r);
652
+
653
+ // decode 2 elems at one time.
654
+ // interleave {e15,e13,e11,e9,e7,e5,e3,e1,e14,e12,e10,e8,e6,e4,e2,e0}
655
+ // only decode for {x,x,x,x,e7,e5,e3,e1,x,x,x,x,e6,e4,e2,e0}
656
+ // otherwise the pointer of _i2s should be moved to
657
+ int i2s = (i2s_i16 & 0x00ff);
658
+ i2s |= ((i2s_i16 & 0xff00) << 8);
659
+
660
+ #pragma unroll
661
+ for (int i = 0; i < (N / 2); i++)
662
+ {
663
+ asm volatile("lop3.b32 %0, %1, %2, %3, %4;\\n"
664
+ : "=r"(h[i])
665
+ : "r"(i2s >> (2 * i)), "n"(BOTTOM_MASK), "n"(FP16_TOP_MAGIC_NUM), "n"(immLut));
666
+ asm volatile("add.f16x2 %0, %1, %2;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(median_num));
667
+
668
+ asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(packed_scales), "r"(0));
669
+ }
670
+ }
671
+ template <typename T1, typename T2, typename T3, typename T4>
672
+ __device__ void decode_i2u_to_f16_scale_zeros_quantized(T1 *_i2u, T2 *B_local_decode, T3 *scale = nullptr, T4 *zeros = nullptr, const int N = 8)
673
+ {
674
+ decode_i2b_to_f16_scale_zeros_quantized<T1, T2, T3, T4, false>(_i2u, B_local_decode, N, scale, zeros);
675
+ }
676
+ """
677
+
678
+ decode_i1_to_f16 = """
679
+ /*
680
+ Kind 0: original
681
+ Kind 1: rescale
682
+ Kind 2: quantized
683
+ # documents for zeros_mode:
684
+ # original: target = (dequantize_weight - zero_point) * scale
685
+ # rescale: target = dequantize_weight * scale - zero_point
686
+ # quantized: target = (dequantize_weight - dequantize_zeros) * scale
687
+ # Notice: only support "original" and "rescale" now
688
+ zeros_mode: Literal["original", "rescale", "quantized"] = "original"
689
+ */
690
+ template <typename T1, typename T2, bool isSigned = false, bool withScaling = false, bool withZeros = false, int ZerosKind = 1>
691
+ __device__ void decode_i1b_to_f16(T1 *_i1s, T2 *B_local_decode, const int N = 8, half *scale = nullptr, half *zeros = nullptr)
692
+ {
693
+ uint *h = reinterpret_cast<uint *>(B_local_decode);
694
+
695
+ static constexpr uint immLut = (0xf0 & 0xcc) | 0xaa;
696
+ static constexpr uint BOTTOM_MASK = 0x00010001;
697
+ static constexpr uint FP16_TOP_MAGIC_NUM = 0x64006400;
698
+ static constexpr uint MEDIAN_NUM = isSigned ? 0x64006400 : 0x64006400;
699
+ static constexpr uint TRANSFORM_SUBTRACT = 0xbc00bc00; // for signed int 2x - 1
700
+ // interleave {e31,e29,e27,e25,e23,e21,e19,e17,e15,e13,e11,e9,e7,e5,e3,e1,e30,e28,e26,e24,e22,e20,e18,e16,e14,e12,e10,e8,e6,e4,e2,e0}
701
+ // only decode e7,e5,e3,e1,e8,e6,e4,e2,e0
702
+ int8_t const i1s_i16 = *reinterpret_cast<int8_t *>(_i1s);
703
+ int i1s = (i1s_i16 & 0x0f);
704
+ i1s |= ((i1s_i16 & 0xf0) << 12);
705
+ #pragma unroll
706
+ // decode 2 elems at one time.
707
+ for (int i = 0; i < (N / 2); i++)
708
+ {
709
+
710
+ asm volatile("lop3.b32 %0, %1, %2, %3, %4;\\n"
711
+ : "=r"(h[i])
712
+ : "r"(i1s >> (1 * i)), "n"(BOTTOM_MASK), "n"(FP16_TOP_MAGIC_NUM), "n"(immLut));
713
+ asm volatile("sub.f16x2 %0, %1, %2;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(MEDIAN_NUM));
714
+ if constexpr (isSigned)
715
+ {
716
+ asm volatile("add.f16x2 %0, %1, %2;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(h[i]));
717
+ asm volatile("add.f16x2 %0, %1, %2;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(TRANSFORM_SUBTRACT));
718
+ }
719
+ if constexpr (withZeros && ZerosKind == 0)
720
+ {
721
+ asm volatile("sub.f16x2 %0, %1, %2;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(__pack_half2(*zeros, *zeros)));
722
+ }
723
+ if constexpr (withScaling)
724
+ {
725
+ asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(__pack_half2(*scale, *scale)), "r"(0));
726
+ }
727
+ if constexpr (withZeros && ZerosKind == 1)
728
+ {
729
+ asm volatile("sub.f16x2 %0, %1, %2;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(__pack_half2(*zeros, *zeros)));
730
+ }
731
+ }
732
+ }
733
+
734
+ template <typename T1, typename T2>
735
+ __device__ void decode_i1s_to_f16(T1 *_i1s, T2 *B_local_decode, const int N = 8)
736
+ {
737
+ decode_i1b_to_f16<T1, T2, true>(_i1s, B_local_decode, N);
738
+ }
739
+
740
+ template <typename T1, typename T2>
741
+ __device__ void decode_i1u_to_f16(T1 *_i1u, T2 *B_local_decode, const int N = 8)
742
+ {
743
+ decode_i1b_to_f16<T1, T2, false>(_i1u, B_local_decode, N);
744
+ }
745
+ """
746
+
747
+ decode_i1_to_f16_scale = """
748
+ template <typename T1, typename T2, typename T3>
749
+ __device__ void decode_i1u_to_f16_scale(T1 *_i1s, T2 *B_local_decode, T3 *scale = nullptr, const int N = 8)
750
+ {
751
+ uint *h = reinterpret_cast<uint *>(B_local_decode);
752
+
753
+ static constexpr uint immLut = (0xf0 & 0xcc) | 0xaa;
754
+ static constexpr uint BOTTOM_MASK = 0x00010001;
755
+ static constexpr uint FP16_TOP_MAGIC_NUM = 0x64006400;
756
+ static constexpr uint MEDIAN_NUM = 0x64006400;
757
+ // interleave {e31,e29,e27,e25,e23,e21,e19,e17,e15,e13,e11,e9,e7,e5,e3,e1,e30,e28,e26,e24,e22,e20,e18,e16,e14,e12,e10,e8,e6,e4,e2,e0}
758
+ // only decode e7,e5,e3,e1,e8,e6,e4,e2,e0
759
+ int8_t const i1s_i16 = *reinterpret_cast<int8_t *>(_i1s);
760
+ int i1s = (i1s_i16 & 0x0f);
761
+ i1s |= ((i1s_i16 & 0xf0) << 12);
762
+ T3 const scale_r = *scale;
763
+ uint const packed_scales = __pack_half2(scale_r, scale_r);
764
+ #pragma unroll
765
+ // decode 2 elems at one time.
766
+ for (int i = 0; i < (N / 2); i++)
767
+ {
768
+
769
+ asm volatile("lop3.b32 %0, %1, %2, %3, %4;\\n"
770
+ : "=r"(h[i])
771
+ : "r"(i1s >> (1 * i)), "n"(BOTTOM_MASK), "n"(FP16_TOP_MAGIC_NUM), "n"(immLut));
772
+ asm volatile("sub.f16x2 %0, %1, %2;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(MEDIAN_NUM));
773
+ asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(packed_scales), "r"(0));
774
+ }
775
+ }
776
+
777
+ template <typename T1, typename T2, typename T3>
778
+ __device__ void decode_i1s_to_f16_scale(T1 *_i1s, T2 *B_local_decode, T3 *scale = nullptr, const int N = 8)
779
+ {
780
+ uint *h = reinterpret_cast<uint *>(B_local_decode);
781
+
782
+ static constexpr uint immLut = (0xf0 & 0xcc) | 0xaa;
783
+ static constexpr uint BOTTOM_MASK = 0x00010001;
784
+ static constexpr uint FP16_TOP_MAGIC_NUM = 0x64006400;
785
+ static constexpr uint MEDIAN_NUM = 0x64006400;
786
+ static constexpr uint TRANSFORM_SUBTRACT = 0xbc00bc00; // for signed int 2x - 1
787
+ // interleave {e31,e29,e27,e25,e23,e21,e19,e17,e15,e13,e11,e9,e7,e5,e3,e1,e30,e28,e26,e24,e22,e20,e18,e16,e14,e12,e10,e8,e6,e4,e2,e0}
788
+ // only decode e7,e5,e3,e1,e8,e6,e4,e2,e0
789
+
790
+ int8_t const i1s_i16 = *reinterpret_cast<int8_t *>(_i1s);
791
+ int i1s = (i1s_i16 & 0x0f);
792
+ i1s |= ((i1s_i16 & 0xf0) << 12);
793
+ T3 const scale_r = *scale;
794
+ uint const packed_scales = __pack_half2(scale_r, scale_r);
795
+ #pragma unroll
796
+ // decode 2 elems at one time.
797
+ for (int i = 0; i < (N / 2); i++)
798
+ {
799
+
800
+ asm volatile("lop3.b32 %0, %1, %2, %3, %4;\\n"
801
+ : "=r"(h[i])
802
+ : "r"(i1s >> (1 * i)), "n"(BOTTOM_MASK), "n"(FP16_TOP_MAGIC_NUM), "n"(immLut));
803
+ asm volatile("sub.f16x2 %0, %1, %2;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(MEDIAN_NUM));
804
+ asm volatile("add.f16x2 %0, %1, %2;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(h[i]));
805
+ asm volatile("add.f16x2 %0, %1, %2;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(TRANSFORM_SUBTRACT));
806
+ asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(packed_scales), "r"(0));
807
+ }
808
+ }
809
+ """
810
+
811
+ decode_i1_to_f16_scale_zeros_original = """
812
+ template <typename T1, typename T2, typename T3, typename T4, bool isSigned = false>
813
+ __device__ void decode_i1b_to_f16_zeros_original(T1 *_i1s, T2 *B_local_decode, const int N = 8, T3 *scale = nullptr, T4 *zeros = nullptr)
814
+ {
815
+ uint *h = reinterpret_cast<uint *>(B_local_decode);
816
+
817
+ static constexpr uint immLut = (0xf0 & 0xcc) | 0xaa;
818
+ static constexpr uint BOTTOM_MASK = 0x00010001;
819
+ static constexpr uint FP16_TOP_MAGIC_NUM = 0x64006400;
820
+ static constexpr uint MEDIAN_NUM = 0x64006400;
821
+ // interleave {e31,e29,e27,e25,e23,e21,e19,e17,e15,e13,e11,e9,e7,e5,e3,e1,e30,e28,e26,e24,e22,e20,e18,e16,e14,e12,e10,e8,e6,e4,e2,e0}
822
+ // only decode e7,e5,e3,e1,e8,e6,e4,e2,e0
823
+ int8_t const i1s_i16 = *reinterpret_cast<int8_t *>(_i1s);
824
+ int i1s = (i1s_i16 & 0x0f);
825
+ i1s |= ((i1s_i16 & 0xf0) << 12);
826
+ T3 const scale_r = *scale;
827
+ uint const packed_scales = __pack_half2(scale_r, scale_r);
828
+ // input zeros maybe int32(qzeros) or half format
829
+ T4 const zero_r = *zeros;
830
+ uint const packed_zeros = __pack_half2(zero_r, zero_r);
831
+
832
+ #pragma unroll
833
+ // decode 2 elems at one time.
834
+ for (int i = 0; i < (N / 2); i++)
835
+ {
836
+
837
+ asm volatile("lop3.b32 %0, %1, %2, %3, %4;\\n"
838
+ : "=r"(h[i])
839
+ : "r"(i1s >> (1 * i)), "n"(BOTTOM_MASK), "n"(FP16_TOP_MAGIC_NUM), "n"(immLut));
840
+ asm volatile("sub.f16x2 %0, %1, %2;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(MEDIAN_NUM));
841
+ asm volatile("sub.f16x2 %0, %1, %2;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(packed_zeros));
842
+ asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(packed_scales), "r"(0));
843
+ }
844
+ }
845
+ template <typename T1, typename T2, typename T3, typename T4>
846
+ __device__ void decode_i1u_to_f16_scale_zeros_original(T1 *_i1u, T2 *B_local_decode, T3 *scale = nullptr, T4 *zeros = nullptr, const int N = 8)
847
+ {
848
+ decode_i1b_to_f16_zeros_original<T1, T2, T3, T4, false>(_i1u, B_local_decode, N, scale, zeros);
849
+ }
850
+ """
851
+
852
+ decode_i1_to_f16_scale_zeros_rescale = """
853
+ template <typename T1, typename T2, typename T3, typename T4, bool isSigned = false>
854
+ __device__ void decode_i1b_to_f16_scale_zeros_rescale(T1 *_i1s, T2 *B_local_decode, const int N = 8, T3 *scale = nullptr, T4 *zeros = nullptr)
855
+ {
856
+ uint *h = reinterpret_cast<uint *>(B_local_decode);
857
+
858
+ static constexpr uint immLut = (0xf0 & 0xcc) | 0xaa;
859
+ static constexpr uint BOTTOM_MASK = 0x00010001;
860
+ static constexpr uint FP16_TOP_MAGIC_NUM = 0x64006400;
861
+ static constexpr uint MEDIAN_NUM = 0x64006400;
862
+ // interleave {e31,e29,e27,e25,e23,e21,e19,e17,e15,e13,e11,e9,e7,e5,e3,e1,e30,e28,e26,e24,e22,e20,e18,e16,e14,e12,e10,e8,e6,e4,e2,e0}
863
+ // only decode e7,e5,e3,e1,e8,e6,e4,e2,e0
864
+ int8_t const i1s_i16 = *reinterpret_cast<int8_t *>(_i1s);
865
+ int i1s = (i1s_i16 & 0x0f);
866
+ i1s |= ((i1s_i16 & 0xf0) << 12);
867
+ T3 const scale_r = *scale;
868
+ uint const packed_scales = __pack_half2(scale_r, scale_r);
869
+ T4 const zero_r = *zeros;
870
+ uint const packed_zeros = 0x80008000 | __pack_half2(zero_r, zero_r);
871
+
872
+ #pragma unroll
873
+ // decode 2 elems at one time.
874
+ for (int i = 0; i < (N / 2); i++)
875
+ {
876
+
877
+ asm volatile("lop3.b32 %0, %1, %2, %3, %4;\\n"
878
+ : "=r"(h[i])
879
+ : "r"(i1s >> (1 * i)), "n"(BOTTOM_MASK), "n"(FP16_TOP_MAGIC_NUM), "n"(immLut));
880
+ asm volatile("sub.f16x2 %0, %1, %2;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(MEDIAN_NUM));
881
+ asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(packed_scales), "r"(packed_zeros));
882
+ }
883
+ }
884
+
885
+ template <typename T1, typename T2, typename T3, typename T4>
886
+ __device__ void decode_i1u_to_f16_scale_zeros_rescale(T1 *_i4u, T2 *B_local_decode, T3 *scale = nullptr, T4 *zeros = nullptr, const int N = 8)
887
+ {
888
+ decode_i1b_to_f16_scale_zeros_rescale<T1, T2, T3, T4, false>(_i4u, B_local_decode, N, scale, zeros);
889
+ }
890
+ """
891
+
892
+ decode_i1s_to_i8s = """template <typename T1, typename T2>
893
+ __device__ void decode_i1s_to_i8s(T1 *_i1b, T2 *_i8s, const int N = 16)
894
+ {
895
+ int i8s[4];
896
+ // vector load
897
+ *reinterpret_cast<int4 *>(i8s) = *reinterpret_cast<int4 *>(_i8s);
898
+ int16_t i1b_i16 = *reinterpret_cast<int16_t *>(_i1b);
899
+ // permutate: {e0,e4,e8,e12,e2,e6,e10,e14,e1,e5,e9,e13,e3,e7,e11,e15}
900
+ // into: {e0,e4,e8,e12,x,x,x,x,e1,e5,e9,x,x,x,x,e13,e2,e6,e10,e14,e1,e5,e9,e13,e3,e7,e11,e15,x,x,x,x}
901
+ int i1b = (i1b_i16 & 0x0f0f);
902
+ i1b |= ((i1b_i16 & 0xf0f0) << 12);
903
+ // i1b {0..,e15,e14,e13,e12,e11,e10,e9,e8,e7,e6,e5,e4,e3,e2,e1,e0}
904
+ // interleave {0..,e15,e13,e11,e9,e7,e5,e3,e1,e14,e12,e10,e8,e6,e4,e2,e0}
905
+ // First, we extract the i1b and construct an intermediate fp16 number.
906
+ static constexpr uint immLut = (0xf0 & 0xcc) | 0xaa; // 0b11101010
907
+ static constexpr uint BOTTOM_MASK = 0x01010101; // 0x1 -> 0b01 select 0,1
908
+ static constexpr uint I8s_MAGIC_NUM = 0x00000000;
909
+ static constexpr uint TRANSFORM_SUBTRACT = 0xffffffff; // for signed int 2x - 1
910
+
911
+ for (int i = 0; i < N / 4; i++)
912
+ {
913
+ asm volatile("lop3.b32 %0, %1, %2, %3, %4;\\n"
914
+ : "=r"(i8s[i])
915
+ : "r"(i1b >> i), "n"(BOTTOM_MASK), "n"(I8s_MAGIC_NUM), "n"(immLut));
916
+ i8s[i] = __vadd4(i8s[i], i8s[i]);
917
+ i8s[i] = __vadd4(i8s[i], TRANSFORM_SUBTRACT);
918
+ }
919
+ *reinterpret_cast<int4 *>(_i8s) = *reinterpret_cast<int4 *>(i8s);
920
+ }
921
+
922
+ template <typename T1, typename T2>
923
+ __device__ void decode_i1u_to_i8s(T1 *_i1b, T2 *_i8s, const int N = 16)
924
+ {
925
+ int *i8s = reinterpret_cast<int *>(_i8s);
926
+ int16_t i1b_i16 = *reinterpret_cast<int16_t *>(_i1b);
927
+ // permutate: {e0,e4,e8,e12,e2,e6,e10,e14,e1,e5,e9,e13,e3,e7,e11,e15}
928
+ // into: {e0,e4,e8,e12,x,x,x,x,e1,e5,e9,x,x,x,x,e13,e2,e6,e10,e14,e1,e5,e9,e13,e3,e7,e11,e15,x,x,x,x}
929
+ int i1b = (i1b_i16 & 0x0f0f);
930
+ i1b |= ((i1b_i16 & 0xf0f0) << 12);
931
+ // i1b {0..,e15,e14,e13,e12,e11,e10,e9,e8,e7,e6,e5,e4,e3,e2,e1,e0}
932
+ // interleave {0..,e15,e13,e11,e9,e7,e5,e3,e1,e14,e12,e10,e8,e6,e4,e2,e0}
933
+ // First, we extract the i1b and construct an intermediate fp16 number.
934
+ static constexpr uint immLut = (0xf0 & 0xcc) | 0xaa; // 0b11101010
935
+ static constexpr uint BOTTOM_MASK = 0x01010101; // 0x1 -> 0b01 select 0,1
936
+ static constexpr uint I8s_MAGIC_NUM = 0x00000000;
937
+ static constexpr uint MEDIAN_NUM = 0x00000000;
938
+
939
+ for (int i = 0; i < N / 4; i++)
940
+ {
941
+ asm volatile("lop3.b32 %0, %1, %2, %3, %4;\\n"
942
+ : "=r"(i8s[i])
943
+ : "r"(i1b >> i), "n"(BOTTOM_MASK), "n"(I8s_MAGIC_NUM), "n"(immLut));
944
+ }
945
+ }
946
+
947
+ """
948
+
949
+ decode_i2s_to_i8s = """template <typename T1, typename T2>
950
+ __device__ void decode_i2s_to_i8s(T1 *_i2b, T2 *_i8s, const int N = 16)
951
+ {
952
+ // convert 8 int2b_t to 8 int8b_t -> 2 int32
953
+ uint *i8s = reinterpret_cast<uint *>(_i8s);
954
+
955
+ // i2b = {e7,e6,e5,e4,e3,e2,e1,e0}
956
+ // also require interleave {e7,e3,e6,e2,e5,e1,e4,e0}
957
+ uint const i2b = *reinterpret_cast<uint *>(_i2b);
958
+
959
+ // First, we extract the i4s and construct an intermediate fp16 number.
960
+ static constexpr uint immLut = (0xf0 & 0xcc) | 0xaa; // 0b11101010
961
+ static constexpr uint BOTTOM_MASK = 0x03030303; // 0xf -> 0b11 select 0,3
962
+ static constexpr uint I8s_MAGIC_NUM = 0x00000000; // 1024
963
+ static constexpr uint MEDIAN_NUM = 0x02020202;
964
+ #pragma unroll
965
+ for (int i = 0; i < (N / 4); i++)
966
+ {
967
+ asm volatile("lop3.b32 %0, %1, %2, %3, %4;\\n"
968
+ : "=r"(i8s[i])
969
+ : "r"(i2b >> (2 * i)), "n"(BOTTOM_MASK), "n"(I8s_MAGIC_NUM), "n"(immLut));
970
+ i8s[i] = __vsub4(i8s[i], MEDIAN_NUM);
971
+ }
972
+ }
973
+ template <typename T1, typename T2>
974
+ __device__ void decode_i2u_to_i8s(T1 *_i2b, T2 *_i8s, const int N = 16)
975
+ {
976
+ // convert 8 int2b_t to 8 int8b_t -> 2 int32
977
+ uint *i8s = reinterpret_cast<uint *>(_i8s);
978
+
979
+ // i2b = {e7,e6,e5,e4,e3,e2,e1,e0}
980
+ // also require interleave {e7,e3,e6,e2,e5,e1,e4,e0}
981
+ uint const i2b = *reinterpret_cast<uint *>(_i2b);
982
+
983
+ // First, we extract the i4s and construct an intermediate fp16 number.
984
+ static constexpr uint immLut = (0xf0 & 0xcc) | 0xaa; // 0b11101010
985
+ static constexpr uint BOTTOM_MASK = 0x03030303; // 0xf -> 0b11 select 0,3
986
+ static constexpr uint I8s_MAGIC_NUM = 0x00000000; // 1024
987
+
988
+ #pragma unroll
989
+ for (int i = 0; i < (N / 4); i++)
990
+ {
991
+ asm volatile("lop3.b32 %0, %1, %2, %3, %4;\\n"
992
+ : "=r"(i8s[i])
993
+ : "r"(i2b >> (2 * i)), "n"(BOTTOM_MASK), "n"(I8s_MAGIC_NUM), "n"(immLut));
994
+ }
995
+ }
996
+ """
997
+
998
+ decode_i4s_to_i8s = """template <typename T1, typename T2>
999
+ __device__ void decode_i4s_to_i8s(T1 *_i4b, T2 *_i8s, const int N = 16)
1000
+ {
1001
+ uint *i8s = reinterpret_cast<uint *>(_i8s);
1002
+ uint *i4b = reinterpret_cast<uint *>(_i4b);
1003
+ // First, we extract the i4s and construct an intermediate i8 number.
1004
+ static constexpr uint immLut = (0xf0 & 0xcc) | 0xaa;
1005
+ static constexpr uint BOTTOM_MASK = 0x0f0f0f0f; // 0xf -> 0b1111 select 0,4,8,12
1006
+ static constexpr uint I4b_TO_I8s_MAGIC_NUM = 0x00000000; // 0
1007
+ static constexpr uint MEDIAN_NUM = 0x07070707;
1008
+ #pragma unroll
1009
+ for (int i = 0; i < (N / 8); i++)
1010
+ {
1011
+ // Extract elt_01 - (i4s & 0x000f000f) | 0x64006400
1012
+ asm volatile("lop3.b32 %0, %1, %2, %3, %4;\\n"
1013
+ : "=r"(i8s[i])
1014
+ : "r"(i4b[0] >> (4 * i)), "n"(BOTTOM_MASK), "n"(I4b_TO_I8s_MAGIC_NUM), "n"(immLut));
1015
+
1016
+ asm volatile("lop3.b32 %0, %1, %2, %3, %4;\\n"
1017
+ : "=r"(i8s[i + 2])
1018
+ : "r"(i4b[1] >> (4 * i)), "n"(BOTTOM_MASK), "n"(I4b_TO_I8s_MAGIC_NUM), "n"(immLut));
1019
+ i8s[i] = __vsubss4(i8s[i], MEDIAN_NUM);
1020
+ i8s[i + 2] = __vsubss4(i8s[i + 2], MEDIAN_NUM);
1021
+ }
1022
+ }
1023
+
1024
+ template <typename T1, typename T2>
1025
+ __device__ void decode_i4u_to_i8s(T1 *_i4b, T2 *_i8s, const int N = 16)
1026
+ {
1027
+ uint *i8s = reinterpret_cast<uint *>(_i8s);
1028
+ uint *i4b = reinterpret_cast<uint *>(_i4b);
1029
+ // First, we extract the i4s and construct an intermediate i8 number.
1030
+ static constexpr uint immLut = (0xf0 & 0xcc) | 0xaa;
1031
+ static constexpr uint BOTTOM_MASK = 0x0f0f0f0f; // 0xf -> 0b1111 select 0,4,8,12
1032
+ static constexpr uint I4b_TO_I8s_MAGIC_NUM = 0x00000000; // 0
1033
+ #pragma unroll
1034
+ for (int i = 0; i < (N / 8); i++)
1035
+ {
1036
+ // Extract elt_01 - (i4s & 0x000f000f) | 0x64006400
1037
+ asm volatile("lop3.b32 %0, %1, %2, %3, %4;\\n"
1038
+ : "=r"(i8s[i])
1039
+ : "r"(i4b[0] >> (4 * i)), "n"(BOTTOM_MASK), "n"(I4b_TO_I8s_MAGIC_NUM), "n"(immLut));
1040
+
1041
+ asm volatile("lop3.b32 %0, %1, %2, %3, %4;\\n"
1042
+ : "=r"(i8s[i + 2])
1043
+ : "r"(i4b[1] >> (4 * i)), "n"(BOTTOM_MASK), "n"(I4b_TO_I8s_MAGIC_NUM), "n"(immLut));
1044
+ }
1045
+ }
1046
+ """
1047
+
1048
+ decode_i2s_to_i4s = r"""
1049
+ template <typename T1, typename T2, bool isSigned>
1050
+ __device__ void decode_i2b_to_i4s(T1 *_i2b, T2 *_i4s, const int N = 16)
1051
+ {
1052
+ uint *i4s = reinterpret_cast<uint *>(_i4s);
1053
+ uint *i2b = reinterpret_cast<uint *>(_i2b);
1054
+ // First, we extract the i4s and construct an intermediate i8 number.
1055
+ static constexpr uint immLut = (0xf0 & 0xcc) | 0xaa;
1056
+ static constexpr uint BOTTOM_MASK = 0x33333333; // 0xf -> 0b1111 select 0,2,4,6,8,10,12
1057
+ static constexpr uint I4b_TO_I8s_MAGIC_NUM = 0x00000000; // 0
1058
+ static constexpr uint MEDIAN_NUM = isSigned ? 0x33333333 : 0x00000000;
1059
+
1060
+ #pragma unroll
1061
+ for (int i = 0; i < (N / 8); i++)
1062
+ {
1063
+ // Extract elt_01 - (i4s & 0x000f000f) | 0x64006400
1064
+ asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
1065
+ : "=r"(i4s[i])
1066
+ : "r"(i2b[i / 2] >> (2 * (i % 2))), "n"(BOTTOM_MASK), "n"(I4b_TO_I8s_MAGIC_NUM), "n"(immLut));
1067
+ if constexpr (isSigned)
1068
+ {
1069
+ // TODO(lei): uint4 sub should be enhanced.
1070
+ // 0x03 0x03 0x03 0x03
1071
+ // i4s[i] = (((i4s[i] << 1) | i4s[i]) << 1) | i4s[i];
1072
+ }
1073
+ }
1074
+ }
1075
+
1076
+ template <typename T1, typename T2>
1077
+ __device__ void decode_i2s_to_i4s(T1 *_i4s, T2 *B_local_decode, const int N = 16)
1078
+ {
1079
+ decode_i2b_to_i4s<T1, T2, true>(_i4s, B_local_decode, N);
1080
+ }
1081
+
1082
+ template <typename T1, typename T2>
1083
+ __device__ void decode_i2u_to_i4s(T1 *_i4u, T2 *B_local_decode, const int N = 16)
1084
+ {
1085
+ decode_i2b_to_i4s<T1, T2, false>(_i4u, B_local_decode, N);
1086
+ }
1087
+ """
1088
+
1089
+
1090
+ def get_lop3_intrin_group(
1091
+ out_dtype: Literal["float16", "int8", "int4"],
1092
+ source_format: Literal["int", "uint"] = "uint",
1093
+ source_bit: int = 4,
1094
+ storage_dtype: Literal["int32", "int8"] = "int8",
1095
+ with_scaling: bool = False,
1096
+ with_zeros: bool = False,
1097
+ zeros_mode: Literal["original", "rescale", "quantized"] = "original",
1098
+ storage_scope: str = "local",
1099
+ ) -> Dict[str, str]:
1100
+ """
1101
+ This function is used to get the intrinsic group of the LOP3 operation to avoid the overhead of fast decoding.
1102
+ LOP3 is a type of logic operation that takes three inputs. The intrinsic group refers to the set of
1103
+ intrinsic operations that can be performed on these inputs. This function retrieves and returns this group.
1104
+
1105
+ Parameters
1106
+ ----------
1107
+ in_dtype : Literal["int8"]
1108
+ The data type of the input. It should be "int8".
1109
+
1110
+ out_dtype : Literal["float16", "int8", "int4"]
1111
+ The data type of the output. It can be either "float16" or "int8" or "int4".
1112
+
1113
+ storage_nbit : int, optional
1114
+ The number of bits used for storage. By default, it is 4.
1115
+
1116
+ with_scale : bool, optional
1117
+ A boolean parameter that indicates whether scaling should be applied. By default, it is False.
1118
+
1119
+ with_zeros : bool, optional
1120
+ A boolean parameter that indicates whether zeros should be used. By default, it is False.
1121
+
1122
+ zeros_mode : Literal["original", "rescale", "quantized"], optional
1123
+ The mode of zeros. It can be either "original", "rescale", or "quantized". By default, it is "original".
1124
+
1125
+ storage_scope : Literal["local", "warp"], optional
1126
+ The scope of the storage. It can be either "local" or "warp". By default, it is "local".
1127
+
1128
+ Returns
1129
+ -------
1130
+ Dict[str, str]
1131
+ A dictionary mapping the names of the intrinsics to their corresponding implementations.
1132
+ """
1133
+ assert out_dtype in [
1134
+ "float16", "int8", "int4"
1135
+ ], (f"Invalid out_dtype: {out_dtype}. Expected 'float16' or 'int8' or 'int4' .")
1136
+
1137
+ dtype_mapping = {"float16": "f16", "int4": "i4", "int8": "i8", "int32": "i32"}
1138
+ target_dtype = dtype_mapping[out_dtype]
1139
+
1140
+ if source_format not in ["int", "uint"]:
1141
+ raise ValueError(
1142
+ f"Invalid source_format. Expected 'int' or 'uint', but got {source_format}.")
1143
+ if with_zeros and source_format == "int":
1144
+ raise ValueError(f"Zeros are not supported for signed integers, but got {source_format}")
1145
+
1146
+ source_symbol = "i" if source_format == "int" else "u"
1147
+
1148
+ import_c_map = {
1149
+ "i4_to_f16": decode_i4_to_f16,
1150
+ "i2_to_f16": decode_i2_to_f16,
1151
+ "i1_to_f16": decode_i1_to_f16,
1152
+ "i4_to_f16_scale": decode_i4_to_f16_scale,
1153
+ "i4_to_f16_scale_offset": decode_i4_to_f16_scale_offset,
1154
+ "i2_to_f16_scale": decode_i2_to_f16_scale,
1155
+ "i1_to_f16_scale": decode_i1_to_f16_scale,
1156
+ "i4_to_f16_scale_zeros_original": decode_i4_to_f16_scale_zeros_original,
1157
+ "i4_to_f16_scale_zeros_original_offset": decode_i4_to_f16_scale_zeros_original_offset,
1158
+ "i2_to_f16_scale_zeros_original": decode_i2_to_f16_scale_zeros_original,
1159
+ "i1_to_f16_scale_zeros_original": decode_i1_to_f16_scale_zeros_original,
1160
+ "i4_to_f16_scale_zeros_rescale": decode_i4_to_f16_scale_zeros_rescale,
1161
+ "i4_to_f16_scale_zeros_rescale_offset": decode_i4_to_f16_scale_zeros_rescale_offset,
1162
+ "i2_to_f16_scale_zeros_rescale": decode_i2_to_f16_scale_zeros_rescale,
1163
+ "i1_to_f16_scale_zeros_rescale": decode_i1_to_f16_scale_zeros_rescale,
1164
+ "i4_to_f16_scale_zeros_quantized": decode_i4_to_f16_scale_zeros_quantized,
1165
+ "i2_to_f16_scale_zeros_quantized": decode_i2_to_f16_scale_zeros_quantized,
1166
+ "i4_to_f16_scale_zeros_quantized_offset": decode_i4_to_f16_scale_zeros_quantized_offset,
1167
+ "i1_to_i8": decode_i1s_to_i8s,
1168
+ "i2_to_i8": decode_i2s_to_i8s,
1169
+ "i4_to_i8": decode_i4s_to_i8s,
1170
+ "i2_to_i4": decode_i2s_to_i4s,
1171
+ }
1172
+ key = f"i{source_bit}_to_{target_dtype}"
1173
+ if with_scaling:
1174
+ key += "_scale"
1175
+ if with_zeros:
1176
+ key += f"_zeros_{zeros_mode}"
1177
+
1178
+ is_ladder_stage3 = (storage_scope == "warp") and with_scaling
1179
+ if is_ladder_stage3:
1180
+ key += "_offset"
1181
+
1182
+ if out_dtype == "float16":
1183
+ d4f = "f16"
1184
+ elif out_dtype == "int8":
1185
+ d4f = "i8s"
1186
+ elif out_dtype == "int4":
1187
+ d4f = "i4s"
1188
+ else:
1189
+ raise ValueError("Unsupported target dtype: {}".format(target_dtype))
1190
+ source_symbol = "u" if source_format == "uint" else "s"
1191
+ func_name = "decode_i{}{}_to_{}".format(source_bit, source_symbol, d4f)
1192
+ if with_scaling:
1193
+ func_name += "_scale"
1194
+ if with_zeros:
1195
+ func_name += f"_zeros_{zeros_mode}"
1196
+ if is_ladder_stage3:
1197
+ func_name += "_offset"
1198
+
1199
+ return {
1200
+ "func_name": func_name,
1201
+ "c_source": import_c_map[key],
1202
+ }