tilelang-rocm 0.1.4.post4__cp310-cp310-manylinux1_x86_64.whl → 0.1.4.post9__cp310-cp310-manylinux1_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tilelang/3rdparty/tvm/python/tvm/_ffi/runtime_ctypes.py +7 -1
- tilelang/3rdparty/tvm/python/tvm/contrib/tvmjs.py +7 -0
- tilelang/3rdparty/tvm/python/tvm/runtime/ndarray.py +8 -0
- tilelang/3rdparty/tvm/python/tvm/script/ir_builder/tir/ir.py +7 -0
- tilelang/3rdparty/tvm/src/target/llvm/codegen_llvm.cc +2 -1
- tilelang/3rdparty/tvm/src/target/source/codegen_cuda.cc +4 -1
- tilelang/3rdparty/tvm/src/target/source/ptx.cc +3 -0
- tilelang/3rdparty/tvm/src/tir/ir/index_map.cc +2 -1
- tilelang/3rdparty/tvm/src/tir/ir/stmt.cc +0 -6
- tilelang/3rdparty/tvm/src/tir/op/op.cc +6 -0
- tilelang/3rdparty/tvm/src/tir/transforms/dtype_conversion.h +3 -0
- tilelang/README.md +3 -1
- tilelang/VERSION +1 -1
- tilelang/__init__.py +2 -1
- tilelang/autotuner/__init__.py +334 -277
- tilelang/autotuner/param.py +329 -0
- tilelang/cache/kernel_cache.py +6 -11
- tilelang/cache/tuner_cache.py +356 -0
- tilelang/carver/arch/driver/__init__.py +2 -0
- tilelang/carver/arch/driver/cuda_driver.py +28 -0
- tilelang/contrib/dlpack.py +1 -1
- tilelang/contrib/nvcc.py +27 -3
- tilelang/engine/phase.py +44 -11
- tilelang/intrinsics/mfma_macro_generator.py +12 -3
- tilelang/intrinsics/mma_layout.py +33 -51
- tilelang/jit/__init__.py +215 -93
- tilelang/jit/adapter/cython/cython_wrapper.pyx +13 -10
- tilelang/jit/adapter/libgen.py +3 -1
- tilelang/jit/adapter/wrapper.py +91 -6
- tilelang/jit/kernel.py +52 -1
- tilelang/jit/param.py +45 -0
- tilelang/language/__init__.py +83 -1
- tilelang/language/builtin.py +90 -1
- tilelang/language/copy.py +13 -11
- tilelang/language/customize.py +13 -0
- tilelang/language/print.py +27 -0
- tilelang/language/reduce.py +16 -5
- tilelang/language/tir/op.py +19 -0
- tilelang/language/warpgroup.py +7 -2
- tilelang/lib/libtilelang.so +0 -0
- tilelang/lib/libtilelang_module.so +0 -0
- tilelang/lib/libtvm.so +0 -0
- tilelang/lib/libtvm_runtime.so +0 -0
- tilelang/primitives/gemm/base.py +61 -24
- tilelang/profiler/__init__.py +41 -2
- tilelang/quantize/__init__.py +18 -0
- tilelang/quantize/lop3.py +1202 -0
- tilelang/quantize/quantization.py +234 -0
- tilelang/quantize/utils.py +126 -0
- tilelang/src/tl_templates/cuda/common.h +23 -0
- tilelang/src/tl_templates/cuda/cuda_fp8.h +42 -13
- tilelang/src/tl_templates/cuda/debug.h +41 -3
- tilelang/src/tl_templates/cuda/gemm_sm80.h +25 -13
- tilelang/src/tl_templates/cuda/gemm_sm89.h +25 -14
- tilelang/src/tl_templates/cuda/gemm_sm90.h +28 -24
- tilelang/src/tl_templates/hip/hip_fp8.h +18 -0
- tilelang/transform/__init__.py +17 -0
- tilelang/transform/pass_config.py +3 -0
- tilelang/utils/tensor.py +1 -0
- tilelang/version.py +21 -0
- {tilelang_rocm-0.1.4.post4.dist-info → tilelang_rocm-0.1.4.post9.dist-info}/METADATA +4 -2
- {tilelang_rocm-0.1.4.post4.dist-info → tilelang_rocm-0.1.4.post9.dist-info}/RECORD +65 -1003
- {tilelang_rocm-0.1.4.post4.dist-info → tilelang_rocm-0.1.4.post9.dist-info}/WHEEL +1 -1
- tilelang/3rdparty/tvm/python/tvm/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/__pycache__/error.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/__pycache__/parser.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/__pycache__/support.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/_ffi/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/_ffi/__pycache__/_pyversion.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/_ffi/__pycache__/base.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/_ffi/__pycache__/libinfo.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/_ffi/__pycache__/registry.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/_ffi/__pycache__/runtime_ctypes.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/_ffi/_ctypes/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/_ffi/_ctypes/__pycache__/ndarray.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/_ffi/_ctypes/__pycache__/object.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/_ffi/_ctypes/__pycache__/packed_func.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/_ffi/_ctypes/__pycache__/types.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/_ffi/_cy3/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/arith/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/arith/__pycache__/_ffi_api.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/arith/__pycache__/analyzer.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/arith/__pycache__/bound.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/arith/__pycache__/int_set.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/arith/__pycache__/int_solver.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/arith/__pycache__/iter_affine_map.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/arith/__pycache__/pattern.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/auto_scheduler/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/auto_scheduler/__pycache__/_ffi_api.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/auto_scheduler/__pycache__/compute_dag.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/auto_scheduler/__pycache__/dispatcher.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/auto_scheduler/__pycache__/feature.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/auto_scheduler/__pycache__/loop_state.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/auto_scheduler/__pycache__/measure.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/auto_scheduler/__pycache__/measure_record.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/auto_scheduler/__pycache__/relay_integration.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/auto_scheduler/__pycache__/search_policy.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/auto_scheduler/__pycache__/search_task.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/auto_scheduler/__pycache__/task_scheduler.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/auto_scheduler/__pycache__/utils.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/auto_scheduler/__pycache__/workload_registry.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/auto_scheduler/cost_model/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/auto_scheduler/cost_model/__pycache__/cost_model.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/auto_scheduler/cost_model/__pycache__/xgb_model.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/autotvm/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/autotvm/__pycache__/database.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/autotvm/__pycache__/env.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/autotvm/__pycache__/feature.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/autotvm/__pycache__/record.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/autotvm/__pycache__/tophub.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/autotvm/__pycache__/utils.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/autotvm/measure/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/autotvm/measure/__pycache__/executor.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/autotvm/measure/__pycache__/measure.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/autotvm/measure/__pycache__/measure_methods.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/autotvm/task/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/autotvm/task/__pycache__/code_hash.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/autotvm/task/__pycache__/dispatcher.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/autotvm/task/__pycache__/relay_integration.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/autotvm/task/__pycache__/space.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/autotvm/task/__pycache__/task.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/autotvm/task/__pycache__/topi_integration.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/autotvm/tuner/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/autotvm/tuner/__pycache__/callback.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/autotvm/tuner/__pycache__/droplet_tuner.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/autotvm/tuner/__pycache__/ga_tuner.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/autotvm/tuner/__pycache__/index_based_tuner.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/autotvm/tuner/__pycache__/metric.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/autotvm/tuner/__pycache__/model_based_tuner.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/autotvm/tuner/__pycache__/sa_model_optimizer.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/autotvm/tuner/__pycache__/tuner.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/autotvm/tuner/__pycache__/xgboost_cost_model.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/autotvm/tuner/__pycache__/xgboost_tuner.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/contrib/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/contrib/__pycache__/cblas.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/contrib/__pycache__/cc.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/contrib/__pycache__/coreml_runtime.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/contrib/__pycache__/cublas.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/contrib/__pycache__/cudnn.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/contrib/__pycache__/dnnl.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/contrib/__pycache__/download.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/contrib/__pycache__/graph_executor.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/contrib/__pycache__/hipcc.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/contrib/__pycache__/miopen.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/contrib/__pycache__/mkl.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/contrib/__pycache__/mrvl.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/contrib/__pycache__/ndk.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/contrib/__pycache__/nnpack.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/contrib/__pycache__/nvcc.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/contrib/__pycache__/pickle_memoize.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/contrib/__pycache__/popen_pool.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/contrib/__pycache__/rocblas.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/contrib/__pycache__/rocm.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/contrib/__pycache__/sdaccel.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/contrib/__pycache__/stackvm.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/contrib/__pycache__/tar.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/contrib/__pycache__/thrust.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/contrib/__pycache__/utils.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/contrib/__pycache__/xcode.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/contrib/target/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/contrib/target/__pycache__/coreml.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/dlight/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/dlight/base/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/dlight/base/__pycache__/analysis.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/dlight/base/__pycache__/common_schedules.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/dlight/base/__pycache__/schedule_rule.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/dlight/base/__pycache__/transform.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/dlight/gpu/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/dlight/gpu/__pycache__/base.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/dlight/gpu/__pycache__/fallback.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/dlight/gpu/__pycache__/gemv.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/dlight/gpu/__pycache__/general_reduction.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/dlight/gpu/__pycache__/low_batch_gemv.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/dlight/gpu/__pycache__/matmul.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/dlight/gpu/__pycache__/reduction.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/dlight/gpu/__pycache__/rmsnorm.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/dlight/gpu/__pycache__/transpose.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/dlight/gpu/__pycache__/utils.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/driver/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/driver/__pycache__/_ffi_api.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/driver/__pycache__/build_module.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/ir/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/ir/__pycache__/_ffi_api.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/ir/__pycache__/_ffi_instrument_api.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/ir/__pycache__/_ffi_transform_api.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/ir/__pycache__/adt.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/ir/__pycache__/affine_type.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/ir/__pycache__/attrs.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/ir/__pycache__/base.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/ir/__pycache__/container.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/ir/__pycache__/expr.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/ir/__pycache__/function.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/ir/__pycache__/global_info.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/ir/__pycache__/instrument.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/ir/__pycache__/json_compact.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/ir/__pycache__/memory_pools.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/ir/__pycache__/module.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/ir/__pycache__/op.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/ir/__pycache__/tensor_type.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/ir/__pycache__/transform.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/ir/__pycache__/type.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/ir/__pycache__/type_relation.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/ir/diagnostics/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/ir/diagnostics/__pycache__/_ffi_api.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/meta_schedule/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/meta_schedule/__pycache__/_ffi_api.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/meta_schedule/__pycache__/arg_info.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/meta_schedule/__pycache__/extracted_task.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/meta_schedule/__pycache__/logging.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/meta_schedule/__pycache__/profiler.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/meta_schedule/__pycache__/relax_integration.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/meta_schedule/__pycache__/relay_integration.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/meta_schedule/__pycache__/tir_integration.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/meta_schedule/__pycache__/trace_apply.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/meta_schedule/__pycache__/tune.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/meta_schedule/__pycache__/tune_context.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/meta_schedule/__pycache__/utils.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/meta_schedule/builder/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/meta_schedule/builder/__pycache__/builder.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/meta_schedule/builder/__pycache__/local_builder.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/meta_schedule/cost_model/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/meta_schedule/cost_model/__pycache__/cost_model.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/meta_schedule/cost_model/__pycache__/metric.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/meta_schedule/cost_model/__pycache__/random_model.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/meta_schedule/cost_model/__pycache__/xgb_model.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/meta_schedule/database/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/meta_schedule/database/__pycache__/database.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/meta_schedule/database/__pycache__/json_database.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/meta_schedule/database/__pycache__/memory_database.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/meta_schedule/database/__pycache__/ordered_union_database.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/meta_schedule/database/__pycache__/schedule_fn_database.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/meta_schedule/database/__pycache__/union_database.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/meta_schedule/feature_extractor/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/meta_schedule/feature_extractor/__pycache__/feature_extractor.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/meta_schedule/feature_extractor/__pycache__/per_store_feature.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/meta_schedule/feature_extractor/__pycache__/random_feature_extractor.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/meta_schedule/measure_callback/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/meta_schedule/measure_callback/__pycache__/add_to_database.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/meta_schedule/measure_callback/__pycache__/measure_callback.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/meta_schedule/measure_callback/__pycache__/remove_build_artifact.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/meta_schedule/measure_callback/__pycache__/update_cost_model.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/meta_schedule/mutator/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/meta_schedule/mutator/__pycache__/mutate_compute_location.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/meta_schedule/mutator/__pycache__/mutate_parallel.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/meta_schedule/mutator/__pycache__/mutate_thread_binding.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/meta_schedule/mutator/__pycache__/mutate_tile_size.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/meta_schedule/mutator/__pycache__/mutate_unroll.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/meta_schedule/mutator/__pycache__/mutator.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/meta_schedule/postproc/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/meta_schedule/postproc/__pycache__/disallow_async_strided_mem_copy.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/meta_schedule/postproc/__pycache__/disallow_dynamic_loop.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/meta_schedule/postproc/__pycache__/postproc.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/meta_schedule/postproc/__pycache__/rewrite_cooperative_fetch.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/meta_schedule/postproc/__pycache__/rewrite_layout.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/meta_schedule/postproc/__pycache__/rewrite_parallel_vectorize_unroll.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/meta_schedule/postproc/__pycache__/rewrite_reduction_block.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/meta_schedule/postproc/__pycache__/rewrite_tensorize.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/meta_schedule/postproc/__pycache__/rewrite_unbound_block.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/meta_schedule/postproc/__pycache__/verify_gpu_code.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/meta_schedule/postproc/__pycache__/verify_vtcm_limit.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/meta_schedule/runner/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/meta_schedule/runner/__pycache__/config.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/meta_schedule/runner/__pycache__/local_runner.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/meta_schedule/runner/__pycache__/rpc_runner.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/meta_schedule/runner/__pycache__/runner.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/meta_schedule/runner/__pycache__/utils.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/meta_schedule/schedule/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/meta_schedule/schedule/cpu/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/meta_schedule/schedule/cuda/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/meta_schedule/schedule/cuda/__pycache__/layout_transform.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/meta_schedule/schedule/generic/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/meta_schedule/schedule/x86/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/meta_schedule/schedule_rule/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/meta_schedule/schedule_rule/__pycache__/add_rfactor.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/meta_schedule/schedule_rule/__pycache__/apply_custom_rule.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/meta_schedule/schedule_rule/__pycache__/auto_bind.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/meta_schedule/schedule_rule/__pycache__/auto_inline.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/meta_schedule/schedule_rule/__pycache__/cross_thread_reduction.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/meta_schedule/schedule_rule/__pycache__/multi_level_tiling.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/meta_schedule/schedule_rule/__pycache__/parallel_vectorize_unroll.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/meta_schedule/schedule_rule/__pycache__/random_compute_location.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/meta_schedule/schedule_rule/__pycache__/schedule_rule.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/meta_schedule/search_strategy/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/meta_schedule/search_strategy/__pycache__/evolutionary_search.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/meta_schedule/search_strategy/__pycache__/replay_func.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/meta_schedule/search_strategy/__pycache__/replay_trace.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/meta_schedule/search_strategy/__pycache__/search_strategy.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/meta_schedule/space_generator/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/meta_schedule/space_generator/__pycache__/post_order_apply.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/meta_schedule/space_generator/__pycache__/schedule_fn.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/meta_schedule/space_generator/__pycache__/space_generator.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/meta_schedule/space_generator/__pycache__/space_generator_union.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/meta_schedule/task_scheduler/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/meta_schedule/task_scheduler/__pycache__/gradient_based.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/meta_schedule/task_scheduler/__pycache__/round_robin.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/meta_schedule/task_scheduler/__pycache__/task_scheduler.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/__pycache__/_ffi_api.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/__pycache__/binding_rewrite.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/__pycache__/block_builder.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/__pycache__/exec_builder.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/__pycache__/expr.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/__pycache__/expr_functor.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/__pycache__/pipeline.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/__pycache__/struct_info.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/__pycache__/ty.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/__pycache__/utils.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/__pycache__/vm_build.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/analysis/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/analysis/__pycache__/_ffi_api.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/analysis/__pycache__/analysis.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/analysis/__pycache__/estimate_memory_usage.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/backend/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/backend/__pycache__/_ffi_api.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/backend/__pycache__/dispatch_sort_scan.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/backend/__pycache__/pattern_registry.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/backend/contrib/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/distributed/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/distributed/__pycache__/_ffi_api.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/distributed/__pycache__/global_info.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/distributed/__pycache__/struct_info.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/distributed/transform/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/distributed/transform/__pycache__/_ffi_api.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/distributed/transform/__pycache__/transform.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/dpl/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/dpl/__pycache__/_ffi.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/dpl/__pycache__/context.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/dpl/__pycache__/pattern.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/dpl/__pycache__/rewrite.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/frontend/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/frontend/__pycache__/common.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/frontend/nn/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/frontend/nn/__pycache__/_tensor_op.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/frontend/nn/__pycache__/core.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/frontend/nn/__pycache__/exporter.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/frontend/nn/__pycache__/extern.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/frontend/nn/__pycache__/modules.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/frontend/nn/__pycache__/op.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/frontend/nn/__pycache__/spec.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/frontend/nn/__pycache__/subroutine.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/frontend/nn/__pycache__/visitor.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/op/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/op/__pycache__/_ffi_api.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/op/__pycache__/_op_gradient.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/op/__pycache__/base.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/op/__pycache__/binary.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/op/__pycache__/create.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/op/__pycache__/datatype.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/op/__pycache__/index.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/op/__pycache__/linear_algebra.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/op/__pycache__/manipulate.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/op/__pycache__/mask.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/op/__pycache__/op_attrs.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/op/__pycache__/qdq.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/op/__pycache__/search.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/op/__pycache__/set.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/op/__pycache__/sorting.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/op/__pycache__/statistical.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/op/__pycache__/ternary.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/op/__pycache__/unary.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/op/builtin/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/op/builtin/__pycache__/_ffi_api.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/op/builtin/__pycache__/builtin.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/op/ccl/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/op/ccl/__pycache__/_ffi_api.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/op/ccl/__pycache__/ccl.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/op/distributed/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/op/distributed/__pycache__/_ffi_api.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/op/distributed/__pycache__/distributed.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/op/grad/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/op/grad/__pycache__/_ffi_api.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/op/grad/__pycache__/grad.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/op/image/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/op/image/__pycache__/_ffi_api.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/op/image/__pycache__/image.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/op/memory/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/op/memory/__pycache__/_ffi_api.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/op/memory/__pycache__/memory.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/op/nn/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/op/nn/__pycache__/_ffi_api.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/op/nn/__pycache__/nn.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/training/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/training/__pycache__/_ffi_api.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/training/__pycache__/loss.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/training/__pycache__/optimizer.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/training/__pycache__/setup_trainer.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/training/__pycache__/trainer.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/training/__pycache__/utils.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/transform/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/transform/__pycache__/_ffi_api.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/transform/__pycache__/attach_external_modules.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/transform/__pycache__/fast_math.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/transform/__pycache__/ipc_allreduce_rewrite.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/transform/__pycache__/lazy_transform_params.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/transform/__pycache__/lower_gpu_ipc_alloc_storage.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/transform/__pycache__/optimize_layout_transform.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/transform/__pycache__/remove_redundant_reshape.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/transform/__pycache__/transform.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/transform/legalize_ops/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/transform/legalize_ops/__pycache__/binary.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/transform/legalize_ops/__pycache__/ccl.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/transform/legalize_ops/__pycache__/common.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/transform/legalize_ops/__pycache__/create.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/transform/legalize_ops/__pycache__/datatype.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/transform/legalize_ops/__pycache__/distributed.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/transform/legalize_ops/__pycache__/grad.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/transform/legalize_ops/__pycache__/image.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/transform/legalize_ops/__pycache__/index.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/transform/legalize_ops/__pycache__/inspect_op.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/transform/legalize_ops/__pycache__/linear_algebra.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/transform/legalize_ops/__pycache__/manipulate.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/transform/legalize_ops/__pycache__/nn.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/transform/legalize_ops/__pycache__/qdq.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/transform/legalize_ops/__pycache__/search.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/transform/legalize_ops/__pycache__/statistical.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/transform/legalize_ops/__pycache__/unary.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/transform/tuning_api/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/transform/tuning_api/__pycache__/_ffi_api.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/transform/tuning_api/__pycache__/database.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/transform/tuning_api/__pycache__/default_functions.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relax/transform/tuning_api/__pycache__/primitives.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/__pycache__/_build_module.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/__pycache__/_ffi_api.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/__pycache__/_ffi_api_parser.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/__pycache__/_make.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/__pycache__/adt.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/__pycache__/base.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/__pycache__/build_module.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/__pycache__/debug.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/__pycache__/expr.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/__pycache__/expr_functor.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/__pycache__/function.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/__pycache__/loops.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/__pycache__/param_dict.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/__pycache__/parser.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/__pycache__/prelude.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/__pycache__/scope_builder.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/__pycache__/ty.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/__pycache__/type_functor.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/analysis/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/analysis/__pycache__/_ffi_api.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/analysis/__pycache__/analysis.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/analysis/__pycache__/annotated_regions.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/analysis/__pycache__/call_graph.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/analysis/__pycache__/count_layers.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/analysis/__pycache__/feature.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/analysis/__pycache__/sparse_conv2d.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/analysis/__pycache__/sparse_dense.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/backend/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/backend/__pycache__/_backend.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/backend/__pycache__/_vm.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/backend/__pycache__/executor.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/backend/__pycache__/executor_factory.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/backend/__pycache__/interpreter.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/backend/__pycache__/runtime.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/backend/__pycache__/te_compiler.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/backend/__pycache__/utils.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/backend/__pycache__/vm.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/collage/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/collage/__pycache__/_ffi_api.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/collage/__pycache__/collage.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/data_dep_optimization/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/data_dep_optimization/__pycache__/bsr_conv2d.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/data_dep_optimization/__pycache__/bsr_dense.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/data_dep_optimization/__pycache__/simplify_fc_transpose.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/data_dep_optimization/__pycache__/utils.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/dataflow_pattern/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/dataflow_pattern/__pycache__/_ffi.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/frontend/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/frontend/__pycache__/caffe.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/frontend/__pycache__/caffe2.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/frontend/__pycache__/change_datatype.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/frontend/__pycache__/common.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/frontend/__pycache__/coreml.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/frontend/__pycache__/darknet.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/frontend/__pycache__/keras.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/frontend/__pycache__/mxnet.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/frontend/__pycache__/mxnet_qnn_op_utils.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/frontend/__pycache__/nnvm_common.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/frontend/__pycache__/oneflow.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/frontend/__pycache__/onnx.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/frontend/__pycache__/paddlepaddle.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/frontend/__pycache__/pytorch.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/frontend/__pycache__/pytorch_utils.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/frontend/__pycache__/qnn_torch.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/frontend/__pycache__/tensorflow.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/frontend/__pycache__/tensorflow_ops.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/frontend/__pycache__/tflite.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/frontend/__pycache__/tflite_flexbuffer.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/op/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/op/__pycache__/_algorithm.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/op/__pycache__/_make.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/op/__pycache__/_math.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/op/__pycache__/_reduce.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/op/__pycache__/_tensor.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/op/__pycache__/_tensor_grad.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/op/__pycache__/_transform.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/op/__pycache__/algorithm.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/op/__pycache__/op.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/op/__pycache__/op_attrs.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/op/__pycache__/reduce.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/op/__pycache__/tensor.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/op/__pycache__/transform.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/op/annotation/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/op/annotation/__pycache__/_make.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/op/annotation/__pycache__/annotation.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/op/contrib/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/op/contrib/__pycache__/_ethosn.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/op/contrib/__pycache__/arm_compute_lib.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/op/contrib/__pycache__/bnns.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/op/contrib/__pycache__/clml.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/op/contrib/__pycache__/coreml.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/op/contrib/__pycache__/cutlass.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/op/contrib/__pycache__/dnnl.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/op/contrib/__pycache__/ethosn.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/op/contrib/__pycache__/libtorch.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/op/contrib/__pycache__/mrvl.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/op/contrib/__pycache__/register.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/op/contrib/__pycache__/tensorrt.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/op/dyn/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/op/dyn/__pycache__/_algorithm.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/op/dyn/__pycache__/_make.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/op/dyn/__pycache__/_tensor.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/op/dyn/__pycache__/_transform.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/op/dyn/image/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/op/dyn/image/__pycache__/_image.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/op/dyn/image/__pycache__/_make.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/op/dyn/nn/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/op/dyn/nn/__pycache__/_make.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/op/dyn/nn/__pycache__/_nn.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/op/image/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/op/image/__pycache__/_image.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/op/image/__pycache__/_make.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/op/image/__pycache__/image.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/op/memory/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/op/memory/__pycache__/_make.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/op/memory/__pycache__/memory.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/op/nn/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/op/nn/__pycache__/_make.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/op/nn/__pycache__/_nn.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/op/nn/__pycache__/nn.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/op/nn/__pycache__/utils.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/op/random/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/op/random/__pycache__/_kernel.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/op/random/__pycache__/_make.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/op/random/__pycache__/kernel.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/op/strategy/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/op/strategy/__pycache__/adreno.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/op/strategy/__pycache__/arm_cpu.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/op/strategy/__pycache__/bifrost.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/op/strategy/__pycache__/cuda.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/op/strategy/__pycache__/generic.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/op/strategy/__pycache__/hexagon.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/op/strategy/__pycache__/hls.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/op/strategy/__pycache__/intel_graphics.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/op/strategy/__pycache__/mali.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/op/strategy/__pycache__/rocm.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/op/strategy/__pycache__/x86.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/op/vision/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/op/vision/__pycache__/_make.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/op/vision/__pycache__/_rcnn.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/op/vision/__pycache__/_vision.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/op/vision/__pycache__/_yolo.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/op/vision/__pycache__/multibox.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/op/vision/__pycache__/nms.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/op/vision/__pycache__/rcnn.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/op/vision/__pycache__/yolo.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/op/vm/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/op/vm/__pycache__/_ffi_api.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/op/vm/__pycache__/vm.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/qnn/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/qnn/__pycache__/transform.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/qnn/op/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/qnn/op/__pycache__/_make.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/qnn/op/__pycache__/_qnn.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/qnn/op/__pycache__/_requantize.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/qnn/op/__pycache__/canonicalizations.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/qnn/op/__pycache__/layout_conversions.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/qnn/op/__pycache__/legalizations.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/qnn/op/__pycache__/op.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/qnn/op/__pycache__/qnn.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/qnn/strategy/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/qnn/strategy/__pycache__/arm_cpu.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/qnn/strategy/__pycache__/generic.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/qnn/strategy/__pycache__/hexagon.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/quantize/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/quantize/__pycache__/_annotate.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/quantize/__pycache__/_calibrate.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/quantize/__pycache__/_partition.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/quantize/__pycache__/_partition_conversions.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/quantize/__pycache__/_quantize.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/quantize/__pycache__/kl_divergence.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/quantize/__pycache__/quantize.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/transform/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/transform/__pycache__/_ffi_api.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/transform/__pycache__/fake_quantization_to_integer.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/transform/__pycache__/flexible_shape.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/transform/__pycache__/memory_plan.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/transform/__pycache__/mixed_precision.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/transform/__pycache__/recast.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/relay/transform/__pycache__/transform.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/rpc/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/rpc/__pycache__/_ffi_api.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/rpc/__pycache__/base.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/rpc/__pycache__/client.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/rpc/__pycache__/minrpc.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/rpc/__pycache__/server.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/rpc/__pycache__/testing.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/runtime/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/runtime/__pycache__/_ffi_api.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/runtime/__pycache__/_ffi_node_api.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/runtime/__pycache__/container.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/runtime/__pycache__/module.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/runtime/__pycache__/name_transforms.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/runtime/__pycache__/ndarray.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/runtime/__pycache__/object.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/runtime/__pycache__/object_generic.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/runtime/__pycache__/object_path.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/runtime/__pycache__/packed_func.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/runtime/__pycache__/params.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/runtime/__pycache__/relax_vm.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/runtime/__pycache__/script_printer.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/runtime/__pycache__/support.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/runtime/__pycache__/vm.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/runtime/disco/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/runtime/disco/__pycache__/_ffi_api.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/runtime/disco/__pycache__/process_pool.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/runtime/disco/__pycache__/session.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/runtime/executor/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/runtime/executor/__pycache__/aot_executor.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/runtime/profiling/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/runtime/profiling/__pycache__/_ffi_api.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/script/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/script/__pycache__/tir.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/script/ir_builder/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/script/ir_builder/__pycache__/_ffi_api.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/script/ir_builder/__pycache__/base.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/script/ir_builder/ir/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/script/ir_builder/ir/__pycache__/_ffi_api.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/script/ir_builder/ir/__pycache__/frame.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/script/ir_builder/ir/__pycache__/ir.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/script/ir_builder/tir/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/script/ir_builder/tir/__pycache__/_ffi_api.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/script/ir_builder/tir/__pycache__/frame.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/script/ir_builder/tir/__pycache__/ir.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/script/parser/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/script/parser/__pycache__/_core.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/script/parser/core/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/script/parser/core/__pycache__/diagnostics.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/script/parser/core/__pycache__/dispatch.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/script/parser/core/__pycache__/doc.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/script/parser/core/__pycache__/doc_core.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/script/parser/core/__pycache__/entry.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/script/parser/core/__pycache__/error.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/script/parser/core/__pycache__/evaluator.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/script/parser/core/__pycache__/parser.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/script/parser/core/__pycache__/utils.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/script/parser/ir/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/script/parser/ir/__pycache__/entry.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/script/parser/ir/__pycache__/parser.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/script/parser/tir/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/script/parser/tir/__pycache__/entry.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/script/parser/tir/__pycache__/operation.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/script/parser/tir/__pycache__/parser.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/target/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/target/__pycache__/_ffi_api.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/target/__pycache__/codegen.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/target/__pycache__/compilation_config.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/target/__pycache__/datatype.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/target/__pycache__/generic_func.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/target/__pycache__/tag.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/target/__pycache__/target.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/target/__pycache__/virtual_device.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/target/__pycache__/x86.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/te/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/te/__pycache__/_ffi_api.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/te/__pycache__/autodiff.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/te/__pycache__/operation.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/te/__pycache__/schedule.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/te/__pycache__/tag.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/te/__pycache__/tensor.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/te/__pycache__/tensor_intrin.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/te/hybrid/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/te/hybrid/__pycache__/calls.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/te/hybrid/__pycache__/module.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/te/hybrid/__pycache__/parser.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/te/hybrid/__pycache__/preprocessor.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/te/hybrid/__pycache__/runtime.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/te/hybrid/__pycache__/utils.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/tir/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/tir/__pycache__/_ffi_api.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/tir/__pycache__/block_dependence_info.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/tir/__pycache__/block_scope.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/tir/__pycache__/buffer.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/tir/__pycache__/data_layout.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/tir/__pycache__/expr.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/tir/__pycache__/function.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/tir/__pycache__/generic.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/tir/__pycache__/ir_builder.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/tir/__pycache__/op.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/tir/__pycache__/stmt.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/tir/__pycache__/stmt_functor.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/tir/analysis/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/tir/analysis/__pycache__/_ffi_api.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/tir/analysis/__pycache__/analysis.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/tir/schedule/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/tir/schedule/__pycache__/_ffi_api.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/tir/schedule/__pycache__/_type_checker.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/tir/schedule/__pycache__/analysis.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/tir/schedule/__pycache__/instruction.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/tir/schedule/__pycache__/schedule.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/tir/schedule/__pycache__/state.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/tir/schedule/__pycache__/trace.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/tir/schedule/__pycache__/transform.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/tir/transform/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/tir/transform/__pycache__/_ffi_api.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/tir/transform/__pycache__/function_pass.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/tir/transform/__pycache__/transform.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/tir/usmp/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/tir/usmp/__pycache__/_ffi_api.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/tir/usmp/__pycache__/utils.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/tir/usmp/analysis/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/tir/usmp/analysis/__pycache__/_ffi_api.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/tir/usmp/analysis/__pycache__/analysis.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/tir/usmp/transform/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/tir/usmp/transform/__pycache__/_ffi_api.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/tir/usmp/transform/__pycache__/transform.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/__pycache__/argwhere.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/__pycache__/broadcast.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/__pycache__/einsum.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/__pycache__/generic_op_impl.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/__pycache__/math.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/__pycache__/reduction.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/__pycache__/scan.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/__pycache__/scatter.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/__pycache__/scatter_elements.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/__pycache__/searchsorted.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/__pycache__/signal.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/__pycache__/sort.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/__pycache__/sparse_fill_empty_rows.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/__pycache__/sparse_reshape.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/__pycache__/tag.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/__pycache__/tensor.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/__pycache__/transform.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/__pycache__/unique.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/__pycache__/utils.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/adreno/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/adreno/__pycache__/conv2d_alter_op.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/adreno/__pycache__/conv2d_nchw.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/adreno/__pycache__/conv2d_nchw_winograd.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/adreno/__pycache__/conv2d_nhwc.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/adreno/__pycache__/conv2d_nhwc_winograd.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/adreno/__pycache__/conv2d_transpose_alter_op.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/adreno/__pycache__/conv2d_transpose_nchw.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/adreno/__pycache__/conv2d_winograd_common.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/adreno/__pycache__/depthwise_conv2d_nchw.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/adreno/__pycache__/depthwise_conv2d_nhwc.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/adreno/__pycache__/injective.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/adreno/__pycache__/pooling.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/adreno/__pycache__/reduction.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/adreno/__pycache__/utils.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/arm_cpu/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/arm_cpu/__pycache__/arm_utils.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/arm_cpu/__pycache__/bitserial_conv2d.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/arm_cpu/__pycache__/bitserial_dense.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/arm_cpu/__pycache__/conv1d.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/arm_cpu/__pycache__/conv2d.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/arm_cpu/__pycache__/conv2d_alter_op.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/arm_cpu/__pycache__/conv2d_gemm.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/arm_cpu/__pycache__/conv2d_int8.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/arm_cpu/__pycache__/conv2d_spatial_pack.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/arm_cpu/__pycache__/conv2d_transpose.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/arm_cpu/__pycache__/dense.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/arm_cpu/__pycache__/depthwise_conv2d.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/arm_cpu/__pycache__/group_conv2d.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/arm_cpu/__pycache__/injective.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/arm_cpu/__pycache__/pooling.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/arm_cpu/__pycache__/qnn.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/arm_cpu/__pycache__/qnn_alter_op.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/arm_cpu/__pycache__/qnn_legalize.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/arm_cpu/__pycache__/tensor_intrin.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/arm_cpu/mprofile/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/arm_cpu/mprofile/dsp/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/arm_cpu/mprofile/dsp/__pycache__/conv1d.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/arm_cpu/mprofile/dsp/__pycache__/conv2d.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/arm_cpu/mprofile/dsp/__pycache__/dense.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/arm_cpu/mprofile/dsp/__pycache__/depthwise_conv2d.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/arm_cpu/mprofile/dsp/__pycache__/pool.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/__pycache__/avg_pool.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/__pycache__/common.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/__pycache__/gemm.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/__pycache__/max_pool.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/__pycache__/multi_channel_convolve.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/__pycache__/tensordot.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/bifrost/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/bifrost/__pycache__/conv2d.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/bifrost/__pycache__/dense.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/bifrost/__pycache__/depthwise_conv2d.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/bifrost/__pycache__/gemm.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/bifrost/__pycache__/transforms.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/cpp/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/cpp/__pycache__/cuda.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/cpp/__pycache__/generic.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/cpp/__pycache__/impl.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/cpp/__pycache__/nn.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/cpp/__pycache__/rocm.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/cpp/__pycache__/utils.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/cpp/__pycache__/x86.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/cpp/vision/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/cpp/vision/__pycache__/yolo.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/cuda/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/cuda/__pycache__/argwhere.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/cuda/__pycache__/batch_matmul.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/cuda/__pycache__/batch_matmul_tensorcore.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/cuda/__pycache__/conv1d.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/cuda/__pycache__/conv1d_transpose_ncw.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/cuda/__pycache__/conv2d.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/cuda/__pycache__/conv2d_alter_op.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/cuda/__pycache__/conv2d_direct.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/cuda/__pycache__/conv2d_hwcn.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/cuda/__pycache__/conv2d_hwnc_tensorcore.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/cuda/__pycache__/conv2d_int8.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/cuda/__pycache__/conv2d_nhwc_tensorcore.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/cuda/__pycache__/conv2d_nhwc_winograd.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/cuda/__pycache__/conv2d_transpose.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/cuda/__pycache__/conv2d_winograd.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/cuda/__pycache__/conv3d.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/cuda/__pycache__/conv3d_alter_op.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/cuda/__pycache__/conv3d_direct.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/cuda/__pycache__/conv3d_ndhwc_tensorcore.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/cuda/__pycache__/conv3d_transpose_ncdhw.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/cuda/__pycache__/conv3d_winograd.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/cuda/__pycache__/correlation.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/cuda/__pycache__/deformable_conv2d.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/cuda/__pycache__/dense.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/cuda/__pycache__/dense_tensorcore.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/cuda/__pycache__/depthwise_conv2d.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/cuda/__pycache__/group_conv2d_nchw.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/cuda/__pycache__/injective.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/cuda/__pycache__/nms.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/cuda/__pycache__/nn.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/cuda/__pycache__/pooling.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/cuda/__pycache__/reduction.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/cuda/__pycache__/scan.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/cuda/__pycache__/scatter.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/cuda/__pycache__/scatter_elements.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/cuda/__pycache__/searchsorted.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/cuda/__pycache__/signal.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/cuda/__pycache__/softmax.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/cuda/__pycache__/sort.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/cuda/__pycache__/sparse.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/cuda/__pycache__/sparse_reshape.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/cuda/__pycache__/tensor_intrin.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/cuda/__pycache__/tensorcore_alter_op.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/cuda/__pycache__/transform.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/cuda/__pycache__/unique.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/cuda/__pycache__/vision.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/cuda/rcnn/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/cuda/rcnn/__pycache__/proposal.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/cuda/ssd/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/cuda/ssd/__pycache__/multibox.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/generic/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/generic/__pycache__/conv2d.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/generic/__pycache__/default.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/generic/__pycache__/extern.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/generic/__pycache__/image.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/generic/__pycache__/injective.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/generic/__pycache__/math.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/generic/__pycache__/nn.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/generic/__pycache__/search.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/generic/__pycache__/sort.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/generic/__pycache__/vision.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/gpu/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/gpu/__pycache__/conv2d.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/gpu/__pycache__/conv2d_nhwc.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/gpu/__pycache__/dense.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/hexagon/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/hexagon/__pycache__/batch_matmul.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/hexagon/__pycache__/compute_poolarea.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/hexagon/__pycache__/conv2d.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/hexagon/__pycache__/conv2d_alter_op.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/hexagon/__pycache__/dense.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/hexagon/__pycache__/dense_alter_op.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/hexagon/__pycache__/injective.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/hexagon/__pycache__/pad.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/hexagon/__pycache__/pooling.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/hexagon/__pycache__/reduce.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/hexagon/__pycache__/resize2d.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/hexagon/__pycache__/tensor_intrin.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/hexagon/__pycache__/utils.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/hexagon/qnn/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/hexagon/qnn/__pycache__/adaptive_avg_pool1d.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/hexagon/qnn/__pycache__/avg_pool2d.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/hexagon/qnn/__pycache__/conv2d_alter_op.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/hexagon/qnn/__pycache__/dense_alter_op.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/hexagon/qnn/__pycache__/dequantize.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/hexagon/qnn/__pycache__/global_avg_pool2d.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/hexagon/qnn/__pycache__/nn.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/hexagon/qnn/__pycache__/qadd_qsub_qmul.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/hexagon/qnn/__pycache__/qdense.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/hexagon/qnn/__pycache__/qdepthwise_conv2d_slice.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/hexagon/qnn/__pycache__/quantize.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/hls/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/hls/__pycache__/injective.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/hls/__pycache__/nn.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/image/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/image/__pycache__/dilation2d.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/image/__pycache__/grid_sample.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/image/__pycache__/resize.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/intel_graphics/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/intel_graphics/__pycache__/conv2d.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/intel_graphics/__pycache__/conv2d_alter_op.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/intel_graphics/__pycache__/depthwise_conv2d.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/mali/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/mali/__pycache__/conv2d.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/mali/__pycache__/dense.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/mali/__pycache__/depthwise_conv2d.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/nn/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/nn/__pycache__/batch_matmul.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/nn/__pycache__/batch_norm.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/nn/__pycache__/batch_to_space_nd.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/nn/__pycache__/bitserial_conv2d.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/nn/__pycache__/bitserial_dense.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/nn/__pycache__/bitserial_util.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/nn/__pycache__/bnn.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/nn/__pycache__/conv1d.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/nn/__pycache__/conv1d_transpose.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/nn/__pycache__/conv2d.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/nn/__pycache__/conv2d_transpose.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/nn/__pycache__/conv3d.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/nn/__pycache__/conv3d_transpose.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/nn/__pycache__/correlation.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/nn/__pycache__/deformable_conv2d.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/nn/__pycache__/dense.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/nn/__pycache__/depth_to_space.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/nn/__pycache__/depthwise_conv2d.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/nn/__pycache__/dilate.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/nn/__pycache__/elemwise.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/nn/__pycache__/fifo_buffer.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/nn/__pycache__/flatten.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/nn/__pycache__/group_norm.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/nn/__pycache__/instance_norm.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/nn/__pycache__/layer_norm.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/nn/__pycache__/local_response_norm.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/nn/__pycache__/loss.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/nn/__pycache__/lstm.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/nn/__pycache__/mapping.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/nn/__pycache__/pad.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/nn/__pycache__/pooling.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/nn/__pycache__/qnn.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/nn/__pycache__/rms_norm.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/nn/__pycache__/softmax.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/nn/__pycache__/space_to_batch_nd.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/nn/__pycache__/space_to_depth.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/nn/__pycache__/sparse.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/nn/__pycache__/upsampling.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/nn/__pycache__/utils.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/nn/__pycache__/winograd_util.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/random/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/random/__pycache__/kernel.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/rocm/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/rocm/__pycache__/batch_matmul.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/rocm/__pycache__/conv2d.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/rocm/__pycache__/dense.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/sparse/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/sparse/__pycache__/csrmm.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/sparse/__pycache__/csrmv.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/sparse/__pycache__/dense.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/vision/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/vision/__pycache__/nms.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/vision/__pycache__/nms_util.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/vision/__pycache__/reorg.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/vision/rcnn/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/vision/rcnn/__pycache__/proposal.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/vision/rcnn/__pycache__/roi_align.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/vision/rcnn/__pycache__/roi_pool.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/vision/ssd/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/vision/ssd/__pycache__/multibox.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/x86/__pycache__/__init__.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/x86/__pycache__/batch_matmul.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/x86/__pycache__/binarize_pack.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/x86/__pycache__/binary_dense.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/x86/__pycache__/bitserial_conv2d.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/x86/__pycache__/bitserial_dense.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/x86/__pycache__/concat.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/x86/__pycache__/conv1d.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/x86/__pycache__/conv2d.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/x86/__pycache__/conv2d_alter_op.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/x86/__pycache__/conv2d_avx_1x1.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/x86/__pycache__/conv2d_avx_common.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/x86/__pycache__/conv2d_int8.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/x86/__pycache__/conv2d_transpose.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/x86/__pycache__/conv3d.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/x86/__pycache__/conv3d_transpose.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/x86/__pycache__/dense.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/x86/__pycache__/dense_alter_op.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/x86/__pycache__/depthwise_conv2d.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/x86/__pycache__/group_conv2d.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/x86/__pycache__/injective.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/x86/__pycache__/math_alter_op.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/x86/__pycache__/nn.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/x86/__pycache__/pooling.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/x86/__pycache__/reduction.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/x86/__pycache__/roi_align.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/x86/__pycache__/sparse.cpython-310.pyc +0 -0
- tilelang/3rdparty/tvm/python/tvm/topi/x86/__pycache__/tensor_intrin.cpython-310.pyc +0 -0
- {tilelang_rocm-0.1.4.post4.dist-info → tilelang_rocm-0.1.4.post9.dist-info}/licenses/LICENSE +0 -0
- {tilelang_rocm-0.1.4.post4.dist-info → tilelang_rocm-0.1.4.post9.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1202 @@
|
|
1
|
+
# Copyright (c) Microsoft Corporation.
|
2
|
+
# Licensed under the MIT License.
|
3
|
+
from typing import Dict, Literal
|
4
|
+
|
5
|
+
decode_i4_to_f16 = """
|
6
|
+
template <typename T1, typename T2, bool isSigned = false>
|
7
|
+
__device__ void decode_i4b_to_f16(T1 *_i4s, T2 *B_local_decode, const int N = 8)
|
8
|
+
{
|
9
|
+
uint *h = reinterpret_cast<uint *>(B_local_decode);
|
10
|
+
|
11
|
+
static constexpr uint immLut = (0xf0 & 0xcc) | 0xaa;
|
12
|
+
static constexpr uint BOTTOM_MASK = 0x000f000f;
|
13
|
+
static constexpr uint FP16_TOP_MAGIC_NUM = 0x64006400;
|
14
|
+
static constexpr uint MEDIAN_NUM = isSigned ? 0x64086408 : 0x64006400;
|
15
|
+
uint const i4s = *reinterpret_cast<uint *>(_i4s);
|
16
|
+
#pragma unroll
|
17
|
+
for (int i = 0; i < (N / 2); i++)
|
18
|
+
{
|
19
|
+
|
20
|
+
asm volatile("lop3.b32 %0, %1, %2, %3, %4;\\n"
|
21
|
+
: "=r"(h[i])
|
22
|
+
: "r"(i4s >> (4 * i)), "n"(BOTTOM_MASK), "n"(FP16_TOP_MAGIC_NUM), "n"(immLut));
|
23
|
+
asm volatile("sub.f16x2 %0, %1, %2;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(MEDIAN_NUM));
|
24
|
+
}
|
25
|
+
}
|
26
|
+
|
27
|
+
template <typename T1, typename T2>
|
28
|
+
__device__ void decode_i4s_to_f16(T1 *_i4s, T2 *B_local_decode, const int N = 8)
|
29
|
+
{
|
30
|
+
decode_i4b_to_f16<T1, T2, true>(_i4s, B_local_decode, N);
|
31
|
+
}
|
32
|
+
|
33
|
+
template <typename T1, typename T2>
|
34
|
+
__device__ void decode_i4u_to_f16(T1 *_i4u, T2 *B_local_decode, const int N = 8)
|
35
|
+
{
|
36
|
+
decode_i4b_to_f16<T1, T2, false>(_i4u, B_local_decode, N);
|
37
|
+
}
|
38
|
+
"""
|
39
|
+
|
40
|
+
decode_i4_to_f16_scale = """
|
41
|
+
template <typename T1, typename T2, typename T3, bool isSigned = false, bool withScaling = false>
|
42
|
+
__device__ void decode_i4b_to_f16_scale(T1 *_i4s, T2 *B_local_decode, const int N = 8, const T3 *scale = nullptr)
|
43
|
+
{
|
44
|
+
uint *h = reinterpret_cast<uint *>(B_local_decode);
|
45
|
+
|
46
|
+
static constexpr uint immLut = (0xf0 & 0xcc) | 0xaa;
|
47
|
+
static constexpr uint BOTTOM_MASK = 0x000f000f;
|
48
|
+
static constexpr uint FP16_TOP_MAGIC_NUM = 0x64006400;
|
49
|
+
// Minus 7 to scale the value to signed
|
50
|
+
static constexpr uint MEDIAN_NUM = isSigned ? 0x64086408 : 0x64006400;
|
51
|
+
uint const i4s = *reinterpret_cast<uint *>(_i4s);
|
52
|
+
T3 const scale_r = *scale;
|
53
|
+
uint const packed_scales = __pack_half2(scale_r, scale_r);
|
54
|
+
|
55
|
+
#pragma unroll
|
56
|
+
// decode 2 elems at one time.
|
57
|
+
for (int i = 0; i < (N / 2); i++)
|
58
|
+
{
|
59
|
+
|
60
|
+
asm volatile("lop3.b32 %0, %1, %2, %3, %4;\\n"
|
61
|
+
: "=r"(h[i])
|
62
|
+
: "r"(i4s >> (4 * i)), "n"(BOTTOM_MASK), "n"(FP16_TOP_MAGIC_NUM), "n"(immLut));
|
63
|
+
asm volatile("sub.f16x2 %0, %1, %2;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(MEDIAN_NUM));
|
64
|
+
asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(packed_scales), "r"(0));
|
65
|
+
}
|
66
|
+
}
|
67
|
+
|
68
|
+
template <typename T1, typename T2, typename T3>
|
69
|
+
__device__ void decode_i4s_to_f16_scale(T1 *_i4s, T2 *B_local_decode, T3 *scale = nullptr, const int N = 8)
|
70
|
+
{
|
71
|
+
decode_i4b_to_f16_scale<T1, T2, T3, true, true>(_i4s, B_local_decode, N, scale);
|
72
|
+
}
|
73
|
+
|
74
|
+
template <typename T1, typename T2, typename T3>
|
75
|
+
__device__ void decode_i4u_to_f16_scale(T1 *_i4u, T2 *B_local_decode, T3 *scale = nullptr, const int N = 8)
|
76
|
+
{
|
77
|
+
decode_i4b_to_f16_scale<T1, T2, T3, false, true>(_i4u, B_local_decode, N, scale);
|
78
|
+
}
|
79
|
+
|
80
|
+
"""
|
81
|
+
|
82
|
+
decode_i4_to_f16_scale_offset = """
|
83
|
+
template <typename T1, typename T2, typename T3, bool isSigned = false, bool withScaling = false>
|
84
|
+
__device__ void decode_i4b_to_f16_scale_offset(T1 *_i4s, T2 *B_local_decode, const int N = 8, const T3 *scale = nullptr, const int offset = 0)
|
85
|
+
{
|
86
|
+
uint *h = reinterpret_cast<uint *>(B_local_decode);
|
87
|
+
|
88
|
+
static constexpr uint immLut = (0xf0 & 0xcc) | 0xaa;
|
89
|
+
static constexpr uint BOTTOM_MASK = 0x000f000f;
|
90
|
+
static constexpr uint FP16_TOP_MAGIC_NUM = 0x64006400;
|
91
|
+
// Minus 7 to scale the value to signed
|
92
|
+
static constexpr uint MEDIAN_NUM = isSigned ? 0x64086408 : 0x64006400;
|
93
|
+
uint const i4s = *reinterpret_cast<uint *>(_i4s);
|
94
|
+
T3 const scale_l = *scale;
|
95
|
+
T3 const scale_r = *(scale + offset);
|
96
|
+
uint const packed_scales_l = __pack_half2(scale_l, scale_l);
|
97
|
+
uint const packed_scales_r = __pack_half2(scale_r, scale_r);
|
98
|
+
|
99
|
+
#pragma unroll
|
100
|
+
// decode 2 elems at one time.
|
101
|
+
for (int i = 0; i < (N / 2); i++)
|
102
|
+
{
|
103
|
+
|
104
|
+
asm volatile("lop3.b32 %0, %1, %2, %3, %4;\\n"
|
105
|
+
: "=r"(h[i])
|
106
|
+
: "r"(i4s >> (4 * i)), "n"(BOTTOM_MASK), "n"(FP16_TOP_MAGIC_NUM), "n"(immLut));
|
107
|
+
asm volatile("sub.f16x2 %0, %1, %2;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(MEDIAN_NUM));
|
108
|
+
}
|
109
|
+
#pragma unroll
|
110
|
+
for (int i = 0; i < (N / 4); i++)
|
111
|
+
{
|
112
|
+
asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(packed_scales_l), "r"(0));
|
113
|
+
}
|
114
|
+
#pragma unroll
|
115
|
+
for (int i = (N / 4); i < (N / 2); i++)
|
116
|
+
{
|
117
|
+
asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(packed_scales_r), "r"(0));
|
118
|
+
}
|
119
|
+
}
|
120
|
+
|
121
|
+
template <typename T1, typename T2, typename T3>
|
122
|
+
__device__ void decode_i4s_to_f16_scale_offset(T1 *_i4s, T2 *B_local_decode, T3 *scale = nullptr, const int offset = 0, const int N = 8)
|
123
|
+
{
|
124
|
+
decode_i4b_to_f16_scale_offset<T1, T2, T3, true, true>(_i4s, B_local_decode, N, scale, offset);
|
125
|
+
}
|
126
|
+
|
127
|
+
template <typename T1, typename T2, typename T3>
|
128
|
+
__device__ void decode_i4u_to_f16_scale_offset(T1 *_i4u, T2 *B_local_decode, T3 *scale = nullptr, const int offset = 0, const int N = 8)
|
129
|
+
{
|
130
|
+
decode_i4b_to_f16_scale_offset<T1, T2, T3, false, true>(_i4u, B_local_decode, N, scale, offset);
|
131
|
+
}
|
132
|
+
|
133
|
+
"""
|
134
|
+
|
135
|
+
decode_i4_to_f16_scale_zeros_original = """
|
136
|
+
template <typename T1, typename T2, typename T3, typename T4, bool isSigned = false>
|
137
|
+
__device__ void decode_i4b_to_f16_zeros_original(T1 *_i4s, T2 *B_local_decode, const int N = 8, const T3 *scale = nullptr, const T4 *zeros = nullptr)
|
138
|
+
{
|
139
|
+
uint *h = reinterpret_cast<uint *>(B_local_decode);
|
140
|
+
|
141
|
+
static constexpr uint immLut = (0xf0 & 0xcc) | 0xaa;
|
142
|
+
static constexpr uint BOTTOM_MASK = 0x000f000f;
|
143
|
+
static constexpr uint FP16_TOP_MAGIC_NUM = 0x64006400;
|
144
|
+
// Minus 7 to scale the value to signed
|
145
|
+
static constexpr uint MEDIAN_NUM = isSigned ? 0x64086408 : 0x64006400;
|
146
|
+
uint const i4s = *reinterpret_cast<uint *>(_i4s);
|
147
|
+
T3 const scale_r = *scale;
|
148
|
+
uint const packed_scales = __pack_half2(scale_r, scale_r);
|
149
|
+
// input zeros maybe int32(qzeros) or half format
|
150
|
+
T4 const zero_r = *zeros;
|
151
|
+
uint const packed_zeros = __pack_half2(zero_r, zero_r);
|
152
|
+
|
153
|
+
|
154
|
+
#pragma unroll
|
155
|
+
// decode 2 elems at one time.
|
156
|
+
for (int i = 0; i < (N / 2); i++)
|
157
|
+
{
|
158
|
+
|
159
|
+
asm volatile("lop3.b32 %0, %1, %2, %3, %4;\\n"
|
160
|
+
: "=r"(h[i])
|
161
|
+
: "r"(i4s >> (4 * i)), "n"(BOTTOM_MASK), "n"(FP16_TOP_MAGIC_NUM), "n"(immLut));
|
162
|
+
|
163
|
+
asm volatile("sub.f16x2 %0, %1, %2;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(MEDIAN_NUM));
|
164
|
+
|
165
|
+
asm volatile("sub.f16x2 %0, %1, %2;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(packed_zeros));
|
166
|
+
asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(packed_scales), "r"(0));
|
167
|
+
}
|
168
|
+
}
|
169
|
+
|
170
|
+
template <typename T1, typename T2, typename T3, typename T4>
|
171
|
+
__device__ void decode_i4u_to_f16_scale_zeros_original(T1 *_i4u, T2 *B_local_decode, T3 *scale = nullptr, T4 *zeros = nullptr, const int N = 8)
|
172
|
+
{
|
173
|
+
decode_i4b_to_f16_zeros_original<T1, T2, T3, T4, false>(_i4u, B_local_decode, N, scale, zeros);
|
174
|
+
}
|
175
|
+
"""
|
176
|
+
|
177
|
+
decode_i4_to_f16_scale_zeros_original_offset = """
|
178
|
+
template <typename T1, typename T2, typename T3, typename T4, bool isSigned = false>
|
179
|
+
__device__ void decode_i4b_to_f16_zeros_original_offset(T1 *_i4s, T2 *B_local_decode, const int N = 8, const T3 *scale = nullptr, const T4 *zeros = nullptr, const int offset = 0)
|
180
|
+
{
|
181
|
+
uint *h = reinterpret_cast<uint *>(B_local_decode);
|
182
|
+
|
183
|
+
static constexpr uint immLut = (0xf0 & 0xcc) | 0xaa;
|
184
|
+
static constexpr uint BOTTOM_MASK = 0x000f000f;
|
185
|
+
static constexpr uint FP16_TOP_MAGIC_NUM = 0x64006400;
|
186
|
+
// Minus 7 to scale the value to signed
|
187
|
+
static constexpr uint MEDIAN_NUM = isSigned ? 0x64086408 : 0x64006400;
|
188
|
+
uint const i4s = *reinterpret_cast<uint *>(_i4s);
|
189
|
+
T3 const scale_l = *scale;
|
190
|
+
T3 const scale_r = *(scale + offset);
|
191
|
+
uint const packed_scales_l = __pack_half2(scale_l, scale_l);
|
192
|
+
uint const packed_scales_r = __pack_half2(scale_r, scale_r);
|
193
|
+
// input zeros maybe int32(qzeros) or half format
|
194
|
+
T3 const zeros_l = *zeros;
|
195
|
+
T3 const zeros_r = *(zeros + offset);
|
196
|
+
uint const packed_zeros_l = __pack_half2(zeros_l, zeros_l);
|
197
|
+
uint const packed_zeros_r = __pack_half2(zeros_r, zeros_r);
|
198
|
+
|
199
|
+
#pragma unroll
|
200
|
+
// decode 2 elems at one time.
|
201
|
+
for (int i = 0; i < (N / 2); i++)
|
202
|
+
{
|
203
|
+
|
204
|
+
asm volatile("lop3.b32 %0, %1, %2, %3, %4;\\n"
|
205
|
+
: "=r"(h[i])
|
206
|
+
: "r"(i4s >> (4 * i)), "n"(BOTTOM_MASK), "n"(FP16_TOP_MAGIC_NUM), "n"(immLut));
|
207
|
+
|
208
|
+
asm volatile("sub.f16x2 %0, %1, %2;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(MEDIAN_NUM));
|
209
|
+
}
|
210
|
+
|
211
|
+
#pragma unroll
|
212
|
+
for (int i = 0; i < (N / 4); i++)
|
213
|
+
{
|
214
|
+
asm volatile("sub.f16x2 %0, %1, %2;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(packed_zeros_l));
|
215
|
+
asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(packed_scales_l), "r"(0));
|
216
|
+
}
|
217
|
+
#pragma unroll
|
218
|
+
for (int i = (N / 4); i < (N / 2); i++)
|
219
|
+
{
|
220
|
+
asm volatile("sub.f16x2 %0, %1, %2;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(packed_zeros_r));
|
221
|
+
asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(packed_scales_r), "r"(0));
|
222
|
+
}
|
223
|
+
}
|
224
|
+
|
225
|
+
template <typename T1, typename T2, typename T3, typename T4>
|
226
|
+
__device__ void decode_i4u_to_f16_scale_zeros_original_offset(T1 *_i4u, T2 *B_local_decode, T3 *scale = nullptr, T4 *zeros = nullptr, const int offset = 0, const int N = 8)
|
227
|
+
{
|
228
|
+
decode_i4b_to_f16_zeros_original_offset<T1, T2, T3, T4, false>(_i4u, B_local_decode, N, scale, zeros, offset);
|
229
|
+
}
|
230
|
+
"""
|
231
|
+
|
232
|
+
decode_i4_to_f16_scale_zeros_rescale = """
|
233
|
+
template <typename T1, typename T2, typename T3, typename T4, bool isSigned = false>
|
234
|
+
__device__ void decode_i4b_to_f16_scale_zeros_rescale(T1 *_i4s, T2 *B_local_decode, const int N = 8, const T3 *scale = nullptr, const T4 *zeros = nullptr)
|
235
|
+
{
|
236
|
+
uint *h = reinterpret_cast<uint *>(B_local_decode);
|
237
|
+
|
238
|
+
static constexpr uint immLut = (0xf0 & 0xcc) | 0xaa;
|
239
|
+
static constexpr uint BOTTOM_MASK = 0x000f000f;
|
240
|
+
static constexpr uint FP16_TOP_MAGIC_NUM = 0x64006400;
|
241
|
+
// Minus 7 to scale the value to signed
|
242
|
+
static constexpr uint MEDIAN_NUM = isSigned ? 0x64086408 : 0x64006400;
|
243
|
+
uint const i4s = *reinterpret_cast<uint *>(_i4s);
|
244
|
+
T3 const scale_r = *scale;
|
245
|
+
uint const packed_scales = __pack_half2(scale_r, scale_r);
|
246
|
+
T4 const zero_r = *zeros;
|
247
|
+
uint const packed_zeros = 0x80008000 | __pack_half2(zero_r, zero_r);
|
248
|
+
|
249
|
+
#pragma unroll
|
250
|
+
// decode 2 elems at one time.
|
251
|
+
for (int i = 0; i < (N / 2); i++)
|
252
|
+
{
|
253
|
+
|
254
|
+
asm volatile("lop3.b32 %0, %1, %2, %3, %4;\\n"
|
255
|
+
: "=r"(h[i])
|
256
|
+
: "r"(i4s >> (4 * i)), "n"(BOTTOM_MASK), "n"(FP16_TOP_MAGIC_NUM), "n"(immLut));
|
257
|
+
|
258
|
+
asm volatile("sub.f16x2 %0, %1, %2;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(MEDIAN_NUM));
|
259
|
+
|
260
|
+
asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(packed_scales), "r"(packed_zeros));
|
261
|
+
}
|
262
|
+
}
|
263
|
+
|
264
|
+
template <typename T1, typename T2, typename T3, typename T4>
|
265
|
+
__device__ void decode_i4u_to_f16_scale_zeros_rescale(T1 *_i4u, T2 *B_local_decode, T3 *scale = nullptr, T4 *zeros = nullptr, const int N = 8)
|
266
|
+
{
|
267
|
+
decode_i4b_to_f16_scale_zeros_rescale<T1, T2, T3, T4, false>(_i4u, B_local_decode, N, scale, zeros);
|
268
|
+
}
|
269
|
+
|
270
|
+
"""
|
271
|
+
|
272
|
+
decode_i4_to_f16_scale_zeros_rescale_offset = """
|
273
|
+
template <typename T1, typename T2, typename T3, typename T4, bool isSigned = false>
|
274
|
+
__device__ void decode_i4b_to_f16_scale_zeros_rescale_offset(T1 *_i4s, T2 *B_local_decode, const int N = 8, const T3 *scale = nullptr, const T4 *zeros = nullptr, const int offset = 0)
|
275
|
+
{
|
276
|
+
uint *h = reinterpret_cast<uint *>(B_local_decode);
|
277
|
+
|
278
|
+
static constexpr uint immLut = (0xf0 & 0xcc) | 0xaa;
|
279
|
+
static constexpr uint BOTTOM_MASK = 0x000f000f;
|
280
|
+
static constexpr uint FP16_TOP_MAGIC_NUM = 0x64006400;
|
281
|
+
// Minus 7 to scale the value to signed
|
282
|
+
static constexpr uint MEDIAN_NUM = isSigned ? 0x64086408 : 0x64006400;
|
283
|
+
uint const i4s = *reinterpret_cast<uint *>(_i4s);
|
284
|
+
T3 const scale_l = *scale;
|
285
|
+
T3 const scale_r = *(scale + offset);
|
286
|
+
uint const packed_scales_l = __pack_half2(scale_l, scale_l);
|
287
|
+
uint const packed_scales_r = __pack_half2(scale_r, scale_r);
|
288
|
+
// input zeros maybe int32(qzeros) or half format
|
289
|
+
T3 const zeros_l = *zeros;
|
290
|
+
T3 const zeros_r = *(zeros + offset);
|
291
|
+
uint const packed_zeros_l = 0x80008000 | __pack_half2(zeros_l, zeros_l);
|
292
|
+
uint const packed_zeros_r = 0x80008000 | __pack_half2(zeros_r, zeros_r);
|
293
|
+
|
294
|
+
#pragma unroll
|
295
|
+
// decode 2 elems at one time.
|
296
|
+
for (int i = 0; i < (N / 2); i++)
|
297
|
+
{
|
298
|
+
|
299
|
+
asm volatile("lop3.b32 %0, %1, %2, %3, %4;\\n"
|
300
|
+
: "=r"(h[i])
|
301
|
+
: "r"(i4s >> (4 * i)), "n"(BOTTOM_MASK), "n"(FP16_TOP_MAGIC_NUM), "n"(immLut));
|
302
|
+
|
303
|
+
asm volatile("sub.f16x2 %0, %1, %2;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(MEDIAN_NUM));
|
304
|
+
}
|
305
|
+
#pragma unroll
|
306
|
+
for (int i = 0; i < (N / 4); i++)
|
307
|
+
{
|
308
|
+
asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(packed_scales_l), "r"(packed_zeros_l));
|
309
|
+
}
|
310
|
+
#pragma unroll
|
311
|
+
for (int i = (N / 4); i < (N / 2); i++)
|
312
|
+
{
|
313
|
+
asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(packed_scales_r), "r"(packed_zeros_r));
|
314
|
+
}
|
315
|
+
}
|
316
|
+
|
317
|
+
template <typename T1, typename T2, typename T3, typename T4>
|
318
|
+
__device__ void decode_i4u_to_f16_scale_zeros_rescale_offset(T1 *_i4u, T2 *B_local_decode, T3 *scale = nullptr, T4 *zeros = nullptr, const int offset = 0, const int N = 8)
|
319
|
+
{
|
320
|
+
decode_i4b_to_f16_scale_zeros_rescale_offset<T1, T2, T3, T4, false>(_i4u, B_local_decode, N, scale, zeros, offset);
|
321
|
+
}
|
322
|
+
|
323
|
+
"""
|
324
|
+
|
325
|
+
decode_i4_to_f16_scale_zeros_quantized = """
|
326
|
+
template <typename T1, typename T2, typename T3, typename T4, bool isSigned = false>
|
327
|
+
__device__ void decode_i4b_to_f16_scale_zeros_quantized(T1 *_i4s, T2 *B_local_decode, const int N = 8, const T3 *scale = nullptr, const T4 *zeros = nullptr)
|
328
|
+
{
|
329
|
+
uint *h = reinterpret_cast<uint *>(B_local_decode);
|
330
|
+
|
331
|
+
static constexpr uint immLut = (0xf0 & 0xcc) | 0xaa;
|
332
|
+
static constexpr uint BOTTOM_MASK = 0x000f000f;
|
333
|
+
static constexpr uint FP16_TOP_MAGIC_NUM = 0x64006400;
|
334
|
+
// Minus 7 to scale the value to signed
|
335
|
+
uint const i4s = *reinterpret_cast<uint *>(_i4s);
|
336
|
+
T3 const scale_r = *scale;
|
337
|
+
uint const packed_scales = __pack_half2(scale_r, scale_r);
|
338
|
+
// input zeros maybe int32(qzeros) or half format
|
339
|
+
int16_t const zero_r = *((int16_t*)zeros);
|
340
|
+
uint median_num = ((0xe400 | zero_r) << 16) | (0xe400 | zero_r);
|
341
|
+
|
342
|
+
#pragma unroll
|
343
|
+
// decode 2 elems at one time.
|
344
|
+
for (int i = 0; i < (N / 2); i++)
|
345
|
+
{
|
346
|
+
|
347
|
+
asm volatile("lop3.b32 %0, %1, %2, %3, %4;\\n"
|
348
|
+
: "=r"(h[i])
|
349
|
+
: "r"(i4s >> (4 * i)), "n"(BOTTOM_MASK), "n"(FP16_TOP_MAGIC_NUM), "n"(immLut));
|
350
|
+
|
351
|
+
asm volatile("add.f16x2 %0, %1, %2;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(median_num));
|
352
|
+
|
353
|
+
asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(packed_scales), "r"(0));
|
354
|
+
}
|
355
|
+
}
|
356
|
+
|
357
|
+
template <typename storage_dtype, typename target_dtype, typename scale_dtype, typename zero_dtype>
|
358
|
+
__device__ void decode_i4u_to_f16_scale_zeros_quantized(storage_dtype *_i4u, target_dtype *B_local_decode, scale_dtype *scale = nullptr, zero_dtype *zeros = nullptr, const int N = 8)
|
359
|
+
{
|
360
|
+
decode_i4b_to_f16_scale_zeros_quantized<storage_dtype, target_dtype, scale_dtype, zero_dtype, false>(_i4u, B_local_decode, N, scale, zeros);
|
361
|
+
}
|
362
|
+
"""
|
363
|
+
|
364
|
+
decode_i4_to_f16_scale_zeros_quantized_offset = """
|
365
|
+
template <typename T1, typename T2, typename T3, bool isSigned = false>
|
366
|
+
__device__ void decode_i4b_to_f16_scale_zeros_quantized_offset(T1 *_i4s, T2 *B_local_decode, const int N = 8, const T3 *scale = nullptr, const T1 *qzeros = nullptr, const int scale_offset = 0, const int qzeros_offset = 0, const int group_offset = 0)
|
367
|
+
{
|
368
|
+
uint *h = reinterpret_cast<uint *>(B_local_decode);
|
369
|
+
|
370
|
+
static constexpr uint immLut = (0xf0 & 0xcc) | 0xaa;
|
371
|
+
static constexpr uint BOTTOM_MASK = 0x000f000f;
|
372
|
+
static constexpr uint FP16_TOP_MAGIC_NUM = 0x64006400;
|
373
|
+
// Minus 7 to scale the value to signed
|
374
|
+
uint const i4s = *reinterpret_cast<uint *>(_i4s);
|
375
|
+
|
376
|
+
T3 const scale_l = *scale;
|
377
|
+
T3 const scale_r = *(scale + scale_offset);
|
378
|
+
uint const packed_scales_l = __pack_half2(scale_l, scale_l);
|
379
|
+
uint const packed_scales_r = __pack_half2(scale_r, scale_r);
|
380
|
+
|
381
|
+
const int num_elems_per_storage_dtype = sizeof(T1) * 8 / 4;
|
382
|
+
|
383
|
+
T1 const qzeros_l = *qzeros;
|
384
|
+
T1 const qzeros_r = *(qzeros + qzeros_offset);
|
385
|
+
int16_t const zero_l = (qzeros_l >> (group_offset * 4) & 0xf);
|
386
|
+
int16_t const zero_r = (qzeros_r >> (group_offset * 4) & 0xf);
|
387
|
+
|
388
|
+
uint median_num_l = ((0xe400 | zero_l) << 16) | (0xe400 | zero_l);
|
389
|
+
uint median_num_r = ((0xe400 | zero_r) << 16) | (0xe400 | zero_r);
|
390
|
+
|
391
|
+
#pragma unroll
|
392
|
+
// decode 2 elems at one time.
|
393
|
+
for (int i = 0; i < (N / 2); i++)
|
394
|
+
{
|
395
|
+
|
396
|
+
asm volatile("lop3.b32 %0, %1, %2, %3, %4;\\n"
|
397
|
+
: "=r"(h[i])
|
398
|
+
: "r"(i4s >> (4 * i)), "n"(BOTTOM_MASK), "n"(FP16_TOP_MAGIC_NUM), "n"(immLut));
|
399
|
+
}
|
400
|
+
#pragma unroll
|
401
|
+
for (int i = 0; i < (N / 4); i++)
|
402
|
+
{
|
403
|
+
asm volatile("add.f16x2 %0, %1, %2;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(median_num_l));
|
404
|
+
|
405
|
+
asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(packed_scales_l), "r"(0));
|
406
|
+
}
|
407
|
+
#pragma unroll
|
408
|
+
for (int i = (N / 4); i < (N / 2); i++)
|
409
|
+
{
|
410
|
+
asm volatile("add.f16x2 %0, %1, %2;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(median_num_r));
|
411
|
+
|
412
|
+
asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(packed_scales_r), "r"(0));
|
413
|
+
}
|
414
|
+
}
|
415
|
+
|
416
|
+
template <typename storage_dtype, typename target_dtype, typename scale_dtype>
|
417
|
+
__device__ void decode_i4u_to_f16_scale_zeros_quantized_offset(storage_dtype *_i4u, target_dtype *B_local_decode, scale_dtype *scale = nullptr, storage_dtype *qzeros = nullptr, const int scale_offset = 0, const int zero_offset = 0, const int group_offset = 0, const int N = 8)
|
418
|
+
{
|
419
|
+
decode_i4b_to_f16_scale_zeros_quantized_offset<storage_dtype, target_dtype, scale_dtype, false>(_i4u, B_local_decode, N, scale, qzeros, scale_offset, zero_offset, group_offset);
|
420
|
+
}
|
421
|
+
"""
|
422
|
+
|
423
|
+
decode_i2_to_f16 = """
|
424
|
+
template <typename T1, typename T2, bool isSigned = false>
|
425
|
+
__device__ void decode_i2b_to_f16(T1 *_i2s, T2 *B_local_decode, const int N = 8)
|
426
|
+
{
|
427
|
+
uint *h = reinterpret_cast<uint *>(B_local_decode);
|
428
|
+
|
429
|
+
static constexpr uint immLut = (0xf0 & 0xcc) | 0xaa;
|
430
|
+
static constexpr uint BOTTOM_MASK = 0x00030003;
|
431
|
+
static constexpr uint FP16_TOP_MAGIC_NUM = 0x64006400;
|
432
|
+
static constexpr uint MEDIAN_NUM = isSigned ? 0x64026402 : 0x64006400;
|
433
|
+
int16_t const i2s_i16 = *reinterpret_cast<int16_t *>(_i2s);
|
434
|
+
// decode 2 elems at one time.
|
435
|
+
// interleave {e15,e13,e11,e9,e7,e5,e3,e1,e14,e12,e10,e8,e6,e4,e2,e0}
|
436
|
+
// only decode for {x,x,x,x,e7,e5,e3,e1,x,x,x,x,e6,e4,e2,e0}
|
437
|
+
// otherwise the pointer of _i2s should be moved to
|
438
|
+
int i2s = (i2s_i16 & 0x00ff);
|
439
|
+
i2s |= ((i2s_i16 & 0xff00) << 8);
|
440
|
+
|
441
|
+
#pragma unroll
|
442
|
+
for (int i = 0; i < (N / 2); i++)
|
443
|
+
{
|
444
|
+
asm volatile("lop3.b32 %0, %1, %2, %3, %4;\\n"
|
445
|
+
: "=r"(h[i])
|
446
|
+
: "r"(i2s >> (2 * i)), "n"(BOTTOM_MASK), "n"(FP16_TOP_MAGIC_NUM), "n"(immLut));
|
447
|
+
asm volatile("sub.f16x2 %0, %1, %2;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(MEDIAN_NUM));
|
448
|
+
}
|
449
|
+
}
|
450
|
+
|
451
|
+
template <typename T1, typename T2>
|
452
|
+
__device__ void decode_i2s_to_f16(T1 *_i2s, T2 *B_local_decode, const int N = 8)
|
453
|
+
{
|
454
|
+
decode_i2b_to_f16<T1, T2, true>(_i2s, B_local_decode, N);
|
455
|
+
}
|
456
|
+
|
457
|
+
template <typename T1, typename T2>
|
458
|
+
__device__ void decode_i2u_to_f16(T1 *_i2u, T2 *B_local_decode, const int N = 8)
|
459
|
+
{
|
460
|
+
decode_i2b_to_f16<T1, T2, false>(_i2u, B_local_decode, N);
|
461
|
+
}
|
462
|
+
"""
|
463
|
+
|
464
|
+
decode_i2_to_f16_scale = """
|
465
|
+
template <typename T1, typename T2, typename T3, bool isSigned = false>
|
466
|
+
__device__ void decode_i2b_to_f16_scale(T1 *_i2s, T2 *B_local_decode, T3 *scale = nullptr, const int N = 8)
|
467
|
+
{
|
468
|
+
uint *h = reinterpret_cast<uint *>(B_local_decode);
|
469
|
+
|
470
|
+
static constexpr uint immLut = (0xf0 & 0xcc) | 0xaa;
|
471
|
+
static constexpr uint BOTTOM_MASK = 0x00030003;
|
472
|
+
static constexpr uint FP16_TOP_MAGIC_NUM = 0x64006400;
|
473
|
+
static constexpr uint MEDIAN_NUM = isSigned ? 0x64026402 : 0x64006400;
|
474
|
+
int16_t const i2s_i16 = *reinterpret_cast<int16_t *>(_i2s);
|
475
|
+
// decode 2 elems at one time.
|
476
|
+
// interleave {e15,e13,e11,e9,e7,e5,e3,e1,e14,e12,e10,e8,e6,e4,e2,e0}
|
477
|
+
// only decode for {x,x,x,x,e7,e5,e3,e1,x,x,x,x,e6,e4,e2,e0}
|
478
|
+
// otherwise the pointer of _i2s should be moved to
|
479
|
+
int i2s = (i2s_i16 & 0x00ff);
|
480
|
+
i2s |= ((i2s_i16 & 0xff00) << 8);
|
481
|
+
|
482
|
+
#pragma unroll
|
483
|
+
for (int i = 0; i < (N / 2); i++)
|
484
|
+
{
|
485
|
+
asm volatile("lop3.b32 %0, %1, %2, %3, %4;\\n"
|
486
|
+
: "=r"(h[i])
|
487
|
+
: "r"(i2s >> (2 * i)), "n"(BOTTOM_MASK), "n"(FP16_TOP_MAGIC_NUM), "n"(immLut));
|
488
|
+
asm volatile("sub.f16x2 %0, %1, %2;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(MEDIAN_NUM));
|
489
|
+
asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(__pack_half2(*scale, *scale)), "r"(0));
|
490
|
+
}
|
491
|
+
}
|
492
|
+
|
493
|
+
template <typename T1, typename T2, typename T3>
|
494
|
+
__device__ void decode_i2s_to_f16_scale(T1 *_i2s, T2 *B_local_decode, T3 *scale, const int N = 8)
|
495
|
+
{
|
496
|
+
decode_i2b_to_f16_scale<T1, T2, T3, true>(_i2s, B_local_decode, scale, N);
|
497
|
+
}
|
498
|
+
|
499
|
+
template <typename T1, typename T2, typename T3>
|
500
|
+
__device__ void decode_i2u_to_f16_scale(T1 *_i2u, T2 *B_local_decode, T3 *scale, const int N = 8)
|
501
|
+
{
|
502
|
+
decode_i2b_to_f16_scale<T1, T2, T3, false>(_i2u, B_local_decode, scale, N);
|
503
|
+
}
|
504
|
+
"""
|
505
|
+
|
506
|
+
decode_i2_to_f16_scale_zeros_original_offset = """
|
507
|
+
template <typename T1, typename T2, typename T3, bool isSigned = false>
|
508
|
+
__device__ void decode_i2b_to_f16_scale_zeros_original_offset(T1 *_i2s, T2 *B_local_decode, T3 *scale = nullptr, T3 *zeros = nullptr, const int offset = 0, const int N = 8)
|
509
|
+
{
|
510
|
+
uint *h = reinterpret_cast<uint *>(B_local_decode);
|
511
|
+
|
512
|
+
static constexpr uint immLut = (0xf0 & 0xcc) | 0xaa;
|
513
|
+
static constexpr uint BOTTOM_MASK = 0x00030003;
|
514
|
+
static constexpr uint FP16_TOP_MAGIC_NUM = 0x64006400;
|
515
|
+
static constexpr uint MEDIAN_NUM = isSigned ? 0x64026402 : 0x64006400;
|
516
|
+
int16_t const i2s_i16 = *reinterpret_cast<int16_t *>(_i2s);
|
517
|
+
// decode 2 elems at one time.
|
518
|
+
// interleave {e15,e13,e11,e9,e7,e5,e3,e1,e14,e12,e10,e8,e6,e4,e2,e0}
|
519
|
+
// only decode for {x,x,x,x,e7,e5,e3,e1,x,x,x,x,e6,e4,e2,e0}
|
520
|
+
// otherwise the pointer of _i2s should be moved to
|
521
|
+
int i2s = (i2s_i16 & 0x00ff);
|
522
|
+
i2s |= ((i2s_i16 & 0xff00) << 8);
|
523
|
+
|
524
|
+
T3 const zeros_l = *zeros;
|
525
|
+
T3 const zeros_r = *(zeros + offset);
|
526
|
+
uint const packed_zeros_l = __pack_half2(zeros_l, zeros_l);
|
527
|
+
uint const packed_zeros_r = __pack_half2(zeros_r, zeros_r);
|
528
|
+
|
529
|
+
T3 const scale_l = *scale;
|
530
|
+
T3 const scale_r = *(scale + offset);
|
531
|
+
uint const packed_scales_l = __pack_half2(scale_l, scale_l);
|
532
|
+
uint const packed_scales_r = __pack_half2(scale_r, scale_r);
|
533
|
+
|
534
|
+
#pragma unroll
|
535
|
+
for (int i = 0; i < (N / 2); i++)
|
536
|
+
{
|
537
|
+
asm volatile("lop3.b32 %0, %1, %2, %3, %4;\\n"
|
538
|
+
: "=r"(h[i])
|
539
|
+
: "r"(i2s >> (2 * i)), "n"(BOTTOM_MASK), "n"(FP16_TOP_MAGIC_NUM), "n"(immLut));
|
540
|
+
asm volatile("sub.f16x2 %0, %1, %2;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(MEDIAN_NUM));
|
541
|
+
}
|
542
|
+
#pragma unroll
|
543
|
+
for (int i = 0; i < (N / 4); i++)
|
544
|
+
{
|
545
|
+
asm volatile("sub.f16x2 %0, %1, %2;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(packed_zeros_l));
|
546
|
+
asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(packed_scales_l), "r"(0));
|
547
|
+
}
|
548
|
+
#pragma unroll
|
549
|
+
for (int i = (N / 4); i < (N / 2); i++)
|
550
|
+
{
|
551
|
+
asm volatile("sub.f16x2 %0, %1, %2;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(packed_zeros_r));
|
552
|
+
asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(packed_scales_r), "r"(0));
|
553
|
+
}
|
554
|
+
}
|
555
|
+
|
556
|
+
template <typename T1, typename T2, typename T3>
|
557
|
+
__device__ void decode_i2u_to_f16_scale_zeros_original_offset(T1 *_i2u, T2 *B_local_decode, T3 *scale, T3 *zeros, const int offset = 0, const int N = 8)
|
558
|
+
{
|
559
|
+
decode_i2b_to_f16_scale_zeros_original<T1, T2, T3, false>(_i2u, B_local_decode, scale, zeros, offset, N);
|
560
|
+
}
|
561
|
+
"""
|
562
|
+
|
563
|
+
decode_i2_to_f16_scale_zeros_original = """
|
564
|
+
template <typename T1, typename T2, typename T3, bool isSigned = false>
|
565
|
+
__device__ void decode_i2b_to_f16_scale_zeros_original(T1 *_i2s, T2 *B_local_decode, T3 *scale = nullptr, T3 *zeros = nullptr, const int N = 8)
|
566
|
+
{
|
567
|
+
uint *h = reinterpret_cast<uint *>(B_local_decode);
|
568
|
+
|
569
|
+
static constexpr uint immLut = (0xf0 & 0xcc) | 0xaa;
|
570
|
+
static constexpr uint BOTTOM_MASK = 0x00030003;
|
571
|
+
static constexpr uint FP16_TOP_MAGIC_NUM = 0x64006400;
|
572
|
+
static constexpr uint MEDIAN_NUM = isSigned ? 0x64026402 : 0x64006400;
|
573
|
+
int16_t const i2s_i16 = *reinterpret_cast<int16_t *>(_i2s);
|
574
|
+
// decode 2 elems at one time.
|
575
|
+
// interleave {e15,e13,e11,e9,e7,e5,e3,e1,e14,e12,e10,e8,e6,e4,e2,e0}
|
576
|
+
// only decode for {x,x,x,x,e7,e5,e3,e1,x,x,x,x,e6,e4,e2,e0}
|
577
|
+
// otherwise the pointer of _i2s should be moved to
|
578
|
+
int i2s = (i2s_i16 & 0x00ff);
|
579
|
+
i2s |= ((i2s_i16 & 0xff00) << 8);
|
580
|
+
|
581
|
+
#pragma unroll
|
582
|
+
for (int i = 0; i < (N / 2); i++)
|
583
|
+
{
|
584
|
+
asm volatile("lop3.b32 %0, %1, %2, %3, %4;\\n"
|
585
|
+
: "=r"(h[i])
|
586
|
+
: "r"(i2s >> (2 * i)), "n"(BOTTOM_MASK), "n"(FP16_TOP_MAGIC_NUM), "n"(immLut));
|
587
|
+
asm volatile("sub.f16x2 %0, %1, %2;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(MEDIAN_NUM));
|
588
|
+
asm volatile("sub.f16x2 %0, %1, %2;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(__pack_half2(*zeros, *zeros)));
|
589
|
+
asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(__pack_half2(*scale, *scale)), "r"(0));
|
590
|
+
}
|
591
|
+
}
|
592
|
+
|
593
|
+
template <typename T1, typename T2, typename T3>
|
594
|
+
__device__ void decode_i2u_to_f16_scale_zeros_original(T1 *_i2u, T2 *B_local_decode, T3 *scale, T3 *zeros, const int N = 8)
|
595
|
+
{
|
596
|
+
decode_i2b_to_f16_scale_zeros_original<T1, T2, T3, false>(_i2u, B_local_decode, scale, zeros, N);
|
597
|
+
}
|
598
|
+
"""
|
599
|
+
|
600
|
+
decode_i2_to_f16_scale_zeros_rescale = """
|
601
|
+
template <typename T1, typename T2, typename T3, bool isSigned = false>
|
602
|
+
__device__ void decode_i2b_to_f16_scale_zeros_rescale(T1 *_i2s, T2 *B_local_decode, T3 *scale = nullptr, T3 *zeros = nullptr, const int N = 8)
|
603
|
+
{
|
604
|
+
uint *h = reinterpret_cast<uint *>(B_local_decode);
|
605
|
+
|
606
|
+
static constexpr uint immLut = (0xf0 & 0xcc) | 0xaa;
|
607
|
+
static constexpr uint BOTTOM_MASK = 0x00030003;
|
608
|
+
static constexpr uint FP16_TOP_MAGIC_NUM = 0x64006400;
|
609
|
+
static constexpr uint MEDIAN_NUM = isSigned ? 0x64026402 : 0x64006400;
|
610
|
+
int16_t const i2s_i16 = *reinterpret_cast<int16_t *>(_i2s);
|
611
|
+
// decode 2 elems at one time.
|
612
|
+
// interleave {e15,e13,e11,e9,e7,e5,e3,e1,e14,e12,e10,e8,e6,e4,e2,e0}
|
613
|
+
// only decode for {x,x,x,x,e7,e5,e3,e1,x,x,x,x,e6,e4,e2,e0}
|
614
|
+
// otherwise the pointer of _i2s should be moved to
|
615
|
+
int i2s = (i2s_i16 & 0x00ff);
|
616
|
+
i2s |= ((i2s_i16 & 0xff00) << 8);
|
617
|
+
|
618
|
+
#pragma unroll
|
619
|
+
for (int i = 0; i < (N / 2); i++)
|
620
|
+
{
|
621
|
+
asm volatile("lop3.b32 %0, %1, %2, %3, %4;\\n"
|
622
|
+
: "=r"(h[i])
|
623
|
+
: "r"(i2s >> (2 * i)), "n"(BOTTOM_MASK), "n"(FP16_TOP_MAGIC_NUM), "n"(immLut));
|
624
|
+
asm volatile("sub.f16x2 %0, %1, %2;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(MEDIAN_NUM));
|
625
|
+
asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(__pack_half2(*scale, *scale)), "r"(0));
|
626
|
+
asm volatile("sub.f16x2 %0, %1, %2;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(__pack_half2(*zeros, *zeros)));
|
627
|
+
}
|
628
|
+
}
|
629
|
+
|
630
|
+
template <typename T1, typename T2, typename T3>
|
631
|
+
__device__ void decode_i2u_to_f16_scale_zeros_rescale(T1 *_i2u, T2 *B_local_decode, T3 *scale, T3 *zeros, const int N = 8)
|
632
|
+
{
|
633
|
+
decode_i2b_to_f16_scale_zeros_rescale<T1, T2, T3, false>(_i2u, B_local_decode, scale, zeros, N);
|
634
|
+
}
|
635
|
+
"""
|
636
|
+
|
637
|
+
decode_i2_to_f16_scale_zeros_quantized = """
|
638
|
+
template <typename T1, typename T2, typename T3, typename T4, bool isSigned = false>
|
639
|
+
__device__ void decode_i2b_to_f16_scale_zeros_quantized(T1 *_i2s, T2 *B_local_decode, const int N = 8, T3 *scale = nullptr, T4 *zeros = nullptr)
|
640
|
+
{
|
641
|
+
uint *h = reinterpret_cast<uint *>(B_local_decode);
|
642
|
+
|
643
|
+
static constexpr uint immLut = (0xf0 & 0xcc) | 0xaa;
|
644
|
+
static constexpr uint BOTTOM_MASK = 0x00030003;
|
645
|
+
static constexpr uint FP16_TOP_MAGIC_NUM = 0x64006400;
|
646
|
+
static constexpr uint MEDIAN_NUM = isSigned ? 0x64016401 : 0x64006400;
|
647
|
+
int16_t const i2s_i16 = *reinterpret_cast<int16_t *>(_i2s);
|
648
|
+
T3 const scale_r = *scale;
|
649
|
+
uint const packed_scales = __pack_half2(scale_r, scale_r);
|
650
|
+
int16_t const zero_r = *((int16_t*)zeros);
|
651
|
+
uint median_num = ((0xe400 | zero_r) << 16) | (0xe400 | zero_r);
|
652
|
+
|
653
|
+
// decode 2 elems at one time.
|
654
|
+
// interleave {e15,e13,e11,e9,e7,e5,e3,e1,e14,e12,e10,e8,e6,e4,e2,e0}
|
655
|
+
// only decode for {x,x,x,x,e7,e5,e3,e1,x,x,x,x,e6,e4,e2,e0}
|
656
|
+
// otherwise the pointer of _i2s should be moved to
|
657
|
+
int i2s = (i2s_i16 & 0x00ff);
|
658
|
+
i2s |= ((i2s_i16 & 0xff00) << 8);
|
659
|
+
|
660
|
+
#pragma unroll
|
661
|
+
for (int i = 0; i < (N / 2); i++)
|
662
|
+
{
|
663
|
+
asm volatile("lop3.b32 %0, %1, %2, %3, %4;\\n"
|
664
|
+
: "=r"(h[i])
|
665
|
+
: "r"(i2s >> (2 * i)), "n"(BOTTOM_MASK), "n"(FP16_TOP_MAGIC_NUM), "n"(immLut));
|
666
|
+
asm volatile("add.f16x2 %0, %1, %2;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(median_num));
|
667
|
+
|
668
|
+
asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(packed_scales), "r"(0));
|
669
|
+
}
|
670
|
+
}
|
671
|
+
template <typename T1, typename T2, typename T3, typename T4>
|
672
|
+
__device__ void decode_i2u_to_f16_scale_zeros_quantized(T1 *_i2u, T2 *B_local_decode, T3 *scale = nullptr, T4 *zeros = nullptr, const int N = 8)
|
673
|
+
{
|
674
|
+
decode_i2b_to_f16_scale_zeros_quantized<T1, T2, T3, T4, false>(_i2u, B_local_decode, N, scale, zeros);
|
675
|
+
}
|
676
|
+
"""
|
677
|
+
|
678
|
+
decode_i1_to_f16 = """
|
679
|
+
/*
|
680
|
+
Kind 0: original
|
681
|
+
Kind 1: rescale
|
682
|
+
Kind 2: quantized
|
683
|
+
# documents for zeros_mode:
|
684
|
+
# original: target = (dequantize_weight - zero_point) * scale
|
685
|
+
# rescale: target = dequantize_weight * scale - zero_point
|
686
|
+
# quantized: target = (dequantize_weight - dequantize_zeros) * scale
|
687
|
+
# Notice: only support "original" and "rescale" now
|
688
|
+
zeros_mode: Literal["original", "rescale", "quantized"] = "original"
|
689
|
+
*/
|
690
|
+
template <typename T1, typename T2, bool isSigned = false, bool withScaling = false, bool withZeros = false, int ZerosKind = 1>
|
691
|
+
__device__ void decode_i1b_to_f16(T1 *_i1s, T2 *B_local_decode, const int N = 8, half *scale = nullptr, half *zeros = nullptr)
|
692
|
+
{
|
693
|
+
uint *h = reinterpret_cast<uint *>(B_local_decode);
|
694
|
+
|
695
|
+
static constexpr uint immLut = (0xf0 & 0xcc) | 0xaa;
|
696
|
+
static constexpr uint BOTTOM_MASK = 0x00010001;
|
697
|
+
static constexpr uint FP16_TOP_MAGIC_NUM = 0x64006400;
|
698
|
+
static constexpr uint MEDIAN_NUM = isSigned ? 0x64006400 : 0x64006400;
|
699
|
+
static constexpr uint TRANSFORM_SUBTRACT = 0xbc00bc00; // for signed int 2x - 1
|
700
|
+
// interleave {e31,e29,e27,e25,e23,e21,e19,e17,e15,e13,e11,e9,e7,e5,e3,e1,e30,e28,e26,e24,e22,e20,e18,e16,e14,e12,e10,e8,e6,e4,e2,e0}
|
701
|
+
// only decode e7,e5,e3,e1,e8,e6,e4,e2,e0
|
702
|
+
int8_t const i1s_i16 = *reinterpret_cast<int8_t *>(_i1s);
|
703
|
+
int i1s = (i1s_i16 & 0x0f);
|
704
|
+
i1s |= ((i1s_i16 & 0xf0) << 12);
|
705
|
+
#pragma unroll
|
706
|
+
// decode 2 elems at one time.
|
707
|
+
for (int i = 0; i < (N / 2); i++)
|
708
|
+
{
|
709
|
+
|
710
|
+
asm volatile("lop3.b32 %0, %1, %2, %3, %4;\\n"
|
711
|
+
: "=r"(h[i])
|
712
|
+
: "r"(i1s >> (1 * i)), "n"(BOTTOM_MASK), "n"(FP16_TOP_MAGIC_NUM), "n"(immLut));
|
713
|
+
asm volatile("sub.f16x2 %0, %1, %2;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(MEDIAN_NUM));
|
714
|
+
if constexpr (isSigned)
|
715
|
+
{
|
716
|
+
asm volatile("add.f16x2 %0, %1, %2;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(h[i]));
|
717
|
+
asm volatile("add.f16x2 %0, %1, %2;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(TRANSFORM_SUBTRACT));
|
718
|
+
}
|
719
|
+
if constexpr (withZeros && ZerosKind == 0)
|
720
|
+
{
|
721
|
+
asm volatile("sub.f16x2 %0, %1, %2;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(__pack_half2(*zeros, *zeros)));
|
722
|
+
}
|
723
|
+
if constexpr (withScaling)
|
724
|
+
{
|
725
|
+
asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(__pack_half2(*scale, *scale)), "r"(0));
|
726
|
+
}
|
727
|
+
if constexpr (withZeros && ZerosKind == 1)
|
728
|
+
{
|
729
|
+
asm volatile("sub.f16x2 %0, %1, %2;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(__pack_half2(*zeros, *zeros)));
|
730
|
+
}
|
731
|
+
}
|
732
|
+
}
|
733
|
+
|
734
|
+
template <typename T1, typename T2>
|
735
|
+
__device__ void decode_i1s_to_f16(T1 *_i1s, T2 *B_local_decode, const int N = 8)
|
736
|
+
{
|
737
|
+
decode_i1b_to_f16<T1, T2, true>(_i1s, B_local_decode, N);
|
738
|
+
}
|
739
|
+
|
740
|
+
template <typename T1, typename T2>
|
741
|
+
__device__ void decode_i1u_to_f16(T1 *_i1u, T2 *B_local_decode, const int N = 8)
|
742
|
+
{
|
743
|
+
decode_i1b_to_f16<T1, T2, false>(_i1u, B_local_decode, N);
|
744
|
+
}
|
745
|
+
"""
|
746
|
+
|
747
|
+
decode_i1_to_f16_scale = """
|
748
|
+
template <typename T1, typename T2, typename T3>
|
749
|
+
__device__ void decode_i1u_to_f16_scale(T1 *_i1s, T2 *B_local_decode, T3 *scale = nullptr, const int N = 8)
|
750
|
+
{
|
751
|
+
uint *h = reinterpret_cast<uint *>(B_local_decode);
|
752
|
+
|
753
|
+
static constexpr uint immLut = (0xf0 & 0xcc) | 0xaa;
|
754
|
+
static constexpr uint BOTTOM_MASK = 0x00010001;
|
755
|
+
static constexpr uint FP16_TOP_MAGIC_NUM = 0x64006400;
|
756
|
+
static constexpr uint MEDIAN_NUM = 0x64006400;
|
757
|
+
// interleave {e31,e29,e27,e25,e23,e21,e19,e17,e15,e13,e11,e9,e7,e5,e3,e1,e30,e28,e26,e24,e22,e20,e18,e16,e14,e12,e10,e8,e6,e4,e2,e0}
|
758
|
+
// only decode e7,e5,e3,e1,e8,e6,e4,e2,e0
|
759
|
+
int8_t const i1s_i16 = *reinterpret_cast<int8_t *>(_i1s);
|
760
|
+
int i1s = (i1s_i16 & 0x0f);
|
761
|
+
i1s |= ((i1s_i16 & 0xf0) << 12);
|
762
|
+
T3 const scale_r = *scale;
|
763
|
+
uint const packed_scales = __pack_half2(scale_r, scale_r);
|
764
|
+
#pragma unroll
|
765
|
+
// decode 2 elems at one time.
|
766
|
+
for (int i = 0; i < (N / 2); i++)
|
767
|
+
{
|
768
|
+
|
769
|
+
asm volatile("lop3.b32 %0, %1, %2, %3, %4;\\n"
|
770
|
+
: "=r"(h[i])
|
771
|
+
: "r"(i1s >> (1 * i)), "n"(BOTTOM_MASK), "n"(FP16_TOP_MAGIC_NUM), "n"(immLut));
|
772
|
+
asm volatile("sub.f16x2 %0, %1, %2;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(MEDIAN_NUM));
|
773
|
+
asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(packed_scales), "r"(0));
|
774
|
+
}
|
775
|
+
}
|
776
|
+
|
777
|
+
template <typename T1, typename T2, typename T3>
|
778
|
+
__device__ void decode_i1s_to_f16_scale(T1 *_i1s, T2 *B_local_decode, T3 *scale = nullptr, const int N = 8)
|
779
|
+
{
|
780
|
+
uint *h = reinterpret_cast<uint *>(B_local_decode);
|
781
|
+
|
782
|
+
static constexpr uint immLut = (0xf0 & 0xcc) | 0xaa;
|
783
|
+
static constexpr uint BOTTOM_MASK = 0x00010001;
|
784
|
+
static constexpr uint FP16_TOP_MAGIC_NUM = 0x64006400;
|
785
|
+
static constexpr uint MEDIAN_NUM = 0x64006400;
|
786
|
+
static constexpr uint TRANSFORM_SUBTRACT = 0xbc00bc00; // for signed int 2x - 1
|
787
|
+
// interleave {e31,e29,e27,e25,e23,e21,e19,e17,e15,e13,e11,e9,e7,e5,e3,e1,e30,e28,e26,e24,e22,e20,e18,e16,e14,e12,e10,e8,e6,e4,e2,e0}
|
788
|
+
// only decode e7,e5,e3,e1,e8,e6,e4,e2,e0
|
789
|
+
|
790
|
+
int8_t const i1s_i16 = *reinterpret_cast<int8_t *>(_i1s);
|
791
|
+
int i1s = (i1s_i16 & 0x0f);
|
792
|
+
i1s |= ((i1s_i16 & 0xf0) << 12);
|
793
|
+
T3 const scale_r = *scale;
|
794
|
+
uint const packed_scales = __pack_half2(scale_r, scale_r);
|
795
|
+
#pragma unroll
|
796
|
+
// decode 2 elems at one time.
|
797
|
+
for (int i = 0; i < (N / 2); i++)
|
798
|
+
{
|
799
|
+
|
800
|
+
asm volatile("lop3.b32 %0, %1, %2, %3, %4;\\n"
|
801
|
+
: "=r"(h[i])
|
802
|
+
: "r"(i1s >> (1 * i)), "n"(BOTTOM_MASK), "n"(FP16_TOP_MAGIC_NUM), "n"(immLut));
|
803
|
+
asm volatile("sub.f16x2 %0, %1, %2;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(MEDIAN_NUM));
|
804
|
+
asm volatile("add.f16x2 %0, %1, %2;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(h[i]));
|
805
|
+
asm volatile("add.f16x2 %0, %1, %2;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(TRANSFORM_SUBTRACT));
|
806
|
+
asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(packed_scales), "r"(0));
|
807
|
+
}
|
808
|
+
}
|
809
|
+
"""
|
810
|
+
|
811
|
+
decode_i1_to_f16_scale_zeros_original = """
|
812
|
+
template <typename T1, typename T2, typename T3, typename T4, bool isSigned = false>
|
813
|
+
__device__ void decode_i1b_to_f16_zeros_original(T1 *_i1s, T2 *B_local_decode, const int N = 8, T3 *scale = nullptr, T4 *zeros = nullptr)
|
814
|
+
{
|
815
|
+
uint *h = reinterpret_cast<uint *>(B_local_decode);
|
816
|
+
|
817
|
+
static constexpr uint immLut = (0xf0 & 0xcc) | 0xaa;
|
818
|
+
static constexpr uint BOTTOM_MASK = 0x00010001;
|
819
|
+
static constexpr uint FP16_TOP_MAGIC_NUM = 0x64006400;
|
820
|
+
static constexpr uint MEDIAN_NUM = 0x64006400;
|
821
|
+
// interleave {e31,e29,e27,e25,e23,e21,e19,e17,e15,e13,e11,e9,e7,e5,e3,e1,e30,e28,e26,e24,e22,e20,e18,e16,e14,e12,e10,e8,e6,e4,e2,e0}
|
822
|
+
// only decode e7,e5,e3,e1,e8,e6,e4,e2,e0
|
823
|
+
int8_t const i1s_i16 = *reinterpret_cast<int8_t *>(_i1s);
|
824
|
+
int i1s = (i1s_i16 & 0x0f);
|
825
|
+
i1s |= ((i1s_i16 & 0xf0) << 12);
|
826
|
+
T3 const scale_r = *scale;
|
827
|
+
uint const packed_scales = __pack_half2(scale_r, scale_r);
|
828
|
+
// input zeros maybe int32(qzeros) or half format
|
829
|
+
T4 const zero_r = *zeros;
|
830
|
+
uint const packed_zeros = __pack_half2(zero_r, zero_r);
|
831
|
+
|
832
|
+
#pragma unroll
|
833
|
+
// decode 2 elems at one time.
|
834
|
+
for (int i = 0; i < (N / 2); i++)
|
835
|
+
{
|
836
|
+
|
837
|
+
asm volatile("lop3.b32 %0, %1, %2, %3, %4;\\n"
|
838
|
+
: "=r"(h[i])
|
839
|
+
: "r"(i1s >> (1 * i)), "n"(BOTTOM_MASK), "n"(FP16_TOP_MAGIC_NUM), "n"(immLut));
|
840
|
+
asm volatile("sub.f16x2 %0, %1, %2;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(MEDIAN_NUM));
|
841
|
+
asm volatile("sub.f16x2 %0, %1, %2;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(packed_zeros));
|
842
|
+
asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(packed_scales), "r"(0));
|
843
|
+
}
|
844
|
+
}
|
845
|
+
template <typename T1, typename T2, typename T3, typename T4>
|
846
|
+
__device__ void decode_i1u_to_f16_scale_zeros_original(T1 *_i1u, T2 *B_local_decode, T3 *scale = nullptr, T4 *zeros = nullptr, const int N = 8)
|
847
|
+
{
|
848
|
+
decode_i1b_to_f16_zeros_original<T1, T2, T3, T4, false>(_i1u, B_local_decode, N, scale, zeros);
|
849
|
+
}
|
850
|
+
"""
|
851
|
+
|
852
|
+
decode_i1_to_f16_scale_zeros_rescale = """
|
853
|
+
template <typename T1, typename T2, typename T3, typename T4, bool isSigned = false>
|
854
|
+
__device__ void decode_i1b_to_f16_scale_zeros_rescale(T1 *_i1s, T2 *B_local_decode, const int N = 8, T3 *scale = nullptr, T4 *zeros = nullptr)
|
855
|
+
{
|
856
|
+
uint *h = reinterpret_cast<uint *>(B_local_decode);
|
857
|
+
|
858
|
+
static constexpr uint immLut = (0xf0 & 0xcc) | 0xaa;
|
859
|
+
static constexpr uint BOTTOM_MASK = 0x00010001;
|
860
|
+
static constexpr uint FP16_TOP_MAGIC_NUM = 0x64006400;
|
861
|
+
static constexpr uint MEDIAN_NUM = 0x64006400;
|
862
|
+
// interleave {e31,e29,e27,e25,e23,e21,e19,e17,e15,e13,e11,e9,e7,e5,e3,e1,e30,e28,e26,e24,e22,e20,e18,e16,e14,e12,e10,e8,e6,e4,e2,e0}
|
863
|
+
// only decode e7,e5,e3,e1,e8,e6,e4,e2,e0
|
864
|
+
int8_t const i1s_i16 = *reinterpret_cast<int8_t *>(_i1s);
|
865
|
+
int i1s = (i1s_i16 & 0x0f);
|
866
|
+
i1s |= ((i1s_i16 & 0xf0) << 12);
|
867
|
+
T3 const scale_r = *scale;
|
868
|
+
uint const packed_scales = __pack_half2(scale_r, scale_r);
|
869
|
+
T4 const zero_r = *zeros;
|
870
|
+
uint const packed_zeros = 0x80008000 | __pack_half2(zero_r, zero_r);
|
871
|
+
|
872
|
+
#pragma unroll
|
873
|
+
// decode 2 elems at one time.
|
874
|
+
for (int i = 0; i < (N / 2); i++)
|
875
|
+
{
|
876
|
+
|
877
|
+
asm volatile("lop3.b32 %0, %1, %2, %3, %4;\\n"
|
878
|
+
: "=r"(h[i])
|
879
|
+
: "r"(i1s >> (1 * i)), "n"(BOTTOM_MASK), "n"(FP16_TOP_MAGIC_NUM), "n"(immLut));
|
880
|
+
asm volatile("sub.f16x2 %0, %1, %2;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(MEDIAN_NUM));
|
881
|
+
asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\\n" : "=r"(h[i]) : "r"(h[i]), "r"(packed_scales), "r"(packed_zeros));
|
882
|
+
}
|
883
|
+
}
|
884
|
+
|
885
|
+
template <typename T1, typename T2, typename T3, typename T4>
|
886
|
+
__device__ void decode_i1u_to_f16_scale_zeros_rescale(T1 *_i4u, T2 *B_local_decode, T3 *scale = nullptr, T4 *zeros = nullptr, const int N = 8)
|
887
|
+
{
|
888
|
+
decode_i1b_to_f16_scale_zeros_rescale<T1, T2, T3, T4, false>(_i4u, B_local_decode, N, scale, zeros);
|
889
|
+
}
|
890
|
+
"""
|
891
|
+
|
892
|
+
decode_i1s_to_i8s = """template <typename T1, typename T2>
|
893
|
+
__device__ void decode_i1s_to_i8s(T1 *_i1b, T2 *_i8s, const int N = 16)
|
894
|
+
{
|
895
|
+
int i8s[4];
|
896
|
+
// vector load
|
897
|
+
*reinterpret_cast<int4 *>(i8s) = *reinterpret_cast<int4 *>(_i8s);
|
898
|
+
int16_t i1b_i16 = *reinterpret_cast<int16_t *>(_i1b);
|
899
|
+
// permutate: {e0,e4,e8,e12,e2,e6,e10,e14,e1,e5,e9,e13,e3,e7,e11,e15}
|
900
|
+
// into: {e0,e4,e8,e12,x,x,x,x,e1,e5,e9,x,x,x,x,e13,e2,e6,e10,e14,e1,e5,e9,e13,e3,e7,e11,e15,x,x,x,x}
|
901
|
+
int i1b = (i1b_i16 & 0x0f0f);
|
902
|
+
i1b |= ((i1b_i16 & 0xf0f0) << 12);
|
903
|
+
// i1b {0..,e15,e14,e13,e12,e11,e10,e9,e8,e7,e6,e5,e4,e3,e2,e1,e0}
|
904
|
+
// interleave {0..,e15,e13,e11,e9,e7,e5,e3,e1,e14,e12,e10,e8,e6,e4,e2,e0}
|
905
|
+
// First, we extract the i1b and construct an intermediate fp16 number.
|
906
|
+
static constexpr uint immLut = (0xf0 & 0xcc) | 0xaa; // 0b11101010
|
907
|
+
static constexpr uint BOTTOM_MASK = 0x01010101; // 0x1 -> 0b01 select 0,1
|
908
|
+
static constexpr uint I8s_MAGIC_NUM = 0x00000000;
|
909
|
+
static constexpr uint TRANSFORM_SUBTRACT = 0xffffffff; // for signed int 2x - 1
|
910
|
+
|
911
|
+
for (int i = 0; i < N / 4; i++)
|
912
|
+
{
|
913
|
+
asm volatile("lop3.b32 %0, %1, %2, %3, %4;\\n"
|
914
|
+
: "=r"(i8s[i])
|
915
|
+
: "r"(i1b >> i), "n"(BOTTOM_MASK), "n"(I8s_MAGIC_NUM), "n"(immLut));
|
916
|
+
i8s[i] = __vadd4(i8s[i], i8s[i]);
|
917
|
+
i8s[i] = __vadd4(i8s[i], TRANSFORM_SUBTRACT);
|
918
|
+
}
|
919
|
+
*reinterpret_cast<int4 *>(_i8s) = *reinterpret_cast<int4 *>(i8s);
|
920
|
+
}
|
921
|
+
|
922
|
+
template <typename T1, typename T2>
|
923
|
+
__device__ void decode_i1u_to_i8s(T1 *_i1b, T2 *_i8s, const int N = 16)
|
924
|
+
{
|
925
|
+
int *i8s = reinterpret_cast<int *>(_i8s);
|
926
|
+
int16_t i1b_i16 = *reinterpret_cast<int16_t *>(_i1b);
|
927
|
+
// permutate: {e0,e4,e8,e12,e2,e6,e10,e14,e1,e5,e9,e13,e3,e7,e11,e15}
|
928
|
+
// into: {e0,e4,e8,e12,x,x,x,x,e1,e5,e9,x,x,x,x,e13,e2,e6,e10,e14,e1,e5,e9,e13,e3,e7,e11,e15,x,x,x,x}
|
929
|
+
int i1b = (i1b_i16 & 0x0f0f);
|
930
|
+
i1b |= ((i1b_i16 & 0xf0f0) << 12);
|
931
|
+
// i1b {0..,e15,e14,e13,e12,e11,e10,e9,e8,e7,e6,e5,e4,e3,e2,e1,e0}
|
932
|
+
// interleave {0..,e15,e13,e11,e9,e7,e5,e3,e1,e14,e12,e10,e8,e6,e4,e2,e0}
|
933
|
+
// First, we extract the i1b and construct an intermediate fp16 number.
|
934
|
+
static constexpr uint immLut = (0xf0 & 0xcc) | 0xaa; // 0b11101010
|
935
|
+
static constexpr uint BOTTOM_MASK = 0x01010101; // 0x1 -> 0b01 select 0,1
|
936
|
+
static constexpr uint I8s_MAGIC_NUM = 0x00000000;
|
937
|
+
static constexpr uint MEDIAN_NUM = 0x00000000;
|
938
|
+
|
939
|
+
for (int i = 0; i < N / 4; i++)
|
940
|
+
{
|
941
|
+
asm volatile("lop3.b32 %0, %1, %2, %3, %4;\\n"
|
942
|
+
: "=r"(i8s[i])
|
943
|
+
: "r"(i1b >> i), "n"(BOTTOM_MASK), "n"(I8s_MAGIC_NUM), "n"(immLut));
|
944
|
+
}
|
945
|
+
}
|
946
|
+
|
947
|
+
"""
|
948
|
+
|
949
|
+
decode_i2s_to_i8s = """template <typename T1, typename T2>
|
950
|
+
__device__ void decode_i2s_to_i8s(T1 *_i2b, T2 *_i8s, const int N = 16)
|
951
|
+
{
|
952
|
+
// convert 8 int2b_t to 8 int8b_t -> 2 int32
|
953
|
+
uint *i8s = reinterpret_cast<uint *>(_i8s);
|
954
|
+
|
955
|
+
// i2b = {e7,e6,e5,e4,e3,e2,e1,e0}
|
956
|
+
// also require interleave {e7,e3,e6,e2,e5,e1,e4,e0}
|
957
|
+
uint const i2b = *reinterpret_cast<uint *>(_i2b);
|
958
|
+
|
959
|
+
// First, we extract the i4s and construct an intermediate fp16 number.
|
960
|
+
static constexpr uint immLut = (0xf0 & 0xcc) | 0xaa; // 0b11101010
|
961
|
+
static constexpr uint BOTTOM_MASK = 0x03030303; // 0xf -> 0b11 select 0,3
|
962
|
+
static constexpr uint I8s_MAGIC_NUM = 0x00000000; // 1024
|
963
|
+
static constexpr uint MEDIAN_NUM = 0x02020202;
|
964
|
+
#pragma unroll
|
965
|
+
for (int i = 0; i < (N / 4); i++)
|
966
|
+
{
|
967
|
+
asm volatile("lop3.b32 %0, %1, %2, %3, %4;\\n"
|
968
|
+
: "=r"(i8s[i])
|
969
|
+
: "r"(i2b >> (2 * i)), "n"(BOTTOM_MASK), "n"(I8s_MAGIC_NUM), "n"(immLut));
|
970
|
+
i8s[i] = __vsub4(i8s[i], MEDIAN_NUM);
|
971
|
+
}
|
972
|
+
}
|
973
|
+
template <typename T1, typename T2>
|
974
|
+
__device__ void decode_i2u_to_i8s(T1 *_i2b, T2 *_i8s, const int N = 16)
|
975
|
+
{
|
976
|
+
// convert 8 int2b_t to 8 int8b_t -> 2 int32
|
977
|
+
uint *i8s = reinterpret_cast<uint *>(_i8s);
|
978
|
+
|
979
|
+
// i2b = {e7,e6,e5,e4,e3,e2,e1,e0}
|
980
|
+
// also require interleave {e7,e3,e6,e2,e5,e1,e4,e0}
|
981
|
+
uint const i2b = *reinterpret_cast<uint *>(_i2b);
|
982
|
+
|
983
|
+
// First, we extract the i4s and construct an intermediate fp16 number.
|
984
|
+
static constexpr uint immLut = (0xf0 & 0xcc) | 0xaa; // 0b11101010
|
985
|
+
static constexpr uint BOTTOM_MASK = 0x03030303; // 0xf -> 0b11 select 0,3
|
986
|
+
static constexpr uint I8s_MAGIC_NUM = 0x00000000; // 1024
|
987
|
+
|
988
|
+
#pragma unroll
|
989
|
+
for (int i = 0; i < (N / 4); i++)
|
990
|
+
{
|
991
|
+
asm volatile("lop3.b32 %0, %1, %2, %3, %4;\\n"
|
992
|
+
: "=r"(i8s[i])
|
993
|
+
: "r"(i2b >> (2 * i)), "n"(BOTTOM_MASK), "n"(I8s_MAGIC_NUM), "n"(immLut));
|
994
|
+
}
|
995
|
+
}
|
996
|
+
"""
|
997
|
+
|
998
|
+
decode_i4s_to_i8s = """template <typename T1, typename T2>
|
999
|
+
__device__ void decode_i4s_to_i8s(T1 *_i4b, T2 *_i8s, const int N = 16)
|
1000
|
+
{
|
1001
|
+
uint *i8s = reinterpret_cast<uint *>(_i8s);
|
1002
|
+
uint *i4b = reinterpret_cast<uint *>(_i4b);
|
1003
|
+
// First, we extract the i4s and construct an intermediate i8 number.
|
1004
|
+
static constexpr uint immLut = (0xf0 & 0xcc) | 0xaa;
|
1005
|
+
static constexpr uint BOTTOM_MASK = 0x0f0f0f0f; // 0xf -> 0b1111 select 0,4,8,12
|
1006
|
+
static constexpr uint I4b_TO_I8s_MAGIC_NUM = 0x00000000; // 0
|
1007
|
+
static constexpr uint MEDIAN_NUM = 0x07070707;
|
1008
|
+
#pragma unroll
|
1009
|
+
for (int i = 0; i < (N / 8); i++)
|
1010
|
+
{
|
1011
|
+
// Extract elt_01 - (i4s & 0x000f000f) | 0x64006400
|
1012
|
+
asm volatile("lop3.b32 %0, %1, %2, %3, %4;\\n"
|
1013
|
+
: "=r"(i8s[i])
|
1014
|
+
: "r"(i4b[0] >> (4 * i)), "n"(BOTTOM_MASK), "n"(I4b_TO_I8s_MAGIC_NUM), "n"(immLut));
|
1015
|
+
|
1016
|
+
asm volatile("lop3.b32 %0, %1, %2, %3, %4;\\n"
|
1017
|
+
: "=r"(i8s[i + 2])
|
1018
|
+
: "r"(i4b[1] >> (4 * i)), "n"(BOTTOM_MASK), "n"(I4b_TO_I8s_MAGIC_NUM), "n"(immLut));
|
1019
|
+
i8s[i] = __vsubss4(i8s[i], MEDIAN_NUM);
|
1020
|
+
i8s[i + 2] = __vsubss4(i8s[i + 2], MEDIAN_NUM);
|
1021
|
+
}
|
1022
|
+
}
|
1023
|
+
|
1024
|
+
template <typename T1, typename T2>
|
1025
|
+
__device__ void decode_i4u_to_i8s(T1 *_i4b, T2 *_i8s, const int N = 16)
|
1026
|
+
{
|
1027
|
+
uint *i8s = reinterpret_cast<uint *>(_i8s);
|
1028
|
+
uint *i4b = reinterpret_cast<uint *>(_i4b);
|
1029
|
+
// First, we extract the i4s and construct an intermediate i8 number.
|
1030
|
+
static constexpr uint immLut = (0xf0 & 0xcc) | 0xaa;
|
1031
|
+
static constexpr uint BOTTOM_MASK = 0x0f0f0f0f; // 0xf -> 0b1111 select 0,4,8,12
|
1032
|
+
static constexpr uint I4b_TO_I8s_MAGIC_NUM = 0x00000000; // 0
|
1033
|
+
#pragma unroll
|
1034
|
+
for (int i = 0; i < (N / 8); i++)
|
1035
|
+
{
|
1036
|
+
// Extract elt_01 - (i4s & 0x000f000f) | 0x64006400
|
1037
|
+
asm volatile("lop3.b32 %0, %1, %2, %3, %4;\\n"
|
1038
|
+
: "=r"(i8s[i])
|
1039
|
+
: "r"(i4b[0] >> (4 * i)), "n"(BOTTOM_MASK), "n"(I4b_TO_I8s_MAGIC_NUM), "n"(immLut));
|
1040
|
+
|
1041
|
+
asm volatile("lop3.b32 %0, %1, %2, %3, %4;\\n"
|
1042
|
+
: "=r"(i8s[i + 2])
|
1043
|
+
: "r"(i4b[1] >> (4 * i)), "n"(BOTTOM_MASK), "n"(I4b_TO_I8s_MAGIC_NUM), "n"(immLut));
|
1044
|
+
}
|
1045
|
+
}
|
1046
|
+
"""
|
1047
|
+
|
1048
|
+
decode_i2s_to_i4s = r"""
|
1049
|
+
template <typename T1, typename T2, bool isSigned>
|
1050
|
+
__device__ void decode_i2b_to_i4s(T1 *_i2b, T2 *_i4s, const int N = 16)
|
1051
|
+
{
|
1052
|
+
uint *i4s = reinterpret_cast<uint *>(_i4s);
|
1053
|
+
uint *i2b = reinterpret_cast<uint *>(_i2b);
|
1054
|
+
// First, we extract the i4s and construct an intermediate i8 number.
|
1055
|
+
static constexpr uint immLut = (0xf0 & 0xcc) | 0xaa;
|
1056
|
+
static constexpr uint BOTTOM_MASK = 0x33333333; // 0xf -> 0b1111 select 0,2,4,6,8,10,12
|
1057
|
+
static constexpr uint I4b_TO_I8s_MAGIC_NUM = 0x00000000; // 0
|
1058
|
+
static constexpr uint MEDIAN_NUM = isSigned ? 0x33333333 : 0x00000000;
|
1059
|
+
|
1060
|
+
#pragma unroll
|
1061
|
+
for (int i = 0; i < (N / 8); i++)
|
1062
|
+
{
|
1063
|
+
// Extract elt_01 - (i4s & 0x000f000f) | 0x64006400
|
1064
|
+
asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
|
1065
|
+
: "=r"(i4s[i])
|
1066
|
+
: "r"(i2b[i / 2] >> (2 * (i % 2))), "n"(BOTTOM_MASK), "n"(I4b_TO_I8s_MAGIC_NUM), "n"(immLut));
|
1067
|
+
if constexpr (isSigned)
|
1068
|
+
{
|
1069
|
+
// TODO(lei): uint4 sub should be enhanced.
|
1070
|
+
// 0x03 0x03 0x03 0x03
|
1071
|
+
// i4s[i] = (((i4s[i] << 1) | i4s[i]) << 1) | i4s[i];
|
1072
|
+
}
|
1073
|
+
}
|
1074
|
+
}
|
1075
|
+
|
1076
|
+
template <typename T1, typename T2>
|
1077
|
+
__device__ void decode_i2s_to_i4s(T1 *_i4s, T2 *B_local_decode, const int N = 16)
|
1078
|
+
{
|
1079
|
+
decode_i2b_to_i4s<T1, T2, true>(_i4s, B_local_decode, N);
|
1080
|
+
}
|
1081
|
+
|
1082
|
+
template <typename T1, typename T2>
|
1083
|
+
__device__ void decode_i2u_to_i4s(T1 *_i4u, T2 *B_local_decode, const int N = 16)
|
1084
|
+
{
|
1085
|
+
decode_i2b_to_i4s<T1, T2, false>(_i4u, B_local_decode, N);
|
1086
|
+
}
|
1087
|
+
"""
|
1088
|
+
|
1089
|
+
|
1090
|
+
def get_lop3_intrin_group(
|
1091
|
+
out_dtype: Literal["float16", "int8", "int4"],
|
1092
|
+
source_format: Literal["int", "uint"] = "uint",
|
1093
|
+
source_bit: int = 4,
|
1094
|
+
storage_dtype: Literal["int32", "int8"] = "int8",
|
1095
|
+
with_scaling: bool = False,
|
1096
|
+
with_zeros: bool = False,
|
1097
|
+
zeros_mode: Literal["original", "rescale", "quantized"] = "original",
|
1098
|
+
storage_scope: str = "local",
|
1099
|
+
) -> Dict[str, str]:
|
1100
|
+
"""
|
1101
|
+
This function is used to get the intrinsic group of the LOP3 operation to avoid the overhead of fast decoding.
|
1102
|
+
LOP3 is a type of logic operation that takes three inputs. The intrinsic group refers to the set of
|
1103
|
+
intrinsic operations that can be performed on these inputs. This function retrieves and returns this group.
|
1104
|
+
|
1105
|
+
Parameters
|
1106
|
+
----------
|
1107
|
+
in_dtype : Literal["int8"]
|
1108
|
+
The data type of the input. It should be "int8".
|
1109
|
+
|
1110
|
+
out_dtype : Literal["float16", "int8", "int4"]
|
1111
|
+
The data type of the output. It can be either "float16" or "int8" or "int4".
|
1112
|
+
|
1113
|
+
storage_nbit : int, optional
|
1114
|
+
The number of bits used for storage. By default, it is 4.
|
1115
|
+
|
1116
|
+
with_scale : bool, optional
|
1117
|
+
A boolean parameter that indicates whether scaling should be applied. By default, it is False.
|
1118
|
+
|
1119
|
+
with_zeros : bool, optional
|
1120
|
+
A boolean parameter that indicates whether zeros should be used. By default, it is False.
|
1121
|
+
|
1122
|
+
zeros_mode : Literal["original", "rescale", "quantized"], optional
|
1123
|
+
The mode of zeros. It can be either "original", "rescale", or "quantized". By default, it is "original".
|
1124
|
+
|
1125
|
+
storage_scope : Literal["local", "warp"], optional
|
1126
|
+
The scope of the storage. It can be either "local" or "warp". By default, it is "local".
|
1127
|
+
|
1128
|
+
Returns
|
1129
|
+
-------
|
1130
|
+
Dict[str, str]
|
1131
|
+
A dictionary mapping the names of the intrinsics to their corresponding implementations.
|
1132
|
+
"""
|
1133
|
+
assert out_dtype in [
|
1134
|
+
"float16", "int8", "int4"
|
1135
|
+
], (f"Invalid out_dtype: {out_dtype}. Expected 'float16' or 'int8' or 'int4' .")
|
1136
|
+
|
1137
|
+
dtype_mapping = {"float16": "f16", "int4": "i4", "int8": "i8", "int32": "i32"}
|
1138
|
+
target_dtype = dtype_mapping[out_dtype]
|
1139
|
+
|
1140
|
+
if source_format not in ["int", "uint"]:
|
1141
|
+
raise ValueError(
|
1142
|
+
f"Invalid source_format. Expected 'int' or 'uint', but got {source_format}.")
|
1143
|
+
if with_zeros and source_format == "int":
|
1144
|
+
raise ValueError(f"Zeros are not supported for signed integers, but got {source_format}")
|
1145
|
+
|
1146
|
+
source_symbol = "i" if source_format == "int" else "u"
|
1147
|
+
|
1148
|
+
import_c_map = {
|
1149
|
+
"i4_to_f16": decode_i4_to_f16,
|
1150
|
+
"i2_to_f16": decode_i2_to_f16,
|
1151
|
+
"i1_to_f16": decode_i1_to_f16,
|
1152
|
+
"i4_to_f16_scale": decode_i4_to_f16_scale,
|
1153
|
+
"i4_to_f16_scale_offset": decode_i4_to_f16_scale_offset,
|
1154
|
+
"i2_to_f16_scale": decode_i2_to_f16_scale,
|
1155
|
+
"i1_to_f16_scale": decode_i1_to_f16_scale,
|
1156
|
+
"i4_to_f16_scale_zeros_original": decode_i4_to_f16_scale_zeros_original,
|
1157
|
+
"i4_to_f16_scale_zeros_original_offset": decode_i4_to_f16_scale_zeros_original_offset,
|
1158
|
+
"i2_to_f16_scale_zeros_original": decode_i2_to_f16_scale_zeros_original,
|
1159
|
+
"i1_to_f16_scale_zeros_original": decode_i1_to_f16_scale_zeros_original,
|
1160
|
+
"i4_to_f16_scale_zeros_rescale": decode_i4_to_f16_scale_zeros_rescale,
|
1161
|
+
"i4_to_f16_scale_zeros_rescale_offset": decode_i4_to_f16_scale_zeros_rescale_offset,
|
1162
|
+
"i2_to_f16_scale_zeros_rescale": decode_i2_to_f16_scale_zeros_rescale,
|
1163
|
+
"i1_to_f16_scale_zeros_rescale": decode_i1_to_f16_scale_zeros_rescale,
|
1164
|
+
"i4_to_f16_scale_zeros_quantized": decode_i4_to_f16_scale_zeros_quantized,
|
1165
|
+
"i2_to_f16_scale_zeros_quantized": decode_i2_to_f16_scale_zeros_quantized,
|
1166
|
+
"i4_to_f16_scale_zeros_quantized_offset": decode_i4_to_f16_scale_zeros_quantized_offset,
|
1167
|
+
"i1_to_i8": decode_i1s_to_i8s,
|
1168
|
+
"i2_to_i8": decode_i2s_to_i8s,
|
1169
|
+
"i4_to_i8": decode_i4s_to_i8s,
|
1170
|
+
"i2_to_i4": decode_i2s_to_i4s,
|
1171
|
+
}
|
1172
|
+
key = f"i{source_bit}_to_{target_dtype}"
|
1173
|
+
if with_scaling:
|
1174
|
+
key += "_scale"
|
1175
|
+
if with_zeros:
|
1176
|
+
key += f"_zeros_{zeros_mode}"
|
1177
|
+
|
1178
|
+
is_ladder_stage3 = (storage_scope == "warp") and with_scaling
|
1179
|
+
if is_ladder_stage3:
|
1180
|
+
key += "_offset"
|
1181
|
+
|
1182
|
+
if out_dtype == "float16":
|
1183
|
+
d4f = "f16"
|
1184
|
+
elif out_dtype == "int8":
|
1185
|
+
d4f = "i8s"
|
1186
|
+
elif out_dtype == "int4":
|
1187
|
+
d4f = "i4s"
|
1188
|
+
else:
|
1189
|
+
raise ValueError("Unsupported target dtype: {}".format(target_dtype))
|
1190
|
+
source_symbol = "u" if source_format == "uint" else "s"
|
1191
|
+
func_name = "decode_i{}{}_to_{}".format(source_bit, source_symbol, d4f)
|
1192
|
+
if with_scaling:
|
1193
|
+
func_name += "_scale"
|
1194
|
+
if with_zeros:
|
1195
|
+
func_name += f"_zeros_{zeros_mode}"
|
1196
|
+
if is_ladder_stage3:
|
1197
|
+
func_name += "_offset"
|
1198
|
+
|
1199
|
+
return {
|
1200
|
+
"func_name": func_name,
|
1201
|
+
"c_source": import_c_map[key],
|
1202
|
+
}
|