@fugood/llama.node 0.3.15 → 0.3.17

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (203) hide show
  1. package/CMakeLists.txt +3 -0
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  7. package/bin/linux-cuda/x64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  9. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  10. package/bin/win32/arm64/llama-node.node +0 -0
  11. package/bin/win32/arm64/node.lib +0 -0
  12. package/bin/win32/x64/llama-node.node +0 -0
  13. package/bin/win32/x64/node.lib +0 -0
  14. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/arm64/node.lib +0 -0
  16. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  17. package/bin/win32-vulkan/x64/node.lib +0 -0
  18. package/lib/binding.ts +5 -0
  19. package/package.json +1 -1
  20. package/src/LlamaCompletionWorker.cpp +8 -0
  21. package/src/LlamaCompletionWorker.h +1 -0
  22. package/src/LlamaContext.cpp +3 -2
  23. package/src/llama.cpp/.github/workflows/build-linux-cross.yml +124 -0
  24. package/src/llama.cpp/.github/workflows/build.yml +70 -27
  25. package/src/llama.cpp/.github/workflows/docker.yml +6 -6
  26. package/src/llama.cpp/.github/workflows/server.yml +7 -11
  27. package/src/llama.cpp/CMakeLists.txt +23 -1
  28. package/src/llama.cpp/common/CMakeLists.txt +6 -3
  29. package/src/llama.cpp/common/arg.cpp +809 -105
  30. package/src/llama.cpp/common/arg.h +9 -0
  31. package/src/llama.cpp/common/chat.cpp +1 -1
  32. package/src/llama.cpp/common/common.cpp +31 -521
  33. package/src/llama.cpp/common/common.h +17 -36
  34. package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -0
  35. package/src/llama.cpp/common/llguidance.cpp +30 -47
  36. package/src/llama.cpp/common/minja/chat-template.hpp +15 -7
  37. package/src/llama.cpp/common/minja/minja.hpp +119 -93
  38. package/src/llama.cpp/common/sampling.cpp +3 -0
  39. package/src/llama.cpp/docs/build.md +122 -7
  40. package/src/llama.cpp/examples/CMakeLists.txt +0 -9
  41. package/src/llama.cpp/examples/batched/batched.cpp +1 -1
  42. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +1 -1
  43. package/src/llama.cpp/examples/embedding/embedding.cpp +7 -1
  44. package/src/llama.cpp/examples/export-lora/export-lora.cpp +1 -1
  45. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +15 -16
  46. package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
  47. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +210 -8
  48. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
  49. package/src/llama.cpp/examples/llava/CMakeLists.txt +39 -24
  50. package/src/llama.cpp/examples/llava/clip-impl.h +345 -0
  51. package/src/llama.cpp/examples/llava/clip.cpp +2152 -1803
  52. package/src/llama.cpp/examples/llava/clip.h +39 -22
  53. package/src/llama.cpp/examples/llava/deprecation-warning.cpp +22 -0
  54. package/src/llama.cpp/examples/llava/llava.cpp +64 -52
  55. package/src/llama.cpp/examples/llava/mtmd-cli.cpp +344 -0
  56. package/src/llama.cpp/examples/llava/mtmd.cpp +708 -0
  57. package/src/llama.cpp/examples/llava/mtmd.h +168 -0
  58. package/src/llama.cpp/examples/llava/{qwen2vl-cli.cpp → qwen2vl-test.cpp} +83 -31
  59. package/src/llama.cpp/examples/main/main.cpp +16 -5
  60. package/src/llama.cpp/examples/parallel/parallel.cpp +3 -1
  61. package/src/llama.cpp/examples/passkey/passkey.cpp +1 -1
  62. package/src/llama.cpp/examples/perplexity/perplexity.cpp +17 -3
  63. package/src/llama.cpp/examples/quantize/quantize.cpp +115 -2
  64. package/src/llama.cpp/examples/rpc/CMakeLists.txt +4 -2
  65. package/src/llama.cpp/examples/rpc/rpc-server.cpp +163 -8
  66. package/src/llama.cpp/examples/run/CMakeLists.txt +12 -1
  67. package/src/llama.cpp/examples/run/run.cpp +14 -28
  68. package/src/llama.cpp/examples/server/httplib.h +313 -247
  69. package/src/llama.cpp/examples/server/server.cpp +243 -139
  70. package/src/llama.cpp/examples/server/utils.hpp +51 -2
  71. package/src/llama.cpp/examples/speculative/speculative.cpp +1 -1
  72. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
  73. package/src/llama.cpp/examples/sycl/build.sh +2 -2
  74. package/src/llama.cpp/examples/sycl/win-build-sycl.bat +2 -2
  75. package/src/llama.cpp/examples/tts/tts.cpp +14 -9
  76. package/src/llama.cpp/ggml/CMakeLists.txt +8 -2
  77. package/src/llama.cpp/ggml/cmake/GitVars.cmake +22 -0
  78. package/src/llama.cpp/ggml/include/ggml-cpu.h +5 -0
  79. package/src/llama.cpp/ggml/include/ggml-rpc.h +6 -1
  80. package/src/llama.cpp/ggml/include/ggml.h +66 -99
  81. package/src/llama.cpp/ggml/src/CMakeLists.txt +15 -8
  82. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +0 -2
  83. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +8 -4
  84. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +5 -5
  85. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +692 -1534
  86. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +613 -122
  87. package/src/llama.cpp/ggml/src/ggml-cann/common.h +135 -1
  88. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +507 -137
  89. package/src/llama.cpp/ggml/src/ggml-common.h +12 -6
  90. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +48 -22
  91. package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.cpp +158 -0
  92. package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
  93. package/src/llama.cpp/ggml/src/ggml-cpu/common.h +72 -0
  94. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +1 -1
  95. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2413 -228
  96. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +2 -21
  97. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +754 -404
  98. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1004 -13516
  99. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +2 -0
  100. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +2 -7
  101. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +0 -1
  102. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +3 -4
  103. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +533 -88
  104. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +8809 -0
  105. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +110 -0
  106. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +892 -0
  107. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +186 -0
  108. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +28 -0
  109. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +258 -0
  110. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +802 -0
  111. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +7 -0
  112. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +1 -0
  113. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +0 -4
  114. package/src/llama.cpp/ggml/src/ggml-impl.h +52 -18
  115. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +70 -3
  116. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +67 -119
  117. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1023 -260
  118. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +293 -40
  119. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +127 -33
  120. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
  121. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +350 -0
  122. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.hpp +39 -0
  123. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +0 -35
  124. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +29 -293
  125. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +79 -90
  126. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +967 -438
  127. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +22 -23
  128. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +12 -43
  129. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +24 -20
  130. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +1 -4
  131. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +210 -286
  132. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +84 -74
  133. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +1 -3
  134. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +37 -49
  135. package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +7 -22
  136. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +4 -14
  137. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +204 -118
  138. package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +1 -3
  139. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +23 -0
  140. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +692 -126
  141. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +12 -0
  142. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +21 -10
  143. package/src/llama.cpp/ggml/src/ggml.c +141 -245
  144. package/src/llama.cpp/ggml/src/gguf.cpp +1 -0
  145. package/src/llama.cpp/include/llama.h +30 -11
  146. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.inp +112 -0
  147. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.out +46 -0
  148. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +112 -0
  149. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.out +46 -0
  150. package/src/llama.cpp/requirements/requirements-all.txt +2 -0
  151. package/src/llama.cpp/requirements/requirements-gguf_editor_gui.txt +3 -0
  152. package/src/llama.cpp/src/CMakeLists.txt +3 -2
  153. package/src/llama.cpp/src/llama-adapter.cpp +37 -1
  154. package/src/llama.cpp/src/llama-arch.cpp +161 -17
  155. package/src/llama.cpp/src/llama-arch.h +16 -0
  156. package/src/llama.cpp/src/llama-chat.cpp +82 -17
  157. package/src/llama.cpp/src/llama-chat.h +6 -2
  158. package/src/llama.cpp/src/llama-context.cpp +108 -92
  159. package/src/llama.cpp/src/llama-context.h +1 -2
  160. package/src/llama.cpp/src/llama-graph.cpp +189 -119
  161. package/src/llama.cpp/src/llama-graph.h +26 -6
  162. package/src/llama.cpp/src/llama-hparams.h +13 -0
  163. package/src/llama.cpp/src/llama-kv-cache.cpp +70 -123
  164. package/src/llama.cpp/src/llama-kv-cache.h +41 -115
  165. package/src/llama.cpp/src/llama-memory.h +1 -1
  166. package/src/llama.cpp/src/llama-mmap.cpp +1 -1
  167. package/src/llama.cpp/src/llama-model-loader.cpp +10 -5
  168. package/src/llama.cpp/src/llama-model-loader.h +5 -3
  169. package/src/llama.cpp/src/llama-model.cpp +1544 -291
  170. package/src/llama.cpp/src/llama-model.h +13 -1
  171. package/src/llama.cpp/src/llama-quant.cpp +29 -8
  172. package/src/llama.cpp/src/llama-sampling.cpp +7 -1
  173. package/src/llama.cpp/src/llama-vocab.cpp +44 -6
  174. package/src/llama.cpp/src/llama.cpp +1 -1
  175. package/src/llama.cpp/tests/CMakeLists.txt +43 -30
  176. package/src/llama.cpp/tests/test-arg-parser.cpp +51 -4
  177. package/src/llama.cpp/tests/test-backend-ops.cpp +139 -57
  178. package/src/llama.cpp/tests/test-chat-template.cpp +34 -13
  179. package/src/llama.cpp/tests/test-chat.cpp +12 -2
  180. package/src/llama.cpp/{examples/gbnf-validator/gbnf-validator.cpp → tests/test-gbnf-validator.cpp} +2 -2
  181. package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -2
  182. package/src/llama.cpp/tests/test-grammar-llguidance.cpp +63 -2
  183. package/src/llama.cpp/tests/test-grammar-parser.cpp +3 -1
  184. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -1
  185. package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -1
  186. package/src/llama.cpp/{examples/quantize-stats/quantize-stats.cpp → tests/test-quantize-stats.cpp} +3 -1
  187. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +2 -1
  188. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +2 -1
  189. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +0 -5
  190. package/src/llama.cpp/examples/llava/gemma3-cli.cpp +0 -341
  191. package/src/llama.cpp/examples/llava/llava-cli.cpp +0 -332
  192. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +0 -354
  193. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +0 -6
  194. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
  195. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
  196. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
  197. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
  198. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
  199. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
  200. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
  201. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
  202. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
  203. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
@@ -1,30 +0,0 @@
1
- file(GLOB SRC_FILES
2
- get_row_f32.cpp
3
- get_row_f16.cpp
4
- get_row_q4_0.cpp
5
- get_row_q8_0.cpp
6
- quantize_f32_q8_0.cpp
7
- quantize_f16_q8_0.cpp
8
- quantize_float_to_q4_0.cpp
9
- dup.cpp
10
- )
11
-
12
- set(ASCEND_CANN_PACKAGE_PATH ${CANN_INSTALL_DIR})
13
- set(RUN_MODE "npu" CACHE STRING "run mode: npu/sim")
14
-
15
- if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
16
- set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
17
- elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/ascendc_devkit/tikcpp/samples/cmake)
18
- set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/ascendc_devkit/tikcpp/samples/cmake)
19
- else()
20
- message(FATAL_ERROR "ascendc_kernel_cmake does not exist, please check whether the compiler package is installed.")
21
- endif()
22
- include(${ASCENDC_CMAKE_DIR}/ascendc.cmake)
23
-
24
- ascendc_library(ascendc_kernels STATIC
25
- ${SRC_FILES}
26
- )
27
-
28
- message(STATUS "CANN: compile ascend kernels witch SOC_TYPE:${SOC_TYPE}, SOC_VERSION:${SOC_VERSION}, compile macro:-D${SOC_TYPE_COMPILE_OPTION}.")
29
- ascendc_compile_definitions(ascendc_kernels PRIVATE "-D${SOC_TYPE_COMPILE_OPTION}")
30
- # ascendc_compile_definitions(ascendc_kernels PRIVATE -DASCENDC_DUMP)
@@ -1,19 +0,0 @@
1
- #ifndef ASCENDC_KERNELS_H
2
- #define ASCENDC_KERNELS_H
3
-
4
- #include "aclrtlaunch_ascendc_get_row_f32.h"
5
- #include "aclrtlaunch_ascendc_get_row_f16.h"
6
- #include "aclrtlaunch_ascendc_get_row_q8_0.h"
7
- #include "aclrtlaunch_ascendc_get_row_q4_0.h"
8
-
9
- #include "aclrtlaunch_ascendc_quantize_f32_q8_0.h"
10
- #include "aclrtlaunch_ascendc_quantize_f16_q8_0.h"
11
- #include "aclrtlaunch_ascendc_quantize_f16_to_q4_0.h"
12
- #include "aclrtlaunch_ascendc_quantize_f32_to_q4_0.h"
13
-
14
- #include "aclrtlaunch_ascendc_dup_by_rows_fp16.h"
15
- #include "aclrtlaunch_ascendc_dup_by_rows_fp32.h"
16
- #include "aclrtlaunch_ascendc_dup_by_rows_fp32_to_fp16.h"
17
- #include "aclrtlaunch_ascendc_dup_by_rows_fp16_to_fp32.h"
18
-
19
- #endif // ASCENDC_KERNELS_H
@@ -1,234 +0,0 @@
1
- #include "kernel_operator.h"
2
-
3
- using namespace AscendC;
4
-
5
- #define BUFFER_NUM 2
6
- const int64_t SUPPORTED_MAX_DIM = 65535; // currently the limit of max block dim supportted by dup kernel is 65535template <typename SRC_T, typename DST_T>
7
-
8
- template <typename SRC_T, typename DST_T>
9
- class DupByRows {
10
- public:
11
- __aicore__ inline DupByRows() {}
12
- __aicore__ inline void init(GM_ADDR src, GM_ADDR dst, int64_t *input_ne_ub,
13
- size_t *input_nb_ub) {
14
- /* Dup by rows when src is contigous on first dimension and dst is
15
- contiguous, each kernel process one row.
16
- */
17
-
18
- // Input has four dims.
19
- int64_t op_block_num = GetBlockNum();
20
- int64_t op_block_idx = GetBlockIdx();
21
-
22
- // param
23
- num_rows = input_ne_ub[1] * input_ne_ub[2] * input_ne_ub[3];
24
- num_elem = input_ne_ub[0];
25
-
26
- // index for (ne[1], ne[2], ne[3]): (idx_ne1, idx_ne2, idx_ne3)
27
- idx_ne3 = op_block_idx / (input_ne_ub[1] * input_ne_ub[2]);
28
- idx_ne2 = (op_block_idx - idx_ne3 * (input_ne_ub[1] * input_ne_ub[2]))
29
- / (input_ne_ub[1]);
30
- idx_ne1 = op_block_idx - idx_ne3 * (input_ne_ub[1] * input_ne_ub[2])
31
- - idx_ne2 * input_ne_ub[1];
32
-
33
- // src may not contiguous in dim [1,2,3], so stride decited by ne&nb
34
- src_stride = input_nb_ub[3] * idx_ne3 + input_nb_ub[2] * idx_ne2
35
- + input_nb_ub[1] * idx_ne1;
36
-
37
- // dst is contiguous
38
- dst_stride = op_block_idx * (input_ne_ub[0] * sizeof(DST_T));
39
-
40
- src_gm.SetGlobalBuffer(reinterpret_cast<__gm__ SRC_T *>(src +
41
- src_stride));
42
- dst_gm.SetGlobalBuffer(reinterpret_cast<__gm__ DST_T *>(dst +
43
- dst_stride));
44
-
45
- pipe.InitBuffer(src_queue, BUFFER_NUM, (sizeof(SRC_T) * num_elem +
46
- 32 - 1) / 32 * 32);
47
- pipe.InitBuffer(dst_queue, BUFFER_NUM, (sizeof(DST_T) * num_elem +
48
- 32 - 1) / 32 * 32);
49
- }
50
-
51
- __aicore__ inline void copy_in() {
52
- LocalTensor<SRC_T> src_local = src_queue.AllocTensor<SRC_T>();
53
- const size_t elem_per_block = 32 / sizeof(SRC_T);
54
- size_t tail = num_elem % elem_per_block;
55
- size_t cpy_elements_len = tail > 0 ? num_elem + 1 : num_elem;
56
- DataCopy(src_local, src_gm, cpy_elements_len);
57
- src_queue.EnQue(src_local);
58
- }
59
-
60
- __aicore__ inline void copy_out() {
61
- LocalTensor<DST_T> dst_local = dst_queue.DeQue<DST_T>();
62
- #ifdef ASCEND_310P
63
- const size_t elem_per_block = 32 / sizeof(DST_T);
64
- size_t tail = num_elem % elem_per_block;
65
- size_t len = num_elem & ~(elem_per_block - 1);
66
- if (len > 0) {
67
- DataCopy(dst_gm, dst_local, len);
68
- }
69
- if(tail != 0) {
70
- for (size_t i = tail; i < elem_per_block; i++) {
71
- dst_local[len + i].SetValue(0, 0);
72
- }
73
- SetAtomicAdd<float>();
74
- DataCopy(dst_gm[len], dst_local[len], elem_per_block);
75
- SetAtomicNone();
76
- }
77
- #else
78
- DataCopyExtParams dataCopyParams;
79
- dataCopyParams.blockCount = 1;
80
- dataCopyParams.blockLen = num_elem * sizeof(DST_T);
81
- DataCopyPad(dst_gm, dst_local, dataCopyParams);
82
- #endif
83
- dst_queue.FreeTensor(dst_local);
84
- }
85
-
86
- __aicore__ inline void dup() {
87
- // main process, copy one row data from src to dst.
88
- copy_in();
89
-
90
- LocalTensor<SRC_T> src_local = src_queue.DeQue<SRC_T>();
91
- LocalTensor<DST_T> dst_local = dst_queue.AllocTensor<DST_T>();
92
-
93
- int32_t BLOCK_NUM = 32 / sizeof(DST_T);
94
- DataCopy(dst_local, src_local, (num_elem + BLOCK_NUM - 1)
95
- / BLOCK_NUM * BLOCK_NUM);
96
- dst_queue.EnQue<DST_T>(dst_local);
97
-
98
- src_queue.FreeTensor(src_local);
99
- copy_out();
100
- }
101
-
102
- __aicore__ inline void dup_with_cast() {
103
- // main process, copy one row data from src to dst.
104
- // cast dtype from src to dst.
105
- copy_in();
106
-
107
- LocalTensor<SRC_T> src_local = src_queue.DeQue<SRC_T>();
108
- LocalTensor<DST_T> dst_local = dst_queue.AllocTensor<DST_T>();
109
-
110
- Cast(dst_local, src_local, RoundMode::CAST_NONE, num_elem);
111
- dst_queue.EnQue<DST_T>(dst_local);
112
-
113
- src_queue.FreeTensor(src_local);
114
- copy_out();
115
- }
116
-
117
- private:
118
-
119
- TPipe pipe;
120
- GlobalTensor<SRC_T> src_gm;
121
- GlobalTensor<DST_T> dst_gm;
122
-
123
- int64_t num_rows;
124
- int64_t num_elem;
125
- int64_t idx_ne3;
126
- int64_t idx_ne2;
127
- int64_t idx_ne1;
128
- int64_t src_stride;
129
- int64_t dst_stride;
130
-
131
- TQue<QuePosition::VECIN, BUFFER_NUM> src_queue;
132
- TQue<QuePosition::VECOUT, BUFFER_NUM> dst_queue;
133
- };
134
-
135
- template <typename T>
136
- __aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
137
- auto gm_ptr = (__gm__ uint8_t *)gm;
138
- auto ub_ptr = (uint8_t *)(ub);
139
- for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
140
- *ub_ptr = *gm_ptr;
141
- }
142
- }
143
-
144
- extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp16(
145
- GM_ADDR src_gm,
146
- GM_ADDR dst_gm,
147
- GM_ADDR input_ne_gm,
148
- GM_ADDR input_nb_gm,
149
- GM_ADDR output_ne_gm,
150
- GM_ADDR output_nb_gm) {
151
-
152
- int64_t input_ne_ub[4];
153
- size_t input_nb_ub[4];
154
- int64_t output_ne_ub[4];
155
- size_t output_nb_ub[4];
156
-
157
- copy_to_ub(input_ne_gm, input_ne_ub, 32);
158
- copy_to_ub(input_nb_gm, input_nb_ub, 32);
159
- copy_to_ub(output_ne_gm, output_ne_ub, 32);
160
- copy_to_ub(output_nb_gm, output_nb_ub, 32);
161
-
162
- DupByRows<half, half> op;
163
- op.init(src_gm, dst_gm, input_ne_ub, input_nb_ub);
164
- op.dup();
165
- }
166
-
167
- extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp32(
168
- GM_ADDR src_gm,
169
- GM_ADDR dst_gm,
170
- GM_ADDR input_ne_gm,
171
- GM_ADDR input_nb_gm,
172
- GM_ADDR output_ne_gm,
173
- GM_ADDR output_nb_gm) {
174
- int64_t input_ne_ub[4];
175
- size_t input_nb_ub[4];
176
- int64_t output_ne_ub[4];
177
- size_t output_nb_ub[4];
178
-
179
- copy_to_ub(input_ne_gm, input_ne_ub, 32);
180
- copy_to_ub(input_nb_gm, input_nb_ub, 32);
181
- copy_to_ub(output_ne_gm, output_ne_ub, 32);
182
- copy_to_ub(output_nb_gm, output_nb_ub, 32);
183
-
184
- DupByRows<float, float> op;
185
- op.init(src_gm, dst_gm, input_ne_ub, input_nb_ub);
186
- op.dup();
187
- }
188
-
189
- extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp32_to_fp16(
190
- GM_ADDR src_gm,
191
- GM_ADDR dst_gm,
192
- GM_ADDR input_ne_gm,
193
- GM_ADDR input_nb_gm,
194
- GM_ADDR output_ne_gm,
195
- GM_ADDR output_nb_gm) {
196
-
197
- int64_t input_ne_ub[4];
198
- size_t input_nb_ub[4];
199
- int64_t output_ne_ub[4];
200
- size_t output_nb_ub[4];
201
-
202
- copy_to_ub(input_ne_gm, input_ne_ub, 32);
203
- copy_to_ub(input_nb_gm, input_nb_ub, 32);
204
- copy_to_ub(output_ne_gm, output_ne_ub, 32);
205
- copy_to_ub(output_nb_gm, output_nb_ub, 32);
206
-
207
- DupByRows<float, half> op;
208
- op.init(src_gm, dst_gm, input_ne_ub, input_nb_ub);
209
- op.dup_with_cast();
210
- }
211
-
212
- extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp16_to_fp32(
213
- GM_ADDR src_gm,
214
- GM_ADDR dst_gm,
215
- GM_ADDR input_ne_gm,
216
- GM_ADDR input_nb_gm,
217
- GM_ADDR output_ne_gm,
218
- GM_ADDR output_nb_gm) {
219
-
220
- // copy params from gm to ub.
221
- int64_t input_ne_ub[4];
222
- size_t input_nb_ub[4];
223
- int64_t output_ne_ub[4];
224
- size_t output_nb_ub[4];
225
-
226
- copy_to_ub(input_ne_gm, input_ne_ub, 32);
227
- copy_to_ub(input_nb_gm, input_nb_ub, 32);
228
- copy_to_ub(output_ne_gm, output_ne_ub, 32);
229
- copy_to_ub(output_nb_gm, output_nb_ub, 32);
230
-
231
- DupByRows<half, float> op;
232
- op.init(src_gm, dst_gm, input_ne_ub, input_nb_ub);
233
- op.dup_with_cast();
234
- }
@@ -1,197 +0,0 @@
1
- #include "kernel_operator.h"
2
-
3
- // optimize me. Use template to avoid copy code.
4
- using namespace AscendC;
5
-
6
- #define BUFFER_NUM 2
7
-
8
- class GET_ROW_F16 {
9
- public:
10
- __aicore__ inline GET_ROW_F16() {}
11
- __aicore__ inline void init(GM_ADDR input, GM_ADDR indices, GM_ADDR output,
12
- int64_t *input_ne_ub, size_t *input_nb_ub,
13
- int64_t *indices_ne_ub, size_t *indices_nb_ub,
14
- int64_t *output_ne_ub, size_t *output_nb_ub) {
15
- // TODO, use template for F16/f32
16
- int64_t op_block_num = GetBlockNum();
17
- op_block_idx = GetBlockIdx();
18
-
19
- for (int i = 0; i < 4; i++) {
20
- input_ne[i] = input_ne_ub[i];
21
- input_stride[i] = input_nb_ub[i] / input_nb_ub[0];
22
-
23
- indices_ne[i] = indices_ne_ub[i];
24
- indices_stride[i] = indices_nb_ub[i] / indices_nb_ub[0];
25
-
26
- output_ne[i] = output_ne_ub[i];
27
- output_stride[i] = output_nb_ub[i] / output_nb_ub[0];
28
- }
29
-
30
- // Indices has two dims. n_elements = all rows should get.
31
- // dr, all rows should this thread get.
32
- uint64_t n_elements =
33
- indices_ne[0] * indices_ne[1] * indices_ne[2] * indices_ne[3];
34
- dr = n_elements / op_block_num;
35
-
36
- uint64_t tails = n_elements % op_block_num;
37
- if (op_block_idx < tails) {
38
- dr += 1;
39
- ir = dr * op_block_idx;
40
- } else {
41
- ir = dr * op_block_idx + tails;
42
- }
43
-
44
- input_gm.SetGlobalBuffer((__gm__ half *)input);
45
- indices_gm.SetGlobalBuffer((__gm__ int32_t *)indices);
46
- output_gm.SetGlobalBuffer((__gm__ float *)output);
47
-
48
- uint64_t input_local_buffer_size = ((input_ne[0] * sizeof(half) + 31)
49
- & ~31);
50
- uint64_t output_local_buffer_size = ((input_ne[0] * sizeof(float) + 31)
51
- & ~31);
52
-
53
- local_buffer_elems = input_local_buffer_size / sizeof(half);
54
-
55
- // TODO, consider long row that can't put in UB.
56
- // All data should asign to 32. It's ok because all data is align to 32.
57
- pipe.InitBuffer(input_queue, BUFFER_NUM, input_local_buffer_size);
58
- pipe.InitBuffer(output_queue, BUFFER_NUM, output_local_buffer_size);
59
- }
60
-
61
- __aicore__ inline void copy_in(uint32_t offset, size_t len) {
62
- size_t origin_len = len;
63
- LocalTensor<half> input_local = input_queue.AllocTensor<half>();
64
- const size_t elem_per_block = 32 / sizeof(half);
65
- size_t tail = len % elem_per_block;
66
- len = len & ~(elem_per_block - 1);
67
- if(tail != 0) {
68
- len += elem_per_block;
69
- }
70
- DataCopy(input_local, input_gm[offset], len);
71
- input_queue.EnQue(input_local);
72
- }
73
-
74
- __aicore__ inline void copy_out(uint32_t offset, size_t len) {
75
- LocalTensor<float> output_local = output_queue.DeQue<float>();
76
- const size_t elem_per_block = 32 / sizeof(float);
77
- size_t tail = len % elem_per_block;
78
- len = len & ~(elem_per_block - 1);
79
- if (len > 0) {
80
- DataCopy(output_gm[offset], output_local, len);
81
- }
82
-
83
- if(tail != 0) {
84
- #ifdef ASCEND_310P
85
- for (size_t i = tail; i < elem_per_block; i++) {
86
- output_local[len + i].SetValue(0, 0);
87
- }
88
- SetAtomicAdd<float>();
89
- DataCopy(output_gm[offset + len], output_local[len], elem_per_block);
90
- SetAtomicNone();
91
- #else
92
- DataCopyExtParams dataCopyParams;
93
- dataCopyParams.blockCount = 1;
94
- dataCopyParams.blockLen = tail * sizeof(float);
95
- DataCopyPad(output_gm[offset + len], output_local[len],
96
- dataCopyParams);
97
- #endif
98
- }
99
- output_queue.FreeTensor(output_local);
100
- }
101
-
102
- __aicore__ inline void calculate_row(int64_t idx) {
103
- const int64_t indices_ne2_idx = idx / (indices_ne[0] * indices_ne[1]);
104
- const int64_t indices_ne1_idx =
105
- (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1]) /
106
- indices_ne[0];
107
- const int64_t indices_ne0_idx =
108
- (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1] -
109
- indices_ne1_idx * indices_ne[0]);
110
-
111
- const int64_t indices_offset = indices_ne0_idx * indices_stride[0] +
112
- indices_ne1_idx * indices_stride[1] +
113
- indices_ne2_idx * indices_stride[2];
114
- const int32_t selected_row_idx = indices_gm.GetValue(indices_offset);
115
-
116
- const int64_t input_offset = selected_row_idx * input_stride[1] +
117
- indices_ne1_idx * input_stride[2] +
118
- indices_ne2_idx * input_stride[3];
119
-
120
- const int64_t output_offset = indices_ne0_idx * output_stride[1] +
121
- indices_ne1_idx * output_stride[2] +
122
- indices_ne2_idx * output_stride[3];
123
-
124
- copy_in(input_offset, input_ne[0]);
125
- LocalTensor<half> input_local = input_queue.DeQue<half>();
126
- LocalTensor<float> output_local = output_queue.AllocTensor<float>();
127
-
128
- Cast(output_local, input_local, RoundMode::CAST_NONE,
129
- local_buffer_elems);
130
- output_queue.EnQue(output_local);
131
- copy_out(output_offset, input_ne[0]);
132
-
133
- input_queue.FreeTensor(input_local);
134
- }
135
-
136
- __aicore__ inline void calculate() {
137
- for (int64_t i = ir; i < ir + dr; i++) {
138
- calculate_row(i);
139
- }
140
- }
141
-
142
- private:
143
- int64_t input_ne[4];
144
- size_t input_stride[4];
145
-
146
- int64_t indices_ne[4];
147
- size_t indices_stride[4];
148
-
149
- int64_t output_ne[4];
150
- size_t output_stride[4];
151
-
152
- size_t local_buffer_elems;
153
-
154
- int64_t ir;
155
- int64_t dr;
156
-
157
- TPipe pipe;
158
- GlobalTensor<half> input_gm;
159
- GlobalTensor<int32_t> indices_gm;
160
- GlobalTensor<float> output_gm;
161
- TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
162
- TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
163
- int64_t op_block_idx;
164
- };
165
-
166
- template <typename T>
167
- __aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
168
- auto gm_ptr = (__gm__ uint8_t *)gm;
169
- auto ub_ptr = (uint8_t *)(ub);
170
- for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
171
- *ub_ptr = *gm_ptr;
172
- }
173
- }
174
-
175
- extern "C" __global__ __aicore__ void ascendc_get_row_f16(
176
- GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm,
177
- GM_ADDR input_ne_gm, GM_ADDR input_nb_gm, GM_ADDR indices_ne_gm,
178
- GM_ADDR indices_nb_gm, GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) {
179
- int64_t input_ne_ub[4];
180
- size_t input_nb_ub[4];
181
- int64_t indices_ne_ub[4];
182
- size_t indices_nb_ub[4];
183
- int64_t output_ne_ub[4];
184
- size_t output_nb_ub[4];
185
-
186
- copy_to_ub(input_ne_gm, input_ne_ub, 32);
187
- copy_to_ub(input_nb_gm, input_nb_ub, 32);
188
- copy_to_ub(indices_ne_gm, indices_ne_ub, 32);
189
- copy_to_ub(indices_nb_gm, indices_nb_ub, 32);
190
- copy_to_ub(output_ne_gm, output_ne_ub, 32);
191
- copy_to_ub(output_nb_gm, output_nb_ub, 32);
192
-
193
- GET_ROW_F16 op;
194
- op.init(input_gm, indices_gm, output_gm, input_ne_ub, input_nb_ub,
195
- indices_ne_ub, indices_nb_ub, output_ne_ub, output_nb_ub);
196
- op.calculate();
197
- }
@@ -1,190 +0,0 @@
1
- #include "kernel_operator.h"
2
-
3
- // optimize me. Use template to avoid copy code.
4
- using namespace AscendC;
5
-
6
- #define BUFFER_NUM 2
7
-
8
- class GET_ROW_F32 {
9
- public:
10
- __aicore__ inline GET_ROW_F32() {}
11
- __aicore__ inline void init(GM_ADDR input, GM_ADDR indices, GM_ADDR output,
12
- int64_t *input_ne_ub, size_t *input_nb_ub,
13
- int64_t *indices_ne_ub, size_t *indices_nb_ub,
14
- int64_t *output_ne_ub, size_t *output_nb_ub) {
15
- int64_t op_block_num = GetBlockNum();
16
- op_block_idx = GetBlockIdx();
17
-
18
- for (int i = 0; i < 4; i++) {
19
- input_ne[i] = input_ne_ub[i];
20
- input_stride[i] = input_nb_ub[i] / input_nb_ub[0];
21
-
22
- indices_ne[i] = indices_ne_ub[i];
23
- indices_stride[i] = indices_nb_ub[i] / indices_nb_ub[0];
24
-
25
- output_ne[i] = output_ne_ub[i];
26
- output_stride[i] = output_nb_ub[i] / output_nb_ub[0];
27
- }
28
-
29
- // Indices has two dims. n_elements = all rows should get.
30
- // dr, all rows should this thread get.
31
- uint64_t n_elements =
32
- indices_ne[0] * indices_ne[1] * indices_ne[2] * indices_ne[3];
33
- dr = n_elements / op_block_num;
34
-
35
- uint64_t tails = n_elements % op_block_num;
36
- if (op_block_idx < tails) {
37
- dr += 1;
38
- ir = dr * op_block_idx;
39
- } else {
40
- ir = dr * op_block_idx + tails;
41
- }
42
-
43
- input_gm.SetGlobalBuffer((__gm__ float *)input);
44
- indices_gm.SetGlobalBuffer((__gm__ int32_t *)indices);
45
- output_gm.SetGlobalBuffer((__gm__ float *)output);
46
-
47
- uint64_t local_buffer_size = ((input_ne[0] * sizeof(float) + 31) & ~31);
48
- local_buffer_elems = local_buffer_size / sizeof(float);
49
-
50
- // TODO, consider long row that can't put in UB.
51
- // All data should asign to 32. It's ok because all data is align to 32.
52
- pipe.InitBuffer(input_queue, BUFFER_NUM, local_buffer_size);
53
- pipe.InitBuffer(output_queue, BUFFER_NUM, local_buffer_size);
54
- }
55
-
56
- __aicore__ inline void copy_in(uint32_t offset, size_t len) {
57
- LocalTensor<float> input_local = input_queue.AllocTensor<float>();
58
- const size_t elem_per_block = 32 / sizeof(float);
59
- size_t tail = len % elem_per_block;
60
- len = len & ~(elem_per_block - 1);
61
- if(tail != 0) {
62
- len += elem_per_block;
63
- }
64
- DataCopy(input_local, input_gm[offset], len);
65
- input_queue.EnQue(input_local);
66
- }
67
-
68
- __aicore__ inline void copy_out(uint32_t offset, size_t len) {
69
- LocalTensor<float> output_local = output_queue.DeQue<float>();
70
- const size_t elem_per_block = 32 / sizeof(float);
71
- size_t tail = len % elem_per_block;
72
- len = len & ~(elem_per_block - 1);
73
- if (len > 0) {
74
- DataCopy(output_gm[offset], output_local, len);
75
- }
76
-
77
- if(tail != 0) {
78
- #ifdef ASCEND_310P
79
- for (size_t i = tail; i < elem_per_block; i++) {
80
- output_local[len + i].SetValue(0, 0);
81
- }
82
- SetAtomicAdd<float>();
83
- DataCopy(output_gm[offset + len], output_local[len], elem_per_block);
84
- SetAtomicNone();
85
- #else
86
- DataCopyExtParams dataCopyParams;
87
- dataCopyParams.blockCount = 1;
88
- dataCopyParams.blockLen = tail * sizeof(float);
89
- DataCopyPad(output_gm[offset + len], output_local[len],
90
- dataCopyParams);
91
- #endif
92
- }
93
- output_queue.FreeTensor(output_local);
94
- }
95
-
96
- __aicore__ inline void calculate_row(int64_t idx) {
97
- const int64_t indices_ne2_idx = idx / (indices_ne[0] * indices_ne[1]);
98
- const int64_t indices_ne1_idx =
99
- (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1]) /
100
- indices_ne[0];
101
- const int64_t indices_ne0_idx =
102
- (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1] -
103
- indices_ne1_idx * indices_ne[0]);
104
-
105
- const int64_t indices_offset = indices_ne0_idx * indices_stride[0] +
106
- indices_ne1_idx * indices_stride[1] +
107
- indices_ne2_idx * indices_stride[2];
108
- const int32_t selected_row_idx = indices_gm.GetValue(indices_offset);
109
-
110
- const int64_t input_offset = selected_row_idx * input_stride[1] +
111
- indices_ne1_idx * input_stride[2] +
112
- indices_ne2_idx * input_stride[3];
113
-
114
- const int64_t output_offset = indices_ne0_idx * output_stride[1] +
115
- indices_ne1_idx * output_stride[2] +
116
- indices_ne2_idx * output_stride[3];
117
-
118
- copy_in(input_offset, input_ne[0]);
119
- LocalTensor<float> input_local = input_queue.DeQue<float>();
120
- LocalTensor<float> output_local = output_queue.AllocTensor<float>();
121
-
122
- DataCopy(output_local, input_local, local_buffer_elems);
123
- output_queue.EnQue(output_local);
124
- copy_out(output_offset, input_ne[0]);
125
-
126
- input_queue.FreeTensor(input_local);
127
- }
128
-
129
- __aicore__ inline void calculate() {
130
- for (int64_t i = ir; i < ir + dr; i++) {
131
- calculate_row(i);
132
- }
133
- }
134
-
135
- private:
136
- int64_t input_ne[4];
137
- size_t input_stride[4];
138
-
139
- int64_t indices_ne[4];
140
- size_t indices_stride[4];
141
-
142
- int64_t output_ne[4];
143
- size_t output_stride[4];
144
-
145
- size_t local_buffer_elems;
146
-
147
- int64_t ir;
148
- int64_t dr;
149
-
150
- TPipe pipe;
151
- GlobalTensor<float> input_gm;
152
- GlobalTensor<int32_t> indices_gm;
153
- GlobalTensor<float> output_gm;
154
- TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
155
- TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
156
- int64_t op_block_idx;
157
- };
158
-
159
- template <typename T>
160
- __aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
161
- auto gm_ptr = (__gm__ uint8_t *)gm;
162
- auto ub_ptr = (uint8_t *)(ub);
163
- for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
164
- *ub_ptr = *gm_ptr;
165
- }
166
- }
167
-
168
- extern "C" __global__ __aicore__ void ascendc_get_row_f32(
169
- GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm,
170
- GM_ADDR input_ne_gm, GM_ADDR input_nb_gm, GM_ADDR indices_ne_gm,
171
- GM_ADDR indices_nb_gm, GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) {
172
- int64_t input_ne_ub[4];
173
- size_t input_nb_ub[4];
174
- int64_t indices_ne_ub[4];
175
- size_t indices_nb_ub[4];
176
- int64_t output_ne_ub[4];
177
- size_t output_nb_ub[4];
178
-
179
- copy_to_ub(input_ne_gm, input_ne_ub, 32);
180
- copy_to_ub(input_nb_gm, input_nb_ub, 32);
181
- copy_to_ub(indices_ne_gm, indices_ne_ub, 32);
182
- copy_to_ub(indices_nb_gm, indices_nb_ub, 32);
183
- copy_to_ub(output_ne_gm, output_ne_ub, 32);
184
- copy_to_ub(output_nb_gm, output_nb_ub, 32);
185
-
186
- GET_ROW_F32 op;
187
- op.init(input_gm, indices_gm, output_gm, input_ne_ub, input_nb_ub,
188
- indices_ne_ub, indices_nb_ub, output_ne_ub, output_nb_ub);
189
- op.calculate();
190
- }