@novastera-oss/llamarn 0.2.7 → 0.2.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (186) hide show
  1. package/android/src/main/cpp/include/llama.h +8 -3
  2. package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
  3. package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
  4. package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
  5. package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
  6. package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
  7. package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
  8. package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
  9. package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
  10. package/cpp/LlamaCppModel.cpp +56 -22
  11. package/cpp/build-info.cpp +2 -2
  12. package/cpp/llama.cpp/CMakeLists.txt +1 -1
  13. package/cpp/llama.cpp/common/arg.cpp +7 -0
  14. package/cpp/llama.cpp/common/common.cpp +3 -0
  15. package/cpp/llama.cpp/common/common.h +1 -0
  16. package/cpp/llama.cpp/common/json-schema-to-grammar.cpp +3 -46
  17. package/cpp/llama.cpp/convert_hf_to_gguf.py +118 -20
  18. package/cpp/llama.cpp/ggml/CMakeLists.txt +1 -0
  19. package/cpp/llama.cpp/ggml/include/ggml-cpu.h +2 -0
  20. package/cpp/llama.cpp/ggml/include/ggml.h +33 -0
  21. package/cpp/llama.cpp/ggml/src/CMakeLists.txt +17 -0
  22. package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +1 -1
  23. package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +31 -2
  24. package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +10 -9
  25. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +109 -108
  26. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +1027 -1038
  27. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +53 -52
  28. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
  29. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +56 -55
  30. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +42 -41
  31. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +24 -23
  32. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +29 -28
  33. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +30 -29
  34. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +83 -82
  35. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +20 -19
  36. package/cpp/llama.cpp/ggml/src/ggml-cpu/common.h +3 -2
  37. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +9 -3
  38. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +83 -102
  39. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +4 -0
  40. package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +3 -2
  41. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +192 -67
  42. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.h +2 -0
  43. package/cpp/llama.cpp/ggml/src/ggml-cpu/quants.c +25 -24
  44. package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +56 -40
  45. package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +211 -33
  46. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +2 -2
  47. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +45 -45
  48. package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +54 -29
  49. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cu +161 -0
  50. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cuh +5 -0
  51. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cu +91 -0
  52. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cuh +4 -0
  53. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +4 -0
  54. package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +84 -31
  55. package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cu +19 -0
  56. package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cuh +3 -0
  57. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cu +257 -87
  58. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cuh +2 -3
  59. package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cu +5 -18
  60. package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cuh +0 -1
  61. package/cpp/llama.cpp/ggml/src/ggml-impl.h +61 -183
  62. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +16 -0
  63. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +227 -41
  64. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +362 -182
  65. package/cpp/llama.cpp/ggml/src/ggml-musa/mudnn.cuh +2 -2
  66. package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +240 -535
  67. package/cpp/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +5 -6
  68. package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +1 -24
  69. package/cpp/llama.cpp/ggml/src/ggml-sycl/concat.cpp +28 -41
  70. package/cpp/llama.cpp/ggml/src/ggml-sycl/conv.cpp +4 -10
  71. package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +99 -166
  72. package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +94 -72
  73. package/cpp/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +49 -67
  74. package/cpp/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +31 -1
  75. package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +99 -159
  76. package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +6 -9
  77. package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +45 -54
  78. package/cpp/llama.cpp/ggml/src/ggml-sycl/gla.cpp +2 -2
  79. package/cpp/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +1 -1
  80. package/cpp/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +60 -80
  81. package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +132 -201
  82. package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.cpp +55 -74
  83. package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +24 -20
  84. package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +3 -3
  85. package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +3 -1
  86. package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +3 -0
  87. package/cpp/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +3 -8
  88. package/cpp/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +12 -16
  89. package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +12 -1
  90. package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +57 -1
  91. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -0
  92. package/cpp/llama.cpp/ggml/src/ggml.c +69 -13
  93. package/cpp/llama.cpp/ggml/src/gguf.cpp +5 -1
  94. package/cpp/llama.cpp/gguf-py/gguf/constants.py +76 -0
  95. package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +21 -0
  96. package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +64 -0
  97. package/cpp/llama.cpp/gguf-py/gguf/vocab.py +97 -4
  98. package/cpp/llama.cpp/gguf-py/pyproject.toml +2 -2
  99. package/cpp/llama.cpp/include/llama.h +8 -3
  100. package/cpp/llama.cpp/models/templates/Mistral-Small-3.2-24B-Instruct-2506.jinja +124 -0
  101. package/cpp/llama.cpp/src/llama-arch.cpp +55 -0
  102. package/cpp/llama.cpp/src/llama-arch.h +18 -0
  103. package/cpp/llama.cpp/src/llama-batch.cpp +570 -359
  104. package/cpp/llama.cpp/src/llama-batch.h +98 -70
  105. package/cpp/llama.cpp/src/llama-chat.cpp +11 -6
  106. package/cpp/llama.cpp/src/llama-context.cpp +101 -107
  107. package/cpp/llama.cpp/src/llama-context.h +13 -13
  108. package/cpp/llama.cpp/src/llama-graph.cpp +199 -252
  109. package/cpp/llama.cpp/src/llama-graph.h +44 -32
  110. package/cpp/llama.cpp/src/llama-hparams.cpp +4 -0
  111. package/cpp/llama.cpp/src/llama-hparams.h +8 -0
  112. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +51 -53
  113. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +19 -24
  114. package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +110 -104
  115. package/cpp/llama.cpp/src/llama-kv-cache-unified.h +17 -22
  116. package/cpp/llama.cpp/src/llama-kv-cells.h +35 -11
  117. package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +66 -67
  118. package/cpp/llama.cpp/src/llama-memory-hybrid.h +16 -21
  119. package/cpp/llama.cpp/src/llama-memory-recurrent.cpp +69 -68
  120. package/cpp/llama.cpp/src/llama-memory-recurrent.h +15 -20
  121. package/cpp/llama.cpp/src/llama-memory.h +18 -22
  122. package/cpp/llama.cpp/src/llama-model-saver.cpp +1 -0
  123. package/cpp/llama.cpp/src/llama-model.cpp +1006 -472
  124. package/cpp/llama.cpp/src/llama-model.h +22 -0
  125. package/cpp/llama.cpp/src/llama-quant.cpp +87 -5
  126. package/cpp/llama.cpp/src/llama-vocab.cpp +26 -3
  127. package/cpp/llama.cpp/src/llama-vocab.h +1 -0
  128. package/cpp/rn-utils.h +3 -0
  129. package/ios/include/common.h +1 -0
  130. package/ios/include/llama.h +8 -3
  131. package/ios/libs/llama.xcframework/Info.plist +19 -19
  132. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  133. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4890 -4863
  134. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
  135. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +33 -0
  136. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +8 -3
  137. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
  138. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  139. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4834
  140. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3764 -3742
  141. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
  142. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
  143. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +8 -3
  144. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
  145. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  146. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4834
  147. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3766 -3744
  148. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-cpu.h +2 -0
  149. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +33 -0
  150. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +8 -3
  151. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-cpu.h +2 -0
  152. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +33 -0
  153. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +8 -3
  154. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
  155. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-cpu.h +2 -0
  156. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +33 -0
  157. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +8 -3
  158. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
  159. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
  160. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  161. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4890 -4863
  162. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
  163. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +33 -0
  164. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +8 -3
  165. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
  166. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  167. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4834
  168. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3764 -3742
  169. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
  170. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
  171. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +8 -3
  172. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
  173. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  174. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4926 -4900
  175. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
  176. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +33 -0
  177. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +8 -3
  178. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
  179. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  180. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4897 -4871
  181. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3794 -3773
  182. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
  183. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
  184. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +8 -3
  185. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
  186. package/package.json +1 -1
@@ -108,7 +108,7 @@ static void ggml_compute_forward_dup_f16(
108
108
  for (int i01 = ir0; i01 < ir1; i01++) {
109
109
  const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
110
110
  for (int i00 = 0; i00 < ne00; i00++) {
111
- dst_ptr[id] = GGML_FP16_TO_FP32(src0_ptr[i00]);
111
+ dst_ptr[id] = GGML_CPU_FP16_TO_FP32(src0_ptr[i00]);
112
112
  id++;
113
113
  }
114
114
  }
@@ -130,7 +130,7 @@ static void ggml_compute_forward_dup_f16(
130
130
  const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
131
131
 
132
132
  for (int i00 = 0; i00 < ne00; i00++) {
133
- src0_f32[i00] = GGML_FP16_TO_FP32(src0_ptr[i00]);
133
+ src0_f32[i00] = GGML_CPU_FP16_TO_FP32(src0_ptr[i00]);
134
134
  }
135
135
 
136
136
  quantize_row_q(src0_f32, dst_ptr + id, ne00);
@@ -156,7 +156,7 @@ static void ggml_compute_forward_dup_f16(
156
156
  for (int i00 = 0; i00 < ne00; i00++) {
157
157
  const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
158
158
 
159
- dst_ptr[id] = GGML_FP16_TO_FP32(*src0_ptr);
159
+ dst_ptr[id] = GGML_CPU_FP16_TO_FP32(*src0_ptr);
160
160
  id++;
161
161
  }
162
162
  }
@@ -267,7 +267,7 @@ static void ggml_compute_forward_dup_f16(
267
267
  const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
268
268
  char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3);
269
269
 
270
- *(float *) dst_ptr = GGML_FP16_TO_FP32(*(const ggml_fp16_t *) src0_ptr);
270
+ *(float *) dst_ptr = GGML_CPU_FP16_TO_FP32(*(const ggml_fp16_t *) src0_ptr);
271
271
 
272
272
  if (++i10 == ne0) {
273
273
  i10 = 0;
@@ -372,7 +372,7 @@ static void ggml_compute_forward_dup_bf16(
372
372
  for (int i01 = ir0; i01 < ir1; i01++) {
373
373
  const ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
374
374
  for (int i00 = 0; i00 < ne00; i00++) {
375
- dst_ptr[id] = GGML_FP32_TO_FP16(GGML_BF16_TO_FP32(src0_ptr[i00]));
375
+ dst_ptr[id] = GGML_CPU_FP32_TO_FP16(GGML_BF16_TO_FP32(src0_ptr[i00]));
376
376
  id++;
377
377
  }
378
378
  }
@@ -473,7 +473,7 @@ static void ggml_compute_forward_dup_bf16(
473
473
  for (int i00 = 0; i00 < ne00; i00++) {
474
474
  const ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
475
475
 
476
- dst_ptr[id] = GGML_FP32_TO_FP16(GGML_BF16_TO_FP32(*src0_ptr));
476
+ dst_ptr[id] = GGML_CPU_FP32_TO_FP16(GGML_BF16_TO_FP32(*src0_ptr));
477
477
  id++;
478
478
  }
479
479
  }
@@ -566,7 +566,7 @@ static void ggml_compute_forward_dup_bf16(
566
566
  const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
567
567
  char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3);
568
568
 
569
- *(ggml_fp16_t *) dst_ptr = GGML_FP32_TO_FP16(GGML_BF16_TO_FP32(*(const ggml_bf16_t *) src0_ptr));
569
+ *(ggml_fp16_t *) dst_ptr = GGML_CPU_FP32_TO_FP16(GGML_BF16_TO_FP32(*(const ggml_bf16_t *) src0_ptr));
570
570
 
571
571
  if (++i10 == ne0) {
572
572
  i10 = 0;
@@ -696,24 +696,8 @@ static void ggml_compute_forward_dup_f32(
696
696
  if (ggml_is_contiguous(dst)) {
697
697
  // TODO: simplify
698
698
  if (nb00 == sizeof(float)) {
699
- if (dst->type == GGML_TYPE_F32) {
700
- size_t id = 0;
701
- const size_t rs = ne00 * nb00;
702
- char * dst_ptr = (char *) dst->data;
703
-
704
- for (int i03 = 0; i03 < ne03; i03++) {
705
- for (int i02 = 0; i02 < ne02; i02++) {
706
- id += rs * ir0;
707
- for (int i01 = ir0; i01 < ir1; i01++) {
708
- const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03;
709
- memcpy(dst_ptr + id, src0_ptr, rs);
710
- id += rs;
711
- }
712
- id += rs * (ne01 - ir1);
713
- }
714
- }
715
- } else if (ggml_get_type_traits_cpu(dst->type)->from_float) {
716
- ggml_from_float_t const quantize_row_q = ggml_get_type_traits_cpu(dst->type)->from_float;
699
+ if (ggml_get_type_traits_cpu(dst->type)->from_float) {
700
+ ggml_from_float_t const from_float = ggml_get_type_traits_cpu(dst->type)->from_float;
717
701
 
718
702
  size_t id = 0;
719
703
  size_t rs = nb0 * (ne00 / ggml_blck_size(dst->type));
@@ -724,7 +708,7 @@ static void ggml_compute_forward_dup_f32(
724
708
  id += rs * ir0;
725
709
  for (int i01 = ir0; i01 < ir1; i01++) {
726
710
  const float * src0_ptr = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
727
- quantize_row_q(src0_ptr, dst_ptr + id, ne00);
711
+ from_float(src0_ptr, dst_ptr + id, ne00);
728
712
  id += rs;
729
713
  }
730
714
  id += rs * (ne01 - ir1);
@@ -765,7 +749,7 @@ static void ggml_compute_forward_dup_f32(
765
749
  for (int i00 = 0; i00 < ne00; i00++) {
766
750
  const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
767
751
 
768
- dst_ptr[id] = GGML_FP32_TO_FP16(*src0_ptr);
752
+ dst_ptr[id] = GGML_CPU_FP32_TO_FP16(*src0_ptr);
769
753
  id++;
770
754
  }
771
755
  }
@@ -878,7 +862,7 @@ static void ggml_compute_forward_dup_f32(
878
862
  const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
879
863
  char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3);
880
864
 
881
- *(ggml_fp16_t *) dst_ptr = GGML_FP32_TO_FP16(*(const float *) src0_ptr);
865
+ *(ggml_fp16_t *) dst_ptr = GGML_CPU_FP32_TO_FP16(*(const float *) src0_ptr);
882
866
 
883
867
  if (++i10 == ne0) {
884
868
  i10 = 0;
@@ -1419,7 +1403,7 @@ static void ggml_compute_forward_add1_f16_f32(
1419
1403
  ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 );
1420
1404
  ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
1421
1405
  for (int i = 0; i < ne0; i++) {
1422
- dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + v);
1406
+ dst_ptr[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(src0_ptr[i]) + v);
1423
1407
  }
1424
1408
  }
1425
1409
  }
@@ -1435,7 +1419,7 @@ static void ggml_compute_forward_add1_f16_f16(
1435
1419
  GGML_ASSERT(ggml_is_scalar(src1));
1436
1420
 
1437
1421
  // scalar to add
1438
- const float v = GGML_FP16_TO_FP32(*(ggml_fp16_t *) src1->data);
1422
+ const float v = GGML_CPU_FP16_TO_FP32(*(ggml_fp16_t *) src1->data);
1439
1423
 
1440
1424
  const int ith = params->ith;
1441
1425
  const int nth = params->nth;
@@ -1467,7 +1451,7 @@ static void ggml_compute_forward_add1_f16_f16(
1467
1451
  ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 );
1468
1452
  ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
1469
1453
  for (int i = 0; i < ne0; i++) {
1470
- dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + v);
1454
+ dst_ptr[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(src0_ptr[i]) + v);
1471
1455
  }
1472
1456
  }
1473
1457
  }
@@ -1889,7 +1873,7 @@ static void ggml_compute_forward_sum_f16(
1889
1873
  }
1890
1874
  }
1891
1875
  }
1892
- ((ggml_fp16_t *) dst->data)[0] = GGML_FP32_TO_FP16(sum);
1876
+ ((ggml_fp16_t *) dst->data)[0] = GGML_CPU_FP32_TO_FP16(sum);
1893
1877
  }
1894
1878
 
1895
1879
  static void ggml_compute_forward_sum_bf16(
@@ -2300,6 +2284,12 @@ void ggml_compute_forward_repeat(
2300
2284
  {
2301
2285
  ggml_compute_forward_repeat_f32(params, dst);
2302
2286
  } break;
2287
+ // TODO: templateify the implemenation and support for I64
2288
+ // ref https://github.com/ggml-org/llama.cpp/pull/14274#discussion_r2169492225
2289
+ //case GGML_TYPE_I64:
2290
+ // {
2291
+ // ggml_compute_forward_repeat_i64(params, dst);
2292
+ // } break;
2303
2293
  default:
2304
2294
  {
2305
2295
  GGML_ABORT("fatal error");
@@ -2660,7 +2650,7 @@ static void ggml_compute_forward_gelu_f16(
2660
2650
  #ifndef NDEBUG
2661
2651
  for (int k = 0; k < nc; k++) {
2662
2652
  const ggml_fp16_t x = ((ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])))[k];
2663
- const float v = GGML_FP16_TO_FP32(x);
2653
+ const float v = GGML_CPU_FP16_TO_FP32(x);
2664
2654
  GGML_UNUSED(v);
2665
2655
  assert(!isnan(v));
2666
2656
  assert(!isinf(v));
@@ -2763,7 +2753,7 @@ static void ggml_compute_forward_gelu_erf_f16(
2763
2753
  #ifndef NDEBUG
2764
2754
  for (int k = 0; k < nc; k++) {
2765
2755
  const ggml_fp16_t x = ((ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])))[k];
2766
- const float v = GGML_FP16_TO_FP32(x);
2756
+ const float v = GGML_CPU_FP16_TO_FP32(x);
2767
2757
  GGML_UNUSED(v);
2768
2758
  assert(!isnan(v));
2769
2759
  assert(!isinf(v));
@@ -2866,7 +2856,7 @@ static void ggml_compute_forward_gelu_quick_f16(
2866
2856
  #ifndef NDEBUG
2867
2857
  for (int k = 0; k < nc; k++) {
2868
2858
  const ggml_fp16_t x = ((ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])))[k];
2869
- const float v = GGML_FP16_TO_FP32(x);
2859
+ const float v = GGML_CPU_FP16_TO_FP32(x);
2870
2860
  GGML_UNUSED(v);
2871
2861
  assert(!isnan(v));
2872
2862
  assert(!isinf(v));
@@ -2969,7 +2959,7 @@ static void ggml_compute_forward_silu_f16(
2969
2959
  #ifndef NDEBUG
2970
2960
  for (int k = 0; k < nc; k++) {
2971
2961
  const ggml_fp16_t x = ((ggml_fp16_t *) ((char *) dst->data + i1*(dst->nb[1])))[k];
2972
- const float v = GGML_FP16_TO_FP32(x);
2962
+ const float v = GGML_CPU_FP16_TO_FP32(x);
2973
2963
  GGML_UNUSED(v);
2974
2964
  assert(!isnan(v));
2975
2965
  assert(!isinf(v));
@@ -3163,7 +3153,7 @@ static void ggml_compute_forward_silu_back_f16(
3163
3153
  #ifndef NDEBUG
3164
3154
  for (int k = 0; k < nc; k++) {
3165
3155
  const float x = ((ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])))[k];
3166
- const float v = GGML_FP16_TO_FP32(x);
3156
+ const float v = GGML_CPU_FP16_TO_FP32(x);
3167
3157
  GGML_UNUSED(v);
3168
3158
  assert(!isnan(v));
3169
3159
  assert(!isinf(v));
@@ -4470,6 +4460,74 @@ void ggml_compute_forward_get_rows(
4470
4460
  //}
4471
4461
  }
4472
4462
 
4463
+ static void ggml_compute_forward_set_rows_f32(
4464
+ const ggml_compute_params * params,
4465
+ ggml_tensor * dst) {
4466
+
4467
+ const ggml_tensor * src0 = dst->src[0];
4468
+ const ggml_tensor * src1 = dst->src[1];
4469
+
4470
+ GGML_TENSOR_BINARY_OP_LOCALS
4471
+
4472
+ const int64_t nc = ne00;
4473
+ const int64_t nr = ne01;
4474
+
4475
+ assert(ne0 == nc);
4476
+ assert(ne2 == ne02);
4477
+ assert(ne3 == ne03);
4478
+ assert(src0->type == GGML_TYPE_F32);
4479
+ assert(ne02 % ne11 == 0);
4480
+ assert(ne03 % ne12 == 0);
4481
+
4482
+ const int ith = params->ith;
4483
+ const int nth = params->nth;
4484
+
4485
+ // rows per thread
4486
+ const int64_t dr = (nr + nth - 1)/nth;
4487
+
4488
+ // row range for this thread
4489
+ const int64_t ir0 = dr*ith;
4490
+ const int64_t ir1 = std::min(ir0 + dr, nr);
4491
+
4492
+ ggml_from_float_t const from_float = ggml_get_type_traits_cpu(dst->type)->from_float;
4493
+
4494
+ for (int64_t i03 = 0; i03 < ne03; ++i03) {
4495
+ for (int64_t i02 = 0; i02 < ne02; ++i02) {
4496
+ for (int64_t i = ir0; i < ir1; ++i) {
4497
+ const int64_t i12 = i03%ne12;
4498
+ const int64_t i11 = i02%ne11;
4499
+ const int64_t i10 = i;
4500
+
4501
+ const int64_t i1 = *(int64_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
4502
+
4503
+ GGML_ASSERT(i1 >= 0 && i1 < ne1);
4504
+
4505
+ from_float(
4506
+ (const float *) ((char *) src0->data + i*nb01 + i02*nb02 + i03*nb03),
4507
+ ((char *) dst->data + i1*nb1 + i02*nb2 + i03*nb3), nc);
4508
+ }
4509
+ }
4510
+ }
4511
+ }
4512
+
4513
+ void ggml_compute_forward_set_rows(
4514
+ const ggml_compute_params * params,
4515
+ ggml_tensor * dst) {
4516
+
4517
+ const ggml_tensor * src0 = dst->src[0];
4518
+
4519
+ switch (src0->type) {
4520
+ case GGML_TYPE_F32:
4521
+ {
4522
+ ggml_compute_forward_set_rows_f32(params, dst);
4523
+ } break;
4524
+ default:
4525
+ {
4526
+ GGML_ABORT("src0->type = %d (%s) not supported", src0->type, ggml_type_name(src0->type));
4527
+ }
4528
+ }
4529
+ }
4530
+
4473
4531
  // ggml_compute_forward_get_rows_back
4474
4532
 
4475
4533
  static void ggml_compute_forward_get_rows_back_f32_f16(
@@ -4500,7 +4558,7 @@ static void ggml_compute_forward_get_rows_back_f32_f16(
4500
4558
 
4501
4559
  for (int j = 0; j < nc; ++j) {
4502
4560
  ggml_fp16_t v = ((ggml_fp16_t *) ((char *) src0->data + i*src0->nb[1]))[j];
4503
- ((float *) ((char *) dst->data + r*dst->nb[1]))[j] += GGML_FP16_TO_FP32(v);
4561
+ ((float *) ((char *) dst->data + r*dst->nb[1]))[j] += GGML_CPU_FP16_TO_FP32(v);
4504
4562
  }
4505
4563
  }
4506
4564
  }
@@ -4792,7 +4850,7 @@ static void ggml_compute_forward_soft_max_f32(
4792
4850
  if (mp_f32) {
4793
4851
  if (use_f16) {
4794
4852
  for (int i = 0; i < nc; ++i) {
4795
- wp[i] += slope*GGML_FP16_TO_FP32(mp_f16[i]);
4853
+ wp[i] += slope*GGML_CPU_FP16_TO_FP32(mp_f16[i]);
4796
4854
  }
4797
4855
  } else {
4798
4856
  for (int i = 0; i < nc; ++i) {
@@ -5018,8 +5076,8 @@ static void ggml_compute_forward_clamp_f16(
5018
5076
  ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + j*nb01);
5019
5077
 
5020
5078
  for (int i = 0; i < nc; i++) {
5021
- float v = GGML_FP16_TO_FP32(src0_ptr[i]);
5022
- dst_ptr[i] = GGML_FP32_TO_FP16(MAX(MIN(v, max), min));
5079
+ float v = GGML_CPU_FP16_TO_FP32(src0_ptr[i]);
5080
+ dst_ptr[i] = GGML_CPU_FP32_TO_FP16(MAX(MIN(v, max), min));
5023
5081
  }
5024
5082
  }
5025
5083
  }
@@ -5476,11 +5534,11 @@ static void ggml_compute_forward_rope_f16(
5476
5534
  const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
5477
5535
  ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0);
5478
5536
 
5479
- const float x0 = GGML_FP16_TO_FP32(src[0]);
5480
- const float x1 = GGML_FP16_TO_FP32(src[n_dims]);
5537
+ const float x0 = GGML_CPU_FP16_TO_FP32(src[0]);
5538
+ const float x1 = GGML_CPU_FP16_TO_FP32(src[n_dims]);
5481
5539
 
5482
- dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
5483
- dst_data[n_dims] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
5540
+ dst_data[0] = GGML_CPU_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
5541
+ dst_data[n_dims] = GGML_CPU_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
5484
5542
  }
5485
5543
  } else {
5486
5544
  for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
@@ -5492,11 +5550,11 @@ static void ggml_compute_forward_rope_f16(
5492
5550
  const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
5493
5551
  ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0);
5494
5552
 
5495
- const float x0 = GGML_FP16_TO_FP32(src[0]);
5496
- const float x1 = GGML_FP16_TO_FP32(src[n_dims/2]);
5553
+ const float x0 = GGML_CPU_FP16_TO_FP32(src[0]);
5554
+ const float x1 = GGML_CPU_FP16_TO_FP32(src[n_dims/2]);
5497
5555
 
5498
- dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
5499
- dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
5556
+ dst_data[0] = GGML_CPU_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
5557
+ dst_data[n_dims/2] = GGML_CPU_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
5500
5558
  }
5501
5559
  }
5502
5560
  } else {
@@ -5507,11 +5565,11 @@ static void ggml_compute_forward_rope_f16(
5507
5565
  const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
5508
5566
  ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
5509
5567
 
5510
- const float x0 = GGML_FP16_TO_FP32(src[0]);
5511
- const float x1 = GGML_FP16_TO_FP32(src[1]);
5568
+ const float x0 = GGML_CPU_FP16_TO_FP32(src[0]);
5569
+ const float x1 = GGML_CPU_FP16_TO_FP32(src[1]);
5512
5570
 
5513
- dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
5514
- dst_data[1] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
5571
+ dst_data[0] = GGML_CPU_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
5572
+ dst_data[1] = GGML_CPU_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
5515
5573
  }
5516
5574
  }
5517
5575
 
@@ -5525,11 +5583,11 @@ static void ggml_compute_forward_rope_f16(
5525
5583
  const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
5526
5584
  ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0);
5527
5585
 
5528
- const float x0 = GGML_FP16_TO_FP32(src[0]);
5529
- const float x1 = GGML_FP16_TO_FP32(src[n_dims]);
5586
+ const float x0 = GGML_CPU_FP16_TO_FP32(src[0]);
5587
+ const float x1 = GGML_CPU_FP16_TO_FP32(src[n_dims]);
5530
5588
 
5531
- dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
5532
- dst_data[n_dims] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
5589
+ dst_data[0] = GGML_CPU_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
5590
+ dst_data[n_dims] = GGML_CPU_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
5533
5591
  }
5534
5592
  } else {
5535
5593
  for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) {
@@ -5640,7 +5698,7 @@ static void ggml_compute_forward_conv_transpose_1d_f16_f32(
5640
5698
  for (int64_t i11 = 0; i11 < ne11; i11++) {
5641
5699
  const float * const src = (float *)((char *) src1->data + i11*nb11);
5642
5700
  for (int64_t i10 = 0; i10 < ne10; i10++) {
5643
- dst_data[i10*ne11 + i11] = GGML_FP32_TO_FP16(src[i10]);
5701
+ dst_data[i10*ne11 + i11] = GGML_CPU_FP32_TO_FP16(src[i10]);
5644
5702
  }
5645
5703
  }
5646
5704
  }
@@ -5933,7 +5991,7 @@ static void ggml_compute_forward_im2col_f16(
5933
5991
  if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
5934
5992
  dst_data[iic*(KH*KW) + ikh*KW + ikw] = 0;
5935
5993
  } else {
5936
- dst_data[iic*(KH*KW) + ikh*KW + ikw] = GGML_FP32_TO_FP16(src_data[iih*IW + iiw]);
5994
+ dst_data[iic*(KH*KW) + ikh*KW + ikw] = GGML_CPU_FP32_TO_FP16(src_data[iih*IW + iiw]);
5937
5995
  }
5938
5996
  }
5939
5997
  }
@@ -6109,7 +6167,7 @@ void ggml_compute_forward_conv_transpose_2d(
6109
6167
  const float * const src = (float *)((char *) src1->data + i12*nb12 + i11*nb11);
6110
6168
  ggml_fp16_t * dst_data = wdata + i11*ne10*ne12;
6111
6169
  for (int i10 = 0; i10 < ne10; i10++) {
6112
- dst_data[i10*ne12 + i12] = GGML_FP32_TO_FP16(src[i10]);
6170
+ dst_data[i10*ne12 + i12] = GGML_CPU_FP32_TO_FP16(src[i10]);
6113
6171
  }
6114
6172
  }
6115
6173
  }
@@ -6358,7 +6416,7 @@ static void ggml_compute_forward_pool_1d_sk_p0(
6358
6416
  case GGML_OP_POOL_COUNT: GGML_ABORT("fatal error");
6359
6417
  }
6360
6418
  for (int ki = 0; ki < k; ++ki) {
6361
- const float srow_j = (src->type == GGML_TYPE_F32) ? ((const float*)srow)[j] : GGML_FP16_TO_FP32(((const ggml_fp16_t*)srow)[j]);
6419
+ const float srow_j = (src->type == GGML_TYPE_F32) ? ((const float*)srow)[j] : GGML_CPU_FP16_TO_FP32(((const ggml_fp16_t*)srow)[j]);
6362
6420
  switch (op) {
6363
6421
  case GGML_OP_POOL_AVG: drow[i] += srow_j; break;
6364
6422
  case GGML_OP_POOL_MAX: if (srow_j > drow[i]) drow[i] = srow_j; break;
@@ -6450,7 +6508,7 @@ void ggml_compute_forward_pool_2d(
6450
6508
  for (int kx = 0; kx < k0; ++kx) {
6451
6509
  int j = ix + kx;
6452
6510
  if (j < 0 || j >= src->ne[0]) continue;
6453
- const float srow_j = (src->type == GGML_TYPE_F32) ? ((const float*)srow)[j] : GGML_FP16_TO_FP32(((const ggml_fp16_t*)srow)[j]);
6511
+ const float srow_j = (src->type == GGML_TYPE_F32) ? ((const float*)srow)[j] : GGML_CPU_FP16_TO_FP32(((const ggml_fp16_t*)srow)[j]);
6454
6512
  switch (op) {
6455
6513
  case GGML_OP_POOL_AVG: *out += srow_j; break;
6456
6514
  case GGML_OP_POOL_MAX: if (srow_j > *out) *out = srow_j; break;
@@ -6538,7 +6596,7 @@ void ggml_compute_forward_pool_2d_back(
6538
6596
  }
6539
6597
 
6540
6598
  const float val = dst->type == GGML_TYPE_F32 ?
6541
- ((const float *) drowf)[j] : GGML_FP16_TO_FP32(((const ggml_fp16_t *) drowf)[j]);
6599
+ ((const float *) drowf)[j] : GGML_CPU_FP16_TO_FP32(((const ggml_fp16_t *) drowf)[j]);
6542
6600
  if (val <= maxval) {
6543
6601
  continue;
6544
6602
  }
@@ -6558,7 +6616,7 @@ void ggml_compute_forward_pool_2d_back(
6558
6616
  if (dst->type == GGML_TYPE_F32) {
6559
6617
  ((float *) drow)[j] += grad0;
6560
6618
  } else {
6561
- ((ggml_fp16_t *) drow)[j] = GGML_FP32_TO_FP16(grad0 + GGML_FP16_TO_FP32(((const ggml_fp16_t *) drow)[j]));
6619
+ ((ggml_fp16_t *) drow)[j] = GGML_CPU_FP32_TO_FP16(grad0 + GGML_CPU_FP16_TO_FP32(((const ggml_fp16_t *) drow)[j]));
6562
6620
  }
6563
6621
  } else if (op == GGML_OP_POOL_AVG) {
6564
6622
  const float grad = grad0 / ka;
@@ -6577,7 +6635,7 @@ void ggml_compute_forward_pool_2d_back(
6577
6635
  if (dst->type == GGML_TYPE_F32) {
6578
6636
  ((float *) drow)[j] += grad;
6579
6637
  } else {
6580
- ((ggml_fp16_t *) drow)[j] += GGML_FP32_TO_FP16(grad);
6638
+ ((ggml_fp16_t *) drow)[j] += GGML_CPU_FP32_TO_FP16(grad);
6581
6639
  }
6582
6640
  }
6583
6641
  }
@@ -6793,6 +6851,73 @@ void ggml_compute_forward_pad_reflect_1d(
6793
6851
  }
6794
6852
  }
6795
6853
 
6854
+ // ggml_compute_forward_roll
6855
+
6856
+ static int64_t ggml_wrap_index(int64_t i, int64_t ne) {
6857
+ if (i < 0) {
6858
+ return i + ne;
6859
+ } else if (i >= ne) {
6860
+ return i - ne;
6861
+ }
6862
+ return i;
6863
+ }
6864
+
6865
+ static void ggml_compute_forward_roll_f32(
6866
+ const ggml_compute_params * params,
6867
+ ggml_tensor * dst) {
6868
+
6869
+ const ggml_tensor * src0 = dst->src[0];
6870
+ const float * src_data = (const float *) src0->data;
6871
+ float * dst_data = (float *) dst->data;
6872
+
6873
+ GGML_TENSOR_UNARY_OP_LOCALS
6874
+
6875
+ const int s0 = ggml_get_op_params_i32(dst, 0);
6876
+ const int s1 = ggml_get_op_params_i32(dst, 1);
6877
+ const int s2 = ggml_get_op_params_i32(dst, 2);
6878
+ const int s3 = ggml_get_op_params_i32(dst, 3);
6879
+
6880
+ const int64_t total = ne1 * ne2 * ne3;
6881
+ const int64_t per_thread = (total + params->nth) / params->nth;
6882
+ const int64_t start = params->ith * per_thread;
6883
+ const int64_t end = std::min(start + per_thread, total);
6884
+
6885
+ for (int64_t i = start; i < end; ++i) {
6886
+ const int64_t i1 = i % ne1;
6887
+ const int64_t i2 = (i / ne1) % ne2;
6888
+ const int64_t i3 = i / (ne2 * ne1);
6889
+ float * dst_row = dst_data + (i3*nb3 + i2*nb2 + i1*nb1) / sizeof(float);
6890
+
6891
+ const int64_t i01 = ggml_wrap_index(i1 - s1, ne01);
6892
+ const int64_t i02 = ggml_wrap_index(i2 - s2, ne02);
6893
+ const int64_t i03 = ggml_wrap_index(i3 - s3, ne03);
6894
+ const float * src_row = src_data + (i03*nb03 + i02*nb02 + i01*nb01) / sizeof(float);
6895
+
6896
+ const int64_t s = ggml_wrap_index(-s0, ne00);
6897
+ const int64_t n = ne00 - s;
6898
+ ggml_vec_cpy_f32(n, dst_row, src_row + s);
6899
+ ggml_vec_cpy_f32(s, dst_row + n, src_row);
6900
+ }
6901
+ }
6902
+
6903
+ void ggml_compute_forward_roll(
6904
+ const ggml_compute_params * params,
6905
+ ggml_tensor * dst) {
6906
+
6907
+ const ggml_tensor * src0 = dst->src[0];
6908
+
6909
+ switch (src0->type) {
6910
+ case GGML_TYPE_F32:
6911
+ {
6912
+ ggml_compute_forward_roll_f32(params, dst);
6913
+ } break;
6914
+ default:
6915
+ {
6916
+ GGML_ABORT("fatal error");
6917
+ }
6918
+ }
6919
+ }
6920
+
6796
6921
  // ggml_compute_forward_arange
6797
6922
 
6798
6923
  static void ggml_compute_forward_arange_f32(
@@ -7075,7 +7200,7 @@ static void ggml_compute_forward_flash_attn_ext_f16(
7075
7200
  // loop over n_kv and n_head_kv
7076
7201
  // ref: https://arxiv.org/pdf/2112.05682.pdf
7077
7202
  for (int64_t ic = 0; ic < nek1; ++ic) {
7078
- const float mv = mp ? slope*GGML_FP16_TO_FP32(mp[ic]) : 0.0f;
7203
+ const float mv = mp ? slope*GGML_CPU_FP16_TO_FP32(mp[ic]) : 0.0f;
7079
7204
  if (mv == -INFINITY) {
7080
7205
  continue;
7081
7206
  }
@@ -7143,7 +7268,7 @@ static void ggml_compute_forward_flash_attn_ext_f16(
7143
7268
 
7144
7269
  if (v->type == GGML_TYPE_F16) {
7145
7270
  for (int64_t d = 0; d < DV; ++d) {
7146
- VKQ32[d] = GGML_FP16_TO_FP32(VKQ16[d]);
7271
+ VKQ32[d] = GGML_CPU_FP16_TO_FP32(VKQ16[d]);
7147
7272
  }
7148
7273
  }
7149
7274
 
@@ -53,6 +53,7 @@ void ggml_compute_forward_permute(const struct ggml_compute_params * params, str
53
53
  void ggml_compute_forward_transpose(const struct ggml_compute_params * params, struct ggml_tensor * dst);
54
54
  void ggml_compute_forward_get_rows(const struct ggml_compute_params * params, struct ggml_tensor * dst);
55
55
  void ggml_compute_forward_get_rows_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
56
+ void ggml_compute_forward_set_rows(const struct ggml_compute_params * params, struct ggml_tensor * dst);
56
57
  void ggml_compute_forward_diag(const struct ggml_compute_params * params, struct ggml_tensor * dst);
57
58
  void ggml_compute_forward_diag_mask_inf(const struct ggml_compute_params * params, struct ggml_tensor * dst);
58
59
  void ggml_compute_forward_diag_mask_zero(const struct ggml_compute_params * params, struct ggml_tensor * dst);
@@ -72,6 +73,7 @@ void ggml_compute_forward_pool_2d_back(const struct ggml_compute_params * params
72
73
  void ggml_compute_forward_upscale(const struct ggml_compute_params * params, struct ggml_tensor * dst);
73
74
  void ggml_compute_forward_pad(const struct ggml_compute_params * params, struct ggml_tensor * dst);
74
75
  void ggml_compute_forward_pad_reflect_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
76
+ void ggml_compute_forward_roll(const struct ggml_compute_params * params, struct ggml_tensor * dst);
75
77
  void ggml_compute_forward_arange(const struct ggml_compute_params * params, struct ggml_tensor * dst);
76
78
  void ggml_compute_forward_timestep_embedding(const struct ggml_compute_params * params, struct ggml_tensor * dst);
77
79
  void ggml_compute_forward_argsort(const struct ggml_compute_params * params, struct ggml_tensor * dst);