@fugood/llama.node 1.0.0-beta.6 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. package/CMakeLists.txt +2 -0
  2. package/lib/binding.ts +12 -0
  3. package/lib/index.js +10 -0
  4. package/lib/index.ts +17 -1
  5. package/package.json +14 -14
  6. package/src/EmbeddingWorker.cpp +1 -1
  7. package/src/LlamaCompletionWorker.cpp +7 -3
  8. package/src/LlamaCompletionWorker.h +2 -0
  9. package/src/LlamaContext.cpp +49 -6
  10. package/src/LlamaContext.h +1 -0
  11. package/src/RerankWorker.h +26 -0
  12. package/src/common.hpp +1 -1
  13. package/src/llama.cpp/CMakeLists.txt +1 -1
  14. package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -46
  15. package/src/llama.cpp/ggml/CMakeLists.txt +1 -0
  16. package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -0
  17. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +8 -0
  18. package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +10 -9
  19. package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +109 -108
  20. package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +13 -12
  21. package/src/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +53 -52
  22. package/src/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +56 -55
  23. package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +42 -41
  24. package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +24 -23
  25. package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +29 -28
  26. package/src/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +30 -29
  27. package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +83 -82
  28. package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +20 -19
  29. package/src/llama.cpp/ggml/src/ggml-cpu/common.h +3 -2
  30. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +9 -3
  31. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +59 -16
  32. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +3 -0
  33. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +3 -2
  34. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +48 -48
  35. package/src/llama.cpp/ggml/src/ggml-cpu/quants.c +25 -24
  36. package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +15 -14
  37. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +211 -33
  38. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +2 -2
  39. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +45 -45
  40. package/src/llama.cpp/include/llama.h +6 -3
  41. package/src/llama.cpp/src/llama-arch.cpp +54 -0
  42. package/src/llama.cpp/src/llama-arch.h +17 -0
  43. package/src/llama.cpp/src/llama-batch.cpp +20 -7
  44. package/src/llama.cpp/src/llama-chat.cpp +11 -6
  45. package/src/llama.cpp/src/llama-context.cpp +0 -1
  46. package/src/llama.cpp/src/llama-graph.cpp +19 -4
  47. package/src/llama.cpp/src/llama-graph.h +14 -2
  48. package/src/llama.cpp/src/llama-hparams.h +6 -0
  49. package/src/llama.cpp/src/llama-kv-cache-unified.cpp +28 -2
  50. package/src/llama.cpp/src/llama-kv-cells.h +33 -9
  51. package/src/llama.cpp/src/llama-model.cpp +518 -1
  52. package/src/llama.cpp/src/llama-model.h +22 -0
  53. package/src/llama.cpp/src/llama-quant.cpp +87 -5
@@ -108,7 +108,7 @@ static void ggml_compute_forward_dup_f16(
108
108
  for (int i01 = ir0; i01 < ir1; i01++) {
109
109
  const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
110
110
  for (int i00 = 0; i00 < ne00; i00++) {
111
- dst_ptr[id] = GGML_FP16_TO_FP32(src0_ptr[i00]);
111
+ dst_ptr[id] = GGML_CPU_FP16_TO_FP32(src0_ptr[i00]);
112
112
  id++;
113
113
  }
114
114
  }
@@ -130,7 +130,7 @@ static void ggml_compute_forward_dup_f16(
130
130
  const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
131
131
 
132
132
  for (int i00 = 0; i00 < ne00; i00++) {
133
- src0_f32[i00] = GGML_FP16_TO_FP32(src0_ptr[i00]);
133
+ src0_f32[i00] = GGML_CPU_FP16_TO_FP32(src0_ptr[i00]);
134
134
  }
135
135
 
136
136
  quantize_row_q(src0_f32, dst_ptr + id, ne00);
@@ -156,7 +156,7 @@ static void ggml_compute_forward_dup_f16(
156
156
  for (int i00 = 0; i00 < ne00; i00++) {
157
157
  const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
158
158
 
159
- dst_ptr[id] = GGML_FP16_TO_FP32(*src0_ptr);
159
+ dst_ptr[id] = GGML_CPU_FP16_TO_FP32(*src0_ptr);
160
160
  id++;
161
161
  }
162
162
  }
@@ -267,7 +267,7 @@ static void ggml_compute_forward_dup_f16(
267
267
  const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
268
268
  char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3);
269
269
 
270
- *(float *) dst_ptr = GGML_FP16_TO_FP32(*(const ggml_fp16_t *) src0_ptr);
270
+ *(float *) dst_ptr = GGML_CPU_FP16_TO_FP32(*(const ggml_fp16_t *) src0_ptr);
271
271
 
272
272
  if (++i10 == ne0) {
273
273
  i10 = 0;
@@ -372,7 +372,7 @@ static void ggml_compute_forward_dup_bf16(
372
372
  for (int i01 = ir0; i01 < ir1; i01++) {
373
373
  const ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
374
374
  for (int i00 = 0; i00 < ne00; i00++) {
375
- dst_ptr[id] = GGML_FP32_TO_FP16(GGML_BF16_TO_FP32(src0_ptr[i00]));
375
+ dst_ptr[id] = GGML_CPU_FP32_TO_FP16(GGML_BF16_TO_FP32(src0_ptr[i00]));
376
376
  id++;
377
377
  }
378
378
  }
@@ -473,7 +473,7 @@ static void ggml_compute_forward_dup_bf16(
473
473
  for (int i00 = 0; i00 < ne00; i00++) {
474
474
  const ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
475
475
 
476
- dst_ptr[id] = GGML_FP32_TO_FP16(GGML_BF16_TO_FP32(*src0_ptr));
476
+ dst_ptr[id] = GGML_CPU_FP32_TO_FP16(GGML_BF16_TO_FP32(*src0_ptr));
477
477
  id++;
478
478
  }
479
479
  }
@@ -566,7 +566,7 @@ static void ggml_compute_forward_dup_bf16(
566
566
  const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
567
567
  char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3);
568
568
 
569
- *(ggml_fp16_t *) dst_ptr = GGML_FP32_TO_FP16(GGML_BF16_TO_FP32(*(const ggml_bf16_t *) src0_ptr));
569
+ *(ggml_fp16_t *) dst_ptr = GGML_CPU_FP32_TO_FP16(GGML_BF16_TO_FP32(*(const ggml_bf16_t *) src0_ptr));
570
570
 
571
571
  if (++i10 == ne0) {
572
572
  i10 = 0;
@@ -765,7 +765,7 @@ static void ggml_compute_forward_dup_f32(
765
765
  for (int i00 = 0; i00 < ne00; i00++) {
766
766
  const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
767
767
 
768
- dst_ptr[id] = GGML_FP32_TO_FP16(*src0_ptr);
768
+ dst_ptr[id] = GGML_CPU_FP32_TO_FP16(*src0_ptr);
769
769
  id++;
770
770
  }
771
771
  }
@@ -878,7 +878,7 @@ static void ggml_compute_forward_dup_f32(
878
878
  const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
879
879
  char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3);
880
880
 
881
- *(ggml_fp16_t *) dst_ptr = GGML_FP32_TO_FP16(*(const float *) src0_ptr);
881
+ *(ggml_fp16_t *) dst_ptr = GGML_CPU_FP32_TO_FP16(*(const float *) src0_ptr);
882
882
 
883
883
  if (++i10 == ne0) {
884
884
  i10 = 0;
@@ -1419,7 +1419,7 @@ static void ggml_compute_forward_add1_f16_f32(
1419
1419
  ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 );
1420
1420
  ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
1421
1421
  for (int i = 0; i < ne0; i++) {
1422
- dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + v);
1422
+ dst_ptr[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(src0_ptr[i]) + v);
1423
1423
  }
1424
1424
  }
1425
1425
  }
@@ -1435,7 +1435,7 @@ static void ggml_compute_forward_add1_f16_f16(
1435
1435
  GGML_ASSERT(ggml_is_scalar(src1));
1436
1436
 
1437
1437
  // scalar to add
1438
- const float v = GGML_FP16_TO_FP32(*(ggml_fp16_t *) src1->data);
1438
+ const float v = GGML_CPU_FP16_TO_FP32(*(ggml_fp16_t *) src1->data);
1439
1439
 
1440
1440
  const int ith = params->ith;
1441
1441
  const int nth = params->nth;
@@ -1467,7 +1467,7 @@ static void ggml_compute_forward_add1_f16_f16(
1467
1467
  ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 );
1468
1468
  ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
1469
1469
  for (int i = 0; i < ne0; i++) {
1470
- dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + v);
1470
+ dst_ptr[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(src0_ptr[i]) + v);
1471
1471
  }
1472
1472
  }
1473
1473
  }
@@ -1889,7 +1889,7 @@ static void ggml_compute_forward_sum_f16(
1889
1889
  }
1890
1890
  }
1891
1891
  }
1892
- ((ggml_fp16_t *) dst->data)[0] = GGML_FP32_TO_FP16(sum);
1892
+ ((ggml_fp16_t *) dst->data)[0] = GGML_CPU_FP32_TO_FP16(sum);
1893
1893
  }
1894
1894
 
1895
1895
  static void ggml_compute_forward_sum_bf16(
@@ -2660,7 +2660,7 @@ static void ggml_compute_forward_gelu_f16(
2660
2660
  #ifndef NDEBUG
2661
2661
  for (int k = 0; k < nc; k++) {
2662
2662
  const ggml_fp16_t x = ((ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])))[k];
2663
- const float v = GGML_FP16_TO_FP32(x);
2663
+ const float v = GGML_CPU_FP16_TO_FP32(x);
2664
2664
  GGML_UNUSED(v);
2665
2665
  assert(!isnan(v));
2666
2666
  assert(!isinf(v));
@@ -2763,7 +2763,7 @@ static void ggml_compute_forward_gelu_erf_f16(
2763
2763
  #ifndef NDEBUG
2764
2764
  for (int k = 0; k < nc; k++) {
2765
2765
  const ggml_fp16_t x = ((ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])))[k];
2766
- const float v = GGML_FP16_TO_FP32(x);
2766
+ const float v = GGML_CPU_FP16_TO_FP32(x);
2767
2767
  GGML_UNUSED(v);
2768
2768
  assert(!isnan(v));
2769
2769
  assert(!isinf(v));
@@ -2866,7 +2866,7 @@ static void ggml_compute_forward_gelu_quick_f16(
2866
2866
  #ifndef NDEBUG
2867
2867
  for (int k = 0; k < nc; k++) {
2868
2868
  const ggml_fp16_t x = ((ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])))[k];
2869
- const float v = GGML_FP16_TO_FP32(x);
2869
+ const float v = GGML_CPU_FP16_TO_FP32(x);
2870
2870
  GGML_UNUSED(v);
2871
2871
  assert(!isnan(v));
2872
2872
  assert(!isinf(v));
@@ -2969,7 +2969,7 @@ static void ggml_compute_forward_silu_f16(
2969
2969
  #ifndef NDEBUG
2970
2970
  for (int k = 0; k < nc; k++) {
2971
2971
  const ggml_fp16_t x = ((ggml_fp16_t *) ((char *) dst->data + i1*(dst->nb[1])))[k];
2972
- const float v = GGML_FP16_TO_FP32(x);
2972
+ const float v = GGML_CPU_FP16_TO_FP32(x);
2973
2973
  GGML_UNUSED(v);
2974
2974
  assert(!isnan(v));
2975
2975
  assert(!isinf(v));
@@ -3163,7 +3163,7 @@ static void ggml_compute_forward_silu_back_f16(
3163
3163
  #ifndef NDEBUG
3164
3164
  for (int k = 0; k < nc; k++) {
3165
3165
  const float x = ((ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])))[k];
3166
- const float v = GGML_FP16_TO_FP32(x);
3166
+ const float v = GGML_CPU_FP16_TO_FP32(x);
3167
3167
  GGML_UNUSED(v);
3168
3168
  assert(!isnan(v));
3169
3169
  assert(!isinf(v));
@@ -4500,7 +4500,7 @@ static void ggml_compute_forward_get_rows_back_f32_f16(
4500
4500
 
4501
4501
  for (int j = 0; j < nc; ++j) {
4502
4502
  ggml_fp16_t v = ((ggml_fp16_t *) ((char *) src0->data + i*src0->nb[1]))[j];
4503
- ((float *) ((char *) dst->data + r*dst->nb[1]))[j] += GGML_FP16_TO_FP32(v);
4503
+ ((float *) ((char *) dst->data + r*dst->nb[1]))[j] += GGML_CPU_FP16_TO_FP32(v);
4504
4504
  }
4505
4505
  }
4506
4506
  }
@@ -4792,7 +4792,7 @@ static void ggml_compute_forward_soft_max_f32(
4792
4792
  if (mp_f32) {
4793
4793
  if (use_f16) {
4794
4794
  for (int i = 0; i < nc; ++i) {
4795
- wp[i] += slope*GGML_FP16_TO_FP32(mp_f16[i]);
4795
+ wp[i] += slope*GGML_CPU_FP16_TO_FP32(mp_f16[i]);
4796
4796
  }
4797
4797
  } else {
4798
4798
  for (int i = 0; i < nc; ++i) {
@@ -5018,8 +5018,8 @@ static void ggml_compute_forward_clamp_f16(
5018
5018
  ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + j*nb01);
5019
5019
 
5020
5020
  for (int i = 0; i < nc; i++) {
5021
- float v = GGML_FP16_TO_FP32(src0_ptr[i]);
5022
- dst_ptr[i] = GGML_FP32_TO_FP16(MAX(MIN(v, max), min));
5021
+ float v = GGML_CPU_FP16_TO_FP32(src0_ptr[i]);
5022
+ dst_ptr[i] = GGML_CPU_FP32_TO_FP16(MAX(MIN(v, max), min));
5023
5023
  }
5024
5024
  }
5025
5025
  }
@@ -5476,11 +5476,11 @@ static void ggml_compute_forward_rope_f16(
5476
5476
  const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
5477
5477
  ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0);
5478
5478
 
5479
- const float x0 = GGML_FP16_TO_FP32(src[0]);
5480
- const float x1 = GGML_FP16_TO_FP32(src[n_dims]);
5479
+ const float x0 = GGML_CPU_FP16_TO_FP32(src[0]);
5480
+ const float x1 = GGML_CPU_FP16_TO_FP32(src[n_dims]);
5481
5481
 
5482
- dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
5483
- dst_data[n_dims] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
5482
+ dst_data[0] = GGML_CPU_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
5483
+ dst_data[n_dims] = GGML_CPU_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
5484
5484
  }
5485
5485
  } else {
5486
5486
  for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
@@ -5492,11 +5492,11 @@ static void ggml_compute_forward_rope_f16(
5492
5492
  const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
5493
5493
  ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0);
5494
5494
 
5495
- const float x0 = GGML_FP16_TO_FP32(src[0]);
5496
- const float x1 = GGML_FP16_TO_FP32(src[n_dims/2]);
5495
+ const float x0 = GGML_CPU_FP16_TO_FP32(src[0]);
5496
+ const float x1 = GGML_CPU_FP16_TO_FP32(src[n_dims/2]);
5497
5497
 
5498
- dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
5499
- dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
5498
+ dst_data[0] = GGML_CPU_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
5499
+ dst_data[n_dims/2] = GGML_CPU_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
5500
5500
  }
5501
5501
  }
5502
5502
  } else {
@@ -5507,11 +5507,11 @@ static void ggml_compute_forward_rope_f16(
5507
5507
  const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
5508
5508
  ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
5509
5509
 
5510
- const float x0 = GGML_FP16_TO_FP32(src[0]);
5511
- const float x1 = GGML_FP16_TO_FP32(src[1]);
5510
+ const float x0 = GGML_CPU_FP16_TO_FP32(src[0]);
5511
+ const float x1 = GGML_CPU_FP16_TO_FP32(src[1]);
5512
5512
 
5513
- dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
5514
- dst_data[1] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
5513
+ dst_data[0] = GGML_CPU_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
5514
+ dst_data[1] = GGML_CPU_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
5515
5515
  }
5516
5516
  }
5517
5517
 
@@ -5525,11 +5525,11 @@ static void ggml_compute_forward_rope_f16(
5525
5525
  const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
5526
5526
  ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0);
5527
5527
 
5528
- const float x0 = GGML_FP16_TO_FP32(src[0]);
5529
- const float x1 = GGML_FP16_TO_FP32(src[n_dims]);
5528
+ const float x0 = GGML_CPU_FP16_TO_FP32(src[0]);
5529
+ const float x1 = GGML_CPU_FP16_TO_FP32(src[n_dims]);
5530
5530
 
5531
- dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
5532
- dst_data[n_dims] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
5531
+ dst_data[0] = GGML_CPU_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
5532
+ dst_data[n_dims] = GGML_CPU_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
5533
5533
  }
5534
5534
  } else {
5535
5535
  for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) {
@@ -5640,7 +5640,7 @@ static void ggml_compute_forward_conv_transpose_1d_f16_f32(
5640
5640
  for (int64_t i11 = 0; i11 < ne11; i11++) {
5641
5641
  const float * const src = (float *)((char *) src1->data + i11*nb11);
5642
5642
  for (int64_t i10 = 0; i10 < ne10; i10++) {
5643
- dst_data[i10*ne11 + i11] = GGML_FP32_TO_FP16(src[i10]);
5643
+ dst_data[i10*ne11 + i11] = GGML_CPU_FP32_TO_FP16(src[i10]);
5644
5644
  }
5645
5645
  }
5646
5646
  }
@@ -5933,7 +5933,7 @@ static void ggml_compute_forward_im2col_f16(
5933
5933
  if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
5934
5934
  dst_data[iic*(KH*KW) + ikh*KW + ikw] = 0;
5935
5935
  } else {
5936
- dst_data[iic*(KH*KW) + ikh*KW + ikw] = GGML_FP32_TO_FP16(src_data[iih*IW + iiw]);
5936
+ dst_data[iic*(KH*KW) + ikh*KW + ikw] = GGML_CPU_FP32_TO_FP16(src_data[iih*IW + iiw]);
5937
5937
  }
5938
5938
  }
5939
5939
  }
@@ -6109,7 +6109,7 @@ void ggml_compute_forward_conv_transpose_2d(
6109
6109
  const float * const src = (float *)((char *) src1->data + i12*nb12 + i11*nb11);
6110
6110
  ggml_fp16_t * dst_data = wdata + i11*ne10*ne12;
6111
6111
  for (int i10 = 0; i10 < ne10; i10++) {
6112
- dst_data[i10*ne12 + i12] = GGML_FP32_TO_FP16(src[i10]);
6112
+ dst_data[i10*ne12 + i12] = GGML_CPU_FP32_TO_FP16(src[i10]);
6113
6113
  }
6114
6114
  }
6115
6115
  }
@@ -6358,7 +6358,7 @@ static void ggml_compute_forward_pool_1d_sk_p0(
6358
6358
  case GGML_OP_POOL_COUNT: GGML_ABORT("fatal error");
6359
6359
  }
6360
6360
  for (int ki = 0; ki < k; ++ki) {
6361
- const float srow_j = (src->type == GGML_TYPE_F32) ? ((const float*)srow)[j] : GGML_FP16_TO_FP32(((const ggml_fp16_t*)srow)[j]);
6361
+ const float srow_j = (src->type == GGML_TYPE_F32) ? ((const float*)srow)[j] : GGML_CPU_FP16_TO_FP32(((const ggml_fp16_t*)srow)[j]);
6362
6362
  switch (op) {
6363
6363
  case GGML_OP_POOL_AVG: drow[i] += srow_j; break;
6364
6364
  case GGML_OP_POOL_MAX: if (srow_j > drow[i]) drow[i] = srow_j; break;
@@ -6450,7 +6450,7 @@ void ggml_compute_forward_pool_2d(
6450
6450
  for (int kx = 0; kx < k0; ++kx) {
6451
6451
  int j = ix + kx;
6452
6452
  if (j < 0 || j >= src->ne[0]) continue;
6453
- const float srow_j = (src->type == GGML_TYPE_F32) ? ((const float*)srow)[j] : GGML_FP16_TO_FP32(((const ggml_fp16_t*)srow)[j]);
6453
+ const float srow_j = (src->type == GGML_TYPE_F32) ? ((const float*)srow)[j] : GGML_CPU_FP16_TO_FP32(((const ggml_fp16_t*)srow)[j]);
6454
6454
  switch (op) {
6455
6455
  case GGML_OP_POOL_AVG: *out += srow_j; break;
6456
6456
  case GGML_OP_POOL_MAX: if (srow_j > *out) *out = srow_j; break;
@@ -6538,7 +6538,7 @@ void ggml_compute_forward_pool_2d_back(
6538
6538
  }
6539
6539
 
6540
6540
  const float val = dst->type == GGML_TYPE_F32 ?
6541
- ((const float *) drowf)[j] : GGML_FP16_TO_FP32(((const ggml_fp16_t *) drowf)[j]);
6541
+ ((const float *) drowf)[j] : GGML_CPU_FP16_TO_FP32(((const ggml_fp16_t *) drowf)[j]);
6542
6542
  if (val <= maxval) {
6543
6543
  continue;
6544
6544
  }
@@ -6558,7 +6558,7 @@ void ggml_compute_forward_pool_2d_back(
6558
6558
  if (dst->type == GGML_TYPE_F32) {
6559
6559
  ((float *) drow)[j] += grad0;
6560
6560
  } else {
6561
- ((ggml_fp16_t *) drow)[j] = GGML_FP32_TO_FP16(grad0 + GGML_FP16_TO_FP32(((const ggml_fp16_t *) drow)[j]));
6561
+ ((ggml_fp16_t *) drow)[j] = GGML_CPU_FP32_TO_FP16(grad0 + GGML_CPU_FP16_TO_FP32(((const ggml_fp16_t *) drow)[j]));
6562
6562
  }
6563
6563
  } else if (op == GGML_OP_POOL_AVG) {
6564
6564
  const float grad = grad0 / ka;
@@ -6577,7 +6577,7 @@ void ggml_compute_forward_pool_2d_back(
6577
6577
  if (dst->type == GGML_TYPE_F32) {
6578
6578
  ((float *) drow)[j] += grad;
6579
6579
  } else {
6580
- ((ggml_fp16_t *) drow)[j] += GGML_FP32_TO_FP16(grad);
6580
+ ((ggml_fp16_t *) drow)[j] += GGML_CPU_FP32_TO_FP16(grad);
6581
6581
  }
6582
6582
  }
6583
6583
  }
@@ -7142,7 +7142,7 @@ static void ggml_compute_forward_flash_attn_ext_f16(
7142
7142
  // loop over n_kv and n_head_kv
7143
7143
  // ref: https://arxiv.org/pdf/2112.05682.pdf
7144
7144
  for (int64_t ic = 0; ic < nek1; ++ic) {
7145
- const float mv = mp ? slope*GGML_FP16_TO_FP32(mp[ic]) : 0.0f;
7145
+ const float mv = mp ? slope*GGML_CPU_FP16_TO_FP32(mp[ic]) : 0.0f;
7146
7146
  if (mv == -INFINITY) {
7147
7147
  continue;
7148
7148
  }
@@ -7210,7 +7210,7 @@ static void ggml_compute_forward_flash_attn_ext_f16(
7210
7210
 
7211
7211
  if (v->type == GGML_TYPE_F16) {
7212
7212
  for (int64_t d = 0; d < DV; ++d) {
7213
- VKQ32[d] = GGML_FP16_TO_FP32(VKQ16[d]);
7213
+ VKQ32[d] = GGML_CPU_FP16_TO_FP32(VKQ16[d]);
7214
7214
  }
7215
7215
  }
7216
7216
 
@@ -2,6 +2,7 @@
2
2
  #include "ggml-common.h"
3
3
 
4
4
  #include "ggml-cpu-impl.h"
5
+ #include "simd-mappings.h"
5
6
  #include "ggml-quants.h"
6
7
  #include "quants.h"
7
8
 
@@ -137,7 +138,7 @@ void ggml_vec_dot_q4_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, c
137
138
  }
138
139
 
139
140
  int sumi = sumi0 + sumi1;
140
- sumf += sumi*GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d);
141
+ sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d);
141
142
  }
142
143
 
143
144
  *s = sumf;
@@ -174,7 +175,7 @@ void ggml_vec_dot_q4_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, c
174
175
  }
175
176
 
176
177
  int sumi = sumi0 + sumi1;
177
- sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s);
178
+ sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
178
179
  }
179
180
 
180
181
  *s = sumf;
@@ -217,7 +218,7 @@ void ggml_vec_dot_q5_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, c
217
218
  }
218
219
 
219
220
  int sumi = sumi0 + sumi1;
220
- sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d)) * sumi;
221
+ sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi;
221
222
  }
222
223
 
223
224
  *s = sumf;
@@ -260,7 +261,7 @@ void ggml_vec_dot_q5_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, c
260
261
  }
261
262
 
262
263
  int sumi = sumi0 + sumi1;
263
- sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s);
264
+ sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
264
265
  }
265
266
 
266
267
  *s = sumf;
@@ -290,7 +291,7 @@ void ggml_vec_dot_q8_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, c
290
291
  sumi += x[ib].qs[j]*y[ib].qs[j];
291
292
  }
292
293
 
293
- sumf += sumi*(GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d));
294
+ sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d));
294
295
  }
295
296
 
296
297
  *s = sumf;
@@ -342,7 +343,7 @@ void ggml_vec_dot_tq1_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
342
343
  }
343
344
  }
344
345
 
345
- sumf += (float) sum * (GGML_FP16_TO_FP32(x[i].d) * y[i].d);
346
+ sumf += (float) sum * (GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d);
346
347
  }
347
348
 
348
349
  *s = sumf;
@@ -372,7 +373,7 @@ void ggml_vec_dot_tq2_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
372
373
  }
373
374
  }
374
375
 
375
- const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
376
+ const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
376
377
 
377
378
  sumf += (float) sumi * d;
378
379
  }
@@ -405,8 +406,8 @@ void ggml_vec_dot_q2_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, c
405
406
  summs += y[i].bsums[j] * (sc[j] >> 4);
406
407
  }
407
408
 
408
- const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d);
409
- const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
409
+ const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
410
+ const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
410
411
 
411
412
  int isum = 0;
412
413
  int is = 0;
@@ -504,7 +505,7 @@ void ggml_vec_dot_q3_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, c
504
505
  for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
505
506
  q8 += 8; a += 8;
506
507
  }
507
- const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
508
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
508
509
  for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
509
510
  }
510
511
  for (int l = 0; l < 8; ++l) sumf += sums[l];
@@ -577,9 +578,9 @@ void ggml_vec_dot_q4_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, c
577
578
  for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
578
579
  q8 += 8; a += 8;
579
580
  }
580
- const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
581
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
581
582
  for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
582
- const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
583
+ const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
583
584
  sumf -= dmin * sumi;
584
585
  }
585
586
  for (int l = 0; l < 8; ++l) sumf += sums[l];
@@ -657,9 +658,9 @@ void ggml_vec_dot_q5_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, c
657
658
  for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
658
659
  q8 += 8; a += 8;
659
660
  }
660
- const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
661
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
661
662
  for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
662
- const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
663
+ const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
663
664
  sumf -= dmin * sumi;
664
665
  }
665
666
  for (int l = 0; l < 8; ++l) sumf += sums[l];
@@ -714,7 +715,7 @@ void ggml_vec_dot_q6_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, c
714
715
  for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
715
716
  q8 += 8; a += 8;
716
717
  }
717
- const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
718
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
718
719
  for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
719
720
  }
720
721
  for (int l = 0; l < 8; ++l) sumf += sums[l];
@@ -739,7 +740,7 @@ void ggml_vec_dot_iq2_xxs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs
739
740
 
740
741
  float sumf = 0.f;
741
742
  for (int i = 0; i < nb; ++i) {
742
- const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
743
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
743
744
  const uint16_t * GGML_RESTRICT q2 = x[i].qs;
744
745
  const int8_t * GGML_RESTRICT q8 = y[i].qs;
745
746
  int32_t bsum = 0;
@@ -778,7 +779,7 @@ void ggml_vec_dot_iq2_xs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
778
779
 
779
780
  float sumf = 0.f;
780
781
  for (int i = 0; i < nb; ++i) {
781
- const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
782
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
782
783
  const uint16_t * GGML_RESTRICT q2 = x[i].qs;
783
784
  const uint8_t * GGML_RESTRICT sc = x[i].scales;
784
785
  const int8_t * GGML_RESTRICT q8 = y[i].qs;
@@ -829,7 +830,7 @@ void ggml_vec_dot_iq2_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
829
830
  float sumf = 0;
830
831
  for (int i = 0; i < nb; i++) {
831
832
 
832
- const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
833
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
833
834
  const int8_t * q8 = y[i].qs;
834
835
  const uint8_t * qs = x[i].qs;
835
836
  const uint8_t * qh = x[i].qh;
@@ -882,7 +883,7 @@ void ggml_vec_dot_iq3_xxs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs
882
883
 
883
884
  float sumf = 0.f;
884
885
  for (int i = 0; i < nb; ++i) {
885
- const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
886
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
886
887
  const uint8_t * GGML_RESTRICT q3 = x[i].qs;
887
888
  const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
888
889
  const int8_t * GGML_RESTRICT q8 = y[i].qs;
@@ -924,7 +925,7 @@ void ggml_vec_dot_iq3_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
924
925
 
925
926
  float sumf = 0.f;
926
927
  for (int i = 0; i < nb; ++i) {
927
- const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
928
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
928
929
  const uint8_t * GGML_RESTRICT qs = x[i].qs;
929
930
  const uint8_t * GGML_RESTRICT qh = x[i].qh;
930
931
  const uint8_t * GGML_RESTRICT signs = x[i].signs;
@@ -1002,7 +1003,7 @@ void ggml_vec_dot_iq1_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
1002
1003
  qs += 4;
1003
1004
  }
1004
1005
 
1005
- sumf += GGML_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1);
1006
+ sumf += GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1);
1006
1007
  }
1007
1008
 
1008
1009
  *s = sumf;
@@ -1063,7 +1064,7 @@ void ggml_vec_dot_iq1_m_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
1063
1064
  qh += 2;
1064
1065
  }
1065
1066
 
1066
- sumf += GGML_FP16_TO_FP32(scale.f16) * y[i].d * (sumi1 + IQ1M_DELTA * sumi2);
1067
+ sumf += GGML_CPU_FP16_TO_FP32(scale.f16) * y[i].d * (sumi1 + IQ1M_DELTA * sumi2);
1067
1068
  }
1068
1069
 
1069
1070
  *s = sumf;
@@ -1087,7 +1088,7 @@ void ggml_vec_dot_iq4_nl_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
1087
1088
  float sumf = 0;
1088
1089
 
1089
1090
  for (; ib < nb; ++ib) {
1090
- const float d = GGML_FP16_TO_FP32(y[ib].d)*GGML_FP16_TO_FP32(x[ib].d);
1091
+ const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_CPU_FP16_TO_FP32(x[ib].d);
1091
1092
  int sumi1 = 0, sumi2 = 0;
1092
1093
  for (int j = 0; j < QK4_NL/2; ++j) {
1093
1094
  sumi1 += y[ib].qs[j+ 0] * kvalues_iq4nl[x[ib].qs[j] & 0xf];
@@ -1113,7 +1114,7 @@ void ggml_vec_dot_iq4_xs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
1113
1114
 
1114
1115
  float sumf = 0;
1115
1116
  for (int ibl = 0; ibl < nb; ++ibl) {
1116
- const float d4d8 = GGML_FP16_TO_FP32(x[ibl].d) * y[ibl].d;
1117
+ const float d4d8 = GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d;
1117
1118
  uint16_t h = x[ibl].scales_h;
1118
1119
  const uint8_t * qs = x[ibl].qs;
1119
1120
  const int8_t * q8 = y[ibl].qs;
@@ -6,6 +6,7 @@
6
6
  #include "ggml-impl.h"
7
7
  #include "ggml-cpu.h"
8
8
  #include "ggml-cpu-impl.h"
9
+ #include "simd-mappings.h"
9
10
  #include "traits.h"
10
11
 
11
12
  #include "arch-fallback.h"
@@ -72,7 +73,7 @@ void ggml_quantize_mat_q8_0_4x4_generic(const float * GGML_RESTRICT x, void * GG
72
73
  const float d = amax / ((1 << 7) - 1);
73
74
  id[row_iter] = d ? 1.0f / d : 0.0f;
74
75
 
75
- y[i].d[row_iter] = GGML_FP32_TO_FP16(d);
76
+ y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d);
76
77
  }
77
78
 
78
79
  for (int j = 0; j < QK8_0 * 4; j++) {
@@ -110,7 +111,7 @@ void ggml_quantize_mat_q8_0_4x8_generic(const float * GGML_RESTRICT x, void * GG
110
111
  const float d = amax / ((1 << 7) - 1);
111
112
  id[row_iter] = d ? 1.0f / d : 0.0f;
112
113
 
113
- y[i].d[row_iter] = GGML_FP32_TO_FP16(d);
114
+ y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d);
114
115
  }
115
116
 
116
117
  for (int j = 0; j < QK8_0 * 4; j++) {
@@ -236,7 +237,7 @@ void ggml_gemv_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
236
237
  const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
237
238
  sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
238
239
  }
239
- sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d);
240
+ sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
240
241
  }
241
242
  }
242
243
  }
@@ -280,7 +281,7 @@ void ggml_gemv_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
280
281
  const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
281
282
  sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
282
283
  }
283
- sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d);
284
+ sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
284
285
  }
285
286
  }
286
287
  }
@@ -325,7 +326,7 @@ void ggml_gemv_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
325
326
  const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
326
327
  sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
327
328
  }
328
- sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d);
329
+ sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
329
330
  }
330
331
  }
331
332
  }
@@ -396,13 +397,13 @@ void ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
396
397
  sumi2 = sumi2 * scales_1[j];
397
398
  sumi += sumi1 + sumi2;
398
399
  }
399
- sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
400
+ sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
400
401
  }
401
402
  }
402
403
  for (int sb = 0; sb < 8; sb++) {
403
404
  uint8_t *mins = (uint8_t*) utmp + 8 + sb * 16;
404
405
  for (int j = 0; j < ncols_interleaved; j++) {
405
- sum_minf[j] += mins[j] * (a_ptr[l].bsums[sb * 2] + a_ptr[l].bsums[sb * 2 + 1]) * GGML_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
406
+ sum_minf[j] += mins[j] * (a_ptr[l].bsums[sb * 2] + a_ptr[l].bsums[sb * 2 + 1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
406
407
  }
407
408
  }
408
409
  }
@@ -449,7 +450,7 @@ void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs
449
450
  const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
450
451
  sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
451
452
  }
452
- sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d);
453
+ sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
453
454
  }
454
455
  }
455
456
  }
@@ -500,7 +501,7 @@ void ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
500
501
  sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
501
502
  (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
502
503
  }
503
- sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d[m]);
504
+ sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
504
505
  }
505
506
  }
506
507
  }
@@ -555,7 +556,7 @@ void ggml_gemm_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
555
556
  sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
556
557
  (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
557
558
  }
558
- sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d[m]);
559
+ sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
559
560
  }
560
561
  }
561
562
  }
@@ -609,7 +610,7 @@ void ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
609
610
  sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
610
611
  (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
611
612
  }
612
- sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d[m]);
613
+ sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
613
614
  }
614
615
  }
615
616
  }
@@ -688,7 +689,7 @@ void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
688
689
  sumi2 = sumi2 * scales_1[j];
689
690
  sumi += sumi1 + sumi2;
690
691
  }
691
- sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m];
692
+ sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m];
692
693
  }
693
694
  }
694
695
  }
@@ -697,7 +698,7 @@ void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
697
698
  for(int m = 0; m < 4; m++) {
698
699
  const int16_t *bsums = a_ptr[l].bsums + (sb * 8) + (m * 4) - ((sb % 2) * 6);
699
700
  for(int j = 0; j < ncols_interleaved; j++) {
700
- sum_minf[m][j] += mins[j] * (bsums[0] + bsums[1]) * GGML_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
701
+ sum_minf[m][j] += mins[j] * (bsums[0] + bsums[1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
701
702
  }
702
703
  }
703
704
  }
@@ -753,7 +754,7 @@ void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs
753
754
  sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
754
755
  (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4]));
755
756
  }
756
- sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d[m]);
757
+ sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
757
758
  }
758
759
  }
759
760
  }