node-llama-cpp 2.8.7 → 2.8.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -1,5 +1,5 @@
1
1
  <div align="center">
2
- <img alt="node-llama-cpp Logo" src="https://media.githubusercontent.com/media/withcatai/node-llama-cpp/master/assets/logo.roundEdges.png" width="360px" />
2
+ <img alt="node-llama-cpp Logo" src="https://raw.githubusercontent.com/withcatai/node-llama-cpp/master/assets/logo.roundEdges.png" width="360px" />
3
3
  <h1>node-llama-cpp</h1>
4
4
  <p>Run AI models locally on your machine</p>
5
5
  <sub>Pre-built bindings are provided with a fallback to building from source with cmake</sub>
@@ -84,7 +84,7 @@ To contribute to `node-llama-cpp` read the [contribution guide](https://withcata
84
84
  <br />
85
85
 
86
86
  <div align="center" width="360">
87
- <img alt="Star please" src="https://media.githubusercontent.com/media/withcatai/node-llama-cpp/master/assets/star.please.roundEdges.png" width="360" margin="auto" />
87
+ <img alt="Star please" src="https://raw.githubusercontent.com/withcatai/node-llama-cpp/master/assets/star.please.roundEdges.png" width="360" margin="auto" />
88
88
  <br/>
89
89
  <p align="right">
90
90
  <i>If you like this repo, star it ✨</i>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
@@ -1,3 +1,3 @@
1
1
  {
2
- "release": "b2177"
2
+ "release": "b2249"
3
3
  }
Binary file
@@ -392,7 +392,7 @@ kernel void kernel_soft_max(
392
392
  float lmax = -INFINITY;
393
393
 
394
394
  for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
395
- lmax = MAX(lmax, psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f) + slope*ppos[i00]);
395
+ lmax = MAX(lmax, psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f) + (ppos ? slope*ppos[i00] : 0.0f));
396
396
  }
397
397
 
398
398
  // find the max value in the block
@@ -417,7 +417,7 @@ kernel void kernel_soft_max(
417
417
  // parallel sum
418
418
  float lsum = 0.0f;
419
419
  for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
420
- const float exp_psrc0 = exp((psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f) + slope*ppos[i00]) - max_val);
420
+ const float exp_psrc0 = exp((psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f) + (ppos ? slope*ppos[i00] : 0.0f)) - max_val);
421
421
  lsum += exp_psrc0;
422
422
  pdst[i00] = exp_psrc0;
423
423
  }
@@ -495,7 +495,7 @@ kernel void kernel_soft_max_4(
495
495
  float4 lmax4 = -INFINITY;
496
496
 
497
497
  for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
498
- lmax4 = fmax(lmax4, psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f) + slope*ppos[i00]);
498
+ lmax4 = fmax(lmax4, psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f) + (ppos ? slope*ppos[i00] : 0.0f));
499
499
  }
500
500
 
501
501
  const float lmax = MAX(MAX(lmax4[0], lmax4[1]), MAX(lmax4[2], lmax4[3]));
@@ -521,7 +521,7 @@ kernel void kernel_soft_max_4(
521
521
  // parallel sum
522
522
  float4 lsum4 = 0.0f;
523
523
  for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
524
- const float4 exp_psrc4 = exp((psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f) + slope*ppos[i00]) - max_val);
524
+ const float4 exp_psrc4 = exp((psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f) + (ppos ? slope*ppos[i00] : 0.0f)) - max_val);
525
525
  lsum4 += exp_psrc4;
526
526
  pdst4[i00] = exp_psrc4;
527
527
  }
@@ -2531,6 +2531,12 @@ typedef struct {
2531
2531
  uint8_t scales[QK_K/16];
2532
2532
  } block_iq1_s;
2533
2533
 
2534
+ // Non-linear quants
2535
+ #define QK4_NL 32
2536
+ typedef struct {
2537
+ half d;
2538
+ uint8_t qs[QK4_NL/2];
2539
+ } block_iq4_nl;
2534
2540
 
2535
2541
  //====================================== dot products =========================
2536
2542
 
@@ -4027,7 +4033,10 @@ void kernel_mul_mv_iq2_xxs_f32_impl(
4027
4033
  y4 += 32 * 32;
4028
4034
  }
4029
4035
  #else
4030
- // TODO
4036
+ (void) x;
4037
+ (void) y;
4038
+ (void) yl;
4039
+ (void) nb32;
4031
4040
  #endif
4032
4041
 
4033
4042
  for (int row = 0; row < N_DST; ++row) {
@@ -4170,7 +4179,10 @@ void kernel_mul_mv_iq2_xs_f32_impl(
4170
4179
  y4 += 32 * 32;
4171
4180
  }
4172
4181
  #else
4173
- // TODO
4182
+ (void) x;
4183
+ (void) y;
4184
+ (void) yl;
4185
+ (void) nb32;
4174
4186
  #endif
4175
4187
 
4176
4188
  for (int row = 0; row < N_DST; ++row) {
@@ -4306,7 +4318,10 @@ void kernel_mul_mv_iq3_xxs_f32_impl(
4306
4318
  y4 += 32 * 32;
4307
4319
  }
4308
4320
  #else
4309
- // TODO
4321
+ (void) x;
4322
+ (void) y;
4323
+ (void) yl;
4324
+ (void) nb32;
4310
4325
  #endif
4311
4326
 
4312
4327
  for (int row = 0; row < N_DST; ++row) {
@@ -4375,7 +4390,6 @@ void kernel_mul_mv_iq1_s_f32_impl(
4375
4390
  const uint i13 = im/ne12;
4376
4391
 
4377
4392
  const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
4378
-
4379
4393
  device const block_iq1_s * x = (device const block_iq1_s *) src0 + ib_row + offset0;
4380
4394
  device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1;
4381
4395
 
@@ -4424,7 +4438,10 @@ void kernel_mul_mv_iq1_s_f32_impl(
4424
4438
  y4 += 16 * 32;
4425
4439
  }
4426
4440
  #else
4427
- // TODO
4441
+ (void) x;
4442
+ (void) y;
4443
+ (void) yl;
4444
+ (void) nb32;
4428
4445
  #endif
4429
4446
 
4430
4447
  for (int row = 0; row < N_DST; ++row) {
@@ -4435,6 +4452,103 @@ void kernel_mul_mv_iq1_s_f32_impl(
4435
4452
  }
4436
4453
  }
4437
4454
 
4455
+ constexpr constant static float kvalues_iq4nl_f[16] = {
4456
+ -127.f, -104.f, -83.f, -65.f, -49.f, -35.f, -22.f, -10.f, 1.f, 13.f, 25.f, 38.f, 53.f, 69.f, 89.f, 113.f
4457
+ };
4458
+
4459
+ void kernel_mul_mv_iq4_nl_f32_impl(
4460
+ device const void * src0,
4461
+ device const float * src1,
4462
+ device float * dst,
4463
+ constant int64_t & ne00,
4464
+ constant int64_t & ne01,
4465
+ constant int64_t & ne02,
4466
+ constant int64_t & ne10,
4467
+ constant int64_t & ne12,
4468
+ constant int64_t & ne0,
4469
+ constant int64_t & ne1,
4470
+ constant uint & r2,
4471
+ constant uint & r3,
4472
+ threadgroup float * shared_values [[threadgroup(0)]],
4473
+ uint3 tgpig[[threadgroup_position_in_grid]],
4474
+ uint tiisg[[thread_index_in_simdgroup]],
4475
+ uint sgitg[[simdgroup_index_in_threadgroup]]) {
4476
+
4477
+ const int nb = ne00/QK4_NL;
4478
+ const int r0 = tgpig.x;
4479
+ const int r1 = tgpig.y;
4480
+ const int im = tgpig.z;
4481
+ const int first_row = (r0 * 2 + sgitg) * 2;
4482
+ const int ib_row = first_row * nb;
4483
+
4484
+ const uint i12 = im%ne12;
4485
+ const uint i13 = im/ne12;
4486
+
4487
+ const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
4488
+ device const block_iq4_nl * x = (device const block_iq4_nl *) src0 + ib_row + offset0;
4489
+ device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1;
4490
+
4491
+ const int ix = tiisg/2; // 0...15
4492
+ const int it = tiisg%2; // 0 or 1
4493
+
4494
+ shared_values[tiisg] = kvalues_iq4nl_f[tiisg%16];
4495
+ threadgroup_barrier(mem_flags::mem_threadgroup);
4496
+
4497
+ float4 yl[4];
4498
+ float sumf[2]={0.f}, all_sum;
4499
+
4500
+ device const float * yb = y + ix * QK4_NL + it * 8;
4501
+
4502
+ uint32_t aux32[2];
4503
+ thread const uint8_t * q8 = (thread const uint8_t *)aux32;
4504
+
4505
+ float4 qf1, qf2;
4506
+
4507
+ for (int ib = ix; ib < nb; ib += 16) {
4508
+
4509
+ device const float4 * y4 = (device const float4 *)yb;
4510
+ yl[0] = y4[0]; yl[1] = y4[4]; yl[2] = y4[1]; yl[3] = y4[5];
4511
+
4512
+ for (int row = 0; row < 2; ++row) {
4513
+
4514
+ device const block_iq4_nl & xb = x[row*nb + ib];
4515
+ device const uint16_t * q4 = (device const uint16_t *)(xb.qs + 8*it);
4516
+
4517
+ float4 acc1 = {0.f}, acc2 = {0.f};
4518
+
4519
+ aux32[0] = q4[0] | (q4[1] << 16);
4520
+ aux32[1] = (aux32[0] >> 4) & 0x0f0f0f0f;
4521
+ aux32[0] &= 0x0f0f0f0f;
4522
+ qf1 = {shared_values[q8[0]], shared_values[q8[1]], shared_values[q8[2]], shared_values[q8[3]]};
4523
+ qf2 = {shared_values[q8[4]], shared_values[q8[5]], shared_values[q8[6]], shared_values[q8[7]]};
4524
+ acc1 += yl[0] * qf1;
4525
+ acc2 += yl[1] * qf2;
4526
+
4527
+ aux32[0] = q4[2] | (q4[3] << 16);
4528
+ aux32[1] = (aux32[0] >> 4) & 0x0f0f0f0f;
4529
+ aux32[0] &= 0x0f0f0f0f;
4530
+ qf1 = {shared_values[q8[0]], shared_values[q8[1]], shared_values[q8[2]], shared_values[q8[3]]};
4531
+ qf2 = {shared_values[q8[4]], shared_values[q8[5]], shared_values[q8[6]], shared_values[q8[7]]};
4532
+ acc1 += yl[2] * qf1;
4533
+ acc2 += yl[3] * qf2;
4534
+
4535
+ acc1 += acc2;
4536
+
4537
+ sumf[row] += (float)xb.d * (acc1[0] + acc1[1] + acc1[2] + acc1[3]);
4538
+
4539
+ }
4540
+
4541
+ yb += 16 * QK4_NL;
4542
+ }
4543
+
4544
+ for (int row = 0; row < 2; ++row) {
4545
+ all_sum = simd_sum(sumf[row]);
4546
+ if (tiisg == 0) {
4547
+ dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum;
4548
+ }
4549
+ }
4550
+ }
4551
+
4438
4552
  [[host_name("kernel_mul_mv_iq1_s_f32")]]
4439
4553
  kernel void kernel_mul_mv_iq1_s_f32(
4440
4554
  device const void * src0,
@@ -4463,6 +4577,34 @@ kernel void kernel_mul_mv_iq1_s_f32(
4463
4577
  kernel_mul_mv_iq1_s_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, tgpig, tiisg, sgitg);
4464
4578
  }
4465
4579
 
4580
+ [[host_name("kernel_mul_mv_iq4_nl_f32")]]
4581
+ kernel void kernel_mul_mv_iq4_nl_f32(
4582
+ device const void * src0,
4583
+ device const float * src1,
4584
+ device float * dst,
4585
+ constant int64_t & ne00,
4586
+ constant int64_t & ne01,
4587
+ constant int64_t & ne02,
4588
+ constant uint64_t & nb00,
4589
+ constant uint64_t & nb01,
4590
+ constant uint64_t & nb02,
4591
+ constant int64_t & ne10,
4592
+ constant int64_t & ne11,
4593
+ constant int64_t & ne12,
4594
+ constant uint64_t & nb10,
4595
+ constant uint64_t & nb11,
4596
+ constant uint64_t & nb12,
4597
+ constant int64_t & ne0,
4598
+ constant int64_t & ne1,
4599
+ constant uint & r2,
4600
+ constant uint & r3,
4601
+ threadgroup float * shared_values [[threadgroup(0)]],
4602
+ uint3 tgpig[[threadgroup_position_in_grid]],
4603
+ uint tiisg[[thread_index_in_simdgroup]],
4604
+ uint sgitg[[simdgroup_index_in_threadgroup]]) {
4605
+
4606
+ kernel_mul_mv_iq4_nl_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
4607
+ }
4466
4608
 
4467
4609
  //============================= templates and their specializations =============================
4468
4610
 
@@ -4659,6 +4801,8 @@ void dequantize_q4_K(device const block_q4_K *xb, short il, thread type4x4 & reg
4659
4801
  const float dl = d * sc[0];
4660
4802
  const float ml = min * sc[1];
4661
4803
  #else
4804
+ (void) get_scale_min_k4_just2;
4805
+
4662
4806
  q = q + 16 * (il&1);
4663
4807
  device const uint8_t * s = xb->scales;
4664
4808
  device const half2 * dh = (device const half2 *)xb->d;
@@ -4824,6 +4968,21 @@ void dequantize_iq1_s(device const block_iq1_s * xb, short il, thread type4x4 &
4824
4968
  }
4825
4969
  }
4826
4970
 
4971
+ template <typename type4x4>
4972
+ void dequantize_iq4_nl(device const block_iq4_nl * xb, short il, thread type4x4 & reg) {
4973
+ device const uint16_t * q4 = (device const uint16_t *)xb->qs;
4974
+ const float d = xb->d;
4975
+ uint32_t aux32;
4976
+ thread const uint8_t * q8 = (thread const uint8_t *)&aux32;
4977
+ for (int i = 0; i < 4; ++i) {
4978
+ aux32 = ((q4[2*i] | (q4[2*i+1] << 16)) >> 4*il) & 0x0f0f0f0f;
4979
+ reg[i][0] = d * kvalues_iq4nl_f[q8[0]];
4980
+ reg[i][1] = d * kvalues_iq4nl_f[q8[1]];
4981
+ reg[i][2] = d * kvalues_iq4nl_f[q8[2]];
4982
+ reg[i][3] = d * kvalues_iq4nl_f[q8[3]];
4983
+ }
4984
+ }
4985
+
4827
4986
  template<typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread float4x4 &)>
4828
4987
  kernel void kernel_get_rows(
4829
4988
  device const void * src0,
@@ -5367,6 +5526,7 @@ template [[host_name("kernel_get_rows_iq2_xxs")]] kernel get_rows_t kernel_get_r
5367
5526
  template [[host_name("kernel_get_rows_iq2_xs")]] kernel get_rows_t kernel_get_rows<block_iq2_xs, QK_NL, dequantize_iq2_xs>;
5368
5527
  template [[host_name("kernel_get_rows_iq3_xxs")]] kernel get_rows_t kernel_get_rows<block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;
5369
5528
  template [[host_name("kernel_get_rows_iq1_s")]] kernel get_rows_t kernel_get_rows<block_iq1_s, QK_NL, dequantize_iq1_s>;
5529
+ template [[host_name("kernel_get_rows_iq4_nl")]] kernel get_rows_t kernel_get_rows<block_iq4_nl, 2, dequantize_iq4_nl>;
5370
5530
 
5371
5531
  //
5372
5532
  // matrix-matrix multiplication
@@ -5407,6 +5567,7 @@ template [[host_name("kernel_mul_mm_iq2_xxs_f32")]] kernel mat_mm_t kernel_mul_m
5407
5567
  template [[host_name("kernel_mul_mm_iq2_xs_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq2_xs, QK_NL, dequantize_iq2_xs>;
5408
5568
  template [[host_name("kernel_mul_mm_iq3_xxs_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;
5409
5569
  template [[host_name("kernel_mul_mm_iq1_s_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq1_s, QK_NL, dequantize_iq1_s>;
5570
+ template [[host_name("kernel_mul_mm_iq4_nl_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq4_nl, 2, dequantize_iq4_nl>;
5410
5571
 
5411
5572
  //
5412
5573
  // indirect matrix-matrix multiplication
@@ -5459,6 +5620,7 @@ template [[host_name("kernel_mul_mm_id_iq2_xxs_f32")]] kernel mat_mm_id_t kernel
5459
5620
  template [[host_name("kernel_mul_mm_id_iq2_xs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq2_xs, QK_NL, dequantize_iq2_xs>;
5460
5621
  template [[host_name("kernel_mul_mm_id_iq3_xxs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;
5461
5622
  template [[host_name("kernel_mul_mm_id_iq1_s_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq1_s, QK_NL, dequantize_iq1_s>;
5623
+ template [[host_name("kernel_mul_mm_id_iq4_nl_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq4_nl, 2, dequantize_iq4_nl>;
5462
5624
 
5463
5625
  //
5464
5626
  // matrix-vector multiplication
@@ -6489,3 +6651,68 @@ kernel void kernel_mul_mv_id_iq1_s_f32(
6489
6651
  tiisg,
6490
6652
  sgitg);
6491
6653
  }
6654
+
6655
+ [[host_name("kernel_mul_mv_id_iq4_nl_f32")]]
6656
+ kernel void kernel_mul_mv_id_iq4_nl_f32(
6657
+ device const char * ids,
6658
+ device const char * src1,
6659
+ device float * dst,
6660
+ constant uint64_t & nbi1,
6661
+ constant int64_t & ne00,
6662
+ constant int64_t & ne01,
6663
+ constant int64_t & ne02,
6664
+ constant uint64_t & nb00,
6665
+ constant uint64_t & nb01,
6666
+ constant uint64_t & nb02,
6667
+ constant int64_t & ne10,
6668
+ constant int64_t & ne11,
6669
+ constant int64_t & ne12,
6670
+ constant int64_t & ne13,
6671
+ constant uint64_t & nb10,
6672
+ constant uint64_t & nb11,
6673
+ constant uint64_t & nb12,
6674
+ constant int64_t & ne0,
6675
+ constant int64_t & ne1,
6676
+ constant uint64_t & nb1,
6677
+ constant uint & r2,
6678
+ constant uint & r3,
6679
+ constant int & idx,
6680
+ device const char * src00,
6681
+ device const char * src01,
6682
+ device const char * src02,
6683
+ device const char * src03,
6684
+ device const char * src04,
6685
+ device const char * src05,
6686
+ device const char * src06,
6687
+ device const char * src07,
6688
+ threadgroup float * shared_values [[threadgroup(0)]],
6689
+ uint3 tgpig[[threadgroup_position_in_grid]],
6690
+ uint tiitg[[thread_index_in_threadgroup]],
6691
+ uint tiisg[[thread_index_in_simdgroup]],
6692
+ uint sgitg[[simdgroup_index_in_threadgroup]]) {
6693
+ device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07};
6694
+
6695
+ const int64_t bid = tgpig.z/(ne12*ne13);
6696
+
6697
+ tgpig.z = tgpig.z%(ne12*ne13);
6698
+
6699
+ const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx];
6700
+
6701
+ kernel_mul_mv_iq4_nl_f32_impl(
6702
+ src0[id],
6703
+ (device const float *) (src1 + bid*nb11),
6704
+ dst + bid*ne0,
6705
+ ne00,
6706
+ ne01,
6707
+ ne02,
6708
+ ne10,
6709
+ ne12,
6710
+ ne0,
6711
+ ne1,
6712
+ r2,
6713
+ r3,
6714
+ shared_values,
6715
+ tgpig,
6716
+ tiisg,
6717
+ sgitg);
6718
+ }
@@ -392,7 +392,7 @@ kernel void kernel_soft_max(
392
392
  float lmax = -INFINITY;
393
393
 
394
394
  for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
395
- lmax = MAX(lmax, psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f) + slope*ppos[i00]);
395
+ lmax = MAX(lmax, psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f) + (ppos ? slope*ppos[i00] : 0.0f));
396
396
  }
397
397
 
398
398
  // find the max value in the block
@@ -417,7 +417,7 @@ kernel void kernel_soft_max(
417
417
  // parallel sum
418
418
  float lsum = 0.0f;
419
419
  for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
420
- const float exp_psrc0 = exp((psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f) + slope*ppos[i00]) - max_val);
420
+ const float exp_psrc0 = exp((psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f) + (ppos ? slope*ppos[i00] : 0.0f)) - max_val);
421
421
  lsum += exp_psrc0;
422
422
  pdst[i00] = exp_psrc0;
423
423
  }
@@ -495,7 +495,7 @@ kernel void kernel_soft_max_4(
495
495
  float4 lmax4 = -INFINITY;
496
496
 
497
497
  for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
498
- lmax4 = fmax(lmax4, psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f) + slope*ppos[i00]);
498
+ lmax4 = fmax(lmax4, psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f) + (ppos ? slope*ppos[i00] : 0.0f));
499
499
  }
500
500
 
501
501
  const float lmax = MAX(MAX(lmax4[0], lmax4[1]), MAX(lmax4[2], lmax4[3]));
@@ -521,7 +521,7 @@ kernel void kernel_soft_max_4(
521
521
  // parallel sum
522
522
  float4 lsum4 = 0.0f;
523
523
  for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
524
- const float4 exp_psrc4 = exp((psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f) + slope*ppos[i00]) - max_val);
524
+ const float4 exp_psrc4 = exp((psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f) + (ppos ? slope*ppos[i00] : 0.0f)) - max_val);
525
525
  lsum4 += exp_psrc4;
526
526
  pdst4[i00] = exp_psrc4;
527
527
  }
@@ -2531,6 +2531,12 @@ typedef struct {
2531
2531
  uint8_t scales[QK_K/16];
2532
2532
  } block_iq1_s;
2533
2533
 
2534
+ // Non-linear quants
2535
+ #define QK4_NL 32
2536
+ typedef struct {
2537
+ half d;
2538
+ uint8_t qs[QK4_NL/2];
2539
+ } block_iq4_nl;
2534
2540
 
2535
2541
  //====================================== dot products =========================
2536
2542
 
@@ -4027,7 +4033,10 @@ void kernel_mul_mv_iq2_xxs_f32_impl(
4027
4033
  y4 += 32 * 32;
4028
4034
  }
4029
4035
  #else
4030
- // TODO
4036
+ (void) x;
4037
+ (void) y;
4038
+ (void) yl;
4039
+ (void) nb32;
4031
4040
  #endif
4032
4041
 
4033
4042
  for (int row = 0; row < N_DST; ++row) {
@@ -4170,7 +4179,10 @@ void kernel_mul_mv_iq2_xs_f32_impl(
4170
4179
  y4 += 32 * 32;
4171
4180
  }
4172
4181
  #else
4173
- // TODO
4182
+ (void) x;
4183
+ (void) y;
4184
+ (void) yl;
4185
+ (void) nb32;
4174
4186
  #endif
4175
4187
 
4176
4188
  for (int row = 0; row < N_DST; ++row) {
@@ -4306,7 +4318,10 @@ void kernel_mul_mv_iq3_xxs_f32_impl(
4306
4318
  y4 += 32 * 32;
4307
4319
  }
4308
4320
  #else
4309
- // TODO
4321
+ (void) x;
4322
+ (void) y;
4323
+ (void) yl;
4324
+ (void) nb32;
4310
4325
  #endif
4311
4326
 
4312
4327
  for (int row = 0; row < N_DST; ++row) {
@@ -4375,7 +4390,6 @@ void kernel_mul_mv_iq1_s_f32_impl(
4375
4390
  const uint i13 = im/ne12;
4376
4391
 
4377
4392
  const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
4378
-
4379
4393
  device const block_iq1_s * x = (device const block_iq1_s *) src0 + ib_row + offset0;
4380
4394
  device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1;
4381
4395
 
@@ -4424,7 +4438,10 @@ void kernel_mul_mv_iq1_s_f32_impl(
4424
4438
  y4 += 16 * 32;
4425
4439
  }
4426
4440
  #else
4427
- // TODO
4441
+ (void) x;
4442
+ (void) y;
4443
+ (void) yl;
4444
+ (void) nb32;
4428
4445
  #endif
4429
4446
 
4430
4447
  for (int row = 0; row < N_DST; ++row) {
@@ -4435,6 +4452,103 @@ void kernel_mul_mv_iq1_s_f32_impl(
4435
4452
  }
4436
4453
  }
4437
4454
 
4455
+ constexpr constant static float kvalues_iq4nl_f[16] = {
4456
+ -127.f, -104.f, -83.f, -65.f, -49.f, -35.f, -22.f, -10.f, 1.f, 13.f, 25.f, 38.f, 53.f, 69.f, 89.f, 113.f
4457
+ };
4458
+
4459
+ void kernel_mul_mv_iq4_nl_f32_impl(
4460
+ device const void * src0,
4461
+ device const float * src1,
4462
+ device float * dst,
4463
+ constant int64_t & ne00,
4464
+ constant int64_t & ne01,
4465
+ constant int64_t & ne02,
4466
+ constant int64_t & ne10,
4467
+ constant int64_t & ne12,
4468
+ constant int64_t & ne0,
4469
+ constant int64_t & ne1,
4470
+ constant uint & r2,
4471
+ constant uint & r3,
4472
+ threadgroup float * shared_values [[threadgroup(0)]],
4473
+ uint3 tgpig[[threadgroup_position_in_grid]],
4474
+ uint tiisg[[thread_index_in_simdgroup]],
4475
+ uint sgitg[[simdgroup_index_in_threadgroup]]) {
4476
+
4477
+ const int nb = ne00/QK4_NL;
4478
+ const int r0 = tgpig.x;
4479
+ const int r1 = tgpig.y;
4480
+ const int im = tgpig.z;
4481
+ const int first_row = (r0 * 2 + sgitg) * 2;
4482
+ const int ib_row = first_row * nb;
4483
+
4484
+ const uint i12 = im%ne12;
4485
+ const uint i13 = im/ne12;
4486
+
4487
+ const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
4488
+ device const block_iq4_nl * x = (device const block_iq4_nl *) src0 + ib_row + offset0;
4489
+ device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1;
4490
+
4491
+ const int ix = tiisg/2; // 0...15
4492
+ const int it = tiisg%2; // 0 or 1
4493
+
4494
+ shared_values[tiisg] = kvalues_iq4nl_f[tiisg%16];
4495
+ threadgroup_barrier(mem_flags::mem_threadgroup);
4496
+
4497
+ float4 yl[4];
4498
+ float sumf[2]={0.f}, all_sum;
4499
+
4500
+ device const float * yb = y + ix * QK4_NL + it * 8;
4501
+
4502
+ uint32_t aux32[2];
4503
+ thread const uint8_t * q8 = (thread const uint8_t *)aux32;
4504
+
4505
+ float4 qf1, qf2;
4506
+
4507
+ for (int ib = ix; ib < nb; ib += 16) {
4508
+
4509
+ device const float4 * y4 = (device const float4 *)yb;
4510
+ yl[0] = y4[0]; yl[1] = y4[4]; yl[2] = y4[1]; yl[3] = y4[5];
4511
+
4512
+ for (int row = 0; row < 2; ++row) {
4513
+
4514
+ device const block_iq4_nl & xb = x[row*nb + ib];
4515
+ device const uint16_t * q4 = (device const uint16_t *)(xb.qs + 8*it);
4516
+
4517
+ float4 acc1 = {0.f}, acc2 = {0.f};
4518
+
4519
+ aux32[0] = q4[0] | (q4[1] << 16);
4520
+ aux32[1] = (aux32[0] >> 4) & 0x0f0f0f0f;
4521
+ aux32[0] &= 0x0f0f0f0f;
4522
+ qf1 = {shared_values[q8[0]], shared_values[q8[1]], shared_values[q8[2]], shared_values[q8[3]]};
4523
+ qf2 = {shared_values[q8[4]], shared_values[q8[5]], shared_values[q8[6]], shared_values[q8[7]]};
4524
+ acc1 += yl[0] * qf1;
4525
+ acc2 += yl[1] * qf2;
4526
+
4527
+ aux32[0] = q4[2] | (q4[3] << 16);
4528
+ aux32[1] = (aux32[0] >> 4) & 0x0f0f0f0f;
4529
+ aux32[0] &= 0x0f0f0f0f;
4530
+ qf1 = {shared_values[q8[0]], shared_values[q8[1]], shared_values[q8[2]], shared_values[q8[3]]};
4531
+ qf2 = {shared_values[q8[4]], shared_values[q8[5]], shared_values[q8[6]], shared_values[q8[7]]};
4532
+ acc1 += yl[2] * qf1;
4533
+ acc2 += yl[3] * qf2;
4534
+
4535
+ acc1 += acc2;
4536
+
4537
+ sumf[row] += (float)xb.d * (acc1[0] + acc1[1] + acc1[2] + acc1[3]);
4538
+
4539
+ }
4540
+
4541
+ yb += 16 * QK4_NL;
4542
+ }
4543
+
4544
+ for (int row = 0; row < 2; ++row) {
4545
+ all_sum = simd_sum(sumf[row]);
4546
+ if (tiisg == 0) {
4547
+ dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum;
4548
+ }
4549
+ }
4550
+ }
4551
+
4438
4552
  [[host_name("kernel_mul_mv_iq1_s_f32")]]
4439
4553
  kernel void kernel_mul_mv_iq1_s_f32(
4440
4554
  device const void * src0,
@@ -4463,6 +4577,34 @@ kernel void kernel_mul_mv_iq1_s_f32(
4463
4577
  kernel_mul_mv_iq1_s_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, tgpig, tiisg, sgitg);
4464
4578
  }
4465
4579
 
4580
+ [[host_name("kernel_mul_mv_iq4_nl_f32")]]
4581
+ kernel void kernel_mul_mv_iq4_nl_f32(
4582
+ device const void * src0,
4583
+ device const float * src1,
4584
+ device float * dst,
4585
+ constant int64_t & ne00,
4586
+ constant int64_t & ne01,
4587
+ constant int64_t & ne02,
4588
+ constant uint64_t & nb00,
4589
+ constant uint64_t & nb01,
4590
+ constant uint64_t & nb02,
4591
+ constant int64_t & ne10,
4592
+ constant int64_t & ne11,
4593
+ constant int64_t & ne12,
4594
+ constant uint64_t & nb10,
4595
+ constant uint64_t & nb11,
4596
+ constant uint64_t & nb12,
4597
+ constant int64_t & ne0,
4598
+ constant int64_t & ne1,
4599
+ constant uint & r2,
4600
+ constant uint & r3,
4601
+ threadgroup float * shared_values [[threadgroup(0)]],
4602
+ uint3 tgpig[[threadgroup_position_in_grid]],
4603
+ uint tiisg[[thread_index_in_simdgroup]],
4604
+ uint sgitg[[simdgroup_index_in_threadgroup]]) {
4605
+
4606
+ kernel_mul_mv_iq4_nl_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
4607
+ }
4466
4608
 
4467
4609
  //============================= templates and their specializations =============================
4468
4610
 
@@ -4659,6 +4801,8 @@ void dequantize_q4_K(device const block_q4_K *xb, short il, thread type4x4 & reg
4659
4801
  const float dl = d * sc[0];
4660
4802
  const float ml = min * sc[1];
4661
4803
  #else
4804
+ (void) get_scale_min_k4_just2;
4805
+
4662
4806
  q = q + 16 * (il&1);
4663
4807
  device const uint8_t * s = xb->scales;
4664
4808
  device const half2 * dh = (device const half2 *)xb->d;
@@ -4824,6 +4968,21 @@ void dequantize_iq1_s(device const block_iq1_s * xb, short il, thread type4x4 &
4824
4968
  }
4825
4969
  }
4826
4970
 
4971
+ template <typename type4x4>
4972
+ void dequantize_iq4_nl(device const block_iq4_nl * xb, short il, thread type4x4 & reg) {
4973
+ device const uint16_t * q4 = (device const uint16_t *)xb->qs;
4974
+ const float d = xb->d;
4975
+ uint32_t aux32;
4976
+ thread const uint8_t * q8 = (thread const uint8_t *)&aux32;
4977
+ for (int i = 0; i < 4; ++i) {
4978
+ aux32 = ((q4[2*i] | (q4[2*i+1] << 16)) >> 4*il) & 0x0f0f0f0f;
4979
+ reg[i][0] = d * kvalues_iq4nl_f[q8[0]];
4980
+ reg[i][1] = d * kvalues_iq4nl_f[q8[1]];
4981
+ reg[i][2] = d * kvalues_iq4nl_f[q8[2]];
4982
+ reg[i][3] = d * kvalues_iq4nl_f[q8[3]];
4983
+ }
4984
+ }
4985
+
4827
4986
  template<typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread float4x4 &)>
4828
4987
  kernel void kernel_get_rows(
4829
4988
  device const void * src0,
@@ -5367,6 +5526,7 @@ template [[host_name("kernel_get_rows_iq2_xxs")]] kernel get_rows_t kernel_get_r
5367
5526
  template [[host_name("kernel_get_rows_iq2_xs")]] kernel get_rows_t kernel_get_rows<block_iq2_xs, QK_NL, dequantize_iq2_xs>;
5368
5527
  template [[host_name("kernel_get_rows_iq3_xxs")]] kernel get_rows_t kernel_get_rows<block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;
5369
5528
  template [[host_name("kernel_get_rows_iq1_s")]] kernel get_rows_t kernel_get_rows<block_iq1_s, QK_NL, dequantize_iq1_s>;
5529
+ template [[host_name("kernel_get_rows_iq4_nl")]] kernel get_rows_t kernel_get_rows<block_iq4_nl, 2, dequantize_iq4_nl>;
5370
5530
 
5371
5531
  //
5372
5532
  // matrix-matrix multiplication
@@ -5407,6 +5567,7 @@ template [[host_name("kernel_mul_mm_iq2_xxs_f32")]] kernel mat_mm_t kernel_mul_m
5407
5567
  template [[host_name("kernel_mul_mm_iq2_xs_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq2_xs, QK_NL, dequantize_iq2_xs>;
5408
5568
  template [[host_name("kernel_mul_mm_iq3_xxs_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;
5409
5569
  template [[host_name("kernel_mul_mm_iq1_s_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq1_s, QK_NL, dequantize_iq1_s>;
5570
+ template [[host_name("kernel_mul_mm_iq4_nl_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq4_nl, 2, dequantize_iq4_nl>;
5410
5571
 
5411
5572
  //
5412
5573
  // indirect matrix-matrix multiplication
@@ -5459,6 +5620,7 @@ template [[host_name("kernel_mul_mm_id_iq2_xxs_f32")]] kernel mat_mm_id_t kernel
5459
5620
  template [[host_name("kernel_mul_mm_id_iq2_xs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq2_xs, QK_NL, dequantize_iq2_xs>;
5460
5621
  template [[host_name("kernel_mul_mm_id_iq3_xxs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;
5461
5622
  template [[host_name("kernel_mul_mm_id_iq1_s_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq1_s, QK_NL, dequantize_iq1_s>;
5623
+ template [[host_name("kernel_mul_mm_id_iq4_nl_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq4_nl, 2, dequantize_iq4_nl>;
5462
5624
 
5463
5625
  //
5464
5626
  // matrix-vector multiplication
@@ -6489,3 +6651,68 @@ kernel void kernel_mul_mv_id_iq1_s_f32(
6489
6651
  tiisg,
6490
6652
  sgitg);
6491
6653
  }
6654
+
6655
+ [[host_name("kernel_mul_mv_id_iq4_nl_f32")]]
6656
+ kernel void kernel_mul_mv_id_iq4_nl_f32(
6657
+ device const char * ids,
6658
+ device const char * src1,
6659
+ device float * dst,
6660
+ constant uint64_t & nbi1,
6661
+ constant int64_t & ne00,
6662
+ constant int64_t & ne01,
6663
+ constant int64_t & ne02,
6664
+ constant uint64_t & nb00,
6665
+ constant uint64_t & nb01,
6666
+ constant uint64_t & nb02,
6667
+ constant int64_t & ne10,
6668
+ constant int64_t & ne11,
6669
+ constant int64_t & ne12,
6670
+ constant int64_t & ne13,
6671
+ constant uint64_t & nb10,
6672
+ constant uint64_t & nb11,
6673
+ constant uint64_t & nb12,
6674
+ constant int64_t & ne0,
6675
+ constant int64_t & ne1,
6676
+ constant uint64_t & nb1,
6677
+ constant uint & r2,
6678
+ constant uint & r3,
6679
+ constant int & idx,
6680
+ device const char * src00,
6681
+ device const char * src01,
6682
+ device const char * src02,
6683
+ device const char * src03,
6684
+ device const char * src04,
6685
+ device const char * src05,
6686
+ device const char * src06,
6687
+ device const char * src07,
6688
+ threadgroup float * shared_values [[threadgroup(0)]],
6689
+ uint3 tgpig[[threadgroup_position_in_grid]],
6690
+ uint tiitg[[thread_index_in_threadgroup]],
6691
+ uint tiisg[[thread_index_in_simdgroup]],
6692
+ uint sgitg[[simdgroup_index_in_threadgroup]]) {
6693
+ device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07};
6694
+
6695
+ const int64_t bid = tgpig.z/(ne12*ne13);
6696
+
6697
+ tgpig.z = tgpig.z%(ne12*ne13);
6698
+
6699
+ const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx];
6700
+
6701
+ kernel_mul_mv_iq4_nl_f32_impl(
6702
+ src0[id],
6703
+ (device const float *) (src1 + bid*nb11),
6704
+ dst + bid*ne0,
6705
+ ne00,
6706
+ ne01,
6707
+ ne02,
6708
+ ne10,
6709
+ ne12,
6710
+ ne0,
6711
+ ne1,
6712
+ r2,
6713
+ r3,
6714
+ shared_values,
6715
+ tgpig,
6716
+ tiisg,
6717
+ sgitg);
6718
+ }
Binary file
Binary file
Binary file
Binary file
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "node-llama-cpp",
3
- "version": "2.8.7",
3
+ "version": "2.8.8",
4
4
  "description": "Run AI models locally on your machine with node.js bindings for llama.cpp. Force a JSON schema on the model output on the generation level",
5
5
  "main": "dist/index.js",
6
6
  "type": "module",