node-llama-cpp 2.8.7 → 2.8.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -2
- package/llama/binariesGithubRelease.json +1 -1
- package/llama/gitRelease.bundle +0 -0
- package/llamaBins/linux-arm64/llama-addon.node +0 -0
- package/llamaBins/linux-armv7l/llama-addon.node +0 -0
- package/llamaBins/linux-x64/llama-addon.node +0 -0
- package/llamaBins/mac-arm64/ggml-metal.metal +236 -9
- package/llamaBins/mac-arm64/llama-addon.node +0 -0
- package/llamaBins/mac-x64/ggml-metal.metal +236 -9
- package/llamaBins/mac-x64/llama-addon.node +0 -0
- package/llamaBins/win-x64/llama-addon.exp +0 -0
- package/llamaBins/win-x64/llama-addon.lib +0 -0
- package/llamaBins/win-x64/llama-addon.node +0 -0
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
<div align="center">
|
|
2
|
-
<img alt="node-llama-cpp Logo" src="https://
|
|
2
|
+
<img alt="node-llama-cpp Logo" src="https://raw.githubusercontent.com/withcatai/node-llama-cpp/master/assets/logo.roundEdges.png" width="360px" />
|
|
3
3
|
<h1>node-llama-cpp</h1>
|
|
4
4
|
<p>Run AI models locally on your machine</p>
|
|
5
5
|
<sub>Pre-built bindings are provided with a fallback to building from source with cmake</sub>
|
|
@@ -84,7 +84,7 @@ To contribute to `node-llama-cpp` read the [contribution guide](https://withcata
|
|
|
84
84
|
<br />
|
|
85
85
|
|
|
86
86
|
<div align="center" width="360">
|
|
87
|
-
<img alt="Star please" src="https://
|
|
87
|
+
<img alt="Star please" src="https://raw.githubusercontent.com/withcatai/node-llama-cpp/master/assets/star.please.roundEdges.png" width="360" margin="auto" />
|
|
88
88
|
<br/>
|
|
89
89
|
<p align="right">
|
|
90
90
|
<i>If you like this repo, star it ✨</i>
|
package/llama/gitRelease.bundle
CHANGED
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
@@ -392,7 +392,7 @@ kernel void kernel_soft_max(
|
|
|
392
392
|
float lmax = -INFINITY;
|
|
393
393
|
|
|
394
394
|
for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
|
|
395
|
-
lmax = MAX(lmax, psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f) + slope*ppos[i00]);
|
|
395
|
+
lmax = MAX(lmax, psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f) + (ppos ? slope*ppos[i00] : 0.0f));
|
|
396
396
|
}
|
|
397
397
|
|
|
398
398
|
// find the max value in the block
|
|
@@ -417,7 +417,7 @@ kernel void kernel_soft_max(
|
|
|
417
417
|
// parallel sum
|
|
418
418
|
float lsum = 0.0f;
|
|
419
419
|
for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
|
|
420
|
-
const float exp_psrc0 = exp((psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f) + slope*ppos[i00]) - max_val);
|
|
420
|
+
const float exp_psrc0 = exp((psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f) + (ppos ? slope*ppos[i00] : 0.0f)) - max_val);
|
|
421
421
|
lsum += exp_psrc0;
|
|
422
422
|
pdst[i00] = exp_psrc0;
|
|
423
423
|
}
|
|
@@ -495,7 +495,7 @@ kernel void kernel_soft_max_4(
|
|
|
495
495
|
float4 lmax4 = -INFINITY;
|
|
496
496
|
|
|
497
497
|
for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
|
|
498
|
-
lmax4 = fmax(lmax4, psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f) + slope*ppos[i00]);
|
|
498
|
+
lmax4 = fmax(lmax4, psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f) + (ppos ? slope*ppos[i00] : 0.0f));
|
|
499
499
|
}
|
|
500
500
|
|
|
501
501
|
const float lmax = MAX(MAX(lmax4[0], lmax4[1]), MAX(lmax4[2], lmax4[3]));
|
|
@@ -521,7 +521,7 @@ kernel void kernel_soft_max_4(
|
|
|
521
521
|
// parallel sum
|
|
522
522
|
float4 lsum4 = 0.0f;
|
|
523
523
|
for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
|
|
524
|
-
const float4 exp_psrc4 = exp((psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f) + slope*ppos[i00]) - max_val);
|
|
524
|
+
const float4 exp_psrc4 = exp((psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f) + (ppos ? slope*ppos[i00] : 0.0f)) - max_val);
|
|
525
525
|
lsum4 += exp_psrc4;
|
|
526
526
|
pdst4[i00] = exp_psrc4;
|
|
527
527
|
}
|
|
@@ -2531,6 +2531,12 @@ typedef struct {
|
|
|
2531
2531
|
uint8_t scales[QK_K/16];
|
|
2532
2532
|
} block_iq1_s;
|
|
2533
2533
|
|
|
2534
|
+
// Non-linear quants
|
|
2535
|
+
#define QK4_NL 32
|
|
2536
|
+
typedef struct {
|
|
2537
|
+
half d;
|
|
2538
|
+
uint8_t qs[QK4_NL/2];
|
|
2539
|
+
} block_iq4_nl;
|
|
2534
2540
|
|
|
2535
2541
|
//====================================== dot products =========================
|
|
2536
2542
|
|
|
@@ -4027,7 +4033,10 @@ void kernel_mul_mv_iq2_xxs_f32_impl(
|
|
|
4027
4033
|
y4 += 32 * 32;
|
|
4028
4034
|
}
|
|
4029
4035
|
#else
|
|
4030
|
-
|
|
4036
|
+
(void) x;
|
|
4037
|
+
(void) y;
|
|
4038
|
+
(void) yl;
|
|
4039
|
+
(void) nb32;
|
|
4031
4040
|
#endif
|
|
4032
4041
|
|
|
4033
4042
|
for (int row = 0; row < N_DST; ++row) {
|
|
@@ -4170,7 +4179,10 @@ void kernel_mul_mv_iq2_xs_f32_impl(
|
|
|
4170
4179
|
y4 += 32 * 32;
|
|
4171
4180
|
}
|
|
4172
4181
|
#else
|
|
4173
|
-
|
|
4182
|
+
(void) x;
|
|
4183
|
+
(void) y;
|
|
4184
|
+
(void) yl;
|
|
4185
|
+
(void) nb32;
|
|
4174
4186
|
#endif
|
|
4175
4187
|
|
|
4176
4188
|
for (int row = 0; row < N_DST; ++row) {
|
|
@@ -4306,7 +4318,10 @@ void kernel_mul_mv_iq3_xxs_f32_impl(
|
|
|
4306
4318
|
y4 += 32 * 32;
|
|
4307
4319
|
}
|
|
4308
4320
|
#else
|
|
4309
|
-
|
|
4321
|
+
(void) x;
|
|
4322
|
+
(void) y;
|
|
4323
|
+
(void) yl;
|
|
4324
|
+
(void) nb32;
|
|
4310
4325
|
#endif
|
|
4311
4326
|
|
|
4312
4327
|
for (int row = 0; row < N_DST; ++row) {
|
|
@@ -4375,7 +4390,6 @@ void kernel_mul_mv_iq1_s_f32_impl(
|
|
|
4375
4390
|
const uint i13 = im/ne12;
|
|
4376
4391
|
|
|
4377
4392
|
const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
|
|
4378
|
-
|
|
4379
4393
|
device const block_iq1_s * x = (device const block_iq1_s *) src0 + ib_row + offset0;
|
|
4380
4394
|
device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1;
|
|
4381
4395
|
|
|
@@ -4424,7 +4438,10 @@ void kernel_mul_mv_iq1_s_f32_impl(
|
|
|
4424
4438
|
y4 += 16 * 32;
|
|
4425
4439
|
}
|
|
4426
4440
|
#else
|
|
4427
|
-
|
|
4441
|
+
(void) x;
|
|
4442
|
+
(void) y;
|
|
4443
|
+
(void) yl;
|
|
4444
|
+
(void) nb32;
|
|
4428
4445
|
#endif
|
|
4429
4446
|
|
|
4430
4447
|
for (int row = 0; row < N_DST; ++row) {
|
|
@@ -4435,6 +4452,103 @@ void kernel_mul_mv_iq1_s_f32_impl(
|
|
|
4435
4452
|
}
|
|
4436
4453
|
}
|
|
4437
4454
|
|
|
4455
|
+
constexpr constant static float kvalues_iq4nl_f[16] = {
|
|
4456
|
+
-127.f, -104.f, -83.f, -65.f, -49.f, -35.f, -22.f, -10.f, 1.f, 13.f, 25.f, 38.f, 53.f, 69.f, 89.f, 113.f
|
|
4457
|
+
};
|
|
4458
|
+
|
|
4459
|
+
void kernel_mul_mv_iq4_nl_f32_impl(
|
|
4460
|
+
device const void * src0,
|
|
4461
|
+
device const float * src1,
|
|
4462
|
+
device float * dst,
|
|
4463
|
+
constant int64_t & ne00,
|
|
4464
|
+
constant int64_t & ne01,
|
|
4465
|
+
constant int64_t & ne02,
|
|
4466
|
+
constant int64_t & ne10,
|
|
4467
|
+
constant int64_t & ne12,
|
|
4468
|
+
constant int64_t & ne0,
|
|
4469
|
+
constant int64_t & ne1,
|
|
4470
|
+
constant uint & r2,
|
|
4471
|
+
constant uint & r3,
|
|
4472
|
+
threadgroup float * shared_values [[threadgroup(0)]],
|
|
4473
|
+
uint3 tgpig[[threadgroup_position_in_grid]],
|
|
4474
|
+
uint tiisg[[thread_index_in_simdgroup]],
|
|
4475
|
+
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
|
4476
|
+
|
|
4477
|
+
const int nb = ne00/QK4_NL;
|
|
4478
|
+
const int r0 = tgpig.x;
|
|
4479
|
+
const int r1 = tgpig.y;
|
|
4480
|
+
const int im = tgpig.z;
|
|
4481
|
+
const int first_row = (r0 * 2 + sgitg) * 2;
|
|
4482
|
+
const int ib_row = first_row * nb;
|
|
4483
|
+
|
|
4484
|
+
const uint i12 = im%ne12;
|
|
4485
|
+
const uint i13 = im/ne12;
|
|
4486
|
+
|
|
4487
|
+
const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
|
|
4488
|
+
device const block_iq4_nl * x = (device const block_iq4_nl *) src0 + ib_row + offset0;
|
|
4489
|
+
device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1;
|
|
4490
|
+
|
|
4491
|
+
const int ix = tiisg/2; // 0...15
|
|
4492
|
+
const int it = tiisg%2; // 0 or 1
|
|
4493
|
+
|
|
4494
|
+
shared_values[tiisg] = kvalues_iq4nl_f[tiisg%16];
|
|
4495
|
+
threadgroup_barrier(mem_flags::mem_threadgroup);
|
|
4496
|
+
|
|
4497
|
+
float4 yl[4];
|
|
4498
|
+
float sumf[2]={0.f}, all_sum;
|
|
4499
|
+
|
|
4500
|
+
device const float * yb = y + ix * QK4_NL + it * 8;
|
|
4501
|
+
|
|
4502
|
+
uint32_t aux32[2];
|
|
4503
|
+
thread const uint8_t * q8 = (thread const uint8_t *)aux32;
|
|
4504
|
+
|
|
4505
|
+
float4 qf1, qf2;
|
|
4506
|
+
|
|
4507
|
+
for (int ib = ix; ib < nb; ib += 16) {
|
|
4508
|
+
|
|
4509
|
+
device const float4 * y4 = (device const float4 *)yb;
|
|
4510
|
+
yl[0] = y4[0]; yl[1] = y4[4]; yl[2] = y4[1]; yl[3] = y4[5];
|
|
4511
|
+
|
|
4512
|
+
for (int row = 0; row < 2; ++row) {
|
|
4513
|
+
|
|
4514
|
+
device const block_iq4_nl & xb = x[row*nb + ib];
|
|
4515
|
+
device const uint16_t * q4 = (device const uint16_t *)(xb.qs + 8*it);
|
|
4516
|
+
|
|
4517
|
+
float4 acc1 = {0.f}, acc2 = {0.f};
|
|
4518
|
+
|
|
4519
|
+
aux32[0] = q4[0] | (q4[1] << 16);
|
|
4520
|
+
aux32[1] = (aux32[0] >> 4) & 0x0f0f0f0f;
|
|
4521
|
+
aux32[0] &= 0x0f0f0f0f;
|
|
4522
|
+
qf1 = {shared_values[q8[0]], shared_values[q8[1]], shared_values[q8[2]], shared_values[q8[3]]};
|
|
4523
|
+
qf2 = {shared_values[q8[4]], shared_values[q8[5]], shared_values[q8[6]], shared_values[q8[7]]};
|
|
4524
|
+
acc1 += yl[0] * qf1;
|
|
4525
|
+
acc2 += yl[1] * qf2;
|
|
4526
|
+
|
|
4527
|
+
aux32[0] = q4[2] | (q4[3] << 16);
|
|
4528
|
+
aux32[1] = (aux32[0] >> 4) & 0x0f0f0f0f;
|
|
4529
|
+
aux32[0] &= 0x0f0f0f0f;
|
|
4530
|
+
qf1 = {shared_values[q8[0]], shared_values[q8[1]], shared_values[q8[2]], shared_values[q8[3]]};
|
|
4531
|
+
qf2 = {shared_values[q8[4]], shared_values[q8[5]], shared_values[q8[6]], shared_values[q8[7]]};
|
|
4532
|
+
acc1 += yl[2] * qf1;
|
|
4533
|
+
acc2 += yl[3] * qf2;
|
|
4534
|
+
|
|
4535
|
+
acc1 += acc2;
|
|
4536
|
+
|
|
4537
|
+
sumf[row] += (float)xb.d * (acc1[0] + acc1[1] + acc1[2] + acc1[3]);
|
|
4538
|
+
|
|
4539
|
+
}
|
|
4540
|
+
|
|
4541
|
+
yb += 16 * QK4_NL;
|
|
4542
|
+
}
|
|
4543
|
+
|
|
4544
|
+
for (int row = 0; row < 2; ++row) {
|
|
4545
|
+
all_sum = simd_sum(sumf[row]);
|
|
4546
|
+
if (tiisg == 0) {
|
|
4547
|
+
dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum;
|
|
4548
|
+
}
|
|
4549
|
+
}
|
|
4550
|
+
}
|
|
4551
|
+
|
|
4438
4552
|
[[host_name("kernel_mul_mv_iq1_s_f32")]]
|
|
4439
4553
|
kernel void kernel_mul_mv_iq1_s_f32(
|
|
4440
4554
|
device const void * src0,
|
|
@@ -4463,6 +4577,34 @@ kernel void kernel_mul_mv_iq1_s_f32(
|
|
|
4463
4577
|
kernel_mul_mv_iq1_s_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, tgpig, tiisg, sgitg);
|
|
4464
4578
|
}
|
|
4465
4579
|
|
|
4580
|
+
[[host_name("kernel_mul_mv_iq4_nl_f32")]]
|
|
4581
|
+
kernel void kernel_mul_mv_iq4_nl_f32(
|
|
4582
|
+
device const void * src0,
|
|
4583
|
+
device const float * src1,
|
|
4584
|
+
device float * dst,
|
|
4585
|
+
constant int64_t & ne00,
|
|
4586
|
+
constant int64_t & ne01,
|
|
4587
|
+
constant int64_t & ne02,
|
|
4588
|
+
constant uint64_t & nb00,
|
|
4589
|
+
constant uint64_t & nb01,
|
|
4590
|
+
constant uint64_t & nb02,
|
|
4591
|
+
constant int64_t & ne10,
|
|
4592
|
+
constant int64_t & ne11,
|
|
4593
|
+
constant int64_t & ne12,
|
|
4594
|
+
constant uint64_t & nb10,
|
|
4595
|
+
constant uint64_t & nb11,
|
|
4596
|
+
constant uint64_t & nb12,
|
|
4597
|
+
constant int64_t & ne0,
|
|
4598
|
+
constant int64_t & ne1,
|
|
4599
|
+
constant uint & r2,
|
|
4600
|
+
constant uint & r3,
|
|
4601
|
+
threadgroup float * shared_values [[threadgroup(0)]],
|
|
4602
|
+
uint3 tgpig[[threadgroup_position_in_grid]],
|
|
4603
|
+
uint tiisg[[thread_index_in_simdgroup]],
|
|
4604
|
+
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
|
4605
|
+
|
|
4606
|
+
kernel_mul_mv_iq4_nl_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
|
|
4607
|
+
}
|
|
4466
4608
|
|
|
4467
4609
|
//============================= templates and their specializations =============================
|
|
4468
4610
|
|
|
@@ -4659,6 +4801,8 @@ void dequantize_q4_K(device const block_q4_K *xb, short il, thread type4x4 & reg
|
|
|
4659
4801
|
const float dl = d * sc[0];
|
|
4660
4802
|
const float ml = min * sc[1];
|
|
4661
4803
|
#else
|
|
4804
|
+
(void) get_scale_min_k4_just2;
|
|
4805
|
+
|
|
4662
4806
|
q = q + 16 * (il&1);
|
|
4663
4807
|
device const uint8_t * s = xb->scales;
|
|
4664
4808
|
device const half2 * dh = (device const half2 *)xb->d;
|
|
@@ -4824,6 +4968,21 @@ void dequantize_iq1_s(device const block_iq1_s * xb, short il, thread type4x4 &
|
|
|
4824
4968
|
}
|
|
4825
4969
|
}
|
|
4826
4970
|
|
|
4971
|
+
template <typename type4x4>
|
|
4972
|
+
void dequantize_iq4_nl(device const block_iq4_nl * xb, short il, thread type4x4 & reg) {
|
|
4973
|
+
device const uint16_t * q4 = (device const uint16_t *)xb->qs;
|
|
4974
|
+
const float d = xb->d;
|
|
4975
|
+
uint32_t aux32;
|
|
4976
|
+
thread const uint8_t * q8 = (thread const uint8_t *)&aux32;
|
|
4977
|
+
for (int i = 0; i < 4; ++i) {
|
|
4978
|
+
aux32 = ((q4[2*i] | (q4[2*i+1] << 16)) >> 4*il) & 0x0f0f0f0f;
|
|
4979
|
+
reg[i][0] = d * kvalues_iq4nl_f[q8[0]];
|
|
4980
|
+
reg[i][1] = d * kvalues_iq4nl_f[q8[1]];
|
|
4981
|
+
reg[i][2] = d * kvalues_iq4nl_f[q8[2]];
|
|
4982
|
+
reg[i][3] = d * kvalues_iq4nl_f[q8[3]];
|
|
4983
|
+
}
|
|
4984
|
+
}
|
|
4985
|
+
|
|
4827
4986
|
template<typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread float4x4 &)>
|
|
4828
4987
|
kernel void kernel_get_rows(
|
|
4829
4988
|
device const void * src0,
|
|
@@ -5367,6 +5526,7 @@ template [[host_name("kernel_get_rows_iq2_xxs")]] kernel get_rows_t kernel_get_r
|
|
|
5367
5526
|
template [[host_name("kernel_get_rows_iq2_xs")]] kernel get_rows_t kernel_get_rows<block_iq2_xs, QK_NL, dequantize_iq2_xs>;
|
|
5368
5527
|
template [[host_name("kernel_get_rows_iq3_xxs")]] kernel get_rows_t kernel_get_rows<block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;
|
|
5369
5528
|
template [[host_name("kernel_get_rows_iq1_s")]] kernel get_rows_t kernel_get_rows<block_iq1_s, QK_NL, dequantize_iq1_s>;
|
|
5529
|
+
template [[host_name("kernel_get_rows_iq4_nl")]] kernel get_rows_t kernel_get_rows<block_iq4_nl, 2, dequantize_iq4_nl>;
|
|
5370
5530
|
|
|
5371
5531
|
//
|
|
5372
5532
|
// matrix-matrix multiplication
|
|
@@ -5407,6 +5567,7 @@ template [[host_name("kernel_mul_mm_iq2_xxs_f32")]] kernel mat_mm_t kernel_mul_m
|
|
|
5407
5567
|
template [[host_name("kernel_mul_mm_iq2_xs_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq2_xs, QK_NL, dequantize_iq2_xs>;
|
|
5408
5568
|
template [[host_name("kernel_mul_mm_iq3_xxs_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;
|
|
5409
5569
|
template [[host_name("kernel_mul_mm_iq1_s_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq1_s, QK_NL, dequantize_iq1_s>;
|
|
5570
|
+
template [[host_name("kernel_mul_mm_iq4_nl_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq4_nl, 2, dequantize_iq4_nl>;
|
|
5410
5571
|
|
|
5411
5572
|
//
|
|
5412
5573
|
// indirect matrix-matrix multiplication
|
|
@@ -5459,6 +5620,7 @@ template [[host_name("kernel_mul_mm_id_iq2_xxs_f32")]] kernel mat_mm_id_t kernel
|
|
|
5459
5620
|
template [[host_name("kernel_mul_mm_id_iq2_xs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq2_xs, QK_NL, dequantize_iq2_xs>;
|
|
5460
5621
|
template [[host_name("kernel_mul_mm_id_iq3_xxs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;
|
|
5461
5622
|
template [[host_name("kernel_mul_mm_id_iq1_s_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq1_s, QK_NL, dequantize_iq1_s>;
|
|
5623
|
+
template [[host_name("kernel_mul_mm_id_iq4_nl_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq4_nl, 2, dequantize_iq4_nl>;
|
|
5462
5624
|
|
|
5463
5625
|
//
|
|
5464
5626
|
// matrix-vector multiplication
|
|
@@ -6489,3 +6651,68 @@ kernel void kernel_mul_mv_id_iq1_s_f32(
|
|
|
6489
6651
|
tiisg,
|
|
6490
6652
|
sgitg);
|
|
6491
6653
|
}
|
|
6654
|
+
|
|
6655
|
+
[[host_name("kernel_mul_mv_id_iq4_nl_f32")]]
|
|
6656
|
+
kernel void kernel_mul_mv_id_iq4_nl_f32(
|
|
6657
|
+
device const char * ids,
|
|
6658
|
+
device const char * src1,
|
|
6659
|
+
device float * dst,
|
|
6660
|
+
constant uint64_t & nbi1,
|
|
6661
|
+
constant int64_t & ne00,
|
|
6662
|
+
constant int64_t & ne01,
|
|
6663
|
+
constant int64_t & ne02,
|
|
6664
|
+
constant uint64_t & nb00,
|
|
6665
|
+
constant uint64_t & nb01,
|
|
6666
|
+
constant uint64_t & nb02,
|
|
6667
|
+
constant int64_t & ne10,
|
|
6668
|
+
constant int64_t & ne11,
|
|
6669
|
+
constant int64_t & ne12,
|
|
6670
|
+
constant int64_t & ne13,
|
|
6671
|
+
constant uint64_t & nb10,
|
|
6672
|
+
constant uint64_t & nb11,
|
|
6673
|
+
constant uint64_t & nb12,
|
|
6674
|
+
constant int64_t & ne0,
|
|
6675
|
+
constant int64_t & ne1,
|
|
6676
|
+
constant uint64_t & nb1,
|
|
6677
|
+
constant uint & r2,
|
|
6678
|
+
constant uint & r3,
|
|
6679
|
+
constant int & idx,
|
|
6680
|
+
device const char * src00,
|
|
6681
|
+
device const char * src01,
|
|
6682
|
+
device const char * src02,
|
|
6683
|
+
device const char * src03,
|
|
6684
|
+
device const char * src04,
|
|
6685
|
+
device const char * src05,
|
|
6686
|
+
device const char * src06,
|
|
6687
|
+
device const char * src07,
|
|
6688
|
+
threadgroup float * shared_values [[threadgroup(0)]],
|
|
6689
|
+
uint3 tgpig[[threadgroup_position_in_grid]],
|
|
6690
|
+
uint tiitg[[thread_index_in_threadgroup]],
|
|
6691
|
+
uint tiisg[[thread_index_in_simdgroup]],
|
|
6692
|
+
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
|
6693
|
+
device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07};
|
|
6694
|
+
|
|
6695
|
+
const int64_t bid = tgpig.z/(ne12*ne13);
|
|
6696
|
+
|
|
6697
|
+
tgpig.z = tgpig.z%(ne12*ne13);
|
|
6698
|
+
|
|
6699
|
+
const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx];
|
|
6700
|
+
|
|
6701
|
+
kernel_mul_mv_iq4_nl_f32_impl(
|
|
6702
|
+
src0[id],
|
|
6703
|
+
(device const float *) (src1 + bid*nb11),
|
|
6704
|
+
dst + bid*ne0,
|
|
6705
|
+
ne00,
|
|
6706
|
+
ne01,
|
|
6707
|
+
ne02,
|
|
6708
|
+
ne10,
|
|
6709
|
+
ne12,
|
|
6710
|
+
ne0,
|
|
6711
|
+
ne1,
|
|
6712
|
+
r2,
|
|
6713
|
+
r3,
|
|
6714
|
+
shared_values,
|
|
6715
|
+
tgpig,
|
|
6716
|
+
tiisg,
|
|
6717
|
+
sgitg);
|
|
6718
|
+
}
|
|
Binary file
|
|
@@ -392,7 +392,7 @@ kernel void kernel_soft_max(
|
|
|
392
392
|
float lmax = -INFINITY;
|
|
393
393
|
|
|
394
394
|
for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
|
|
395
|
-
lmax = MAX(lmax, psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f) + slope*ppos[i00]);
|
|
395
|
+
lmax = MAX(lmax, psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f) + (ppos ? slope*ppos[i00] : 0.0f));
|
|
396
396
|
}
|
|
397
397
|
|
|
398
398
|
// find the max value in the block
|
|
@@ -417,7 +417,7 @@ kernel void kernel_soft_max(
|
|
|
417
417
|
// parallel sum
|
|
418
418
|
float lsum = 0.0f;
|
|
419
419
|
for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
|
|
420
|
-
const float exp_psrc0 = exp((psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f) + slope*ppos[i00]) - max_val);
|
|
420
|
+
const float exp_psrc0 = exp((psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f) + (ppos ? slope*ppos[i00] : 0.0f)) - max_val);
|
|
421
421
|
lsum += exp_psrc0;
|
|
422
422
|
pdst[i00] = exp_psrc0;
|
|
423
423
|
}
|
|
@@ -495,7 +495,7 @@ kernel void kernel_soft_max_4(
|
|
|
495
495
|
float4 lmax4 = -INFINITY;
|
|
496
496
|
|
|
497
497
|
for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
|
|
498
|
-
lmax4 = fmax(lmax4, psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f) + slope*ppos[i00]);
|
|
498
|
+
lmax4 = fmax(lmax4, psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f) + (ppos ? slope*ppos[i00] : 0.0f));
|
|
499
499
|
}
|
|
500
500
|
|
|
501
501
|
const float lmax = MAX(MAX(lmax4[0], lmax4[1]), MAX(lmax4[2], lmax4[3]));
|
|
@@ -521,7 +521,7 @@ kernel void kernel_soft_max_4(
|
|
|
521
521
|
// parallel sum
|
|
522
522
|
float4 lsum4 = 0.0f;
|
|
523
523
|
for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
|
|
524
|
-
const float4 exp_psrc4 = exp((psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f) + slope*ppos[i00]) - max_val);
|
|
524
|
+
const float4 exp_psrc4 = exp((psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f) + (ppos ? slope*ppos[i00] : 0.0f)) - max_val);
|
|
525
525
|
lsum4 += exp_psrc4;
|
|
526
526
|
pdst4[i00] = exp_psrc4;
|
|
527
527
|
}
|
|
@@ -2531,6 +2531,12 @@ typedef struct {
|
|
|
2531
2531
|
uint8_t scales[QK_K/16];
|
|
2532
2532
|
} block_iq1_s;
|
|
2533
2533
|
|
|
2534
|
+
// Non-linear quants
|
|
2535
|
+
#define QK4_NL 32
|
|
2536
|
+
typedef struct {
|
|
2537
|
+
half d;
|
|
2538
|
+
uint8_t qs[QK4_NL/2];
|
|
2539
|
+
} block_iq4_nl;
|
|
2534
2540
|
|
|
2535
2541
|
//====================================== dot products =========================
|
|
2536
2542
|
|
|
@@ -4027,7 +4033,10 @@ void kernel_mul_mv_iq2_xxs_f32_impl(
|
|
|
4027
4033
|
y4 += 32 * 32;
|
|
4028
4034
|
}
|
|
4029
4035
|
#else
|
|
4030
|
-
|
|
4036
|
+
(void) x;
|
|
4037
|
+
(void) y;
|
|
4038
|
+
(void) yl;
|
|
4039
|
+
(void) nb32;
|
|
4031
4040
|
#endif
|
|
4032
4041
|
|
|
4033
4042
|
for (int row = 0; row < N_DST; ++row) {
|
|
@@ -4170,7 +4179,10 @@ void kernel_mul_mv_iq2_xs_f32_impl(
|
|
|
4170
4179
|
y4 += 32 * 32;
|
|
4171
4180
|
}
|
|
4172
4181
|
#else
|
|
4173
|
-
|
|
4182
|
+
(void) x;
|
|
4183
|
+
(void) y;
|
|
4184
|
+
(void) yl;
|
|
4185
|
+
(void) nb32;
|
|
4174
4186
|
#endif
|
|
4175
4187
|
|
|
4176
4188
|
for (int row = 0; row < N_DST; ++row) {
|
|
@@ -4306,7 +4318,10 @@ void kernel_mul_mv_iq3_xxs_f32_impl(
|
|
|
4306
4318
|
y4 += 32 * 32;
|
|
4307
4319
|
}
|
|
4308
4320
|
#else
|
|
4309
|
-
|
|
4321
|
+
(void) x;
|
|
4322
|
+
(void) y;
|
|
4323
|
+
(void) yl;
|
|
4324
|
+
(void) nb32;
|
|
4310
4325
|
#endif
|
|
4311
4326
|
|
|
4312
4327
|
for (int row = 0; row < N_DST; ++row) {
|
|
@@ -4375,7 +4390,6 @@ void kernel_mul_mv_iq1_s_f32_impl(
|
|
|
4375
4390
|
const uint i13 = im/ne12;
|
|
4376
4391
|
|
|
4377
4392
|
const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
|
|
4378
|
-
|
|
4379
4393
|
device const block_iq1_s * x = (device const block_iq1_s *) src0 + ib_row + offset0;
|
|
4380
4394
|
device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1;
|
|
4381
4395
|
|
|
@@ -4424,7 +4438,10 @@ void kernel_mul_mv_iq1_s_f32_impl(
|
|
|
4424
4438
|
y4 += 16 * 32;
|
|
4425
4439
|
}
|
|
4426
4440
|
#else
|
|
4427
|
-
|
|
4441
|
+
(void) x;
|
|
4442
|
+
(void) y;
|
|
4443
|
+
(void) yl;
|
|
4444
|
+
(void) nb32;
|
|
4428
4445
|
#endif
|
|
4429
4446
|
|
|
4430
4447
|
for (int row = 0; row < N_DST; ++row) {
|
|
@@ -4435,6 +4452,103 @@ void kernel_mul_mv_iq1_s_f32_impl(
|
|
|
4435
4452
|
}
|
|
4436
4453
|
}
|
|
4437
4454
|
|
|
4455
|
+
constexpr constant static float kvalues_iq4nl_f[16] = {
|
|
4456
|
+
-127.f, -104.f, -83.f, -65.f, -49.f, -35.f, -22.f, -10.f, 1.f, 13.f, 25.f, 38.f, 53.f, 69.f, 89.f, 113.f
|
|
4457
|
+
};
|
|
4458
|
+
|
|
4459
|
+
void kernel_mul_mv_iq4_nl_f32_impl(
|
|
4460
|
+
device const void * src0,
|
|
4461
|
+
device const float * src1,
|
|
4462
|
+
device float * dst,
|
|
4463
|
+
constant int64_t & ne00,
|
|
4464
|
+
constant int64_t & ne01,
|
|
4465
|
+
constant int64_t & ne02,
|
|
4466
|
+
constant int64_t & ne10,
|
|
4467
|
+
constant int64_t & ne12,
|
|
4468
|
+
constant int64_t & ne0,
|
|
4469
|
+
constant int64_t & ne1,
|
|
4470
|
+
constant uint & r2,
|
|
4471
|
+
constant uint & r3,
|
|
4472
|
+
threadgroup float * shared_values [[threadgroup(0)]],
|
|
4473
|
+
uint3 tgpig[[threadgroup_position_in_grid]],
|
|
4474
|
+
uint tiisg[[thread_index_in_simdgroup]],
|
|
4475
|
+
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
|
4476
|
+
|
|
4477
|
+
const int nb = ne00/QK4_NL;
|
|
4478
|
+
const int r0 = tgpig.x;
|
|
4479
|
+
const int r1 = tgpig.y;
|
|
4480
|
+
const int im = tgpig.z;
|
|
4481
|
+
const int first_row = (r0 * 2 + sgitg) * 2;
|
|
4482
|
+
const int ib_row = first_row * nb;
|
|
4483
|
+
|
|
4484
|
+
const uint i12 = im%ne12;
|
|
4485
|
+
const uint i13 = im/ne12;
|
|
4486
|
+
|
|
4487
|
+
const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
|
|
4488
|
+
device const block_iq4_nl * x = (device const block_iq4_nl *) src0 + ib_row + offset0;
|
|
4489
|
+
device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1;
|
|
4490
|
+
|
|
4491
|
+
const int ix = tiisg/2; // 0...15
|
|
4492
|
+
const int it = tiisg%2; // 0 or 1
|
|
4493
|
+
|
|
4494
|
+
shared_values[tiisg] = kvalues_iq4nl_f[tiisg%16];
|
|
4495
|
+
threadgroup_barrier(mem_flags::mem_threadgroup);
|
|
4496
|
+
|
|
4497
|
+
float4 yl[4];
|
|
4498
|
+
float sumf[2]={0.f}, all_sum;
|
|
4499
|
+
|
|
4500
|
+
device const float * yb = y + ix * QK4_NL + it * 8;
|
|
4501
|
+
|
|
4502
|
+
uint32_t aux32[2];
|
|
4503
|
+
thread const uint8_t * q8 = (thread const uint8_t *)aux32;
|
|
4504
|
+
|
|
4505
|
+
float4 qf1, qf2;
|
|
4506
|
+
|
|
4507
|
+
for (int ib = ix; ib < nb; ib += 16) {
|
|
4508
|
+
|
|
4509
|
+
device const float4 * y4 = (device const float4 *)yb;
|
|
4510
|
+
yl[0] = y4[0]; yl[1] = y4[4]; yl[2] = y4[1]; yl[3] = y4[5];
|
|
4511
|
+
|
|
4512
|
+
for (int row = 0; row < 2; ++row) {
|
|
4513
|
+
|
|
4514
|
+
device const block_iq4_nl & xb = x[row*nb + ib];
|
|
4515
|
+
device const uint16_t * q4 = (device const uint16_t *)(xb.qs + 8*it);
|
|
4516
|
+
|
|
4517
|
+
float4 acc1 = {0.f}, acc2 = {0.f};
|
|
4518
|
+
|
|
4519
|
+
aux32[0] = q4[0] | (q4[1] << 16);
|
|
4520
|
+
aux32[1] = (aux32[0] >> 4) & 0x0f0f0f0f;
|
|
4521
|
+
aux32[0] &= 0x0f0f0f0f;
|
|
4522
|
+
qf1 = {shared_values[q8[0]], shared_values[q8[1]], shared_values[q8[2]], shared_values[q8[3]]};
|
|
4523
|
+
qf2 = {shared_values[q8[4]], shared_values[q8[5]], shared_values[q8[6]], shared_values[q8[7]]};
|
|
4524
|
+
acc1 += yl[0] * qf1;
|
|
4525
|
+
acc2 += yl[1] * qf2;
|
|
4526
|
+
|
|
4527
|
+
aux32[0] = q4[2] | (q4[3] << 16);
|
|
4528
|
+
aux32[1] = (aux32[0] >> 4) & 0x0f0f0f0f;
|
|
4529
|
+
aux32[0] &= 0x0f0f0f0f;
|
|
4530
|
+
qf1 = {shared_values[q8[0]], shared_values[q8[1]], shared_values[q8[2]], shared_values[q8[3]]};
|
|
4531
|
+
qf2 = {shared_values[q8[4]], shared_values[q8[5]], shared_values[q8[6]], shared_values[q8[7]]};
|
|
4532
|
+
acc1 += yl[2] * qf1;
|
|
4533
|
+
acc2 += yl[3] * qf2;
|
|
4534
|
+
|
|
4535
|
+
acc1 += acc2;
|
|
4536
|
+
|
|
4537
|
+
sumf[row] += (float)xb.d * (acc1[0] + acc1[1] + acc1[2] + acc1[3]);
|
|
4538
|
+
|
|
4539
|
+
}
|
|
4540
|
+
|
|
4541
|
+
yb += 16 * QK4_NL;
|
|
4542
|
+
}
|
|
4543
|
+
|
|
4544
|
+
for (int row = 0; row < 2; ++row) {
|
|
4545
|
+
all_sum = simd_sum(sumf[row]);
|
|
4546
|
+
if (tiisg == 0) {
|
|
4547
|
+
dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum;
|
|
4548
|
+
}
|
|
4549
|
+
}
|
|
4550
|
+
}
|
|
4551
|
+
|
|
4438
4552
|
[[host_name("kernel_mul_mv_iq1_s_f32")]]
|
|
4439
4553
|
kernel void kernel_mul_mv_iq1_s_f32(
|
|
4440
4554
|
device const void * src0,
|
|
@@ -4463,6 +4577,34 @@ kernel void kernel_mul_mv_iq1_s_f32(
|
|
|
4463
4577
|
kernel_mul_mv_iq1_s_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, tgpig, tiisg, sgitg);
|
|
4464
4578
|
}
|
|
4465
4579
|
|
|
4580
|
+
[[host_name("kernel_mul_mv_iq4_nl_f32")]]
|
|
4581
|
+
kernel void kernel_mul_mv_iq4_nl_f32(
|
|
4582
|
+
device const void * src0,
|
|
4583
|
+
device const float * src1,
|
|
4584
|
+
device float * dst,
|
|
4585
|
+
constant int64_t & ne00,
|
|
4586
|
+
constant int64_t & ne01,
|
|
4587
|
+
constant int64_t & ne02,
|
|
4588
|
+
constant uint64_t & nb00,
|
|
4589
|
+
constant uint64_t & nb01,
|
|
4590
|
+
constant uint64_t & nb02,
|
|
4591
|
+
constant int64_t & ne10,
|
|
4592
|
+
constant int64_t & ne11,
|
|
4593
|
+
constant int64_t & ne12,
|
|
4594
|
+
constant uint64_t & nb10,
|
|
4595
|
+
constant uint64_t & nb11,
|
|
4596
|
+
constant uint64_t & nb12,
|
|
4597
|
+
constant int64_t & ne0,
|
|
4598
|
+
constant int64_t & ne1,
|
|
4599
|
+
constant uint & r2,
|
|
4600
|
+
constant uint & r3,
|
|
4601
|
+
threadgroup float * shared_values [[threadgroup(0)]],
|
|
4602
|
+
uint3 tgpig[[threadgroup_position_in_grid]],
|
|
4603
|
+
uint tiisg[[thread_index_in_simdgroup]],
|
|
4604
|
+
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
|
4605
|
+
|
|
4606
|
+
kernel_mul_mv_iq4_nl_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
|
|
4607
|
+
}
|
|
4466
4608
|
|
|
4467
4609
|
//============================= templates and their specializations =============================
|
|
4468
4610
|
|
|
@@ -4659,6 +4801,8 @@ void dequantize_q4_K(device const block_q4_K *xb, short il, thread type4x4 & reg
|
|
|
4659
4801
|
const float dl = d * sc[0];
|
|
4660
4802
|
const float ml = min * sc[1];
|
|
4661
4803
|
#else
|
|
4804
|
+
(void) get_scale_min_k4_just2;
|
|
4805
|
+
|
|
4662
4806
|
q = q + 16 * (il&1);
|
|
4663
4807
|
device const uint8_t * s = xb->scales;
|
|
4664
4808
|
device const half2 * dh = (device const half2 *)xb->d;
|
|
@@ -4824,6 +4968,21 @@ void dequantize_iq1_s(device const block_iq1_s * xb, short il, thread type4x4 &
|
|
|
4824
4968
|
}
|
|
4825
4969
|
}
|
|
4826
4970
|
|
|
4971
|
+
template <typename type4x4>
|
|
4972
|
+
void dequantize_iq4_nl(device const block_iq4_nl * xb, short il, thread type4x4 & reg) {
|
|
4973
|
+
device const uint16_t * q4 = (device const uint16_t *)xb->qs;
|
|
4974
|
+
const float d = xb->d;
|
|
4975
|
+
uint32_t aux32;
|
|
4976
|
+
thread const uint8_t * q8 = (thread const uint8_t *)&aux32;
|
|
4977
|
+
for (int i = 0; i < 4; ++i) {
|
|
4978
|
+
aux32 = ((q4[2*i] | (q4[2*i+1] << 16)) >> 4*il) & 0x0f0f0f0f;
|
|
4979
|
+
reg[i][0] = d * kvalues_iq4nl_f[q8[0]];
|
|
4980
|
+
reg[i][1] = d * kvalues_iq4nl_f[q8[1]];
|
|
4981
|
+
reg[i][2] = d * kvalues_iq4nl_f[q8[2]];
|
|
4982
|
+
reg[i][3] = d * kvalues_iq4nl_f[q8[3]];
|
|
4983
|
+
}
|
|
4984
|
+
}
|
|
4985
|
+
|
|
4827
4986
|
template<typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread float4x4 &)>
|
|
4828
4987
|
kernel void kernel_get_rows(
|
|
4829
4988
|
device const void * src0,
|
|
@@ -5367,6 +5526,7 @@ template [[host_name("kernel_get_rows_iq2_xxs")]] kernel get_rows_t kernel_get_r
|
|
|
5367
5526
|
template [[host_name("kernel_get_rows_iq2_xs")]] kernel get_rows_t kernel_get_rows<block_iq2_xs, QK_NL, dequantize_iq2_xs>;
|
|
5368
5527
|
template [[host_name("kernel_get_rows_iq3_xxs")]] kernel get_rows_t kernel_get_rows<block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;
|
|
5369
5528
|
template [[host_name("kernel_get_rows_iq1_s")]] kernel get_rows_t kernel_get_rows<block_iq1_s, QK_NL, dequantize_iq1_s>;
|
|
5529
|
+
template [[host_name("kernel_get_rows_iq4_nl")]] kernel get_rows_t kernel_get_rows<block_iq4_nl, 2, dequantize_iq4_nl>;
|
|
5370
5530
|
|
|
5371
5531
|
//
|
|
5372
5532
|
// matrix-matrix multiplication
|
|
@@ -5407,6 +5567,7 @@ template [[host_name("kernel_mul_mm_iq2_xxs_f32")]] kernel mat_mm_t kernel_mul_m
|
|
|
5407
5567
|
template [[host_name("kernel_mul_mm_iq2_xs_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq2_xs, QK_NL, dequantize_iq2_xs>;
|
|
5408
5568
|
template [[host_name("kernel_mul_mm_iq3_xxs_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;
|
|
5409
5569
|
template [[host_name("kernel_mul_mm_iq1_s_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq1_s, QK_NL, dequantize_iq1_s>;
|
|
5570
|
+
template [[host_name("kernel_mul_mm_iq4_nl_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq4_nl, 2, dequantize_iq4_nl>;
|
|
5410
5571
|
|
|
5411
5572
|
//
|
|
5412
5573
|
// indirect matrix-matrix multiplication
|
|
@@ -5459,6 +5620,7 @@ template [[host_name("kernel_mul_mm_id_iq2_xxs_f32")]] kernel mat_mm_id_t kernel
|
|
|
5459
5620
|
template [[host_name("kernel_mul_mm_id_iq2_xs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq2_xs, QK_NL, dequantize_iq2_xs>;
|
|
5460
5621
|
template [[host_name("kernel_mul_mm_id_iq3_xxs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;
|
|
5461
5622
|
template [[host_name("kernel_mul_mm_id_iq1_s_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq1_s, QK_NL, dequantize_iq1_s>;
|
|
5623
|
+
template [[host_name("kernel_mul_mm_id_iq4_nl_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq4_nl, 2, dequantize_iq4_nl>;
|
|
5462
5624
|
|
|
5463
5625
|
//
|
|
5464
5626
|
// matrix-vector multiplication
|
|
@@ -6489,3 +6651,68 @@ kernel void kernel_mul_mv_id_iq1_s_f32(
|
|
|
6489
6651
|
tiisg,
|
|
6490
6652
|
sgitg);
|
|
6491
6653
|
}
|
|
6654
|
+
|
|
6655
|
+
[[host_name("kernel_mul_mv_id_iq4_nl_f32")]]
|
|
6656
|
+
kernel void kernel_mul_mv_id_iq4_nl_f32(
|
|
6657
|
+
device const char * ids,
|
|
6658
|
+
device const char * src1,
|
|
6659
|
+
device float * dst,
|
|
6660
|
+
constant uint64_t & nbi1,
|
|
6661
|
+
constant int64_t & ne00,
|
|
6662
|
+
constant int64_t & ne01,
|
|
6663
|
+
constant int64_t & ne02,
|
|
6664
|
+
constant uint64_t & nb00,
|
|
6665
|
+
constant uint64_t & nb01,
|
|
6666
|
+
constant uint64_t & nb02,
|
|
6667
|
+
constant int64_t & ne10,
|
|
6668
|
+
constant int64_t & ne11,
|
|
6669
|
+
constant int64_t & ne12,
|
|
6670
|
+
constant int64_t & ne13,
|
|
6671
|
+
constant uint64_t & nb10,
|
|
6672
|
+
constant uint64_t & nb11,
|
|
6673
|
+
constant uint64_t & nb12,
|
|
6674
|
+
constant int64_t & ne0,
|
|
6675
|
+
constant int64_t & ne1,
|
|
6676
|
+
constant uint64_t & nb1,
|
|
6677
|
+
constant uint & r2,
|
|
6678
|
+
constant uint & r3,
|
|
6679
|
+
constant int & idx,
|
|
6680
|
+
device const char * src00,
|
|
6681
|
+
device const char * src01,
|
|
6682
|
+
device const char * src02,
|
|
6683
|
+
device const char * src03,
|
|
6684
|
+
device const char * src04,
|
|
6685
|
+
device const char * src05,
|
|
6686
|
+
device const char * src06,
|
|
6687
|
+
device const char * src07,
|
|
6688
|
+
threadgroup float * shared_values [[threadgroup(0)]],
|
|
6689
|
+
uint3 tgpig[[threadgroup_position_in_grid]],
|
|
6690
|
+
uint tiitg[[thread_index_in_threadgroup]],
|
|
6691
|
+
uint tiisg[[thread_index_in_simdgroup]],
|
|
6692
|
+
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
|
6693
|
+
device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07};
|
|
6694
|
+
|
|
6695
|
+
const int64_t bid = tgpig.z/(ne12*ne13);
|
|
6696
|
+
|
|
6697
|
+
tgpig.z = tgpig.z%(ne12*ne13);
|
|
6698
|
+
|
|
6699
|
+
const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx];
|
|
6700
|
+
|
|
6701
|
+
kernel_mul_mv_iq4_nl_f32_impl(
|
|
6702
|
+
src0[id],
|
|
6703
|
+
(device const float *) (src1 + bid*nb11),
|
|
6704
|
+
dst + bid*ne0,
|
|
6705
|
+
ne00,
|
|
6706
|
+
ne01,
|
|
6707
|
+
ne02,
|
|
6708
|
+
ne10,
|
|
6709
|
+
ne12,
|
|
6710
|
+
ne0,
|
|
6711
|
+
ne1,
|
|
6712
|
+
r2,
|
|
6713
|
+
r3,
|
|
6714
|
+
shared_values,
|
|
6715
|
+
tgpig,
|
|
6716
|
+
tiisg,
|
|
6717
|
+
sgitg);
|
|
6718
|
+
}
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "node-llama-cpp",
|
|
3
|
-
"version": "2.8.
|
|
3
|
+
"version": "2.8.8",
|
|
4
4
|
"description": "Run AI models locally on your machine with node.js bindings for llama.cpp. Force a JSON schema on the model output on the generation level",
|
|
5
5
|
"main": "dist/index.js",
|
|
6
6
|
"type": "module",
|