node-llama-cpp 3.0.0-beta.10 → 3.0.0-beta.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +4 -4
- package/dist/bindings/AddonTypes.d.ts +3 -0
- package/dist/bindings/Llama.d.ts +1 -0
- package/dist/bindings/Llama.js +7 -1
- package/dist/bindings/Llama.js.map +1 -1
- package/dist/bindings/getLlama.d.ts +24 -1
- package/dist/bindings/getLlama.js +10 -4
- package/dist/bindings/getLlama.js.map +1 -1
- package/dist/bindings/types.d.ts +1 -0
- package/dist/bindings/types.js.map +1 -1
- package/dist/bindings/utils/compileLLamaCpp.js +2 -0
- package/dist/bindings/utils/compileLLamaCpp.js.map +1 -1
- package/dist/bindings/utils/getBuildFolderNameForBuildOptions.js +2 -0
- package/dist/bindings/utils/getBuildFolderNameForBuildOptions.js.map +1 -1
- package/dist/bindings/utils/resolveChatWrapperBasedOnWrapperTypeName.d.ts +26 -0
- package/dist/bindings/utils/resolveChatWrapperBasedOnWrapperTypeName.js +43 -0
- package/dist/bindings/utils/resolveChatWrapperBasedOnWrapperTypeName.js.map +1 -0
- package/dist/bindings/utils/resolveCustomCmakeOptions.js +2 -0
- package/dist/bindings/utils/resolveCustomCmakeOptions.js.map +1 -1
- package/dist/cli/cli.js +4 -0
- package/dist/cli/cli.js.map +1 -1
- package/dist/cli/commands/BuildCommand.d.ts +2 -1
- package/dist/cli/commands/BuildCommand.js +11 -9
- package/dist/cli/commands/BuildCommand.js.map +1 -1
- package/dist/cli/commands/ChatCommand.d.ts +2 -2
- package/dist/cli/commands/ChatCommand.js +3 -39
- package/dist/cli/commands/ChatCommand.js.map +1 -1
- package/dist/cli/commands/CompleteCommand.d.ts +25 -0
- package/dist/cli/commands/CompleteCommand.js +278 -0
- package/dist/cli/commands/CompleteCommand.js.map +1 -0
- package/dist/cli/commands/DebugCommand.js +16 -13
- package/dist/cli/commands/DebugCommand.js.map +1 -1
- package/dist/cli/commands/DownloadCommand.d.ts +2 -1
- package/dist/cli/commands/DownloadCommand.js +11 -9
- package/dist/cli/commands/DownloadCommand.js.map +1 -1
- package/dist/cli/commands/InfillCommand.d.ts +27 -0
- package/dist/cli/commands/InfillCommand.js +316 -0
- package/dist/cli/commands/InfillCommand.js.map +1 -0
- package/dist/cli/utils/logEnabledComputeLayers.d.ts +8 -0
- package/dist/cli/utils/logEnabledComputeLayers.js +11 -0
- package/dist/cli/utils/logEnabledComputeLayers.js.map +1 -0
- package/dist/config.d.ts +1 -0
- package/dist/config.js +5 -2
- package/dist/config.js.map +1 -1
- package/dist/consts.d.ts +1 -0
- package/dist/consts.js +2 -0
- package/dist/consts.js.map +1 -0
- package/dist/evaluator/LlamaChat/LlamaChat.d.ts +2 -33
- package/dist/evaluator/LlamaChat/LlamaChat.js +7 -28
- package/dist/evaluator/LlamaChat/LlamaChat.js.map +1 -1
- package/dist/evaluator/LlamaChatSession/LlamaChatSession.js +1 -1
- package/dist/evaluator/LlamaChatSession/LlamaChatSession.js.map +1 -1
- package/dist/evaluator/LlamaCompletion.d.ts +148 -0
- package/dist/evaluator/LlamaCompletion.js +402 -0
- package/dist/evaluator/LlamaCompletion.js.map +1 -0
- package/dist/evaluator/LlamaContext/LlamaContext.js +6 -2
- package/dist/evaluator/LlamaContext/LlamaContext.js.map +1 -1
- package/dist/evaluator/LlamaModel.d.ts +10 -1
- package/dist/evaluator/LlamaModel.js +33 -3
- package/dist/evaluator/LlamaModel.js.map +1 -1
- package/dist/index.d.ts +6 -4
- package/dist/index.js +4 -2
- package/dist/index.js.map +1 -1
- package/dist/types.d.ts +31 -0
- package/dist/utils/UnsupportedError.d.ts +2 -0
- package/dist/utils/UnsupportedError.js +7 -0
- package/dist/utils/UnsupportedError.js.map +1 -0
- package/dist/utils/gbnfJson/terminals/GbnfArray.js.map +1 -1
- package/dist/utils/gbnfJson/terminals/GbnfBoolean.d.ts +1 -1
- package/dist/utils/gbnfJson/terminals/GbnfBoolean.js.map +1 -1
- package/dist/utils/gbnfJson/terminals/GbnfBooleanValue.js.map +1 -1
- package/dist/utils/gbnfJson/terminals/GbnfGrammar.js.map +1 -1
- package/dist/utils/gbnfJson/terminals/GbnfNull.d.ts +1 -1
- package/dist/utils/gbnfJson/terminals/GbnfNull.js.map +1 -1
- package/dist/utils/gbnfJson/terminals/GbnfNumber.d.ts +1 -1
- package/dist/utils/gbnfJson/terminals/GbnfNumber.js.map +1 -1
- package/dist/utils/gbnfJson/terminals/GbnfNumberValue.js.map +1 -1
- package/dist/utils/gbnfJson/terminals/GbnfObjectMap.js.map +1 -1
- package/dist/utils/gbnfJson/terminals/GbnfOr.js.map +1 -1
- package/dist/utils/gbnfJson/terminals/GbnfString.d.ts +1 -1
- package/dist/utils/gbnfJson/terminals/GbnfString.js.map +1 -1
- package/dist/utils/gbnfJson/terminals/GbnfStringValue.js.map +1 -1
- package/dist/utils/gbnfJson/terminals/GbnfVerbatimText.js.map +1 -1
- package/dist/utils/gbnfJson/terminals/GbnfWhitespace.d.ts +1 -1
- package/dist/utils/gbnfJson/terminals/GbnfWhitespace.js.map +1 -1
- package/dist/utils/getBuildDefaults.d.ts +1 -0
- package/dist/utils/getBuildDefaults.js +3 -2
- package/dist/utils/getBuildDefaults.js.map +1 -1
- package/dist/utils/getQueuedTokensBeforeStopTrigger.d.ts +6 -0
- package/dist/utils/getQueuedTokensBeforeStopTrigger.js +22 -0
- package/dist/utils/getQueuedTokensBeforeStopTrigger.js.map +1 -0
- package/llama/CMakeLists.txt +20 -0
- package/llama/addon.cpp +97 -12
- package/llama/binariesGithubRelease.json +1 -1
- package/llama/gitRelease.bundle +0 -0
- package/llama/gpuInfo/cuda-gpu-info.cu +5 -5
- package/llama/gpuInfo/cuda-gpu-info.h +2 -2
- package/llama/gpuInfo/vulkan-gpu-info.cpp +65 -0
- package/llama/gpuInfo/vulkan-gpu-info.h +7 -0
- package/llama/llama.cpp.info.json +1 -1
- package/llamaBins/linux-arm64/.buildMetadata.json +1 -1
- package/llamaBins/linux-arm64/llama-addon.node +0 -0
- package/llamaBins/linux-armv7l/.buildMetadata.json +1 -1
- package/llamaBins/linux-armv7l/llama-addon.node +0 -0
- package/llamaBins/linux-x64/.buildMetadata.json +1 -1
- package/llamaBins/linux-x64/llama-addon.node +0 -0
- package/llamaBins/linux-x64-cuda/.buildMetadata.json +1 -1
- package/llamaBins/linux-x64-cuda/llama-addon.node +0 -0
- package/llamaBins/linux-x64-vulkan/.buildMetadata.json +1 -0
- package/llamaBins/linux-x64-vulkan/llama-addon.node +0 -0
- package/llamaBins/mac-arm64-metal/.buildMetadata.json +1 -1
- package/llamaBins/mac-arm64-metal/ggml-metal.metal +1035 -132
- package/llamaBins/mac-arm64-metal/llama-addon.node +0 -0
- package/llamaBins/mac-x64/.buildMetadata.json +1 -1
- package/llamaBins/mac-x64/llama-addon.node +0 -0
- package/llamaBins/win-x64/.buildMetadata.json +1 -1
- package/llamaBins/win-x64/llama-addon.exp +0 -0
- package/llamaBins/win-x64/llama-addon.lib +0 -0
- package/llamaBins/win-x64/llama-addon.node +0 -0
- package/llamaBins/win-x64-cuda/.buildMetadata.json +1 -1
- package/llamaBins/win-x64-cuda/llama-addon.exp +0 -0
- package/llamaBins/win-x64-cuda/llama-addon.lib +0 -0
- package/llamaBins/win-x64-cuda/llama-addon.node +0 -0
- package/llamaBins/win-x64-vulkan/.buildMetadata.json +1 -0
- package/llamaBins/win-x64-vulkan/llama-addon.exp +0 -0
- package/llamaBins/win-x64-vulkan/llama-addon.lib +0 -0
- package/llamaBins/win-x64-vulkan/llama-addon.node +0 -0
- package/package.json +3 -2
- package/dist/AbortError.d.ts +0 -2
- package/dist/AbortError.js +0 -7
- package/dist/AbortError.js.map +0 -1
|
@@ -351,12 +351,17 @@ kernel void kernel_sum_rows(
|
|
|
351
351
|
kernel void kernel_soft_max(
|
|
352
352
|
device const float * src0,
|
|
353
353
|
device const float * src1,
|
|
354
|
+
device const float * src2,
|
|
354
355
|
device float * dst,
|
|
355
356
|
constant int64_t & ne00,
|
|
356
357
|
constant int64_t & ne01,
|
|
357
358
|
constant int64_t & ne02,
|
|
358
359
|
constant float & scale,
|
|
359
|
-
|
|
360
|
+
constant float & max_bias,
|
|
361
|
+
constant float & m0,
|
|
362
|
+
constant float & m1,
|
|
363
|
+
constant uint32_t & n_head_log2,
|
|
364
|
+
threadgroup float * buf [[threadgroup(0)]],
|
|
360
365
|
uint tgpig[[threadgroup_position_in_grid]],
|
|
361
366
|
uint tpitg[[thread_position_in_threadgroup]],
|
|
362
367
|
uint sgitg[[simdgroup_index_in_threadgroup]],
|
|
@@ -368,13 +373,26 @@ kernel void kernel_soft_max(
|
|
|
368
373
|
|
|
369
374
|
device const float * psrc0 = src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
|
|
370
375
|
device const float * pmask = src1 != src0 ? src1 + i01*ne00 : nullptr;
|
|
376
|
+
device const float * ppos = src2 != src0 ? src2 : nullptr;
|
|
371
377
|
device float * pdst = dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
|
|
372
378
|
|
|
379
|
+
float slope = 0.0f;
|
|
380
|
+
|
|
381
|
+
// ALiBi
|
|
382
|
+
if (max_bias > 0.0f) {
|
|
383
|
+
const int64_t h = i02;
|
|
384
|
+
|
|
385
|
+
const float base = h < n_head_log2 ? m0 : m1;
|
|
386
|
+
const int exp = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
|
|
387
|
+
|
|
388
|
+
slope = pow(base, exp);
|
|
389
|
+
}
|
|
390
|
+
|
|
373
391
|
// parallel max
|
|
374
392
|
float lmax = -INFINITY;
|
|
375
393
|
|
|
376
394
|
for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
|
|
377
|
-
lmax = MAX(lmax, psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f));
|
|
395
|
+
lmax = MAX(lmax, psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f) + (ppos ? slope*ppos[i00] : 0.0f));
|
|
378
396
|
}
|
|
379
397
|
|
|
380
398
|
// find the max value in the block
|
|
@@ -399,7 +417,7 @@ kernel void kernel_soft_max(
|
|
|
399
417
|
// parallel sum
|
|
400
418
|
float lsum = 0.0f;
|
|
401
419
|
for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
|
|
402
|
-
const float exp_psrc0 = exp((psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f)) - max_val);
|
|
420
|
+
const float exp_psrc0 = exp((psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f) + (ppos ? slope*ppos[i00] : 0.0f)) - max_val);
|
|
403
421
|
lsum += exp_psrc0;
|
|
404
422
|
pdst[i00] = exp_psrc0;
|
|
405
423
|
}
|
|
@@ -437,12 +455,17 @@ kernel void kernel_soft_max(
|
|
|
437
455
|
kernel void kernel_soft_max_4(
|
|
438
456
|
device const float * src0,
|
|
439
457
|
device const float * src1,
|
|
458
|
+
device const float * src2,
|
|
440
459
|
device float * dst,
|
|
441
460
|
constant int64_t & ne00,
|
|
442
461
|
constant int64_t & ne01,
|
|
443
462
|
constant int64_t & ne02,
|
|
444
463
|
constant float & scale,
|
|
445
|
-
|
|
464
|
+
constant float & max_bias,
|
|
465
|
+
constant float & m0,
|
|
466
|
+
constant float & m1,
|
|
467
|
+
constant uint32_t & n_head_log2,
|
|
468
|
+
threadgroup float * buf [[threadgroup(0)]],
|
|
446
469
|
uint tgpig[[threadgroup_position_in_grid]],
|
|
447
470
|
uint tpitg[[thread_position_in_threadgroup]],
|
|
448
471
|
uint sgitg[[simdgroup_index_in_threadgroup]],
|
|
@@ -454,13 +477,25 @@ kernel void kernel_soft_max_4(
|
|
|
454
477
|
|
|
455
478
|
device const float4 * psrc4 = (device const float4 *)(src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
|
|
456
479
|
device const float4 * pmask = src1 != src0 ? (device const float4 *)(src1 + i01*ne00) : nullptr;
|
|
480
|
+
device const float4 * ppos = src2 != src0 ? (device const float4 *)(src2) : nullptr;
|
|
457
481
|
device float4 * pdst4 = (device float4 *)(dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
|
|
458
482
|
|
|
483
|
+
float slope = 0.0f;
|
|
484
|
+
|
|
485
|
+
if (max_bias > 0.0f) {
|
|
486
|
+
const int64_t h = i02;
|
|
487
|
+
|
|
488
|
+
const float base = h < n_head_log2 ? m0 : m1;
|
|
489
|
+
const int exp = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
|
|
490
|
+
|
|
491
|
+
slope = pow(base, exp);
|
|
492
|
+
}
|
|
493
|
+
|
|
459
494
|
// parallel max
|
|
460
495
|
float4 lmax4 = -INFINITY;
|
|
461
496
|
|
|
462
497
|
for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
|
|
463
|
-
lmax4 = fmax(lmax4, psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f));
|
|
498
|
+
lmax4 = fmax(lmax4, psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f) + (ppos ? slope*ppos[i00] : 0.0f));
|
|
464
499
|
}
|
|
465
500
|
|
|
466
501
|
const float lmax = MAX(MAX(lmax4[0], lmax4[1]), MAX(lmax4[2], lmax4[3]));
|
|
@@ -486,7 +521,7 @@ kernel void kernel_soft_max_4(
|
|
|
486
521
|
// parallel sum
|
|
487
522
|
float4 lsum4 = 0.0f;
|
|
488
523
|
for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
|
|
489
|
-
const float4 exp_psrc4 = exp((psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f)) - max_val);
|
|
524
|
+
const float4 exp_psrc4 = exp((psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f) + (ppos ? slope*ppos[i00] : 0.0f)) - max_val);
|
|
490
525
|
lsum4 += exp_psrc4;
|
|
491
526
|
pdst4[i00] = exp_psrc4;
|
|
492
527
|
}
|
|
@@ -2490,6 +2525,33 @@ typedef struct {
|
|
|
2490
2525
|
} block_iq3_xxs;
|
|
2491
2526
|
// 98 bytes / block for QK_K = 256, so 3.0625 bpw
|
|
2492
2527
|
|
|
2528
|
+
// 3.4375 bpw
|
|
2529
|
+
#if QK_K == 64
|
|
2530
|
+
#define IQ3S_N_SCALE 2
|
|
2531
|
+
#else
|
|
2532
|
+
#define IQ3S_N_SCALE QK_K/64
|
|
2533
|
+
#endif
|
|
2534
|
+
typedef struct {
|
|
2535
|
+
half d;
|
|
2536
|
+
uint8_t qs[QK_K/4];
|
|
2537
|
+
uint8_t qh[QK_K/32];
|
|
2538
|
+
uint8_t signs[QK_K/8];
|
|
2539
|
+
uint8_t scales[IQ3S_N_SCALE];
|
|
2540
|
+
} block_iq3_s;
|
|
2541
|
+
|
|
2542
|
+
typedef struct {
|
|
2543
|
+
half d;
|
|
2544
|
+
uint8_t qs[QK_K/8];
|
|
2545
|
+
uint8_t scales[QK_K/16];
|
|
2546
|
+
} block_iq1_s;
|
|
2547
|
+
|
|
2548
|
+
// Non-linear quants
|
|
2549
|
+
#define QK4_NL 32
|
|
2550
|
+
typedef struct {
|
|
2551
|
+
half d;
|
|
2552
|
+
uint8_t qs[QK4_NL/2];
|
|
2553
|
+
} block_iq4_nl;
|
|
2554
|
+
|
|
2493
2555
|
//====================================== dot products =========================
|
|
2494
2556
|
|
|
2495
2557
|
void kernel_mul_mv_q2_K_f32_impl(
|
|
@@ -3747,6 +3809,204 @@ constexpr constant static uint32_t iq3xxs_grid[256] = {
|
|
|
3747
3809
|
0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c, 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04,
|
|
3748
3810
|
};
|
|
3749
3811
|
|
|
3812
|
+
constexpr constant static uint32_t iq3xs_grid[512] = {
|
|
3813
|
+
0x04040404, 0x0404040c, 0x04040414, 0x0404042c, 0x0404043e, 0x04040c04, 0x04040c0c, 0x04040c14,
|
|
3814
|
+
0x04040c24, 0x04040c34, 0x04041404, 0x0404140c, 0x0404142c, 0x04041c1c, 0x04042404, 0x04042414,
|
|
3815
|
+
0x0404242c, 0x0404243e, 0x04042c0c, 0x04042c1c, 0x04043404, 0x04043414, 0x04043e0c, 0x04043e24,
|
|
3816
|
+
0x04043e3e, 0x040c0404, 0x040c040c, 0x040c0414, 0x040c0424, 0x040c0c04, 0x040c0c0c, 0x040c0c2c,
|
|
3817
|
+
0x040c1404, 0x040c141c, 0x040c143e, 0x040c1c0c, 0x040c1c2c, 0x040c2424, 0x040c340c, 0x040c342c,
|
|
3818
|
+
0x040c3e14, 0x04140404, 0x0414040c, 0x0414042c, 0x0414043e, 0x04140c04, 0x04140c1c, 0x04140c34,
|
|
3819
|
+
0x0414140c, 0x0414142c, 0x04141c04, 0x04141c24, 0x04142414, 0x0414242c, 0x0414243e, 0x04142c0c,
|
|
3820
|
+
0x04142c1c, 0x04143e04, 0x04143e1c, 0x041c041c, 0x041c0c0c, 0x041c0c2c, 0x041c1404, 0x041c1414,
|
|
3821
|
+
0x041c1c0c, 0x041c1c1c, 0x041c1c34, 0x041c2424, 0x041c2c04, 0x041c2c14, 0x041c343e, 0x041c3e0c,
|
|
3822
|
+
0x041c3e2c, 0x04240404, 0x04240c1c, 0x04240c3e, 0x0424140c, 0x04241424, 0x04241c14, 0x04242404,
|
|
3823
|
+
0x0424241c, 0x04242c0c, 0x04243e04, 0x042c0414, 0x042c0424, 0x042c1404, 0x042c1414, 0x042c1434,
|
|
3824
|
+
0x042c1c1c, 0x042c240c, 0x042c242c, 0x042c243e, 0x042c3434, 0x042c3e1c, 0x04340434, 0x04340c0c,
|
|
3825
|
+
0x04340c1c, 0x04341c0c, 0x04342c14, 0x04343e0c, 0x043e0404, 0x043e0414, 0x043e0424, 0x043e1404,
|
|
3826
|
+
0x043e1414, 0x043e1434, 0x043e1c1c, 0x043e2c04, 0x043e2c24, 0x0c040404, 0x0c04040c, 0x0c040414,
|
|
3827
|
+
0x0c040424, 0x0c040c04, 0x0c040c0c, 0x0c040c1c, 0x0c040c2c, 0x0c040c3e, 0x0c041404, 0x0c041414,
|
|
3828
|
+
0x0c041c0c, 0x0c041c24, 0x0c041c34, 0x0c042c24, 0x0c042c34, 0x0c04340c, 0x0c043e14, 0x0c0c0404,
|
|
3829
|
+
0x0c0c040c, 0x0c0c041c, 0x0c0c0434, 0x0c0c0c04, 0x0c0c0c24, 0x0c0c140c, 0x0c0c1c04, 0x0c0c1c1c,
|
|
3830
|
+
0x0c0c240c, 0x0c0c2c04, 0x0c0c2c14, 0x0c0c3e04, 0x0c0c3e34, 0x0c140404, 0x0c140c14, 0x0c140c2c,
|
|
3831
|
+
0x0c140c3e, 0x0c141404, 0x0c141424, 0x0c141c14, 0x0c142404, 0x0c14241c, 0x0c142c2c, 0x0c143404,
|
|
3832
|
+
0x0c143e14, 0x0c1c040c, 0x0c1c0424, 0x0c1c043e, 0x0c1c0c04, 0x0c1c0c1c, 0x0c1c140c, 0x0c1c143e,
|
|
3833
|
+
0x0c1c1c04, 0x0c1c1c24, 0x0c1c240c, 0x0c1c3414, 0x0c1c3e04, 0x0c24041c, 0x0c24042c, 0x0c240c14,
|
|
3834
|
+
0x0c240c24, 0x0c241c0c, 0x0c241c1c, 0x0c242414, 0x0c242434, 0x0c242c04, 0x0c242c24, 0x0c2c040c,
|
|
3835
|
+
0x0c2c0c04, 0x0c2c0c1c, 0x0c2c140c, 0x0c2c1c04, 0x0c2c1c14, 0x0c2c2c0c, 0x0c341404, 0x0c341424,
|
|
3836
|
+
0x0c34143e, 0x0c342424, 0x0c342434, 0x0c3e040c, 0x0c3e041c, 0x0c3e0c04, 0x0c3e0c14, 0x0c3e140c,
|
|
3837
|
+
0x0c3e1c2c, 0x0c3e240c, 0x0c3e3414, 0x0c3e3e04, 0x14040404, 0x1404040c, 0x1404041c, 0x1404042c,
|
|
3838
|
+
0x1404043e, 0x14040c04, 0x14040c14, 0x14040c24, 0x14040c34, 0x1404140c, 0x1404141c, 0x1404143e,
|
|
3839
|
+
0x14041c04, 0x14041c14, 0x1404240c, 0x1404241c, 0x1404242c, 0x14042c04, 0x14042c14, 0x1404343e,
|
|
3840
|
+
0x14043e04, 0x14043e1c, 0x14043e2c, 0x140c0404, 0x140c0414, 0x140c0c04, 0x140c0c1c, 0x140c0c3e,
|
|
3841
|
+
0x140c1414, 0x140c142c, 0x140c1c0c, 0x140c1c24, 0x140c2414, 0x140c2c0c, 0x1414040c, 0x14140424,
|
|
3842
|
+
0x1414043e, 0x1414140c, 0x1414141c, 0x14141c04, 0x14141c3e, 0x1414240c, 0x14142c1c, 0x14142c3e,
|
|
3843
|
+
0x14143e0c, 0x14143e24, 0x141c0404, 0x141c0414, 0x141c042c, 0x141c0c0c, 0x141c1414, 0x141c1424,
|
|
3844
|
+
0x141c1c0c, 0x141c1c1c, 0x141c2414, 0x141c2c04, 0x141c3434, 0x1424040c, 0x1424043e, 0x14241404,
|
|
3845
|
+
0x1424141c, 0x14241c14, 0x14241c2c, 0x1424240c, 0x14243e14, 0x14243e2c, 0x142c0424, 0x142c0c0c,
|
|
3846
|
+
0x142c1414, 0x142c1c3e, 0x142c2404, 0x142c2c1c, 0x142c3e04, 0x14340404, 0x14340414, 0x1434043e,
|
|
3847
|
+
0x1434140c, 0x14342c2c, 0x1434340c, 0x143e042c, 0x143e0c0c, 0x143e1434, 0x143e1c04, 0x143e241c,
|
|
3848
|
+
0x143e2c04, 0x1c040414, 0x1c040c0c, 0x1c040c1c, 0x1c040c2c, 0x1c040c3e, 0x1c041414, 0x1c041c0c,
|
|
3849
|
+
0x1c041c1c, 0x1c041c2c, 0x1c042414, 0x1c042424, 0x1c04243e, 0x1c042c0c, 0x1c04341c, 0x1c043e0c,
|
|
3850
|
+
0x1c0c040c, 0x1c0c041c, 0x1c0c042c, 0x1c0c0c24, 0x1c0c140c, 0x1c0c141c, 0x1c0c2404, 0x1c0c3404,
|
|
3851
|
+
0x1c0c3e14, 0x1c0c3e34, 0x1c140404, 0x1c140c14, 0x1c141404, 0x1c141c14, 0x1c141c24, 0x1c142c04,
|
|
3852
|
+
0x1c1c040c, 0x1c1c0c04, 0x1c1c0c24, 0x1c1c140c, 0x1c1c141c, 0x1c1c143e, 0x1c1c1c04, 0x1c1c240c,
|
|
3853
|
+
0x1c1c241c, 0x1c1c243e, 0x1c1c2c2c, 0x1c1c3e1c, 0x1c24041c, 0x1c240c0c, 0x1c240c34, 0x1c241414,
|
|
3854
|
+
0x1c241c0c, 0x1c242c14, 0x1c243404, 0x1c243424, 0x1c2c040c, 0x1c2c0c04, 0x1c2c0c14, 0x1c2c142c,
|
|
3855
|
+
0x1c2c1c14, 0x1c2c2424, 0x1c2c2c34, 0x1c2c3e1c, 0x1c340c34, 0x1c34240c, 0x1c3e040c, 0x1c3e041c,
|
|
3856
|
+
0x1c3e1404, 0x1c3e1414, 0x1c3e1c2c, 0x24040404, 0x24040424, 0x24040c14, 0x24041404, 0x24041424,
|
|
3857
|
+
0x2404143e, 0x24041c14, 0x2404240c, 0x24042c04, 0x24043e04, 0x240c0414, 0x240c043e, 0x240c0c0c,
|
|
3858
|
+
0x240c0c1c, 0x240c1414, 0x240c1c04, 0x240c1c2c, 0x240c241c, 0x240c2c0c, 0x240c2c2c, 0x2414040c,
|
|
3859
|
+
0x2414041c, 0x24140c04, 0x24140c2c, 0x2414140c, 0x24141c1c, 0x24142404, 0x24142c3e, 0x24143414,
|
|
3860
|
+
0x24143e04, 0x241c0424, 0x241c0c0c, 0x241c0c1c, 0x241c1404, 0x241c1414, 0x241c1c0c, 0x241c1c2c,
|
|
3861
|
+
0x24240404, 0x24240414, 0x24241424, 0x24241c3e, 0x24242404, 0x24243e0c, 0x242c042c, 0x242c043e,
|
|
3862
|
+
0x242c140c, 0x242c3414, 0x24340c1c, 0x24341c24, 0x24343404, 0x243e0c04, 0x243e0c2c, 0x243e1c04,
|
|
3863
|
+
0x243e241c, 0x243e2c0c, 0x2c040414, 0x2c040c04, 0x2c040c24, 0x2c041414, 0x2c042404, 0x2c042424,
|
|
3864
|
+
0x2c04243e, 0x2c042c14, 0x2c043434, 0x2c043e24, 0x2c0c040c, 0x2c0c041c, 0x2c0c042c, 0x2c0c0c14,
|
|
3865
|
+
0x2c0c140c, 0x2c0c1c14, 0x2c0c3e14, 0x2c140404, 0x2c140c0c, 0x2c14141c, 0x2c141c04, 0x2c141c34,
|
|
3866
|
+
0x2c142c1c, 0x2c1c0414, 0x2c1c043e, 0x2c1c0c04, 0x2c1c143e, 0x2c1c2424, 0x2c1c2c0c, 0x2c1c342c,
|
|
3867
|
+
0x2c1c3e1c, 0x2c24040c, 0x2c240424, 0x2c241404, 0x2c241c14, 0x2c242434, 0x2c2c0c14, 0x2c2c1434,
|
|
3868
|
+
0x2c2c2c0c, 0x2c2c2c1c, 0x2c342414, 0x2c3e0414, 0x2c3e0424, 0x2c3e1414, 0x34040c0c, 0x34040c1c,
|
|
3869
|
+
0x34040c2c, 0x34041c0c, 0x34041c1c, 0x34043404, 0x340c0404, 0x340c1404, 0x340c143e, 0x340c3424,
|
|
3870
|
+
0x34140c14, 0x34141c24, 0x34142414, 0x34142c2c, 0x34143414, 0x34143e04, 0x341c0404, 0x341c0c24,
|
|
3871
|
+
0x341c140c, 0x341c2404, 0x3424142c, 0x3424241c, 0x34243414, 0x342c0404, 0x342c041c, 0x342c1c24,
|
|
3872
|
+
0x342c3404, 0x3434042c, 0x34342404, 0x343e0c0c, 0x343e0c1c, 0x3e040404, 0x3e040424, 0x3e04043e,
|
|
3873
|
+
0x3e041404, 0x3e041414, 0x3e041c34, 0x3e042404, 0x3e042c24, 0x3e043414, 0x3e0c0414, 0x3e0c0c0c,
|
|
3874
|
+
0x3e0c1424, 0x3e0c241c, 0x3e0c242c, 0x3e14040c, 0x3e140424, 0x3e140c04, 0x3e140c34, 0x3e14140c,
|
|
3875
|
+
0x3e141c04, 0x3e142c0c, 0x3e1c0414, 0x3e1c1c14, 0x3e1c1c2c, 0x3e1c2c1c, 0x3e24040c, 0x3e24042c,
|
|
3876
|
+
0x3e240c1c, 0x3e241404, 0x3e242c04, 0x3e2c1414, 0x3e2c2414, 0x3e340414, 0x3e341c0c, 0x3e3e0404,
|
|
3877
|
+
};
|
|
3878
|
+
|
|
3879
|
+
#define NGRID_IQ1S 512
|
|
3880
|
+
constexpr constant static uint64_t iq1s_grid[NGRID_IQ1S] = {
|
|
3881
|
+
0xffffffffffff0101, 0xffffffffff01ff00, 0xffffffffff010100, 0xffffffff00000000,
|
|
3882
|
+
0xffffffff01ff00ff, 0xffffffff01ff0001, 0xffffffff0101ffff, 0xffffffff0101ff01,
|
|
3883
|
+
0xffffff00ff000000, 0xffffff000000ff00, 0xffffff00000000ff, 0xffffff0000000100,
|
|
3884
|
+
0xffffff0000010000, 0xffffff0001000000, 0xffffff01ffff00ff, 0xffffff01ff01ff00,
|
|
3885
|
+
0xffffff01ff010100, 0xffffff0100000001, 0xffffff0101ffff00, 0xffffff0101ff0101,
|
|
3886
|
+
0xffffff0101010100, 0xffff00ffff00ff01, 0xffff00ffff0000ff, 0xffff00ff00ff0100,
|
|
3887
|
+
0xffff00ff0100ff00, 0xffff00ff010001ff, 0xffff0000ff0101ff, 0xffff000000ffff00,
|
|
3888
|
+
0xffff000000000000, 0xffff00000001ff01, 0xffff000001000101, 0xffff0000010100ff,
|
|
3889
|
+
0xffff0001ffff0100, 0xffff00010000ff00, 0xffff000100010101, 0xffff000101000000,
|
|
3890
|
+
0xffff01ffffff0000, 0xffff01ffff01ffff, 0xffff01ffff010100, 0xffff01ff00000000,
|
|
3891
|
+
0xffff01ff01ffffff, 0xffff01ff01ff0001, 0xffff01ff0101ffff, 0xffff01ff01010001,
|
|
3892
|
+
0xffff0100ffffff01, 0xffff01000000ffff, 0xffff010000000100, 0xffff010001ff01ff,
|
|
3893
|
+
0xffff010001000000, 0xffff0101ff000000, 0xffff0101000101ff, 0xffff010101ffff01,
|
|
3894
|
+
0xffff01010101ff00, 0xff00ffffff000000, 0xff00ffff00ffff00, 0xff00ffff00000001,
|
|
3895
|
+
0xff00ffff000001ff, 0xff00ffff01010000, 0xff00ff00ffff0000, 0xff00ff00ff00ff00,
|
|
3896
|
+
0xff00ff00ff0000ff, 0xff00ff00ff000100, 0xff00ff00ff010001, 0xff00ff0000ff0001,
|
|
3897
|
+
0xff00ff000000ffff, 0xff00ff0000000000, 0xff00ff000001ff00, 0xff00ff0000010100,
|
|
3898
|
+
0xff00ff0001ff0000, 0xff00ff000100ff00, 0xff00ff0001000100, 0xff00ff01ff000000,
|
|
3899
|
+
0xff00ff0100ff0000, 0xff00ff01000001ff, 0xff00ff0101010001, 0xff0000ff00000000,
|
|
3900
|
+
0xff0000ff0001ff00, 0xff0000ff00010100, 0xff000000ffff0101, 0xff000000ff000000,
|
|
3901
|
+
0xff000000ff01ff00, 0xff00000000ff0000, 0xff0000000000ff00, 0xff000000000000ff,
|
|
3902
|
+
0xff00000000000000, 0xff00000000000001, 0xff00000000000100, 0xff0000000001ffff,
|
|
3903
|
+
0xff00000000010000, 0xff00000001000000, 0xff00000001010100, 0xff000001ff00ff01,
|
|
3904
|
+
0xff000001ff0100ff, 0xff00000100000000, 0xff0000010001ff00, 0xff00000101ff0100,
|
|
3905
|
+
0xff0000010100ff00, 0xff0001ff00ff00ff, 0xff0001ff00000101, 0xff0001ff000100ff,
|
|
3906
|
+
0xff0001ff01000000, 0xff000100ff0001ff, 0xff0001000000ff01, 0xff00010000000000,
|
|
3907
|
+
0xff00010000010001, 0xff00010000010100, 0xff00010001ffff00, 0xff00010001ff0101,
|
|
3908
|
+
0xff00010001010000, 0xff000101ffffffff, 0xff000101ff000101, 0xff00010101ff00ff,
|
|
3909
|
+
0xff00010101000001, 0xff000101010100ff, 0xff01ffffff000101, 0xff01ffffff01ffff,
|
|
3910
|
+
0xff01ffffff01ff01, 0xff01ffffff0101ff, 0xff01ffff00000000, 0xff01ffff01ff0001,
|
|
3911
|
+
0xff01ffff0101ff01, 0xff01ff00ff000000, 0xff01ff0000ff0100, 0xff01ff000000ff01,
|
|
3912
|
+
0xff01ff0000010000, 0xff01ff00010000ff, 0xff01ff01ff01ff00, 0xff01ff0100000101,
|
|
3913
|
+
0xff0100ffffff0000, 0xff0100ffff010000, 0xff0100ff01ff00ff, 0xff0100ff01000100,
|
|
3914
|
+
0xff0100ff010100ff, 0xff010000ffffff01, 0xff01000000000000, 0xff0100000101ff00,
|
|
3915
|
+
0xff010001ffff00ff, 0xff010001ff000100, 0xff01000100ffff00, 0xff01000100010001,
|
|
3916
|
+
0xff01000101ff0001, 0xff010001010001ff, 0xff0101ffffffffff, 0xff0101ffff01ffff,
|
|
3917
|
+
0xff0101ffff010101, 0xff0101ff0000ff00, 0xff0101ff01010001, 0xff010100ff000000,
|
|
3918
|
+
0xff010100ff01ff01, 0xff01010000ff0001, 0xff01010000000100, 0xff01010001000000,
|
|
3919
|
+
0xff0101010100ffff, 0x00ffffff0000ff01, 0x00ffffff000000ff, 0x00ffffff00000100,
|
|
3920
|
+
0x00ffffff00010000, 0x00ffff00ffff0001, 0x00ffff00ff0000ff, 0x00ffff00ff000100,
|
|
3921
|
+
0x00ffff0000000000, 0x00ffff0001000100, 0x00ffff0001010001, 0x00ffff01ff00ff01,
|
|
3922
|
+
0x00ffff0100ff0100, 0x00ffff010000ff00, 0x00ffff01000100ff, 0x00ffff0101ff00ff,
|
|
3923
|
+
0x00ffff010101ff00, 0x00ff00ffffffffff, 0x00ff00ffffff01ff, 0x00ff00ffff000101,
|
|
3924
|
+
0x00ff00ff00000000, 0x00ff00ff000101ff, 0x00ff00ff01010101, 0x00ff0000ff000000,
|
|
3925
|
+
0x00ff0000ff01ffff, 0x00ff000000ff0000, 0x00ff00000000ff00, 0x00ff0000000000ff,
|
|
3926
|
+
0x00ff000000000000, 0x00ff000000000001, 0x00ff000000000100, 0x00ff000000010000,
|
|
3927
|
+
0x00ff000001ffff01, 0x00ff000001000000, 0x00ff0001ff000101, 0x00ff000100ffffff,
|
|
3928
|
+
0x00ff000100000000, 0x00ff0001010001ff, 0x00ff01ffff000000, 0x00ff01ff0001ff00,
|
|
3929
|
+
0x00ff01ff01ff0100, 0x00ff0100ff01ff01, 0x00ff010000ff00ff, 0x00ff010000ff0101,
|
|
3930
|
+
0x00ff010000000000, 0x00ff010000010101, 0x00ff01000100ff00, 0x00ff010001010000,
|
|
3931
|
+
0x00ff0101ffffff00, 0x00ff01010000ff01, 0x00ff010100000100, 0x00ff010101ff0000,
|
|
3932
|
+
0x0000ffffffff0100, 0x0000ffffff00ff00, 0x0000ffffff0000ff, 0x0000ffffff010000,
|
|
3933
|
+
0x0000ffff00000000, 0x0000ffff00010101, 0x0000ffff01ffff01, 0x0000ffff01000100,
|
|
3934
|
+
0x0000ff00ff000000, 0x0000ff00ff01ff00, 0x0000ff00ff0101ff, 0x0000ff0000ff0000,
|
|
3935
|
+
0x0000ff000000ff00, 0x0000ff00000000ff, 0x0000ff0000000000, 0x0000ff0000000001,
|
|
3936
|
+
0x0000ff0000000100, 0x0000ff0000010000, 0x0000ff0001ffffff, 0x0000ff0001ff01ff,
|
|
3937
|
+
0x0000ff0001000000, 0x0000ff000101ffff, 0x0000ff01ffff0101, 0x0000ff01ff010000,
|
|
3938
|
+
0x0000ff0100000000, 0x0000ff0101000101, 0x000000ffffff0001, 0x000000ffff000000,
|
|
3939
|
+
0x000000ff00ff0000, 0x000000ff0000ff00, 0x000000ff000000ff, 0x000000ff00000000,
|
|
3940
|
+
0x000000ff00000001, 0x000000ff00000100, 0x000000ff00010000, 0x000000ff01000000,
|
|
3941
|
+
0x000000ff0101ff00, 0x00000000ffff0000, 0x00000000ff00ff00, 0x00000000ff0000ff,
|
|
3942
|
+
0x00000000ff000000, 0x00000000ff000001, 0x00000000ff000100, 0x00000000ff010000,
|
|
3943
|
+
0x0000000000ffff00, 0x0000000000ff00ff, 0x0000000000ff0000, 0x0000000000ff0001,
|
|
3944
|
+
0x0000000000ff0100, 0x000000000000ffff, 0x000000000000ff00, 0x000000000000ff01,
|
|
3945
|
+
0x00000000000000ff, 0x0000000000000001, 0x00000000000001ff, 0x0000000000000100,
|
|
3946
|
+
0x0000000000000101, 0x000000000001ff00, 0x00000000000100ff, 0x0000000000010000,
|
|
3947
|
+
0x0000000000010001, 0x0000000000010100, 0x0000000001ff0000, 0x000000000100ff00,
|
|
3948
|
+
0x00000000010000ff, 0x0000000001000000, 0x0000000001000001, 0x0000000001000100,
|
|
3949
|
+
0x0000000001010000, 0x00000001ffff01ff, 0x00000001ff000000, 0x0000000100ff0000,
|
|
3950
|
+
0x000000010000ff00, 0x00000001000000ff, 0x0000000100000000, 0x0000000100000001,
|
|
3951
|
+
0x0000000100000100, 0x0000000100010000, 0x0000000101000000, 0x000001ffff00ff00,
|
|
3952
|
+
0x000001ffff010001, 0x000001ffff0101ff, 0x000001ff00ffff01, 0x000001ff0000ffff,
|
|
3953
|
+
0x000001ff00000000, 0x000001ff010000ff, 0x000001ff01010100, 0x00000100ffff0100,
|
|
3954
|
+
0x00000100ff000000, 0x0000010000ff0000, 0x000001000000ff00, 0x00000100000000ff,
|
|
3955
|
+
0x0000010000000000, 0x0000010000000001, 0x0000010000000100, 0x0000010000010000,
|
|
3956
|
+
0x0000010001000000, 0x000001000101ff01, 0x00000101ffff0001, 0x00000101ff01ffff,
|
|
3957
|
+
0x0000010100000000, 0x0000010101010100, 0x0001ffffff000000, 0x0001ffff00ffffff,
|
|
3958
|
+
0x0001ffff00000100, 0x0001ffff0001ff00, 0x0001ffff01000000, 0x0001ff00ffffff00,
|
|
3959
|
+
0x0001ff00ffff01ff, 0x0001ff00ff010000, 0x0001ff0000000000, 0x0001ff0000010001,
|
|
3960
|
+
0x0001ff0001ff0000, 0x0001ff0001010100, 0x0001ff01ff0000ff, 0x0001ff01ff000001,
|
|
3961
|
+
0x0001ff0100ffffff, 0x0001ff010001ffff, 0x0001ff01000101ff, 0x0001ff010100ff01,
|
|
3962
|
+
0x000100ffff00ffff, 0x000100ffff00ff01, 0x000100ffff000100, 0x000100ff00000000,
|
|
3963
|
+
0x000100ff000101ff, 0x000100ff01ff0101, 0x000100ff0100ffff, 0x000100ff01010101,
|
|
3964
|
+
0x00010000ff000000, 0x00010000ff010100, 0x0001000000ff0000, 0x000100000000ff00,
|
|
3965
|
+
0x00010000000000ff, 0x0001000000000000, 0x0001000000000001, 0x0001000000000100,
|
|
3966
|
+
0x0001000000010000, 0x0001000001ffff01, 0x0001000001000000, 0x0001000100ff0101,
|
|
3967
|
+
0x0001000100000000, 0x00010001010100ff, 0x000101ffffff01ff, 0x000101ffffff0101,
|
|
3968
|
+
0x000101ff00010000, 0x000101ff01ff0000, 0x000101ff0100ff01, 0x00010100ffff0000,
|
|
3969
|
+
0x0001010000000000, 0x000101000001ffff, 0x0001010000010101, 0x00010100010001ff,
|
|
3970
|
+
0x00010101ff00ff00, 0x00010101ff010001, 0x0001010100ffffff, 0x0001010100ff01ff,
|
|
3971
|
+
0x00010101000101ff, 0x0001010101ff0000, 0x000101010100ff01, 0x0001010101000101,
|
|
3972
|
+
0x01ffffffffff0101, 0x01ffffffff01ffff, 0x01ffffffff01ff01, 0x01ffffffff0101ff,
|
|
3973
|
+
0x01ffffffff010101, 0x01ffffff00000000, 0x01ffffff01ff01ff, 0x01ffffff01000101,
|
|
3974
|
+
0x01ffffff0101ff01, 0x01ffffff010100ff, 0x01ffff000000ff00, 0x01ffff0000000001,
|
|
3975
|
+
0x01ffff00000001ff, 0x01ffff0000010000, 0x01ffff0001ff0000, 0x01ffff01ffffffff,
|
|
3976
|
+
0x01ffff01ffff01ff, 0x01ffff01ff000000, 0x01ffff01ff01ffff, 0x01ffff01ff0101ff,
|
|
3977
|
+
0x01ffff010100ffff, 0x01ff00ffffff0000, 0x01ff00ffff010000, 0x01ff00ff00ffff01,
|
|
3978
|
+
0x01ff0000ff0000ff, 0x01ff000000000000, 0x01ff00000001ff01, 0x01ff000001ffffff,
|
|
3979
|
+
0x01ff000001010100, 0x01ff0001ffffff01, 0x01ff0001ff010001, 0x01ff000101ff0100,
|
|
3980
|
+
0x01ff000101000001, 0x01ff0001010100ff, 0x01ff01ffff00ffff, 0x01ff01ff00010001,
|
|
3981
|
+
0x01ff01ff01000000, 0x01ff01ff010101ff, 0x01ff0100ff000001, 0x01ff010000ffff00,
|
|
3982
|
+
0x01ff010000000100, 0x01ff010001ff01ff, 0x01ff01000101ffff, 0x01ff0101ffff00ff,
|
|
3983
|
+
0x01ff0101ffff0101, 0x01ff0101ff0101ff, 0x01ff010100010000, 0x0100ffff00ff00ff,
|
|
3984
|
+
0x0100ffff00ff0001, 0x0100ffff00000100, 0x0100ffff0100ff00, 0x0100ff00ffff0000,
|
|
3985
|
+
0x0100ff00ff00ffff, 0x0100ff00ff00ff01, 0x0100ff00ff000100, 0x0100ff00ff010000,
|
|
3986
|
+
0x0100ff0000000000, 0x0100ff00000100ff, 0x0100ff0001ff0101, 0x0100ff0001010101,
|
|
3987
|
+
0x0100ff0100ff00ff, 0x0100ff0100ff0001, 0x0100ff0100000100, 0x0100ff0100010001,
|
|
3988
|
+
0x0100ff0101000000, 0x010000ffff00ff00, 0x010000ff0000ffff, 0x010000ff00000000,
|
|
3989
|
+
0x010000ff010001ff, 0x010000ff01010001, 0x01000000ffffff00, 0x01000000ffff0101,
|
|
3990
|
+
0x01000000ff000000, 0x01000000ff0100ff, 0x01000000ff010101, 0x0100000000ff0000,
|
|
3991
|
+
0x010000000000ff00, 0x01000000000000ff, 0x0100000000000000, 0x0100000000000001,
|
|
3992
|
+
0x0100000000000100, 0x0100000000010000, 0x0100000001000000, 0x0100000100000000,
|
|
3993
|
+
0x01000001000101ff, 0x0100000101ffff01, 0x010001ffff000101, 0x010001ff00ff0100,
|
|
3994
|
+
0x010001ff0000ff00, 0x010001ff000100ff, 0x010001ff01ffffff, 0x01000100ffff0000,
|
|
3995
|
+
0x01000100ff0001ff, 0x0100010000000000, 0x010001000001ff00, 0x0100010001ff0000,
|
|
3996
|
+
0x01000100010000ff, 0x0100010001000101, 0x01000101ff00ff01, 0x0100010100ff0100,
|
|
3997
|
+
0x010001010000ffff, 0x0100010101010001, 0x0101ffffffff0101, 0x0101ffffff0001ff,
|
|
3998
|
+
0x0101ffffff01ffff, 0x0101ffffff010101, 0x0101ffff00000000, 0x0101ffff0101ffff,
|
|
3999
|
+
0x0101ffff010101ff, 0x0101ff00ff000000, 0x0101ff0000ff0100, 0x0101ff000000ff00,
|
|
4000
|
+
0x0101ff0000010000, 0x0101ff00010000ff, 0x0101ff0001000001, 0x0101ff01ff010101,
|
|
4001
|
+
0x0101ff0100000000, 0x0101ff010101ff00, 0x010100ffffff0000, 0x010100ffff010000,
|
|
4002
|
+
0x010100ff00ff01ff, 0x010100ff000000ff, 0x010100ff00000101, 0x010100ff01ffff00,
|
|
4003
|
+
0x01010000ffffff01, 0x01010000ff000100, 0x01010000ff01ff01, 0x0101000000000000,
|
|
4004
|
+
0x01010000000100ff, 0x010100000101ff01, 0x01010001ffff0000, 0x01010001ff00ffff,
|
|
4005
|
+
0x01010001ff010000, 0x0101000101ffffff, 0x0101000101ff01ff, 0x0101000101010101,
|
|
4006
|
+
0x010101ffff01ffff, 0x010101ff00000000, 0x010101ff0001ff01, 0x010101ff0101ffff,
|
|
4007
|
+
0x010101ff010101ff, 0x01010100ffffffff, 0x01010100ff000001, 0x010101000000ff00,
|
|
4008
|
+
0x0101010001010000, 0x0101010100ff0001, 0x010101010001ff01, 0x010101010101ffff,
|
|
4009
|
+
};
|
|
3750
4010
|
|
|
3751
4011
|
constexpr constant static uint8_t ksigns_iq2xs[128] = {
|
|
3752
4012
|
0, 129, 130, 3, 132, 5, 6, 135, 136, 9, 10, 139, 12, 141, 142, 15,
|
|
@@ -3854,7 +4114,10 @@ void kernel_mul_mv_iq2_xxs_f32_impl(
|
|
|
3854
4114
|
y4 += 32 * 32;
|
|
3855
4115
|
}
|
|
3856
4116
|
#else
|
|
3857
|
-
|
|
4117
|
+
(void) x;
|
|
4118
|
+
(void) y;
|
|
4119
|
+
(void) yl;
|
|
4120
|
+
(void) nb32;
|
|
3858
4121
|
#endif
|
|
3859
4122
|
|
|
3860
4123
|
for (int row = 0; row < N_DST; ++row) {
|
|
@@ -3997,7 +4260,10 @@ void kernel_mul_mv_iq2_xs_f32_impl(
|
|
|
3997
4260
|
y4 += 32 * 32;
|
|
3998
4261
|
}
|
|
3999
4262
|
#else
|
|
4000
|
-
|
|
4263
|
+
(void) x;
|
|
4264
|
+
(void) y;
|
|
4265
|
+
(void) yl;
|
|
4266
|
+
(void) nb32;
|
|
4001
4267
|
#endif
|
|
4002
4268
|
|
|
4003
4269
|
for (int row = 0; row < N_DST; ++row) {
|
|
@@ -4133,7 +4399,10 @@ void kernel_mul_mv_iq3_xxs_f32_impl(
|
|
|
4133
4399
|
y4 += 32 * 32;
|
|
4134
4400
|
}
|
|
4135
4401
|
#else
|
|
4136
|
-
|
|
4402
|
+
(void) x;
|
|
4403
|
+
(void) y;
|
|
4404
|
+
(void) yl;
|
|
4405
|
+
(void) nb32;
|
|
4137
4406
|
#endif
|
|
4138
4407
|
|
|
4139
4408
|
for (int row = 0; row < N_DST; ++row) {
|
|
@@ -4173,158 +4442,532 @@ kernel void kernel_mul_mv_iq3_xxs_f32(
|
|
|
4173
4442
|
kernel_mul_mv_iq3_xxs_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
|
|
4174
4443
|
}
|
|
4175
4444
|
|
|
4445
|
+
void kernel_mul_mv_iq3_s_f32_impl(
|
|
4446
|
+
device const void * src0,
|
|
4447
|
+
device const float * src1,
|
|
4448
|
+
device float * dst,
|
|
4449
|
+
constant int64_t & ne00,
|
|
4450
|
+
constant int64_t & ne01,
|
|
4451
|
+
constant int64_t & ne02,
|
|
4452
|
+
constant int64_t & ne10,
|
|
4453
|
+
constant int64_t & ne12,
|
|
4454
|
+
constant int64_t & ne0,
|
|
4455
|
+
constant int64_t & ne1,
|
|
4456
|
+
constant uint & r2,
|
|
4457
|
+
constant uint & r3,
|
|
4458
|
+
threadgroup int8_t * shared_values [[threadgroup(0)]],
|
|
4459
|
+
uint3 tgpig[[threadgroup_position_in_grid]],
|
|
4460
|
+
uint tiisg[[thread_index_in_simdgroup]],
|
|
4461
|
+
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
|
4176
4462
|
|
|
4177
|
-
|
|
4178
|
-
|
|
4179
|
-
|
|
4180
|
-
|
|
4181
|
-
void dequantize_f32(device const float4x4 * src, short il, thread type4x4 & reg) {
|
|
4182
|
-
float4x4 temp = *(((device float4x4 *)src));
|
|
4183
|
-
for (int i = 0; i < 16; i++){
|
|
4184
|
-
reg[i/4][i%4] = temp[i/4][i%4];
|
|
4185
|
-
}
|
|
4186
|
-
}
|
|
4463
|
+
const int nb = ne00/QK_K;
|
|
4464
|
+
const int r0 = tgpig.x;
|
|
4465
|
+
const int r1 = tgpig.y;
|
|
4466
|
+
const int im = tgpig.z;
|
|
4187
4467
|
|
|
4188
|
-
|
|
4189
|
-
|
|
4190
|
-
half4x4 temp = *(((device half4x4 *)src));
|
|
4191
|
-
for (int i = 0; i < 16; i++){
|
|
4192
|
-
reg[i/4][i%4] = temp[i/4][i%4];
|
|
4193
|
-
}
|
|
4194
|
-
}
|
|
4468
|
+
const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
|
|
4469
|
+
const int ib_row = first_row * nb;
|
|
4195
4470
|
|
|
4196
|
-
|
|
4197
|
-
|
|
4198
|
-
device const uint16_t * qs = ((device const uint16_t *)xb + 1);
|
|
4199
|
-
const float d1 = il ? (xb->d / 16.h) : xb->d;
|
|
4200
|
-
const float d2 = d1 / 256.f;
|
|
4201
|
-
const float md = -8.h * xb->d;
|
|
4202
|
-
const ushort mask0 = il ? 0x00F0 : 0x000F;
|
|
4203
|
-
const ushort mask1 = mask0 << 8;
|
|
4471
|
+
const uint i12 = im%ne12;
|
|
4472
|
+
const uint i13 = im/ne12;
|
|
4204
4473
|
|
|
4205
|
-
|
|
4206
|
-
reg[i/2][2*(i%2)+0] = d1 * (qs[i] & mask0) + md;
|
|
4207
|
-
reg[i/2][2*(i%2)+1] = d2 * (qs[i] & mask1) + md;
|
|
4208
|
-
}
|
|
4209
|
-
}
|
|
4474
|
+
const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
|
|
4210
4475
|
|
|
4211
|
-
|
|
4212
|
-
|
|
4213
|
-
device const uint16_t * qs = ((device const uint16_t *)xb + 2);
|
|
4214
|
-
const float d1 = il ? (xb->d / 16.h) : xb->d;
|
|
4215
|
-
const float d2 = d1 / 256.f;
|
|
4216
|
-
const float m = xb->m;
|
|
4217
|
-
const ushort mask0 = il ? 0x00F0 : 0x000F;
|
|
4218
|
-
const ushort mask1 = mask0 << 8;
|
|
4476
|
+
device const block_iq3_s * x = (device const block_iq3_s *) src0 + ib_row + offset0;
|
|
4477
|
+
device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1;
|
|
4219
4478
|
|
|
4220
|
-
|
|
4221
|
-
|
|
4222
|
-
reg[i/2][2*(i%2)+1] = ((qs[i] & mask1) * d2) + m;
|
|
4223
|
-
}
|
|
4224
|
-
}
|
|
4479
|
+
float yl[32];
|
|
4480
|
+
float sumf[N_DST]={0.f}, all_sum;
|
|
4225
4481
|
|
|
4226
|
-
|
|
4227
|
-
void dequantize_q5_0(device const block_q5_0 *xb, short il, thread type4x4 & reg) {
|
|
4228
|
-
device const uint16_t * qs = ((device const uint16_t *)xb + 3);
|
|
4229
|
-
const float d = xb->d;
|
|
4230
|
-
const float md = -16.h * xb->d;
|
|
4231
|
-
const ushort mask = il ? 0x00F0 : 0x000F;
|
|
4482
|
+
const int nb32 = nb * (QK_K / 32);
|
|
4232
4483
|
|
|
4233
|
-
|
|
4484
|
+
threadgroup uint32_t * values = (threadgroup uint32_t *)shared_values;
|
|
4485
|
+
{
|
|
4486
|
+
int nval = 8;
|
|
4487
|
+
int pos = (32*sgitg + tiisg)*nval;
|
|
4488
|
+
for (int i = 0; i < nval; ++i) values[pos + i] = iq3xs_grid[pos + i];
|
|
4489
|
+
threadgroup_barrier(mem_flags::mem_threadgroup);
|
|
4490
|
+
}
|
|
4234
4491
|
|
|
4235
|
-
const int
|
|
4492
|
+
const int ix = tiisg;
|
|
4236
4493
|
|
|
4237
|
-
const
|
|
4238
|
-
const int gh_bk = il ? 0 : 4;
|
|
4494
|
+
device const float * y4 = y + 32 * ix;
|
|
4239
4495
|
|
|
4240
|
-
for (int
|
|
4241
|
-
// extract the 5-th bits for x0 and x1
|
|
4242
|
-
const uint8_t xh_0 = ((qh >> (gh_mv + 2*i )) << gh_bk) & 0x10;
|
|
4243
|
-
const uint8_t xh_1 = ((qh >> (gh_mv + 2*i+1)) << gh_bk) & 0x10;
|
|
4496
|
+
for (int ib32 = ix; ib32 < nb32; ib32 += 32) {
|
|
4244
4497
|
|
|
4245
|
-
|
|
4246
|
-
|
|
4247
|
-
|
|
4498
|
+
for (int i = 0; i < 32; ++i) {
|
|
4499
|
+
yl[i] = y4[i];
|
|
4500
|
+
}
|
|
4248
4501
|
|
|
4249
|
-
|
|
4250
|
-
|
|
4251
|
-
}
|
|
4252
|
-
}
|
|
4502
|
+
const int ibl = ib32 / (QK_K / 32);
|
|
4503
|
+
const int ib = ib32 % (QK_K / 32);
|
|
4253
4504
|
|
|
4254
|
-
|
|
4255
|
-
|
|
4256
|
-
|
|
4257
|
-
|
|
4258
|
-
|
|
4259
|
-
|
|
4505
|
+
device const block_iq3_s * xr = x + ibl;
|
|
4506
|
+
device const uint8_t * qs = xr->qs + 8 * ib;
|
|
4507
|
+
device const uint8_t * qh = xr->qh + ib;
|
|
4508
|
+
device const uint8_t * sc = xr->scales + (ib/2);
|
|
4509
|
+
device const uint8_t * signs = xr->signs + 4 * ib;
|
|
4510
|
+
device const half * dh = &xr->d;
|
|
4260
4511
|
|
|
4261
|
-
|
|
4512
|
+
for (int row = 0; row < N_DST; row++) {
|
|
4262
4513
|
|
|
4263
|
-
|
|
4514
|
+
const float db = dh[0];
|
|
4515
|
+
const float d = db * (0.5f + ((sc[0] >> 4*(ib%2)) & 0xf));
|
|
4264
4516
|
|
|
4265
|
-
|
|
4266
|
-
|
|
4517
|
+
float2 sum = {0};
|
|
4518
|
+
for (int l = 0; l < 4; ++l) {
|
|
4519
|
+
const threadgroup uint8_t * grid1 = (const threadgroup uint8_t *)(values + (qs[2*l+0] | ((qh[0] << (8-2*l)) & 256)));
|
|
4520
|
+
const threadgroup uint8_t * grid2 = (const threadgroup uint8_t *)(values + (qs[2*l+1] | ((qh[0] << (7-2*l)) & 256)));
|
|
4521
|
+
for (int j = 0; j < 4; ++j) {
|
|
4522
|
+
sum[0] += yl[8*l + j + 0] * grid1[j] * select(1, -1, signs[l] & kmask_iq2xs[j+0]);
|
|
4523
|
+
sum[1] += yl[8*l + j + 4] * grid2[j] * select(1, -1, signs[l] & kmask_iq2xs[j+4]);
|
|
4524
|
+
}
|
|
4525
|
+
}
|
|
4526
|
+
sumf[row] += d * (sum[0] + sum[1]);
|
|
4267
4527
|
|
|
4268
|
-
|
|
4269
|
-
|
|
4270
|
-
|
|
4271
|
-
|
|
4528
|
+
dh += nb*sizeof(block_iq3_s)/2;
|
|
4529
|
+
qs += nb*sizeof(block_iq3_s);
|
|
4530
|
+
qh += nb*sizeof(block_iq3_s);
|
|
4531
|
+
sc += nb*sizeof(block_iq3_s);
|
|
4532
|
+
signs += nb*sizeof(block_iq3_s);
|
|
4533
|
+
}
|
|
4272
4534
|
|
|
4273
|
-
|
|
4274
|
-
|
|
4275
|
-
const int32_t x1 = ((((qs[i] >> 8) & mask) >> x_mv) | xh_1);
|
|
4535
|
+
y4 += 32 * 32;
|
|
4536
|
+
}
|
|
4276
4537
|
|
|
4277
|
-
|
|
4278
|
-
|
|
4538
|
+
for (int row = 0; row < N_DST; ++row) {
|
|
4539
|
+
all_sum = simd_sum(sumf[row]);
|
|
4540
|
+
if (tiisg == 0) {
|
|
4541
|
+
dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum * 0.5f;
|
|
4542
|
+
}
|
|
4279
4543
|
}
|
|
4280
4544
|
}
|
|
4281
4545
|
|
|
4282
|
-
|
|
4283
|
-
void
|
|
4284
|
-
|
|
4285
|
-
|
|
4546
|
+
[[host_name("kernel_mul_mv_iq3_s_f32")]]
|
|
4547
|
+
kernel void kernel_mul_mv_iq3_s_f32(
|
|
4548
|
+
device const void * src0,
|
|
4549
|
+
device const float * src1,
|
|
4550
|
+
device float * dst,
|
|
4551
|
+
constant int64_t & ne00,
|
|
4552
|
+
constant int64_t & ne01,
|
|
4553
|
+
constant int64_t & ne02,
|
|
4554
|
+
constant uint64_t & nb00,
|
|
4555
|
+
constant uint64_t & nb01,
|
|
4556
|
+
constant uint64_t & nb02,
|
|
4557
|
+
constant int64_t & ne10,
|
|
4558
|
+
constant int64_t & ne11,
|
|
4559
|
+
constant int64_t & ne12,
|
|
4560
|
+
constant uint64_t & nb10,
|
|
4561
|
+
constant uint64_t & nb11,
|
|
4562
|
+
constant uint64_t & nb12,
|
|
4563
|
+
constant int64_t & ne0,
|
|
4564
|
+
constant int64_t & ne1,
|
|
4565
|
+
constant uint & r2,
|
|
4566
|
+
constant uint & r3,
|
|
4567
|
+
threadgroup int8_t * shared_values [[threadgroup(0)]],
|
|
4568
|
+
uint3 tgpig[[threadgroup_position_in_grid]],
|
|
4569
|
+
uint tiisg[[thread_index_in_simdgroup]],
|
|
4570
|
+
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
|
4286
4571
|
|
|
4287
|
-
|
|
4288
|
-
reg[i/4][i%4] = (qs[i + 16*il] * d);
|
|
4289
|
-
}
|
|
4572
|
+
kernel_mul_mv_iq3_s_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
|
|
4290
4573
|
}
|
|
4291
4574
|
|
|
4292
|
-
|
|
4293
|
-
|
|
4294
|
-
|
|
4295
|
-
|
|
4296
|
-
|
|
4297
|
-
|
|
4298
|
-
|
|
4575
|
+
void kernel_mul_mv_iq1_s_f32_impl(
|
|
4576
|
+
device const void * src0,
|
|
4577
|
+
device const float * src1,
|
|
4578
|
+
device float * dst,
|
|
4579
|
+
constant int64_t & ne00,
|
|
4580
|
+
constant int64_t & ne01,
|
|
4581
|
+
constant int64_t & ne02,
|
|
4582
|
+
constant int64_t & ne10,
|
|
4583
|
+
constant int64_t & ne12,
|
|
4584
|
+
constant int64_t & ne0,
|
|
4585
|
+
constant int64_t & ne1,
|
|
4586
|
+
constant uint & r2,
|
|
4587
|
+
constant uint & r3,
|
|
4588
|
+
uint3 tgpig[[threadgroup_position_in_grid]],
|
|
4589
|
+
uint tiisg[[thread_index_in_simdgroup]],
|
|
4590
|
+
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
|
4299
4591
|
|
|
4300
|
-
|
|
4301
|
-
|
|
4302
|
-
|
|
4303
|
-
|
|
4304
|
-
half coef = il>1 ? (il>2 ? 1/64.h : 1/16.h) : (il>0 ? 1/4.h : 1.h);
|
|
4305
|
-
uchar mask = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3);
|
|
4306
|
-
dl = d * (sc & 0xF) * coef, ml = min * (sc >> 4);
|
|
4307
|
-
for (int i = 0; i < 16; ++i) {
|
|
4308
|
-
reg[i/4][i%4] = dl * (q[i] & mask) - ml;
|
|
4309
|
-
}
|
|
4310
|
-
}
|
|
4592
|
+
const int nb = ne00/QK_K;
|
|
4593
|
+
const int r0 = tgpig.x;
|
|
4594
|
+
const int r1 = tgpig.y;
|
|
4595
|
+
const int im = tgpig.z;
|
|
4311
4596
|
|
|
4312
|
-
|
|
4313
|
-
|
|
4314
|
-
const half d_all = xb->d;
|
|
4315
|
-
device const uint8_t * q = (device const uint8_t *)xb->qs;
|
|
4316
|
-
device const uint8_t * h = (device const uint8_t *)xb->hmask;
|
|
4317
|
-
device const int8_t * scales = (device const int8_t *)xb->scales;
|
|
4597
|
+
const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
|
|
4598
|
+
const int ib_row = first_row * nb;
|
|
4318
4599
|
|
|
4319
|
-
|
|
4320
|
-
|
|
4321
|
-
|
|
4322
|
-
|
|
4323
|
-
|
|
4324
|
-
|
|
4325
|
-
|
|
4326
|
-
|
|
4327
|
-
|
|
4600
|
+
const uint i12 = im%ne12;
|
|
4601
|
+
const uint i13 = im/ne12;
|
|
4602
|
+
|
|
4603
|
+
const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
|
|
4604
|
+
device const block_iq1_s * x = (device const block_iq1_s *) src0 + ib_row + offset0;
|
|
4605
|
+
device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1;
|
|
4606
|
+
|
|
4607
|
+
float yl[16];
|
|
4608
|
+
float sumf[N_DST]={0.f}, all_sum;
|
|
4609
|
+
|
|
4610
|
+
const int nb32 = nb * (QK_K / 32);
|
|
4611
|
+
|
|
4612
|
+
#if QK_K == 256
|
|
4613
|
+
const int ix = tiisg/2;
|
|
4614
|
+
const int il = tiisg%2;
|
|
4615
|
+
|
|
4616
|
+
device const float * y4 = y + 32 * ix + 16 * il;
|
|
4617
|
+
|
|
4618
|
+
for (int ib32 = ix; ib32 < nb32; ib32 += 16) {
|
|
4619
|
+
|
|
4620
|
+
for (int i = 0; i < 16; ++i) {
|
|
4621
|
+
yl[i] = y4[i];
|
|
4622
|
+
}
|
|
4623
|
+
|
|
4624
|
+
const int ibl = ib32 / (QK_K / 32);
|
|
4625
|
+
const int ib = ib32 % (QK_K / 32);
|
|
4626
|
+
|
|
4627
|
+
device const block_iq1_s * xr = x + ibl;
|
|
4628
|
+
device const uint8_t * qs = xr->qs + 4 * ib + 2 * il;
|
|
4629
|
+
device const uint8_t * sc = xr->scales + 2 * ib + il;
|
|
4630
|
+
device const half * dh = &xr->d;
|
|
4631
|
+
|
|
4632
|
+
for (int row = 0; row < N_DST; row++) {
|
|
4633
|
+
|
|
4634
|
+
constant int8_t * grid1 = (constant int8_t *)(iq1s_grid + (qs[0] | ((sc[0] & 0x08) << 5)));
|
|
4635
|
+
constant int8_t * grid2 = (constant int8_t *)(iq1s_grid + (qs[1] | ((sc[0] & 0x80) << 1)));
|
|
4636
|
+
|
|
4637
|
+
float2 sum = {0};
|
|
4638
|
+
for (int j = 0; j < 8; ++j) {
|
|
4639
|
+
sum[0] += yl[j+ 0] * grid1[j];
|
|
4640
|
+
sum[1] += yl[j+ 8] * grid2[j];
|
|
4641
|
+
}
|
|
4642
|
+
sumf[row] += (float)dh[0] * (sum[0] * (2*(sc[0] & 7) + 1) + sum[1] * (2*((sc[0] >> 4) & 7) + 1));
|
|
4643
|
+
|
|
4644
|
+
dh += nb*sizeof(block_iq1_s)/2;
|
|
4645
|
+
qs += nb*sizeof(block_iq1_s);
|
|
4646
|
+
sc += nb*sizeof(block_iq1_s);
|
|
4647
|
+
}
|
|
4648
|
+
|
|
4649
|
+
y4 += 16 * 32;
|
|
4650
|
+
}
|
|
4651
|
+
#else
|
|
4652
|
+
(void) x;
|
|
4653
|
+
(void) y;
|
|
4654
|
+
(void) yl;
|
|
4655
|
+
(void) nb32;
|
|
4656
|
+
#endif
|
|
4657
|
+
|
|
4658
|
+
for (int row = 0; row < N_DST; ++row) {
|
|
4659
|
+
all_sum = simd_sum(sumf[row]);
|
|
4660
|
+
if (tiisg == 0) {
|
|
4661
|
+
dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum;
|
|
4662
|
+
}
|
|
4663
|
+
}
|
|
4664
|
+
}
|
|
4665
|
+
|
|
4666
|
+
constexpr constant static float kvalues_iq4nl_f[16] = {
|
|
4667
|
+
-127.f, -104.f, -83.f, -65.f, -49.f, -35.f, -22.f, -10.f, 1.f, 13.f, 25.f, 38.f, 53.f, 69.f, 89.f, 113.f
|
|
4668
|
+
};
|
|
4669
|
+
|
|
4670
|
+
void kernel_mul_mv_iq4_nl_f32_impl(
|
|
4671
|
+
device const void * src0,
|
|
4672
|
+
device const float * src1,
|
|
4673
|
+
device float * dst,
|
|
4674
|
+
constant int64_t & ne00,
|
|
4675
|
+
constant int64_t & ne01,
|
|
4676
|
+
constant int64_t & ne02,
|
|
4677
|
+
constant int64_t & ne10,
|
|
4678
|
+
constant int64_t & ne12,
|
|
4679
|
+
constant int64_t & ne0,
|
|
4680
|
+
constant int64_t & ne1,
|
|
4681
|
+
constant uint & r2,
|
|
4682
|
+
constant uint & r3,
|
|
4683
|
+
threadgroup float * shared_values [[threadgroup(0)]],
|
|
4684
|
+
uint3 tgpig[[threadgroup_position_in_grid]],
|
|
4685
|
+
uint tiisg[[thread_index_in_simdgroup]],
|
|
4686
|
+
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
|
4687
|
+
|
|
4688
|
+
const int nb = ne00/QK4_NL;
|
|
4689
|
+
const int r0 = tgpig.x;
|
|
4690
|
+
const int r1 = tgpig.y;
|
|
4691
|
+
const int im = tgpig.z;
|
|
4692
|
+
const int first_row = (r0 * 2 + sgitg) * 2;
|
|
4693
|
+
const int ib_row = first_row * nb;
|
|
4694
|
+
|
|
4695
|
+
const uint i12 = im%ne12;
|
|
4696
|
+
const uint i13 = im/ne12;
|
|
4697
|
+
|
|
4698
|
+
const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
|
|
4699
|
+
device const block_iq4_nl * x = (device const block_iq4_nl *) src0 + ib_row + offset0;
|
|
4700
|
+
device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1;
|
|
4701
|
+
|
|
4702
|
+
const int ix = tiisg/2; // 0...15
|
|
4703
|
+
const int it = tiisg%2; // 0 or 1
|
|
4704
|
+
|
|
4705
|
+
shared_values[tiisg] = kvalues_iq4nl_f[tiisg%16];
|
|
4706
|
+
threadgroup_barrier(mem_flags::mem_threadgroup);
|
|
4707
|
+
|
|
4708
|
+
float4 yl[4];
|
|
4709
|
+
float sumf[2]={0.f}, all_sum;
|
|
4710
|
+
|
|
4711
|
+
device const float * yb = y + ix * QK4_NL + it * 8;
|
|
4712
|
+
|
|
4713
|
+
uint32_t aux32[2];
|
|
4714
|
+
thread const uint8_t * q8 = (thread const uint8_t *)aux32;
|
|
4715
|
+
|
|
4716
|
+
float4 qf1, qf2;
|
|
4717
|
+
|
|
4718
|
+
for (int ib = ix; ib < nb; ib += 16) {
|
|
4719
|
+
|
|
4720
|
+
device const float4 * y4 = (device const float4 *)yb;
|
|
4721
|
+
yl[0] = y4[0]; yl[1] = y4[4]; yl[2] = y4[1]; yl[3] = y4[5];
|
|
4722
|
+
|
|
4723
|
+
for (int row = 0; row < 2; ++row) {
|
|
4724
|
+
|
|
4725
|
+
device const block_iq4_nl & xb = x[row*nb + ib];
|
|
4726
|
+
device const uint16_t * q4 = (device const uint16_t *)(xb.qs + 8*it);
|
|
4727
|
+
|
|
4728
|
+
float4 acc1 = {0.f}, acc2 = {0.f};
|
|
4729
|
+
|
|
4730
|
+
aux32[0] = q4[0] | (q4[1] << 16);
|
|
4731
|
+
aux32[1] = (aux32[0] >> 4) & 0x0f0f0f0f;
|
|
4732
|
+
aux32[0] &= 0x0f0f0f0f;
|
|
4733
|
+
qf1 = {shared_values[q8[0]], shared_values[q8[1]], shared_values[q8[2]], shared_values[q8[3]]};
|
|
4734
|
+
qf2 = {shared_values[q8[4]], shared_values[q8[5]], shared_values[q8[6]], shared_values[q8[7]]};
|
|
4735
|
+
acc1 += yl[0] * qf1;
|
|
4736
|
+
acc2 += yl[1] * qf2;
|
|
4737
|
+
|
|
4738
|
+
aux32[0] = q4[2] | (q4[3] << 16);
|
|
4739
|
+
aux32[1] = (aux32[0] >> 4) & 0x0f0f0f0f;
|
|
4740
|
+
aux32[0] &= 0x0f0f0f0f;
|
|
4741
|
+
qf1 = {shared_values[q8[0]], shared_values[q8[1]], shared_values[q8[2]], shared_values[q8[3]]};
|
|
4742
|
+
qf2 = {shared_values[q8[4]], shared_values[q8[5]], shared_values[q8[6]], shared_values[q8[7]]};
|
|
4743
|
+
acc1 += yl[2] * qf1;
|
|
4744
|
+
acc2 += yl[3] * qf2;
|
|
4745
|
+
|
|
4746
|
+
acc1 += acc2;
|
|
4747
|
+
|
|
4748
|
+
sumf[row] += (float)xb.d * (acc1[0] + acc1[1] + acc1[2] + acc1[3]);
|
|
4749
|
+
|
|
4750
|
+
}
|
|
4751
|
+
|
|
4752
|
+
yb += 16 * QK4_NL;
|
|
4753
|
+
}
|
|
4754
|
+
|
|
4755
|
+
for (int row = 0; row < 2; ++row) {
|
|
4756
|
+
all_sum = simd_sum(sumf[row]);
|
|
4757
|
+
if (tiisg == 0) {
|
|
4758
|
+
dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum;
|
|
4759
|
+
}
|
|
4760
|
+
}
|
|
4761
|
+
}
|
|
4762
|
+
|
|
4763
|
+
[[host_name("kernel_mul_mv_iq1_s_f32")]]
|
|
4764
|
+
kernel void kernel_mul_mv_iq1_s_f32(
|
|
4765
|
+
device const void * src0,
|
|
4766
|
+
device const float * src1,
|
|
4767
|
+
device float * dst,
|
|
4768
|
+
constant int64_t & ne00,
|
|
4769
|
+
constant int64_t & ne01,
|
|
4770
|
+
constant int64_t & ne02,
|
|
4771
|
+
constant uint64_t & nb00,
|
|
4772
|
+
constant uint64_t & nb01,
|
|
4773
|
+
constant uint64_t & nb02,
|
|
4774
|
+
constant int64_t & ne10,
|
|
4775
|
+
constant int64_t & ne11,
|
|
4776
|
+
constant int64_t & ne12,
|
|
4777
|
+
constant uint64_t & nb10,
|
|
4778
|
+
constant uint64_t & nb11,
|
|
4779
|
+
constant uint64_t & nb12,
|
|
4780
|
+
constant int64_t & ne0,
|
|
4781
|
+
constant int64_t & ne1,
|
|
4782
|
+
constant uint & r2,
|
|
4783
|
+
constant uint & r3,
|
|
4784
|
+
uint3 tgpig[[threadgroup_position_in_grid]],
|
|
4785
|
+
uint tiisg[[thread_index_in_simdgroup]],
|
|
4786
|
+
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
|
4787
|
+
|
|
4788
|
+
kernel_mul_mv_iq1_s_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, tgpig, tiisg, sgitg);
|
|
4789
|
+
}
|
|
4790
|
+
|
|
4791
|
+
[[host_name("kernel_mul_mv_iq4_nl_f32")]]
|
|
4792
|
+
kernel void kernel_mul_mv_iq4_nl_f32(
|
|
4793
|
+
device const void * src0,
|
|
4794
|
+
device const float * src1,
|
|
4795
|
+
device float * dst,
|
|
4796
|
+
constant int64_t & ne00,
|
|
4797
|
+
constant int64_t & ne01,
|
|
4798
|
+
constant int64_t & ne02,
|
|
4799
|
+
constant uint64_t & nb00,
|
|
4800
|
+
constant uint64_t & nb01,
|
|
4801
|
+
constant uint64_t & nb02,
|
|
4802
|
+
constant int64_t & ne10,
|
|
4803
|
+
constant int64_t & ne11,
|
|
4804
|
+
constant int64_t & ne12,
|
|
4805
|
+
constant uint64_t & nb10,
|
|
4806
|
+
constant uint64_t & nb11,
|
|
4807
|
+
constant uint64_t & nb12,
|
|
4808
|
+
constant int64_t & ne0,
|
|
4809
|
+
constant int64_t & ne1,
|
|
4810
|
+
constant uint & r2,
|
|
4811
|
+
constant uint & r3,
|
|
4812
|
+
threadgroup float * shared_values [[threadgroup(0)]],
|
|
4813
|
+
uint3 tgpig[[threadgroup_position_in_grid]],
|
|
4814
|
+
uint tiisg[[thread_index_in_simdgroup]],
|
|
4815
|
+
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
|
4816
|
+
|
|
4817
|
+
kernel_mul_mv_iq4_nl_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
|
|
4818
|
+
}
|
|
4819
|
+
|
|
4820
|
+
//============================= templates and their specializations =============================
|
|
4821
|
+
|
|
4822
|
+
// NOTE: this is not dequantizing - we are simply fitting the template
|
|
4823
|
+
template <typename type4x4>
|
|
4824
|
+
void dequantize_f32(device const float4x4 * src, short il, thread type4x4 & reg) {
|
|
4825
|
+
float4x4 temp = *(((device float4x4 *)src));
|
|
4826
|
+
for (int i = 0; i < 16; i++){
|
|
4827
|
+
reg[i/4][i%4] = temp[i/4][i%4];
|
|
4828
|
+
}
|
|
4829
|
+
}
|
|
4830
|
+
|
|
4831
|
+
template <typename type4x4>
|
|
4832
|
+
void dequantize_f16(device const half4x4 * src, short il, thread type4x4 & reg) {
|
|
4833
|
+
half4x4 temp = *(((device half4x4 *)src));
|
|
4834
|
+
for (int i = 0; i < 16; i++){
|
|
4835
|
+
reg[i/4][i%4] = temp[i/4][i%4];
|
|
4836
|
+
}
|
|
4837
|
+
}
|
|
4838
|
+
|
|
4839
|
+
template <typename type4x4>
|
|
4840
|
+
void dequantize_q4_0(device const block_q4_0 *xb, short il, thread type4x4 & reg) {
|
|
4841
|
+
device const uint16_t * qs = ((device const uint16_t *)xb + 1);
|
|
4842
|
+
const float d1 = il ? (xb->d / 16.h) : xb->d;
|
|
4843
|
+
const float d2 = d1 / 256.f;
|
|
4844
|
+
const float md = -8.h * xb->d;
|
|
4845
|
+
const ushort mask0 = il ? 0x00F0 : 0x000F;
|
|
4846
|
+
const ushort mask1 = mask0 << 8;
|
|
4847
|
+
|
|
4848
|
+
for (int i=0;i<8;i++) {
|
|
4849
|
+
reg[i/2][2*(i%2)+0] = d1 * (qs[i] & mask0) + md;
|
|
4850
|
+
reg[i/2][2*(i%2)+1] = d2 * (qs[i] & mask1) + md;
|
|
4851
|
+
}
|
|
4852
|
+
}
|
|
4853
|
+
|
|
4854
|
+
template <typename type4x4>
|
|
4855
|
+
void dequantize_q4_1(device const block_q4_1 *xb, short il, thread type4x4 & reg) {
|
|
4856
|
+
device const uint16_t * qs = ((device const uint16_t *)xb + 2);
|
|
4857
|
+
const float d1 = il ? (xb->d / 16.h) : xb->d;
|
|
4858
|
+
const float d2 = d1 / 256.f;
|
|
4859
|
+
const float m = xb->m;
|
|
4860
|
+
const ushort mask0 = il ? 0x00F0 : 0x000F;
|
|
4861
|
+
const ushort mask1 = mask0 << 8;
|
|
4862
|
+
|
|
4863
|
+
for (int i=0;i<8;i++) {
|
|
4864
|
+
reg[i/2][2*(i%2)+0] = ((qs[i] & mask0) * d1) + m;
|
|
4865
|
+
reg[i/2][2*(i%2)+1] = ((qs[i] & mask1) * d2) + m;
|
|
4866
|
+
}
|
|
4867
|
+
}
|
|
4868
|
+
|
|
4869
|
+
template <typename type4x4>
|
|
4870
|
+
void dequantize_q5_0(device const block_q5_0 *xb, short il, thread type4x4 & reg) {
|
|
4871
|
+
device const uint16_t * qs = ((device const uint16_t *)xb + 3);
|
|
4872
|
+
const float d = xb->d;
|
|
4873
|
+
const float md = -16.h * xb->d;
|
|
4874
|
+
const ushort mask = il ? 0x00F0 : 0x000F;
|
|
4875
|
+
|
|
4876
|
+
const uint32_t qh = *((device const uint32_t *)xb->qh);
|
|
4877
|
+
|
|
4878
|
+
const int x_mv = il ? 4 : 0;
|
|
4879
|
+
|
|
4880
|
+
const int gh_mv = il ? 12 : 0;
|
|
4881
|
+
const int gh_bk = il ? 0 : 4;
|
|
4882
|
+
|
|
4883
|
+
for (int i = 0; i < 8; i++) {
|
|
4884
|
+
// extract the 5-th bits for x0 and x1
|
|
4885
|
+
const uint8_t xh_0 = ((qh >> (gh_mv + 2*i )) << gh_bk) & 0x10;
|
|
4886
|
+
const uint8_t xh_1 = ((qh >> (gh_mv + 2*i+1)) << gh_bk) & 0x10;
|
|
4887
|
+
|
|
4888
|
+
// combine the 4-bits from qs with the 5th bit
|
|
4889
|
+
const int32_t x0 = ((((qs[i] ) & mask) >> x_mv) | xh_0);
|
|
4890
|
+
const int32_t x1 = ((((qs[i] >> 8) & mask) >> x_mv) | xh_1);
|
|
4891
|
+
|
|
4892
|
+
reg[i/2][2*(i%2)+0] = d * x0 + md;
|
|
4893
|
+
reg[i/2][2*(i%2)+1] = d * x1 + md;
|
|
4894
|
+
}
|
|
4895
|
+
}
|
|
4896
|
+
|
|
4897
|
+
template <typename type4x4>
|
|
4898
|
+
void dequantize_q5_1(device const block_q5_1 *xb, short il, thread type4x4 & reg) {
|
|
4899
|
+
device const uint16_t * qs = ((device const uint16_t *)xb + 4);
|
|
4900
|
+
const float d = xb->d;
|
|
4901
|
+
const float m = xb->m;
|
|
4902
|
+
const ushort mask = il ? 0x00F0 : 0x000F;
|
|
4903
|
+
|
|
4904
|
+
const uint32_t qh = *((device const uint32_t *)xb->qh);
|
|
4905
|
+
|
|
4906
|
+
const int x_mv = il ? 4 : 0;
|
|
4907
|
+
|
|
4908
|
+
const int gh_mv = il ? 12 : 0;
|
|
4909
|
+
const int gh_bk = il ? 0 : 4;
|
|
4910
|
+
|
|
4911
|
+
for (int i = 0; i < 8; i++) {
|
|
4912
|
+
// extract the 5-th bits for x0 and x1
|
|
4913
|
+
const uint8_t xh_0 = ((qh >> (gh_mv + 2*i )) << gh_bk) & 0x10;
|
|
4914
|
+
const uint8_t xh_1 = ((qh >> (gh_mv + 2*i+1)) << gh_bk) & 0x10;
|
|
4915
|
+
|
|
4916
|
+
// combine the 4-bits from qs with the 5th bit
|
|
4917
|
+
const int32_t x0 = ((((qs[i] ) & mask) >> x_mv) | xh_0);
|
|
4918
|
+
const int32_t x1 = ((((qs[i] >> 8) & mask) >> x_mv) | xh_1);
|
|
4919
|
+
|
|
4920
|
+
reg[i/2][2*(i%2)+0] = d * x0 + m;
|
|
4921
|
+
reg[i/2][2*(i%2)+1] = d * x1 + m;
|
|
4922
|
+
}
|
|
4923
|
+
}
|
|
4924
|
+
|
|
4925
|
+
template <typename type4x4>
|
|
4926
|
+
void dequantize_q8_0(device const block_q8_0 *xb, short il, thread type4x4 & reg) {
|
|
4927
|
+
device const int8_t * qs = ((device const int8_t *)xb->qs);
|
|
4928
|
+
const half d = xb->d;
|
|
4929
|
+
|
|
4930
|
+
for (int i = 0; i < 16; i++) {
|
|
4931
|
+
reg[i/4][i%4] = (qs[i + 16*il] * d);
|
|
4932
|
+
}
|
|
4933
|
+
}
|
|
4934
|
+
|
|
4935
|
+
template <typename type4x4>
|
|
4936
|
+
void dequantize_q2_K(device const block_q2_K *xb, short il, thread type4x4 & reg) {
|
|
4937
|
+
const float d = xb->d;
|
|
4938
|
+
const float min = xb->dmin;
|
|
4939
|
+
device const uint8_t * q = (device const uint8_t *)xb->qs;
|
|
4940
|
+
float dl, ml;
|
|
4941
|
+
uint8_t sc = xb->scales[il];
|
|
4942
|
+
|
|
4943
|
+
#if QK_K == 256
|
|
4944
|
+
q = q + 32*(il/8) + 16*(il&1);
|
|
4945
|
+
il = (il/2)%4;
|
|
4946
|
+
#endif
|
|
4947
|
+
half coef = il>1 ? (il>2 ? 1/64.h : 1/16.h) : (il>0 ? 1/4.h : 1.h);
|
|
4948
|
+
uchar mask = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3);
|
|
4949
|
+
dl = d * (sc & 0xF) * coef, ml = min * (sc >> 4);
|
|
4950
|
+
for (int i = 0; i < 16; ++i) {
|
|
4951
|
+
reg[i/4][i%4] = dl * (q[i] & mask) - ml;
|
|
4952
|
+
}
|
|
4953
|
+
}
|
|
4954
|
+
|
|
4955
|
+
template <typename type4x4>
|
|
4956
|
+
void dequantize_q3_K(device const block_q3_K *xb, short il, thread type4x4 & reg) {
|
|
4957
|
+
const half d_all = xb->d;
|
|
4958
|
+
device const uint8_t * q = (device const uint8_t *)xb->qs;
|
|
4959
|
+
device const uint8_t * h = (device const uint8_t *)xb->hmask;
|
|
4960
|
+
device const int8_t * scales = (device const int8_t *)xb->scales;
|
|
4961
|
+
|
|
4962
|
+
#if QK_K == 256
|
|
4963
|
+
q = q + 32 * (il/8) + 16 * (il&1);
|
|
4964
|
+
h = h + 16 * (il&1);
|
|
4965
|
+
uint8_t m = 1 << (il/2);
|
|
4966
|
+
uint16_t kmask1 = (il/4)>1 ? ((il/4)>2 ? 192 : 48) : \
|
|
4967
|
+
((il/4)>0 ? 12 : 3);
|
|
4968
|
+
uint16_t kmask2 = il/8 ? 0xF0 : 0x0F;
|
|
4969
|
+
uint16_t scale_2 = scales[il%8], scale_1 = scales[8 + il%4];
|
|
4970
|
+
int16_t dl_int = (il/4)&1 ? (scale_2&kmask2) | ((scale_1&kmask1) << 2)
|
|
4328
4971
|
: (scale_2&kmask2) | ((scale_1&kmask1) << 4);
|
|
4329
4972
|
float dl = il<8 ? d_all * (dl_int - 32.f) : d_all * (dl_int / 16.f - 32.f);
|
|
4330
4973
|
const float ml = 4.f * dl;
|
|
@@ -4369,6 +5012,8 @@ void dequantize_q4_K(device const block_q4_K *xb, short il, thread type4x4 & reg
|
|
|
4369
5012
|
const float dl = d * sc[0];
|
|
4370
5013
|
const float ml = min * sc[1];
|
|
4371
5014
|
#else
|
|
5015
|
+
(void) get_scale_min_k4_just2;
|
|
5016
|
+
|
|
4372
5017
|
q = q + 16 * (il&1);
|
|
4373
5018
|
device const uint8_t * s = xb->scales;
|
|
4374
5019
|
device const half2 * dh = (device const half2 *)xb->d;
|
|
@@ -4518,6 +5163,62 @@ void dequantize_iq3_xxs(device const block_iq3_xxs * xb, short il, thread type4x
|
|
|
4518
5163
|
}
|
|
4519
5164
|
}
|
|
4520
5165
|
|
|
5166
|
+
template <typename type4x4>
|
|
5167
|
+
void dequantize_iq3_s(device const block_iq3_s * xb, short il, thread type4x4 & reg) {
|
|
5168
|
+
// il is 0...15 for QK_K = 256 => index of block of 32 is il/2
|
|
5169
|
+
const float d = xb->d;
|
|
5170
|
+
const int ib32 = il/2;
|
|
5171
|
+
il = il%2;
|
|
5172
|
+
// il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16
|
|
5173
|
+
device const uint8_t * qs = xb->qs + 8*ib32;
|
|
5174
|
+
device const uint8_t * signs = xb->signs + 4*ib32 + 2*il;
|
|
5175
|
+
const uint8_t qh = xb->qh[ib32] >> 4*il;
|
|
5176
|
+
const float dl = d * (0.5f + ((xb->scales[ib32/2] >> 4*(ib32%2)) & 0xf)) * 0.5f;
|
|
5177
|
+
constant uint8_t * grid1 = (constant uint8_t *)(iq3xs_grid + (qs[4*il+0] | ((qh << 8) & 256)));
|
|
5178
|
+
constant uint8_t * grid2 = (constant uint8_t *)(iq3xs_grid + (qs[4*il+1] | ((qh << 7) & 256)));
|
|
5179
|
+
for (int i = 0; i < 4; ++i) {
|
|
5180
|
+
reg[0][i] = dl * grid1[i] * select(1, -1, signs[0] & kmask_iq2xs[i+0]);
|
|
5181
|
+
reg[1][i] = dl * grid2[i] * select(1, -1, signs[0] & kmask_iq2xs[i+4]);
|
|
5182
|
+
}
|
|
5183
|
+
grid1 = (constant uint8_t *)(iq3xs_grid + (qs[4*il+2] | ((qh << 6) & 256)));
|
|
5184
|
+
grid2 = (constant uint8_t *)(iq3xs_grid + (qs[4*il+3] | ((qh << 5) & 256)));
|
|
5185
|
+
for (int i = 0; i < 4; ++i) {
|
|
5186
|
+
reg[2][i] = dl * grid1[i] * select(1, -1, signs[1] & kmask_iq2xs[i+0]);
|
|
5187
|
+
reg[3][i] = dl * grid2[i] * select(1, -1, signs[1] & kmask_iq2xs[i+4]);
|
|
5188
|
+
}
|
|
5189
|
+
}
|
|
5190
|
+
|
|
5191
|
+
template <typename type4x4>
|
|
5192
|
+
void dequantize_iq1_s(device const block_iq1_s * xb, short il, thread type4x4 & reg) {
|
|
5193
|
+
// il is 0...15 for QK_K = 256 => index of block of 32 is il/2
|
|
5194
|
+
const float d = xb->d;
|
|
5195
|
+
device const uint8_t * qs = xb->qs + 2*il;
|
|
5196
|
+
device const uint8_t * sc = xb->scales + il;
|
|
5197
|
+
const float dl1 = d * (2*(sc[0] & 7) + 1);
|
|
5198
|
+
const float dl2 = d * (2*((sc[0] >> 4) & 7) + 1);
|
|
5199
|
+
constant int8_t * grid1 = (constant int8_t *)(iq1s_grid + (qs[0] | ((sc[0] & 0x08) << 5)));
|
|
5200
|
+
constant int8_t * grid2 = (constant int8_t *)(iq1s_grid + (qs[1] | ((sc[0] & 0x80) << 1)));
|
|
5201
|
+
for (int i = 0; i < 8; ++i) {
|
|
5202
|
+
reg[i/4+0][i%4] = dl1 * grid1[i];
|
|
5203
|
+
reg[i/4+2][i%4] = dl2 * grid2[i];
|
|
5204
|
+
}
|
|
5205
|
+
}
|
|
5206
|
+
|
|
5207
|
+
template <typename type4x4>
|
|
5208
|
+
void dequantize_iq4_nl(device const block_iq4_nl * xb, short il, thread type4x4 & reg) {
|
|
5209
|
+
device const uint16_t * q4 = (device const uint16_t *)xb->qs;
|
|
5210
|
+
const float d = xb->d;
|
|
5211
|
+
uint32_t aux32;
|
|
5212
|
+
thread const uint8_t * q8 = (thread const uint8_t *)&aux32;
|
|
5213
|
+
for (int i = 0; i < 4; ++i) {
|
|
5214
|
+
aux32 = ((q4[2*i] | (q4[2*i+1] << 16)) >> 4*il) & 0x0f0f0f0f;
|
|
5215
|
+
reg[i][0] = d * kvalues_iq4nl_f[q8[0]];
|
|
5216
|
+
reg[i][1] = d * kvalues_iq4nl_f[q8[1]];
|
|
5217
|
+
reg[i][2] = d * kvalues_iq4nl_f[q8[2]];
|
|
5218
|
+
reg[i][3] = d * kvalues_iq4nl_f[q8[3]];
|
|
5219
|
+
}
|
|
5220
|
+
}
|
|
5221
|
+
|
|
4521
5222
|
template<typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread float4x4 &)>
|
|
4522
5223
|
kernel void kernel_get_rows(
|
|
4523
5224
|
device const void * src0,
|
|
@@ -5060,6 +5761,9 @@ template [[host_name("kernel_get_rows_q6_K")]] kernel get_rows_t kernel_get_rows
|
|
|
5060
5761
|
template [[host_name("kernel_get_rows_iq2_xxs")]] kernel get_rows_t kernel_get_rows<block_iq2_xxs, QK_NL, dequantize_iq2_xxs>;
|
|
5061
5762
|
template [[host_name("kernel_get_rows_iq2_xs")]] kernel get_rows_t kernel_get_rows<block_iq2_xs, QK_NL, dequantize_iq2_xs>;
|
|
5062
5763
|
template [[host_name("kernel_get_rows_iq3_xxs")]] kernel get_rows_t kernel_get_rows<block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;
|
|
5764
|
+
template [[host_name("kernel_get_rows_iq3_s")]] kernel get_rows_t kernel_get_rows<block_iq3_s, QK_NL, dequantize_iq3_s>;
|
|
5765
|
+
template [[host_name("kernel_get_rows_iq1_s")]] kernel get_rows_t kernel_get_rows<block_iq1_s, QK_NL, dequantize_iq1_s>;
|
|
5766
|
+
template [[host_name("kernel_get_rows_iq4_nl")]] kernel get_rows_t kernel_get_rows<block_iq4_nl, 2, dequantize_iq4_nl>;
|
|
5063
5767
|
|
|
5064
5768
|
//
|
|
5065
5769
|
// matrix-matrix multiplication
|
|
@@ -5099,6 +5803,9 @@ template [[host_name("kernel_mul_mm_q6_K_f32")]] kernel mat_mm_t kernel_mul_mm<b
|
|
|
5099
5803
|
template [[host_name("kernel_mul_mm_iq2_xxs_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq2_xxs, QK_NL, dequantize_iq2_xxs>;
|
|
5100
5804
|
template [[host_name("kernel_mul_mm_iq2_xs_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq2_xs, QK_NL, dequantize_iq2_xs>;
|
|
5101
5805
|
template [[host_name("kernel_mul_mm_iq3_xxs_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;
|
|
5806
|
+
template [[host_name("kernel_mul_mm_iq3_s_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq3_s, QK_NL, dequantize_iq3_s>;
|
|
5807
|
+
template [[host_name("kernel_mul_mm_iq1_s_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq1_s, QK_NL, dequantize_iq1_s>;
|
|
5808
|
+
template [[host_name("kernel_mul_mm_iq4_nl_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq4_nl, 2, dequantize_iq4_nl>;
|
|
5102
5809
|
|
|
5103
5810
|
//
|
|
5104
5811
|
// indirect matrix-matrix multiplication
|
|
@@ -5150,6 +5857,9 @@ template [[host_name("kernel_mul_mm_id_q6_K_f32")]] kernel mat_mm_id_t kernel_mu
|
|
|
5150
5857
|
template [[host_name("kernel_mul_mm_id_iq2_xxs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq2_xxs, QK_NL, dequantize_iq2_xxs>;
|
|
5151
5858
|
template [[host_name("kernel_mul_mm_id_iq2_xs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq2_xs, QK_NL, dequantize_iq2_xs>;
|
|
5152
5859
|
template [[host_name("kernel_mul_mm_id_iq3_xxs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;
|
|
5860
|
+
template [[host_name("kernel_mul_mm_id_iq3_s_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq3_s, QK_NL, dequantize_iq3_s>;
|
|
5861
|
+
template [[host_name("kernel_mul_mm_id_iq1_s_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq1_s, QK_NL, dequantize_iq1_s>;
|
|
5862
|
+
template [[host_name("kernel_mul_mm_id_iq4_nl_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq4_nl, 2, dequantize_iq4_nl>;
|
|
5153
5863
|
|
|
5154
5864
|
//
|
|
5155
5865
|
// matrix-vector multiplication
|
|
@@ -6117,3 +6827,196 @@ kernel void kernel_mul_mv_id_iq3_xxs_f32(
|
|
|
6117
6827
|
tiisg,
|
|
6118
6828
|
sgitg);
|
|
6119
6829
|
}
|
|
6830
|
+
|
|
6831
|
+
[[host_name("kernel_mul_mv_id_iq3_s_f32")]]
|
|
6832
|
+
kernel void kernel_mul_mv_id_iq3_s_f32(
|
|
6833
|
+
device const char * ids,
|
|
6834
|
+
device const char * src1,
|
|
6835
|
+
device float * dst,
|
|
6836
|
+
constant uint64_t & nbi1,
|
|
6837
|
+
constant int64_t & ne00,
|
|
6838
|
+
constant int64_t & ne01,
|
|
6839
|
+
constant int64_t & ne02,
|
|
6840
|
+
constant uint64_t & nb00,
|
|
6841
|
+
constant uint64_t & nb01,
|
|
6842
|
+
constant uint64_t & nb02,
|
|
6843
|
+
constant int64_t & ne10,
|
|
6844
|
+
constant int64_t & ne11,
|
|
6845
|
+
constant int64_t & ne12,
|
|
6846
|
+
constant int64_t & ne13,
|
|
6847
|
+
constant uint64_t & nb10,
|
|
6848
|
+
constant uint64_t & nb11,
|
|
6849
|
+
constant uint64_t & nb12,
|
|
6850
|
+
constant int64_t & ne0,
|
|
6851
|
+
constant int64_t & ne1,
|
|
6852
|
+
constant uint64_t & nb1,
|
|
6853
|
+
constant uint & r2,
|
|
6854
|
+
constant uint & r3,
|
|
6855
|
+
constant int & idx,
|
|
6856
|
+
device const char * src00,
|
|
6857
|
+
device const char * src01,
|
|
6858
|
+
device const char * src02,
|
|
6859
|
+
device const char * src03,
|
|
6860
|
+
device const char * src04,
|
|
6861
|
+
device const char * src05,
|
|
6862
|
+
device const char * src06,
|
|
6863
|
+
device const char * src07,
|
|
6864
|
+
threadgroup int8_t * shared_values [[threadgroup(0)]],
|
|
6865
|
+
uint3 tgpig[[threadgroup_position_in_grid]],
|
|
6866
|
+
uint tiitg[[thread_index_in_threadgroup]],
|
|
6867
|
+
uint tiisg[[thread_index_in_simdgroup]],
|
|
6868
|
+
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
|
6869
|
+
device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07};
|
|
6870
|
+
|
|
6871
|
+
const int64_t bid = tgpig.z/(ne12*ne13);
|
|
6872
|
+
|
|
6873
|
+
tgpig.z = tgpig.z%(ne12*ne13);
|
|
6874
|
+
|
|
6875
|
+
const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx];
|
|
6876
|
+
|
|
6877
|
+
kernel_mul_mv_iq3_s_f32_impl(
|
|
6878
|
+
src0[id],
|
|
6879
|
+
(device const float *) (src1 + bid*nb11),
|
|
6880
|
+
dst + bid*ne0,
|
|
6881
|
+
ne00,
|
|
6882
|
+
ne01,
|
|
6883
|
+
ne02,
|
|
6884
|
+
ne10,
|
|
6885
|
+
ne12,
|
|
6886
|
+
ne0,
|
|
6887
|
+
ne1,
|
|
6888
|
+
r2,
|
|
6889
|
+
r3,
|
|
6890
|
+
shared_values,
|
|
6891
|
+
tgpig,
|
|
6892
|
+
tiisg,
|
|
6893
|
+
sgitg);
|
|
6894
|
+
}
|
|
6895
|
+
|
|
6896
|
+
[[host_name("kernel_mul_mv_id_iq1_s_f32")]]
|
|
6897
|
+
kernel void kernel_mul_mv_id_iq1_s_f32(
|
|
6898
|
+
device const char * ids,
|
|
6899
|
+
device const char * src1,
|
|
6900
|
+
device float * dst,
|
|
6901
|
+
constant uint64_t & nbi1,
|
|
6902
|
+
constant int64_t & ne00,
|
|
6903
|
+
constant int64_t & ne01,
|
|
6904
|
+
constant int64_t & ne02,
|
|
6905
|
+
constant uint64_t & nb00,
|
|
6906
|
+
constant uint64_t & nb01,
|
|
6907
|
+
constant uint64_t & nb02,
|
|
6908
|
+
constant int64_t & ne10,
|
|
6909
|
+
constant int64_t & ne11,
|
|
6910
|
+
constant int64_t & ne12,
|
|
6911
|
+
constant int64_t & ne13,
|
|
6912
|
+
constant uint64_t & nb10,
|
|
6913
|
+
constant uint64_t & nb11,
|
|
6914
|
+
constant uint64_t & nb12,
|
|
6915
|
+
constant int64_t & ne0,
|
|
6916
|
+
constant int64_t & ne1,
|
|
6917
|
+
constant uint64_t & nb1,
|
|
6918
|
+
constant uint & r2,
|
|
6919
|
+
constant uint & r3,
|
|
6920
|
+
constant int & idx,
|
|
6921
|
+
device const char * src00,
|
|
6922
|
+
device const char * src01,
|
|
6923
|
+
device const char * src02,
|
|
6924
|
+
device const char * src03,
|
|
6925
|
+
device const char * src04,
|
|
6926
|
+
device const char * src05,
|
|
6927
|
+
device const char * src06,
|
|
6928
|
+
device const char * src07,
|
|
6929
|
+
uint3 tgpig[[threadgroup_position_in_grid]],
|
|
6930
|
+
uint tiitg[[thread_index_in_threadgroup]],
|
|
6931
|
+
uint tiisg[[thread_index_in_simdgroup]],
|
|
6932
|
+
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
|
6933
|
+
device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07};
|
|
6934
|
+
|
|
6935
|
+
const int64_t bid = tgpig.z/(ne12*ne13);
|
|
6936
|
+
|
|
6937
|
+
tgpig.z = tgpig.z%(ne12*ne13);
|
|
6938
|
+
|
|
6939
|
+
const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx];
|
|
6940
|
+
|
|
6941
|
+
kernel_mul_mv_iq1_s_f32_impl(
|
|
6942
|
+
src0[id],
|
|
6943
|
+
(device const float *) (src1 + bid*nb11),
|
|
6944
|
+
dst + bid*ne0,
|
|
6945
|
+
ne00,
|
|
6946
|
+
ne01,
|
|
6947
|
+
ne02,
|
|
6948
|
+
ne10,
|
|
6949
|
+
ne12,
|
|
6950
|
+
ne0,
|
|
6951
|
+
ne1,
|
|
6952
|
+
r2,
|
|
6953
|
+
r3,
|
|
6954
|
+
tgpig,
|
|
6955
|
+
tiisg,
|
|
6956
|
+
sgitg);
|
|
6957
|
+
}
|
|
6958
|
+
|
|
6959
|
+
[[host_name("kernel_mul_mv_id_iq4_nl_f32")]]
|
|
6960
|
+
kernel void kernel_mul_mv_id_iq4_nl_f32(
|
|
6961
|
+
device const char * ids,
|
|
6962
|
+
device const char * src1,
|
|
6963
|
+
device float * dst,
|
|
6964
|
+
constant uint64_t & nbi1,
|
|
6965
|
+
constant int64_t & ne00,
|
|
6966
|
+
constant int64_t & ne01,
|
|
6967
|
+
constant int64_t & ne02,
|
|
6968
|
+
constant uint64_t & nb00,
|
|
6969
|
+
constant uint64_t & nb01,
|
|
6970
|
+
constant uint64_t & nb02,
|
|
6971
|
+
constant int64_t & ne10,
|
|
6972
|
+
constant int64_t & ne11,
|
|
6973
|
+
constant int64_t & ne12,
|
|
6974
|
+
constant int64_t & ne13,
|
|
6975
|
+
constant uint64_t & nb10,
|
|
6976
|
+
constant uint64_t & nb11,
|
|
6977
|
+
constant uint64_t & nb12,
|
|
6978
|
+
constant int64_t & ne0,
|
|
6979
|
+
constant int64_t & ne1,
|
|
6980
|
+
constant uint64_t & nb1,
|
|
6981
|
+
constant uint & r2,
|
|
6982
|
+
constant uint & r3,
|
|
6983
|
+
constant int & idx,
|
|
6984
|
+
device const char * src00,
|
|
6985
|
+
device const char * src01,
|
|
6986
|
+
device const char * src02,
|
|
6987
|
+
device const char * src03,
|
|
6988
|
+
device const char * src04,
|
|
6989
|
+
device const char * src05,
|
|
6990
|
+
device const char * src06,
|
|
6991
|
+
device const char * src07,
|
|
6992
|
+
threadgroup float * shared_values [[threadgroup(0)]],
|
|
6993
|
+
uint3 tgpig[[threadgroup_position_in_grid]],
|
|
6994
|
+
uint tiitg[[thread_index_in_threadgroup]],
|
|
6995
|
+
uint tiisg[[thread_index_in_simdgroup]],
|
|
6996
|
+
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
|
6997
|
+
device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07};
|
|
6998
|
+
|
|
6999
|
+
const int64_t bid = tgpig.z/(ne12*ne13);
|
|
7000
|
+
|
|
7001
|
+
tgpig.z = tgpig.z%(ne12*ne13);
|
|
7002
|
+
|
|
7003
|
+
const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx];
|
|
7004
|
+
|
|
7005
|
+
kernel_mul_mv_iq4_nl_f32_impl(
|
|
7006
|
+
src0[id],
|
|
7007
|
+
(device const float *) (src1 + bid*nb11),
|
|
7008
|
+
dst + bid*ne0,
|
|
7009
|
+
ne00,
|
|
7010
|
+
ne01,
|
|
7011
|
+
ne02,
|
|
7012
|
+
ne10,
|
|
7013
|
+
ne12,
|
|
7014
|
+
ne0,
|
|
7015
|
+
ne1,
|
|
7016
|
+
r2,
|
|
7017
|
+
r3,
|
|
7018
|
+
shared_values,
|
|
7019
|
+
tgpig,
|
|
7020
|
+
tiisg,
|
|
7021
|
+
sgitg);
|
|
7022
|
+
}
|