node-llama-cpp 3.0.0-beta.11 → 3.0.0-beta.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +4 -4
- package/dist/ChatWrapper.d.ts +1 -0
- package/dist/ChatWrapper.js +2 -1
- package/dist/ChatWrapper.js.map +1 -1
- package/dist/TemplateChatWrapper.d.ts +67 -0
- package/dist/TemplateChatWrapper.js +239 -0
- package/dist/TemplateChatWrapper.js.map +1 -0
- package/dist/bindings/AddonTypes.d.ts +2 -0
- package/dist/bindings/Llama.d.ts +1 -2
- package/dist/bindings/Llama.js +10 -14
- package/dist/bindings/Llama.js.map +1 -1
- package/dist/bindings/consts.d.ts +2 -0
- package/dist/bindings/consts.js +11 -0
- package/dist/bindings/consts.js.map +1 -0
- package/dist/bindings/getLlama.d.ts +14 -12
- package/dist/bindings/getLlama.js +210 -75
- package/dist/bindings/getLlama.js.map +1 -1
- package/dist/bindings/types.d.ts +8 -4
- package/dist/bindings/types.js +18 -0
- package/dist/bindings/types.js.map +1 -1
- package/dist/bindings/utils/asyncEvery.d.ts +5 -0
- package/dist/bindings/utils/asyncEvery.js +15 -0
- package/dist/bindings/utils/asyncEvery.js.map +1 -0
- package/dist/bindings/utils/asyncSome.d.ts +5 -0
- package/dist/bindings/utils/asyncSome.js +27 -0
- package/dist/bindings/utils/asyncSome.js.map +1 -0
- package/dist/bindings/utils/cloneLlamaCppRepo.js +13 -3
- package/dist/bindings/utils/cloneLlamaCppRepo.js.map +1 -1
- package/dist/bindings/utils/compileLLamaCpp.js +31 -3
- package/dist/bindings/utils/compileLLamaCpp.js.map +1 -1
- package/dist/bindings/utils/detectAvailableComputeLayers.d.ts +11 -0
- package/dist/bindings/utils/detectAvailableComputeLayers.js +158 -0
- package/dist/bindings/utils/detectAvailableComputeLayers.js.map +1 -0
- package/dist/bindings/utils/detectGlibc.d.ts +4 -0
- package/dist/bindings/utils/detectGlibc.js +36 -0
- package/dist/bindings/utils/detectGlibc.js.map +1 -0
- package/dist/bindings/utils/getBestComputeLayersAvailable.d.ts +9 -0
- package/dist/bindings/utils/getBestComputeLayersAvailable.js +29 -0
- package/dist/bindings/utils/getBestComputeLayersAvailable.js.map +1 -0
- package/dist/bindings/utils/getBuildFolderNameForBuildOptions.js +12 -4
- package/dist/bindings/utils/getBuildFolderNameForBuildOptions.js.map +1 -1
- package/dist/bindings/utils/getGpuTypesToUseForOption.d.ts +11 -0
- package/dist/bindings/utils/getGpuTypesToUseForOption.js +30 -0
- package/dist/bindings/utils/getGpuTypesToUseForOption.js.map +1 -0
- package/dist/bindings/utils/getLinuxDistroInfo.d.ts +9 -0
- package/dist/bindings/utils/getLinuxDistroInfo.js +46 -0
- package/dist/bindings/utils/getLinuxDistroInfo.js.map +1 -0
- package/dist/bindings/utils/getPlatformInfo.d.ts +5 -0
- package/dist/bindings/utils/getPlatformInfo.js +28 -0
- package/dist/bindings/utils/getPlatformInfo.js.map +1 -0
- package/dist/bindings/utils/hasFileInPath.d.ts +2 -0
- package/dist/bindings/utils/hasFileInPath.js +34 -0
- package/dist/bindings/utils/hasFileInPath.js.map +1 -0
- package/dist/bindings/utils/logBinaryUsageExampleToConsole.d.ts +1 -1
- package/dist/bindings/utils/logBinaryUsageExampleToConsole.js +3 -9
- package/dist/bindings/utils/logBinaryUsageExampleToConsole.js.map +1 -1
- package/dist/bindings/utils/logDistroInstallInstruction.d.ts +13 -0
- package/dist/bindings/utils/logDistroInstallInstruction.js +38 -0
- package/dist/bindings/utils/logDistroInstallInstruction.js.map +1 -0
- package/dist/bindings/utils/resolveChatWrapperBasedOnWrapperTypeName.d.ts +9 -2
- package/dist/bindings/utils/resolveChatWrapperBasedOnWrapperTypeName.js +10 -4
- package/dist/bindings/utils/resolveChatWrapperBasedOnWrapperTypeName.js.map +1 -1
- package/dist/bindings/utils/resolveCustomCmakeOptions.js +2 -0
- package/dist/bindings/utils/resolveCustomCmakeOptions.js.map +1 -1
- package/dist/bindings/utils/testBindingBinary.d.ts +1 -0
- package/dist/bindings/utils/testBindingBinary.js +98 -0
- package/dist/bindings/utils/testBindingBinary.js.map +1 -0
- package/dist/chatWrappers/ChatMLChatWrapper.js +1 -1
- package/dist/chatWrappers/ChatMLChatWrapper.js.map +1 -1
- package/dist/chatWrappers/GemmaChatWrapper.d.ts +18 -0
- package/dist/chatWrappers/GemmaChatWrapper.js +86 -0
- package/dist/chatWrappers/GemmaChatWrapper.js.map +1 -0
- package/dist/chatWrappers/resolveChatWrapperBasedOnModel.js +3 -0
- package/dist/chatWrappers/resolveChatWrapperBasedOnModel.js.map +1 -1
- package/dist/cli/cli.js +2 -0
- package/dist/cli/cli.js.map +1 -1
- package/dist/cli/commands/BuildCommand.d.ts +5 -4
- package/dist/cli/commands/BuildCommand.js +78 -58
- package/dist/cli/commands/BuildCommand.js.map +1 -1
- package/dist/cli/commands/DebugCommand.js +12 -15
- package/dist/cli/commands/DebugCommand.js.map +1 -1
- package/dist/cli/commands/DownloadCommand.d.ts +5 -4
- package/dist/cli/commands/DownloadCommand.js +97 -54
- package/dist/cli/commands/DownloadCommand.js.map +1 -1
- package/dist/cli/commands/InspectCommand.d.ts +7 -0
- package/dist/cli/commands/InspectCommand.js +113 -0
- package/dist/cli/commands/InspectCommand.js.map +1 -0
- package/dist/cli/utils/logUsedGpuTypeOption.d.ts +2 -0
- package/dist/cli/utils/logUsedGpuTypeOption.js +9 -0
- package/dist/cli/utils/logUsedGpuTypeOption.js.map +1 -0
- package/dist/config.d.ts +3 -2
- package/dist/config.js +12 -10
- package/dist/config.js.map +1 -1
- package/dist/evaluator/LlamaChatSession/utils/defineChatSessionFunction.d.ts +3 -0
- package/dist/evaluator/LlamaChatSession/utils/defineChatSessionFunction.js +3 -0
- package/dist/evaluator/LlamaChatSession/utils/defineChatSessionFunction.js.map +1 -1
- package/dist/gguf/GGUFInsights.d.ts +28 -0
- package/dist/gguf/GGUFInsights.js +58 -0
- package/dist/gguf/GGUFInsights.js.map +1 -0
- package/dist/gguf/GGUFMetadata.d.ts +19 -0
- package/dist/gguf/GGUFMetadata.js +38 -0
- package/dist/gguf/GGUFMetadata.js.map +1 -0
- package/dist/gguf/errors/InvalidGGUFMagicError.d.ts +3 -0
- package/dist/gguf/errors/InvalidGGUFMagicError.js +6 -0
- package/dist/gguf/errors/InvalidGGUFMagicError.js.map +1 -0
- package/dist/gguf/errors/MetadataNotParsedYetError.d.ts +3 -0
- package/dist/gguf/errors/MetadataNotParsedYetError.js +6 -0
- package/dist/gguf/errors/MetadataNotParsedYetError.js.map +1 -0
- package/dist/gguf/errors/MissingNodeLlamaError.d.ts +3 -0
- package/dist/gguf/errors/MissingNodeLlamaError.js +6 -0
- package/dist/gguf/errors/MissingNodeLlamaError.js.map +1 -0
- package/dist/gguf/errors/ModelScore/NotEnoughVRamError.d.ts +5 -0
- package/dist/gguf/errors/ModelScore/NotEnoughVRamError.js +12 -0
- package/dist/gguf/errors/ModelScore/NotEnoughVRamError.js.map +1 -0
- package/dist/gguf/errors/UnsupportedMetadataTypeError.d.ts +4 -0
- package/dist/gguf/errors/UnsupportedMetadataTypeError.js +8 -0
- package/dist/gguf/errors/UnsupportedMetadataTypeError.js.map +1 -0
- package/dist/gguf/ggufParser/GGUFParser.d.ts +18 -0
- package/dist/gguf/ggufParser/GGUFParser.js +123 -0
- package/dist/gguf/ggufParser/GGUFParser.js.map +1 -0
- package/dist/gguf/ggufParser/GGUFTypes.d.ts +257 -0
- package/dist/gguf/ggufParser/GGUFTypes.js +2 -0
- package/dist/gguf/ggufParser/GGUFTypes.js.map +1 -0
- package/dist/gguf/ggufParser/checkArchitecture.d.ts +14 -0
- package/dist/gguf/ggufParser/checkArchitecture.js +74 -0
- package/dist/gguf/ggufParser/checkArchitecture.js.map +1 -0
- package/dist/gguf/ggufParser/stream/GGUFBaseStream.d.ts +38 -0
- package/dist/gguf/ggufParser/stream/GGUFBaseStream.js +83 -0
- package/dist/gguf/ggufParser/stream/GGUFBaseStream.js.map +1 -0
- package/dist/gguf/ggufParser/stream/GGUFFetchStream.d.ts +14 -0
- package/dist/gguf/ggufParser/stream/GGUFFetchStream.js +35 -0
- package/dist/gguf/ggufParser/stream/GGUFFetchStream.js.map +1 -0
- package/dist/gguf/ggufParser/stream/GGUFReadStream.d.ts +15 -0
- package/dist/gguf/ggufParser/stream/GGUFReadStream.js +40 -0
- package/dist/gguf/ggufParser/stream/GGUFReadStream.js.map +1 -0
- package/dist/index.d.ts +3 -1
- package/dist/index.js +3 -1
- package/dist/index.js.map +1 -1
- package/dist/utils/LlamaText.js +2 -2
- package/dist/utils/LlamaText.js.map +1 -1
- package/dist/utils/cmake.js +23 -10
- package/dist/utils/cmake.js.map +1 -1
- package/dist/utils/gbnfJson/terminals/GbnfArray.js.map +1 -1
- package/dist/utils/gbnfJson/terminals/GbnfBoolean.d.ts +1 -1
- package/dist/utils/gbnfJson/terminals/GbnfBoolean.js.map +1 -1
- package/dist/utils/gbnfJson/terminals/GbnfBooleanValue.js.map +1 -1
- package/dist/utils/gbnfJson/terminals/GbnfGrammar.js.map +1 -1
- package/dist/utils/gbnfJson/terminals/GbnfNull.d.ts +1 -1
- package/dist/utils/gbnfJson/terminals/GbnfNull.js.map +1 -1
- package/dist/utils/gbnfJson/terminals/GbnfNumber.d.ts +1 -1
- package/dist/utils/gbnfJson/terminals/GbnfNumber.js.map +1 -1
- package/dist/utils/gbnfJson/terminals/GbnfNumberValue.js.map +1 -1
- package/dist/utils/gbnfJson/terminals/GbnfObjectMap.js.map +1 -1
- package/dist/utils/gbnfJson/terminals/GbnfOr.js.map +1 -1
- package/dist/utils/gbnfJson/terminals/GbnfString.d.ts +1 -1
- package/dist/utils/gbnfJson/terminals/GbnfString.js.map +1 -1
- package/dist/utils/gbnfJson/terminals/GbnfStringValue.js.map +1 -1
- package/dist/utils/gbnfJson/terminals/GbnfVerbatimText.js.map +1 -1
- package/dist/utils/gbnfJson/terminals/GbnfWhitespace.d.ts +1 -1
- package/dist/utils/gbnfJson/terminals/GbnfWhitespace.js.map +1 -1
- package/dist/utils/getBuildDefaults.d.ts +1 -2
- package/dist/utils/getBuildDefaults.js +2 -3
- package/dist/utils/getBuildDefaults.js.map +1 -1
- package/dist/utils/getConsoleLogPrefix.d.ts +1 -1
- package/dist/utils/getConsoleLogPrefix.js +2 -2
- package/dist/utils/getConsoleLogPrefix.js.map +1 -1
- package/dist/utils/mergeUnionTypes.d.ts +6 -0
- package/dist/utils/mergeUnionTypes.js +2 -0
- package/dist/utils/mergeUnionTypes.js.map +1 -0
- package/dist/utils/parseTextTemplate.d.ts +66 -0
- package/dist/utils/parseTextTemplate.js +116 -0
- package/dist/utils/parseTextTemplate.js.map +1 -0
- package/llama/CMakeLists.txt +30 -4
- package/llama/addon.cpp +62 -7
- package/llama/binariesGithubRelease.json +1 -1
- package/llama/gitRelease.bundle +0 -0
- package/llama/gpuInfo/cuda-gpu-info.cu +5 -5
- package/llama/gpuInfo/cuda-gpu-info.h +2 -2
- package/llama/gpuInfo/vulkan-gpu-info.cpp +65 -0
- package/llama/gpuInfo/vulkan-gpu-info.h +7 -0
- package/llama/llama.cpp.info.json +1 -1
- package/llamaBins/linux-arm64/.buildMetadata.json +1 -1
- package/llamaBins/linux-arm64/llama-addon.node +0 -0
- package/llamaBins/linux-armv7l/.buildMetadata.json +1 -1
- package/llamaBins/linux-armv7l/llama-addon.node +0 -0
- package/llamaBins/linux-x64/.buildMetadata.json +1 -1
- package/llamaBins/linux-x64/llama-addon.node +0 -0
- package/llamaBins/linux-x64-cuda/.buildMetadata.json +1 -1
- package/llamaBins/linux-x64-cuda/llama-addon.node +0 -0
- package/llamaBins/linux-x64-vulkan/.buildMetadata.json +1 -0
- package/llamaBins/linux-x64-vulkan/llama-addon.node +0 -0
- package/llamaBins/mac-arm64-metal/.buildMetadata.json +1 -1
- package/llamaBins/mac-arm64-metal/ggml-metal.metal +1382 -142
- package/llamaBins/mac-arm64-metal/llama-addon.node +0 -0
- package/llamaBins/mac-x64/.buildMetadata.json +1 -1
- package/llamaBins/mac-x64/llama-addon.node +0 -0
- package/llamaBins/win-x64/.buildMetadata.json +1 -1
- package/llamaBins/win-x64/llama-addon.exp +0 -0
- package/llamaBins/win-x64/llama-addon.lib +0 -0
- package/llamaBins/win-x64/llama-addon.node +0 -0
- package/llamaBins/win-x64-cuda/.buildMetadata.json +1 -1
- package/llamaBins/win-x64-cuda/llama-addon.exp +0 -0
- package/llamaBins/win-x64-cuda/llama-addon.lib +0 -0
- package/llamaBins/win-x64-cuda/llama-addon.node +0 -0
- package/llamaBins/win-x64-vulkan/.buildMetadata.json +1 -0
- package/llamaBins/win-x64-vulkan/llama-addon.exp +0 -0
- package/llamaBins/win-x64-vulkan/llama-addon.lib +0 -0
- package/llamaBins/win-x64-vulkan/llama-addon.node +0 -0
- package/package.json +7 -4
|
@@ -392,7 +392,7 @@ kernel void kernel_soft_max(
|
|
|
392
392
|
float lmax = -INFINITY;
|
|
393
393
|
|
|
394
394
|
for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
|
|
395
|
-
lmax = MAX(lmax, psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f) + slope*ppos[i00]);
|
|
395
|
+
lmax = MAX(lmax, psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f) + (ppos ? slope*ppos[i00] : 0.0f));
|
|
396
396
|
}
|
|
397
397
|
|
|
398
398
|
// find the max value in the block
|
|
@@ -417,7 +417,7 @@ kernel void kernel_soft_max(
|
|
|
417
417
|
// parallel sum
|
|
418
418
|
float lsum = 0.0f;
|
|
419
419
|
for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
|
|
420
|
-
const float exp_psrc0 = exp((psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f) + slope*ppos[i00]) - max_val);
|
|
420
|
+
const float exp_psrc0 = exp((psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f) + (ppos ? slope*ppos[i00] : 0.0f)) - max_val);
|
|
421
421
|
lsum += exp_psrc0;
|
|
422
422
|
pdst[i00] = exp_psrc0;
|
|
423
423
|
}
|
|
@@ -495,7 +495,7 @@ kernel void kernel_soft_max_4(
|
|
|
495
495
|
float4 lmax4 = -INFINITY;
|
|
496
496
|
|
|
497
497
|
for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
|
|
498
|
-
lmax4 = fmax(lmax4, psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f) + slope*ppos[i00]);
|
|
498
|
+
lmax4 = fmax(lmax4, psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f) + (ppos ? slope*ppos[i00] : 0.0f));
|
|
499
499
|
}
|
|
500
500
|
|
|
501
501
|
const float lmax = MAX(MAX(lmax4[0], lmax4[1]), MAX(lmax4[2], lmax4[3]));
|
|
@@ -521,7 +521,7 @@ kernel void kernel_soft_max_4(
|
|
|
521
521
|
// parallel sum
|
|
522
522
|
float4 lsum4 = 0.0f;
|
|
523
523
|
for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
|
|
524
|
-
const float4 exp_psrc4 = exp((psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f) + slope*ppos[i00]) - max_val);
|
|
524
|
+
const float4 exp_psrc4 = exp((psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f) + (ppos ? slope*ppos[i00] : 0.0f)) - max_val);
|
|
525
525
|
lsum4 += exp_psrc4;
|
|
526
526
|
pdst4[i00] = exp_psrc4;
|
|
527
527
|
}
|
|
@@ -2519,18 +2519,57 @@ typedef struct {
|
|
|
2519
2519
|
} block_iq2_xs;
|
|
2520
2520
|
// 74 bytes / block for QK_K = 256, so 2.3125 bpw
|
|
2521
2521
|
|
|
2522
|
+
// 2.5625 bpw quants
|
|
2523
|
+
typedef struct {
|
|
2524
|
+
half d;
|
|
2525
|
+
uint8_t qs[QK_K/4];
|
|
2526
|
+
uint8_t qh[QK_K/32];
|
|
2527
|
+
uint8_t scales[QK_K/32];
|
|
2528
|
+
} block_iq2_s;
|
|
2529
|
+
|
|
2522
2530
|
typedef struct {
|
|
2523
2531
|
half d;
|
|
2524
2532
|
uint8_t qs[3*QK_K/8];
|
|
2525
2533
|
} block_iq3_xxs;
|
|
2526
2534
|
// 98 bytes / block for QK_K = 256, so 3.0625 bpw
|
|
2527
2535
|
|
|
2536
|
+
// 3.4375 bpw
|
|
2537
|
+
#if QK_K == 64
|
|
2538
|
+
#define IQ3S_N_SCALE 2
|
|
2539
|
+
#else
|
|
2540
|
+
#define IQ3S_N_SCALE QK_K/64
|
|
2541
|
+
#endif
|
|
2542
|
+
typedef struct {
|
|
2543
|
+
half d;
|
|
2544
|
+
uint8_t qs[QK_K/4];
|
|
2545
|
+
uint8_t qh[QK_K/32];
|
|
2546
|
+
uint8_t signs[QK_K/8];
|
|
2547
|
+
uint8_t scales[IQ3S_N_SCALE];
|
|
2548
|
+
} block_iq3_s;
|
|
2549
|
+
|
|
2528
2550
|
typedef struct {
|
|
2529
2551
|
half d;
|
|
2530
2552
|
uint8_t qs[QK_K/8];
|
|
2531
2553
|
uint8_t scales[QK_K/16];
|
|
2532
2554
|
} block_iq1_s;
|
|
2533
2555
|
|
|
2556
|
+
// Non-linear quants
|
|
2557
|
+
#define QK4_NL 32
|
|
2558
|
+
typedef struct {
|
|
2559
|
+
half d;
|
|
2560
|
+
uint8_t qs[QK4_NL/2];
|
|
2561
|
+
} block_iq4_nl;
|
|
2562
|
+
|
|
2563
|
+
#if QK_K == 64
|
|
2564
|
+
#define block_iq4_xs block_iq4_nl
|
|
2565
|
+
#else
|
|
2566
|
+
typedef struct {
|
|
2567
|
+
half d;
|
|
2568
|
+
uint16_t scales_h;
|
|
2569
|
+
uint8_t scales_l[QK_K/64];
|
|
2570
|
+
uint8_t qs[QK_K/2];
|
|
2571
|
+
} block_iq4_xs;
|
|
2572
|
+
#endif
|
|
2534
2573
|
|
|
2535
2574
|
//====================================== dot products =========================
|
|
2536
2575
|
|
|
@@ -3754,6 +3793,265 @@ constexpr constant static uint64_t iq2xs_grid[512] = {
|
|
|
3754
3793
|
0x2b2b2b2b082b2b08, 0x2b2b2b2b082b2b2b, 0x2b2b2b2b2b190819, 0x2b2b2b2b2b2b2b2b,
|
|
3755
3794
|
};
|
|
3756
3795
|
|
|
3796
|
+
constexpr constant static uint64_t iq2s_grid[1024] = {
|
|
3797
|
+
0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08,
|
|
3798
|
+
0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x080808080819192b,
|
|
3799
|
+
0x0808080808192b19, 0x08080808082b0808, 0x08080808082b082b, 0x08080808082b1919,
|
|
3800
|
+
0x08080808082b2b08, 0x0808080819080819, 0x0808080819081908, 0x080808081908192b,
|
|
3801
|
+
0x0808080819082b19, 0x0808080819190808, 0x080808081919082b, 0x0808080819191919,
|
|
3802
|
+
0x0808080819192b08, 0x08080808192b0819, 0x08080808192b1908, 0x08080808192b192b,
|
|
3803
|
+
0x08080808192b2b19, 0x080808082b080808, 0x080808082b08082b, 0x080808082b081919,
|
|
3804
|
+
0x080808082b082b08, 0x080808082b190819, 0x080808082b191908, 0x080808082b2b0808,
|
|
3805
|
+
0x080808082b2b1919, 0x080808082b2b2b2b, 0x0808081908080819, 0x0808081908081908,
|
|
3806
|
+
0x080808190808192b, 0x0808081908082b19, 0x0808081908190808, 0x080808190819082b,
|
|
3807
|
+
0x0808081908191919, 0x0808081908192b08, 0x08080819082b0819, 0x08080819082b1908,
|
|
3808
|
+
0x0808081919080808, 0x080808191908082b, 0x0808081919081919, 0x0808081919082b08,
|
|
3809
|
+
0x0808081919190819, 0x0808081919191908, 0x080808191919192b, 0x0808081919192b19,
|
|
3810
|
+
0x08080819192b0808, 0x08080819192b1919, 0x08080819192b2b08, 0x080808192b080819,
|
|
3811
|
+
0x080808192b081908, 0x080808192b190808, 0x080808192b19082b, 0x080808192b191919,
|
|
3812
|
+
0x080808192b2b0819, 0x080808192b2b1908, 0x0808082b08080808, 0x0808082b0808082b,
|
|
3813
|
+
0x0808082b08081919, 0x0808082b08082b08, 0x0808082b08190819, 0x0808082b08191908,
|
|
3814
|
+
0x0808082b082b0808, 0x0808082b082b2b2b, 0x0808082b19080819, 0x0808082b19081908,
|
|
3815
|
+
0x0808082b1908192b, 0x0808082b19082b19, 0x0808082b19190808, 0x0808082b19191919,
|
|
3816
|
+
0x0808082b2b080808, 0x0808082b2b081919, 0x0808082b2b082b2b, 0x0808082b2b191908,
|
|
3817
|
+
0x0808082b2b2b082b, 0x0808190808080819, 0x0808190808081908, 0x080819080808192b,
|
|
3818
|
+
0x0808190808082b19, 0x0808190808190808, 0x080819080819082b, 0x0808190808191919,
|
|
3819
|
+
0x0808190808192b08, 0x08081908082b0819, 0x08081908082b1908, 0x08081908082b192b,
|
|
3820
|
+
0x08081908082b2b19, 0x0808190819080808, 0x080819081908082b, 0x0808190819081919,
|
|
3821
|
+
0x0808190819082b08, 0x0808190819082b2b, 0x0808190819190819, 0x0808190819191908,
|
|
3822
|
+
0x080819081919192b, 0x0808190819192b19, 0x08081908192b0808, 0x08081908192b082b,
|
|
3823
|
+
0x08081908192b1919, 0x080819082b080819, 0x080819082b081908, 0x080819082b08192b,
|
|
3824
|
+
0x080819082b082b19, 0x080819082b190808, 0x080819082b191919, 0x080819082b192b08,
|
|
3825
|
+
0x080819082b2b0819, 0x080819082b2b1908, 0x0808191908080808, 0x080819190808082b,
|
|
3826
|
+
0x0808191908081919, 0x0808191908082b08, 0x0808191908082b2b, 0x0808191908190819,
|
|
3827
|
+
0x0808191908191908, 0x080819190819192b, 0x0808191908192b19, 0x08081919082b0808,
|
|
3828
|
+
0x08081919082b1919, 0x08081919082b2b08, 0x0808191919080819, 0x0808191919081908,
|
|
3829
|
+
0x080819191908192b, 0x0808191919082b19, 0x0808191919190808, 0x080819191919082b,
|
|
3830
|
+
0x0808191919191919, 0x0808191919192b08, 0x08081919192b0819, 0x08081919192b1908,
|
|
3831
|
+
0x080819192b080808, 0x080819192b08082b, 0x080819192b081919, 0x080819192b082b08,
|
|
3832
|
+
0x080819192b190819, 0x080819192b191908, 0x080819192b2b0808, 0x0808192b08080819,
|
|
3833
|
+
0x0808192b08081908, 0x0808192b0808192b, 0x0808192b08082b19, 0x0808192b08190808,
|
|
3834
|
+
0x0808192b08191919, 0x0808192b19080808, 0x0808192b19081919, 0x0808192b19082b08,
|
|
3835
|
+
0x0808192b19190819, 0x0808192b19191908, 0x0808192b192b0808, 0x0808192b2b080819,
|
|
3836
|
+
0x0808192b2b081908, 0x0808192b2b190808, 0x08082b0808080808, 0x08082b080808082b,
|
|
3837
|
+
0x08082b0808081919, 0x08082b0808082b08, 0x08082b0808190819, 0x08082b0808191908,
|
|
3838
|
+
0x08082b080819192b, 0x08082b0808192b19, 0x08082b08082b0808, 0x08082b08082b1919,
|
|
3839
|
+
0x08082b08082b2b2b, 0x08082b0819080819, 0x08082b0819081908, 0x08082b081908192b,
|
|
3840
|
+
0x08082b0819082b19, 0x08082b0819190808, 0x08082b081919082b, 0x08082b0819191919,
|
|
3841
|
+
0x08082b0819192b08, 0x08082b08192b0819, 0x08082b08192b1908, 0x08082b082b080808,
|
|
3842
|
+
0x08082b082b081919, 0x08082b082b191908, 0x08082b082b2b2b2b, 0x08082b1908080819,
|
|
3843
|
+
0x08082b1908081908, 0x08082b1908190808, 0x08082b190819082b, 0x08082b1908191919,
|
|
3844
|
+
0x08082b1908192b08, 0x08082b19082b0819, 0x08082b1919080808, 0x08082b1919081919,
|
|
3845
|
+
0x08082b1919082b08, 0x08082b1919190819, 0x08082b1919191908, 0x08082b19192b0808,
|
|
3846
|
+
0x08082b192b080819, 0x08082b192b190808, 0x08082b2b08080808, 0x08082b2b08190819,
|
|
3847
|
+
0x08082b2b08191908, 0x08082b2b082b082b, 0x08082b2b082b2b08, 0x08082b2b082b2b2b,
|
|
3848
|
+
0x08082b2b19190808, 0x08082b2b2b192b19, 0x0819080808080819, 0x0819080808081908,
|
|
3849
|
+
0x081908080808192b, 0x0819080808082b19, 0x0819080808190808, 0x081908080819082b,
|
|
3850
|
+
0x0819080808191919, 0x0819080808192b08, 0x08190808082b0819, 0x08190808082b1908,
|
|
3851
|
+
0x08190808082b192b, 0x0819080819080808, 0x081908081908082b, 0x0819080819081919,
|
|
3852
|
+
0x0819080819082b08, 0x0819080819190819, 0x0819080819191908, 0x081908081919192b,
|
|
3853
|
+
0x0819080819192b19, 0x08190808192b0808, 0x08190808192b082b, 0x08190808192b1919,
|
|
3854
|
+
0x08190808192b2b08, 0x081908082b080819, 0x081908082b081908, 0x081908082b08192b,
|
|
3855
|
+
0x081908082b190808, 0x081908082b191919, 0x081908082b192b08, 0x081908082b2b0819,
|
|
3856
|
+
0x081908082b2b1908, 0x0819081908080808, 0x081908190808082b, 0x0819081908081919,
|
|
3857
|
+
0x0819081908082b08, 0x0819081908082b2b, 0x0819081908190819, 0x0819081908191908,
|
|
3858
|
+
0x081908190819192b, 0x0819081908192b19, 0x08190819082b0808, 0x08190819082b082b,
|
|
3859
|
+
0x08190819082b1919, 0x08190819082b2b08, 0x0819081919080819, 0x0819081919081908,
|
|
3860
|
+
0x081908191908192b, 0x0819081919082b19, 0x0819081919190808, 0x081908191919082b,
|
|
3861
|
+
0x0819081919191919, 0x0819081919192b08, 0x08190819192b0819, 0x08190819192b1908,
|
|
3862
|
+
0x081908192b080808, 0x081908192b08082b, 0x081908192b081919, 0x081908192b082b08,
|
|
3863
|
+
0x081908192b190819, 0x081908192b191908, 0x0819082b08080819, 0x0819082b08081908,
|
|
3864
|
+
0x0819082b08082b19, 0x0819082b08190808, 0x0819082b08191919, 0x0819082b082b0819,
|
|
3865
|
+
0x0819082b082b1908, 0x0819082b19080808, 0x0819082b19081919, 0x0819082b19190819,
|
|
3866
|
+
0x0819082b19191908, 0x0819082b2b080819, 0x0819082b2b081908, 0x0819082b2b190808,
|
|
3867
|
+
0x0819190808080808, 0x081919080808082b, 0x0819190808081919, 0x0819190808082b08,
|
|
3868
|
+
0x0819190808190819, 0x0819190808191908, 0x081919080819192b, 0x0819190808192b19,
|
|
3869
|
+
0x08191908082b0808, 0x08191908082b1919, 0x08191908082b2b08, 0x0819190819080819,
|
|
3870
|
+
0x0819190819081908, 0x081919081908192b, 0x0819190819082b19, 0x0819190819190808,
|
|
3871
|
+
0x081919081919082b, 0x0819190819191919, 0x0819190819192b08, 0x08191908192b0819,
|
|
3872
|
+
0x08191908192b1908, 0x081919082b080808, 0x081919082b08082b, 0x081919082b081919,
|
|
3873
|
+
0x081919082b082b08, 0x081919082b190819, 0x081919082b191908, 0x081919082b2b0808,
|
|
3874
|
+
0x0819191908080819, 0x0819191908081908, 0x081919190808192b, 0x0819191908082b19,
|
|
3875
|
+
0x0819191908190808, 0x081919190819082b, 0x0819191908191919, 0x0819191908192b08,
|
|
3876
|
+
0x08191919082b0819, 0x08191919082b1908, 0x0819191919080808, 0x081919191908082b,
|
|
3877
|
+
0x0819191919081919, 0x0819191919082b08, 0x0819191919190819, 0x0819191919191908,
|
|
3878
|
+
0x08191919192b0808, 0x081919192b080819, 0x081919192b081908, 0x081919192b190808,
|
|
3879
|
+
0x0819192b08080808, 0x0819192b08081919, 0x0819192b08082b08, 0x0819192b08190819,
|
|
3880
|
+
0x0819192b08191908, 0x0819192b082b0808, 0x0819192b19080819, 0x0819192b19081908,
|
|
3881
|
+
0x0819192b19190808, 0x0819192b2b080808, 0x0819192b2b2b2b2b, 0x08192b0808080819,
|
|
3882
|
+
0x08192b0808081908, 0x08192b080808192b, 0x08192b0808082b19, 0x08192b0808190808,
|
|
3883
|
+
0x08192b0808191919, 0x08192b0808192b08, 0x08192b08082b0819, 0x08192b0819080808,
|
|
3884
|
+
0x08192b081908082b, 0x08192b0819081919, 0x08192b0819082b08, 0x08192b0819190819,
|
|
3885
|
+
0x08192b0819191908, 0x08192b08192b0808, 0x08192b082b080819, 0x08192b082b081908,
|
|
3886
|
+
0x08192b1908080808, 0x08192b190808082b, 0x08192b1908081919, 0x08192b1908082b08,
|
|
3887
|
+
0x08192b1908190819, 0x08192b1908191908, 0x08192b19082b0808, 0x08192b1919080819,
|
|
3888
|
+
0x08192b1919081908, 0x08192b1919190808, 0x08192b19192b2b19, 0x08192b192b2b082b,
|
|
3889
|
+
0x08192b2b08081908, 0x08192b2b08190808, 0x08192b2b19080808, 0x08192b2b1919192b,
|
|
3890
|
+
0x082b080808080808, 0x082b08080808082b, 0x082b080808081919, 0x082b080808082b08,
|
|
3891
|
+
0x082b080808190819, 0x082b080808191908, 0x082b08080819192b, 0x082b080808192b19,
|
|
3892
|
+
0x082b0808082b0808, 0x082b0808082b1919, 0x082b0808082b2b2b, 0x082b080819080819,
|
|
3893
|
+
0x082b080819081908, 0x082b080819190808, 0x082b08081919082b, 0x082b080819191919,
|
|
3894
|
+
0x082b0808192b1908, 0x082b08082b080808, 0x082b08082b082b2b, 0x082b08082b191908,
|
|
3895
|
+
0x082b08082b2b2b2b, 0x082b081908080819, 0x082b081908081908, 0x082b081908190808,
|
|
3896
|
+
0x082b08190819082b, 0x082b081908191919, 0x082b0819082b0819, 0x082b081919080808,
|
|
3897
|
+
0x082b08191908082b, 0x082b081919081919, 0x082b081919190819, 0x082b081919191908,
|
|
3898
|
+
0x082b0819192b0808, 0x082b08192b080819, 0x082b08192b081908, 0x082b08192b190808,
|
|
3899
|
+
0x082b082b08080808, 0x082b082b08082b2b, 0x082b082b082b082b, 0x082b082b082b2b08,
|
|
3900
|
+
0x082b082b082b2b2b, 0x082b082b19081908, 0x082b082b19190808, 0x082b082b2b082b08,
|
|
3901
|
+
0x082b082b2b082b2b, 0x082b082b2b2b2b08, 0x082b190808080819, 0x082b190808081908,
|
|
3902
|
+
0x082b19080808192b, 0x082b190808082b19, 0x082b190808190808, 0x082b190808191919,
|
|
3903
|
+
0x082b190808192b08, 0x082b1908082b0819, 0x082b1908082b1908, 0x082b190819080808,
|
|
3904
|
+
0x082b19081908082b, 0x082b190819081919, 0x082b190819082b08, 0x082b190819190819,
|
|
3905
|
+
0x082b190819191908, 0x082b1908192b0808, 0x082b19082b080819, 0x082b19082b081908,
|
|
3906
|
+
0x082b19082b190808, 0x082b191908080808, 0x082b191908081919, 0x082b191908082b08,
|
|
3907
|
+
0x082b191908190819, 0x082b191908191908, 0x082b1919082b0808, 0x082b191919080819,
|
|
3908
|
+
0x082b191919081908, 0x082b191919190808, 0x082b1919192b192b, 0x082b19192b080808,
|
|
3909
|
+
0x082b192b08080819, 0x082b192b08081908, 0x082b192b08190808, 0x082b192b19080808,
|
|
3910
|
+
0x082b192b19192b19, 0x082b2b0808080808, 0x082b2b0808081919, 0x082b2b0808190819,
|
|
3911
|
+
0x082b2b0808191908, 0x082b2b0819080819, 0x082b2b0819081908, 0x082b2b0819190808,
|
|
3912
|
+
0x082b2b082b082b2b, 0x082b2b082b2b2b2b, 0x082b2b1908080819, 0x082b2b1908081908,
|
|
3913
|
+
0x082b2b1908190808, 0x082b2b192b191919, 0x082b2b2b08082b2b, 0x082b2b2b082b082b,
|
|
3914
|
+
0x082b2b2b192b1908, 0x082b2b2b2b082b08, 0x082b2b2b2b082b2b, 0x1908080808080819,
|
|
3915
|
+
0x1908080808081908, 0x190808080808192b, 0x1908080808082b19, 0x1908080808190808,
|
|
3916
|
+
0x190808080819082b, 0x1908080808191919, 0x1908080808192b08, 0x1908080808192b2b,
|
|
3917
|
+
0x19080808082b0819, 0x19080808082b1908, 0x19080808082b192b, 0x1908080819080808,
|
|
3918
|
+
0x190808081908082b, 0x1908080819081919, 0x1908080819082b08, 0x1908080819082b2b,
|
|
3919
|
+
0x1908080819190819, 0x1908080819191908, 0x190808081919192b, 0x1908080819192b19,
|
|
3920
|
+
0x19080808192b0808, 0x19080808192b082b, 0x19080808192b1919, 0x190808082b080819,
|
|
3921
|
+
0x190808082b081908, 0x190808082b190808, 0x190808082b191919, 0x190808082b192b08,
|
|
3922
|
+
0x190808082b2b0819, 0x190808082b2b1908, 0x1908081908080808, 0x190808190808082b,
|
|
3923
|
+
0x1908081908081919, 0x1908081908082b08, 0x1908081908190819, 0x1908081908191908,
|
|
3924
|
+
0x190808190819192b, 0x1908081908192b19, 0x19080819082b0808, 0x19080819082b082b,
|
|
3925
|
+
0x19080819082b1919, 0x1908081919080819, 0x1908081919081908, 0x190808191908192b,
|
|
3926
|
+
0x1908081919082b19, 0x1908081919190808, 0x190808191919082b, 0x1908081919191919,
|
|
3927
|
+
0x1908081919192b08, 0x19080819192b0819, 0x19080819192b1908, 0x190808192b080808,
|
|
3928
|
+
0x190808192b08082b, 0x190808192b081919, 0x190808192b082b08, 0x190808192b190819,
|
|
3929
|
+
0x190808192b191908, 0x190808192b2b0808, 0x1908082b08080819, 0x1908082b08081908,
|
|
3930
|
+
0x1908082b08190808, 0x1908082b0819082b, 0x1908082b08191919, 0x1908082b08192b08,
|
|
3931
|
+
0x1908082b082b1908, 0x1908082b19080808, 0x1908082b19081919, 0x1908082b19082b08,
|
|
3932
|
+
0x1908082b19190819, 0x1908082b19191908, 0x1908082b192b0808, 0x1908082b2b080819,
|
|
3933
|
+
0x1908082b2b081908, 0x1908190808080808, 0x190819080808082b, 0x1908190808081919,
|
|
3934
|
+
0x1908190808082b08, 0x1908190808082b2b, 0x1908190808190819, 0x1908190808191908,
|
|
3935
|
+
0x190819080819192b, 0x1908190808192b19, 0x19081908082b0808, 0x19081908082b082b,
|
|
3936
|
+
0x19081908082b1919, 0x19081908082b2b08, 0x1908190819080819, 0x1908190819081908,
|
|
3937
|
+
0x190819081908192b, 0x1908190819082b19, 0x1908190819190808, 0x190819081919082b,
|
|
3938
|
+
0x1908190819191919, 0x1908190819192b08, 0x19081908192b0819, 0x19081908192b1908,
|
|
3939
|
+
0x190819082b080808, 0x190819082b08082b, 0x190819082b081919, 0x190819082b082b08,
|
|
3940
|
+
0x190819082b190819, 0x190819082b191908, 0x190819082b2b0808, 0x1908191908080819,
|
|
3941
|
+
0x1908191908081908, 0x190819190808192b, 0x1908191908082b19, 0x1908191908190808,
|
|
3942
|
+
0x190819190819082b, 0x1908191908191919, 0x1908191908192b08, 0x19081919082b0819,
|
|
3943
|
+
0x19081919082b1908, 0x1908191919080808, 0x190819191908082b, 0x1908191919081919,
|
|
3944
|
+
0x1908191919082b08, 0x1908191919190819, 0x1908191919191908, 0x19081919192b0808,
|
|
3945
|
+
0x19081919192b2b2b, 0x190819192b080819, 0x190819192b081908, 0x190819192b190808,
|
|
3946
|
+
0x1908192b08080808, 0x1908192b0808082b, 0x1908192b08081919, 0x1908192b08082b08,
|
|
3947
|
+
0x1908192b08190819, 0x1908192b08191908, 0x1908192b082b0808, 0x1908192b19080819,
|
|
3948
|
+
0x1908192b19081908, 0x1908192b19190808, 0x1908192b2b080808, 0x1908192b2b2b1919,
|
|
3949
|
+
0x19082b0808080819, 0x19082b0808081908, 0x19082b0808082b19, 0x19082b0808190808,
|
|
3950
|
+
0x19082b080819082b, 0x19082b0808191919, 0x19082b0808192b08, 0x19082b08082b0819,
|
|
3951
|
+
0x19082b08082b1908, 0x19082b0819080808, 0x19082b081908082b, 0x19082b0819081919,
|
|
3952
|
+
0x19082b0819082b08, 0x19082b0819190819, 0x19082b0819191908, 0x19082b08192b0808,
|
|
3953
|
+
0x19082b082b081908, 0x19082b082b190808, 0x19082b1908080808, 0x19082b190808082b,
|
|
3954
|
+
0x19082b1908081919, 0x19082b1908082b08, 0x19082b1908190819, 0x19082b1908191908,
|
|
3955
|
+
0x19082b19082b0808, 0x19082b1919080819, 0x19082b1919081908, 0x19082b1919190808,
|
|
3956
|
+
0x19082b192b080808, 0x19082b192b19192b, 0x19082b2b08080819, 0x19082b2b08081908,
|
|
3957
|
+
0x19082b2b08190808, 0x19082b2b19080808, 0x1919080808080808, 0x191908080808082b,
|
|
3958
|
+
0x1919080808081919, 0x1919080808082b08, 0x1919080808190819, 0x1919080808191908,
|
|
3959
|
+
0x191908080819192b, 0x1919080808192b19, 0x19190808082b0808, 0x19190808082b082b,
|
|
3960
|
+
0x19190808082b1919, 0x19190808082b2b08, 0x1919080819080819, 0x1919080819081908,
|
|
3961
|
+
0x191908081908192b, 0x1919080819082b19, 0x1919080819190808, 0x191908081919082b,
|
|
3962
|
+
0x1919080819191919, 0x1919080819192b08, 0x19190808192b0819, 0x19190808192b1908,
|
|
3963
|
+
0x191908082b080808, 0x191908082b08082b, 0x191908082b081919, 0x191908082b082b08,
|
|
3964
|
+
0x191908082b190819, 0x191908082b191908, 0x1919081908080819, 0x1919081908081908,
|
|
3965
|
+
0x191908190808192b, 0x1919081908082b19, 0x1919081908190808, 0x191908190819082b,
|
|
3966
|
+
0x1919081908191919, 0x1919081908192b08, 0x19190819082b0819, 0x19190819082b1908,
|
|
3967
|
+
0x1919081919080808, 0x191908191908082b, 0x1919081919081919, 0x1919081919082b08,
|
|
3968
|
+
0x1919081919190819, 0x1919081919191908, 0x19190819192b0808, 0x191908192b080819,
|
|
3969
|
+
0x191908192b081908, 0x191908192b190808, 0x1919082b08080808, 0x1919082b08081919,
|
|
3970
|
+
0x1919082b08082b08, 0x1919082b08190819, 0x1919082b08191908, 0x1919082b082b0808,
|
|
3971
|
+
0x1919082b19080819, 0x1919082b19081908, 0x1919082b19190808, 0x1919082b192b2b19,
|
|
3972
|
+
0x1919082b2b080808, 0x1919190808080819, 0x1919190808081908, 0x191919080808192b,
|
|
3973
|
+
0x1919190808082b19, 0x1919190808190808, 0x191919080819082b, 0x1919190808191919,
|
|
3974
|
+
0x1919190808192b08, 0x19191908082b0819, 0x19191908082b1908, 0x1919190819080808,
|
|
3975
|
+
0x191919081908082b, 0x1919190819081919, 0x1919190819082b08, 0x1919190819190819,
|
|
3976
|
+
0x1919190819191908, 0x19191908192b0808, 0x191919082b080819, 0x191919082b081908,
|
|
3977
|
+
0x191919082b190808, 0x1919191908080808, 0x191919190808082b, 0x1919191908081919,
|
|
3978
|
+
0x1919191908082b08, 0x1919191908190819, 0x1919191908191908, 0x19191919082b0808,
|
|
3979
|
+
0x1919191919080819, 0x1919191919081908, 0x1919191919190808, 0x191919192b080808,
|
|
3980
|
+
0x1919192b08080819, 0x1919192b08081908, 0x1919192b08190808, 0x1919192b082b192b,
|
|
3981
|
+
0x1919192b19080808, 0x19192b0808080808, 0x19192b080808082b, 0x19192b0808081919,
|
|
3982
|
+
0x19192b0808082b08, 0x19192b0808190819, 0x19192b0808191908, 0x19192b08082b0808,
|
|
3983
|
+
0x19192b0819080819, 0x19192b0819081908, 0x19192b0819190808, 0x19192b0819192b2b,
|
|
3984
|
+
0x19192b082b080808, 0x19192b1908080819, 0x19192b1908081908, 0x19192b1908190808,
|
|
3985
|
+
0x19192b1919080808, 0x19192b2b08080808, 0x19192b2b08192b19, 0x19192b2b2b081919,
|
|
3986
|
+
0x19192b2b2b2b2b08, 0x192b080808080819, 0x192b080808081908, 0x192b08080808192b,
|
|
3987
|
+
0x192b080808190808, 0x192b08080819082b, 0x192b080808191919, 0x192b080808192b08,
|
|
3988
|
+
0x192b0808082b0819, 0x192b0808082b1908, 0x192b080819080808, 0x192b080819081919,
|
|
3989
|
+
0x192b080819082b08, 0x192b080819190819, 0x192b080819191908, 0x192b0808192b0808,
|
|
3990
|
+
0x192b08082b081908, 0x192b08082b190808, 0x192b081908080808, 0x192b08190808082b,
|
|
3991
|
+
0x192b081908081919, 0x192b081908082b08, 0x192b081908190819, 0x192b081908191908,
|
|
3992
|
+
0x192b0819082b0808, 0x192b081919080819, 0x192b081919081908, 0x192b081919190808,
|
|
3993
|
+
0x192b08192b080808, 0x192b08192b192b19, 0x192b082b08081908, 0x192b082b08190808,
|
|
3994
|
+
0x192b082b19080808, 0x192b082b1919192b, 0x192b082b2b2b0819, 0x192b190808080808,
|
|
3995
|
+
0x192b190808081919, 0x192b190808082b08, 0x192b190808190819, 0x192b190808191908,
|
|
3996
|
+
0x192b1908082b0808, 0x192b190819080819, 0x192b190819081908, 0x192b190819190808,
|
|
3997
|
+
0x192b19082b080808, 0x192b191908080819, 0x192b191908081908, 0x192b191908190808,
|
|
3998
|
+
0x192b191919080808, 0x192b191919082b2b, 0x192b1919192b2b08, 0x192b19192b19082b,
|
|
3999
|
+
0x192b192b08080808, 0x192b192b2b191908, 0x192b2b0808080819, 0x192b2b0808081908,
|
|
4000
|
+
0x192b2b0808190808, 0x192b2b08192b1919, 0x192b2b082b192b08, 0x192b2b1908080808,
|
|
4001
|
+
0x192b2b19082b2b2b, 0x192b2b2b1908082b, 0x192b2b2b2b2b0819, 0x2b08080808080808,
|
|
4002
|
+
0x2b0808080808082b, 0x2b08080808081919, 0x2b08080808082b08, 0x2b08080808190819,
|
|
4003
|
+
0x2b08080808191908, 0x2b08080808192b19, 0x2b080808082b0808, 0x2b080808082b1919,
|
|
4004
|
+
0x2b08080819080819, 0x2b08080819081908, 0x2b08080819190808, 0x2b0808081919082b,
|
|
4005
|
+
0x2b08080819191919, 0x2b08080819192b08, 0x2b080808192b0819, 0x2b0808082b080808,
|
|
4006
|
+
0x2b0808082b081919, 0x2b0808082b190819, 0x2b0808082b191908, 0x2b08081908080819,
|
|
4007
|
+
0x2b08081908081908, 0x2b08081908082b19, 0x2b08081908190808, 0x2b0808190819082b,
|
|
4008
|
+
0x2b08081908191919, 0x2b08081908192b08, 0x2b080819082b0819, 0x2b080819082b1908,
|
|
4009
|
+
0x2b08081919080808, 0x2b0808191908082b, 0x2b08081919081919, 0x2b08081919082b08,
|
|
4010
|
+
0x2b08081919190819, 0x2b08081919191908, 0x2b0808192b080819, 0x2b0808192b081908,
|
|
4011
|
+
0x2b0808192b190808, 0x2b0808192b2b2b19, 0x2b08082b08080808, 0x2b08082b08081919,
|
|
4012
|
+
0x2b08082b08082b2b, 0x2b08082b08190819, 0x2b08082b08191908, 0x2b08082b19080819,
|
|
4013
|
+
0x2b08082b19081908, 0x2b08082b19190808, 0x2b08190808080819, 0x2b08190808081908,
|
|
4014
|
+
0x2b0819080808192b, 0x2b08190808082b19, 0x2b08190808190808, 0x2b0819080819082b,
|
|
4015
|
+
0x2b08190808191919, 0x2b08190808192b08, 0x2b081908082b0819, 0x2b08190819080808,
|
|
4016
|
+
0x2b0819081908082b, 0x2b08190819081919, 0x2b08190819082b08, 0x2b08190819190819,
|
|
4017
|
+
0x2b08190819191908, 0x2b081908192b0808, 0x2b0819082b080819, 0x2b0819082b081908,
|
|
4018
|
+
0x2b0819082b190808, 0x2b08191908080808, 0x2b0819190808082b, 0x2b08191908081919,
|
|
4019
|
+
0x2b08191908082b08, 0x2b08191908190819, 0x2b08191908191908, 0x2b081919082b0808,
|
|
4020
|
+
0x2b08191919080819, 0x2b08191919081908, 0x2b08191919190808, 0x2b0819192b080808,
|
|
4021
|
+
0x2b0819192b082b2b, 0x2b08192b08080819, 0x2b08192b08081908, 0x2b08192b08190808,
|
|
4022
|
+
0x2b08192b082b2b19, 0x2b08192b19080808, 0x2b082b0808080808, 0x2b082b0808081919,
|
|
4023
|
+
0x2b082b0808190819, 0x2b082b0808191908, 0x2b082b0819080819, 0x2b082b0819081908,
|
|
4024
|
+
0x2b082b0819190808, 0x2b082b082b2b082b, 0x2b082b1908080819, 0x2b082b1908081908,
|
|
4025
|
+
0x2b082b1919080808, 0x2b082b19192b1919, 0x2b082b2b082b082b, 0x2b082b2b19192b08,
|
|
4026
|
+
0x2b082b2b19192b2b, 0x2b082b2b2b08082b, 0x2b082b2b2b2b082b, 0x2b19080808080819,
|
|
4027
|
+
0x2b19080808081908, 0x2b19080808082b19, 0x2b19080808190808, 0x2b1908080819082b,
|
|
4028
|
+
0x2b19080808191919, 0x2b19080808192b08, 0x2b190808082b1908, 0x2b19080819080808,
|
|
4029
|
+
0x2b1908081908082b, 0x2b19080819081919, 0x2b19080819082b08, 0x2b19080819190819,
|
|
4030
|
+
0x2b19080819191908, 0x2b190808192b0808, 0x2b1908082b080819, 0x2b1908082b081908,
|
|
4031
|
+
0x2b1908082b190808, 0x2b19081908080808, 0x2b19081908081919, 0x2b19081908190819,
|
|
4032
|
+
0x2b19081908191908, 0x2b19081919080819, 0x2b19081919081908, 0x2b19081919190808,
|
|
4033
|
+
0x2b19081919192b2b, 0x2b19082b08080819, 0x2b19082b08081908, 0x2b19082b08190808,
|
|
4034
|
+
0x2b19082b19080808, 0x2b19082b2b2b192b, 0x2b19190808080808, 0x2b1919080808082b,
|
|
4035
|
+
0x2b19190808081919, 0x2b19190808082b08, 0x2b19190808190819, 0x2b19190808191908,
|
|
4036
|
+
0x2b191908082b0808, 0x2b19190819080819, 0x2b19190819081908, 0x2b19190819190808,
|
|
4037
|
+
0x2b1919082b080808, 0x2b1919082b19192b, 0x2b19191908080819, 0x2b19191908081908,
|
|
4038
|
+
0x2b19191908190808, 0x2b19191919080808, 0x2b1919192b192b08, 0x2b1919192b2b0819,
|
|
4039
|
+
0x2b19192b08080808, 0x2b19192b1908192b, 0x2b19192b192b1908, 0x2b192b0808080819,
|
|
4040
|
+
0x2b192b0808081908, 0x2b192b0808190808, 0x2b192b08082b192b, 0x2b192b0819080808,
|
|
4041
|
+
0x2b192b082b2b2b19, 0x2b192b1908080808, 0x2b192b1919082b19, 0x2b192b191919082b,
|
|
4042
|
+
0x2b192b2b2b190808, 0x2b2b080808080808, 0x2b2b080808081919, 0x2b2b080808082b2b,
|
|
4043
|
+
0x2b2b080808191908, 0x2b2b0808082b082b, 0x2b2b0808082b2b2b, 0x2b2b080819080819,
|
|
4044
|
+
0x2b2b080819081908, 0x2b2b080819190808, 0x2b2b08082b2b082b, 0x2b2b08082b2b2b2b,
|
|
4045
|
+
0x2b2b081919080808, 0x2b2b0819192b1919, 0x2b2b082b0808082b, 0x2b2b082b08082b2b,
|
|
4046
|
+
0x2b2b082b082b082b, 0x2b2b082b082b2b08, 0x2b2b082b082b2b2b, 0x2b2b082b2b08082b,
|
|
4047
|
+
0x2b2b082b2b082b08, 0x2b2b082b2b082b2b, 0x2b2b082b2b2b2b08, 0x2b2b190808080819,
|
|
4048
|
+
0x2b2b190808081908, 0x2b2b190808190808, 0x2b2b190819080808, 0x2b2b19082b082b19,
|
|
4049
|
+
0x2b2b19082b2b1908, 0x2b2b191908080808, 0x2b2b191908192b19, 0x2b2b192b19190819,
|
|
4050
|
+
0x2b2b2b0808082b2b, 0x2b2b2b08082b2b08, 0x2b2b2b082b2b082b, 0x2b2b2b1919191908,
|
|
4051
|
+
0x2b2b2b192b08192b, 0x2b2b2b2b08082b08, 0x2b2b2b2b08082b2b, 0x2b2b2b2b082b0808,
|
|
4052
|
+
0x2b2b2b2b082b082b, 0x2b2b2b2b082b2b08, 0x2b2b2b2b2b082b08, 0x2b2b2b2b2b2b2b2b,
|
|
4053
|
+
};
|
|
4054
|
+
|
|
3757
4055
|
constexpr constant static uint32_t iq3xxs_grid[256] = {
|
|
3758
4056
|
0x04040404, 0x04040414, 0x04040424, 0x04040c0c, 0x04040c1c, 0x04040c3e, 0x04041404, 0x04041414,
|
|
3759
4057
|
0x04041c0c, 0x04042414, 0x04043e1c, 0x04043e2c, 0x040c040c, 0x040c041c, 0x040c0c04, 0x040c0c14,
|
|
@@ -3789,6 +4087,73 @@ constexpr constant static uint32_t iq3xxs_grid[256] = {
|
|
|
3789
4087
|
0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c, 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04,
|
|
3790
4088
|
};
|
|
3791
4089
|
|
|
4090
|
+
constexpr constant static uint32_t iq3s_grid[512] = {
|
|
4091
|
+
0x01010101, 0x01010103, 0x01010105, 0x0101010b, 0x0101010f, 0x01010301, 0x01010303, 0x01010305,
|
|
4092
|
+
0x01010309, 0x0101030d, 0x01010501, 0x01010503, 0x0101050b, 0x01010707, 0x01010901, 0x01010905,
|
|
4093
|
+
0x0101090b, 0x0101090f, 0x01010b03, 0x01010b07, 0x01010d01, 0x01010d05, 0x01010f03, 0x01010f09,
|
|
4094
|
+
0x01010f0f, 0x01030101, 0x01030103, 0x01030105, 0x01030109, 0x01030301, 0x01030303, 0x0103030b,
|
|
4095
|
+
0x01030501, 0x01030507, 0x0103050f, 0x01030703, 0x0103070b, 0x01030909, 0x01030d03, 0x01030d0b,
|
|
4096
|
+
0x01030f05, 0x01050101, 0x01050103, 0x0105010b, 0x0105010f, 0x01050301, 0x01050307, 0x0105030d,
|
|
4097
|
+
0x01050503, 0x0105050b, 0x01050701, 0x01050709, 0x01050905, 0x0105090b, 0x0105090f, 0x01050b03,
|
|
4098
|
+
0x01050b07, 0x01050f01, 0x01050f07, 0x01070107, 0x01070303, 0x0107030b, 0x01070501, 0x01070505,
|
|
4099
|
+
0x01070703, 0x01070707, 0x0107070d, 0x01070909, 0x01070b01, 0x01070b05, 0x01070d0f, 0x01070f03,
|
|
4100
|
+
0x01070f0b, 0x01090101, 0x01090307, 0x0109030f, 0x01090503, 0x01090509, 0x01090705, 0x01090901,
|
|
4101
|
+
0x01090907, 0x01090b03, 0x01090f01, 0x010b0105, 0x010b0109, 0x010b0501, 0x010b0505, 0x010b050d,
|
|
4102
|
+
0x010b0707, 0x010b0903, 0x010b090b, 0x010b090f, 0x010b0d0d, 0x010b0f07, 0x010d010d, 0x010d0303,
|
|
4103
|
+
0x010d0307, 0x010d0703, 0x010d0b05, 0x010d0f03, 0x010f0101, 0x010f0105, 0x010f0109, 0x010f0501,
|
|
4104
|
+
0x010f0505, 0x010f050d, 0x010f0707, 0x010f0b01, 0x010f0b09, 0x03010101, 0x03010103, 0x03010105,
|
|
4105
|
+
0x03010109, 0x03010301, 0x03010303, 0x03010307, 0x0301030b, 0x0301030f, 0x03010501, 0x03010505,
|
|
4106
|
+
0x03010703, 0x03010709, 0x0301070d, 0x03010b09, 0x03010b0d, 0x03010d03, 0x03010f05, 0x03030101,
|
|
4107
|
+
0x03030103, 0x03030107, 0x0303010d, 0x03030301, 0x03030309, 0x03030503, 0x03030701, 0x03030707,
|
|
4108
|
+
0x03030903, 0x03030b01, 0x03030b05, 0x03030f01, 0x03030f0d, 0x03050101, 0x03050305, 0x0305030b,
|
|
4109
|
+
0x0305030f, 0x03050501, 0x03050509, 0x03050705, 0x03050901, 0x03050907, 0x03050b0b, 0x03050d01,
|
|
4110
|
+
0x03050f05, 0x03070103, 0x03070109, 0x0307010f, 0x03070301, 0x03070307, 0x03070503, 0x0307050f,
|
|
4111
|
+
0x03070701, 0x03070709, 0x03070903, 0x03070d05, 0x03070f01, 0x03090107, 0x0309010b, 0x03090305,
|
|
4112
|
+
0x03090309, 0x03090703, 0x03090707, 0x03090905, 0x0309090d, 0x03090b01, 0x03090b09, 0x030b0103,
|
|
4113
|
+
0x030b0301, 0x030b0307, 0x030b0503, 0x030b0701, 0x030b0705, 0x030b0b03, 0x030d0501, 0x030d0509,
|
|
4114
|
+
0x030d050f, 0x030d0909, 0x030d090d, 0x030f0103, 0x030f0107, 0x030f0301, 0x030f0305, 0x030f0503,
|
|
4115
|
+
0x030f070b, 0x030f0903, 0x030f0d05, 0x030f0f01, 0x05010101, 0x05010103, 0x05010107, 0x0501010b,
|
|
4116
|
+
0x0501010f, 0x05010301, 0x05010305, 0x05010309, 0x0501030d, 0x05010503, 0x05010507, 0x0501050f,
|
|
4117
|
+
0x05010701, 0x05010705, 0x05010903, 0x05010907, 0x0501090b, 0x05010b01, 0x05010b05, 0x05010d0f,
|
|
4118
|
+
0x05010f01, 0x05010f07, 0x05010f0b, 0x05030101, 0x05030105, 0x05030301, 0x05030307, 0x0503030f,
|
|
4119
|
+
0x05030505, 0x0503050b, 0x05030703, 0x05030709, 0x05030905, 0x05030b03, 0x05050103, 0x05050109,
|
|
4120
|
+
0x0505010f, 0x05050503, 0x05050507, 0x05050701, 0x0505070f, 0x05050903, 0x05050b07, 0x05050b0f,
|
|
4121
|
+
0x05050f03, 0x05050f09, 0x05070101, 0x05070105, 0x0507010b, 0x05070303, 0x05070505, 0x05070509,
|
|
4122
|
+
0x05070703, 0x05070707, 0x05070905, 0x05070b01, 0x05070d0d, 0x05090103, 0x0509010f, 0x05090501,
|
|
4123
|
+
0x05090507, 0x05090705, 0x0509070b, 0x05090903, 0x05090f05, 0x05090f0b, 0x050b0109, 0x050b0303,
|
|
4124
|
+
0x050b0505, 0x050b070f, 0x050b0901, 0x050b0b07, 0x050b0f01, 0x050d0101, 0x050d0105, 0x050d010f,
|
|
4125
|
+
0x050d0503, 0x050d0b0b, 0x050d0d03, 0x050f010b, 0x050f0303, 0x050f050d, 0x050f0701, 0x050f0907,
|
|
4126
|
+
0x050f0b01, 0x07010105, 0x07010303, 0x07010307, 0x0701030b, 0x0701030f, 0x07010505, 0x07010703,
|
|
4127
|
+
0x07010707, 0x0701070b, 0x07010905, 0x07010909, 0x0701090f, 0x07010b03, 0x07010d07, 0x07010f03,
|
|
4128
|
+
0x07030103, 0x07030107, 0x0703010b, 0x07030309, 0x07030503, 0x07030507, 0x07030901, 0x07030d01,
|
|
4129
|
+
0x07030f05, 0x07030f0d, 0x07050101, 0x07050305, 0x07050501, 0x07050705, 0x07050709, 0x07050b01,
|
|
4130
|
+
0x07070103, 0x07070301, 0x07070309, 0x07070503, 0x07070507, 0x0707050f, 0x07070701, 0x07070903,
|
|
4131
|
+
0x07070907, 0x0707090f, 0x07070b0b, 0x07070f07, 0x07090107, 0x07090303, 0x0709030d, 0x07090505,
|
|
4132
|
+
0x07090703, 0x07090b05, 0x07090d01, 0x07090d09, 0x070b0103, 0x070b0301, 0x070b0305, 0x070b050b,
|
|
4133
|
+
0x070b0705, 0x070b0909, 0x070b0b0d, 0x070b0f07, 0x070d030d, 0x070d0903, 0x070f0103, 0x070f0107,
|
|
4134
|
+
0x070f0501, 0x070f0505, 0x070f070b, 0x09010101, 0x09010109, 0x09010305, 0x09010501, 0x09010509,
|
|
4135
|
+
0x0901050f, 0x09010705, 0x09010903, 0x09010b01, 0x09010f01, 0x09030105, 0x0903010f, 0x09030303,
|
|
4136
|
+
0x09030307, 0x09030505, 0x09030701, 0x0903070b, 0x09030907, 0x09030b03, 0x09030b0b, 0x09050103,
|
|
4137
|
+
0x09050107, 0x09050301, 0x0905030b, 0x09050503, 0x09050707, 0x09050901, 0x09050b0f, 0x09050d05,
|
|
4138
|
+
0x09050f01, 0x09070109, 0x09070303, 0x09070307, 0x09070501, 0x09070505, 0x09070703, 0x0907070b,
|
|
4139
|
+
0x09090101, 0x09090105, 0x09090509, 0x0909070f, 0x09090901, 0x09090f03, 0x090b010b, 0x090b010f,
|
|
4140
|
+
0x090b0503, 0x090b0d05, 0x090d0307, 0x090d0709, 0x090d0d01, 0x090f0301, 0x090f030b, 0x090f0701,
|
|
4141
|
+
0x090f0907, 0x090f0b03, 0x0b010105, 0x0b010301, 0x0b010309, 0x0b010505, 0x0b010901, 0x0b010909,
|
|
4142
|
+
0x0b01090f, 0x0b010b05, 0x0b010d0d, 0x0b010f09, 0x0b030103, 0x0b030107, 0x0b03010b, 0x0b030305,
|
|
4143
|
+
0x0b030503, 0x0b030705, 0x0b030f05, 0x0b050101, 0x0b050303, 0x0b050507, 0x0b050701, 0x0b05070d,
|
|
4144
|
+
0x0b050b07, 0x0b070105, 0x0b07010f, 0x0b070301, 0x0b07050f, 0x0b070909, 0x0b070b03, 0x0b070d0b,
|
|
4145
|
+
0x0b070f07, 0x0b090103, 0x0b090109, 0x0b090501, 0x0b090705, 0x0b09090d, 0x0b0b0305, 0x0b0b050d,
|
|
4146
|
+
0x0b0b0b03, 0x0b0b0b07, 0x0b0d0905, 0x0b0f0105, 0x0b0f0109, 0x0b0f0505, 0x0d010303, 0x0d010307,
|
|
4147
|
+
0x0d01030b, 0x0d010703, 0x0d010707, 0x0d010d01, 0x0d030101, 0x0d030501, 0x0d03050f, 0x0d030d09,
|
|
4148
|
+
0x0d050305, 0x0d050709, 0x0d050905, 0x0d050b0b, 0x0d050d05, 0x0d050f01, 0x0d070101, 0x0d070309,
|
|
4149
|
+
0x0d070503, 0x0d070901, 0x0d09050b, 0x0d090907, 0x0d090d05, 0x0d0b0101, 0x0d0b0107, 0x0d0b0709,
|
|
4150
|
+
0x0d0b0d01, 0x0d0d010b, 0x0d0d0901, 0x0d0f0303, 0x0d0f0307, 0x0f010101, 0x0f010109, 0x0f01010f,
|
|
4151
|
+
0x0f010501, 0x0f010505, 0x0f01070d, 0x0f010901, 0x0f010b09, 0x0f010d05, 0x0f030105, 0x0f030303,
|
|
4152
|
+
0x0f030509, 0x0f030907, 0x0f03090b, 0x0f050103, 0x0f050109, 0x0f050301, 0x0f05030d, 0x0f050503,
|
|
4153
|
+
0x0f050701, 0x0f050b03, 0x0f070105, 0x0f070705, 0x0f07070b, 0x0f070b07, 0x0f090103, 0x0f09010b,
|
|
4154
|
+
0x0f090307, 0x0f090501, 0x0f090b01, 0x0f0b0505, 0x0f0b0905, 0x0f0d0105, 0x0f0d0703, 0x0f0f0101,
|
|
4155
|
+
};
|
|
4156
|
+
|
|
3792
4157
|
#define NGRID_IQ1S 512
|
|
3793
4158
|
constexpr constant static uint64_t iq1s_grid[NGRID_IQ1S] = {
|
|
3794
4159
|
0xffffffffffff0101, 0xffffffffff01ff00, 0xffffffffff010100, 0xffffffff00000000,
|
|
@@ -3985,7 +4350,6 @@ void kernel_mul_mv_iq2_xxs_f32_impl(
|
|
|
3985
4350
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
|
3986
4351
|
}
|
|
3987
4352
|
|
|
3988
|
-
#if QK_K == 256
|
|
3989
4353
|
const int ix = tiisg;
|
|
3990
4354
|
|
|
3991
4355
|
device const float * y4 = y + 32 * ix;
|
|
@@ -4026,9 +4390,6 @@ void kernel_mul_mv_iq2_xxs_f32_impl(
|
|
|
4026
4390
|
|
|
4027
4391
|
y4 += 32 * 32;
|
|
4028
4392
|
}
|
|
4029
|
-
#else
|
|
4030
|
-
// TODO
|
|
4031
|
-
#endif
|
|
4032
4393
|
|
|
4033
4394
|
for (int row = 0; row < N_DST; ++row) {
|
|
4034
4395
|
all_sum = simd_sum(sumf[row]);
|
|
@@ -4118,7 +4479,6 @@ void kernel_mul_mv_iq2_xs_f32_impl(
|
|
|
4118
4479
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
|
4119
4480
|
}
|
|
4120
4481
|
|
|
4121
|
-
#if QK_K == 256
|
|
4122
4482
|
const int ix = tiisg;
|
|
4123
4483
|
|
|
4124
4484
|
device const float * y4 = y + 32 * ix;
|
|
@@ -4169,9 +4529,6 @@ void kernel_mul_mv_iq2_xs_f32_impl(
|
|
|
4169
4529
|
|
|
4170
4530
|
y4 += 32 * 32;
|
|
4171
4531
|
}
|
|
4172
|
-
#else
|
|
4173
|
-
// TODO
|
|
4174
|
-
#endif
|
|
4175
4532
|
|
|
4176
4533
|
for (int row = 0; row < N_DST; ++row) {
|
|
4177
4534
|
all_sum = simd_sum(sumf[row]);
|
|
@@ -4261,7 +4618,6 @@ void kernel_mul_mv_iq3_xxs_f32_impl(
|
|
|
4261
4618
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
|
4262
4619
|
}
|
|
4263
4620
|
|
|
4264
|
-
#if QK_K == 256
|
|
4265
4621
|
const int ix = tiisg;
|
|
4266
4622
|
|
|
4267
4623
|
device const float * y4 = y + 32 * ix;
|
|
@@ -4305,9 +4661,6 @@ void kernel_mul_mv_iq3_xxs_f32_impl(
|
|
|
4305
4661
|
|
|
4306
4662
|
y4 += 32 * 32;
|
|
4307
4663
|
}
|
|
4308
|
-
#else
|
|
4309
|
-
// TODO
|
|
4310
|
-
#endif
|
|
4311
4664
|
|
|
4312
4665
|
for (int row = 0; row < N_DST; ++row) {
|
|
4313
4666
|
all_sum = simd_sum(sumf[row]);
|
|
@@ -4346,7 +4699,7 @@ kernel void kernel_mul_mv_iq3_xxs_f32(
|
|
|
4346
4699
|
kernel_mul_mv_iq3_xxs_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
|
|
4347
4700
|
}
|
|
4348
4701
|
|
|
4349
|
-
void
|
|
4702
|
+
void kernel_mul_mv_iq3_s_f32_impl(
|
|
4350
4703
|
device const void * src0,
|
|
4351
4704
|
device const float * src1,
|
|
4352
4705
|
device float * dst,
|
|
@@ -4359,6 +4712,7 @@ void kernel_mul_mv_iq1_s_f32_impl(
|
|
|
4359
4712
|
constant int64_t & ne1,
|
|
4360
4713
|
constant uint & r2,
|
|
4361
4714
|
constant uint & r3,
|
|
4715
|
+
threadgroup int8_t * shared_values [[threadgroup(0)]],
|
|
4362
4716
|
uint3 tgpig[[threadgroup_position_in_grid]],
|
|
4363
4717
|
uint tiisg[[thread_index_in_simdgroup]],
|
|
4364
4718
|
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
|
@@ -4376,56 +4730,69 @@ void kernel_mul_mv_iq1_s_f32_impl(
|
|
|
4376
4730
|
|
|
4377
4731
|
const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
|
|
4378
4732
|
|
|
4379
|
-
device const
|
|
4733
|
+
device const block_iq3_s * x = (device const block_iq3_s *) src0 + ib_row + offset0;
|
|
4380
4734
|
device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1;
|
|
4381
4735
|
|
|
4382
|
-
float yl[
|
|
4736
|
+
float yl[32];
|
|
4383
4737
|
float sumf[N_DST]={0.f}, all_sum;
|
|
4384
4738
|
|
|
4385
4739
|
const int nb32 = nb * (QK_K / 32);
|
|
4386
4740
|
|
|
4387
|
-
|
|
4388
|
-
|
|
4389
|
-
|
|
4741
|
+
threadgroup uint32_t * values = (threadgroup uint32_t *)shared_values;
|
|
4742
|
+
{
|
|
4743
|
+
int nval = 8;
|
|
4744
|
+
int pos = (32*sgitg + tiisg)*nval;
|
|
4745
|
+
for (int i = 0; i < nval; ++i) values[pos + i] = iq3s_grid[pos + i];
|
|
4746
|
+
threadgroup_barrier(mem_flags::mem_threadgroup);
|
|
4747
|
+
}
|
|
4390
4748
|
|
|
4391
|
-
|
|
4749
|
+
const int ix = tiisg;
|
|
4392
4750
|
|
|
4393
|
-
|
|
4751
|
+
device const float * y4 = y + 32 * ix;
|
|
4394
4752
|
|
|
4395
|
-
|
|
4753
|
+
for (int ib32 = ix; ib32 < nb32; ib32 += 32) {
|
|
4754
|
+
|
|
4755
|
+
for (int i = 0; i < 32; ++i) {
|
|
4396
4756
|
yl[i] = y4[i];
|
|
4397
4757
|
}
|
|
4398
4758
|
|
|
4399
4759
|
const int ibl = ib32 / (QK_K / 32);
|
|
4400
4760
|
const int ib = ib32 % (QK_K / 32);
|
|
4401
4761
|
|
|
4402
|
-
device const
|
|
4403
|
-
device const uint8_t * qs = xr->qs +
|
|
4404
|
-
device const uint8_t *
|
|
4405
|
-
device const
|
|
4762
|
+
device const block_iq3_s * xr = x + ibl;
|
|
4763
|
+
device const uint8_t * qs = xr->qs + 8 * ib;
|
|
4764
|
+
device const uint8_t * qh = xr->qh + ib;
|
|
4765
|
+
device const uint8_t * sc = xr->scales + (ib/2);
|
|
4766
|
+
device const uint8_t * signs = xr->signs + 4 * ib;
|
|
4767
|
+
device const half * dh = &xr->d;
|
|
4406
4768
|
|
|
4407
4769
|
for (int row = 0; row < N_DST; row++) {
|
|
4408
4770
|
|
|
4409
|
-
|
|
4410
|
-
|
|
4771
|
+
const float db = dh[0];
|
|
4772
|
+
const float d = db * (1 + 2*((sc[0] >> 4*(ib%2)) & 0xf));
|
|
4411
4773
|
|
|
4412
4774
|
float2 sum = {0};
|
|
4413
|
-
for (int
|
|
4414
|
-
|
|
4415
|
-
|
|
4775
|
+
for (int l = 0; l < 4; ++l) {
|
|
4776
|
+
const threadgroup uint32_t * table1 = qh[0] & kmask_iq2xs[2*l+0] ? values + 256 : values;
|
|
4777
|
+
const threadgroup uint32_t * table2 = qh[0] & kmask_iq2xs[2*l+1] ? values + 256 : values;
|
|
4778
|
+
const threadgroup uint8_t * grid1 = (const threadgroup uint8_t *)(table1 + qs[2*l+0]);
|
|
4779
|
+
const threadgroup uint8_t * grid2 = (const threadgroup uint8_t *)(table2 + qs[2*l+1]);
|
|
4780
|
+
for (int j = 0; j < 4; ++j) {
|
|
4781
|
+
sum[0] += yl[8*l + j + 0] * grid1[j] * select(1, -1, signs[l] & kmask_iq2xs[j+0]);
|
|
4782
|
+
sum[1] += yl[8*l + j + 4] * grid2[j] * select(1, -1, signs[l] & kmask_iq2xs[j+4]);
|
|
4783
|
+
}
|
|
4416
4784
|
}
|
|
4417
|
-
sumf[row] +=
|
|
4785
|
+
sumf[row] += d * (sum[0] + sum[1]);
|
|
4418
4786
|
|
|
4419
|
-
dh
|
|
4420
|
-
qs
|
|
4421
|
-
|
|
4787
|
+
dh += nb*sizeof(block_iq3_s)/2;
|
|
4788
|
+
qs += nb*sizeof(block_iq3_s);
|
|
4789
|
+
qh += nb*sizeof(block_iq3_s);
|
|
4790
|
+
sc += nb*sizeof(block_iq3_s);
|
|
4791
|
+
signs += nb*sizeof(block_iq3_s);
|
|
4422
4792
|
}
|
|
4423
4793
|
|
|
4424
|
-
y4 +=
|
|
4794
|
+
y4 += 32 * 32;
|
|
4425
4795
|
}
|
|
4426
|
-
#else
|
|
4427
|
-
// TODO
|
|
4428
|
-
#endif
|
|
4429
4796
|
|
|
4430
4797
|
for (int row = 0; row < N_DST; ++row) {
|
|
4431
4798
|
all_sum = simd_sum(sumf[row]);
|
|
@@ -4435,8 +4802,8 @@ void kernel_mul_mv_iq1_s_f32_impl(
|
|
|
4435
4802
|
}
|
|
4436
4803
|
}
|
|
4437
4804
|
|
|
4438
|
-
[[host_name("
|
|
4439
|
-
kernel void
|
|
4805
|
+
[[host_name("kernel_mul_mv_iq3_s_f32")]]
|
|
4806
|
+
kernel void kernel_mul_mv_iq3_s_f32(
|
|
4440
4807
|
device const void * src0,
|
|
4441
4808
|
device const float * src1,
|
|
4442
4809
|
device float * dst,
|
|
@@ -4456,136 +4823,636 @@ kernel void kernel_mul_mv_iq1_s_f32(
|
|
|
4456
4823
|
constant int64_t & ne1,
|
|
4457
4824
|
constant uint & r2,
|
|
4458
4825
|
constant uint & r3,
|
|
4826
|
+
threadgroup int8_t * shared_values [[threadgroup(0)]],
|
|
4459
4827
|
uint3 tgpig[[threadgroup_position_in_grid]],
|
|
4460
4828
|
uint tiisg[[thread_index_in_simdgroup]],
|
|
4461
4829
|
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
|
4462
4830
|
|
|
4463
|
-
|
|
4831
|
+
kernel_mul_mv_iq3_s_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
|
|
4464
4832
|
}
|
|
4465
4833
|
|
|
4834
|
+
void kernel_mul_mv_iq2_s_f32_impl(
|
|
4835
|
+
device const void * src0,
|
|
4836
|
+
device const float * src1,
|
|
4837
|
+
device float * dst,
|
|
4838
|
+
constant int64_t & ne00,
|
|
4839
|
+
constant int64_t & ne01,
|
|
4840
|
+
constant int64_t & ne02,
|
|
4841
|
+
constant int64_t & ne10,
|
|
4842
|
+
constant int64_t & ne12,
|
|
4843
|
+
constant int64_t & ne0,
|
|
4844
|
+
constant int64_t & ne1,
|
|
4845
|
+
constant uint & r2,
|
|
4846
|
+
constant uint & r3,
|
|
4847
|
+
threadgroup int8_t * shared_values [[threadgroup(0)]],
|
|
4848
|
+
uint3 tgpig[[threadgroup_position_in_grid]],
|
|
4849
|
+
uint tiisg[[thread_index_in_simdgroup]],
|
|
4850
|
+
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
|
4466
4851
|
|
|
4467
|
-
|
|
4468
|
-
|
|
4469
|
-
|
|
4470
|
-
|
|
4471
|
-
void dequantize_f32(device const float4x4 * src, short il, thread type4x4 & reg) {
|
|
4472
|
-
float4x4 temp = *(((device float4x4 *)src));
|
|
4473
|
-
for (int i = 0; i < 16; i++){
|
|
4474
|
-
reg[i/4][i%4] = temp[i/4][i%4];
|
|
4475
|
-
}
|
|
4476
|
-
}
|
|
4852
|
+
const int nb = ne00/QK_K;
|
|
4853
|
+
const int r0 = tgpig.x;
|
|
4854
|
+
const int r1 = tgpig.y;
|
|
4855
|
+
const int im = tgpig.z;
|
|
4477
4856
|
|
|
4478
|
-
|
|
4479
|
-
|
|
4480
|
-
half4x4 temp = *(((device half4x4 *)src));
|
|
4481
|
-
for (int i = 0; i < 16; i++){
|
|
4482
|
-
reg[i/4][i%4] = temp[i/4][i%4];
|
|
4483
|
-
}
|
|
4484
|
-
}
|
|
4857
|
+
const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
|
|
4858
|
+
const int ib_row = first_row * nb;
|
|
4485
4859
|
|
|
4486
|
-
|
|
4487
|
-
|
|
4488
|
-
device const uint16_t * qs = ((device const uint16_t *)xb + 1);
|
|
4489
|
-
const float d1 = il ? (xb->d / 16.h) : xb->d;
|
|
4490
|
-
const float d2 = d1 / 256.f;
|
|
4491
|
-
const float md = -8.h * xb->d;
|
|
4492
|
-
const ushort mask0 = il ? 0x00F0 : 0x000F;
|
|
4493
|
-
const ushort mask1 = mask0 << 8;
|
|
4860
|
+
const uint i12 = im%ne12;
|
|
4861
|
+
const uint i13 = im/ne12;
|
|
4494
4862
|
|
|
4495
|
-
|
|
4496
|
-
reg[i/2][2*(i%2)+0] = d1 * (qs[i] & mask0) + md;
|
|
4497
|
-
reg[i/2][2*(i%2)+1] = d2 * (qs[i] & mask1) + md;
|
|
4498
|
-
}
|
|
4499
|
-
}
|
|
4863
|
+
const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
|
|
4500
4864
|
|
|
4501
|
-
|
|
4502
|
-
|
|
4503
|
-
device const uint16_t * qs = ((device const uint16_t *)xb + 2);
|
|
4504
|
-
const float d1 = il ? (xb->d / 16.h) : xb->d;
|
|
4505
|
-
const float d2 = d1 / 256.f;
|
|
4506
|
-
const float m = xb->m;
|
|
4507
|
-
const ushort mask0 = il ? 0x00F0 : 0x000F;
|
|
4508
|
-
const ushort mask1 = mask0 << 8;
|
|
4865
|
+
device const block_iq2_s * x = (device const block_iq2_s *) src0 + ib_row + offset0;
|
|
4866
|
+
device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1;
|
|
4509
4867
|
|
|
4510
|
-
|
|
4511
|
-
|
|
4512
|
-
reg[i/2][2*(i%2)+1] = ((qs[i] & mask1) * d2) + m;
|
|
4513
|
-
}
|
|
4514
|
-
}
|
|
4868
|
+
float yl[32];
|
|
4869
|
+
float sumf[N_DST]={0.f}, all_sum;
|
|
4515
4870
|
|
|
4516
|
-
|
|
4517
|
-
void dequantize_q5_0(device const block_q5_0 *xb, short il, thread type4x4 & reg) {
|
|
4518
|
-
device const uint16_t * qs = ((device const uint16_t *)xb + 3);
|
|
4519
|
-
const float d = xb->d;
|
|
4520
|
-
const float md = -16.h * xb->d;
|
|
4521
|
-
const ushort mask = il ? 0x00F0 : 0x000F;
|
|
4871
|
+
const int nb32 = nb * (QK_K / 32);
|
|
4522
4872
|
|
|
4523
|
-
|
|
4873
|
+
//threadgroup uint64_t * values = (threadgroup uint64_t *)shared_values;
|
|
4874
|
+
//{
|
|
4875
|
+
// int nval = 32;
|
|
4876
|
+
// int pos = (32*sgitg + tiisg)*nval;
|
|
4877
|
+
// for (int i = 0; i < nval; ++i) values[pos + i] = iq2s_grid[pos + i];
|
|
4878
|
+
// threadgroup_barrier(mem_flags::mem_threadgroup);
|
|
4879
|
+
//}
|
|
4524
4880
|
|
|
4525
|
-
const int
|
|
4881
|
+
const int ix = tiisg;
|
|
4526
4882
|
|
|
4527
|
-
const
|
|
4528
|
-
const int gh_bk = il ? 0 : 4;
|
|
4883
|
+
device const float * y4 = y + 32 * ix;
|
|
4529
4884
|
|
|
4530
|
-
for (int
|
|
4531
|
-
// extract the 5-th bits for x0 and x1
|
|
4532
|
-
const uint8_t xh_0 = ((qh >> (gh_mv + 2*i )) << gh_bk) & 0x10;
|
|
4533
|
-
const uint8_t xh_1 = ((qh >> (gh_mv + 2*i+1)) << gh_bk) & 0x10;
|
|
4885
|
+
for (int ib32 = ix; ib32 < nb32; ib32 += 32) {
|
|
4534
4886
|
|
|
4535
|
-
|
|
4536
|
-
|
|
4537
|
-
|
|
4887
|
+
for (int i = 0; i < 32; ++i) {
|
|
4888
|
+
yl[i] = y4[i];
|
|
4889
|
+
}
|
|
4538
4890
|
|
|
4539
|
-
|
|
4540
|
-
|
|
4541
|
-
}
|
|
4542
|
-
}
|
|
4891
|
+
const int ibl = ib32 / (QK_K / 32);
|
|
4892
|
+
const int ib = ib32 % (QK_K / 32);
|
|
4543
4893
|
|
|
4544
|
-
|
|
4545
|
-
|
|
4546
|
-
|
|
4547
|
-
|
|
4548
|
-
|
|
4549
|
-
|
|
4894
|
+
device const block_iq2_s * xr = x + ibl;
|
|
4895
|
+
device const uint8_t * qs = xr->qs + 4 * ib;
|
|
4896
|
+
device const uint8_t * qh = xr->qh + ib;
|
|
4897
|
+
device const uint8_t * sc = xr->scales + ib;
|
|
4898
|
+
device const uint8_t * signs = qs + QK_K/8;
|
|
4899
|
+
device const half * dh = &xr->d;
|
|
4550
4900
|
|
|
4551
|
-
|
|
4901
|
+
for (int row = 0; row < N_DST; row++) {
|
|
4552
4902
|
|
|
4553
|
-
|
|
4903
|
+
const float db = dh[0];
|
|
4904
|
+
const float d1 = db * (0.5f + (sc[0] & 0xf));
|
|
4905
|
+
const float d2 = db * (0.5f + (sc[0] >> 4));
|
|
4554
4906
|
|
|
4555
|
-
|
|
4556
|
-
|
|
4907
|
+
float2 sum = {0};
|
|
4908
|
+
for (int l = 0; l < 2; ++l) {
|
|
4909
|
+
//const threadgroup uint8_t * grid1 = (const threadgroup uint8_t *)(values + (qs[l+0] | ((qh[0] << (8-2*l)) & 0x300)));
|
|
4910
|
+
//const threadgroup uint8_t * grid2 = (const threadgroup uint8_t *)(values + (qs[l+2] | ((qh[0] << (4-2*l)) & 0x300)));
|
|
4911
|
+
constant uint8_t * grid1 = (constant uint8_t *)(iq2s_grid + (qs[l+0] | ((qh[0] << (8-2*l)) & 0x300)));
|
|
4912
|
+
constant uint8_t * grid2 = (constant uint8_t *)(iq2s_grid + (qs[l+2] | ((qh[0] << (4-2*l)) & 0x300)));
|
|
4913
|
+
for (int j = 0; j < 8; ++j) {
|
|
4914
|
+
sum[0] += yl[8*l + j + 0] * grid1[j] * select(1, -1, signs[l+0] & kmask_iq2xs[j]);
|
|
4915
|
+
sum[1] += yl[8*l + j + 16] * grid2[j] * select(1, -1, signs[l+2] & kmask_iq2xs[j]);
|
|
4916
|
+
}
|
|
4917
|
+
}
|
|
4918
|
+
sumf[row] += d1 * sum[0] + d2 * sum[1];
|
|
4557
4919
|
|
|
4558
|
-
|
|
4559
|
-
|
|
4560
|
-
|
|
4561
|
-
|
|
4920
|
+
dh += nb*sizeof(block_iq2_s)/2;
|
|
4921
|
+
qs += nb*sizeof(block_iq2_s);
|
|
4922
|
+
qh += nb*sizeof(block_iq2_s);
|
|
4923
|
+
sc += nb*sizeof(block_iq2_s);
|
|
4924
|
+
signs += nb*sizeof(block_iq2_s);
|
|
4925
|
+
}
|
|
4562
4926
|
|
|
4563
|
-
|
|
4564
|
-
|
|
4565
|
-
const int32_t x1 = ((((qs[i] >> 8) & mask) >> x_mv) | xh_1);
|
|
4927
|
+
y4 += 32 * 32;
|
|
4928
|
+
}
|
|
4566
4929
|
|
|
4567
|
-
|
|
4568
|
-
|
|
4930
|
+
for (int row = 0; row < N_DST; ++row) {
|
|
4931
|
+
all_sum = simd_sum(sumf[row]);
|
|
4932
|
+
if (tiisg == 0) {
|
|
4933
|
+
dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum * 0.25f;
|
|
4934
|
+
}
|
|
4569
4935
|
}
|
|
4570
4936
|
}
|
|
4571
4937
|
|
|
4572
|
-
|
|
4573
|
-
void
|
|
4574
|
-
|
|
4575
|
-
|
|
4938
|
+
[[host_name("kernel_mul_mv_iq2_s_f32")]]
|
|
4939
|
+
kernel void kernel_mul_mv_iq2_s_f32(
|
|
4940
|
+
device const void * src0,
|
|
4941
|
+
device const float * src1,
|
|
4942
|
+
device float * dst,
|
|
4943
|
+
constant int64_t & ne00,
|
|
4944
|
+
constant int64_t & ne01,
|
|
4945
|
+
constant int64_t & ne02,
|
|
4946
|
+
constant uint64_t & nb00,
|
|
4947
|
+
constant uint64_t & nb01,
|
|
4948
|
+
constant uint64_t & nb02,
|
|
4949
|
+
constant int64_t & ne10,
|
|
4950
|
+
constant int64_t & ne11,
|
|
4951
|
+
constant int64_t & ne12,
|
|
4952
|
+
constant uint64_t & nb10,
|
|
4953
|
+
constant uint64_t & nb11,
|
|
4954
|
+
constant uint64_t & nb12,
|
|
4955
|
+
constant int64_t & ne0,
|
|
4956
|
+
constant int64_t & ne1,
|
|
4957
|
+
constant uint & r2,
|
|
4958
|
+
constant uint & r3,
|
|
4959
|
+
threadgroup int8_t * shared_values [[threadgroup(0)]],
|
|
4960
|
+
uint3 tgpig[[threadgroup_position_in_grid]],
|
|
4961
|
+
uint tiisg[[thread_index_in_simdgroup]],
|
|
4962
|
+
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
|
4576
4963
|
|
|
4577
|
-
|
|
4578
|
-
reg[i/4][i%4] = (qs[i + 16*il] * d);
|
|
4579
|
-
}
|
|
4964
|
+
kernel_mul_mv_iq2_s_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
|
|
4580
4965
|
}
|
|
4581
4966
|
|
|
4582
|
-
|
|
4583
|
-
|
|
4584
|
-
|
|
4585
|
-
|
|
4586
|
-
|
|
4587
|
-
|
|
4588
|
-
|
|
4967
|
+
void kernel_mul_mv_iq1_s_f32_impl(
|
|
4968
|
+
device const void * src0,
|
|
4969
|
+
device const float * src1,
|
|
4970
|
+
device float * dst,
|
|
4971
|
+
constant int64_t & ne00,
|
|
4972
|
+
constant int64_t & ne01,
|
|
4973
|
+
constant int64_t & ne02,
|
|
4974
|
+
constant int64_t & ne10,
|
|
4975
|
+
constant int64_t & ne12,
|
|
4976
|
+
constant int64_t & ne0,
|
|
4977
|
+
constant int64_t & ne1,
|
|
4978
|
+
constant uint & r2,
|
|
4979
|
+
constant uint & r3,
|
|
4980
|
+
uint3 tgpig[[threadgroup_position_in_grid]],
|
|
4981
|
+
uint tiisg[[thread_index_in_simdgroup]],
|
|
4982
|
+
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
|
4983
|
+
|
|
4984
|
+
const int nb = ne00/QK_K;
|
|
4985
|
+
const int r0 = tgpig.x;
|
|
4986
|
+
const int r1 = tgpig.y;
|
|
4987
|
+
const int im = tgpig.z;
|
|
4988
|
+
|
|
4989
|
+
const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
|
|
4990
|
+
const int ib_row = first_row * nb;
|
|
4991
|
+
|
|
4992
|
+
const uint i12 = im%ne12;
|
|
4993
|
+
const uint i13 = im/ne12;
|
|
4994
|
+
|
|
4995
|
+
const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
|
|
4996
|
+
device const block_iq1_s * x = (device const block_iq1_s *) src0 + ib_row + offset0;
|
|
4997
|
+
device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1;
|
|
4998
|
+
|
|
4999
|
+
float yl[16];
|
|
5000
|
+
float sumf[N_DST]={0.f}, all_sum;
|
|
5001
|
+
|
|
5002
|
+
const int nb32 = nb * (QK_K / 32);
|
|
5003
|
+
|
|
5004
|
+
const int ix = tiisg/2;
|
|
5005
|
+
const int il = tiisg%2;
|
|
5006
|
+
|
|
5007
|
+
device const float * y4 = y + 32 * ix + 16 * il;
|
|
5008
|
+
|
|
5009
|
+
for (int ib32 = ix; ib32 < nb32; ib32 += 16) {
|
|
5010
|
+
|
|
5011
|
+
for (int i = 0; i < 16; ++i) {
|
|
5012
|
+
yl[i] = y4[i];
|
|
5013
|
+
}
|
|
5014
|
+
|
|
5015
|
+
const int ibl = ib32 / (QK_K / 32);
|
|
5016
|
+
const int ib = ib32 % (QK_K / 32);
|
|
5017
|
+
|
|
5018
|
+
device const block_iq1_s * xr = x + ibl;
|
|
5019
|
+
device const uint8_t * qs = xr->qs + 4 * ib + 2 * il;
|
|
5020
|
+
device const uint8_t * sc = xr->scales + 2 * ib + il;
|
|
5021
|
+
device const half * dh = &xr->d;
|
|
5022
|
+
|
|
5023
|
+
for (int row = 0; row < N_DST; row++) {
|
|
5024
|
+
|
|
5025
|
+
constant int8_t * grid1 = (constant int8_t *)(iq1s_grid + (qs[0] | ((sc[0] & 0x08) << 5)));
|
|
5026
|
+
constant int8_t * grid2 = (constant int8_t *)(iq1s_grid + (qs[1] | ((sc[0] & 0x80) << 1)));
|
|
5027
|
+
|
|
5028
|
+
float2 sum = {0};
|
|
5029
|
+
for (int j = 0; j < 8; ++j) {
|
|
5030
|
+
sum[0] += yl[j+ 0] * grid1[j];
|
|
5031
|
+
sum[1] += yl[j+ 8] * grid2[j];
|
|
5032
|
+
}
|
|
5033
|
+
sumf[row] += (float)dh[0] * (sum[0] * (2*(sc[0] & 7) + 1) + sum[1] * (2*((sc[0] >> 4) & 7) + 1));
|
|
5034
|
+
|
|
5035
|
+
dh += nb*sizeof(block_iq1_s)/2;
|
|
5036
|
+
qs += nb*sizeof(block_iq1_s);
|
|
5037
|
+
sc += nb*sizeof(block_iq1_s);
|
|
5038
|
+
}
|
|
5039
|
+
|
|
5040
|
+
y4 += 16 * 32;
|
|
5041
|
+
}
|
|
5042
|
+
|
|
5043
|
+
for (int row = 0; row < N_DST; ++row) {
|
|
5044
|
+
all_sum = simd_sum(sumf[row]);
|
|
5045
|
+
if (tiisg == 0) {
|
|
5046
|
+
dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum;
|
|
5047
|
+
}
|
|
5048
|
+
}
|
|
5049
|
+
}
|
|
5050
|
+
|
|
5051
|
+
constexpr constant static float kvalues_iq4nl_f[16] = {
|
|
5052
|
+
-127.f, -104.f, -83.f, -65.f, -49.f, -35.f, -22.f, -10.f, 1.f, 13.f, 25.f, 38.f, 53.f, 69.f, 89.f, 113.f
|
|
5053
|
+
};
|
|
5054
|
+
|
|
5055
|
+
void kernel_mul_mv_iq4_nl_f32_impl(
|
|
5056
|
+
device const void * src0,
|
|
5057
|
+
device const float * src1,
|
|
5058
|
+
device float * dst,
|
|
5059
|
+
constant int64_t & ne00,
|
|
5060
|
+
constant int64_t & ne01,
|
|
5061
|
+
constant int64_t & ne02,
|
|
5062
|
+
constant int64_t & ne10,
|
|
5063
|
+
constant int64_t & ne12,
|
|
5064
|
+
constant int64_t & ne0,
|
|
5065
|
+
constant int64_t & ne1,
|
|
5066
|
+
constant uint & r2,
|
|
5067
|
+
constant uint & r3,
|
|
5068
|
+
threadgroup float * shared_values [[threadgroup(0)]],
|
|
5069
|
+
uint3 tgpig[[threadgroup_position_in_grid]],
|
|
5070
|
+
uint tiisg[[thread_index_in_simdgroup]],
|
|
5071
|
+
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
|
5072
|
+
|
|
5073
|
+
const int nb = ne00/QK4_NL;
|
|
5074
|
+
const int r0 = tgpig.x;
|
|
5075
|
+
const int r1 = tgpig.y;
|
|
5076
|
+
const int im = tgpig.z;
|
|
5077
|
+
const int first_row = (r0 * 2 + sgitg) * 2;
|
|
5078
|
+
const int ib_row = first_row * nb;
|
|
5079
|
+
|
|
5080
|
+
const uint i12 = im%ne12;
|
|
5081
|
+
const uint i13 = im/ne12;
|
|
5082
|
+
|
|
5083
|
+
const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
|
|
5084
|
+
device const block_iq4_nl * x = (device const block_iq4_nl *) src0 + ib_row + offset0;
|
|
5085
|
+
device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1;
|
|
5086
|
+
|
|
5087
|
+
const int ix = tiisg/2; // 0...15
|
|
5088
|
+
const int it = tiisg%2; // 0 or 1
|
|
5089
|
+
|
|
5090
|
+
shared_values[tiisg] = kvalues_iq4nl_f[tiisg%16];
|
|
5091
|
+
threadgroup_barrier(mem_flags::mem_threadgroup);
|
|
5092
|
+
|
|
5093
|
+
float4 yl[4];
|
|
5094
|
+
float sumf[2]={0.f}, all_sum;
|
|
5095
|
+
|
|
5096
|
+
device const float * yb = y + ix * QK4_NL + it * 8;
|
|
5097
|
+
|
|
5098
|
+
uint32_t aux32[2];
|
|
5099
|
+
thread const uint8_t * q8 = (thread const uint8_t *)aux32;
|
|
5100
|
+
|
|
5101
|
+
float4 qf1, qf2;
|
|
5102
|
+
|
|
5103
|
+
for (int ib = ix; ib < nb; ib += 16) {
|
|
5104
|
+
|
|
5105
|
+
device const float4 * y4 = (device const float4 *)yb;
|
|
5106
|
+
yl[0] = y4[0]; yl[1] = y4[4]; yl[2] = y4[1]; yl[3] = y4[5];
|
|
5107
|
+
|
|
5108
|
+
for (int row = 0; row < 2; ++row) {
|
|
5109
|
+
|
|
5110
|
+
device const block_iq4_nl & xb = x[row*nb + ib];
|
|
5111
|
+
device const uint16_t * q4 = (device const uint16_t *)(xb.qs + 8*it);
|
|
5112
|
+
|
|
5113
|
+
float4 acc1 = {0.f}, acc2 = {0.f};
|
|
5114
|
+
|
|
5115
|
+
aux32[0] = q4[0] | (q4[1] << 16);
|
|
5116
|
+
aux32[1] = (aux32[0] >> 4) & 0x0f0f0f0f;
|
|
5117
|
+
aux32[0] &= 0x0f0f0f0f;
|
|
5118
|
+
qf1 = {shared_values[q8[0]], shared_values[q8[1]], shared_values[q8[2]], shared_values[q8[3]]};
|
|
5119
|
+
qf2 = {shared_values[q8[4]], shared_values[q8[5]], shared_values[q8[6]], shared_values[q8[7]]};
|
|
5120
|
+
acc1 += yl[0] * qf1;
|
|
5121
|
+
acc2 += yl[1] * qf2;
|
|
5122
|
+
|
|
5123
|
+
aux32[0] = q4[2] | (q4[3] << 16);
|
|
5124
|
+
aux32[1] = (aux32[0] >> 4) & 0x0f0f0f0f;
|
|
5125
|
+
aux32[0] &= 0x0f0f0f0f;
|
|
5126
|
+
qf1 = {shared_values[q8[0]], shared_values[q8[1]], shared_values[q8[2]], shared_values[q8[3]]};
|
|
5127
|
+
qf2 = {shared_values[q8[4]], shared_values[q8[5]], shared_values[q8[6]], shared_values[q8[7]]};
|
|
5128
|
+
acc1 += yl[2] * qf1;
|
|
5129
|
+
acc2 += yl[3] * qf2;
|
|
5130
|
+
|
|
5131
|
+
acc1 += acc2;
|
|
5132
|
+
|
|
5133
|
+
sumf[row] += (float)xb.d * (acc1[0] + acc1[1] + acc1[2] + acc1[3]);
|
|
5134
|
+
|
|
5135
|
+
}
|
|
5136
|
+
|
|
5137
|
+
yb += 16 * QK4_NL;
|
|
5138
|
+
}
|
|
5139
|
+
|
|
5140
|
+
for (int row = 0; row < 2; ++row) {
|
|
5141
|
+
all_sum = simd_sum(sumf[row]);
|
|
5142
|
+
if (tiisg == 0) {
|
|
5143
|
+
dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum;
|
|
5144
|
+
}
|
|
5145
|
+
}
|
|
5146
|
+
}
|
|
5147
|
+
|
|
5148
|
+
#if QK_K != 64
|
|
5149
|
+
void kernel_mul_mv_iq4_xs_f32_impl(
|
|
5150
|
+
device const void * src0,
|
|
5151
|
+
device const float * src1,
|
|
5152
|
+
device float * dst,
|
|
5153
|
+
constant int64_t & ne00,
|
|
5154
|
+
constant int64_t & ne01,
|
|
5155
|
+
constant int64_t & ne02,
|
|
5156
|
+
constant int64_t & ne10,
|
|
5157
|
+
constant int64_t & ne12,
|
|
5158
|
+
constant int64_t & ne0,
|
|
5159
|
+
constant int64_t & ne1,
|
|
5160
|
+
constant uint & r2,
|
|
5161
|
+
constant uint & r3,
|
|
5162
|
+
threadgroup float * shared_values [[threadgroup(0)]],
|
|
5163
|
+
uint3 tgpig[[threadgroup_position_in_grid]],
|
|
5164
|
+
uint tiisg[[thread_index_in_simdgroup]],
|
|
5165
|
+
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
|
5166
|
+
|
|
5167
|
+
const int nb = ne00/QK_K;
|
|
5168
|
+
const int r0 = tgpig.x;
|
|
5169
|
+
const int r1 = tgpig.y;
|
|
5170
|
+
const int im = tgpig.z;
|
|
5171
|
+
const int first_row = (r0 * 2 + sgitg) * 2;
|
|
5172
|
+
const int ib_row = first_row * nb;
|
|
5173
|
+
|
|
5174
|
+
const uint i12 = im%ne12;
|
|
5175
|
+
const uint i13 = im/ne12;
|
|
5176
|
+
|
|
5177
|
+
const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
|
|
5178
|
+
device const block_iq4_xs * x = (device const block_iq4_xs *) src0 + ib_row + offset0;
|
|
5179
|
+
device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1;
|
|
5180
|
+
|
|
5181
|
+
const int ix = tiisg/16; // 0 or 1
|
|
5182
|
+
const int it = tiisg%16; // 0...15
|
|
5183
|
+
const int ib = it/2;
|
|
5184
|
+
const int il = it%2;
|
|
5185
|
+
|
|
5186
|
+
shared_values[tiisg] = kvalues_iq4nl_f[tiisg%16];
|
|
5187
|
+
threadgroup_barrier(mem_flags::mem_threadgroup);
|
|
5188
|
+
|
|
5189
|
+
float4 yl[4];
|
|
5190
|
+
float sumf[2]={0.f}, all_sum;
|
|
5191
|
+
|
|
5192
|
+
device const float * yb = y + ix * QK_K + ib * 32 + il * 8;
|
|
5193
|
+
|
|
5194
|
+
uint32_t aux32[2];
|
|
5195
|
+
thread const uint8_t * q8 = (thread const uint8_t *)aux32;
|
|
5196
|
+
|
|
5197
|
+
float4 qf1, qf2;
|
|
5198
|
+
|
|
5199
|
+
for (int ibl = ix; ibl < nb; ibl += 2) {
|
|
5200
|
+
|
|
5201
|
+
device const float4 * y4 = (device const float4 *)yb;
|
|
5202
|
+
yl[0] = y4[0]; yl[1] = y4[4]; yl[2] = y4[1]; yl[3] = y4[5];
|
|
5203
|
+
|
|
5204
|
+
for (int row = 0; row < 2; ++row) {
|
|
5205
|
+
|
|
5206
|
+
device const block_iq4_xs & xb = x[row*nb + ibl];
|
|
5207
|
+
device const uint32_t * q4 = (device const uint32_t *)(xb.qs + 16*ib + 8*il);
|
|
5208
|
+
|
|
5209
|
+
float4 acc1 = {0.f}, acc2 = {0.f};
|
|
5210
|
+
|
|
5211
|
+
aux32[0] = q4[0] & 0x0f0f0f0f;
|
|
5212
|
+
aux32[1] = (q4[0] >> 4) & 0x0f0f0f0f;
|
|
5213
|
+
qf1 = {shared_values[q8[0]], shared_values[q8[1]], shared_values[q8[2]], shared_values[q8[3]]};
|
|
5214
|
+
qf2 = {shared_values[q8[4]], shared_values[q8[5]], shared_values[q8[6]], shared_values[q8[7]]};
|
|
5215
|
+
acc1 += yl[0] * qf1;
|
|
5216
|
+
acc2 += yl[1] * qf2;
|
|
5217
|
+
|
|
5218
|
+
aux32[0] = q4[1] & 0x0f0f0f0f;
|
|
5219
|
+
aux32[1] = (q4[1] >> 4) & 0x0f0f0f0f;
|
|
5220
|
+
qf1 = {shared_values[q8[0]], shared_values[q8[1]], shared_values[q8[2]], shared_values[q8[3]]};
|
|
5221
|
+
qf2 = {shared_values[q8[4]], shared_values[q8[5]], shared_values[q8[6]], shared_values[q8[7]]};
|
|
5222
|
+
acc1 += yl[2] * qf1;
|
|
5223
|
+
acc2 += yl[3] * qf2;
|
|
5224
|
+
|
|
5225
|
+
acc1 += acc2;
|
|
5226
|
+
|
|
5227
|
+
const int ls = (((xb.scales_l[ib/2] >> 4*(ib%2)) & 0xf) | (((xb.scales_h >> 2*ib) & 3) << 4)) - 32;
|
|
5228
|
+
sumf[row] += (float)xb.d * ls * (acc1[0] + acc1[1] + acc1[2] + acc1[3]);
|
|
5229
|
+
|
|
5230
|
+
}
|
|
5231
|
+
|
|
5232
|
+
yb += 2 * QK_K;
|
|
5233
|
+
}
|
|
5234
|
+
|
|
5235
|
+
for (int row = 0; row < 2; ++row) {
|
|
5236
|
+
all_sum = simd_sum(sumf[row]);
|
|
5237
|
+
if (tiisg == 0) {
|
|
5238
|
+
dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum;
|
|
5239
|
+
}
|
|
5240
|
+
}
|
|
5241
|
+
}
|
|
5242
|
+
#endif
|
|
5243
|
+
|
|
5244
|
+
[[host_name("kernel_mul_mv_iq1_s_f32")]]
|
|
5245
|
+
kernel void kernel_mul_mv_iq1_s_f32(
|
|
5246
|
+
device const void * src0,
|
|
5247
|
+
device const float * src1,
|
|
5248
|
+
device float * dst,
|
|
5249
|
+
constant int64_t & ne00,
|
|
5250
|
+
constant int64_t & ne01,
|
|
5251
|
+
constant int64_t & ne02,
|
|
5252
|
+
constant uint64_t & nb00,
|
|
5253
|
+
constant uint64_t & nb01,
|
|
5254
|
+
constant uint64_t & nb02,
|
|
5255
|
+
constant int64_t & ne10,
|
|
5256
|
+
constant int64_t & ne11,
|
|
5257
|
+
constant int64_t & ne12,
|
|
5258
|
+
constant uint64_t & nb10,
|
|
5259
|
+
constant uint64_t & nb11,
|
|
5260
|
+
constant uint64_t & nb12,
|
|
5261
|
+
constant int64_t & ne0,
|
|
5262
|
+
constant int64_t & ne1,
|
|
5263
|
+
constant uint & r2,
|
|
5264
|
+
constant uint & r3,
|
|
5265
|
+
uint3 tgpig[[threadgroup_position_in_grid]],
|
|
5266
|
+
uint tiisg[[thread_index_in_simdgroup]],
|
|
5267
|
+
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
|
5268
|
+
|
|
5269
|
+
kernel_mul_mv_iq1_s_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, tgpig, tiisg, sgitg);
|
|
5270
|
+
}
|
|
5271
|
+
|
|
5272
|
+
[[host_name("kernel_mul_mv_iq4_nl_f32")]]
|
|
5273
|
+
kernel void kernel_mul_mv_iq4_nl_f32(
|
|
5274
|
+
device const void * src0,
|
|
5275
|
+
device const float * src1,
|
|
5276
|
+
device float * dst,
|
|
5277
|
+
constant int64_t & ne00,
|
|
5278
|
+
constant int64_t & ne01,
|
|
5279
|
+
constant int64_t & ne02,
|
|
5280
|
+
constant uint64_t & nb00,
|
|
5281
|
+
constant uint64_t & nb01,
|
|
5282
|
+
constant uint64_t & nb02,
|
|
5283
|
+
constant int64_t & ne10,
|
|
5284
|
+
constant int64_t & ne11,
|
|
5285
|
+
constant int64_t & ne12,
|
|
5286
|
+
constant uint64_t & nb10,
|
|
5287
|
+
constant uint64_t & nb11,
|
|
5288
|
+
constant uint64_t & nb12,
|
|
5289
|
+
constant int64_t & ne0,
|
|
5290
|
+
constant int64_t & ne1,
|
|
5291
|
+
constant uint & r2,
|
|
5292
|
+
constant uint & r3,
|
|
5293
|
+
threadgroup float * shared_values [[threadgroup(0)]],
|
|
5294
|
+
uint3 tgpig[[threadgroup_position_in_grid]],
|
|
5295
|
+
uint tiisg[[thread_index_in_simdgroup]],
|
|
5296
|
+
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
|
5297
|
+
|
|
5298
|
+
kernel_mul_mv_iq4_nl_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
|
|
5299
|
+
}
|
|
5300
|
+
|
|
5301
|
+
[[host_name("kernel_mul_mv_iq4_xs_f32")]]
|
|
5302
|
+
kernel void kernel_mul_mv_iq4_xs_f32(
|
|
5303
|
+
device const void * src0,
|
|
5304
|
+
device const float * src1,
|
|
5305
|
+
device float * dst,
|
|
5306
|
+
constant int64_t & ne00,
|
|
5307
|
+
constant int64_t & ne01,
|
|
5308
|
+
constant int64_t & ne02,
|
|
5309
|
+
constant uint64_t & nb00,
|
|
5310
|
+
constant uint64_t & nb01,
|
|
5311
|
+
constant uint64_t & nb02,
|
|
5312
|
+
constant int64_t & ne10,
|
|
5313
|
+
constant int64_t & ne11,
|
|
5314
|
+
constant int64_t & ne12,
|
|
5315
|
+
constant uint64_t & nb10,
|
|
5316
|
+
constant uint64_t & nb11,
|
|
5317
|
+
constant uint64_t & nb12,
|
|
5318
|
+
constant int64_t & ne0,
|
|
5319
|
+
constant int64_t & ne1,
|
|
5320
|
+
constant uint & r2,
|
|
5321
|
+
constant uint & r3,
|
|
5322
|
+
threadgroup float * shared_values [[threadgroup(0)]],
|
|
5323
|
+
uint3 tgpig[[threadgroup_position_in_grid]],
|
|
5324
|
+
uint tiisg[[thread_index_in_simdgroup]],
|
|
5325
|
+
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
|
5326
|
+
|
|
5327
|
+
#if QK_K == 64
|
|
5328
|
+
kernel_mul_mv_iq4_nl_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
|
|
5329
|
+
#else
|
|
5330
|
+
kernel_mul_mv_iq4_xs_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
|
|
5331
|
+
#endif
|
|
5332
|
+
}
|
|
5333
|
+
|
|
5334
|
+
//============================= templates and their specializations =============================
|
|
5335
|
+
|
|
5336
|
+
// NOTE: this is not dequantizing - we are simply fitting the template
|
|
5337
|
+
template <typename type4x4>
|
|
5338
|
+
void dequantize_f32(device const float4x4 * src, short il, thread type4x4 & reg) {
|
|
5339
|
+
float4x4 temp = *(((device float4x4 *)src));
|
|
5340
|
+
for (int i = 0; i < 16; i++){
|
|
5341
|
+
reg[i/4][i%4] = temp[i/4][i%4];
|
|
5342
|
+
}
|
|
5343
|
+
}
|
|
5344
|
+
|
|
5345
|
+
template <typename type4x4>
|
|
5346
|
+
void dequantize_f16(device const half4x4 * src, short il, thread type4x4 & reg) {
|
|
5347
|
+
half4x4 temp = *(((device half4x4 *)src));
|
|
5348
|
+
for (int i = 0; i < 16; i++){
|
|
5349
|
+
reg[i/4][i%4] = temp[i/4][i%4];
|
|
5350
|
+
}
|
|
5351
|
+
}
|
|
5352
|
+
|
|
5353
|
+
template <typename type4x4>
|
|
5354
|
+
void dequantize_q4_0(device const block_q4_0 *xb, short il, thread type4x4 & reg) {
|
|
5355
|
+
device const uint16_t * qs = ((device const uint16_t *)xb + 1);
|
|
5356
|
+
const float d1 = il ? (xb->d / 16.h) : xb->d;
|
|
5357
|
+
const float d2 = d1 / 256.f;
|
|
5358
|
+
const float md = -8.h * xb->d;
|
|
5359
|
+
const ushort mask0 = il ? 0x00F0 : 0x000F;
|
|
5360
|
+
const ushort mask1 = mask0 << 8;
|
|
5361
|
+
|
|
5362
|
+
for (int i=0;i<8;i++) {
|
|
5363
|
+
reg[i/2][2*(i%2)+0] = d1 * (qs[i] & mask0) + md;
|
|
5364
|
+
reg[i/2][2*(i%2)+1] = d2 * (qs[i] & mask1) + md;
|
|
5365
|
+
}
|
|
5366
|
+
}
|
|
5367
|
+
|
|
5368
|
+
template <typename type4x4>
|
|
5369
|
+
void dequantize_q4_1(device const block_q4_1 *xb, short il, thread type4x4 & reg) {
|
|
5370
|
+
device const uint16_t * qs = ((device const uint16_t *)xb + 2);
|
|
5371
|
+
const float d1 = il ? (xb->d / 16.h) : xb->d;
|
|
5372
|
+
const float d2 = d1 / 256.f;
|
|
5373
|
+
const float m = xb->m;
|
|
5374
|
+
const ushort mask0 = il ? 0x00F0 : 0x000F;
|
|
5375
|
+
const ushort mask1 = mask0 << 8;
|
|
5376
|
+
|
|
5377
|
+
for (int i=0;i<8;i++) {
|
|
5378
|
+
reg[i/2][2*(i%2)+0] = ((qs[i] & mask0) * d1) + m;
|
|
5379
|
+
reg[i/2][2*(i%2)+1] = ((qs[i] & mask1) * d2) + m;
|
|
5380
|
+
}
|
|
5381
|
+
}
|
|
5382
|
+
|
|
5383
|
+
template <typename type4x4>
|
|
5384
|
+
void dequantize_q5_0(device const block_q5_0 *xb, short il, thread type4x4 & reg) {
|
|
5385
|
+
device const uint16_t * qs = ((device const uint16_t *)xb + 3);
|
|
5386
|
+
const float d = xb->d;
|
|
5387
|
+
const float md = -16.h * xb->d;
|
|
5388
|
+
const ushort mask = il ? 0x00F0 : 0x000F;
|
|
5389
|
+
|
|
5390
|
+
const uint32_t qh = *((device const uint32_t *)xb->qh);
|
|
5391
|
+
|
|
5392
|
+
const int x_mv = il ? 4 : 0;
|
|
5393
|
+
|
|
5394
|
+
const int gh_mv = il ? 12 : 0;
|
|
5395
|
+
const int gh_bk = il ? 0 : 4;
|
|
5396
|
+
|
|
5397
|
+
for (int i = 0; i < 8; i++) {
|
|
5398
|
+
// extract the 5-th bits for x0 and x1
|
|
5399
|
+
const uint8_t xh_0 = ((qh >> (gh_mv + 2*i )) << gh_bk) & 0x10;
|
|
5400
|
+
const uint8_t xh_1 = ((qh >> (gh_mv + 2*i+1)) << gh_bk) & 0x10;
|
|
5401
|
+
|
|
5402
|
+
// combine the 4-bits from qs with the 5th bit
|
|
5403
|
+
const int32_t x0 = ((((qs[i] ) & mask) >> x_mv) | xh_0);
|
|
5404
|
+
const int32_t x1 = ((((qs[i] >> 8) & mask) >> x_mv) | xh_1);
|
|
5405
|
+
|
|
5406
|
+
reg[i/2][2*(i%2)+0] = d * x0 + md;
|
|
5407
|
+
reg[i/2][2*(i%2)+1] = d * x1 + md;
|
|
5408
|
+
}
|
|
5409
|
+
}
|
|
5410
|
+
|
|
5411
|
+
template <typename type4x4>
|
|
5412
|
+
void dequantize_q5_1(device const block_q5_1 *xb, short il, thread type4x4 & reg) {
|
|
5413
|
+
device const uint16_t * qs = ((device const uint16_t *)xb + 4);
|
|
5414
|
+
const float d = xb->d;
|
|
5415
|
+
const float m = xb->m;
|
|
5416
|
+
const ushort mask = il ? 0x00F0 : 0x000F;
|
|
5417
|
+
|
|
5418
|
+
const uint32_t qh = *((device const uint32_t *)xb->qh);
|
|
5419
|
+
|
|
5420
|
+
const int x_mv = il ? 4 : 0;
|
|
5421
|
+
|
|
5422
|
+
const int gh_mv = il ? 12 : 0;
|
|
5423
|
+
const int gh_bk = il ? 0 : 4;
|
|
5424
|
+
|
|
5425
|
+
for (int i = 0; i < 8; i++) {
|
|
5426
|
+
// extract the 5-th bits for x0 and x1
|
|
5427
|
+
const uint8_t xh_0 = ((qh >> (gh_mv + 2*i )) << gh_bk) & 0x10;
|
|
5428
|
+
const uint8_t xh_1 = ((qh >> (gh_mv + 2*i+1)) << gh_bk) & 0x10;
|
|
5429
|
+
|
|
5430
|
+
// combine the 4-bits from qs with the 5th bit
|
|
5431
|
+
const int32_t x0 = ((((qs[i] ) & mask) >> x_mv) | xh_0);
|
|
5432
|
+
const int32_t x1 = ((((qs[i] >> 8) & mask) >> x_mv) | xh_1);
|
|
5433
|
+
|
|
5434
|
+
reg[i/2][2*(i%2)+0] = d * x0 + m;
|
|
5435
|
+
reg[i/2][2*(i%2)+1] = d * x1 + m;
|
|
5436
|
+
}
|
|
5437
|
+
}
|
|
5438
|
+
|
|
5439
|
+
template <typename type4x4>
|
|
5440
|
+
void dequantize_q8_0(device const block_q8_0 *xb, short il, thread type4x4 & reg) {
|
|
5441
|
+
device const int8_t * qs = ((device const int8_t *)xb->qs);
|
|
5442
|
+
const half d = xb->d;
|
|
5443
|
+
|
|
5444
|
+
for (int i = 0; i < 16; i++) {
|
|
5445
|
+
reg[i/4][i%4] = (qs[i + 16*il] * d);
|
|
5446
|
+
}
|
|
5447
|
+
}
|
|
5448
|
+
|
|
5449
|
+
template <typename type4x4>
|
|
5450
|
+
void dequantize_q2_K(device const block_q2_K *xb, short il, thread type4x4 & reg) {
|
|
5451
|
+
const float d = xb->d;
|
|
5452
|
+
const float min = xb->dmin;
|
|
5453
|
+
device const uint8_t * q = (device const uint8_t *)xb->qs;
|
|
5454
|
+
float dl, ml;
|
|
5455
|
+
uint8_t sc = xb->scales[il];
|
|
4589
5456
|
|
|
4590
5457
|
#if QK_K == 256
|
|
4591
5458
|
q = q + 32*(il/8) + 16*(il&1);
|
|
@@ -4659,6 +5526,8 @@ void dequantize_q4_K(device const block_q4_K *xb, short il, thread type4x4 & reg
|
|
|
4659
5526
|
const float dl = d * sc[0];
|
|
4660
5527
|
const float ml = min * sc[1];
|
|
4661
5528
|
#else
|
|
5529
|
+
(void) get_scale_min_k4_just2;
|
|
5530
|
+
|
|
4662
5531
|
q = q + 16 * (il&1);
|
|
4663
5532
|
device const uint8_t * s = xb->scales;
|
|
4664
5533
|
device const half2 * dh = (device const half2 *)xb->d;
|
|
@@ -4808,6 +5677,50 @@ void dequantize_iq3_xxs(device const block_iq3_xxs * xb, short il, thread type4x
|
|
|
4808
5677
|
}
|
|
4809
5678
|
}
|
|
4810
5679
|
|
|
5680
|
+
template <typename type4x4>
|
|
5681
|
+
void dequantize_iq3_s(device const block_iq3_s * xb, short il, thread type4x4 & reg) {
|
|
5682
|
+
// il is 0...15 for QK_K = 256 => index of block of 32 is il/2
|
|
5683
|
+
const float d = xb->d;
|
|
5684
|
+
const int ib32 = il/2;
|
|
5685
|
+
il = il%2;
|
|
5686
|
+
// il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16
|
|
5687
|
+
device const uint8_t * qs = xb->qs + 8*ib32;
|
|
5688
|
+
device const uint8_t * signs = xb->signs + 4*ib32 + 2*il;
|
|
5689
|
+
const uint8_t qh = xb->qh[ib32] >> 4*il;
|
|
5690
|
+
const float dl = d * (1 + 2*((xb->scales[ib32/2] >> 4*(ib32%2)) & 0xf));
|
|
5691
|
+
constant uint8_t * grid1 = (constant uint8_t *)(iq3s_grid + (qs[4*il+0] | ((qh << 8) & 256)));
|
|
5692
|
+
constant uint8_t * grid2 = (constant uint8_t *)(iq3s_grid + (qs[4*il+1] | ((qh << 7) & 256)));
|
|
5693
|
+
for (int i = 0; i < 4; ++i) {
|
|
5694
|
+
reg[0][i] = dl * grid1[i] * select(1, -1, signs[0] & kmask_iq2xs[i+0]);
|
|
5695
|
+
reg[1][i] = dl * grid2[i] * select(1, -1, signs[0] & kmask_iq2xs[i+4]);
|
|
5696
|
+
}
|
|
5697
|
+
grid1 = (constant uint8_t *)(iq3s_grid + (qs[4*il+2] | ((qh << 6) & 256)));
|
|
5698
|
+
grid2 = (constant uint8_t *)(iq3s_grid + (qs[4*il+3] | ((qh << 5) & 256)));
|
|
5699
|
+
for (int i = 0; i < 4; ++i) {
|
|
5700
|
+
reg[2][i] = dl * grid1[i] * select(1, -1, signs[1] & kmask_iq2xs[i+0]);
|
|
5701
|
+
reg[3][i] = dl * grid2[i] * select(1, -1, signs[1] & kmask_iq2xs[i+4]);
|
|
5702
|
+
}
|
|
5703
|
+
}
|
|
5704
|
+
|
|
5705
|
+
template <typename type4x4>
|
|
5706
|
+
void dequantize_iq2_s(device const block_iq2_s * xb, short il, thread type4x4 & reg) {
|
|
5707
|
+
// il is 0...15 for QK_K = 256 => index of block of 32 is il/2
|
|
5708
|
+
const float d = xb->d;
|
|
5709
|
+
const int ib32 = il/2;
|
|
5710
|
+
il = il%2;
|
|
5711
|
+
// il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16
|
|
5712
|
+
device const uint8_t * qs = xb->qs + 4*ib32 + 2*il;
|
|
5713
|
+
device const uint8_t * signs = qs + QK_K/8;
|
|
5714
|
+
const uint8_t qh = xb->qh[ib32] >> 4*il;
|
|
5715
|
+
const float dl = d * (0.5f + ((xb->scales[ib32] >> 4*il) & 0xf)) * 0.25f;
|
|
5716
|
+
constant uint8_t * grid1 = (constant uint8_t *)(iq2s_grid + (qs[0] | ((qh << 8) & 0x300)));
|
|
5717
|
+
constant uint8_t * grid2 = (constant uint8_t *)(iq2s_grid + (qs[1] | ((qh << 6) & 0x300)));
|
|
5718
|
+
for (int i = 0; i < 8; ++i) {
|
|
5719
|
+
reg[i/4+0][i%4] = dl * grid1[i] * select(1, -1, signs[0] & kmask_iq2xs[i]);
|
|
5720
|
+
reg[i/4+2][i%4] = dl * grid2[i] * select(1, -1, signs[1] & kmask_iq2xs[i]);
|
|
5721
|
+
}
|
|
5722
|
+
}
|
|
5723
|
+
|
|
4811
5724
|
template <typename type4x4>
|
|
4812
5725
|
void dequantize_iq1_s(device const block_iq1_s * xb, short il, thread type4x4 & reg) {
|
|
4813
5726
|
// il is 0...15 for QK_K = 256 => index of block of 32 is il/2
|
|
@@ -4824,6 +5737,45 @@ void dequantize_iq1_s(device const block_iq1_s * xb, short il, thread type4x4 &
|
|
|
4824
5737
|
}
|
|
4825
5738
|
}
|
|
4826
5739
|
|
|
5740
|
+
template <typename type4x4>
|
|
5741
|
+
void dequantize_iq4_nl(device const block_iq4_nl * xb, short il, thread type4x4 & reg) {
|
|
5742
|
+
device const uint16_t * q4 = (device const uint16_t *)xb->qs;
|
|
5743
|
+
const float d = xb->d;
|
|
5744
|
+
uint32_t aux32;
|
|
5745
|
+
thread const uint8_t * q8 = (thread const uint8_t *)&aux32;
|
|
5746
|
+
for (int i = 0; i < 4; ++i) {
|
|
5747
|
+
aux32 = ((q4[2*i] | (q4[2*i+1] << 16)) >> 4*il) & 0x0f0f0f0f;
|
|
5748
|
+
reg[i][0] = d * kvalues_iq4nl_f[q8[0]];
|
|
5749
|
+
reg[i][1] = d * kvalues_iq4nl_f[q8[1]];
|
|
5750
|
+
reg[i][2] = d * kvalues_iq4nl_f[q8[2]];
|
|
5751
|
+
reg[i][3] = d * kvalues_iq4nl_f[q8[3]];
|
|
5752
|
+
}
|
|
5753
|
+
}
|
|
5754
|
+
|
|
5755
|
+
template <typename type4x4>
|
|
5756
|
+
void dequantize_iq4_xs(device const block_iq4_xs * xb, short il, thread type4x4 & reg) {
|
|
5757
|
+
#if QK_K == 64
|
|
5758
|
+
dequantize_iq4_nl(xb, il, reg);
|
|
5759
|
+
#else
|
|
5760
|
+
// il is 0...15 for QK_K = 256 => index of block of 32 is il/2
|
|
5761
|
+
const int ib32 = il/2;
|
|
5762
|
+
il = il%2;
|
|
5763
|
+
// il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16
|
|
5764
|
+
device const uint32_t * q4 = (device const uint32_t *)xb->qs + 4*ib32;
|
|
5765
|
+
const int ls = ((xb->scales_l[ib32/2] >> 4*(ib32%2)) & 0xf) | (((xb->scales_h >> 2*ib32) & 3) << 4);
|
|
5766
|
+
const float d = (float)xb->d * (ls - 32);
|
|
5767
|
+
uint32_t aux32;
|
|
5768
|
+
thread const uint8_t * q8 = (thread const uint8_t *)&aux32;
|
|
5769
|
+
for (int i = 0; i < 4; ++i) {
|
|
5770
|
+
aux32 = (q4[i] >> 4*il) & 0x0f0f0f0f;
|
|
5771
|
+
reg[i][0] = d * kvalues_iq4nl_f[q8[0]];
|
|
5772
|
+
reg[i][1] = d * kvalues_iq4nl_f[q8[1]];
|
|
5773
|
+
reg[i][2] = d * kvalues_iq4nl_f[q8[2]];
|
|
5774
|
+
reg[i][3] = d * kvalues_iq4nl_f[q8[3]];
|
|
5775
|
+
}
|
|
5776
|
+
#endif
|
|
5777
|
+
}
|
|
5778
|
+
|
|
4827
5779
|
template<typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread float4x4 &)>
|
|
4828
5780
|
kernel void kernel_get_rows(
|
|
4829
5781
|
device const void * src0,
|
|
@@ -5366,7 +6318,15 @@ template [[host_name("kernel_get_rows_q6_K")]] kernel get_rows_t kernel_get_rows
|
|
|
5366
6318
|
template [[host_name("kernel_get_rows_iq2_xxs")]] kernel get_rows_t kernel_get_rows<block_iq2_xxs, QK_NL, dequantize_iq2_xxs>;
|
|
5367
6319
|
template [[host_name("kernel_get_rows_iq2_xs")]] kernel get_rows_t kernel_get_rows<block_iq2_xs, QK_NL, dequantize_iq2_xs>;
|
|
5368
6320
|
template [[host_name("kernel_get_rows_iq3_xxs")]] kernel get_rows_t kernel_get_rows<block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;
|
|
6321
|
+
template [[host_name("kernel_get_rows_iq3_s")]] kernel get_rows_t kernel_get_rows<block_iq3_s, QK_NL, dequantize_iq3_s>;
|
|
6322
|
+
template [[host_name("kernel_get_rows_iq2_s")]] kernel get_rows_t kernel_get_rows<block_iq2_s, QK_NL, dequantize_iq2_s>;
|
|
5369
6323
|
template [[host_name("kernel_get_rows_iq1_s")]] kernel get_rows_t kernel_get_rows<block_iq1_s, QK_NL, dequantize_iq1_s>;
|
|
6324
|
+
template [[host_name("kernel_get_rows_iq4_nl")]] kernel get_rows_t kernel_get_rows<block_iq4_nl, 2, dequantize_iq4_nl>;
|
|
6325
|
+
#if QK_K == 64
|
|
6326
|
+
template [[host_name("kernel_get_rows_iq4_xs")]] kernel get_rows_t kernel_get_rows<block_iq4_xs, 2, dequantize_iq4_xs>;
|
|
6327
|
+
#else
|
|
6328
|
+
template [[host_name("kernel_get_rows_iq4_xs")]] kernel get_rows_t kernel_get_rows<block_iq4_xs, QK_NL, dequantize_iq4_xs>;
|
|
6329
|
+
#endif
|
|
5370
6330
|
|
|
5371
6331
|
//
|
|
5372
6332
|
// matrix-matrix multiplication
|
|
@@ -5406,7 +6366,15 @@ template [[host_name("kernel_mul_mm_q6_K_f32")]] kernel mat_mm_t kernel_mul_mm<b
|
|
|
5406
6366
|
template [[host_name("kernel_mul_mm_iq2_xxs_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq2_xxs, QK_NL, dequantize_iq2_xxs>;
|
|
5407
6367
|
template [[host_name("kernel_mul_mm_iq2_xs_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq2_xs, QK_NL, dequantize_iq2_xs>;
|
|
5408
6368
|
template [[host_name("kernel_mul_mm_iq3_xxs_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;
|
|
6369
|
+
template [[host_name("kernel_mul_mm_iq3_s_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq3_s, QK_NL, dequantize_iq3_s>;
|
|
6370
|
+
template [[host_name("kernel_mul_mm_iq2_s_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq2_s, QK_NL, dequantize_iq2_s>;
|
|
5409
6371
|
template [[host_name("kernel_mul_mm_iq1_s_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq1_s, QK_NL, dequantize_iq1_s>;
|
|
6372
|
+
template [[host_name("kernel_mul_mm_iq4_nl_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq4_nl, 2, dequantize_iq4_nl>;
|
|
6373
|
+
#if QK_K == 64
|
|
6374
|
+
template [[host_name("kernel_mul_mm_iq4_xs_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq4_nl, 2, dequantize_iq4_xs>;
|
|
6375
|
+
#else
|
|
6376
|
+
template [[host_name("kernel_mul_mm_iq4_xs_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq4_xs, QK_NL, dequantize_iq4_xs>;
|
|
6377
|
+
#endif
|
|
5410
6378
|
|
|
5411
6379
|
//
|
|
5412
6380
|
// indirect matrix-matrix multiplication
|
|
@@ -5458,7 +6426,15 @@ template [[host_name("kernel_mul_mm_id_q6_K_f32")]] kernel mat_mm_id_t kernel_mu
|
|
|
5458
6426
|
template [[host_name("kernel_mul_mm_id_iq2_xxs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq2_xxs, QK_NL, dequantize_iq2_xxs>;
|
|
5459
6427
|
template [[host_name("kernel_mul_mm_id_iq2_xs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq2_xs, QK_NL, dequantize_iq2_xs>;
|
|
5460
6428
|
template [[host_name("kernel_mul_mm_id_iq3_xxs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;
|
|
6429
|
+
template [[host_name("kernel_mul_mm_id_iq3_s_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq3_s, QK_NL, dequantize_iq3_s>;
|
|
6430
|
+
template [[host_name("kernel_mul_mm_id_iq2_s_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq2_s, QK_NL, dequantize_iq2_s>;
|
|
5461
6431
|
template [[host_name("kernel_mul_mm_id_iq1_s_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq1_s, QK_NL, dequantize_iq1_s>;
|
|
6432
|
+
template [[host_name("kernel_mul_mm_id_iq4_nl_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq4_nl, 2, dequantize_iq4_nl>;
|
|
6433
|
+
#if QK_K == 64
|
|
6434
|
+
template [[host_name("kernel_mul_mm_id_iq4_xs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq4_xs, 2, dequantize_iq4_xs>;
|
|
6435
|
+
#else
|
|
6436
|
+
template [[host_name("kernel_mul_mm_id_iq4_xs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq4_xs, QK_NL, dequantize_iq4_xs>;
|
|
6437
|
+
#endif
|
|
5462
6438
|
|
|
5463
6439
|
//
|
|
5464
6440
|
// matrix-vector multiplication
|
|
@@ -6427,6 +7403,136 @@ kernel void kernel_mul_mv_id_iq3_xxs_f32(
|
|
|
6427
7403
|
sgitg);
|
|
6428
7404
|
}
|
|
6429
7405
|
|
|
7406
|
+
[[host_name("kernel_mul_mv_id_iq3_s_f32")]]
|
|
7407
|
+
kernel void kernel_mul_mv_id_iq3_s_f32(
|
|
7408
|
+
device const char * ids,
|
|
7409
|
+
device const char * src1,
|
|
7410
|
+
device float * dst,
|
|
7411
|
+
constant uint64_t & nbi1,
|
|
7412
|
+
constant int64_t & ne00,
|
|
7413
|
+
constant int64_t & ne01,
|
|
7414
|
+
constant int64_t & ne02,
|
|
7415
|
+
constant uint64_t & nb00,
|
|
7416
|
+
constant uint64_t & nb01,
|
|
7417
|
+
constant uint64_t & nb02,
|
|
7418
|
+
constant int64_t & ne10,
|
|
7419
|
+
constant int64_t & ne11,
|
|
7420
|
+
constant int64_t & ne12,
|
|
7421
|
+
constant int64_t & ne13,
|
|
7422
|
+
constant uint64_t & nb10,
|
|
7423
|
+
constant uint64_t & nb11,
|
|
7424
|
+
constant uint64_t & nb12,
|
|
7425
|
+
constant int64_t & ne0,
|
|
7426
|
+
constant int64_t & ne1,
|
|
7427
|
+
constant uint64_t & nb1,
|
|
7428
|
+
constant uint & r2,
|
|
7429
|
+
constant uint & r3,
|
|
7430
|
+
constant int & idx,
|
|
7431
|
+
device const char * src00,
|
|
7432
|
+
device const char * src01,
|
|
7433
|
+
device const char * src02,
|
|
7434
|
+
device const char * src03,
|
|
7435
|
+
device const char * src04,
|
|
7436
|
+
device const char * src05,
|
|
7437
|
+
device const char * src06,
|
|
7438
|
+
device const char * src07,
|
|
7439
|
+
threadgroup int8_t * shared_values [[threadgroup(0)]],
|
|
7440
|
+
uint3 tgpig[[threadgroup_position_in_grid]],
|
|
7441
|
+
uint tiitg[[thread_index_in_threadgroup]],
|
|
7442
|
+
uint tiisg[[thread_index_in_simdgroup]],
|
|
7443
|
+
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
|
7444
|
+
device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07};
|
|
7445
|
+
|
|
7446
|
+
const int64_t bid = tgpig.z/(ne12*ne13);
|
|
7447
|
+
|
|
7448
|
+
tgpig.z = tgpig.z%(ne12*ne13);
|
|
7449
|
+
|
|
7450
|
+
const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx];
|
|
7451
|
+
|
|
7452
|
+
kernel_mul_mv_iq3_s_f32_impl(
|
|
7453
|
+
src0[id],
|
|
7454
|
+
(device const float *) (src1 + bid*nb11),
|
|
7455
|
+
dst + bid*ne0,
|
|
7456
|
+
ne00,
|
|
7457
|
+
ne01,
|
|
7458
|
+
ne02,
|
|
7459
|
+
ne10,
|
|
7460
|
+
ne12,
|
|
7461
|
+
ne0,
|
|
7462
|
+
ne1,
|
|
7463
|
+
r2,
|
|
7464
|
+
r3,
|
|
7465
|
+
shared_values,
|
|
7466
|
+
tgpig,
|
|
7467
|
+
tiisg,
|
|
7468
|
+
sgitg);
|
|
7469
|
+
}
|
|
7470
|
+
|
|
7471
|
+
[[host_name("kernel_mul_mv_id_iq2_s_f32")]]
|
|
7472
|
+
kernel void kernel_mul_mv_id_iq2_s_f32(
|
|
7473
|
+
device const char * ids,
|
|
7474
|
+
device const char * src1,
|
|
7475
|
+
device float * dst,
|
|
7476
|
+
constant uint64_t & nbi1,
|
|
7477
|
+
constant int64_t & ne00,
|
|
7478
|
+
constant int64_t & ne01,
|
|
7479
|
+
constant int64_t & ne02,
|
|
7480
|
+
constant uint64_t & nb00,
|
|
7481
|
+
constant uint64_t & nb01,
|
|
7482
|
+
constant uint64_t & nb02,
|
|
7483
|
+
constant int64_t & ne10,
|
|
7484
|
+
constant int64_t & ne11,
|
|
7485
|
+
constant int64_t & ne12,
|
|
7486
|
+
constant int64_t & ne13,
|
|
7487
|
+
constant uint64_t & nb10,
|
|
7488
|
+
constant uint64_t & nb11,
|
|
7489
|
+
constant uint64_t & nb12,
|
|
7490
|
+
constant int64_t & ne0,
|
|
7491
|
+
constant int64_t & ne1,
|
|
7492
|
+
constant uint64_t & nb1,
|
|
7493
|
+
constant uint & r2,
|
|
7494
|
+
constant uint & r3,
|
|
7495
|
+
constant int & idx,
|
|
7496
|
+
device const char * src00,
|
|
7497
|
+
device const char * src01,
|
|
7498
|
+
device const char * src02,
|
|
7499
|
+
device const char * src03,
|
|
7500
|
+
device const char * src04,
|
|
7501
|
+
device const char * src05,
|
|
7502
|
+
device const char * src06,
|
|
7503
|
+
device const char * src07,
|
|
7504
|
+
threadgroup int8_t * shared_values [[threadgroup(0)]],
|
|
7505
|
+
uint3 tgpig[[threadgroup_position_in_grid]],
|
|
7506
|
+
uint tiitg[[thread_index_in_threadgroup]],
|
|
7507
|
+
uint tiisg[[thread_index_in_simdgroup]],
|
|
7508
|
+
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
|
7509
|
+
device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07};
|
|
7510
|
+
|
|
7511
|
+
const int64_t bid = tgpig.z/(ne12*ne13);
|
|
7512
|
+
|
|
7513
|
+
tgpig.z = tgpig.z%(ne12*ne13);
|
|
7514
|
+
|
|
7515
|
+
const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx];
|
|
7516
|
+
|
|
7517
|
+
kernel_mul_mv_iq2_s_f32_impl(
|
|
7518
|
+
src0[id],
|
|
7519
|
+
(device const float *) (src1 + bid*nb11),
|
|
7520
|
+
dst + bid*ne0,
|
|
7521
|
+
ne00,
|
|
7522
|
+
ne01,
|
|
7523
|
+
ne02,
|
|
7524
|
+
ne10,
|
|
7525
|
+
ne12,
|
|
7526
|
+
ne0,
|
|
7527
|
+
ne1,
|
|
7528
|
+
r2,
|
|
7529
|
+
r3,
|
|
7530
|
+
shared_values,
|
|
7531
|
+
tgpig,
|
|
7532
|
+
tiisg,
|
|
7533
|
+
sgitg);
|
|
7534
|
+
}
|
|
7535
|
+
|
|
6430
7536
|
[[host_name("kernel_mul_mv_id_iq1_s_f32")]]
|
|
6431
7537
|
kernel void kernel_mul_mv_id_iq1_s_f32(
|
|
6432
7538
|
device const char * ids,
|
|
@@ -6489,3 +7595,137 @@ kernel void kernel_mul_mv_id_iq1_s_f32(
|
|
|
6489
7595
|
tiisg,
|
|
6490
7596
|
sgitg);
|
|
6491
7597
|
}
|
|
7598
|
+
|
|
7599
|
+
[[host_name("kernel_mul_mv_id_iq4_nl_f32")]]
|
|
7600
|
+
kernel void kernel_mul_mv_id_iq4_nl_f32(
|
|
7601
|
+
device const char * ids,
|
|
7602
|
+
device const char * src1,
|
|
7603
|
+
device float * dst,
|
|
7604
|
+
constant uint64_t & nbi1,
|
|
7605
|
+
constant int64_t & ne00,
|
|
7606
|
+
constant int64_t & ne01,
|
|
7607
|
+
constant int64_t & ne02,
|
|
7608
|
+
constant uint64_t & nb00,
|
|
7609
|
+
constant uint64_t & nb01,
|
|
7610
|
+
constant uint64_t & nb02,
|
|
7611
|
+
constant int64_t & ne10,
|
|
7612
|
+
constant int64_t & ne11,
|
|
7613
|
+
constant int64_t & ne12,
|
|
7614
|
+
constant int64_t & ne13,
|
|
7615
|
+
constant uint64_t & nb10,
|
|
7616
|
+
constant uint64_t & nb11,
|
|
7617
|
+
constant uint64_t & nb12,
|
|
7618
|
+
constant int64_t & ne0,
|
|
7619
|
+
constant int64_t & ne1,
|
|
7620
|
+
constant uint64_t & nb1,
|
|
7621
|
+
constant uint & r2,
|
|
7622
|
+
constant uint & r3,
|
|
7623
|
+
constant int & idx,
|
|
7624
|
+
device const char * src00,
|
|
7625
|
+
device const char * src01,
|
|
7626
|
+
device const char * src02,
|
|
7627
|
+
device const char * src03,
|
|
7628
|
+
device const char * src04,
|
|
7629
|
+
device const char * src05,
|
|
7630
|
+
device const char * src06,
|
|
7631
|
+
device const char * src07,
|
|
7632
|
+
threadgroup float * shared_values [[threadgroup(0)]],
|
|
7633
|
+
uint3 tgpig[[threadgroup_position_in_grid]],
|
|
7634
|
+
uint tiitg[[thread_index_in_threadgroup]],
|
|
7635
|
+
uint tiisg[[thread_index_in_simdgroup]],
|
|
7636
|
+
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
|
7637
|
+
device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07};
|
|
7638
|
+
|
|
7639
|
+
const int64_t bid = tgpig.z/(ne12*ne13);
|
|
7640
|
+
|
|
7641
|
+
tgpig.z = tgpig.z%(ne12*ne13);
|
|
7642
|
+
|
|
7643
|
+
const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx];
|
|
7644
|
+
|
|
7645
|
+
kernel_mul_mv_iq4_nl_f32_impl(
|
|
7646
|
+
src0[id],
|
|
7647
|
+
(device const float *) (src1 + bid*nb11),
|
|
7648
|
+
dst + bid*ne0,
|
|
7649
|
+
ne00,
|
|
7650
|
+
ne01,
|
|
7651
|
+
ne02,
|
|
7652
|
+
ne10,
|
|
7653
|
+
ne12,
|
|
7654
|
+
ne0,
|
|
7655
|
+
ne1,
|
|
7656
|
+
r2,
|
|
7657
|
+
r3,
|
|
7658
|
+
shared_values,
|
|
7659
|
+
tgpig,
|
|
7660
|
+
tiisg,
|
|
7661
|
+
sgitg);
|
|
7662
|
+
}
|
|
7663
|
+
|
|
7664
|
+
[[host_name("kernel_mul_mv_id_iq4_xs_f32")]]
|
|
7665
|
+
kernel void kernel_mul_mv_id_iq4_xs_f32(
|
|
7666
|
+
device const char * ids,
|
|
7667
|
+
device const char * src1,
|
|
7668
|
+
device float * dst,
|
|
7669
|
+
constant uint64_t & nbi1,
|
|
7670
|
+
constant int64_t & ne00,
|
|
7671
|
+
constant int64_t & ne01,
|
|
7672
|
+
constant int64_t & ne02,
|
|
7673
|
+
constant uint64_t & nb00,
|
|
7674
|
+
constant uint64_t & nb01,
|
|
7675
|
+
constant uint64_t & nb02,
|
|
7676
|
+
constant int64_t & ne10,
|
|
7677
|
+
constant int64_t & ne11,
|
|
7678
|
+
constant int64_t & ne12,
|
|
7679
|
+
constant int64_t & ne13,
|
|
7680
|
+
constant uint64_t & nb10,
|
|
7681
|
+
constant uint64_t & nb11,
|
|
7682
|
+
constant uint64_t & nb12,
|
|
7683
|
+
constant int64_t & ne0,
|
|
7684
|
+
constant int64_t & ne1,
|
|
7685
|
+
constant uint64_t & nb1,
|
|
7686
|
+
constant uint & r2,
|
|
7687
|
+
constant uint & r3,
|
|
7688
|
+
constant int & idx,
|
|
7689
|
+
device const char * src00,
|
|
7690
|
+
device const char * src01,
|
|
7691
|
+
device const char * src02,
|
|
7692
|
+
device const char * src03,
|
|
7693
|
+
device const char * src04,
|
|
7694
|
+
device const char * src05,
|
|
7695
|
+
device const char * src06,
|
|
7696
|
+
device const char * src07,
|
|
7697
|
+
threadgroup float * shared_values [[threadgroup(0)]],
|
|
7698
|
+
uint3 tgpig[[threadgroup_position_in_grid]],
|
|
7699
|
+
uint tiitg[[thread_index_in_threadgroup]],
|
|
7700
|
+
uint tiisg[[thread_index_in_simdgroup]],
|
|
7701
|
+
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
|
7702
|
+
device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07};
|
|
7703
|
+
|
|
7704
|
+
const int64_t bid = tgpig.z/(ne12*ne13);
|
|
7705
|
+
|
|
7706
|
+
tgpig.z = tgpig.z%(ne12*ne13);
|
|
7707
|
+
|
|
7708
|
+
const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx];
|
|
7709
|
+
|
|
7710
|
+
#if QK_K == 64
|
|
7711
|
+
kernel_mul_mv_iq4_nl_f32_impl(
|
|
7712
|
+
#else
|
|
7713
|
+
kernel_mul_mv_iq4_xs_f32_impl(
|
|
7714
|
+
#endif
|
|
7715
|
+
src0[id],
|
|
7716
|
+
(device const float *) (src1 + bid*nb11),
|
|
7717
|
+
dst + bid*ne0,
|
|
7718
|
+
ne00,
|
|
7719
|
+
ne01,
|
|
7720
|
+
ne02,
|
|
7721
|
+
ne10,
|
|
7722
|
+
ne12,
|
|
7723
|
+
ne0,
|
|
7724
|
+
ne1,
|
|
7725
|
+
r2,
|
|
7726
|
+
r3,
|
|
7727
|
+
shared_values,
|
|
7728
|
+
tgpig,
|
|
7729
|
+
tiisg,
|
|
7730
|
+
sgitg);
|
|
7731
|
+
}
|