node-llama-cpp 3.0.0-beta.12 → 3.0.0-beta.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/ChatWrapper.d.ts +1 -0
- package/dist/ChatWrapper.js +2 -1
- package/dist/ChatWrapper.js.map +1 -1
- package/dist/TemplateChatWrapper.d.ts +67 -0
- package/dist/TemplateChatWrapper.js +239 -0
- package/dist/TemplateChatWrapper.js.map +1 -0
- package/dist/bindings/AddonTypes.d.ts +2 -0
- package/dist/bindings/Llama.d.ts +1 -3
- package/dist/bindings/Llama.js +10 -20
- package/dist/bindings/Llama.js.map +1 -1
- package/dist/bindings/consts.d.ts +2 -0
- package/dist/bindings/consts.js +11 -0
- package/dist/bindings/consts.js.map +1 -0
- package/dist/bindings/getLlama.d.ts +14 -18
- package/dist/bindings/getLlama.js +210 -78
- package/dist/bindings/getLlama.js.map +1 -1
- package/dist/bindings/types.d.ts +8 -5
- package/dist/bindings/types.js +18 -0
- package/dist/bindings/types.js.map +1 -1
- package/dist/bindings/utils/asyncEvery.d.ts +5 -0
- package/dist/bindings/utils/asyncEvery.js +15 -0
- package/dist/bindings/utils/asyncEvery.js.map +1 -0
- package/dist/bindings/utils/asyncSome.d.ts +5 -0
- package/dist/bindings/utils/asyncSome.js +27 -0
- package/dist/bindings/utils/asyncSome.js.map +1 -0
- package/dist/bindings/utils/cloneLlamaCppRepo.js +13 -3
- package/dist/bindings/utils/cloneLlamaCppRepo.js.map +1 -1
- package/dist/bindings/utils/compileLLamaCpp.js +30 -4
- package/dist/bindings/utils/compileLLamaCpp.js.map +1 -1
- package/dist/bindings/utils/detectAvailableComputeLayers.d.ts +11 -0
- package/dist/bindings/utils/detectAvailableComputeLayers.js +158 -0
- package/dist/bindings/utils/detectAvailableComputeLayers.js.map +1 -0
- package/dist/bindings/utils/detectGlibc.d.ts +4 -0
- package/dist/bindings/utils/detectGlibc.js +36 -0
- package/dist/bindings/utils/detectGlibc.js.map +1 -0
- package/dist/bindings/utils/getBestComputeLayersAvailable.d.ts +9 -0
- package/dist/bindings/utils/getBestComputeLayersAvailable.js +29 -0
- package/dist/bindings/utils/getBestComputeLayersAvailable.js.map +1 -0
- package/dist/bindings/utils/getBuildFolderNameForBuildOptions.js +12 -6
- package/dist/bindings/utils/getBuildFolderNameForBuildOptions.js.map +1 -1
- package/dist/bindings/utils/getGpuTypesToUseForOption.d.ts +11 -0
- package/dist/bindings/utils/getGpuTypesToUseForOption.js +30 -0
- package/dist/bindings/utils/getGpuTypesToUseForOption.js.map +1 -0
- package/dist/bindings/utils/getLinuxDistroInfo.d.ts +9 -0
- package/dist/bindings/utils/getLinuxDistroInfo.js +46 -0
- package/dist/bindings/utils/getLinuxDistroInfo.js.map +1 -0
- package/dist/bindings/utils/getPlatformInfo.d.ts +5 -0
- package/dist/bindings/utils/getPlatformInfo.js +28 -0
- package/dist/bindings/utils/getPlatformInfo.js.map +1 -0
- package/dist/bindings/utils/hasFileInPath.d.ts +2 -0
- package/dist/bindings/utils/hasFileInPath.js +34 -0
- package/dist/bindings/utils/hasFileInPath.js.map +1 -0
- package/dist/bindings/utils/logBinaryUsageExampleToConsole.d.ts +1 -1
- package/dist/bindings/utils/logBinaryUsageExampleToConsole.js +3 -9
- package/dist/bindings/utils/logBinaryUsageExampleToConsole.js.map +1 -1
- package/dist/bindings/utils/logDistroInstallInstruction.d.ts +13 -0
- package/dist/bindings/utils/logDistroInstallInstruction.js +38 -0
- package/dist/bindings/utils/logDistroInstallInstruction.js.map +1 -0
- package/dist/bindings/utils/resolveChatWrapperBasedOnWrapperTypeName.d.ts +9 -2
- package/dist/bindings/utils/resolveChatWrapperBasedOnWrapperTypeName.js +10 -4
- package/dist/bindings/utils/resolveChatWrapperBasedOnWrapperTypeName.js.map +1 -1
- package/dist/bindings/utils/testBindingBinary.d.ts +1 -0
- package/dist/bindings/utils/testBindingBinary.js +98 -0
- package/dist/bindings/utils/testBindingBinary.js.map +1 -0
- package/dist/chatWrappers/ChatMLChatWrapper.js +1 -1
- package/dist/chatWrappers/ChatMLChatWrapper.js.map +1 -1
- package/dist/chatWrappers/GemmaChatWrapper.d.ts +18 -0
- package/dist/chatWrappers/GemmaChatWrapper.js +86 -0
- package/dist/chatWrappers/GemmaChatWrapper.js.map +1 -0
- package/dist/chatWrappers/resolveChatWrapperBasedOnModel.js +3 -0
- package/dist/chatWrappers/resolveChatWrapperBasedOnModel.js.map +1 -1
- package/dist/cli/cli.js +2 -0
- package/dist/cli/cli.js.map +1 -1
- package/dist/cli/commands/BuildCommand.d.ts +5 -5
- package/dist/cli/commands/BuildCommand.js +78 -60
- package/dist/cli/commands/BuildCommand.js.map +1 -1
- package/dist/cli/commands/DebugCommand.js +3 -9
- package/dist/cli/commands/DebugCommand.js.map +1 -1
- package/dist/cli/commands/DownloadCommand.d.ts +5 -5
- package/dist/cli/commands/DownloadCommand.js +97 -56
- package/dist/cli/commands/DownloadCommand.js.map +1 -1
- package/dist/cli/commands/InspectCommand.d.ts +7 -0
- package/dist/cli/commands/InspectCommand.js +113 -0
- package/dist/cli/commands/InspectCommand.js.map +1 -0
- package/dist/cli/utils/logUsedGpuTypeOption.d.ts +2 -0
- package/dist/cli/utils/logUsedGpuTypeOption.js +9 -0
- package/dist/cli/utils/logUsedGpuTypeOption.js.map +1 -0
- package/dist/config.d.ts +3 -3
- package/dist/config.js +10 -11
- package/dist/config.js.map +1 -1
- package/dist/evaluator/LlamaChatSession/utils/defineChatSessionFunction.d.ts +3 -0
- package/dist/evaluator/LlamaChatSession/utils/defineChatSessionFunction.js +3 -0
- package/dist/evaluator/LlamaChatSession/utils/defineChatSessionFunction.js.map +1 -1
- package/dist/gguf/GGUFInsights.d.ts +28 -0
- package/dist/gguf/GGUFInsights.js +58 -0
- package/dist/gguf/GGUFInsights.js.map +1 -0
- package/dist/gguf/GGUFMetadata.d.ts +19 -0
- package/dist/gguf/GGUFMetadata.js +38 -0
- package/dist/gguf/GGUFMetadata.js.map +1 -0
- package/dist/gguf/errors/InvalidGGUFMagicError.d.ts +3 -0
- package/dist/gguf/errors/InvalidGGUFMagicError.js +6 -0
- package/dist/gguf/errors/InvalidGGUFMagicError.js.map +1 -0
- package/dist/gguf/errors/MetadataNotParsedYetError.d.ts +3 -0
- package/dist/gguf/errors/MetadataNotParsedYetError.js +6 -0
- package/dist/gguf/errors/MetadataNotParsedYetError.js.map +1 -0
- package/dist/gguf/errors/MissingNodeLlamaError.d.ts +3 -0
- package/dist/gguf/errors/MissingNodeLlamaError.js +6 -0
- package/dist/gguf/errors/MissingNodeLlamaError.js.map +1 -0
- package/dist/gguf/errors/ModelScore/NotEnoughVRamError.d.ts +5 -0
- package/dist/gguf/errors/ModelScore/NotEnoughVRamError.js +12 -0
- package/dist/gguf/errors/ModelScore/NotEnoughVRamError.js.map +1 -0
- package/dist/gguf/errors/UnsupportedMetadataTypeError.d.ts +4 -0
- package/dist/gguf/errors/UnsupportedMetadataTypeError.js +8 -0
- package/dist/gguf/errors/UnsupportedMetadataTypeError.js.map +1 -0
- package/dist/gguf/ggufParser/GGUFParser.d.ts +18 -0
- package/dist/gguf/ggufParser/GGUFParser.js +123 -0
- package/dist/gguf/ggufParser/GGUFParser.js.map +1 -0
- package/dist/gguf/ggufParser/GGUFTypes.d.ts +257 -0
- package/dist/gguf/ggufParser/GGUFTypes.js +2 -0
- package/dist/gguf/ggufParser/GGUFTypes.js.map +1 -0
- package/dist/gguf/ggufParser/checkArchitecture.d.ts +14 -0
- package/dist/gguf/ggufParser/checkArchitecture.js +74 -0
- package/dist/gguf/ggufParser/checkArchitecture.js.map +1 -0
- package/dist/gguf/ggufParser/stream/GGUFBaseStream.d.ts +38 -0
- package/dist/gguf/ggufParser/stream/GGUFBaseStream.js +83 -0
- package/dist/gguf/ggufParser/stream/GGUFBaseStream.js.map +1 -0
- package/dist/gguf/ggufParser/stream/GGUFFetchStream.d.ts +14 -0
- package/dist/gguf/ggufParser/stream/GGUFFetchStream.js +35 -0
- package/dist/gguf/ggufParser/stream/GGUFFetchStream.js.map +1 -0
- package/dist/gguf/ggufParser/stream/GGUFReadStream.d.ts +15 -0
- package/dist/gguf/ggufParser/stream/GGUFReadStream.js +40 -0
- package/dist/gguf/ggufParser/stream/GGUFReadStream.js.map +1 -0
- package/dist/index.d.ts +3 -1
- package/dist/index.js +3 -1
- package/dist/index.js.map +1 -1
- package/dist/utils/LlamaText.js +2 -2
- package/dist/utils/LlamaText.js.map +1 -1
- package/dist/utils/cmake.js +23 -10
- package/dist/utils/cmake.js.map +1 -1
- package/dist/utils/getBuildDefaults.d.ts +1 -3
- package/dist/utils/getBuildDefaults.js +2 -4
- package/dist/utils/getBuildDefaults.js.map +1 -1
- package/dist/utils/getConsoleLogPrefix.d.ts +1 -1
- package/dist/utils/getConsoleLogPrefix.js +2 -2
- package/dist/utils/getConsoleLogPrefix.js.map +1 -1
- package/dist/utils/mergeUnionTypes.d.ts +6 -0
- package/dist/utils/mergeUnionTypes.js +2 -0
- package/dist/utils/mergeUnionTypes.js.map +1 -0
- package/dist/utils/parseTextTemplate.d.ts +66 -0
- package/dist/utils/parseTextTemplate.js +116 -0
- package/dist/utils/parseTextTemplate.js.map +1 -0
- package/llama/CMakeLists.txt +11 -5
- package/llama/addon.cpp +31 -7
- package/llama/binariesGithubRelease.json +1 -1
- package/llama/gitRelease.bundle +0 -0
- package/llama/llama.cpp.info.json +1 -1
- package/llamaBins/linux-arm64/.buildMetadata.json +1 -1
- package/llamaBins/linux-arm64/llama-addon.node +0 -0
- package/llamaBins/linux-armv7l/.buildMetadata.json +1 -1
- package/llamaBins/linux-armv7l/llama-addon.node +0 -0
- package/llamaBins/linux-x64/.buildMetadata.json +1 -1
- package/llamaBins/linux-x64/llama-addon.node +0 -0
- package/llamaBins/linux-x64-cuda/.buildMetadata.json +1 -1
- package/llamaBins/linux-x64-cuda/llama-addon.node +0 -0
- package/llamaBins/linux-x64-vulkan/.buildMetadata.json +1 -1
- package/llamaBins/linux-x64-vulkan/llama-addon.node +0 -0
- package/llamaBins/mac-arm64-metal/.buildMetadata.json +1 -1
- package/llamaBins/mac-arm64-metal/ggml-metal.metal +815 -106
- package/llamaBins/mac-arm64-metal/llama-addon.node +0 -0
- package/llamaBins/mac-x64/.buildMetadata.json +1 -1
- package/llamaBins/mac-x64/llama-addon.node +0 -0
- package/llamaBins/win-x64/.buildMetadata.json +1 -1
- package/llamaBins/win-x64/llama-addon.node +0 -0
- package/llamaBins/win-x64-cuda/.buildMetadata.json +1 -1
- package/llamaBins/win-x64-cuda/llama-addon.node +0 -0
- package/llamaBins/win-x64-vulkan/.buildMetadata.json +1 -1
- package/llamaBins/win-x64-vulkan/llama-addon.node +0 -0
- package/package.json +6 -4
- package/dist/cli/utils/logEnabledComputeLayers.d.ts +0 -8
- package/dist/cli/utils/logEnabledComputeLayers.js +0 -11
- package/dist/cli/utils/logEnabledComputeLayers.js.map +0 -1
|
@@ -2519,6 +2519,14 @@ typedef struct {
|
|
|
2519
2519
|
} block_iq2_xs;
|
|
2520
2520
|
// 74 bytes / block for QK_K = 256, so 2.3125 bpw
|
|
2521
2521
|
|
|
2522
|
+
// 2.5625 bpw quants
|
|
2523
|
+
typedef struct {
|
|
2524
|
+
half d;
|
|
2525
|
+
uint8_t qs[QK_K/4];
|
|
2526
|
+
uint8_t qh[QK_K/32];
|
|
2527
|
+
uint8_t scales[QK_K/32];
|
|
2528
|
+
} block_iq2_s;
|
|
2529
|
+
|
|
2522
2530
|
typedef struct {
|
|
2523
2531
|
half d;
|
|
2524
2532
|
uint8_t qs[3*QK_K/8];
|
|
@@ -2552,6 +2560,17 @@ typedef struct {
|
|
|
2552
2560
|
uint8_t qs[QK4_NL/2];
|
|
2553
2561
|
} block_iq4_nl;
|
|
2554
2562
|
|
|
2563
|
+
#if QK_K == 64
|
|
2564
|
+
#define block_iq4_xs block_iq4_nl
|
|
2565
|
+
#else
|
|
2566
|
+
typedef struct {
|
|
2567
|
+
half d;
|
|
2568
|
+
uint16_t scales_h;
|
|
2569
|
+
uint8_t scales_l[QK_K/64];
|
|
2570
|
+
uint8_t qs[QK_K/2];
|
|
2571
|
+
} block_iq4_xs;
|
|
2572
|
+
#endif
|
|
2573
|
+
|
|
2555
2574
|
//====================================== dot products =========================
|
|
2556
2575
|
|
|
2557
2576
|
void kernel_mul_mv_q2_K_f32_impl(
|
|
@@ -3774,6 +3793,265 @@ constexpr constant static uint64_t iq2xs_grid[512] = {
|
|
|
3774
3793
|
0x2b2b2b2b082b2b08, 0x2b2b2b2b082b2b2b, 0x2b2b2b2b2b190819, 0x2b2b2b2b2b2b2b2b,
|
|
3775
3794
|
};
|
|
3776
3795
|
|
|
3796
|
+
constexpr constant static uint64_t iq2s_grid[1024] = {
|
|
3797
|
+
0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08,
|
|
3798
|
+
0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x080808080819192b,
|
|
3799
|
+
0x0808080808192b19, 0x08080808082b0808, 0x08080808082b082b, 0x08080808082b1919,
|
|
3800
|
+
0x08080808082b2b08, 0x0808080819080819, 0x0808080819081908, 0x080808081908192b,
|
|
3801
|
+
0x0808080819082b19, 0x0808080819190808, 0x080808081919082b, 0x0808080819191919,
|
|
3802
|
+
0x0808080819192b08, 0x08080808192b0819, 0x08080808192b1908, 0x08080808192b192b,
|
|
3803
|
+
0x08080808192b2b19, 0x080808082b080808, 0x080808082b08082b, 0x080808082b081919,
|
|
3804
|
+
0x080808082b082b08, 0x080808082b190819, 0x080808082b191908, 0x080808082b2b0808,
|
|
3805
|
+
0x080808082b2b1919, 0x080808082b2b2b2b, 0x0808081908080819, 0x0808081908081908,
|
|
3806
|
+
0x080808190808192b, 0x0808081908082b19, 0x0808081908190808, 0x080808190819082b,
|
|
3807
|
+
0x0808081908191919, 0x0808081908192b08, 0x08080819082b0819, 0x08080819082b1908,
|
|
3808
|
+
0x0808081919080808, 0x080808191908082b, 0x0808081919081919, 0x0808081919082b08,
|
|
3809
|
+
0x0808081919190819, 0x0808081919191908, 0x080808191919192b, 0x0808081919192b19,
|
|
3810
|
+
0x08080819192b0808, 0x08080819192b1919, 0x08080819192b2b08, 0x080808192b080819,
|
|
3811
|
+
0x080808192b081908, 0x080808192b190808, 0x080808192b19082b, 0x080808192b191919,
|
|
3812
|
+
0x080808192b2b0819, 0x080808192b2b1908, 0x0808082b08080808, 0x0808082b0808082b,
|
|
3813
|
+
0x0808082b08081919, 0x0808082b08082b08, 0x0808082b08190819, 0x0808082b08191908,
|
|
3814
|
+
0x0808082b082b0808, 0x0808082b082b2b2b, 0x0808082b19080819, 0x0808082b19081908,
|
|
3815
|
+
0x0808082b1908192b, 0x0808082b19082b19, 0x0808082b19190808, 0x0808082b19191919,
|
|
3816
|
+
0x0808082b2b080808, 0x0808082b2b081919, 0x0808082b2b082b2b, 0x0808082b2b191908,
|
|
3817
|
+
0x0808082b2b2b082b, 0x0808190808080819, 0x0808190808081908, 0x080819080808192b,
|
|
3818
|
+
0x0808190808082b19, 0x0808190808190808, 0x080819080819082b, 0x0808190808191919,
|
|
3819
|
+
0x0808190808192b08, 0x08081908082b0819, 0x08081908082b1908, 0x08081908082b192b,
|
|
3820
|
+
0x08081908082b2b19, 0x0808190819080808, 0x080819081908082b, 0x0808190819081919,
|
|
3821
|
+
0x0808190819082b08, 0x0808190819082b2b, 0x0808190819190819, 0x0808190819191908,
|
|
3822
|
+
0x080819081919192b, 0x0808190819192b19, 0x08081908192b0808, 0x08081908192b082b,
|
|
3823
|
+
0x08081908192b1919, 0x080819082b080819, 0x080819082b081908, 0x080819082b08192b,
|
|
3824
|
+
0x080819082b082b19, 0x080819082b190808, 0x080819082b191919, 0x080819082b192b08,
|
|
3825
|
+
0x080819082b2b0819, 0x080819082b2b1908, 0x0808191908080808, 0x080819190808082b,
|
|
3826
|
+
0x0808191908081919, 0x0808191908082b08, 0x0808191908082b2b, 0x0808191908190819,
|
|
3827
|
+
0x0808191908191908, 0x080819190819192b, 0x0808191908192b19, 0x08081919082b0808,
|
|
3828
|
+
0x08081919082b1919, 0x08081919082b2b08, 0x0808191919080819, 0x0808191919081908,
|
|
3829
|
+
0x080819191908192b, 0x0808191919082b19, 0x0808191919190808, 0x080819191919082b,
|
|
3830
|
+
0x0808191919191919, 0x0808191919192b08, 0x08081919192b0819, 0x08081919192b1908,
|
|
3831
|
+
0x080819192b080808, 0x080819192b08082b, 0x080819192b081919, 0x080819192b082b08,
|
|
3832
|
+
0x080819192b190819, 0x080819192b191908, 0x080819192b2b0808, 0x0808192b08080819,
|
|
3833
|
+
0x0808192b08081908, 0x0808192b0808192b, 0x0808192b08082b19, 0x0808192b08190808,
|
|
3834
|
+
0x0808192b08191919, 0x0808192b19080808, 0x0808192b19081919, 0x0808192b19082b08,
|
|
3835
|
+
0x0808192b19190819, 0x0808192b19191908, 0x0808192b192b0808, 0x0808192b2b080819,
|
|
3836
|
+
0x0808192b2b081908, 0x0808192b2b190808, 0x08082b0808080808, 0x08082b080808082b,
|
|
3837
|
+
0x08082b0808081919, 0x08082b0808082b08, 0x08082b0808190819, 0x08082b0808191908,
|
|
3838
|
+
0x08082b080819192b, 0x08082b0808192b19, 0x08082b08082b0808, 0x08082b08082b1919,
|
|
3839
|
+
0x08082b08082b2b2b, 0x08082b0819080819, 0x08082b0819081908, 0x08082b081908192b,
|
|
3840
|
+
0x08082b0819082b19, 0x08082b0819190808, 0x08082b081919082b, 0x08082b0819191919,
|
|
3841
|
+
0x08082b0819192b08, 0x08082b08192b0819, 0x08082b08192b1908, 0x08082b082b080808,
|
|
3842
|
+
0x08082b082b081919, 0x08082b082b191908, 0x08082b082b2b2b2b, 0x08082b1908080819,
|
|
3843
|
+
0x08082b1908081908, 0x08082b1908190808, 0x08082b190819082b, 0x08082b1908191919,
|
|
3844
|
+
0x08082b1908192b08, 0x08082b19082b0819, 0x08082b1919080808, 0x08082b1919081919,
|
|
3845
|
+
0x08082b1919082b08, 0x08082b1919190819, 0x08082b1919191908, 0x08082b19192b0808,
|
|
3846
|
+
0x08082b192b080819, 0x08082b192b190808, 0x08082b2b08080808, 0x08082b2b08190819,
|
|
3847
|
+
0x08082b2b08191908, 0x08082b2b082b082b, 0x08082b2b082b2b08, 0x08082b2b082b2b2b,
|
|
3848
|
+
0x08082b2b19190808, 0x08082b2b2b192b19, 0x0819080808080819, 0x0819080808081908,
|
|
3849
|
+
0x081908080808192b, 0x0819080808082b19, 0x0819080808190808, 0x081908080819082b,
|
|
3850
|
+
0x0819080808191919, 0x0819080808192b08, 0x08190808082b0819, 0x08190808082b1908,
|
|
3851
|
+
0x08190808082b192b, 0x0819080819080808, 0x081908081908082b, 0x0819080819081919,
|
|
3852
|
+
0x0819080819082b08, 0x0819080819190819, 0x0819080819191908, 0x081908081919192b,
|
|
3853
|
+
0x0819080819192b19, 0x08190808192b0808, 0x08190808192b082b, 0x08190808192b1919,
|
|
3854
|
+
0x08190808192b2b08, 0x081908082b080819, 0x081908082b081908, 0x081908082b08192b,
|
|
3855
|
+
0x081908082b190808, 0x081908082b191919, 0x081908082b192b08, 0x081908082b2b0819,
|
|
3856
|
+
0x081908082b2b1908, 0x0819081908080808, 0x081908190808082b, 0x0819081908081919,
|
|
3857
|
+
0x0819081908082b08, 0x0819081908082b2b, 0x0819081908190819, 0x0819081908191908,
|
|
3858
|
+
0x081908190819192b, 0x0819081908192b19, 0x08190819082b0808, 0x08190819082b082b,
|
|
3859
|
+
0x08190819082b1919, 0x08190819082b2b08, 0x0819081919080819, 0x0819081919081908,
|
|
3860
|
+
0x081908191908192b, 0x0819081919082b19, 0x0819081919190808, 0x081908191919082b,
|
|
3861
|
+
0x0819081919191919, 0x0819081919192b08, 0x08190819192b0819, 0x08190819192b1908,
|
|
3862
|
+
0x081908192b080808, 0x081908192b08082b, 0x081908192b081919, 0x081908192b082b08,
|
|
3863
|
+
0x081908192b190819, 0x081908192b191908, 0x0819082b08080819, 0x0819082b08081908,
|
|
3864
|
+
0x0819082b08082b19, 0x0819082b08190808, 0x0819082b08191919, 0x0819082b082b0819,
|
|
3865
|
+
0x0819082b082b1908, 0x0819082b19080808, 0x0819082b19081919, 0x0819082b19190819,
|
|
3866
|
+
0x0819082b19191908, 0x0819082b2b080819, 0x0819082b2b081908, 0x0819082b2b190808,
|
|
3867
|
+
0x0819190808080808, 0x081919080808082b, 0x0819190808081919, 0x0819190808082b08,
|
|
3868
|
+
0x0819190808190819, 0x0819190808191908, 0x081919080819192b, 0x0819190808192b19,
|
|
3869
|
+
0x08191908082b0808, 0x08191908082b1919, 0x08191908082b2b08, 0x0819190819080819,
|
|
3870
|
+
0x0819190819081908, 0x081919081908192b, 0x0819190819082b19, 0x0819190819190808,
|
|
3871
|
+
0x081919081919082b, 0x0819190819191919, 0x0819190819192b08, 0x08191908192b0819,
|
|
3872
|
+
0x08191908192b1908, 0x081919082b080808, 0x081919082b08082b, 0x081919082b081919,
|
|
3873
|
+
0x081919082b082b08, 0x081919082b190819, 0x081919082b191908, 0x081919082b2b0808,
|
|
3874
|
+
0x0819191908080819, 0x0819191908081908, 0x081919190808192b, 0x0819191908082b19,
|
|
3875
|
+
0x0819191908190808, 0x081919190819082b, 0x0819191908191919, 0x0819191908192b08,
|
|
3876
|
+
0x08191919082b0819, 0x08191919082b1908, 0x0819191919080808, 0x081919191908082b,
|
|
3877
|
+
0x0819191919081919, 0x0819191919082b08, 0x0819191919190819, 0x0819191919191908,
|
|
3878
|
+
0x08191919192b0808, 0x081919192b080819, 0x081919192b081908, 0x081919192b190808,
|
|
3879
|
+
0x0819192b08080808, 0x0819192b08081919, 0x0819192b08082b08, 0x0819192b08190819,
|
|
3880
|
+
0x0819192b08191908, 0x0819192b082b0808, 0x0819192b19080819, 0x0819192b19081908,
|
|
3881
|
+
0x0819192b19190808, 0x0819192b2b080808, 0x0819192b2b2b2b2b, 0x08192b0808080819,
|
|
3882
|
+
0x08192b0808081908, 0x08192b080808192b, 0x08192b0808082b19, 0x08192b0808190808,
|
|
3883
|
+
0x08192b0808191919, 0x08192b0808192b08, 0x08192b08082b0819, 0x08192b0819080808,
|
|
3884
|
+
0x08192b081908082b, 0x08192b0819081919, 0x08192b0819082b08, 0x08192b0819190819,
|
|
3885
|
+
0x08192b0819191908, 0x08192b08192b0808, 0x08192b082b080819, 0x08192b082b081908,
|
|
3886
|
+
0x08192b1908080808, 0x08192b190808082b, 0x08192b1908081919, 0x08192b1908082b08,
|
|
3887
|
+
0x08192b1908190819, 0x08192b1908191908, 0x08192b19082b0808, 0x08192b1919080819,
|
|
3888
|
+
0x08192b1919081908, 0x08192b1919190808, 0x08192b19192b2b19, 0x08192b192b2b082b,
|
|
3889
|
+
0x08192b2b08081908, 0x08192b2b08190808, 0x08192b2b19080808, 0x08192b2b1919192b,
|
|
3890
|
+
0x082b080808080808, 0x082b08080808082b, 0x082b080808081919, 0x082b080808082b08,
|
|
3891
|
+
0x082b080808190819, 0x082b080808191908, 0x082b08080819192b, 0x082b080808192b19,
|
|
3892
|
+
0x082b0808082b0808, 0x082b0808082b1919, 0x082b0808082b2b2b, 0x082b080819080819,
|
|
3893
|
+
0x082b080819081908, 0x082b080819190808, 0x082b08081919082b, 0x082b080819191919,
|
|
3894
|
+
0x082b0808192b1908, 0x082b08082b080808, 0x082b08082b082b2b, 0x082b08082b191908,
|
|
3895
|
+
0x082b08082b2b2b2b, 0x082b081908080819, 0x082b081908081908, 0x082b081908190808,
|
|
3896
|
+
0x082b08190819082b, 0x082b081908191919, 0x082b0819082b0819, 0x082b081919080808,
|
|
3897
|
+
0x082b08191908082b, 0x082b081919081919, 0x082b081919190819, 0x082b081919191908,
|
|
3898
|
+
0x082b0819192b0808, 0x082b08192b080819, 0x082b08192b081908, 0x082b08192b190808,
|
|
3899
|
+
0x082b082b08080808, 0x082b082b08082b2b, 0x082b082b082b082b, 0x082b082b082b2b08,
|
|
3900
|
+
0x082b082b082b2b2b, 0x082b082b19081908, 0x082b082b19190808, 0x082b082b2b082b08,
|
|
3901
|
+
0x082b082b2b082b2b, 0x082b082b2b2b2b08, 0x082b190808080819, 0x082b190808081908,
|
|
3902
|
+
0x082b19080808192b, 0x082b190808082b19, 0x082b190808190808, 0x082b190808191919,
|
|
3903
|
+
0x082b190808192b08, 0x082b1908082b0819, 0x082b1908082b1908, 0x082b190819080808,
|
|
3904
|
+
0x082b19081908082b, 0x082b190819081919, 0x082b190819082b08, 0x082b190819190819,
|
|
3905
|
+
0x082b190819191908, 0x082b1908192b0808, 0x082b19082b080819, 0x082b19082b081908,
|
|
3906
|
+
0x082b19082b190808, 0x082b191908080808, 0x082b191908081919, 0x082b191908082b08,
|
|
3907
|
+
0x082b191908190819, 0x082b191908191908, 0x082b1919082b0808, 0x082b191919080819,
|
|
3908
|
+
0x082b191919081908, 0x082b191919190808, 0x082b1919192b192b, 0x082b19192b080808,
|
|
3909
|
+
0x082b192b08080819, 0x082b192b08081908, 0x082b192b08190808, 0x082b192b19080808,
|
|
3910
|
+
0x082b192b19192b19, 0x082b2b0808080808, 0x082b2b0808081919, 0x082b2b0808190819,
|
|
3911
|
+
0x082b2b0808191908, 0x082b2b0819080819, 0x082b2b0819081908, 0x082b2b0819190808,
|
|
3912
|
+
0x082b2b082b082b2b, 0x082b2b082b2b2b2b, 0x082b2b1908080819, 0x082b2b1908081908,
|
|
3913
|
+
0x082b2b1908190808, 0x082b2b192b191919, 0x082b2b2b08082b2b, 0x082b2b2b082b082b,
|
|
3914
|
+
0x082b2b2b192b1908, 0x082b2b2b2b082b08, 0x082b2b2b2b082b2b, 0x1908080808080819,
|
|
3915
|
+
0x1908080808081908, 0x190808080808192b, 0x1908080808082b19, 0x1908080808190808,
|
|
3916
|
+
0x190808080819082b, 0x1908080808191919, 0x1908080808192b08, 0x1908080808192b2b,
|
|
3917
|
+
0x19080808082b0819, 0x19080808082b1908, 0x19080808082b192b, 0x1908080819080808,
|
|
3918
|
+
0x190808081908082b, 0x1908080819081919, 0x1908080819082b08, 0x1908080819082b2b,
|
|
3919
|
+
0x1908080819190819, 0x1908080819191908, 0x190808081919192b, 0x1908080819192b19,
|
|
3920
|
+
0x19080808192b0808, 0x19080808192b082b, 0x19080808192b1919, 0x190808082b080819,
|
|
3921
|
+
0x190808082b081908, 0x190808082b190808, 0x190808082b191919, 0x190808082b192b08,
|
|
3922
|
+
0x190808082b2b0819, 0x190808082b2b1908, 0x1908081908080808, 0x190808190808082b,
|
|
3923
|
+
0x1908081908081919, 0x1908081908082b08, 0x1908081908190819, 0x1908081908191908,
|
|
3924
|
+
0x190808190819192b, 0x1908081908192b19, 0x19080819082b0808, 0x19080819082b082b,
|
|
3925
|
+
0x19080819082b1919, 0x1908081919080819, 0x1908081919081908, 0x190808191908192b,
|
|
3926
|
+
0x1908081919082b19, 0x1908081919190808, 0x190808191919082b, 0x1908081919191919,
|
|
3927
|
+
0x1908081919192b08, 0x19080819192b0819, 0x19080819192b1908, 0x190808192b080808,
|
|
3928
|
+
0x190808192b08082b, 0x190808192b081919, 0x190808192b082b08, 0x190808192b190819,
|
|
3929
|
+
0x190808192b191908, 0x190808192b2b0808, 0x1908082b08080819, 0x1908082b08081908,
|
|
3930
|
+
0x1908082b08190808, 0x1908082b0819082b, 0x1908082b08191919, 0x1908082b08192b08,
|
|
3931
|
+
0x1908082b082b1908, 0x1908082b19080808, 0x1908082b19081919, 0x1908082b19082b08,
|
|
3932
|
+
0x1908082b19190819, 0x1908082b19191908, 0x1908082b192b0808, 0x1908082b2b080819,
|
|
3933
|
+
0x1908082b2b081908, 0x1908190808080808, 0x190819080808082b, 0x1908190808081919,
|
|
3934
|
+
0x1908190808082b08, 0x1908190808082b2b, 0x1908190808190819, 0x1908190808191908,
|
|
3935
|
+
0x190819080819192b, 0x1908190808192b19, 0x19081908082b0808, 0x19081908082b082b,
|
|
3936
|
+
0x19081908082b1919, 0x19081908082b2b08, 0x1908190819080819, 0x1908190819081908,
|
|
3937
|
+
0x190819081908192b, 0x1908190819082b19, 0x1908190819190808, 0x190819081919082b,
|
|
3938
|
+
0x1908190819191919, 0x1908190819192b08, 0x19081908192b0819, 0x19081908192b1908,
|
|
3939
|
+
0x190819082b080808, 0x190819082b08082b, 0x190819082b081919, 0x190819082b082b08,
|
|
3940
|
+
0x190819082b190819, 0x190819082b191908, 0x190819082b2b0808, 0x1908191908080819,
|
|
3941
|
+
0x1908191908081908, 0x190819190808192b, 0x1908191908082b19, 0x1908191908190808,
|
|
3942
|
+
0x190819190819082b, 0x1908191908191919, 0x1908191908192b08, 0x19081919082b0819,
|
|
3943
|
+
0x19081919082b1908, 0x1908191919080808, 0x190819191908082b, 0x1908191919081919,
|
|
3944
|
+
0x1908191919082b08, 0x1908191919190819, 0x1908191919191908, 0x19081919192b0808,
|
|
3945
|
+
0x19081919192b2b2b, 0x190819192b080819, 0x190819192b081908, 0x190819192b190808,
|
|
3946
|
+
0x1908192b08080808, 0x1908192b0808082b, 0x1908192b08081919, 0x1908192b08082b08,
|
|
3947
|
+
0x1908192b08190819, 0x1908192b08191908, 0x1908192b082b0808, 0x1908192b19080819,
|
|
3948
|
+
0x1908192b19081908, 0x1908192b19190808, 0x1908192b2b080808, 0x1908192b2b2b1919,
|
|
3949
|
+
0x19082b0808080819, 0x19082b0808081908, 0x19082b0808082b19, 0x19082b0808190808,
|
|
3950
|
+
0x19082b080819082b, 0x19082b0808191919, 0x19082b0808192b08, 0x19082b08082b0819,
|
|
3951
|
+
0x19082b08082b1908, 0x19082b0819080808, 0x19082b081908082b, 0x19082b0819081919,
|
|
3952
|
+
0x19082b0819082b08, 0x19082b0819190819, 0x19082b0819191908, 0x19082b08192b0808,
|
|
3953
|
+
0x19082b082b081908, 0x19082b082b190808, 0x19082b1908080808, 0x19082b190808082b,
|
|
3954
|
+
0x19082b1908081919, 0x19082b1908082b08, 0x19082b1908190819, 0x19082b1908191908,
|
|
3955
|
+
0x19082b19082b0808, 0x19082b1919080819, 0x19082b1919081908, 0x19082b1919190808,
|
|
3956
|
+
0x19082b192b080808, 0x19082b192b19192b, 0x19082b2b08080819, 0x19082b2b08081908,
|
|
3957
|
+
0x19082b2b08190808, 0x19082b2b19080808, 0x1919080808080808, 0x191908080808082b,
|
|
3958
|
+
0x1919080808081919, 0x1919080808082b08, 0x1919080808190819, 0x1919080808191908,
|
|
3959
|
+
0x191908080819192b, 0x1919080808192b19, 0x19190808082b0808, 0x19190808082b082b,
|
|
3960
|
+
0x19190808082b1919, 0x19190808082b2b08, 0x1919080819080819, 0x1919080819081908,
|
|
3961
|
+
0x191908081908192b, 0x1919080819082b19, 0x1919080819190808, 0x191908081919082b,
|
|
3962
|
+
0x1919080819191919, 0x1919080819192b08, 0x19190808192b0819, 0x19190808192b1908,
|
|
3963
|
+
0x191908082b080808, 0x191908082b08082b, 0x191908082b081919, 0x191908082b082b08,
|
|
3964
|
+
0x191908082b190819, 0x191908082b191908, 0x1919081908080819, 0x1919081908081908,
|
|
3965
|
+
0x191908190808192b, 0x1919081908082b19, 0x1919081908190808, 0x191908190819082b,
|
|
3966
|
+
0x1919081908191919, 0x1919081908192b08, 0x19190819082b0819, 0x19190819082b1908,
|
|
3967
|
+
0x1919081919080808, 0x191908191908082b, 0x1919081919081919, 0x1919081919082b08,
|
|
3968
|
+
0x1919081919190819, 0x1919081919191908, 0x19190819192b0808, 0x191908192b080819,
|
|
3969
|
+
0x191908192b081908, 0x191908192b190808, 0x1919082b08080808, 0x1919082b08081919,
|
|
3970
|
+
0x1919082b08082b08, 0x1919082b08190819, 0x1919082b08191908, 0x1919082b082b0808,
|
|
3971
|
+
0x1919082b19080819, 0x1919082b19081908, 0x1919082b19190808, 0x1919082b192b2b19,
|
|
3972
|
+
0x1919082b2b080808, 0x1919190808080819, 0x1919190808081908, 0x191919080808192b,
|
|
3973
|
+
0x1919190808082b19, 0x1919190808190808, 0x191919080819082b, 0x1919190808191919,
|
|
3974
|
+
0x1919190808192b08, 0x19191908082b0819, 0x19191908082b1908, 0x1919190819080808,
|
|
3975
|
+
0x191919081908082b, 0x1919190819081919, 0x1919190819082b08, 0x1919190819190819,
|
|
3976
|
+
0x1919190819191908, 0x19191908192b0808, 0x191919082b080819, 0x191919082b081908,
|
|
3977
|
+
0x191919082b190808, 0x1919191908080808, 0x191919190808082b, 0x1919191908081919,
|
|
3978
|
+
0x1919191908082b08, 0x1919191908190819, 0x1919191908191908, 0x19191919082b0808,
|
|
3979
|
+
0x1919191919080819, 0x1919191919081908, 0x1919191919190808, 0x191919192b080808,
|
|
3980
|
+
0x1919192b08080819, 0x1919192b08081908, 0x1919192b08190808, 0x1919192b082b192b,
|
|
3981
|
+
0x1919192b19080808, 0x19192b0808080808, 0x19192b080808082b, 0x19192b0808081919,
|
|
3982
|
+
0x19192b0808082b08, 0x19192b0808190819, 0x19192b0808191908, 0x19192b08082b0808,
|
|
3983
|
+
0x19192b0819080819, 0x19192b0819081908, 0x19192b0819190808, 0x19192b0819192b2b,
|
|
3984
|
+
0x19192b082b080808, 0x19192b1908080819, 0x19192b1908081908, 0x19192b1908190808,
|
|
3985
|
+
0x19192b1919080808, 0x19192b2b08080808, 0x19192b2b08192b19, 0x19192b2b2b081919,
|
|
3986
|
+
0x19192b2b2b2b2b08, 0x192b080808080819, 0x192b080808081908, 0x192b08080808192b,
|
|
3987
|
+
0x192b080808190808, 0x192b08080819082b, 0x192b080808191919, 0x192b080808192b08,
|
|
3988
|
+
0x192b0808082b0819, 0x192b0808082b1908, 0x192b080819080808, 0x192b080819081919,
|
|
3989
|
+
0x192b080819082b08, 0x192b080819190819, 0x192b080819191908, 0x192b0808192b0808,
|
|
3990
|
+
0x192b08082b081908, 0x192b08082b190808, 0x192b081908080808, 0x192b08190808082b,
|
|
3991
|
+
0x192b081908081919, 0x192b081908082b08, 0x192b081908190819, 0x192b081908191908,
|
|
3992
|
+
0x192b0819082b0808, 0x192b081919080819, 0x192b081919081908, 0x192b081919190808,
|
|
3993
|
+
0x192b08192b080808, 0x192b08192b192b19, 0x192b082b08081908, 0x192b082b08190808,
|
|
3994
|
+
0x192b082b19080808, 0x192b082b1919192b, 0x192b082b2b2b0819, 0x192b190808080808,
|
|
3995
|
+
0x192b190808081919, 0x192b190808082b08, 0x192b190808190819, 0x192b190808191908,
|
|
3996
|
+
0x192b1908082b0808, 0x192b190819080819, 0x192b190819081908, 0x192b190819190808,
|
|
3997
|
+
0x192b19082b080808, 0x192b191908080819, 0x192b191908081908, 0x192b191908190808,
|
|
3998
|
+
0x192b191919080808, 0x192b191919082b2b, 0x192b1919192b2b08, 0x192b19192b19082b,
|
|
3999
|
+
0x192b192b08080808, 0x192b192b2b191908, 0x192b2b0808080819, 0x192b2b0808081908,
|
|
4000
|
+
0x192b2b0808190808, 0x192b2b08192b1919, 0x192b2b082b192b08, 0x192b2b1908080808,
|
|
4001
|
+
0x192b2b19082b2b2b, 0x192b2b2b1908082b, 0x192b2b2b2b2b0819, 0x2b08080808080808,
|
|
4002
|
+
0x2b0808080808082b, 0x2b08080808081919, 0x2b08080808082b08, 0x2b08080808190819,
|
|
4003
|
+
0x2b08080808191908, 0x2b08080808192b19, 0x2b080808082b0808, 0x2b080808082b1919,
|
|
4004
|
+
0x2b08080819080819, 0x2b08080819081908, 0x2b08080819190808, 0x2b0808081919082b,
|
|
4005
|
+
0x2b08080819191919, 0x2b08080819192b08, 0x2b080808192b0819, 0x2b0808082b080808,
|
|
4006
|
+
0x2b0808082b081919, 0x2b0808082b190819, 0x2b0808082b191908, 0x2b08081908080819,
|
|
4007
|
+
0x2b08081908081908, 0x2b08081908082b19, 0x2b08081908190808, 0x2b0808190819082b,
|
|
4008
|
+
0x2b08081908191919, 0x2b08081908192b08, 0x2b080819082b0819, 0x2b080819082b1908,
|
|
4009
|
+
0x2b08081919080808, 0x2b0808191908082b, 0x2b08081919081919, 0x2b08081919082b08,
|
|
4010
|
+
0x2b08081919190819, 0x2b08081919191908, 0x2b0808192b080819, 0x2b0808192b081908,
|
|
4011
|
+
0x2b0808192b190808, 0x2b0808192b2b2b19, 0x2b08082b08080808, 0x2b08082b08081919,
|
|
4012
|
+
0x2b08082b08082b2b, 0x2b08082b08190819, 0x2b08082b08191908, 0x2b08082b19080819,
|
|
4013
|
+
0x2b08082b19081908, 0x2b08082b19190808, 0x2b08190808080819, 0x2b08190808081908,
|
|
4014
|
+
0x2b0819080808192b, 0x2b08190808082b19, 0x2b08190808190808, 0x2b0819080819082b,
|
|
4015
|
+
0x2b08190808191919, 0x2b08190808192b08, 0x2b081908082b0819, 0x2b08190819080808,
|
|
4016
|
+
0x2b0819081908082b, 0x2b08190819081919, 0x2b08190819082b08, 0x2b08190819190819,
|
|
4017
|
+
0x2b08190819191908, 0x2b081908192b0808, 0x2b0819082b080819, 0x2b0819082b081908,
|
|
4018
|
+
0x2b0819082b190808, 0x2b08191908080808, 0x2b0819190808082b, 0x2b08191908081919,
|
|
4019
|
+
0x2b08191908082b08, 0x2b08191908190819, 0x2b08191908191908, 0x2b081919082b0808,
|
|
4020
|
+
0x2b08191919080819, 0x2b08191919081908, 0x2b08191919190808, 0x2b0819192b080808,
|
|
4021
|
+
0x2b0819192b082b2b, 0x2b08192b08080819, 0x2b08192b08081908, 0x2b08192b08190808,
|
|
4022
|
+
0x2b08192b082b2b19, 0x2b08192b19080808, 0x2b082b0808080808, 0x2b082b0808081919,
|
|
4023
|
+
0x2b082b0808190819, 0x2b082b0808191908, 0x2b082b0819080819, 0x2b082b0819081908,
|
|
4024
|
+
0x2b082b0819190808, 0x2b082b082b2b082b, 0x2b082b1908080819, 0x2b082b1908081908,
|
|
4025
|
+
0x2b082b1919080808, 0x2b082b19192b1919, 0x2b082b2b082b082b, 0x2b082b2b19192b08,
|
|
4026
|
+
0x2b082b2b19192b2b, 0x2b082b2b2b08082b, 0x2b082b2b2b2b082b, 0x2b19080808080819,
|
|
4027
|
+
0x2b19080808081908, 0x2b19080808082b19, 0x2b19080808190808, 0x2b1908080819082b,
|
|
4028
|
+
0x2b19080808191919, 0x2b19080808192b08, 0x2b190808082b1908, 0x2b19080819080808,
|
|
4029
|
+
0x2b1908081908082b, 0x2b19080819081919, 0x2b19080819082b08, 0x2b19080819190819,
|
|
4030
|
+
0x2b19080819191908, 0x2b190808192b0808, 0x2b1908082b080819, 0x2b1908082b081908,
|
|
4031
|
+
0x2b1908082b190808, 0x2b19081908080808, 0x2b19081908081919, 0x2b19081908190819,
|
|
4032
|
+
0x2b19081908191908, 0x2b19081919080819, 0x2b19081919081908, 0x2b19081919190808,
|
|
4033
|
+
0x2b19081919192b2b, 0x2b19082b08080819, 0x2b19082b08081908, 0x2b19082b08190808,
|
|
4034
|
+
0x2b19082b19080808, 0x2b19082b2b2b192b, 0x2b19190808080808, 0x2b1919080808082b,
|
|
4035
|
+
0x2b19190808081919, 0x2b19190808082b08, 0x2b19190808190819, 0x2b19190808191908,
|
|
4036
|
+
0x2b191908082b0808, 0x2b19190819080819, 0x2b19190819081908, 0x2b19190819190808,
|
|
4037
|
+
0x2b1919082b080808, 0x2b1919082b19192b, 0x2b19191908080819, 0x2b19191908081908,
|
|
4038
|
+
0x2b19191908190808, 0x2b19191919080808, 0x2b1919192b192b08, 0x2b1919192b2b0819,
|
|
4039
|
+
0x2b19192b08080808, 0x2b19192b1908192b, 0x2b19192b192b1908, 0x2b192b0808080819,
|
|
4040
|
+
0x2b192b0808081908, 0x2b192b0808190808, 0x2b192b08082b192b, 0x2b192b0819080808,
|
|
4041
|
+
0x2b192b082b2b2b19, 0x2b192b1908080808, 0x2b192b1919082b19, 0x2b192b191919082b,
|
|
4042
|
+
0x2b192b2b2b190808, 0x2b2b080808080808, 0x2b2b080808081919, 0x2b2b080808082b2b,
|
|
4043
|
+
0x2b2b080808191908, 0x2b2b0808082b082b, 0x2b2b0808082b2b2b, 0x2b2b080819080819,
|
|
4044
|
+
0x2b2b080819081908, 0x2b2b080819190808, 0x2b2b08082b2b082b, 0x2b2b08082b2b2b2b,
|
|
4045
|
+
0x2b2b081919080808, 0x2b2b0819192b1919, 0x2b2b082b0808082b, 0x2b2b082b08082b2b,
|
|
4046
|
+
0x2b2b082b082b082b, 0x2b2b082b082b2b08, 0x2b2b082b082b2b2b, 0x2b2b082b2b08082b,
|
|
4047
|
+
0x2b2b082b2b082b08, 0x2b2b082b2b082b2b, 0x2b2b082b2b2b2b08, 0x2b2b190808080819,
|
|
4048
|
+
0x2b2b190808081908, 0x2b2b190808190808, 0x2b2b190819080808, 0x2b2b19082b082b19,
|
|
4049
|
+
0x2b2b19082b2b1908, 0x2b2b191908080808, 0x2b2b191908192b19, 0x2b2b192b19190819,
|
|
4050
|
+
0x2b2b2b0808082b2b, 0x2b2b2b08082b2b08, 0x2b2b2b082b2b082b, 0x2b2b2b1919191908,
|
|
4051
|
+
0x2b2b2b192b08192b, 0x2b2b2b2b08082b08, 0x2b2b2b2b08082b2b, 0x2b2b2b2b082b0808,
|
|
4052
|
+
0x2b2b2b2b082b082b, 0x2b2b2b2b082b2b08, 0x2b2b2b2b2b082b08, 0x2b2b2b2b2b2b2b2b,
|
|
4053
|
+
};
|
|
4054
|
+
|
|
3777
4055
|
constexpr constant static uint32_t iq3xxs_grid[256] = {
|
|
3778
4056
|
0x04040404, 0x04040414, 0x04040424, 0x04040c0c, 0x04040c1c, 0x04040c3e, 0x04041404, 0x04041414,
|
|
3779
4057
|
0x04041c0c, 0x04042414, 0x04043e1c, 0x04043e2c, 0x040c040c, 0x040c041c, 0x040c0c04, 0x040c0c14,
|
|
@@ -3809,71 +4087,71 @@ constexpr constant static uint32_t iq3xxs_grid[256] = {
|
|
|
3809
4087
|
0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c, 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04,
|
|
3810
4088
|
};
|
|
3811
4089
|
|
|
3812
|
-
constexpr constant static uint32_t
|
|
3813
|
-
|
|
3814
|
-
|
|
3815
|
-
|
|
3816
|
-
|
|
3817
|
-
|
|
3818
|
-
|
|
3819
|
-
|
|
3820
|
-
|
|
3821
|
-
|
|
3822
|
-
|
|
3823
|
-
|
|
3824
|
-
|
|
3825
|
-
|
|
3826
|
-
|
|
3827
|
-
|
|
3828
|
-
|
|
3829
|
-
|
|
3830
|
-
|
|
3831
|
-
|
|
3832
|
-
|
|
3833
|
-
|
|
3834
|
-
|
|
3835
|
-
|
|
3836
|
-
|
|
3837
|
-
|
|
3838
|
-
|
|
3839
|
-
|
|
3840
|
-
|
|
3841
|
-
|
|
3842
|
-
|
|
3843
|
-
|
|
3844
|
-
|
|
3845
|
-
|
|
3846
|
-
|
|
3847
|
-
|
|
3848
|
-
|
|
3849
|
-
|
|
3850
|
-
|
|
3851
|
-
|
|
3852
|
-
|
|
3853
|
-
|
|
3854
|
-
|
|
3855
|
-
|
|
3856
|
-
|
|
3857
|
-
|
|
3858
|
-
|
|
3859
|
-
|
|
3860
|
-
|
|
3861
|
-
|
|
3862
|
-
|
|
3863
|
-
|
|
3864
|
-
|
|
3865
|
-
|
|
3866
|
-
|
|
3867
|
-
|
|
3868
|
-
|
|
3869
|
-
|
|
3870
|
-
|
|
3871
|
-
|
|
3872
|
-
|
|
3873
|
-
|
|
3874
|
-
|
|
3875
|
-
|
|
3876
|
-
|
|
4090
|
+
constexpr constant static uint32_t iq3s_grid[512] = {
|
|
4091
|
+
0x01010101, 0x01010103, 0x01010105, 0x0101010b, 0x0101010f, 0x01010301, 0x01010303, 0x01010305,
|
|
4092
|
+
0x01010309, 0x0101030d, 0x01010501, 0x01010503, 0x0101050b, 0x01010707, 0x01010901, 0x01010905,
|
|
4093
|
+
0x0101090b, 0x0101090f, 0x01010b03, 0x01010b07, 0x01010d01, 0x01010d05, 0x01010f03, 0x01010f09,
|
|
4094
|
+
0x01010f0f, 0x01030101, 0x01030103, 0x01030105, 0x01030109, 0x01030301, 0x01030303, 0x0103030b,
|
|
4095
|
+
0x01030501, 0x01030507, 0x0103050f, 0x01030703, 0x0103070b, 0x01030909, 0x01030d03, 0x01030d0b,
|
|
4096
|
+
0x01030f05, 0x01050101, 0x01050103, 0x0105010b, 0x0105010f, 0x01050301, 0x01050307, 0x0105030d,
|
|
4097
|
+
0x01050503, 0x0105050b, 0x01050701, 0x01050709, 0x01050905, 0x0105090b, 0x0105090f, 0x01050b03,
|
|
4098
|
+
0x01050b07, 0x01050f01, 0x01050f07, 0x01070107, 0x01070303, 0x0107030b, 0x01070501, 0x01070505,
|
|
4099
|
+
0x01070703, 0x01070707, 0x0107070d, 0x01070909, 0x01070b01, 0x01070b05, 0x01070d0f, 0x01070f03,
|
|
4100
|
+
0x01070f0b, 0x01090101, 0x01090307, 0x0109030f, 0x01090503, 0x01090509, 0x01090705, 0x01090901,
|
|
4101
|
+
0x01090907, 0x01090b03, 0x01090f01, 0x010b0105, 0x010b0109, 0x010b0501, 0x010b0505, 0x010b050d,
|
|
4102
|
+
0x010b0707, 0x010b0903, 0x010b090b, 0x010b090f, 0x010b0d0d, 0x010b0f07, 0x010d010d, 0x010d0303,
|
|
4103
|
+
0x010d0307, 0x010d0703, 0x010d0b05, 0x010d0f03, 0x010f0101, 0x010f0105, 0x010f0109, 0x010f0501,
|
|
4104
|
+
0x010f0505, 0x010f050d, 0x010f0707, 0x010f0b01, 0x010f0b09, 0x03010101, 0x03010103, 0x03010105,
|
|
4105
|
+
0x03010109, 0x03010301, 0x03010303, 0x03010307, 0x0301030b, 0x0301030f, 0x03010501, 0x03010505,
|
|
4106
|
+
0x03010703, 0x03010709, 0x0301070d, 0x03010b09, 0x03010b0d, 0x03010d03, 0x03010f05, 0x03030101,
|
|
4107
|
+
0x03030103, 0x03030107, 0x0303010d, 0x03030301, 0x03030309, 0x03030503, 0x03030701, 0x03030707,
|
|
4108
|
+
0x03030903, 0x03030b01, 0x03030b05, 0x03030f01, 0x03030f0d, 0x03050101, 0x03050305, 0x0305030b,
|
|
4109
|
+
0x0305030f, 0x03050501, 0x03050509, 0x03050705, 0x03050901, 0x03050907, 0x03050b0b, 0x03050d01,
|
|
4110
|
+
0x03050f05, 0x03070103, 0x03070109, 0x0307010f, 0x03070301, 0x03070307, 0x03070503, 0x0307050f,
|
|
4111
|
+
0x03070701, 0x03070709, 0x03070903, 0x03070d05, 0x03070f01, 0x03090107, 0x0309010b, 0x03090305,
|
|
4112
|
+
0x03090309, 0x03090703, 0x03090707, 0x03090905, 0x0309090d, 0x03090b01, 0x03090b09, 0x030b0103,
|
|
4113
|
+
0x030b0301, 0x030b0307, 0x030b0503, 0x030b0701, 0x030b0705, 0x030b0b03, 0x030d0501, 0x030d0509,
|
|
4114
|
+
0x030d050f, 0x030d0909, 0x030d090d, 0x030f0103, 0x030f0107, 0x030f0301, 0x030f0305, 0x030f0503,
|
|
4115
|
+
0x030f070b, 0x030f0903, 0x030f0d05, 0x030f0f01, 0x05010101, 0x05010103, 0x05010107, 0x0501010b,
|
|
4116
|
+
0x0501010f, 0x05010301, 0x05010305, 0x05010309, 0x0501030d, 0x05010503, 0x05010507, 0x0501050f,
|
|
4117
|
+
0x05010701, 0x05010705, 0x05010903, 0x05010907, 0x0501090b, 0x05010b01, 0x05010b05, 0x05010d0f,
|
|
4118
|
+
0x05010f01, 0x05010f07, 0x05010f0b, 0x05030101, 0x05030105, 0x05030301, 0x05030307, 0x0503030f,
|
|
4119
|
+
0x05030505, 0x0503050b, 0x05030703, 0x05030709, 0x05030905, 0x05030b03, 0x05050103, 0x05050109,
|
|
4120
|
+
0x0505010f, 0x05050503, 0x05050507, 0x05050701, 0x0505070f, 0x05050903, 0x05050b07, 0x05050b0f,
|
|
4121
|
+
0x05050f03, 0x05050f09, 0x05070101, 0x05070105, 0x0507010b, 0x05070303, 0x05070505, 0x05070509,
|
|
4122
|
+
0x05070703, 0x05070707, 0x05070905, 0x05070b01, 0x05070d0d, 0x05090103, 0x0509010f, 0x05090501,
|
|
4123
|
+
0x05090507, 0x05090705, 0x0509070b, 0x05090903, 0x05090f05, 0x05090f0b, 0x050b0109, 0x050b0303,
|
|
4124
|
+
0x050b0505, 0x050b070f, 0x050b0901, 0x050b0b07, 0x050b0f01, 0x050d0101, 0x050d0105, 0x050d010f,
|
|
4125
|
+
0x050d0503, 0x050d0b0b, 0x050d0d03, 0x050f010b, 0x050f0303, 0x050f050d, 0x050f0701, 0x050f0907,
|
|
4126
|
+
0x050f0b01, 0x07010105, 0x07010303, 0x07010307, 0x0701030b, 0x0701030f, 0x07010505, 0x07010703,
|
|
4127
|
+
0x07010707, 0x0701070b, 0x07010905, 0x07010909, 0x0701090f, 0x07010b03, 0x07010d07, 0x07010f03,
|
|
4128
|
+
0x07030103, 0x07030107, 0x0703010b, 0x07030309, 0x07030503, 0x07030507, 0x07030901, 0x07030d01,
|
|
4129
|
+
0x07030f05, 0x07030f0d, 0x07050101, 0x07050305, 0x07050501, 0x07050705, 0x07050709, 0x07050b01,
|
|
4130
|
+
0x07070103, 0x07070301, 0x07070309, 0x07070503, 0x07070507, 0x0707050f, 0x07070701, 0x07070903,
|
|
4131
|
+
0x07070907, 0x0707090f, 0x07070b0b, 0x07070f07, 0x07090107, 0x07090303, 0x0709030d, 0x07090505,
|
|
4132
|
+
0x07090703, 0x07090b05, 0x07090d01, 0x07090d09, 0x070b0103, 0x070b0301, 0x070b0305, 0x070b050b,
|
|
4133
|
+
0x070b0705, 0x070b0909, 0x070b0b0d, 0x070b0f07, 0x070d030d, 0x070d0903, 0x070f0103, 0x070f0107,
|
|
4134
|
+
0x070f0501, 0x070f0505, 0x070f070b, 0x09010101, 0x09010109, 0x09010305, 0x09010501, 0x09010509,
|
|
4135
|
+
0x0901050f, 0x09010705, 0x09010903, 0x09010b01, 0x09010f01, 0x09030105, 0x0903010f, 0x09030303,
|
|
4136
|
+
0x09030307, 0x09030505, 0x09030701, 0x0903070b, 0x09030907, 0x09030b03, 0x09030b0b, 0x09050103,
|
|
4137
|
+
0x09050107, 0x09050301, 0x0905030b, 0x09050503, 0x09050707, 0x09050901, 0x09050b0f, 0x09050d05,
|
|
4138
|
+
0x09050f01, 0x09070109, 0x09070303, 0x09070307, 0x09070501, 0x09070505, 0x09070703, 0x0907070b,
|
|
4139
|
+
0x09090101, 0x09090105, 0x09090509, 0x0909070f, 0x09090901, 0x09090f03, 0x090b010b, 0x090b010f,
|
|
4140
|
+
0x090b0503, 0x090b0d05, 0x090d0307, 0x090d0709, 0x090d0d01, 0x090f0301, 0x090f030b, 0x090f0701,
|
|
4141
|
+
0x090f0907, 0x090f0b03, 0x0b010105, 0x0b010301, 0x0b010309, 0x0b010505, 0x0b010901, 0x0b010909,
|
|
4142
|
+
0x0b01090f, 0x0b010b05, 0x0b010d0d, 0x0b010f09, 0x0b030103, 0x0b030107, 0x0b03010b, 0x0b030305,
|
|
4143
|
+
0x0b030503, 0x0b030705, 0x0b030f05, 0x0b050101, 0x0b050303, 0x0b050507, 0x0b050701, 0x0b05070d,
|
|
4144
|
+
0x0b050b07, 0x0b070105, 0x0b07010f, 0x0b070301, 0x0b07050f, 0x0b070909, 0x0b070b03, 0x0b070d0b,
|
|
4145
|
+
0x0b070f07, 0x0b090103, 0x0b090109, 0x0b090501, 0x0b090705, 0x0b09090d, 0x0b0b0305, 0x0b0b050d,
|
|
4146
|
+
0x0b0b0b03, 0x0b0b0b07, 0x0b0d0905, 0x0b0f0105, 0x0b0f0109, 0x0b0f0505, 0x0d010303, 0x0d010307,
|
|
4147
|
+
0x0d01030b, 0x0d010703, 0x0d010707, 0x0d010d01, 0x0d030101, 0x0d030501, 0x0d03050f, 0x0d030d09,
|
|
4148
|
+
0x0d050305, 0x0d050709, 0x0d050905, 0x0d050b0b, 0x0d050d05, 0x0d050f01, 0x0d070101, 0x0d070309,
|
|
4149
|
+
0x0d070503, 0x0d070901, 0x0d09050b, 0x0d090907, 0x0d090d05, 0x0d0b0101, 0x0d0b0107, 0x0d0b0709,
|
|
4150
|
+
0x0d0b0d01, 0x0d0d010b, 0x0d0d0901, 0x0d0f0303, 0x0d0f0307, 0x0f010101, 0x0f010109, 0x0f01010f,
|
|
4151
|
+
0x0f010501, 0x0f010505, 0x0f01070d, 0x0f010901, 0x0f010b09, 0x0f010d05, 0x0f030105, 0x0f030303,
|
|
4152
|
+
0x0f030509, 0x0f030907, 0x0f03090b, 0x0f050103, 0x0f050109, 0x0f050301, 0x0f05030d, 0x0f050503,
|
|
4153
|
+
0x0f050701, 0x0f050b03, 0x0f070105, 0x0f070705, 0x0f07070b, 0x0f070b07, 0x0f090103, 0x0f09010b,
|
|
4154
|
+
0x0f090307, 0x0f090501, 0x0f090b01, 0x0f0b0505, 0x0f0b0905, 0x0f0d0105, 0x0f0d0703, 0x0f0f0101,
|
|
3877
4155
|
};
|
|
3878
4156
|
|
|
3879
4157
|
#define NGRID_IQ1S 512
|
|
@@ -4072,7 +4350,6 @@ void kernel_mul_mv_iq2_xxs_f32_impl(
|
|
|
4072
4350
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
|
4073
4351
|
}
|
|
4074
4352
|
|
|
4075
|
-
#if QK_K == 256
|
|
4076
4353
|
const int ix = tiisg;
|
|
4077
4354
|
|
|
4078
4355
|
device const float * y4 = y + 32 * ix;
|
|
@@ -4113,12 +4390,6 @@ void kernel_mul_mv_iq2_xxs_f32_impl(
|
|
|
4113
4390
|
|
|
4114
4391
|
y4 += 32 * 32;
|
|
4115
4392
|
}
|
|
4116
|
-
#else
|
|
4117
|
-
(void) x;
|
|
4118
|
-
(void) y;
|
|
4119
|
-
(void) yl;
|
|
4120
|
-
(void) nb32;
|
|
4121
|
-
#endif
|
|
4122
4393
|
|
|
4123
4394
|
for (int row = 0; row < N_DST; ++row) {
|
|
4124
4395
|
all_sum = simd_sum(sumf[row]);
|
|
@@ -4208,7 +4479,6 @@ void kernel_mul_mv_iq2_xs_f32_impl(
|
|
|
4208
4479
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
|
4209
4480
|
}
|
|
4210
4481
|
|
|
4211
|
-
#if QK_K == 256
|
|
4212
4482
|
const int ix = tiisg;
|
|
4213
4483
|
|
|
4214
4484
|
device const float * y4 = y + 32 * ix;
|
|
@@ -4259,12 +4529,6 @@ void kernel_mul_mv_iq2_xs_f32_impl(
|
|
|
4259
4529
|
|
|
4260
4530
|
y4 += 32 * 32;
|
|
4261
4531
|
}
|
|
4262
|
-
#else
|
|
4263
|
-
(void) x;
|
|
4264
|
-
(void) y;
|
|
4265
|
-
(void) yl;
|
|
4266
|
-
(void) nb32;
|
|
4267
|
-
#endif
|
|
4268
4532
|
|
|
4269
4533
|
for (int row = 0; row < N_DST; ++row) {
|
|
4270
4534
|
all_sum = simd_sum(sumf[row]);
|
|
@@ -4354,7 +4618,6 @@ void kernel_mul_mv_iq3_xxs_f32_impl(
|
|
|
4354
4618
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
|
4355
4619
|
}
|
|
4356
4620
|
|
|
4357
|
-
#if QK_K == 256
|
|
4358
4621
|
const int ix = tiisg;
|
|
4359
4622
|
|
|
4360
4623
|
device const float * y4 = y + 32 * ix;
|
|
@@ -4398,12 +4661,6 @@ void kernel_mul_mv_iq3_xxs_f32_impl(
|
|
|
4398
4661
|
|
|
4399
4662
|
y4 += 32 * 32;
|
|
4400
4663
|
}
|
|
4401
|
-
#else
|
|
4402
|
-
(void) x;
|
|
4403
|
-
(void) y;
|
|
4404
|
-
(void) yl;
|
|
4405
|
-
(void) nb32;
|
|
4406
|
-
#endif
|
|
4407
4664
|
|
|
4408
4665
|
for (int row = 0; row < N_DST; ++row) {
|
|
4409
4666
|
all_sum = simd_sum(sumf[row]);
|
|
@@ -4485,7 +4742,7 @@ void kernel_mul_mv_iq3_s_f32_impl(
|
|
|
4485
4742
|
{
|
|
4486
4743
|
int nval = 8;
|
|
4487
4744
|
int pos = (32*sgitg + tiisg)*nval;
|
|
4488
|
-
for (int i = 0; i < nval; ++i) values[pos + i] =
|
|
4745
|
+
for (int i = 0; i < nval; ++i) values[pos + i] = iq3s_grid[pos + i];
|
|
4489
4746
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
|
4490
4747
|
}
|
|
4491
4748
|
|
|
@@ -4512,12 +4769,14 @@ void kernel_mul_mv_iq3_s_f32_impl(
|
|
|
4512
4769
|
for (int row = 0; row < N_DST; row++) {
|
|
4513
4770
|
|
|
4514
4771
|
const float db = dh[0];
|
|
4515
|
-
const float d = db * (
|
|
4772
|
+
const float d = db * (1 + 2*((sc[0] >> 4*(ib%2)) & 0xf));
|
|
4516
4773
|
|
|
4517
4774
|
float2 sum = {0};
|
|
4518
4775
|
for (int l = 0; l < 4; ++l) {
|
|
4519
|
-
const threadgroup
|
|
4520
|
-
const threadgroup
|
|
4776
|
+
const threadgroup uint32_t * table1 = qh[0] & kmask_iq2xs[2*l+0] ? values + 256 : values;
|
|
4777
|
+
const threadgroup uint32_t * table2 = qh[0] & kmask_iq2xs[2*l+1] ? values + 256 : values;
|
|
4778
|
+
const threadgroup uint8_t * grid1 = (const threadgroup uint8_t *)(table1 + qs[2*l+0]);
|
|
4779
|
+
const threadgroup uint8_t * grid2 = (const threadgroup uint8_t *)(table2 + qs[2*l+1]);
|
|
4521
4780
|
for (int j = 0; j < 4; ++j) {
|
|
4522
4781
|
sum[0] += yl[8*l + j + 0] * grid1[j] * select(1, -1, signs[l] & kmask_iq2xs[j+0]);
|
|
4523
4782
|
sum[1] += yl[8*l + j + 4] * grid2[j] * select(1, -1, signs[l] & kmask_iq2xs[j+4]);
|
|
@@ -4538,7 +4797,7 @@ void kernel_mul_mv_iq3_s_f32_impl(
|
|
|
4538
4797
|
for (int row = 0; row < N_DST; ++row) {
|
|
4539
4798
|
all_sum = simd_sum(sumf[row]);
|
|
4540
4799
|
if (tiisg == 0) {
|
|
4541
|
-
dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum
|
|
4800
|
+
dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum;
|
|
4542
4801
|
}
|
|
4543
4802
|
}
|
|
4544
4803
|
}
|
|
@@ -4572,6 +4831,139 @@ kernel void kernel_mul_mv_iq3_s_f32(
|
|
|
4572
4831
|
kernel_mul_mv_iq3_s_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
|
|
4573
4832
|
}
|
|
4574
4833
|
|
|
4834
|
+
void kernel_mul_mv_iq2_s_f32_impl(
|
|
4835
|
+
device const void * src0,
|
|
4836
|
+
device const float * src1,
|
|
4837
|
+
device float * dst,
|
|
4838
|
+
constant int64_t & ne00,
|
|
4839
|
+
constant int64_t & ne01,
|
|
4840
|
+
constant int64_t & ne02,
|
|
4841
|
+
constant int64_t & ne10,
|
|
4842
|
+
constant int64_t & ne12,
|
|
4843
|
+
constant int64_t & ne0,
|
|
4844
|
+
constant int64_t & ne1,
|
|
4845
|
+
constant uint & r2,
|
|
4846
|
+
constant uint & r3,
|
|
4847
|
+
threadgroup int8_t * shared_values [[threadgroup(0)]],
|
|
4848
|
+
uint3 tgpig[[threadgroup_position_in_grid]],
|
|
4849
|
+
uint tiisg[[thread_index_in_simdgroup]],
|
|
4850
|
+
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
|
4851
|
+
|
|
4852
|
+
const int nb = ne00/QK_K;
|
|
4853
|
+
const int r0 = tgpig.x;
|
|
4854
|
+
const int r1 = tgpig.y;
|
|
4855
|
+
const int im = tgpig.z;
|
|
4856
|
+
|
|
4857
|
+
const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
|
|
4858
|
+
const int ib_row = first_row * nb;
|
|
4859
|
+
|
|
4860
|
+
const uint i12 = im%ne12;
|
|
4861
|
+
const uint i13 = im/ne12;
|
|
4862
|
+
|
|
4863
|
+
const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
|
|
4864
|
+
|
|
4865
|
+
device const block_iq2_s * x = (device const block_iq2_s *) src0 + ib_row + offset0;
|
|
4866
|
+
device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1;
|
|
4867
|
+
|
|
4868
|
+
float yl[32];
|
|
4869
|
+
float sumf[N_DST]={0.f}, all_sum;
|
|
4870
|
+
|
|
4871
|
+
const int nb32 = nb * (QK_K / 32);
|
|
4872
|
+
|
|
4873
|
+
//threadgroup uint64_t * values = (threadgroup uint64_t *)shared_values;
|
|
4874
|
+
//{
|
|
4875
|
+
// int nval = 32;
|
|
4876
|
+
// int pos = (32*sgitg + tiisg)*nval;
|
|
4877
|
+
// for (int i = 0; i < nval; ++i) values[pos + i] = iq2s_grid[pos + i];
|
|
4878
|
+
// threadgroup_barrier(mem_flags::mem_threadgroup);
|
|
4879
|
+
//}
|
|
4880
|
+
|
|
4881
|
+
const int ix = tiisg;
|
|
4882
|
+
|
|
4883
|
+
device const float * y4 = y + 32 * ix;
|
|
4884
|
+
|
|
4885
|
+
for (int ib32 = ix; ib32 < nb32; ib32 += 32) {
|
|
4886
|
+
|
|
4887
|
+
for (int i = 0; i < 32; ++i) {
|
|
4888
|
+
yl[i] = y4[i];
|
|
4889
|
+
}
|
|
4890
|
+
|
|
4891
|
+
const int ibl = ib32 / (QK_K / 32);
|
|
4892
|
+
const int ib = ib32 % (QK_K / 32);
|
|
4893
|
+
|
|
4894
|
+
device const block_iq2_s * xr = x + ibl;
|
|
4895
|
+
device const uint8_t * qs = xr->qs + 4 * ib;
|
|
4896
|
+
device const uint8_t * qh = xr->qh + ib;
|
|
4897
|
+
device const uint8_t * sc = xr->scales + ib;
|
|
4898
|
+
device const uint8_t * signs = qs + QK_K/8;
|
|
4899
|
+
device const half * dh = &xr->d;
|
|
4900
|
+
|
|
4901
|
+
for (int row = 0; row < N_DST; row++) {
|
|
4902
|
+
|
|
4903
|
+
const float db = dh[0];
|
|
4904
|
+
const float d1 = db * (0.5f + (sc[0] & 0xf));
|
|
4905
|
+
const float d2 = db * (0.5f + (sc[0] >> 4));
|
|
4906
|
+
|
|
4907
|
+
float2 sum = {0};
|
|
4908
|
+
for (int l = 0; l < 2; ++l) {
|
|
4909
|
+
//const threadgroup uint8_t * grid1 = (const threadgroup uint8_t *)(values + (qs[l+0] | ((qh[0] << (8-2*l)) & 0x300)));
|
|
4910
|
+
//const threadgroup uint8_t * grid2 = (const threadgroup uint8_t *)(values + (qs[l+2] | ((qh[0] << (4-2*l)) & 0x300)));
|
|
4911
|
+
constant uint8_t * grid1 = (constant uint8_t *)(iq2s_grid + (qs[l+0] | ((qh[0] << (8-2*l)) & 0x300)));
|
|
4912
|
+
constant uint8_t * grid2 = (constant uint8_t *)(iq2s_grid + (qs[l+2] | ((qh[0] << (4-2*l)) & 0x300)));
|
|
4913
|
+
for (int j = 0; j < 8; ++j) {
|
|
4914
|
+
sum[0] += yl[8*l + j + 0] * grid1[j] * select(1, -1, signs[l+0] & kmask_iq2xs[j]);
|
|
4915
|
+
sum[1] += yl[8*l + j + 16] * grid2[j] * select(1, -1, signs[l+2] & kmask_iq2xs[j]);
|
|
4916
|
+
}
|
|
4917
|
+
}
|
|
4918
|
+
sumf[row] += d1 * sum[0] + d2 * sum[1];
|
|
4919
|
+
|
|
4920
|
+
dh += nb*sizeof(block_iq2_s)/2;
|
|
4921
|
+
qs += nb*sizeof(block_iq2_s);
|
|
4922
|
+
qh += nb*sizeof(block_iq2_s);
|
|
4923
|
+
sc += nb*sizeof(block_iq2_s);
|
|
4924
|
+
signs += nb*sizeof(block_iq2_s);
|
|
4925
|
+
}
|
|
4926
|
+
|
|
4927
|
+
y4 += 32 * 32;
|
|
4928
|
+
}
|
|
4929
|
+
|
|
4930
|
+
for (int row = 0; row < N_DST; ++row) {
|
|
4931
|
+
all_sum = simd_sum(sumf[row]);
|
|
4932
|
+
if (tiisg == 0) {
|
|
4933
|
+
dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum * 0.25f;
|
|
4934
|
+
}
|
|
4935
|
+
}
|
|
4936
|
+
}
|
|
4937
|
+
|
|
4938
|
+
[[host_name("kernel_mul_mv_iq2_s_f32")]]
|
|
4939
|
+
kernel void kernel_mul_mv_iq2_s_f32(
|
|
4940
|
+
device const void * src0,
|
|
4941
|
+
device const float * src1,
|
|
4942
|
+
device float * dst,
|
|
4943
|
+
constant int64_t & ne00,
|
|
4944
|
+
constant int64_t & ne01,
|
|
4945
|
+
constant int64_t & ne02,
|
|
4946
|
+
constant uint64_t & nb00,
|
|
4947
|
+
constant uint64_t & nb01,
|
|
4948
|
+
constant uint64_t & nb02,
|
|
4949
|
+
constant int64_t & ne10,
|
|
4950
|
+
constant int64_t & ne11,
|
|
4951
|
+
constant int64_t & ne12,
|
|
4952
|
+
constant uint64_t & nb10,
|
|
4953
|
+
constant uint64_t & nb11,
|
|
4954
|
+
constant uint64_t & nb12,
|
|
4955
|
+
constant int64_t & ne0,
|
|
4956
|
+
constant int64_t & ne1,
|
|
4957
|
+
constant uint & r2,
|
|
4958
|
+
constant uint & r3,
|
|
4959
|
+
threadgroup int8_t * shared_values [[threadgroup(0)]],
|
|
4960
|
+
uint3 tgpig[[threadgroup_position_in_grid]],
|
|
4961
|
+
uint tiisg[[thread_index_in_simdgroup]],
|
|
4962
|
+
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
|
4963
|
+
|
|
4964
|
+
kernel_mul_mv_iq2_s_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
|
|
4965
|
+
}
|
|
4966
|
+
|
|
4575
4967
|
void kernel_mul_mv_iq1_s_f32_impl(
|
|
4576
4968
|
device const void * src0,
|
|
4577
4969
|
device const float * src1,
|
|
@@ -4609,7 +5001,6 @@ void kernel_mul_mv_iq1_s_f32_impl(
|
|
|
4609
5001
|
|
|
4610
5002
|
const int nb32 = nb * (QK_K / 32);
|
|
4611
5003
|
|
|
4612
|
-
#if QK_K == 256
|
|
4613
5004
|
const int ix = tiisg/2;
|
|
4614
5005
|
const int il = tiisg%2;
|
|
4615
5006
|
|
|
@@ -4648,12 +5039,6 @@ void kernel_mul_mv_iq1_s_f32_impl(
|
|
|
4648
5039
|
|
|
4649
5040
|
y4 += 16 * 32;
|
|
4650
5041
|
}
|
|
4651
|
-
#else
|
|
4652
|
-
(void) x;
|
|
4653
|
-
(void) y;
|
|
4654
|
-
(void) yl;
|
|
4655
|
-
(void) nb32;
|
|
4656
|
-
#endif
|
|
4657
5042
|
|
|
4658
5043
|
for (int row = 0; row < N_DST; ++row) {
|
|
4659
5044
|
all_sum = simd_sum(sumf[row]);
|
|
@@ -4760,6 +5145,102 @@ void kernel_mul_mv_iq4_nl_f32_impl(
|
|
|
4760
5145
|
}
|
|
4761
5146
|
}
|
|
4762
5147
|
|
|
5148
|
+
#if QK_K != 64
|
|
5149
|
+
void kernel_mul_mv_iq4_xs_f32_impl(
|
|
5150
|
+
device const void * src0,
|
|
5151
|
+
device const float * src1,
|
|
5152
|
+
device float * dst,
|
|
5153
|
+
constant int64_t & ne00,
|
|
5154
|
+
constant int64_t & ne01,
|
|
5155
|
+
constant int64_t & ne02,
|
|
5156
|
+
constant int64_t & ne10,
|
|
5157
|
+
constant int64_t & ne12,
|
|
5158
|
+
constant int64_t & ne0,
|
|
5159
|
+
constant int64_t & ne1,
|
|
5160
|
+
constant uint & r2,
|
|
5161
|
+
constant uint & r3,
|
|
5162
|
+
threadgroup float * shared_values [[threadgroup(0)]],
|
|
5163
|
+
uint3 tgpig[[threadgroup_position_in_grid]],
|
|
5164
|
+
uint tiisg[[thread_index_in_simdgroup]],
|
|
5165
|
+
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
|
5166
|
+
|
|
5167
|
+
const int nb = ne00/QK_K;
|
|
5168
|
+
const int r0 = tgpig.x;
|
|
5169
|
+
const int r1 = tgpig.y;
|
|
5170
|
+
const int im = tgpig.z;
|
|
5171
|
+
const int first_row = (r0 * 2 + sgitg) * 2;
|
|
5172
|
+
const int ib_row = first_row * nb;
|
|
5173
|
+
|
|
5174
|
+
const uint i12 = im%ne12;
|
|
5175
|
+
const uint i13 = im/ne12;
|
|
5176
|
+
|
|
5177
|
+
const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
|
|
5178
|
+
device const block_iq4_xs * x = (device const block_iq4_xs *) src0 + ib_row + offset0;
|
|
5179
|
+
device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1;
|
|
5180
|
+
|
|
5181
|
+
const int ix = tiisg/16; // 0 or 1
|
|
5182
|
+
const int it = tiisg%16; // 0...15
|
|
5183
|
+
const int ib = it/2;
|
|
5184
|
+
const int il = it%2;
|
|
5185
|
+
|
|
5186
|
+
shared_values[tiisg] = kvalues_iq4nl_f[tiisg%16];
|
|
5187
|
+
threadgroup_barrier(mem_flags::mem_threadgroup);
|
|
5188
|
+
|
|
5189
|
+
float4 yl[4];
|
|
5190
|
+
float sumf[2]={0.f}, all_sum;
|
|
5191
|
+
|
|
5192
|
+
device const float * yb = y + ix * QK_K + ib * 32 + il * 8;
|
|
5193
|
+
|
|
5194
|
+
uint32_t aux32[2];
|
|
5195
|
+
thread const uint8_t * q8 = (thread const uint8_t *)aux32;
|
|
5196
|
+
|
|
5197
|
+
float4 qf1, qf2;
|
|
5198
|
+
|
|
5199
|
+
for (int ibl = ix; ibl < nb; ibl += 2) {
|
|
5200
|
+
|
|
5201
|
+
device const float4 * y4 = (device const float4 *)yb;
|
|
5202
|
+
yl[0] = y4[0]; yl[1] = y4[4]; yl[2] = y4[1]; yl[3] = y4[5];
|
|
5203
|
+
|
|
5204
|
+
for (int row = 0; row < 2; ++row) {
|
|
5205
|
+
|
|
5206
|
+
device const block_iq4_xs & xb = x[row*nb + ibl];
|
|
5207
|
+
device const uint32_t * q4 = (device const uint32_t *)(xb.qs + 16*ib + 8*il);
|
|
5208
|
+
|
|
5209
|
+
float4 acc1 = {0.f}, acc2 = {0.f};
|
|
5210
|
+
|
|
5211
|
+
aux32[0] = q4[0] & 0x0f0f0f0f;
|
|
5212
|
+
aux32[1] = (q4[0] >> 4) & 0x0f0f0f0f;
|
|
5213
|
+
qf1 = {shared_values[q8[0]], shared_values[q8[1]], shared_values[q8[2]], shared_values[q8[3]]};
|
|
5214
|
+
qf2 = {shared_values[q8[4]], shared_values[q8[5]], shared_values[q8[6]], shared_values[q8[7]]};
|
|
5215
|
+
acc1 += yl[0] * qf1;
|
|
5216
|
+
acc2 += yl[1] * qf2;
|
|
5217
|
+
|
|
5218
|
+
aux32[0] = q4[1] & 0x0f0f0f0f;
|
|
5219
|
+
aux32[1] = (q4[1] >> 4) & 0x0f0f0f0f;
|
|
5220
|
+
qf1 = {shared_values[q8[0]], shared_values[q8[1]], shared_values[q8[2]], shared_values[q8[3]]};
|
|
5221
|
+
qf2 = {shared_values[q8[4]], shared_values[q8[5]], shared_values[q8[6]], shared_values[q8[7]]};
|
|
5222
|
+
acc1 += yl[2] * qf1;
|
|
5223
|
+
acc2 += yl[3] * qf2;
|
|
5224
|
+
|
|
5225
|
+
acc1 += acc2;
|
|
5226
|
+
|
|
5227
|
+
const int ls = (((xb.scales_l[ib/2] >> 4*(ib%2)) & 0xf) | (((xb.scales_h >> 2*ib) & 3) << 4)) - 32;
|
|
5228
|
+
sumf[row] += (float)xb.d * ls * (acc1[0] + acc1[1] + acc1[2] + acc1[3]);
|
|
5229
|
+
|
|
5230
|
+
}
|
|
5231
|
+
|
|
5232
|
+
yb += 2 * QK_K;
|
|
5233
|
+
}
|
|
5234
|
+
|
|
5235
|
+
for (int row = 0; row < 2; ++row) {
|
|
5236
|
+
all_sum = simd_sum(sumf[row]);
|
|
5237
|
+
if (tiisg == 0) {
|
|
5238
|
+
dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum;
|
|
5239
|
+
}
|
|
5240
|
+
}
|
|
5241
|
+
}
|
|
5242
|
+
#endif
|
|
5243
|
+
|
|
4763
5244
|
[[host_name("kernel_mul_mv_iq1_s_f32")]]
|
|
4764
5245
|
kernel void kernel_mul_mv_iq1_s_f32(
|
|
4765
5246
|
device const void * src0,
|
|
@@ -4817,6 +5298,39 @@ kernel void kernel_mul_mv_iq4_nl_f32(
|
|
|
4817
5298
|
kernel_mul_mv_iq4_nl_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
|
|
4818
5299
|
}
|
|
4819
5300
|
|
|
5301
|
+
[[host_name("kernel_mul_mv_iq4_xs_f32")]]
|
|
5302
|
+
kernel void kernel_mul_mv_iq4_xs_f32(
|
|
5303
|
+
device const void * src0,
|
|
5304
|
+
device const float * src1,
|
|
5305
|
+
device float * dst,
|
|
5306
|
+
constant int64_t & ne00,
|
|
5307
|
+
constant int64_t & ne01,
|
|
5308
|
+
constant int64_t & ne02,
|
|
5309
|
+
constant uint64_t & nb00,
|
|
5310
|
+
constant uint64_t & nb01,
|
|
5311
|
+
constant uint64_t & nb02,
|
|
5312
|
+
constant int64_t & ne10,
|
|
5313
|
+
constant int64_t & ne11,
|
|
5314
|
+
constant int64_t & ne12,
|
|
5315
|
+
constant uint64_t & nb10,
|
|
5316
|
+
constant uint64_t & nb11,
|
|
5317
|
+
constant uint64_t & nb12,
|
|
5318
|
+
constant int64_t & ne0,
|
|
5319
|
+
constant int64_t & ne1,
|
|
5320
|
+
constant uint & r2,
|
|
5321
|
+
constant uint & r3,
|
|
5322
|
+
threadgroup float * shared_values [[threadgroup(0)]],
|
|
5323
|
+
uint3 tgpig[[threadgroup_position_in_grid]],
|
|
5324
|
+
uint tiisg[[thread_index_in_simdgroup]],
|
|
5325
|
+
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
|
5326
|
+
|
|
5327
|
+
#if QK_K == 64
|
|
5328
|
+
kernel_mul_mv_iq4_nl_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
|
|
5329
|
+
#else
|
|
5330
|
+
kernel_mul_mv_iq4_xs_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
|
|
5331
|
+
#endif
|
|
5332
|
+
}
|
|
5333
|
+
|
|
4820
5334
|
//============================= templates and their specializations =============================
|
|
4821
5335
|
|
|
4822
5336
|
// NOTE: this is not dequantizing - we are simply fitting the template
|
|
@@ -5173,21 +5687,40 @@ void dequantize_iq3_s(device const block_iq3_s * xb, short il, thread type4x4 &
|
|
|
5173
5687
|
device const uint8_t * qs = xb->qs + 8*ib32;
|
|
5174
5688
|
device const uint8_t * signs = xb->signs + 4*ib32 + 2*il;
|
|
5175
5689
|
const uint8_t qh = xb->qh[ib32] >> 4*il;
|
|
5176
|
-
const float dl = d * (
|
|
5177
|
-
constant uint8_t * grid1 = (constant uint8_t *)(
|
|
5178
|
-
constant uint8_t * grid2 = (constant uint8_t *)(
|
|
5690
|
+
const float dl = d * (1 + 2*((xb->scales[ib32/2] >> 4*(ib32%2)) & 0xf));
|
|
5691
|
+
constant uint8_t * grid1 = (constant uint8_t *)(iq3s_grid + (qs[4*il+0] | ((qh << 8) & 256)));
|
|
5692
|
+
constant uint8_t * grid2 = (constant uint8_t *)(iq3s_grid + (qs[4*il+1] | ((qh << 7) & 256)));
|
|
5179
5693
|
for (int i = 0; i < 4; ++i) {
|
|
5180
5694
|
reg[0][i] = dl * grid1[i] * select(1, -1, signs[0] & kmask_iq2xs[i+0]);
|
|
5181
5695
|
reg[1][i] = dl * grid2[i] * select(1, -1, signs[0] & kmask_iq2xs[i+4]);
|
|
5182
5696
|
}
|
|
5183
|
-
grid1 = (constant uint8_t *)(
|
|
5184
|
-
grid2 = (constant uint8_t *)(
|
|
5697
|
+
grid1 = (constant uint8_t *)(iq3s_grid + (qs[4*il+2] | ((qh << 6) & 256)));
|
|
5698
|
+
grid2 = (constant uint8_t *)(iq3s_grid + (qs[4*il+3] | ((qh << 5) & 256)));
|
|
5185
5699
|
for (int i = 0; i < 4; ++i) {
|
|
5186
5700
|
reg[2][i] = dl * grid1[i] * select(1, -1, signs[1] & kmask_iq2xs[i+0]);
|
|
5187
5701
|
reg[3][i] = dl * grid2[i] * select(1, -1, signs[1] & kmask_iq2xs[i+4]);
|
|
5188
5702
|
}
|
|
5189
5703
|
}
|
|
5190
5704
|
|
|
5705
|
+
template <typename type4x4>
|
|
5706
|
+
void dequantize_iq2_s(device const block_iq2_s * xb, short il, thread type4x4 & reg) {
|
|
5707
|
+
// il is 0...15 for QK_K = 256 => index of block of 32 is il/2
|
|
5708
|
+
const float d = xb->d;
|
|
5709
|
+
const int ib32 = il/2;
|
|
5710
|
+
il = il%2;
|
|
5711
|
+
// il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16
|
|
5712
|
+
device const uint8_t * qs = xb->qs + 4*ib32 + 2*il;
|
|
5713
|
+
device const uint8_t * signs = qs + QK_K/8;
|
|
5714
|
+
const uint8_t qh = xb->qh[ib32] >> 4*il;
|
|
5715
|
+
const float dl = d * (0.5f + ((xb->scales[ib32] >> 4*il) & 0xf)) * 0.25f;
|
|
5716
|
+
constant uint8_t * grid1 = (constant uint8_t *)(iq2s_grid + (qs[0] | ((qh << 8) & 0x300)));
|
|
5717
|
+
constant uint8_t * grid2 = (constant uint8_t *)(iq2s_grid + (qs[1] | ((qh << 6) & 0x300)));
|
|
5718
|
+
for (int i = 0; i < 8; ++i) {
|
|
5719
|
+
reg[i/4+0][i%4] = dl * grid1[i] * select(1, -1, signs[0] & kmask_iq2xs[i]);
|
|
5720
|
+
reg[i/4+2][i%4] = dl * grid2[i] * select(1, -1, signs[1] & kmask_iq2xs[i]);
|
|
5721
|
+
}
|
|
5722
|
+
}
|
|
5723
|
+
|
|
5191
5724
|
template <typename type4x4>
|
|
5192
5725
|
void dequantize_iq1_s(device const block_iq1_s * xb, short il, thread type4x4 & reg) {
|
|
5193
5726
|
// il is 0...15 for QK_K = 256 => index of block of 32 is il/2
|
|
@@ -5219,6 +5752,30 @@ void dequantize_iq4_nl(device const block_iq4_nl * xb, short il, thread type4x4
|
|
|
5219
5752
|
}
|
|
5220
5753
|
}
|
|
5221
5754
|
|
|
5755
|
+
template <typename type4x4>
|
|
5756
|
+
void dequantize_iq4_xs(device const block_iq4_xs * xb, short il, thread type4x4 & reg) {
|
|
5757
|
+
#if QK_K == 64
|
|
5758
|
+
dequantize_iq4_nl(xb, il, reg);
|
|
5759
|
+
#else
|
|
5760
|
+
// il is 0...15 for QK_K = 256 => index of block of 32 is il/2
|
|
5761
|
+
const int ib32 = il/2;
|
|
5762
|
+
il = il%2;
|
|
5763
|
+
// il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16
|
|
5764
|
+
device const uint32_t * q4 = (device const uint32_t *)xb->qs + 4*ib32;
|
|
5765
|
+
const int ls = ((xb->scales_l[ib32/2] >> 4*(ib32%2)) & 0xf) | (((xb->scales_h >> 2*ib32) & 3) << 4);
|
|
5766
|
+
const float d = (float)xb->d * (ls - 32);
|
|
5767
|
+
uint32_t aux32;
|
|
5768
|
+
thread const uint8_t * q8 = (thread const uint8_t *)&aux32;
|
|
5769
|
+
for (int i = 0; i < 4; ++i) {
|
|
5770
|
+
aux32 = (q4[i] >> 4*il) & 0x0f0f0f0f;
|
|
5771
|
+
reg[i][0] = d * kvalues_iq4nl_f[q8[0]];
|
|
5772
|
+
reg[i][1] = d * kvalues_iq4nl_f[q8[1]];
|
|
5773
|
+
reg[i][2] = d * kvalues_iq4nl_f[q8[2]];
|
|
5774
|
+
reg[i][3] = d * kvalues_iq4nl_f[q8[3]];
|
|
5775
|
+
}
|
|
5776
|
+
#endif
|
|
5777
|
+
}
|
|
5778
|
+
|
|
5222
5779
|
template<typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread float4x4 &)>
|
|
5223
5780
|
kernel void kernel_get_rows(
|
|
5224
5781
|
device const void * src0,
|
|
@@ -5762,8 +6319,14 @@ template [[host_name("kernel_get_rows_iq2_xxs")]] kernel get_rows_t kernel_get_r
|
|
|
5762
6319
|
template [[host_name("kernel_get_rows_iq2_xs")]] kernel get_rows_t kernel_get_rows<block_iq2_xs, QK_NL, dequantize_iq2_xs>;
|
|
5763
6320
|
template [[host_name("kernel_get_rows_iq3_xxs")]] kernel get_rows_t kernel_get_rows<block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;
|
|
5764
6321
|
template [[host_name("kernel_get_rows_iq3_s")]] kernel get_rows_t kernel_get_rows<block_iq3_s, QK_NL, dequantize_iq3_s>;
|
|
6322
|
+
template [[host_name("kernel_get_rows_iq2_s")]] kernel get_rows_t kernel_get_rows<block_iq2_s, QK_NL, dequantize_iq2_s>;
|
|
5765
6323
|
template [[host_name("kernel_get_rows_iq1_s")]] kernel get_rows_t kernel_get_rows<block_iq1_s, QK_NL, dequantize_iq1_s>;
|
|
5766
|
-
template [[host_name("kernel_get_rows_iq4_nl")]] kernel get_rows_t kernel_get_rows<block_iq4_nl, 2,
|
|
6324
|
+
template [[host_name("kernel_get_rows_iq4_nl")]] kernel get_rows_t kernel_get_rows<block_iq4_nl, 2, dequantize_iq4_nl>;
|
|
6325
|
+
#if QK_K == 64
|
|
6326
|
+
template [[host_name("kernel_get_rows_iq4_xs")]] kernel get_rows_t kernel_get_rows<block_iq4_xs, 2, dequantize_iq4_xs>;
|
|
6327
|
+
#else
|
|
6328
|
+
template [[host_name("kernel_get_rows_iq4_xs")]] kernel get_rows_t kernel_get_rows<block_iq4_xs, QK_NL, dequantize_iq4_xs>;
|
|
6329
|
+
#endif
|
|
5767
6330
|
|
|
5768
6331
|
//
|
|
5769
6332
|
// matrix-matrix multiplication
|
|
@@ -5804,8 +6367,14 @@ template [[host_name("kernel_mul_mm_iq2_xxs_f32")]] kernel mat_mm_t kernel_mul_m
|
|
|
5804
6367
|
template [[host_name("kernel_mul_mm_iq2_xs_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq2_xs, QK_NL, dequantize_iq2_xs>;
|
|
5805
6368
|
template [[host_name("kernel_mul_mm_iq3_xxs_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;
|
|
5806
6369
|
template [[host_name("kernel_mul_mm_iq3_s_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq3_s, QK_NL, dequantize_iq3_s>;
|
|
6370
|
+
template [[host_name("kernel_mul_mm_iq2_s_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq2_s, QK_NL, dequantize_iq2_s>;
|
|
5807
6371
|
template [[host_name("kernel_mul_mm_iq1_s_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq1_s, QK_NL, dequantize_iq1_s>;
|
|
5808
|
-
template [[host_name("kernel_mul_mm_iq4_nl_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq4_nl, 2,
|
|
6372
|
+
template [[host_name("kernel_mul_mm_iq4_nl_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq4_nl, 2, dequantize_iq4_nl>;
|
|
6373
|
+
#if QK_K == 64
|
|
6374
|
+
template [[host_name("kernel_mul_mm_iq4_xs_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq4_nl, 2, dequantize_iq4_xs>;
|
|
6375
|
+
#else
|
|
6376
|
+
template [[host_name("kernel_mul_mm_iq4_xs_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq4_xs, QK_NL, dequantize_iq4_xs>;
|
|
6377
|
+
#endif
|
|
5809
6378
|
|
|
5810
6379
|
//
|
|
5811
6380
|
// indirect matrix-matrix multiplication
|
|
@@ -5858,8 +6427,14 @@ template [[host_name("kernel_mul_mm_id_iq2_xxs_f32")]] kernel mat_mm_id_t kernel
|
|
|
5858
6427
|
template [[host_name("kernel_mul_mm_id_iq2_xs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq2_xs, QK_NL, dequantize_iq2_xs>;
|
|
5859
6428
|
template [[host_name("kernel_mul_mm_id_iq3_xxs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;
|
|
5860
6429
|
template [[host_name("kernel_mul_mm_id_iq3_s_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq3_s, QK_NL, dequantize_iq3_s>;
|
|
6430
|
+
template [[host_name("kernel_mul_mm_id_iq2_s_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq2_s, QK_NL, dequantize_iq2_s>;
|
|
5861
6431
|
template [[host_name("kernel_mul_mm_id_iq1_s_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq1_s, QK_NL, dequantize_iq1_s>;
|
|
5862
|
-
template [[host_name("kernel_mul_mm_id_iq4_nl_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq4_nl, 2,
|
|
6432
|
+
template [[host_name("kernel_mul_mm_id_iq4_nl_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq4_nl, 2, dequantize_iq4_nl>;
|
|
6433
|
+
#if QK_K == 64
|
|
6434
|
+
template [[host_name("kernel_mul_mm_id_iq4_xs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq4_xs, 2, dequantize_iq4_xs>;
|
|
6435
|
+
#else
|
|
6436
|
+
template [[host_name("kernel_mul_mm_id_iq4_xs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq4_xs, QK_NL, dequantize_iq4_xs>;
|
|
6437
|
+
#endif
|
|
5863
6438
|
|
|
5864
6439
|
//
|
|
5865
6440
|
// matrix-vector multiplication
|
|
@@ -6893,6 +7468,71 @@ kernel void kernel_mul_mv_id_iq3_s_f32(
|
|
|
6893
7468
|
sgitg);
|
|
6894
7469
|
}
|
|
6895
7470
|
|
|
7471
|
+
[[host_name("kernel_mul_mv_id_iq2_s_f32")]]
|
|
7472
|
+
kernel void kernel_mul_mv_id_iq2_s_f32(
|
|
7473
|
+
device const char * ids,
|
|
7474
|
+
device const char * src1,
|
|
7475
|
+
device float * dst,
|
|
7476
|
+
constant uint64_t & nbi1,
|
|
7477
|
+
constant int64_t & ne00,
|
|
7478
|
+
constant int64_t & ne01,
|
|
7479
|
+
constant int64_t & ne02,
|
|
7480
|
+
constant uint64_t & nb00,
|
|
7481
|
+
constant uint64_t & nb01,
|
|
7482
|
+
constant uint64_t & nb02,
|
|
7483
|
+
constant int64_t & ne10,
|
|
7484
|
+
constant int64_t & ne11,
|
|
7485
|
+
constant int64_t & ne12,
|
|
7486
|
+
constant int64_t & ne13,
|
|
7487
|
+
constant uint64_t & nb10,
|
|
7488
|
+
constant uint64_t & nb11,
|
|
7489
|
+
constant uint64_t & nb12,
|
|
7490
|
+
constant int64_t & ne0,
|
|
7491
|
+
constant int64_t & ne1,
|
|
7492
|
+
constant uint64_t & nb1,
|
|
7493
|
+
constant uint & r2,
|
|
7494
|
+
constant uint & r3,
|
|
7495
|
+
constant int & idx,
|
|
7496
|
+
device const char * src00,
|
|
7497
|
+
device const char * src01,
|
|
7498
|
+
device const char * src02,
|
|
7499
|
+
device const char * src03,
|
|
7500
|
+
device const char * src04,
|
|
7501
|
+
device const char * src05,
|
|
7502
|
+
device const char * src06,
|
|
7503
|
+
device const char * src07,
|
|
7504
|
+
threadgroup int8_t * shared_values [[threadgroup(0)]],
|
|
7505
|
+
uint3 tgpig[[threadgroup_position_in_grid]],
|
|
7506
|
+
uint tiitg[[thread_index_in_threadgroup]],
|
|
7507
|
+
uint tiisg[[thread_index_in_simdgroup]],
|
|
7508
|
+
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
|
7509
|
+
device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07};
|
|
7510
|
+
|
|
7511
|
+
const int64_t bid = tgpig.z/(ne12*ne13);
|
|
7512
|
+
|
|
7513
|
+
tgpig.z = tgpig.z%(ne12*ne13);
|
|
7514
|
+
|
|
7515
|
+
const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx];
|
|
7516
|
+
|
|
7517
|
+
kernel_mul_mv_iq2_s_f32_impl(
|
|
7518
|
+
src0[id],
|
|
7519
|
+
(device const float *) (src1 + bid*nb11),
|
|
7520
|
+
dst + bid*ne0,
|
|
7521
|
+
ne00,
|
|
7522
|
+
ne01,
|
|
7523
|
+
ne02,
|
|
7524
|
+
ne10,
|
|
7525
|
+
ne12,
|
|
7526
|
+
ne0,
|
|
7527
|
+
ne1,
|
|
7528
|
+
r2,
|
|
7529
|
+
r3,
|
|
7530
|
+
shared_values,
|
|
7531
|
+
tgpig,
|
|
7532
|
+
tiisg,
|
|
7533
|
+
sgitg);
|
|
7534
|
+
}
|
|
7535
|
+
|
|
6896
7536
|
[[host_name("kernel_mul_mv_id_iq1_s_f32")]]
|
|
6897
7537
|
kernel void kernel_mul_mv_id_iq1_s_f32(
|
|
6898
7538
|
device const char * ids,
|
|
@@ -7020,3 +7660,72 @@ kernel void kernel_mul_mv_id_iq4_nl_f32(
|
|
|
7020
7660
|
tiisg,
|
|
7021
7661
|
sgitg);
|
|
7022
7662
|
}
|
|
7663
|
+
|
|
7664
|
+
[[host_name("kernel_mul_mv_id_iq4_xs_f32")]]
|
|
7665
|
+
kernel void kernel_mul_mv_id_iq4_xs_f32(
|
|
7666
|
+
device const char * ids,
|
|
7667
|
+
device const char * src1,
|
|
7668
|
+
device float * dst,
|
|
7669
|
+
constant uint64_t & nbi1,
|
|
7670
|
+
constant int64_t & ne00,
|
|
7671
|
+
constant int64_t & ne01,
|
|
7672
|
+
constant int64_t & ne02,
|
|
7673
|
+
constant uint64_t & nb00,
|
|
7674
|
+
constant uint64_t & nb01,
|
|
7675
|
+
constant uint64_t & nb02,
|
|
7676
|
+
constant int64_t & ne10,
|
|
7677
|
+
constant int64_t & ne11,
|
|
7678
|
+
constant int64_t & ne12,
|
|
7679
|
+
constant int64_t & ne13,
|
|
7680
|
+
constant uint64_t & nb10,
|
|
7681
|
+
constant uint64_t & nb11,
|
|
7682
|
+
constant uint64_t & nb12,
|
|
7683
|
+
constant int64_t & ne0,
|
|
7684
|
+
constant int64_t & ne1,
|
|
7685
|
+
constant uint64_t & nb1,
|
|
7686
|
+
constant uint & r2,
|
|
7687
|
+
constant uint & r3,
|
|
7688
|
+
constant int & idx,
|
|
7689
|
+
device const char * src00,
|
|
7690
|
+
device const char * src01,
|
|
7691
|
+
device const char * src02,
|
|
7692
|
+
device const char * src03,
|
|
7693
|
+
device const char * src04,
|
|
7694
|
+
device const char * src05,
|
|
7695
|
+
device const char * src06,
|
|
7696
|
+
device const char * src07,
|
|
7697
|
+
threadgroup float * shared_values [[threadgroup(0)]],
|
|
7698
|
+
uint3 tgpig[[threadgroup_position_in_grid]],
|
|
7699
|
+
uint tiitg[[thread_index_in_threadgroup]],
|
|
7700
|
+
uint tiisg[[thread_index_in_simdgroup]],
|
|
7701
|
+
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
|
7702
|
+
device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07};
|
|
7703
|
+
|
|
7704
|
+
const int64_t bid = tgpig.z/(ne12*ne13);
|
|
7705
|
+
|
|
7706
|
+
tgpig.z = tgpig.z%(ne12*ne13);
|
|
7707
|
+
|
|
7708
|
+
const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx];
|
|
7709
|
+
|
|
7710
|
+
#if QK_K == 64
|
|
7711
|
+
kernel_mul_mv_iq4_nl_f32_impl(
|
|
7712
|
+
#else
|
|
7713
|
+
kernel_mul_mv_iq4_xs_f32_impl(
|
|
7714
|
+
#endif
|
|
7715
|
+
src0[id],
|
|
7716
|
+
(device const float *) (src1 + bid*nb11),
|
|
7717
|
+
dst + bid*ne0,
|
|
7718
|
+
ne00,
|
|
7719
|
+
ne01,
|
|
7720
|
+
ne02,
|
|
7721
|
+
ne10,
|
|
7722
|
+
ne12,
|
|
7723
|
+
ne0,
|
|
7724
|
+
ne1,
|
|
7725
|
+
r2,
|
|
7726
|
+
r3,
|
|
7727
|
+
shared_values,
|
|
7728
|
+
tgpig,
|
|
7729
|
+
tiisg,
|
|
7730
|
+
sgitg);
|
|
7731
|
+
}
|