node-llama-cpp 3.0.0-beta.11 → 3.0.0-beta.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (209) hide show
  1. package/README.md +4 -4
  2. package/dist/ChatWrapper.d.ts +1 -0
  3. package/dist/ChatWrapper.js +2 -1
  4. package/dist/ChatWrapper.js.map +1 -1
  5. package/dist/TemplateChatWrapper.d.ts +67 -0
  6. package/dist/TemplateChatWrapper.js +239 -0
  7. package/dist/TemplateChatWrapper.js.map +1 -0
  8. package/dist/bindings/AddonTypes.d.ts +2 -0
  9. package/dist/bindings/Llama.d.ts +1 -2
  10. package/dist/bindings/Llama.js +10 -14
  11. package/dist/bindings/Llama.js.map +1 -1
  12. package/dist/bindings/consts.d.ts +2 -0
  13. package/dist/bindings/consts.js +11 -0
  14. package/dist/bindings/consts.js.map +1 -0
  15. package/dist/bindings/getLlama.d.ts +14 -12
  16. package/dist/bindings/getLlama.js +210 -75
  17. package/dist/bindings/getLlama.js.map +1 -1
  18. package/dist/bindings/types.d.ts +8 -4
  19. package/dist/bindings/types.js +18 -0
  20. package/dist/bindings/types.js.map +1 -1
  21. package/dist/bindings/utils/asyncEvery.d.ts +5 -0
  22. package/dist/bindings/utils/asyncEvery.js +15 -0
  23. package/dist/bindings/utils/asyncEvery.js.map +1 -0
  24. package/dist/bindings/utils/asyncSome.d.ts +5 -0
  25. package/dist/bindings/utils/asyncSome.js +27 -0
  26. package/dist/bindings/utils/asyncSome.js.map +1 -0
  27. package/dist/bindings/utils/cloneLlamaCppRepo.js +13 -3
  28. package/dist/bindings/utils/cloneLlamaCppRepo.js.map +1 -1
  29. package/dist/bindings/utils/compileLLamaCpp.js +31 -3
  30. package/dist/bindings/utils/compileLLamaCpp.js.map +1 -1
  31. package/dist/bindings/utils/detectAvailableComputeLayers.d.ts +11 -0
  32. package/dist/bindings/utils/detectAvailableComputeLayers.js +158 -0
  33. package/dist/bindings/utils/detectAvailableComputeLayers.js.map +1 -0
  34. package/dist/bindings/utils/detectGlibc.d.ts +4 -0
  35. package/dist/bindings/utils/detectGlibc.js +36 -0
  36. package/dist/bindings/utils/detectGlibc.js.map +1 -0
  37. package/dist/bindings/utils/getBestComputeLayersAvailable.d.ts +9 -0
  38. package/dist/bindings/utils/getBestComputeLayersAvailable.js +29 -0
  39. package/dist/bindings/utils/getBestComputeLayersAvailable.js.map +1 -0
  40. package/dist/bindings/utils/getBuildFolderNameForBuildOptions.js +12 -4
  41. package/dist/bindings/utils/getBuildFolderNameForBuildOptions.js.map +1 -1
  42. package/dist/bindings/utils/getGpuTypesToUseForOption.d.ts +11 -0
  43. package/dist/bindings/utils/getGpuTypesToUseForOption.js +30 -0
  44. package/dist/bindings/utils/getGpuTypesToUseForOption.js.map +1 -0
  45. package/dist/bindings/utils/getLinuxDistroInfo.d.ts +9 -0
  46. package/dist/bindings/utils/getLinuxDistroInfo.js +46 -0
  47. package/dist/bindings/utils/getLinuxDistroInfo.js.map +1 -0
  48. package/dist/bindings/utils/getPlatformInfo.d.ts +5 -0
  49. package/dist/bindings/utils/getPlatformInfo.js +28 -0
  50. package/dist/bindings/utils/getPlatformInfo.js.map +1 -0
  51. package/dist/bindings/utils/hasFileInPath.d.ts +2 -0
  52. package/dist/bindings/utils/hasFileInPath.js +34 -0
  53. package/dist/bindings/utils/hasFileInPath.js.map +1 -0
  54. package/dist/bindings/utils/logBinaryUsageExampleToConsole.d.ts +1 -1
  55. package/dist/bindings/utils/logBinaryUsageExampleToConsole.js +3 -9
  56. package/dist/bindings/utils/logBinaryUsageExampleToConsole.js.map +1 -1
  57. package/dist/bindings/utils/logDistroInstallInstruction.d.ts +13 -0
  58. package/dist/bindings/utils/logDistroInstallInstruction.js +38 -0
  59. package/dist/bindings/utils/logDistroInstallInstruction.js.map +1 -0
  60. package/dist/bindings/utils/resolveChatWrapperBasedOnWrapperTypeName.d.ts +9 -2
  61. package/dist/bindings/utils/resolveChatWrapperBasedOnWrapperTypeName.js +10 -4
  62. package/dist/bindings/utils/resolveChatWrapperBasedOnWrapperTypeName.js.map +1 -1
  63. package/dist/bindings/utils/resolveCustomCmakeOptions.js +2 -0
  64. package/dist/bindings/utils/resolveCustomCmakeOptions.js.map +1 -1
  65. package/dist/bindings/utils/testBindingBinary.d.ts +1 -0
  66. package/dist/bindings/utils/testBindingBinary.js +98 -0
  67. package/dist/bindings/utils/testBindingBinary.js.map +1 -0
  68. package/dist/chatWrappers/ChatMLChatWrapper.js +1 -1
  69. package/dist/chatWrappers/ChatMLChatWrapper.js.map +1 -1
  70. package/dist/chatWrappers/GemmaChatWrapper.d.ts +18 -0
  71. package/dist/chatWrappers/GemmaChatWrapper.js +86 -0
  72. package/dist/chatWrappers/GemmaChatWrapper.js.map +1 -0
  73. package/dist/chatWrappers/resolveChatWrapperBasedOnModel.js +3 -0
  74. package/dist/chatWrappers/resolveChatWrapperBasedOnModel.js.map +1 -1
  75. package/dist/cli/cli.js +2 -0
  76. package/dist/cli/cli.js.map +1 -1
  77. package/dist/cli/commands/BuildCommand.d.ts +5 -4
  78. package/dist/cli/commands/BuildCommand.js +78 -58
  79. package/dist/cli/commands/BuildCommand.js.map +1 -1
  80. package/dist/cli/commands/DebugCommand.js +12 -15
  81. package/dist/cli/commands/DebugCommand.js.map +1 -1
  82. package/dist/cli/commands/DownloadCommand.d.ts +5 -4
  83. package/dist/cli/commands/DownloadCommand.js +97 -54
  84. package/dist/cli/commands/DownloadCommand.js.map +1 -1
  85. package/dist/cli/commands/InspectCommand.d.ts +7 -0
  86. package/dist/cli/commands/InspectCommand.js +113 -0
  87. package/dist/cli/commands/InspectCommand.js.map +1 -0
  88. package/dist/cli/utils/logUsedGpuTypeOption.d.ts +2 -0
  89. package/dist/cli/utils/logUsedGpuTypeOption.js +9 -0
  90. package/dist/cli/utils/logUsedGpuTypeOption.js.map +1 -0
  91. package/dist/config.d.ts +3 -2
  92. package/dist/config.js +12 -10
  93. package/dist/config.js.map +1 -1
  94. package/dist/evaluator/LlamaChatSession/utils/defineChatSessionFunction.d.ts +3 -0
  95. package/dist/evaluator/LlamaChatSession/utils/defineChatSessionFunction.js +3 -0
  96. package/dist/evaluator/LlamaChatSession/utils/defineChatSessionFunction.js.map +1 -1
  97. package/dist/gguf/GGUFInsights.d.ts +28 -0
  98. package/dist/gguf/GGUFInsights.js +58 -0
  99. package/dist/gguf/GGUFInsights.js.map +1 -0
  100. package/dist/gguf/GGUFMetadata.d.ts +19 -0
  101. package/dist/gguf/GGUFMetadata.js +38 -0
  102. package/dist/gguf/GGUFMetadata.js.map +1 -0
  103. package/dist/gguf/errors/InvalidGGUFMagicError.d.ts +3 -0
  104. package/dist/gguf/errors/InvalidGGUFMagicError.js +6 -0
  105. package/dist/gguf/errors/InvalidGGUFMagicError.js.map +1 -0
  106. package/dist/gguf/errors/MetadataNotParsedYetError.d.ts +3 -0
  107. package/dist/gguf/errors/MetadataNotParsedYetError.js +6 -0
  108. package/dist/gguf/errors/MetadataNotParsedYetError.js.map +1 -0
  109. package/dist/gguf/errors/MissingNodeLlamaError.d.ts +3 -0
  110. package/dist/gguf/errors/MissingNodeLlamaError.js +6 -0
  111. package/dist/gguf/errors/MissingNodeLlamaError.js.map +1 -0
  112. package/dist/gguf/errors/ModelScore/NotEnoughVRamError.d.ts +5 -0
  113. package/dist/gguf/errors/ModelScore/NotEnoughVRamError.js +12 -0
  114. package/dist/gguf/errors/ModelScore/NotEnoughVRamError.js.map +1 -0
  115. package/dist/gguf/errors/UnsupportedMetadataTypeError.d.ts +4 -0
  116. package/dist/gguf/errors/UnsupportedMetadataTypeError.js +8 -0
  117. package/dist/gguf/errors/UnsupportedMetadataTypeError.js.map +1 -0
  118. package/dist/gguf/ggufParser/GGUFParser.d.ts +18 -0
  119. package/dist/gguf/ggufParser/GGUFParser.js +123 -0
  120. package/dist/gguf/ggufParser/GGUFParser.js.map +1 -0
  121. package/dist/gguf/ggufParser/GGUFTypes.d.ts +257 -0
  122. package/dist/gguf/ggufParser/GGUFTypes.js +2 -0
  123. package/dist/gguf/ggufParser/GGUFTypes.js.map +1 -0
  124. package/dist/gguf/ggufParser/checkArchitecture.d.ts +14 -0
  125. package/dist/gguf/ggufParser/checkArchitecture.js +74 -0
  126. package/dist/gguf/ggufParser/checkArchitecture.js.map +1 -0
  127. package/dist/gguf/ggufParser/stream/GGUFBaseStream.d.ts +38 -0
  128. package/dist/gguf/ggufParser/stream/GGUFBaseStream.js +83 -0
  129. package/dist/gguf/ggufParser/stream/GGUFBaseStream.js.map +1 -0
  130. package/dist/gguf/ggufParser/stream/GGUFFetchStream.d.ts +14 -0
  131. package/dist/gguf/ggufParser/stream/GGUFFetchStream.js +35 -0
  132. package/dist/gguf/ggufParser/stream/GGUFFetchStream.js.map +1 -0
  133. package/dist/gguf/ggufParser/stream/GGUFReadStream.d.ts +15 -0
  134. package/dist/gguf/ggufParser/stream/GGUFReadStream.js +40 -0
  135. package/dist/gguf/ggufParser/stream/GGUFReadStream.js.map +1 -0
  136. package/dist/index.d.ts +3 -1
  137. package/dist/index.js +3 -1
  138. package/dist/index.js.map +1 -1
  139. package/dist/utils/LlamaText.js +2 -2
  140. package/dist/utils/LlamaText.js.map +1 -1
  141. package/dist/utils/cmake.js +23 -10
  142. package/dist/utils/cmake.js.map +1 -1
  143. package/dist/utils/gbnfJson/terminals/GbnfArray.js.map +1 -1
  144. package/dist/utils/gbnfJson/terminals/GbnfBoolean.d.ts +1 -1
  145. package/dist/utils/gbnfJson/terminals/GbnfBoolean.js.map +1 -1
  146. package/dist/utils/gbnfJson/terminals/GbnfBooleanValue.js.map +1 -1
  147. package/dist/utils/gbnfJson/terminals/GbnfGrammar.js.map +1 -1
  148. package/dist/utils/gbnfJson/terminals/GbnfNull.d.ts +1 -1
  149. package/dist/utils/gbnfJson/terminals/GbnfNull.js.map +1 -1
  150. package/dist/utils/gbnfJson/terminals/GbnfNumber.d.ts +1 -1
  151. package/dist/utils/gbnfJson/terminals/GbnfNumber.js.map +1 -1
  152. package/dist/utils/gbnfJson/terminals/GbnfNumberValue.js.map +1 -1
  153. package/dist/utils/gbnfJson/terminals/GbnfObjectMap.js.map +1 -1
  154. package/dist/utils/gbnfJson/terminals/GbnfOr.js.map +1 -1
  155. package/dist/utils/gbnfJson/terminals/GbnfString.d.ts +1 -1
  156. package/dist/utils/gbnfJson/terminals/GbnfString.js.map +1 -1
  157. package/dist/utils/gbnfJson/terminals/GbnfStringValue.js.map +1 -1
  158. package/dist/utils/gbnfJson/terminals/GbnfVerbatimText.js.map +1 -1
  159. package/dist/utils/gbnfJson/terminals/GbnfWhitespace.d.ts +1 -1
  160. package/dist/utils/gbnfJson/terminals/GbnfWhitespace.js.map +1 -1
  161. package/dist/utils/getBuildDefaults.d.ts +1 -2
  162. package/dist/utils/getBuildDefaults.js +2 -3
  163. package/dist/utils/getBuildDefaults.js.map +1 -1
  164. package/dist/utils/getConsoleLogPrefix.d.ts +1 -1
  165. package/dist/utils/getConsoleLogPrefix.js +2 -2
  166. package/dist/utils/getConsoleLogPrefix.js.map +1 -1
  167. package/dist/utils/mergeUnionTypes.d.ts +6 -0
  168. package/dist/utils/mergeUnionTypes.js +2 -0
  169. package/dist/utils/mergeUnionTypes.js.map +1 -0
  170. package/dist/utils/parseTextTemplate.d.ts +66 -0
  171. package/dist/utils/parseTextTemplate.js +116 -0
  172. package/dist/utils/parseTextTemplate.js.map +1 -0
  173. package/llama/CMakeLists.txt +30 -4
  174. package/llama/addon.cpp +62 -7
  175. package/llama/binariesGithubRelease.json +1 -1
  176. package/llama/gitRelease.bundle +0 -0
  177. package/llama/gpuInfo/cuda-gpu-info.cu +5 -5
  178. package/llama/gpuInfo/cuda-gpu-info.h +2 -2
  179. package/llama/gpuInfo/vulkan-gpu-info.cpp +65 -0
  180. package/llama/gpuInfo/vulkan-gpu-info.h +7 -0
  181. package/llama/llama.cpp.info.json +1 -1
  182. package/llamaBins/linux-arm64/.buildMetadata.json +1 -1
  183. package/llamaBins/linux-arm64/llama-addon.node +0 -0
  184. package/llamaBins/linux-armv7l/.buildMetadata.json +1 -1
  185. package/llamaBins/linux-armv7l/llama-addon.node +0 -0
  186. package/llamaBins/linux-x64/.buildMetadata.json +1 -1
  187. package/llamaBins/linux-x64/llama-addon.node +0 -0
  188. package/llamaBins/linux-x64-cuda/.buildMetadata.json +1 -1
  189. package/llamaBins/linux-x64-cuda/llama-addon.node +0 -0
  190. package/llamaBins/linux-x64-vulkan/.buildMetadata.json +1 -0
  191. package/llamaBins/linux-x64-vulkan/llama-addon.node +0 -0
  192. package/llamaBins/mac-arm64-metal/.buildMetadata.json +1 -1
  193. package/llamaBins/mac-arm64-metal/ggml-metal.metal +1382 -142
  194. package/llamaBins/mac-arm64-metal/llama-addon.node +0 -0
  195. package/llamaBins/mac-x64/.buildMetadata.json +1 -1
  196. package/llamaBins/mac-x64/llama-addon.node +0 -0
  197. package/llamaBins/win-x64/.buildMetadata.json +1 -1
  198. package/llamaBins/win-x64/llama-addon.exp +0 -0
  199. package/llamaBins/win-x64/llama-addon.lib +0 -0
  200. package/llamaBins/win-x64/llama-addon.node +0 -0
  201. package/llamaBins/win-x64-cuda/.buildMetadata.json +1 -1
  202. package/llamaBins/win-x64-cuda/llama-addon.exp +0 -0
  203. package/llamaBins/win-x64-cuda/llama-addon.lib +0 -0
  204. package/llamaBins/win-x64-cuda/llama-addon.node +0 -0
  205. package/llamaBins/win-x64-vulkan/.buildMetadata.json +1 -0
  206. package/llamaBins/win-x64-vulkan/llama-addon.exp +0 -0
  207. package/llamaBins/win-x64-vulkan/llama-addon.lib +0 -0
  208. package/llamaBins/win-x64-vulkan/llama-addon.node +0 -0
  209. package/package.json +7 -4
@@ -392,7 +392,7 @@ kernel void kernel_soft_max(
392
392
  float lmax = -INFINITY;
393
393
 
394
394
  for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
395
- lmax = MAX(lmax, psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f) + slope*ppos[i00]);
395
+ lmax = MAX(lmax, psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f) + (ppos ? slope*ppos[i00] : 0.0f));
396
396
  }
397
397
 
398
398
  // find the max value in the block
@@ -417,7 +417,7 @@ kernel void kernel_soft_max(
417
417
  // parallel sum
418
418
  float lsum = 0.0f;
419
419
  for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
420
- const float exp_psrc0 = exp((psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f) + slope*ppos[i00]) - max_val);
420
+ const float exp_psrc0 = exp((psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f) + (ppos ? slope*ppos[i00] : 0.0f)) - max_val);
421
421
  lsum += exp_psrc0;
422
422
  pdst[i00] = exp_psrc0;
423
423
  }
@@ -495,7 +495,7 @@ kernel void kernel_soft_max_4(
495
495
  float4 lmax4 = -INFINITY;
496
496
 
497
497
  for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
498
- lmax4 = fmax(lmax4, psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f) + slope*ppos[i00]);
498
+ lmax4 = fmax(lmax4, psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f) + (ppos ? slope*ppos[i00] : 0.0f));
499
499
  }
500
500
 
501
501
  const float lmax = MAX(MAX(lmax4[0], lmax4[1]), MAX(lmax4[2], lmax4[3]));
@@ -521,7 +521,7 @@ kernel void kernel_soft_max_4(
521
521
  // parallel sum
522
522
  float4 lsum4 = 0.0f;
523
523
  for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
524
- const float4 exp_psrc4 = exp((psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f) + slope*ppos[i00]) - max_val);
524
+ const float4 exp_psrc4 = exp((psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f) + (ppos ? slope*ppos[i00] : 0.0f)) - max_val);
525
525
  lsum4 += exp_psrc4;
526
526
  pdst4[i00] = exp_psrc4;
527
527
  }
@@ -2519,18 +2519,57 @@ typedef struct {
2519
2519
  } block_iq2_xs;
2520
2520
  // 74 bytes / block for QK_K = 256, so 2.3125 bpw
2521
2521
 
2522
+ // 2.5625 bpw quants
2523
+ typedef struct {
2524
+ half d;
2525
+ uint8_t qs[QK_K/4];
2526
+ uint8_t qh[QK_K/32];
2527
+ uint8_t scales[QK_K/32];
2528
+ } block_iq2_s;
2529
+
2522
2530
  typedef struct {
2523
2531
  half d;
2524
2532
  uint8_t qs[3*QK_K/8];
2525
2533
  } block_iq3_xxs;
2526
2534
  // 98 bytes / block for QK_K = 256, so 3.0625 bpw
2527
2535
 
2536
+ // 3.4375 bpw
2537
+ #if QK_K == 64
2538
+ #define IQ3S_N_SCALE 2
2539
+ #else
2540
+ #define IQ3S_N_SCALE QK_K/64
2541
+ #endif
2542
+ typedef struct {
2543
+ half d;
2544
+ uint8_t qs[QK_K/4];
2545
+ uint8_t qh[QK_K/32];
2546
+ uint8_t signs[QK_K/8];
2547
+ uint8_t scales[IQ3S_N_SCALE];
2548
+ } block_iq3_s;
2549
+
2528
2550
  typedef struct {
2529
2551
  half d;
2530
2552
  uint8_t qs[QK_K/8];
2531
2553
  uint8_t scales[QK_K/16];
2532
2554
  } block_iq1_s;
2533
2555
 
2556
+ // Non-linear quants
2557
+ #define QK4_NL 32
2558
+ typedef struct {
2559
+ half d;
2560
+ uint8_t qs[QK4_NL/2];
2561
+ } block_iq4_nl;
2562
+
2563
+ #if QK_K == 64
2564
+ #define block_iq4_xs block_iq4_nl
2565
+ #else
2566
+ typedef struct {
2567
+ half d;
2568
+ uint16_t scales_h;
2569
+ uint8_t scales_l[QK_K/64];
2570
+ uint8_t qs[QK_K/2];
2571
+ } block_iq4_xs;
2572
+ #endif
2534
2573
 
2535
2574
  //====================================== dot products =========================
2536
2575
 
@@ -3754,6 +3793,265 @@ constexpr constant static uint64_t iq2xs_grid[512] = {
3754
3793
  0x2b2b2b2b082b2b08, 0x2b2b2b2b082b2b2b, 0x2b2b2b2b2b190819, 0x2b2b2b2b2b2b2b2b,
3755
3794
  };
3756
3795
 
3796
+ constexpr constant static uint64_t iq2s_grid[1024] = {
3797
+ 0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08,
3798
+ 0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x080808080819192b,
3799
+ 0x0808080808192b19, 0x08080808082b0808, 0x08080808082b082b, 0x08080808082b1919,
3800
+ 0x08080808082b2b08, 0x0808080819080819, 0x0808080819081908, 0x080808081908192b,
3801
+ 0x0808080819082b19, 0x0808080819190808, 0x080808081919082b, 0x0808080819191919,
3802
+ 0x0808080819192b08, 0x08080808192b0819, 0x08080808192b1908, 0x08080808192b192b,
3803
+ 0x08080808192b2b19, 0x080808082b080808, 0x080808082b08082b, 0x080808082b081919,
3804
+ 0x080808082b082b08, 0x080808082b190819, 0x080808082b191908, 0x080808082b2b0808,
3805
+ 0x080808082b2b1919, 0x080808082b2b2b2b, 0x0808081908080819, 0x0808081908081908,
3806
+ 0x080808190808192b, 0x0808081908082b19, 0x0808081908190808, 0x080808190819082b,
3807
+ 0x0808081908191919, 0x0808081908192b08, 0x08080819082b0819, 0x08080819082b1908,
3808
+ 0x0808081919080808, 0x080808191908082b, 0x0808081919081919, 0x0808081919082b08,
3809
+ 0x0808081919190819, 0x0808081919191908, 0x080808191919192b, 0x0808081919192b19,
3810
+ 0x08080819192b0808, 0x08080819192b1919, 0x08080819192b2b08, 0x080808192b080819,
3811
+ 0x080808192b081908, 0x080808192b190808, 0x080808192b19082b, 0x080808192b191919,
3812
+ 0x080808192b2b0819, 0x080808192b2b1908, 0x0808082b08080808, 0x0808082b0808082b,
3813
+ 0x0808082b08081919, 0x0808082b08082b08, 0x0808082b08190819, 0x0808082b08191908,
3814
+ 0x0808082b082b0808, 0x0808082b082b2b2b, 0x0808082b19080819, 0x0808082b19081908,
3815
+ 0x0808082b1908192b, 0x0808082b19082b19, 0x0808082b19190808, 0x0808082b19191919,
3816
+ 0x0808082b2b080808, 0x0808082b2b081919, 0x0808082b2b082b2b, 0x0808082b2b191908,
3817
+ 0x0808082b2b2b082b, 0x0808190808080819, 0x0808190808081908, 0x080819080808192b,
3818
+ 0x0808190808082b19, 0x0808190808190808, 0x080819080819082b, 0x0808190808191919,
3819
+ 0x0808190808192b08, 0x08081908082b0819, 0x08081908082b1908, 0x08081908082b192b,
3820
+ 0x08081908082b2b19, 0x0808190819080808, 0x080819081908082b, 0x0808190819081919,
3821
+ 0x0808190819082b08, 0x0808190819082b2b, 0x0808190819190819, 0x0808190819191908,
3822
+ 0x080819081919192b, 0x0808190819192b19, 0x08081908192b0808, 0x08081908192b082b,
3823
+ 0x08081908192b1919, 0x080819082b080819, 0x080819082b081908, 0x080819082b08192b,
3824
+ 0x080819082b082b19, 0x080819082b190808, 0x080819082b191919, 0x080819082b192b08,
3825
+ 0x080819082b2b0819, 0x080819082b2b1908, 0x0808191908080808, 0x080819190808082b,
3826
+ 0x0808191908081919, 0x0808191908082b08, 0x0808191908082b2b, 0x0808191908190819,
3827
+ 0x0808191908191908, 0x080819190819192b, 0x0808191908192b19, 0x08081919082b0808,
3828
+ 0x08081919082b1919, 0x08081919082b2b08, 0x0808191919080819, 0x0808191919081908,
3829
+ 0x080819191908192b, 0x0808191919082b19, 0x0808191919190808, 0x080819191919082b,
3830
+ 0x0808191919191919, 0x0808191919192b08, 0x08081919192b0819, 0x08081919192b1908,
3831
+ 0x080819192b080808, 0x080819192b08082b, 0x080819192b081919, 0x080819192b082b08,
3832
+ 0x080819192b190819, 0x080819192b191908, 0x080819192b2b0808, 0x0808192b08080819,
3833
+ 0x0808192b08081908, 0x0808192b0808192b, 0x0808192b08082b19, 0x0808192b08190808,
3834
+ 0x0808192b08191919, 0x0808192b19080808, 0x0808192b19081919, 0x0808192b19082b08,
3835
+ 0x0808192b19190819, 0x0808192b19191908, 0x0808192b192b0808, 0x0808192b2b080819,
3836
+ 0x0808192b2b081908, 0x0808192b2b190808, 0x08082b0808080808, 0x08082b080808082b,
3837
+ 0x08082b0808081919, 0x08082b0808082b08, 0x08082b0808190819, 0x08082b0808191908,
3838
+ 0x08082b080819192b, 0x08082b0808192b19, 0x08082b08082b0808, 0x08082b08082b1919,
3839
+ 0x08082b08082b2b2b, 0x08082b0819080819, 0x08082b0819081908, 0x08082b081908192b,
3840
+ 0x08082b0819082b19, 0x08082b0819190808, 0x08082b081919082b, 0x08082b0819191919,
3841
+ 0x08082b0819192b08, 0x08082b08192b0819, 0x08082b08192b1908, 0x08082b082b080808,
3842
+ 0x08082b082b081919, 0x08082b082b191908, 0x08082b082b2b2b2b, 0x08082b1908080819,
3843
+ 0x08082b1908081908, 0x08082b1908190808, 0x08082b190819082b, 0x08082b1908191919,
3844
+ 0x08082b1908192b08, 0x08082b19082b0819, 0x08082b1919080808, 0x08082b1919081919,
3845
+ 0x08082b1919082b08, 0x08082b1919190819, 0x08082b1919191908, 0x08082b19192b0808,
3846
+ 0x08082b192b080819, 0x08082b192b190808, 0x08082b2b08080808, 0x08082b2b08190819,
3847
+ 0x08082b2b08191908, 0x08082b2b082b082b, 0x08082b2b082b2b08, 0x08082b2b082b2b2b,
3848
+ 0x08082b2b19190808, 0x08082b2b2b192b19, 0x0819080808080819, 0x0819080808081908,
3849
+ 0x081908080808192b, 0x0819080808082b19, 0x0819080808190808, 0x081908080819082b,
3850
+ 0x0819080808191919, 0x0819080808192b08, 0x08190808082b0819, 0x08190808082b1908,
3851
+ 0x08190808082b192b, 0x0819080819080808, 0x081908081908082b, 0x0819080819081919,
3852
+ 0x0819080819082b08, 0x0819080819190819, 0x0819080819191908, 0x081908081919192b,
3853
+ 0x0819080819192b19, 0x08190808192b0808, 0x08190808192b082b, 0x08190808192b1919,
3854
+ 0x08190808192b2b08, 0x081908082b080819, 0x081908082b081908, 0x081908082b08192b,
3855
+ 0x081908082b190808, 0x081908082b191919, 0x081908082b192b08, 0x081908082b2b0819,
3856
+ 0x081908082b2b1908, 0x0819081908080808, 0x081908190808082b, 0x0819081908081919,
3857
+ 0x0819081908082b08, 0x0819081908082b2b, 0x0819081908190819, 0x0819081908191908,
3858
+ 0x081908190819192b, 0x0819081908192b19, 0x08190819082b0808, 0x08190819082b082b,
3859
+ 0x08190819082b1919, 0x08190819082b2b08, 0x0819081919080819, 0x0819081919081908,
3860
+ 0x081908191908192b, 0x0819081919082b19, 0x0819081919190808, 0x081908191919082b,
3861
+ 0x0819081919191919, 0x0819081919192b08, 0x08190819192b0819, 0x08190819192b1908,
3862
+ 0x081908192b080808, 0x081908192b08082b, 0x081908192b081919, 0x081908192b082b08,
3863
+ 0x081908192b190819, 0x081908192b191908, 0x0819082b08080819, 0x0819082b08081908,
3864
+ 0x0819082b08082b19, 0x0819082b08190808, 0x0819082b08191919, 0x0819082b082b0819,
3865
+ 0x0819082b082b1908, 0x0819082b19080808, 0x0819082b19081919, 0x0819082b19190819,
3866
+ 0x0819082b19191908, 0x0819082b2b080819, 0x0819082b2b081908, 0x0819082b2b190808,
3867
+ 0x0819190808080808, 0x081919080808082b, 0x0819190808081919, 0x0819190808082b08,
3868
+ 0x0819190808190819, 0x0819190808191908, 0x081919080819192b, 0x0819190808192b19,
3869
+ 0x08191908082b0808, 0x08191908082b1919, 0x08191908082b2b08, 0x0819190819080819,
3870
+ 0x0819190819081908, 0x081919081908192b, 0x0819190819082b19, 0x0819190819190808,
3871
+ 0x081919081919082b, 0x0819190819191919, 0x0819190819192b08, 0x08191908192b0819,
3872
+ 0x08191908192b1908, 0x081919082b080808, 0x081919082b08082b, 0x081919082b081919,
3873
+ 0x081919082b082b08, 0x081919082b190819, 0x081919082b191908, 0x081919082b2b0808,
3874
+ 0x0819191908080819, 0x0819191908081908, 0x081919190808192b, 0x0819191908082b19,
3875
+ 0x0819191908190808, 0x081919190819082b, 0x0819191908191919, 0x0819191908192b08,
3876
+ 0x08191919082b0819, 0x08191919082b1908, 0x0819191919080808, 0x081919191908082b,
3877
+ 0x0819191919081919, 0x0819191919082b08, 0x0819191919190819, 0x0819191919191908,
3878
+ 0x08191919192b0808, 0x081919192b080819, 0x081919192b081908, 0x081919192b190808,
3879
+ 0x0819192b08080808, 0x0819192b08081919, 0x0819192b08082b08, 0x0819192b08190819,
3880
+ 0x0819192b08191908, 0x0819192b082b0808, 0x0819192b19080819, 0x0819192b19081908,
3881
+ 0x0819192b19190808, 0x0819192b2b080808, 0x0819192b2b2b2b2b, 0x08192b0808080819,
3882
+ 0x08192b0808081908, 0x08192b080808192b, 0x08192b0808082b19, 0x08192b0808190808,
3883
+ 0x08192b0808191919, 0x08192b0808192b08, 0x08192b08082b0819, 0x08192b0819080808,
3884
+ 0x08192b081908082b, 0x08192b0819081919, 0x08192b0819082b08, 0x08192b0819190819,
3885
+ 0x08192b0819191908, 0x08192b08192b0808, 0x08192b082b080819, 0x08192b082b081908,
3886
+ 0x08192b1908080808, 0x08192b190808082b, 0x08192b1908081919, 0x08192b1908082b08,
3887
+ 0x08192b1908190819, 0x08192b1908191908, 0x08192b19082b0808, 0x08192b1919080819,
3888
+ 0x08192b1919081908, 0x08192b1919190808, 0x08192b19192b2b19, 0x08192b192b2b082b,
3889
+ 0x08192b2b08081908, 0x08192b2b08190808, 0x08192b2b19080808, 0x08192b2b1919192b,
3890
+ 0x082b080808080808, 0x082b08080808082b, 0x082b080808081919, 0x082b080808082b08,
3891
+ 0x082b080808190819, 0x082b080808191908, 0x082b08080819192b, 0x082b080808192b19,
3892
+ 0x082b0808082b0808, 0x082b0808082b1919, 0x082b0808082b2b2b, 0x082b080819080819,
3893
+ 0x082b080819081908, 0x082b080819190808, 0x082b08081919082b, 0x082b080819191919,
3894
+ 0x082b0808192b1908, 0x082b08082b080808, 0x082b08082b082b2b, 0x082b08082b191908,
3895
+ 0x082b08082b2b2b2b, 0x082b081908080819, 0x082b081908081908, 0x082b081908190808,
3896
+ 0x082b08190819082b, 0x082b081908191919, 0x082b0819082b0819, 0x082b081919080808,
3897
+ 0x082b08191908082b, 0x082b081919081919, 0x082b081919190819, 0x082b081919191908,
3898
+ 0x082b0819192b0808, 0x082b08192b080819, 0x082b08192b081908, 0x082b08192b190808,
3899
+ 0x082b082b08080808, 0x082b082b08082b2b, 0x082b082b082b082b, 0x082b082b082b2b08,
3900
+ 0x082b082b082b2b2b, 0x082b082b19081908, 0x082b082b19190808, 0x082b082b2b082b08,
3901
+ 0x082b082b2b082b2b, 0x082b082b2b2b2b08, 0x082b190808080819, 0x082b190808081908,
3902
+ 0x082b19080808192b, 0x082b190808082b19, 0x082b190808190808, 0x082b190808191919,
3903
+ 0x082b190808192b08, 0x082b1908082b0819, 0x082b1908082b1908, 0x082b190819080808,
3904
+ 0x082b19081908082b, 0x082b190819081919, 0x082b190819082b08, 0x082b190819190819,
3905
+ 0x082b190819191908, 0x082b1908192b0808, 0x082b19082b080819, 0x082b19082b081908,
3906
+ 0x082b19082b190808, 0x082b191908080808, 0x082b191908081919, 0x082b191908082b08,
3907
+ 0x082b191908190819, 0x082b191908191908, 0x082b1919082b0808, 0x082b191919080819,
3908
+ 0x082b191919081908, 0x082b191919190808, 0x082b1919192b192b, 0x082b19192b080808,
3909
+ 0x082b192b08080819, 0x082b192b08081908, 0x082b192b08190808, 0x082b192b19080808,
3910
+ 0x082b192b19192b19, 0x082b2b0808080808, 0x082b2b0808081919, 0x082b2b0808190819,
3911
+ 0x082b2b0808191908, 0x082b2b0819080819, 0x082b2b0819081908, 0x082b2b0819190808,
3912
+ 0x082b2b082b082b2b, 0x082b2b082b2b2b2b, 0x082b2b1908080819, 0x082b2b1908081908,
3913
+ 0x082b2b1908190808, 0x082b2b192b191919, 0x082b2b2b08082b2b, 0x082b2b2b082b082b,
3914
+ 0x082b2b2b192b1908, 0x082b2b2b2b082b08, 0x082b2b2b2b082b2b, 0x1908080808080819,
3915
+ 0x1908080808081908, 0x190808080808192b, 0x1908080808082b19, 0x1908080808190808,
3916
+ 0x190808080819082b, 0x1908080808191919, 0x1908080808192b08, 0x1908080808192b2b,
3917
+ 0x19080808082b0819, 0x19080808082b1908, 0x19080808082b192b, 0x1908080819080808,
3918
+ 0x190808081908082b, 0x1908080819081919, 0x1908080819082b08, 0x1908080819082b2b,
3919
+ 0x1908080819190819, 0x1908080819191908, 0x190808081919192b, 0x1908080819192b19,
3920
+ 0x19080808192b0808, 0x19080808192b082b, 0x19080808192b1919, 0x190808082b080819,
3921
+ 0x190808082b081908, 0x190808082b190808, 0x190808082b191919, 0x190808082b192b08,
3922
+ 0x190808082b2b0819, 0x190808082b2b1908, 0x1908081908080808, 0x190808190808082b,
3923
+ 0x1908081908081919, 0x1908081908082b08, 0x1908081908190819, 0x1908081908191908,
3924
+ 0x190808190819192b, 0x1908081908192b19, 0x19080819082b0808, 0x19080819082b082b,
3925
+ 0x19080819082b1919, 0x1908081919080819, 0x1908081919081908, 0x190808191908192b,
3926
+ 0x1908081919082b19, 0x1908081919190808, 0x190808191919082b, 0x1908081919191919,
3927
+ 0x1908081919192b08, 0x19080819192b0819, 0x19080819192b1908, 0x190808192b080808,
3928
+ 0x190808192b08082b, 0x190808192b081919, 0x190808192b082b08, 0x190808192b190819,
3929
+ 0x190808192b191908, 0x190808192b2b0808, 0x1908082b08080819, 0x1908082b08081908,
3930
+ 0x1908082b08190808, 0x1908082b0819082b, 0x1908082b08191919, 0x1908082b08192b08,
3931
+ 0x1908082b082b1908, 0x1908082b19080808, 0x1908082b19081919, 0x1908082b19082b08,
3932
+ 0x1908082b19190819, 0x1908082b19191908, 0x1908082b192b0808, 0x1908082b2b080819,
3933
+ 0x1908082b2b081908, 0x1908190808080808, 0x190819080808082b, 0x1908190808081919,
3934
+ 0x1908190808082b08, 0x1908190808082b2b, 0x1908190808190819, 0x1908190808191908,
3935
+ 0x190819080819192b, 0x1908190808192b19, 0x19081908082b0808, 0x19081908082b082b,
3936
+ 0x19081908082b1919, 0x19081908082b2b08, 0x1908190819080819, 0x1908190819081908,
3937
+ 0x190819081908192b, 0x1908190819082b19, 0x1908190819190808, 0x190819081919082b,
3938
+ 0x1908190819191919, 0x1908190819192b08, 0x19081908192b0819, 0x19081908192b1908,
3939
+ 0x190819082b080808, 0x190819082b08082b, 0x190819082b081919, 0x190819082b082b08,
3940
+ 0x190819082b190819, 0x190819082b191908, 0x190819082b2b0808, 0x1908191908080819,
3941
+ 0x1908191908081908, 0x190819190808192b, 0x1908191908082b19, 0x1908191908190808,
3942
+ 0x190819190819082b, 0x1908191908191919, 0x1908191908192b08, 0x19081919082b0819,
3943
+ 0x19081919082b1908, 0x1908191919080808, 0x190819191908082b, 0x1908191919081919,
3944
+ 0x1908191919082b08, 0x1908191919190819, 0x1908191919191908, 0x19081919192b0808,
3945
+ 0x19081919192b2b2b, 0x190819192b080819, 0x190819192b081908, 0x190819192b190808,
3946
+ 0x1908192b08080808, 0x1908192b0808082b, 0x1908192b08081919, 0x1908192b08082b08,
3947
+ 0x1908192b08190819, 0x1908192b08191908, 0x1908192b082b0808, 0x1908192b19080819,
3948
+ 0x1908192b19081908, 0x1908192b19190808, 0x1908192b2b080808, 0x1908192b2b2b1919,
3949
+ 0x19082b0808080819, 0x19082b0808081908, 0x19082b0808082b19, 0x19082b0808190808,
3950
+ 0x19082b080819082b, 0x19082b0808191919, 0x19082b0808192b08, 0x19082b08082b0819,
3951
+ 0x19082b08082b1908, 0x19082b0819080808, 0x19082b081908082b, 0x19082b0819081919,
3952
+ 0x19082b0819082b08, 0x19082b0819190819, 0x19082b0819191908, 0x19082b08192b0808,
3953
+ 0x19082b082b081908, 0x19082b082b190808, 0x19082b1908080808, 0x19082b190808082b,
3954
+ 0x19082b1908081919, 0x19082b1908082b08, 0x19082b1908190819, 0x19082b1908191908,
3955
+ 0x19082b19082b0808, 0x19082b1919080819, 0x19082b1919081908, 0x19082b1919190808,
3956
+ 0x19082b192b080808, 0x19082b192b19192b, 0x19082b2b08080819, 0x19082b2b08081908,
3957
+ 0x19082b2b08190808, 0x19082b2b19080808, 0x1919080808080808, 0x191908080808082b,
3958
+ 0x1919080808081919, 0x1919080808082b08, 0x1919080808190819, 0x1919080808191908,
3959
+ 0x191908080819192b, 0x1919080808192b19, 0x19190808082b0808, 0x19190808082b082b,
3960
+ 0x19190808082b1919, 0x19190808082b2b08, 0x1919080819080819, 0x1919080819081908,
3961
+ 0x191908081908192b, 0x1919080819082b19, 0x1919080819190808, 0x191908081919082b,
3962
+ 0x1919080819191919, 0x1919080819192b08, 0x19190808192b0819, 0x19190808192b1908,
3963
+ 0x191908082b080808, 0x191908082b08082b, 0x191908082b081919, 0x191908082b082b08,
3964
+ 0x191908082b190819, 0x191908082b191908, 0x1919081908080819, 0x1919081908081908,
3965
+ 0x191908190808192b, 0x1919081908082b19, 0x1919081908190808, 0x191908190819082b,
3966
+ 0x1919081908191919, 0x1919081908192b08, 0x19190819082b0819, 0x19190819082b1908,
3967
+ 0x1919081919080808, 0x191908191908082b, 0x1919081919081919, 0x1919081919082b08,
3968
+ 0x1919081919190819, 0x1919081919191908, 0x19190819192b0808, 0x191908192b080819,
3969
+ 0x191908192b081908, 0x191908192b190808, 0x1919082b08080808, 0x1919082b08081919,
3970
+ 0x1919082b08082b08, 0x1919082b08190819, 0x1919082b08191908, 0x1919082b082b0808,
3971
+ 0x1919082b19080819, 0x1919082b19081908, 0x1919082b19190808, 0x1919082b192b2b19,
3972
+ 0x1919082b2b080808, 0x1919190808080819, 0x1919190808081908, 0x191919080808192b,
3973
+ 0x1919190808082b19, 0x1919190808190808, 0x191919080819082b, 0x1919190808191919,
3974
+ 0x1919190808192b08, 0x19191908082b0819, 0x19191908082b1908, 0x1919190819080808,
3975
+ 0x191919081908082b, 0x1919190819081919, 0x1919190819082b08, 0x1919190819190819,
3976
+ 0x1919190819191908, 0x19191908192b0808, 0x191919082b080819, 0x191919082b081908,
3977
+ 0x191919082b190808, 0x1919191908080808, 0x191919190808082b, 0x1919191908081919,
3978
+ 0x1919191908082b08, 0x1919191908190819, 0x1919191908191908, 0x19191919082b0808,
3979
+ 0x1919191919080819, 0x1919191919081908, 0x1919191919190808, 0x191919192b080808,
3980
+ 0x1919192b08080819, 0x1919192b08081908, 0x1919192b08190808, 0x1919192b082b192b,
3981
+ 0x1919192b19080808, 0x19192b0808080808, 0x19192b080808082b, 0x19192b0808081919,
3982
+ 0x19192b0808082b08, 0x19192b0808190819, 0x19192b0808191908, 0x19192b08082b0808,
3983
+ 0x19192b0819080819, 0x19192b0819081908, 0x19192b0819190808, 0x19192b0819192b2b,
3984
+ 0x19192b082b080808, 0x19192b1908080819, 0x19192b1908081908, 0x19192b1908190808,
3985
+ 0x19192b1919080808, 0x19192b2b08080808, 0x19192b2b08192b19, 0x19192b2b2b081919,
3986
+ 0x19192b2b2b2b2b08, 0x192b080808080819, 0x192b080808081908, 0x192b08080808192b,
3987
+ 0x192b080808190808, 0x192b08080819082b, 0x192b080808191919, 0x192b080808192b08,
3988
+ 0x192b0808082b0819, 0x192b0808082b1908, 0x192b080819080808, 0x192b080819081919,
3989
+ 0x192b080819082b08, 0x192b080819190819, 0x192b080819191908, 0x192b0808192b0808,
3990
+ 0x192b08082b081908, 0x192b08082b190808, 0x192b081908080808, 0x192b08190808082b,
3991
+ 0x192b081908081919, 0x192b081908082b08, 0x192b081908190819, 0x192b081908191908,
3992
+ 0x192b0819082b0808, 0x192b081919080819, 0x192b081919081908, 0x192b081919190808,
3993
+ 0x192b08192b080808, 0x192b08192b192b19, 0x192b082b08081908, 0x192b082b08190808,
3994
+ 0x192b082b19080808, 0x192b082b1919192b, 0x192b082b2b2b0819, 0x192b190808080808,
3995
+ 0x192b190808081919, 0x192b190808082b08, 0x192b190808190819, 0x192b190808191908,
3996
+ 0x192b1908082b0808, 0x192b190819080819, 0x192b190819081908, 0x192b190819190808,
3997
+ 0x192b19082b080808, 0x192b191908080819, 0x192b191908081908, 0x192b191908190808,
3998
+ 0x192b191919080808, 0x192b191919082b2b, 0x192b1919192b2b08, 0x192b19192b19082b,
3999
+ 0x192b192b08080808, 0x192b192b2b191908, 0x192b2b0808080819, 0x192b2b0808081908,
4000
+ 0x192b2b0808190808, 0x192b2b08192b1919, 0x192b2b082b192b08, 0x192b2b1908080808,
4001
+ 0x192b2b19082b2b2b, 0x192b2b2b1908082b, 0x192b2b2b2b2b0819, 0x2b08080808080808,
4002
+ 0x2b0808080808082b, 0x2b08080808081919, 0x2b08080808082b08, 0x2b08080808190819,
4003
+ 0x2b08080808191908, 0x2b08080808192b19, 0x2b080808082b0808, 0x2b080808082b1919,
4004
+ 0x2b08080819080819, 0x2b08080819081908, 0x2b08080819190808, 0x2b0808081919082b,
4005
+ 0x2b08080819191919, 0x2b08080819192b08, 0x2b080808192b0819, 0x2b0808082b080808,
4006
+ 0x2b0808082b081919, 0x2b0808082b190819, 0x2b0808082b191908, 0x2b08081908080819,
4007
+ 0x2b08081908081908, 0x2b08081908082b19, 0x2b08081908190808, 0x2b0808190819082b,
4008
+ 0x2b08081908191919, 0x2b08081908192b08, 0x2b080819082b0819, 0x2b080819082b1908,
4009
+ 0x2b08081919080808, 0x2b0808191908082b, 0x2b08081919081919, 0x2b08081919082b08,
4010
+ 0x2b08081919190819, 0x2b08081919191908, 0x2b0808192b080819, 0x2b0808192b081908,
4011
+ 0x2b0808192b190808, 0x2b0808192b2b2b19, 0x2b08082b08080808, 0x2b08082b08081919,
4012
+ 0x2b08082b08082b2b, 0x2b08082b08190819, 0x2b08082b08191908, 0x2b08082b19080819,
4013
+ 0x2b08082b19081908, 0x2b08082b19190808, 0x2b08190808080819, 0x2b08190808081908,
4014
+ 0x2b0819080808192b, 0x2b08190808082b19, 0x2b08190808190808, 0x2b0819080819082b,
4015
+ 0x2b08190808191919, 0x2b08190808192b08, 0x2b081908082b0819, 0x2b08190819080808,
4016
+ 0x2b0819081908082b, 0x2b08190819081919, 0x2b08190819082b08, 0x2b08190819190819,
4017
+ 0x2b08190819191908, 0x2b081908192b0808, 0x2b0819082b080819, 0x2b0819082b081908,
4018
+ 0x2b0819082b190808, 0x2b08191908080808, 0x2b0819190808082b, 0x2b08191908081919,
4019
+ 0x2b08191908082b08, 0x2b08191908190819, 0x2b08191908191908, 0x2b081919082b0808,
4020
+ 0x2b08191919080819, 0x2b08191919081908, 0x2b08191919190808, 0x2b0819192b080808,
4021
+ 0x2b0819192b082b2b, 0x2b08192b08080819, 0x2b08192b08081908, 0x2b08192b08190808,
4022
+ 0x2b08192b082b2b19, 0x2b08192b19080808, 0x2b082b0808080808, 0x2b082b0808081919,
4023
+ 0x2b082b0808190819, 0x2b082b0808191908, 0x2b082b0819080819, 0x2b082b0819081908,
4024
+ 0x2b082b0819190808, 0x2b082b082b2b082b, 0x2b082b1908080819, 0x2b082b1908081908,
4025
+ 0x2b082b1919080808, 0x2b082b19192b1919, 0x2b082b2b082b082b, 0x2b082b2b19192b08,
4026
+ 0x2b082b2b19192b2b, 0x2b082b2b2b08082b, 0x2b082b2b2b2b082b, 0x2b19080808080819,
4027
+ 0x2b19080808081908, 0x2b19080808082b19, 0x2b19080808190808, 0x2b1908080819082b,
4028
+ 0x2b19080808191919, 0x2b19080808192b08, 0x2b190808082b1908, 0x2b19080819080808,
4029
+ 0x2b1908081908082b, 0x2b19080819081919, 0x2b19080819082b08, 0x2b19080819190819,
4030
+ 0x2b19080819191908, 0x2b190808192b0808, 0x2b1908082b080819, 0x2b1908082b081908,
4031
+ 0x2b1908082b190808, 0x2b19081908080808, 0x2b19081908081919, 0x2b19081908190819,
4032
+ 0x2b19081908191908, 0x2b19081919080819, 0x2b19081919081908, 0x2b19081919190808,
4033
+ 0x2b19081919192b2b, 0x2b19082b08080819, 0x2b19082b08081908, 0x2b19082b08190808,
4034
+ 0x2b19082b19080808, 0x2b19082b2b2b192b, 0x2b19190808080808, 0x2b1919080808082b,
4035
+ 0x2b19190808081919, 0x2b19190808082b08, 0x2b19190808190819, 0x2b19190808191908,
4036
+ 0x2b191908082b0808, 0x2b19190819080819, 0x2b19190819081908, 0x2b19190819190808,
4037
+ 0x2b1919082b080808, 0x2b1919082b19192b, 0x2b19191908080819, 0x2b19191908081908,
4038
+ 0x2b19191908190808, 0x2b19191919080808, 0x2b1919192b192b08, 0x2b1919192b2b0819,
4039
+ 0x2b19192b08080808, 0x2b19192b1908192b, 0x2b19192b192b1908, 0x2b192b0808080819,
4040
+ 0x2b192b0808081908, 0x2b192b0808190808, 0x2b192b08082b192b, 0x2b192b0819080808,
4041
+ 0x2b192b082b2b2b19, 0x2b192b1908080808, 0x2b192b1919082b19, 0x2b192b191919082b,
4042
+ 0x2b192b2b2b190808, 0x2b2b080808080808, 0x2b2b080808081919, 0x2b2b080808082b2b,
4043
+ 0x2b2b080808191908, 0x2b2b0808082b082b, 0x2b2b0808082b2b2b, 0x2b2b080819080819,
4044
+ 0x2b2b080819081908, 0x2b2b080819190808, 0x2b2b08082b2b082b, 0x2b2b08082b2b2b2b,
4045
+ 0x2b2b081919080808, 0x2b2b0819192b1919, 0x2b2b082b0808082b, 0x2b2b082b08082b2b,
4046
+ 0x2b2b082b082b082b, 0x2b2b082b082b2b08, 0x2b2b082b082b2b2b, 0x2b2b082b2b08082b,
4047
+ 0x2b2b082b2b082b08, 0x2b2b082b2b082b2b, 0x2b2b082b2b2b2b08, 0x2b2b190808080819,
4048
+ 0x2b2b190808081908, 0x2b2b190808190808, 0x2b2b190819080808, 0x2b2b19082b082b19,
4049
+ 0x2b2b19082b2b1908, 0x2b2b191908080808, 0x2b2b191908192b19, 0x2b2b192b19190819,
4050
+ 0x2b2b2b0808082b2b, 0x2b2b2b08082b2b08, 0x2b2b2b082b2b082b, 0x2b2b2b1919191908,
4051
+ 0x2b2b2b192b08192b, 0x2b2b2b2b08082b08, 0x2b2b2b2b08082b2b, 0x2b2b2b2b082b0808,
4052
+ 0x2b2b2b2b082b082b, 0x2b2b2b2b082b2b08, 0x2b2b2b2b2b082b08, 0x2b2b2b2b2b2b2b2b,
4053
+ };
4054
+
3757
4055
  constexpr constant static uint32_t iq3xxs_grid[256] = {
3758
4056
  0x04040404, 0x04040414, 0x04040424, 0x04040c0c, 0x04040c1c, 0x04040c3e, 0x04041404, 0x04041414,
3759
4057
  0x04041c0c, 0x04042414, 0x04043e1c, 0x04043e2c, 0x040c040c, 0x040c041c, 0x040c0c04, 0x040c0c14,
@@ -3789,6 +4087,73 @@ constexpr constant static uint32_t iq3xxs_grid[256] = {
3789
4087
  0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c, 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04,
3790
4088
  };
3791
4089
 
4090
+ constexpr constant static uint32_t iq3s_grid[512] = {
4091
+ 0x01010101, 0x01010103, 0x01010105, 0x0101010b, 0x0101010f, 0x01010301, 0x01010303, 0x01010305,
4092
+ 0x01010309, 0x0101030d, 0x01010501, 0x01010503, 0x0101050b, 0x01010707, 0x01010901, 0x01010905,
4093
+ 0x0101090b, 0x0101090f, 0x01010b03, 0x01010b07, 0x01010d01, 0x01010d05, 0x01010f03, 0x01010f09,
4094
+ 0x01010f0f, 0x01030101, 0x01030103, 0x01030105, 0x01030109, 0x01030301, 0x01030303, 0x0103030b,
4095
+ 0x01030501, 0x01030507, 0x0103050f, 0x01030703, 0x0103070b, 0x01030909, 0x01030d03, 0x01030d0b,
4096
+ 0x01030f05, 0x01050101, 0x01050103, 0x0105010b, 0x0105010f, 0x01050301, 0x01050307, 0x0105030d,
4097
+ 0x01050503, 0x0105050b, 0x01050701, 0x01050709, 0x01050905, 0x0105090b, 0x0105090f, 0x01050b03,
4098
+ 0x01050b07, 0x01050f01, 0x01050f07, 0x01070107, 0x01070303, 0x0107030b, 0x01070501, 0x01070505,
4099
+ 0x01070703, 0x01070707, 0x0107070d, 0x01070909, 0x01070b01, 0x01070b05, 0x01070d0f, 0x01070f03,
4100
+ 0x01070f0b, 0x01090101, 0x01090307, 0x0109030f, 0x01090503, 0x01090509, 0x01090705, 0x01090901,
4101
+ 0x01090907, 0x01090b03, 0x01090f01, 0x010b0105, 0x010b0109, 0x010b0501, 0x010b0505, 0x010b050d,
4102
+ 0x010b0707, 0x010b0903, 0x010b090b, 0x010b090f, 0x010b0d0d, 0x010b0f07, 0x010d010d, 0x010d0303,
4103
+ 0x010d0307, 0x010d0703, 0x010d0b05, 0x010d0f03, 0x010f0101, 0x010f0105, 0x010f0109, 0x010f0501,
4104
+ 0x010f0505, 0x010f050d, 0x010f0707, 0x010f0b01, 0x010f0b09, 0x03010101, 0x03010103, 0x03010105,
4105
+ 0x03010109, 0x03010301, 0x03010303, 0x03010307, 0x0301030b, 0x0301030f, 0x03010501, 0x03010505,
4106
+ 0x03010703, 0x03010709, 0x0301070d, 0x03010b09, 0x03010b0d, 0x03010d03, 0x03010f05, 0x03030101,
4107
+ 0x03030103, 0x03030107, 0x0303010d, 0x03030301, 0x03030309, 0x03030503, 0x03030701, 0x03030707,
4108
+ 0x03030903, 0x03030b01, 0x03030b05, 0x03030f01, 0x03030f0d, 0x03050101, 0x03050305, 0x0305030b,
4109
+ 0x0305030f, 0x03050501, 0x03050509, 0x03050705, 0x03050901, 0x03050907, 0x03050b0b, 0x03050d01,
4110
+ 0x03050f05, 0x03070103, 0x03070109, 0x0307010f, 0x03070301, 0x03070307, 0x03070503, 0x0307050f,
4111
+ 0x03070701, 0x03070709, 0x03070903, 0x03070d05, 0x03070f01, 0x03090107, 0x0309010b, 0x03090305,
4112
+ 0x03090309, 0x03090703, 0x03090707, 0x03090905, 0x0309090d, 0x03090b01, 0x03090b09, 0x030b0103,
4113
+ 0x030b0301, 0x030b0307, 0x030b0503, 0x030b0701, 0x030b0705, 0x030b0b03, 0x030d0501, 0x030d0509,
4114
+ 0x030d050f, 0x030d0909, 0x030d090d, 0x030f0103, 0x030f0107, 0x030f0301, 0x030f0305, 0x030f0503,
4115
+ 0x030f070b, 0x030f0903, 0x030f0d05, 0x030f0f01, 0x05010101, 0x05010103, 0x05010107, 0x0501010b,
4116
+ 0x0501010f, 0x05010301, 0x05010305, 0x05010309, 0x0501030d, 0x05010503, 0x05010507, 0x0501050f,
4117
+ 0x05010701, 0x05010705, 0x05010903, 0x05010907, 0x0501090b, 0x05010b01, 0x05010b05, 0x05010d0f,
4118
+ 0x05010f01, 0x05010f07, 0x05010f0b, 0x05030101, 0x05030105, 0x05030301, 0x05030307, 0x0503030f,
4119
+ 0x05030505, 0x0503050b, 0x05030703, 0x05030709, 0x05030905, 0x05030b03, 0x05050103, 0x05050109,
4120
+ 0x0505010f, 0x05050503, 0x05050507, 0x05050701, 0x0505070f, 0x05050903, 0x05050b07, 0x05050b0f,
4121
+ 0x05050f03, 0x05050f09, 0x05070101, 0x05070105, 0x0507010b, 0x05070303, 0x05070505, 0x05070509,
4122
+ 0x05070703, 0x05070707, 0x05070905, 0x05070b01, 0x05070d0d, 0x05090103, 0x0509010f, 0x05090501,
4123
+ 0x05090507, 0x05090705, 0x0509070b, 0x05090903, 0x05090f05, 0x05090f0b, 0x050b0109, 0x050b0303,
4124
+ 0x050b0505, 0x050b070f, 0x050b0901, 0x050b0b07, 0x050b0f01, 0x050d0101, 0x050d0105, 0x050d010f,
4125
+ 0x050d0503, 0x050d0b0b, 0x050d0d03, 0x050f010b, 0x050f0303, 0x050f050d, 0x050f0701, 0x050f0907,
4126
+ 0x050f0b01, 0x07010105, 0x07010303, 0x07010307, 0x0701030b, 0x0701030f, 0x07010505, 0x07010703,
4127
+ 0x07010707, 0x0701070b, 0x07010905, 0x07010909, 0x0701090f, 0x07010b03, 0x07010d07, 0x07010f03,
4128
+ 0x07030103, 0x07030107, 0x0703010b, 0x07030309, 0x07030503, 0x07030507, 0x07030901, 0x07030d01,
4129
+ 0x07030f05, 0x07030f0d, 0x07050101, 0x07050305, 0x07050501, 0x07050705, 0x07050709, 0x07050b01,
4130
+ 0x07070103, 0x07070301, 0x07070309, 0x07070503, 0x07070507, 0x0707050f, 0x07070701, 0x07070903,
4131
+ 0x07070907, 0x0707090f, 0x07070b0b, 0x07070f07, 0x07090107, 0x07090303, 0x0709030d, 0x07090505,
4132
+ 0x07090703, 0x07090b05, 0x07090d01, 0x07090d09, 0x070b0103, 0x070b0301, 0x070b0305, 0x070b050b,
4133
+ 0x070b0705, 0x070b0909, 0x070b0b0d, 0x070b0f07, 0x070d030d, 0x070d0903, 0x070f0103, 0x070f0107,
4134
+ 0x070f0501, 0x070f0505, 0x070f070b, 0x09010101, 0x09010109, 0x09010305, 0x09010501, 0x09010509,
4135
+ 0x0901050f, 0x09010705, 0x09010903, 0x09010b01, 0x09010f01, 0x09030105, 0x0903010f, 0x09030303,
4136
+ 0x09030307, 0x09030505, 0x09030701, 0x0903070b, 0x09030907, 0x09030b03, 0x09030b0b, 0x09050103,
4137
+ 0x09050107, 0x09050301, 0x0905030b, 0x09050503, 0x09050707, 0x09050901, 0x09050b0f, 0x09050d05,
4138
+ 0x09050f01, 0x09070109, 0x09070303, 0x09070307, 0x09070501, 0x09070505, 0x09070703, 0x0907070b,
4139
+ 0x09090101, 0x09090105, 0x09090509, 0x0909070f, 0x09090901, 0x09090f03, 0x090b010b, 0x090b010f,
4140
+ 0x090b0503, 0x090b0d05, 0x090d0307, 0x090d0709, 0x090d0d01, 0x090f0301, 0x090f030b, 0x090f0701,
4141
+ 0x090f0907, 0x090f0b03, 0x0b010105, 0x0b010301, 0x0b010309, 0x0b010505, 0x0b010901, 0x0b010909,
4142
+ 0x0b01090f, 0x0b010b05, 0x0b010d0d, 0x0b010f09, 0x0b030103, 0x0b030107, 0x0b03010b, 0x0b030305,
4143
+ 0x0b030503, 0x0b030705, 0x0b030f05, 0x0b050101, 0x0b050303, 0x0b050507, 0x0b050701, 0x0b05070d,
4144
+ 0x0b050b07, 0x0b070105, 0x0b07010f, 0x0b070301, 0x0b07050f, 0x0b070909, 0x0b070b03, 0x0b070d0b,
4145
+ 0x0b070f07, 0x0b090103, 0x0b090109, 0x0b090501, 0x0b090705, 0x0b09090d, 0x0b0b0305, 0x0b0b050d,
4146
+ 0x0b0b0b03, 0x0b0b0b07, 0x0b0d0905, 0x0b0f0105, 0x0b0f0109, 0x0b0f0505, 0x0d010303, 0x0d010307,
4147
+ 0x0d01030b, 0x0d010703, 0x0d010707, 0x0d010d01, 0x0d030101, 0x0d030501, 0x0d03050f, 0x0d030d09,
4148
+ 0x0d050305, 0x0d050709, 0x0d050905, 0x0d050b0b, 0x0d050d05, 0x0d050f01, 0x0d070101, 0x0d070309,
4149
+ 0x0d070503, 0x0d070901, 0x0d09050b, 0x0d090907, 0x0d090d05, 0x0d0b0101, 0x0d0b0107, 0x0d0b0709,
4150
+ 0x0d0b0d01, 0x0d0d010b, 0x0d0d0901, 0x0d0f0303, 0x0d0f0307, 0x0f010101, 0x0f010109, 0x0f01010f,
4151
+ 0x0f010501, 0x0f010505, 0x0f01070d, 0x0f010901, 0x0f010b09, 0x0f010d05, 0x0f030105, 0x0f030303,
4152
+ 0x0f030509, 0x0f030907, 0x0f03090b, 0x0f050103, 0x0f050109, 0x0f050301, 0x0f05030d, 0x0f050503,
4153
+ 0x0f050701, 0x0f050b03, 0x0f070105, 0x0f070705, 0x0f07070b, 0x0f070b07, 0x0f090103, 0x0f09010b,
4154
+ 0x0f090307, 0x0f090501, 0x0f090b01, 0x0f0b0505, 0x0f0b0905, 0x0f0d0105, 0x0f0d0703, 0x0f0f0101,
4155
+ };
4156
+
3792
4157
  #define NGRID_IQ1S 512
3793
4158
  constexpr constant static uint64_t iq1s_grid[NGRID_IQ1S] = {
3794
4159
  0xffffffffffff0101, 0xffffffffff01ff00, 0xffffffffff010100, 0xffffffff00000000,
@@ -3985,7 +4350,6 @@ void kernel_mul_mv_iq2_xxs_f32_impl(
3985
4350
  threadgroup_barrier(mem_flags::mem_threadgroup);
3986
4351
  }
3987
4352
 
3988
- #if QK_K == 256
3989
4353
  const int ix = tiisg;
3990
4354
 
3991
4355
  device const float * y4 = y + 32 * ix;
@@ -4026,9 +4390,6 @@ void kernel_mul_mv_iq2_xxs_f32_impl(
4026
4390
 
4027
4391
  y4 += 32 * 32;
4028
4392
  }
4029
- #else
4030
- // TODO
4031
- #endif
4032
4393
 
4033
4394
  for (int row = 0; row < N_DST; ++row) {
4034
4395
  all_sum = simd_sum(sumf[row]);
@@ -4118,7 +4479,6 @@ void kernel_mul_mv_iq2_xs_f32_impl(
4118
4479
  threadgroup_barrier(mem_flags::mem_threadgroup);
4119
4480
  }
4120
4481
 
4121
- #if QK_K == 256
4122
4482
  const int ix = tiisg;
4123
4483
 
4124
4484
  device const float * y4 = y + 32 * ix;
@@ -4169,9 +4529,6 @@ void kernel_mul_mv_iq2_xs_f32_impl(
4169
4529
 
4170
4530
  y4 += 32 * 32;
4171
4531
  }
4172
- #else
4173
- // TODO
4174
- #endif
4175
4532
 
4176
4533
  for (int row = 0; row < N_DST; ++row) {
4177
4534
  all_sum = simd_sum(sumf[row]);
@@ -4261,7 +4618,6 @@ void kernel_mul_mv_iq3_xxs_f32_impl(
4261
4618
  threadgroup_barrier(mem_flags::mem_threadgroup);
4262
4619
  }
4263
4620
 
4264
- #if QK_K == 256
4265
4621
  const int ix = tiisg;
4266
4622
 
4267
4623
  device const float * y4 = y + 32 * ix;
@@ -4305,9 +4661,6 @@ void kernel_mul_mv_iq3_xxs_f32_impl(
4305
4661
 
4306
4662
  y4 += 32 * 32;
4307
4663
  }
4308
- #else
4309
- // TODO
4310
- #endif
4311
4664
 
4312
4665
  for (int row = 0; row < N_DST; ++row) {
4313
4666
  all_sum = simd_sum(sumf[row]);
@@ -4346,7 +4699,7 @@ kernel void kernel_mul_mv_iq3_xxs_f32(
4346
4699
  kernel_mul_mv_iq3_xxs_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
4347
4700
  }
4348
4701
 
4349
- void kernel_mul_mv_iq1_s_f32_impl(
4702
+ void kernel_mul_mv_iq3_s_f32_impl(
4350
4703
  device const void * src0,
4351
4704
  device const float * src1,
4352
4705
  device float * dst,
@@ -4359,6 +4712,7 @@ void kernel_mul_mv_iq1_s_f32_impl(
4359
4712
  constant int64_t & ne1,
4360
4713
  constant uint & r2,
4361
4714
  constant uint & r3,
4715
+ threadgroup int8_t * shared_values [[threadgroup(0)]],
4362
4716
  uint3 tgpig[[threadgroup_position_in_grid]],
4363
4717
  uint tiisg[[thread_index_in_simdgroup]],
4364
4718
  uint sgitg[[simdgroup_index_in_threadgroup]]) {
@@ -4376,56 +4730,69 @@ void kernel_mul_mv_iq1_s_f32_impl(
4376
4730
 
4377
4731
  const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
4378
4732
 
4379
- device const block_iq1_s * x = (device const block_iq1_s *) src0 + ib_row + offset0;
4733
+ device const block_iq3_s * x = (device const block_iq3_s *) src0 + ib_row + offset0;
4380
4734
  device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1;
4381
4735
 
4382
- float yl[16];
4736
+ float yl[32];
4383
4737
  float sumf[N_DST]={0.f}, all_sum;
4384
4738
 
4385
4739
  const int nb32 = nb * (QK_K / 32);
4386
4740
 
4387
- #if QK_K == 256
4388
- const int ix = tiisg/2;
4389
- const int il = tiisg%2;
4741
+ threadgroup uint32_t * values = (threadgroup uint32_t *)shared_values;
4742
+ {
4743
+ int nval = 8;
4744
+ int pos = (32*sgitg + tiisg)*nval;
4745
+ for (int i = 0; i < nval; ++i) values[pos + i] = iq3s_grid[pos + i];
4746
+ threadgroup_barrier(mem_flags::mem_threadgroup);
4747
+ }
4390
4748
 
4391
- device const float * y4 = y + 32 * ix + 16 * il;
4749
+ const int ix = tiisg;
4392
4750
 
4393
- for (int ib32 = ix; ib32 < nb32; ib32 += 16) {
4751
+ device const float * y4 = y + 32 * ix;
4394
4752
 
4395
- for (int i = 0; i < 16; ++i) {
4753
+ for (int ib32 = ix; ib32 < nb32; ib32 += 32) {
4754
+
4755
+ for (int i = 0; i < 32; ++i) {
4396
4756
  yl[i] = y4[i];
4397
4757
  }
4398
4758
 
4399
4759
  const int ibl = ib32 / (QK_K / 32);
4400
4760
  const int ib = ib32 % (QK_K / 32);
4401
4761
 
4402
- device const block_iq1_s * xr = x + ibl;
4403
- device const uint8_t * qs = xr->qs + 4 * ib + 2 * il;
4404
- device const uint8_t * sc = xr->scales + 2 * ib + il;
4405
- device const half * dh = &xr->d;
4762
+ device const block_iq3_s * xr = x + ibl;
4763
+ device const uint8_t * qs = xr->qs + 8 * ib;
4764
+ device const uint8_t * qh = xr->qh + ib;
4765
+ device const uint8_t * sc = xr->scales + (ib/2);
4766
+ device const uint8_t * signs = xr->signs + 4 * ib;
4767
+ device const half * dh = &xr->d;
4406
4768
 
4407
4769
  for (int row = 0; row < N_DST; row++) {
4408
4770
 
4409
- constant int8_t * grid1 = (constant int8_t *)(iq1s_grid + (qs[0] | ((sc[0] & 0x08) << 5)));
4410
- constant int8_t * grid2 = (constant int8_t *)(iq1s_grid + (qs[1] | ((sc[0] & 0x80) << 1)));
4771
+ const float db = dh[0];
4772
+ const float d = db * (1 + 2*((sc[0] >> 4*(ib%2)) & 0xf));
4411
4773
 
4412
4774
  float2 sum = {0};
4413
- for (int j = 0; j < 8; ++j) {
4414
- sum[0] += yl[j+ 0] * grid1[j];
4415
- sum[1] += yl[j+ 8] * grid2[j];
4775
+ for (int l = 0; l < 4; ++l) {
4776
+ const threadgroup uint32_t * table1 = qh[0] & kmask_iq2xs[2*l+0] ? values + 256 : values;
4777
+ const threadgroup uint32_t * table2 = qh[0] & kmask_iq2xs[2*l+1] ? values + 256 : values;
4778
+ const threadgroup uint8_t * grid1 = (const threadgroup uint8_t *)(table1 + qs[2*l+0]);
4779
+ const threadgroup uint8_t * grid2 = (const threadgroup uint8_t *)(table2 + qs[2*l+1]);
4780
+ for (int j = 0; j < 4; ++j) {
4781
+ sum[0] += yl[8*l + j + 0] * grid1[j] * select(1, -1, signs[l] & kmask_iq2xs[j+0]);
4782
+ sum[1] += yl[8*l + j + 4] * grid2[j] * select(1, -1, signs[l] & kmask_iq2xs[j+4]);
4783
+ }
4416
4784
  }
4417
- sumf[row] += (float)dh[0] * (sum[0] * (2*(sc[0] & 7) + 1) + sum[1] * (2*((sc[0] >> 4) & 7) + 1));
4785
+ sumf[row] += d * (sum[0] + sum[1]);
4418
4786
 
4419
- dh += nb*sizeof(block_iq1_s)/2;
4420
- qs += nb*sizeof(block_iq1_s);
4421
- sc += nb*sizeof(block_iq1_s);
4787
+ dh += nb*sizeof(block_iq3_s)/2;
4788
+ qs += nb*sizeof(block_iq3_s);
4789
+ qh += nb*sizeof(block_iq3_s);
4790
+ sc += nb*sizeof(block_iq3_s);
4791
+ signs += nb*sizeof(block_iq3_s);
4422
4792
  }
4423
4793
 
4424
- y4 += 16 * 32;
4794
+ y4 += 32 * 32;
4425
4795
  }
4426
- #else
4427
- // TODO
4428
- #endif
4429
4796
 
4430
4797
  for (int row = 0; row < N_DST; ++row) {
4431
4798
  all_sum = simd_sum(sumf[row]);
@@ -4435,8 +4802,8 @@ void kernel_mul_mv_iq1_s_f32_impl(
4435
4802
  }
4436
4803
  }
4437
4804
 
4438
- [[host_name("kernel_mul_mv_iq1_s_f32")]]
4439
- kernel void kernel_mul_mv_iq1_s_f32(
4805
+ [[host_name("kernel_mul_mv_iq3_s_f32")]]
4806
+ kernel void kernel_mul_mv_iq3_s_f32(
4440
4807
  device const void * src0,
4441
4808
  device const float * src1,
4442
4809
  device float * dst,
@@ -4456,136 +4823,636 @@ kernel void kernel_mul_mv_iq1_s_f32(
4456
4823
  constant int64_t & ne1,
4457
4824
  constant uint & r2,
4458
4825
  constant uint & r3,
4826
+ threadgroup int8_t * shared_values [[threadgroup(0)]],
4459
4827
  uint3 tgpig[[threadgroup_position_in_grid]],
4460
4828
  uint tiisg[[thread_index_in_simdgroup]],
4461
4829
  uint sgitg[[simdgroup_index_in_threadgroup]]) {
4462
4830
 
4463
- kernel_mul_mv_iq1_s_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, tgpig, tiisg, sgitg);
4831
+ kernel_mul_mv_iq3_s_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
4464
4832
  }
4465
4833
 
4834
+ void kernel_mul_mv_iq2_s_f32_impl(
4835
+ device const void * src0,
4836
+ device const float * src1,
4837
+ device float * dst,
4838
+ constant int64_t & ne00,
4839
+ constant int64_t & ne01,
4840
+ constant int64_t & ne02,
4841
+ constant int64_t & ne10,
4842
+ constant int64_t & ne12,
4843
+ constant int64_t & ne0,
4844
+ constant int64_t & ne1,
4845
+ constant uint & r2,
4846
+ constant uint & r3,
4847
+ threadgroup int8_t * shared_values [[threadgroup(0)]],
4848
+ uint3 tgpig[[threadgroup_position_in_grid]],
4849
+ uint tiisg[[thread_index_in_simdgroup]],
4850
+ uint sgitg[[simdgroup_index_in_threadgroup]]) {
4466
4851
 
4467
- //============================= templates and their specializations =============================
4468
-
4469
- // NOTE: this is not dequantizing - we are simply fitting the template
4470
- template <typename type4x4>
4471
- void dequantize_f32(device const float4x4 * src, short il, thread type4x4 & reg) {
4472
- float4x4 temp = *(((device float4x4 *)src));
4473
- for (int i = 0; i < 16; i++){
4474
- reg[i/4][i%4] = temp[i/4][i%4];
4475
- }
4476
- }
4852
+ const int nb = ne00/QK_K;
4853
+ const int r0 = tgpig.x;
4854
+ const int r1 = tgpig.y;
4855
+ const int im = tgpig.z;
4477
4856
 
4478
- template <typename type4x4>
4479
- void dequantize_f16(device const half4x4 * src, short il, thread type4x4 & reg) {
4480
- half4x4 temp = *(((device half4x4 *)src));
4481
- for (int i = 0; i < 16; i++){
4482
- reg[i/4][i%4] = temp[i/4][i%4];
4483
- }
4484
- }
4857
+ const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
4858
+ const int ib_row = first_row * nb;
4485
4859
 
4486
- template <typename type4x4>
4487
- void dequantize_q4_0(device const block_q4_0 *xb, short il, thread type4x4 & reg) {
4488
- device const uint16_t * qs = ((device const uint16_t *)xb + 1);
4489
- const float d1 = il ? (xb->d / 16.h) : xb->d;
4490
- const float d2 = d1 / 256.f;
4491
- const float md = -8.h * xb->d;
4492
- const ushort mask0 = il ? 0x00F0 : 0x000F;
4493
- const ushort mask1 = mask0 << 8;
4860
+ const uint i12 = im%ne12;
4861
+ const uint i13 = im/ne12;
4494
4862
 
4495
- for (int i=0;i<8;i++) {
4496
- reg[i/2][2*(i%2)+0] = d1 * (qs[i] & mask0) + md;
4497
- reg[i/2][2*(i%2)+1] = d2 * (qs[i] & mask1) + md;
4498
- }
4499
- }
4863
+ const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
4500
4864
 
4501
- template <typename type4x4>
4502
- void dequantize_q4_1(device const block_q4_1 *xb, short il, thread type4x4 & reg) {
4503
- device const uint16_t * qs = ((device const uint16_t *)xb + 2);
4504
- const float d1 = il ? (xb->d / 16.h) : xb->d;
4505
- const float d2 = d1 / 256.f;
4506
- const float m = xb->m;
4507
- const ushort mask0 = il ? 0x00F0 : 0x000F;
4508
- const ushort mask1 = mask0 << 8;
4865
+ device const block_iq2_s * x = (device const block_iq2_s *) src0 + ib_row + offset0;
4866
+ device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1;
4509
4867
 
4510
- for (int i=0;i<8;i++) {
4511
- reg[i/2][2*(i%2)+0] = ((qs[i] & mask0) * d1) + m;
4512
- reg[i/2][2*(i%2)+1] = ((qs[i] & mask1) * d2) + m;
4513
- }
4514
- }
4868
+ float yl[32];
4869
+ float sumf[N_DST]={0.f}, all_sum;
4515
4870
 
4516
- template <typename type4x4>
4517
- void dequantize_q5_0(device const block_q5_0 *xb, short il, thread type4x4 & reg) {
4518
- device const uint16_t * qs = ((device const uint16_t *)xb + 3);
4519
- const float d = xb->d;
4520
- const float md = -16.h * xb->d;
4521
- const ushort mask = il ? 0x00F0 : 0x000F;
4871
+ const int nb32 = nb * (QK_K / 32);
4522
4872
 
4523
- const uint32_t qh = *((device const uint32_t *)xb->qh);
4873
+ //threadgroup uint64_t * values = (threadgroup uint64_t *)shared_values;
4874
+ //{
4875
+ // int nval = 32;
4876
+ // int pos = (32*sgitg + tiisg)*nval;
4877
+ // for (int i = 0; i < nval; ++i) values[pos + i] = iq2s_grid[pos + i];
4878
+ // threadgroup_barrier(mem_flags::mem_threadgroup);
4879
+ //}
4524
4880
 
4525
- const int x_mv = il ? 4 : 0;
4881
+ const int ix = tiisg;
4526
4882
 
4527
- const int gh_mv = il ? 12 : 0;
4528
- const int gh_bk = il ? 0 : 4;
4883
+ device const float * y4 = y + 32 * ix;
4529
4884
 
4530
- for (int i = 0; i < 8; i++) {
4531
- // extract the 5-th bits for x0 and x1
4532
- const uint8_t xh_0 = ((qh >> (gh_mv + 2*i )) << gh_bk) & 0x10;
4533
- const uint8_t xh_1 = ((qh >> (gh_mv + 2*i+1)) << gh_bk) & 0x10;
4885
+ for (int ib32 = ix; ib32 < nb32; ib32 += 32) {
4534
4886
 
4535
- // combine the 4-bits from qs with the 5th bit
4536
- const int32_t x0 = ((((qs[i] ) & mask) >> x_mv) | xh_0);
4537
- const int32_t x1 = ((((qs[i] >> 8) & mask) >> x_mv) | xh_1);
4887
+ for (int i = 0; i < 32; ++i) {
4888
+ yl[i] = y4[i];
4889
+ }
4538
4890
 
4539
- reg[i/2][2*(i%2)+0] = d * x0 + md;
4540
- reg[i/2][2*(i%2)+1] = d * x1 + md;
4541
- }
4542
- }
4891
+ const int ibl = ib32 / (QK_K / 32);
4892
+ const int ib = ib32 % (QK_K / 32);
4543
4893
 
4544
- template <typename type4x4>
4545
- void dequantize_q5_1(device const block_q5_1 *xb, short il, thread type4x4 & reg) {
4546
- device const uint16_t * qs = ((device const uint16_t *)xb + 4);
4547
- const float d = xb->d;
4548
- const float m = xb->m;
4549
- const ushort mask = il ? 0x00F0 : 0x000F;
4894
+ device const block_iq2_s * xr = x + ibl;
4895
+ device const uint8_t * qs = xr->qs + 4 * ib;
4896
+ device const uint8_t * qh = xr->qh + ib;
4897
+ device const uint8_t * sc = xr->scales + ib;
4898
+ device const uint8_t * signs = qs + QK_K/8;
4899
+ device const half * dh = &xr->d;
4550
4900
 
4551
- const uint32_t qh = *((device const uint32_t *)xb->qh);
4901
+ for (int row = 0; row < N_DST; row++) {
4552
4902
 
4553
- const int x_mv = il ? 4 : 0;
4903
+ const float db = dh[0];
4904
+ const float d1 = db * (0.5f + (sc[0] & 0xf));
4905
+ const float d2 = db * (0.5f + (sc[0] >> 4));
4554
4906
 
4555
- const int gh_mv = il ? 12 : 0;
4556
- const int gh_bk = il ? 0 : 4;
4907
+ float2 sum = {0};
4908
+ for (int l = 0; l < 2; ++l) {
4909
+ //const threadgroup uint8_t * grid1 = (const threadgroup uint8_t *)(values + (qs[l+0] | ((qh[0] << (8-2*l)) & 0x300)));
4910
+ //const threadgroup uint8_t * grid2 = (const threadgroup uint8_t *)(values + (qs[l+2] | ((qh[0] << (4-2*l)) & 0x300)));
4911
+ constant uint8_t * grid1 = (constant uint8_t *)(iq2s_grid + (qs[l+0] | ((qh[0] << (8-2*l)) & 0x300)));
4912
+ constant uint8_t * grid2 = (constant uint8_t *)(iq2s_grid + (qs[l+2] | ((qh[0] << (4-2*l)) & 0x300)));
4913
+ for (int j = 0; j < 8; ++j) {
4914
+ sum[0] += yl[8*l + j + 0] * grid1[j] * select(1, -1, signs[l+0] & kmask_iq2xs[j]);
4915
+ sum[1] += yl[8*l + j + 16] * grid2[j] * select(1, -1, signs[l+2] & kmask_iq2xs[j]);
4916
+ }
4917
+ }
4918
+ sumf[row] += d1 * sum[0] + d2 * sum[1];
4557
4919
 
4558
- for (int i = 0; i < 8; i++) {
4559
- // extract the 5-th bits for x0 and x1
4560
- const uint8_t xh_0 = ((qh >> (gh_mv + 2*i )) << gh_bk) & 0x10;
4561
- const uint8_t xh_1 = ((qh >> (gh_mv + 2*i+1)) << gh_bk) & 0x10;
4920
+ dh += nb*sizeof(block_iq2_s)/2;
4921
+ qs += nb*sizeof(block_iq2_s);
4922
+ qh += nb*sizeof(block_iq2_s);
4923
+ sc += nb*sizeof(block_iq2_s);
4924
+ signs += nb*sizeof(block_iq2_s);
4925
+ }
4562
4926
 
4563
- // combine the 4-bits from qs with the 5th bit
4564
- const int32_t x0 = ((((qs[i] ) & mask) >> x_mv) | xh_0);
4565
- const int32_t x1 = ((((qs[i] >> 8) & mask) >> x_mv) | xh_1);
4927
+ y4 += 32 * 32;
4928
+ }
4566
4929
 
4567
- reg[i/2][2*(i%2)+0] = d * x0 + m;
4568
- reg[i/2][2*(i%2)+1] = d * x1 + m;
4930
+ for (int row = 0; row < N_DST; ++row) {
4931
+ all_sum = simd_sum(sumf[row]);
4932
+ if (tiisg == 0) {
4933
+ dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum * 0.25f;
4934
+ }
4569
4935
  }
4570
4936
  }
4571
4937
 
4572
- template <typename type4x4>
4573
- void dequantize_q8_0(device const block_q8_0 *xb, short il, thread type4x4 & reg) {
4574
- device const int8_t * qs = ((device const int8_t *)xb->qs);
4575
- const half d = xb->d;
4938
+ [[host_name("kernel_mul_mv_iq2_s_f32")]]
4939
+ kernel void kernel_mul_mv_iq2_s_f32(
4940
+ device const void * src0,
4941
+ device const float * src1,
4942
+ device float * dst,
4943
+ constant int64_t & ne00,
4944
+ constant int64_t & ne01,
4945
+ constant int64_t & ne02,
4946
+ constant uint64_t & nb00,
4947
+ constant uint64_t & nb01,
4948
+ constant uint64_t & nb02,
4949
+ constant int64_t & ne10,
4950
+ constant int64_t & ne11,
4951
+ constant int64_t & ne12,
4952
+ constant uint64_t & nb10,
4953
+ constant uint64_t & nb11,
4954
+ constant uint64_t & nb12,
4955
+ constant int64_t & ne0,
4956
+ constant int64_t & ne1,
4957
+ constant uint & r2,
4958
+ constant uint & r3,
4959
+ threadgroup int8_t * shared_values [[threadgroup(0)]],
4960
+ uint3 tgpig[[threadgroup_position_in_grid]],
4961
+ uint tiisg[[thread_index_in_simdgroup]],
4962
+ uint sgitg[[simdgroup_index_in_threadgroup]]) {
4576
4963
 
4577
- for (int i = 0; i < 16; i++) {
4578
- reg[i/4][i%4] = (qs[i + 16*il] * d);
4579
- }
4964
+ kernel_mul_mv_iq2_s_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
4580
4965
  }
4581
4966
 
4582
- template <typename type4x4>
4583
- void dequantize_q2_K(device const block_q2_K *xb, short il, thread type4x4 & reg) {
4584
- const float d = xb->d;
4585
- const float min = xb->dmin;
4586
- device const uint8_t * q = (device const uint8_t *)xb->qs;
4587
- float dl, ml;
4588
- uint8_t sc = xb->scales[il];
4967
+ void kernel_mul_mv_iq1_s_f32_impl(
4968
+ device const void * src0,
4969
+ device const float * src1,
4970
+ device float * dst,
4971
+ constant int64_t & ne00,
4972
+ constant int64_t & ne01,
4973
+ constant int64_t & ne02,
4974
+ constant int64_t & ne10,
4975
+ constant int64_t & ne12,
4976
+ constant int64_t & ne0,
4977
+ constant int64_t & ne1,
4978
+ constant uint & r2,
4979
+ constant uint & r3,
4980
+ uint3 tgpig[[threadgroup_position_in_grid]],
4981
+ uint tiisg[[thread_index_in_simdgroup]],
4982
+ uint sgitg[[simdgroup_index_in_threadgroup]]) {
4983
+
4984
+ const int nb = ne00/QK_K;
4985
+ const int r0 = tgpig.x;
4986
+ const int r1 = tgpig.y;
4987
+ const int im = tgpig.z;
4988
+
4989
+ const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
4990
+ const int ib_row = first_row * nb;
4991
+
4992
+ const uint i12 = im%ne12;
4993
+ const uint i13 = im/ne12;
4994
+
4995
+ const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
4996
+ device const block_iq1_s * x = (device const block_iq1_s *) src0 + ib_row + offset0;
4997
+ device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1;
4998
+
4999
+ float yl[16];
5000
+ float sumf[N_DST]={0.f}, all_sum;
5001
+
5002
+ const int nb32 = nb * (QK_K / 32);
5003
+
5004
+ const int ix = tiisg/2;
5005
+ const int il = tiisg%2;
5006
+
5007
+ device const float * y4 = y + 32 * ix + 16 * il;
5008
+
5009
+ for (int ib32 = ix; ib32 < nb32; ib32 += 16) {
5010
+
5011
+ for (int i = 0; i < 16; ++i) {
5012
+ yl[i] = y4[i];
5013
+ }
5014
+
5015
+ const int ibl = ib32 / (QK_K / 32);
5016
+ const int ib = ib32 % (QK_K / 32);
5017
+
5018
+ device const block_iq1_s * xr = x + ibl;
5019
+ device const uint8_t * qs = xr->qs + 4 * ib + 2 * il;
5020
+ device const uint8_t * sc = xr->scales + 2 * ib + il;
5021
+ device const half * dh = &xr->d;
5022
+
5023
+ for (int row = 0; row < N_DST; row++) {
5024
+
5025
+ constant int8_t * grid1 = (constant int8_t *)(iq1s_grid + (qs[0] | ((sc[0] & 0x08) << 5)));
5026
+ constant int8_t * grid2 = (constant int8_t *)(iq1s_grid + (qs[1] | ((sc[0] & 0x80) << 1)));
5027
+
5028
+ float2 sum = {0};
5029
+ for (int j = 0; j < 8; ++j) {
5030
+ sum[0] += yl[j+ 0] * grid1[j];
5031
+ sum[1] += yl[j+ 8] * grid2[j];
5032
+ }
5033
+ sumf[row] += (float)dh[0] * (sum[0] * (2*(sc[0] & 7) + 1) + sum[1] * (2*((sc[0] >> 4) & 7) + 1));
5034
+
5035
+ dh += nb*sizeof(block_iq1_s)/2;
5036
+ qs += nb*sizeof(block_iq1_s);
5037
+ sc += nb*sizeof(block_iq1_s);
5038
+ }
5039
+
5040
+ y4 += 16 * 32;
5041
+ }
5042
+
5043
+ for (int row = 0; row < N_DST; ++row) {
5044
+ all_sum = simd_sum(sumf[row]);
5045
+ if (tiisg == 0) {
5046
+ dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum;
5047
+ }
5048
+ }
5049
+ }
5050
+
5051
+ constexpr constant static float kvalues_iq4nl_f[16] = {
5052
+ -127.f, -104.f, -83.f, -65.f, -49.f, -35.f, -22.f, -10.f, 1.f, 13.f, 25.f, 38.f, 53.f, 69.f, 89.f, 113.f
5053
+ };
5054
+
5055
+ void kernel_mul_mv_iq4_nl_f32_impl(
5056
+ device const void * src0,
5057
+ device const float * src1,
5058
+ device float * dst,
5059
+ constant int64_t & ne00,
5060
+ constant int64_t & ne01,
5061
+ constant int64_t & ne02,
5062
+ constant int64_t & ne10,
5063
+ constant int64_t & ne12,
5064
+ constant int64_t & ne0,
5065
+ constant int64_t & ne1,
5066
+ constant uint & r2,
5067
+ constant uint & r3,
5068
+ threadgroup float * shared_values [[threadgroup(0)]],
5069
+ uint3 tgpig[[threadgroup_position_in_grid]],
5070
+ uint tiisg[[thread_index_in_simdgroup]],
5071
+ uint sgitg[[simdgroup_index_in_threadgroup]]) {
5072
+
5073
+ const int nb = ne00/QK4_NL;
5074
+ const int r0 = tgpig.x;
5075
+ const int r1 = tgpig.y;
5076
+ const int im = tgpig.z;
5077
+ const int first_row = (r0 * 2 + sgitg) * 2;
5078
+ const int ib_row = first_row * nb;
5079
+
5080
+ const uint i12 = im%ne12;
5081
+ const uint i13 = im/ne12;
5082
+
5083
+ const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
5084
+ device const block_iq4_nl * x = (device const block_iq4_nl *) src0 + ib_row + offset0;
5085
+ device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1;
5086
+
5087
+ const int ix = tiisg/2; // 0...15
5088
+ const int it = tiisg%2; // 0 or 1
5089
+
5090
+ shared_values[tiisg] = kvalues_iq4nl_f[tiisg%16];
5091
+ threadgroup_barrier(mem_flags::mem_threadgroup);
5092
+
5093
+ float4 yl[4];
5094
+ float sumf[2]={0.f}, all_sum;
5095
+
5096
+ device const float * yb = y + ix * QK4_NL + it * 8;
5097
+
5098
+ uint32_t aux32[2];
5099
+ thread const uint8_t * q8 = (thread const uint8_t *)aux32;
5100
+
5101
+ float4 qf1, qf2;
5102
+
5103
+ for (int ib = ix; ib < nb; ib += 16) {
5104
+
5105
+ device const float4 * y4 = (device const float4 *)yb;
5106
+ yl[0] = y4[0]; yl[1] = y4[4]; yl[2] = y4[1]; yl[3] = y4[5];
5107
+
5108
+ for (int row = 0; row < 2; ++row) {
5109
+
5110
+ device const block_iq4_nl & xb = x[row*nb + ib];
5111
+ device const uint16_t * q4 = (device const uint16_t *)(xb.qs + 8*it);
5112
+
5113
+ float4 acc1 = {0.f}, acc2 = {0.f};
5114
+
5115
+ aux32[0] = q4[0] | (q4[1] << 16);
5116
+ aux32[1] = (aux32[0] >> 4) & 0x0f0f0f0f;
5117
+ aux32[0] &= 0x0f0f0f0f;
5118
+ qf1 = {shared_values[q8[0]], shared_values[q8[1]], shared_values[q8[2]], shared_values[q8[3]]};
5119
+ qf2 = {shared_values[q8[4]], shared_values[q8[5]], shared_values[q8[6]], shared_values[q8[7]]};
5120
+ acc1 += yl[0] * qf1;
5121
+ acc2 += yl[1] * qf2;
5122
+
5123
+ aux32[0] = q4[2] | (q4[3] << 16);
5124
+ aux32[1] = (aux32[0] >> 4) & 0x0f0f0f0f;
5125
+ aux32[0] &= 0x0f0f0f0f;
5126
+ qf1 = {shared_values[q8[0]], shared_values[q8[1]], shared_values[q8[2]], shared_values[q8[3]]};
5127
+ qf2 = {shared_values[q8[4]], shared_values[q8[5]], shared_values[q8[6]], shared_values[q8[7]]};
5128
+ acc1 += yl[2] * qf1;
5129
+ acc2 += yl[3] * qf2;
5130
+
5131
+ acc1 += acc2;
5132
+
5133
+ sumf[row] += (float)xb.d * (acc1[0] + acc1[1] + acc1[2] + acc1[3]);
5134
+
5135
+ }
5136
+
5137
+ yb += 16 * QK4_NL;
5138
+ }
5139
+
5140
+ for (int row = 0; row < 2; ++row) {
5141
+ all_sum = simd_sum(sumf[row]);
5142
+ if (tiisg == 0) {
5143
+ dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum;
5144
+ }
5145
+ }
5146
+ }
5147
+
5148
+ #if QK_K != 64
5149
+ void kernel_mul_mv_iq4_xs_f32_impl(
5150
+ device const void * src0,
5151
+ device const float * src1,
5152
+ device float * dst,
5153
+ constant int64_t & ne00,
5154
+ constant int64_t & ne01,
5155
+ constant int64_t & ne02,
5156
+ constant int64_t & ne10,
5157
+ constant int64_t & ne12,
5158
+ constant int64_t & ne0,
5159
+ constant int64_t & ne1,
5160
+ constant uint & r2,
5161
+ constant uint & r3,
5162
+ threadgroup float * shared_values [[threadgroup(0)]],
5163
+ uint3 tgpig[[threadgroup_position_in_grid]],
5164
+ uint tiisg[[thread_index_in_simdgroup]],
5165
+ uint sgitg[[simdgroup_index_in_threadgroup]]) {
5166
+
5167
+ const int nb = ne00/QK_K;
5168
+ const int r0 = tgpig.x;
5169
+ const int r1 = tgpig.y;
5170
+ const int im = tgpig.z;
5171
+ const int first_row = (r0 * 2 + sgitg) * 2;
5172
+ const int ib_row = first_row * nb;
5173
+
5174
+ const uint i12 = im%ne12;
5175
+ const uint i13 = im/ne12;
5176
+
5177
+ const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
5178
+ device const block_iq4_xs * x = (device const block_iq4_xs *) src0 + ib_row + offset0;
5179
+ device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1;
5180
+
5181
+ const int ix = tiisg/16; // 0 or 1
5182
+ const int it = tiisg%16; // 0...15
5183
+ const int ib = it/2;
5184
+ const int il = it%2;
5185
+
5186
+ shared_values[tiisg] = kvalues_iq4nl_f[tiisg%16];
5187
+ threadgroup_barrier(mem_flags::mem_threadgroup);
5188
+
5189
+ float4 yl[4];
5190
+ float sumf[2]={0.f}, all_sum;
5191
+
5192
+ device const float * yb = y + ix * QK_K + ib * 32 + il * 8;
5193
+
5194
+ uint32_t aux32[2];
5195
+ thread const uint8_t * q8 = (thread const uint8_t *)aux32;
5196
+
5197
+ float4 qf1, qf2;
5198
+
5199
+ for (int ibl = ix; ibl < nb; ibl += 2) {
5200
+
5201
+ device const float4 * y4 = (device const float4 *)yb;
5202
+ yl[0] = y4[0]; yl[1] = y4[4]; yl[2] = y4[1]; yl[3] = y4[5];
5203
+
5204
+ for (int row = 0; row < 2; ++row) {
5205
+
5206
+ device const block_iq4_xs & xb = x[row*nb + ibl];
5207
+ device const uint32_t * q4 = (device const uint32_t *)(xb.qs + 16*ib + 8*il);
5208
+
5209
+ float4 acc1 = {0.f}, acc2 = {0.f};
5210
+
5211
+ aux32[0] = q4[0] & 0x0f0f0f0f;
5212
+ aux32[1] = (q4[0] >> 4) & 0x0f0f0f0f;
5213
+ qf1 = {shared_values[q8[0]], shared_values[q8[1]], shared_values[q8[2]], shared_values[q8[3]]};
5214
+ qf2 = {shared_values[q8[4]], shared_values[q8[5]], shared_values[q8[6]], shared_values[q8[7]]};
5215
+ acc1 += yl[0] * qf1;
5216
+ acc2 += yl[1] * qf2;
5217
+
5218
+ aux32[0] = q4[1] & 0x0f0f0f0f;
5219
+ aux32[1] = (q4[1] >> 4) & 0x0f0f0f0f;
5220
+ qf1 = {shared_values[q8[0]], shared_values[q8[1]], shared_values[q8[2]], shared_values[q8[3]]};
5221
+ qf2 = {shared_values[q8[4]], shared_values[q8[5]], shared_values[q8[6]], shared_values[q8[7]]};
5222
+ acc1 += yl[2] * qf1;
5223
+ acc2 += yl[3] * qf2;
5224
+
5225
+ acc1 += acc2;
5226
+
5227
+ const int ls = (((xb.scales_l[ib/2] >> 4*(ib%2)) & 0xf) | (((xb.scales_h >> 2*ib) & 3) << 4)) - 32;
5228
+ sumf[row] += (float)xb.d * ls * (acc1[0] + acc1[1] + acc1[2] + acc1[3]);
5229
+
5230
+ }
5231
+
5232
+ yb += 2 * QK_K;
5233
+ }
5234
+
5235
+ for (int row = 0; row < 2; ++row) {
5236
+ all_sum = simd_sum(sumf[row]);
5237
+ if (tiisg == 0) {
5238
+ dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum;
5239
+ }
5240
+ }
5241
+ }
5242
+ #endif
5243
+
5244
+ [[host_name("kernel_mul_mv_iq1_s_f32")]]
5245
+ kernel void kernel_mul_mv_iq1_s_f32(
5246
+ device const void * src0,
5247
+ device const float * src1,
5248
+ device float * dst,
5249
+ constant int64_t & ne00,
5250
+ constant int64_t & ne01,
5251
+ constant int64_t & ne02,
5252
+ constant uint64_t & nb00,
5253
+ constant uint64_t & nb01,
5254
+ constant uint64_t & nb02,
5255
+ constant int64_t & ne10,
5256
+ constant int64_t & ne11,
5257
+ constant int64_t & ne12,
5258
+ constant uint64_t & nb10,
5259
+ constant uint64_t & nb11,
5260
+ constant uint64_t & nb12,
5261
+ constant int64_t & ne0,
5262
+ constant int64_t & ne1,
5263
+ constant uint & r2,
5264
+ constant uint & r3,
5265
+ uint3 tgpig[[threadgroup_position_in_grid]],
5266
+ uint tiisg[[thread_index_in_simdgroup]],
5267
+ uint sgitg[[simdgroup_index_in_threadgroup]]) {
5268
+
5269
+ kernel_mul_mv_iq1_s_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, tgpig, tiisg, sgitg);
5270
+ }
5271
+
5272
+ [[host_name("kernel_mul_mv_iq4_nl_f32")]]
5273
+ kernel void kernel_mul_mv_iq4_nl_f32(
5274
+ device const void * src0,
5275
+ device const float * src1,
5276
+ device float * dst,
5277
+ constant int64_t & ne00,
5278
+ constant int64_t & ne01,
5279
+ constant int64_t & ne02,
5280
+ constant uint64_t & nb00,
5281
+ constant uint64_t & nb01,
5282
+ constant uint64_t & nb02,
5283
+ constant int64_t & ne10,
5284
+ constant int64_t & ne11,
5285
+ constant int64_t & ne12,
5286
+ constant uint64_t & nb10,
5287
+ constant uint64_t & nb11,
5288
+ constant uint64_t & nb12,
5289
+ constant int64_t & ne0,
5290
+ constant int64_t & ne1,
5291
+ constant uint & r2,
5292
+ constant uint & r3,
5293
+ threadgroup float * shared_values [[threadgroup(0)]],
5294
+ uint3 tgpig[[threadgroup_position_in_grid]],
5295
+ uint tiisg[[thread_index_in_simdgroup]],
5296
+ uint sgitg[[simdgroup_index_in_threadgroup]]) {
5297
+
5298
+ kernel_mul_mv_iq4_nl_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
5299
+ }
5300
+
5301
+ [[host_name("kernel_mul_mv_iq4_xs_f32")]]
5302
+ kernel void kernel_mul_mv_iq4_xs_f32(
5303
+ device const void * src0,
5304
+ device const float * src1,
5305
+ device float * dst,
5306
+ constant int64_t & ne00,
5307
+ constant int64_t & ne01,
5308
+ constant int64_t & ne02,
5309
+ constant uint64_t & nb00,
5310
+ constant uint64_t & nb01,
5311
+ constant uint64_t & nb02,
5312
+ constant int64_t & ne10,
5313
+ constant int64_t & ne11,
5314
+ constant int64_t & ne12,
5315
+ constant uint64_t & nb10,
5316
+ constant uint64_t & nb11,
5317
+ constant uint64_t & nb12,
5318
+ constant int64_t & ne0,
5319
+ constant int64_t & ne1,
5320
+ constant uint & r2,
5321
+ constant uint & r3,
5322
+ threadgroup float * shared_values [[threadgroup(0)]],
5323
+ uint3 tgpig[[threadgroup_position_in_grid]],
5324
+ uint tiisg[[thread_index_in_simdgroup]],
5325
+ uint sgitg[[simdgroup_index_in_threadgroup]]) {
5326
+
5327
+ #if QK_K == 64
5328
+ kernel_mul_mv_iq4_nl_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
5329
+ #else
5330
+ kernel_mul_mv_iq4_xs_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
5331
+ #endif
5332
+ }
5333
+
5334
+ //============================= templates and their specializations =============================
5335
+
5336
+ // NOTE: this is not dequantizing - we are simply fitting the template
5337
+ template <typename type4x4>
5338
+ void dequantize_f32(device const float4x4 * src, short il, thread type4x4 & reg) {
5339
+ float4x4 temp = *(((device float4x4 *)src));
5340
+ for (int i = 0; i < 16; i++){
5341
+ reg[i/4][i%4] = temp[i/4][i%4];
5342
+ }
5343
+ }
5344
+
5345
+ template <typename type4x4>
5346
+ void dequantize_f16(device const half4x4 * src, short il, thread type4x4 & reg) {
5347
+ half4x4 temp = *(((device half4x4 *)src));
5348
+ for (int i = 0; i < 16; i++){
5349
+ reg[i/4][i%4] = temp[i/4][i%4];
5350
+ }
5351
+ }
5352
+
5353
+ template <typename type4x4>
5354
+ void dequantize_q4_0(device const block_q4_0 *xb, short il, thread type4x4 & reg) {
5355
+ device const uint16_t * qs = ((device const uint16_t *)xb + 1);
5356
+ const float d1 = il ? (xb->d / 16.h) : xb->d;
5357
+ const float d2 = d1 / 256.f;
5358
+ const float md = -8.h * xb->d;
5359
+ const ushort mask0 = il ? 0x00F0 : 0x000F;
5360
+ const ushort mask1 = mask0 << 8;
5361
+
5362
+ for (int i=0;i<8;i++) {
5363
+ reg[i/2][2*(i%2)+0] = d1 * (qs[i] & mask0) + md;
5364
+ reg[i/2][2*(i%2)+1] = d2 * (qs[i] & mask1) + md;
5365
+ }
5366
+ }
5367
+
5368
+ template <typename type4x4>
5369
+ void dequantize_q4_1(device const block_q4_1 *xb, short il, thread type4x4 & reg) {
5370
+ device const uint16_t * qs = ((device const uint16_t *)xb + 2);
5371
+ const float d1 = il ? (xb->d / 16.h) : xb->d;
5372
+ const float d2 = d1 / 256.f;
5373
+ const float m = xb->m;
5374
+ const ushort mask0 = il ? 0x00F0 : 0x000F;
5375
+ const ushort mask1 = mask0 << 8;
5376
+
5377
+ for (int i=0;i<8;i++) {
5378
+ reg[i/2][2*(i%2)+0] = ((qs[i] & mask0) * d1) + m;
5379
+ reg[i/2][2*(i%2)+1] = ((qs[i] & mask1) * d2) + m;
5380
+ }
5381
+ }
5382
+
5383
+ template <typename type4x4>
5384
+ void dequantize_q5_0(device const block_q5_0 *xb, short il, thread type4x4 & reg) {
5385
+ device const uint16_t * qs = ((device const uint16_t *)xb + 3);
5386
+ const float d = xb->d;
5387
+ const float md = -16.h * xb->d;
5388
+ const ushort mask = il ? 0x00F0 : 0x000F;
5389
+
5390
+ const uint32_t qh = *((device const uint32_t *)xb->qh);
5391
+
5392
+ const int x_mv = il ? 4 : 0;
5393
+
5394
+ const int gh_mv = il ? 12 : 0;
5395
+ const int gh_bk = il ? 0 : 4;
5396
+
5397
+ for (int i = 0; i < 8; i++) {
5398
+ // extract the 5-th bits for x0 and x1
5399
+ const uint8_t xh_0 = ((qh >> (gh_mv + 2*i )) << gh_bk) & 0x10;
5400
+ const uint8_t xh_1 = ((qh >> (gh_mv + 2*i+1)) << gh_bk) & 0x10;
5401
+
5402
+ // combine the 4-bits from qs with the 5th bit
5403
+ const int32_t x0 = ((((qs[i] ) & mask) >> x_mv) | xh_0);
5404
+ const int32_t x1 = ((((qs[i] >> 8) & mask) >> x_mv) | xh_1);
5405
+
5406
+ reg[i/2][2*(i%2)+0] = d * x0 + md;
5407
+ reg[i/2][2*(i%2)+1] = d * x1 + md;
5408
+ }
5409
+ }
5410
+
5411
+ template <typename type4x4>
5412
+ void dequantize_q5_1(device const block_q5_1 *xb, short il, thread type4x4 & reg) {
5413
+ device const uint16_t * qs = ((device const uint16_t *)xb + 4);
5414
+ const float d = xb->d;
5415
+ const float m = xb->m;
5416
+ const ushort mask = il ? 0x00F0 : 0x000F;
5417
+
5418
+ const uint32_t qh = *((device const uint32_t *)xb->qh);
5419
+
5420
+ const int x_mv = il ? 4 : 0;
5421
+
5422
+ const int gh_mv = il ? 12 : 0;
5423
+ const int gh_bk = il ? 0 : 4;
5424
+
5425
+ for (int i = 0; i < 8; i++) {
5426
+ // extract the 5-th bits for x0 and x1
5427
+ const uint8_t xh_0 = ((qh >> (gh_mv + 2*i )) << gh_bk) & 0x10;
5428
+ const uint8_t xh_1 = ((qh >> (gh_mv + 2*i+1)) << gh_bk) & 0x10;
5429
+
5430
+ // combine the 4-bits from qs with the 5th bit
5431
+ const int32_t x0 = ((((qs[i] ) & mask) >> x_mv) | xh_0);
5432
+ const int32_t x1 = ((((qs[i] >> 8) & mask) >> x_mv) | xh_1);
5433
+
5434
+ reg[i/2][2*(i%2)+0] = d * x0 + m;
5435
+ reg[i/2][2*(i%2)+1] = d * x1 + m;
5436
+ }
5437
+ }
5438
+
5439
+ template <typename type4x4>
5440
+ void dequantize_q8_0(device const block_q8_0 *xb, short il, thread type4x4 & reg) {
5441
+ device const int8_t * qs = ((device const int8_t *)xb->qs);
5442
+ const half d = xb->d;
5443
+
5444
+ for (int i = 0; i < 16; i++) {
5445
+ reg[i/4][i%4] = (qs[i + 16*il] * d);
5446
+ }
5447
+ }
5448
+
5449
+ template <typename type4x4>
5450
+ void dequantize_q2_K(device const block_q2_K *xb, short il, thread type4x4 & reg) {
5451
+ const float d = xb->d;
5452
+ const float min = xb->dmin;
5453
+ device const uint8_t * q = (device const uint8_t *)xb->qs;
5454
+ float dl, ml;
5455
+ uint8_t sc = xb->scales[il];
4589
5456
 
4590
5457
  #if QK_K == 256
4591
5458
  q = q + 32*(il/8) + 16*(il&1);
@@ -4659,6 +5526,8 @@ void dequantize_q4_K(device const block_q4_K *xb, short il, thread type4x4 & reg
4659
5526
  const float dl = d * sc[0];
4660
5527
  const float ml = min * sc[1];
4661
5528
  #else
5529
+ (void) get_scale_min_k4_just2;
5530
+
4662
5531
  q = q + 16 * (il&1);
4663
5532
  device const uint8_t * s = xb->scales;
4664
5533
  device const half2 * dh = (device const half2 *)xb->d;
@@ -4808,6 +5677,50 @@ void dequantize_iq3_xxs(device const block_iq3_xxs * xb, short il, thread type4x
4808
5677
  }
4809
5678
  }
4810
5679
 
5680
+ template <typename type4x4>
5681
+ void dequantize_iq3_s(device const block_iq3_s * xb, short il, thread type4x4 & reg) {
5682
+ // il is 0...15 for QK_K = 256 => index of block of 32 is il/2
5683
+ const float d = xb->d;
5684
+ const int ib32 = il/2;
5685
+ il = il%2;
5686
+ // il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16
5687
+ device const uint8_t * qs = xb->qs + 8*ib32;
5688
+ device const uint8_t * signs = xb->signs + 4*ib32 + 2*il;
5689
+ const uint8_t qh = xb->qh[ib32] >> 4*il;
5690
+ const float dl = d * (1 + 2*((xb->scales[ib32/2] >> 4*(ib32%2)) & 0xf));
5691
+ constant uint8_t * grid1 = (constant uint8_t *)(iq3s_grid + (qs[4*il+0] | ((qh << 8) & 256)));
5692
+ constant uint8_t * grid2 = (constant uint8_t *)(iq3s_grid + (qs[4*il+1] | ((qh << 7) & 256)));
5693
+ for (int i = 0; i < 4; ++i) {
5694
+ reg[0][i] = dl * grid1[i] * select(1, -1, signs[0] & kmask_iq2xs[i+0]);
5695
+ reg[1][i] = dl * grid2[i] * select(1, -1, signs[0] & kmask_iq2xs[i+4]);
5696
+ }
5697
+ grid1 = (constant uint8_t *)(iq3s_grid + (qs[4*il+2] | ((qh << 6) & 256)));
5698
+ grid2 = (constant uint8_t *)(iq3s_grid + (qs[4*il+3] | ((qh << 5) & 256)));
5699
+ for (int i = 0; i < 4; ++i) {
5700
+ reg[2][i] = dl * grid1[i] * select(1, -1, signs[1] & kmask_iq2xs[i+0]);
5701
+ reg[3][i] = dl * grid2[i] * select(1, -1, signs[1] & kmask_iq2xs[i+4]);
5702
+ }
5703
+ }
5704
+
5705
+ template <typename type4x4>
5706
+ void dequantize_iq2_s(device const block_iq2_s * xb, short il, thread type4x4 & reg) {
5707
+ // il is 0...15 for QK_K = 256 => index of block of 32 is il/2
5708
+ const float d = xb->d;
5709
+ const int ib32 = il/2;
5710
+ il = il%2;
5711
+ // il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16
5712
+ device const uint8_t * qs = xb->qs + 4*ib32 + 2*il;
5713
+ device const uint8_t * signs = qs + QK_K/8;
5714
+ const uint8_t qh = xb->qh[ib32] >> 4*il;
5715
+ const float dl = d * (0.5f + ((xb->scales[ib32] >> 4*il) & 0xf)) * 0.25f;
5716
+ constant uint8_t * grid1 = (constant uint8_t *)(iq2s_grid + (qs[0] | ((qh << 8) & 0x300)));
5717
+ constant uint8_t * grid2 = (constant uint8_t *)(iq2s_grid + (qs[1] | ((qh << 6) & 0x300)));
5718
+ for (int i = 0; i < 8; ++i) {
5719
+ reg[i/4+0][i%4] = dl * grid1[i] * select(1, -1, signs[0] & kmask_iq2xs[i]);
5720
+ reg[i/4+2][i%4] = dl * grid2[i] * select(1, -1, signs[1] & kmask_iq2xs[i]);
5721
+ }
5722
+ }
5723
+
4811
5724
  template <typename type4x4>
4812
5725
  void dequantize_iq1_s(device const block_iq1_s * xb, short il, thread type4x4 & reg) {
4813
5726
  // il is 0...15 for QK_K = 256 => index of block of 32 is il/2
@@ -4824,6 +5737,45 @@ void dequantize_iq1_s(device const block_iq1_s * xb, short il, thread type4x4 &
4824
5737
  }
4825
5738
  }
4826
5739
 
5740
+ template <typename type4x4>
5741
+ void dequantize_iq4_nl(device const block_iq4_nl * xb, short il, thread type4x4 & reg) {
5742
+ device const uint16_t * q4 = (device const uint16_t *)xb->qs;
5743
+ const float d = xb->d;
5744
+ uint32_t aux32;
5745
+ thread const uint8_t * q8 = (thread const uint8_t *)&aux32;
5746
+ for (int i = 0; i < 4; ++i) {
5747
+ aux32 = ((q4[2*i] | (q4[2*i+1] << 16)) >> 4*il) & 0x0f0f0f0f;
5748
+ reg[i][0] = d * kvalues_iq4nl_f[q8[0]];
5749
+ reg[i][1] = d * kvalues_iq4nl_f[q8[1]];
5750
+ reg[i][2] = d * kvalues_iq4nl_f[q8[2]];
5751
+ reg[i][3] = d * kvalues_iq4nl_f[q8[3]];
5752
+ }
5753
+ }
5754
+
5755
+ template <typename type4x4>
5756
+ void dequantize_iq4_xs(device const block_iq4_xs * xb, short il, thread type4x4 & reg) {
5757
+ #if QK_K == 64
5758
+ dequantize_iq4_nl(xb, il, reg);
5759
+ #else
5760
+ // il is 0...15 for QK_K = 256 => index of block of 32 is il/2
5761
+ const int ib32 = il/2;
5762
+ il = il%2;
5763
+ // il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16
5764
+ device const uint32_t * q4 = (device const uint32_t *)xb->qs + 4*ib32;
5765
+ const int ls = ((xb->scales_l[ib32/2] >> 4*(ib32%2)) & 0xf) | (((xb->scales_h >> 2*ib32) & 3) << 4);
5766
+ const float d = (float)xb->d * (ls - 32);
5767
+ uint32_t aux32;
5768
+ thread const uint8_t * q8 = (thread const uint8_t *)&aux32;
5769
+ for (int i = 0; i < 4; ++i) {
5770
+ aux32 = (q4[i] >> 4*il) & 0x0f0f0f0f;
5771
+ reg[i][0] = d * kvalues_iq4nl_f[q8[0]];
5772
+ reg[i][1] = d * kvalues_iq4nl_f[q8[1]];
5773
+ reg[i][2] = d * kvalues_iq4nl_f[q8[2]];
5774
+ reg[i][3] = d * kvalues_iq4nl_f[q8[3]];
5775
+ }
5776
+ #endif
5777
+ }
5778
+
4827
5779
  template<typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread float4x4 &)>
4828
5780
  kernel void kernel_get_rows(
4829
5781
  device const void * src0,
@@ -5366,7 +6318,15 @@ template [[host_name("kernel_get_rows_q6_K")]] kernel get_rows_t kernel_get_rows
5366
6318
  template [[host_name("kernel_get_rows_iq2_xxs")]] kernel get_rows_t kernel_get_rows<block_iq2_xxs, QK_NL, dequantize_iq2_xxs>;
5367
6319
  template [[host_name("kernel_get_rows_iq2_xs")]] kernel get_rows_t kernel_get_rows<block_iq2_xs, QK_NL, dequantize_iq2_xs>;
5368
6320
  template [[host_name("kernel_get_rows_iq3_xxs")]] kernel get_rows_t kernel_get_rows<block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;
6321
+ template [[host_name("kernel_get_rows_iq3_s")]] kernel get_rows_t kernel_get_rows<block_iq3_s, QK_NL, dequantize_iq3_s>;
6322
+ template [[host_name("kernel_get_rows_iq2_s")]] kernel get_rows_t kernel_get_rows<block_iq2_s, QK_NL, dequantize_iq2_s>;
5369
6323
  template [[host_name("kernel_get_rows_iq1_s")]] kernel get_rows_t kernel_get_rows<block_iq1_s, QK_NL, dequantize_iq1_s>;
6324
+ template [[host_name("kernel_get_rows_iq4_nl")]] kernel get_rows_t kernel_get_rows<block_iq4_nl, 2, dequantize_iq4_nl>;
6325
+ #if QK_K == 64
6326
+ template [[host_name("kernel_get_rows_iq4_xs")]] kernel get_rows_t kernel_get_rows<block_iq4_xs, 2, dequantize_iq4_xs>;
6327
+ #else
6328
+ template [[host_name("kernel_get_rows_iq4_xs")]] kernel get_rows_t kernel_get_rows<block_iq4_xs, QK_NL, dequantize_iq4_xs>;
6329
+ #endif
5370
6330
 
5371
6331
  //
5372
6332
  // matrix-matrix multiplication
@@ -5406,7 +6366,15 @@ template [[host_name("kernel_mul_mm_q6_K_f32")]] kernel mat_mm_t kernel_mul_mm<b
5406
6366
  template [[host_name("kernel_mul_mm_iq2_xxs_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq2_xxs, QK_NL, dequantize_iq2_xxs>;
5407
6367
  template [[host_name("kernel_mul_mm_iq2_xs_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq2_xs, QK_NL, dequantize_iq2_xs>;
5408
6368
  template [[host_name("kernel_mul_mm_iq3_xxs_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;
6369
+ template [[host_name("kernel_mul_mm_iq3_s_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq3_s, QK_NL, dequantize_iq3_s>;
6370
+ template [[host_name("kernel_mul_mm_iq2_s_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq2_s, QK_NL, dequantize_iq2_s>;
5409
6371
  template [[host_name("kernel_mul_mm_iq1_s_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq1_s, QK_NL, dequantize_iq1_s>;
6372
+ template [[host_name("kernel_mul_mm_iq4_nl_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq4_nl, 2, dequantize_iq4_nl>;
6373
+ #if QK_K == 64
6374
+ template [[host_name("kernel_mul_mm_iq4_xs_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq4_nl, 2, dequantize_iq4_xs>;
6375
+ #else
6376
+ template [[host_name("kernel_mul_mm_iq4_xs_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq4_xs, QK_NL, dequantize_iq4_xs>;
6377
+ #endif
5410
6378
 
5411
6379
  //
5412
6380
  // indirect matrix-matrix multiplication
@@ -5458,7 +6426,15 @@ template [[host_name("kernel_mul_mm_id_q6_K_f32")]] kernel mat_mm_id_t kernel_mu
5458
6426
  template [[host_name("kernel_mul_mm_id_iq2_xxs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq2_xxs, QK_NL, dequantize_iq2_xxs>;
5459
6427
  template [[host_name("kernel_mul_mm_id_iq2_xs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq2_xs, QK_NL, dequantize_iq2_xs>;
5460
6428
  template [[host_name("kernel_mul_mm_id_iq3_xxs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;
6429
+ template [[host_name("kernel_mul_mm_id_iq3_s_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq3_s, QK_NL, dequantize_iq3_s>;
6430
+ template [[host_name("kernel_mul_mm_id_iq2_s_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq2_s, QK_NL, dequantize_iq2_s>;
5461
6431
  template [[host_name("kernel_mul_mm_id_iq1_s_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq1_s, QK_NL, dequantize_iq1_s>;
6432
+ template [[host_name("kernel_mul_mm_id_iq4_nl_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq4_nl, 2, dequantize_iq4_nl>;
6433
+ #if QK_K == 64
6434
+ template [[host_name("kernel_mul_mm_id_iq4_xs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq4_xs, 2, dequantize_iq4_xs>;
6435
+ #else
6436
+ template [[host_name("kernel_mul_mm_id_iq4_xs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq4_xs, QK_NL, dequantize_iq4_xs>;
6437
+ #endif
5462
6438
 
5463
6439
  //
5464
6440
  // matrix-vector multiplication
@@ -6427,6 +7403,136 @@ kernel void kernel_mul_mv_id_iq3_xxs_f32(
6427
7403
  sgitg);
6428
7404
  }
6429
7405
 
7406
+ [[host_name("kernel_mul_mv_id_iq3_s_f32")]]
7407
+ kernel void kernel_mul_mv_id_iq3_s_f32(
7408
+ device const char * ids,
7409
+ device const char * src1,
7410
+ device float * dst,
7411
+ constant uint64_t & nbi1,
7412
+ constant int64_t & ne00,
7413
+ constant int64_t & ne01,
7414
+ constant int64_t & ne02,
7415
+ constant uint64_t & nb00,
7416
+ constant uint64_t & nb01,
7417
+ constant uint64_t & nb02,
7418
+ constant int64_t & ne10,
7419
+ constant int64_t & ne11,
7420
+ constant int64_t & ne12,
7421
+ constant int64_t & ne13,
7422
+ constant uint64_t & nb10,
7423
+ constant uint64_t & nb11,
7424
+ constant uint64_t & nb12,
7425
+ constant int64_t & ne0,
7426
+ constant int64_t & ne1,
7427
+ constant uint64_t & nb1,
7428
+ constant uint & r2,
7429
+ constant uint & r3,
7430
+ constant int & idx,
7431
+ device const char * src00,
7432
+ device const char * src01,
7433
+ device const char * src02,
7434
+ device const char * src03,
7435
+ device const char * src04,
7436
+ device const char * src05,
7437
+ device const char * src06,
7438
+ device const char * src07,
7439
+ threadgroup int8_t * shared_values [[threadgroup(0)]],
7440
+ uint3 tgpig[[threadgroup_position_in_grid]],
7441
+ uint tiitg[[thread_index_in_threadgroup]],
7442
+ uint tiisg[[thread_index_in_simdgroup]],
7443
+ uint sgitg[[simdgroup_index_in_threadgroup]]) {
7444
+ device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07};
7445
+
7446
+ const int64_t bid = tgpig.z/(ne12*ne13);
7447
+
7448
+ tgpig.z = tgpig.z%(ne12*ne13);
7449
+
7450
+ const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx];
7451
+
7452
+ kernel_mul_mv_iq3_s_f32_impl(
7453
+ src0[id],
7454
+ (device const float *) (src1 + bid*nb11),
7455
+ dst + bid*ne0,
7456
+ ne00,
7457
+ ne01,
7458
+ ne02,
7459
+ ne10,
7460
+ ne12,
7461
+ ne0,
7462
+ ne1,
7463
+ r2,
7464
+ r3,
7465
+ shared_values,
7466
+ tgpig,
7467
+ tiisg,
7468
+ sgitg);
7469
+ }
7470
+
7471
+ [[host_name("kernel_mul_mv_id_iq2_s_f32")]]
7472
+ kernel void kernel_mul_mv_id_iq2_s_f32(
7473
+ device const char * ids,
7474
+ device const char * src1,
7475
+ device float * dst,
7476
+ constant uint64_t & nbi1,
7477
+ constant int64_t & ne00,
7478
+ constant int64_t & ne01,
7479
+ constant int64_t & ne02,
7480
+ constant uint64_t & nb00,
7481
+ constant uint64_t & nb01,
7482
+ constant uint64_t & nb02,
7483
+ constant int64_t & ne10,
7484
+ constant int64_t & ne11,
7485
+ constant int64_t & ne12,
7486
+ constant int64_t & ne13,
7487
+ constant uint64_t & nb10,
7488
+ constant uint64_t & nb11,
7489
+ constant uint64_t & nb12,
7490
+ constant int64_t & ne0,
7491
+ constant int64_t & ne1,
7492
+ constant uint64_t & nb1,
7493
+ constant uint & r2,
7494
+ constant uint & r3,
7495
+ constant int & idx,
7496
+ device const char * src00,
7497
+ device const char * src01,
7498
+ device const char * src02,
7499
+ device const char * src03,
7500
+ device const char * src04,
7501
+ device const char * src05,
7502
+ device const char * src06,
7503
+ device const char * src07,
7504
+ threadgroup int8_t * shared_values [[threadgroup(0)]],
7505
+ uint3 tgpig[[threadgroup_position_in_grid]],
7506
+ uint tiitg[[thread_index_in_threadgroup]],
7507
+ uint tiisg[[thread_index_in_simdgroup]],
7508
+ uint sgitg[[simdgroup_index_in_threadgroup]]) {
7509
+ device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07};
7510
+
7511
+ const int64_t bid = tgpig.z/(ne12*ne13);
7512
+
7513
+ tgpig.z = tgpig.z%(ne12*ne13);
7514
+
7515
+ const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx];
7516
+
7517
+ kernel_mul_mv_iq2_s_f32_impl(
7518
+ src0[id],
7519
+ (device const float *) (src1 + bid*nb11),
7520
+ dst + bid*ne0,
7521
+ ne00,
7522
+ ne01,
7523
+ ne02,
7524
+ ne10,
7525
+ ne12,
7526
+ ne0,
7527
+ ne1,
7528
+ r2,
7529
+ r3,
7530
+ shared_values,
7531
+ tgpig,
7532
+ tiisg,
7533
+ sgitg);
7534
+ }
7535
+
6430
7536
  [[host_name("kernel_mul_mv_id_iq1_s_f32")]]
6431
7537
  kernel void kernel_mul_mv_id_iq1_s_f32(
6432
7538
  device const char * ids,
@@ -6489,3 +7595,137 @@ kernel void kernel_mul_mv_id_iq1_s_f32(
6489
7595
  tiisg,
6490
7596
  sgitg);
6491
7597
  }
7598
+
7599
+ [[host_name("kernel_mul_mv_id_iq4_nl_f32")]]
7600
+ kernel void kernel_mul_mv_id_iq4_nl_f32(
7601
+ device const char * ids,
7602
+ device const char * src1,
7603
+ device float * dst,
7604
+ constant uint64_t & nbi1,
7605
+ constant int64_t & ne00,
7606
+ constant int64_t & ne01,
7607
+ constant int64_t & ne02,
7608
+ constant uint64_t & nb00,
7609
+ constant uint64_t & nb01,
7610
+ constant uint64_t & nb02,
7611
+ constant int64_t & ne10,
7612
+ constant int64_t & ne11,
7613
+ constant int64_t & ne12,
7614
+ constant int64_t & ne13,
7615
+ constant uint64_t & nb10,
7616
+ constant uint64_t & nb11,
7617
+ constant uint64_t & nb12,
7618
+ constant int64_t & ne0,
7619
+ constant int64_t & ne1,
7620
+ constant uint64_t & nb1,
7621
+ constant uint & r2,
7622
+ constant uint & r3,
7623
+ constant int & idx,
7624
+ device const char * src00,
7625
+ device const char * src01,
7626
+ device const char * src02,
7627
+ device const char * src03,
7628
+ device const char * src04,
7629
+ device const char * src05,
7630
+ device const char * src06,
7631
+ device const char * src07,
7632
+ threadgroup float * shared_values [[threadgroup(0)]],
7633
+ uint3 tgpig[[threadgroup_position_in_grid]],
7634
+ uint tiitg[[thread_index_in_threadgroup]],
7635
+ uint tiisg[[thread_index_in_simdgroup]],
7636
+ uint sgitg[[simdgroup_index_in_threadgroup]]) {
7637
+ device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07};
7638
+
7639
+ const int64_t bid = tgpig.z/(ne12*ne13);
7640
+
7641
+ tgpig.z = tgpig.z%(ne12*ne13);
7642
+
7643
+ const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx];
7644
+
7645
+ kernel_mul_mv_iq4_nl_f32_impl(
7646
+ src0[id],
7647
+ (device const float *) (src1 + bid*nb11),
7648
+ dst + bid*ne0,
7649
+ ne00,
7650
+ ne01,
7651
+ ne02,
7652
+ ne10,
7653
+ ne12,
7654
+ ne0,
7655
+ ne1,
7656
+ r2,
7657
+ r3,
7658
+ shared_values,
7659
+ tgpig,
7660
+ tiisg,
7661
+ sgitg);
7662
+ }
7663
+
7664
+ [[host_name("kernel_mul_mv_id_iq4_xs_f32")]]
7665
+ kernel void kernel_mul_mv_id_iq4_xs_f32(
7666
+ device const char * ids,
7667
+ device const char * src1,
7668
+ device float * dst,
7669
+ constant uint64_t & nbi1,
7670
+ constant int64_t & ne00,
7671
+ constant int64_t & ne01,
7672
+ constant int64_t & ne02,
7673
+ constant uint64_t & nb00,
7674
+ constant uint64_t & nb01,
7675
+ constant uint64_t & nb02,
7676
+ constant int64_t & ne10,
7677
+ constant int64_t & ne11,
7678
+ constant int64_t & ne12,
7679
+ constant int64_t & ne13,
7680
+ constant uint64_t & nb10,
7681
+ constant uint64_t & nb11,
7682
+ constant uint64_t & nb12,
7683
+ constant int64_t & ne0,
7684
+ constant int64_t & ne1,
7685
+ constant uint64_t & nb1,
7686
+ constant uint & r2,
7687
+ constant uint & r3,
7688
+ constant int & idx,
7689
+ device const char * src00,
7690
+ device const char * src01,
7691
+ device const char * src02,
7692
+ device const char * src03,
7693
+ device const char * src04,
7694
+ device const char * src05,
7695
+ device const char * src06,
7696
+ device const char * src07,
7697
+ threadgroup float * shared_values [[threadgroup(0)]],
7698
+ uint3 tgpig[[threadgroup_position_in_grid]],
7699
+ uint tiitg[[thread_index_in_threadgroup]],
7700
+ uint tiisg[[thread_index_in_simdgroup]],
7701
+ uint sgitg[[simdgroup_index_in_threadgroup]]) {
7702
+ device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07};
7703
+
7704
+ const int64_t bid = tgpig.z/(ne12*ne13);
7705
+
7706
+ tgpig.z = tgpig.z%(ne12*ne13);
7707
+
7708
+ const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx];
7709
+
7710
+ #if QK_K == 64
7711
+ kernel_mul_mv_iq4_nl_f32_impl(
7712
+ #else
7713
+ kernel_mul_mv_iq4_xs_f32_impl(
7714
+ #endif
7715
+ src0[id],
7716
+ (device const float *) (src1 + bid*nb11),
7717
+ dst + bid*ne0,
7718
+ ne00,
7719
+ ne01,
7720
+ ne02,
7721
+ ne10,
7722
+ ne12,
7723
+ ne0,
7724
+ ne1,
7725
+ r2,
7726
+ r3,
7727
+ shared_values,
7728
+ tgpig,
7729
+ tiisg,
7730
+ sgitg);
7731
+ }