llama_cpp 0.15.0 → 0.15.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -326,14 +326,20 @@ extern "C" {
326
326
  // get ggml_status name string
327
327
  GGML_API GGML_CALL const char * ggml_status_to_string(enum ggml_status status);
328
328
 
329
+ // ieee 754-2008 half-precision float16
330
+ // todo: make this not an integral type
329
331
  typedef uint16_t ggml_fp16_t;
330
-
331
- // convert FP16 <-> FP32
332
- GGML_API float ggml_fp16_to_fp32(ggml_fp16_t x);
333
- GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x);
334
-
335
- GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int64_t n);
336
- GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n);
332
+ GGML_API float ggml_fp16_to_fp32(ggml_fp16_t);
333
+ GGML_API ggml_fp16_t ggml_fp32_to_fp16(float);
334
+ GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t *, float *, int64_t);
335
+ GGML_API void ggml_fp32_to_fp16_row(const float *, ggml_fp16_t *, int64_t);
336
+
337
+ // google brain half-precision bfloat16
338
+ typedef struct { uint16_t bits; } ggml_bf16_t;
339
+ GGML_API ggml_bf16_t ggml_fp32_to_bf16(float);
340
+ GGML_API float ggml_bf16_to_fp32(ggml_bf16_t); // consider just doing << 16
341
+ GGML_API void ggml_bf16_to_fp32_row(const ggml_bf16_t *, float *, int64_t);
342
+ GGML_API void ggml_fp32_to_bf16_row(const float *, ggml_bf16_t *, int64_t);
337
343
 
338
344
  struct ggml_object;
339
345
  struct ggml_context;
@@ -370,6 +376,7 @@ extern "C" {
370
376
  GGML_TYPE_I64 = 27,
371
377
  GGML_TYPE_F64 = 28,
372
378
  GGML_TYPE_IQ1_M = 29,
379
+ GGML_TYPE_BF16 = 30,
373
380
  GGML_TYPE_COUNT,
374
381
  };
375
382
 
@@ -410,6 +417,7 @@ extern "C" {
410
417
  GGML_FTYPE_MOSTLY_IQ2_S = 21, // except 1d tensors
411
418
  GGML_FTYPE_MOSTLY_IQ4_XS = 22, // except 1d tensors
412
419
  GGML_FTYPE_MOSTLY_IQ1_M = 23, // except 1d tensors
420
+ GGML_FTYPE_MOSTLY_BF16 = 24, // except 1d tensors
413
421
  };
414
422
 
415
423
  // available tensor operations:
@@ -3175,6 +3175,7 @@ struct llama_model_loader {
3175
3175
  switch (type_max) {
3176
3176
  case GGML_TYPE_F32: ftype = LLAMA_FTYPE_ALL_F32; break;
3177
3177
  case GGML_TYPE_F16: ftype = LLAMA_FTYPE_MOSTLY_F16; break;
3178
+ case GGML_TYPE_BF16: ftype = LLAMA_FTYPE_MOSTLY_BF16; break;
3178
3179
  case GGML_TYPE_Q4_0: ftype = LLAMA_FTYPE_MOSTLY_Q4_0; break;
3179
3180
  case GGML_TYPE_Q4_1: ftype = LLAMA_FTYPE_MOSTLY_Q4_1; break;
3180
3181
  case GGML_TYPE_Q5_0: ftype = LLAMA_FTYPE_MOSTLY_Q5_0; break;
@@ -3666,6 +3667,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
3666
3667
  switch (ftype) {
3667
3668
  case LLAMA_FTYPE_ALL_F32: return "all F32";
3668
3669
  case LLAMA_FTYPE_MOSTLY_F16: return "F16";
3670
+ case LLAMA_FTYPE_MOSTLY_BF16: return "BF16";
3669
3671
  case LLAMA_FTYPE_MOSTLY_Q4_0: return "Q4_0";
3670
3672
  case LLAMA_FTYPE_MOSTLY_Q4_1: return "Q4_1";
3671
3673
  case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
@@ -4383,6 +4385,21 @@ static void llm_load_vocab(
4383
4385
  } else if (
4384
4386
  tokenizer_pre == "gpt-2") {
4385
4387
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
4388
+ } else if (
4389
+ tokenizer_pre == "refact") {
4390
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_REFACT;
4391
+ } else if (
4392
+ tokenizer_pre == "command-r") {
4393
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_COMMAND_R;
4394
+ } else if (
4395
+ tokenizer_pre == "qwen2") {
4396
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_QWEN2;
4397
+ } else if (
4398
+ tokenizer_pre == "olmo") {
4399
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_OLMO;
4400
+ } else if (
4401
+ tokenizer_pre == "dbrx") {
4402
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DBRX;
4386
4403
  } else {
4387
4404
  throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
4388
4405
  }
@@ -6120,6 +6137,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
6120
6137
  || !(
6121
6138
  model.ftype == LLAMA_FTYPE_ALL_F32 ||
6122
6139
  model.ftype == LLAMA_FTYPE_MOSTLY_F16 ||
6140
+ model.ftype == LLAMA_FTYPE_MOSTLY_BF16 ||
6123
6141
  model.ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ||
6124
6142
  model.ftype == LLAMA_FTYPE_MOSTLY_Q4_1
6125
6143
  )
@@ -11952,7 +11970,7 @@ static bool llama_is_user_defined_token(const llama_vocab& vocab, llama_token id
11952
11970
  static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
11953
11971
  GGML_ASSERT(llama_vocab_get_type(vocab) != LLAMA_VOCAB_TYPE_NONE);
11954
11972
  GGML_ASSERT(llama_is_byte_token(vocab, id));
11955
- const auto& token_data = vocab.id_to_token.at(id);
11973
+ const auto & token_data = vocab.id_to_token.at(id);
11956
11974
  switch (llama_vocab_get_type(vocab)) {
11957
11975
  case LLAMA_VOCAB_TYPE_SPM: {
11958
11976
  auto buf = token_data.text.substr(3, 2);
@@ -12188,6 +12206,7 @@ struct llm_tokenizer_bpe {
12188
12206
  case LLAMA_VOCAB_TYPE_BPE:
12189
12207
  switch (vocab.type_pre) {
12190
12208
  case LLAMA_VOCAB_PRE_TYPE_LLAMA3:
12209
+ case LLAMA_VOCAB_PRE_TYPE_DBRX:
12191
12210
  word_collection = unicode_regex_split(text, {
12192
12211
  // original regex from tokenizer.json
12193
12212
  //"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
@@ -12212,14 +12231,13 @@ struct llm_tokenizer_bpe {
12212
12231
  "\\s?\\p{L}+",
12213
12232
  "\\s?\\p{P}+",
12214
12233
  "[一-龥ࠀ-一가-퟿]+",
12215
- "\\p{N}+",
12234
+ "\\p{N}",
12216
12235
  });
12217
12236
  break;
12218
12237
  case LLAMA_VOCAB_PRE_TYPE_FALCON:
12219
12238
  word_collection = unicode_regex_split(text, {
12220
12239
  "[\\p{P}\\$\\+<=>\\^~\\|]+",
12221
12240
  "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
12222
- "\\p{N}+",
12223
12241
  "[0-9][0-9][0-9]",
12224
12242
  });
12225
12243
  break;
@@ -12235,11 +12253,26 @@ struct llm_tokenizer_bpe {
12235
12253
  });
12236
12254
  break;
12237
12255
  case LLAMA_VOCAB_PRE_TYPE_STARCODER:
12256
+ case LLAMA_VOCAB_PRE_TYPE_REFACT:
12257
+ case LLAMA_VOCAB_PRE_TYPE_COMMAND_R:
12258
+ word_collection = unicode_regex_split(text, {
12259
+ "\\p{N}",
12260
+ "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
12261
+ });
12262
+ break;
12238
12263
  case LLAMA_VOCAB_PRE_TYPE_GPT2:
12264
+ case LLAMA_VOCAB_PRE_TYPE_OLMO:
12239
12265
  word_collection = unicode_regex_split(text, {
12240
12266
  "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
12241
12267
  });
12242
12268
  break;
12269
+ case LLAMA_VOCAB_PRE_TYPE_QWEN2:
12270
+ word_collection = unicode_regex_split(text, {
12271
+ // original regex from tokenizer.json
12272
+ // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
12273
+ "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
12274
+ });
12275
+ break;
12243
12276
  default:
12244
12277
  // default regex for BPE tokenization pre-processing
12245
12278
  word_collection = unicode_regex_split(text, {
@@ -12455,7 +12488,7 @@ struct llm_tokenizer_wpm {
12455
12488
  continue;
12456
12489
  }
12457
12490
  code = unicode_tolower(code);
12458
- if (type == CODEPOINT_TYPE_WHITESPACE) {
12491
+ if (type == CODEPOINT_TYPE_SEPARATOR) {
12459
12492
  code = ' ';
12460
12493
  }
12461
12494
  std::string s = unicode_cpt_to_utf8(code);
@@ -14142,13 +14175,16 @@ static void llama_tensor_dequantize_internal(
14142
14175
  if (qtype.to_float == NULL) {
14143
14176
  throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor->type)));
14144
14177
  }
14145
- } else if (tensor->type != GGML_TYPE_F16) {
14178
+ } else if (tensor->type != GGML_TYPE_F16 &&
14179
+ tensor->type != GGML_TYPE_BF16) {
14146
14180
  throw std::runtime_error(format("cannot dequantize/convert tensor type %s", ggml_type_name(tensor->type)));
14147
14181
  }
14148
14182
 
14149
14183
  if (nthread < 2) {
14150
14184
  if (tensor->type == GGML_TYPE_F16) {
14151
14185
  ggml_fp16_to_fp32_row((ggml_fp16_t *)tensor->data, f32_output, nelements);
14186
+ } else if (tensor->type == GGML_TYPE_BF16) {
14187
+ ggml_bf16_to_fp32_row((ggml_bf16_t *)tensor->data, f32_output, nelements);
14152
14188
  } else if (ggml_is_quantized(tensor->type)) {
14153
14189
  qtype.to_float(tensor->data, f32_output, nelements);
14154
14190
  } else {
@@ -14157,7 +14193,14 @@ static void llama_tensor_dequantize_internal(
14157
14193
  return;
14158
14194
  }
14159
14195
 
14160
- size_t block_size = tensor->type == GGML_TYPE_F16 ? 1 : (size_t)ggml_blck_size(tensor->type);
14196
+ size_t block_size;
14197
+ if (tensor->type == GGML_TYPE_F16 ||
14198
+ tensor->type == GGML_TYPE_BF16) {
14199
+ block_size = 1;
14200
+ } else {
14201
+ block_size = (size_t)ggml_blck_size(tensor->type);
14202
+ }
14203
+
14161
14204
  size_t block_size_bytes = ggml_type_size(tensor->type);
14162
14205
 
14163
14206
  GGML_ASSERT(nelements % block_size == 0);
@@ -14176,6 +14219,8 @@ static void llama_tensor_dequantize_internal(
14176
14219
  auto compute = [qtype] (ggml_type typ, uint8_t * inbuf, float * outbuf, int nels) {
14177
14220
  if (typ == GGML_TYPE_F16) {
14178
14221
  ggml_fp16_to_fp32_row((ggml_fp16_t *)inbuf, outbuf, nels);
14222
+ } else if (typ == GGML_TYPE_BF16) {
14223
+ ggml_bf16_to_fp32_row((ggml_bf16_t *)inbuf, outbuf, nels);
14179
14224
  } else {
14180
14225
  qtype.to_float(inbuf, outbuf, nels);
14181
14226
  }
@@ -14536,6 +14581,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
14536
14581
  case LLAMA_FTYPE_MOSTLY_Q5_1: default_type = GGML_TYPE_Q5_1; break;
14537
14582
  case LLAMA_FTYPE_MOSTLY_Q8_0: default_type = GGML_TYPE_Q8_0; break;
14538
14583
  case LLAMA_FTYPE_MOSTLY_F16: default_type = GGML_TYPE_F16; break;
14584
+ case LLAMA_FTYPE_MOSTLY_BF16: default_type = GGML_TYPE_BF16; break;
14539
14585
  case LLAMA_FTYPE_ALL_F32: default_type = GGML_TYPE_F32; break;
14540
14586
 
14541
14587
  // K-quants
@@ -15473,13 +15519,6 @@ struct llama_context * llama_new_context_with_model(
15473
15519
  cparams.flash_attn = false;
15474
15520
  }
15475
15521
 
15476
- #ifdef GGML_USE_HIPBLAS
15477
- if (cparams.flash_attn) {
15478
- LLAMA_LOG_WARN("%s: flash_attn is not yet compatible with HIPBLAS builds - forcing off\n", __func__);
15479
- cparams.flash_attn = false;
15480
- }
15481
- #endif
15482
-
15483
15522
  if (params.seed == LLAMA_DEFAULT_SEED) {
15484
15523
  params.seed = time(NULL);
15485
15524
  }
@@ -17466,9 +17505,10 @@ int32_t llama_tokenize(
17466
17505
 
17467
17506
  static std::string llama_decode_text(const std::string & text) {
17468
17507
  std::string decoded_text;
17469
- auto unicode_sequences = unicode_cpts_from_utf8(text);
17470
- for (auto & unicode_sequence : unicode_sequences) {
17471
- decoded_text += unicode_utf8_to_byte(unicode_cpt_to_utf8(unicode_sequence));
17508
+
17509
+ const auto cpts = unicode_cpts_from_utf8(text);
17510
+ for (const auto cpt : cpts) {
17511
+ decoded_text += unicode_utf8_to_byte(unicode_cpt_to_utf8(cpt));
17472
17512
  }
17473
17513
 
17474
17514
  return decoded_text;
@@ -17832,7 +17872,7 @@ struct llama_timings llama_get_timings(struct llama_context * ctx) {
17832
17872
  /*.t_eval_ms =*/ 1e-3 * ctx->t_eval_us,
17833
17873
 
17834
17874
  /*.n_sample =*/ std::max(1, ctx->n_sample),
17835
- /*.n_p_eval =*/ std::max(1, ctx->n_p_eval),
17875
+ /*.n_p_eval =*/ std::max(0, ctx->n_p_eval),
17836
17876
  /*.n_eval =*/ std::max(1, ctx->n_eval),
17837
17877
  };
17838
17878
 
@@ -79,6 +79,11 @@ extern "C" {
79
79
  LLAMA_VOCAB_PRE_TYPE_MPT = 5,
80
80
  LLAMA_VOCAB_PRE_TYPE_STARCODER = 6,
81
81
  LLAMA_VOCAB_PRE_TYPE_GPT2 = 7,
82
+ LLAMA_VOCAB_PRE_TYPE_REFACT = 8,
83
+ LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 9,
84
+ LLAMA_VOCAB_PRE_TYPE_QWEN2 = 10,
85
+ LLAMA_VOCAB_PRE_TYPE_OLMO = 11,
86
+ LLAMA_VOCAB_PRE_TYPE_DBRX = 12,
82
87
  };
83
88
 
84
89
  // note: these values should be synchronized with ggml_rope
@@ -134,6 +139,7 @@ extern "C" {
134
139
  LLAMA_FTYPE_MOSTLY_IQ2_M = 29, // except 1d tensors
135
140
  LLAMA_FTYPE_MOSTLY_IQ4_XS = 30, // except 1d tensors
136
141
  LLAMA_FTYPE_MOSTLY_IQ1_M = 31, // except 1d tensors
142
+ LLAMA_FTYPE_MOSTLY_BF16 = 32, // except 1d tensors
137
143
 
138
144
  LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
139
145
  };
@@ -171,7 +177,7 @@ extern "C" {
171
177
  bool sorted;
172
178
  } llama_token_data_array;
173
179
 
174
- typedef bool (*llama_progress_callback)(float progress, void *ctx);
180
+ typedef bool (*llama_progress_callback)(float progress, void * user_data);
175
181
 
176
182
  // Input data for llama_decode
177
183
  // A llama_batch object can contain input about one or many sequences
@@ -1,6 +1,3 @@
1
- // -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
2
- // vi: set et ft=c++ ts=4 sts=4 sw=4 fenc=utf-8 :vi
3
- //
4
1
  // Copyright 2024 Mozilla Foundation
5
2
  //
6
3
  // Permission is hereby granted, free of charge, to any person obtaining
@@ -585,15 +582,15 @@ class tinyBLAS_Q0_ARM {
585
582
  };
586
583
  #endif // __ARM_FEATURE_DOTPROD
587
584
 
588
- #if defined(__AVX2__) || defined(__AVX512F__)
585
+ #if defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX__)
589
586
  template <typename TA, typename TB, typename TC>
590
- class tinyBLAS_Q0_AVX2 {
587
+ class tinyBLAS_Q0_AVX {
591
588
  public:
592
- tinyBLAS_Q0_AVX2(int64_t k,
593
- const TA *A, int64_t lda,
594
- const TB *B, int64_t ldb,
595
- TC *C, int64_t ldc,
596
- int ith, int nth)
589
+ tinyBLAS_Q0_AVX(int64_t k,
590
+ const TA *A, int64_t lda,
591
+ const TB *B, int64_t ldb,
592
+ TC *C, int64_t ldc,
593
+ int ith, int nth)
597
594
  : A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
598
595
  }
599
596
 
@@ -728,14 +725,34 @@ class tinyBLAS_Q0_AVX2 {
728
725
  __m256 Cv[RN][RM] = {};
729
726
  for (int64_t l = 0; l < k; ++l)
730
727
  for (int64_t j = 0; j < RN; ++j)
731
- for (int64_t i = 0; i < RM; ++i)
728
+ for (int64_t i = 0; i < RM; ++i) {
729
+ #if defined(__AVX2__)
730
+ __m256 udTmp = updot(_mm256_sign_epi8(load(A + lda * (ii + i) + l),
731
+ load(A + lda * (ii + i) + l)),
732
+ _mm256_sign_epi8(load(B + ldb * (jj + j) + l),
733
+ load(A + lda * (ii + i) + l)));
734
+ #else
735
+ __m128i ali0 = load0(A + lda * (ii + i) + l);
736
+ __m128i ali1 = load1(A + lda * (ii + i) + l);
737
+ __m128i blj0 = load0(B + ldb * (jj + j) + l);
738
+ __m128i blj1 = load1(B + ldb * (jj + j) + l);
739
+
740
+ __m128i sepAA0 = _mm_sign_epi8(ali0, ali0);
741
+ __m128i sepAA1 = _mm_sign_epi8(ali1, ali1);
742
+ __m128i sepBA0 = _mm_sign_epi8(blj0, ali0);
743
+ __m128i sepBA1 = _mm_sign_epi8(blj1, ali1);
744
+
745
+ // updot
746
+ const __m128i oneFill = _mm_set1_epi16(1);
747
+ __m128i mad0 = _mm_maddubs_epi16(sepAA0, sepBA0);
748
+ __m128i mad1 = _mm_maddubs_epi16(sepAA1, sepBA1);
749
+ __m256 udTmp = _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_madd_epi16(oneFill, mad1), _mm_madd_epi16(oneFill, mad0)));
750
+ #endif
732
751
  Cv[j][i] = madd(_mm256_set1_ps(unhalf(A[lda * (ii + i) + l].d) *
733
752
  unhalf(B[ldb * (jj + j) + l].d)),
734
- updot(_mm256_sign_epi8(load(A + lda * (ii + i) + l),
735
- load(A + lda * (ii + i) + l)),
736
- _mm256_sign_epi8(load(B + ldb * (jj + j) + l),
737
- load(A + lda * (ii + i) + l))),
738
- Cv[j][i]);
753
+ udTmp,
754
+ Cv[j][i]);
755
+ }
739
756
  for (int64_t j = 0; j < RN; ++j)
740
757
  for (int64_t i = 0; i < RM; ++i)
741
758
  C[ldc * (jj + j) + (ii + i)] = hsum(Cv[j][i]);
@@ -746,10 +763,28 @@ class tinyBLAS_Q0_AVX2 {
746
763
  return _mm256_loadu_si256((const __m256i *)b->qs);
747
764
  }
748
765
 
766
+ inline __m128i load0(const block_q8_0 *b) {
767
+ return _mm_loadu_si128((const __m128i *)b->qs);
768
+ }
769
+
770
+ inline __m128i load1(const block_q8_0 *b) {
771
+ return _mm_loadu_si128(((const __m128i *)b->qs) + 1);
772
+ }
773
+
749
774
  inline __m256i load(const block_q4_0 *b) {
750
775
  return _mm256_sub_epi8(denibble(b->qs), _mm256_set1_epi8(8));
751
776
  }
752
777
 
778
+ inline __m128i load0(const block_q4_0 *b) {
779
+ const __m128i x = _mm_loadu_si128((const __m128i *)(b->qs));
780
+ return _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), x), _mm_set1_epi8(8));
781
+ }
782
+
783
+ inline __m128i load1(const block_q4_0 *b) {
784
+ const __m128i x = _mm_loadu_si128((const __m128i *)(b->qs));
785
+ return _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(x, 4)), _mm_set1_epi8(8));
786
+ }
787
+
753
788
  inline __m256 updot(__m256i u, __m256i s) {
754
789
  __m256i res;
755
790
  #if defined(__AVXVNNI__) || (defined(__AVX512VNNI__) && defined(__AVX512VL__))
@@ -777,7 +812,7 @@ class tinyBLAS_Q0_AVX2 {
777
812
  const int ith;
778
813
  const int nth;
779
814
  };
780
- #endif // __AVX2__
815
+ #endif // __AVX__
781
816
 
782
817
  } // namespace
783
818
 
@@ -928,8 +963,8 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
928
963
  case GGML_TYPE_Q8_0: {
929
964
  if (Btype != GGML_TYPE_Q8_0)
930
965
  return false;
931
- #if defined(__AVX2__) || defined(__AVX512F__)
932
- tinyBLAS_Q0_AVX2<block_q8_0, block_q8_0, float> tb{
966
+ #if defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX__)
967
+ tinyBLAS_Q0_AVX<block_q8_0, block_q8_0, float> tb{
933
968
  k, (const block_q8_0 *)A, lda,
934
969
  (const block_q8_0 *)B, ldb,
935
970
  (float *)C, ldc,
@@ -952,8 +987,8 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
952
987
  case GGML_TYPE_Q4_0: {
953
988
  if (Btype != GGML_TYPE_Q8_0)
954
989
  return false;
955
- #if defined(__AVX2__) || defined(__AVX512F__)
956
- tinyBLAS_Q0_AVX2<block_q4_0, block_q8_0, float> tb{
990
+ #if defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX__)
991
+ tinyBLAS_Q0_AVX<block_q4_0, block_q8_0, float> tb{
957
992
  k, (const block_q4_0 *)A, lda,
958
993
  (const block_q8_0 *)B, ldb,
959
994
  (float *)C, ldc,