llama_cpp 0.14.7 → 0.15.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -12383,3 +12383,287 @@ void quantize_row_iq2_s(const float * restrict x, void * restrict vy, int64_t k)
12383
12383
  block_iq2_s * restrict y = vy;
12384
12384
  quantize_row_iq2_s_reference(x, y, k);
12385
12385
  }
12386
+
12387
+ static bool validate_float(float f, size_t i) {
12388
+ if (isinf(f)) {
12389
+ fprintf(stderr, "ggml_validate_row_data: found inf value at block %zu\n", i);
12390
+ return false;
12391
+ }
12392
+
12393
+ if (isnan(f)) {
12394
+ fprintf(stderr, "ggml_validate_row_data: found nan value at block %zu\n", i);
12395
+ return false;
12396
+ }
12397
+
12398
+ return true;
12399
+ }
12400
+
12401
+ static bool isinf_fp16(ggml_fp16_t f) {
12402
+ return (f & 0x7c00) == 0x7c00 && (f & 0x03ff) == 0;
12403
+ }
12404
+
12405
+ static bool isnan_fp16(ggml_fp16_t f) {
12406
+ return (f & 0x7c00) == 0x7c00 && (f & 0x03ff) != 0;
12407
+ }
12408
+
12409
+ static bool validate_fp16(ggml_fp16_t f, size_t i) {
12410
+ if (isinf_fp16(f)) {
12411
+ fprintf(stderr, "ggml_validate_row_data: found inf value at block %zu\n", i);
12412
+ return false;
12413
+ }
12414
+
12415
+ if (isnan_fp16(f)) {
12416
+ fprintf(stderr, "ggml_validate_row_data: found nan value at block %zu\n", i);
12417
+ return false;
12418
+ }
12419
+
12420
+ return true;
12421
+ }
12422
+
12423
+ #define VALIDATE_ROW_DATA_D_F16_IMPL(type, data, nb) \
12424
+ const type * q = (const type *) (data); \
12425
+ for (size_t i = 0; i < (nb); ++i) { \
12426
+ if (!validate_fp16(q[i].d, i)) { \
12427
+ return false; \
12428
+ } \
12429
+ }
12430
+
12431
+ #define VALIDATE_ROW_DATA_DM_F16_IMPL(type, data, nb, d, m) \
12432
+ const type * q = (const type *) (data); \
12433
+ for (size_t i = 0; i < (nb); ++i) { \
12434
+ if (!validate_fp16(q[i].d, i) || !validate_fp16(q[i].m, i)) { \
12435
+ return false; \
12436
+ } \
12437
+ }
12438
+
12439
+ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbytes) {
12440
+ if (type < 0 || type >= GGML_TYPE_COUNT) {
12441
+ fprintf(stderr, "%s: invalid type %d\n", __func__, type);
12442
+ return false;
12443
+ }
12444
+
12445
+ if (nbytes % ggml_type_size(type) != 0) {
12446
+ fprintf(stderr, "%s: invalid size %zu for type %d\n", __func__, nbytes, type);
12447
+ return false;
12448
+ }
12449
+
12450
+ const size_t nb = nbytes/ggml_type_size(type);
12451
+
12452
+ switch (type) {
12453
+ case GGML_TYPE_F16:
12454
+ {
12455
+ const ggml_fp16_t * f = (const ggml_fp16_t *) data;
12456
+ size_t i = 0;
12457
+ #if defined(__AVX2__)
12458
+ for (; i + 15 < nb; i += 16) {
12459
+ __m256i v = _mm256_loadu_si256((const __m256i *)(f + i));
12460
+ __m256i vexp = _mm256_and_si256(v, _mm256_set1_epi16(0x7c00));
12461
+ __m256i cmp = _mm256_cmpeq_epi16(vexp, _mm256_set1_epi16(0x7c00));
12462
+ int mask = _mm256_movemask_epi8(cmp);
12463
+ if (mask) {
12464
+ for (size_t j = 0; j < 16; ++j) {
12465
+ if (!validate_fp16(f[i + j], i + j)) {
12466
+ return false;
12467
+ }
12468
+ }
12469
+ GGML_UNREACHABLE();
12470
+ }
12471
+ }
12472
+ #elif defined(__ARM_NEON)
12473
+ for (; i + 7 < nb; i += 8) {
12474
+ uint16x8_t v = vld1q_u16(f + i);
12475
+ uint16x8_t vexp = vandq_u16(v, vdupq_n_u16(0x7c00));
12476
+ uint16x8_t cmp = vceqq_u16(vexp, vdupq_n_u16(0x7c00));
12477
+ uint64_t mask = vget_lane_u64(vreinterpret_u64_u8(vshrn_n_u16(cmp, 4)), 0);
12478
+ if (mask) {
12479
+ for (size_t j = 0; j < 8; ++j) {
12480
+ if (!validate_fp16(f[i + j], i + j)) {
12481
+ return false;
12482
+ }
12483
+ }
12484
+ GGML_UNREACHABLE();
12485
+ }
12486
+ }
12487
+ #endif
12488
+ for (; i < nb; ++i) {
12489
+ if (!validate_fp16(f[i], i)) {
12490
+ return false;
12491
+ }
12492
+ }
12493
+ } break;
12494
+ case GGML_TYPE_F32:
12495
+ {
12496
+ const float * f = (const float *) data;
12497
+ size_t i = 0;
12498
+ #if defined(__AVX2__)
12499
+ for (; i + 7 < nb; i += 8) {
12500
+ __m256i v = _mm256_loadu_si256((const __m256i *)(f + i));
12501
+ __m256i vexp = _mm256_and_si256(v, _mm256_set1_epi32(0x7f800000));
12502
+ __m256i cmp = _mm256_cmpeq_epi32(vexp, _mm256_set1_epi32(0x7f800000));
12503
+ int mask = _mm256_movemask_epi8(cmp);
12504
+ if (mask) {
12505
+ for (size_t j = 0; j < 8; ++j) {
12506
+ if (!validate_float(f[i + j], i + j)) {
12507
+ return false;
12508
+ }
12509
+ }
12510
+ GGML_UNREACHABLE();
12511
+ }
12512
+ }
12513
+ #elif defined(__ARM_NEON)
12514
+ for (; i + 3 < nb; i += 4) {
12515
+ uint32x4_t v = vld1q_u32((const uint32_t *)f + i);
12516
+ uint32x4_t vexp = vandq_u32(v, vdupq_n_u32(0x7f800000));
12517
+ uint32x4_t cmp = vceqq_u32(vexp, vdupq_n_u32(0x7f800000));
12518
+ uint64_t mask = vget_lane_u64(vreinterpret_u64_u16(vshrn_n_u32(cmp, 8)), 0);
12519
+ if (mask) {
12520
+ for (size_t j = 0; j < 4; ++j) {
12521
+ if (!validate_float(f[i + j], i + j)) {
12522
+ return false;
12523
+ }
12524
+ }
12525
+ GGML_UNREACHABLE();
12526
+ }
12527
+ }
12528
+ #endif
12529
+ for (; i < nb; ++i) {
12530
+ if (!validate_float(f[i], i)) {
12531
+ return false;
12532
+ }
12533
+ }
12534
+ } break;
12535
+ case GGML_TYPE_F64:
12536
+ {
12537
+ const double * f = (const double *) data;
12538
+ for (size_t i = 0; i < nb; ++i) {
12539
+ if (!validate_float(f[i], i)) {
12540
+ return false;
12541
+ }
12542
+ }
12543
+ } break;
12544
+ case GGML_TYPE_Q4_0:
12545
+ {
12546
+ VALIDATE_ROW_DATA_D_F16_IMPL(block_q4_0, data, nb);
12547
+ } break;
12548
+ case GGML_TYPE_Q4_1:
12549
+ {
12550
+ VALIDATE_ROW_DATA_DM_F16_IMPL(block_q4_1, data, nb, d, m);
12551
+ } break;
12552
+ case GGML_TYPE_Q5_0:
12553
+ {
12554
+ VALIDATE_ROW_DATA_D_F16_IMPL(block_q5_0, data, nb);
12555
+ } break;
12556
+ case GGML_TYPE_Q5_1:
12557
+ {
12558
+ VALIDATE_ROW_DATA_DM_F16_IMPL(block_q5_1, data, nb, d, m);
12559
+ } break;
12560
+ case GGML_TYPE_Q8_0:
12561
+ {
12562
+ VALIDATE_ROW_DATA_D_F16_IMPL(block_q8_0, data, nb);
12563
+ } break;
12564
+ case GGML_TYPE_Q2_K:
12565
+ {
12566
+ VALIDATE_ROW_DATA_DM_F16_IMPL(block_q2_K, data, nb, d, dmin);
12567
+ } break;
12568
+ case GGML_TYPE_Q3_K:
12569
+ {
12570
+ VALIDATE_ROW_DATA_D_F16_IMPL(block_q3_K, data, nb);
12571
+ } break;
12572
+ case GGML_TYPE_Q4_K:
12573
+ {
12574
+ #ifdef GGML_QKK_64
12575
+ VALIDATE_ROW_DATA_DM_F16_IMPL(block_q4_K, data, nb, d[0], d[1]);
12576
+ #else
12577
+ VALIDATE_ROW_DATA_DM_F16_IMPL(block_q4_K, data, nb, d, dmin);
12578
+ #endif
12579
+ } break;
12580
+ case GGML_TYPE_Q5_K:
12581
+ {
12582
+ #ifdef GGML_QKK_64
12583
+ VALIDATE_ROW_DATA_D_F16_IMPL(block_q5_K, data, nb);
12584
+ #else
12585
+ VALIDATE_ROW_DATA_DM_F16_IMPL(block_q5_K, data, nb, d, dmin);
12586
+ #endif
12587
+ } break;
12588
+ case GGML_TYPE_Q6_K:
12589
+ {
12590
+ VALIDATE_ROW_DATA_D_F16_IMPL(block_q6_K, data, nb);
12591
+ } break;
12592
+ case GGML_TYPE_Q8_K:
12593
+ {
12594
+ const block_q8_K * q = (const block_q8_K *) data;
12595
+ for (size_t i = 0; i < nb; ++i) {
12596
+ if (!validate_float(q[i].d, i)) {
12597
+ return false;
12598
+ }
12599
+ }
12600
+ } break;
12601
+ case GGML_TYPE_IQ1_S:
12602
+ {
12603
+ VALIDATE_ROW_DATA_D_F16_IMPL(block_iq1_s, data, nb);
12604
+ } break;
12605
+ case GGML_TYPE_IQ1_M:
12606
+ {
12607
+ const block_iq1_m * q = (const block_iq1_m *) data;
12608
+ for (size_t i = 0; i < nb; ++i) {
12609
+ #if QK_K == 64
12610
+ if (!validate_fp16(q[i].d, i)) {
12611
+ return false;
12612
+ }
12613
+ #else
12614
+ iq1m_scale_t scale;
12615
+ const uint16_t * sc = (const uint16_t *)q[i].scales;
12616
+ scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
12617
+ if (!validate_fp16(scale.f16, i)) {
12618
+ return false;
12619
+ }
12620
+ #endif
12621
+ }
12622
+ } break;
12623
+ case GGML_TYPE_IQ2_XXS:
12624
+ {
12625
+ VALIDATE_ROW_DATA_D_F16_IMPL(block_iq2_xxs, data, nb);
12626
+ } break;
12627
+ case GGML_TYPE_IQ2_XS:
12628
+ {
12629
+ VALIDATE_ROW_DATA_D_F16_IMPL(block_iq2_xs, data, nb);
12630
+ } break;
12631
+ case GGML_TYPE_IQ2_S:
12632
+ {
12633
+ VALIDATE_ROW_DATA_D_F16_IMPL(block_iq2_s, data, nb);
12634
+ } break;
12635
+ case GGML_TYPE_IQ3_XXS:
12636
+ {
12637
+ VALIDATE_ROW_DATA_D_F16_IMPL(block_iq3_xxs, data, nb);
12638
+ } break;
12639
+
12640
+ case GGML_TYPE_IQ3_S:
12641
+ {
12642
+ VALIDATE_ROW_DATA_D_F16_IMPL(block_iq3_s, data, nb);
12643
+ } break;
12644
+ case GGML_TYPE_IQ4_XS:
12645
+ #if QK_K != 64
12646
+ {
12647
+ VALIDATE_ROW_DATA_D_F16_IMPL(block_iq4_xs, data, nb);
12648
+ } break;
12649
+ #endif
12650
+ // with QK_K == 64, iq4_xs is iq4_nl
12651
+ case GGML_TYPE_IQ4_NL:
12652
+ {
12653
+ VALIDATE_ROW_DATA_D_F16_IMPL(block_iq4_nl, data, nb);
12654
+ } break;
12655
+ case GGML_TYPE_I8:
12656
+ case GGML_TYPE_I16:
12657
+ case GGML_TYPE_I32:
12658
+ case GGML_TYPE_I64:
12659
+ // nothing to validate
12660
+ break;
12661
+ default:
12662
+ {
12663
+ fprintf(stderr, "%s: invalid type %d\n", __func__, type);
12664
+ return false;
12665
+ }
12666
+ }
12667
+
12668
+ return true;
12669
+ }
@@ -13416,11 +13416,16 @@ void print_device_detail(int id, sycl::device &device, std::string device_type)
13416
13416
  version += std::to_string(prop.get_minor_version());
13417
13417
 
13418
13418
  device_type = std::regex_replace(device_type, std::regex("ext_oneapi_"), "");
13419
+ std::string name = std::string(prop.get_name());
13420
+ name = std::regex_replace(name, std::regex("\\(R\\)"), "");
13421
+ name = std::regex_replace(name, std::regex("\\(TM\\)"), "");
13419
13422
 
13420
- fprintf(stderr, "|%2d|%18s|%45s|%10s|%11d|%8d|%7d|%15lu|\n", id, device_type.c_str(),
13421
- prop.get_name(), version.c_str(), prop.get_max_compute_units(),
13423
+ auto global_mem_size = prop.get_global_mem_size()/1000000;
13424
+
13425
+ fprintf(stderr, "|%2d|%19s|%39s|%7s|%7d|%8d|%5d|%6luM|%21s|\n", id, device_type.c_str(),
13426
+ name.c_str(), version.c_str(), prop.get_max_compute_units(),
13422
13427
  prop.get_max_work_group_size(), prop.get_max_sub_group_size(),
13423
- prop.get_global_mem_size());
13428
+ global_mem_size, device.get_info<sycl::info::device::driver_version>().c_str());
13424
13429
  }
13425
13430
 
13426
13431
  void ggml_backend_sycl_print_sycl_devices() {
@@ -13428,9 +13433,10 @@ void ggml_backend_sycl_print_sycl_devices() {
13428
13433
  int device_count = dpct::dev_mgr::instance().device_count();
13429
13434
  std::map<std::string, size_t> DeviceNums;
13430
13435
  fprintf(stderr, "found %d SYCL devices:\n", device_count);
13431
- fprintf(stderr, "| | | |Compute |Max compute|Max work|Max sub| |\n");
13432
- fprintf(stderr, "|ID| Device Type| Name|capability|units |group |group |Global mem size|\n");
13433
- fprintf(stderr, "|--|------------------|---------------------------------------------|----------|-----------|--------|-------|---------------|\n");
13436
+ fprintf(stderr, "| | | | |Max | |Max |Global | |\n");
13437
+ fprintf(stderr, "| | | | |compute|Max work|sub |mem | |\n");
13438
+ fprintf(stderr, "|ID| Device Type| Name|Version|units |group |group|size | Driver version|\n");
13439
+ fprintf(stderr, "|--|-------------------|---------------------------------------|-------|-------|--------|-----|-------|---------------------|\n");
13434
13440
  for (int id = 0; id < device_count; ++id) {
13435
13441
  sycl::device device = dpct::dev_mgr::instance().get_device(id);
13436
13442
  sycl::backend backend = device.get_backend();
@@ -14738,7 +14744,12 @@ inline void ggml_sycl_op_soft_max(const ggml_tensor *src0,
14738
14744
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
14739
14745
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
14740
14746
 
14747
+ const ggml_tensor * src2 = dst->src[2];
14748
+
14749
+ #pragma message("TODO: add ggml_sycl_op_soft_max() F16 src1 and src2 support")
14750
+ #pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5021")
14741
14751
  GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F32); // src1 contains mask and it is optional
14752
+ GGML_ASSERT(!src2 || src2->type == GGML_TYPE_F32); // src2 contains positions and it is optional
14742
14753
 
14743
14754
  const int64_t ne00 = src0->ne[0];
14744
14755
  const int64_t nrows_x = ggml_nrows(src0);
@@ -14754,7 +14765,6 @@ inline void ggml_sycl_op_soft_max(const ggml_tensor *src0,
14754
14765
  float * src2_dd = nullptr;
14755
14766
  sycl_pool_alloc<float> src2_f;
14756
14767
 
14757
- ggml_tensor * src2 = dst->src[2];
14758
14768
  const bool use_src2 = src2 != nullptr;
14759
14769
 
14760
14770
  if (use_src2) {
@@ -3178,6 +3178,11 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
3178
3178
  }
3179
3179
  return nullptr;
3180
3180
  case GGML_OP_SOFT_MAX:
3181
+ #pragma message("TODO: add ggml_vk_soft_max() F16 src1 and src2 support")
3182
+ #pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5021")
3183
+ GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F32);
3184
+ GGML_ASSERT(!src2 || src2->type == GGML_TYPE_F32);
3185
+
3181
3186
  if (src0->type == GGML_TYPE_F32 && (src1 == nullptr || src1->type == GGML_TYPE_F32) && (src2 == nullptr || src2->type == GGML_TYPE_F32) && dst->type == GGML_TYPE_F32) {
3182
3187
  return ctx->device->pipeline_soft_max_f32;
3183
3188
  }