llama_cpp 0.14.7 → 0.15.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -0
- data/README.md +2 -2
- data/ext/llama_cpp/extconf.rb +2 -1
- data/ext/llama_cpp/llama_cpp.cpp +53 -9
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +18 -3
- data/vendor/tmp/llama.cpp/Makefile +41 -16
- data/vendor/tmp/llama.cpp/ggml-backend.c +7 -5
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +6 -0
- data/vendor/tmp/llama.cpp/ggml-impl.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +7 -0
- data/vendor/tmp/llama.cpp/ggml-metal.m +376 -176
- data/vendor/tmp/llama.cpp/ggml-metal.metal +654 -18
- data/vendor/tmp/llama.cpp/ggml-quants.c +284 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +17 -7
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +5 -0
- data/vendor/tmp/llama.cpp/ggml.c +391 -27
- data/vendor/tmp/llama.cpp/ggml.h +22 -0
- data/vendor/tmp/llama.cpp/llama.cpp +623 -395
- data/vendor/tmp/llama.cpp/llama.h +27 -9
- data/vendor/tmp/llama.cpp/sgemm.cpp +83 -87
- data/vendor/tmp/llama.cpp/sgemm.h +4 -2
- data/vendor/tmp/llama.cpp/unicode-data.cpp +1 -1
- data/vendor/tmp/llama.cpp/unicode-data.h +2 -2
- data/vendor/tmp/llama.cpp/unicode.cpp +448 -39
- data/vendor/tmp/llama.cpp/unicode.h +2 -1
- metadata +3 -3
@@ -12383,3 +12383,287 @@ void quantize_row_iq2_s(const float * restrict x, void * restrict vy, int64_t k)
|
|
12383
12383
|
block_iq2_s * restrict y = vy;
|
12384
12384
|
quantize_row_iq2_s_reference(x, y, k);
|
12385
12385
|
}
|
12386
|
+
|
12387
|
+
static bool validate_float(float f, size_t i) {
|
12388
|
+
if (isinf(f)) {
|
12389
|
+
fprintf(stderr, "ggml_validate_row_data: found inf value at block %zu\n", i);
|
12390
|
+
return false;
|
12391
|
+
}
|
12392
|
+
|
12393
|
+
if (isnan(f)) {
|
12394
|
+
fprintf(stderr, "ggml_validate_row_data: found nan value at block %zu\n", i);
|
12395
|
+
return false;
|
12396
|
+
}
|
12397
|
+
|
12398
|
+
return true;
|
12399
|
+
}
|
12400
|
+
|
12401
|
+
static bool isinf_fp16(ggml_fp16_t f) {
|
12402
|
+
return (f & 0x7c00) == 0x7c00 && (f & 0x03ff) == 0;
|
12403
|
+
}
|
12404
|
+
|
12405
|
+
static bool isnan_fp16(ggml_fp16_t f) {
|
12406
|
+
return (f & 0x7c00) == 0x7c00 && (f & 0x03ff) != 0;
|
12407
|
+
}
|
12408
|
+
|
12409
|
+
static bool validate_fp16(ggml_fp16_t f, size_t i) {
|
12410
|
+
if (isinf_fp16(f)) {
|
12411
|
+
fprintf(stderr, "ggml_validate_row_data: found inf value at block %zu\n", i);
|
12412
|
+
return false;
|
12413
|
+
}
|
12414
|
+
|
12415
|
+
if (isnan_fp16(f)) {
|
12416
|
+
fprintf(stderr, "ggml_validate_row_data: found nan value at block %zu\n", i);
|
12417
|
+
return false;
|
12418
|
+
}
|
12419
|
+
|
12420
|
+
return true;
|
12421
|
+
}
|
12422
|
+
|
12423
|
+
#define VALIDATE_ROW_DATA_D_F16_IMPL(type, data, nb) \
|
12424
|
+
const type * q = (const type *) (data); \
|
12425
|
+
for (size_t i = 0; i < (nb); ++i) { \
|
12426
|
+
if (!validate_fp16(q[i].d, i)) { \
|
12427
|
+
return false; \
|
12428
|
+
} \
|
12429
|
+
}
|
12430
|
+
|
12431
|
+
#define VALIDATE_ROW_DATA_DM_F16_IMPL(type, data, nb, d, m) \
|
12432
|
+
const type * q = (const type *) (data); \
|
12433
|
+
for (size_t i = 0; i < (nb); ++i) { \
|
12434
|
+
if (!validate_fp16(q[i].d, i) || !validate_fp16(q[i].m, i)) { \
|
12435
|
+
return false; \
|
12436
|
+
} \
|
12437
|
+
}
|
12438
|
+
|
12439
|
+
bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbytes) {
|
12440
|
+
if (type < 0 || type >= GGML_TYPE_COUNT) {
|
12441
|
+
fprintf(stderr, "%s: invalid type %d\n", __func__, type);
|
12442
|
+
return false;
|
12443
|
+
}
|
12444
|
+
|
12445
|
+
if (nbytes % ggml_type_size(type) != 0) {
|
12446
|
+
fprintf(stderr, "%s: invalid size %zu for type %d\n", __func__, nbytes, type);
|
12447
|
+
return false;
|
12448
|
+
}
|
12449
|
+
|
12450
|
+
const size_t nb = nbytes/ggml_type_size(type);
|
12451
|
+
|
12452
|
+
switch (type) {
|
12453
|
+
case GGML_TYPE_F16:
|
12454
|
+
{
|
12455
|
+
const ggml_fp16_t * f = (const ggml_fp16_t *) data;
|
12456
|
+
size_t i = 0;
|
12457
|
+
#if defined(__AVX2__)
|
12458
|
+
for (; i + 15 < nb; i += 16) {
|
12459
|
+
__m256i v = _mm256_loadu_si256((const __m256i *)(f + i));
|
12460
|
+
__m256i vexp = _mm256_and_si256(v, _mm256_set1_epi16(0x7c00));
|
12461
|
+
__m256i cmp = _mm256_cmpeq_epi16(vexp, _mm256_set1_epi16(0x7c00));
|
12462
|
+
int mask = _mm256_movemask_epi8(cmp);
|
12463
|
+
if (mask) {
|
12464
|
+
for (size_t j = 0; j < 16; ++j) {
|
12465
|
+
if (!validate_fp16(f[i + j], i + j)) {
|
12466
|
+
return false;
|
12467
|
+
}
|
12468
|
+
}
|
12469
|
+
GGML_UNREACHABLE();
|
12470
|
+
}
|
12471
|
+
}
|
12472
|
+
#elif defined(__ARM_NEON)
|
12473
|
+
for (; i + 7 < nb; i += 8) {
|
12474
|
+
uint16x8_t v = vld1q_u16(f + i);
|
12475
|
+
uint16x8_t vexp = vandq_u16(v, vdupq_n_u16(0x7c00));
|
12476
|
+
uint16x8_t cmp = vceqq_u16(vexp, vdupq_n_u16(0x7c00));
|
12477
|
+
uint64_t mask = vget_lane_u64(vreinterpret_u64_u8(vshrn_n_u16(cmp, 4)), 0);
|
12478
|
+
if (mask) {
|
12479
|
+
for (size_t j = 0; j < 8; ++j) {
|
12480
|
+
if (!validate_fp16(f[i + j], i + j)) {
|
12481
|
+
return false;
|
12482
|
+
}
|
12483
|
+
}
|
12484
|
+
GGML_UNREACHABLE();
|
12485
|
+
}
|
12486
|
+
}
|
12487
|
+
#endif
|
12488
|
+
for (; i < nb; ++i) {
|
12489
|
+
if (!validate_fp16(f[i], i)) {
|
12490
|
+
return false;
|
12491
|
+
}
|
12492
|
+
}
|
12493
|
+
} break;
|
12494
|
+
case GGML_TYPE_F32:
|
12495
|
+
{
|
12496
|
+
const float * f = (const float *) data;
|
12497
|
+
size_t i = 0;
|
12498
|
+
#if defined(__AVX2__)
|
12499
|
+
for (; i + 7 < nb; i += 8) {
|
12500
|
+
__m256i v = _mm256_loadu_si256((const __m256i *)(f + i));
|
12501
|
+
__m256i vexp = _mm256_and_si256(v, _mm256_set1_epi32(0x7f800000));
|
12502
|
+
__m256i cmp = _mm256_cmpeq_epi32(vexp, _mm256_set1_epi32(0x7f800000));
|
12503
|
+
int mask = _mm256_movemask_epi8(cmp);
|
12504
|
+
if (mask) {
|
12505
|
+
for (size_t j = 0; j < 8; ++j) {
|
12506
|
+
if (!validate_float(f[i + j], i + j)) {
|
12507
|
+
return false;
|
12508
|
+
}
|
12509
|
+
}
|
12510
|
+
GGML_UNREACHABLE();
|
12511
|
+
}
|
12512
|
+
}
|
12513
|
+
#elif defined(__ARM_NEON)
|
12514
|
+
for (; i + 3 < nb; i += 4) {
|
12515
|
+
uint32x4_t v = vld1q_u32((const uint32_t *)f + i);
|
12516
|
+
uint32x4_t vexp = vandq_u32(v, vdupq_n_u32(0x7f800000));
|
12517
|
+
uint32x4_t cmp = vceqq_u32(vexp, vdupq_n_u32(0x7f800000));
|
12518
|
+
uint64_t mask = vget_lane_u64(vreinterpret_u64_u16(vshrn_n_u32(cmp, 8)), 0);
|
12519
|
+
if (mask) {
|
12520
|
+
for (size_t j = 0; j < 4; ++j) {
|
12521
|
+
if (!validate_float(f[i + j], i + j)) {
|
12522
|
+
return false;
|
12523
|
+
}
|
12524
|
+
}
|
12525
|
+
GGML_UNREACHABLE();
|
12526
|
+
}
|
12527
|
+
}
|
12528
|
+
#endif
|
12529
|
+
for (; i < nb; ++i) {
|
12530
|
+
if (!validate_float(f[i], i)) {
|
12531
|
+
return false;
|
12532
|
+
}
|
12533
|
+
}
|
12534
|
+
} break;
|
12535
|
+
case GGML_TYPE_F64:
|
12536
|
+
{
|
12537
|
+
const double * f = (const double *) data;
|
12538
|
+
for (size_t i = 0; i < nb; ++i) {
|
12539
|
+
if (!validate_float(f[i], i)) {
|
12540
|
+
return false;
|
12541
|
+
}
|
12542
|
+
}
|
12543
|
+
} break;
|
12544
|
+
case GGML_TYPE_Q4_0:
|
12545
|
+
{
|
12546
|
+
VALIDATE_ROW_DATA_D_F16_IMPL(block_q4_0, data, nb);
|
12547
|
+
} break;
|
12548
|
+
case GGML_TYPE_Q4_1:
|
12549
|
+
{
|
12550
|
+
VALIDATE_ROW_DATA_DM_F16_IMPL(block_q4_1, data, nb, d, m);
|
12551
|
+
} break;
|
12552
|
+
case GGML_TYPE_Q5_0:
|
12553
|
+
{
|
12554
|
+
VALIDATE_ROW_DATA_D_F16_IMPL(block_q5_0, data, nb);
|
12555
|
+
} break;
|
12556
|
+
case GGML_TYPE_Q5_1:
|
12557
|
+
{
|
12558
|
+
VALIDATE_ROW_DATA_DM_F16_IMPL(block_q5_1, data, nb, d, m);
|
12559
|
+
} break;
|
12560
|
+
case GGML_TYPE_Q8_0:
|
12561
|
+
{
|
12562
|
+
VALIDATE_ROW_DATA_D_F16_IMPL(block_q8_0, data, nb);
|
12563
|
+
} break;
|
12564
|
+
case GGML_TYPE_Q2_K:
|
12565
|
+
{
|
12566
|
+
VALIDATE_ROW_DATA_DM_F16_IMPL(block_q2_K, data, nb, d, dmin);
|
12567
|
+
} break;
|
12568
|
+
case GGML_TYPE_Q3_K:
|
12569
|
+
{
|
12570
|
+
VALIDATE_ROW_DATA_D_F16_IMPL(block_q3_K, data, nb);
|
12571
|
+
} break;
|
12572
|
+
case GGML_TYPE_Q4_K:
|
12573
|
+
{
|
12574
|
+
#ifdef GGML_QKK_64
|
12575
|
+
VALIDATE_ROW_DATA_DM_F16_IMPL(block_q4_K, data, nb, d[0], d[1]);
|
12576
|
+
#else
|
12577
|
+
VALIDATE_ROW_DATA_DM_F16_IMPL(block_q4_K, data, nb, d, dmin);
|
12578
|
+
#endif
|
12579
|
+
} break;
|
12580
|
+
case GGML_TYPE_Q5_K:
|
12581
|
+
{
|
12582
|
+
#ifdef GGML_QKK_64
|
12583
|
+
VALIDATE_ROW_DATA_D_F16_IMPL(block_q5_K, data, nb);
|
12584
|
+
#else
|
12585
|
+
VALIDATE_ROW_DATA_DM_F16_IMPL(block_q5_K, data, nb, d, dmin);
|
12586
|
+
#endif
|
12587
|
+
} break;
|
12588
|
+
case GGML_TYPE_Q6_K:
|
12589
|
+
{
|
12590
|
+
VALIDATE_ROW_DATA_D_F16_IMPL(block_q6_K, data, nb);
|
12591
|
+
} break;
|
12592
|
+
case GGML_TYPE_Q8_K:
|
12593
|
+
{
|
12594
|
+
const block_q8_K * q = (const block_q8_K *) data;
|
12595
|
+
for (size_t i = 0; i < nb; ++i) {
|
12596
|
+
if (!validate_float(q[i].d, i)) {
|
12597
|
+
return false;
|
12598
|
+
}
|
12599
|
+
}
|
12600
|
+
} break;
|
12601
|
+
case GGML_TYPE_IQ1_S:
|
12602
|
+
{
|
12603
|
+
VALIDATE_ROW_DATA_D_F16_IMPL(block_iq1_s, data, nb);
|
12604
|
+
} break;
|
12605
|
+
case GGML_TYPE_IQ1_M:
|
12606
|
+
{
|
12607
|
+
const block_iq1_m * q = (const block_iq1_m *) data;
|
12608
|
+
for (size_t i = 0; i < nb; ++i) {
|
12609
|
+
#if QK_K == 64
|
12610
|
+
if (!validate_fp16(q[i].d, i)) {
|
12611
|
+
return false;
|
12612
|
+
}
|
12613
|
+
#else
|
12614
|
+
iq1m_scale_t scale;
|
12615
|
+
const uint16_t * sc = (const uint16_t *)q[i].scales;
|
12616
|
+
scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
|
12617
|
+
if (!validate_fp16(scale.f16, i)) {
|
12618
|
+
return false;
|
12619
|
+
}
|
12620
|
+
#endif
|
12621
|
+
}
|
12622
|
+
} break;
|
12623
|
+
case GGML_TYPE_IQ2_XXS:
|
12624
|
+
{
|
12625
|
+
VALIDATE_ROW_DATA_D_F16_IMPL(block_iq2_xxs, data, nb);
|
12626
|
+
} break;
|
12627
|
+
case GGML_TYPE_IQ2_XS:
|
12628
|
+
{
|
12629
|
+
VALIDATE_ROW_DATA_D_F16_IMPL(block_iq2_xs, data, nb);
|
12630
|
+
} break;
|
12631
|
+
case GGML_TYPE_IQ2_S:
|
12632
|
+
{
|
12633
|
+
VALIDATE_ROW_DATA_D_F16_IMPL(block_iq2_s, data, nb);
|
12634
|
+
} break;
|
12635
|
+
case GGML_TYPE_IQ3_XXS:
|
12636
|
+
{
|
12637
|
+
VALIDATE_ROW_DATA_D_F16_IMPL(block_iq3_xxs, data, nb);
|
12638
|
+
} break;
|
12639
|
+
|
12640
|
+
case GGML_TYPE_IQ3_S:
|
12641
|
+
{
|
12642
|
+
VALIDATE_ROW_DATA_D_F16_IMPL(block_iq3_s, data, nb);
|
12643
|
+
} break;
|
12644
|
+
case GGML_TYPE_IQ4_XS:
|
12645
|
+
#if QK_K != 64
|
12646
|
+
{
|
12647
|
+
VALIDATE_ROW_DATA_D_F16_IMPL(block_iq4_xs, data, nb);
|
12648
|
+
} break;
|
12649
|
+
#endif
|
12650
|
+
// with QK_K == 64, iq4_xs is iq4_nl
|
12651
|
+
case GGML_TYPE_IQ4_NL:
|
12652
|
+
{
|
12653
|
+
VALIDATE_ROW_DATA_D_F16_IMPL(block_iq4_nl, data, nb);
|
12654
|
+
} break;
|
12655
|
+
case GGML_TYPE_I8:
|
12656
|
+
case GGML_TYPE_I16:
|
12657
|
+
case GGML_TYPE_I32:
|
12658
|
+
case GGML_TYPE_I64:
|
12659
|
+
// nothing to validate
|
12660
|
+
break;
|
12661
|
+
default:
|
12662
|
+
{
|
12663
|
+
fprintf(stderr, "%s: invalid type %d\n", __func__, type);
|
12664
|
+
return false;
|
12665
|
+
}
|
12666
|
+
}
|
12667
|
+
|
12668
|
+
return true;
|
12669
|
+
}
|
@@ -13416,11 +13416,16 @@ void print_device_detail(int id, sycl::device &device, std::string device_type)
|
|
13416
13416
|
version += std::to_string(prop.get_minor_version());
|
13417
13417
|
|
13418
13418
|
device_type = std::regex_replace(device_type, std::regex("ext_oneapi_"), "");
|
13419
|
+
std::string name = std::string(prop.get_name());
|
13420
|
+
name = std::regex_replace(name, std::regex("\\(R\\)"), "");
|
13421
|
+
name = std::regex_replace(name, std::regex("\\(TM\\)"), "");
|
13419
13422
|
|
13420
|
-
|
13421
|
-
|
13423
|
+
auto global_mem_size = prop.get_global_mem_size()/1000000;
|
13424
|
+
|
13425
|
+
fprintf(stderr, "|%2d|%19s|%39s|%7s|%7d|%8d|%5d|%6luM|%21s|\n", id, device_type.c_str(),
|
13426
|
+
name.c_str(), version.c_str(), prop.get_max_compute_units(),
|
13422
13427
|
prop.get_max_work_group_size(), prop.get_max_sub_group_size(),
|
13423
|
-
|
13428
|
+
global_mem_size, device.get_info<sycl::info::device::driver_version>().c_str());
|
13424
13429
|
}
|
13425
13430
|
|
13426
13431
|
void ggml_backend_sycl_print_sycl_devices() {
|
@@ -13428,9 +13433,10 @@ void ggml_backend_sycl_print_sycl_devices() {
|
|
13428
13433
|
int device_count = dpct::dev_mgr::instance().device_count();
|
13429
13434
|
std::map<std::string, size_t> DeviceNums;
|
13430
13435
|
fprintf(stderr, "found %d SYCL devices:\n", device_count);
|
13431
|
-
fprintf(stderr, "| |
|
13432
|
-
fprintf(stderr, "|
|
13433
|
-
fprintf(stderr, "
|
13436
|
+
fprintf(stderr, "| | | | |Max | |Max |Global | |\n");
|
13437
|
+
fprintf(stderr, "| | | | |compute|Max work|sub |mem | |\n");
|
13438
|
+
fprintf(stderr, "|ID| Device Type| Name|Version|units |group |group|size | Driver version|\n");
|
13439
|
+
fprintf(stderr, "|--|-------------------|---------------------------------------|-------|-------|--------|-----|-------|---------------------|\n");
|
13434
13440
|
for (int id = 0; id < device_count; ++id) {
|
13435
13441
|
sycl::device device = dpct::dev_mgr::instance().get_device(id);
|
13436
13442
|
sycl::backend backend = device.get_backend();
|
@@ -14738,7 +14744,12 @@ inline void ggml_sycl_op_soft_max(const ggml_tensor *src0,
|
|
14738
14744
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
14739
14745
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
14740
14746
|
|
14747
|
+
const ggml_tensor * src2 = dst->src[2];
|
14748
|
+
|
14749
|
+
#pragma message("TODO: add ggml_sycl_op_soft_max() F16 src1 and src2 support")
|
14750
|
+
#pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5021")
|
14741
14751
|
GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F32); // src1 contains mask and it is optional
|
14752
|
+
GGML_ASSERT(!src2 || src2->type == GGML_TYPE_F32); // src2 contains positions and it is optional
|
14742
14753
|
|
14743
14754
|
const int64_t ne00 = src0->ne[0];
|
14744
14755
|
const int64_t nrows_x = ggml_nrows(src0);
|
@@ -14754,7 +14765,6 @@ inline void ggml_sycl_op_soft_max(const ggml_tensor *src0,
|
|
14754
14765
|
float * src2_dd = nullptr;
|
14755
14766
|
sycl_pool_alloc<float> src2_f;
|
14756
14767
|
|
14757
|
-
ggml_tensor * src2 = dst->src[2];
|
14758
14768
|
const bool use_src2 = src2 != nullptr;
|
14759
14769
|
|
14760
14770
|
if (use_src2) {
|
@@ -3178,6 +3178,11 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
|
|
3178
3178
|
}
|
3179
3179
|
return nullptr;
|
3180
3180
|
case GGML_OP_SOFT_MAX:
|
3181
|
+
#pragma message("TODO: add ggml_vk_soft_max() F16 src1 and src2 support")
|
3182
|
+
#pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5021")
|
3183
|
+
GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F32);
|
3184
|
+
GGML_ASSERT(!src2 || src2->type == GGML_TYPE_F32);
|
3185
|
+
|
3181
3186
|
if (src0->type == GGML_TYPE_F32 && (src1 == nullptr || src1->type == GGML_TYPE_F32) && (src2 == nullptr || src2->type == GGML_TYPE_F32) && dst->type == GGML_TYPE_F32) {
|
3182
3187
|
return ctx->device->pipeline_soft_max_f32;
|
3183
3188
|
}
|