llama_cpp 0.7.0 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +12 -0
- data/ext/llama_cpp/extconf.rb +1 -1
- data/ext/llama_cpp/llama_cpp.cpp +41 -21
- data/ext/llama_cpp/src/ggml-alloc.c +62 -107
- data/ext/llama_cpp/src/ggml-alloc.h +11 -5
- data/ext/llama_cpp/src/ggml-backend.c +385 -0
- data/ext/llama_cpp/src/ggml-backend.h +143 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +500 -78
- data/ext/llama_cpp/src/ggml-cuda.h +4 -0
- data/ext/llama_cpp/src/ggml-metal.h +18 -1
- data/ext/llama_cpp/src/ggml-metal.m +396 -127
- data/ext/llama_cpp/src/ggml-metal.metal +290 -46
- data/ext/llama_cpp/src/ggml-opencl.cpp +47 -71
- data/ext/llama_cpp/src/ggml.c +71 -55
- data/ext/llama_cpp/src/ggml.h +15 -9
- data/ext/llama_cpp/src/k_quants.c +12 -20
- data/ext/llama_cpp/src/k_quants.h +5 -5
- data/ext/llama_cpp/src/llama.cpp +1851 -250
- data/ext/llama_cpp/src/llama.h +18 -12
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +4 -4
- metadata +5 -3
| @@ -19,7 +19,7 @@ | |
| 19 19 | 
             
            #pragma warning(disable: 4244 4267) // possible loss of data
         | 
| 20 20 | 
             
            #endif
         | 
| 21 21 |  | 
| 22 | 
            -
            #define  | 
| 22 | 
            +
            #define CL_DMMV_LOCAL_SIZE 32
         | 
| 23 23 |  | 
| 24 24 | 
             
            #ifndef K_QUANTS_PER_ITERATION
         | 
| 25 25 | 
             
            #define K_QUANTS_PER_ITERATION 1
         | 
| @@ -338,7 +338,7 @@ __kernel void dequantize_mul_mat_vec_q2_K(__global const struct block_q2_K * xx, | |
| 338 338 | 
             
                const int row = get_group_id(0);
         | 
| 339 339 |  | 
| 340 340 | 
             
                const int num_blocks_per_row = ncols / QK_K;
         | 
| 341 | 
            -
                const int ib0 = row*num_blocks_per_row;
         | 
| 341 | 
            +
                const int ib0 = row*num_blocks_per_row + get_global_offset(0);
         | 
| 342 342 |  | 
| 343 343 | 
             
                __global const struct block_q2_K * x = xx + ib0;
         | 
| 344 344 |  | 
| @@ -413,7 +413,7 @@ __kernel void dequantize_mul_mat_vec_q3_K(__global const struct block_q3_K * xx, | |
| 413 413 | 
             
                const int row = get_group_id(0);
         | 
| 414 414 |  | 
| 415 415 | 
             
                const int num_blocks_per_row = ncols / QK_K;
         | 
| 416 | 
            -
                const int ib0 = row*num_blocks_per_row;
         | 
| 416 | 
            +
                const int ib0 = row*num_blocks_per_row + get_global_offset(0);
         | 
| 417 417 |  | 
| 418 418 | 
             
                __global const struct block_q3_K * x = xx + ib0;
         | 
| 419 419 |  | 
| @@ -489,7 +489,7 @@ __kernel void dequantize_mul_mat_vec_q4_K(__global const struct block_q4_K * xx, | |
| 489 489 |  | 
| 490 490 | 
             
                const int row = get_group_id(0);
         | 
| 491 491 | 
             
                const int num_blocks_per_row = ncols / QK_K;
         | 
| 492 | 
            -
                const int ib0 = row*num_blocks_per_row;
         | 
| 492 | 
            +
                const int ib0 = row*num_blocks_per_row + get_global_offset(0);
         | 
| 493 493 |  | 
| 494 494 | 
             
                const int tid = get_local_id(0)/K_QUANTS_PER_ITERATION;  // 0...15
         | 
| 495 495 | 
             
                const int ix  = get_local_id(0)%K_QUANTS_PER_ITERATION;
         | 
| @@ -562,7 +562,7 @@ __kernel void dequantize_mul_mat_vec_q5_K(__global const struct block_q5_K * xx, | |
| 562 562 |  | 
| 563 563 | 
             
                const int row = get_group_id(0);
         | 
| 564 564 | 
             
                const int num_blocks_per_row = ncols / QK_K;
         | 
| 565 | 
            -
                const int ib0 = row*num_blocks_per_row;
         | 
| 565 | 
            +
                const int ib0 = row*num_blocks_per_row + get_global_offset(0);
         | 
| 566 566 |  | 
| 567 567 | 
             
                const int tid = get_local_id(0)/2;  // 0...15
         | 
| 568 568 | 
             
                const int ix  = get_local_id(0)%2;
         | 
| @@ -641,7 +641,7 @@ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx, | |
| 641 641 | 
             
                const int row = get_group_id(0);
         | 
| 642 642 |  | 
| 643 643 | 
             
                const int num_blocks_per_row = ncols / QK_K;
         | 
| 644 | 
            -
                const int ib0 = row*num_blocks_per_row;
         | 
| 644 | 
            +
                const int ib0 = row*num_blocks_per_row + get_global_offset(0);
         | 
| 645 645 |  | 
| 646 646 | 
             
                __global const struct block_q6_K * x = xx + ib0;
         | 
| 647 647 |  | 
| @@ -745,19 +745,21 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __global float* y) { | |
| 745 745 |  | 
| 746 746 | 
             
            std::string dequant_mul_mat_vec_template = MULTILINE_QUOTE(
         | 
| 747 747 | 
             
            __kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float* y, __global float* dst, const int ncols) {
         | 
| 748 | 
            -
                const int  | 
| 748 | 
            +
                const int local_size = get_local_size(0);
         | 
| 749 749 | 
             
                const int row = get_group_id(0);
         | 
| 750 750 | 
             
                const int tid = get_local_id(0);
         | 
| 751 751 |  | 
| 752 752 | 
             
                const uint qk = QUANT_K;
         | 
| 753 753 | 
             
                const uint qr = QUANT_R;
         | 
| 754 754 |  | 
| 755 | 
            +
                const int col_step = local_size * 2;
         | 
| 755 756 | 
             
                const int y_offset = qr == 1 ? 1 : qk/2;
         | 
| 756 757 |  | 
| 758 | 
            +
                x += get_global_offset(0);
         | 
| 759 | 
            +
             | 
| 757 760 | 
             
                tmp[tid] = 0;
         | 
| 758 761 |  | 
| 759 | 
            -
                for (int  | 
| 760 | 
            -
                    const int col = i*block_size + 2*tid;
         | 
| 762 | 
            +
                for (int col = tid*2; col < ncols; col += col_step) {
         | 
| 761 763 | 
             
                    const int ib = (row*ncols + col)/qk; // block index
         | 
| 762 764 | 
             
                    const int iqs = (col%qk)/qr; // quant index
         | 
| 763 765 | 
             
                    const int iybs = col - col%qk; // y block start index
         | 
| @@ -773,7 +775,7 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float | |
| 773 775 |  | 
| 774 776 | 
             
                // sum up partial sums and write back result
         | 
| 775 777 | 
             
                barrier(CLK_LOCAL_MEM_FENCE);
         | 
| 776 | 
            -
                for (int s= | 
| 778 | 
            +
                for (int s=local_size/2; s>0; s>>=1) {
         | 
| 777 779 | 
             
                    if (tid < s) {
         | 
| 778 780 | 
             
                        tmp[tid] += tmp[tid + s];
         | 
| 779 781 | 
             
                    }
         | 
| @@ -1393,75 +1395,46 @@ static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1, | |
| 1393 1395 | 
             
                const int64_t ne01 = src0->ne[1];
         | 
| 1394 1396 | 
             
                const int64_t ne02 = src0->ne[2];
         | 
| 1395 1397 | 
             
                const int64_t ne03 = src0->ne[3];
         | 
| 1396 | 
            -
                const int64_t ne0 = ne00 * ne01 * ne02 * ne03;
         | 
| 1397 1398 | 
             
                const int64_t ne10 = src1->ne[0];
         | 
| 1398 1399 | 
             
                const int64_t ne11 = src1->ne[1];
         | 
| 1399 1400 | 
             
                const int64_t ne12 = src1->ne[2];
         | 
| 1400 1401 | 
             
                const int64_t ne13 = src1->ne[3];
         | 
| 1401 | 
            -
                const int64_t nb10 = src1->nb[0];
         | 
| 1402 1402 | 
             
                const int nb2  = dst->nb[2];
         | 
| 1403 1403 | 
             
                const int nb3  = dst->nb[3];
         | 
| 1404 1404 | 
             
                size_t x_size;
         | 
| 1405 1405 | 
             
                size_t d_size;
         | 
| 1406 1406 |  | 
| 1407 | 
            -
                cl_mem d_X = ggml_cl_pool_malloc( | 
| 1407 | 
            +
                cl_mem d_X = ggml_cl_pool_malloc(ne00 * ne01 * sizeof(float), &x_size); // src0
         | 
| 1408 1408 | 
             
                cl_mem d_Y = (cl_mem) src1->extra; // src1 is already on device, broadcasted.
         | 
| 1409 | 
            -
                cl_mem d_D = ggml_cl_pool_malloc( | 
| 1409 | 
            +
                cl_mem d_D = ggml_cl_pool_malloc(ne00 * ne01 * sizeof(float), &d_size); // dst
         | 
| 1410 1410 |  | 
| 1411 1411 |  | 
| 1412 1412 | 
             
                for (int64_t i03 = 0; i03 < ne03; i03++) {
         | 
| 1413 1413 | 
             
                    for (int64_t i02 = 0; i02 < ne02; i02++) {
         | 
| 1414 | 
            -
                        const int i0 = i03*ne02 + i02;
         | 
| 1415 | 
            -
             | 
| 1416 1414 | 
             
                        cl_event ev;
         | 
| 1417 1415 |  | 
| 1418 1416 | 
             
                        // copy src0 to device
         | 
| 1419 | 
            -
                        CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X,  | 
| 1420 | 
            -
             | 
| 1421 | 
            -
                         | 
| 1422 | 
            -
             | 
| 1423 | 
            -
             | 
| 1424 | 
            -
             | 
| 1425 | 
            -
             | 
| 1426 | 
            -
             | 
| 1427 | 
            -
             | 
| 1428 | 
            -
             | 
| 1429 | 
            -
             | 
| 1430 | 
            -
             | 
| 1431 | 
            -
             | 
| 1432 | 
            -
             | 
| 1433 | 
            -
             | 
| 1434 | 
            -
             | 
| 1435 | 
            -
             | 
| 1436 | 
            -
             | 
| 1437 | 
            -
             | 
| 1438 | 
            -
             | 
| 1439 | 
            -
             | 
| 1440 | 
            -
                            CL_CHECK(clEnqueueNDRangeKernel(queue, mul_f32_cl, 1, NULL, &global, NULL, 1, &ev, NULL));
         | 
| 1441 | 
            -
                        } else {
         | 
| 1442 | 
            -
                            for (int64_t i01 = 0; i01 < ne01; i01++) {
         | 
| 1443 | 
            -
                                const int64_t i13 = i03%ne13;
         | 
| 1444 | 
            -
                                const int64_t i12 = i02%ne12;
         | 
| 1445 | 
            -
                                const int64_t i11 = i01%ne11;
         | 
| 1446 | 
            -
                                const int i1 = i13*ne12*ne11 + i12*ne11 + i11;
         | 
| 1447 | 
            -
             | 
| 1448 | 
            -
                                cl_int x_offset = i01*ne00;
         | 
| 1449 | 
            -
                                cl_int y_offset = i1*ne10;
         | 
| 1450 | 
            -
                                cl_int d_offset = i01*ne00;
         | 
| 1451 | 
            -
             | 
| 1452 | 
            -
                                // compute
         | 
| 1453 | 
            -
                                size_t global = ne00;
         | 
| 1454 | 
            -
                                cl_int ky = ne10;
         | 
| 1455 | 
            -
                                CL_CHECK(clSetKernelArg(mul_f32_cl, 0, sizeof(cl_mem), &d_X));
         | 
| 1456 | 
            -
                                CL_CHECK(clSetKernelArg(mul_f32_cl, 1, sizeof(cl_int), &x_offset));
         | 
| 1457 | 
            -
                                CL_CHECK(clSetKernelArg(mul_f32_cl, 2, sizeof(cl_mem), &d_Y));
         | 
| 1458 | 
            -
                                CL_CHECK(clSetKernelArg(mul_f32_cl, 3, sizeof(cl_int), &y_offset));
         | 
| 1459 | 
            -
                                CL_CHECK(clSetKernelArg(mul_f32_cl, 4, sizeof(cl_mem), &d_D));
         | 
| 1460 | 
            -
                                CL_CHECK(clSetKernelArg(mul_f32_cl, 5, sizeof(cl_int), &d_offset));
         | 
| 1461 | 
            -
                                CL_CHECK(clSetKernelArg(mul_f32_cl, 6, sizeof(cl_int), &ky));
         | 
| 1462 | 
            -
                                CL_CHECK(clEnqueueNDRangeKernel(queue, mul_f32_cl, 1, NULL, &global, NULL, 1, &ev, NULL));
         | 
| 1463 | 
            -
                            }
         | 
| 1464 | 
            -
                        }
         | 
| 1417 | 
            +
                        CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, &ev));
         | 
| 1418 | 
            +
             | 
| 1419 | 
            +
                        const int64_t i13 = i03%ne13;
         | 
| 1420 | 
            +
                        const int64_t i12 = i02%ne12;
         | 
| 1421 | 
            +
                        const int i1 = i13*ne12*ne11 + i12*ne11;
         | 
| 1422 | 
            +
             | 
| 1423 | 
            +
                        cl_int x_offset = 0;
         | 
| 1424 | 
            +
                        cl_int y_offset = i1*ne10;
         | 
| 1425 | 
            +
                        cl_int d_offset = 0;
         | 
| 1426 | 
            +
             | 
| 1427 | 
            +
                        size_t global = ne00 * ne01;
         | 
| 1428 | 
            +
                        cl_int ky = ne10 * ne11;
         | 
| 1429 | 
            +
             | 
| 1430 | 
            +
                        CL_CHECK(clSetKernelArg(mul_f32_cl, 0, sizeof(cl_mem), &d_X));
         | 
| 1431 | 
            +
                        CL_CHECK(clSetKernelArg(mul_f32_cl, 1, sizeof(cl_int), &x_offset));
         | 
| 1432 | 
            +
                        CL_CHECK(clSetKernelArg(mul_f32_cl, 2, sizeof(cl_mem), &d_Y));
         | 
| 1433 | 
            +
                        CL_CHECK(clSetKernelArg(mul_f32_cl, 3, sizeof(cl_int), &y_offset));
         | 
| 1434 | 
            +
                        CL_CHECK(clSetKernelArg(mul_f32_cl, 4, sizeof(cl_mem), &d_D));
         | 
| 1435 | 
            +
                        CL_CHECK(clSetKernelArg(mul_f32_cl, 5, sizeof(cl_int), &d_offset));
         | 
| 1436 | 
            +
                        CL_CHECK(clSetKernelArg(mul_f32_cl, 6, sizeof(cl_int), &ky));
         | 
| 1437 | 
            +
                        CL_CHECK(clEnqueueNDRangeKernel(queue, mul_f32_cl, 1, NULL, &global, NULL, 1, &ev, NULL));
         | 
| 1465 1438 |  | 
| 1466 1439 | 
             
                        CL_CHECK(clReleaseEvent(ev));
         | 
| 1467 1440 | 
             
                        CL_CHECK(clFinish(queue));
         | 
| @@ -1566,7 +1539,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr | |
| 1566 1539 | 
             
                ggml_cl_pool_free(d_D, d_size);
         | 
| 1567 1540 | 
             
            }
         | 
| 1568 1541 |  | 
| 1569 | 
            -
            static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, void * wdata, size_t  | 
| 1542 | 
            +
            static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, void * wdata, size_t wsize) {
         | 
| 1570 1543 | 
             
                GGML_ASSERT(fp16_support);
         | 
| 1571 1544 |  | 
| 1572 1545 | 
             
                const int64_t ne00 = src0->ne[0];
         | 
| @@ -1596,6 +1569,10 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr | |
| 1596 1569 | 
             
                const int y_ne = ne11 * ne10;
         | 
| 1597 1570 | 
             
                const int d_ne = ne11 * ne01;
         | 
| 1598 1571 |  | 
| 1572 | 
            +
                GGML_ASSERT(wsize >= sizeof(ggml_fp16_t) * y_ne);
         | 
| 1573 | 
            +
                GGML_ASSERT(wsize >= sizeof(ggml_fp16_t) * d_ne);
         | 
| 1574 | 
            +
                ggml_fp16_t * const tmp = (ggml_fp16_t *) wdata;
         | 
| 1575 | 
            +
             | 
| 1599 1576 | 
             
                size_t x_size;
         | 
| 1600 1577 | 
             
                size_t y_size;
         | 
| 1601 1578 | 
             
                size_t d_size;
         | 
| @@ -1632,7 +1609,6 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr | |
| 1632 1609 |  | 
| 1633 1610 | 
             
                        // convert src1 to fp16
         | 
| 1634 1611 | 
             
                        // TODO: use multiple threads
         | 
| 1635 | 
            -
                        ggml_fp16_t * const tmp = (ggml_fp16_t *) wdata + (ne11 * ne10) * (i13 * ne12 + i12);
         | 
| 1636 1612 | 
             
                        char * src1i = (char *) src1->data + i13*nb13 + i12*nb12;
         | 
| 1637 1613 | 
             
                        if (src1_cont_rows) {
         | 
| 1638 1614 | 
             
                            if (src1_cont_cols) {
         | 
| @@ -1704,7 +1680,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * | |
| 1704 1680 | 
             
                const int nb2  = dst->nb[2];
         | 
| 1705 1681 | 
             
                const int nb3  = dst->nb[3];
         | 
| 1706 1682 | 
             
                const ggml_type type = src0->type;
         | 
| 1707 | 
            -
                const bool mul_mat_vec = ne11 == 1;
         | 
| 1683 | 
            +
                const bool mul_mat_vec = ne11 == 1 && ne00%2 == 0;
         | 
| 1708 1684 |  | 
| 1709 1685 | 
             
                const int64_t r2 = ne12 / ne02;
         | 
| 1710 1686 | 
             
                const int64_t r3 = ne13 / ne03;
         | 
| @@ -1737,7 +1713,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * | |
| 1737 1713 | 
             
                GGML_ASSERT(to_fp32_cl != nullptr);
         | 
| 1738 1714 |  | 
| 1739 1715 | 
             
                const size_t global_denom = ggml_cl_global_denom(type);
         | 
| 1740 | 
            -
                const size_t local = ggml_cl_local_size(type);
         | 
| 1716 | 
            +
                const size_t local = mul_mat_vec ? CL_DMMV_LOCAL_SIZE : ggml_cl_local_size(type);
         | 
| 1741 1717 |  | 
| 1742 1718 | 
             
                size_t ev_idx = 0;
         | 
| 1743 1719 | 
             
                std::vector<cl_event> events;
         | 
| @@ -1770,8 +1746,8 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * | |
| 1770 1746 | 
             
                            CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, events.data() + ev_idx++));
         | 
| 1771 1747 |  | 
| 1772 1748 | 
             
                            // compute
         | 
| 1773 | 
            -
                            const size_t global = ne01 *  | 
| 1774 | 
            -
                            const size_t  | 
| 1749 | 
            +
                            const size_t global = ne01 * local;
         | 
| 1750 | 
            +
                            const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
         | 
| 1775 1751 | 
             
                            const cl_int ncols = ne00;
         | 
| 1776 1752 | 
             
                            events.emplace_back();
         | 
| 1777 1753 | 
             
                            CL_CHECK(clSetKernelArg(*dmmv, 0, sizeof(cl_mem), &d_Q));
         | 
| @@ -1779,7 +1755,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * | |
| 1779 1755 | 
             
                            CL_CHECK(clSetKernelArg(*dmmv, 2, sizeof(cl_mem), &d_Y));
         | 
| 1780 1756 | 
             
                            CL_CHECK(clSetKernelArg(*dmmv, 3, sizeof(cl_mem), &d_D));
         | 
| 1781 1757 | 
             
                            CL_CHECK(clSetKernelArg(*dmmv, 4, sizeof(cl_int), &ncols));
         | 
| 1782 | 
            -
                            CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1,  | 
| 1758 | 
            +
                            CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1, &offset, &global, &local, events.size() - 1, events.data(), events.data() + ev_idx++));
         | 
| 1783 1759 | 
             
                        } else { // general dequantization kernel + CLBlast matrix matrix multiplication
         | 
| 1784 1760 | 
             
                            // convert src0 to fp32 on device
         | 
| 1785 1761 | 
             
                            const size_t global = x_ne / global_denom;
         | 
| @@ -1895,8 +1871,8 @@ void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * | |
| 1895 1871 | 
             
            }
         | 
| 1896 1872 |  | 
| 1897 1873 | 
             
            size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
         | 
| 1898 | 
            -
                if (ggml_cl_mul_mat_use_f16(src0, src1, dst)) {
         | 
| 1899 | 
            -
                    return  | 
| 1874 | 
            +
                if (src0->type == GGML_TYPE_F16 && ggml_cl_mul_mat_use_f16(src0, src1, dst)) {
         | 
| 1875 | 
            +
                    return sizeof(ggml_fp16_t) * std::max(src1->ne[0] * src1->ne[1], dst->ne[0] * dst->ne[1]);
         | 
| 1900 1876 | 
             
                }
         | 
| 1901 1877 | 
             
                return 0;
         | 
| 1902 1878 | 
             
            }
         | 
    
        data/ext/llama_cpp/src/ggml.c
    CHANGED
    
    | @@ -162,40 +162,16 @@ typedef void * thread_ret_t; | |
| 162 162 |  | 
| 163 163 | 
             
            #define GGML_PRINT(...) printf(__VA_ARGS__)
         | 
| 164 164 |  | 
| 165 | 
            +
            //
         | 
| 166 | 
            +
            // end of logging block
         | 
| 167 | 
            +
            //
         | 
| 168 | 
            +
             | 
| 165 169 | 
             
            #ifdef GGML_USE_ACCELERATE
         | 
| 166 170 | 
             
            // uncomment to use vDSP for soft max computation
         | 
| 167 171 | 
             
            // note: not sure if it is actually faster
         | 
| 168 172 | 
             
            //#define GGML_SOFT_MAX_ACCELERATE
         | 
| 169 173 | 
             
            #endif
         | 
| 170 174 |  | 
| 171 | 
            -
            //
         | 
| 172 | 
            -
            // logging
         | 
| 173 | 
            -
            //
         | 
| 174 | 
            -
             | 
| 175 | 
            -
            #if (GGML_DEBUG >= 1)
         | 
| 176 | 
            -
            #define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
         | 
| 177 | 
            -
            #else
         | 
| 178 | 
            -
            #define GGML_PRINT_DEBUG(...)
         | 
| 179 | 
            -
            #endif
         | 
| 180 | 
            -
             | 
| 181 | 
            -
            #if (GGML_DEBUG >= 5)
         | 
| 182 | 
            -
            #define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
         | 
| 183 | 
            -
            #else
         | 
| 184 | 
            -
            #define GGML_PRINT_DEBUG_5(...)
         | 
| 185 | 
            -
            #endif
         | 
| 186 | 
            -
             | 
| 187 | 
            -
            #if (GGML_DEBUG >= 10)
         | 
| 188 | 
            -
            #define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
         | 
| 189 | 
            -
            #else
         | 
| 190 | 
            -
            #define GGML_PRINT_DEBUG_10(...)
         | 
| 191 | 
            -
            #endif
         | 
| 192 | 
            -
             | 
| 193 | 
            -
            #define GGML_PRINT(...) printf(__VA_ARGS__)
         | 
| 194 | 
            -
             | 
| 195 | 
            -
            //
         | 
| 196 | 
            -
            // end of logging block
         | 
| 197 | 
            -
            //
         | 
| 198 | 
            -
             | 
| 199 175 | 
             
            #if defined(_MSC_VER) || defined(__MINGW32__)
         | 
| 200 176 | 
             
            #define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN)
         | 
| 201 177 | 
             
            #define GGML_ALIGNED_FREE(ptr)    _aligned_free(ptr)
         | 
| @@ -4951,6 +4927,7 @@ static struct ggml_tensor * ggml_new_tensor_impl( | |
| 4951 4927 | 
             
                *result = (struct ggml_tensor) {
         | 
| 4952 4928 | 
             
                    /*.type         =*/ type,
         | 
| 4953 4929 | 
             
                    /*.backend      =*/ GGML_BACKEND_CPU,
         | 
| 4930 | 
            +
                    /*.buffer       =*/ NULL,
         | 
| 4954 4931 | 
             
                    /*.n_dims       =*/ n_dims,
         | 
| 4955 4932 | 
             
                    /*.ne           =*/ { 1, 1, 1, 1 },
         | 
| 4956 4933 | 
             
                    /*.nb           =*/ { 0, 0, 0, 0 },
         | 
| @@ -5517,6 +5494,39 @@ struct ggml_tensor * ggml_view_tensor( | |
| 5517 5494 | 
             
                return result;
         | 
| 5518 5495 | 
             
            }
         | 
| 5519 5496 |  | 
| 5497 | 
            +
            struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx) {
         | 
| 5498 | 
            +
                struct ggml_object * obj = ctx->objects_begin;
         | 
| 5499 | 
            +
             | 
| 5500 | 
            +
                char * const mem_buffer = ctx->mem_buffer;
         | 
| 5501 | 
            +
             | 
| 5502 | 
            +
                while (obj != NULL) {
         | 
| 5503 | 
            +
                    if (obj->type == GGML_OBJECT_TENSOR) {
         | 
| 5504 | 
            +
                        return (struct ggml_tensor *)(mem_buffer + obj->offs);
         | 
| 5505 | 
            +
                    }
         | 
| 5506 | 
            +
             | 
| 5507 | 
            +
                    obj = obj->next;
         | 
| 5508 | 
            +
                }
         | 
| 5509 | 
            +
             | 
| 5510 | 
            +
                return NULL;
         | 
| 5511 | 
            +
            }
         | 
| 5512 | 
            +
             | 
| 5513 | 
            +
            struct ggml_tensor * ggml_get_next_tensor(struct ggml_context * ctx, struct ggml_tensor * tensor) {
         | 
| 5514 | 
            +
                struct ggml_object * obj = (struct ggml_object *) ((char *)tensor - GGML_OBJECT_SIZE);
         | 
| 5515 | 
            +
                obj = obj->next;
         | 
| 5516 | 
            +
             | 
| 5517 | 
            +
                char * const mem_buffer = ctx->mem_buffer;
         | 
| 5518 | 
            +
             | 
| 5519 | 
            +
                while (obj != NULL) {
         | 
| 5520 | 
            +
                    if (obj->type == GGML_OBJECT_TENSOR) {
         | 
| 5521 | 
            +
                        return (struct ggml_tensor *)(mem_buffer + obj->offs);
         | 
| 5522 | 
            +
                    }
         | 
| 5523 | 
            +
             | 
| 5524 | 
            +
                    obj = obj->next;
         | 
| 5525 | 
            +
                }
         | 
| 5526 | 
            +
             | 
| 5527 | 
            +
                return NULL;
         | 
| 5528 | 
            +
            }
         | 
| 5529 | 
            +
             | 
| 5520 5530 | 
             
            struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name) {
         | 
| 5521 5531 | 
             
                struct ggml_object * obj = ctx->objects_begin;
         | 
| 5522 5532 |  | 
| @@ -8670,6 +8680,7 @@ void ggml_set_param( | |
| 8670 8680 |  | 
| 8671 8681 | 
             
                GGML_ASSERT(tensor->grad == NULL);
         | 
| 8672 8682 | 
             
                tensor->grad = ggml_dup_tensor(ctx, tensor);
         | 
| 8683 | 
            +
                ggml_format_name(tensor->grad, "%s (grad)", tensor->name);
         | 
| 8673 8684 | 
             
            }
         | 
| 8674 8685 |  | 
| 8675 8686 | 
             
            // ggml_compute_forward_dup
         | 
| @@ -11256,7 +11267,7 @@ static void ggml_compute_forward_silu_f32( | |
| 11256 11267 |  | 
| 11257 11268 | 
             
            #ifndef NDEBUG
         | 
| 11258 11269 | 
             
                    for (int k = 0; k < nc; k++) {
         | 
| 11259 | 
            -
                        const float x = ((float *) ((char *) dst->data + i1*( | 
| 11270 | 
            +
                        const float x = ((float *) ((char *) dst->data + i1*(dst->nb[1])))[k];
         | 
| 11260 11271 | 
             
                        UNUSED(x);
         | 
| 11261 11272 | 
             
                        assert(!isnan(x));
         | 
| 11262 11273 | 
             
                        assert(!isinf(x));
         | 
| @@ -13082,24 +13093,22 @@ static void ggml_compute_forward_alibi_f32( | |
| 13082 13093 | 
             
                    return;
         | 
| 13083 13094 | 
             
                }
         | 
| 13084 13095 |  | 
| 13085 | 
            -
                const int n_past = ((int32_t *) dst->op_params)[0]; | 
| 13096 | 
            +
                //const int n_past = ((int32_t *) dst->op_params)[0];
         | 
| 13086 13097 | 
             
                const int n_head = ((int32_t *) dst->op_params)[1];
         | 
| 13087 13098 | 
             
                float max_bias;
         | 
| 13088 13099 | 
             
                memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
         | 
| 13089 13100 |  | 
| 13090 | 
            -
                 | 
| 13091 | 
            -
             | 
| 13092 | 
            -
                const  | 
| 13093 | 
            -
                const  | 
| 13094 | 
            -
                const int ne2 = src0->ne[2]; // n_head -> this is k
         | 
| 13095 | 
            -
                //const int ne3 = src0->ne[3]; // 1 -> bsz
         | 
| 13101 | 
            +
                const int64_t ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
         | 
| 13102 | 
            +
                const int64_t ne1 = src0->ne[1]; // seq_len_without_past
         | 
| 13103 | 
            +
                const int64_t ne2 = src0->ne[2]; // n_head -> this is k
         | 
| 13104 | 
            +
                //const int64_t ne3 = src0->ne[3]; // 1 -> bsz
         | 
| 13096 13105 |  | 
| 13097 | 
            -
                const  | 
| 13098 | 
            -
                const  | 
| 13106 | 
            +
                const int64_t n  = ggml_nrows(src0);
         | 
| 13107 | 
            +
                const int64_t ne2_ne3 = n/ne1; // ne2*ne3
         | 
| 13099 13108 |  | 
| 13100 | 
            -
                const  | 
| 13101 | 
            -
                const  | 
| 13102 | 
            -
                const  | 
| 13109 | 
            +
                const size_t nb0 = src0->nb[0];
         | 
| 13110 | 
            +
                const size_t nb1 = src0->nb[1];
         | 
| 13111 | 
            +
                const size_t nb2 = src0->nb[2];
         | 
| 13103 13112 | 
             
                //const int nb3 = src0->nb[3];
         | 
| 13104 13113 |  | 
| 13105 13114 | 
             
                GGML_ASSERT(nb0 == sizeof(float));
         | 
| @@ -13111,9 +13120,9 @@ static void ggml_compute_forward_alibi_f32( | |
| 13111 13120 | 
             
                const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
         | 
| 13112 13121 | 
             
                const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
         | 
| 13113 13122 |  | 
| 13114 | 
            -
                for ( | 
| 13115 | 
            -
                    for ( | 
| 13116 | 
            -
                        for ( | 
| 13123 | 
            +
                for (int64_t i = 0; i < ne0; i++) {
         | 
| 13124 | 
            +
                    for (int64_t j = 0; j < ne1; j++) {
         | 
| 13125 | 
            +
                        for (int64_t k = 0; k < ne2_ne3; k++) {
         | 
| 13117 13126 | 
             
                            float * const src = (float *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
         | 
| 13118 13127 | 
             
                            float *      pdst = (float *)((char *)  dst->data + i*nb0 + j*nb1 + k*nb2);
         | 
| 13119 13128 |  | 
| @@ -13128,7 +13137,6 @@ static void ggml_compute_forward_alibi_f32( | |
| 13128 13137 | 
             
                            }
         | 
| 13129 13138 |  | 
| 13130 13139 | 
             
                            pdst[0] = i * m_k + src[0];
         | 
| 13131 | 
            -
             | 
| 13132 13140 | 
             
                        }
         | 
| 13133 13141 | 
             
                    }
         | 
| 13134 13142 | 
             
                }
         | 
| @@ -13529,7 +13537,7 @@ static void ggml_compute_forward_rope_f16( | |
| 13529 13537 | 
             
                                    dst_data[n_dims]     = GGML_FP32_TO_FP16(x2*cos_block_theta - x3*sin_block_theta);
         | 
| 13530 13538 | 
             
                                    dst_data[n_dims/2*3] = GGML_FP32_TO_FP16(x2*sin_block_theta + x3*cos_block_theta);
         | 
| 13531 13539 | 
             
                                }
         | 
| 13532 | 
            -
                            } if (!is_neox) {
         | 
| 13540 | 
            +
                            } else if (!is_neox) {
         | 
| 13533 13541 | 
             
                                for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
         | 
| 13534 13542 | 
             
                                    const float cos_theta = cosf(theta);
         | 
| 13535 13543 | 
             
                                    const float sin_theta = sinf(theta);
         | 
| @@ -14454,7 +14462,7 @@ static void ggml_compute_forward_conv_2d_f16_f32( | |
| 14454 14462 | 
             
                int64_t t0 = ggml_perf_time_us();
         | 
| 14455 14463 | 
             
                UNUSED(t0);
         | 
| 14456 14464 |  | 
| 14457 | 
            -
                GGML_TENSOR_BINARY_OP_LOCALS
         | 
| 14465 | 
            +
                GGML_TENSOR_BINARY_OP_LOCALS;
         | 
| 14458 14466 |  | 
| 14459 14467 | 
             
                const int ith = params->ith;
         | 
| 14460 14468 | 
             
                const int nth = params->nth;
         | 
| @@ -19162,6 +19170,7 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) { | |
| 19162 19170 |  | 
| 19163 19171 | 
             
                                        if (idx == -1) {
         | 
| 19164 19172 | 
             
                                            fprintf(stderr, "%s: failed to find tensor, arg = %d, node = %d\n", __func__, j, i);
         | 
| 19173 | 
            +
                                            fclose(fout);
         | 
| 19165 19174 | 
             
                                            return;
         | 
| 19166 19175 | 
             
                                        }
         | 
| 19167 19176 |  | 
| @@ -20203,6 +20212,10 @@ static enum ggml_opt_result ggml_opt_lbfgs( | |
| 20203 20212 | 
             
                    ggml_vec_cpy_f32(nx, xp, x);
         | 
| 20204 20213 | 
             
                    ggml_vec_cpy_f32(nx, gp, g);
         | 
| 20205 20214 |  | 
| 20215 | 
            +
                    // TODO: instead of passing &cancel here, use the return code of the linesearch
         | 
| 20216 | 
            +
                    //       to determine if the optimization should be cancelled
         | 
| 20217 | 
            +
                    //       this is a simple change, but not doing this atm, since I don't have a nice
         | 
| 20218 | 
            +
                    //       way to test and don't want to break something with so many changes lined up
         | 
| 20206 20219 | 
             
                    ls = linesearch_backtracking(¶ms, nx, x, &fx, g, d, step, xp, f, gb, &cplan, np, ps, &cancel, callback, callback_data);
         | 
| 20207 20220 | 
             
                    if (cancel) {
         | 
| 20208 20221 | 
             
                        return GGML_OPT_CANCEL;
         | 
| @@ -20832,7 +20845,7 @@ struct gguf_kv { | |
| 20832 20845 | 
             
            };
         | 
| 20833 20846 |  | 
| 20834 20847 | 
             
            struct gguf_header {
         | 
| 20835 | 
            -
                 | 
| 20848 | 
            +
                char magic[4];
         | 
| 20836 20849 | 
             
                uint32_t version;
         | 
| 20837 20850 | 
             
                uint64_t n_tensors; // GGUFv2
         | 
| 20838 20851 | 
             
                uint64_t n_kv;      // GGUFv2
         | 
| @@ -20902,7 +20915,7 @@ static bool gguf_fread_str_v1(FILE * file, struct gguf_str * p, size_t * offset) | |
| 20902 20915 | 
             
            struct gguf_context * gguf_init_empty(void) {
         | 
| 20903 20916 | 
             
                struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
         | 
| 20904 20917 |  | 
| 20905 | 
            -
                ctx->header.magic | 
| 20918 | 
            +
                memcpy(ctx->header.magic, GGUF_MAGIC, sizeof(ctx->header.magic));
         | 
| 20906 20919 | 
             
                ctx->header.version   = GGUF_VERSION;
         | 
| 20907 20920 | 
             
                ctx->header.n_tensors = 0;
         | 
| 20908 20921 | 
             
                ctx->header.n_kv      = 0;
         | 
| @@ -20928,16 +20941,18 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p | |
| 20928 20941 | 
             
                // offset from start of file
         | 
| 20929 20942 | 
             
                size_t offset = 0;
         | 
| 20930 20943 |  | 
| 20931 | 
            -
                 | 
| 20944 | 
            +
                char magic[4];
         | 
| 20932 20945 |  | 
| 20933 20946 | 
             
                // check the magic before making allocations
         | 
| 20934 20947 | 
             
                {
         | 
| 20935 20948 | 
             
                    gguf_fread_el(file, &magic, sizeof(magic), &offset);
         | 
| 20936 20949 |  | 
| 20937 | 
            -
                     | 
| 20938 | 
            -
                         | 
| 20939 | 
            -
             | 
| 20940 | 
            -
             | 
| 20950 | 
            +
                    for (uint32_t i = 0; i < sizeof(magic); i++) {
         | 
| 20951 | 
            +
                        if (magic[i] != GGUF_MAGIC[i]) {
         | 
| 20952 | 
            +
                            fprintf(stderr, "%s: invalid magic characters %s.\n", __func__, magic);
         | 
| 20953 | 
            +
                            fclose(file);
         | 
| 20954 | 
            +
                            return NULL;
         | 
| 20955 | 
            +
                        }
         | 
| 20941 20956 | 
             
                    }
         | 
| 20942 20957 | 
             
                }
         | 
| 20943 20958 |  | 
| @@ -20947,7 +20962,8 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p | |
| 20947 20962 |  | 
| 20948 20963 | 
             
                // read the header
         | 
| 20949 20964 | 
             
                {
         | 
| 20950 | 
            -
                    ctx->header.magic  | 
| 20965 | 
            +
                    strncpy(ctx->header.magic, magic, 4);
         | 
| 20966 | 
            +
             | 
| 20951 20967 |  | 
| 20952 20968 | 
             
                    ctx->kv    = NULL;
         | 
| 20953 20969 | 
             
                    ctx->infos = NULL;
         | 
    
        data/ext/llama_cpp/src/ggml.h
    CHANGED
    
    | @@ -231,8 +231,9 @@ | |
| 231 231 | 
             
            #define GGML_EXIT_SUCCESS 0
         | 
| 232 232 | 
             
            #define GGML_EXIT_ABORTED 1
         | 
| 233 233 |  | 
| 234 | 
            -
            #define GGUF_MAGIC | 
| 235 | 
            -
             | 
| 234 | 
            +
            #define GGUF_MAGIC "GGUF"
         | 
| 235 | 
            +
             | 
| 236 | 
            +
            #define GGUF_VERSION 3
         | 
| 236 237 |  | 
| 237 238 | 
             
            #define GGUF_DEFAULT_ALIGNMENT 32
         | 
| 238 239 |  | 
| @@ -326,7 +327,7 @@ extern "C" { | |
| 326 327 | 
             
                    GGML_TYPE_COUNT,
         | 
| 327 328 | 
             
                };
         | 
| 328 329 |  | 
| 329 | 
            -
                enum  | 
| 330 | 
            +
                enum ggml_backend_type {
         | 
| 330 331 | 
             
                    GGML_BACKEND_CPU = 0,
         | 
| 331 332 | 
             
                    GGML_BACKEND_GPU = 10,
         | 
| 332 333 | 
             
                    GGML_BACKEND_GPU_SPLIT = 20,
         | 
| @@ -479,8 +480,10 @@ extern "C" { | |
| 479 480 |  | 
| 480 481 | 
             
                // n-dimensional tensor
         | 
| 481 482 | 
             
                struct ggml_tensor {
         | 
| 482 | 
            -
                    enum ggml_type | 
| 483 | 
            -
                    enum  | 
| 483 | 
            +
                    enum ggml_type         type;
         | 
| 484 | 
            +
                    enum ggml_backend_type backend;
         | 
| 485 | 
            +
             | 
| 486 | 
            +
                    struct ggml_backend_buffer * buffer;
         | 
| 484 487 |  | 
| 485 488 | 
             
                    int     n_dims;
         | 
| 486 489 | 
             
                    int64_t ne[GGML_MAX_DIMS]; // number of elements
         | 
| @@ -514,7 +517,7 @@ extern "C" { | |
| 514 517 |  | 
| 515 518 | 
             
                    void * extra; // extra things e.g. for ggml-cuda.cu
         | 
| 516 519 |  | 
| 517 | 
            -
                    char padding[ | 
| 520 | 
            +
                    char padding[12];
         | 
| 518 521 | 
             
                };
         | 
| 519 522 |  | 
| 520 523 | 
             
                static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
         | 
| @@ -702,6 +705,9 @@ extern "C" { | |
| 702 705 | 
             
                GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
         | 
| 703 706 | 
             
                GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, struct ggml_tensor * src);
         | 
| 704 707 |  | 
| 708 | 
            +
                // Context tensor enumeration and lookup
         | 
| 709 | 
            +
                GGML_API struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx);
         | 
| 710 | 
            +
                GGML_API struct ggml_tensor * ggml_get_next_tensor (struct ggml_context * ctx, struct ggml_tensor * tensor);
         | 
| 705 711 | 
             
                GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
         | 
| 706 712 |  | 
| 707 713 | 
             
                GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
         | 
| @@ -1358,7 +1364,7 @@ extern "C" { | |
| 1358 1364 |  | 
| 1359 1365 | 
             
                // alibi position embedding
         | 
| 1360 1366 | 
             
                // in-place, returns view(a)
         | 
| 1361 | 
            -
                struct ggml_tensor * ggml_alibi(
         | 
| 1367 | 
            +
                GGML_API struct ggml_tensor * ggml_alibi(
         | 
| 1362 1368 | 
             
                        struct ggml_context * ctx,
         | 
| 1363 1369 | 
             
                        struct ggml_tensor  * a,
         | 
| 1364 1370 | 
             
                        int                   n_past,
         | 
| @@ -1367,7 +1373,7 @@ extern "C" { | |
| 1367 1373 |  | 
| 1368 1374 | 
             
                // clamp
         | 
| 1369 1375 | 
             
                // in-place, returns view(a)
         | 
| 1370 | 
            -
                struct ggml_tensor * ggml_clamp(
         | 
| 1376 | 
            +
                GGML_API struct ggml_tensor * ggml_clamp(
         | 
| 1371 1377 | 
             
                        struct ggml_context * ctx,
         | 
| 1372 1378 | 
             
                        struct ggml_tensor  * a,
         | 
| 1373 1379 | 
             
                        float                 min,
         | 
| @@ -2102,7 +2108,7 @@ extern "C" { | |
| 2102 2108 | 
             
                    enum ggml_type    vec_dot_type;
         | 
| 2103 2109 | 
             
                } ggml_type_traits_t;
         | 
| 2104 2110 |  | 
| 2105 | 
            -
                ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
         | 
| 2111 | 
            +
                GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
         | 
| 2106 2112 |  | 
| 2107 2113 | 
             
            #ifdef  __cplusplus
         | 
| 2108 2114 | 
             
            }
         | 
| @@ -46,7 +46,7 @@ inline static int32_t vaddvq_s32(int32x4_t v) { | |
| 46 46 | 
             
            #if defined(_MSC_VER) || defined(__MINGW32__)
         | 
| 47 47 | 
             
            #include <intrin.h>
         | 
| 48 48 | 
             
            #else
         | 
| 49 | 
            -
            #if !defined(__riscv)
         | 
| 49 | 
            +
            #if !defined(__riscv) && !defined(__s390__)
         | 
| 50 50 | 
             
            #include <immintrin.h>
         | 
| 51 51 | 
             
            #endif
         | 
| 52 52 | 
             
            #endif
         | 
| @@ -462,12 +462,9 @@ void quantize_row_q2_K(const float * restrict x, void * restrict vy, int k) { | |
| 462 462 | 
             
            }
         | 
| 463 463 |  | 
| 464 464 | 
             
            size_t ggml_quantize_q2_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
         | 
| 465 | 
            -
                 | 
| 466 | 
            -
             | 
| 467 | 
            -
                // TODO - collect histograms - although, at a second thought, I don't really care about them
         | 
| 468 | 
            -
                (void)hist;
         | 
| 465 | 
            +
                (void)hist; // TODO: collect histograms
         | 
| 469 466 |  | 
| 470 | 
            -
                for (int j = 0; j <  | 
| 467 | 
            +
                for (int j = 0; j < n; j += k) {
         | 
| 471 468 | 
             
                    block_q2_K * restrict y = (block_q2_K *)dst + j/QK_K;
         | 
| 472 469 | 
             
                    quantize_row_q2_K_reference(src + j, y, k);
         | 
| 473 470 | 
             
                }
         | 
| @@ -678,12 +675,9 @@ void quantize_row_q3_K(const float * restrict x, void * restrict vy, int k) { | |
| 678 675 | 
             
            }
         | 
| 679 676 |  | 
| 680 677 | 
             
            size_t ggml_quantize_q3_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
         | 
| 681 | 
            -
                 | 
| 682 | 
            -
             | 
| 683 | 
            -
                // TODO - collect histograms - although, at a second thought, I don't really care about them
         | 
| 684 | 
            -
                (void)hist;
         | 
| 678 | 
            +
                (void)hist; // TODO: collect histograms
         | 
| 685 679 |  | 
| 686 | 
            -
                for (int j = 0; j <  | 
| 680 | 
            +
                for (int j = 0; j < n; j += k) {
         | 
| 687 681 | 
             
                    block_q3_K * restrict y = (block_q3_K *)dst + j/QK_K;
         | 
| 688 682 | 
             
                    quantize_row_q3_K_reference(src + j, y, k);
         | 
| 689 683 | 
             
                }
         | 
| @@ -846,9 +840,9 @@ void quantize_row_q4_K(const float * restrict x, void * restrict vy, int k) { | |
| 846 840 |  | 
| 847 841 | 
             
            size_t ggml_quantize_q4_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
         | 
| 848 842 | 
             
                assert(k % QK_K == 0);
         | 
| 849 | 
            -
                const int nb = k / QK_K;
         | 
| 850 843 | 
             
                (void)hist; // TODO: collect histograms
         | 
| 851 | 
            -
             | 
| 844 | 
            +
             | 
| 845 | 
            +
                for (int j = 0; j < n; j += k) {
         | 
| 852 846 | 
             
                    block_q4_K * restrict y = (block_q4_K *)dst + j/QK_K;
         | 
| 853 847 | 
             
                    quantize_row_q4_K_reference(src + j, y, k);
         | 
| 854 848 | 
             
                }
         | 
| @@ -1052,9 +1046,9 @@ void quantize_row_q5_K(const float * restrict x, void * restrict vy, int k) { | |
| 1052 1046 |  | 
| 1053 1047 | 
             
            size_t ggml_quantize_q5_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
         | 
| 1054 1048 | 
             
                assert(k % QK_K == 0);
         | 
| 1055 | 
            -
                 | 
| 1056 | 
            -
             | 
| 1057 | 
            -
                for (int j = 0; j <  | 
| 1049 | 
            +
                (void)hist; // TODO: collect histograms
         | 
| 1050 | 
            +
             | 
| 1051 | 
            +
                for (int j = 0; j < n; j += k) {
         | 
| 1058 1052 | 
             
                    block_q5_K * restrict y = (block_q5_K *)dst + j/QK_K;
         | 
| 1059 1053 | 
             
                    quantize_row_q5_K_reference(src + j, y, k);
         | 
| 1060 1054 | 
             
                }
         | 
| @@ -1200,11 +1194,9 @@ void quantize_row_q6_K(const float * restrict x, void * restrict vy, int k) { | |
| 1200 1194 |  | 
| 1201 1195 | 
             
            size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist) {
         | 
| 1202 1196 | 
             
                assert(k % QK_K == 0);
         | 
| 1203 | 
            -
                 | 
| 1204 | 
            -
             | 
| 1205 | 
            -
                (void)hist; // TODO
         | 
| 1197 | 
            +
                (void)hist; // TODO: collect histograms
         | 
| 1206 1198 |  | 
| 1207 | 
            -
                for (int j = 0; j <  | 
| 1199 | 
            +
                for (int j = 0; j < n; j += k) {
         | 
| 1208 1200 | 
             
                    block_q6_K * restrict y = (block_q6_K *)dst + j/QK_K;
         | 
| 1209 1201 | 
             
                    quantize_row_q6_K_reference(src + j, y, k);
         | 
| 1210 1202 | 
             
                }
         |