@fugood/llama.node 0.3.12 → 0.3.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +1 -0
- package/package.json +1 -1
- package/src/LlamaCompletionWorker.cpp +14 -0
- package/src/LlamaContext.cpp +13 -4
- package/src/llama.cpp/.github/workflows/build.yml +35 -3
- package/src/llama.cpp/.github/workflows/docker.yml +2 -0
- package/src/llama.cpp/.github/workflows/labeler.yml +1 -1
- package/src/llama.cpp/common/CMakeLists.txt +20 -3
- package/src/llama.cpp/common/arg.cpp +180 -3
- package/src/llama.cpp/common/chat-template.hpp +21 -7
- package/src/llama.cpp/common/chat.cpp +220 -101
- package/src/llama.cpp/common/chat.hpp +3 -0
- package/src/llama.cpp/common/common.h +15 -7
- package/src/llama.cpp/common/llguidance.cpp +3 -3
- package/src/llama.cpp/common/log.cpp +1 -0
- package/src/llama.cpp/common/log.h +2 -1
- package/src/llama.cpp/common/minja.hpp +24 -9
- package/src/llama.cpp/common/sampling.cpp +52 -46
- package/src/llama.cpp/common/speculative.h +1 -1
- package/src/llama.cpp/docs/build.md +2 -2
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +2 -1
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +6 -5
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +1 -0
- package/src/llama.cpp/examples/run/run.cpp +5 -12
- package/src/llama.cpp/examples/server/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/server/httplib.h +381 -292
- package/src/llama.cpp/examples/server/server.cpp +58 -47
- package/src/llama.cpp/examples/server/utils.hpp +7 -5
- package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -1
- package/src/llama.cpp/ggml/include/ggml-metal.h +1 -1
- package/src/llama.cpp/ggml/include/ggml-vulkan.h +0 -2
- package/src/llama.cpp/ggml/include/ggml.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-common.h +0 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +6 -12
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +852 -268
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +200 -107
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +2 -5
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +9 -8
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +2 -2
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +26 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +6 -7
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +812 -569
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +25 -1
- package/src/llama.cpp/ggml/src/ggml.c +1 -1
- package/src/llama.cpp/include/llama.h +14 -10
- package/src/llama.cpp/src/llama-grammar.cpp +1 -1
- package/src/llama.cpp/src/llama-grammar.h +1 -1
- package/src/llama.cpp/src/llama-impl.h +6 -6
- package/src/llama.cpp/src/llama-kv-cache.h +1 -1
- package/src/llama.cpp/src/llama-mmap.h +1 -0
- package/src/llama.cpp/src/llama-model.cpp +1 -1
- package/src/llama.cpp/src/llama-sampling.cpp +131 -57
- package/src/llama.cpp/src/llama.cpp +7 -5
- package/src/llama.cpp/src/unicode.cpp +9 -2
- package/src/llama.cpp/tests/test-backend-ops.cpp +5 -5
- package/src/llama.cpp/tests/test-chat.cpp +237 -69
- package/src/llama.cpp/tests/test-gguf.cpp +4 -4
- package/src/llama.cpp/tests/test-sampling.cpp +15 -0
|
@@ -7,10 +7,8 @@
|
|
|
7
7
|
#include "ggml-cpu-impl.h"
|
|
8
8
|
#include "ggml-cpu.h"
|
|
9
9
|
#include "ggml-impl.h"
|
|
10
|
-
#include "ggml-quants.h"
|
|
11
10
|
#include "ggml-cpu-quants.h"
|
|
12
11
|
#include "ggml-threading.h"
|
|
13
|
-
#include "amx/amx.h"
|
|
14
12
|
#include "ggml.h"
|
|
15
13
|
|
|
16
14
|
#if defined(_MSC_VER) || defined(__MINGW32__)
|
|
@@ -1078,29 +1076,23 @@ do { \
|
|
|
1078
1076
|
#define GGML_F16_STEP 32
|
|
1079
1077
|
#define GGML_F16_EPR 8
|
|
1080
1078
|
|
|
1081
|
-
// F16 arithmetic is not supported by
|
|
1079
|
+
// F16 arithmetic is not supported by LASX, so we use F32 instead
|
|
1082
1080
|
|
|
1083
1081
|
#define GGML_F32Cx8 __m256
|
|
1084
1082
|
#define GGML_F32Cx8_ZERO (__m256)__lasx_xvldi(0)
|
|
1085
1083
|
#define GGML_F32Cx8_SET1(x) (__m256)__lasx_xvreplgr2vr_w((x))
|
|
1086
1084
|
|
|
1087
1085
|
static inline __m256 __lasx_f32cx8_load(const ggml_fp16_t * x) {
|
|
1088
|
-
|
|
1089
|
-
|
|
1090
|
-
|
|
1091
|
-
|
|
1092
|
-
}
|
|
1093
|
-
|
|
1094
|
-
return (__m256)__lasx_xvld(tmp, 0);
|
|
1086
|
+
__m256i a;
|
|
1087
|
+
memcpy(&a, x, sizeof(ggml_fp16_t) * 8);
|
|
1088
|
+
a = __lasx_xvpermi_d(a, 0 | (1 << 4));
|
|
1089
|
+
return __lasx_xvfcvtl_s_h(a);
|
|
1095
1090
|
}
|
|
1096
|
-
static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) {
|
|
1097
|
-
float arr[8];
|
|
1098
|
-
|
|
1099
|
-
__lasx_xvst(y, arr, 0);
|
|
1100
1091
|
|
|
1101
|
-
|
|
1102
|
-
|
|
1103
|
-
|
|
1092
|
+
static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) {
|
|
1093
|
+
__m256i a = __lasx_xvfcvt_h_s(y, y);
|
|
1094
|
+
a = __lasx_xvpermi_d(a, 0 | (2 << 2));
|
|
1095
|
+
memcpy(x, &a, sizeof(ggml_fp16_t) * 8);
|
|
1104
1096
|
}
|
|
1105
1097
|
#define GGML_F32Cx8_LOAD(x) __lasx_f32cx8_load(x)
|
|
1106
1098
|
#define GGML_F32Cx8_STORE(x, y) __lasx_f32cx8_store(x, y)
|
|
@@ -1297,7 +1289,7 @@ struct ggml_threadpool {
|
|
|
1297
1289
|
atomic_int n_graph; // incremented when there is work to be done (i.e each graph)
|
|
1298
1290
|
atomic_int GGML_CACHE_ALIGN n_barrier;
|
|
1299
1291
|
atomic_int GGML_CACHE_ALIGN n_barrier_passed;
|
|
1300
|
-
atomic_int current_chunk; // currently processing chunk during Mat_Mul, shared between all the threads.
|
|
1292
|
+
atomic_int GGML_CACHE_ALIGN current_chunk; // currently processing chunk during Mat_Mul, shared between all the threads.
|
|
1301
1293
|
|
|
1302
1294
|
// these are atomic as an annotation for thread-sanitizer
|
|
1303
1295
|
atomic_bool stop; // Used for stopping the threadpool altogether
|
|
@@ -1824,7 +1816,7 @@ inline static float ggml_silu_f32(float x) {
|
|
|
1824
1816
|
|
|
1825
1817
|
#if __FINITE_MATH_ONLY__
|
|
1826
1818
|
#error "some routines in ggml.c require non-finite math arithmetics -- pass -fno-finite-math-only to the compiler to fix"
|
|
1827
|
-
#error "ref: https://github.com/
|
|
1819
|
+
#error "ref: https://github.com/ggml-org/llama.cpp/pull/7154#issuecomment-2143844461"
|
|
1828
1820
|
#endif
|
|
1829
1821
|
|
|
1830
1822
|
#if defined(__ARM_NEON) && defined(__aarch64__)
|
|
@@ -7496,6 +7488,7 @@ UseGgmlGemm1:;
|
|
|
7496
7488
|
if (src1->type != vec_dot_type) {
|
|
7497
7489
|
char * wdata = params->wdata;
|
|
7498
7490
|
|
|
7491
|
+
const size_t nbw0 = ggml_type_size(vec_dot_type);
|
|
7499
7492
|
const size_t nbw1 = ggml_row_size(vec_dot_type, ne10);
|
|
7500
7493
|
const size_t nbw2 = nbw1*ne11;
|
|
7501
7494
|
const size_t nbw3 = nbw2*ne12;
|
|
@@ -7503,6 +7496,7 @@ UseGgmlGemm1:;
|
|
|
7503
7496
|
assert(params->wsize >= ne13*nbw3);
|
|
7504
7497
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
|
7505
7498
|
|
|
7499
|
+
#if 0
|
|
7506
7500
|
for (int64_t i13 = 0; i13 < ne13; ++i13) {
|
|
7507
7501
|
for (int64_t i12 = 0; i12 < ne12; ++i12) {
|
|
7508
7502
|
for (int64_t i11 = ith; i11 < ne11; i11 += nth) {
|
|
@@ -7512,6 +7506,20 @@ UseGgmlGemm1:;
|
|
|
7512
7506
|
}
|
|
7513
7507
|
}
|
|
7514
7508
|
}
|
|
7509
|
+
#else
|
|
7510
|
+
for (int64_t i13 = 0; i13 < ne13; ++i13) {
|
|
7511
|
+
for (int64_t i12 = 0; i12 < ne12; ++i12) {
|
|
7512
|
+
for (int64_t i11 = 0; i11 < ne11; ++i11) {
|
|
7513
|
+
size_t bs = ggml_blck_size(vec_dot_type);
|
|
7514
|
+
int64_t ne10_block_start = (ith * ne10/bs) / nth;
|
|
7515
|
+
int64_t ne10_block_end = ((ith + 1) * ne10/bs) / nth;
|
|
7516
|
+
from_float((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + ne10_block_start*bs*nb10),
|
|
7517
|
+
(void *) (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1 + ne10_block_start*nbw0),
|
|
7518
|
+
(ne10_block_end - ne10_block_start) * bs);
|
|
7519
|
+
}
|
|
7520
|
+
}
|
|
7521
|
+
}
|
|
7522
|
+
#endif
|
|
7515
7523
|
}
|
|
7516
7524
|
|
|
7517
7525
|
if (ith == 0) {
|
|
@@ -7566,7 +7574,7 @@ UseGgmlGemm2:;
|
|
|
7566
7574
|
int64_t nchunk1 = (nr1 + chunk_size - 1) / chunk_size;
|
|
7567
7575
|
|
|
7568
7576
|
// If the chunking is poor for the number of threads on this setup, scrap the whole plan. Re-chunk it by thread.
|
|
7569
|
-
// Also, chunking by thread was measured to have perform better on NUMA systems. See https://github.com/
|
|
7577
|
+
// Also, chunking by thread was measured to have perform better on NUMA systems. See https://github.com/ggml-org/llama.cpp/pull/6915
|
|
7570
7578
|
// In theory, chunking should be just as useful on NUMA and non NUMA systems, but testing disagreed with that.
|
|
7571
7579
|
if (nchunk0 * nchunk1 < nth * 4 || ggml_is_numa()) {
|
|
7572
7580
|
// distribute the thread work across the inner or outer loop based on which one is larger
|
|
@@ -7599,7 +7607,6 @@ UseGgmlGemm2:;
|
|
|
7599
7607
|
if ((nr0 % 2 != 0) || (ne11 % 2 != 0) || ((ir0_end - ir0_start) % 2 != 0) || ((ir1_end - ir1_start) % 2 != 0)) {
|
|
7600
7608
|
num_rows_per_vec_dot = 1;
|
|
7601
7609
|
}
|
|
7602
|
-
|
|
7603
7610
|
ggml_compute_forward_mul_mat_one_chunk(params, dst, src0->type, num_rows_per_vec_dot, ir0_start, ir0_end, ir1_start, ir1_end);
|
|
7604
7611
|
|
|
7605
7612
|
if (nth >= nchunk0 * nchunk1) {
|
|
@@ -7612,6 +7619,84 @@ UseGgmlGemm2:;
|
|
|
7612
7619
|
|
|
7613
7620
|
// ggml_compute_forward_mul_mat_id
|
|
7614
7621
|
|
|
7622
|
+
#define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ids->ne[0]*ids->ne[1] + (i1)]
|
|
7623
|
+
|
|
7624
|
+
struct mmid_row_mapping {
|
|
7625
|
+
int32_t i1;
|
|
7626
|
+
int32_t i2;
|
|
7627
|
+
};
|
|
7628
|
+
|
|
7629
|
+
static void ggml_compute_forward_mul_mat_id_one_chunk(
|
|
7630
|
+
struct ggml_tensor * dst,
|
|
7631
|
+
const struct ggml_tensor * src0,
|
|
7632
|
+
const struct ggml_tensor * src1,
|
|
7633
|
+
const struct ggml_tensor * ids,
|
|
7634
|
+
const int64_t cur_a,
|
|
7635
|
+
const int64_t ir0_start,
|
|
7636
|
+
const int64_t ir0_end,
|
|
7637
|
+
const int64_t ir1_start,
|
|
7638
|
+
const int64_t ir1_end,
|
|
7639
|
+
const char * src0_cur,
|
|
7640
|
+
const struct mmid_row_mapping * matrix_rows,
|
|
7641
|
+
const size_t row_size,
|
|
7642
|
+
const bool src1_cont,
|
|
7643
|
+
const void * wdata) {
|
|
7644
|
+
|
|
7645
|
+
GGML_TENSOR_BINARY_OP_LOCALS
|
|
7646
|
+
|
|
7647
|
+
const enum ggml_type type = src0->type;
|
|
7648
|
+
|
|
7649
|
+
ggml_vec_dot_t const vec_dot = type_traits_cpu[type].vec_dot;
|
|
7650
|
+
enum ggml_type const vec_dot_type = type_traits_cpu[type].vec_dot_type;
|
|
7651
|
+
|
|
7652
|
+
const int64_t blck_0 = 16;
|
|
7653
|
+
const int64_t blck_1 = 16;
|
|
7654
|
+
|
|
7655
|
+
float tmp[16];
|
|
7656
|
+
|
|
7657
|
+
for (int64_t iir1 = ir1_start; iir1 < ir1_end; iir1 += blck_1) {
|
|
7658
|
+
for (int64_t iir0 = ir0_start; iir0 < ir0_end; iir0 += blck_0) {
|
|
7659
|
+
for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir1_end; ++ir1) {
|
|
7660
|
+
const int64_t _i12 = ir1; // logical row index for this expert
|
|
7661
|
+
|
|
7662
|
+
struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, _i12);
|
|
7663
|
+
const int id = row_mapping.i1; // selected expert index
|
|
7664
|
+
|
|
7665
|
+
const int64_t i11 = id % ne11;
|
|
7666
|
+
const int64_t i12 = row_mapping.i2; // row index in src1
|
|
7667
|
+
|
|
7668
|
+
const int64_t i1 = id; // selected expert index
|
|
7669
|
+
const int64_t i2 = i12; // row
|
|
7670
|
+
|
|
7671
|
+
// desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
|
|
7672
|
+
// if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
|
|
7673
|
+
// the original src1 data pointer, so we should index using the indices directly
|
|
7674
|
+
// TODO: this is a bit of a hack, we should probably have a better way to handle this
|
|
7675
|
+
const char * src1_col = (const char *) wdata +
|
|
7676
|
+
(src1_cont || src1->type != vec_dot_type
|
|
7677
|
+
? (i11 + i12*ne11)*row_size
|
|
7678
|
+
: (i11*nb11 + i12*nb12));
|
|
7679
|
+
|
|
7680
|
+
float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2));
|
|
7681
|
+
|
|
7682
|
+
for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ++ir0) {
|
|
7683
|
+
vec_dot(ne00, &tmp[ir0 - iir0], 0, src0_cur + ir0*nb01, 0, src1_col, 0, 1);
|
|
7684
|
+
}
|
|
7685
|
+
|
|
7686
|
+
memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir0_end) - iir0)*sizeof(float));
|
|
7687
|
+
}
|
|
7688
|
+
}
|
|
7689
|
+
}
|
|
7690
|
+
}
|
|
7691
|
+
|
|
7692
|
+
static void * incr_ptr_aligned(void ** p, size_t size, size_t align) {
|
|
7693
|
+
|
|
7694
|
+
void * ptr = *p;
|
|
7695
|
+
ptr = (void *) GGML_PAD((uintptr_t) ptr, align);
|
|
7696
|
+
*p = (void *) ((char *) ptr + size);
|
|
7697
|
+
return ptr;
|
|
7698
|
+
}
|
|
7699
|
+
|
|
7615
7700
|
static void ggml_compute_forward_mul_mat_id(
|
|
7616
7701
|
const struct ggml_compute_params * params,
|
|
7617
7702
|
struct ggml_tensor * dst) {
|
|
@@ -7629,7 +7714,6 @@ static void ggml_compute_forward_mul_mat_id(
|
|
|
7629
7714
|
|
|
7630
7715
|
const bool src1_cont = ggml_is_contiguous(src1);
|
|
7631
7716
|
|
|
7632
|
-
ggml_vec_dot_t const vec_dot = type_traits_cpu[type].vec_dot;
|
|
7633
7717
|
enum ggml_type const vec_dot_type = type_traits_cpu[type].vec_dot_type;
|
|
7634
7718
|
ggml_from_float_t const from_float = type_traits_cpu[vec_dot_type].from_float;
|
|
7635
7719
|
|
|
@@ -7647,21 +7731,27 @@ static void ggml_compute_forward_mul_mat_id(
|
|
|
7647
7731
|
const int n_ids = ids->ne[0]; // n_expert_used
|
|
7648
7732
|
const int n_as = ne02; // n_expert
|
|
7649
7733
|
|
|
7650
|
-
|
|
7651
|
-
(char *) params->wdata :
|
|
7652
|
-
(char *) params->wdata + GGML_PAD(ggml_row_size(vec_dot_type, ggml_nelements(src1)), sizeof(int64_t));
|
|
7734
|
+
void * wdata_cur = params->wdata;
|
|
7653
7735
|
|
|
7654
|
-
|
|
7655
|
-
|
|
7656
|
-
|
|
7657
|
-
};
|
|
7736
|
+
if (src1->type != vec_dot_type) {
|
|
7737
|
+
incr_ptr_aligned(&wdata_cur, ggml_row_size(vec_dot_type, ggml_nelements(src1)), sizeof(int64_t));
|
|
7738
|
+
}
|
|
7658
7739
|
|
|
7659
|
-
int64_t * matrix_row_counts =
|
|
7660
|
-
|
|
7740
|
+
int64_t * matrix_row_counts = // [n_as]
|
|
7741
|
+
incr_ptr_aligned(&wdata_cur, n_as*sizeof(int64_t), sizeof(int64_t));
|
|
7742
|
+
|
|
7743
|
+
struct mmid_row_mapping * matrix_rows = // [n_as][ids->ne[0]*ids->ne[1]]
|
|
7744
|
+
incr_ptr_aligned(&wdata_cur, n_as*ids->ne[0]*ids->ne[1]*sizeof(struct mmid_row_mapping), sizeof(int64_t));
|
|
7745
|
+
|
|
7746
|
+
char (*atomic_current_chunk)[CACHE_LINE_SIZE] = // [n_as]
|
|
7747
|
+
incr_ptr_aligned(&wdata_cur, CACHE_LINE_SIZE * n_as, CACHE_LINE_SIZE);
|
|
7748
|
+
|
|
7749
|
+
GGML_ASSERT(params->wsize >= (size_t)((char *) wdata_cur - (char *) params->wdata));
|
|
7661
7750
|
|
|
7662
7751
|
if (src1->type != vec_dot_type) {
|
|
7663
7752
|
char * wdata = params->wdata;
|
|
7664
7753
|
|
|
7754
|
+
const size_t nbw0 = ggml_type_size(vec_dot_type);
|
|
7665
7755
|
const size_t nbw1 = ggml_row_size(vec_dot_type, ne10);
|
|
7666
7756
|
const size_t nbw2 = nbw1*ne11;
|
|
7667
7757
|
const size_t nbw3 = nbw2*ne12;
|
|
@@ -7669,19 +7759,32 @@ static void ggml_compute_forward_mul_mat_id(
|
|
|
7669
7759
|
assert(params->wsize >= ne13*nbw3);
|
|
7670
7760
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
|
7671
7761
|
|
|
7762
|
+
#if 0
|
|
7672
7763
|
for (int64_t i13 = 0; i13 < ne13; ++i13) {
|
|
7673
|
-
for (int64_t i12 =
|
|
7674
|
-
for (int64_t i11 =
|
|
7764
|
+
for (int64_t i12 = ith; i12 < ne12; i12 += nth) {
|
|
7765
|
+
for (int64_t i11 = 0; i11 < ne11; ++i11) {
|
|
7675
7766
|
from_float((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11),
|
|
7676
7767
|
(void *) (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1),
|
|
7677
7768
|
ne10);
|
|
7678
7769
|
}
|
|
7679
7770
|
}
|
|
7680
7771
|
}
|
|
7772
|
+
#else
|
|
7773
|
+
for (int64_t i13 = 0; i13 < ne13; ++i13) {
|
|
7774
|
+
for (int64_t i12 = 0; i12 < ne12; ++i12) {
|
|
7775
|
+
for (int64_t i11 = 0; i11 < ne11; ++i11) {
|
|
7776
|
+
size_t bs = ggml_blck_size(vec_dot_type);
|
|
7777
|
+
int64_t ne10_block_start = (ith * ne10/bs) / nth;
|
|
7778
|
+
int64_t ne10_block_end = ((ith + 1) * ne10/bs) / nth;
|
|
7779
|
+
from_float((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + ne10_block_start*bs*nb10),
|
|
7780
|
+
(void *) (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1 + ne10_block_start*nbw0),
|
|
7781
|
+
(ne10_block_end - ne10_block_start) * bs);
|
|
7782
|
+
}
|
|
7783
|
+
}
|
|
7784
|
+
}
|
|
7785
|
+
#endif
|
|
7681
7786
|
}
|
|
7682
7787
|
|
|
7683
|
-
#define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne12 + (i1)]
|
|
7684
|
-
|
|
7685
7788
|
if (ith == 0) {
|
|
7686
7789
|
// initialize matrix_row_counts
|
|
7687
7790
|
memset(matrix_row_counts, 0, n_as*sizeof(int64_t));
|
|
@@ -7699,9 +7802,14 @@ static void ggml_compute_forward_mul_mat_id(
|
|
|
7699
7802
|
}
|
|
7700
7803
|
}
|
|
7701
7804
|
|
|
7805
|
+
// reset current_chunk
|
|
7806
|
+
for (int cur_a = ith; cur_a < n_as; cur_a += nth) {
|
|
7807
|
+
atomic_int * current_chunk_ctr = (atomic_int *)(atomic_current_chunk + cur_a);
|
|
7808
|
+
*current_chunk_ctr = nth;
|
|
7809
|
+
}
|
|
7810
|
+
|
|
7702
7811
|
ggml_barrier(params->threadpool);
|
|
7703
7812
|
|
|
7704
|
-
// compute each matrix multiplication in sequence
|
|
7705
7813
|
for (int cur_a = 0; cur_a < n_as; ++cur_a) {
|
|
7706
7814
|
const int64_t cne1 = matrix_row_counts[cur_a];
|
|
7707
7815
|
|
|
@@ -7709,84 +7817,64 @@ static void ggml_compute_forward_mul_mat_id(
|
|
|
7709
7817
|
continue;
|
|
7710
7818
|
}
|
|
7711
7819
|
|
|
7712
|
-
const char * src0_cur = (const char *) src0->data + cur_a*nb02;
|
|
7713
|
-
|
|
7714
|
-
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
|
|
7820
|
+
const char * src0_cur = (const char *) src0->data + cur_a * nb02;
|
|
7821
|
+
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
|
|
7715
7822
|
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
|
7716
7823
|
|
|
7717
|
-
const int64_t nr0 = ne01;
|
|
7718
|
-
const int64_t nr1 = cne1;
|
|
7719
|
-
|
|
7720
|
-
// distribute the thread work across the inner or outer loop based on which one is larger
|
|
7721
|
-
|
|
7722
|
-
const int64_t nth0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows
|
|
7723
|
-
const int64_t nth1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows
|
|
7724
|
-
|
|
7725
|
-
const int64_t ith0 = ith % nth0;
|
|
7726
|
-
const int64_t ith1 = ith / nth0;
|
|
7824
|
+
const int64_t nr0 = ne01;
|
|
7825
|
+
const int64_t nr1 = cne1;
|
|
7727
7826
|
|
|
7728
|
-
|
|
7729
|
-
|
|
7730
|
-
|
|
7731
|
-
|
|
7732
|
-
const int64_t ir011 = MIN(ir010 + dr0, nr0);
|
|
7733
|
-
|
|
7734
|
-
const int64_t ir110 = dr1*ith1;
|
|
7735
|
-
const int64_t ir111 = MIN(ir110 + dr1, nr1);
|
|
7827
|
+
int chunk_size = 16;
|
|
7828
|
+
if (nr0 == 1 || nr1 == 1) {
|
|
7829
|
+
chunk_size = 64;
|
|
7830
|
+
}
|
|
7736
7831
|
|
|
7737
|
-
|
|
7738
|
-
//
|
|
7739
|
-
|
|
7740
|
-
|
|
7741
|
-
//
|
|
7832
|
+
#if defined(__aarch64__)
|
|
7833
|
+
// disable for ARM
|
|
7834
|
+
const bool disable_chunking = true;
|
|
7835
|
+
#else
|
|
7836
|
+
// disable for NUMA
|
|
7837
|
+
const bool disable_chunking = ggml_is_numa();
|
|
7838
|
+
#endif // defined(__aarch64__)
|
|
7742
7839
|
|
|
7743
|
-
|
|
7744
|
-
|
|
7745
|
-
const int64_t blck_1 = 16;
|
|
7840
|
+
int64_t nchunk0 = (nr0 + chunk_size - 1) / chunk_size;
|
|
7841
|
+
int64_t nchunk1 = (nr1 + chunk_size - 1) / chunk_size;
|
|
7746
7842
|
|
|
7747
|
-
|
|
7748
|
-
|
|
7843
|
+
if (nchunk0 * nchunk1 < nth * 4 || disable_chunking) {
|
|
7844
|
+
nchunk0 = nr0 > nr1 ? nth : 1;
|
|
7845
|
+
nchunk1 = nr0 > nr1 ? 1 : nth;
|
|
7846
|
+
}
|
|
7749
7847
|
|
|
7750
|
-
|
|
7751
|
-
|
|
7752
|
-
for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) {
|
|
7753
|
-
const int64_t _i12 = ir1; // logical row index for this expert
|
|
7848
|
+
const int64_t dr0 = (nr0 + nchunk0 - 1) / nchunk0;
|
|
7849
|
+
const int64_t dr1 = (nr1 + nchunk1 - 1) / nchunk1;
|
|
7754
7850
|
|
|
7755
|
-
|
|
7756
|
-
const int id = row_mapping.i1; // selected expert index
|
|
7851
|
+
int current_chunk = ith;
|
|
7757
7852
|
|
|
7758
|
-
|
|
7759
|
-
const int64_t i12 = row_mapping.i2; // row index in src1
|
|
7853
|
+
atomic_int * current_chunk_ctr = (atomic_int *)(atomic_current_chunk + cur_a);
|
|
7760
7854
|
|
|
7761
|
-
|
|
7762
|
-
|
|
7855
|
+
while (current_chunk < nchunk0 * nchunk1) {
|
|
7856
|
+
const int64_t ith0 = current_chunk % nchunk0;
|
|
7857
|
+
const int64_t ith1 = current_chunk / nchunk0;
|
|
7763
7858
|
|
|
7764
|
-
|
|
7765
|
-
|
|
7766
|
-
// the original src1 data pointer, so we should index using the indices directly
|
|
7767
|
-
// TODO: this is a bit of a hack, we should probably have a better way to handle this
|
|
7768
|
-
const char * src1_col = (const char *) wdata +
|
|
7769
|
-
(src1_cont || src1->type != vec_dot_type
|
|
7770
|
-
? (i11 + i12*ne11)*row_size
|
|
7771
|
-
: (i11*nb11 + i12*nb12));
|
|
7859
|
+
const int64_t ir0_start = dr0 * ith0;
|
|
7860
|
+
const int64_t ir0_end = MIN(ir0_start + dr0, nr0);
|
|
7772
7861
|
|
|
7773
|
-
|
|
7862
|
+
const int64_t ir1_start = dr1 * ith1;
|
|
7863
|
+
const int64_t ir1_end = MIN(ir1_start + dr1, nr1);
|
|
7774
7864
|
|
|
7775
|
-
|
|
7776
|
-
|
|
7777
|
-
|
|
7865
|
+
ggml_compute_forward_mul_mat_id_one_chunk(
|
|
7866
|
+
dst, src0, src1, ids, cur_a,
|
|
7867
|
+
ir0_start, ir0_end, ir1_start, ir1_end,
|
|
7868
|
+
src0_cur, matrix_rows, row_size, src1_cont, wdata
|
|
7869
|
+
);
|
|
7778
7870
|
|
|
7779
|
-
|
|
7780
|
-
|
|
7781
|
-
}
|
|
7782
|
-
|
|
7783
|
-
memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float));
|
|
7784
|
-
}
|
|
7871
|
+
if (nth >= nchunk0 * nchunk1) {
|
|
7872
|
+
break;
|
|
7785
7873
|
}
|
|
7874
|
+
|
|
7875
|
+
current_chunk = atomic_fetch_add_explicit(current_chunk_ctr, 1, memory_order_relaxed);
|
|
7786
7876
|
}
|
|
7787
7877
|
}
|
|
7788
|
-
|
|
7789
|
-
#undef MMID_MATRIX_ROW
|
|
7790
7878
|
}
|
|
7791
7879
|
|
|
7792
7880
|
// ggml_compute_forward_out_prod
|
|
@@ -9080,10 +9168,6 @@ static void ggml_compute_forward_clamp_f32(
|
|
|
9080
9168
|
|
|
9081
9169
|
const struct ggml_tensor * src0 = dst->src[0];
|
|
9082
9170
|
|
|
9083
|
-
if (params->ith != 0) {
|
|
9084
|
-
return;
|
|
9085
|
-
}
|
|
9086
|
-
|
|
9087
9171
|
float min;
|
|
9088
9172
|
float max;
|
|
9089
9173
|
memcpy(&min, (float *) dst->op_params + 0, sizeof(float));
|
|
@@ -13723,14 +13807,19 @@ struct ggml_cplan ggml_graph_plan(
|
|
|
13723
13807
|
cur = 0;
|
|
13724
13808
|
const struct ggml_tensor * src0 = node->src[0];
|
|
13725
13809
|
const struct ggml_tensor * src1 = node->src[1];
|
|
13810
|
+
const struct ggml_tensor * ids = node->src[2];
|
|
13726
13811
|
const enum ggml_type vec_dot_type = type_traits_cpu[src0->type].vec_dot_type;
|
|
13812
|
+
const int n_as = src0->ne[2];
|
|
13813
|
+
// src1
|
|
13727
13814
|
if (src1->type != vec_dot_type) {
|
|
13728
|
-
cur += ggml_row_size(vec_dot_type, ggml_nelements(src1));
|
|
13815
|
+
cur += ggml_row_size(vec_dot_type, ggml_nelements(src1)) + sizeof(int64_t);
|
|
13729
13816
|
}
|
|
13730
|
-
|
|
13731
|
-
cur +=
|
|
13732
|
-
|
|
13733
|
-
cur += n_as
|
|
13817
|
+
// matrix_row_counts
|
|
13818
|
+
cur += n_as * sizeof(int64_t) + sizeof(int64_t);
|
|
13819
|
+
// matrix_rows
|
|
13820
|
+
cur += n_as*ids->ne[0]*ids->ne[1]*sizeof(struct mmid_row_mapping) + sizeof(int64_t);
|
|
13821
|
+
// atomic_current_chunk
|
|
13822
|
+
cur += CACHE_LINE_SIZE*n_as + CACHE_LINE_SIZE;
|
|
13734
13823
|
} break;
|
|
13735
13824
|
case GGML_OP_OUT_PROD:
|
|
13736
13825
|
{
|
|
@@ -13862,9 +13951,13 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
|
13862
13951
|
tp->ec = GGML_STATUS_ABORTED;
|
|
13863
13952
|
}
|
|
13864
13953
|
|
|
13865
|
-
|
|
13954
|
+
if (node_n + 1 < cgraph->n_nodes) {
|
|
13955
|
+
ggml_barrier(state->threadpool);
|
|
13956
|
+
}
|
|
13866
13957
|
}
|
|
13867
13958
|
|
|
13959
|
+
ggml_barrier(state->threadpool);
|
|
13960
|
+
|
|
13868
13961
|
return 0;
|
|
13869
13962
|
}
|
|
13870
13963
|
|
|
@@ -284,14 +284,14 @@ struct ggml_backend_cpu_device_context {
|
|
|
284
284
|
&hKey) == ERROR_SUCCESS) {
|
|
285
285
|
DWORD cpu_brand_size = 0;
|
|
286
286
|
if (RegQueryValueExA(hKey,
|
|
287
|
-
|
|
287
|
+
"ProcessorNameString",
|
|
288
288
|
NULL,
|
|
289
289
|
NULL,
|
|
290
290
|
NULL,
|
|
291
291
|
&cpu_brand_size) == ERROR_SUCCESS) {
|
|
292
292
|
description.resize(cpu_brand_size);
|
|
293
293
|
if (RegQueryValueExA(hKey,
|
|
294
|
-
|
|
294
|
+
"ProcessorNameString",
|
|
295
295
|
NULL,
|
|
296
296
|
NULL,
|
|
297
297
|
(LPBYTE)&description[0], // NOLINT
|
|
@@ -534,9 +534,6 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r
|
|
|
534
534
|
if (ggml_cpu_has_dotprod()) {
|
|
535
535
|
features.push_back({ "DOTPROD", "1" });
|
|
536
536
|
}
|
|
537
|
-
if (ggml_cpu_has_matmul_int8()) {
|
|
538
|
-
features.push_back({ "MATMUL_INT8", "1" });
|
|
539
|
-
}
|
|
540
537
|
if (ggml_cpu_get_sve_cnt() > 0) {
|
|
541
538
|
static std::string sve_cnt = std::to_string(ggml_cpu_get_sve_cnt());
|
|
542
539
|
features.push_back({ "SVE_CNT", sve_cnt.c_str() });
|
|
@@ -280,14 +280,6 @@ template <> inline __m256bh load(const float *p) {
|
|
|
280
280
|
}
|
|
281
281
|
#endif
|
|
282
282
|
|
|
283
|
-
////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
284
|
-
// CONSTANTS
|
|
285
|
-
|
|
286
|
-
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
|
|
287
|
-
static const int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
|
|
288
|
-
static const __m128i iq4nlt = _mm_loadu_si128((const __m128i *) kvalues_iq4nl);
|
|
289
|
-
#endif
|
|
290
|
-
|
|
291
283
|
////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
292
284
|
// FLOATING POINT MATRIX MULTIPLICATION
|
|
293
285
|
|
|
@@ -614,6 +606,14 @@ class tinyBLAS_Q0_AVX {
|
|
|
614
606
|
TC *C, int64_t ldc,
|
|
615
607
|
int ith, int nth)
|
|
616
608
|
: A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
|
|
609
|
+
const int8_t kvalues_iq4nl[16] = {
|
|
610
|
+
-127, -104, -83, -65,
|
|
611
|
+
-49, -35, -22, -10,
|
|
612
|
+
1, 13, 25, 38,
|
|
613
|
+
53, 69, 89, 113
|
|
614
|
+
};
|
|
615
|
+
|
|
616
|
+
iq4nlt = _mm_loadu_si128((const __m128i *)kvalues_iq4nl);
|
|
617
617
|
}
|
|
618
618
|
|
|
619
619
|
void matmul(int64_t m, int64_t n) {
|
|
@@ -1038,6 +1038,7 @@ class tinyBLAS_Q0_AVX {
|
|
|
1038
1038
|
const int64_t ldc;
|
|
1039
1039
|
const int ith;
|
|
1040
1040
|
const int nth;
|
|
1041
|
+
__m128i iq4nlt;
|
|
1041
1042
|
};
|
|
1042
1043
|
#endif // __AVX__
|
|
1043
1044
|
|
|
@@ -15,9 +15,9 @@ if (CUDAToolkit_FOUND)
|
|
|
15
15
|
if (GGML_NATIVE AND CUDAToolkit_VERSION VERSION_GREATER_EQUAL "11.6" AND CMAKE_VERSION VERSION_GREATER_EQUAL "3.24")
|
|
16
16
|
set(CMAKE_CUDA_ARCHITECTURES "native")
|
|
17
17
|
elseif(GGML_CUDA_F16 OR GGML_CUDA_DMMV_F16)
|
|
18
|
-
set(CMAKE_CUDA_ARCHITECTURES "60;61;70;75")
|
|
18
|
+
set(CMAKE_CUDA_ARCHITECTURES "60;61;70;75;80")
|
|
19
19
|
else()
|
|
20
|
-
set(CMAKE_CUDA_ARCHITECTURES "52;61;70;75")
|
|
20
|
+
set(CMAKE_CUDA_ARCHITECTURES "52;61;70;75;80")
|
|
21
21
|
endif()
|
|
22
22
|
endif()
|
|
23
23
|
message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
|
|
@@ -143,6 +143,7 @@ struct ggml_backend_opencl_context {
|
|
|
143
143
|
cl_kernel kernel_rms_norm;
|
|
144
144
|
cl_kernel kernel_diag_mask_inf, kernel_diag_mask_inf_8;
|
|
145
145
|
cl_kernel kernel_soft_max, kernel_soft_max_4;
|
|
146
|
+
cl_kernel kernel_soft_max_f16, kernel_soft_max_4_f16;
|
|
146
147
|
cl_kernel kernel_get_rows_f32, kernel_get_rows_f16, kernel_get_rows_q4_0;
|
|
147
148
|
cl_kernel kernel_rope_norm_f32, kernel_rope_norm_f16, kernel_rope_neox_f32, kernel_rope_neox_f16;
|
|
148
149
|
cl_kernel kernel_cpy_f16_f16, kernel_cpy_f16_f32, kernel_cpy_f32_f16, kernel_cpy_f32_f32;
|
|
@@ -614,6 +615,8 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
|
|
614
615
|
CL_CHECK((backend_ctx->kernel_diag_mask_inf_8 = clCreateKernel(backend_ctx->program, "kernel_diag_mask_inf_8", &err), err));
|
|
615
616
|
CL_CHECK((backend_ctx->kernel_soft_max = clCreateKernel(backend_ctx->program, "kernel_soft_max", &err), err));
|
|
616
617
|
CL_CHECK((backend_ctx->kernel_soft_max_4 = clCreateKernel(backend_ctx->program, "kernel_soft_max_4", &err), err));
|
|
618
|
+
CL_CHECK((backend_ctx->kernel_soft_max_f16 = clCreateKernel(backend_ctx->program, "kernel_soft_max_f16", &err), err));
|
|
619
|
+
CL_CHECK((backend_ctx->kernel_soft_max_4_f16 = clCreateKernel(backend_ctx->program, "kernel_soft_max_4_f16", &err), err));
|
|
617
620
|
CL_CHECK((backend_ctx->kernel_rope_norm_f32 = clCreateKernel(backend_ctx->program, "kernel_rope_norm_f32", &err), err));
|
|
618
621
|
CL_CHECK((backend_ctx->kernel_rope_norm_f16 = clCreateKernel(backend_ctx->program, "kernel_rope_norm_f16", &err), err));
|
|
619
622
|
CL_CHECK((backend_ctx->kernel_rope_neox_f32 = clCreateKernel(backend_ctx->program, "kernel_rope_neox_f32", &err), err));
|
|
@@ -1044,8 +1047,16 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
|
|
|
1044
1047
|
return true;
|
|
1045
1048
|
case GGML_OP_DIAG_MASK_INF:
|
|
1046
1049
|
return op->ne[3] == 1;
|
|
1047
|
-
case GGML_OP_ROPE:
|
|
1050
|
+
case GGML_OP_ROPE: {
|
|
1051
|
+
const int mode = ((const int32_t *) op->op_params)[2];
|
|
1052
|
+
if (mode & GGML_ROPE_TYPE_MROPE) {
|
|
1053
|
+
return false;
|
|
1054
|
+
}
|
|
1055
|
+
if (mode & GGML_ROPE_TYPE_VISION) {
|
|
1056
|
+
return false;
|
|
1057
|
+
}
|
|
1048
1058
|
return true;
|
|
1059
|
+
}
|
|
1049
1060
|
default:
|
|
1050
1061
|
return false;
|
|
1051
1062
|
}
|
|
@@ -3666,6 +3677,8 @@ static void ggml_cl_soft_max(ggml_backend_t backend, const ggml_tensor * src0, c
|
|
|
3666
3677
|
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
|
|
3667
3678
|
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
|
|
3668
3679
|
|
|
3680
|
+
const bool use_f16 = (src1 && src1->type == GGML_TYPE_F16);
|
|
3681
|
+
|
|
3669
3682
|
// Local size must be wave size. Each workgroup is a wave, working on a row,
|
|
3670
3683
|
// where a row corresponds to leading dimension.
|
|
3671
3684
|
int nth = MIN(32, ne00);
|
|
@@ -3683,9 +3696,17 @@ static void ggml_cl_soft_max(ggml_backend_t backend, const ggml_tensor * src0, c
|
|
|
3683
3696
|
cl_kernel kernel;
|
|
3684
3697
|
|
|
3685
3698
|
if (ne00%4 == 0) {
|
|
3686
|
-
|
|
3699
|
+
if (use_f16) {
|
|
3700
|
+
kernel = backend_ctx->kernel_soft_max_4_f16;
|
|
3701
|
+
} else {
|
|
3702
|
+
kernel = backend_ctx->kernel_soft_max_4;
|
|
3703
|
+
}
|
|
3687
3704
|
} else {
|
|
3688
|
-
|
|
3705
|
+
if (use_f16) {
|
|
3706
|
+
kernel = backend_ctx->kernel_soft_max_f16;
|
|
3707
|
+
} else {
|
|
3708
|
+
kernel = backend_ctx->kernel_soft_max;
|
|
3709
|
+
}
|
|
3689
3710
|
}
|
|
3690
3711
|
|
|
3691
3712
|
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
|
@@ -3766,7 +3787,8 @@ static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
3766
3787
|
const int nb2 = dst ? dst->nb[2] : 0;
|
|
3767
3788
|
const int nb3 = dst ? dst->nb[3] : 0;
|
|
3768
3789
|
|
|
3769
|
-
GGML_ASSERT(ne10 ==
|
|
3790
|
+
GGML_ASSERT(ne10 % ne02 == 0);
|
|
3791
|
+
GGML_ASSERT(ne10 >= ne02);
|
|
3770
3792
|
|
|
3771
3793
|
int nth = MIN(64, ne00);
|
|
3772
3794
|
|