@fugood/llama.node 1.4.15 → 1.6.0-rc.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/binding.ts +1 -5
- package/lib/index.js +2 -2
- package/lib/index.ts +2 -2
- package/package.json +15 -15
- package/scripts/llama.cpp.patch +76 -61
- package/src/LlamaContext.cpp +20 -32
- package/src/llama.cpp/common/CMakeLists.txt +12 -0
- package/src/llama.cpp/common/arg.cpp +20 -0
- package/src/llama.cpp/common/chat-parser.cpp +3 -3
- package/src/llama.cpp/common/chat-parser.h +4 -4
- package/src/llama.cpp/common/chat.cpp +289 -34
- package/src/llama.cpp/common/chat.h +32 -20
- package/src/llama.cpp/common/common.cpp +0 -1
- package/src/llama.cpp/common/common.h +31 -25
- package/src/llama.cpp/common/download.cpp +19 -14
- package/src/llama.cpp/common/jinja/caps.cpp +237 -0
- package/src/llama.cpp/common/jinja/caps.h +24 -0
- package/src/llama.cpp/common/jinja/lexer.cpp +341 -0
- package/src/llama.cpp/common/jinja/lexer.h +157 -0
- package/src/llama.cpp/common/jinja/parser.cpp +591 -0
- package/src/llama.cpp/common/jinja/parser.h +21 -0
- package/src/llama.cpp/common/jinja/runtime.cpp +865 -0
- package/src/llama.cpp/common/jinja/runtime.h +628 -0
- package/src/llama.cpp/common/jinja/string.cpp +207 -0
- package/src/llama.cpp/common/jinja/string.h +58 -0
- package/src/llama.cpp/common/jinja/utils.h +49 -0
- package/src/llama.cpp/common/jinja/value.cpp +1221 -0
- package/src/llama.cpp/common/jinja/value.h +464 -0
- package/src/llama.cpp/common/json-partial.h +1 -0
- package/src/llama.cpp/common/sampling.cpp +52 -19
- package/src/llama.cpp/ggml/include/ggml.h +39 -7
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +4 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +63 -37
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +31 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +18 -0
- package/src/llama.cpp/include/llama-cpp.h +3 -1
- package/src/llama.cpp/include/llama.h +29 -2
- package/src/llama.cpp/src/llama-adapter.cpp +7 -13
- package/src/llama.cpp/src/llama-adapter.h +1 -3
- package/src/llama.cpp/src/llama-context.cpp +232 -144
- package/src/llama.cpp/src/llama-context.h +10 -0
- package/src/llama.cpp/src/llama-cparams.h +2 -0
- package/src/llama.cpp/src/llama-hparams.cpp +0 -36
- package/src/llama.cpp/src/llama-hparams.h +38 -1
- package/src/llama.cpp/src/llama-kv-cache.cpp +201 -59
- package/src/llama.cpp/src/llama-kv-cache.h +0 -2
- package/src/llama.cpp/src/llama-mmap.cpp +5 -1
- package/src/llama.cpp/src/llama-model-loader.cpp +21 -7
- package/src/llama.cpp/src/llama-model.cpp +5 -1
- package/src/llama.cpp/src/llama-model.h +3 -2
- package/src/llama.cpp/src/llama-sampling.cpp +170 -13
|
@@ -630,10 +630,11 @@ extern "C" {
|
|
|
630
630
|
|
|
631
631
|
// this tensor...
|
|
632
632
|
enum ggml_tensor_flag {
|
|
633
|
-
GGML_TENSOR_FLAG_INPUT
|
|
634
|
-
GGML_TENSOR_FLAG_OUTPUT
|
|
635
|
-
GGML_TENSOR_FLAG_PARAM
|
|
636
|
-
GGML_TENSOR_FLAG_LOSS
|
|
633
|
+
GGML_TENSOR_FLAG_INPUT = 1, // ...is an input for the GGML compute graph
|
|
634
|
+
GGML_TENSOR_FLAG_OUTPUT = 2, // ...is an output for the GGML compute graph
|
|
635
|
+
GGML_TENSOR_FLAG_PARAM = 4, // ...contains trainable parameters
|
|
636
|
+
GGML_TENSOR_FLAG_LOSS = 8, // ...defines loss for numerical optimization (multiple loss tensors add up)
|
|
637
|
+
GGML_TENSOR_FLAG_COMPUTE = 16, // ...must be computed
|
|
637
638
|
};
|
|
638
639
|
|
|
639
640
|
enum ggml_tri_type {
|
|
@@ -2577,11 +2578,42 @@ extern "C" {
|
|
|
2577
2578
|
struct ggml_tensor * grad,
|
|
2578
2579
|
struct ggml_tensor * sgd_params); // alpha, weight decay
|
|
2579
2580
|
|
|
2581
|
+
// build forward mutiple tensors and select one of them for computing
|
|
2582
|
+
// this is useful for creating graphs that have constant topology but compute different things based on the input
|
|
2583
|
+
// ref: https://github.com/ggml-org/llama.cpp/pull/18550
|
|
2580
2584
|
//
|
|
2581
|
-
//
|
|
2585
|
+
// nodes:
|
|
2586
|
+
// | - build forward into the graph but do not compute
|
|
2587
|
+
// c - build forward into the graph and compute
|
|
2582
2588
|
//
|
|
2589
|
+
// | | ... c ... |
|
|
2590
|
+
// | | ... c ... |
|
|
2591
|
+
// | | ... c ... |
|
|
2592
|
+
// [0 1 ... idx ... n-1] <-- ggml_build_forward_select(..., n, idx)
|
|
2593
|
+
// c
|
|
2594
|
+
// c
|
|
2595
|
+
//
|
|
2596
|
+
// example:
|
|
2597
|
+
// struct ggml_tensor * curs[3];
|
|
2598
|
+
//
|
|
2599
|
+
// curs[0] = compute0(...);
|
|
2600
|
+
// curs[1] = compute1(...);
|
|
2601
|
+
// curs[2] = compute2(...);
|
|
2602
|
+
//
|
|
2603
|
+
// int idx = select_branch(some_input);
|
|
2604
|
+
//
|
|
2605
|
+
// struct ggml_tensor * out = ggml_build_forward_select(cgraph, curs, 3, idx);
|
|
2606
|
+
//
|
|
2607
|
+
GGML_API struct ggml_tensor * ggml_build_forward_select(
|
|
2608
|
+
struct ggml_cgraph * cgraph,
|
|
2609
|
+
struct ggml_tensor ** tensors,
|
|
2610
|
+
int n_tensors,
|
|
2611
|
+
int idx);
|
|
2612
|
+
|
|
2613
|
+
GGML_API void ggml_build_forward_expand(
|
|
2614
|
+
struct ggml_cgraph * cgraph,
|
|
2615
|
+
struct ggml_tensor * tensor);
|
|
2583
2616
|
|
|
2584
|
-
GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
|
|
2585
2617
|
GGML_API void ggml_build_backward_expand(
|
|
2586
2618
|
struct ggml_context * ctx, // context for gradient computation
|
|
2587
2619
|
struct ggml_cgraph * cgraph,
|
|
@@ -2613,7 +2645,7 @@ extern "C" {
|
|
|
2613
2645
|
GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph);
|
|
2614
2646
|
|
|
2615
2647
|
// dump the graph into a file using the dot format
|
|
2616
|
-
GGML_API void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph *
|
|
2648
|
+
GGML_API void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * cgraph, const char * filename);
|
|
2617
2649
|
|
|
2618
2650
|
// TODO these functions were sandwiched in the old optimization interface, is there a better place for them?
|
|
2619
2651
|
typedef void (*ggml_log_callback)(enum ggml_log_level level, const char * text, void * user_data);
|
|
@@ -2943,6 +2943,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
|
2943
2943
|
continue;
|
|
2944
2944
|
}
|
|
2945
2945
|
|
|
2946
|
+
if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) {
|
|
2947
|
+
continue;
|
|
2948
|
+
}
|
|
2949
|
+
|
|
2946
2950
|
ggml_compute_forward(¶ms, node);
|
|
2947
2951
|
|
|
2948
2952
|
if (state->ith == 0 && cplan->abort_callback &&
|
|
@@ -7,10 +7,9 @@
|
|
|
7
7
|
#include "unary-ops.h"
|
|
8
8
|
#include "vec.h"
|
|
9
9
|
|
|
10
|
-
#include <cfloat>
|
|
11
10
|
#include <algorithm>
|
|
11
|
+
#include <cfloat>
|
|
12
12
|
#include <cmath>
|
|
13
|
-
#include <functional>
|
|
14
13
|
|
|
15
14
|
// ggml_compute_forward_dup
|
|
16
15
|
|
|
@@ -7110,12 +7109,13 @@ void ggml_compute_forward_conv_2d_dw(
|
|
|
7110
7109
|
}
|
|
7111
7110
|
}
|
|
7112
7111
|
|
|
7113
|
-
//
|
|
7114
|
-
|
|
7115
|
-
static void ggml_compute_forward_pool_1d_sk_p0(
|
|
7112
|
+
// ggml_compute_forward_pool_1d_ksp
|
|
7113
|
+
static void ggml_compute_forward_pool_1d_ksp(
|
|
7116
7114
|
const ggml_compute_params * params,
|
|
7117
7115
|
const ggml_op_pool op,
|
|
7118
7116
|
const int k,
|
|
7117
|
+
const int s,
|
|
7118
|
+
const int p,
|
|
7119
7119
|
ggml_tensor * dst) {
|
|
7120
7120
|
|
|
7121
7121
|
const ggml_tensor * src = dst->src[0];
|
|
@@ -7126,39 +7126,56 @@ static void ggml_compute_forward_pool_1d_sk_p0(
|
|
|
7126
7126
|
return;
|
|
7127
7127
|
}
|
|
7128
7128
|
|
|
7129
|
-
const
|
|
7130
|
-
const
|
|
7131
|
-
float * drow = (float *)dst->data;
|
|
7129
|
+
const int64_t IW = src->ne[0];
|
|
7130
|
+
const int64_t OW = dst->ne[0];
|
|
7132
7131
|
|
|
7133
|
-
const int64_t
|
|
7132
|
+
const int64_t nr = ggml_nrows(src);
|
|
7134
7133
|
|
|
7135
|
-
|
|
7136
|
-
const
|
|
7137
|
-
|
|
7138
|
-
|
|
7134
|
+
for (int64_t ir = 0; ir < nr; ++ir) {
|
|
7135
|
+
const char * srow_bytes = (const char *) src->data + ir * src->nb[1];
|
|
7136
|
+
float * drow = (float *) (( char *) dst->data + ir * dst->nb[1]);
|
|
7137
|
+
|
|
7138
|
+
for (int64_t ow = 0; ow < OW; ++ow) {
|
|
7139
|
+
float res = 0;
|
|
7139
7140
|
switch (op) {
|
|
7140
|
-
case GGML_OP_POOL_AVG:
|
|
7141
|
-
case GGML_OP_POOL_MAX:
|
|
7141
|
+
case GGML_OP_POOL_AVG: res = 0.0f; break;
|
|
7142
|
+
case GGML_OP_POOL_MAX: res = -FLT_MAX; break;
|
|
7142
7143
|
case GGML_OP_POOL_COUNT: GGML_ABORT("fatal error");
|
|
7143
7144
|
}
|
|
7145
|
+
|
|
7146
|
+
int count = 0;
|
|
7147
|
+
const int base = (int) ow * s - p;
|
|
7148
|
+
|
|
7144
7149
|
for (int ki = 0; ki < k; ++ki) {
|
|
7145
|
-
const
|
|
7150
|
+
const int j = base + ki;
|
|
7151
|
+
if (j < 0 || j >= (int) IW) {
|
|
7152
|
+
continue;
|
|
7153
|
+
}
|
|
7154
|
+
|
|
7155
|
+
float v;
|
|
7156
|
+
if (src->type == GGML_TYPE_F32) {
|
|
7157
|
+
v = ((const float *) srow_bytes)[j];
|
|
7158
|
+
} else {
|
|
7159
|
+
v = GGML_CPU_FP16_TO_FP32(((const ggml_fp16_t *) srow_bytes)[j]);
|
|
7160
|
+
}
|
|
7161
|
+
|
|
7146
7162
|
switch (op) {
|
|
7147
|
-
case GGML_OP_POOL_AVG:
|
|
7148
|
-
case GGML_OP_POOL_MAX:
|
|
7149
|
-
case GGML_OP_POOL_COUNT:
|
|
7163
|
+
case GGML_OP_POOL_AVG: res += v; break;
|
|
7164
|
+
case GGML_OP_POOL_MAX: res = std::max(v, res); break;
|
|
7165
|
+
case GGML_OP_POOL_COUNT: GGML_ABORT("fatal error");
|
|
7150
7166
|
}
|
|
7151
|
-
|
|
7167
|
+
|
|
7168
|
+
++count;
|
|
7152
7169
|
}
|
|
7170
|
+
|
|
7153
7171
|
switch (op) {
|
|
7154
|
-
case GGML_OP_POOL_AVG:
|
|
7155
|
-
case GGML_OP_POOL_MAX:
|
|
7172
|
+
case GGML_OP_POOL_AVG: res = (count > 0) ? (res / count) : 0.0f; break;
|
|
7173
|
+
case GGML_OP_POOL_MAX: break;
|
|
7156
7174
|
case GGML_OP_POOL_COUNT: GGML_ABORT("fatal error");
|
|
7157
7175
|
}
|
|
7158
|
-
}
|
|
7159
7176
|
|
|
7160
|
-
|
|
7161
|
-
|
|
7177
|
+
drow[ow] = res;
|
|
7178
|
+
}
|
|
7162
7179
|
}
|
|
7163
7180
|
}
|
|
7164
7181
|
|
|
@@ -7173,10 +7190,8 @@ void ggml_compute_forward_pool_1d(
|
|
|
7173
7190
|
const int k0 = opts[1];
|
|
7174
7191
|
const int s0 = opts[2];
|
|
7175
7192
|
const int p0 = opts[3];
|
|
7176
|
-
GGML_ASSERT(p0 == 0); // padding not supported
|
|
7177
|
-
GGML_ASSERT(k0 == s0); // only s = k supported
|
|
7178
7193
|
|
|
7179
|
-
|
|
7194
|
+
ggml_compute_forward_pool_1d_ksp(params, op, k0, s0, p0, dst);
|
|
7180
7195
|
}
|
|
7181
7196
|
|
|
7182
7197
|
// ggml_compute_forward_pool_2d
|
|
@@ -7194,6 +7209,7 @@ void ggml_compute_forward_pool_2d(
|
|
|
7194
7209
|
}
|
|
7195
7210
|
|
|
7196
7211
|
const int32_t * opts = (const int32_t *)dst->op_params;
|
|
7212
|
+
|
|
7197
7213
|
ggml_op_pool op = static_cast<ggml_op_pool>(opts[0]);
|
|
7198
7214
|
const int k0 = opts[1];
|
|
7199
7215
|
const int k1 = opts[2];
|
|
@@ -7217,11 +7233,13 @@ void ggml_compute_forward_pool_2d(
|
|
|
7217
7233
|
while (cdata < data_end) {
|
|
7218
7234
|
for (int oy = 0; oy < py; ++oy) {
|
|
7219
7235
|
float * const drow = dplane + oy * px;
|
|
7236
|
+
float * const out = drow;
|
|
7237
|
+
|
|
7220
7238
|
for (int ox = 0; ox < px; ++ox) {
|
|
7221
|
-
float
|
|
7239
|
+
float res = 0;
|
|
7222
7240
|
switch (op) {
|
|
7223
|
-
case GGML_OP_POOL_AVG:
|
|
7224
|
-
case GGML_OP_POOL_MAX:
|
|
7241
|
+
case GGML_OP_POOL_AVG: res = 0; break;
|
|
7242
|
+
case GGML_OP_POOL_MAX: res = -FLT_MAX; break;
|
|
7225
7243
|
case GGML_OP_POOL_COUNT: GGML_ABORT("fatal error");
|
|
7226
7244
|
}
|
|
7227
7245
|
|
|
@@ -7229,24 +7247,32 @@ void ggml_compute_forward_pool_2d(
|
|
|
7229
7247
|
const int iy = offset1 + oy * s1;
|
|
7230
7248
|
|
|
7231
7249
|
for (int ky = 0; ky < k1; ++ky) {
|
|
7232
|
-
if (iy + ky < 0 || iy + ky >= src->ne[1])
|
|
7250
|
+
if (iy + ky < 0 || iy + ky >= src->ne[1]) {
|
|
7251
|
+
continue;
|
|
7252
|
+
}
|
|
7253
|
+
|
|
7233
7254
|
const void * srow = (const void *)(cdata + src->nb[1] * (iy + ky));
|
|
7234
7255
|
for (int kx = 0; kx < k0; ++kx) {
|
|
7235
7256
|
int j = ix + kx;
|
|
7236
|
-
if (j < 0 || j >= src->ne[0])
|
|
7257
|
+
if (j < 0 || j >= src->ne[0]) {
|
|
7258
|
+
continue;
|
|
7259
|
+
}
|
|
7260
|
+
|
|
7237
7261
|
const float srow_j = (src->type == GGML_TYPE_F32) ? ((const float*)srow)[j] : GGML_CPU_FP16_TO_FP32(((const ggml_fp16_t*)srow)[j]);
|
|
7238
7262
|
switch (op) {
|
|
7239
|
-
case GGML_OP_POOL_AVG:
|
|
7240
|
-
case GGML_OP_POOL_MAX:
|
|
7263
|
+
case GGML_OP_POOL_AVG: res += srow_j; break;
|
|
7264
|
+
case GGML_OP_POOL_MAX: res = std::max(srow_j, res); break;
|
|
7241
7265
|
case GGML_OP_POOL_COUNT: GGML_ABORT("fatal error");
|
|
7242
7266
|
}
|
|
7243
7267
|
}
|
|
7244
7268
|
}
|
|
7245
7269
|
switch (op) {
|
|
7246
|
-
case GGML_OP_POOL_AVG:
|
|
7247
|
-
case GGML_OP_POOL_MAX:
|
|
7270
|
+
case GGML_OP_POOL_AVG: res /= ka; break;
|
|
7271
|
+
case GGML_OP_POOL_MAX: break;
|
|
7248
7272
|
case GGML_OP_POOL_COUNT: GGML_ABORT("fatal error");
|
|
7249
7273
|
}
|
|
7274
|
+
|
|
7275
|
+
out[ox] = res;
|
|
7250
7276
|
}
|
|
7251
7277
|
}
|
|
7252
7278
|
|
|
@@ -654,6 +654,14 @@ static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
|
|
|
654
654
|
vec_extract(x[0], 2) + \
|
|
655
655
|
vec_extract(x[0], 3); \
|
|
656
656
|
}
|
|
657
|
+
#define GGML_F32x4_REDUCE_4(res, s0, s1, s2, s3) \
|
|
658
|
+
{ \
|
|
659
|
+
vector float v = vec_add(vec_add(s0, s1), \
|
|
660
|
+
vec_add(s2, s3)); \
|
|
661
|
+
v = vec_add(v, vec_sld(v, v, 8)); \
|
|
662
|
+
v = vec_add(v, vec_sld(v, v, 4)); \
|
|
663
|
+
res += (ggml_float) vec_extract(v, 0); \
|
|
664
|
+
}
|
|
657
665
|
|
|
658
666
|
#define GGML_F32_VEC GGML_F32x4
|
|
659
667
|
#define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
|
|
@@ -690,6 +698,29 @@ static inline unsigned char ggml_endian_byte(int i) {
|
|
|
690
698
|
r[i - GGML_ENDIAN_BYTE(0)]), \
|
|
691
699
|
0, p - GGML_F16_EPR)
|
|
692
700
|
|
|
701
|
+
//BF16 POWER9
|
|
702
|
+
#define GGML_BF16_STEP 16
|
|
703
|
+
#define GGML_BF16_EPR 8
|
|
704
|
+
|
|
705
|
+
#define GGML_BF16x8 vector unsigned short
|
|
706
|
+
#define GGML_BF16x8_ZERO vec_splats((unsigned short)0)
|
|
707
|
+
#define GGML_BF16x8_LOAD(p) vec_xl(0, (const unsigned short *)(p))
|
|
708
|
+
|
|
709
|
+
#define GGML_BF16_VEC GGML_BF16x8
|
|
710
|
+
#define GGML_BF16_VEC_ZERO GGML_BF16x8_ZERO
|
|
711
|
+
#define GGML_BF16_VEC_LOAD GGML_BF16x8_LOAD
|
|
712
|
+
#if defined(__LITTLE_ENDIAN__)
|
|
713
|
+
#define GGML_BF16_TO_F32_LO(v) ((vector float) vec_mergel(GGML_BF16_VEC_ZERO, (v)))
|
|
714
|
+
#define GGML_BF16_TO_F32_HI(v) ((vector float) vec_mergeh(GGML_BF16_VEC_ZERO, (v)))
|
|
715
|
+
#else
|
|
716
|
+
#define GGML_BF16_TO_F32_LO(v) ((vector float) vec_mergel((v), GGML_BF16_VEC_ZERO))
|
|
717
|
+
#define GGML_BF16_TO_F32_HI(v) ((vector float) vec_mergeh((v), GGML_BF16_VEC_ZERO))
|
|
718
|
+
#endif
|
|
719
|
+
#define GGML_BF16_FMA_LO(acc, x, y) \
|
|
720
|
+
(acc) = GGML_F32x4_FMA((acc), GGML_BF16_TO_F32_LO(x), GGML_BF16_TO_F32_LO(y))
|
|
721
|
+
#define GGML_BF16_FMA_HI(acc, x, y) \
|
|
722
|
+
(acc) = GGML_F32x4_FMA((acc), GGML_BF16_TO_F32_HI(x), GGML_BF16_TO_F32_HI(y))
|
|
723
|
+
|
|
693
724
|
#elif defined(__wasm_simd128__)
|
|
694
725
|
|
|
695
726
|
#define GGML_SIMD
|
|
@@ -237,6 +237,24 @@ void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t *
|
|
|
237
237
|
sumf += __riscv_vfmv_f_s_f32m1_f32(redsum);
|
|
238
238
|
|
|
239
239
|
#endif
|
|
240
|
+
#if defined(__POWER9_VECTOR__)
|
|
241
|
+
const int np = (n & ~(GGML_BF16_STEP - 1));
|
|
242
|
+
if (np > 0) {
|
|
243
|
+
GGML_F32_VEC sum[4] = {GGML_F32_VEC_ZERO};
|
|
244
|
+
for (; i < np; i += GGML_BF16_STEP) {
|
|
245
|
+
GGML_BF16_VEC vx0 = GGML_BF16_VEC_LOAD(x + i);
|
|
246
|
+
GGML_BF16_VEC vx1 = GGML_BF16_VEC_LOAD(x + i + 8);
|
|
247
|
+
GGML_BF16_VEC vy0 = GGML_BF16_VEC_LOAD(y + i);
|
|
248
|
+
GGML_BF16_VEC vy1 = GGML_BF16_VEC_LOAD(y + i + 8);
|
|
249
|
+
GGML_BF16_FMA_LO(sum[0], vx0, vy0);
|
|
250
|
+
GGML_BF16_FMA_HI(sum[1], vx0, vy0);
|
|
251
|
+
GGML_BF16_FMA_LO(sum[2], vx1, vy1);
|
|
252
|
+
GGML_BF16_FMA_HI(sum[3], vx1, vy1);
|
|
253
|
+
}
|
|
254
|
+
GGML_F32x4_REDUCE_4(sumf, sum[0], sum[1], sum[2], sum[3]);
|
|
255
|
+
}
|
|
256
|
+
#endif
|
|
257
|
+
|
|
240
258
|
for (; i < n; ++i) {
|
|
241
259
|
sumf += (ggml_float)(GGML_BF16_TO_FP32(x[i]) *
|
|
242
260
|
GGML_BF16_TO_FP32(y[i]));
|
|
@@ -21,7 +21,9 @@ struct llama_sampler_deleter {
|
|
|
21
21
|
};
|
|
22
22
|
|
|
23
23
|
struct llama_adapter_lora_deleter {
|
|
24
|
-
void operator()(llama_adapter_lora *
|
|
24
|
+
void operator()(llama_adapter_lora *) {
|
|
25
|
+
// llama_adapter_lora_free is deprecated
|
|
26
|
+
}
|
|
25
27
|
};
|
|
26
28
|
|
|
27
29
|
typedef std::unique_ptr<llama_model, llama_model_deleter> llama_model_ptr;
|
|
@@ -646,7 +646,8 @@ extern "C" {
|
|
|
646
646
|
|
|
647
647
|
// Manually free a LoRA adapter
|
|
648
648
|
// NOTE: loaded adapters will be free when the associated model is deleted
|
|
649
|
-
LLAMA_API void llama_adapter_lora_free(struct llama_adapter_lora * adapter)
|
|
649
|
+
LLAMA_API DEPRECATED(void llama_adapter_lora_free(struct llama_adapter_lora * adapter),
|
|
650
|
+
"adapters are now freed together with the associated model");
|
|
650
651
|
|
|
651
652
|
// Get the invocation tokens if the current lora is an alora
|
|
652
653
|
LLAMA_API uint64_t llama_adapter_get_alora_n_invocation_tokens(const struct llama_adapter_lora * adapter);
|
|
@@ -1255,7 +1256,6 @@ extern "C" {
|
|
|
1255
1256
|
// [EXPERIMENTAL]
|
|
1256
1257
|
// attach a sampler to the context
|
|
1257
1258
|
// note: prefer initializing the context with llama_context_params.samplers when possible
|
|
1258
|
-
// note: changing the samplers of a context can cause graph reallocations and degraded performance
|
|
1259
1259
|
LLAMA_API bool llama_set_sampler(struct llama_context * ctx, llama_seq_id seq_id, struct llama_sampler * smpl);
|
|
1260
1260
|
|
|
1261
1261
|
// mirror of llama_sampler_i:
|
|
@@ -1395,6 +1395,33 @@ extern "C" {
|
|
|
1395
1395
|
const char ** seq_breakers,
|
|
1396
1396
|
size_t num_breakers);
|
|
1397
1397
|
|
|
1398
|
+
/// adaptive-p: select tokens near a configurable target probability over time.
|
|
1399
|
+
///
|
|
1400
|
+
/// the adaptive-p sampler transforms the token probability distribution to favor tokens
|
|
1401
|
+
/// that fall near a user-configurable probability target.
|
|
1402
|
+
///
|
|
1403
|
+
/// internally, the sampler maintains an exponential moving average of the *ORIGINAL*
|
|
1404
|
+
/// probabilities of selected tokens at each sampling step. it uses this EMA to compute an
|
|
1405
|
+
/// adapted target probability at each sampling step, thus maintaining the desired target
|
|
1406
|
+
/// probability over time.
|
|
1407
|
+
///
|
|
1408
|
+
/// adaptive-p selects a token ID rather than just mutating candidates, so it must be last
|
|
1409
|
+
/// in the sampler chain (like mirostat, dist, greedy).
|
|
1410
|
+
///
|
|
1411
|
+
/// only mild truncation before this sampler is recommended. we suggest applying min-p
|
|
1412
|
+
/// before adaptive-p as the only other active sampler in the chain.
|
|
1413
|
+
///
|
|
1414
|
+
/// @param target select tokens near this probability (valid range 0.0 to 1.0; negative = disabled)
|
|
1415
|
+
/// @param decay EMA decay for adaptation; history ≈ 1/(1-decay) tokens (valid range 0.0 - 0.99)
|
|
1416
|
+
/// @param seed RNG seed
|
|
1417
|
+
///
|
|
1418
|
+
/// ref: https://github.com/ggml-org/llama.cpp/pull/17927
|
|
1419
|
+
///
|
|
1420
|
+
LLAMA_API struct llama_sampler * llama_sampler_init_adaptive_p(
|
|
1421
|
+
float target,
|
|
1422
|
+
float decay,
|
|
1423
|
+
uint32_t seed);
|
|
1424
|
+
|
|
1398
1425
|
LLAMA_API struct llama_sampler * llama_sampler_init_logit_bias(
|
|
1399
1426
|
int32_t n_vocab,
|
|
1400
1427
|
int32_t n_logit_bias,
|
|
@@ -146,11 +146,9 @@ llama_adapter_lora_weight * llama_adapter_lora::get_weight(ggml_tensor * w) {
|
|
|
146
146
|
return nullptr;
|
|
147
147
|
}
|
|
148
148
|
|
|
149
|
-
static void llama_adapter_lora_init_impl(const char * path_lora, llama_adapter_lora & adapter) {
|
|
149
|
+
static void llama_adapter_lora_init_impl(llama_model & model, const char * path_lora, llama_adapter_lora & adapter) {
|
|
150
150
|
LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora);
|
|
151
151
|
|
|
152
|
-
llama_model & model = adapter.model;
|
|
153
|
-
|
|
154
152
|
ggml_context * ctx_init;
|
|
155
153
|
gguf_init_params meta_gguf_params = {
|
|
156
154
|
/* .no_alloc = */ true,
|
|
@@ -413,17 +411,17 @@ static void llama_adapter_lora_init_impl(const char * path_lora, llama_adapter_l
|
|
|
413
411
|
}
|
|
414
412
|
}
|
|
415
413
|
|
|
416
|
-
//
|
|
417
|
-
model.
|
|
414
|
+
// register adapter with model
|
|
415
|
+
model.loras.insert(&adapter);
|
|
418
416
|
|
|
419
417
|
LLAMA_LOG_INFO("%s: loaded %zu tensors from lora file\n", __func__, adapter.ab_map.size()*2);
|
|
420
418
|
}
|
|
421
419
|
|
|
422
420
|
llama_adapter_lora * llama_adapter_lora_init(llama_model * model, const char * path_lora) {
|
|
423
|
-
llama_adapter_lora * adapter = new llama_adapter_lora(
|
|
421
|
+
llama_adapter_lora * adapter = new llama_adapter_lora();
|
|
424
422
|
|
|
425
423
|
try {
|
|
426
|
-
llama_adapter_lora_init_impl(path_lora, *adapter);
|
|
424
|
+
llama_adapter_lora_init_impl(*model, path_lora, *adapter);
|
|
427
425
|
return adapter;
|
|
428
426
|
} catch (const std::exception & err) {
|
|
429
427
|
LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
|
|
@@ -473,12 +471,8 @@ int32_t llama_adapter_meta_val_str_by_index(const llama_adapter_lora * adapter,
|
|
|
473
471
|
return snprintf(buf, buf_size, "%s", it->second.c_str());
|
|
474
472
|
}
|
|
475
473
|
|
|
476
|
-
void llama_adapter_lora_free(llama_adapter_lora *
|
|
477
|
-
//
|
|
478
|
-
GGML_ASSERT(adapter->model.n_lora_nodes >= adapter->get_n_nodes());
|
|
479
|
-
adapter->model.n_lora_nodes -= adapter->get_n_nodes();
|
|
480
|
-
|
|
481
|
-
delete adapter;
|
|
474
|
+
void llama_adapter_lora_free(llama_adapter_lora *) {
|
|
475
|
+
// deprecated: adapters are freed by llama_model's destructor
|
|
482
476
|
}
|
|
483
477
|
|
|
484
478
|
uint64_t llama_adapter_get_alora_n_invocation_tokens(const struct llama_adapter_lora * adapter) {
|
|
@@ -59,8 +59,6 @@ struct llama_adapter_lora_weight {
|
|
|
59
59
|
};
|
|
60
60
|
|
|
61
61
|
struct llama_adapter_lora {
|
|
62
|
-
llama_model & model;
|
|
63
|
-
|
|
64
62
|
// map tensor name to lora_a_b
|
|
65
63
|
std::unordered_map<std::string, llama_adapter_lora_weight> ab_map;
|
|
66
64
|
|
|
@@ -75,7 +73,7 @@ struct llama_adapter_lora {
|
|
|
75
73
|
// activated lora (aLoRA)
|
|
76
74
|
std::vector<llama_token> alora_invocation_tokens;
|
|
77
75
|
|
|
78
|
-
llama_adapter_lora(
|
|
76
|
+
llama_adapter_lora() = default;
|
|
79
77
|
~llama_adapter_lora() = default;
|
|
80
78
|
|
|
81
79
|
llama_adapter_lora_weight * get_weight(ggml_tensor * w);
|