llama_cpp 0.12.5 → 0.12.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +17 -0
- data/ext/llama_cpp/llama_cpp.cpp +67 -10
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +15 -1
- data/vendor/tmp/llama.cpp/Makefile +51 -12
- data/vendor/tmp/llama.cpp/ggml-alloc.c +595 -492
- data/vendor/tmp/llama.cpp/ggml-alloc.h +39 -65
- data/vendor/tmp/llama.cpp/ggml-backend.c +268 -271
- data/vendor/tmp/llama.cpp/ggml-backend.h +8 -12
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +560 -346
- data/vendor/tmp/llama.cpp/ggml-impl.h +20 -7
- data/vendor/tmp/llama.cpp/ggml-metal.m +101 -11
- data/vendor/tmp/llama.cpp/ggml-metal.metal +608 -9
- data/vendor/tmp/llama.cpp/ggml-quants.c +1255 -94
- data/vendor/tmp/llama.cpp/ggml-quants.h +39 -16
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +95 -264
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +213 -58
- data/vendor/tmp/llama.cpp/ggml.c +1082 -564
- data/vendor/tmp/llama.cpp/ggml.h +50 -17
- data/vendor/tmp/llama.cpp/llama.cpp +1329 -280
- data/vendor/tmp/llama.cpp/llama.h +43 -1
- data/vendor/tmp/llama.cpp/scripts/get-flags.mk +1 -1
- data/vendor/tmp/llama.cpp/unicode.h +42 -30
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 350a80cc8b804b23ee7b0f4e90604110b09664892d3d7c4217c4cd48c77cf775
|
4
|
+
data.tar.gz: 7a127d3b83cb680969589368eb741c6a2ac6a9765adf9f57dd23c0c1b54ca13d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: dbf25eb8f0fd60332eb8452ea400294d5b9b2b09127d0f3c5ef347135f30f565b161123d0f76a8553bcabf9e35db9fac3fff6cdd9df407fb830ab124d0d85d47
|
7
|
+
data.tar.gz: 2bbefd5b502150f052ab556c372c4f37b9cf2de2e22e34f4b2153a3b7ff93d7fca768eec5572d5514d7c46dc2a9c03121487907adc5ede612ecb6cea72de682d
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,20 @@
|
|
1
|
+
## [[0.12.7](https://github.com/yoshoku/llama_cpp.rb/compare/v0.12.6...v0.12.7)] - 2024-02-24
|
2
|
+
|
3
|
+
- Bump bundled llama.cpp from b2106 to b2143.
|
4
|
+
- Add constants for file type: `LLAMA_FTYPE_MOSTLY_IQ1_S` and `LLAMA_FTYPE_MOSTLY_IQ4_NL`.
|
5
|
+
- Add constants for pooling type: `LLAMA_POOLING_NONE`, `LLAMA_POOLING_MEAN`, and `LLAMA_POOLING_CLS`.
|
6
|
+
- Add `numa_init` module function to `LLaMACpp`.
|
7
|
+
- Remove unnecessary argument from `backend_init`.
|
8
|
+
|
9
|
+
Implementation of llama_chat_apply_template binding has been postponed for the time being.
|
10
|
+
|
11
|
+
## [[0.12.6](https://github.com/yoshoku/llama_cpp.rb/compare/v0.12.5...v0.12.6)] - 2024-02-17
|
12
|
+
|
13
|
+
- Bump bundled llama.cpp from b2106 to b2143.
|
14
|
+
- Add constant: `LLAMA_VOCAB_TYPE_WPM`.
|
15
|
+
- Add `do_pooling` accessors to ContextParams.
|
16
|
+
- Add `embeddings_ith` method to Context.
|
17
|
+
|
1
18
|
## [[0.12.5](https://github.com/yoshoku/llama_cpp.rb/compare/v0.12.4...v0.12.5)] - 2024-02-09
|
2
19
|
|
3
20
|
- Bump bundled llama.cpp from b2047 to b2106.
|
data/ext/llama_cpp/llama_cpp.cpp
CHANGED
@@ -978,6 +978,8 @@ public:
|
|
978
978
|
rb_define_method(rb_cLLaMAContextParams, "embedding", RUBY_METHOD_FUNC(_llama_context_params_get_embedding), 0);
|
979
979
|
rb_define_method(rb_cLLaMAContextParams, "offload_kqv=", RUBY_METHOD_FUNC(_llama_context_params_set_offload_kqv), 1);
|
980
980
|
rb_define_method(rb_cLLaMAContextParams, "offload_kqv", RUBY_METHOD_FUNC(_llama_context_params_get_offload_kqv), 0);
|
981
|
+
rb_define_method(rb_cLLaMAContextParams, "do_pooling=", RUBY_METHOD_FUNC(_llama_context_params_set_do_pooling), 1);
|
982
|
+
rb_define_method(rb_cLLaMAContextParams, "do_pooling", RUBY_METHOD_FUNC(_llama_context_params_get_do_pooling), 0);
|
981
983
|
}
|
982
984
|
|
983
985
|
private:
|
@@ -1220,6 +1222,18 @@ private:
|
|
1220
1222
|
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
1221
1223
|
return ptr->params.offload_kqv ? Qtrue : Qfalse;
|
1222
1224
|
}
|
1225
|
+
|
1226
|
+
// do_pooling
|
1227
|
+
static VALUE _llama_context_params_set_do_pooling(VALUE self, VALUE do_pooling) {
|
1228
|
+
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
1229
|
+
ptr->params.do_pooling = RTEST(do_pooling) ? true : false;
|
1230
|
+
return ptr->params.do_pooling ? Qtrue : Qfalse;
|
1231
|
+
}
|
1232
|
+
|
1233
|
+
static VALUE _llama_context_params_get_do_pooling(VALUE self) {
|
1234
|
+
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
1235
|
+
return ptr->params.do_pooling ? Qtrue : Qfalse;
|
1236
|
+
}
|
1223
1237
|
};
|
1224
1238
|
|
1225
1239
|
const rb_data_type_t RbLLaMAContextParams::llama_context_params_type = {
|
@@ -2029,6 +2043,7 @@ public:
|
|
2029
2043
|
rb_define_method(rb_cLLaMAContext, "decode", RUBY_METHOD_FUNC(_llama_context_decode), 1);
|
2030
2044
|
rb_define_method(rb_cLLaMAContext, "logits", RUBY_METHOD_FUNC(_llama_context_logits), 0);
|
2031
2045
|
rb_define_method(rb_cLLaMAContext, "embeddings", RUBY_METHOD_FUNC(_llama_context_embeddings), 0);
|
2046
|
+
rb_define_method(rb_cLLaMAContext, "embeddings_ith", RUBY_METHOD_FUNC(_llama_context_embeddings_ith), 1);
|
2032
2047
|
rb_define_method(rb_cLLaMAContext, "n_ctx", RUBY_METHOD_FUNC(_llama_context_n_ctx), 0);
|
2033
2048
|
rb_define_method(rb_cLLaMAContext, "n_batch", RUBY_METHOD_FUNC(_llama_context_n_batch), 0);
|
2034
2049
|
rb_define_method(rb_cLLaMAContext, "timings", RUBY_METHOD_FUNC(_llama_context_get_timings), 0);
|
@@ -2286,6 +2301,36 @@ private:
|
|
2286
2301
|
return output;
|
2287
2302
|
}
|
2288
2303
|
|
2304
|
+
static VALUE _llama_context_embeddings_ith(VALUE self, VALUE ith) {
|
2305
|
+
if (!RB_INTEGER_TYPE_P(ith)) {
|
2306
|
+
rb_raise(rb_eArgError, "ith must be an integer");
|
2307
|
+
return Qnil;
|
2308
|
+
}
|
2309
|
+
LLaMAContextWrapper* ptr = get_llama_context(self);
|
2310
|
+
if (ptr->ctx == NULL) {
|
2311
|
+
rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
|
2312
|
+
return Qnil;
|
2313
|
+
}
|
2314
|
+
VALUE params = rb_iv_get(self, "@params");
|
2315
|
+
LLaMAContextParamsWrapper* prms_ptr = RbLLaMAContextParams::get_llama_context_params(params);
|
2316
|
+
if (!prms_ptr->params.embedding) {
|
2317
|
+
rb_raise(rb_eRuntimeError, "embedding parameter is false");
|
2318
|
+
return Qnil;
|
2319
|
+
}
|
2320
|
+
|
2321
|
+
VALUE model = rb_iv_get(self, "@model");
|
2322
|
+
LLaMAModelWrapper* model_ptr = RbLLaMAModel::get_llama_model(model);
|
2323
|
+
const int n_embd = llama_n_embd(model_ptr->model);
|
2324
|
+
|
2325
|
+
VALUE output = rb_ary_new();
|
2326
|
+
const float* embd = llama_get_embeddings_ith(ptr->ctx, NUM2INT(ith));
|
2327
|
+
for (int i = 0; i < n_embd; i++) {
|
2328
|
+
rb_ary_push(output, DBL2NUM((double)(embd[i])));
|
2329
|
+
}
|
2330
|
+
|
2331
|
+
return output;
|
2332
|
+
}
|
2333
|
+
|
2289
2334
|
static VALUE _llama_context_n_ctx(VALUE self) {
|
2290
2335
|
LLaMAContextWrapper* ptr = get_llama_context(self);
|
2291
2336
|
if (ptr->ctx == NULL) {
|
@@ -3198,15 +3243,8 @@ const rb_data_type_t RbLLaMAContext::llama_context_type = {
|
|
3198
3243
|
|
3199
3244
|
// module functions
|
3200
3245
|
|
3201
|
-
static VALUE rb_llama_llama_backend_init(
|
3202
|
-
|
3203
|
-
ID kw_table[1] = { rb_intern("numa") };
|
3204
|
-
VALUE kw_values[1] = { Qundef };
|
3205
|
-
rb_scan_args(argc, argv, ":", &kw_args);
|
3206
|
-
rb_get_kwargs(kw_args, kw_table, 0, 1, kw_values);
|
3207
|
-
|
3208
|
-
const bool numa = kw_values[0] == Qundef ? false : (RTEST(kw_values[0]) ? true : false);
|
3209
|
-
llama_backend_init(numa);
|
3246
|
+
static VALUE rb_llama_llama_backend_init(VALUE self) {
|
3247
|
+
llama_backend_init();
|
3210
3248
|
|
3211
3249
|
return Qnil;
|
3212
3250
|
}
|
@@ -3217,6 +3255,17 @@ static VALUE rb_llama_llama_backend_free(VALUE self) {
|
|
3217
3255
|
return Qnil;
|
3218
3256
|
}
|
3219
3257
|
|
3258
|
+
static VALUE rb_llama_llama_numa_init(VALUE self, VALUE strategy) {
|
3259
|
+
if (!RB_INTEGER_TYPE_P(strategy)) {
|
3260
|
+
rb_raise(rb_eArgError, "strategy must be an integer");
|
3261
|
+
return Qnil;
|
3262
|
+
}
|
3263
|
+
|
3264
|
+
llama_numa_init(static_cast<enum ggml_numa_strategy>(NUM2INT(strategy)));
|
3265
|
+
|
3266
|
+
return Qnil;
|
3267
|
+
}
|
3268
|
+
|
3220
3269
|
static VALUE rb_llama_model_quantize(int argc, VALUE* argv, VALUE self) {
|
3221
3270
|
VALUE kw_args = Qnil;
|
3222
3271
|
ID kw_table[3] = { rb_intern("input_path"), rb_intern("output_path"), rb_intern("params") };
|
@@ -3300,8 +3349,9 @@ extern "C" void Init_llama_cpp(void) {
|
|
3300
3349
|
RbLLaMAGrammarElement::define_class(rb_mLLaMACpp);
|
3301
3350
|
RbLLaMAGrammar::define_class(rb_mLLaMACpp);
|
3302
3351
|
|
3303
|
-
rb_define_module_function(rb_mLLaMACpp, "backend_init", rb_llama_llama_backend_init,
|
3352
|
+
rb_define_module_function(rb_mLLaMACpp, "backend_init", rb_llama_llama_backend_init, 0);
|
3304
3353
|
rb_define_module_function(rb_mLLaMACpp, "backend_free", rb_llama_llama_backend_free, 0);
|
3354
|
+
rb_define_module_function(rb_mLLaMACpp, "numa_init", rb_llama_llama_numa_init, 1);
|
3305
3355
|
rb_define_module_function(rb_mLLaMACpp, "model_quantize", rb_llama_model_quantize, -1);
|
3306
3356
|
rb_define_module_function(rb_mLLaMACpp, "print_system_info", rb_llama_print_system_info, 0);
|
3307
3357
|
rb_define_module_function(rb_mLLaMACpp, "time_us", rb_llama_time_us, 0);
|
@@ -3314,6 +3364,7 @@ extern "C" void Init_llama_cpp(void) {
|
|
3314
3364
|
|
3315
3365
|
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_TYPE_SPM", INT2NUM(LLAMA_VOCAB_TYPE_SPM));
|
3316
3366
|
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_TYPE_BPE", INT2NUM(LLAMA_VOCAB_TYPE_BPE));
|
3367
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_TYPE_WPM", INT2NUM(LLAMA_VOCAB_TYPE_WPM));
|
3317
3368
|
|
3318
3369
|
rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_UNDEFINED", INT2NUM(LLAMA_TOKEN_TYPE_UNDEFINED));
|
3319
3370
|
rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_NORMAL", INT2NUM(LLAMA_TOKEN_TYPE_NORMAL));
|
@@ -3345,6 +3396,8 @@ extern "C" void Init_llama_cpp(void) {
|
|
3345
3396
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q2_K_S", INT2NUM(LLAMA_FTYPE_MOSTLY_Q2_K_S));
|
3346
3397
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q3_K_XS", INT2NUM(LLAMA_FTYPE_MOSTLY_Q3_K_XS));
|
3347
3398
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ3_XXS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ3_XXS));
|
3399
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ1_S", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ1_S));
|
3400
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ4_NL", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ4_NL));
|
3348
3401
|
|
3349
3402
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_GUESSED", INT2NUM(LLAMA_FTYPE_GUESSED));
|
3350
3403
|
|
@@ -3366,6 +3419,10 @@ extern "C" void Init_llama_cpp(void) {
|
|
3366
3419
|
rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_YARN", INT2NUM(LLAMA_ROPE_SCALING_YARN));
|
3367
3420
|
rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_MAX_VALUE", INT2NUM(LLAMA_ROPE_SCALING_MAX_VALUE));
|
3368
3421
|
|
3422
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_POOLING_NONE", INT2NUM(LLAMA_POOLING_NONE));
|
3423
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_POOLING_MEAN", INT2NUM(LLAMA_POOLING_MEAN));
|
3424
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_POOLING_CLS", INT2NUM(LLAMA_POOLING_CLS));
|
3425
|
+
|
3369
3426
|
rb_define_const(rb_mLLaMACpp, "LLAMA_SPLIT_NONE", INT2NUM(LLAMA_SPLIT_NONE));
|
3370
3427
|
rb_define_const(rb_mLLaMACpp, "LLAMA_SPLIT_LAYER", INT2NUM(LLAMA_SPLIT_LAYER));
|
3371
3428
|
rb_define_const(rb_mLLaMACpp, "LLAMA_SPLIT_ROW", INT2NUM(LLAMA_SPLIT_ROW));
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.12.
|
6
|
+
VERSION = '0.12.7'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = '
|
9
|
+
LLAMA_CPP_VERSION = 'b2249'
|
10
10
|
end
|
data/sig/llama_cpp.rbs
CHANGED
@@ -3,6 +3,10 @@ module LLaMACpp
|
|
3
3
|
LLAMA_CPP_VERSION: String
|
4
4
|
LLAMA_DEFALUT_SEED: String
|
5
5
|
|
6
|
+
LLAMA_VOCAB_TYPE_SPM: Integer
|
7
|
+
LLAMA_VOCAB_TYPE_BPE: Integer
|
8
|
+
LLAMA_VOCAB_TYPE_WPM: Integer
|
9
|
+
|
6
10
|
LLAMA_FTYPE_ALL_F32: Integer
|
7
11
|
LLAMA_FTYPE_MOSTLY_F16: Integer
|
8
12
|
LLAMA_FTYPE_MOSTLY_Q4_0: Integer
|
@@ -25,6 +29,8 @@ module LLaMACpp
|
|
25
29
|
LLAMA_FTYPE_MOSTLY_Q2_K_S: Integer
|
26
30
|
LLAMA_FTYPE_MOSTLY_Q3_K_XS: Integer
|
27
31
|
LLAMA_FTYPE_MOSTLY_IQ3_XXS: Integer
|
32
|
+
LLAMA_FTYPE_MOSTLY_IQ1_S: Integer
|
33
|
+
LLAMA_FTYPE_MOSTLY_IQ4_NL: Integer
|
28
34
|
|
29
35
|
LLAMA_KV_OVERRIDE_INT: Integer
|
30
36
|
LLAMA_KV_OVERRIDE_FLOAT: Integer
|
@@ -44,12 +50,17 @@ module LLaMACpp
|
|
44
50
|
LLAMA_ROPE_SCALING_YARN: Integer
|
45
51
|
LLAMA_ROPE_SCALING_MAX_VALUE: Integer
|
46
52
|
|
53
|
+
LLAMA_POOLING_NONE: Integer
|
54
|
+
LLAMA_POOLING_MEAN: Integer
|
55
|
+
LLAMA_POOLING_CLS: Integer
|
56
|
+
|
47
57
|
LLAMA_SPLIT_NONE: Integer
|
48
58
|
LLAMA_SPLIT_LAYER: Integer
|
49
59
|
LLAMA_SPLIT_ROW: Integer
|
50
60
|
|
51
|
-
def self?.backend_init: (
|
61
|
+
def self?.backend_init: () -> void
|
52
62
|
def self?.backend_free: () -> void
|
63
|
+
def self?.numa_init: (Integer) -> void
|
53
64
|
def self?.model_quantize: (input_path: String, output_path: String, params: ModelQuantizeParams) -> void
|
54
65
|
def self?.generate: (::LLaMACpp::Context, String,
|
55
66
|
?n_predict: Integer, ?n_threads: Integer, ?n_keep: Integer, ?n_batch: Integer,
|
@@ -190,6 +201,7 @@ module LLaMACpp
|
|
190
201
|
|
191
202
|
def initialize: (model: ::LLaMACpp::Model, params: ::LLaMACpp::ContextParams) -> void
|
192
203
|
def embeddings: () -> Array[Float]
|
204
|
+
def embeddings_ith: (Integer) -> Array[Float]
|
193
205
|
def eval: (tokens: Array[Integer], n_past: Integer, ?n_tokens: Integer) -> void
|
194
206
|
def eval_embd: (tokens: Array[Float], n_past: Integer, ?n_tokens: Integer) -> void
|
195
207
|
def decode: (::LLaMACpp::Batch) -> void
|
@@ -270,6 +282,8 @@ module LLaMACpp
|
|
270
282
|
def embedding=: (bool) -> bool
|
271
283
|
def offload_kqv: () -> bool
|
272
284
|
def offload_kqv=: (bool) -> bool
|
285
|
+
def do_pooling: () -> bool
|
286
|
+
def do_pooling=: (bool) -> bool
|
273
287
|
end
|
274
288
|
|
275
289
|
class ModelQuantizeParams
|
@@ -97,9 +97,10 @@ endif
|
|
97
97
|
#
|
98
98
|
|
99
99
|
# keep standard at C11 and C++11
|
100
|
-
MK_CPPFLAGS
|
101
|
-
MK_CFLAGS
|
102
|
-
MK_CXXFLAGS
|
100
|
+
MK_CPPFLAGS = -I. -Icommon
|
101
|
+
MK_CFLAGS = -std=c11 -fPIC
|
102
|
+
MK_CXXFLAGS = -std=c++11 -fPIC
|
103
|
+
MK_NVCCFLAGS = -std=c++11
|
103
104
|
|
104
105
|
# -Ofast tends to produce faster code, but may not be available for some compilers.
|
105
106
|
ifdef LLAMA_FAST
|
@@ -172,7 +173,7 @@ ifdef LLAMA_DEBUG
|
|
172
173
|
MK_LDFLAGS += -g
|
173
174
|
|
174
175
|
ifeq ($(UNAME_S),Linux)
|
175
|
-
|
176
|
+
MK_CPPFLAGS += -D_GLIBCXX_ASSERTIONS
|
176
177
|
endif
|
177
178
|
else
|
178
179
|
MK_CPPFLAGS += -DNDEBUG
|
@@ -215,6 +216,11 @@ MK_CFLAGS += $(WARN_FLAGS) -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmis
|
|
215
216
|
-Werror=implicit-function-declaration
|
216
217
|
MK_CXXFLAGS += $(WARN_FLAGS) -Wmissing-declarations -Wmissing-noreturn
|
217
218
|
|
219
|
+
ifeq ($(LLAMA_FATAL_WARNINGS),1)
|
220
|
+
MK_CFLAGS += -Werror
|
221
|
+
MK_CXXFLAGS += -Werror
|
222
|
+
endif
|
223
|
+
|
218
224
|
# this version of Apple ld64 is buggy
|
219
225
|
ifneq '' '$(findstring dyld-1015.7,$(shell $(CC) $(LDFLAGS) -Wl,-v 2>&1))'
|
220
226
|
MK_CPPFLAGS += -DHAVE_BUGGY_APPLE_LINKER
|
@@ -381,6 +387,9 @@ ifdef LLAMA_CUBLAS
|
|
381
387
|
MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib -L/usr/local/cuda/targets/aarch64-linux/lib -L/usr/lib/wsl/lib
|
382
388
|
OBJS += ggml-cuda.o
|
383
389
|
MK_NVCCFLAGS += -use_fast_math
|
390
|
+
ifdef LLAMA_FATAL_WARNINGS
|
391
|
+
MK_NVCCFLAGS += -Werror all-warnings
|
392
|
+
endif # LLAMA_FATAL_WARNINGS
|
384
393
|
ifndef JETSON_EOL_MODULE_DETECT
|
385
394
|
MK_NVCCFLAGS += --forward-unknown-to-host-compiler
|
386
395
|
endif # JETSON_EOL_MODULE_DETECT
|
@@ -439,9 +448,9 @@ ifdef LLAMA_CUDA_CCBIN
|
|
439
448
|
endif
|
440
449
|
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
|
441
450
|
ifdef JETSON_EOL_MODULE_DETECT
|
442
|
-
$(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
|
451
|
+
$(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
|
443
452
|
else
|
444
|
-
$(NVCC) $(
|
453
|
+
$(NVCC) $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
|
445
454
|
endif # JETSON_EOL_MODULE_DETECT
|
446
455
|
endif # LLAMA_CUBLAS
|
447
456
|
|
@@ -526,11 +535,29 @@ ifdef LLAMA_METAL
|
|
526
535
|
ifdef LLAMA_METAL_NDEBUG
|
527
536
|
MK_CPPFLAGS += -DGGML_METAL_NDEBUG
|
528
537
|
endif
|
538
|
+
ifdef LLAMA_METAL_EMBED_LIBRARY
|
539
|
+
MK_CPPFLAGS += -DGGML_METAL_EMBED_LIBRARY
|
540
|
+
OBJS += ggml-metal-embed.o
|
541
|
+
endif
|
529
542
|
endif # LLAMA_METAL
|
530
543
|
|
531
544
|
ifdef LLAMA_METAL
|
532
545
|
ggml-metal.o: ggml-metal.m ggml-metal.h
|
533
546
|
$(CC) $(CFLAGS) -c $< -o $@
|
547
|
+
|
548
|
+
ifdef LLAMA_METAL_EMBED_LIBRARY
|
549
|
+
ggml-metal-embed.o: ggml-metal.metal
|
550
|
+
@echo "Embedding Metal library"
|
551
|
+
$(eval TEMP_ASSEMBLY=$(shell mktemp))
|
552
|
+
@echo ".section __DATA, __ggml_metallib" > $(TEMP_ASSEMBLY)
|
553
|
+
@echo ".globl _ggml_metallib_start" >> $(TEMP_ASSEMBLY)
|
554
|
+
@echo "_ggml_metallib_start:" >> $(TEMP_ASSEMBLY)
|
555
|
+
@echo ".incbin \"$<\"" >> $(TEMP_ASSEMBLY)
|
556
|
+
@echo ".globl _ggml_metallib_end" >> $(TEMP_ASSEMBLY)
|
557
|
+
@echo "_ggml_metallib_end:" >> $(TEMP_ASSEMBLY)
|
558
|
+
@$(AS) $(TEMP_ASSEMBLY) -o $@
|
559
|
+
@rm -f ${TEMP_ASSEMBLY}
|
560
|
+
endif
|
534
561
|
endif # LLAMA_METAL
|
535
562
|
|
536
563
|
ifdef LLAMA_MPI
|
@@ -542,9 +569,10 @@ GF_CC := $(CC)
|
|
542
569
|
include scripts/get-flags.mk
|
543
570
|
|
544
571
|
# combine build flags with cmdline overrides
|
545
|
-
override
|
546
|
-
|
547
|
-
|
572
|
+
override CPPFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS)
|
573
|
+
override CFLAGS := $(CPPFLAGS) $(MK_CFLAGS) $(GF_CFLAGS) $(CFLAGS)
|
574
|
+
BASE_CXXFLAGS := $(MK_CXXFLAGS) $(CXXFLAGS)
|
575
|
+
override CXXFLAGS := $(BASE_CXXFLAGS) $(HOST_CXXFLAGS) $(GF_CXXFLAGS) $(CPPFLAGS)
|
548
576
|
override NVCCFLAGS := $(MK_NVCCFLAGS) $(NVCCFLAGS)
|
549
577
|
override LDFLAGS := $(MK_LDFLAGS) $(LDFLAGS)
|
550
578
|
|
@@ -552,7 +580,7 @@ override LDFLAGS := $(MK_LDFLAGS) $(LDFLAGS)
|
|
552
580
|
ifdef LLAMA_CUBLAS
|
553
581
|
GF_CC := $(NVCC) $(NVCCFLAGS) 2>/dev/null .c -Xcompiler
|
554
582
|
include scripts/get-flags.mk
|
555
|
-
CUDA_CXXFLAGS := $(GF_CXXFLAGS)
|
583
|
+
CUDA_CXXFLAGS := $(BASE_CXXFLAGS) $(GF_CXXFLAGS) -Wno-pedantic
|
556
584
|
endif
|
557
585
|
|
558
586
|
#
|
@@ -571,6 +599,14 @@ $(info I CC: $(shell $(CC) --version | head -n 1))
|
|
571
599
|
$(info I CXX: $(shell $(CXX) --version | head -n 1))
|
572
600
|
ifdef LLAMA_CUBLAS
|
573
601
|
$(info I NVCC: $(shell $(NVCC) --version | tail -n 1))
|
602
|
+
CUDA_VERSION := $(shell nvcc --version | grep -oP 'release (\K[0-9]+\.[0-9])')
|
603
|
+
ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1)
|
604
|
+
ifndef CUDA_DOCKER_ARCH
|
605
|
+
ifndef CUDA_POWER_ARCH
|
606
|
+
$(error I ERROR: For CUDA versions < 11.7 a target CUDA architecture must be explicitly provided via CUDA_DOCKER_ARCH)
|
607
|
+
endif # CUDA_POWER_ARCH
|
608
|
+
endif # CUDA_DOCKER_ARCH
|
609
|
+
endif # eq ($(shell echo "$(CUDA_VERSION) < 11.7" | bc),1)
|
574
610
|
endif # LLAMA_CUBLAS
|
575
611
|
$(info )
|
576
612
|
|
@@ -625,7 +661,6 @@ lib: llama.o ggml.o $(OBJS)
|
|
625
661
|
|
626
662
|
clean:
|
627
663
|
rm -vrf *.o tests/*.o *.so *.a *.dll *.dylib benchmark-matmult common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
|
628
|
-
find examples pocs -type f -name "*.o" -delete
|
629
664
|
|
630
665
|
#
|
631
666
|
# Examples
|
@@ -689,7 +724,7 @@ save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(C
|
|
689
724
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
690
725
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
691
726
|
|
692
|
-
server: examples/server/server.cpp examples/server/oai.hpp examples/server/utils.hpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
727
|
+
server: examples/server/server.cpp examples/server/oai.hpp examples/server/utils.hpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h examples/llava/llava.h examples/llava/llava.cpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
693
728
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
694
729
|
$(CXX) $(CXXFLAGS) -c examples/llava/clip.cpp -o $(call GET_OBJ_FILE, examples/llava/clip.cpp) -Wno-cast-qual
|
695
730
|
$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h %.hpp $< examples/llava/clip.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) -o $@ $(LDFLAGS) $(LWINSOCK2)
|
@@ -860,3 +895,7 @@ tests/test-model-load-cancel: tests/test-model-load-cancel.cpp ggml.o llama.o te
|
|
860
895
|
tests/test-autorelease: tests/test-autorelease.cpp ggml.o llama.o tests/get-model.cpp $(COMMON_DEPS) $(OBJS)
|
861
896
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
862
897
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
898
|
+
|
899
|
+
tests/test-chat-template: tests/test-chat-template.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
900
|
+
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
901
|
+
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|