llama_cpp 0.12.5 → 0.12.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +17 -0
- data/ext/llama_cpp/llama_cpp.cpp +67 -10
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +15 -1
- data/vendor/tmp/llama.cpp/Makefile +51 -12
- data/vendor/tmp/llama.cpp/ggml-alloc.c +595 -492
- data/vendor/tmp/llama.cpp/ggml-alloc.h +39 -65
- data/vendor/tmp/llama.cpp/ggml-backend.c +268 -271
- data/vendor/tmp/llama.cpp/ggml-backend.h +8 -12
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +560 -346
- data/vendor/tmp/llama.cpp/ggml-impl.h +20 -7
- data/vendor/tmp/llama.cpp/ggml-metal.m +101 -11
- data/vendor/tmp/llama.cpp/ggml-metal.metal +608 -9
- data/vendor/tmp/llama.cpp/ggml-quants.c +1255 -94
- data/vendor/tmp/llama.cpp/ggml-quants.h +39 -16
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +95 -264
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +213 -58
- data/vendor/tmp/llama.cpp/ggml.c +1082 -564
- data/vendor/tmp/llama.cpp/ggml.h +50 -17
- data/vendor/tmp/llama.cpp/llama.cpp +1329 -280
- data/vendor/tmp/llama.cpp/llama.h +43 -1
- data/vendor/tmp/llama.cpp/scripts/get-flags.mk +1 -1
- data/vendor/tmp/llama.cpp/unicode.h +42 -30
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 350a80cc8b804b23ee7b0f4e90604110b09664892d3d7c4217c4cd48c77cf775
|
4
|
+
data.tar.gz: 7a127d3b83cb680969589368eb741c6a2ac6a9765adf9f57dd23c0c1b54ca13d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: dbf25eb8f0fd60332eb8452ea400294d5b9b2b09127d0f3c5ef347135f30f565b161123d0f76a8553bcabf9e35db9fac3fff6cdd9df407fb830ab124d0d85d47
|
7
|
+
data.tar.gz: 2bbefd5b502150f052ab556c372c4f37b9cf2de2e22e34f4b2153a3b7ff93d7fca768eec5572d5514d7c46dc2a9c03121487907adc5ede612ecb6cea72de682d
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,20 @@
|
|
1
|
+
## [[0.12.7](https://github.com/yoshoku/llama_cpp.rb/compare/v0.12.6...v0.12.7)] - 2024-02-24
|
2
|
+
|
3
|
+
- Bump bundled llama.cpp from b2106 to b2143.
|
4
|
+
- Add constants for file type: `LLAMA_FTYPE_MOSTLY_IQ1_S` and `LLAMA_FTYPE_MOSTLY_IQ4_NL`.
|
5
|
+
- Add constants for pooling type: `LLAMA_POOLING_NONE`, `LLAMA_POOLING_MEAN`, and `LLAMA_POOLING_CLS`.
|
6
|
+
- Add `numa_init` module function to `LLaMACpp`.
|
7
|
+
- Remove unnecessary argument from `backend_init`.
|
8
|
+
|
9
|
+
Implementation of llama_chat_apply_template binding has been postponed for the time being.
|
10
|
+
|
11
|
+
## [[0.12.6](https://github.com/yoshoku/llama_cpp.rb/compare/v0.12.5...v0.12.6)] - 2024-02-17
|
12
|
+
|
13
|
+
- Bump bundled llama.cpp from b2106 to b2143.
|
14
|
+
- Add constant: `LLAMA_VOCAB_TYPE_WPM`.
|
15
|
+
- Add `do_pooling` accessors to ContextParams.
|
16
|
+
- Add `embeddings_ith` method to Context.
|
17
|
+
|
1
18
|
## [[0.12.5](https://github.com/yoshoku/llama_cpp.rb/compare/v0.12.4...v0.12.5)] - 2024-02-09
|
2
19
|
|
3
20
|
- Bump bundled llama.cpp from b2047 to b2106.
|
data/ext/llama_cpp/llama_cpp.cpp
CHANGED
@@ -978,6 +978,8 @@ public:
|
|
978
978
|
rb_define_method(rb_cLLaMAContextParams, "embedding", RUBY_METHOD_FUNC(_llama_context_params_get_embedding), 0);
|
979
979
|
rb_define_method(rb_cLLaMAContextParams, "offload_kqv=", RUBY_METHOD_FUNC(_llama_context_params_set_offload_kqv), 1);
|
980
980
|
rb_define_method(rb_cLLaMAContextParams, "offload_kqv", RUBY_METHOD_FUNC(_llama_context_params_get_offload_kqv), 0);
|
981
|
+
rb_define_method(rb_cLLaMAContextParams, "do_pooling=", RUBY_METHOD_FUNC(_llama_context_params_set_do_pooling), 1);
|
982
|
+
rb_define_method(rb_cLLaMAContextParams, "do_pooling", RUBY_METHOD_FUNC(_llama_context_params_get_do_pooling), 0);
|
981
983
|
}
|
982
984
|
|
983
985
|
private:
|
@@ -1220,6 +1222,18 @@ private:
|
|
1220
1222
|
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
1221
1223
|
return ptr->params.offload_kqv ? Qtrue : Qfalse;
|
1222
1224
|
}
|
1225
|
+
|
1226
|
+
// do_pooling
|
1227
|
+
static VALUE _llama_context_params_set_do_pooling(VALUE self, VALUE do_pooling) {
|
1228
|
+
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
1229
|
+
ptr->params.do_pooling = RTEST(do_pooling) ? true : false;
|
1230
|
+
return ptr->params.do_pooling ? Qtrue : Qfalse;
|
1231
|
+
}
|
1232
|
+
|
1233
|
+
static VALUE _llama_context_params_get_do_pooling(VALUE self) {
|
1234
|
+
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
1235
|
+
return ptr->params.do_pooling ? Qtrue : Qfalse;
|
1236
|
+
}
|
1223
1237
|
};
|
1224
1238
|
|
1225
1239
|
const rb_data_type_t RbLLaMAContextParams::llama_context_params_type = {
|
@@ -2029,6 +2043,7 @@ public:
|
|
2029
2043
|
rb_define_method(rb_cLLaMAContext, "decode", RUBY_METHOD_FUNC(_llama_context_decode), 1);
|
2030
2044
|
rb_define_method(rb_cLLaMAContext, "logits", RUBY_METHOD_FUNC(_llama_context_logits), 0);
|
2031
2045
|
rb_define_method(rb_cLLaMAContext, "embeddings", RUBY_METHOD_FUNC(_llama_context_embeddings), 0);
|
2046
|
+
rb_define_method(rb_cLLaMAContext, "embeddings_ith", RUBY_METHOD_FUNC(_llama_context_embeddings_ith), 1);
|
2032
2047
|
rb_define_method(rb_cLLaMAContext, "n_ctx", RUBY_METHOD_FUNC(_llama_context_n_ctx), 0);
|
2033
2048
|
rb_define_method(rb_cLLaMAContext, "n_batch", RUBY_METHOD_FUNC(_llama_context_n_batch), 0);
|
2034
2049
|
rb_define_method(rb_cLLaMAContext, "timings", RUBY_METHOD_FUNC(_llama_context_get_timings), 0);
|
@@ -2286,6 +2301,36 @@ private:
|
|
2286
2301
|
return output;
|
2287
2302
|
}
|
2288
2303
|
|
2304
|
+
static VALUE _llama_context_embeddings_ith(VALUE self, VALUE ith) {
|
2305
|
+
if (!RB_INTEGER_TYPE_P(ith)) {
|
2306
|
+
rb_raise(rb_eArgError, "ith must be an integer");
|
2307
|
+
return Qnil;
|
2308
|
+
}
|
2309
|
+
LLaMAContextWrapper* ptr = get_llama_context(self);
|
2310
|
+
if (ptr->ctx == NULL) {
|
2311
|
+
rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
|
2312
|
+
return Qnil;
|
2313
|
+
}
|
2314
|
+
VALUE params = rb_iv_get(self, "@params");
|
2315
|
+
LLaMAContextParamsWrapper* prms_ptr = RbLLaMAContextParams::get_llama_context_params(params);
|
2316
|
+
if (!prms_ptr->params.embedding) {
|
2317
|
+
rb_raise(rb_eRuntimeError, "embedding parameter is false");
|
2318
|
+
return Qnil;
|
2319
|
+
}
|
2320
|
+
|
2321
|
+
VALUE model = rb_iv_get(self, "@model");
|
2322
|
+
LLaMAModelWrapper* model_ptr = RbLLaMAModel::get_llama_model(model);
|
2323
|
+
const int n_embd = llama_n_embd(model_ptr->model);
|
2324
|
+
|
2325
|
+
VALUE output = rb_ary_new();
|
2326
|
+
const float* embd = llama_get_embeddings_ith(ptr->ctx, NUM2INT(ith));
|
2327
|
+
for (int i = 0; i < n_embd; i++) {
|
2328
|
+
rb_ary_push(output, DBL2NUM((double)(embd[i])));
|
2329
|
+
}
|
2330
|
+
|
2331
|
+
return output;
|
2332
|
+
}
|
2333
|
+
|
2289
2334
|
static VALUE _llama_context_n_ctx(VALUE self) {
|
2290
2335
|
LLaMAContextWrapper* ptr = get_llama_context(self);
|
2291
2336
|
if (ptr->ctx == NULL) {
|
@@ -3198,15 +3243,8 @@ const rb_data_type_t RbLLaMAContext::llama_context_type = {
|
|
3198
3243
|
|
3199
3244
|
// module functions
|
3200
3245
|
|
3201
|
-
static VALUE rb_llama_llama_backend_init(
|
3202
|
-
|
3203
|
-
ID kw_table[1] = { rb_intern("numa") };
|
3204
|
-
VALUE kw_values[1] = { Qundef };
|
3205
|
-
rb_scan_args(argc, argv, ":", &kw_args);
|
3206
|
-
rb_get_kwargs(kw_args, kw_table, 0, 1, kw_values);
|
3207
|
-
|
3208
|
-
const bool numa = kw_values[0] == Qundef ? false : (RTEST(kw_values[0]) ? true : false);
|
3209
|
-
llama_backend_init(numa);
|
3246
|
+
static VALUE rb_llama_llama_backend_init(VALUE self) {
|
3247
|
+
llama_backend_init();
|
3210
3248
|
|
3211
3249
|
return Qnil;
|
3212
3250
|
}
|
@@ -3217,6 +3255,17 @@ static VALUE rb_llama_llama_backend_free(VALUE self) {
|
|
3217
3255
|
return Qnil;
|
3218
3256
|
}
|
3219
3257
|
|
3258
|
+
static VALUE rb_llama_llama_numa_init(VALUE self, VALUE strategy) {
|
3259
|
+
if (!RB_INTEGER_TYPE_P(strategy)) {
|
3260
|
+
rb_raise(rb_eArgError, "strategy must be an integer");
|
3261
|
+
return Qnil;
|
3262
|
+
}
|
3263
|
+
|
3264
|
+
llama_numa_init(static_cast<enum ggml_numa_strategy>(NUM2INT(strategy)));
|
3265
|
+
|
3266
|
+
return Qnil;
|
3267
|
+
}
|
3268
|
+
|
3220
3269
|
static VALUE rb_llama_model_quantize(int argc, VALUE* argv, VALUE self) {
|
3221
3270
|
VALUE kw_args = Qnil;
|
3222
3271
|
ID kw_table[3] = { rb_intern("input_path"), rb_intern("output_path"), rb_intern("params") };
|
@@ -3300,8 +3349,9 @@ extern "C" void Init_llama_cpp(void) {
|
|
3300
3349
|
RbLLaMAGrammarElement::define_class(rb_mLLaMACpp);
|
3301
3350
|
RbLLaMAGrammar::define_class(rb_mLLaMACpp);
|
3302
3351
|
|
3303
|
-
rb_define_module_function(rb_mLLaMACpp, "backend_init", rb_llama_llama_backend_init,
|
3352
|
+
rb_define_module_function(rb_mLLaMACpp, "backend_init", rb_llama_llama_backend_init, 0);
|
3304
3353
|
rb_define_module_function(rb_mLLaMACpp, "backend_free", rb_llama_llama_backend_free, 0);
|
3354
|
+
rb_define_module_function(rb_mLLaMACpp, "numa_init", rb_llama_llama_numa_init, 1);
|
3305
3355
|
rb_define_module_function(rb_mLLaMACpp, "model_quantize", rb_llama_model_quantize, -1);
|
3306
3356
|
rb_define_module_function(rb_mLLaMACpp, "print_system_info", rb_llama_print_system_info, 0);
|
3307
3357
|
rb_define_module_function(rb_mLLaMACpp, "time_us", rb_llama_time_us, 0);
|
@@ -3314,6 +3364,7 @@ extern "C" void Init_llama_cpp(void) {
|
|
3314
3364
|
|
3315
3365
|
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_TYPE_SPM", INT2NUM(LLAMA_VOCAB_TYPE_SPM));
|
3316
3366
|
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_TYPE_BPE", INT2NUM(LLAMA_VOCAB_TYPE_BPE));
|
3367
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_TYPE_WPM", INT2NUM(LLAMA_VOCAB_TYPE_WPM));
|
3317
3368
|
|
3318
3369
|
rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_UNDEFINED", INT2NUM(LLAMA_TOKEN_TYPE_UNDEFINED));
|
3319
3370
|
rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_NORMAL", INT2NUM(LLAMA_TOKEN_TYPE_NORMAL));
|
@@ -3345,6 +3396,8 @@ extern "C" void Init_llama_cpp(void) {
|
|
3345
3396
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q2_K_S", INT2NUM(LLAMA_FTYPE_MOSTLY_Q2_K_S));
|
3346
3397
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q3_K_XS", INT2NUM(LLAMA_FTYPE_MOSTLY_Q3_K_XS));
|
3347
3398
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ3_XXS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ3_XXS));
|
3399
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ1_S", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ1_S));
|
3400
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ4_NL", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ4_NL));
|
3348
3401
|
|
3349
3402
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_GUESSED", INT2NUM(LLAMA_FTYPE_GUESSED));
|
3350
3403
|
|
@@ -3366,6 +3419,10 @@ extern "C" void Init_llama_cpp(void) {
|
|
3366
3419
|
rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_YARN", INT2NUM(LLAMA_ROPE_SCALING_YARN));
|
3367
3420
|
rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_MAX_VALUE", INT2NUM(LLAMA_ROPE_SCALING_MAX_VALUE));
|
3368
3421
|
|
3422
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_POOLING_NONE", INT2NUM(LLAMA_POOLING_NONE));
|
3423
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_POOLING_MEAN", INT2NUM(LLAMA_POOLING_MEAN));
|
3424
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_POOLING_CLS", INT2NUM(LLAMA_POOLING_CLS));
|
3425
|
+
|
3369
3426
|
rb_define_const(rb_mLLaMACpp, "LLAMA_SPLIT_NONE", INT2NUM(LLAMA_SPLIT_NONE));
|
3370
3427
|
rb_define_const(rb_mLLaMACpp, "LLAMA_SPLIT_LAYER", INT2NUM(LLAMA_SPLIT_LAYER));
|
3371
3428
|
rb_define_const(rb_mLLaMACpp, "LLAMA_SPLIT_ROW", INT2NUM(LLAMA_SPLIT_ROW));
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.12.
|
6
|
+
VERSION = '0.12.7'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = '
|
9
|
+
LLAMA_CPP_VERSION = 'b2249'
|
10
10
|
end
|
data/sig/llama_cpp.rbs
CHANGED
@@ -3,6 +3,10 @@ module LLaMACpp
|
|
3
3
|
LLAMA_CPP_VERSION: String
|
4
4
|
LLAMA_DEFALUT_SEED: String
|
5
5
|
|
6
|
+
LLAMA_VOCAB_TYPE_SPM: Integer
|
7
|
+
LLAMA_VOCAB_TYPE_BPE: Integer
|
8
|
+
LLAMA_VOCAB_TYPE_WPM: Integer
|
9
|
+
|
6
10
|
LLAMA_FTYPE_ALL_F32: Integer
|
7
11
|
LLAMA_FTYPE_MOSTLY_F16: Integer
|
8
12
|
LLAMA_FTYPE_MOSTLY_Q4_0: Integer
|
@@ -25,6 +29,8 @@ module LLaMACpp
|
|
25
29
|
LLAMA_FTYPE_MOSTLY_Q2_K_S: Integer
|
26
30
|
LLAMA_FTYPE_MOSTLY_Q3_K_XS: Integer
|
27
31
|
LLAMA_FTYPE_MOSTLY_IQ3_XXS: Integer
|
32
|
+
LLAMA_FTYPE_MOSTLY_IQ1_S: Integer
|
33
|
+
LLAMA_FTYPE_MOSTLY_IQ4_NL: Integer
|
28
34
|
|
29
35
|
LLAMA_KV_OVERRIDE_INT: Integer
|
30
36
|
LLAMA_KV_OVERRIDE_FLOAT: Integer
|
@@ -44,12 +50,17 @@ module LLaMACpp
|
|
44
50
|
LLAMA_ROPE_SCALING_YARN: Integer
|
45
51
|
LLAMA_ROPE_SCALING_MAX_VALUE: Integer
|
46
52
|
|
53
|
+
LLAMA_POOLING_NONE: Integer
|
54
|
+
LLAMA_POOLING_MEAN: Integer
|
55
|
+
LLAMA_POOLING_CLS: Integer
|
56
|
+
|
47
57
|
LLAMA_SPLIT_NONE: Integer
|
48
58
|
LLAMA_SPLIT_LAYER: Integer
|
49
59
|
LLAMA_SPLIT_ROW: Integer
|
50
60
|
|
51
|
-
def self?.backend_init: (
|
61
|
+
def self?.backend_init: () -> void
|
52
62
|
def self?.backend_free: () -> void
|
63
|
+
def self?.numa_init: (Integer) -> void
|
53
64
|
def self?.model_quantize: (input_path: String, output_path: String, params: ModelQuantizeParams) -> void
|
54
65
|
def self?.generate: (::LLaMACpp::Context, String,
|
55
66
|
?n_predict: Integer, ?n_threads: Integer, ?n_keep: Integer, ?n_batch: Integer,
|
@@ -190,6 +201,7 @@ module LLaMACpp
|
|
190
201
|
|
191
202
|
def initialize: (model: ::LLaMACpp::Model, params: ::LLaMACpp::ContextParams) -> void
|
192
203
|
def embeddings: () -> Array[Float]
|
204
|
+
def embeddings_ith: (Integer) -> Array[Float]
|
193
205
|
def eval: (tokens: Array[Integer], n_past: Integer, ?n_tokens: Integer) -> void
|
194
206
|
def eval_embd: (tokens: Array[Float], n_past: Integer, ?n_tokens: Integer) -> void
|
195
207
|
def decode: (::LLaMACpp::Batch) -> void
|
@@ -270,6 +282,8 @@ module LLaMACpp
|
|
270
282
|
def embedding=: (bool) -> bool
|
271
283
|
def offload_kqv: () -> bool
|
272
284
|
def offload_kqv=: (bool) -> bool
|
285
|
+
def do_pooling: () -> bool
|
286
|
+
def do_pooling=: (bool) -> bool
|
273
287
|
end
|
274
288
|
|
275
289
|
class ModelQuantizeParams
|
@@ -97,9 +97,10 @@ endif
|
|
97
97
|
#
|
98
98
|
|
99
99
|
# keep standard at C11 and C++11
|
100
|
-
MK_CPPFLAGS
|
101
|
-
MK_CFLAGS
|
102
|
-
MK_CXXFLAGS
|
100
|
+
MK_CPPFLAGS = -I. -Icommon
|
101
|
+
MK_CFLAGS = -std=c11 -fPIC
|
102
|
+
MK_CXXFLAGS = -std=c++11 -fPIC
|
103
|
+
MK_NVCCFLAGS = -std=c++11
|
103
104
|
|
104
105
|
# -Ofast tends to produce faster code, but may not be available for some compilers.
|
105
106
|
ifdef LLAMA_FAST
|
@@ -172,7 +173,7 @@ ifdef LLAMA_DEBUG
|
|
172
173
|
MK_LDFLAGS += -g
|
173
174
|
|
174
175
|
ifeq ($(UNAME_S),Linux)
|
175
|
-
|
176
|
+
MK_CPPFLAGS += -D_GLIBCXX_ASSERTIONS
|
176
177
|
endif
|
177
178
|
else
|
178
179
|
MK_CPPFLAGS += -DNDEBUG
|
@@ -215,6 +216,11 @@ MK_CFLAGS += $(WARN_FLAGS) -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmis
|
|
215
216
|
-Werror=implicit-function-declaration
|
216
217
|
MK_CXXFLAGS += $(WARN_FLAGS) -Wmissing-declarations -Wmissing-noreturn
|
217
218
|
|
219
|
+
ifeq ($(LLAMA_FATAL_WARNINGS),1)
|
220
|
+
MK_CFLAGS += -Werror
|
221
|
+
MK_CXXFLAGS += -Werror
|
222
|
+
endif
|
223
|
+
|
218
224
|
# this version of Apple ld64 is buggy
|
219
225
|
ifneq '' '$(findstring dyld-1015.7,$(shell $(CC) $(LDFLAGS) -Wl,-v 2>&1))'
|
220
226
|
MK_CPPFLAGS += -DHAVE_BUGGY_APPLE_LINKER
|
@@ -381,6 +387,9 @@ ifdef LLAMA_CUBLAS
|
|
381
387
|
MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib -L/usr/local/cuda/targets/aarch64-linux/lib -L/usr/lib/wsl/lib
|
382
388
|
OBJS += ggml-cuda.o
|
383
389
|
MK_NVCCFLAGS += -use_fast_math
|
390
|
+
ifdef LLAMA_FATAL_WARNINGS
|
391
|
+
MK_NVCCFLAGS += -Werror all-warnings
|
392
|
+
endif # LLAMA_FATAL_WARNINGS
|
384
393
|
ifndef JETSON_EOL_MODULE_DETECT
|
385
394
|
MK_NVCCFLAGS += --forward-unknown-to-host-compiler
|
386
395
|
endif # JETSON_EOL_MODULE_DETECT
|
@@ -439,9 +448,9 @@ ifdef LLAMA_CUDA_CCBIN
|
|
439
448
|
endif
|
440
449
|
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
|
441
450
|
ifdef JETSON_EOL_MODULE_DETECT
|
442
|
-
$(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
|
451
|
+
$(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
|
443
452
|
else
|
444
|
-
$(NVCC) $(
|
453
|
+
$(NVCC) $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
|
445
454
|
endif # JETSON_EOL_MODULE_DETECT
|
446
455
|
endif # LLAMA_CUBLAS
|
447
456
|
|
@@ -526,11 +535,29 @@ ifdef LLAMA_METAL
|
|
526
535
|
ifdef LLAMA_METAL_NDEBUG
|
527
536
|
MK_CPPFLAGS += -DGGML_METAL_NDEBUG
|
528
537
|
endif
|
538
|
+
ifdef LLAMA_METAL_EMBED_LIBRARY
|
539
|
+
MK_CPPFLAGS += -DGGML_METAL_EMBED_LIBRARY
|
540
|
+
OBJS += ggml-metal-embed.o
|
541
|
+
endif
|
529
542
|
endif # LLAMA_METAL
|
530
543
|
|
531
544
|
ifdef LLAMA_METAL
|
532
545
|
ggml-metal.o: ggml-metal.m ggml-metal.h
|
533
546
|
$(CC) $(CFLAGS) -c $< -o $@
|
547
|
+
|
548
|
+
ifdef LLAMA_METAL_EMBED_LIBRARY
|
549
|
+
ggml-metal-embed.o: ggml-metal.metal
|
550
|
+
@echo "Embedding Metal library"
|
551
|
+
$(eval TEMP_ASSEMBLY=$(shell mktemp))
|
552
|
+
@echo ".section __DATA, __ggml_metallib" > $(TEMP_ASSEMBLY)
|
553
|
+
@echo ".globl _ggml_metallib_start" >> $(TEMP_ASSEMBLY)
|
554
|
+
@echo "_ggml_metallib_start:" >> $(TEMP_ASSEMBLY)
|
555
|
+
@echo ".incbin \"$<\"" >> $(TEMP_ASSEMBLY)
|
556
|
+
@echo ".globl _ggml_metallib_end" >> $(TEMP_ASSEMBLY)
|
557
|
+
@echo "_ggml_metallib_end:" >> $(TEMP_ASSEMBLY)
|
558
|
+
@$(AS) $(TEMP_ASSEMBLY) -o $@
|
559
|
+
@rm -f ${TEMP_ASSEMBLY}
|
560
|
+
endif
|
534
561
|
endif # LLAMA_METAL
|
535
562
|
|
536
563
|
ifdef LLAMA_MPI
|
@@ -542,9 +569,10 @@ GF_CC := $(CC)
|
|
542
569
|
include scripts/get-flags.mk
|
543
570
|
|
544
571
|
# combine build flags with cmdline overrides
|
545
|
-
override
|
546
|
-
|
547
|
-
|
572
|
+
override CPPFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS)
|
573
|
+
override CFLAGS := $(CPPFLAGS) $(MK_CFLAGS) $(GF_CFLAGS) $(CFLAGS)
|
574
|
+
BASE_CXXFLAGS := $(MK_CXXFLAGS) $(CXXFLAGS)
|
575
|
+
override CXXFLAGS := $(BASE_CXXFLAGS) $(HOST_CXXFLAGS) $(GF_CXXFLAGS) $(CPPFLAGS)
|
548
576
|
override NVCCFLAGS := $(MK_NVCCFLAGS) $(NVCCFLAGS)
|
549
577
|
override LDFLAGS := $(MK_LDFLAGS) $(LDFLAGS)
|
550
578
|
|
@@ -552,7 +580,7 @@ override LDFLAGS := $(MK_LDFLAGS) $(LDFLAGS)
|
|
552
580
|
ifdef LLAMA_CUBLAS
|
553
581
|
GF_CC := $(NVCC) $(NVCCFLAGS) 2>/dev/null .c -Xcompiler
|
554
582
|
include scripts/get-flags.mk
|
555
|
-
CUDA_CXXFLAGS := $(GF_CXXFLAGS)
|
583
|
+
CUDA_CXXFLAGS := $(BASE_CXXFLAGS) $(GF_CXXFLAGS) -Wno-pedantic
|
556
584
|
endif
|
557
585
|
|
558
586
|
#
|
@@ -571,6 +599,14 @@ $(info I CC: $(shell $(CC) --version | head -n 1))
|
|
571
599
|
$(info I CXX: $(shell $(CXX) --version | head -n 1))
|
572
600
|
ifdef LLAMA_CUBLAS
|
573
601
|
$(info I NVCC: $(shell $(NVCC) --version | tail -n 1))
|
602
|
+
CUDA_VERSION := $(shell nvcc --version | grep -oP 'release (\K[0-9]+\.[0-9])')
|
603
|
+
ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1)
|
604
|
+
ifndef CUDA_DOCKER_ARCH
|
605
|
+
ifndef CUDA_POWER_ARCH
|
606
|
+
$(error I ERROR: For CUDA versions < 11.7 a target CUDA architecture must be explicitly provided via CUDA_DOCKER_ARCH)
|
607
|
+
endif # CUDA_POWER_ARCH
|
608
|
+
endif # CUDA_DOCKER_ARCH
|
609
|
+
endif # eq ($(shell echo "$(CUDA_VERSION) < 11.7" | bc),1)
|
574
610
|
endif # LLAMA_CUBLAS
|
575
611
|
$(info )
|
576
612
|
|
@@ -625,7 +661,6 @@ lib: llama.o ggml.o $(OBJS)
|
|
625
661
|
|
626
662
|
clean:
|
627
663
|
rm -vrf *.o tests/*.o *.so *.a *.dll *.dylib benchmark-matmult common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
|
628
|
-
find examples pocs -type f -name "*.o" -delete
|
629
664
|
|
630
665
|
#
|
631
666
|
# Examples
|
@@ -689,7 +724,7 @@ save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(C
|
|
689
724
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
690
725
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
691
726
|
|
692
|
-
server: examples/server/server.cpp examples/server/oai.hpp examples/server/utils.hpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
727
|
+
server: examples/server/server.cpp examples/server/oai.hpp examples/server/utils.hpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h examples/llava/llava.h examples/llava/llava.cpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
693
728
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
694
729
|
$(CXX) $(CXXFLAGS) -c examples/llava/clip.cpp -o $(call GET_OBJ_FILE, examples/llava/clip.cpp) -Wno-cast-qual
|
695
730
|
$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h %.hpp $< examples/llava/clip.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) -o $@ $(LDFLAGS) $(LWINSOCK2)
|
@@ -860,3 +895,7 @@ tests/test-model-load-cancel: tests/test-model-load-cancel.cpp ggml.o llama.o te
|
|
860
895
|
tests/test-autorelease: tests/test-autorelease.cpp ggml.o llama.o tests/get-model.cpp $(COMMON_DEPS) $(OBJS)
|
861
896
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
862
897
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
898
|
+
|
899
|
+
tests/test-chat-template: tests/test-chat-template.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
900
|
+
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
901
|
+
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|