llama_cpp 0.12.5 → 0.12.7

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 143fb1bb108c9cc679ed6eddaaca4cb8a52a5321ee4ffd965440a2c92aeeb99e
4
- data.tar.gz: f522cbf943f82143d1a4eae679473468a9920a6ef6fe6cf88147b82bc6a1f279
3
+ metadata.gz: 350a80cc8b804b23ee7b0f4e90604110b09664892d3d7c4217c4cd48c77cf775
4
+ data.tar.gz: 7a127d3b83cb680969589368eb741c6a2ac6a9765adf9f57dd23c0c1b54ca13d
5
5
  SHA512:
6
- metadata.gz: 1646833e8e1ffd6dd22d809ce2c4f2b0f3de78d84504713da4e8d5ab1c2b466c5cbc47a3c787297753f6d56656635e12cf522acffbe37253bdae0c57f8cc51c9
7
- data.tar.gz: fbbf0372d52ba8862dcc4ff61f590f634cdcde039dc31f09a93ac6cd8e112c34a1c6d567d54a9ec2d0679e1c4ec8c2e8153071c6952f67af34fa0c4ccf49ac76
6
+ metadata.gz: dbf25eb8f0fd60332eb8452ea400294d5b9b2b09127d0f3c5ef347135f30f565b161123d0f76a8553bcabf9e35db9fac3fff6cdd9df407fb830ab124d0d85d47
7
+ data.tar.gz: 2bbefd5b502150f052ab556c372c4f37b9cf2de2e22e34f4b2153a3b7ff93d7fca768eec5572d5514d7c46dc2a9c03121487907adc5ede612ecb6cea72de682d
data/CHANGELOG.md CHANGED
@@ -1,3 +1,20 @@
1
+ ## [[0.12.7](https://github.com/yoshoku/llama_cpp.rb/compare/v0.12.6...v0.12.7)] - 2024-02-24
2
+
3
+ - Bump bundled llama.cpp from b2106 to b2143.
4
+ - Add constants for file type: `LLAMA_FTYPE_MOSTLY_IQ1_S` and `LLAMA_FTYPE_MOSTLY_IQ4_NL`.
5
+ - Add constants for pooling type: `LLAMA_POOLING_NONE`, `LLAMA_POOLING_MEAN`, and `LLAMA_POOLING_CLS`.
6
+ - Add `numa_init` module function to `LLaMACpp`.
7
+ - Remove unnecessary argument from `backend_init`.
8
+
9
+ Implementation of llama_chat_apply_template binding has been postponed for the time being.
10
+
11
+ ## [[0.12.6](https://github.com/yoshoku/llama_cpp.rb/compare/v0.12.5...v0.12.6)] - 2024-02-17
12
+
13
+ - Bump bundled llama.cpp from b2106 to b2143.
14
+ - Add constant: `LLAMA_VOCAB_TYPE_WPM`.
15
+ - Add `do_pooling` accessors to ContextParams.
16
+ - Add `embeddings_ith` method to Context.
17
+
1
18
  ## [[0.12.5](https://github.com/yoshoku/llama_cpp.rb/compare/v0.12.4...v0.12.5)] - 2024-02-09
2
19
 
3
20
  - Bump bundled llama.cpp from b2047 to b2106.
@@ -978,6 +978,8 @@ public:
978
978
  rb_define_method(rb_cLLaMAContextParams, "embedding", RUBY_METHOD_FUNC(_llama_context_params_get_embedding), 0);
979
979
  rb_define_method(rb_cLLaMAContextParams, "offload_kqv=", RUBY_METHOD_FUNC(_llama_context_params_set_offload_kqv), 1);
980
980
  rb_define_method(rb_cLLaMAContextParams, "offload_kqv", RUBY_METHOD_FUNC(_llama_context_params_get_offload_kqv), 0);
981
+ rb_define_method(rb_cLLaMAContextParams, "do_pooling=", RUBY_METHOD_FUNC(_llama_context_params_set_do_pooling), 1);
982
+ rb_define_method(rb_cLLaMAContextParams, "do_pooling", RUBY_METHOD_FUNC(_llama_context_params_get_do_pooling), 0);
981
983
  }
982
984
 
983
985
  private:
@@ -1220,6 +1222,18 @@ private:
1220
1222
  LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
1221
1223
  return ptr->params.offload_kqv ? Qtrue : Qfalse;
1222
1224
  }
1225
+
1226
+ // do_pooling
1227
+ static VALUE _llama_context_params_set_do_pooling(VALUE self, VALUE do_pooling) {
1228
+ LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
1229
+ ptr->params.do_pooling = RTEST(do_pooling) ? true : false;
1230
+ return ptr->params.do_pooling ? Qtrue : Qfalse;
1231
+ }
1232
+
1233
+ static VALUE _llama_context_params_get_do_pooling(VALUE self) {
1234
+ LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
1235
+ return ptr->params.do_pooling ? Qtrue : Qfalse;
1236
+ }
1223
1237
  };
1224
1238
 
1225
1239
  const rb_data_type_t RbLLaMAContextParams::llama_context_params_type = {
@@ -2029,6 +2043,7 @@ public:
2029
2043
  rb_define_method(rb_cLLaMAContext, "decode", RUBY_METHOD_FUNC(_llama_context_decode), 1);
2030
2044
  rb_define_method(rb_cLLaMAContext, "logits", RUBY_METHOD_FUNC(_llama_context_logits), 0);
2031
2045
  rb_define_method(rb_cLLaMAContext, "embeddings", RUBY_METHOD_FUNC(_llama_context_embeddings), 0);
2046
+ rb_define_method(rb_cLLaMAContext, "embeddings_ith", RUBY_METHOD_FUNC(_llama_context_embeddings_ith), 1);
2032
2047
  rb_define_method(rb_cLLaMAContext, "n_ctx", RUBY_METHOD_FUNC(_llama_context_n_ctx), 0);
2033
2048
  rb_define_method(rb_cLLaMAContext, "n_batch", RUBY_METHOD_FUNC(_llama_context_n_batch), 0);
2034
2049
  rb_define_method(rb_cLLaMAContext, "timings", RUBY_METHOD_FUNC(_llama_context_get_timings), 0);
@@ -2286,6 +2301,36 @@ private:
2286
2301
  return output;
2287
2302
  }
2288
2303
 
2304
+ static VALUE _llama_context_embeddings_ith(VALUE self, VALUE ith) {
2305
+ if (!RB_INTEGER_TYPE_P(ith)) {
2306
+ rb_raise(rb_eArgError, "ith must be an integer");
2307
+ return Qnil;
2308
+ }
2309
+ LLaMAContextWrapper* ptr = get_llama_context(self);
2310
+ if (ptr->ctx == NULL) {
2311
+ rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
2312
+ return Qnil;
2313
+ }
2314
+ VALUE params = rb_iv_get(self, "@params");
2315
+ LLaMAContextParamsWrapper* prms_ptr = RbLLaMAContextParams::get_llama_context_params(params);
2316
+ if (!prms_ptr->params.embedding) {
2317
+ rb_raise(rb_eRuntimeError, "embedding parameter is false");
2318
+ return Qnil;
2319
+ }
2320
+
2321
+ VALUE model = rb_iv_get(self, "@model");
2322
+ LLaMAModelWrapper* model_ptr = RbLLaMAModel::get_llama_model(model);
2323
+ const int n_embd = llama_n_embd(model_ptr->model);
2324
+
2325
+ VALUE output = rb_ary_new();
2326
+ const float* embd = llama_get_embeddings_ith(ptr->ctx, NUM2INT(ith));
2327
+ for (int i = 0; i < n_embd; i++) {
2328
+ rb_ary_push(output, DBL2NUM((double)(embd[i])));
2329
+ }
2330
+
2331
+ return output;
2332
+ }
2333
+
2289
2334
  static VALUE _llama_context_n_ctx(VALUE self) {
2290
2335
  LLaMAContextWrapper* ptr = get_llama_context(self);
2291
2336
  if (ptr->ctx == NULL) {
@@ -3198,15 +3243,8 @@ const rb_data_type_t RbLLaMAContext::llama_context_type = {
3198
3243
 
3199
3244
  // module functions
3200
3245
 
3201
- static VALUE rb_llama_llama_backend_init(int argc, VALUE* argv, VALUE self) {
3202
- VALUE kw_args = Qnil;
3203
- ID kw_table[1] = { rb_intern("numa") };
3204
- VALUE kw_values[1] = { Qundef };
3205
- rb_scan_args(argc, argv, ":", &kw_args);
3206
- rb_get_kwargs(kw_args, kw_table, 0, 1, kw_values);
3207
-
3208
- const bool numa = kw_values[0] == Qundef ? false : (RTEST(kw_values[0]) ? true : false);
3209
- llama_backend_init(numa);
3246
+ static VALUE rb_llama_llama_backend_init(VALUE self) {
3247
+ llama_backend_init();
3210
3248
 
3211
3249
  return Qnil;
3212
3250
  }
@@ -3217,6 +3255,17 @@ static VALUE rb_llama_llama_backend_free(VALUE self) {
3217
3255
  return Qnil;
3218
3256
  }
3219
3257
 
3258
+ static VALUE rb_llama_llama_numa_init(VALUE self, VALUE strategy) {
3259
+ if (!RB_INTEGER_TYPE_P(strategy)) {
3260
+ rb_raise(rb_eArgError, "strategy must be an integer");
3261
+ return Qnil;
3262
+ }
3263
+
3264
+ llama_numa_init(static_cast<enum ggml_numa_strategy>(NUM2INT(strategy)));
3265
+
3266
+ return Qnil;
3267
+ }
3268
+
3220
3269
  static VALUE rb_llama_model_quantize(int argc, VALUE* argv, VALUE self) {
3221
3270
  VALUE kw_args = Qnil;
3222
3271
  ID kw_table[3] = { rb_intern("input_path"), rb_intern("output_path"), rb_intern("params") };
@@ -3300,8 +3349,9 @@ extern "C" void Init_llama_cpp(void) {
3300
3349
  RbLLaMAGrammarElement::define_class(rb_mLLaMACpp);
3301
3350
  RbLLaMAGrammar::define_class(rb_mLLaMACpp);
3302
3351
 
3303
- rb_define_module_function(rb_mLLaMACpp, "backend_init", rb_llama_llama_backend_init, -1);
3352
+ rb_define_module_function(rb_mLLaMACpp, "backend_init", rb_llama_llama_backend_init, 0);
3304
3353
  rb_define_module_function(rb_mLLaMACpp, "backend_free", rb_llama_llama_backend_free, 0);
3354
+ rb_define_module_function(rb_mLLaMACpp, "numa_init", rb_llama_llama_numa_init, 1);
3305
3355
  rb_define_module_function(rb_mLLaMACpp, "model_quantize", rb_llama_model_quantize, -1);
3306
3356
  rb_define_module_function(rb_mLLaMACpp, "print_system_info", rb_llama_print_system_info, 0);
3307
3357
  rb_define_module_function(rb_mLLaMACpp, "time_us", rb_llama_time_us, 0);
@@ -3314,6 +3364,7 @@ extern "C" void Init_llama_cpp(void) {
3314
3364
 
3315
3365
  rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_TYPE_SPM", INT2NUM(LLAMA_VOCAB_TYPE_SPM));
3316
3366
  rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_TYPE_BPE", INT2NUM(LLAMA_VOCAB_TYPE_BPE));
3367
+ rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_TYPE_WPM", INT2NUM(LLAMA_VOCAB_TYPE_WPM));
3317
3368
 
3318
3369
  rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_UNDEFINED", INT2NUM(LLAMA_TOKEN_TYPE_UNDEFINED));
3319
3370
  rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_NORMAL", INT2NUM(LLAMA_TOKEN_TYPE_NORMAL));
@@ -3345,6 +3396,8 @@ extern "C" void Init_llama_cpp(void) {
3345
3396
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q2_K_S", INT2NUM(LLAMA_FTYPE_MOSTLY_Q2_K_S));
3346
3397
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q3_K_XS", INT2NUM(LLAMA_FTYPE_MOSTLY_Q3_K_XS));
3347
3398
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ3_XXS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ3_XXS));
3399
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ1_S", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ1_S));
3400
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ4_NL", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ4_NL));
3348
3401
 
3349
3402
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_GUESSED", INT2NUM(LLAMA_FTYPE_GUESSED));
3350
3403
 
@@ -3366,6 +3419,10 @@ extern "C" void Init_llama_cpp(void) {
3366
3419
  rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_YARN", INT2NUM(LLAMA_ROPE_SCALING_YARN));
3367
3420
  rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_MAX_VALUE", INT2NUM(LLAMA_ROPE_SCALING_MAX_VALUE));
3368
3421
 
3422
+ rb_define_const(rb_mLLaMACpp, "LLAMA_POOLING_NONE", INT2NUM(LLAMA_POOLING_NONE));
3423
+ rb_define_const(rb_mLLaMACpp, "LLAMA_POOLING_MEAN", INT2NUM(LLAMA_POOLING_MEAN));
3424
+ rb_define_const(rb_mLLaMACpp, "LLAMA_POOLING_CLS", INT2NUM(LLAMA_POOLING_CLS));
3425
+
3369
3426
  rb_define_const(rb_mLLaMACpp, "LLAMA_SPLIT_NONE", INT2NUM(LLAMA_SPLIT_NONE));
3370
3427
  rb_define_const(rb_mLLaMACpp, "LLAMA_SPLIT_LAYER", INT2NUM(LLAMA_SPLIT_LAYER));
3371
3428
  rb_define_const(rb_mLLaMACpp, "LLAMA_SPLIT_ROW", INT2NUM(LLAMA_SPLIT_ROW));
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.12.5'
6
+ VERSION = '0.12.7'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'b2106'
9
+ LLAMA_CPP_VERSION = 'b2249'
10
10
  end
data/sig/llama_cpp.rbs CHANGED
@@ -3,6 +3,10 @@ module LLaMACpp
3
3
  LLAMA_CPP_VERSION: String
4
4
  LLAMA_DEFALUT_SEED: String
5
5
 
6
+ LLAMA_VOCAB_TYPE_SPM: Integer
7
+ LLAMA_VOCAB_TYPE_BPE: Integer
8
+ LLAMA_VOCAB_TYPE_WPM: Integer
9
+
6
10
  LLAMA_FTYPE_ALL_F32: Integer
7
11
  LLAMA_FTYPE_MOSTLY_F16: Integer
8
12
  LLAMA_FTYPE_MOSTLY_Q4_0: Integer
@@ -25,6 +29,8 @@ module LLaMACpp
25
29
  LLAMA_FTYPE_MOSTLY_Q2_K_S: Integer
26
30
  LLAMA_FTYPE_MOSTLY_Q3_K_XS: Integer
27
31
  LLAMA_FTYPE_MOSTLY_IQ3_XXS: Integer
32
+ LLAMA_FTYPE_MOSTLY_IQ1_S: Integer
33
+ LLAMA_FTYPE_MOSTLY_IQ4_NL: Integer
28
34
 
29
35
  LLAMA_KV_OVERRIDE_INT: Integer
30
36
  LLAMA_KV_OVERRIDE_FLOAT: Integer
@@ -44,12 +50,17 @@ module LLaMACpp
44
50
  LLAMA_ROPE_SCALING_YARN: Integer
45
51
  LLAMA_ROPE_SCALING_MAX_VALUE: Integer
46
52
 
53
+ LLAMA_POOLING_NONE: Integer
54
+ LLAMA_POOLING_MEAN: Integer
55
+ LLAMA_POOLING_CLS: Integer
56
+
47
57
  LLAMA_SPLIT_NONE: Integer
48
58
  LLAMA_SPLIT_LAYER: Integer
49
59
  LLAMA_SPLIT_ROW: Integer
50
60
 
51
- def self?.backend_init: (?numa: bool) -> void
61
+ def self?.backend_init: () -> void
52
62
  def self?.backend_free: () -> void
63
+ def self?.numa_init: (Integer) -> void
53
64
  def self?.model_quantize: (input_path: String, output_path: String, params: ModelQuantizeParams) -> void
54
65
  def self?.generate: (::LLaMACpp::Context, String,
55
66
  ?n_predict: Integer, ?n_threads: Integer, ?n_keep: Integer, ?n_batch: Integer,
@@ -190,6 +201,7 @@ module LLaMACpp
190
201
 
191
202
  def initialize: (model: ::LLaMACpp::Model, params: ::LLaMACpp::ContextParams) -> void
192
203
  def embeddings: () -> Array[Float]
204
+ def embeddings_ith: (Integer) -> Array[Float]
193
205
  def eval: (tokens: Array[Integer], n_past: Integer, ?n_tokens: Integer) -> void
194
206
  def eval_embd: (tokens: Array[Float], n_past: Integer, ?n_tokens: Integer) -> void
195
207
  def decode: (::LLaMACpp::Batch) -> void
@@ -270,6 +282,8 @@ module LLaMACpp
270
282
  def embedding=: (bool) -> bool
271
283
  def offload_kqv: () -> bool
272
284
  def offload_kqv=: (bool) -> bool
285
+ def do_pooling: () -> bool
286
+ def do_pooling=: (bool) -> bool
273
287
  end
274
288
 
275
289
  class ModelQuantizeParams
@@ -97,9 +97,10 @@ endif
97
97
  #
98
98
 
99
99
  # keep standard at C11 and C++11
100
- MK_CPPFLAGS = -I. -Icommon
101
- MK_CFLAGS = -std=c11 -fPIC
102
- MK_CXXFLAGS = -std=c++11 -fPIC
100
+ MK_CPPFLAGS = -I. -Icommon
101
+ MK_CFLAGS = -std=c11 -fPIC
102
+ MK_CXXFLAGS = -std=c++11 -fPIC
103
+ MK_NVCCFLAGS = -std=c++11
103
104
 
104
105
  # -Ofast tends to produce faster code, but may not be available for some compilers.
105
106
  ifdef LLAMA_FAST
@@ -172,7 +173,7 @@ ifdef LLAMA_DEBUG
172
173
  MK_LDFLAGS += -g
173
174
 
174
175
  ifeq ($(UNAME_S),Linux)
175
- MK_CXXFLAGS += -Wp,-D_GLIBCXX_ASSERTIONS
176
+ MK_CPPFLAGS += -D_GLIBCXX_ASSERTIONS
176
177
  endif
177
178
  else
178
179
  MK_CPPFLAGS += -DNDEBUG
@@ -215,6 +216,11 @@ MK_CFLAGS += $(WARN_FLAGS) -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmis
215
216
  -Werror=implicit-function-declaration
216
217
  MK_CXXFLAGS += $(WARN_FLAGS) -Wmissing-declarations -Wmissing-noreturn
217
218
 
219
+ ifeq ($(LLAMA_FATAL_WARNINGS),1)
220
+ MK_CFLAGS += -Werror
221
+ MK_CXXFLAGS += -Werror
222
+ endif
223
+
218
224
  # this version of Apple ld64 is buggy
219
225
  ifneq '' '$(findstring dyld-1015.7,$(shell $(CC) $(LDFLAGS) -Wl,-v 2>&1))'
220
226
  MK_CPPFLAGS += -DHAVE_BUGGY_APPLE_LINKER
@@ -381,6 +387,9 @@ ifdef LLAMA_CUBLAS
381
387
  MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib -L/usr/local/cuda/targets/aarch64-linux/lib -L/usr/lib/wsl/lib
382
388
  OBJS += ggml-cuda.o
383
389
  MK_NVCCFLAGS += -use_fast_math
390
+ ifdef LLAMA_FATAL_WARNINGS
391
+ MK_NVCCFLAGS += -Werror all-warnings
392
+ endif # LLAMA_FATAL_WARNINGS
384
393
  ifndef JETSON_EOL_MODULE_DETECT
385
394
  MK_NVCCFLAGS += --forward-unknown-to-host-compiler
386
395
  endif # JETSON_EOL_MODULE_DETECT
@@ -439,9 +448,9 @@ ifdef LLAMA_CUDA_CCBIN
439
448
  endif
440
449
  ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
441
450
  ifdef JETSON_EOL_MODULE_DETECT
442
- $(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
451
+ $(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
443
452
  else
444
- $(NVCC) $(BASE_CXXFLAGS) $(NVCCFLAGS) -Wno-pedantic -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
453
+ $(NVCC) $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
445
454
  endif # JETSON_EOL_MODULE_DETECT
446
455
  endif # LLAMA_CUBLAS
447
456
 
@@ -526,11 +535,29 @@ ifdef LLAMA_METAL
526
535
  ifdef LLAMA_METAL_NDEBUG
527
536
  MK_CPPFLAGS += -DGGML_METAL_NDEBUG
528
537
  endif
538
+ ifdef LLAMA_METAL_EMBED_LIBRARY
539
+ MK_CPPFLAGS += -DGGML_METAL_EMBED_LIBRARY
540
+ OBJS += ggml-metal-embed.o
541
+ endif
529
542
  endif # LLAMA_METAL
530
543
 
531
544
  ifdef LLAMA_METAL
532
545
  ggml-metal.o: ggml-metal.m ggml-metal.h
533
546
  $(CC) $(CFLAGS) -c $< -o $@
547
+
548
+ ifdef LLAMA_METAL_EMBED_LIBRARY
549
+ ggml-metal-embed.o: ggml-metal.metal
550
+ @echo "Embedding Metal library"
551
+ $(eval TEMP_ASSEMBLY=$(shell mktemp))
552
+ @echo ".section __DATA, __ggml_metallib" > $(TEMP_ASSEMBLY)
553
+ @echo ".globl _ggml_metallib_start" >> $(TEMP_ASSEMBLY)
554
+ @echo "_ggml_metallib_start:" >> $(TEMP_ASSEMBLY)
555
+ @echo ".incbin \"$<\"" >> $(TEMP_ASSEMBLY)
556
+ @echo ".globl _ggml_metallib_end" >> $(TEMP_ASSEMBLY)
557
+ @echo "_ggml_metallib_end:" >> $(TEMP_ASSEMBLY)
558
+ @$(AS) $(TEMP_ASSEMBLY) -o $@
559
+ @rm -f ${TEMP_ASSEMBLY}
560
+ endif
534
561
  endif # LLAMA_METAL
535
562
 
536
563
  ifdef LLAMA_MPI
@@ -542,9 +569,10 @@ GF_CC := $(CC)
542
569
  include scripts/get-flags.mk
543
570
 
544
571
  # combine build flags with cmdline overrides
545
- override CFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CFLAGS) $(GF_CFLAGS) $(CFLAGS)
546
- BASE_CXXFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CXXFLAGS) $(CXXFLAGS)
547
- override CXXFLAGS := $(BASE_CXXFLAGS) $(HOST_CXXFLAGS) $(GF_CXXFLAGS)
572
+ override CPPFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS)
573
+ override CFLAGS := $(CPPFLAGS) $(MK_CFLAGS) $(GF_CFLAGS) $(CFLAGS)
574
+ BASE_CXXFLAGS := $(MK_CXXFLAGS) $(CXXFLAGS)
575
+ override CXXFLAGS := $(BASE_CXXFLAGS) $(HOST_CXXFLAGS) $(GF_CXXFLAGS) $(CPPFLAGS)
548
576
  override NVCCFLAGS := $(MK_NVCCFLAGS) $(NVCCFLAGS)
549
577
  override LDFLAGS := $(MK_LDFLAGS) $(LDFLAGS)
550
578
 
@@ -552,7 +580,7 @@ override LDFLAGS := $(MK_LDFLAGS) $(LDFLAGS)
552
580
  ifdef LLAMA_CUBLAS
553
581
  GF_CC := $(NVCC) $(NVCCFLAGS) 2>/dev/null .c -Xcompiler
554
582
  include scripts/get-flags.mk
555
- CUDA_CXXFLAGS := $(GF_CXXFLAGS)
583
+ CUDA_CXXFLAGS := $(BASE_CXXFLAGS) $(GF_CXXFLAGS) -Wno-pedantic
556
584
  endif
557
585
 
558
586
  #
@@ -571,6 +599,14 @@ $(info I CC: $(shell $(CC) --version | head -n 1))
571
599
  $(info I CXX: $(shell $(CXX) --version | head -n 1))
572
600
  ifdef LLAMA_CUBLAS
573
601
  $(info I NVCC: $(shell $(NVCC) --version | tail -n 1))
602
+ CUDA_VERSION := $(shell nvcc --version | grep -oP 'release (\K[0-9]+\.[0-9])')
603
+ ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1)
604
+ ifndef CUDA_DOCKER_ARCH
605
+ ifndef CUDA_POWER_ARCH
606
+ $(error I ERROR: For CUDA versions < 11.7 a target CUDA architecture must be explicitly provided via CUDA_DOCKER_ARCH)
607
+ endif # CUDA_POWER_ARCH
608
+ endif # CUDA_DOCKER_ARCH
609
+ endif # eq ($(shell echo "$(CUDA_VERSION) < 11.7" | bc),1)
574
610
  endif # LLAMA_CUBLAS
575
611
  $(info )
576
612
 
@@ -625,7 +661,6 @@ lib: llama.o ggml.o $(OBJS)
625
661
 
626
662
  clean:
627
663
  rm -vrf *.o tests/*.o *.so *.a *.dll *.dylib benchmark-matmult common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
628
- find examples pocs -type f -name "*.o" -delete
629
664
 
630
665
  #
631
666
  # Examples
@@ -689,7 +724,7 @@ save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(C
689
724
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
690
725
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
691
726
 
692
- server: examples/server/server.cpp examples/server/oai.hpp examples/server/utils.hpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
727
+ server: examples/server/server.cpp examples/server/oai.hpp examples/server/utils.hpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h examples/llava/llava.h examples/llava/llava.cpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
693
728
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
694
729
  $(CXX) $(CXXFLAGS) -c examples/llava/clip.cpp -o $(call GET_OBJ_FILE, examples/llava/clip.cpp) -Wno-cast-qual
695
730
  $(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h %.hpp $< examples/llava/clip.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) -o $@ $(LDFLAGS) $(LWINSOCK2)
@@ -860,3 +895,7 @@ tests/test-model-load-cancel: tests/test-model-load-cancel.cpp ggml.o llama.o te
860
895
  tests/test-autorelease: tests/test-autorelease.cpp ggml.o llama.o tests/get-model.cpp $(COMMON_DEPS) $(OBJS)
861
896
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
862
897
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
898
+
899
+ tests/test-chat-template: tests/test-chat-template.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
900
+ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
901
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)