llama_cpp 0.12.5 → 0.12.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 143fb1bb108c9cc679ed6eddaaca4cb8a52a5321ee4ffd965440a2c92aeeb99e
4
- data.tar.gz: f522cbf943f82143d1a4eae679473468a9920a6ef6fe6cf88147b82bc6a1f279
3
+ metadata.gz: 350a80cc8b804b23ee7b0f4e90604110b09664892d3d7c4217c4cd48c77cf775
4
+ data.tar.gz: 7a127d3b83cb680969589368eb741c6a2ac6a9765adf9f57dd23c0c1b54ca13d
5
5
  SHA512:
6
- metadata.gz: 1646833e8e1ffd6dd22d809ce2c4f2b0f3de78d84504713da4e8d5ab1c2b466c5cbc47a3c787297753f6d56656635e12cf522acffbe37253bdae0c57f8cc51c9
7
- data.tar.gz: fbbf0372d52ba8862dcc4ff61f590f634cdcde039dc31f09a93ac6cd8e112c34a1c6d567d54a9ec2d0679e1c4ec8c2e8153071c6952f67af34fa0c4ccf49ac76
6
+ metadata.gz: dbf25eb8f0fd60332eb8452ea400294d5b9b2b09127d0f3c5ef347135f30f565b161123d0f76a8553bcabf9e35db9fac3fff6cdd9df407fb830ab124d0d85d47
7
+ data.tar.gz: 2bbefd5b502150f052ab556c372c4f37b9cf2de2e22e34f4b2153a3b7ff93d7fca768eec5572d5514d7c46dc2a9c03121487907adc5ede612ecb6cea72de682d
data/CHANGELOG.md CHANGED
@@ -1,3 +1,20 @@
1
+ ## [[0.12.7](https://github.com/yoshoku/llama_cpp.rb/compare/v0.12.6...v0.12.7)] - 2024-02-24
2
+
3
+ - Bump bundled llama.cpp from b2106 to b2143.
4
+ - Add constants for file type: `LLAMA_FTYPE_MOSTLY_IQ1_S` and `LLAMA_FTYPE_MOSTLY_IQ4_NL`.
5
+ - Add constants for pooling type: `LLAMA_POOLING_NONE`, `LLAMA_POOLING_MEAN`, and `LLAMA_POOLING_CLS`.
6
+ - Add `numa_init` module function to `LLaMACpp`.
7
+ - Remove unnecessary argument from `backend_init`.
8
+
9
+ Implementation of llama_chat_apply_template binding has been postponed for the time being.
10
+
11
+ ## [[0.12.6](https://github.com/yoshoku/llama_cpp.rb/compare/v0.12.5...v0.12.6)] - 2024-02-17
12
+
13
+ - Bump bundled llama.cpp from b2106 to b2143.
14
+ - Add constant: `LLAMA_VOCAB_TYPE_WPM`.
15
+ - Add `do_pooling` accessors to ContextParams.
16
+ - Add `embeddings_ith` method to Context.
17
+
1
18
  ## [[0.12.5](https://github.com/yoshoku/llama_cpp.rb/compare/v0.12.4...v0.12.5)] - 2024-02-09
2
19
 
3
20
  - Bump bundled llama.cpp from b2047 to b2106.
@@ -978,6 +978,8 @@ public:
978
978
  rb_define_method(rb_cLLaMAContextParams, "embedding", RUBY_METHOD_FUNC(_llama_context_params_get_embedding), 0);
979
979
  rb_define_method(rb_cLLaMAContextParams, "offload_kqv=", RUBY_METHOD_FUNC(_llama_context_params_set_offload_kqv), 1);
980
980
  rb_define_method(rb_cLLaMAContextParams, "offload_kqv", RUBY_METHOD_FUNC(_llama_context_params_get_offload_kqv), 0);
981
+ rb_define_method(rb_cLLaMAContextParams, "do_pooling=", RUBY_METHOD_FUNC(_llama_context_params_set_do_pooling), 1);
982
+ rb_define_method(rb_cLLaMAContextParams, "do_pooling", RUBY_METHOD_FUNC(_llama_context_params_get_do_pooling), 0);
981
983
  }
982
984
 
983
985
  private:
@@ -1220,6 +1222,18 @@ private:
1220
1222
  LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
1221
1223
  return ptr->params.offload_kqv ? Qtrue : Qfalse;
1222
1224
  }
1225
+
1226
+ // do_pooling
1227
+ static VALUE _llama_context_params_set_do_pooling(VALUE self, VALUE do_pooling) {
1228
+ LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
1229
+ ptr->params.do_pooling = RTEST(do_pooling) ? true : false;
1230
+ return ptr->params.do_pooling ? Qtrue : Qfalse;
1231
+ }
1232
+
1233
+ static VALUE _llama_context_params_get_do_pooling(VALUE self) {
1234
+ LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
1235
+ return ptr->params.do_pooling ? Qtrue : Qfalse;
1236
+ }
1223
1237
  };
1224
1238
 
1225
1239
  const rb_data_type_t RbLLaMAContextParams::llama_context_params_type = {
@@ -2029,6 +2043,7 @@ public:
2029
2043
  rb_define_method(rb_cLLaMAContext, "decode", RUBY_METHOD_FUNC(_llama_context_decode), 1);
2030
2044
  rb_define_method(rb_cLLaMAContext, "logits", RUBY_METHOD_FUNC(_llama_context_logits), 0);
2031
2045
  rb_define_method(rb_cLLaMAContext, "embeddings", RUBY_METHOD_FUNC(_llama_context_embeddings), 0);
2046
+ rb_define_method(rb_cLLaMAContext, "embeddings_ith", RUBY_METHOD_FUNC(_llama_context_embeddings_ith), 1);
2032
2047
  rb_define_method(rb_cLLaMAContext, "n_ctx", RUBY_METHOD_FUNC(_llama_context_n_ctx), 0);
2033
2048
  rb_define_method(rb_cLLaMAContext, "n_batch", RUBY_METHOD_FUNC(_llama_context_n_batch), 0);
2034
2049
  rb_define_method(rb_cLLaMAContext, "timings", RUBY_METHOD_FUNC(_llama_context_get_timings), 0);
@@ -2286,6 +2301,36 @@ private:
2286
2301
  return output;
2287
2302
  }
2288
2303
 
2304
+ static VALUE _llama_context_embeddings_ith(VALUE self, VALUE ith) {
2305
+ if (!RB_INTEGER_TYPE_P(ith)) {
2306
+ rb_raise(rb_eArgError, "ith must be an integer");
2307
+ return Qnil;
2308
+ }
2309
+ LLaMAContextWrapper* ptr = get_llama_context(self);
2310
+ if (ptr->ctx == NULL) {
2311
+ rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
2312
+ return Qnil;
2313
+ }
2314
+ VALUE params = rb_iv_get(self, "@params");
2315
+ LLaMAContextParamsWrapper* prms_ptr = RbLLaMAContextParams::get_llama_context_params(params);
2316
+ if (!prms_ptr->params.embedding) {
2317
+ rb_raise(rb_eRuntimeError, "embedding parameter is false");
2318
+ return Qnil;
2319
+ }
2320
+
2321
+ VALUE model = rb_iv_get(self, "@model");
2322
+ LLaMAModelWrapper* model_ptr = RbLLaMAModel::get_llama_model(model);
2323
+ const int n_embd = llama_n_embd(model_ptr->model);
2324
+
2325
+ VALUE output = rb_ary_new();
2326
+ const float* embd = llama_get_embeddings_ith(ptr->ctx, NUM2INT(ith));
2327
+ for (int i = 0; i < n_embd; i++) {
2328
+ rb_ary_push(output, DBL2NUM((double)(embd[i])));
2329
+ }
2330
+
2331
+ return output;
2332
+ }
2333
+
2289
2334
  static VALUE _llama_context_n_ctx(VALUE self) {
2290
2335
  LLaMAContextWrapper* ptr = get_llama_context(self);
2291
2336
  if (ptr->ctx == NULL) {
@@ -3198,15 +3243,8 @@ const rb_data_type_t RbLLaMAContext::llama_context_type = {
3198
3243
 
3199
3244
  // module functions
3200
3245
 
3201
- static VALUE rb_llama_llama_backend_init(int argc, VALUE* argv, VALUE self) {
3202
- VALUE kw_args = Qnil;
3203
- ID kw_table[1] = { rb_intern("numa") };
3204
- VALUE kw_values[1] = { Qundef };
3205
- rb_scan_args(argc, argv, ":", &kw_args);
3206
- rb_get_kwargs(kw_args, kw_table, 0, 1, kw_values);
3207
-
3208
- const bool numa = kw_values[0] == Qundef ? false : (RTEST(kw_values[0]) ? true : false);
3209
- llama_backend_init(numa);
3246
+ static VALUE rb_llama_llama_backend_init(VALUE self) {
3247
+ llama_backend_init();
3210
3248
 
3211
3249
  return Qnil;
3212
3250
  }
@@ -3217,6 +3255,17 @@ static VALUE rb_llama_llama_backend_free(VALUE self) {
3217
3255
  return Qnil;
3218
3256
  }
3219
3257
 
3258
+ static VALUE rb_llama_llama_numa_init(VALUE self, VALUE strategy) {
3259
+ if (!RB_INTEGER_TYPE_P(strategy)) {
3260
+ rb_raise(rb_eArgError, "strategy must be an integer");
3261
+ return Qnil;
3262
+ }
3263
+
3264
+ llama_numa_init(static_cast<enum ggml_numa_strategy>(NUM2INT(strategy)));
3265
+
3266
+ return Qnil;
3267
+ }
3268
+
3220
3269
  static VALUE rb_llama_model_quantize(int argc, VALUE* argv, VALUE self) {
3221
3270
  VALUE kw_args = Qnil;
3222
3271
  ID kw_table[3] = { rb_intern("input_path"), rb_intern("output_path"), rb_intern("params") };
@@ -3300,8 +3349,9 @@ extern "C" void Init_llama_cpp(void) {
3300
3349
  RbLLaMAGrammarElement::define_class(rb_mLLaMACpp);
3301
3350
  RbLLaMAGrammar::define_class(rb_mLLaMACpp);
3302
3351
 
3303
- rb_define_module_function(rb_mLLaMACpp, "backend_init", rb_llama_llama_backend_init, -1);
3352
+ rb_define_module_function(rb_mLLaMACpp, "backend_init", rb_llama_llama_backend_init, 0);
3304
3353
  rb_define_module_function(rb_mLLaMACpp, "backend_free", rb_llama_llama_backend_free, 0);
3354
+ rb_define_module_function(rb_mLLaMACpp, "numa_init", rb_llama_llama_numa_init, 1);
3305
3355
  rb_define_module_function(rb_mLLaMACpp, "model_quantize", rb_llama_model_quantize, -1);
3306
3356
  rb_define_module_function(rb_mLLaMACpp, "print_system_info", rb_llama_print_system_info, 0);
3307
3357
  rb_define_module_function(rb_mLLaMACpp, "time_us", rb_llama_time_us, 0);
@@ -3314,6 +3364,7 @@ extern "C" void Init_llama_cpp(void) {
3314
3364
 
3315
3365
  rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_TYPE_SPM", INT2NUM(LLAMA_VOCAB_TYPE_SPM));
3316
3366
  rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_TYPE_BPE", INT2NUM(LLAMA_VOCAB_TYPE_BPE));
3367
+ rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_TYPE_WPM", INT2NUM(LLAMA_VOCAB_TYPE_WPM));
3317
3368
 
3318
3369
  rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_UNDEFINED", INT2NUM(LLAMA_TOKEN_TYPE_UNDEFINED));
3319
3370
  rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_NORMAL", INT2NUM(LLAMA_TOKEN_TYPE_NORMAL));
@@ -3345,6 +3396,8 @@ extern "C" void Init_llama_cpp(void) {
3345
3396
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q2_K_S", INT2NUM(LLAMA_FTYPE_MOSTLY_Q2_K_S));
3346
3397
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q3_K_XS", INT2NUM(LLAMA_FTYPE_MOSTLY_Q3_K_XS));
3347
3398
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ3_XXS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ3_XXS));
3399
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ1_S", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ1_S));
3400
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ4_NL", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ4_NL));
3348
3401
 
3349
3402
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_GUESSED", INT2NUM(LLAMA_FTYPE_GUESSED));
3350
3403
 
@@ -3366,6 +3419,10 @@ extern "C" void Init_llama_cpp(void) {
3366
3419
  rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_YARN", INT2NUM(LLAMA_ROPE_SCALING_YARN));
3367
3420
  rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_MAX_VALUE", INT2NUM(LLAMA_ROPE_SCALING_MAX_VALUE));
3368
3421
 
3422
+ rb_define_const(rb_mLLaMACpp, "LLAMA_POOLING_NONE", INT2NUM(LLAMA_POOLING_NONE));
3423
+ rb_define_const(rb_mLLaMACpp, "LLAMA_POOLING_MEAN", INT2NUM(LLAMA_POOLING_MEAN));
3424
+ rb_define_const(rb_mLLaMACpp, "LLAMA_POOLING_CLS", INT2NUM(LLAMA_POOLING_CLS));
3425
+
3369
3426
  rb_define_const(rb_mLLaMACpp, "LLAMA_SPLIT_NONE", INT2NUM(LLAMA_SPLIT_NONE));
3370
3427
  rb_define_const(rb_mLLaMACpp, "LLAMA_SPLIT_LAYER", INT2NUM(LLAMA_SPLIT_LAYER));
3371
3428
  rb_define_const(rb_mLLaMACpp, "LLAMA_SPLIT_ROW", INT2NUM(LLAMA_SPLIT_ROW));
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.12.5'
6
+ VERSION = '0.12.7'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'b2106'
9
+ LLAMA_CPP_VERSION = 'b2249'
10
10
  end
data/sig/llama_cpp.rbs CHANGED
@@ -3,6 +3,10 @@ module LLaMACpp
3
3
  LLAMA_CPP_VERSION: String
4
4
  LLAMA_DEFALUT_SEED: String
5
5
 
6
+ LLAMA_VOCAB_TYPE_SPM: Integer
7
+ LLAMA_VOCAB_TYPE_BPE: Integer
8
+ LLAMA_VOCAB_TYPE_WPM: Integer
9
+
6
10
  LLAMA_FTYPE_ALL_F32: Integer
7
11
  LLAMA_FTYPE_MOSTLY_F16: Integer
8
12
  LLAMA_FTYPE_MOSTLY_Q4_0: Integer
@@ -25,6 +29,8 @@ module LLaMACpp
25
29
  LLAMA_FTYPE_MOSTLY_Q2_K_S: Integer
26
30
  LLAMA_FTYPE_MOSTLY_Q3_K_XS: Integer
27
31
  LLAMA_FTYPE_MOSTLY_IQ3_XXS: Integer
32
+ LLAMA_FTYPE_MOSTLY_IQ1_S: Integer
33
+ LLAMA_FTYPE_MOSTLY_IQ4_NL: Integer
28
34
 
29
35
  LLAMA_KV_OVERRIDE_INT: Integer
30
36
  LLAMA_KV_OVERRIDE_FLOAT: Integer
@@ -44,12 +50,17 @@ module LLaMACpp
44
50
  LLAMA_ROPE_SCALING_YARN: Integer
45
51
  LLAMA_ROPE_SCALING_MAX_VALUE: Integer
46
52
 
53
+ LLAMA_POOLING_NONE: Integer
54
+ LLAMA_POOLING_MEAN: Integer
55
+ LLAMA_POOLING_CLS: Integer
56
+
47
57
  LLAMA_SPLIT_NONE: Integer
48
58
  LLAMA_SPLIT_LAYER: Integer
49
59
  LLAMA_SPLIT_ROW: Integer
50
60
 
51
- def self?.backend_init: (?numa: bool) -> void
61
+ def self?.backend_init: () -> void
52
62
  def self?.backend_free: () -> void
63
+ def self?.numa_init: (Integer) -> void
53
64
  def self?.model_quantize: (input_path: String, output_path: String, params: ModelQuantizeParams) -> void
54
65
  def self?.generate: (::LLaMACpp::Context, String,
55
66
  ?n_predict: Integer, ?n_threads: Integer, ?n_keep: Integer, ?n_batch: Integer,
@@ -190,6 +201,7 @@ module LLaMACpp
190
201
 
191
202
  def initialize: (model: ::LLaMACpp::Model, params: ::LLaMACpp::ContextParams) -> void
192
203
  def embeddings: () -> Array[Float]
204
+ def embeddings_ith: (Integer) -> Array[Float]
193
205
  def eval: (tokens: Array[Integer], n_past: Integer, ?n_tokens: Integer) -> void
194
206
  def eval_embd: (tokens: Array[Float], n_past: Integer, ?n_tokens: Integer) -> void
195
207
  def decode: (::LLaMACpp::Batch) -> void
@@ -270,6 +282,8 @@ module LLaMACpp
270
282
  def embedding=: (bool) -> bool
271
283
  def offload_kqv: () -> bool
272
284
  def offload_kqv=: (bool) -> bool
285
+ def do_pooling: () -> bool
286
+ def do_pooling=: (bool) -> bool
273
287
  end
274
288
 
275
289
  class ModelQuantizeParams
@@ -97,9 +97,10 @@ endif
97
97
  #
98
98
 
99
99
  # keep standard at C11 and C++11
100
- MK_CPPFLAGS = -I. -Icommon
101
- MK_CFLAGS = -std=c11 -fPIC
102
- MK_CXXFLAGS = -std=c++11 -fPIC
100
+ MK_CPPFLAGS = -I. -Icommon
101
+ MK_CFLAGS = -std=c11 -fPIC
102
+ MK_CXXFLAGS = -std=c++11 -fPIC
103
+ MK_NVCCFLAGS = -std=c++11
103
104
 
104
105
  # -Ofast tends to produce faster code, but may not be available for some compilers.
105
106
  ifdef LLAMA_FAST
@@ -172,7 +173,7 @@ ifdef LLAMA_DEBUG
172
173
  MK_LDFLAGS += -g
173
174
 
174
175
  ifeq ($(UNAME_S),Linux)
175
- MK_CXXFLAGS += -Wp,-D_GLIBCXX_ASSERTIONS
176
+ MK_CPPFLAGS += -D_GLIBCXX_ASSERTIONS
176
177
  endif
177
178
  else
178
179
  MK_CPPFLAGS += -DNDEBUG
@@ -215,6 +216,11 @@ MK_CFLAGS += $(WARN_FLAGS) -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmis
215
216
  -Werror=implicit-function-declaration
216
217
  MK_CXXFLAGS += $(WARN_FLAGS) -Wmissing-declarations -Wmissing-noreturn
217
218
 
219
+ ifeq ($(LLAMA_FATAL_WARNINGS),1)
220
+ MK_CFLAGS += -Werror
221
+ MK_CXXFLAGS += -Werror
222
+ endif
223
+
218
224
  # this version of Apple ld64 is buggy
219
225
  ifneq '' '$(findstring dyld-1015.7,$(shell $(CC) $(LDFLAGS) -Wl,-v 2>&1))'
220
226
  MK_CPPFLAGS += -DHAVE_BUGGY_APPLE_LINKER
@@ -381,6 +387,9 @@ ifdef LLAMA_CUBLAS
381
387
  MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib -L/usr/local/cuda/targets/aarch64-linux/lib -L/usr/lib/wsl/lib
382
388
  OBJS += ggml-cuda.o
383
389
  MK_NVCCFLAGS += -use_fast_math
390
+ ifdef LLAMA_FATAL_WARNINGS
391
+ MK_NVCCFLAGS += -Werror all-warnings
392
+ endif # LLAMA_FATAL_WARNINGS
384
393
  ifndef JETSON_EOL_MODULE_DETECT
385
394
  MK_NVCCFLAGS += --forward-unknown-to-host-compiler
386
395
  endif # JETSON_EOL_MODULE_DETECT
@@ -439,9 +448,9 @@ ifdef LLAMA_CUDA_CCBIN
439
448
  endif
440
449
  ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
441
450
  ifdef JETSON_EOL_MODULE_DETECT
442
- $(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
451
+ $(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
443
452
  else
444
- $(NVCC) $(BASE_CXXFLAGS) $(NVCCFLAGS) -Wno-pedantic -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
453
+ $(NVCC) $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
445
454
  endif # JETSON_EOL_MODULE_DETECT
446
455
  endif # LLAMA_CUBLAS
447
456
 
@@ -526,11 +535,29 @@ ifdef LLAMA_METAL
526
535
  ifdef LLAMA_METAL_NDEBUG
527
536
  MK_CPPFLAGS += -DGGML_METAL_NDEBUG
528
537
  endif
538
+ ifdef LLAMA_METAL_EMBED_LIBRARY
539
+ MK_CPPFLAGS += -DGGML_METAL_EMBED_LIBRARY
540
+ OBJS += ggml-metal-embed.o
541
+ endif
529
542
  endif # LLAMA_METAL
530
543
 
531
544
  ifdef LLAMA_METAL
532
545
  ggml-metal.o: ggml-metal.m ggml-metal.h
533
546
  $(CC) $(CFLAGS) -c $< -o $@
547
+
548
+ ifdef LLAMA_METAL_EMBED_LIBRARY
549
+ ggml-metal-embed.o: ggml-metal.metal
550
+ @echo "Embedding Metal library"
551
+ $(eval TEMP_ASSEMBLY=$(shell mktemp))
552
+ @echo ".section __DATA, __ggml_metallib" > $(TEMP_ASSEMBLY)
553
+ @echo ".globl _ggml_metallib_start" >> $(TEMP_ASSEMBLY)
554
+ @echo "_ggml_metallib_start:" >> $(TEMP_ASSEMBLY)
555
+ @echo ".incbin \"$<\"" >> $(TEMP_ASSEMBLY)
556
+ @echo ".globl _ggml_metallib_end" >> $(TEMP_ASSEMBLY)
557
+ @echo "_ggml_metallib_end:" >> $(TEMP_ASSEMBLY)
558
+ @$(AS) $(TEMP_ASSEMBLY) -o $@
559
+ @rm -f ${TEMP_ASSEMBLY}
560
+ endif
534
561
  endif # LLAMA_METAL
535
562
 
536
563
  ifdef LLAMA_MPI
@@ -542,9 +569,10 @@ GF_CC := $(CC)
542
569
  include scripts/get-flags.mk
543
570
 
544
571
  # combine build flags with cmdline overrides
545
- override CFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CFLAGS) $(GF_CFLAGS) $(CFLAGS)
546
- BASE_CXXFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CXXFLAGS) $(CXXFLAGS)
547
- override CXXFLAGS := $(BASE_CXXFLAGS) $(HOST_CXXFLAGS) $(GF_CXXFLAGS)
572
+ override CPPFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS)
573
+ override CFLAGS := $(CPPFLAGS) $(MK_CFLAGS) $(GF_CFLAGS) $(CFLAGS)
574
+ BASE_CXXFLAGS := $(MK_CXXFLAGS) $(CXXFLAGS)
575
+ override CXXFLAGS := $(BASE_CXXFLAGS) $(HOST_CXXFLAGS) $(GF_CXXFLAGS) $(CPPFLAGS)
548
576
  override NVCCFLAGS := $(MK_NVCCFLAGS) $(NVCCFLAGS)
549
577
  override LDFLAGS := $(MK_LDFLAGS) $(LDFLAGS)
550
578
 
@@ -552,7 +580,7 @@ override LDFLAGS := $(MK_LDFLAGS) $(LDFLAGS)
552
580
  ifdef LLAMA_CUBLAS
553
581
  GF_CC := $(NVCC) $(NVCCFLAGS) 2>/dev/null .c -Xcompiler
554
582
  include scripts/get-flags.mk
555
- CUDA_CXXFLAGS := $(GF_CXXFLAGS)
583
+ CUDA_CXXFLAGS := $(BASE_CXXFLAGS) $(GF_CXXFLAGS) -Wno-pedantic
556
584
  endif
557
585
 
558
586
  #
@@ -571,6 +599,14 @@ $(info I CC: $(shell $(CC) --version | head -n 1))
571
599
  $(info I CXX: $(shell $(CXX) --version | head -n 1))
572
600
  ifdef LLAMA_CUBLAS
573
601
  $(info I NVCC: $(shell $(NVCC) --version | tail -n 1))
602
+ CUDA_VERSION := $(shell nvcc --version | grep -oP 'release (\K[0-9]+\.[0-9])')
603
+ ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1)
604
+ ifndef CUDA_DOCKER_ARCH
605
+ ifndef CUDA_POWER_ARCH
606
+ $(error I ERROR: For CUDA versions < 11.7 a target CUDA architecture must be explicitly provided via CUDA_DOCKER_ARCH)
607
+ endif # CUDA_POWER_ARCH
608
+ endif # CUDA_DOCKER_ARCH
609
+ endif # eq ($(shell echo "$(CUDA_VERSION) < 11.7" | bc),1)
574
610
  endif # LLAMA_CUBLAS
575
611
  $(info )
576
612
 
@@ -625,7 +661,6 @@ lib: llama.o ggml.o $(OBJS)
625
661
 
626
662
  clean:
627
663
  rm -vrf *.o tests/*.o *.so *.a *.dll *.dylib benchmark-matmult common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
628
- find examples pocs -type f -name "*.o" -delete
629
664
 
630
665
  #
631
666
  # Examples
@@ -689,7 +724,7 @@ save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(C
689
724
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
690
725
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
691
726
 
692
- server: examples/server/server.cpp examples/server/oai.hpp examples/server/utils.hpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
727
+ server: examples/server/server.cpp examples/server/oai.hpp examples/server/utils.hpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h examples/llava/llava.h examples/llava/llava.cpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
693
728
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
694
729
  $(CXX) $(CXXFLAGS) -c examples/llava/clip.cpp -o $(call GET_OBJ_FILE, examples/llava/clip.cpp) -Wno-cast-qual
695
730
  $(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h %.hpp $< examples/llava/clip.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) -o $@ $(LDFLAGS) $(LWINSOCK2)
@@ -860,3 +895,7 @@ tests/test-model-load-cancel: tests/test-model-load-cancel.cpp ggml.o llama.o te
860
895
  tests/test-autorelease: tests/test-autorelease.cpp ggml.o llama.o tests/get-model.cpp $(COMMON_DEPS) $(OBJS)
861
896
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
862
897
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
898
+
899
+ tests/test-chat-template: tests/test-chat-template.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
900
+ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
901
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)