llama_cpp 0.14.2 → 0.14.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +14 -0
- data/ext/llama_cpp/llama_cpp.cpp +64 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +6 -0
- data/vendor/tmp/llama.cpp/Makefile +91 -21
- data/vendor/tmp/llama.cpp/ggml-alloc.c +14 -5
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +5 -0
- data/vendor/tmp/llama.cpp/ggml-backend.c +155 -125
- data/vendor/tmp/llama.cpp/ggml-backend.h +4 -4
- data/vendor/tmp/llama.cpp/ggml-common.h +25 -2
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +1779 -10762
- data/vendor/tmp/llama.cpp/ggml-cuda.h +6 -15
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +5 -0
- data/vendor/tmp/llama.cpp/ggml-metal.m +167 -124
- data/vendor/tmp/llama.cpp/ggml-metal.metal +603 -303
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +5 -0
- data/vendor/tmp/llama.cpp/ggml-quants.c +663 -56
- data/vendor/tmp/llama.cpp/ggml-quants.h +3 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +341 -469
- data/vendor/tmp/llama.cpp/ggml-sycl.h +19 -4
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +37199 -14939
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +335 -307
- data/vendor/tmp/llama.cpp/ggml-vulkan.h +0 -11
- data/vendor/tmp/llama.cpp/ggml.c +229 -107
- data/vendor/tmp/llama.cpp/ggml.h +11 -5
- data/vendor/tmp/llama.cpp/llama.cpp +2136 -464
- data/vendor/tmp/llama.cpp/llama.h +86 -23
- data/vendor/tmp/llama.cpp/unicode-data.cpp +1651 -0
- data/vendor/tmp/llama.cpp/unicode-data.h +16 -0
- data/vendor/tmp/llama.cpp/unicode.cpp +8 -1403
- data/vendor/tmp/llama.cpp/unicode.h +2 -0
- metadata +5 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 92ebd411f54255b05074ef79ed3e220c9ff4332164cfc831122d766226322515
|
4
|
+
data.tar.gz: 571f0ec65776945d40a54e31bba26cc4194b19965cc4841ce40b9ad3d94605df
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 34ca9567b7eb96add562e977f22f8b2be087c026c85bf92cd5e31f9a96ea5f02a841bdf05f745c4079740a4bb01476fb9bab313317d66dbf8870fa829a269c86
|
7
|
+
data.tar.gz: 64b19ef010bb52800cd3710c1ec70bcb7b747e53b6ea7d8f13d84d336b1ee67868153f5bbdfee0b4131dfddaf1c656c49bd774084ee2d14f191d22d215a47737
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,17 @@
|
|
1
|
+
## [[0.14.4](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.3...v0.14.4)] - 2024-04-06
|
2
|
+
|
3
|
+
- Bump llama.cpp from b2496 to b2573.
|
4
|
+
- Add file type constants.
|
5
|
+
- Bump llama.cpp from b2573 to b2608.
|
6
|
+
|
7
|
+
Implementation of llama_split_path, llama_split_prefix binding, llama_grammar_accept, and decode_utf8 has been skipped.
|
8
|
+
|
9
|
+
## [[0.14.3](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.2...v0.14.3)] - 2024-03-23
|
10
|
+
|
11
|
+
- Bump llama.cpp from b2435 to b2496.
|
12
|
+
- Add `n_layer` method to `Model`.
|
13
|
+
- Add `apply_control_vector` method to `Context`.
|
14
|
+
|
1
15
|
## [[0.14.2](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.1...v0.14.2)] - 2024-03-16
|
2
16
|
|
3
17
|
- Fix to use metal embed library on macOS.
|
data/ext/llama_cpp/llama_cpp.cpp
CHANGED
@@ -1466,6 +1466,7 @@ public:
|
|
1466
1466
|
rb_define_method(rb_cLLaMAModel, "n_vocab", RUBY_METHOD_FUNC(_llama_model_get_model_n_vocab), 0);
|
1467
1467
|
rb_define_method(rb_cLLaMAModel, "n_ctx_train", RUBY_METHOD_FUNC(_llama_model_get_model_n_ctx_train), 0);
|
1468
1468
|
rb_define_method(rb_cLLaMAModel, "n_embd", RUBY_METHOD_FUNC(_llama_model_get_model_n_embd), 0);
|
1469
|
+
rb_define_method(rb_cLLaMAModel, "n_layer", RUBY_METHOD_FUNC(_llama_model_get_model_n_layer), 0);
|
1469
1470
|
rb_define_method(rb_cLLaMAModel, "rope_freq_scale_train", RUBY_METHOD_FUNC(_llama_model_rope_freq_scale_train), 0);
|
1470
1471
|
rb_define_method(rb_cLLaMAModel, "token_to_piece", RUBY_METHOD_FUNC(_llama_model_token_to_piece), 1);
|
1471
1472
|
rb_define_method(rb_cLLaMAModel, "tokenize", RUBY_METHOD_FUNC(_llama_model_tokenize), -1);
|
@@ -1613,6 +1614,11 @@ private:
|
|
1613
1614
|
return INT2NUM(llama_n_embd(ptr->model));
|
1614
1615
|
}
|
1615
1616
|
|
1617
|
+
static VALUE _llama_model_get_model_n_layer(VALUE self) {
|
1618
|
+
LLaMAModelWrapper* ptr = get_llama_model(self);
|
1619
|
+
return INT2NUM(llama_n_layer(ptr->model));
|
1620
|
+
}
|
1621
|
+
|
1616
1622
|
static VALUE _llama_model_rope_freq_scale_train(VALUE self) {
|
1617
1623
|
LLaMAModelWrapper* ptr = get_llama_model(self);
|
1618
1624
|
return DBL2NUM(llama_rope_freq_scale_train(ptr->model));
|
@@ -2083,6 +2089,7 @@ public:
|
|
2083
2089
|
rb_define_method(rb_cLLaMAContext, "sample_token", RUBY_METHOD_FUNC(_llama_context_sample_token), 1);
|
2084
2090
|
rb_define_method(rb_cLLaMAContext, "sample_grammar", RUBY_METHOD_FUNC(_llama_context_sample_grammar), -1);
|
2085
2091
|
rb_define_method(rb_cLLaMAContext, "grammar_accept_token", RUBY_METHOD_FUNC(_llama_context_grammar_accept_token), -1);
|
2092
|
+
rb_define_method(rb_cLLaMAContext, "apply_control_vector", RUBY_METHOD_FUNC(_llama_context_apply_control_vector), -1);
|
2086
2093
|
}
|
2087
2094
|
|
2088
2095
|
private:
|
@@ -3153,6 +3160,59 @@ private:
|
|
3153
3160
|
|
3154
3161
|
return Qnil;
|
3155
3162
|
}
|
3163
|
+
|
3164
|
+
static VALUE _llama_context_apply_control_vector(int argc, VALUE* argv, VALUE self) {
|
3165
|
+
VALUE kw_args = Qnil;
|
3166
|
+
ID kw_table[4] = { rb_intern("data"), rb_intern("n_embd"), rb_intern("il_start"), rb_intern("il_end") };
|
3167
|
+
VALUE kw_values[4] = { Qundef, Qundef, Qundef, Qundef };
|
3168
|
+
rb_scan_args(argc, argv, ":", &kw_args);
|
3169
|
+
rb_get_kwargs(kw_args, kw_table, 4, 0, kw_values);
|
3170
|
+
|
3171
|
+
if (!RB_TYPE_P(kw_values[0], T_ARRAY) && !NIL_P(kw_values[0])) {
|
3172
|
+
rb_raise(rb_eArgError, "data must be an Array or nil");
|
3173
|
+
return Qnil;
|
3174
|
+
}
|
3175
|
+
if (!RB_INTEGER_TYPE_P(kw_values[1])) {
|
3176
|
+
rb_raise(rb_eArgError, "n_embd must be an Integer");
|
3177
|
+
return Qnil;
|
3178
|
+
}
|
3179
|
+
if (!RB_INTEGER_TYPE_P(kw_values[2])) {
|
3180
|
+
rb_raise(rb_eArgError, "il_start must be an Integer");
|
3181
|
+
return Qnil;
|
3182
|
+
}
|
3183
|
+
if (!RB_INTEGER_TYPE_P(kw_values[3])) {
|
3184
|
+
rb_raise(rb_eArgError, "il_end must be an Integer");
|
3185
|
+
return Qnil;
|
3186
|
+
}
|
3187
|
+
|
3188
|
+
LLaMAContextWrapper* ctx_ptr = get_llama_context(self);
|
3189
|
+
if (ctx_ptr->ctx == NULL) {
|
3190
|
+
rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
|
3191
|
+
return Qnil;
|
3192
|
+
}
|
3193
|
+
|
3194
|
+
std::vector<float> data(RARRAY_LEN(kw_values[0]));
|
3195
|
+
for (size_t i = 0; i < data.size(); i++) {
|
3196
|
+
data[i] = NUM2DBL(rb_ary_entry(kw_values[0], i));
|
3197
|
+
}
|
3198
|
+
const int32_t n_embd = NUM2INT(kw_values[1]);
|
3199
|
+
const int32_t il_start = NUM2INT(kw_values[2]);
|
3200
|
+
const int32_t il_end = NUM2INT(kw_values[3]);
|
3201
|
+
|
3202
|
+
int32_t err = 0;
|
3203
|
+
if (NIL_P(kw_values[0])) {
|
3204
|
+
err = llama_control_vector_apply(ctx_ptr->ctx, NULL, 0, n_embd, il_start, il_end);
|
3205
|
+
} else {
|
3206
|
+
err = llama_control_vector_apply(ctx_ptr->ctx, data.data(), data.size(), n_embd, il_start, il_end);
|
3207
|
+
}
|
3208
|
+
|
3209
|
+
if (err) {
|
3210
|
+
rb_raise(rb_eRuntimeError, "Failed to apply control vector");
|
3211
|
+
return Qnil;
|
3212
|
+
}
|
3213
|
+
|
3214
|
+
return Qnil;
|
3215
|
+
}
|
3156
3216
|
};
|
3157
3217
|
|
3158
3218
|
const rb_data_type_t RbLLaMAContext::llama_context_type = {
|
@@ -3311,6 +3371,10 @@ extern "C" void Init_llama_cpp(void) {
|
|
3311
3371
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ3_XXS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ3_XXS));
|
3312
3372
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ1_S", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ1_S));
|
3313
3373
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ4_NL", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ4_NL));
|
3374
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ3_S", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ3_S));
|
3375
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ3_M", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ3_M));
|
3376
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ4_XS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ4_XS));
|
3377
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ1_M", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ1_M));
|
3314
3378
|
|
3315
3379
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_GUESSED", INT2NUM(LLAMA_FTYPE_GUESSED));
|
3316
3380
|
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.14.
|
6
|
+
VERSION = '0.14.4'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = '
|
9
|
+
LLAMA_CPP_VERSION = 'b2608'
|
10
10
|
end
|
data/sig/llama_cpp.rbs
CHANGED
@@ -32,6 +32,10 @@ module LLaMACpp
|
|
32
32
|
LLAMA_FTYPE_MOSTLY_IQ3_XXS: Integer
|
33
33
|
LLAMA_FTYPE_MOSTLY_IQ1_S: Integer
|
34
34
|
LLAMA_FTYPE_MOSTLY_IQ4_NL: Integer
|
35
|
+
LLAMA_FTYPE_MOSTLY_IQ3_S: Integer
|
36
|
+
LLAMA_FTYPE_MOSTLY_IQ3_M: Integer
|
37
|
+
LLAMA_FTYPE_MOSTLY_IQ4_XS: Integer
|
38
|
+
LLAMA_FTYPE_MOSTLY_IQ1_M: Integer
|
35
39
|
|
36
40
|
LLAMA_KV_OVERRIDE_TYPE_INT: Integer
|
37
41
|
LLAMA_KV_OVERRIDE_TYPE_FLOAT: Integer
|
@@ -108,6 +112,7 @@ module LLaMACpp
|
|
108
112
|
def n_vocab: () -> Integer
|
109
113
|
def n_ctx_train: () -> Integer
|
110
114
|
def n_embd: () -> Integer
|
115
|
+
def n_layer: () -> Integer
|
111
116
|
def rope_freq_scale_train: () -> Float
|
112
117
|
def token_to_piece: (Integer) -> String
|
113
118
|
def tokenize: (text: String, ?n_max_tokens: Integer, ?add_bos: bool, ?special: bool) -> Array[Integer]
|
@@ -244,6 +249,7 @@ module LLaMACpp
|
|
244
249
|
def sample_token: (::LLaMACpp::TokenDataArray) -> Integer
|
245
250
|
def sample_grammar: (::LLaMACpp::TokenDataArray, grammar: ::LLaMACpp::Grammar) -> void
|
246
251
|
def grammar_accept_token: (grammar: ::LLaMACpp::Grammar, token: Integer) -> void
|
252
|
+
def apply_control_vector: (data: Array[Float], n_embd: Integer, il_start: Integer, il_end: Integer) -> void
|
247
253
|
end
|
248
254
|
|
249
255
|
class ContextParams
|
@@ -1,15 +1,16 @@
|
|
1
1
|
# Define the default target now so that it is always the first target
|
2
2
|
BUILD_TARGETS = \
|
3
3
|
main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
|
4
|
-
simple batched batched-bench save-load-state server gguf llama-bench libllava.a llava-cli baby-llama beam-search \
|
5
|
-
speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o
|
4
|
+
simple batched batched-bench save-load-state server gguf gguf-split llama-bench libllava.a llava-cli baby-llama beam-search \
|
5
|
+
retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o
|
6
6
|
|
7
7
|
# Binaries only useful for tests
|
8
8
|
TEST_TARGETS = \
|
9
9
|
tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt \
|
10
10
|
tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama \
|
11
11
|
tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope \
|
12
|
-
tests/test-backend-ops tests/test-model-load-cancel tests/test-autorelease
|
12
|
+
tests/test-backend-ops tests/test-model-load-cancel tests/test-autorelease \
|
13
|
+
tests/test-json-schema-to-grammar
|
13
14
|
|
14
15
|
# Code coverage output files
|
15
16
|
COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
|
@@ -391,14 +392,20 @@ ifdef LLAMA_BLIS
|
|
391
392
|
endif # LLAMA_BLIS
|
392
393
|
|
393
394
|
ifdef LLAMA_CUBLAS
|
395
|
+
# LLAMA_CUBLAS is deprecated and will be removed in the future
|
396
|
+
LLAMA_CUDA := 1
|
397
|
+
endif
|
398
|
+
|
399
|
+
ifdef LLAMA_CUDA
|
394
400
|
ifneq ('', '$(wildcard /opt/cuda)')
|
395
401
|
CUDA_PATH ?= /opt/cuda
|
396
402
|
else
|
397
403
|
CUDA_PATH ?= /usr/local/cuda
|
398
404
|
endif
|
399
|
-
MK_CPPFLAGS += -
|
405
|
+
MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
|
400
406
|
MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L/usr/lib/wsl/lib
|
401
407
|
OBJS += ggml-cuda.o
|
408
|
+
OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
|
402
409
|
MK_NVCCFLAGS += -use_fast_math
|
403
410
|
ifdef LLAMA_FATAL_WARNINGS
|
404
411
|
MK_NVCCFLAGS += -Werror all-warnings
|
@@ -453,19 +460,30 @@ ifdef LLAMA_CUDA_PEER_MAX_BATCH_SIZE
|
|
453
460
|
else
|
454
461
|
MK_NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128
|
455
462
|
endif # LLAMA_CUDA_PEER_MAX_BATCH_SIZE
|
456
|
-
|
457
|
-
|
458
|
-
|
463
|
+
ifdef LLAMA_CUDA_NO_PEER_COPY
|
464
|
+
MK_NVCCFLAGS += -DGGML_CUDA_NO_PEER_COPY
|
465
|
+
endif # LLAMA_CUDA_NO_PEER_COPY
|
459
466
|
ifdef LLAMA_CUDA_CCBIN
|
460
467
|
MK_NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
|
461
468
|
endif
|
462
|
-
|
469
|
+
|
463
470
|
ifdef JETSON_EOL_MODULE_DETECT
|
464
|
-
|
471
|
+
define NVCC_COMPILE
|
472
|
+
$(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUDA -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
|
473
|
+
endef # NVCC_COMPILE
|
465
474
|
else
|
475
|
+
define NVCC_COMPILE
|
466
476
|
$(NVCC) $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
|
477
|
+
endef # NVCC_COMPILE
|
467
478
|
endif # JETSON_EOL_MODULE_DETECT
|
468
|
-
|
479
|
+
|
480
|
+
ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/common.cuh
|
481
|
+
$(NVCC_COMPILE)
|
482
|
+
|
483
|
+
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
|
484
|
+
$(NVCC_COMPILE)
|
485
|
+
|
486
|
+
endif # LLAMA_CUDA
|
469
487
|
|
470
488
|
ifdef LLAMA_CLBLAST
|
471
489
|
|
@@ -511,7 +529,6 @@ ggml-vulkan.o: ggml-vulkan.cpp ggml-vulkan.h
|
|
511
529
|
endif # LLAMA_VULKAN
|
512
530
|
|
513
531
|
ifdef LLAMA_HIPBLAS
|
514
|
-
|
515
532
|
ifeq ($(wildcard /opt/rocm),)
|
516
533
|
ROCM_PATH ?= /usr
|
517
534
|
GPU_TARGETS ?= $(shell $(shell which amdgpu-arch))
|
@@ -523,7 +540,7 @@ ifdef LLAMA_HIPBLAS
|
|
523
540
|
LLAMA_CUDA_DMMV_X ?= 32
|
524
541
|
LLAMA_CUDA_MMV_Y ?= 1
|
525
542
|
LLAMA_CUDA_KQUANTS_ITER ?= 2
|
526
|
-
MK_CPPFLAGS += -DGGML_USE_HIPBLAS -
|
543
|
+
MK_CPPFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUDA
|
527
544
|
ifdef LLAMA_HIP_UMA
|
528
545
|
MK_CPPFLAGS += -DGGML_HIP_UMA
|
529
546
|
endif # LLAMA_HIP_UMA
|
@@ -536,9 +553,18 @@ endif # LLAMA_HIP_UMA
|
|
536
553
|
ifdef LLAMA_CUDA_FORCE_DMMV
|
537
554
|
HIPFLAGS += -DGGML_CUDA_FORCE_DMMV
|
538
555
|
endif # LLAMA_CUDA_FORCE_DMMV
|
556
|
+
ifdef LLAMA_CUDA_NO_PEER_COPY
|
557
|
+
HIPFLAGS += -DGGML_CUDA_NO_PEER_COPY
|
558
|
+
endif # LLAMA_CUDA_NO_PEER_COPY
|
539
559
|
OBJS += ggml-cuda.o
|
540
|
-
|
560
|
+
OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
|
561
|
+
|
562
|
+
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
|
563
|
+
$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
|
564
|
+
|
565
|
+
ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/common.cuh
|
541
566
|
$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
|
567
|
+
|
542
568
|
endif # LLAMA_HIPBLAS
|
543
569
|
|
544
570
|
ifdef LLAMA_METAL
|
@@ -555,7 +581,7 @@ endif
|
|
555
581
|
endif # LLAMA_METAL
|
556
582
|
|
557
583
|
ifdef LLAMA_METAL
|
558
|
-
ggml-metal.o: ggml-metal.m ggml-metal.h
|
584
|
+
ggml-metal.o: ggml-metal.m ggml-metal.h ggml.h
|
559
585
|
$(CC) $(CFLAGS) -c $< -o $@
|
560
586
|
|
561
587
|
ifdef LLAMA_METAL_EMBED_LIBRARY
|
@@ -591,12 +617,17 @@ override NVCCFLAGS := $(MK_NVCCFLAGS) $(NVCCFLAGS)
|
|
591
617
|
override LDFLAGS := $(MK_LDFLAGS) $(LDFLAGS)
|
592
618
|
|
593
619
|
# identify CUDA host compiler
|
594
|
-
ifdef
|
620
|
+
ifdef LLAMA_CUDA
|
595
621
|
GF_CC := $(NVCC) $(NVCCFLAGS) 2>/dev/null .c -Xcompiler
|
596
622
|
include scripts/get-flags.mk
|
597
623
|
CUDA_CXXFLAGS := $(BASE_CXXFLAGS) $(GF_CXXFLAGS) -Wno-pedantic
|
598
624
|
endif
|
599
625
|
|
626
|
+
ifdef LLAMA_CURL
|
627
|
+
override CXXFLAGS := $(CXXFLAGS) -DLLAMA_USE_CURL
|
628
|
+
override LDFLAGS := $(LDFLAGS) -lcurl
|
629
|
+
endif
|
630
|
+
|
600
631
|
#
|
601
632
|
# Print build information
|
602
633
|
#
|
@@ -611,7 +642,7 @@ $(info I NVCCFLAGS: $(NVCCFLAGS))
|
|
611
642
|
$(info I LDFLAGS: $(LDFLAGS))
|
612
643
|
$(info I CC: $(shell $(CC) --version | head -n 1))
|
613
644
|
$(info I CXX: $(shell $(CXX) --version | head -n 1))
|
614
|
-
ifdef
|
645
|
+
ifdef LLAMA_CUDA
|
615
646
|
$(info I NVCC: $(shell $(NVCC) --version | tail -n 1))
|
616
647
|
CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])')
|
617
648
|
ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1)
|
@@ -621,8 +652,15 @@ $(error I ERROR: For CUDA versions < 11.7 a target CUDA architecture must be exp
|
|
621
652
|
endif # CUDA_POWER_ARCH
|
622
653
|
endif # CUDA_DOCKER_ARCH
|
623
654
|
endif # eq ($(shell echo "$(CUDA_VERSION) < 11.7" | bc),1)
|
624
|
-
endif #
|
655
|
+
endif # LLAMA_CUDA
|
656
|
+
$(info )
|
657
|
+
|
658
|
+
ifdef LLAMA_CUBLAS
|
659
|
+
$(info !!!!)
|
660
|
+
$(info LLAMA_CUBLAS is deprecated and will be removed in the future. Use LLAMA_CUDA instead.)
|
661
|
+
$(info !!!!)
|
625
662
|
$(info )
|
663
|
+
endif
|
626
664
|
|
627
665
|
#
|
628
666
|
# Build library
|
@@ -643,7 +681,10 @@ ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h ggml-common.h
|
|
643
681
|
unicode.o: unicode.cpp unicode.h
|
644
682
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
645
683
|
|
646
|
-
|
684
|
+
unicode-data.o: unicode-data.cpp unicode-data.h
|
685
|
+
$(CXX) $(CXXFLAGS) -c $< -o $@
|
686
|
+
|
687
|
+
OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o
|
647
688
|
|
648
689
|
llama.o: llama.cpp unicode.h ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
|
649
690
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
@@ -663,9 +704,15 @@ console.o: common/console.cpp common/console.h
|
|
663
704
|
grammar-parser.o: common/grammar-parser.cpp common/grammar-parser.h
|
664
705
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
665
706
|
|
707
|
+
json-schema-to-grammar.o: common/json-schema-to-grammar.cpp common/json-schema-to-grammar.h
|
708
|
+
$(CXX) $(CXXFLAGS) -c $< -o $@
|
709
|
+
|
666
710
|
train.o: common/train.cpp common/train.h
|
667
711
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
668
712
|
|
713
|
+
ngram-cache.o: common/ngram-cache.cpp common/ngram-cache.h
|
714
|
+
$(CXX) $(CXXFLAGS) -c $< -o $@
|
715
|
+
|
669
716
|
libllama.so: llama.o ggml.o $(OBJS)
|
670
717
|
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
|
671
718
|
|
@@ -677,7 +724,8 @@ lib: llama.o ggml.o $(OBJS)
|
|
677
724
|
ar rcs libllama.a $^
|
678
725
|
|
679
726
|
clean:
|
680
|
-
rm -vrf *.o tests/*.o *.so *.a *.dll *.dylib benchmark-matmult common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
|
727
|
+
rm -vrf *.o tests/*.o *.so *.a *.dll *.dylib benchmark-matmult lookup-create lookup-merge lookup-stats common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
|
728
|
+
rm -vrf ggml-cuda/*.o
|
681
729
|
|
682
730
|
#
|
683
731
|
# Examples
|
@@ -745,7 +793,7 @@ save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(C
|
|
745
793
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
746
794
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
747
795
|
|
748
|
-
server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h
|
796
|
+
server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp json-schema-to-grammar.o common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
749
797
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
750
798
|
$(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
|
751
799
|
|
@@ -753,6 +801,10 @@ gguf: examples/gguf/gguf.cpp ggml.o $(OBJS)
|
|
753
801
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
754
802
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
755
803
|
|
804
|
+
gguf-split: examples/gguf-split/gguf-split.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
805
|
+
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
806
|
+
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
807
|
+
|
756
808
|
train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
|
757
809
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
758
810
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
@@ -790,6 +842,10 @@ export-lora: examples/export-lora/export-lora.cpp ggml.o common/common.h $(OBJS)
|
|
790
842
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
791
843
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
792
844
|
|
845
|
+
retrieval: examples/retrieval/retrieval.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
846
|
+
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
847
|
+
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
848
|
+
|
793
849
|
speculative: examples/speculative/speculative.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
794
850
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
795
851
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
@@ -802,14 +858,24 @@ lookahead: examples/lookahead/lookahead.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS
|
|
802
858
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
803
859
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
804
860
|
|
805
|
-
lookup: examples/lookup/lookup.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
861
|
+
lookup: examples/lookup/lookup.cpp ggml.o llama.o ngram-cache.o $(COMMON_DEPS) $(OBJS)
|
806
862
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
807
863
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
864
|
+
$(CXX) $(CXXFLAGS) -c examples/lookup/lookup-create.cpp -o $(call GET_OBJ_FILE, examples/lookup/lookup-create.cpp)
|
865
|
+
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, examples/lookup/lookup-create.cpp) -o lookup-create $(LDFLAGS)
|
866
|
+
$(CXX) $(CXXFLAGS) -c examples/lookup/lookup-merge.cpp -o $(call GET_OBJ_FILE, examples/lookup/lookup-merge.cpp)
|
867
|
+
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, examples/lookup/lookup-merge.cpp) -o lookup-merge $(LDFLAGS)
|
868
|
+
$(CXX) $(CXXFLAGS) -c examples/lookup/lookup-stats.cpp -o $(call GET_OBJ_FILE, examples/lookup/lookup-stats.cpp)
|
869
|
+
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, examples/lookup/lookup-stats.cpp) -o lookup-stats $(LDFLAGS)
|
808
870
|
|
809
871
|
passkey: examples/passkey/passkey.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
810
872
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
811
873
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
812
874
|
|
875
|
+
gbnf-validator: examples/gbnf-validator/gbnf-validator.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
876
|
+
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
877
|
+
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
878
|
+
|
813
879
|
ifeq ($(UNAME_S),Darwin)
|
814
880
|
swift: examples/batched.swift
|
815
881
|
(cd examples/batched.swift; make build)
|
@@ -861,6 +927,10 @@ tests/test-double-float: tests/test-double-float.cpp ggml.o $(OBJS)
|
|
861
927
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
862
928
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
863
929
|
|
930
|
+
tests/test-json-schema-to-grammar: tests/test-json-schema-to-grammar.cpp json-schema-to-grammar.o ggml.o llama.o grammar-parser.o $(OBJS)
|
931
|
+
$(CXX) $(CXXFLAGS) -Iexamples/server -c $< -o $(call GET_OBJ_FILE, $<)
|
932
|
+
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
933
|
+
|
864
934
|
tests/test-grad0: tests/test-grad0.cpp ggml.o $(OBJS)
|
865
935
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
866
936
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
@@ -548,7 +548,11 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
|
|
548
548
|
for (int i = 0; i < graph->n_nodes; i++) {
|
549
549
|
struct ggml_tensor * node = graph->nodes[i];
|
550
550
|
|
551
|
-
|
551
|
+
// TODO: better way to add external dependencies
|
552
|
+
// GGML_OP_NONE does not appear normally in the graph nodes, but is used by ggml-backend to add dependencies to
|
553
|
+
// control when some tensors are allocated and freed. in this case, the dependencies are in `src`, but the node
|
554
|
+
// itself is never used and should not be considered a dependency
|
555
|
+
if (ggml_is_view(node) && node->op != GGML_OP_NONE) {
|
552
556
|
struct ggml_tensor * view_src = node->view_src;
|
553
557
|
ggml_gallocr_hash_get(galloc, view_src)->n_views += 1;
|
554
558
|
}
|
@@ -565,8 +569,8 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
|
|
565
569
|
|
566
570
|
ggml_gallocr_hash_get(galloc, src)->n_children += 1;
|
567
571
|
|
568
|
-
// allocate explicit inputs
|
569
|
-
if (src->flags & GGML_TENSOR_FLAG_INPUT
|
572
|
+
// allocate explicit inputs
|
573
|
+
if (src->flags & GGML_TENSOR_FLAG_INPUT) {
|
570
574
|
ggml_gallocr_allocate_node(galloc, src, get_node_buffer_id(node_buffer_ids, i));
|
571
575
|
}
|
572
576
|
}
|
@@ -701,8 +705,13 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
|
701
705
|
struct ggml_tensor * leaf = graph->leafs[i];
|
702
706
|
struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
|
703
707
|
galloc->leaf_allocs[i].buffer_id = hn->buffer_id;
|
704
|
-
|
705
|
-
|
708
|
+
if (leaf->view_src || leaf->data) {
|
709
|
+
galloc->leaf_allocs[i].leaf.offset = SIZE_MAX;
|
710
|
+
galloc->leaf_allocs[i].leaf.size_max = 0;
|
711
|
+
} else {
|
712
|
+
galloc->leaf_allocs[i].leaf.offset = hn->offset;
|
713
|
+
galloc->leaf_allocs[i].leaf.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
|
714
|
+
}
|
706
715
|
}
|
707
716
|
|
708
717
|
// reallocate buffers if needed
|
@@ -103,6 +103,11 @@ extern "C" {
|
|
103
103
|
// check if the backend supports an operation
|
104
104
|
bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
|
105
105
|
|
106
|
+
// check if the backend wants to run an operation, even if the weights are allocated in a CPU buffer
|
107
|
+
// these should be expensive operations with large batch sizes that may benefit from running on this backend
|
108
|
+
// even if the weight has to be copied from the CPU temporarily
|
109
|
+
bool (*GGML_CALL offload_op)(ggml_backend_t backend, const struct ggml_tensor * op);
|
110
|
+
|
106
111
|
// (optional) event synchronization
|
107
112
|
ggml_backend_event_t (*GGML_CALL event_new) (ggml_backend_t backend);
|
108
113
|
void (*GGML_CALL event_free) (ggml_backend_event_t event);
|