llama_cpp 0.14.2 → 0.14.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +14 -0
- data/ext/llama_cpp/llama_cpp.cpp +64 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +6 -0
- data/vendor/tmp/llama.cpp/Makefile +91 -21
- data/vendor/tmp/llama.cpp/ggml-alloc.c +14 -5
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +5 -0
- data/vendor/tmp/llama.cpp/ggml-backend.c +155 -125
- data/vendor/tmp/llama.cpp/ggml-backend.h +4 -4
- data/vendor/tmp/llama.cpp/ggml-common.h +25 -2
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +1779 -10762
- data/vendor/tmp/llama.cpp/ggml-cuda.h +6 -15
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +5 -0
- data/vendor/tmp/llama.cpp/ggml-metal.m +167 -124
- data/vendor/tmp/llama.cpp/ggml-metal.metal +603 -303
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +5 -0
- data/vendor/tmp/llama.cpp/ggml-quants.c +663 -56
- data/vendor/tmp/llama.cpp/ggml-quants.h +3 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +341 -469
- data/vendor/tmp/llama.cpp/ggml-sycl.h +19 -4
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +37199 -14939
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +335 -307
- data/vendor/tmp/llama.cpp/ggml-vulkan.h +0 -11
- data/vendor/tmp/llama.cpp/ggml.c +229 -107
- data/vendor/tmp/llama.cpp/ggml.h +11 -5
- data/vendor/tmp/llama.cpp/llama.cpp +2136 -464
- data/vendor/tmp/llama.cpp/llama.h +86 -23
- data/vendor/tmp/llama.cpp/unicode-data.cpp +1651 -0
- data/vendor/tmp/llama.cpp/unicode-data.h +16 -0
- data/vendor/tmp/llama.cpp/unicode.cpp +8 -1403
- data/vendor/tmp/llama.cpp/unicode.h +2 -0
- metadata +5 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 92ebd411f54255b05074ef79ed3e220c9ff4332164cfc831122d766226322515
|
4
|
+
data.tar.gz: 571f0ec65776945d40a54e31bba26cc4194b19965cc4841ce40b9ad3d94605df
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 34ca9567b7eb96add562e977f22f8b2be087c026c85bf92cd5e31f9a96ea5f02a841bdf05f745c4079740a4bb01476fb9bab313317d66dbf8870fa829a269c86
|
7
|
+
data.tar.gz: 64b19ef010bb52800cd3710c1ec70bcb7b747e53b6ea7d8f13d84d336b1ee67868153f5bbdfee0b4131dfddaf1c656c49bd774084ee2d14f191d22d215a47737
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,17 @@
|
|
1
|
+
## [[0.14.4](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.3...v0.14.4)] - 2024-04-06
|
2
|
+
|
3
|
+
- Bump llama.cpp from b2496 to b2573.
|
4
|
+
- Add file type constants.
|
5
|
+
- Bump llama.cpp from b2573 to b2608.
|
6
|
+
|
7
|
+
Implementation of llama_split_path, llama_split_prefix binding, llama_grammar_accept, and decode_utf8 has been skipped.
|
8
|
+
|
9
|
+
## [[0.14.3](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.2...v0.14.3)] - 2024-03-23
|
10
|
+
|
11
|
+
- Bump llama.cpp from b2435 to b2496.
|
12
|
+
- Add `n_layer` method to `Model`.
|
13
|
+
- Add `apply_control_vector` method to `Context`.
|
14
|
+
|
1
15
|
## [[0.14.2](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.1...v0.14.2)] - 2024-03-16
|
2
16
|
|
3
17
|
- Fix to use metal embed library on macOS.
|
data/ext/llama_cpp/llama_cpp.cpp
CHANGED
@@ -1466,6 +1466,7 @@ public:
|
|
1466
1466
|
rb_define_method(rb_cLLaMAModel, "n_vocab", RUBY_METHOD_FUNC(_llama_model_get_model_n_vocab), 0);
|
1467
1467
|
rb_define_method(rb_cLLaMAModel, "n_ctx_train", RUBY_METHOD_FUNC(_llama_model_get_model_n_ctx_train), 0);
|
1468
1468
|
rb_define_method(rb_cLLaMAModel, "n_embd", RUBY_METHOD_FUNC(_llama_model_get_model_n_embd), 0);
|
1469
|
+
rb_define_method(rb_cLLaMAModel, "n_layer", RUBY_METHOD_FUNC(_llama_model_get_model_n_layer), 0);
|
1469
1470
|
rb_define_method(rb_cLLaMAModel, "rope_freq_scale_train", RUBY_METHOD_FUNC(_llama_model_rope_freq_scale_train), 0);
|
1470
1471
|
rb_define_method(rb_cLLaMAModel, "token_to_piece", RUBY_METHOD_FUNC(_llama_model_token_to_piece), 1);
|
1471
1472
|
rb_define_method(rb_cLLaMAModel, "tokenize", RUBY_METHOD_FUNC(_llama_model_tokenize), -1);
|
@@ -1613,6 +1614,11 @@ private:
|
|
1613
1614
|
return INT2NUM(llama_n_embd(ptr->model));
|
1614
1615
|
}
|
1615
1616
|
|
1617
|
+
static VALUE _llama_model_get_model_n_layer(VALUE self) {
|
1618
|
+
LLaMAModelWrapper* ptr = get_llama_model(self);
|
1619
|
+
return INT2NUM(llama_n_layer(ptr->model));
|
1620
|
+
}
|
1621
|
+
|
1616
1622
|
static VALUE _llama_model_rope_freq_scale_train(VALUE self) {
|
1617
1623
|
LLaMAModelWrapper* ptr = get_llama_model(self);
|
1618
1624
|
return DBL2NUM(llama_rope_freq_scale_train(ptr->model));
|
@@ -2083,6 +2089,7 @@ public:
|
|
2083
2089
|
rb_define_method(rb_cLLaMAContext, "sample_token", RUBY_METHOD_FUNC(_llama_context_sample_token), 1);
|
2084
2090
|
rb_define_method(rb_cLLaMAContext, "sample_grammar", RUBY_METHOD_FUNC(_llama_context_sample_grammar), -1);
|
2085
2091
|
rb_define_method(rb_cLLaMAContext, "grammar_accept_token", RUBY_METHOD_FUNC(_llama_context_grammar_accept_token), -1);
|
2092
|
+
rb_define_method(rb_cLLaMAContext, "apply_control_vector", RUBY_METHOD_FUNC(_llama_context_apply_control_vector), -1);
|
2086
2093
|
}
|
2087
2094
|
|
2088
2095
|
private:
|
@@ -3153,6 +3160,59 @@ private:
|
|
3153
3160
|
|
3154
3161
|
return Qnil;
|
3155
3162
|
}
|
3163
|
+
|
3164
|
+
static VALUE _llama_context_apply_control_vector(int argc, VALUE* argv, VALUE self) {
|
3165
|
+
VALUE kw_args = Qnil;
|
3166
|
+
ID kw_table[4] = { rb_intern("data"), rb_intern("n_embd"), rb_intern("il_start"), rb_intern("il_end") };
|
3167
|
+
VALUE kw_values[4] = { Qundef, Qundef, Qundef, Qundef };
|
3168
|
+
rb_scan_args(argc, argv, ":", &kw_args);
|
3169
|
+
rb_get_kwargs(kw_args, kw_table, 4, 0, kw_values);
|
3170
|
+
|
3171
|
+
if (!RB_TYPE_P(kw_values[0], T_ARRAY) && !NIL_P(kw_values[0])) {
|
3172
|
+
rb_raise(rb_eArgError, "data must be an Array or nil");
|
3173
|
+
return Qnil;
|
3174
|
+
}
|
3175
|
+
if (!RB_INTEGER_TYPE_P(kw_values[1])) {
|
3176
|
+
rb_raise(rb_eArgError, "n_embd must be an Integer");
|
3177
|
+
return Qnil;
|
3178
|
+
}
|
3179
|
+
if (!RB_INTEGER_TYPE_P(kw_values[2])) {
|
3180
|
+
rb_raise(rb_eArgError, "il_start must be an Integer");
|
3181
|
+
return Qnil;
|
3182
|
+
}
|
3183
|
+
if (!RB_INTEGER_TYPE_P(kw_values[3])) {
|
3184
|
+
rb_raise(rb_eArgError, "il_end must be an Integer");
|
3185
|
+
return Qnil;
|
3186
|
+
}
|
3187
|
+
|
3188
|
+
LLaMAContextWrapper* ctx_ptr = get_llama_context(self);
|
3189
|
+
if (ctx_ptr->ctx == NULL) {
|
3190
|
+
rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
|
3191
|
+
return Qnil;
|
3192
|
+
}
|
3193
|
+
|
3194
|
+
std::vector<float> data(RARRAY_LEN(kw_values[0]));
|
3195
|
+
for (size_t i = 0; i < data.size(); i++) {
|
3196
|
+
data[i] = NUM2DBL(rb_ary_entry(kw_values[0], i));
|
3197
|
+
}
|
3198
|
+
const int32_t n_embd = NUM2INT(kw_values[1]);
|
3199
|
+
const int32_t il_start = NUM2INT(kw_values[2]);
|
3200
|
+
const int32_t il_end = NUM2INT(kw_values[3]);
|
3201
|
+
|
3202
|
+
int32_t err = 0;
|
3203
|
+
if (NIL_P(kw_values[0])) {
|
3204
|
+
err = llama_control_vector_apply(ctx_ptr->ctx, NULL, 0, n_embd, il_start, il_end);
|
3205
|
+
} else {
|
3206
|
+
err = llama_control_vector_apply(ctx_ptr->ctx, data.data(), data.size(), n_embd, il_start, il_end);
|
3207
|
+
}
|
3208
|
+
|
3209
|
+
if (err) {
|
3210
|
+
rb_raise(rb_eRuntimeError, "Failed to apply control vector");
|
3211
|
+
return Qnil;
|
3212
|
+
}
|
3213
|
+
|
3214
|
+
return Qnil;
|
3215
|
+
}
|
3156
3216
|
};
|
3157
3217
|
|
3158
3218
|
const rb_data_type_t RbLLaMAContext::llama_context_type = {
|
@@ -3311,6 +3371,10 @@ extern "C" void Init_llama_cpp(void) {
|
|
3311
3371
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ3_XXS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ3_XXS));
|
3312
3372
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ1_S", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ1_S));
|
3313
3373
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ4_NL", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ4_NL));
|
3374
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ3_S", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ3_S));
|
3375
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ3_M", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ3_M));
|
3376
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ4_XS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ4_XS));
|
3377
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ1_M", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ1_M));
|
3314
3378
|
|
3315
3379
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_GUESSED", INT2NUM(LLAMA_FTYPE_GUESSED));
|
3316
3380
|
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.14.
|
6
|
+
VERSION = '0.14.4'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = '
|
9
|
+
LLAMA_CPP_VERSION = 'b2608'
|
10
10
|
end
|
data/sig/llama_cpp.rbs
CHANGED
@@ -32,6 +32,10 @@ module LLaMACpp
|
|
32
32
|
LLAMA_FTYPE_MOSTLY_IQ3_XXS: Integer
|
33
33
|
LLAMA_FTYPE_MOSTLY_IQ1_S: Integer
|
34
34
|
LLAMA_FTYPE_MOSTLY_IQ4_NL: Integer
|
35
|
+
LLAMA_FTYPE_MOSTLY_IQ3_S: Integer
|
36
|
+
LLAMA_FTYPE_MOSTLY_IQ3_M: Integer
|
37
|
+
LLAMA_FTYPE_MOSTLY_IQ4_XS: Integer
|
38
|
+
LLAMA_FTYPE_MOSTLY_IQ1_M: Integer
|
35
39
|
|
36
40
|
LLAMA_KV_OVERRIDE_TYPE_INT: Integer
|
37
41
|
LLAMA_KV_OVERRIDE_TYPE_FLOAT: Integer
|
@@ -108,6 +112,7 @@ module LLaMACpp
|
|
108
112
|
def n_vocab: () -> Integer
|
109
113
|
def n_ctx_train: () -> Integer
|
110
114
|
def n_embd: () -> Integer
|
115
|
+
def n_layer: () -> Integer
|
111
116
|
def rope_freq_scale_train: () -> Float
|
112
117
|
def token_to_piece: (Integer) -> String
|
113
118
|
def tokenize: (text: String, ?n_max_tokens: Integer, ?add_bos: bool, ?special: bool) -> Array[Integer]
|
@@ -244,6 +249,7 @@ module LLaMACpp
|
|
244
249
|
def sample_token: (::LLaMACpp::TokenDataArray) -> Integer
|
245
250
|
def sample_grammar: (::LLaMACpp::TokenDataArray, grammar: ::LLaMACpp::Grammar) -> void
|
246
251
|
def grammar_accept_token: (grammar: ::LLaMACpp::Grammar, token: Integer) -> void
|
252
|
+
def apply_control_vector: (data: Array[Float], n_embd: Integer, il_start: Integer, il_end: Integer) -> void
|
247
253
|
end
|
248
254
|
|
249
255
|
class ContextParams
|
@@ -1,15 +1,16 @@
|
|
1
1
|
# Define the default target now so that it is always the first target
|
2
2
|
BUILD_TARGETS = \
|
3
3
|
main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
|
4
|
-
simple batched batched-bench save-load-state server gguf llama-bench libllava.a llava-cli baby-llama beam-search \
|
5
|
-
speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o
|
4
|
+
simple batched batched-bench save-load-state server gguf gguf-split llama-bench libllava.a llava-cli baby-llama beam-search \
|
5
|
+
retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o
|
6
6
|
|
7
7
|
# Binaries only useful for tests
|
8
8
|
TEST_TARGETS = \
|
9
9
|
tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt \
|
10
10
|
tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama \
|
11
11
|
tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope \
|
12
|
-
tests/test-backend-ops tests/test-model-load-cancel tests/test-autorelease
|
12
|
+
tests/test-backend-ops tests/test-model-load-cancel tests/test-autorelease \
|
13
|
+
tests/test-json-schema-to-grammar
|
13
14
|
|
14
15
|
# Code coverage output files
|
15
16
|
COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
|
@@ -391,14 +392,20 @@ ifdef LLAMA_BLIS
|
|
391
392
|
endif # LLAMA_BLIS
|
392
393
|
|
393
394
|
ifdef LLAMA_CUBLAS
|
395
|
+
# LLAMA_CUBLAS is deprecated and will be removed in the future
|
396
|
+
LLAMA_CUDA := 1
|
397
|
+
endif
|
398
|
+
|
399
|
+
ifdef LLAMA_CUDA
|
394
400
|
ifneq ('', '$(wildcard /opt/cuda)')
|
395
401
|
CUDA_PATH ?= /opt/cuda
|
396
402
|
else
|
397
403
|
CUDA_PATH ?= /usr/local/cuda
|
398
404
|
endif
|
399
|
-
MK_CPPFLAGS += -
|
405
|
+
MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
|
400
406
|
MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L/usr/lib/wsl/lib
|
401
407
|
OBJS += ggml-cuda.o
|
408
|
+
OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
|
402
409
|
MK_NVCCFLAGS += -use_fast_math
|
403
410
|
ifdef LLAMA_FATAL_WARNINGS
|
404
411
|
MK_NVCCFLAGS += -Werror all-warnings
|
@@ -453,19 +460,30 @@ ifdef LLAMA_CUDA_PEER_MAX_BATCH_SIZE
|
|
453
460
|
else
|
454
461
|
MK_NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128
|
455
462
|
endif # LLAMA_CUDA_PEER_MAX_BATCH_SIZE
|
456
|
-
|
457
|
-
|
458
|
-
|
463
|
+
ifdef LLAMA_CUDA_NO_PEER_COPY
|
464
|
+
MK_NVCCFLAGS += -DGGML_CUDA_NO_PEER_COPY
|
465
|
+
endif # LLAMA_CUDA_NO_PEER_COPY
|
459
466
|
ifdef LLAMA_CUDA_CCBIN
|
460
467
|
MK_NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
|
461
468
|
endif
|
462
|
-
|
469
|
+
|
463
470
|
ifdef JETSON_EOL_MODULE_DETECT
|
464
|
-
|
471
|
+
define NVCC_COMPILE
|
472
|
+
$(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUDA -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
|
473
|
+
endef # NVCC_COMPILE
|
465
474
|
else
|
475
|
+
define NVCC_COMPILE
|
466
476
|
$(NVCC) $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
|
477
|
+
endef # NVCC_COMPILE
|
467
478
|
endif # JETSON_EOL_MODULE_DETECT
|
468
|
-
|
479
|
+
|
480
|
+
ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/common.cuh
|
481
|
+
$(NVCC_COMPILE)
|
482
|
+
|
483
|
+
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
|
484
|
+
$(NVCC_COMPILE)
|
485
|
+
|
486
|
+
endif # LLAMA_CUDA
|
469
487
|
|
470
488
|
ifdef LLAMA_CLBLAST
|
471
489
|
|
@@ -511,7 +529,6 @@ ggml-vulkan.o: ggml-vulkan.cpp ggml-vulkan.h
|
|
511
529
|
endif # LLAMA_VULKAN
|
512
530
|
|
513
531
|
ifdef LLAMA_HIPBLAS
|
514
|
-
|
515
532
|
ifeq ($(wildcard /opt/rocm),)
|
516
533
|
ROCM_PATH ?= /usr
|
517
534
|
GPU_TARGETS ?= $(shell $(shell which amdgpu-arch))
|
@@ -523,7 +540,7 @@ ifdef LLAMA_HIPBLAS
|
|
523
540
|
LLAMA_CUDA_DMMV_X ?= 32
|
524
541
|
LLAMA_CUDA_MMV_Y ?= 1
|
525
542
|
LLAMA_CUDA_KQUANTS_ITER ?= 2
|
526
|
-
MK_CPPFLAGS += -DGGML_USE_HIPBLAS -
|
543
|
+
MK_CPPFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUDA
|
527
544
|
ifdef LLAMA_HIP_UMA
|
528
545
|
MK_CPPFLAGS += -DGGML_HIP_UMA
|
529
546
|
endif # LLAMA_HIP_UMA
|
@@ -536,9 +553,18 @@ endif # LLAMA_HIP_UMA
|
|
536
553
|
ifdef LLAMA_CUDA_FORCE_DMMV
|
537
554
|
HIPFLAGS += -DGGML_CUDA_FORCE_DMMV
|
538
555
|
endif # LLAMA_CUDA_FORCE_DMMV
|
556
|
+
ifdef LLAMA_CUDA_NO_PEER_COPY
|
557
|
+
HIPFLAGS += -DGGML_CUDA_NO_PEER_COPY
|
558
|
+
endif # LLAMA_CUDA_NO_PEER_COPY
|
539
559
|
OBJS += ggml-cuda.o
|
540
|
-
|
560
|
+
OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
|
561
|
+
|
562
|
+
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
|
563
|
+
$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
|
564
|
+
|
565
|
+
ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/common.cuh
|
541
566
|
$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
|
567
|
+
|
542
568
|
endif # LLAMA_HIPBLAS
|
543
569
|
|
544
570
|
ifdef LLAMA_METAL
|
@@ -555,7 +581,7 @@ endif
|
|
555
581
|
endif # LLAMA_METAL
|
556
582
|
|
557
583
|
ifdef LLAMA_METAL
|
558
|
-
ggml-metal.o: ggml-metal.m ggml-metal.h
|
584
|
+
ggml-metal.o: ggml-metal.m ggml-metal.h ggml.h
|
559
585
|
$(CC) $(CFLAGS) -c $< -o $@
|
560
586
|
|
561
587
|
ifdef LLAMA_METAL_EMBED_LIBRARY
|
@@ -591,12 +617,17 @@ override NVCCFLAGS := $(MK_NVCCFLAGS) $(NVCCFLAGS)
|
|
591
617
|
override LDFLAGS := $(MK_LDFLAGS) $(LDFLAGS)
|
592
618
|
|
593
619
|
# identify CUDA host compiler
|
594
|
-
ifdef
|
620
|
+
ifdef LLAMA_CUDA
|
595
621
|
GF_CC := $(NVCC) $(NVCCFLAGS) 2>/dev/null .c -Xcompiler
|
596
622
|
include scripts/get-flags.mk
|
597
623
|
CUDA_CXXFLAGS := $(BASE_CXXFLAGS) $(GF_CXXFLAGS) -Wno-pedantic
|
598
624
|
endif
|
599
625
|
|
626
|
+
ifdef LLAMA_CURL
|
627
|
+
override CXXFLAGS := $(CXXFLAGS) -DLLAMA_USE_CURL
|
628
|
+
override LDFLAGS := $(LDFLAGS) -lcurl
|
629
|
+
endif
|
630
|
+
|
600
631
|
#
|
601
632
|
# Print build information
|
602
633
|
#
|
@@ -611,7 +642,7 @@ $(info I NVCCFLAGS: $(NVCCFLAGS))
|
|
611
642
|
$(info I LDFLAGS: $(LDFLAGS))
|
612
643
|
$(info I CC: $(shell $(CC) --version | head -n 1))
|
613
644
|
$(info I CXX: $(shell $(CXX) --version | head -n 1))
|
614
|
-
ifdef
|
645
|
+
ifdef LLAMA_CUDA
|
615
646
|
$(info I NVCC: $(shell $(NVCC) --version | tail -n 1))
|
616
647
|
CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])')
|
617
648
|
ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1)
|
@@ -621,8 +652,15 @@ $(error I ERROR: For CUDA versions < 11.7 a target CUDA architecture must be exp
|
|
621
652
|
endif # CUDA_POWER_ARCH
|
622
653
|
endif # CUDA_DOCKER_ARCH
|
623
654
|
endif # eq ($(shell echo "$(CUDA_VERSION) < 11.7" | bc),1)
|
624
|
-
endif #
|
655
|
+
endif # LLAMA_CUDA
|
656
|
+
$(info )
|
657
|
+
|
658
|
+
ifdef LLAMA_CUBLAS
|
659
|
+
$(info !!!!)
|
660
|
+
$(info LLAMA_CUBLAS is deprecated and will be removed in the future. Use LLAMA_CUDA instead.)
|
661
|
+
$(info !!!!)
|
625
662
|
$(info )
|
663
|
+
endif
|
626
664
|
|
627
665
|
#
|
628
666
|
# Build library
|
@@ -643,7 +681,10 @@ ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h ggml-common.h
|
|
643
681
|
unicode.o: unicode.cpp unicode.h
|
644
682
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
645
683
|
|
646
|
-
|
684
|
+
unicode-data.o: unicode-data.cpp unicode-data.h
|
685
|
+
$(CXX) $(CXXFLAGS) -c $< -o $@
|
686
|
+
|
687
|
+
OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o
|
647
688
|
|
648
689
|
llama.o: llama.cpp unicode.h ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
|
649
690
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
@@ -663,9 +704,15 @@ console.o: common/console.cpp common/console.h
|
|
663
704
|
grammar-parser.o: common/grammar-parser.cpp common/grammar-parser.h
|
664
705
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
665
706
|
|
707
|
+
json-schema-to-grammar.o: common/json-schema-to-grammar.cpp common/json-schema-to-grammar.h
|
708
|
+
$(CXX) $(CXXFLAGS) -c $< -o $@
|
709
|
+
|
666
710
|
train.o: common/train.cpp common/train.h
|
667
711
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
668
712
|
|
713
|
+
ngram-cache.o: common/ngram-cache.cpp common/ngram-cache.h
|
714
|
+
$(CXX) $(CXXFLAGS) -c $< -o $@
|
715
|
+
|
669
716
|
libllama.so: llama.o ggml.o $(OBJS)
|
670
717
|
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
|
671
718
|
|
@@ -677,7 +724,8 @@ lib: llama.o ggml.o $(OBJS)
|
|
677
724
|
ar rcs libllama.a $^
|
678
725
|
|
679
726
|
clean:
|
680
|
-
rm -vrf *.o tests/*.o *.so *.a *.dll *.dylib benchmark-matmult common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
|
727
|
+
rm -vrf *.o tests/*.o *.so *.a *.dll *.dylib benchmark-matmult lookup-create lookup-merge lookup-stats common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
|
728
|
+
rm -vrf ggml-cuda/*.o
|
681
729
|
|
682
730
|
#
|
683
731
|
# Examples
|
@@ -745,7 +793,7 @@ save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(C
|
|
745
793
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
746
794
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
747
795
|
|
748
|
-
server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h
|
796
|
+
server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp json-schema-to-grammar.o common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
749
797
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
750
798
|
$(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
|
751
799
|
|
@@ -753,6 +801,10 @@ gguf: examples/gguf/gguf.cpp ggml.o $(OBJS)
|
|
753
801
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
754
802
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
755
803
|
|
804
|
+
gguf-split: examples/gguf-split/gguf-split.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
805
|
+
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
806
|
+
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
807
|
+
|
756
808
|
train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
|
757
809
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
758
810
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
@@ -790,6 +842,10 @@ export-lora: examples/export-lora/export-lora.cpp ggml.o common/common.h $(OBJS)
|
|
790
842
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
791
843
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
792
844
|
|
845
|
+
retrieval: examples/retrieval/retrieval.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
846
|
+
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
847
|
+
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
848
|
+
|
793
849
|
speculative: examples/speculative/speculative.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
794
850
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
795
851
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
@@ -802,14 +858,24 @@ lookahead: examples/lookahead/lookahead.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS
|
|
802
858
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
803
859
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
804
860
|
|
805
|
-
lookup: examples/lookup/lookup.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
861
|
+
lookup: examples/lookup/lookup.cpp ggml.o llama.o ngram-cache.o $(COMMON_DEPS) $(OBJS)
|
806
862
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
807
863
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
864
|
+
$(CXX) $(CXXFLAGS) -c examples/lookup/lookup-create.cpp -o $(call GET_OBJ_FILE, examples/lookup/lookup-create.cpp)
|
865
|
+
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, examples/lookup/lookup-create.cpp) -o lookup-create $(LDFLAGS)
|
866
|
+
$(CXX) $(CXXFLAGS) -c examples/lookup/lookup-merge.cpp -o $(call GET_OBJ_FILE, examples/lookup/lookup-merge.cpp)
|
867
|
+
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, examples/lookup/lookup-merge.cpp) -o lookup-merge $(LDFLAGS)
|
868
|
+
$(CXX) $(CXXFLAGS) -c examples/lookup/lookup-stats.cpp -o $(call GET_OBJ_FILE, examples/lookup/lookup-stats.cpp)
|
869
|
+
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, examples/lookup/lookup-stats.cpp) -o lookup-stats $(LDFLAGS)
|
808
870
|
|
809
871
|
passkey: examples/passkey/passkey.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
810
872
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
811
873
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
812
874
|
|
875
|
+
gbnf-validator: examples/gbnf-validator/gbnf-validator.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
876
|
+
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
877
|
+
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
878
|
+
|
813
879
|
ifeq ($(UNAME_S),Darwin)
|
814
880
|
swift: examples/batched.swift
|
815
881
|
(cd examples/batched.swift; make build)
|
@@ -861,6 +927,10 @@ tests/test-double-float: tests/test-double-float.cpp ggml.o $(OBJS)
|
|
861
927
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
862
928
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
863
929
|
|
930
|
+
tests/test-json-schema-to-grammar: tests/test-json-schema-to-grammar.cpp json-schema-to-grammar.o ggml.o llama.o grammar-parser.o $(OBJS)
|
931
|
+
$(CXX) $(CXXFLAGS) -Iexamples/server -c $< -o $(call GET_OBJ_FILE, $<)
|
932
|
+
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
933
|
+
|
864
934
|
tests/test-grad0: tests/test-grad0.cpp ggml.o $(OBJS)
|
865
935
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
866
936
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
@@ -548,7 +548,11 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
|
|
548
548
|
for (int i = 0; i < graph->n_nodes; i++) {
|
549
549
|
struct ggml_tensor * node = graph->nodes[i];
|
550
550
|
|
551
|
-
|
551
|
+
// TODO: better way to add external dependencies
|
552
|
+
// GGML_OP_NONE does not appear normally in the graph nodes, but is used by ggml-backend to add dependencies to
|
553
|
+
// control when some tensors are allocated and freed. in this case, the dependencies are in `src`, but the node
|
554
|
+
// itself is never used and should not be considered a dependency
|
555
|
+
if (ggml_is_view(node) && node->op != GGML_OP_NONE) {
|
552
556
|
struct ggml_tensor * view_src = node->view_src;
|
553
557
|
ggml_gallocr_hash_get(galloc, view_src)->n_views += 1;
|
554
558
|
}
|
@@ -565,8 +569,8 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
|
|
565
569
|
|
566
570
|
ggml_gallocr_hash_get(galloc, src)->n_children += 1;
|
567
571
|
|
568
|
-
// allocate explicit inputs
|
569
|
-
if (src->flags & GGML_TENSOR_FLAG_INPUT
|
572
|
+
// allocate explicit inputs
|
573
|
+
if (src->flags & GGML_TENSOR_FLAG_INPUT) {
|
570
574
|
ggml_gallocr_allocate_node(galloc, src, get_node_buffer_id(node_buffer_ids, i));
|
571
575
|
}
|
572
576
|
}
|
@@ -701,8 +705,13 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
|
701
705
|
struct ggml_tensor * leaf = graph->leafs[i];
|
702
706
|
struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
|
703
707
|
galloc->leaf_allocs[i].buffer_id = hn->buffer_id;
|
704
|
-
|
705
|
-
|
708
|
+
if (leaf->view_src || leaf->data) {
|
709
|
+
galloc->leaf_allocs[i].leaf.offset = SIZE_MAX;
|
710
|
+
galloc->leaf_allocs[i].leaf.size_max = 0;
|
711
|
+
} else {
|
712
|
+
galloc->leaf_allocs[i].leaf.offset = hn->offset;
|
713
|
+
galloc->leaf_allocs[i].leaf.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
|
714
|
+
}
|
706
715
|
}
|
707
716
|
|
708
717
|
// reallocate buffers if needed
|
@@ -103,6 +103,11 @@ extern "C" {
|
|
103
103
|
// check if the backend supports an operation
|
104
104
|
bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
|
105
105
|
|
106
|
+
// check if the backend wants to run an operation, even if the weights are allocated in a CPU buffer
|
107
|
+
// these should be expensive operations with large batch sizes that may benefit from running on this backend
|
108
|
+
// even if the weight has to be copied from the CPU temporarily
|
109
|
+
bool (*GGML_CALL offload_op)(ggml_backend_t backend, const struct ggml_tensor * op);
|
110
|
+
|
106
111
|
// (optional) event synchronization
|
107
112
|
ggml_backend_event_t (*GGML_CALL event_new) (ggml_backend_t backend);
|
108
113
|
void (*GGML_CALL event_free) (ggml_backend_event_t event);
|