llama_cpp 0.14.3 → 0.14.5

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 3b503998061ee4c8a87bc3148d41feda0b45b04cbe0cafdb3897d1d457b26e0a
4
- data.tar.gz: b761a18fd964ca0a4e871d01cc0a6058527c951413de7b110a8b07862ed64d8c
3
+ metadata.gz: 7d80abb57b135ff04718e34099accaaabf3358553b0f061d79b195a99386739d
4
+ data.tar.gz: 5b24a9b7846b962f4063a0e50f15c6d9a9c874d1931ed32c200f3383869a2fd9
5
5
  SHA512:
6
- metadata.gz: 2951b2a59b0579f3afa983283a73853300f822891f0d1dfef292727d6f313392ddc68902144caaca33b173e43e95076dda02ffa97228cf7f65babc4ac82354c9
7
- data.tar.gz: cb655d32282b28ebaee30b87b882600c79a6666c306de2692a059da3de1bc21d3c988116fd1dd26d97c00ea0f22fdccc8b3f8d94b20cb01c819d9a578c71bd67
6
+ metadata.gz: dfb20e108a57b65ff624db1e2ee37034ffca406d906268d89ff441099a02c00fd67743a786a0353df2368614003604a4bf5982089024f14aee2e0f95e210e297
7
+ data.tar.gz: 0a0bbd93dfe57e033f25e5c3e3d61fb568362aa2d317851dbb69fe620e5e30bc8b08c27272579e7841c50b87984abf70ade4a9e7e34fb2615e106a5c2474b79e
data/CHANGELOG.md CHANGED
@@ -1,3 +1,19 @@
1
+ ## [[0.14.5](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.4...v0.14.5)] - 2024-04-13
2
+
3
+ - Bump llama.cpp from b2608 to b2658.
4
+ - Add magic number constants.
5
+ - Add `token_cls` and `token_sep` methods to `Model`.
6
+
7
+ Implementation bindings for llama_state_get_size, llama_state_get_data, llama_state_set_data, llama_state_load_file, llama_state_save_file, llama_state_seq_get_size, llama_state_seq_get_data, llama_state_seq_set_data, llama_state_seq_save_file, and llama_state_seq_load_file has been skipped.
8
+
9
+ ## [[0.14.4](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.3...v0.14.4)] - 2024-04-06
10
+
11
+ - Bump llama.cpp from b2496 to b2573.
12
+ - Add file type constants.
13
+ - Bump llama.cpp from b2573 to b2608.
14
+
15
+ Implementation bindings for llama_split_path, llama_split_prefix binding, llama_grammar_accept, and decode_utf8 has been skipped.
16
+
1
17
  ## [[0.14.3](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.2...v0.14.3)] - 2024-03-23
2
18
 
3
19
  - Bump llama.cpp from b2435 to b2496.
data/examples/chat.rb CHANGED
@@ -127,8 +127,7 @@ class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
127
127
  end
128
128
 
129
129
  if input_echo
130
- output = []
131
- embd.each { |token| output << context.model.token_to_piece(token) }
130
+ output = embd.map { |token| context.model.token_to_piece(token) }
132
131
  output_str = output.join
133
132
  output_str.chomp!(antiprompt) if first_input
134
133
  print(output_str)
@@ -136,8 +135,7 @@ class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
136
135
 
137
136
  if embd_input.size <= n_consumed
138
137
  if antiprompt.size.positive?
139
- last_output = []
140
- last_n_tokens.each { |token| last_output << context.model.token_to_piece(token) }
138
+ last_output = last_n_tokens.map { |token| context.model.token_to_piece(token) }
141
139
  last_output_str = last_output.join
142
140
 
143
141
  search_start_pos = last_output_str.size > antiprompt.size ? last_output_str.size - antiprompt.size : 0
@@ -37,6 +37,7 @@ if RUBY_PLATFORM.match?(/darwin/)
37
37
  abort('Failed to set installation path for libllama.dylib.') unless mkstatus.success?
38
38
  end
39
39
  FileUtils.cp("#{LLAMA_CPP_DIR}/ggml-metal-embed.metal", VENDOR_LIB_DIR)
40
+ FileUtils.cp("#{LLAMA_CPP_DIR}/ggml-metal.metal", VENDOR_LIB_DIR)
40
41
  end
41
42
 
42
43
  abort('libstdc++ is not found.') unless have_library('stdc++')
@@ -1478,6 +1478,8 @@ public:
1478
1478
  rb_define_method(rb_cLLaMAModel, "type", RUBY_METHOD_FUNC(_llama_model_get_type), 1);
1479
1479
  rb_define_method(rb_cLLaMAModel, "token_bos", RUBY_METHOD_FUNC(_llama_model_token_bos), 0);
1480
1480
  rb_define_method(rb_cLLaMAModel, "token_eos", RUBY_METHOD_FUNC(_llama_model_token_eos), 0);
1481
+ rb_define_method(rb_cLLaMAModel, "token_cls", RUBY_METHOD_FUNC(_llama_model_token_cls), 0);
1482
+ rb_define_method(rb_cLLaMAModel, "token_sep", RUBY_METHOD_FUNC(_llama_model_token_sep), 0);
1481
1483
  rb_define_method(rb_cLLaMAModel, "token_nl", RUBY_METHOD_FUNC(_llama_model_token_nl), 0);
1482
1484
  rb_define_method(rb_cLLaMAModel, "add_bos_token?", RUBY_METHOD_FUNC(_llama_model_add_bos_token), 0);
1483
1485
  rb_define_method(rb_cLLaMAModel, "add_eos_token?", RUBY_METHOD_FUNC(_llama_model_add_eos_token), 0);
@@ -1743,6 +1745,16 @@ private:
1743
1745
  return INT2NUM(llama_token_eos(ptr->model));
1744
1746
  }
1745
1747
 
1748
+ static VALUE _llama_model_token_cls(VALUE self) {
1749
+ LLaMAModelWrapper* ptr = get_llama_model(self);
1750
+ return INT2NUM(llama_token_cls(ptr->model));
1751
+ }
1752
+
1753
+ static VALUE _llama_model_token_sep(VALUE self) {
1754
+ LLaMAModelWrapper* ptr = get_llama_model(self);
1755
+ return INT2NUM(llama_token_sep(ptr->model));
1756
+ }
1757
+
1746
1758
  static VALUE _llama_model_token_nl(VALUE self) {
1747
1759
  LLaMAModelWrapper* ptr = get_llama_model(self);
1748
1760
  return INT2NUM(llama_token_nl(ptr->model));
@@ -3371,6 +3383,10 @@ extern "C" void Init_llama_cpp(void) {
3371
3383
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ3_XXS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ3_XXS));
3372
3384
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ1_S", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ1_S));
3373
3385
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ4_NL", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ4_NL));
3386
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ3_S", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ3_S));
3387
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ3_M", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ3_M));
3388
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ4_XS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ4_XS));
3389
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ1_M", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ1_M));
3374
3390
 
3375
3391
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_GUESSED", INT2NUM(LLAMA_FTYPE_GUESSED));
3376
3392
 
@@ -3410,15 +3426,26 @@ extern "C" void Init_llama_cpp(void) {
3410
3426
  ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC_GGSN;
3411
3427
  rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_MAGIC_GGSN", rb_str_new2(ss_magic.str().c_str()));
3412
3428
 
3429
+ ss_magic.str("");
3430
+ ss_magic.clear(std::stringstream::goodbit);
3431
+ ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC_GGSQ;
3432
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_MAGIC_GGSQ", rb_str_new2(ss_magic.str().c_str()));
3433
+
3413
3434
  ss_magic.str("");
3414
3435
  ss_magic.clear(std::stringstream::goodbit);
3415
3436
  ss_magic << std::showbase << std::hex << LLAMA_SESSION_MAGIC;
3416
3437
  rb_define_const(rb_mLLaMACpp, "LLAMA_SESSION_MAGIC", rb_str_new2(ss_magic.str().c_str()));
3417
3438
 
3439
+ ss_magic.str("");
3440
+ ss_magic.clear(std::stringstream::goodbit);
3441
+ ss_magic << std::showbase << std::hex << LLAMA_STATE_SEQ_MAGIC;
3442
+ rb_define_const(rb_mLLaMACpp, "LLAMA_STATE_SEQ_MAGIC", rb_str_new2(ss_magic.str().c_str()));
3443
+
3418
3444
  ss_magic.str("");
3419
3445
  ss_magic.clear(std::stringstream::goodbit);
3420
3446
  ss_magic << std::showbase << std::hex << LLAMA_DEFAULT_SEED;
3421
3447
  rb_define_const(rb_mLLaMACpp, "LLAMA_DEFAULT_SEED", rb_str_new2(ss_magic.str().c_str()));
3422
3448
 
3423
3449
  rb_define_const(rb_mLLaMACpp, "LLAMA_SESSION_VERSION", rb_str_new2(std::to_string(LLAMA_SESSION_VERSION).c_str()));
3450
+ rb_define_const(rb_mLLaMACpp, "LLAMA_STATE_SEQ_VERSION", rb_str_new2(std::to_string(LLAMA_STATE_SEQ_VERSION).c_str()));
3424
3451
  }
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.14.3'
6
+ VERSION = '0.14.5'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'b2496'
9
+ LLAMA_CPP_VERSION = 'b2658'
10
10
  end
data/sig/llama_cpp.rbs CHANGED
@@ -3,6 +3,14 @@ module LLaMACpp
3
3
  LLAMA_CPP_VERSION: String
4
4
  LLAMA_DEFALUT_SEED: String
5
5
 
6
+ LLAMA_FILE_MAGIC_GGLA: String
7
+ LLAMA_FILE_MAGIC_GGSN: String
8
+ LLAMA_FILE_MAGIC_GGSQ: String
9
+ LLAMA_SESSION_MAGIC: String
10
+ LLAMA_SESSION_VERSION: String
11
+ LLAMA_STATE_SEQ_MAGIC: String
12
+ LLAMA_STATE_SEQ_VERSION: String
13
+
6
14
  LLAMA_VOCAB_TYPE_NONE: Integer
7
15
  LLAMA_VOCAB_TYPE_SPM: Integer
8
16
  LLAMA_VOCAB_TYPE_BPE: Integer
@@ -32,6 +40,10 @@ module LLaMACpp
32
40
  LLAMA_FTYPE_MOSTLY_IQ3_XXS: Integer
33
41
  LLAMA_FTYPE_MOSTLY_IQ1_S: Integer
34
42
  LLAMA_FTYPE_MOSTLY_IQ4_NL: Integer
43
+ LLAMA_FTYPE_MOSTLY_IQ3_S: Integer
44
+ LLAMA_FTYPE_MOSTLY_IQ3_M: Integer
45
+ LLAMA_FTYPE_MOSTLY_IQ4_XS: Integer
46
+ LLAMA_FTYPE_MOSTLY_IQ1_M: Integer
35
47
 
36
48
  LLAMA_KV_OVERRIDE_TYPE_INT: Integer
37
49
  LLAMA_KV_OVERRIDE_TYPE_FLOAT: Integer
@@ -120,6 +132,8 @@ module LLaMACpp
120
132
  def type: (Integer) -> Integer
121
133
  def token_bos: () -> Integer
122
134
  def token_eos: () -> Integer
135
+ def token_cls: () -> Integer
136
+ def token_sep: () -> Integer
123
137
  def token_nl: () -> Integer
124
138
  def add_bos_token?: () -> bool
125
139
  def add_eos_token?: () -> bool
@@ -1,6 +1,6 @@
1
1
  MIT License
2
2
 
3
- Copyright (c) 2023 Georgi Gerganov
3
+ Copyright (c) 2023-2024 The ggml authors
4
4
 
5
5
  Permission is hereby granted, free of charge, to any person obtaining a copy
6
6
  of this software and associated documentation files (the "Software"), to deal
@@ -1,8 +1,8 @@
1
1
  # Define the default target now so that it is always the first target
2
2
  BUILD_TARGETS = \
3
3
  main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
4
- simple batched batched-bench save-load-state server gguf llama-bench libllava.a llava-cli baby-llama beam-search \
5
- speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o
4
+ simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama beam-search \
5
+ retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o
6
6
 
7
7
  # Binaries only useful for tests
8
8
  TEST_TARGETS = \
@@ -10,7 +10,7 @@ TEST_TARGETS = \
10
10
  tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama \
11
11
  tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope \
12
12
  tests/test-backend-ops tests/test-model-load-cancel tests/test-autorelease \
13
- tests/test-json-schema-to-grammar
13
+ tests/test-json-schema-to-grammar tests/test-grammar-integration
14
14
 
15
15
  # Code coverage output files
16
16
  COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
@@ -392,14 +392,20 @@ ifdef LLAMA_BLIS
392
392
  endif # LLAMA_BLIS
393
393
 
394
394
  ifdef LLAMA_CUBLAS
395
+ # LLAMA_CUBLAS is deprecated and will be removed in the future
396
+ LLAMA_CUDA := 1
397
+ endif
398
+
399
+ ifdef LLAMA_CUDA
395
400
  ifneq ('', '$(wildcard /opt/cuda)')
396
401
  CUDA_PATH ?= /opt/cuda
397
402
  else
398
403
  CUDA_PATH ?= /usr/local/cuda
399
404
  endif
400
- MK_CPPFLAGS += -DGGML_USE_CUBLAS -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
405
+ MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
401
406
  MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L/usr/lib/wsl/lib
402
407
  OBJS += ggml-cuda.o
408
+ OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
403
409
  MK_NVCCFLAGS += -use_fast_math
404
410
  ifdef LLAMA_FATAL_WARNINGS
405
411
  MK_NVCCFLAGS += -Werror all-warnings
@@ -454,19 +460,30 @@ ifdef LLAMA_CUDA_PEER_MAX_BATCH_SIZE
454
460
  else
455
461
  MK_NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128
456
462
  endif # LLAMA_CUDA_PEER_MAX_BATCH_SIZE
457
- #ifdef LLAMA_CUDA_CUBLAS
458
- # MK_NVCCFLAGS += -DGGML_CUDA_CUBLAS
459
- #endif # LLAMA_CUDA_CUBLAS
463
+ ifdef LLAMA_CUDA_NO_PEER_COPY
464
+ MK_NVCCFLAGS += -DGGML_CUDA_NO_PEER_COPY
465
+ endif # LLAMA_CUDA_NO_PEER_COPY
460
466
  ifdef LLAMA_CUDA_CCBIN
461
467
  MK_NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
462
468
  endif
463
- ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml-common.h
469
+
464
470
  ifdef JETSON_EOL_MODULE_DETECT
465
- $(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
471
+ define NVCC_COMPILE
472
+ $(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUDA -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
473
+ endef # NVCC_COMPILE
466
474
  else
475
+ define NVCC_COMPILE
467
476
  $(NVCC) $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
477
+ endef # NVCC_COMPILE
468
478
  endif # JETSON_EOL_MODULE_DETECT
469
- endif # LLAMA_CUBLAS
479
+
480
+ ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/common.cuh
481
+ $(NVCC_COMPILE)
482
+
483
+ ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
484
+ $(NVCC_COMPILE)
485
+
486
+ endif # LLAMA_CUDA
470
487
 
471
488
  ifdef LLAMA_CLBLAST
472
489
 
@@ -512,7 +529,6 @@ ggml-vulkan.o: ggml-vulkan.cpp ggml-vulkan.h
512
529
  endif # LLAMA_VULKAN
513
530
 
514
531
  ifdef LLAMA_HIPBLAS
515
-
516
532
  ifeq ($(wildcard /opt/rocm),)
517
533
  ROCM_PATH ?= /usr
518
534
  GPU_TARGETS ?= $(shell $(shell which amdgpu-arch))
@@ -524,7 +540,7 @@ ifdef LLAMA_HIPBLAS
524
540
  LLAMA_CUDA_DMMV_X ?= 32
525
541
  LLAMA_CUDA_MMV_Y ?= 1
526
542
  LLAMA_CUDA_KQUANTS_ITER ?= 2
527
- MK_CPPFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS
543
+ MK_CPPFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUDA
528
544
  ifdef LLAMA_HIP_UMA
529
545
  MK_CPPFLAGS += -DGGML_HIP_UMA
530
546
  endif # LLAMA_HIP_UMA
@@ -537,9 +553,18 @@ endif # LLAMA_HIP_UMA
537
553
  ifdef LLAMA_CUDA_FORCE_DMMV
538
554
  HIPFLAGS += -DGGML_CUDA_FORCE_DMMV
539
555
  endif # LLAMA_CUDA_FORCE_DMMV
556
+ ifdef LLAMA_CUDA_NO_PEER_COPY
557
+ HIPFLAGS += -DGGML_CUDA_NO_PEER_COPY
558
+ endif # LLAMA_CUDA_NO_PEER_COPY
540
559
  OBJS += ggml-cuda.o
541
- ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
560
+ OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
561
+
562
+ ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
542
563
  $(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
564
+
565
+ ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/common.cuh
566
+ $(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
567
+
543
568
  endif # LLAMA_HIPBLAS
544
569
 
545
570
  ifdef LLAMA_METAL
@@ -592,7 +617,7 @@ override NVCCFLAGS := $(MK_NVCCFLAGS) $(NVCCFLAGS)
592
617
  override LDFLAGS := $(MK_LDFLAGS) $(LDFLAGS)
593
618
 
594
619
  # identify CUDA host compiler
595
- ifdef LLAMA_CUBLAS
620
+ ifdef LLAMA_CUDA
596
621
  GF_CC := $(NVCC) $(NVCCFLAGS) 2>/dev/null .c -Xcompiler
597
622
  include scripts/get-flags.mk
598
623
  CUDA_CXXFLAGS := $(BASE_CXXFLAGS) $(GF_CXXFLAGS) -Wno-pedantic
@@ -617,19 +642,26 @@ $(info I NVCCFLAGS: $(NVCCFLAGS))
617
642
  $(info I LDFLAGS: $(LDFLAGS))
618
643
  $(info I CC: $(shell $(CC) --version | head -n 1))
619
644
  $(info I CXX: $(shell $(CXX) --version | head -n 1))
620
- ifdef LLAMA_CUBLAS
645
+ ifdef LLAMA_CUDA
621
646
  $(info I NVCC: $(shell $(NVCC) --version | tail -n 1))
622
647
  CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])')
623
648
  ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1)
624
649
  ifndef CUDA_DOCKER_ARCH
625
650
  ifndef CUDA_POWER_ARCH
626
- $(error I ERROR: For CUDA versions < 11.7 a target CUDA architecture must be explicitly provided via CUDA_DOCKER_ARCH)
651
+ $(error I ERROR: For CUDA versions < 11.7 a target CUDA architecture must be explicitly provided via environment variable CUDA_DOCKER_ARCH, e.g. by running "export CUDA_DOCKER_ARCH=compute_XX" on Unix-like systems, where XX is the minimum compute capability that the code needs to run on. A list with compute capabilities can be found here: https://developer.nvidia.com/cuda-gpus )
627
652
  endif # CUDA_POWER_ARCH
628
653
  endif # CUDA_DOCKER_ARCH
629
654
  endif # eq ($(shell echo "$(CUDA_VERSION) < 11.7" | bc),1)
630
- endif # LLAMA_CUBLAS
655
+ endif # LLAMA_CUDA
631
656
  $(info )
632
657
 
658
+ ifdef LLAMA_CUBLAS
659
+ $(info !!!!)
660
+ $(info LLAMA_CUBLAS is deprecated and will be removed in the future. Use LLAMA_CUDA instead.)
661
+ $(info !!!!)
662
+ $(info )
663
+ endif
664
+
633
665
  #
634
666
  # Build library
635
667
  #
@@ -649,7 +681,10 @@ ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h ggml-common.h
649
681
  unicode.o: unicode.cpp unicode.h
650
682
  $(CXX) $(CXXFLAGS) -c $< -o $@
651
683
 
652
- OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o
684
+ unicode-data.o: unicode-data.cpp unicode-data.h
685
+ $(CXX) $(CXXFLAGS) -c $< -o $@
686
+
687
+ OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o
653
688
 
654
689
  llama.o: llama.cpp unicode.h ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
655
690
  $(CXX) $(CXXFLAGS) -c $< -o $@
@@ -675,6 +710,9 @@ json-schema-to-grammar.o: common/json-schema-to-grammar.cpp common/json-schema-t
675
710
  train.o: common/train.cpp common/train.h
676
711
  $(CXX) $(CXXFLAGS) -c $< -o $@
677
712
 
713
+ ngram-cache.o: common/ngram-cache.cpp common/ngram-cache.h
714
+ $(CXX) $(CXXFLAGS) -c $< -o $@
715
+
678
716
  libllama.so: llama.o ggml.o $(OBJS)
679
717
  $(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
680
718
 
@@ -686,7 +724,8 @@ lib: llama.o ggml.o $(OBJS)
686
724
  ar rcs libllama.a $^
687
725
 
688
726
  clean:
689
- rm -vrf *.o tests/*.o *.so *.a *.dll *.dylib benchmark-matmult common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
727
+ rm -vrf *.o tests/*.o *.so *.a *.dll *.dylib benchmark-matmult lookup-create lookup-merge lookup-stats common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
728
+ rm -vrf ggml-cuda/*.o
690
729
 
691
730
  #
692
731
  # Examples
@@ -766,6 +805,10 @@ gguf-split: examples/gguf-split/gguf-split.cpp ggml.o llama.o $(COMMON_DEPS) $(O
766
805
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
767
806
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
768
807
 
808
+ eval-callback: examples/eval-callback/eval-callback.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
809
+ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
810
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
811
+
769
812
  train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
770
813
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
771
814
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@@ -803,6 +846,10 @@ export-lora: examples/export-lora/export-lora.cpp ggml.o common/common.h $(OBJS)
803
846
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
804
847
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
805
848
 
849
+ retrieval: examples/retrieval/retrieval.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
850
+ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
851
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
852
+
806
853
  speculative: examples/speculative/speculative.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
807
854
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
808
855
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@@ -815,14 +862,24 @@ lookahead: examples/lookahead/lookahead.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS
815
862
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
816
863
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
817
864
 
818
- lookup: examples/lookup/lookup.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
865
+ lookup: examples/lookup/lookup.cpp ggml.o llama.o ngram-cache.o $(COMMON_DEPS) $(OBJS)
819
866
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
820
867
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
868
+ $(CXX) $(CXXFLAGS) -c examples/lookup/lookup-create.cpp -o $(call GET_OBJ_FILE, examples/lookup/lookup-create.cpp)
869
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, examples/lookup/lookup-create.cpp) -o lookup-create $(LDFLAGS)
870
+ $(CXX) $(CXXFLAGS) -c examples/lookup/lookup-merge.cpp -o $(call GET_OBJ_FILE, examples/lookup/lookup-merge.cpp)
871
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, examples/lookup/lookup-merge.cpp) -o lookup-merge $(LDFLAGS)
872
+ $(CXX) $(CXXFLAGS) -c examples/lookup/lookup-stats.cpp -o $(call GET_OBJ_FILE, examples/lookup/lookup-stats.cpp)
873
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, examples/lookup/lookup-stats.cpp) -o lookup-stats $(LDFLAGS)
821
874
 
822
875
  passkey: examples/passkey/passkey.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
823
876
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
824
877
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
825
878
 
879
+ gbnf-validator: examples/gbnf-validator/gbnf-validator.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
880
+ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
881
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
882
+
826
883
  ifeq ($(UNAME_S),Darwin)
827
884
  swift: examples/batched.swift
828
885
  (cd examples/batched.swift; make build)
@@ -870,6 +927,10 @@ tests/test-grammar-parser: tests/test-grammar-parser.cpp ggml.o llama.o grammar-
870
927
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
871
928
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
872
929
 
930
+ tests/test-grammar-integration: tests/test-grammar-integration.cpp ggml.o llama.o grammar-parser.o $(OBJS)
931
+ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
932
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
933
+
873
934
  tests/test-double-float: tests/test-double-float.cpp ggml.o $(OBJS)
874
935
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
875
936
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@@ -705,8 +705,13 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
705
705
  struct ggml_tensor * leaf = graph->leafs[i];
706
706
  struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
707
707
  galloc->leaf_allocs[i].buffer_id = hn->buffer_id;
708
- galloc->leaf_allocs[i].leaf.offset = hn->offset;
709
- galloc->leaf_allocs[i].leaf.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
708
+ if (leaf->view_src || leaf->data) {
709
+ galloc->leaf_allocs[i].leaf.offset = SIZE_MAX;
710
+ galloc->leaf_allocs[i].leaf.size_max = 0;
711
+ } else {
712
+ galloc->leaf_allocs[i].leaf.offset = hn->offset;
713
+ galloc->leaf_allocs[i].leaf.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
714
+ }
710
715
  }
711
716
 
712
717
  // reallocate buffers if needed
@@ -420,7 +420,7 @@ GGML_CALL static void ggml_backend_registry_init(void) {
420
420
  ggml_backend_register("CPU", ggml_backend_reg_cpu_init, ggml_backend_cpu_buffer_type(), NULL);
421
421
 
422
422
  // add forward decls here to avoid including the backend headers
423
- #ifdef GGML_USE_CUBLAS
423
+ #ifdef GGML_USE_CUDA
424
424
  extern GGML_CALL void ggml_backend_cuda_reg_devices(void);
425
425
  ggml_backend_cuda_reg_devices();
426
426
  #endif
@@ -137,7 +137,7 @@ extern "C" {
137
137
  /*
138
138
  Example usage:
139
139
 
140
- // operations that use tensors allocated in a buffer with USAGE_WEIGHTS will be asigned
140
+ // operations that use tensors allocated in a buffer with USAGE_WEIGHTS will be assigned
141
141
  // preferrably to run on the same backend as the buffer
142
142
  ggml_backend_buffer_set_usage(buf_weights, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
143
143
 
@@ -377,6 +377,27 @@ typedef struct {
377
377
  } block_iq1_s;
378
378
  static_assert(sizeof(block_iq1_s) == sizeof(ggml_half) + QK_K/8 + QK_K/16, "wrong iq1_s block size/padding");
379
379
 
380
+ // 1.75 bpw
381
+ typedef struct {
382
+ uint8_t qs[QK_K/8]; // grid index, low 8 bits
383
+ uint8_t qh[QK_K/16]; // grid index, high 3 bits + grid shift bit (for two groups of 8)
384
+ #if QK_K == 64
385
+ ggml_half d;
386
+ #endif
387
+ uint8_t scales[QK_K/32]; // 3-bit block scales (4-bit if QK_K == 64)
388
+ } block_iq1_m;
389
+ #if QK_K == 64
390
+ static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32 + sizeof(ggml_half), "wrong iq1_m block size/padding");
391
+ #else
392
+ static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32, "wrong iq1_m block size/padding");
393
+ #endif
394
+
395
+ // Used by IQ1_M quants
396
+ typedef union {
397
+ ggml_half f16;
398
+ uint16_t u16;
399
+ } iq1m_scale_t;
400
+
380
401
  // Non-linear quants
381
402
  #define QK4_NL 32
382
403
  typedef struct {
@@ -426,10 +447,11 @@ static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_
426
447
 
427
448
  #define GGML_COMMON_IMPL
428
449
  #elif defined(GGML_COMMON_IMPL_SYCL)
450
+
429
451
  #include <cstdint>
430
452
 
431
- #define GGML_TABLE_BEGIN(type, name, size) static dpct::global_memory<const type, 1> name(sycl::range<1>(size), {
432
- #define GGML_TABLE_END() });
453
+ #define GGML_TABLE_BEGIN(type, name, size) static const type name[size] = {
454
+ #define GGML_TABLE_END() };
433
455
 
434
456
  #define GGML_COMMON_IMPL
435
457
  #endif
@@ -1050,6 +1072,7 @@ GGML_TABLE_END()
1050
1072
 
1051
1073
  #define NGRID_IQ1S 2048
1052
1074
  #define IQ1S_DELTA 0.125f
1075
+ #define IQ1M_DELTA 0.125f
1053
1076
  #if defined(GGML_COMMON_IMPL_C)
1054
1077
  GGML_TABLE_BEGIN(uint64_t, iq1s_grid, NGRID_IQ1S)
1055
1078
  0xffffffffffffffff, 0xffffffffffffff01, 0xffffffffffff0000, 0xffffffffffff01ff,