llama_cpp 0.14.3 → 0.14.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 3b503998061ee4c8a87bc3148d41feda0b45b04cbe0cafdb3897d1d457b26e0a
4
- data.tar.gz: b761a18fd964ca0a4e871d01cc0a6058527c951413de7b110a8b07862ed64d8c
3
+ metadata.gz: 7d80abb57b135ff04718e34099accaaabf3358553b0f061d79b195a99386739d
4
+ data.tar.gz: 5b24a9b7846b962f4063a0e50f15c6d9a9c874d1931ed32c200f3383869a2fd9
5
5
  SHA512:
6
- metadata.gz: 2951b2a59b0579f3afa983283a73853300f822891f0d1dfef292727d6f313392ddc68902144caaca33b173e43e95076dda02ffa97228cf7f65babc4ac82354c9
7
- data.tar.gz: cb655d32282b28ebaee30b87b882600c79a6666c306de2692a059da3de1bc21d3c988116fd1dd26d97c00ea0f22fdccc8b3f8d94b20cb01c819d9a578c71bd67
6
+ metadata.gz: dfb20e108a57b65ff624db1e2ee37034ffca406d906268d89ff441099a02c00fd67743a786a0353df2368614003604a4bf5982089024f14aee2e0f95e210e297
7
+ data.tar.gz: 0a0bbd93dfe57e033f25e5c3e3d61fb568362aa2d317851dbb69fe620e5e30bc8b08c27272579e7841c50b87984abf70ade4a9e7e34fb2615e106a5c2474b79e
data/CHANGELOG.md CHANGED
@@ -1,3 +1,19 @@
1
+ ## [[0.14.5](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.4...v0.14.5)] - 2024-04-13
2
+
3
+ - Bump llama.cpp from b2608 to b2658.
4
+ - Add magic number constants.
5
+ - Add `token_cls` and `token_sep` methods to `Model`.
6
+
7
+ Implementation bindings for llama_state_get_size, llama_state_get_data, llama_state_set_data, llama_state_load_file, llama_state_save_file, llama_state_seq_get_size, llama_state_seq_get_data, llama_state_seq_set_data, llama_state_seq_save_file, and llama_state_seq_load_file has been skipped.
8
+
9
+ ## [[0.14.4](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.3...v0.14.4)] - 2024-04-06
10
+
11
+ - Bump llama.cpp from b2496 to b2573.
12
+ - Add file type constants.
13
+ - Bump llama.cpp from b2573 to b2608.
14
+
15
+ Implementation bindings for llama_split_path, llama_split_prefix binding, llama_grammar_accept, and decode_utf8 has been skipped.
16
+
1
17
  ## [[0.14.3](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.2...v0.14.3)] - 2024-03-23
2
18
 
3
19
  - Bump llama.cpp from b2435 to b2496.
data/examples/chat.rb CHANGED
@@ -127,8 +127,7 @@ class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
127
127
  end
128
128
 
129
129
  if input_echo
130
- output = []
131
- embd.each { |token| output << context.model.token_to_piece(token) }
130
+ output = embd.map { |token| context.model.token_to_piece(token) }
132
131
  output_str = output.join
133
132
  output_str.chomp!(antiprompt) if first_input
134
133
  print(output_str)
@@ -136,8 +135,7 @@ class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
136
135
 
137
136
  if embd_input.size <= n_consumed
138
137
  if antiprompt.size.positive?
139
- last_output = []
140
- last_n_tokens.each { |token| last_output << context.model.token_to_piece(token) }
138
+ last_output = last_n_tokens.map { |token| context.model.token_to_piece(token) }
141
139
  last_output_str = last_output.join
142
140
 
143
141
  search_start_pos = last_output_str.size > antiprompt.size ? last_output_str.size - antiprompt.size : 0
@@ -37,6 +37,7 @@ if RUBY_PLATFORM.match?(/darwin/)
37
37
  abort('Failed to set installation path for libllama.dylib.') unless mkstatus.success?
38
38
  end
39
39
  FileUtils.cp("#{LLAMA_CPP_DIR}/ggml-metal-embed.metal", VENDOR_LIB_DIR)
40
+ FileUtils.cp("#{LLAMA_CPP_DIR}/ggml-metal.metal", VENDOR_LIB_DIR)
40
41
  end
41
42
 
42
43
  abort('libstdc++ is not found.') unless have_library('stdc++')
@@ -1478,6 +1478,8 @@ public:
1478
1478
  rb_define_method(rb_cLLaMAModel, "type", RUBY_METHOD_FUNC(_llama_model_get_type), 1);
1479
1479
  rb_define_method(rb_cLLaMAModel, "token_bos", RUBY_METHOD_FUNC(_llama_model_token_bos), 0);
1480
1480
  rb_define_method(rb_cLLaMAModel, "token_eos", RUBY_METHOD_FUNC(_llama_model_token_eos), 0);
1481
+ rb_define_method(rb_cLLaMAModel, "token_cls", RUBY_METHOD_FUNC(_llama_model_token_cls), 0);
1482
+ rb_define_method(rb_cLLaMAModel, "token_sep", RUBY_METHOD_FUNC(_llama_model_token_sep), 0);
1481
1483
  rb_define_method(rb_cLLaMAModel, "token_nl", RUBY_METHOD_FUNC(_llama_model_token_nl), 0);
1482
1484
  rb_define_method(rb_cLLaMAModel, "add_bos_token?", RUBY_METHOD_FUNC(_llama_model_add_bos_token), 0);
1483
1485
  rb_define_method(rb_cLLaMAModel, "add_eos_token?", RUBY_METHOD_FUNC(_llama_model_add_eos_token), 0);
@@ -1743,6 +1745,16 @@ private:
1743
1745
  return INT2NUM(llama_token_eos(ptr->model));
1744
1746
  }
1745
1747
 
1748
+ static VALUE _llama_model_token_cls(VALUE self) {
1749
+ LLaMAModelWrapper* ptr = get_llama_model(self);
1750
+ return INT2NUM(llama_token_cls(ptr->model));
1751
+ }
1752
+
1753
+ static VALUE _llama_model_token_sep(VALUE self) {
1754
+ LLaMAModelWrapper* ptr = get_llama_model(self);
1755
+ return INT2NUM(llama_token_sep(ptr->model));
1756
+ }
1757
+
1746
1758
  static VALUE _llama_model_token_nl(VALUE self) {
1747
1759
  LLaMAModelWrapper* ptr = get_llama_model(self);
1748
1760
  return INT2NUM(llama_token_nl(ptr->model));
@@ -3371,6 +3383,10 @@ extern "C" void Init_llama_cpp(void) {
3371
3383
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ3_XXS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ3_XXS));
3372
3384
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ1_S", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ1_S));
3373
3385
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ4_NL", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ4_NL));
3386
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ3_S", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ3_S));
3387
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ3_M", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ3_M));
3388
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ4_XS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ4_XS));
3389
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ1_M", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ1_M));
3374
3390
 
3375
3391
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_GUESSED", INT2NUM(LLAMA_FTYPE_GUESSED));
3376
3392
 
@@ -3410,15 +3426,26 @@ extern "C" void Init_llama_cpp(void) {
3410
3426
  ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC_GGSN;
3411
3427
  rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_MAGIC_GGSN", rb_str_new2(ss_magic.str().c_str()));
3412
3428
 
3429
+ ss_magic.str("");
3430
+ ss_magic.clear(std::stringstream::goodbit);
3431
+ ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC_GGSQ;
3432
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_MAGIC_GGSQ", rb_str_new2(ss_magic.str().c_str()));
3433
+
3413
3434
  ss_magic.str("");
3414
3435
  ss_magic.clear(std::stringstream::goodbit);
3415
3436
  ss_magic << std::showbase << std::hex << LLAMA_SESSION_MAGIC;
3416
3437
  rb_define_const(rb_mLLaMACpp, "LLAMA_SESSION_MAGIC", rb_str_new2(ss_magic.str().c_str()));
3417
3438
 
3439
+ ss_magic.str("");
3440
+ ss_magic.clear(std::stringstream::goodbit);
3441
+ ss_magic << std::showbase << std::hex << LLAMA_STATE_SEQ_MAGIC;
3442
+ rb_define_const(rb_mLLaMACpp, "LLAMA_STATE_SEQ_MAGIC", rb_str_new2(ss_magic.str().c_str()));
3443
+
3418
3444
  ss_magic.str("");
3419
3445
  ss_magic.clear(std::stringstream::goodbit);
3420
3446
  ss_magic << std::showbase << std::hex << LLAMA_DEFAULT_SEED;
3421
3447
  rb_define_const(rb_mLLaMACpp, "LLAMA_DEFAULT_SEED", rb_str_new2(ss_magic.str().c_str()));
3422
3448
 
3423
3449
  rb_define_const(rb_mLLaMACpp, "LLAMA_SESSION_VERSION", rb_str_new2(std::to_string(LLAMA_SESSION_VERSION).c_str()));
3450
+ rb_define_const(rb_mLLaMACpp, "LLAMA_STATE_SEQ_VERSION", rb_str_new2(std::to_string(LLAMA_STATE_SEQ_VERSION).c_str()));
3424
3451
  }
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.14.3'
6
+ VERSION = '0.14.5'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'b2496'
9
+ LLAMA_CPP_VERSION = 'b2658'
10
10
  end
data/sig/llama_cpp.rbs CHANGED
@@ -3,6 +3,14 @@ module LLaMACpp
3
3
  LLAMA_CPP_VERSION: String
4
4
  LLAMA_DEFALUT_SEED: String
5
5
 
6
+ LLAMA_FILE_MAGIC_GGLA: String
7
+ LLAMA_FILE_MAGIC_GGSN: String
8
+ LLAMA_FILE_MAGIC_GGSQ: String
9
+ LLAMA_SESSION_MAGIC: String
10
+ LLAMA_SESSION_VERSION: String
11
+ LLAMA_STATE_SEQ_MAGIC: String
12
+ LLAMA_STATE_SEQ_VERSION: String
13
+
6
14
  LLAMA_VOCAB_TYPE_NONE: Integer
7
15
  LLAMA_VOCAB_TYPE_SPM: Integer
8
16
  LLAMA_VOCAB_TYPE_BPE: Integer
@@ -32,6 +40,10 @@ module LLaMACpp
32
40
  LLAMA_FTYPE_MOSTLY_IQ3_XXS: Integer
33
41
  LLAMA_FTYPE_MOSTLY_IQ1_S: Integer
34
42
  LLAMA_FTYPE_MOSTLY_IQ4_NL: Integer
43
+ LLAMA_FTYPE_MOSTLY_IQ3_S: Integer
44
+ LLAMA_FTYPE_MOSTLY_IQ3_M: Integer
45
+ LLAMA_FTYPE_MOSTLY_IQ4_XS: Integer
46
+ LLAMA_FTYPE_MOSTLY_IQ1_M: Integer
35
47
 
36
48
  LLAMA_KV_OVERRIDE_TYPE_INT: Integer
37
49
  LLAMA_KV_OVERRIDE_TYPE_FLOAT: Integer
@@ -120,6 +132,8 @@ module LLaMACpp
120
132
  def type: (Integer) -> Integer
121
133
  def token_bos: () -> Integer
122
134
  def token_eos: () -> Integer
135
+ def token_cls: () -> Integer
136
+ def token_sep: () -> Integer
123
137
  def token_nl: () -> Integer
124
138
  def add_bos_token?: () -> bool
125
139
  def add_eos_token?: () -> bool
@@ -1,6 +1,6 @@
1
1
  MIT License
2
2
 
3
- Copyright (c) 2023 Georgi Gerganov
3
+ Copyright (c) 2023-2024 The ggml authors
4
4
 
5
5
  Permission is hereby granted, free of charge, to any person obtaining a copy
6
6
  of this software and associated documentation files (the "Software"), to deal
@@ -1,8 +1,8 @@
1
1
  # Define the default target now so that it is always the first target
2
2
  BUILD_TARGETS = \
3
3
  main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
4
- simple batched batched-bench save-load-state server gguf llama-bench libllava.a llava-cli baby-llama beam-search \
5
- speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o
4
+ simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama beam-search \
5
+ retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o
6
6
 
7
7
  # Binaries only useful for tests
8
8
  TEST_TARGETS = \
@@ -10,7 +10,7 @@ TEST_TARGETS = \
10
10
  tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama \
11
11
  tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope \
12
12
  tests/test-backend-ops tests/test-model-load-cancel tests/test-autorelease \
13
- tests/test-json-schema-to-grammar
13
+ tests/test-json-schema-to-grammar tests/test-grammar-integration
14
14
 
15
15
  # Code coverage output files
16
16
  COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
@@ -392,14 +392,20 @@ ifdef LLAMA_BLIS
392
392
  endif # LLAMA_BLIS
393
393
 
394
394
  ifdef LLAMA_CUBLAS
395
+ # LLAMA_CUBLAS is deprecated and will be removed in the future
396
+ LLAMA_CUDA := 1
397
+ endif
398
+
399
+ ifdef LLAMA_CUDA
395
400
  ifneq ('', '$(wildcard /opt/cuda)')
396
401
  CUDA_PATH ?= /opt/cuda
397
402
  else
398
403
  CUDA_PATH ?= /usr/local/cuda
399
404
  endif
400
- MK_CPPFLAGS += -DGGML_USE_CUBLAS -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
405
+ MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
401
406
  MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L/usr/lib/wsl/lib
402
407
  OBJS += ggml-cuda.o
408
+ OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
403
409
  MK_NVCCFLAGS += -use_fast_math
404
410
  ifdef LLAMA_FATAL_WARNINGS
405
411
  MK_NVCCFLAGS += -Werror all-warnings
@@ -454,19 +460,30 @@ ifdef LLAMA_CUDA_PEER_MAX_BATCH_SIZE
454
460
  else
455
461
  MK_NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128
456
462
  endif # LLAMA_CUDA_PEER_MAX_BATCH_SIZE
457
- #ifdef LLAMA_CUDA_CUBLAS
458
- # MK_NVCCFLAGS += -DGGML_CUDA_CUBLAS
459
- #endif # LLAMA_CUDA_CUBLAS
463
+ ifdef LLAMA_CUDA_NO_PEER_COPY
464
+ MK_NVCCFLAGS += -DGGML_CUDA_NO_PEER_COPY
465
+ endif # LLAMA_CUDA_NO_PEER_COPY
460
466
  ifdef LLAMA_CUDA_CCBIN
461
467
  MK_NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
462
468
  endif
463
- ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml-common.h
469
+
464
470
  ifdef JETSON_EOL_MODULE_DETECT
465
- $(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
471
+ define NVCC_COMPILE
472
+ $(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUDA -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
473
+ endef # NVCC_COMPILE
466
474
  else
475
+ define NVCC_COMPILE
467
476
  $(NVCC) $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
477
+ endef # NVCC_COMPILE
468
478
  endif # JETSON_EOL_MODULE_DETECT
469
- endif # LLAMA_CUBLAS
479
+
480
+ ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/common.cuh
481
+ $(NVCC_COMPILE)
482
+
483
+ ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
484
+ $(NVCC_COMPILE)
485
+
486
+ endif # LLAMA_CUDA
470
487
 
471
488
  ifdef LLAMA_CLBLAST
472
489
 
@@ -512,7 +529,6 @@ ggml-vulkan.o: ggml-vulkan.cpp ggml-vulkan.h
512
529
  endif # LLAMA_VULKAN
513
530
 
514
531
  ifdef LLAMA_HIPBLAS
515
-
516
532
  ifeq ($(wildcard /opt/rocm),)
517
533
  ROCM_PATH ?= /usr
518
534
  GPU_TARGETS ?= $(shell $(shell which amdgpu-arch))
@@ -524,7 +540,7 @@ ifdef LLAMA_HIPBLAS
524
540
  LLAMA_CUDA_DMMV_X ?= 32
525
541
  LLAMA_CUDA_MMV_Y ?= 1
526
542
  LLAMA_CUDA_KQUANTS_ITER ?= 2
527
- MK_CPPFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS
543
+ MK_CPPFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUDA
528
544
  ifdef LLAMA_HIP_UMA
529
545
  MK_CPPFLAGS += -DGGML_HIP_UMA
530
546
  endif # LLAMA_HIP_UMA
@@ -537,9 +553,18 @@ endif # LLAMA_HIP_UMA
537
553
  ifdef LLAMA_CUDA_FORCE_DMMV
538
554
  HIPFLAGS += -DGGML_CUDA_FORCE_DMMV
539
555
  endif # LLAMA_CUDA_FORCE_DMMV
556
+ ifdef LLAMA_CUDA_NO_PEER_COPY
557
+ HIPFLAGS += -DGGML_CUDA_NO_PEER_COPY
558
+ endif # LLAMA_CUDA_NO_PEER_COPY
540
559
  OBJS += ggml-cuda.o
541
- ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
560
+ OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
561
+
562
+ ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
542
563
  $(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
564
+
565
+ ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/common.cuh
566
+ $(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
567
+
543
568
  endif # LLAMA_HIPBLAS
544
569
 
545
570
  ifdef LLAMA_METAL
@@ -592,7 +617,7 @@ override NVCCFLAGS := $(MK_NVCCFLAGS) $(NVCCFLAGS)
592
617
  override LDFLAGS := $(MK_LDFLAGS) $(LDFLAGS)
593
618
 
594
619
  # identify CUDA host compiler
595
- ifdef LLAMA_CUBLAS
620
+ ifdef LLAMA_CUDA
596
621
  GF_CC := $(NVCC) $(NVCCFLAGS) 2>/dev/null .c -Xcompiler
597
622
  include scripts/get-flags.mk
598
623
  CUDA_CXXFLAGS := $(BASE_CXXFLAGS) $(GF_CXXFLAGS) -Wno-pedantic
@@ -617,19 +642,26 @@ $(info I NVCCFLAGS: $(NVCCFLAGS))
617
642
  $(info I LDFLAGS: $(LDFLAGS))
618
643
  $(info I CC: $(shell $(CC) --version | head -n 1))
619
644
  $(info I CXX: $(shell $(CXX) --version | head -n 1))
620
- ifdef LLAMA_CUBLAS
645
+ ifdef LLAMA_CUDA
621
646
  $(info I NVCC: $(shell $(NVCC) --version | tail -n 1))
622
647
  CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])')
623
648
  ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1)
624
649
  ifndef CUDA_DOCKER_ARCH
625
650
  ifndef CUDA_POWER_ARCH
626
- $(error I ERROR: For CUDA versions < 11.7 a target CUDA architecture must be explicitly provided via CUDA_DOCKER_ARCH)
651
+ $(error I ERROR: For CUDA versions < 11.7 a target CUDA architecture must be explicitly provided via environment variable CUDA_DOCKER_ARCH, e.g. by running "export CUDA_DOCKER_ARCH=compute_XX" on Unix-like systems, where XX is the minimum compute capability that the code needs to run on. A list with compute capabilities can be found here: https://developer.nvidia.com/cuda-gpus )
627
652
  endif # CUDA_POWER_ARCH
628
653
  endif # CUDA_DOCKER_ARCH
629
654
  endif # eq ($(shell echo "$(CUDA_VERSION) < 11.7" | bc),1)
630
- endif # LLAMA_CUBLAS
655
+ endif # LLAMA_CUDA
631
656
  $(info )
632
657
 
658
+ ifdef LLAMA_CUBLAS
659
+ $(info !!!!)
660
+ $(info LLAMA_CUBLAS is deprecated and will be removed in the future. Use LLAMA_CUDA instead.)
661
+ $(info !!!!)
662
+ $(info )
663
+ endif
664
+
633
665
  #
634
666
  # Build library
635
667
  #
@@ -649,7 +681,10 @@ ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h ggml-common.h
649
681
  unicode.o: unicode.cpp unicode.h
650
682
  $(CXX) $(CXXFLAGS) -c $< -o $@
651
683
 
652
- OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o
684
+ unicode-data.o: unicode-data.cpp unicode-data.h
685
+ $(CXX) $(CXXFLAGS) -c $< -o $@
686
+
687
+ OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o
653
688
 
654
689
  llama.o: llama.cpp unicode.h ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
655
690
  $(CXX) $(CXXFLAGS) -c $< -o $@
@@ -675,6 +710,9 @@ json-schema-to-grammar.o: common/json-schema-to-grammar.cpp common/json-schema-t
675
710
  train.o: common/train.cpp common/train.h
676
711
  $(CXX) $(CXXFLAGS) -c $< -o $@
677
712
 
713
+ ngram-cache.o: common/ngram-cache.cpp common/ngram-cache.h
714
+ $(CXX) $(CXXFLAGS) -c $< -o $@
715
+
678
716
  libllama.so: llama.o ggml.o $(OBJS)
679
717
  $(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
680
718
 
@@ -686,7 +724,8 @@ lib: llama.o ggml.o $(OBJS)
686
724
  ar rcs libllama.a $^
687
725
 
688
726
  clean:
689
- rm -vrf *.o tests/*.o *.so *.a *.dll *.dylib benchmark-matmult common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
727
+ rm -vrf *.o tests/*.o *.so *.a *.dll *.dylib benchmark-matmult lookup-create lookup-merge lookup-stats common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
728
+ rm -vrf ggml-cuda/*.o
690
729
 
691
730
  #
692
731
  # Examples
@@ -766,6 +805,10 @@ gguf-split: examples/gguf-split/gguf-split.cpp ggml.o llama.o $(COMMON_DEPS) $(O
766
805
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
767
806
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
768
807
 
808
+ eval-callback: examples/eval-callback/eval-callback.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
809
+ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
810
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
811
+
769
812
  train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
770
813
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
771
814
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@@ -803,6 +846,10 @@ export-lora: examples/export-lora/export-lora.cpp ggml.o common/common.h $(OBJS)
803
846
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
804
847
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
805
848
 
849
+ retrieval: examples/retrieval/retrieval.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
850
+ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
851
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
852
+
806
853
  speculative: examples/speculative/speculative.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
807
854
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
808
855
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@@ -815,14 +862,24 @@ lookahead: examples/lookahead/lookahead.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS
815
862
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
816
863
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
817
864
 
818
- lookup: examples/lookup/lookup.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
865
+ lookup: examples/lookup/lookup.cpp ggml.o llama.o ngram-cache.o $(COMMON_DEPS) $(OBJS)
819
866
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
820
867
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
868
+ $(CXX) $(CXXFLAGS) -c examples/lookup/lookup-create.cpp -o $(call GET_OBJ_FILE, examples/lookup/lookup-create.cpp)
869
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, examples/lookup/lookup-create.cpp) -o lookup-create $(LDFLAGS)
870
+ $(CXX) $(CXXFLAGS) -c examples/lookup/lookup-merge.cpp -o $(call GET_OBJ_FILE, examples/lookup/lookup-merge.cpp)
871
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, examples/lookup/lookup-merge.cpp) -o lookup-merge $(LDFLAGS)
872
+ $(CXX) $(CXXFLAGS) -c examples/lookup/lookup-stats.cpp -o $(call GET_OBJ_FILE, examples/lookup/lookup-stats.cpp)
873
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, examples/lookup/lookup-stats.cpp) -o lookup-stats $(LDFLAGS)
821
874
 
822
875
  passkey: examples/passkey/passkey.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
823
876
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
824
877
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
825
878
 
879
+ gbnf-validator: examples/gbnf-validator/gbnf-validator.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
880
+ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
881
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
882
+
826
883
  ifeq ($(UNAME_S),Darwin)
827
884
  swift: examples/batched.swift
828
885
  (cd examples/batched.swift; make build)
@@ -870,6 +927,10 @@ tests/test-grammar-parser: tests/test-grammar-parser.cpp ggml.o llama.o grammar-
870
927
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
871
928
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
872
929
 
930
+ tests/test-grammar-integration: tests/test-grammar-integration.cpp ggml.o llama.o grammar-parser.o $(OBJS)
931
+ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
932
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
933
+
873
934
  tests/test-double-float: tests/test-double-float.cpp ggml.o $(OBJS)
874
935
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
875
936
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@@ -705,8 +705,13 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
705
705
  struct ggml_tensor * leaf = graph->leafs[i];
706
706
  struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
707
707
  galloc->leaf_allocs[i].buffer_id = hn->buffer_id;
708
- galloc->leaf_allocs[i].leaf.offset = hn->offset;
709
- galloc->leaf_allocs[i].leaf.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
708
+ if (leaf->view_src || leaf->data) {
709
+ galloc->leaf_allocs[i].leaf.offset = SIZE_MAX;
710
+ galloc->leaf_allocs[i].leaf.size_max = 0;
711
+ } else {
712
+ galloc->leaf_allocs[i].leaf.offset = hn->offset;
713
+ galloc->leaf_allocs[i].leaf.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
714
+ }
710
715
  }
711
716
 
712
717
  // reallocate buffers if needed
@@ -420,7 +420,7 @@ GGML_CALL static void ggml_backend_registry_init(void) {
420
420
  ggml_backend_register("CPU", ggml_backend_reg_cpu_init, ggml_backend_cpu_buffer_type(), NULL);
421
421
 
422
422
  // add forward decls here to avoid including the backend headers
423
- #ifdef GGML_USE_CUBLAS
423
+ #ifdef GGML_USE_CUDA
424
424
  extern GGML_CALL void ggml_backend_cuda_reg_devices(void);
425
425
  ggml_backend_cuda_reg_devices();
426
426
  #endif
@@ -137,7 +137,7 @@ extern "C" {
137
137
  /*
138
138
  Example usage:
139
139
 
140
- // operations that use tensors allocated in a buffer with USAGE_WEIGHTS will be asigned
140
+ // operations that use tensors allocated in a buffer with USAGE_WEIGHTS will be assigned
141
141
  // preferrably to run on the same backend as the buffer
142
142
  ggml_backend_buffer_set_usage(buf_weights, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
143
143
 
@@ -377,6 +377,27 @@ typedef struct {
377
377
  } block_iq1_s;
378
378
  static_assert(sizeof(block_iq1_s) == sizeof(ggml_half) + QK_K/8 + QK_K/16, "wrong iq1_s block size/padding");
379
379
 
380
+ // 1.75 bpw
381
+ typedef struct {
382
+ uint8_t qs[QK_K/8]; // grid index, low 8 bits
383
+ uint8_t qh[QK_K/16]; // grid index, high 3 bits + grid shift bit (for two groups of 8)
384
+ #if QK_K == 64
385
+ ggml_half d;
386
+ #endif
387
+ uint8_t scales[QK_K/32]; // 3-bit block scales (4-bit if QK_K == 64)
388
+ } block_iq1_m;
389
+ #if QK_K == 64
390
+ static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32 + sizeof(ggml_half), "wrong iq1_m block size/padding");
391
+ #else
392
+ static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32, "wrong iq1_m block size/padding");
393
+ #endif
394
+
395
+ // Used by IQ1_M quants
396
+ typedef union {
397
+ ggml_half f16;
398
+ uint16_t u16;
399
+ } iq1m_scale_t;
400
+
380
401
  // Non-linear quants
381
402
  #define QK4_NL 32
382
403
  typedef struct {
@@ -426,10 +447,11 @@ static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_
426
447
 
427
448
  #define GGML_COMMON_IMPL
428
449
  #elif defined(GGML_COMMON_IMPL_SYCL)
450
+
429
451
  #include <cstdint>
430
452
 
431
- #define GGML_TABLE_BEGIN(type, name, size) static dpct::global_memory<const type, 1> name(sycl::range<1>(size), {
432
- #define GGML_TABLE_END() });
453
+ #define GGML_TABLE_BEGIN(type, name, size) static const type name[size] = {
454
+ #define GGML_TABLE_END() };
433
455
 
434
456
  #define GGML_COMMON_IMPL
435
457
  #endif
@@ -1050,6 +1072,7 @@ GGML_TABLE_END()
1050
1072
 
1051
1073
  #define NGRID_IQ1S 2048
1052
1074
  #define IQ1S_DELTA 0.125f
1075
+ #define IQ1M_DELTA 0.125f
1053
1076
  #if defined(GGML_COMMON_IMPL_C)
1054
1077
  GGML_TABLE_BEGIN(uint64_t, iq1s_grid, NGRID_IQ1S)
1055
1078
  0xffffffffffffffff, 0xffffffffffffff01, 0xffffffffffff0000, 0xffffffffffff01ff,