llama_cpp 0.14.3 → 0.14.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/ext/llama_cpp/llama_cpp.cpp +4 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +4 -0
- data/vendor/tmp/llama.cpp/Makefile +71 -18
- data/vendor/tmp/llama.cpp/ggml-alloc.c +7 -2
- data/vendor/tmp/llama.cpp/ggml-backend.c +1 -1
- data/vendor/tmp/llama.cpp/ggml-common.h +25 -2
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +300 -9333
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +4 -0
- data/vendor/tmp/llama.cpp/ggml-metal.m +133 -113
- data/vendor/tmp/llama.cpp/ggml-metal.metal +344 -276
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +5 -0
- data/vendor/tmp/llama.cpp/ggml-quants.c +638 -43
- data/vendor/tmp/llama.cpp/ggml-quants.h +3 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +106 -393
- data/vendor/tmp/llama.cpp/ggml-sycl.h +13 -3
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +37199 -14939
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +329 -308
- data/vendor/tmp/llama.cpp/ggml-vulkan.h +0 -11
- data/vendor/tmp/llama.cpp/ggml.c +133 -93
- data/vendor/tmp/llama.cpp/ggml.h +11 -5
- data/vendor/tmp/llama.cpp/llama.cpp +1763 -431
- data/vendor/tmp/llama.cpp/llama.h +67 -19
- data/vendor/tmp/llama.cpp/unicode-data.cpp +1651 -0
- data/vendor/tmp/llama.cpp/unicode-data.h +16 -0
- data/vendor/tmp/llama.cpp/unicode.cpp +8 -1403
- data/vendor/tmp/llama.cpp/unicode.h +2 -0
- metadata +5 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 92ebd411f54255b05074ef79ed3e220c9ff4332164cfc831122d766226322515
|
4
|
+
data.tar.gz: 571f0ec65776945d40a54e31bba26cc4194b19965cc4841ce40b9ad3d94605df
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 34ca9567b7eb96add562e977f22f8b2be087c026c85bf92cd5e31f9a96ea5f02a841bdf05f745c4079740a4bb01476fb9bab313317d66dbf8870fa829a269c86
|
7
|
+
data.tar.gz: 64b19ef010bb52800cd3710c1ec70bcb7b747e53b6ea7d8f13d84d336b1ee67868153f5bbdfee0b4131dfddaf1c656c49bd774084ee2d14f191d22d215a47737
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,11 @@
|
|
1
|
+
## [[0.14.4](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.3...v0.14.4)] - 2024-04-06
|
2
|
+
|
3
|
+
- Bump llama.cpp from b2496 to b2573.
|
4
|
+
- Add file type constants.
|
5
|
+
- Bump llama.cpp from b2573 to b2608.
|
6
|
+
|
7
|
+
Implementation of llama_split_path, llama_split_prefix binding, llama_grammar_accept, and decode_utf8 has been skipped.
|
8
|
+
|
1
9
|
## [[0.14.3](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.2...v0.14.3)] - 2024-03-23
|
2
10
|
|
3
11
|
- Bump llama.cpp from b2435 to b2496.
|
data/ext/llama_cpp/llama_cpp.cpp
CHANGED
@@ -3371,6 +3371,10 @@ extern "C" void Init_llama_cpp(void) {
|
|
3371
3371
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ3_XXS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ3_XXS));
|
3372
3372
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ1_S", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ1_S));
|
3373
3373
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ4_NL", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ4_NL));
|
3374
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ3_S", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ3_S));
|
3375
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ3_M", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ3_M));
|
3376
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ4_XS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ4_XS));
|
3377
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ1_M", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ1_M));
|
3374
3378
|
|
3375
3379
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_GUESSED", INT2NUM(LLAMA_FTYPE_GUESSED));
|
3376
3380
|
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.14.
|
6
|
+
VERSION = '0.14.4'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = '
|
9
|
+
LLAMA_CPP_VERSION = 'b2608'
|
10
10
|
end
|
data/sig/llama_cpp.rbs
CHANGED
@@ -32,6 +32,10 @@ module LLaMACpp
|
|
32
32
|
LLAMA_FTYPE_MOSTLY_IQ3_XXS: Integer
|
33
33
|
LLAMA_FTYPE_MOSTLY_IQ1_S: Integer
|
34
34
|
LLAMA_FTYPE_MOSTLY_IQ4_NL: Integer
|
35
|
+
LLAMA_FTYPE_MOSTLY_IQ3_S: Integer
|
36
|
+
LLAMA_FTYPE_MOSTLY_IQ3_M: Integer
|
37
|
+
LLAMA_FTYPE_MOSTLY_IQ4_XS: Integer
|
38
|
+
LLAMA_FTYPE_MOSTLY_IQ1_M: Integer
|
35
39
|
|
36
40
|
LLAMA_KV_OVERRIDE_TYPE_INT: Integer
|
37
41
|
LLAMA_KV_OVERRIDE_TYPE_FLOAT: Integer
|
@@ -1,8 +1,8 @@
|
|
1
1
|
# Define the default target now so that it is always the first target
|
2
2
|
BUILD_TARGETS = \
|
3
3
|
main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
|
4
|
-
simple batched batched-bench save-load-state server gguf llama-bench libllava.a llava-cli baby-llama beam-search \
|
5
|
-
speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o
|
4
|
+
simple batched batched-bench save-load-state server gguf gguf-split llama-bench libllava.a llava-cli baby-llama beam-search \
|
5
|
+
retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o
|
6
6
|
|
7
7
|
# Binaries only useful for tests
|
8
8
|
TEST_TARGETS = \
|
@@ -392,14 +392,20 @@ ifdef LLAMA_BLIS
|
|
392
392
|
endif # LLAMA_BLIS
|
393
393
|
|
394
394
|
ifdef LLAMA_CUBLAS
|
395
|
+
# LLAMA_CUBLAS is deprecated and will be removed in the future
|
396
|
+
LLAMA_CUDA := 1
|
397
|
+
endif
|
398
|
+
|
399
|
+
ifdef LLAMA_CUDA
|
395
400
|
ifneq ('', '$(wildcard /opt/cuda)')
|
396
401
|
CUDA_PATH ?= /opt/cuda
|
397
402
|
else
|
398
403
|
CUDA_PATH ?= /usr/local/cuda
|
399
404
|
endif
|
400
|
-
MK_CPPFLAGS += -
|
405
|
+
MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
|
401
406
|
MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L/usr/lib/wsl/lib
|
402
407
|
OBJS += ggml-cuda.o
|
408
|
+
OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
|
403
409
|
MK_NVCCFLAGS += -use_fast_math
|
404
410
|
ifdef LLAMA_FATAL_WARNINGS
|
405
411
|
MK_NVCCFLAGS += -Werror all-warnings
|
@@ -454,19 +460,30 @@ ifdef LLAMA_CUDA_PEER_MAX_BATCH_SIZE
|
|
454
460
|
else
|
455
461
|
MK_NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128
|
456
462
|
endif # LLAMA_CUDA_PEER_MAX_BATCH_SIZE
|
457
|
-
|
458
|
-
|
459
|
-
|
463
|
+
ifdef LLAMA_CUDA_NO_PEER_COPY
|
464
|
+
MK_NVCCFLAGS += -DGGML_CUDA_NO_PEER_COPY
|
465
|
+
endif # LLAMA_CUDA_NO_PEER_COPY
|
460
466
|
ifdef LLAMA_CUDA_CCBIN
|
461
467
|
MK_NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
|
462
468
|
endif
|
463
|
-
|
469
|
+
|
464
470
|
ifdef JETSON_EOL_MODULE_DETECT
|
465
|
-
|
471
|
+
define NVCC_COMPILE
|
472
|
+
$(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUDA -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
|
473
|
+
endef # NVCC_COMPILE
|
466
474
|
else
|
475
|
+
define NVCC_COMPILE
|
467
476
|
$(NVCC) $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
|
477
|
+
endef # NVCC_COMPILE
|
468
478
|
endif # JETSON_EOL_MODULE_DETECT
|
469
|
-
|
479
|
+
|
480
|
+
ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/common.cuh
|
481
|
+
$(NVCC_COMPILE)
|
482
|
+
|
483
|
+
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
|
484
|
+
$(NVCC_COMPILE)
|
485
|
+
|
486
|
+
endif # LLAMA_CUDA
|
470
487
|
|
471
488
|
ifdef LLAMA_CLBLAST
|
472
489
|
|
@@ -512,7 +529,6 @@ ggml-vulkan.o: ggml-vulkan.cpp ggml-vulkan.h
|
|
512
529
|
endif # LLAMA_VULKAN
|
513
530
|
|
514
531
|
ifdef LLAMA_HIPBLAS
|
515
|
-
|
516
532
|
ifeq ($(wildcard /opt/rocm),)
|
517
533
|
ROCM_PATH ?= /usr
|
518
534
|
GPU_TARGETS ?= $(shell $(shell which amdgpu-arch))
|
@@ -524,7 +540,7 @@ ifdef LLAMA_HIPBLAS
|
|
524
540
|
LLAMA_CUDA_DMMV_X ?= 32
|
525
541
|
LLAMA_CUDA_MMV_Y ?= 1
|
526
542
|
LLAMA_CUDA_KQUANTS_ITER ?= 2
|
527
|
-
MK_CPPFLAGS += -DGGML_USE_HIPBLAS -
|
543
|
+
MK_CPPFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUDA
|
528
544
|
ifdef LLAMA_HIP_UMA
|
529
545
|
MK_CPPFLAGS += -DGGML_HIP_UMA
|
530
546
|
endif # LLAMA_HIP_UMA
|
@@ -537,9 +553,18 @@ endif # LLAMA_HIP_UMA
|
|
537
553
|
ifdef LLAMA_CUDA_FORCE_DMMV
|
538
554
|
HIPFLAGS += -DGGML_CUDA_FORCE_DMMV
|
539
555
|
endif # LLAMA_CUDA_FORCE_DMMV
|
556
|
+
ifdef LLAMA_CUDA_NO_PEER_COPY
|
557
|
+
HIPFLAGS += -DGGML_CUDA_NO_PEER_COPY
|
558
|
+
endif # LLAMA_CUDA_NO_PEER_COPY
|
540
559
|
OBJS += ggml-cuda.o
|
541
|
-
|
560
|
+
OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
|
561
|
+
|
562
|
+
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
|
563
|
+
$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
|
564
|
+
|
565
|
+
ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/common.cuh
|
542
566
|
$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
|
567
|
+
|
543
568
|
endif # LLAMA_HIPBLAS
|
544
569
|
|
545
570
|
ifdef LLAMA_METAL
|
@@ -592,7 +617,7 @@ override NVCCFLAGS := $(MK_NVCCFLAGS) $(NVCCFLAGS)
|
|
592
617
|
override LDFLAGS := $(MK_LDFLAGS) $(LDFLAGS)
|
593
618
|
|
594
619
|
# identify CUDA host compiler
|
595
|
-
ifdef
|
620
|
+
ifdef LLAMA_CUDA
|
596
621
|
GF_CC := $(NVCC) $(NVCCFLAGS) 2>/dev/null .c -Xcompiler
|
597
622
|
include scripts/get-flags.mk
|
598
623
|
CUDA_CXXFLAGS := $(BASE_CXXFLAGS) $(GF_CXXFLAGS) -Wno-pedantic
|
@@ -617,7 +642,7 @@ $(info I NVCCFLAGS: $(NVCCFLAGS))
|
|
617
642
|
$(info I LDFLAGS: $(LDFLAGS))
|
618
643
|
$(info I CC: $(shell $(CC) --version | head -n 1))
|
619
644
|
$(info I CXX: $(shell $(CXX) --version | head -n 1))
|
620
|
-
ifdef
|
645
|
+
ifdef LLAMA_CUDA
|
621
646
|
$(info I NVCC: $(shell $(NVCC) --version | tail -n 1))
|
622
647
|
CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])')
|
623
648
|
ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1)
|
@@ -627,9 +652,16 @@ $(error I ERROR: For CUDA versions < 11.7 a target CUDA architecture must be exp
|
|
627
652
|
endif # CUDA_POWER_ARCH
|
628
653
|
endif # CUDA_DOCKER_ARCH
|
629
654
|
endif # eq ($(shell echo "$(CUDA_VERSION) < 11.7" | bc),1)
|
630
|
-
endif #
|
655
|
+
endif # LLAMA_CUDA
|
631
656
|
$(info )
|
632
657
|
|
658
|
+
ifdef LLAMA_CUBLAS
|
659
|
+
$(info !!!!)
|
660
|
+
$(info LLAMA_CUBLAS is deprecated and will be removed in the future. Use LLAMA_CUDA instead.)
|
661
|
+
$(info !!!!)
|
662
|
+
$(info )
|
663
|
+
endif
|
664
|
+
|
633
665
|
#
|
634
666
|
# Build library
|
635
667
|
#
|
@@ -649,7 +681,10 @@ ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h ggml-common.h
|
|
649
681
|
unicode.o: unicode.cpp unicode.h
|
650
682
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
651
683
|
|
652
|
-
|
684
|
+
unicode-data.o: unicode-data.cpp unicode-data.h
|
685
|
+
$(CXX) $(CXXFLAGS) -c $< -o $@
|
686
|
+
|
687
|
+
OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o
|
653
688
|
|
654
689
|
llama.o: llama.cpp unicode.h ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
|
655
690
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
@@ -675,6 +710,9 @@ json-schema-to-grammar.o: common/json-schema-to-grammar.cpp common/json-schema-t
|
|
675
710
|
train.o: common/train.cpp common/train.h
|
676
711
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
677
712
|
|
713
|
+
ngram-cache.o: common/ngram-cache.cpp common/ngram-cache.h
|
714
|
+
$(CXX) $(CXXFLAGS) -c $< -o $@
|
715
|
+
|
678
716
|
libllama.so: llama.o ggml.o $(OBJS)
|
679
717
|
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
|
680
718
|
|
@@ -686,7 +724,8 @@ lib: llama.o ggml.o $(OBJS)
|
|
686
724
|
ar rcs libllama.a $^
|
687
725
|
|
688
726
|
clean:
|
689
|
-
rm -vrf *.o tests/*.o *.so *.a *.dll *.dylib benchmark-matmult common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
|
727
|
+
rm -vrf *.o tests/*.o *.so *.a *.dll *.dylib benchmark-matmult lookup-create lookup-merge lookup-stats common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
|
728
|
+
rm -vrf ggml-cuda/*.o
|
690
729
|
|
691
730
|
#
|
692
731
|
# Examples
|
@@ -803,6 +842,10 @@ export-lora: examples/export-lora/export-lora.cpp ggml.o common/common.h $(OBJS)
|
|
803
842
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
804
843
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
805
844
|
|
845
|
+
retrieval: examples/retrieval/retrieval.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
846
|
+
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
847
|
+
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
848
|
+
|
806
849
|
speculative: examples/speculative/speculative.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
807
850
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
808
851
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
@@ -815,14 +858,24 @@ lookahead: examples/lookahead/lookahead.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS
|
|
815
858
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
816
859
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
817
860
|
|
818
|
-
lookup: examples/lookup/lookup.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
861
|
+
lookup: examples/lookup/lookup.cpp ggml.o llama.o ngram-cache.o $(COMMON_DEPS) $(OBJS)
|
819
862
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
820
863
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
864
|
+
$(CXX) $(CXXFLAGS) -c examples/lookup/lookup-create.cpp -o $(call GET_OBJ_FILE, examples/lookup/lookup-create.cpp)
|
865
|
+
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, examples/lookup/lookup-create.cpp) -o lookup-create $(LDFLAGS)
|
866
|
+
$(CXX) $(CXXFLAGS) -c examples/lookup/lookup-merge.cpp -o $(call GET_OBJ_FILE, examples/lookup/lookup-merge.cpp)
|
867
|
+
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, examples/lookup/lookup-merge.cpp) -o lookup-merge $(LDFLAGS)
|
868
|
+
$(CXX) $(CXXFLAGS) -c examples/lookup/lookup-stats.cpp -o $(call GET_OBJ_FILE, examples/lookup/lookup-stats.cpp)
|
869
|
+
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, examples/lookup/lookup-stats.cpp) -o lookup-stats $(LDFLAGS)
|
821
870
|
|
822
871
|
passkey: examples/passkey/passkey.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
823
872
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
824
873
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
825
874
|
|
875
|
+
gbnf-validator: examples/gbnf-validator/gbnf-validator.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
876
|
+
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
877
|
+
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
878
|
+
|
826
879
|
ifeq ($(UNAME_S),Darwin)
|
827
880
|
swift: examples/batched.swift
|
828
881
|
(cd examples/batched.swift; make build)
|
@@ -705,8 +705,13 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
|
705
705
|
struct ggml_tensor * leaf = graph->leafs[i];
|
706
706
|
struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
|
707
707
|
galloc->leaf_allocs[i].buffer_id = hn->buffer_id;
|
708
|
-
|
709
|
-
|
708
|
+
if (leaf->view_src || leaf->data) {
|
709
|
+
galloc->leaf_allocs[i].leaf.offset = SIZE_MAX;
|
710
|
+
galloc->leaf_allocs[i].leaf.size_max = 0;
|
711
|
+
} else {
|
712
|
+
galloc->leaf_allocs[i].leaf.offset = hn->offset;
|
713
|
+
galloc->leaf_allocs[i].leaf.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
|
714
|
+
}
|
710
715
|
}
|
711
716
|
|
712
717
|
// reallocate buffers if needed
|
@@ -420,7 +420,7 @@ GGML_CALL static void ggml_backend_registry_init(void) {
|
|
420
420
|
ggml_backend_register("CPU", ggml_backend_reg_cpu_init, ggml_backend_cpu_buffer_type(), NULL);
|
421
421
|
|
422
422
|
// add forward decls here to avoid including the backend headers
|
423
|
-
#ifdef
|
423
|
+
#ifdef GGML_USE_CUDA
|
424
424
|
extern GGML_CALL void ggml_backend_cuda_reg_devices(void);
|
425
425
|
ggml_backend_cuda_reg_devices();
|
426
426
|
#endif
|
@@ -377,6 +377,27 @@ typedef struct {
|
|
377
377
|
} block_iq1_s;
|
378
378
|
static_assert(sizeof(block_iq1_s) == sizeof(ggml_half) + QK_K/8 + QK_K/16, "wrong iq1_s block size/padding");
|
379
379
|
|
380
|
+
// 1.75 bpw
|
381
|
+
typedef struct {
|
382
|
+
uint8_t qs[QK_K/8]; // grid index, low 8 bits
|
383
|
+
uint8_t qh[QK_K/16]; // grid index, high 3 bits + grid shift bit (for two groups of 8)
|
384
|
+
#if QK_K == 64
|
385
|
+
ggml_half d;
|
386
|
+
#endif
|
387
|
+
uint8_t scales[QK_K/32]; // 3-bit block scales (4-bit if QK_K == 64)
|
388
|
+
} block_iq1_m;
|
389
|
+
#if QK_K == 64
|
390
|
+
static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32 + sizeof(ggml_half), "wrong iq1_m block size/padding");
|
391
|
+
#else
|
392
|
+
static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32, "wrong iq1_m block size/padding");
|
393
|
+
#endif
|
394
|
+
|
395
|
+
// Used by IQ1_M quants
|
396
|
+
typedef union {
|
397
|
+
ggml_half f16;
|
398
|
+
uint16_t u16;
|
399
|
+
} iq1m_scale_t;
|
400
|
+
|
380
401
|
// Non-linear quants
|
381
402
|
#define QK4_NL 32
|
382
403
|
typedef struct {
|
@@ -426,10 +447,11 @@ static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_
|
|
426
447
|
|
427
448
|
#define GGML_COMMON_IMPL
|
428
449
|
#elif defined(GGML_COMMON_IMPL_SYCL)
|
450
|
+
|
429
451
|
#include <cstdint>
|
430
452
|
|
431
|
-
#define GGML_TABLE_BEGIN(type, name, size) static
|
432
|
-
#define GGML_TABLE_END() }
|
453
|
+
#define GGML_TABLE_BEGIN(type, name, size) static const type name[size] = {
|
454
|
+
#define GGML_TABLE_END() };
|
433
455
|
|
434
456
|
#define GGML_COMMON_IMPL
|
435
457
|
#endif
|
@@ -1050,6 +1072,7 @@ GGML_TABLE_END()
|
|
1050
1072
|
|
1051
1073
|
#define NGRID_IQ1S 2048
|
1052
1074
|
#define IQ1S_DELTA 0.125f
|
1075
|
+
#define IQ1M_DELTA 0.125f
|
1053
1076
|
#if defined(GGML_COMMON_IMPL_C)
|
1054
1077
|
GGML_TABLE_BEGIN(uint64_t, iq1s_grid, NGRID_IQ1S)
|
1055
1078
|
0xffffffffffffffff, 0xffffffffffffff01, 0xffffffffffff0000, 0xffffffffffff01ff,
|