llama_cpp 0.14.3 → 0.14.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/ext/llama_cpp/llama_cpp.cpp +4 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +4 -0
- data/vendor/tmp/llama.cpp/Makefile +71 -18
- data/vendor/tmp/llama.cpp/ggml-alloc.c +7 -2
- data/vendor/tmp/llama.cpp/ggml-backend.c +1 -1
- data/vendor/tmp/llama.cpp/ggml-common.h +25 -2
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +300 -9333
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +4 -0
- data/vendor/tmp/llama.cpp/ggml-metal.m +133 -113
- data/vendor/tmp/llama.cpp/ggml-metal.metal +344 -276
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +5 -0
- data/vendor/tmp/llama.cpp/ggml-quants.c +638 -43
- data/vendor/tmp/llama.cpp/ggml-quants.h +3 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +106 -393
- data/vendor/tmp/llama.cpp/ggml-sycl.h +13 -3
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +37199 -14939
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +329 -308
- data/vendor/tmp/llama.cpp/ggml-vulkan.h +0 -11
- data/vendor/tmp/llama.cpp/ggml.c +133 -93
- data/vendor/tmp/llama.cpp/ggml.h +11 -5
- data/vendor/tmp/llama.cpp/llama.cpp +1763 -431
- data/vendor/tmp/llama.cpp/llama.h +67 -19
- data/vendor/tmp/llama.cpp/unicode-data.cpp +1651 -0
- data/vendor/tmp/llama.cpp/unicode-data.h +16 -0
- data/vendor/tmp/llama.cpp/unicode.cpp +8 -1403
- data/vendor/tmp/llama.cpp/unicode.h +2 -0
- metadata +5 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 92ebd411f54255b05074ef79ed3e220c9ff4332164cfc831122d766226322515
|
4
|
+
data.tar.gz: 571f0ec65776945d40a54e31bba26cc4194b19965cc4841ce40b9ad3d94605df
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 34ca9567b7eb96add562e977f22f8b2be087c026c85bf92cd5e31f9a96ea5f02a841bdf05f745c4079740a4bb01476fb9bab313317d66dbf8870fa829a269c86
|
7
|
+
data.tar.gz: 64b19ef010bb52800cd3710c1ec70bcb7b747e53b6ea7d8f13d84d336b1ee67868153f5bbdfee0b4131dfddaf1c656c49bd774084ee2d14f191d22d215a47737
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,11 @@
|
|
1
|
+
## [[0.14.4](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.3...v0.14.4)] - 2024-04-06
|
2
|
+
|
3
|
+
- Bump llama.cpp from b2496 to b2573.
|
4
|
+
- Add file type constants.
|
5
|
+
- Bump llama.cpp from b2573 to b2608.
|
6
|
+
|
7
|
+
Implementation of llama_split_path, llama_split_prefix binding, llama_grammar_accept, and decode_utf8 has been skipped.
|
8
|
+
|
1
9
|
## [[0.14.3](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.2...v0.14.3)] - 2024-03-23
|
2
10
|
|
3
11
|
- Bump llama.cpp from b2435 to b2496.
|
data/ext/llama_cpp/llama_cpp.cpp
CHANGED
@@ -3371,6 +3371,10 @@ extern "C" void Init_llama_cpp(void) {
|
|
3371
3371
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ3_XXS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ3_XXS));
|
3372
3372
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ1_S", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ1_S));
|
3373
3373
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ4_NL", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ4_NL));
|
3374
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ3_S", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ3_S));
|
3375
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ3_M", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ3_M));
|
3376
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ4_XS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ4_XS));
|
3377
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ1_M", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ1_M));
|
3374
3378
|
|
3375
3379
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_GUESSED", INT2NUM(LLAMA_FTYPE_GUESSED));
|
3376
3380
|
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.14.
|
6
|
+
VERSION = '0.14.4'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = '
|
9
|
+
LLAMA_CPP_VERSION = 'b2608'
|
10
10
|
end
|
data/sig/llama_cpp.rbs
CHANGED
@@ -32,6 +32,10 @@ module LLaMACpp
|
|
32
32
|
LLAMA_FTYPE_MOSTLY_IQ3_XXS: Integer
|
33
33
|
LLAMA_FTYPE_MOSTLY_IQ1_S: Integer
|
34
34
|
LLAMA_FTYPE_MOSTLY_IQ4_NL: Integer
|
35
|
+
LLAMA_FTYPE_MOSTLY_IQ3_S: Integer
|
36
|
+
LLAMA_FTYPE_MOSTLY_IQ3_M: Integer
|
37
|
+
LLAMA_FTYPE_MOSTLY_IQ4_XS: Integer
|
38
|
+
LLAMA_FTYPE_MOSTLY_IQ1_M: Integer
|
35
39
|
|
36
40
|
LLAMA_KV_OVERRIDE_TYPE_INT: Integer
|
37
41
|
LLAMA_KV_OVERRIDE_TYPE_FLOAT: Integer
|
@@ -1,8 +1,8 @@
|
|
1
1
|
# Define the default target now so that it is always the first target
|
2
2
|
BUILD_TARGETS = \
|
3
3
|
main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
|
4
|
-
simple batched batched-bench save-load-state server gguf llama-bench libllava.a llava-cli baby-llama beam-search \
|
5
|
-
speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o
|
4
|
+
simple batched batched-bench save-load-state server gguf gguf-split llama-bench libllava.a llava-cli baby-llama beam-search \
|
5
|
+
retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o
|
6
6
|
|
7
7
|
# Binaries only useful for tests
|
8
8
|
TEST_TARGETS = \
|
@@ -392,14 +392,20 @@ ifdef LLAMA_BLIS
|
|
392
392
|
endif # LLAMA_BLIS
|
393
393
|
|
394
394
|
ifdef LLAMA_CUBLAS
|
395
|
+
# LLAMA_CUBLAS is deprecated and will be removed in the future
|
396
|
+
LLAMA_CUDA := 1
|
397
|
+
endif
|
398
|
+
|
399
|
+
ifdef LLAMA_CUDA
|
395
400
|
ifneq ('', '$(wildcard /opt/cuda)')
|
396
401
|
CUDA_PATH ?= /opt/cuda
|
397
402
|
else
|
398
403
|
CUDA_PATH ?= /usr/local/cuda
|
399
404
|
endif
|
400
|
-
MK_CPPFLAGS += -
|
405
|
+
MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
|
401
406
|
MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L/usr/lib/wsl/lib
|
402
407
|
OBJS += ggml-cuda.o
|
408
|
+
OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
|
403
409
|
MK_NVCCFLAGS += -use_fast_math
|
404
410
|
ifdef LLAMA_FATAL_WARNINGS
|
405
411
|
MK_NVCCFLAGS += -Werror all-warnings
|
@@ -454,19 +460,30 @@ ifdef LLAMA_CUDA_PEER_MAX_BATCH_SIZE
|
|
454
460
|
else
|
455
461
|
MK_NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128
|
456
462
|
endif # LLAMA_CUDA_PEER_MAX_BATCH_SIZE
|
457
|
-
|
458
|
-
|
459
|
-
|
463
|
+
ifdef LLAMA_CUDA_NO_PEER_COPY
|
464
|
+
MK_NVCCFLAGS += -DGGML_CUDA_NO_PEER_COPY
|
465
|
+
endif # LLAMA_CUDA_NO_PEER_COPY
|
460
466
|
ifdef LLAMA_CUDA_CCBIN
|
461
467
|
MK_NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
|
462
468
|
endif
|
463
|
-
|
469
|
+
|
464
470
|
ifdef JETSON_EOL_MODULE_DETECT
|
465
|
-
|
471
|
+
define NVCC_COMPILE
|
472
|
+
$(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUDA -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
|
473
|
+
endef # NVCC_COMPILE
|
466
474
|
else
|
475
|
+
define NVCC_COMPILE
|
467
476
|
$(NVCC) $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
|
477
|
+
endef # NVCC_COMPILE
|
468
478
|
endif # JETSON_EOL_MODULE_DETECT
|
469
|
-
|
479
|
+
|
480
|
+
ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/common.cuh
|
481
|
+
$(NVCC_COMPILE)
|
482
|
+
|
483
|
+
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
|
484
|
+
$(NVCC_COMPILE)
|
485
|
+
|
486
|
+
endif # LLAMA_CUDA
|
470
487
|
|
471
488
|
ifdef LLAMA_CLBLAST
|
472
489
|
|
@@ -512,7 +529,6 @@ ggml-vulkan.o: ggml-vulkan.cpp ggml-vulkan.h
|
|
512
529
|
endif # LLAMA_VULKAN
|
513
530
|
|
514
531
|
ifdef LLAMA_HIPBLAS
|
515
|
-
|
516
532
|
ifeq ($(wildcard /opt/rocm),)
|
517
533
|
ROCM_PATH ?= /usr
|
518
534
|
GPU_TARGETS ?= $(shell $(shell which amdgpu-arch))
|
@@ -524,7 +540,7 @@ ifdef LLAMA_HIPBLAS
|
|
524
540
|
LLAMA_CUDA_DMMV_X ?= 32
|
525
541
|
LLAMA_CUDA_MMV_Y ?= 1
|
526
542
|
LLAMA_CUDA_KQUANTS_ITER ?= 2
|
527
|
-
MK_CPPFLAGS += -DGGML_USE_HIPBLAS -
|
543
|
+
MK_CPPFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUDA
|
528
544
|
ifdef LLAMA_HIP_UMA
|
529
545
|
MK_CPPFLAGS += -DGGML_HIP_UMA
|
530
546
|
endif # LLAMA_HIP_UMA
|
@@ -537,9 +553,18 @@ endif # LLAMA_HIP_UMA
|
|
537
553
|
ifdef LLAMA_CUDA_FORCE_DMMV
|
538
554
|
HIPFLAGS += -DGGML_CUDA_FORCE_DMMV
|
539
555
|
endif # LLAMA_CUDA_FORCE_DMMV
|
556
|
+
ifdef LLAMA_CUDA_NO_PEER_COPY
|
557
|
+
HIPFLAGS += -DGGML_CUDA_NO_PEER_COPY
|
558
|
+
endif # LLAMA_CUDA_NO_PEER_COPY
|
540
559
|
OBJS += ggml-cuda.o
|
541
|
-
|
560
|
+
OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
|
561
|
+
|
562
|
+
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
|
563
|
+
$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
|
564
|
+
|
565
|
+
ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/common.cuh
|
542
566
|
$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
|
567
|
+
|
543
568
|
endif # LLAMA_HIPBLAS
|
544
569
|
|
545
570
|
ifdef LLAMA_METAL
|
@@ -592,7 +617,7 @@ override NVCCFLAGS := $(MK_NVCCFLAGS) $(NVCCFLAGS)
|
|
592
617
|
override LDFLAGS := $(MK_LDFLAGS) $(LDFLAGS)
|
593
618
|
|
594
619
|
# identify CUDA host compiler
|
595
|
-
ifdef
|
620
|
+
ifdef LLAMA_CUDA
|
596
621
|
GF_CC := $(NVCC) $(NVCCFLAGS) 2>/dev/null .c -Xcompiler
|
597
622
|
include scripts/get-flags.mk
|
598
623
|
CUDA_CXXFLAGS := $(BASE_CXXFLAGS) $(GF_CXXFLAGS) -Wno-pedantic
|
@@ -617,7 +642,7 @@ $(info I NVCCFLAGS: $(NVCCFLAGS))
|
|
617
642
|
$(info I LDFLAGS: $(LDFLAGS))
|
618
643
|
$(info I CC: $(shell $(CC) --version | head -n 1))
|
619
644
|
$(info I CXX: $(shell $(CXX) --version | head -n 1))
|
620
|
-
ifdef
|
645
|
+
ifdef LLAMA_CUDA
|
621
646
|
$(info I NVCC: $(shell $(NVCC) --version | tail -n 1))
|
622
647
|
CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])')
|
623
648
|
ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1)
|
@@ -627,9 +652,16 @@ $(error I ERROR: For CUDA versions < 11.7 a target CUDA architecture must be exp
|
|
627
652
|
endif # CUDA_POWER_ARCH
|
628
653
|
endif # CUDA_DOCKER_ARCH
|
629
654
|
endif # eq ($(shell echo "$(CUDA_VERSION) < 11.7" | bc),1)
|
630
|
-
endif #
|
655
|
+
endif # LLAMA_CUDA
|
631
656
|
$(info )
|
632
657
|
|
658
|
+
ifdef LLAMA_CUBLAS
|
659
|
+
$(info !!!!)
|
660
|
+
$(info LLAMA_CUBLAS is deprecated and will be removed in the future. Use LLAMA_CUDA instead.)
|
661
|
+
$(info !!!!)
|
662
|
+
$(info )
|
663
|
+
endif
|
664
|
+
|
633
665
|
#
|
634
666
|
# Build library
|
635
667
|
#
|
@@ -649,7 +681,10 @@ ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h ggml-common.h
|
|
649
681
|
unicode.o: unicode.cpp unicode.h
|
650
682
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
651
683
|
|
652
|
-
|
684
|
+
unicode-data.o: unicode-data.cpp unicode-data.h
|
685
|
+
$(CXX) $(CXXFLAGS) -c $< -o $@
|
686
|
+
|
687
|
+
OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o
|
653
688
|
|
654
689
|
llama.o: llama.cpp unicode.h ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
|
655
690
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
@@ -675,6 +710,9 @@ json-schema-to-grammar.o: common/json-schema-to-grammar.cpp common/json-schema-t
|
|
675
710
|
train.o: common/train.cpp common/train.h
|
676
711
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
677
712
|
|
713
|
+
ngram-cache.o: common/ngram-cache.cpp common/ngram-cache.h
|
714
|
+
$(CXX) $(CXXFLAGS) -c $< -o $@
|
715
|
+
|
678
716
|
libllama.so: llama.o ggml.o $(OBJS)
|
679
717
|
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
|
680
718
|
|
@@ -686,7 +724,8 @@ lib: llama.o ggml.o $(OBJS)
|
|
686
724
|
ar rcs libllama.a $^
|
687
725
|
|
688
726
|
clean:
|
689
|
-
rm -vrf *.o tests/*.o *.so *.a *.dll *.dylib benchmark-matmult common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
|
727
|
+
rm -vrf *.o tests/*.o *.so *.a *.dll *.dylib benchmark-matmult lookup-create lookup-merge lookup-stats common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
|
728
|
+
rm -vrf ggml-cuda/*.o
|
690
729
|
|
691
730
|
#
|
692
731
|
# Examples
|
@@ -803,6 +842,10 @@ export-lora: examples/export-lora/export-lora.cpp ggml.o common/common.h $(OBJS)
|
|
803
842
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
804
843
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
805
844
|
|
845
|
+
retrieval: examples/retrieval/retrieval.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
846
|
+
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
847
|
+
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
848
|
+
|
806
849
|
speculative: examples/speculative/speculative.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
807
850
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
808
851
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
@@ -815,14 +858,24 @@ lookahead: examples/lookahead/lookahead.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS
|
|
815
858
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
816
859
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
817
860
|
|
818
|
-
lookup: examples/lookup/lookup.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
861
|
+
lookup: examples/lookup/lookup.cpp ggml.o llama.o ngram-cache.o $(COMMON_DEPS) $(OBJS)
|
819
862
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
820
863
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
864
|
+
$(CXX) $(CXXFLAGS) -c examples/lookup/lookup-create.cpp -o $(call GET_OBJ_FILE, examples/lookup/lookup-create.cpp)
|
865
|
+
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, examples/lookup/lookup-create.cpp) -o lookup-create $(LDFLAGS)
|
866
|
+
$(CXX) $(CXXFLAGS) -c examples/lookup/lookup-merge.cpp -o $(call GET_OBJ_FILE, examples/lookup/lookup-merge.cpp)
|
867
|
+
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, examples/lookup/lookup-merge.cpp) -o lookup-merge $(LDFLAGS)
|
868
|
+
$(CXX) $(CXXFLAGS) -c examples/lookup/lookup-stats.cpp -o $(call GET_OBJ_FILE, examples/lookup/lookup-stats.cpp)
|
869
|
+
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, examples/lookup/lookup-stats.cpp) -o lookup-stats $(LDFLAGS)
|
821
870
|
|
822
871
|
passkey: examples/passkey/passkey.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
823
872
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
824
873
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
825
874
|
|
875
|
+
gbnf-validator: examples/gbnf-validator/gbnf-validator.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
876
|
+
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
877
|
+
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
878
|
+
|
826
879
|
ifeq ($(UNAME_S),Darwin)
|
827
880
|
swift: examples/batched.swift
|
828
881
|
(cd examples/batched.swift; make build)
|
@@ -705,8 +705,13 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
|
705
705
|
struct ggml_tensor * leaf = graph->leafs[i];
|
706
706
|
struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
|
707
707
|
galloc->leaf_allocs[i].buffer_id = hn->buffer_id;
|
708
|
-
|
709
|
-
|
708
|
+
if (leaf->view_src || leaf->data) {
|
709
|
+
galloc->leaf_allocs[i].leaf.offset = SIZE_MAX;
|
710
|
+
galloc->leaf_allocs[i].leaf.size_max = 0;
|
711
|
+
} else {
|
712
|
+
galloc->leaf_allocs[i].leaf.offset = hn->offset;
|
713
|
+
galloc->leaf_allocs[i].leaf.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
|
714
|
+
}
|
710
715
|
}
|
711
716
|
|
712
717
|
// reallocate buffers if needed
|
@@ -420,7 +420,7 @@ GGML_CALL static void ggml_backend_registry_init(void) {
|
|
420
420
|
ggml_backend_register("CPU", ggml_backend_reg_cpu_init, ggml_backend_cpu_buffer_type(), NULL);
|
421
421
|
|
422
422
|
// add forward decls here to avoid including the backend headers
|
423
|
-
#ifdef
|
423
|
+
#ifdef GGML_USE_CUDA
|
424
424
|
extern GGML_CALL void ggml_backend_cuda_reg_devices(void);
|
425
425
|
ggml_backend_cuda_reg_devices();
|
426
426
|
#endif
|
@@ -377,6 +377,27 @@ typedef struct {
|
|
377
377
|
} block_iq1_s;
|
378
378
|
static_assert(sizeof(block_iq1_s) == sizeof(ggml_half) + QK_K/8 + QK_K/16, "wrong iq1_s block size/padding");
|
379
379
|
|
380
|
+
// 1.75 bpw
|
381
|
+
typedef struct {
|
382
|
+
uint8_t qs[QK_K/8]; // grid index, low 8 bits
|
383
|
+
uint8_t qh[QK_K/16]; // grid index, high 3 bits + grid shift bit (for two groups of 8)
|
384
|
+
#if QK_K == 64
|
385
|
+
ggml_half d;
|
386
|
+
#endif
|
387
|
+
uint8_t scales[QK_K/32]; // 3-bit block scales (4-bit if QK_K == 64)
|
388
|
+
} block_iq1_m;
|
389
|
+
#if QK_K == 64
|
390
|
+
static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32 + sizeof(ggml_half), "wrong iq1_m block size/padding");
|
391
|
+
#else
|
392
|
+
static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32, "wrong iq1_m block size/padding");
|
393
|
+
#endif
|
394
|
+
|
395
|
+
// Used by IQ1_M quants
|
396
|
+
typedef union {
|
397
|
+
ggml_half f16;
|
398
|
+
uint16_t u16;
|
399
|
+
} iq1m_scale_t;
|
400
|
+
|
380
401
|
// Non-linear quants
|
381
402
|
#define QK4_NL 32
|
382
403
|
typedef struct {
|
@@ -426,10 +447,11 @@ static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_
|
|
426
447
|
|
427
448
|
#define GGML_COMMON_IMPL
|
428
449
|
#elif defined(GGML_COMMON_IMPL_SYCL)
|
450
|
+
|
429
451
|
#include <cstdint>
|
430
452
|
|
431
|
-
#define GGML_TABLE_BEGIN(type, name, size) static
|
432
|
-
#define GGML_TABLE_END() }
|
453
|
+
#define GGML_TABLE_BEGIN(type, name, size) static const type name[size] = {
|
454
|
+
#define GGML_TABLE_END() };
|
433
455
|
|
434
456
|
#define GGML_COMMON_IMPL
|
435
457
|
#endif
|
@@ -1050,6 +1072,7 @@ GGML_TABLE_END()
|
|
1050
1072
|
|
1051
1073
|
#define NGRID_IQ1S 2048
|
1052
1074
|
#define IQ1S_DELTA 0.125f
|
1075
|
+
#define IQ1M_DELTA 0.125f
|
1053
1076
|
#if defined(GGML_COMMON_IMPL_C)
|
1054
1077
|
GGML_TABLE_BEGIN(uint64_t, iq1s_grid, NGRID_IQ1S)
|
1055
1078
|
0xffffffffffffffff, 0xffffffffffffff01, 0xffffffffffff0000, 0xffffffffffff01ff,
|