llama_cpp 0.14.3 → 0.14.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 3b503998061ee4c8a87bc3148d41feda0b45b04cbe0cafdb3897d1d457b26e0a
4
- data.tar.gz: b761a18fd964ca0a4e871d01cc0a6058527c951413de7b110a8b07862ed64d8c
3
+ metadata.gz: 92ebd411f54255b05074ef79ed3e220c9ff4332164cfc831122d766226322515
4
+ data.tar.gz: 571f0ec65776945d40a54e31bba26cc4194b19965cc4841ce40b9ad3d94605df
5
5
  SHA512:
6
- metadata.gz: 2951b2a59b0579f3afa983283a73853300f822891f0d1dfef292727d6f313392ddc68902144caaca33b173e43e95076dda02ffa97228cf7f65babc4ac82354c9
7
- data.tar.gz: cb655d32282b28ebaee30b87b882600c79a6666c306de2692a059da3de1bc21d3c988116fd1dd26d97c00ea0f22fdccc8b3f8d94b20cb01c819d9a578c71bd67
6
+ metadata.gz: 34ca9567b7eb96add562e977f22f8b2be087c026c85bf92cd5e31f9a96ea5f02a841bdf05f745c4079740a4bb01476fb9bab313317d66dbf8870fa829a269c86
7
+ data.tar.gz: 64b19ef010bb52800cd3710c1ec70bcb7b747e53b6ea7d8f13d84d336b1ee67868153f5bbdfee0b4131dfddaf1c656c49bd774084ee2d14f191d22d215a47737
data/CHANGELOG.md CHANGED
@@ -1,3 +1,11 @@
1
+ ## [[0.14.4](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.3...v0.14.4)] - 2024-04-06
2
+
3
+ - Bump llama.cpp from b2496 to b2573.
4
+ - Add file type constants.
5
+ - Bump llama.cpp from b2573 to b2608.
6
+
7
+ Implementation of llama_split_path, llama_split_prefix binding, llama_grammar_accept, and decode_utf8 has been skipped.
8
+
1
9
  ## [[0.14.3](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.2...v0.14.3)] - 2024-03-23
2
10
 
3
11
  - Bump llama.cpp from b2435 to b2496.
@@ -3371,6 +3371,10 @@ extern "C" void Init_llama_cpp(void) {
3371
3371
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ3_XXS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ3_XXS));
3372
3372
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ1_S", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ1_S));
3373
3373
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ4_NL", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ4_NL));
3374
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ3_S", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ3_S));
3375
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ3_M", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ3_M));
3376
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ4_XS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ4_XS));
3377
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ1_M", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ1_M));
3374
3378
 
3375
3379
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_GUESSED", INT2NUM(LLAMA_FTYPE_GUESSED));
3376
3380
 
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.14.3'
6
+ VERSION = '0.14.4'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'b2496'
9
+ LLAMA_CPP_VERSION = 'b2608'
10
10
  end
data/sig/llama_cpp.rbs CHANGED
@@ -32,6 +32,10 @@ module LLaMACpp
32
32
  LLAMA_FTYPE_MOSTLY_IQ3_XXS: Integer
33
33
  LLAMA_FTYPE_MOSTLY_IQ1_S: Integer
34
34
  LLAMA_FTYPE_MOSTLY_IQ4_NL: Integer
35
+ LLAMA_FTYPE_MOSTLY_IQ3_S: Integer
36
+ LLAMA_FTYPE_MOSTLY_IQ3_M: Integer
37
+ LLAMA_FTYPE_MOSTLY_IQ4_XS: Integer
38
+ LLAMA_FTYPE_MOSTLY_IQ1_M: Integer
35
39
 
36
40
  LLAMA_KV_OVERRIDE_TYPE_INT: Integer
37
41
  LLAMA_KV_OVERRIDE_TYPE_FLOAT: Integer
@@ -1,8 +1,8 @@
1
1
  # Define the default target now so that it is always the first target
2
2
  BUILD_TARGETS = \
3
3
  main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
4
- simple batched batched-bench save-load-state server gguf llama-bench libllava.a llava-cli baby-llama beam-search \
5
- speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o
4
+ simple batched batched-bench save-load-state server gguf gguf-split llama-bench libllava.a llava-cli baby-llama beam-search \
5
+ retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o
6
6
 
7
7
  # Binaries only useful for tests
8
8
  TEST_TARGETS = \
@@ -392,14 +392,20 @@ ifdef LLAMA_BLIS
392
392
  endif # LLAMA_BLIS
393
393
 
394
394
  ifdef LLAMA_CUBLAS
395
+ # LLAMA_CUBLAS is deprecated and will be removed in the future
396
+ LLAMA_CUDA := 1
397
+ endif
398
+
399
+ ifdef LLAMA_CUDA
395
400
  ifneq ('', '$(wildcard /opt/cuda)')
396
401
  CUDA_PATH ?= /opt/cuda
397
402
  else
398
403
  CUDA_PATH ?= /usr/local/cuda
399
404
  endif
400
- MK_CPPFLAGS += -DGGML_USE_CUBLAS -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
405
+ MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
401
406
  MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L/usr/lib/wsl/lib
402
407
  OBJS += ggml-cuda.o
408
+ OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
403
409
  MK_NVCCFLAGS += -use_fast_math
404
410
  ifdef LLAMA_FATAL_WARNINGS
405
411
  MK_NVCCFLAGS += -Werror all-warnings
@@ -454,19 +460,30 @@ ifdef LLAMA_CUDA_PEER_MAX_BATCH_SIZE
454
460
  else
455
461
  MK_NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128
456
462
  endif # LLAMA_CUDA_PEER_MAX_BATCH_SIZE
457
- #ifdef LLAMA_CUDA_CUBLAS
458
- # MK_NVCCFLAGS += -DGGML_CUDA_CUBLAS
459
- #endif # LLAMA_CUDA_CUBLAS
463
+ ifdef LLAMA_CUDA_NO_PEER_COPY
464
+ MK_NVCCFLAGS += -DGGML_CUDA_NO_PEER_COPY
465
+ endif # LLAMA_CUDA_NO_PEER_COPY
460
466
  ifdef LLAMA_CUDA_CCBIN
461
467
  MK_NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
462
468
  endif
463
- ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml-common.h
469
+
464
470
  ifdef JETSON_EOL_MODULE_DETECT
465
- $(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
471
+ define NVCC_COMPILE
472
+ $(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUDA -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
473
+ endef # NVCC_COMPILE
466
474
  else
475
+ define NVCC_COMPILE
467
476
  $(NVCC) $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
477
+ endef # NVCC_COMPILE
468
478
  endif # JETSON_EOL_MODULE_DETECT
469
- endif # LLAMA_CUBLAS
479
+
480
+ ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/common.cuh
481
+ $(NVCC_COMPILE)
482
+
483
+ ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
484
+ $(NVCC_COMPILE)
485
+
486
+ endif # LLAMA_CUDA
470
487
 
471
488
  ifdef LLAMA_CLBLAST
472
489
 
@@ -512,7 +529,6 @@ ggml-vulkan.o: ggml-vulkan.cpp ggml-vulkan.h
512
529
  endif # LLAMA_VULKAN
513
530
 
514
531
  ifdef LLAMA_HIPBLAS
515
-
516
532
  ifeq ($(wildcard /opt/rocm),)
517
533
  ROCM_PATH ?= /usr
518
534
  GPU_TARGETS ?= $(shell $(shell which amdgpu-arch))
@@ -524,7 +540,7 @@ ifdef LLAMA_HIPBLAS
524
540
  LLAMA_CUDA_DMMV_X ?= 32
525
541
  LLAMA_CUDA_MMV_Y ?= 1
526
542
  LLAMA_CUDA_KQUANTS_ITER ?= 2
527
- MK_CPPFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS
543
+ MK_CPPFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUDA
528
544
  ifdef LLAMA_HIP_UMA
529
545
  MK_CPPFLAGS += -DGGML_HIP_UMA
530
546
  endif # LLAMA_HIP_UMA
@@ -537,9 +553,18 @@ endif # LLAMA_HIP_UMA
537
553
  ifdef LLAMA_CUDA_FORCE_DMMV
538
554
  HIPFLAGS += -DGGML_CUDA_FORCE_DMMV
539
555
  endif # LLAMA_CUDA_FORCE_DMMV
556
+ ifdef LLAMA_CUDA_NO_PEER_COPY
557
+ HIPFLAGS += -DGGML_CUDA_NO_PEER_COPY
558
+ endif # LLAMA_CUDA_NO_PEER_COPY
540
559
  OBJS += ggml-cuda.o
541
- ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
560
+ OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
561
+
562
+ ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
563
+ $(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
564
+
565
+ ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/common.cuh
542
566
  $(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
567
+
543
568
  endif # LLAMA_HIPBLAS
544
569
 
545
570
  ifdef LLAMA_METAL
@@ -592,7 +617,7 @@ override NVCCFLAGS := $(MK_NVCCFLAGS) $(NVCCFLAGS)
592
617
  override LDFLAGS := $(MK_LDFLAGS) $(LDFLAGS)
593
618
 
594
619
  # identify CUDA host compiler
595
- ifdef LLAMA_CUBLAS
620
+ ifdef LLAMA_CUDA
596
621
  GF_CC := $(NVCC) $(NVCCFLAGS) 2>/dev/null .c -Xcompiler
597
622
  include scripts/get-flags.mk
598
623
  CUDA_CXXFLAGS := $(BASE_CXXFLAGS) $(GF_CXXFLAGS) -Wno-pedantic
@@ -617,7 +642,7 @@ $(info I NVCCFLAGS: $(NVCCFLAGS))
617
642
  $(info I LDFLAGS: $(LDFLAGS))
618
643
  $(info I CC: $(shell $(CC) --version | head -n 1))
619
644
  $(info I CXX: $(shell $(CXX) --version | head -n 1))
620
- ifdef LLAMA_CUBLAS
645
+ ifdef LLAMA_CUDA
621
646
  $(info I NVCC: $(shell $(NVCC) --version | tail -n 1))
622
647
  CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])')
623
648
  ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1)
@@ -627,9 +652,16 @@ $(error I ERROR: For CUDA versions < 11.7 a target CUDA architecture must be exp
627
652
  endif # CUDA_POWER_ARCH
628
653
  endif # CUDA_DOCKER_ARCH
629
654
  endif # eq ($(shell echo "$(CUDA_VERSION) < 11.7" | bc),1)
630
- endif # LLAMA_CUBLAS
655
+ endif # LLAMA_CUDA
631
656
  $(info )
632
657
 
658
+ ifdef LLAMA_CUBLAS
659
+ $(info !!!!)
660
+ $(info LLAMA_CUBLAS is deprecated and will be removed in the future. Use LLAMA_CUDA instead.)
661
+ $(info !!!!)
662
+ $(info )
663
+ endif
664
+
633
665
  #
634
666
  # Build library
635
667
  #
@@ -649,7 +681,10 @@ ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h ggml-common.h
649
681
  unicode.o: unicode.cpp unicode.h
650
682
  $(CXX) $(CXXFLAGS) -c $< -o $@
651
683
 
652
- OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o
684
+ unicode-data.o: unicode-data.cpp unicode-data.h
685
+ $(CXX) $(CXXFLAGS) -c $< -o $@
686
+
687
+ OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o
653
688
 
654
689
  llama.o: llama.cpp unicode.h ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
655
690
  $(CXX) $(CXXFLAGS) -c $< -o $@
@@ -675,6 +710,9 @@ json-schema-to-grammar.o: common/json-schema-to-grammar.cpp common/json-schema-t
675
710
  train.o: common/train.cpp common/train.h
676
711
  $(CXX) $(CXXFLAGS) -c $< -o $@
677
712
 
713
+ ngram-cache.o: common/ngram-cache.cpp common/ngram-cache.h
714
+ $(CXX) $(CXXFLAGS) -c $< -o $@
715
+
678
716
  libllama.so: llama.o ggml.o $(OBJS)
679
717
  $(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
680
718
 
@@ -686,7 +724,8 @@ lib: llama.o ggml.o $(OBJS)
686
724
  ar rcs libllama.a $^
687
725
 
688
726
  clean:
689
- rm -vrf *.o tests/*.o *.so *.a *.dll *.dylib benchmark-matmult common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
727
+ rm -vrf *.o tests/*.o *.so *.a *.dll *.dylib benchmark-matmult lookup-create lookup-merge lookup-stats common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
728
+ rm -vrf ggml-cuda/*.o
690
729
 
691
730
  #
692
731
  # Examples
@@ -803,6 +842,10 @@ export-lora: examples/export-lora/export-lora.cpp ggml.o common/common.h $(OBJS)
803
842
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
804
843
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
805
844
 
845
+ retrieval: examples/retrieval/retrieval.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
846
+ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
847
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
848
+
806
849
  speculative: examples/speculative/speculative.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
807
850
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
808
851
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@@ -815,14 +858,24 @@ lookahead: examples/lookahead/lookahead.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS
815
858
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
816
859
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
817
860
 
818
- lookup: examples/lookup/lookup.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
861
+ lookup: examples/lookup/lookup.cpp ggml.o llama.o ngram-cache.o $(COMMON_DEPS) $(OBJS)
819
862
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
820
863
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
864
+ $(CXX) $(CXXFLAGS) -c examples/lookup/lookup-create.cpp -o $(call GET_OBJ_FILE, examples/lookup/lookup-create.cpp)
865
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, examples/lookup/lookup-create.cpp) -o lookup-create $(LDFLAGS)
866
+ $(CXX) $(CXXFLAGS) -c examples/lookup/lookup-merge.cpp -o $(call GET_OBJ_FILE, examples/lookup/lookup-merge.cpp)
867
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, examples/lookup/lookup-merge.cpp) -o lookup-merge $(LDFLAGS)
868
+ $(CXX) $(CXXFLAGS) -c examples/lookup/lookup-stats.cpp -o $(call GET_OBJ_FILE, examples/lookup/lookup-stats.cpp)
869
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, examples/lookup/lookup-stats.cpp) -o lookup-stats $(LDFLAGS)
821
870
 
822
871
  passkey: examples/passkey/passkey.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
823
872
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
824
873
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
825
874
 
875
+ gbnf-validator: examples/gbnf-validator/gbnf-validator.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
876
+ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
877
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
878
+
826
879
  ifeq ($(UNAME_S),Darwin)
827
880
  swift: examples/batched.swift
828
881
  (cd examples/batched.swift; make build)
@@ -705,8 +705,13 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
705
705
  struct ggml_tensor * leaf = graph->leafs[i];
706
706
  struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
707
707
  galloc->leaf_allocs[i].buffer_id = hn->buffer_id;
708
- galloc->leaf_allocs[i].leaf.offset = hn->offset;
709
- galloc->leaf_allocs[i].leaf.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
708
+ if (leaf->view_src || leaf->data) {
709
+ galloc->leaf_allocs[i].leaf.offset = SIZE_MAX;
710
+ galloc->leaf_allocs[i].leaf.size_max = 0;
711
+ } else {
712
+ galloc->leaf_allocs[i].leaf.offset = hn->offset;
713
+ galloc->leaf_allocs[i].leaf.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
714
+ }
710
715
  }
711
716
 
712
717
  // reallocate buffers if needed
@@ -420,7 +420,7 @@ GGML_CALL static void ggml_backend_registry_init(void) {
420
420
  ggml_backend_register("CPU", ggml_backend_reg_cpu_init, ggml_backend_cpu_buffer_type(), NULL);
421
421
 
422
422
  // add forward decls here to avoid including the backend headers
423
- #ifdef GGML_USE_CUBLAS
423
+ #ifdef GGML_USE_CUDA
424
424
  extern GGML_CALL void ggml_backend_cuda_reg_devices(void);
425
425
  ggml_backend_cuda_reg_devices();
426
426
  #endif
@@ -377,6 +377,27 @@ typedef struct {
377
377
  } block_iq1_s;
378
378
  static_assert(sizeof(block_iq1_s) == sizeof(ggml_half) + QK_K/8 + QK_K/16, "wrong iq1_s block size/padding");
379
379
 
380
+ // 1.75 bpw
381
+ typedef struct {
382
+ uint8_t qs[QK_K/8]; // grid index, low 8 bits
383
+ uint8_t qh[QK_K/16]; // grid index, high 3 bits + grid shift bit (for two groups of 8)
384
+ #if QK_K == 64
385
+ ggml_half d;
386
+ #endif
387
+ uint8_t scales[QK_K/32]; // 3-bit block scales (4-bit if QK_K == 64)
388
+ } block_iq1_m;
389
+ #if QK_K == 64
390
+ static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32 + sizeof(ggml_half), "wrong iq1_m block size/padding");
391
+ #else
392
+ static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32, "wrong iq1_m block size/padding");
393
+ #endif
394
+
395
+ // Used by IQ1_M quants
396
+ typedef union {
397
+ ggml_half f16;
398
+ uint16_t u16;
399
+ } iq1m_scale_t;
400
+
380
401
  // Non-linear quants
381
402
  #define QK4_NL 32
382
403
  typedef struct {
@@ -426,10 +447,11 @@ static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_
426
447
 
427
448
  #define GGML_COMMON_IMPL
428
449
  #elif defined(GGML_COMMON_IMPL_SYCL)
450
+
429
451
  #include <cstdint>
430
452
 
431
- #define GGML_TABLE_BEGIN(type, name, size) static dpct::global_memory<const type, 1> name(sycl::range<1>(size), {
432
- #define GGML_TABLE_END() });
453
+ #define GGML_TABLE_BEGIN(type, name, size) static const type name[size] = {
454
+ #define GGML_TABLE_END() };
433
455
 
434
456
  #define GGML_COMMON_IMPL
435
457
  #endif
@@ -1050,6 +1072,7 @@ GGML_TABLE_END()
1050
1072
 
1051
1073
  #define NGRID_IQ1S 2048
1052
1074
  #define IQ1S_DELTA 0.125f
1075
+ #define IQ1M_DELTA 0.125f
1053
1076
  #if defined(GGML_COMMON_IMPL_C)
1054
1077
  GGML_TABLE_BEGIN(uint64_t, iq1s_grid, NGRID_IQ1S)
1055
1078
  0xffffffffffffffff, 0xffffffffffffff01, 0xffffffffffff0000, 0xffffffffffff01ff,