llama_cpp 0.14.3 → 0.14.4

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 3b503998061ee4c8a87bc3148d41feda0b45b04cbe0cafdb3897d1d457b26e0a
4
- data.tar.gz: b761a18fd964ca0a4e871d01cc0a6058527c951413de7b110a8b07862ed64d8c
3
+ metadata.gz: 92ebd411f54255b05074ef79ed3e220c9ff4332164cfc831122d766226322515
4
+ data.tar.gz: 571f0ec65776945d40a54e31bba26cc4194b19965cc4841ce40b9ad3d94605df
5
5
  SHA512:
6
- metadata.gz: 2951b2a59b0579f3afa983283a73853300f822891f0d1dfef292727d6f313392ddc68902144caaca33b173e43e95076dda02ffa97228cf7f65babc4ac82354c9
7
- data.tar.gz: cb655d32282b28ebaee30b87b882600c79a6666c306de2692a059da3de1bc21d3c988116fd1dd26d97c00ea0f22fdccc8b3f8d94b20cb01c819d9a578c71bd67
6
+ metadata.gz: 34ca9567b7eb96add562e977f22f8b2be087c026c85bf92cd5e31f9a96ea5f02a841bdf05f745c4079740a4bb01476fb9bab313317d66dbf8870fa829a269c86
7
+ data.tar.gz: 64b19ef010bb52800cd3710c1ec70bcb7b747e53b6ea7d8f13d84d336b1ee67868153f5bbdfee0b4131dfddaf1c656c49bd774084ee2d14f191d22d215a47737
data/CHANGELOG.md CHANGED
@@ -1,3 +1,11 @@
1
+ ## [[0.14.4](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.3...v0.14.4)] - 2024-04-06
2
+
3
+ - Bump llama.cpp from b2496 to b2573.
4
+ - Add file type constants.
5
+ - Bump llama.cpp from b2573 to b2608.
6
+
7
+ Implementation of llama_split_path, llama_split_prefix binding, llama_grammar_accept, and decode_utf8 has been skipped.
8
+
1
9
  ## [[0.14.3](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.2...v0.14.3)] - 2024-03-23
2
10
 
3
11
  - Bump llama.cpp from b2435 to b2496.
@@ -3371,6 +3371,10 @@ extern "C" void Init_llama_cpp(void) {
3371
3371
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ3_XXS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ3_XXS));
3372
3372
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ1_S", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ1_S));
3373
3373
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ4_NL", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ4_NL));
3374
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ3_S", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ3_S));
3375
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ3_M", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ3_M));
3376
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ4_XS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ4_XS));
3377
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ1_M", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ1_M));
3374
3378
 
3375
3379
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_GUESSED", INT2NUM(LLAMA_FTYPE_GUESSED));
3376
3380
 
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.14.3'
6
+ VERSION = '0.14.4'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'b2496'
9
+ LLAMA_CPP_VERSION = 'b2608'
10
10
  end
data/sig/llama_cpp.rbs CHANGED
@@ -32,6 +32,10 @@ module LLaMACpp
32
32
  LLAMA_FTYPE_MOSTLY_IQ3_XXS: Integer
33
33
  LLAMA_FTYPE_MOSTLY_IQ1_S: Integer
34
34
  LLAMA_FTYPE_MOSTLY_IQ4_NL: Integer
35
+ LLAMA_FTYPE_MOSTLY_IQ3_S: Integer
36
+ LLAMA_FTYPE_MOSTLY_IQ3_M: Integer
37
+ LLAMA_FTYPE_MOSTLY_IQ4_XS: Integer
38
+ LLAMA_FTYPE_MOSTLY_IQ1_M: Integer
35
39
 
36
40
  LLAMA_KV_OVERRIDE_TYPE_INT: Integer
37
41
  LLAMA_KV_OVERRIDE_TYPE_FLOAT: Integer
@@ -1,8 +1,8 @@
1
1
  # Define the default target now so that it is always the first target
2
2
  BUILD_TARGETS = \
3
3
  main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
4
- simple batched batched-bench save-load-state server gguf llama-bench libllava.a llava-cli baby-llama beam-search \
5
- speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o
4
+ simple batched batched-bench save-load-state server gguf gguf-split llama-bench libllava.a llava-cli baby-llama beam-search \
5
+ retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o
6
6
 
7
7
  # Binaries only useful for tests
8
8
  TEST_TARGETS = \
@@ -392,14 +392,20 @@ ifdef LLAMA_BLIS
392
392
  endif # LLAMA_BLIS
393
393
 
394
394
  ifdef LLAMA_CUBLAS
395
+ # LLAMA_CUBLAS is deprecated and will be removed in the future
396
+ LLAMA_CUDA := 1
397
+ endif
398
+
399
+ ifdef LLAMA_CUDA
395
400
  ifneq ('', '$(wildcard /opt/cuda)')
396
401
  CUDA_PATH ?= /opt/cuda
397
402
  else
398
403
  CUDA_PATH ?= /usr/local/cuda
399
404
  endif
400
- MK_CPPFLAGS += -DGGML_USE_CUBLAS -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
405
+ MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
401
406
  MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L/usr/lib/wsl/lib
402
407
  OBJS += ggml-cuda.o
408
+ OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
403
409
  MK_NVCCFLAGS += -use_fast_math
404
410
  ifdef LLAMA_FATAL_WARNINGS
405
411
  MK_NVCCFLAGS += -Werror all-warnings
@@ -454,19 +460,30 @@ ifdef LLAMA_CUDA_PEER_MAX_BATCH_SIZE
454
460
  else
455
461
  MK_NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128
456
462
  endif # LLAMA_CUDA_PEER_MAX_BATCH_SIZE
457
- #ifdef LLAMA_CUDA_CUBLAS
458
- # MK_NVCCFLAGS += -DGGML_CUDA_CUBLAS
459
- #endif # LLAMA_CUDA_CUBLAS
463
+ ifdef LLAMA_CUDA_NO_PEER_COPY
464
+ MK_NVCCFLAGS += -DGGML_CUDA_NO_PEER_COPY
465
+ endif # LLAMA_CUDA_NO_PEER_COPY
460
466
  ifdef LLAMA_CUDA_CCBIN
461
467
  MK_NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
462
468
  endif
463
- ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml-common.h
469
+
464
470
  ifdef JETSON_EOL_MODULE_DETECT
465
- $(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
471
+ define NVCC_COMPILE
472
+ $(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUDA -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
473
+ endef # NVCC_COMPILE
466
474
  else
475
+ define NVCC_COMPILE
467
476
  $(NVCC) $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
477
+ endef # NVCC_COMPILE
468
478
  endif # JETSON_EOL_MODULE_DETECT
469
- endif # LLAMA_CUBLAS
479
+
480
+ ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/common.cuh
481
+ $(NVCC_COMPILE)
482
+
483
+ ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
484
+ $(NVCC_COMPILE)
485
+
486
+ endif # LLAMA_CUDA
470
487
 
471
488
  ifdef LLAMA_CLBLAST
472
489
 
@@ -512,7 +529,6 @@ ggml-vulkan.o: ggml-vulkan.cpp ggml-vulkan.h
512
529
  endif # LLAMA_VULKAN
513
530
 
514
531
  ifdef LLAMA_HIPBLAS
515
-
516
532
  ifeq ($(wildcard /opt/rocm),)
517
533
  ROCM_PATH ?= /usr
518
534
  GPU_TARGETS ?= $(shell $(shell which amdgpu-arch))
@@ -524,7 +540,7 @@ ifdef LLAMA_HIPBLAS
524
540
  LLAMA_CUDA_DMMV_X ?= 32
525
541
  LLAMA_CUDA_MMV_Y ?= 1
526
542
  LLAMA_CUDA_KQUANTS_ITER ?= 2
527
- MK_CPPFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS
543
+ MK_CPPFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUDA
528
544
  ifdef LLAMA_HIP_UMA
529
545
  MK_CPPFLAGS += -DGGML_HIP_UMA
530
546
  endif # LLAMA_HIP_UMA
@@ -537,9 +553,18 @@ endif # LLAMA_HIP_UMA
537
553
  ifdef LLAMA_CUDA_FORCE_DMMV
538
554
  HIPFLAGS += -DGGML_CUDA_FORCE_DMMV
539
555
  endif # LLAMA_CUDA_FORCE_DMMV
556
+ ifdef LLAMA_CUDA_NO_PEER_COPY
557
+ HIPFLAGS += -DGGML_CUDA_NO_PEER_COPY
558
+ endif # LLAMA_CUDA_NO_PEER_COPY
540
559
  OBJS += ggml-cuda.o
541
- ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
560
+ OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
561
+
562
+ ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
563
+ $(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
564
+
565
+ ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/common.cuh
542
566
  $(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
567
+
543
568
  endif # LLAMA_HIPBLAS
544
569
 
545
570
  ifdef LLAMA_METAL
@@ -592,7 +617,7 @@ override NVCCFLAGS := $(MK_NVCCFLAGS) $(NVCCFLAGS)
592
617
  override LDFLAGS := $(MK_LDFLAGS) $(LDFLAGS)
593
618
 
594
619
  # identify CUDA host compiler
595
- ifdef LLAMA_CUBLAS
620
+ ifdef LLAMA_CUDA
596
621
  GF_CC := $(NVCC) $(NVCCFLAGS) 2>/dev/null .c -Xcompiler
597
622
  include scripts/get-flags.mk
598
623
  CUDA_CXXFLAGS := $(BASE_CXXFLAGS) $(GF_CXXFLAGS) -Wno-pedantic
@@ -617,7 +642,7 @@ $(info I NVCCFLAGS: $(NVCCFLAGS))
617
642
  $(info I LDFLAGS: $(LDFLAGS))
618
643
  $(info I CC: $(shell $(CC) --version | head -n 1))
619
644
  $(info I CXX: $(shell $(CXX) --version | head -n 1))
620
- ifdef LLAMA_CUBLAS
645
+ ifdef LLAMA_CUDA
621
646
  $(info I NVCC: $(shell $(NVCC) --version | tail -n 1))
622
647
  CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])')
623
648
  ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1)
@@ -627,9 +652,16 @@ $(error I ERROR: For CUDA versions < 11.7 a target CUDA architecture must be exp
627
652
  endif # CUDA_POWER_ARCH
628
653
  endif # CUDA_DOCKER_ARCH
629
654
  endif # eq ($(shell echo "$(CUDA_VERSION) < 11.7" | bc),1)
630
- endif # LLAMA_CUBLAS
655
+ endif # LLAMA_CUDA
631
656
  $(info )
632
657
 
658
+ ifdef LLAMA_CUBLAS
659
+ $(info !!!!)
660
+ $(info LLAMA_CUBLAS is deprecated and will be removed in the future. Use LLAMA_CUDA instead.)
661
+ $(info !!!!)
662
+ $(info )
663
+ endif
664
+
633
665
  #
634
666
  # Build library
635
667
  #
@@ -649,7 +681,10 @@ ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h ggml-common.h
649
681
  unicode.o: unicode.cpp unicode.h
650
682
  $(CXX) $(CXXFLAGS) -c $< -o $@
651
683
 
652
- OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o
684
+ unicode-data.o: unicode-data.cpp unicode-data.h
685
+ $(CXX) $(CXXFLAGS) -c $< -o $@
686
+
687
+ OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o
653
688
 
654
689
  llama.o: llama.cpp unicode.h ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
655
690
  $(CXX) $(CXXFLAGS) -c $< -o $@
@@ -675,6 +710,9 @@ json-schema-to-grammar.o: common/json-schema-to-grammar.cpp common/json-schema-t
675
710
  train.o: common/train.cpp common/train.h
676
711
  $(CXX) $(CXXFLAGS) -c $< -o $@
677
712
 
713
+ ngram-cache.o: common/ngram-cache.cpp common/ngram-cache.h
714
+ $(CXX) $(CXXFLAGS) -c $< -o $@
715
+
678
716
  libllama.so: llama.o ggml.o $(OBJS)
679
717
  $(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
680
718
 
@@ -686,7 +724,8 @@ lib: llama.o ggml.o $(OBJS)
686
724
  ar rcs libllama.a $^
687
725
 
688
726
  clean:
689
- rm -vrf *.o tests/*.o *.so *.a *.dll *.dylib benchmark-matmult common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
727
+ rm -vrf *.o tests/*.o *.so *.a *.dll *.dylib benchmark-matmult lookup-create lookup-merge lookup-stats common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
728
+ rm -vrf ggml-cuda/*.o
690
729
 
691
730
  #
692
731
  # Examples
@@ -803,6 +842,10 @@ export-lora: examples/export-lora/export-lora.cpp ggml.o common/common.h $(OBJS)
803
842
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
804
843
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
805
844
 
845
+ retrieval: examples/retrieval/retrieval.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
846
+ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
847
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
848
+
806
849
  speculative: examples/speculative/speculative.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
807
850
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
808
851
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@@ -815,14 +858,24 @@ lookahead: examples/lookahead/lookahead.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS
815
858
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
816
859
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
817
860
 
818
- lookup: examples/lookup/lookup.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
861
+ lookup: examples/lookup/lookup.cpp ggml.o llama.o ngram-cache.o $(COMMON_DEPS) $(OBJS)
819
862
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
820
863
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
864
+ $(CXX) $(CXXFLAGS) -c examples/lookup/lookup-create.cpp -o $(call GET_OBJ_FILE, examples/lookup/lookup-create.cpp)
865
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, examples/lookup/lookup-create.cpp) -o lookup-create $(LDFLAGS)
866
+ $(CXX) $(CXXFLAGS) -c examples/lookup/lookup-merge.cpp -o $(call GET_OBJ_FILE, examples/lookup/lookup-merge.cpp)
867
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, examples/lookup/lookup-merge.cpp) -o lookup-merge $(LDFLAGS)
868
+ $(CXX) $(CXXFLAGS) -c examples/lookup/lookup-stats.cpp -o $(call GET_OBJ_FILE, examples/lookup/lookup-stats.cpp)
869
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, examples/lookup/lookup-stats.cpp) -o lookup-stats $(LDFLAGS)
821
870
 
822
871
  passkey: examples/passkey/passkey.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
823
872
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
824
873
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
825
874
 
875
+ gbnf-validator: examples/gbnf-validator/gbnf-validator.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
876
+ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
877
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
878
+
826
879
  ifeq ($(UNAME_S),Darwin)
827
880
  swift: examples/batched.swift
828
881
  (cd examples/batched.swift; make build)
@@ -705,8 +705,13 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
705
705
  struct ggml_tensor * leaf = graph->leafs[i];
706
706
  struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
707
707
  galloc->leaf_allocs[i].buffer_id = hn->buffer_id;
708
- galloc->leaf_allocs[i].leaf.offset = hn->offset;
709
- galloc->leaf_allocs[i].leaf.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
708
+ if (leaf->view_src || leaf->data) {
709
+ galloc->leaf_allocs[i].leaf.offset = SIZE_MAX;
710
+ galloc->leaf_allocs[i].leaf.size_max = 0;
711
+ } else {
712
+ galloc->leaf_allocs[i].leaf.offset = hn->offset;
713
+ galloc->leaf_allocs[i].leaf.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
714
+ }
710
715
  }
711
716
 
712
717
  // reallocate buffers if needed
@@ -420,7 +420,7 @@ GGML_CALL static void ggml_backend_registry_init(void) {
420
420
  ggml_backend_register("CPU", ggml_backend_reg_cpu_init, ggml_backend_cpu_buffer_type(), NULL);
421
421
 
422
422
  // add forward decls here to avoid including the backend headers
423
- #ifdef GGML_USE_CUBLAS
423
+ #ifdef GGML_USE_CUDA
424
424
  extern GGML_CALL void ggml_backend_cuda_reg_devices(void);
425
425
  ggml_backend_cuda_reg_devices();
426
426
  #endif
@@ -377,6 +377,27 @@ typedef struct {
377
377
  } block_iq1_s;
378
378
  static_assert(sizeof(block_iq1_s) == sizeof(ggml_half) + QK_K/8 + QK_K/16, "wrong iq1_s block size/padding");
379
379
 
380
+ // 1.75 bpw
381
+ typedef struct {
382
+ uint8_t qs[QK_K/8]; // grid index, low 8 bits
383
+ uint8_t qh[QK_K/16]; // grid index, high 3 bits + grid shift bit (for two groups of 8)
384
+ #if QK_K == 64
385
+ ggml_half d;
386
+ #endif
387
+ uint8_t scales[QK_K/32]; // 3-bit block scales (4-bit if QK_K == 64)
388
+ } block_iq1_m;
389
+ #if QK_K == 64
390
+ static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32 + sizeof(ggml_half), "wrong iq1_m block size/padding");
391
+ #else
392
+ static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32, "wrong iq1_m block size/padding");
393
+ #endif
394
+
395
+ // Used by IQ1_M quants
396
+ typedef union {
397
+ ggml_half f16;
398
+ uint16_t u16;
399
+ } iq1m_scale_t;
400
+
380
401
  // Non-linear quants
381
402
  #define QK4_NL 32
382
403
  typedef struct {
@@ -426,10 +447,11 @@ static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_
426
447
 
427
448
  #define GGML_COMMON_IMPL
428
449
  #elif defined(GGML_COMMON_IMPL_SYCL)
450
+
429
451
  #include <cstdint>
430
452
 
431
- #define GGML_TABLE_BEGIN(type, name, size) static dpct::global_memory<const type, 1> name(sycl::range<1>(size), {
432
- #define GGML_TABLE_END() });
453
+ #define GGML_TABLE_BEGIN(type, name, size) static const type name[size] = {
454
+ #define GGML_TABLE_END() };
433
455
 
434
456
  #define GGML_COMMON_IMPL
435
457
  #endif
@@ -1050,6 +1072,7 @@ GGML_TABLE_END()
1050
1072
 
1051
1073
  #define NGRID_IQ1S 2048
1052
1074
  #define IQ1S_DELTA 0.125f
1075
+ #define IQ1M_DELTA 0.125f
1053
1076
  #if defined(GGML_COMMON_IMPL_C)
1054
1077
  GGML_TABLE_BEGIN(uint64_t, iq1s_grid, NGRID_IQ1S)
1055
1078
  0xffffffffffffffff, 0xffffffffffffff01, 0xffffffffffff0000, 0xffffffffffff01ff,