llama_cpp 0.0.5 → 0.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -204,7 +204,9 @@ enum ggml_type {
204
204
  GGML_TYPE_F16 = 1,
205
205
  GGML_TYPE_Q4_0 = 2,
206
206
  GGML_TYPE_Q4_1 = 3,
207
- GGML_TYPE_Q8_0 = 4,
207
+ GGML_TYPE_Q4_2 = 4,
208
+ GGML_TYPE_Q4_3 = 5,
209
+ GGML_TYPE_Q8_0 = 6,
208
210
  GGML_TYPE_I8,
209
211
  GGML_TYPE_I16,
210
212
  GGML_TYPE_I32,
@@ -359,6 +361,8 @@ const char * ggml_type_name(enum ggml_type type);
359
361
 
360
362
  size_t ggml_element_size(const struct ggml_tensor * tensor);
361
363
 
364
+ bool ggml_is_quantized(enum ggml_type type);
365
+
362
366
  struct ggml_context * ggml_init(struct ggml_init_params params);
363
367
  void ggml_free(struct ggml_context * ctx);
364
368
 
@@ -626,7 +630,8 @@ struct ggml_tensor * ggml_soft_max(
626
630
 
627
631
  // rotary position embedding
628
632
  // in-place, returns view(a)
629
- // if mode == 1, skip n_past elements
633
+ // if mode & 1 == 1, skip n_past elements
634
+ // if mode & 2 == 1, GPT-NeoX style
630
635
  // TODO: avoid creating a new tensor every time
631
636
  struct ggml_tensor * ggml_rope(
632
637
  struct ggml_context * ctx,
@@ -806,6 +811,10 @@ enum ggml_opt_result ggml_opt(
806
811
 
807
812
  size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
808
813
  size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
814
+ size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist);
815
+ size_t ggml_quantize_q4_3(const float * src, void * dst, int n, int k, int64_t * hist);
816
+
817
+ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);
809
818
 
810
819
  //
811
820
  // system info
@@ -823,6 +832,7 @@ int ggml_cpu_has_f16c(void);
823
832
  int ggml_cpu_has_fp16_va(void);
824
833
  int ggml_cpu_has_wasm_simd(void);
825
834
  int ggml_cpu_has_blas(void);
835
+ int ggml_cpu_has_cublas(void);
826
836
  int ggml_cpu_has_sse3(void);
827
837
  int ggml_cpu_has_vsx(void);
828
838
 
@@ -24,6 +24,9 @@
24
24
  #include <memory>
25
25
  #include <algorithm>
26
26
  #include <initializer_list>
27
+ #include <thread>
28
+ #include <atomic>
29
+ #include <mutex>
27
30
 
28
31
  #define LLAMA_USE_SCRATCH
29
32
  #define LLAMA_MAX_SCRATCH_BUFFERS 16
@@ -478,6 +481,8 @@ struct llama_file_loader {
478
481
  case GGML_TYPE_F16:
479
482
  case GGML_TYPE_Q4_0:
480
483
  case GGML_TYPE_Q4_1:
484
+ case GGML_TYPE_Q4_2:
485
+ case GGML_TYPE_Q4_3:
481
486
  break;
482
487
  default: {
483
488
  throw format("unrecognized tensor type %u\n", shard.type);
@@ -550,6 +555,8 @@ struct llama_file_saver {
550
555
  case GGML_TYPE_F16:
551
556
  case GGML_TYPE_Q4_0:
552
557
  case GGML_TYPE_Q4_1:
558
+ case GGML_TYPE_Q4_2:
559
+ case GGML_TYPE_Q4_3:
553
560
  break;
554
561
  default: LLAMA_ASSERT(false);
555
562
  }
@@ -838,6 +845,8 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
838
845
  case LLAMA_FTYPE_MOSTLY_Q4_1: return "mostly Q4_1";
839
846
  case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
840
847
  return "mostly Q4_1, some F16";
848
+ case LLAMA_FTYPE_MOSTLY_Q4_2: return "mostly Q4_2";
849
+ case LLAMA_FTYPE_MOSTLY_Q4_3: return "mostly Q4_3";
841
850
  default: return "unknown, may not work";
842
851
  }
843
852
  }
@@ -1066,7 +1075,7 @@ static bool llama_eval_internal(
1066
1075
  // for big prompts, if BLAS is enabled, it is better to use only one thread
1067
1076
  // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
1068
1077
  ggml_cgraph gf = {};
1069
- gf.n_threads = N >= 32 && ggml_cpu_has_blas() ? 1 : n_threads;
1078
+ gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_cublas() ? 1 : n_threads;
1070
1079
 
1071
1080
  struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
1072
1081
  memcpy(embd->data, tokens, N*ggml_element_size(embd));
@@ -1566,14 +1575,20 @@ static llama_vocab::id llama_sample_top_p_top_k(
1566
1575
  // quantization
1567
1576
  //
1568
1577
 
1569
- static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, enum llama_ftype ftype) {
1578
+ static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, enum llama_ftype ftype, int nthread) {
1570
1579
  ggml_type quantized_type;
1571
1580
  switch (ftype) {
1572
1581
  case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break;
1573
1582
  case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
1583
+ case LLAMA_FTYPE_MOSTLY_Q4_2: quantized_type = GGML_TYPE_Q4_2; break;
1584
+ case LLAMA_FTYPE_MOSTLY_Q4_3: quantized_type = GGML_TYPE_Q4_3; break;
1574
1585
  default: throw format("invalid output file type %d\n", ftype);
1575
1586
  };
1576
1587
 
1588
+ if (nthread <= 0) {
1589
+ nthread = std::thread::hardware_concurrency();
1590
+ }
1591
+
1577
1592
  std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp.c_str(), /*use_mmap*/ false,
1578
1593
  /*vocab_only*/ false));
1579
1594
  llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), ftype);
@@ -1582,6 +1597,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1582
1597
  size_t total_size_new = 0;
1583
1598
  std::vector<int64_t> hist_all(1 << 4, 0);
1584
1599
 
1600
+ std::vector<std::thread> workers;
1601
+ std::mutex mutex;
1602
+
1585
1603
  size_t idx = 0;
1586
1604
  for (llama_load_tensor & tensor : model_loader->tensors_map.tensors) {
1587
1605
  llama_buffer read_data;
@@ -1600,6 +1618,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1600
1618
  // quantize only 2D tensors
1601
1619
  quantize &= (tensor.ne.size() == 2);
1602
1620
 
1621
+ // GG: uncomment this to keep the output layer in FP16
1622
+ //if (tensor.name.rfind("output")) {
1623
+ // quantize = false;
1624
+ //}
1625
+
1603
1626
  enum ggml_type new_type;
1604
1627
  void * new_data;
1605
1628
  size_t new_size;
@@ -1635,17 +1658,37 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1635
1658
  new_data = work.addr;
1636
1659
  std::vector<int64_t> hist_cur(1 << 4, 0);
1637
1660
 
1638
- switch (new_type) {
1639
- case GGML_TYPE_Q4_0:
1640
- {
1641
- new_size = ggml_quantize_q4_0(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data());
1642
- } break;
1643
- case GGML_TYPE_Q4_1:
1644
- {
1645
- new_size = ggml_quantize_q4_1(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data());
1646
- } break;
1647
- default:
1648
- LLAMA_ASSERT(false);
1661
+ int chunk_size = 32 * 512;
1662
+ const int nchunk = (nelements + chunk_size - 1)/chunk_size;
1663
+ const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1;
1664
+ if (nthread_use < 2) {
1665
+ new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nelements, hist_cur.data());
1666
+ } else {
1667
+ size_t counter = 0;
1668
+ new_size = 0;
1669
+ auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements, chunk_size] () {
1670
+ std::vector<int64_t> local_hist;
1671
+ size_t local_size = 0;
1672
+ while (true) {
1673
+ std::unique_lock<std::mutex> lock(mutex);
1674
+ size_t first = counter; counter += chunk_size;
1675
+ if (first >= nelements) {
1676
+ if (!local_hist.empty()) {
1677
+ for (int j=0; j<int(local_hist.size()); ++j) hist_cur[j] += local_hist[j];
1678
+ new_size += local_size;
1679
+ }
1680
+ break;
1681
+ }
1682
+ lock.unlock();
1683
+ size_t last = std::min(nelements, first + chunk_size);
1684
+ if (local_hist.empty()) local_hist.resize(hist_cur.size(), 0);
1685
+ local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, local_hist.data());
1686
+ }
1687
+ };
1688
+ if (int(workers.size()) < nthread_use - 1) workers.resize(nthread_use - 1);
1689
+ for (int it = 0; it < nthread_use - 1; ++it) workers[it] = std::thread(compute);
1690
+ compute();
1691
+ for (int it = 0; it < nthread_use - 1; ++it) workers[it].join();
1649
1692
  }
1650
1693
 
1651
1694
  printf("size = %8.2f MB -> %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0);
@@ -1767,9 +1810,10 @@ void llama_free(struct llama_context * ctx) {
1767
1810
  int llama_model_quantize(
1768
1811
  const char * fname_inp,
1769
1812
  const char * fname_out,
1770
- enum llama_ftype ftype) {
1813
+ enum llama_ftype ftype,
1814
+ int nthread) {
1771
1815
  try {
1772
- llama_model_quantize_internal(fname_inp, fname_out, ftype);
1816
+ llama_model_quantize_internal(fname_inp, fname_out, ftype, nthread);
1773
1817
  return 0;
1774
1818
  } catch (const std::string & err) {
1775
1819
  fprintf(stderr, "%s: failed to quantize: %s\n", __func__, err.c_str());
@@ -1955,7 +1999,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
1955
1999
  base_t = dest_t;
1956
2000
  }
1957
2001
 
1958
- if (base_t->type == GGML_TYPE_Q4_0 || base_t->type == GGML_TYPE_Q4_1) {
2002
+ if (ggml_is_quantized(base_t->type)) {
1959
2003
  if (!warned) {
1960
2004
  fprintf(stderr, "%s: warning: using a lora adapter with a quantized model may result in poor quality, "
1961
2005
  "use a f16 or f32 base model with --lora-base\n", __func__);
@@ -72,6 +72,8 @@ extern "C" {
72
72
  LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
73
73
  LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
74
74
  LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
75
+ LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // except 1d tensors
76
+ LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // except 1d tensors
75
77
  };
76
78
 
77
79
  LLAMA_API struct llama_context_params llama_context_default_params();
@@ -91,10 +93,12 @@ extern "C" {
91
93
 
92
94
  // TODO: not great API - very likely to change
93
95
  // Returns 0 on success
96
+ // nthread - how many threads to use. If <=0, will use std::thread::hardware_concurrency(), else the number given
94
97
  LLAMA_API int llama_model_quantize(
95
98
  const char * fname_inp,
96
99
  const char * fname_out,
97
- enum llama_ftype ftype);
100
+ enum llama_ftype ftype,
101
+ int nthread);
98
102
 
99
103
  // Apply a LoRA adapter to a loaded model
100
104
  // path_base_model is the path to a higher quality model to use as a base for
@@ -202,7 +202,6 @@ struct llama_mmap {
202
202
 
203
203
  HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
204
204
  DWORD error = GetLastError();
205
- CloseHandle(hFile);
206
205
 
207
206
  if (hMapping == NULL) {
208
207
  throw format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str());
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.0.5'
6
+ VERSION = '0.0.6'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'master-315a95a'
9
+ LLAMA_CPP_VERSION = 'master-12b5900'
10
10
  end
data/sig/llama_cpp.rbs CHANGED
@@ -5,6 +5,15 @@ module LLaMACpp
5
5
  LLAMA_FILE_MAGIC: String
6
6
  LLAMA_FILE_MAGIC_UNVERSIONED: String
7
7
 
8
+ LLAMA_FTYPE_ALL_F32: Integer
9
+ LLAMA_FTYPE_MOSTLY_F16: Integer
10
+ LLAMA_FTYPE_MOSTLY_Q4_0: Integer
11
+ LLAMA_FTYPE_MOSTLY_Q4_1: Integer
12
+ LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16: Integer
13
+ LLAMA_FTYPE_MOSTLY_Q4_2: Integer
14
+ LLAMA_FTYPE_MOSTLY_Q4_3: Integer
15
+
16
+ def self?.model_quantize: (input_path: String, output_path: String, ftype: Integer, ?n_threads: Integer) -> void
8
17
  def self?.generate: (::LLaMACpp::Context, String, ?n_threads: Integer) -> String
9
18
  def self?.print_system_info: () -> void
10
19
  def self?.token_bos: () -> Integer
@@ -18,7 +27,7 @@ module LLaMACpp
18
27
  def initialize: (model_path: String, params: ::LLaMACpp::ContextParams) -> void
19
28
  | () -> void
20
29
  def embeddings: () -> Array[Float]
21
- def eval: (tokens: Array[Integer], n_past: Integer, ?n_tokens: Integer, ?n_threads: Integer) -> Qnil
30
+ def eval: (tokens: Array[Integer], n_past: Integer, ?n_tokens: Integer, ?n_threads: Integer) -> void
22
31
  def free: () -> void
23
32
  def load: (model_path: String, params: ::LLaMACpp::ContextParams) -> void
24
33
  def logits: () -> Array[Float]
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llama_cpp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.5
4
+ version: 0.0.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-04-20 00:00:00.000000000 Z
11
+ date: 2023-04-22 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
14
14
  email:
@@ -26,6 +26,7 @@ files:
26
26
  - ext/llama_cpp/llama_cpp.cpp
27
27
  - ext/llama_cpp/llama_cpp.h
28
28
  - ext/llama_cpp/src/LICENSE
29
+ - ext/llama_cpp/src/ggml-cuda.h
29
30
  - ext/llama_cpp/src/ggml.c
30
31
  - ext/llama_cpp/src/ggml.h
31
32
  - ext/llama_cpp/src/llama.cpp