llama_cpp 0.0.5 → 0.0.6

Sign up to get free protection for your applications and to get access to all the features.
@@ -204,7 +204,9 @@ enum ggml_type {
204
204
  GGML_TYPE_F16 = 1,
205
205
  GGML_TYPE_Q4_0 = 2,
206
206
  GGML_TYPE_Q4_1 = 3,
207
- GGML_TYPE_Q8_0 = 4,
207
+ GGML_TYPE_Q4_2 = 4,
208
+ GGML_TYPE_Q4_3 = 5,
209
+ GGML_TYPE_Q8_0 = 6,
208
210
  GGML_TYPE_I8,
209
211
  GGML_TYPE_I16,
210
212
  GGML_TYPE_I32,
@@ -359,6 +361,8 @@ const char * ggml_type_name(enum ggml_type type);
359
361
 
360
362
  size_t ggml_element_size(const struct ggml_tensor * tensor);
361
363
 
364
+ bool ggml_is_quantized(enum ggml_type type);
365
+
362
366
  struct ggml_context * ggml_init(struct ggml_init_params params);
363
367
  void ggml_free(struct ggml_context * ctx);
364
368
 
@@ -626,7 +630,8 @@ struct ggml_tensor * ggml_soft_max(
626
630
 
627
631
  // rotary position embedding
628
632
  // in-place, returns view(a)
629
- // if mode == 1, skip n_past elements
633
+ // if mode & 1 == 1, skip n_past elements
634
+ // if mode & 2 == 1, GPT-NeoX style
630
635
  // TODO: avoid creating a new tensor every time
631
636
  struct ggml_tensor * ggml_rope(
632
637
  struct ggml_context * ctx,
@@ -806,6 +811,10 @@ enum ggml_opt_result ggml_opt(
806
811
 
807
812
  size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
808
813
  size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
814
+ size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist);
815
+ size_t ggml_quantize_q4_3(const float * src, void * dst, int n, int k, int64_t * hist);
816
+
817
+ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);
809
818
 
810
819
  //
811
820
  // system info
@@ -823,6 +832,7 @@ int ggml_cpu_has_f16c(void);
823
832
  int ggml_cpu_has_fp16_va(void);
824
833
  int ggml_cpu_has_wasm_simd(void);
825
834
  int ggml_cpu_has_blas(void);
835
+ int ggml_cpu_has_cublas(void);
826
836
  int ggml_cpu_has_sse3(void);
827
837
  int ggml_cpu_has_vsx(void);
828
838
 
@@ -24,6 +24,9 @@
24
24
  #include <memory>
25
25
  #include <algorithm>
26
26
  #include <initializer_list>
27
+ #include <thread>
28
+ #include <atomic>
29
+ #include <mutex>
27
30
 
28
31
  #define LLAMA_USE_SCRATCH
29
32
  #define LLAMA_MAX_SCRATCH_BUFFERS 16
@@ -478,6 +481,8 @@ struct llama_file_loader {
478
481
  case GGML_TYPE_F16:
479
482
  case GGML_TYPE_Q4_0:
480
483
  case GGML_TYPE_Q4_1:
484
+ case GGML_TYPE_Q4_2:
485
+ case GGML_TYPE_Q4_3:
481
486
  break;
482
487
  default: {
483
488
  throw format("unrecognized tensor type %u\n", shard.type);
@@ -550,6 +555,8 @@ struct llama_file_saver {
550
555
  case GGML_TYPE_F16:
551
556
  case GGML_TYPE_Q4_0:
552
557
  case GGML_TYPE_Q4_1:
558
+ case GGML_TYPE_Q4_2:
559
+ case GGML_TYPE_Q4_3:
553
560
  break;
554
561
  default: LLAMA_ASSERT(false);
555
562
  }
@@ -838,6 +845,8 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
838
845
  case LLAMA_FTYPE_MOSTLY_Q4_1: return "mostly Q4_1";
839
846
  case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
840
847
  return "mostly Q4_1, some F16";
848
+ case LLAMA_FTYPE_MOSTLY_Q4_2: return "mostly Q4_2";
849
+ case LLAMA_FTYPE_MOSTLY_Q4_3: return "mostly Q4_3";
841
850
  default: return "unknown, may not work";
842
851
  }
843
852
  }
@@ -1066,7 +1075,7 @@ static bool llama_eval_internal(
1066
1075
  // for big prompts, if BLAS is enabled, it is better to use only one thread
1067
1076
  // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
1068
1077
  ggml_cgraph gf = {};
1069
- gf.n_threads = N >= 32 && ggml_cpu_has_blas() ? 1 : n_threads;
1078
+ gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_cublas() ? 1 : n_threads;
1070
1079
 
1071
1080
  struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
1072
1081
  memcpy(embd->data, tokens, N*ggml_element_size(embd));
@@ -1566,14 +1575,20 @@ static llama_vocab::id llama_sample_top_p_top_k(
1566
1575
  // quantization
1567
1576
  //
1568
1577
 
1569
- static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, enum llama_ftype ftype) {
1578
+ static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, enum llama_ftype ftype, int nthread) {
1570
1579
  ggml_type quantized_type;
1571
1580
  switch (ftype) {
1572
1581
  case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break;
1573
1582
  case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
1583
+ case LLAMA_FTYPE_MOSTLY_Q4_2: quantized_type = GGML_TYPE_Q4_2; break;
1584
+ case LLAMA_FTYPE_MOSTLY_Q4_3: quantized_type = GGML_TYPE_Q4_3; break;
1574
1585
  default: throw format("invalid output file type %d\n", ftype);
1575
1586
  };
1576
1587
 
1588
+ if (nthread <= 0) {
1589
+ nthread = std::thread::hardware_concurrency();
1590
+ }
1591
+
1577
1592
  std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp.c_str(), /*use_mmap*/ false,
1578
1593
  /*vocab_only*/ false));
1579
1594
  llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), ftype);
@@ -1582,6 +1597,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1582
1597
  size_t total_size_new = 0;
1583
1598
  std::vector<int64_t> hist_all(1 << 4, 0);
1584
1599
 
1600
+ std::vector<std::thread> workers;
1601
+ std::mutex mutex;
1602
+
1585
1603
  size_t idx = 0;
1586
1604
  for (llama_load_tensor & tensor : model_loader->tensors_map.tensors) {
1587
1605
  llama_buffer read_data;
@@ -1600,6 +1618,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1600
1618
  // quantize only 2D tensors
1601
1619
  quantize &= (tensor.ne.size() == 2);
1602
1620
 
1621
+ // GG: uncomment this to keep the output layer in FP16
1622
+ //if (tensor.name.rfind("output")) {
1623
+ // quantize = false;
1624
+ //}
1625
+
1603
1626
  enum ggml_type new_type;
1604
1627
  void * new_data;
1605
1628
  size_t new_size;
@@ -1635,17 +1658,37 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1635
1658
  new_data = work.addr;
1636
1659
  std::vector<int64_t> hist_cur(1 << 4, 0);
1637
1660
 
1638
- switch (new_type) {
1639
- case GGML_TYPE_Q4_0:
1640
- {
1641
- new_size = ggml_quantize_q4_0(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data());
1642
- } break;
1643
- case GGML_TYPE_Q4_1:
1644
- {
1645
- new_size = ggml_quantize_q4_1(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data());
1646
- } break;
1647
- default:
1648
- LLAMA_ASSERT(false);
1661
+ int chunk_size = 32 * 512;
1662
+ const int nchunk = (nelements + chunk_size - 1)/chunk_size;
1663
+ const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1;
1664
+ if (nthread_use < 2) {
1665
+ new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nelements, hist_cur.data());
1666
+ } else {
1667
+ size_t counter = 0;
1668
+ new_size = 0;
1669
+ auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements, chunk_size] () {
1670
+ std::vector<int64_t> local_hist;
1671
+ size_t local_size = 0;
1672
+ while (true) {
1673
+ std::unique_lock<std::mutex> lock(mutex);
1674
+ size_t first = counter; counter += chunk_size;
1675
+ if (first >= nelements) {
1676
+ if (!local_hist.empty()) {
1677
+ for (int j=0; j<int(local_hist.size()); ++j) hist_cur[j] += local_hist[j];
1678
+ new_size += local_size;
1679
+ }
1680
+ break;
1681
+ }
1682
+ lock.unlock();
1683
+ size_t last = std::min(nelements, first + chunk_size);
1684
+ if (local_hist.empty()) local_hist.resize(hist_cur.size(), 0);
1685
+ local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, local_hist.data());
1686
+ }
1687
+ };
1688
+ if (int(workers.size()) < nthread_use - 1) workers.resize(nthread_use - 1);
1689
+ for (int it = 0; it < nthread_use - 1; ++it) workers[it] = std::thread(compute);
1690
+ compute();
1691
+ for (int it = 0; it < nthread_use - 1; ++it) workers[it].join();
1649
1692
  }
1650
1693
 
1651
1694
  printf("size = %8.2f MB -> %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0);
@@ -1767,9 +1810,10 @@ void llama_free(struct llama_context * ctx) {
1767
1810
  int llama_model_quantize(
1768
1811
  const char * fname_inp,
1769
1812
  const char * fname_out,
1770
- enum llama_ftype ftype) {
1813
+ enum llama_ftype ftype,
1814
+ int nthread) {
1771
1815
  try {
1772
- llama_model_quantize_internal(fname_inp, fname_out, ftype);
1816
+ llama_model_quantize_internal(fname_inp, fname_out, ftype, nthread);
1773
1817
  return 0;
1774
1818
  } catch (const std::string & err) {
1775
1819
  fprintf(stderr, "%s: failed to quantize: %s\n", __func__, err.c_str());
@@ -1955,7 +1999,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
1955
1999
  base_t = dest_t;
1956
2000
  }
1957
2001
 
1958
- if (base_t->type == GGML_TYPE_Q4_0 || base_t->type == GGML_TYPE_Q4_1) {
2002
+ if (ggml_is_quantized(base_t->type)) {
1959
2003
  if (!warned) {
1960
2004
  fprintf(stderr, "%s: warning: using a lora adapter with a quantized model may result in poor quality, "
1961
2005
  "use a f16 or f32 base model with --lora-base\n", __func__);
@@ -72,6 +72,8 @@ extern "C" {
72
72
  LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
73
73
  LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
74
74
  LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
75
+ LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // except 1d tensors
76
+ LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // except 1d tensors
75
77
  };
76
78
 
77
79
  LLAMA_API struct llama_context_params llama_context_default_params();
@@ -91,10 +93,12 @@ extern "C" {
91
93
 
92
94
  // TODO: not great API - very likely to change
93
95
  // Returns 0 on success
96
+ // nthread - how many threads to use. If <=0, will use std::thread::hardware_concurrency(), else the number given
94
97
  LLAMA_API int llama_model_quantize(
95
98
  const char * fname_inp,
96
99
  const char * fname_out,
97
- enum llama_ftype ftype);
100
+ enum llama_ftype ftype,
101
+ int nthread);
98
102
 
99
103
  // Apply a LoRA adapter to a loaded model
100
104
  // path_base_model is the path to a higher quality model to use as a base for
@@ -202,7 +202,6 @@ struct llama_mmap {
202
202
 
203
203
  HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
204
204
  DWORD error = GetLastError();
205
- CloseHandle(hFile);
206
205
 
207
206
  if (hMapping == NULL) {
208
207
  throw format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str());
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.0.5'
6
+ VERSION = '0.0.6'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'master-315a95a'
9
+ LLAMA_CPP_VERSION = 'master-12b5900'
10
10
  end
data/sig/llama_cpp.rbs CHANGED
@@ -5,6 +5,15 @@ module LLaMACpp
5
5
  LLAMA_FILE_MAGIC: String
6
6
  LLAMA_FILE_MAGIC_UNVERSIONED: String
7
7
 
8
+ LLAMA_FTYPE_ALL_F32: Integer
9
+ LLAMA_FTYPE_MOSTLY_F16: Integer
10
+ LLAMA_FTYPE_MOSTLY_Q4_0: Integer
11
+ LLAMA_FTYPE_MOSTLY_Q4_1: Integer
12
+ LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16: Integer
13
+ LLAMA_FTYPE_MOSTLY_Q4_2: Integer
14
+ LLAMA_FTYPE_MOSTLY_Q4_3: Integer
15
+
16
+ def self?.model_quantize: (input_path: String, output_path: String, ftype: Integer, ?n_threads: Integer) -> void
8
17
  def self?.generate: (::LLaMACpp::Context, String, ?n_threads: Integer) -> String
9
18
  def self?.print_system_info: () -> void
10
19
  def self?.token_bos: () -> Integer
@@ -18,7 +27,7 @@ module LLaMACpp
18
27
  def initialize: (model_path: String, params: ::LLaMACpp::ContextParams) -> void
19
28
  | () -> void
20
29
  def embeddings: () -> Array[Float]
21
- def eval: (tokens: Array[Integer], n_past: Integer, ?n_tokens: Integer, ?n_threads: Integer) -> Qnil
30
+ def eval: (tokens: Array[Integer], n_past: Integer, ?n_tokens: Integer, ?n_threads: Integer) -> void
22
31
  def free: () -> void
23
32
  def load: (model_path: String, params: ::LLaMACpp::ContextParams) -> void
24
33
  def logits: () -> Array[Float]
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llama_cpp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.5
4
+ version: 0.0.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-04-20 00:00:00.000000000 Z
11
+ date: 2023-04-22 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
14
14
  email:
@@ -26,6 +26,7 @@ files:
26
26
  - ext/llama_cpp/llama_cpp.cpp
27
27
  - ext/llama_cpp/llama_cpp.h
28
28
  - ext/llama_cpp/src/LICENSE
29
+ - ext/llama_cpp/src/ggml-cuda.h
29
30
  - ext/llama_cpp/src/ggml.c
30
31
  - ext/llama_cpp/src/ggml.h
31
32
  - ext/llama_cpp/src/llama.cpp