llama_cpp 0.0.5 → 0.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -24,6 +24,10 @@
24
24
  #include <memory>
25
25
  #include <algorithm>
26
26
  #include <initializer_list>
27
+ #include <thread>
28
+ #include <atomic>
29
+ #include <mutex>
30
+ #include <sstream>
27
31
 
28
32
  #define LLAMA_USE_SCRATCH
29
33
  #define LLAMA_MAX_SCRATCH_BUFFERS 16
@@ -50,7 +54,7 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
50
54
  { MODEL_7B, 512ull * MB },
51
55
  { MODEL_13B, 512ull * MB },
52
56
  { MODEL_30B, 512ull * MB },
53
- { MODEL_65B, 512ull * MB },
57
+ { MODEL_65B, 1024ull * MB },
54
58
  };
55
59
  return _MEM_REQ_SCRATCH0;
56
60
  }
@@ -61,10 +65,10 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
61
65
  { MODEL_7B, 512ull * MB },
62
66
  { MODEL_13B, 512ull * MB },
63
67
  { MODEL_30B, 512ull * MB },
64
- { MODEL_65B, 512ull * MB },
68
+ { MODEL_65B, 1024ull * MB },
65
69
  };
66
70
  return _MEM_REQ_SCRATCH1;
67
- };
71
+ }
68
72
 
69
73
  // 2*n_embd*n_ctx*n_layer*sizeof(float16)
70
74
  static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
@@ -76,7 +80,7 @@ static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
76
80
  { MODEL_65B, 5120ull * MB },
77
81
  };
78
82
  return _MEM_REQ_KV_SELF;
79
- };
83
+ }
80
84
 
81
85
  // this is mostly needed for temporary mul_mat buffers to dequantize the data
82
86
  // not actually needed if BLAS is disabled
@@ -89,7 +93,7 @@ static const std::map<e_model, size_t> & MEM_REQ_EVAL()
89
93
  { MODEL_65B, 1536ull * MB },
90
94
  };
91
95
  return _MEM_REQ_EVAL;
92
- };
96
+ }
93
97
 
94
98
  // default hparams (LLaMA 7B)
95
99
  struct llama_hparams {
@@ -478,6 +482,11 @@ struct llama_file_loader {
478
482
  case GGML_TYPE_F16:
479
483
  case GGML_TYPE_Q4_0:
480
484
  case GGML_TYPE_Q4_1:
485
+ case GGML_TYPE_Q4_2:
486
+ case GGML_TYPE_Q4_3:
487
+ case GGML_TYPE_Q5_0:
488
+ case GGML_TYPE_Q5_1:
489
+ case GGML_TYPE_Q8_0:
481
490
  break;
482
491
  default: {
483
492
  throw format("unrecognized tensor type %u\n", shard.type);
@@ -550,6 +559,11 @@ struct llama_file_saver {
550
559
  case GGML_TYPE_F16:
551
560
  case GGML_TYPE_Q4_0:
552
561
  case GGML_TYPE_Q4_1:
562
+ case GGML_TYPE_Q4_2:
563
+ case GGML_TYPE_Q4_3:
564
+ case GGML_TYPE_Q5_0:
565
+ case GGML_TYPE_Q5_1:
566
+ case GGML_TYPE_Q8_0:
553
567
  break;
554
568
  default: LLAMA_ASSERT(false);
555
569
  }
@@ -838,6 +852,11 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
838
852
  case LLAMA_FTYPE_MOSTLY_Q4_1: return "mostly Q4_1";
839
853
  case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
840
854
  return "mostly Q4_1, some F16";
855
+ case LLAMA_FTYPE_MOSTLY_Q4_2: return "mostly Q4_2";
856
+ case LLAMA_FTYPE_MOSTLY_Q4_3: return "mostly Q4_3";
857
+ case LLAMA_FTYPE_MOSTLY_Q5_0: return "mostly Q5_0";
858
+ case LLAMA_FTYPE_MOSTLY_Q5_1: return "mostly Q5_1";
859
+ case LLAMA_FTYPE_MOSTLY_Q8_0: return "mostly Q8_0";
841
860
  default: return "unknown, may not work";
842
861
  }
843
862
  }
@@ -1066,7 +1085,7 @@ static bool llama_eval_internal(
1066
1085
  // for big prompts, if BLAS is enabled, it is better to use only one thread
1067
1086
  // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
1068
1087
  ggml_cgraph gf = {};
1069
- gf.n_threads = N >= 32 && ggml_cpu_has_blas() ? 1 : n_threads;
1088
+ gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
1070
1089
 
1071
1090
  struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
1072
1091
  memcpy(embd->data, tokens, N*ggml_element_size(embd));
@@ -1240,9 +1259,11 @@ static bool llama_eval_internal(
1240
1259
  ggml_build_forward_expand(&gf, inpL);
1241
1260
  ggml_graph_compute (ctx0, &gf);
1242
1261
 
1262
+ #ifdef GGML_PERF
1243
1263
  // print timing information per ggml operation (for debugging purposes)
1244
1264
  // requires GGML_PERF to be defined
1245
- //ggml_graph_print(&gf);
1265
+ ggml_graph_print(&gf);
1266
+ #endif
1246
1267
 
1247
1268
  // plot the computation graph in dot format (for debugging purposes)
1248
1269
  //if (n_past%100 == 0) {
@@ -1566,14 +1587,23 @@ static llama_vocab::id llama_sample_top_p_top_k(
1566
1587
  // quantization
1567
1588
  //
1568
1589
 
1569
- static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, enum llama_ftype ftype) {
1590
+ static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, enum llama_ftype ftype, int nthread) {
1570
1591
  ggml_type quantized_type;
1571
1592
  switch (ftype) {
1572
1593
  case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break;
1573
1594
  case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
1595
+ case LLAMA_FTYPE_MOSTLY_Q4_2: quantized_type = GGML_TYPE_Q4_2; break;
1596
+ case LLAMA_FTYPE_MOSTLY_Q4_3: quantized_type = GGML_TYPE_Q4_3; break;
1597
+ case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break;
1598
+ case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break;
1599
+ case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break;
1574
1600
  default: throw format("invalid output file type %d\n", ftype);
1575
1601
  };
1576
1602
 
1603
+ if (nthread <= 0) {
1604
+ nthread = std::thread::hardware_concurrency();
1605
+ }
1606
+
1577
1607
  std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp.c_str(), /*use_mmap*/ false,
1578
1608
  /*vocab_only*/ false));
1579
1609
  llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), ftype);
@@ -1582,6 +1612,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1582
1612
  size_t total_size_new = 0;
1583
1613
  std::vector<int64_t> hist_all(1 << 4, 0);
1584
1614
 
1615
+ std::vector<std::thread> workers;
1616
+ std::mutex mutex;
1617
+
1585
1618
  size_t idx = 0;
1586
1619
  for (llama_load_tensor & tensor : model_loader->tensors_map.tensors) {
1587
1620
  llama_buffer read_data;
@@ -1600,6 +1633,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1600
1633
  // quantize only 2D tensors
1601
1634
  quantize &= (tensor.ne.size() == 2);
1602
1635
 
1636
+ // uncomment this to keep the output layer in FP16
1637
+ //if (tensor.name == "output.weight") {
1638
+ // quantize = false;
1639
+ //}
1640
+
1603
1641
  enum ggml_type new_type;
1604
1642
  void * new_data;
1605
1643
  size_t new_size;
@@ -1635,17 +1673,37 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1635
1673
  new_data = work.addr;
1636
1674
  std::vector<int64_t> hist_cur(1 << 4, 0);
1637
1675
 
1638
- switch (new_type) {
1639
- case GGML_TYPE_Q4_0:
1640
- {
1641
- new_size = ggml_quantize_q4_0(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data());
1642
- } break;
1643
- case GGML_TYPE_Q4_1:
1644
- {
1645
- new_size = ggml_quantize_q4_1(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data());
1646
- } break;
1647
- default:
1648
- LLAMA_ASSERT(false);
1676
+ int chunk_size = 32 * 512;
1677
+ const int nchunk = (nelements + chunk_size - 1)/chunk_size;
1678
+ const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1;
1679
+ if (nthread_use < 2) {
1680
+ new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nelements, hist_cur.data());
1681
+ } else {
1682
+ size_t counter = 0;
1683
+ new_size = 0;
1684
+ auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements, chunk_size] () {
1685
+ std::vector<int64_t> local_hist;
1686
+ size_t local_size = 0;
1687
+ while (true) {
1688
+ std::unique_lock<std::mutex> lock(mutex);
1689
+ size_t first = counter; counter += chunk_size;
1690
+ if (first >= nelements) {
1691
+ if (!local_hist.empty()) {
1692
+ for (int j=0; j<int(local_hist.size()); ++j) hist_cur[j] += local_hist[j];
1693
+ new_size += local_size;
1694
+ }
1695
+ break;
1696
+ }
1697
+ lock.unlock();
1698
+ size_t last = std::min(nelements, first + chunk_size);
1699
+ if (local_hist.empty()) local_hist.resize(hist_cur.size(), 0);
1700
+ local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, local_hist.data());
1701
+ }
1702
+ };
1703
+ if (int(workers.size()) < nthread_use - 1) workers.resize(nthread_use - 1);
1704
+ for (int it = 0; it < nthread_use - 1; ++it) workers[it] = std::thread(compute);
1705
+ compute();
1706
+ for (int it = 0; it < nthread_use - 1; ++it) workers[it].join();
1649
1707
  }
1650
1708
 
1651
1709
  printf("size = %8.2f MB -> %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0);
@@ -1744,7 +1802,7 @@ struct llama_context * llama_init_from_file(
1744
1802
  if (params.logits_all) {
1745
1803
  ctx->logits.reserve(hparams.n_ctx*hparams.n_vocab);
1746
1804
  } else {
1747
- ctx->logits.reserve(hparams.n_ctx);
1805
+ ctx->logits.reserve(hparams.n_vocab);
1748
1806
  }
1749
1807
 
1750
1808
  if (params.embedding){
@@ -1767,9 +1825,10 @@ void llama_free(struct llama_context * ctx) {
1767
1825
  int llama_model_quantize(
1768
1826
  const char * fname_inp,
1769
1827
  const char * fname_out,
1770
- enum llama_ftype ftype) {
1828
+ enum llama_ftype ftype,
1829
+ int nthread) {
1771
1830
  try {
1772
- llama_model_quantize_internal(fname_inp, fname_out, ftype);
1831
+ llama_model_quantize_internal(fname_inp, fname_out, ftype, nthread);
1773
1832
  return 0;
1774
1833
  } catch (const std::string & err) {
1775
1834
  fprintf(stderr, "%s: failed to quantize: %s\n", __func__, err.c_str());
@@ -1955,7 +2014,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
1955
2014
  base_t = dest_t;
1956
2015
  }
1957
2016
 
1958
- if (base_t->type == GGML_TYPE_Q4_0 || base_t->type == GGML_TYPE_Q4_1) {
2017
+ if (ggml_is_quantized(base_t->type)) {
1959
2018
  if (!warned) {
1960
2019
  fprintf(stderr, "%s: warning: using a lora adapter with a quantized model may result in poor quality, "
1961
2020
  "use a f16 or f32 base model with --lora-base\n", __func__);
@@ -2025,31 +2084,198 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor
2025
2084
  }
2026
2085
  }
2027
2086
 
2028
- // Returns the KV cache that will contain the context for the
2029
- // ongoing prediction with the model.
2030
- const uint8_t * llama_get_kv_cache(struct llama_context * ctx) {
2031
- return ctx->model.kv_self.buf.addr;
2087
+ int llama_get_kv_cache_token_count(struct llama_context * ctx) {
2088
+ return ctx->model.kv_self.n;
2032
2089
  }
2033
2090
 
2034
- // Returns the size of the KV cache
2035
- size_t llama_get_kv_cache_size(struct llama_context * ctx) {
2036
- return ctx->model.kv_self.buf.size;
2091
+ #define LLAMA_MAX_RNG_STATE 64*1024
2092
+
2093
+ void llama_set_rng_seed(struct llama_context * ctx, int seed) {
2094
+ if (seed <= 0) {
2095
+ seed = time(NULL);
2096
+ }
2097
+ ctx->rng.seed(seed);
2037
2098
  }
2038
2099
 
2039
- int llama_get_kv_cache_token_count(struct llama_context * ctx) {
2040
- return ctx->model.kv_self.n;
2100
+ // Returns the size of the state
2101
+ size_t llama_get_state_size(struct llama_context * ctx) {
2102
+ // we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state.
2103
+ // for reference, std::mt19937(1337) serializes to 6701 bytes.
2104
+ const size_t s_rng_size = sizeof(size_t);
2105
+ const size_t s_rng = LLAMA_MAX_RNG_STATE;
2106
+ const size_t s_logits_capacity = sizeof(size_t);
2107
+ const size_t s_logits_size = sizeof(size_t);
2108
+ const size_t s_logits = ctx->logits.capacity() * sizeof(float);
2109
+ const size_t s_embedding_size = sizeof(size_t);
2110
+ const size_t s_embedding = ctx->embedding.size() * sizeof(float);
2111
+ const size_t s_kv_size = sizeof(size_t);
2112
+ const size_t s_kv_ntok = sizeof(int);
2113
+ const size_t s_kv = ctx->model.kv_self.buf.size;
2114
+
2115
+ const size_t s_total = (
2116
+ + s_rng_size
2117
+ + s_rng
2118
+ + s_logits_capacity
2119
+ + s_logits_size
2120
+ + s_logits
2121
+ + s_embedding_size
2122
+ + s_embedding
2123
+ + s_kv_size
2124
+ + s_kv_ntok
2125
+ + s_kv
2126
+ );
2127
+
2128
+ return s_total;
2041
2129
  }
2042
2130
 
2043
- // Sets the KV cache containing the current context for the model
2044
- void llama_set_kv_cache(
2045
- struct llama_context * ctx,
2046
- const uint8_t * kv_cache,
2047
- size_t n_size,
2048
- int n_token_count) {
2049
- // Make sure we have the same kv cache setup
2050
- LLAMA_ASSERT(ctx->model.kv_self.buf.size == n_size);
2051
- memcpy(ctx->model.kv_self.buf.addr, kv_cache, n_size);
2052
- ctx->model.kv_self.n = n_token_count;
2131
+ // Copies the state to the specified destination address
2132
+ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest) {
2133
+ uint8_t * out = dest;
2134
+
2135
+ // copy rng
2136
+ {
2137
+ std::stringstream rng_ss;
2138
+ rng_ss << ctx->rng;
2139
+
2140
+ const size_t rng_size = rng_ss.str().size();
2141
+ char rng_buf[LLAMA_MAX_RNG_STATE];
2142
+
2143
+ memset(&rng_buf[0], 0, LLAMA_MAX_RNG_STATE);
2144
+ memcpy(&rng_buf[0], rng_ss.str().data(), rng_ss.str().size());
2145
+
2146
+ memcpy(out, &rng_size, sizeof(rng_size)); out += sizeof(rng_size);
2147
+ memcpy(out, &rng_buf[0], LLAMA_MAX_RNG_STATE); out += LLAMA_MAX_RNG_STATE;
2148
+ }
2149
+
2150
+ // copy logits
2151
+ {
2152
+ const size_t logits_cap = ctx->logits.capacity();
2153
+ const size_t logits_size = ctx->logits.size();
2154
+
2155
+ memcpy(out, &logits_cap, sizeof(logits_cap)); out += sizeof(logits_cap);
2156
+ memcpy(out, &logits_size, sizeof(logits_size)); out += sizeof(logits_size);
2157
+
2158
+ if (logits_size) {
2159
+ memcpy(out, ctx->logits.data(), logits_size * sizeof(float));
2160
+ }
2161
+
2162
+ out += logits_cap * sizeof(float);
2163
+ }
2164
+
2165
+ // copy embeddings
2166
+ {
2167
+ const size_t embedding_size = ctx->embedding.size();
2168
+
2169
+ memcpy(out, &embedding_size, sizeof(embedding_size)); out += sizeof(embedding_size);
2170
+
2171
+ if (embedding_size) {
2172
+ memcpy(out, ctx->embedding.data(), embedding_size * sizeof(float));
2173
+ out += embedding_size * sizeof(float);
2174
+ }
2175
+ }
2176
+
2177
+ // copy kv cache
2178
+ {
2179
+ const size_t kv_size = ctx->model.kv_self.buf.size;
2180
+ const int kv_ntok = llama_get_kv_cache_token_count(ctx);
2181
+
2182
+ memcpy(out, &kv_size, sizeof(kv_size)); out += sizeof(kv_size);
2183
+ memcpy(out, &kv_ntok, sizeof(kv_ntok)); out += sizeof(kv_ntok);
2184
+
2185
+ if (kv_size) {
2186
+ memcpy(out, ctx->model.kv_self.buf.addr, kv_size); out += kv_size;
2187
+ }
2188
+ }
2189
+
2190
+ const size_t written = out - dest;
2191
+ const size_t expected = llama_get_state_size(ctx);
2192
+
2193
+ LLAMA_ASSERT(written == expected);
2194
+
2195
+ return written;
2196
+ }
2197
+
2198
+ // Sets the state reading from the specified source address
2199
+ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
2200
+ const uint8_t * in = src;
2201
+
2202
+ // set rng
2203
+ {
2204
+ size_t rng_size;
2205
+ char rng_buf[LLAMA_MAX_RNG_STATE];
2206
+
2207
+ memcpy(&rng_size, in, sizeof(rng_size)); in += sizeof(rng_size);
2208
+ memcpy(&rng_buf[0], in, LLAMA_MAX_RNG_STATE); in += LLAMA_MAX_RNG_STATE;
2209
+
2210
+ std::stringstream rng_ss;
2211
+ rng_ss.str(std::string(&rng_buf[0], rng_size));
2212
+ rng_ss >> ctx->rng;
2213
+
2214
+ LLAMA_ASSERT(rng_ss.fail() == false);
2215
+ }
2216
+
2217
+ // set logits
2218
+ {
2219
+ size_t logits_cap;
2220
+ size_t logits_size;
2221
+
2222
+ memcpy(&logits_cap, in, sizeof(logits_cap)); in += sizeof(logits_cap);
2223
+ memcpy(&logits_size, in, sizeof(logits_size)); in += sizeof(logits_size);
2224
+
2225
+ LLAMA_ASSERT(ctx->logits.capacity() == logits_cap);
2226
+
2227
+ if (logits_size) {
2228
+ ctx->logits.resize(logits_size);
2229
+ memcpy(ctx->logits.data(), in, logits_size * sizeof(float));
2230
+ }
2231
+
2232
+ in += logits_cap * sizeof(float);
2233
+ }
2234
+
2235
+ // set embeddings
2236
+ {
2237
+ size_t embedding_size;
2238
+
2239
+ memcpy(&embedding_size, in, sizeof(embedding_size)); in += sizeof(embedding_size);
2240
+
2241
+ LLAMA_ASSERT(ctx->embedding.capacity() == embedding_size);
2242
+
2243
+ if (embedding_size) {
2244
+ memcpy(ctx->embedding.data(), in, embedding_size * sizeof(float));
2245
+ in += embedding_size * sizeof(float);
2246
+ }
2247
+ }
2248
+
2249
+ // set kv cache
2250
+ {
2251
+ size_t kv_size;
2252
+ int kv_ntok;
2253
+
2254
+ memcpy(&kv_size, in, sizeof(kv_size)); in += sizeof(kv_size);
2255
+ memcpy(&kv_ntok, in, sizeof(kv_ntok)); in += sizeof(kv_ntok);
2256
+
2257
+ if (kv_size) {
2258
+ LLAMA_ASSERT(ctx->model.kv_self.buf.size == kv_size);
2259
+
2260
+ void * k_data = ctx->model.kv_self.k->data; // remember data pointers
2261
+ void * v_data = ctx->model.kv_self.v->data; // because their value is stored in buf and overwritten by memcpy
2262
+
2263
+ memcpy(ctx->model.kv_self.buf.addr, in, kv_size); in += kv_size;
2264
+
2265
+ ctx->model.kv_self.k->data = k_data; // restore correct data pointers
2266
+ ctx->model.kv_self.v->data = v_data;
2267
+
2268
+ }
2269
+
2270
+ ctx->model.kv_self.n = kv_ntok;
2271
+ }
2272
+
2273
+ const size_t nread = in - src;
2274
+ const size_t expected = llama_get_state_size(ctx);
2275
+
2276
+ LLAMA_ASSERT(nread == expected);
2277
+
2278
+ return nread;
2053
2279
  }
2054
2280
 
2055
2281
  int llama_eval(
@@ -2204,3 +2430,4 @@ const char * llama_print_system_info(void) {
2204
2430
  std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
2205
2431
  return ctx->model.tensors_by_name;
2206
2432
  }
2433
+
@@ -72,6 +72,11 @@ extern "C" {
72
72
  LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
73
73
  LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
74
74
  LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
75
+ LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // except 1d tensors
76
+ LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // except 1d tensors
77
+ LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
78
+ LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
79
+ LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
75
80
  };
76
81
 
77
82
  LLAMA_API struct llama_context_params llama_context_default_params();
@@ -91,10 +96,12 @@ extern "C" {
91
96
 
92
97
  // TODO: not great API - very likely to change
93
98
  // Returns 0 on success
99
+ // nthread - how many threads to use. If <=0, will use std::thread::hardware_concurrency(), else the number given
94
100
  LLAMA_API int llama_model_quantize(
95
101
  const char * fname_inp,
96
102
  const char * fname_out,
97
- enum llama_ftype ftype);
103
+ enum llama_ftype ftype,
104
+ int nthread);
98
105
 
99
106
  // Apply a LoRA adapter to a loaded model
100
107
  // path_base_model is the path to a higher quality model to use as a base for
@@ -108,22 +115,23 @@ extern "C" {
108
115
  const char * path_base_model,
109
116
  int n_threads);
110
117
 
111
- // Returns the KV cache that will contain the context for the
112
- // ongoing prediction with the model.
113
- LLAMA_API const uint8_t * llama_get_kv_cache(struct llama_context * ctx);
114
-
115
- // Returns the size of the KV cache
116
- LLAMA_API size_t llama_get_kv_cache_size(struct llama_context * ctx);
117
-
118
118
  // Returns the number of tokens in the KV cache
119
119
  LLAMA_API int llama_get_kv_cache_token_count(struct llama_context * ctx);
120
120
 
121
- // Sets the KV cache containing the current context for the model
122
- LLAMA_API void llama_set_kv_cache(
123
- struct llama_context * ctx,
124
- const uint8_t * kv_cache,
125
- size_t n_size,
126
- int n_token_count);
121
+ // Sets the current rng seed.
122
+ LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, int seed);
123
+
124
+ // Returns the size in bytes of the state (rng, logits, embedding and kv_cache)
125
+ LLAMA_API size_t llama_get_state_size(struct llama_context * ctx);
126
+
127
+ // Copies the state to the specified destination address.
128
+ // Destination needs to have allocated enough memory.
129
+ // Returns the number of bytes copied
130
+ LLAMA_API size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest);
131
+
132
+ // Set the state reading from the specified address
133
+ // Returns the number of bytes read
134
+ LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src);
127
135
 
128
136
  // Run the llama inference to obtain the logits and probabilities for the next token.
129
137
  // tokens + n_tokens is the provided batch of new tokens to process
@@ -21,6 +21,9 @@
21
21
  #if defined(_POSIX_MAPPED_FILES)
22
22
  #include <sys/mman.h>
23
23
  #endif
24
+ #if defined(_POSIX_MEMLOCK_RANGE)
25
+ #include <sys/resource.h>
26
+ #endif
24
27
  #endif
25
28
  #endif
26
29
 
@@ -202,7 +205,6 @@ struct llama_mmap {
202
205
 
203
206
  HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
204
207
  DWORD error = GetLastError();
205
- CloseHandle(hFile);
206
208
 
207
209
  if (hMapping == NULL) {
208
210
  throw format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str());
@@ -304,8 +306,18 @@ struct llama_mlock {
304
306
  if (!mlock(addr, size)) {
305
307
  return true;
306
308
  } else {
307
- fprintf(stderr, "warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n" MLOCK_SUGGESTION,
308
- size, this->size, std::strerror(errno));
309
+ char* errmsg = std::strerror(errno);
310
+ bool suggest = (errno == ENOMEM);
311
+
312
+ // Check if the resource limit is fine after all
313
+ struct rlimit lock_limit;
314
+ if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit))
315
+ suggest = false;
316
+ if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size))
317
+ suggest = false;
318
+
319
+ fprintf(stderr, "warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s",
320
+ size, this->size, errmsg, suggest ? MLOCK_SUGGESTION : "");
309
321
  return false;
310
322
  }
311
323
  }