llama_cpp 0.0.5 → 0.0.7

Sign up to get free protection for your applications and to get access to all the features.
@@ -24,6 +24,10 @@
24
24
  #include <memory>
25
25
  #include <algorithm>
26
26
  #include <initializer_list>
27
+ #include <thread>
28
+ #include <atomic>
29
+ #include <mutex>
30
+ #include <sstream>
27
31
 
28
32
  #define LLAMA_USE_SCRATCH
29
33
  #define LLAMA_MAX_SCRATCH_BUFFERS 16
@@ -50,7 +54,7 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
50
54
  { MODEL_7B, 512ull * MB },
51
55
  { MODEL_13B, 512ull * MB },
52
56
  { MODEL_30B, 512ull * MB },
53
- { MODEL_65B, 512ull * MB },
57
+ { MODEL_65B, 1024ull * MB },
54
58
  };
55
59
  return _MEM_REQ_SCRATCH0;
56
60
  }
@@ -61,10 +65,10 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
61
65
  { MODEL_7B, 512ull * MB },
62
66
  { MODEL_13B, 512ull * MB },
63
67
  { MODEL_30B, 512ull * MB },
64
- { MODEL_65B, 512ull * MB },
68
+ { MODEL_65B, 1024ull * MB },
65
69
  };
66
70
  return _MEM_REQ_SCRATCH1;
67
- };
71
+ }
68
72
 
69
73
  // 2*n_embd*n_ctx*n_layer*sizeof(float16)
70
74
  static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
@@ -76,7 +80,7 @@ static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
76
80
  { MODEL_65B, 5120ull * MB },
77
81
  };
78
82
  return _MEM_REQ_KV_SELF;
79
- };
83
+ }
80
84
 
81
85
  // this is mostly needed for temporary mul_mat buffers to dequantize the data
82
86
  // not actually needed if BLAS is disabled
@@ -89,7 +93,7 @@ static const std::map<e_model, size_t> & MEM_REQ_EVAL()
89
93
  { MODEL_65B, 1536ull * MB },
90
94
  };
91
95
  return _MEM_REQ_EVAL;
92
- };
96
+ }
93
97
 
94
98
  // default hparams (LLaMA 7B)
95
99
  struct llama_hparams {
@@ -478,6 +482,11 @@ struct llama_file_loader {
478
482
  case GGML_TYPE_F16:
479
483
  case GGML_TYPE_Q4_0:
480
484
  case GGML_TYPE_Q4_1:
485
+ case GGML_TYPE_Q4_2:
486
+ case GGML_TYPE_Q4_3:
487
+ case GGML_TYPE_Q5_0:
488
+ case GGML_TYPE_Q5_1:
489
+ case GGML_TYPE_Q8_0:
481
490
  break;
482
491
  default: {
483
492
  throw format("unrecognized tensor type %u\n", shard.type);
@@ -550,6 +559,11 @@ struct llama_file_saver {
550
559
  case GGML_TYPE_F16:
551
560
  case GGML_TYPE_Q4_0:
552
561
  case GGML_TYPE_Q4_1:
562
+ case GGML_TYPE_Q4_2:
563
+ case GGML_TYPE_Q4_3:
564
+ case GGML_TYPE_Q5_0:
565
+ case GGML_TYPE_Q5_1:
566
+ case GGML_TYPE_Q8_0:
553
567
  break;
554
568
  default: LLAMA_ASSERT(false);
555
569
  }
@@ -838,6 +852,11 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
838
852
  case LLAMA_FTYPE_MOSTLY_Q4_1: return "mostly Q4_1";
839
853
  case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
840
854
  return "mostly Q4_1, some F16";
855
+ case LLAMA_FTYPE_MOSTLY_Q4_2: return "mostly Q4_2";
856
+ case LLAMA_FTYPE_MOSTLY_Q4_3: return "mostly Q4_3";
857
+ case LLAMA_FTYPE_MOSTLY_Q5_0: return "mostly Q5_0";
858
+ case LLAMA_FTYPE_MOSTLY_Q5_1: return "mostly Q5_1";
859
+ case LLAMA_FTYPE_MOSTLY_Q8_0: return "mostly Q8_0";
841
860
  default: return "unknown, may not work";
842
861
  }
843
862
  }
@@ -1066,7 +1085,7 @@ static bool llama_eval_internal(
1066
1085
  // for big prompts, if BLAS is enabled, it is better to use only one thread
1067
1086
  // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
1068
1087
  ggml_cgraph gf = {};
1069
- gf.n_threads = N >= 32 && ggml_cpu_has_blas() ? 1 : n_threads;
1088
+ gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
1070
1089
 
1071
1090
  struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
1072
1091
  memcpy(embd->data, tokens, N*ggml_element_size(embd));
@@ -1240,9 +1259,11 @@ static bool llama_eval_internal(
1240
1259
  ggml_build_forward_expand(&gf, inpL);
1241
1260
  ggml_graph_compute (ctx0, &gf);
1242
1261
 
1262
+ #ifdef GGML_PERF
1243
1263
  // print timing information per ggml operation (for debugging purposes)
1244
1264
  // requires GGML_PERF to be defined
1245
- //ggml_graph_print(&gf);
1265
+ ggml_graph_print(&gf);
1266
+ #endif
1246
1267
 
1247
1268
  // plot the computation graph in dot format (for debugging purposes)
1248
1269
  //if (n_past%100 == 0) {
@@ -1566,14 +1587,23 @@ static llama_vocab::id llama_sample_top_p_top_k(
1566
1587
  // quantization
1567
1588
  //
1568
1589
 
1569
- static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, enum llama_ftype ftype) {
1590
+ static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, enum llama_ftype ftype, int nthread) {
1570
1591
  ggml_type quantized_type;
1571
1592
  switch (ftype) {
1572
1593
  case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break;
1573
1594
  case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
1595
+ case LLAMA_FTYPE_MOSTLY_Q4_2: quantized_type = GGML_TYPE_Q4_2; break;
1596
+ case LLAMA_FTYPE_MOSTLY_Q4_3: quantized_type = GGML_TYPE_Q4_3; break;
1597
+ case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break;
1598
+ case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break;
1599
+ case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break;
1574
1600
  default: throw format("invalid output file type %d\n", ftype);
1575
1601
  };
1576
1602
 
1603
+ if (nthread <= 0) {
1604
+ nthread = std::thread::hardware_concurrency();
1605
+ }
1606
+
1577
1607
  std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp.c_str(), /*use_mmap*/ false,
1578
1608
  /*vocab_only*/ false));
1579
1609
  llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), ftype);
@@ -1582,6 +1612,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1582
1612
  size_t total_size_new = 0;
1583
1613
  std::vector<int64_t> hist_all(1 << 4, 0);
1584
1614
 
1615
+ std::vector<std::thread> workers;
1616
+ std::mutex mutex;
1617
+
1585
1618
  size_t idx = 0;
1586
1619
  for (llama_load_tensor & tensor : model_loader->tensors_map.tensors) {
1587
1620
  llama_buffer read_data;
@@ -1600,6 +1633,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1600
1633
  // quantize only 2D tensors
1601
1634
  quantize &= (tensor.ne.size() == 2);
1602
1635
 
1636
+ // uncomment this to keep the output layer in FP16
1637
+ //if (tensor.name == "output.weight") {
1638
+ // quantize = false;
1639
+ //}
1640
+
1603
1641
  enum ggml_type new_type;
1604
1642
  void * new_data;
1605
1643
  size_t new_size;
@@ -1635,17 +1673,37 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1635
1673
  new_data = work.addr;
1636
1674
  std::vector<int64_t> hist_cur(1 << 4, 0);
1637
1675
 
1638
- switch (new_type) {
1639
- case GGML_TYPE_Q4_0:
1640
- {
1641
- new_size = ggml_quantize_q4_0(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data());
1642
- } break;
1643
- case GGML_TYPE_Q4_1:
1644
- {
1645
- new_size = ggml_quantize_q4_1(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data());
1646
- } break;
1647
- default:
1648
- LLAMA_ASSERT(false);
1676
+ int chunk_size = 32 * 512;
1677
+ const int nchunk = (nelements + chunk_size - 1)/chunk_size;
1678
+ const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1;
1679
+ if (nthread_use < 2) {
1680
+ new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nelements, hist_cur.data());
1681
+ } else {
1682
+ size_t counter = 0;
1683
+ new_size = 0;
1684
+ auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements, chunk_size] () {
1685
+ std::vector<int64_t> local_hist;
1686
+ size_t local_size = 0;
1687
+ while (true) {
1688
+ std::unique_lock<std::mutex> lock(mutex);
1689
+ size_t first = counter; counter += chunk_size;
1690
+ if (first >= nelements) {
1691
+ if (!local_hist.empty()) {
1692
+ for (int j=0; j<int(local_hist.size()); ++j) hist_cur[j] += local_hist[j];
1693
+ new_size += local_size;
1694
+ }
1695
+ break;
1696
+ }
1697
+ lock.unlock();
1698
+ size_t last = std::min(nelements, first + chunk_size);
1699
+ if (local_hist.empty()) local_hist.resize(hist_cur.size(), 0);
1700
+ local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, local_hist.data());
1701
+ }
1702
+ };
1703
+ if (int(workers.size()) < nthread_use - 1) workers.resize(nthread_use - 1);
1704
+ for (int it = 0; it < nthread_use - 1; ++it) workers[it] = std::thread(compute);
1705
+ compute();
1706
+ for (int it = 0; it < nthread_use - 1; ++it) workers[it].join();
1649
1707
  }
1650
1708
 
1651
1709
  printf("size = %8.2f MB -> %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0);
@@ -1744,7 +1802,7 @@ struct llama_context * llama_init_from_file(
1744
1802
  if (params.logits_all) {
1745
1803
  ctx->logits.reserve(hparams.n_ctx*hparams.n_vocab);
1746
1804
  } else {
1747
- ctx->logits.reserve(hparams.n_ctx);
1805
+ ctx->logits.reserve(hparams.n_vocab);
1748
1806
  }
1749
1807
 
1750
1808
  if (params.embedding){
@@ -1767,9 +1825,10 @@ void llama_free(struct llama_context * ctx) {
1767
1825
  int llama_model_quantize(
1768
1826
  const char * fname_inp,
1769
1827
  const char * fname_out,
1770
- enum llama_ftype ftype) {
1828
+ enum llama_ftype ftype,
1829
+ int nthread) {
1771
1830
  try {
1772
- llama_model_quantize_internal(fname_inp, fname_out, ftype);
1831
+ llama_model_quantize_internal(fname_inp, fname_out, ftype, nthread);
1773
1832
  return 0;
1774
1833
  } catch (const std::string & err) {
1775
1834
  fprintf(stderr, "%s: failed to quantize: %s\n", __func__, err.c_str());
@@ -1955,7 +2014,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
1955
2014
  base_t = dest_t;
1956
2015
  }
1957
2016
 
1958
- if (base_t->type == GGML_TYPE_Q4_0 || base_t->type == GGML_TYPE_Q4_1) {
2017
+ if (ggml_is_quantized(base_t->type)) {
1959
2018
  if (!warned) {
1960
2019
  fprintf(stderr, "%s: warning: using a lora adapter with a quantized model may result in poor quality, "
1961
2020
  "use a f16 or f32 base model with --lora-base\n", __func__);
@@ -2025,31 +2084,198 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor
2025
2084
  }
2026
2085
  }
2027
2086
 
2028
- // Returns the KV cache that will contain the context for the
2029
- // ongoing prediction with the model.
2030
- const uint8_t * llama_get_kv_cache(struct llama_context * ctx) {
2031
- return ctx->model.kv_self.buf.addr;
2087
+ int llama_get_kv_cache_token_count(struct llama_context * ctx) {
2088
+ return ctx->model.kv_self.n;
2032
2089
  }
2033
2090
 
2034
- // Returns the size of the KV cache
2035
- size_t llama_get_kv_cache_size(struct llama_context * ctx) {
2036
- return ctx->model.kv_self.buf.size;
2091
+ #define LLAMA_MAX_RNG_STATE 64*1024
2092
+
2093
+ void llama_set_rng_seed(struct llama_context * ctx, int seed) {
2094
+ if (seed <= 0) {
2095
+ seed = time(NULL);
2096
+ }
2097
+ ctx->rng.seed(seed);
2037
2098
  }
2038
2099
 
2039
- int llama_get_kv_cache_token_count(struct llama_context * ctx) {
2040
- return ctx->model.kv_self.n;
2100
+ // Returns the size of the state
2101
+ size_t llama_get_state_size(struct llama_context * ctx) {
2102
+ // we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state.
2103
+ // for reference, std::mt19937(1337) serializes to 6701 bytes.
2104
+ const size_t s_rng_size = sizeof(size_t);
2105
+ const size_t s_rng = LLAMA_MAX_RNG_STATE;
2106
+ const size_t s_logits_capacity = sizeof(size_t);
2107
+ const size_t s_logits_size = sizeof(size_t);
2108
+ const size_t s_logits = ctx->logits.capacity() * sizeof(float);
2109
+ const size_t s_embedding_size = sizeof(size_t);
2110
+ const size_t s_embedding = ctx->embedding.size() * sizeof(float);
2111
+ const size_t s_kv_size = sizeof(size_t);
2112
+ const size_t s_kv_ntok = sizeof(int);
2113
+ const size_t s_kv = ctx->model.kv_self.buf.size;
2114
+
2115
+ const size_t s_total = (
2116
+ + s_rng_size
2117
+ + s_rng
2118
+ + s_logits_capacity
2119
+ + s_logits_size
2120
+ + s_logits
2121
+ + s_embedding_size
2122
+ + s_embedding
2123
+ + s_kv_size
2124
+ + s_kv_ntok
2125
+ + s_kv
2126
+ );
2127
+
2128
+ return s_total;
2041
2129
  }
2042
2130
 
2043
- // Sets the KV cache containing the current context for the model
2044
- void llama_set_kv_cache(
2045
- struct llama_context * ctx,
2046
- const uint8_t * kv_cache,
2047
- size_t n_size,
2048
- int n_token_count) {
2049
- // Make sure we have the same kv cache setup
2050
- LLAMA_ASSERT(ctx->model.kv_self.buf.size == n_size);
2051
- memcpy(ctx->model.kv_self.buf.addr, kv_cache, n_size);
2052
- ctx->model.kv_self.n = n_token_count;
2131
+ // Copies the state to the specified destination address
2132
+ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest) {
2133
+ uint8_t * out = dest;
2134
+
2135
+ // copy rng
2136
+ {
2137
+ std::stringstream rng_ss;
2138
+ rng_ss << ctx->rng;
2139
+
2140
+ const size_t rng_size = rng_ss.str().size();
2141
+ char rng_buf[LLAMA_MAX_RNG_STATE];
2142
+
2143
+ memset(&rng_buf[0], 0, LLAMA_MAX_RNG_STATE);
2144
+ memcpy(&rng_buf[0], rng_ss.str().data(), rng_ss.str().size());
2145
+
2146
+ memcpy(out, &rng_size, sizeof(rng_size)); out += sizeof(rng_size);
2147
+ memcpy(out, &rng_buf[0], LLAMA_MAX_RNG_STATE); out += LLAMA_MAX_RNG_STATE;
2148
+ }
2149
+
2150
+ // copy logits
2151
+ {
2152
+ const size_t logits_cap = ctx->logits.capacity();
2153
+ const size_t logits_size = ctx->logits.size();
2154
+
2155
+ memcpy(out, &logits_cap, sizeof(logits_cap)); out += sizeof(logits_cap);
2156
+ memcpy(out, &logits_size, sizeof(logits_size)); out += sizeof(logits_size);
2157
+
2158
+ if (logits_size) {
2159
+ memcpy(out, ctx->logits.data(), logits_size * sizeof(float));
2160
+ }
2161
+
2162
+ out += logits_cap * sizeof(float);
2163
+ }
2164
+
2165
+ // copy embeddings
2166
+ {
2167
+ const size_t embedding_size = ctx->embedding.size();
2168
+
2169
+ memcpy(out, &embedding_size, sizeof(embedding_size)); out += sizeof(embedding_size);
2170
+
2171
+ if (embedding_size) {
2172
+ memcpy(out, ctx->embedding.data(), embedding_size * sizeof(float));
2173
+ out += embedding_size * sizeof(float);
2174
+ }
2175
+ }
2176
+
2177
+ // copy kv cache
2178
+ {
2179
+ const size_t kv_size = ctx->model.kv_self.buf.size;
2180
+ const int kv_ntok = llama_get_kv_cache_token_count(ctx);
2181
+
2182
+ memcpy(out, &kv_size, sizeof(kv_size)); out += sizeof(kv_size);
2183
+ memcpy(out, &kv_ntok, sizeof(kv_ntok)); out += sizeof(kv_ntok);
2184
+
2185
+ if (kv_size) {
2186
+ memcpy(out, ctx->model.kv_self.buf.addr, kv_size); out += kv_size;
2187
+ }
2188
+ }
2189
+
2190
+ const size_t written = out - dest;
2191
+ const size_t expected = llama_get_state_size(ctx);
2192
+
2193
+ LLAMA_ASSERT(written == expected);
2194
+
2195
+ return written;
2196
+ }
2197
+
2198
+ // Sets the state reading from the specified source address
2199
+ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
2200
+ const uint8_t * in = src;
2201
+
2202
+ // set rng
2203
+ {
2204
+ size_t rng_size;
2205
+ char rng_buf[LLAMA_MAX_RNG_STATE];
2206
+
2207
+ memcpy(&rng_size, in, sizeof(rng_size)); in += sizeof(rng_size);
2208
+ memcpy(&rng_buf[0], in, LLAMA_MAX_RNG_STATE); in += LLAMA_MAX_RNG_STATE;
2209
+
2210
+ std::stringstream rng_ss;
2211
+ rng_ss.str(std::string(&rng_buf[0], rng_size));
2212
+ rng_ss >> ctx->rng;
2213
+
2214
+ LLAMA_ASSERT(rng_ss.fail() == false);
2215
+ }
2216
+
2217
+ // set logits
2218
+ {
2219
+ size_t logits_cap;
2220
+ size_t logits_size;
2221
+
2222
+ memcpy(&logits_cap, in, sizeof(logits_cap)); in += sizeof(logits_cap);
2223
+ memcpy(&logits_size, in, sizeof(logits_size)); in += sizeof(logits_size);
2224
+
2225
+ LLAMA_ASSERT(ctx->logits.capacity() == logits_cap);
2226
+
2227
+ if (logits_size) {
2228
+ ctx->logits.resize(logits_size);
2229
+ memcpy(ctx->logits.data(), in, logits_size * sizeof(float));
2230
+ }
2231
+
2232
+ in += logits_cap * sizeof(float);
2233
+ }
2234
+
2235
+ // set embeddings
2236
+ {
2237
+ size_t embedding_size;
2238
+
2239
+ memcpy(&embedding_size, in, sizeof(embedding_size)); in += sizeof(embedding_size);
2240
+
2241
+ LLAMA_ASSERT(ctx->embedding.capacity() == embedding_size);
2242
+
2243
+ if (embedding_size) {
2244
+ memcpy(ctx->embedding.data(), in, embedding_size * sizeof(float));
2245
+ in += embedding_size * sizeof(float);
2246
+ }
2247
+ }
2248
+
2249
+ // set kv cache
2250
+ {
2251
+ size_t kv_size;
2252
+ int kv_ntok;
2253
+
2254
+ memcpy(&kv_size, in, sizeof(kv_size)); in += sizeof(kv_size);
2255
+ memcpy(&kv_ntok, in, sizeof(kv_ntok)); in += sizeof(kv_ntok);
2256
+
2257
+ if (kv_size) {
2258
+ LLAMA_ASSERT(ctx->model.kv_self.buf.size == kv_size);
2259
+
2260
+ void * k_data = ctx->model.kv_self.k->data; // remember data pointers
2261
+ void * v_data = ctx->model.kv_self.v->data; // because their value is stored in buf and overwritten by memcpy
2262
+
2263
+ memcpy(ctx->model.kv_self.buf.addr, in, kv_size); in += kv_size;
2264
+
2265
+ ctx->model.kv_self.k->data = k_data; // restore correct data pointers
2266
+ ctx->model.kv_self.v->data = v_data;
2267
+
2268
+ }
2269
+
2270
+ ctx->model.kv_self.n = kv_ntok;
2271
+ }
2272
+
2273
+ const size_t nread = in - src;
2274
+ const size_t expected = llama_get_state_size(ctx);
2275
+
2276
+ LLAMA_ASSERT(nread == expected);
2277
+
2278
+ return nread;
2053
2279
  }
2054
2280
 
2055
2281
  int llama_eval(
@@ -2204,3 +2430,4 @@ const char * llama_print_system_info(void) {
2204
2430
  std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
2205
2431
  return ctx->model.tensors_by_name;
2206
2432
  }
2433
+
@@ -72,6 +72,11 @@ extern "C" {
72
72
  LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
73
73
  LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
74
74
  LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
75
+ LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // except 1d tensors
76
+ LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // except 1d tensors
77
+ LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
78
+ LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
79
+ LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
75
80
  };
76
81
 
77
82
  LLAMA_API struct llama_context_params llama_context_default_params();
@@ -91,10 +96,12 @@ extern "C" {
91
96
 
92
97
  // TODO: not great API - very likely to change
93
98
  // Returns 0 on success
99
+ // nthread - how many threads to use. If <=0, will use std::thread::hardware_concurrency(), else the number given
94
100
  LLAMA_API int llama_model_quantize(
95
101
  const char * fname_inp,
96
102
  const char * fname_out,
97
- enum llama_ftype ftype);
103
+ enum llama_ftype ftype,
104
+ int nthread);
98
105
 
99
106
  // Apply a LoRA adapter to a loaded model
100
107
  // path_base_model is the path to a higher quality model to use as a base for
@@ -108,22 +115,23 @@ extern "C" {
108
115
  const char * path_base_model,
109
116
  int n_threads);
110
117
 
111
- // Returns the KV cache that will contain the context for the
112
- // ongoing prediction with the model.
113
- LLAMA_API const uint8_t * llama_get_kv_cache(struct llama_context * ctx);
114
-
115
- // Returns the size of the KV cache
116
- LLAMA_API size_t llama_get_kv_cache_size(struct llama_context * ctx);
117
-
118
118
  // Returns the number of tokens in the KV cache
119
119
  LLAMA_API int llama_get_kv_cache_token_count(struct llama_context * ctx);
120
120
 
121
- // Sets the KV cache containing the current context for the model
122
- LLAMA_API void llama_set_kv_cache(
123
- struct llama_context * ctx,
124
- const uint8_t * kv_cache,
125
- size_t n_size,
126
- int n_token_count);
121
+ // Sets the current rng seed.
122
+ LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, int seed);
123
+
124
+ // Returns the size in bytes of the state (rng, logits, embedding and kv_cache)
125
+ LLAMA_API size_t llama_get_state_size(struct llama_context * ctx);
126
+
127
+ // Copies the state to the specified destination address.
128
+ // Destination needs to have allocated enough memory.
129
+ // Returns the number of bytes copied
130
+ LLAMA_API size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest);
131
+
132
+ // Set the state reading from the specified address
133
+ // Returns the number of bytes read
134
+ LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src);
127
135
 
128
136
  // Run the llama inference to obtain the logits and probabilities for the next token.
129
137
  // tokens + n_tokens is the provided batch of new tokens to process
@@ -21,6 +21,9 @@
21
21
  #if defined(_POSIX_MAPPED_FILES)
22
22
  #include <sys/mman.h>
23
23
  #endif
24
+ #if defined(_POSIX_MEMLOCK_RANGE)
25
+ #include <sys/resource.h>
26
+ #endif
24
27
  #endif
25
28
  #endif
26
29
 
@@ -202,7 +205,6 @@ struct llama_mmap {
202
205
 
203
206
  HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
204
207
  DWORD error = GetLastError();
205
- CloseHandle(hFile);
206
208
 
207
209
  if (hMapping == NULL) {
208
210
  throw format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str());
@@ -304,8 +306,18 @@ struct llama_mlock {
304
306
  if (!mlock(addr, size)) {
305
307
  return true;
306
308
  } else {
307
- fprintf(stderr, "warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n" MLOCK_SUGGESTION,
308
- size, this->size, std::strerror(errno));
309
+ char* errmsg = std::strerror(errno);
310
+ bool suggest = (errno == ENOMEM);
311
+
312
+ // Check if the resource limit is fine after all
313
+ struct rlimit lock_limit;
314
+ if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit))
315
+ suggest = false;
316
+ if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size))
317
+ suggest = false;
318
+
319
+ fprintf(stderr, "warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s",
320
+ size, this->size, errmsg, suggest ? MLOCK_SUGGESTION : "");
309
321
  return false;
310
322
  }
311
323
  }