llama_cpp 0.0.6 → 0.0.7

Sign up to get free protection for your applications and to get access to all the features.
@@ -27,6 +27,7 @@
27
27
  #include <thread>
28
28
  #include <atomic>
29
29
  #include <mutex>
30
+ #include <sstream>
30
31
 
31
32
  #define LLAMA_USE_SCRATCH
32
33
  #define LLAMA_MAX_SCRATCH_BUFFERS 16
@@ -53,7 +54,7 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
53
54
  { MODEL_7B, 512ull * MB },
54
55
  { MODEL_13B, 512ull * MB },
55
56
  { MODEL_30B, 512ull * MB },
56
- { MODEL_65B, 512ull * MB },
57
+ { MODEL_65B, 1024ull * MB },
57
58
  };
58
59
  return _MEM_REQ_SCRATCH0;
59
60
  }
@@ -64,10 +65,10 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
64
65
  { MODEL_7B, 512ull * MB },
65
66
  { MODEL_13B, 512ull * MB },
66
67
  { MODEL_30B, 512ull * MB },
67
- { MODEL_65B, 512ull * MB },
68
+ { MODEL_65B, 1024ull * MB },
68
69
  };
69
70
  return _MEM_REQ_SCRATCH1;
70
- };
71
+ }
71
72
 
72
73
  // 2*n_embd*n_ctx*n_layer*sizeof(float16)
73
74
  static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
@@ -79,7 +80,7 @@ static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
79
80
  { MODEL_65B, 5120ull * MB },
80
81
  };
81
82
  return _MEM_REQ_KV_SELF;
82
- };
83
+ }
83
84
 
84
85
  // this is mostly needed for temporary mul_mat buffers to dequantize the data
85
86
  // not actually needed if BLAS is disabled
@@ -92,7 +93,7 @@ static const std::map<e_model, size_t> & MEM_REQ_EVAL()
92
93
  { MODEL_65B, 1536ull * MB },
93
94
  };
94
95
  return _MEM_REQ_EVAL;
95
- };
96
+ }
96
97
 
97
98
  // default hparams (LLaMA 7B)
98
99
  struct llama_hparams {
@@ -483,6 +484,9 @@ struct llama_file_loader {
483
484
  case GGML_TYPE_Q4_1:
484
485
  case GGML_TYPE_Q4_2:
485
486
  case GGML_TYPE_Q4_3:
487
+ case GGML_TYPE_Q5_0:
488
+ case GGML_TYPE_Q5_1:
489
+ case GGML_TYPE_Q8_0:
486
490
  break;
487
491
  default: {
488
492
  throw format("unrecognized tensor type %u\n", shard.type);
@@ -557,6 +561,9 @@ struct llama_file_saver {
557
561
  case GGML_TYPE_Q4_1:
558
562
  case GGML_TYPE_Q4_2:
559
563
  case GGML_TYPE_Q4_3:
564
+ case GGML_TYPE_Q5_0:
565
+ case GGML_TYPE_Q5_1:
566
+ case GGML_TYPE_Q8_0:
560
567
  break;
561
568
  default: LLAMA_ASSERT(false);
562
569
  }
@@ -847,6 +854,9 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
847
854
  return "mostly Q4_1, some F16";
848
855
  case LLAMA_FTYPE_MOSTLY_Q4_2: return "mostly Q4_2";
849
856
  case LLAMA_FTYPE_MOSTLY_Q4_3: return "mostly Q4_3";
857
+ case LLAMA_FTYPE_MOSTLY_Q5_0: return "mostly Q5_0";
858
+ case LLAMA_FTYPE_MOSTLY_Q5_1: return "mostly Q5_1";
859
+ case LLAMA_FTYPE_MOSTLY_Q8_0: return "mostly Q8_0";
850
860
  default: return "unknown, may not work";
851
861
  }
852
862
  }
@@ -1075,7 +1085,7 @@ static bool llama_eval_internal(
1075
1085
  // for big prompts, if BLAS is enabled, it is better to use only one thread
1076
1086
  // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
1077
1087
  ggml_cgraph gf = {};
1078
- gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_cublas() ? 1 : n_threads;
1088
+ gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
1079
1089
 
1080
1090
  struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
1081
1091
  memcpy(embd->data, tokens, N*ggml_element_size(embd));
@@ -1249,9 +1259,11 @@ static bool llama_eval_internal(
1249
1259
  ggml_build_forward_expand(&gf, inpL);
1250
1260
  ggml_graph_compute (ctx0, &gf);
1251
1261
 
1262
+ #ifdef GGML_PERF
1252
1263
  // print timing information per ggml operation (for debugging purposes)
1253
1264
  // requires GGML_PERF to be defined
1254
- //ggml_graph_print(&gf);
1265
+ ggml_graph_print(&gf);
1266
+ #endif
1255
1267
 
1256
1268
  // plot the computation graph in dot format (for debugging purposes)
1257
1269
  //if (n_past%100 == 0) {
@@ -1582,6 +1594,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1582
1594
  case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
1583
1595
  case LLAMA_FTYPE_MOSTLY_Q4_2: quantized_type = GGML_TYPE_Q4_2; break;
1584
1596
  case LLAMA_FTYPE_MOSTLY_Q4_3: quantized_type = GGML_TYPE_Q4_3; break;
1597
+ case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break;
1598
+ case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break;
1599
+ case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break;
1585
1600
  default: throw format("invalid output file type %d\n", ftype);
1586
1601
  };
1587
1602
 
@@ -1618,8 +1633,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1618
1633
  // quantize only 2D tensors
1619
1634
  quantize &= (tensor.ne.size() == 2);
1620
1635
 
1621
- // GG: uncomment this to keep the output layer in FP16
1622
- //if (tensor.name.rfind("output")) {
1636
+ // uncomment this to keep the output layer in FP16
1637
+ //if (tensor.name == "output.weight") {
1623
1638
  // quantize = false;
1624
1639
  //}
1625
1640
 
@@ -1787,7 +1802,7 @@ struct llama_context * llama_init_from_file(
1787
1802
  if (params.logits_all) {
1788
1803
  ctx->logits.reserve(hparams.n_ctx*hparams.n_vocab);
1789
1804
  } else {
1790
- ctx->logits.reserve(hparams.n_ctx);
1805
+ ctx->logits.reserve(hparams.n_vocab);
1791
1806
  }
1792
1807
 
1793
1808
  if (params.embedding){
@@ -2069,31 +2084,198 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor
2069
2084
  }
2070
2085
  }
2071
2086
 
2072
- // Returns the KV cache that will contain the context for the
2073
- // ongoing prediction with the model.
2074
- const uint8_t * llama_get_kv_cache(struct llama_context * ctx) {
2075
- return ctx->model.kv_self.buf.addr;
2087
+ int llama_get_kv_cache_token_count(struct llama_context * ctx) {
2088
+ return ctx->model.kv_self.n;
2076
2089
  }
2077
2090
 
2078
- // Returns the size of the KV cache
2079
- size_t llama_get_kv_cache_size(struct llama_context * ctx) {
2080
- return ctx->model.kv_self.buf.size;
2091
+ #define LLAMA_MAX_RNG_STATE 64*1024
2092
+
2093
+ void llama_set_rng_seed(struct llama_context * ctx, int seed) {
2094
+ if (seed <= 0) {
2095
+ seed = time(NULL);
2096
+ }
2097
+ ctx->rng.seed(seed);
2081
2098
  }
2082
2099
 
2083
- int llama_get_kv_cache_token_count(struct llama_context * ctx) {
2084
- return ctx->model.kv_self.n;
2100
+ // Returns the size of the state
2101
+ size_t llama_get_state_size(struct llama_context * ctx) {
2102
+ // we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state.
2103
+ // for reference, std::mt19937(1337) serializes to 6701 bytes.
2104
+ const size_t s_rng_size = sizeof(size_t);
2105
+ const size_t s_rng = LLAMA_MAX_RNG_STATE;
2106
+ const size_t s_logits_capacity = sizeof(size_t);
2107
+ const size_t s_logits_size = sizeof(size_t);
2108
+ const size_t s_logits = ctx->logits.capacity() * sizeof(float);
2109
+ const size_t s_embedding_size = sizeof(size_t);
2110
+ const size_t s_embedding = ctx->embedding.size() * sizeof(float);
2111
+ const size_t s_kv_size = sizeof(size_t);
2112
+ const size_t s_kv_ntok = sizeof(int);
2113
+ const size_t s_kv = ctx->model.kv_self.buf.size;
2114
+
2115
+ const size_t s_total = (
2116
+ + s_rng_size
2117
+ + s_rng
2118
+ + s_logits_capacity
2119
+ + s_logits_size
2120
+ + s_logits
2121
+ + s_embedding_size
2122
+ + s_embedding
2123
+ + s_kv_size
2124
+ + s_kv_ntok
2125
+ + s_kv
2126
+ );
2127
+
2128
+ return s_total;
2085
2129
  }
2086
2130
 
2087
- // Sets the KV cache containing the current context for the model
2088
- void llama_set_kv_cache(
2089
- struct llama_context * ctx,
2090
- const uint8_t * kv_cache,
2091
- size_t n_size,
2092
- int n_token_count) {
2093
- // Make sure we have the same kv cache setup
2094
- LLAMA_ASSERT(ctx->model.kv_self.buf.size == n_size);
2095
- memcpy(ctx->model.kv_self.buf.addr, kv_cache, n_size);
2096
- ctx->model.kv_self.n = n_token_count;
2131
+ // Copies the state to the specified destination address
2132
+ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest) {
2133
+ uint8_t * out = dest;
2134
+
2135
+ // copy rng
2136
+ {
2137
+ std::stringstream rng_ss;
2138
+ rng_ss << ctx->rng;
2139
+
2140
+ const size_t rng_size = rng_ss.str().size();
2141
+ char rng_buf[LLAMA_MAX_RNG_STATE];
2142
+
2143
+ memset(&rng_buf[0], 0, LLAMA_MAX_RNG_STATE);
2144
+ memcpy(&rng_buf[0], rng_ss.str().data(), rng_ss.str().size());
2145
+
2146
+ memcpy(out, &rng_size, sizeof(rng_size)); out += sizeof(rng_size);
2147
+ memcpy(out, &rng_buf[0], LLAMA_MAX_RNG_STATE); out += LLAMA_MAX_RNG_STATE;
2148
+ }
2149
+
2150
+ // copy logits
2151
+ {
2152
+ const size_t logits_cap = ctx->logits.capacity();
2153
+ const size_t logits_size = ctx->logits.size();
2154
+
2155
+ memcpy(out, &logits_cap, sizeof(logits_cap)); out += sizeof(logits_cap);
2156
+ memcpy(out, &logits_size, sizeof(logits_size)); out += sizeof(logits_size);
2157
+
2158
+ if (logits_size) {
2159
+ memcpy(out, ctx->logits.data(), logits_size * sizeof(float));
2160
+ }
2161
+
2162
+ out += logits_cap * sizeof(float);
2163
+ }
2164
+
2165
+ // copy embeddings
2166
+ {
2167
+ const size_t embedding_size = ctx->embedding.size();
2168
+
2169
+ memcpy(out, &embedding_size, sizeof(embedding_size)); out += sizeof(embedding_size);
2170
+
2171
+ if (embedding_size) {
2172
+ memcpy(out, ctx->embedding.data(), embedding_size * sizeof(float));
2173
+ out += embedding_size * sizeof(float);
2174
+ }
2175
+ }
2176
+
2177
+ // copy kv cache
2178
+ {
2179
+ const size_t kv_size = ctx->model.kv_self.buf.size;
2180
+ const int kv_ntok = llama_get_kv_cache_token_count(ctx);
2181
+
2182
+ memcpy(out, &kv_size, sizeof(kv_size)); out += sizeof(kv_size);
2183
+ memcpy(out, &kv_ntok, sizeof(kv_ntok)); out += sizeof(kv_ntok);
2184
+
2185
+ if (kv_size) {
2186
+ memcpy(out, ctx->model.kv_self.buf.addr, kv_size); out += kv_size;
2187
+ }
2188
+ }
2189
+
2190
+ const size_t written = out - dest;
2191
+ const size_t expected = llama_get_state_size(ctx);
2192
+
2193
+ LLAMA_ASSERT(written == expected);
2194
+
2195
+ return written;
2196
+ }
2197
+
2198
+ // Sets the state reading from the specified source address
2199
+ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
2200
+ const uint8_t * in = src;
2201
+
2202
+ // set rng
2203
+ {
2204
+ size_t rng_size;
2205
+ char rng_buf[LLAMA_MAX_RNG_STATE];
2206
+
2207
+ memcpy(&rng_size, in, sizeof(rng_size)); in += sizeof(rng_size);
2208
+ memcpy(&rng_buf[0], in, LLAMA_MAX_RNG_STATE); in += LLAMA_MAX_RNG_STATE;
2209
+
2210
+ std::stringstream rng_ss;
2211
+ rng_ss.str(std::string(&rng_buf[0], rng_size));
2212
+ rng_ss >> ctx->rng;
2213
+
2214
+ LLAMA_ASSERT(rng_ss.fail() == false);
2215
+ }
2216
+
2217
+ // set logits
2218
+ {
2219
+ size_t logits_cap;
2220
+ size_t logits_size;
2221
+
2222
+ memcpy(&logits_cap, in, sizeof(logits_cap)); in += sizeof(logits_cap);
2223
+ memcpy(&logits_size, in, sizeof(logits_size)); in += sizeof(logits_size);
2224
+
2225
+ LLAMA_ASSERT(ctx->logits.capacity() == logits_cap);
2226
+
2227
+ if (logits_size) {
2228
+ ctx->logits.resize(logits_size);
2229
+ memcpy(ctx->logits.data(), in, logits_size * sizeof(float));
2230
+ }
2231
+
2232
+ in += logits_cap * sizeof(float);
2233
+ }
2234
+
2235
+ // set embeddings
2236
+ {
2237
+ size_t embedding_size;
2238
+
2239
+ memcpy(&embedding_size, in, sizeof(embedding_size)); in += sizeof(embedding_size);
2240
+
2241
+ LLAMA_ASSERT(ctx->embedding.capacity() == embedding_size);
2242
+
2243
+ if (embedding_size) {
2244
+ memcpy(ctx->embedding.data(), in, embedding_size * sizeof(float));
2245
+ in += embedding_size * sizeof(float);
2246
+ }
2247
+ }
2248
+
2249
+ // set kv cache
2250
+ {
2251
+ size_t kv_size;
2252
+ int kv_ntok;
2253
+
2254
+ memcpy(&kv_size, in, sizeof(kv_size)); in += sizeof(kv_size);
2255
+ memcpy(&kv_ntok, in, sizeof(kv_ntok)); in += sizeof(kv_ntok);
2256
+
2257
+ if (kv_size) {
2258
+ LLAMA_ASSERT(ctx->model.kv_self.buf.size == kv_size);
2259
+
2260
+ void * k_data = ctx->model.kv_self.k->data; // remember data pointers
2261
+ void * v_data = ctx->model.kv_self.v->data; // because their value is stored in buf and overwritten by memcpy
2262
+
2263
+ memcpy(ctx->model.kv_self.buf.addr, in, kv_size); in += kv_size;
2264
+
2265
+ ctx->model.kv_self.k->data = k_data; // restore correct data pointers
2266
+ ctx->model.kv_self.v->data = v_data;
2267
+
2268
+ }
2269
+
2270
+ ctx->model.kv_self.n = kv_ntok;
2271
+ }
2272
+
2273
+ const size_t nread = in - src;
2274
+ const size_t expected = llama_get_state_size(ctx);
2275
+
2276
+ LLAMA_ASSERT(nread == expected);
2277
+
2278
+ return nread;
2097
2279
  }
2098
2280
 
2099
2281
  int llama_eval(
@@ -2248,3 +2430,4 @@ const char * llama_print_system_info(void) {
2248
2430
  std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
2249
2431
  return ctx->model.tensors_by_name;
2250
2432
  }
2433
+
@@ -74,6 +74,9 @@ extern "C" {
74
74
  LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
75
75
  LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // except 1d tensors
76
76
  LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // except 1d tensors
77
+ LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
78
+ LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
79
+ LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
77
80
  };
78
81
 
79
82
  LLAMA_API struct llama_context_params llama_context_default_params();
@@ -112,22 +115,23 @@ extern "C" {
112
115
  const char * path_base_model,
113
116
  int n_threads);
114
117
 
115
- // Returns the KV cache that will contain the context for the
116
- // ongoing prediction with the model.
117
- LLAMA_API const uint8_t * llama_get_kv_cache(struct llama_context * ctx);
118
-
119
- // Returns the size of the KV cache
120
- LLAMA_API size_t llama_get_kv_cache_size(struct llama_context * ctx);
121
-
122
118
  // Returns the number of tokens in the KV cache
123
119
  LLAMA_API int llama_get_kv_cache_token_count(struct llama_context * ctx);
124
120
 
125
- // Sets the KV cache containing the current context for the model
126
- LLAMA_API void llama_set_kv_cache(
127
- struct llama_context * ctx,
128
- const uint8_t * kv_cache,
129
- size_t n_size,
130
- int n_token_count);
121
+ // Sets the current rng seed.
122
+ LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, int seed);
123
+
124
+ // Returns the size in bytes of the state (rng, logits, embedding and kv_cache)
125
+ LLAMA_API size_t llama_get_state_size(struct llama_context * ctx);
126
+
127
+ // Copies the state to the specified destination address.
128
+ // Destination needs to have allocated enough memory.
129
+ // Returns the number of bytes copied
130
+ LLAMA_API size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest);
131
+
132
+ // Set the state reading from the specified address
133
+ // Returns the number of bytes read
134
+ LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src);
131
135
 
132
136
  // Run the llama inference to obtain the logits and probabilities for the next token.
133
137
  // tokens + n_tokens is the provided batch of new tokens to process
@@ -21,6 +21,9 @@
21
21
  #if defined(_POSIX_MAPPED_FILES)
22
22
  #include <sys/mman.h>
23
23
  #endif
24
+ #if defined(_POSIX_MEMLOCK_RANGE)
25
+ #include <sys/resource.h>
26
+ #endif
24
27
  #endif
25
28
  #endif
26
29
 
@@ -303,8 +306,18 @@ struct llama_mlock {
303
306
  if (!mlock(addr, size)) {
304
307
  return true;
305
308
  } else {
306
- fprintf(stderr, "warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n" MLOCK_SUGGESTION,
307
- size, this->size, std::strerror(errno));
309
+ char* errmsg = std::strerror(errno);
310
+ bool suggest = (errno == ENOMEM);
311
+
312
+ // Check if the resource limit is fine after all
313
+ struct rlimit lock_limit;
314
+ if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit))
315
+ suggest = false;
316
+ if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size))
317
+ suggest = false;
318
+
319
+ fprintf(stderr, "warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s",
320
+ size, this->size, errmsg, suggest ? MLOCK_SUGGESTION : "");
308
321
  return false;
309
322
  }
310
323
  }
@@ -0,0 +1,151 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LLaMACpp
4
+ # Client provides a high-level interface to the LLM model.
5
+ class Client
6
+ # Creates a new client.
7
+ #
8
+ # @param model_path [String] The path to the model file.
9
+ # @param lora_adapter_path [String] The path to the LoRA adapter file.
10
+ # @param lora_base_path [String] The path to the LoRA base model file.
11
+ # @param n_ctx [Integer] The context size.
12
+ # @param n_parts [Integer] The amount of model parts (-1 = determine from model dimensions).
13
+ # @param memory_f16 [Boolean] The flag wheter to use f16 instead of f32 for memory kv.
14
+ # @param use_mmap [Boolean] The flag whether to use mmap.
15
+ # @param use_mlock [Boolean] The flag hether to use mlock.
16
+ # @param embedding [Boolean] The flag whether to calculate embedding.
17
+ # @param n_threads [Integer] The number of threads to use.
18
+ # @param seed [Integer] The seed for the random number generator.
19
+ # @return [Client]
20
+ # rubocop:disable Metrics/MethodLength, Metrics/ParameterLists
21
+ def initialize(model_path:, lora_adapter_path: nil, lora_base_path: nil,
22
+ n_ctx: 512, n_parts: -1, memory_f16: false, use_mmap: true, use_mlock: false,
23
+ embedding: false,
24
+ n_threads: 1, seed: 0)
25
+ @params = {
26
+ model_path: model_path,
27
+ lora_adapter_path: lora_adapter_path,
28
+ lora_base_path: lora_base_path,
29
+ n_ctx: n_ctx,
30
+ n_parts: n_parts,
31
+ memory_f16: memory_f16,
32
+ use_mmap: use_mmap,
33
+ use_mlock: use_mlock,
34
+ embedding: embedding,
35
+ n_threads: n_threads,
36
+ seed: seed
37
+ }
38
+ @context_params = ContextParams.new
39
+ @context_params.n_ctx = n_ctx
40
+ @context_params.n_parts = n_parts
41
+ @context_params.f16_kv = memory_f16
42
+ @context_params.use_mmap = use_mmap
43
+ @context_params.use_mlock = use_mlock
44
+ @context_params.embedding = embedding
45
+ @context_params.seed = seed
46
+ @context = Context.new(model_path: model_path, params: @context_params)
47
+ return unless lora_adapter_path.is_a?(String)
48
+
49
+ if lora_base_path.is_a?(String)
50
+ @context.apply_lora_from_file(lora_path: lora_adapter_path, base_model_path: lora_base_path, n_threads: n_threads)
51
+ else
52
+ @context.apply_lora_from_file(lora_path: lora_adapter_path, n_threads: n_threads)
53
+ end
54
+ end
55
+ # rubocop:enable Metrics/MethodLength, Metrics/ParameterLists
56
+
57
+ # Generates completions for a given prompt.
58
+ #
59
+ # @param prompt [String] The prompt to generate completions for.
60
+ # @param max_tokens [Integer] The maximum number of tokens to generate.
61
+ # @param n_keep [Integer] The number of tokens to keep from the initial prompt.
62
+ # @param repeat_last_n [Integer] The number of tokens to use for repeat penalty.
63
+ # @param n_batch [Integer] The batch size.
64
+ # @param top_k [Integer] The top-k value.
65
+ # @param top_p [Float] The top-p value.
66
+ # @param temperature [Float] The temperature value.
67
+ # @param repeat_penalty [Float] The repeat penalty value.
68
+ # @return [String]
69
+ # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/ParameterLists, Metrics/PerceivedComplexity
70
+ def completions(prompt, max_tokens: 64, n_keep: 10, repeat_last_n: 64, n_batch: 512,
71
+ top_k: 40, top_p: 0.95, temperature: 0.80, repeat_penalty: 1.1)
72
+ embd_input = tokenize_prompt(prompt)
73
+
74
+ n_ctx = @context.n_ctx
75
+ raise ArgumentError, "prompt is too long #{embd_input.size} tokens, maximum is #{n_ctx - 4}" if embd_input.size > n_ctx - 4
76
+
77
+ last_n_tokens = [0] * n_ctx
78
+
79
+ embd = []
80
+ n_consumed = 0
81
+ n_past = 0
82
+ n_remain = max_tokens
83
+ output = []
84
+
85
+ while n_remain != 0
86
+ unless embd.empty?
87
+ if n_past + embd.size > n_ctx
88
+ n_left = n_past - n_keep
89
+ n_past = n_keep
90
+ embd.insert(0, last_n_tokens[(n_ctx - (n_left / 2) - embd.size)...-embd.size])
91
+ end
92
+
93
+ @context.eval(tokens: embd, n_past: n_past, n_threads: @params[:n_threads])
94
+ end
95
+
96
+ n_past += embd.size
97
+ embd.clear
98
+
99
+ if embd_input.size <= n_consumed
100
+ start = n_ctx - repeat_last_n
101
+ id = @context.sample_top_p_top_k(
102
+ last_n_tokens[start...(start + repeat_last_n)],
103
+ top_k: top_k, top_p: top_p, temp: temperature, penalty: repeat_penalty
104
+ )
105
+ last_n_tokens.shift
106
+ last_n_tokens.push(id)
107
+
108
+ embd.push(id)
109
+ n_remain -= 1
110
+ else
111
+ while embd_input.size > n_consumed
112
+ embd.push(embd_input[n_consumed])
113
+ last_n_tokens.shift
114
+ last_n_tokens.push(embd_input[n_consumed])
115
+ n_consumed += 1
116
+ break if embd.size >= n_batch
117
+ end
118
+ end
119
+
120
+ embd.each { |token| output << @context.token_to_str(token) }
121
+
122
+ break if !embd.empty? && embd[-1] == LLaMACpp.token_eos
123
+ end
124
+
125
+ output.join.delete_prefix(" #{prompt}").strip
126
+ end
127
+ # rubocop:enable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/ParameterLists, Metrics/PerceivedComplexity
128
+
129
+ # def chat(prompt); end
130
+
131
+ # Obtains the embedding for a given text.
132
+ #
133
+ # @param text [String] The text to obtain the embedding for.
134
+ # @return [Array<Float>]
135
+ def embeddings(text)
136
+ raise 'The embedding option is set to false' unless @params[:embedding]
137
+
138
+ embd_input = tokenize_prompt(text)
139
+ raise 'The result of tokenizing the input text is empty' unless embd_input.size.positive?
140
+
141
+ @context.eval(tokens: embd_input, n_past: 0, n_threads: @params[:n_threads])
142
+ @context.embeddings
143
+ end
144
+
145
+ private
146
+
147
+ def tokenize_prompt(prompt)
148
+ @context.tokenize(text: " #{prompt}", add_bos: true)
149
+ end
150
+ end
151
+ end
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.0.6'
6
+ VERSION = '0.0.7'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'master-12b5900'
9
+ LLAMA_CPP_VERSION = 'master-11d9023'
10
10
  end
data/lib/llama_cpp.rb CHANGED
@@ -2,6 +2,7 @@
2
2
 
3
3
  require_relative 'llama_cpp/version'
4
4
  require_relative 'llama_cpp/llama_cpp'
5
+ require_relative 'llama_cpp/client'
5
6
 
6
7
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
7
8
  module LLaMACpp
@@ -12,24 +13,31 @@ module LLaMACpp
12
13
 
13
14
  # Generates sentences following the given prompt for operation check.
14
15
  #
15
- # @param context [LLaMACpp::Context]
16
- # @param prompt [String]
17
- # @param n_threads [Integer]
16
+ # @param context [LLaMACpp::Context] The context to use.
17
+ # @param prompt [String] The prompt to start generation with.
18
+ # @param n_predict [Integer] The number of tokens to predict.
19
+ # @param n_threads [Integer] The number of threads.
18
20
  # @return [String]
19
- def generate(context, prompt, n_threads: 1) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength, Metrics/PerceivedComplexity
20
- spaced_prompt = " #{prompt}"
21
+ def generate(context, prompt, n_predict: 128, n_threads: 1) # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
22
+ raise ArgumentError, 'context must be an instance of LLaMACpp::Context' unless context.is_a?(LLaMACpp::Context)
23
+ raise ArgumentError, 'context must have loaded the model' if context.empty?
24
+ raise ArgumentError, 'prompt must be a String' unless prompt.is_a?(String)
21
25
 
26
+ spaced_prompt = " #{prompt}"
22
27
  embd_input = context.tokenize(text: spaced_prompt, add_bos: true)
23
28
 
24
29
  n_ctx = context.n_ctx
30
+ raise ArgumentError, "prompt is too long #{embd_input.size} tokens, maximum is #{n_ctx - 4}" if embd_input.size > n_ctx - 4
31
+
25
32
  last_n_tokens = [0] * n_ctx
26
33
 
27
34
  embd = []
28
35
  n_consumed = 0
29
36
  n_keep = 10
30
37
  n_past = 0
31
- n_remain = 128
38
+ n_remain = n_predict
32
39
  repeat_last_n = 64
40
+ n_batch = 512
33
41
  output = []
34
42
 
35
43
  while n_remain != 0
@@ -62,13 +70,13 @@ module LLaMACpp
62
70
  last_n_tokens.shift
63
71
  last_n_tokens.push(embd_input[n_consumed])
64
72
  n_consumed += 1
65
- break if embd.size >= 512
73
+ break if embd.size >= n_batch
66
74
  end
67
75
  end
68
76
 
69
77
  embd.each { |token| output << context.token_to_str(token) }
70
78
 
71
- break if embd[-1] == LLaMACpp.token_eos
79
+ break if !embd.empty? && embd[-1] == LLaMACpp.token_eos
72
80
  end
73
81
 
74
82
  output.join.delete_prefix(spaced_prompt).strip