llama_cpp 0.0.6 → 0.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -27,6 +27,7 @@
27
27
  #include <thread>
28
28
  #include <atomic>
29
29
  #include <mutex>
30
+ #include <sstream>
30
31
 
31
32
  #define LLAMA_USE_SCRATCH
32
33
  #define LLAMA_MAX_SCRATCH_BUFFERS 16
@@ -53,7 +54,7 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
53
54
  { MODEL_7B, 512ull * MB },
54
55
  { MODEL_13B, 512ull * MB },
55
56
  { MODEL_30B, 512ull * MB },
56
- { MODEL_65B, 512ull * MB },
57
+ { MODEL_65B, 1024ull * MB },
57
58
  };
58
59
  return _MEM_REQ_SCRATCH0;
59
60
  }
@@ -64,10 +65,10 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
64
65
  { MODEL_7B, 512ull * MB },
65
66
  { MODEL_13B, 512ull * MB },
66
67
  { MODEL_30B, 512ull * MB },
67
- { MODEL_65B, 512ull * MB },
68
+ { MODEL_65B, 1024ull * MB },
68
69
  };
69
70
  return _MEM_REQ_SCRATCH1;
70
- };
71
+ }
71
72
 
72
73
  // 2*n_embd*n_ctx*n_layer*sizeof(float16)
73
74
  static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
@@ -79,7 +80,7 @@ static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
79
80
  { MODEL_65B, 5120ull * MB },
80
81
  };
81
82
  return _MEM_REQ_KV_SELF;
82
- };
83
+ }
83
84
 
84
85
  // this is mostly needed for temporary mul_mat buffers to dequantize the data
85
86
  // not actually needed if BLAS is disabled
@@ -92,7 +93,7 @@ static const std::map<e_model, size_t> & MEM_REQ_EVAL()
92
93
  { MODEL_65B, 1536ull * MB },
93
94
  };
94
95
  return _MEM_REQ_EVAL;
95
- };
96
+ }
96
97
 
97
98
  // default hparams (LLaMA 7B)
98
99
  struct llama_hparams {
@@ -483,6 +484,9 @@ struct llama_file_loader {
483
484
  case GGML_TYPE_Q4_1:
484
485
  case GGML_TYPE_Q4_2:
485
486
  case GGML_TYPE_Q4_3:
487
+ case GGML_TYPE_Q5_0:
488
+ case GGML_TYPE_Q5_1:
489
+ case GGML_TYPE_Q8_0:
486
490
  break;
487
491
  default: {
488
492
  throw format("unrecognized tensor type %u\n", shard.type);
@@ -557,6 +561,9 @@ struct llama_file_saver {
557
561
  case GGML_TYPE_Q4_1:
558
562
  case GGML_TYPE_Q4_2:
559
563
  case GGML_TYPE_Q4_3:
564
+ case GGML_TYPE_Q5_0:
565
+ case GGML_TYPE_Q5_1:
566
+ case GGML_TYPE_Q8_0:
560
567
  break;
561
568
  default: LLAMA_ASSERT(false);
562
569
  }
@@ -847,6 +854,9 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
847
854
  return "mostly Q4_1, some F16";
848
855
  case LLAMA_FTYPE_MOSTLY_Q4_2: return "mostly Q4_2";
849
856
  case LLAMA_FTYPE_MOSTLY_Q4_3: return "mostly Q4_3";
857
+ case LLAMA_FTYPE_MOSTLY_Q5_0: return "mostly Q5_0";
858
+ case LLAMA_FTYPE_MOSTLY_Q5_1: return "mostly Q5_1";
859
+ case LLAMA_FTYPE_MOSTLY_Q8_0: return "mostly Q8_0";
850
860
  default: return "unknown, may not work";
851
861
  }
852
862
  }
@@ -1075,7 +1085,7 @@ static bool llama_eval_internal(
1075
1085
  // for big prompts, if BLAS is enabled, it is better to use only one thread
1076
1086
  // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
1077
1087
  ggml_cgraph gf = {};
1078
- gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_cublas() ? 1 : n_threads;
1088
+ gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
1079
1089
 
1080
1090
  struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
1081
1091
  memcpy(embd->data, tokens, N*ggml_element_size(embd));
@@ -1249,9 +1259,11 @@ static bool llama_eval_internal(
1249
1259
  ggml_build_forward_expand(&gf, inpL);
1250
1260
  ggml_graph_compute (ctx0, &gf);
1251
1261
 
1262
+ #ifdef GGML_PERF
1252
1263
  // print timing information per ggml operation (for debugging purposes)
1253
1264
  // requires GGML_PERF to be defined
1254
- //ggml_graph_print(&gf);
1265
+ ggml_graph_print(&gf);
1266
+ #endif
1255
1267
 
1256
1268
  // plot the computation graph in dot format (for debugging purposes)
1257
1269
  //if (n_past%100 == 0) {
@@ -1582,6 +1594,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1582
1594
  case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
1583
1595
  case LLAMA_FTYPE_MOSTLY_Q4_2: quantized_type = GGML_TYPE_Q4_2; break;
1584
1596
  case LLAMA_FTYPE_MOSTLY_Q4_3: quantized_type = GGML_TYPE_Q4_3; break;
1597
+ case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break;
1598
+ case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break;
1599
+ case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break;
1585
1600
  default: throw format("invalid output file type %d\n", ftype);
1586
1601
  };
1587
1602
 
@@ -1618,8 +1633,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1618
1633
  // quantize only 2D tensors
1619
1634
  quantize &= (tensor.ne.size() == 2);
1620
1635
 
1621
- // GG: uncomment this to keep the output layer in FP16
1622
- //if (tensor.name.rfind("output")) {
1636
+ // uncomment this to keep the output layer in FP16
1637
+ //if (tensor.name == "output.weight") {
1623
1638
  // quantize = false;
1624
1639
  //}
1625
1640
 
@@ -1787,7 +1802,7 @@ struct llama_context * llama_init_from_file(
1787
1802
  if (params.logits_all) {
1788
1803
  ctx->logits.reserve(hparams.n_ctx*hparams.n_vocab);
1789
1804
  } else {
1790
- ctx->logits.reserve(hparams.n_ctx);
1805
+ ctx->logits.reserve(hparams.n_vocab);
1791
1806
  }
1792
1807
 
1793
1808
  if (params.embedding){
@@ -2069,31 +2084,198 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor
2069
2084
  }
2070
2085
  }
2071
2086
 
2072
- // Returns the KV cache that will contain the context for the
2073
- // ongoing prediction with the model.
2074
- const uint8_t * llama_get_kv_cache(struct llama_context * ctx) {
2075
- return ctx->model.kv_self.buf.addr;
2087
+ int llama_get_kv_cache_token_count(struct llama_context * ctx) {
2088
+ return ctx->model.kv_self.n;
2076
2089
  }
2077
2090
 
2078
- // Returns the size of the KV cache
2079
- size_t llama_get_kv_cache_size(struct llama_context * ctx) {
2080
- return ctx->model.kv_self.buf.size;
2091
+ #define LLAMA_MAX_RNG_STATE 64*1024
2092
+
2093
+ void llama_set_rng_seed(struct llama_context * ctx, int seed) {
2094
+ if (seed <= 0) {
2095
+ seed = time(NULL);
2096
+ }
2097
+ ctx->rng.seed(seed);
2081
2098
  }
2082
2099
 
2083
- int llama_get_kv_cache_token_count(struct llama_context * ctx) {
2084
- return ctx->model.kv_self.n;
2100
+ // Returns the size of the state
2101
+ size_t llama_get_state_size(struct llama_context * ctx) {
2102
+ // we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state.
2103
+ // for reference, std::mt19937(1337) serializes to 6701 bytes.
2104
+ const size_t s_rng_size = sizeof(size_t);
2105
+ const size_t s_rng = LLAMA_MAX_RNG_STATE;
2106
+ const size_t s_logits_capacity = sizeof(size_t);
2107
+ const size_t s_logits_size = sizeof(size_t);
2108
+ const size_t s_logits = ctx->logits.capacity() * sizeof(float);
2109
+ const size_t s_embedding_size = sizeof(size_t);
2110
+ const size_t s_embedding = ctx->embedding.size() * sizeof(float);
2111
+ const size_t s_kv_size = sizeof(size_t);
2112
+ const size_t s_kv_ntok = sizeof(int);
2113
+ const size_t s_kv = ctx->model.kv_self.buf.size;
2114
+
2115
+ const size_t s_total = (
2116
+ + s_rng_size
2117
+ + s_rng
2118
+ + s_logits_capacity
2119
+ + s_logits_size
2120
+ + s_logits
2121
+ + s_embedding_size
2122
+ + s_embedding
2123
+ + s_kv_size
2124
+ + s_kv_ntok
2125
+ + s_kv
2126
+ );
2127
+
2128
+ return s_total;
2085
2129
  }
2086
2130
 
2087
- // Sets the KV cache containing the current context for the model
2088
- void llama_set_kv_cache(
2089
- struct llama_context * ctx,
2090
- const uint8_t * kv_cache,
2091
- size_t n_size,
2092
- int n_token_count) {
2093
- // Make sure we have the same kv cache setup
2094
- LLAMA_ASSERT(ctx->model.kv_self.buf.size == n_size);
2095
- memcpy(ctx->model.kv_self.buf.addr, kv_cache, n_size);
2096
- ctx->model.kv_self.n = n_token_count;
2131
+ // Copies the state to the specified destination address
2132
+ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest) {
2133
+ uint8_t * out = dest;
2134
+
2135
+ // copy rng
2136
+ {
2137
+ std::stringstream rng_ss;
2138
+ rng_ss << ctx->rng;
2139
+
2140
+ const size_t rng_size = rng_ss.str().size();
2141
+ char rng_buf[LLAMA_MAX_RNG_STATE];
2142
+
2143
+ memset(&rng_buf[0], 0, LLAMA_MAX_RNG_STATE);
2144
+ memcpy(&rng_buf[0], rng_ss.str().data(), rng_ss.str().size());
2145
+
2146
+ memcpy(out, &rng_size, sizeof(rng_size)); out += sizeof(rng_size);
2147
+ memcpy(out, &rng_buf[0], LLAMA_MAX_RNG_STATE); out += LLAMA_MAX_RNG_STATE;
2148
+ }
2149
+
2150
+ // copy logits
2151
+ {
2152
+ const size_t logits_cap = ctx->logits.capacity();
2153
+ const size_t logits_size = ctx->logits.size();
2154
+
2155
+ memcpy(out, &logits_cap, sizeof(logits_cap)); out += sizeof(logits_cap);
2156
+ memcpy(out, &logits_size, sizeof(logits_size)); out += sizeof(logits_size);
2157
+
2158
+ if (logits_size) {
2159
+ memcpy(out, ctx->logits.data(), logits_size * sizeof(float));
2160
+ }
2161
+
2162
+ out += logits_cap * sizeof(float);
2163
+ }
2164
+
2165
+ // copy embeddings
2166
+ {
2167
+ const size_t embedding_size = ctx->embedding.size();
2168
+
2169
+ memcpy(out, &embedding_size, sizeof(embedding_size)); out += sizeof(embedding_size);
2170
+
2171
+ if (embedding_size) {
2172
+ memcpy(out, ctx->embedding.data(), embedding_size * sizeof(float));
2173
+ out += embedding_size * sizeof(float);
2174
+ }
2175
+ }
2176
+
2177
+ // copy kv cache
2178
+ {
2179
+ const size_t kv_size = ctx->model.kv_self.buf.size;
2180
+ const int kv_ntok = llama_get_kv_cache_token_count(ctx);
2181
+
2182
+ memcpy(out, &kv_size, sizeof(kv_size)); out += sizeof(kv_size);
2183
+ memcpy(out, &kv_ntok, sizeof(kv_ntok)); out += sizeof(kv_ntok);
2184
+
2185
+ if (kv_size) {
2186
+ memcpy(out, ctx->model.kv_self.buf.addr, kv_size); out += kv_size;
2187
+ }
2188
+ }
2189
+
2190
+ const size_t written = out - dest;
2191
+ const size_t expected = llama_get_state_size(ctx);
2192
+
2193
+ LLAMA_ASSERT(written == expected);
2194
+
2195
+ return written;
2196
+ }
2197
+
2198
+ // Sets the state reading from the specified source address
2199
+ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
2200
+ const uint8_t * in = src;
2201
+
2202
+ // set rng
2203
+ {
2204
+ size_t rng_size;
2205
+ char rng_buf[LLAMA_MAX_RNG_STATE];
2206
+
2207
+ memcpy(&rng_size, in, sizeof(rng_size)); in += sizeof(rng_size);
2208
+ memcpy(&rng_buf[0], in, LLAMA_MAX_RNG_STATE); in += LLAMA_MAX_RNG_STATE;
2209
+
2210
+ std::stringstream rng_ss;
2211
+ rng_ss.str(std::string(&rng_buf[0], rng_size));
2212
+ rng_ss >> ctx->rng;
2213
+
2214
+ LLAMA_ASSERT(rng_ss.fail() == false);
2215
+ }
2216
+
2217
+ // set logits
2218
+ {
2219
+ size_t logits_cap;
2220
+ size_t logits_size;
2221
+
2222
+ memcpy(&logits_cap, in, sizeof(logits_cap)); in += sizeof(logits_cap);
2223
+ memcpy(&logits_size, in, sizeof(logits_size)); in += sizeof(logits_size);
2224
+
2225
+ LLAMA_ASSERT(ctx->logits.capacity() == logits_cap);
2226
+
2227
+ if (logits_size) {
2228
+ ctx->logits.resize(logits_size);
2229
+ memcpy(ctx->logits.data(), in, logits_size * sizeof(float));
2230
+ }
2231
+
2232
+ in += logits_cap * sizeof(float);
2233
+ }
2234
+
2235
+ // set embeddings
2236
+ {
2237
+ size_t embedding_size;
2238
+
2239
+ memcpy(&embedding_size, in, sizeof(embedding_size)); in += sizeof(embedding_size);
2240
+
2241
+ LLAMA_ASSERT(ctx->embedding.capacity() == embedding_size);
2242
+
2243
+ if (embedding_size) {
2244
+ memcpy(ctx->embedding.data(), in, embedding_size * sizeof(float));
2245
+ in += embedding_size * sizeof(float);
2246
+ }
2247
+ }
2248
+
2249
+ // set kv cache
2250
+ {
2251
+ size_t kv_size;
2252
+ int kv_ntok;
2253
+
2254
+ memcpy(&kv_size, in, sizeof(kv_size)); in += sizeof(kv_size);
2255
+ memcpy(&kv_ntok, in, sizeof(kv_ntok)); in += sizeof(kv_ntok);
2256
+
2257
+ if (kv_size) {
2258
+ LLAMA_ASSERT(ctx->model.kv_self.buf.size == kv_size);
2259
+
2260
+ void * k_data = ctx->model.kv_self.k->data; // remember data pointers
2261
+ void * v_data = ctx->model.kv_self.v->data; // because their value is stored in buf and overwritten by memcpy
2262
+
2263
+ memcpy(ctx->model.kv_self.buf.addr, in, kv_size); in += kv_size;
2264
+
2265
+ ctx->model.kv_self.k->data = k_data; // restore correct data pointers
2266
+ ctx->model.kv_self.v->data = v_data;
2267
+
2268
+ }
2269
+
2270
+ ctx->model.kv_self.n = kv_ntok;
2271
+ }
2272
+
2273
+ const size_t nread = in - src;
2274
+ const size_t expected = llama_get_state_size(ctx);
2275
+
2276
+ LLAMA_ASSERT(nread == expected);
2277
+
2278
+ return nread;
2097
2279
  }
2098
2280
 
2099
2281
  int llama_eval(
@@ -2248,3 +2430,4 @@ const char * llama_print_system_info(void) {
2248
2430
  std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
2249
2431
  return ctx->model.tensors_by_name;
2250
2432
  }
2433
+
@@ -74,6 +74,9 @@ extern "C" {
74
74
  LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
75
75
  LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // except 1d tensors
76
76
  LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // except 1d tensors
77
+ LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
78
+ LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
79
+ LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
77
80
  };
78
81
 
79
82
  LLAMA_API struct llama_context_params llama_context_default_params();
@@ -112,22 +115,23 @@ extern "C" {
112
115
  const char * path_base_model,
113
116
  int n_threads);
114
117
 
115
- // Returns the KV cache that will contain the context for the
116
- // ongoing prediction with the model.
117
- LLAMA_API const uint8_t * llama_get_kv_cache(struct llama_context * ctx);
118
-
119
- // Returns the size of the KV cache
120
- LLAMA_API size_t llama_get_kv_cache_size(struct llama_context * ctx);
121
-
122
118
  // Returns the number of tokens in the KV cache
123
119
  LLAMA_API int llama_get_kv_cache_token_count(struct llama_context * ctx);
124
120
 
125
- // Sets the KV cache containing the current context for the model
126
- LLAMA_API void llama_set_kv_cache(
127
- struct llama_context * ctx,
128
- const uint8_t * kv_cache,
129
- size_t n_size,
130
- int n_token_count);
121
+ // Sets the current rng seed.
122
+ LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, int seed);
123
+
124
+ // Returns the size in bytes of the state (rng, logits, embedding and kv_cache)
125
+ LLAMA_API size_t llama_get_state_size(struct llama_context * ctx);
126
+
127
+ // Copies the state to the specified destination address.
128
+ // Destination needs to have allocated enough memory.
129
+ // Returns the number of bytes copied
130
+ LLAMA_API size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest);
131
+
132
+ // Set the state reading from the specified address
133
+ // Returns the number of bytes read
134
+ LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src);
131
135
 
132
136
  // Run the llama inference to obtain the logits and probabilities for the next token.
133
137
  // tokens + n_tokens is the provided batch of new tokens to process
@@ -21,6 +21,9 @@
21
21
  #if defined(_POSIX_MAPPED_FILES)
22
22
  #include <sys/mman.h>
23
23
  #endif
24
+ #if defined(_POSIX_MEMLOCK_RANGE)
25
+ #include <sys/resource.h>
26
+ #endif
24
27
  #endif
25
28
  #endif
26
29
 
@@ -303,8 +306,18 @@ struct llama_mlock {
303
306
  if (!mlock(addr, size)) {
304
307
  return true;
305
308
  } else {
306
- fprintf(stderr, "warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n" MLOCK_SUGGESTION,
307
- size, this->size, std::strerror(errno));
309
+ char* errmsg = std::strerror(errno);
310
+ bool suggest = (errno == ENOMEM);
311
+
312
+ // Check if the resource limit is fine after all
313
+ struct rlimit lock_limit;
314
+ if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit))
315
+ suggest = false;
316
+ if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size))
317
+ suggest = false;
318
+
319
+ fprintf(stderr, "warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s",
320
+ size, this->size, errmsg, suggest ? MLOCK_SUGGESTION : "");
308
321
  return false;
309
322
  }
310
323
  }
@@ -0,0 +1,151 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LLaMACpp
4
+ # Client provides a high-level interface to the LLM model.
5
+ class Client
6
+ # Creates a new client.
7
+ #
8
+ # @param model_path [String] The path to the model file.
9
+ # @param lora_adapter_path [String] The path to the LoRA adapter file.
10
+ # @param lora_base_path [String] The path to the LoRA base model file.
11
+ # @param n_ctx [Integer] The context size.
12
+ # @param n_parts [Integer] The amount of model parts (-1 = determine from model dimensions).
13
+ # @param memory_f16 [Boolean] The flag wheter to use f16 instead of f32 for memory kv.
14
+ # @param use_mmap [Boolean] The flag whether to use mmap.
15
+ # @param use_mlock [Boolean] The flag hether to use mlock.
16
+ # @param embedding [Boolean] The flag whether to calculate embedding.
17
+ # @param n_threads [Integer] The number of threads to use.
18
+ # @param seed [Integer] The seed for the random number generator.
19
+ # @return [Client]
20
+ # rubocop:disable Metrics/MethodLength, Metrics/ParameterLists
21
+ def initialize(model_path:, lora_adapter_path: nil, lora_base_path: nil,
22
+ n_ctx: 512, n_parts: -1, memory_f16: false, use_mmap: true, use_mlock: false,
23
+ embedding: false,
24
+ n_threads: 1, seed: 0)
25
+ @params = {
26
+ model_path: model_path,
27
+ lora_adapter_path: lora_adapter_path,
28
+ lora_base_path: lora_base_path,
29
+ n_ctx: n_ctx,
30
+ n_parts: n_parts,
31
+ memory_f16: memory_f16,
32
+ use_mmap: use_mmap,
33
+ use_mlock: use_mlock,
34
+ embedding: embedding,
35
+ n_threads: n_threads,
36
+ seed: seed
37
+ }
38
+ @context_params = ContextParams.new
39
+ @context_params.n_ctx = n_ctx
40
+ @context_params.n_parts = n_parts
41
+ @context_params.f16_kv = memory_f16
42
+ @context_params.use_mmap = use_mmap
43
+ @context_params.use_mlock = use_mlock
44
+ @context_params.embedding = embedding
45
+ @context_params.seed = seed
46
+ @context = Context.new(model_path: model_path, params: @context_params)
47
+ return unless lora_adapter_path.is_a?(String)
48
+
49
+ if lora_base_path.is_a?(String)
50
+ @context.apply_lora_from_file(lora_path: lora_adapter_path, base_model_path: lora_base_path, n_threads: n_threads)
51
+ else
52
+ @context.apply_lora_from_file(lora_path: lora_adapter_path, n_threads: n_threads)
53
+ end
54
+ end
55
+ # rubocop:enable Metrics/MethodLength, Metrics/ParameterLists
56
+
57
+ # Generates completions for a given prompt.
58
+ #
59
+ # @param prompt [String] The prompt to generate completions for.
60
+ # @param max_tokens [Integer] The maximum number of tokens to generate.
61
+ # @param n_keep [Integer] The number of tokens to keep from the initial prompt.
62
+ # @param repeat_last_n [Integer] The number of tokens to use for repeat penalty.
63
+ # @param n_batch [Integer] The batch size.
64
+ # @param top_k [Integer] The top-k value.
65
+ # @param top_p [Float] The top-p value.
66
+ # @param temperature [Float] The temperature value.
67
+ # @param repeat_penalty [Float] The repeat penalty value.
68
+ # @return [String]
69
+ # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/ParameterLists, Metrics/PerceivedComplexity
70
+ def completions(prompt, max_tokens: 64, n_keep: 10, repeat_last_n: 64, n_batch: 512,
71
+ top_k: 40, top_p: 0.95, temperature: 0.80, repeat_penalty: 1.1)
72
+ embd_input = tokenize_prompt(prompt)
73
+
74
+ n_ctx = @context.n_ctx
75
+ raise ArgumentError, "prompt is too long #{embd_input.size} tokens, maximum is #{n_ctx - 4}" if embd_input.size > n_ctx - 4
76
+
77
+ last_n_tokens = [0] * n_ctx
78
+
79
+ embd = []
80
+ n_consumed = 0
81
+ n_past = 0
82
+ n_remain = max_tokens
83
+ output = []
84
+
85
+ while n_remain != 0
86
+ unless embd.empty?
87
+ if n_past + embd.size > n_ctx
88
+ n_left = n_past - n_keep
89
+ n_past = n_keep
90
+ embd.insert(0, last_n_tokens[(n_ctx - (n_left / 2) - embd.size)...-embd.size])
91
+ end
92
+
93
+ @context.eval(tokens: embd, n_past: n_past, n_threads: @params[:n_threads])
94
+ end
95
+
96
+ n_past += embd.size
97
+ embd.clear
98
+
99
+ if embd_input.size <= n_consumed
100
+ start = n_ctx - repeat_last_n
101
+ id = @context.sample_top_p_top_k(
102
+ last_n_tokens[start...(start + repeat_last_n)],
103
+ top_k: top_k, top_p: top_p, temp: temperature, penalty: repeat_penalty
104
+ )
105
+ last_n_tokens.shift
106
+ last_n_tokens.push(id)
107
+
108
+ embd.push(id)
109
+ n_remain -= 1
110
+ else
111
+ while embd_input.size > n_consumed
112
+ embd.push(embd_input[n_consumed])
113
+ last_n_tokens.shift
114
+ last_n_tokens.push(embd_input[n_consumed])
115
+ n_consumed += 1
116
+ break if embd.size >= n_batch
117
+ end
118
+ end
119
+
120
+ embd.each { |token| output << @context.token_to_str(token) }
121
+
122
+ break if !embd.empty? && embd[-1] == LLaMACpp.token_eos
123
+ end
124
+
125
+ output.join.delete_prefix(" #{prompt}").strip
126
+ end
127
+ # rubocop:enable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/ParameterLists, Metrics/PerceivedComplexity
128
+
129
+ # def chat(prompt); end
130
+
131
+ # Obtains the embedding for a given text.
132
+ #
133
+ # @param text [String] The text to obtain the embedding for.
134
+ # @return [Array<Float>]
135
+ def embeddings(text)
136
+ raise 'The embedding option is set to false' unless @params[:embedding]
137
+
138
+ embd_input = tokenize_prompt(text)
139
+ raise 'The result of tokenizing the input text is empty' unless embd_input.size.positive?
140
+
141
+ @context.eval(tokens: embd_input, n_past: 0, n_threads: @params[:n_threads])
142
+ @context.embeddings
143
+ end
144
+
145
+ private
146
+
147
+ def tokenize_prompt(prompt)
148
+ @context.tokenize(text: " #{prompt}", add_bos: true)
149
+ end
150
+ end
151
+ end
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.0.6'
6
+ VERSION = '0.0.7'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'master-12b5900'
9
+ LLAMA_CPP_VERSION = 'master-11d9023'
10
10
  end
data/lib/llama_cpp.rb CHANGED
@@ -2,6 +2,7 @@
2
2
 
3
3
  require_relative 'llama_cpp/version'
4
4
  require_relative 'llama_cpp/llama_cpp'
5
+ require_relative 'llama_cpp/client'
5
6
 
6
7
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
7
8
  module LLaMACpp
@@ -12,24 +13,31 @@ module LLaMACpp
12
13
 
13
14
  # Generates sentences following the given prompt for operation check.
14
15
  #
15
- # @param context [LLaMACpp::Context]
16
- # @param prompt [String]
17
- # @param n_threads [Integer]
16
+ # @param context [LLaMACpp::Context] The context to use.
17
+ # @param prompt [String] The prompt to start generation with.
18
+ # @param n_predict [Integer] The number of tokens to predict.
19
+ # @param n_threads [Integer] The number of threads.
18
20
  # @return [String]
19
- def generate(context, prompt, n_threads: 1) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength, Metrics/PerceivedComplexity
20
- spaced_prompt = " #{prompt}"
21
+ def generate(context, prompt, n_predict: 128, n_threads: 1) # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
22
+ raise ArgumentError, 'context must be an instance of LLaMACpp::Context' unless context.is_a?(LLaMACpp::Context)
23
+ raise ArgumentError, 'context must have loaded the model' if context.empty?
24
+ raise ArgumentError, 'prompt must be a String' unless prompt.is_a?(String)
21
25
 
26
+ spaced_prompt = " #{prompt}"
22
27
  embd_input = context.tokenize(text: spaced_prompt, add_bos: true)
23
28
 
24
29
  n_ctx = context.n_ctx
30
+ raise ArgumentError, "prompt is too long #{embd_input.size} tokens, maximum is #{n_ctx - 4}" if embd_input.size > n_ctx - 4
31
+
25
32
  last_n_tokens = [0] * n_ctx
26
33
 
27
34
  embd = []
28
35
  n_consumed = 0
29
36
  n_keep = 10
30
37
  n_past = 0
31
- n_remain = 128
38
+ n_remain = n_predict
32
39
  repeat_last_n = 64
40
+ n_batch = 512
33
41
  output = []
34
42
 
35
43
  while n_remain != 0
@@ -62,13 +70,13 @@ module LLaMACpp
62
70
  last_n_tokens.shift
63
71
  last_n_tokens.push(embd_input[n_consumed])
64
72
  n_consumed += 1
65
- break if embd.size >= 512
73
+ break if embd.size >= n_batch
66
74
  end
67
75
  end
68
76
 
69
77
  embd.each { |token| output << context.token_to_str(token) }
70
78
 
71
- break if embd[-1] == LLaMACpp.token_eos
79
+ break if !embd.empty? && embd[-1] == LLaMACpp.token_eos
72
80
  end
73
81
 
74
82
  output.join.delete_prefix(spaced_prompt).strip