llama_cpp 0.2.1 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +32 -0
- data/README.md +39 -6
- data/examples/README.md +32 -0
- data/examples/chat.rb +2 -1
- data/examples/embedding.rb +38 -0
- data/ext/llama_cpp/extconf.rb +13 -0
- data/ext/llama_cpp/llama_cpp.cpp +231 -132
- data/ext/llama_cpp/src/ggml-cuda.cu +844 -337
- data/ext/llama_cpp/src/ggml-metal.h +4 -1
- data/ext/llama_cpp/src/ggml-metal.m +193 -49
- data/ext/llama_cpp/src/ggml-metal.metal +477 -84
- data/ext/llama_cpp/src/ggml-opencl.cpp +493 -4
- data/ext/llama_cpp/src/ggml.c +1565 -430
- data/ext/llama_cpp/src/ggml.h +208 -14
- data/ext/llama_cpp/src/k_quants.c +1712 -56
- data/ext/llama_cpp/src/k_quants.h +41 -6
- data/ext/llama_cpp/src/llama-util.h +19 -5
- data/ext/llama_cpp/src/llama.cpp +194 -101
- data/ext/llama_cpp/src/llama.h +41 -14
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +0 -2
- data/sig/llama_cpp.rbs +12 -17
- metadata +3 -3
- data/lib/llama_cpp/client.rb +0 -172
data/ext/llama_cpp/src/llama.h
CHANGED
@@ -26,6 +26,14 @@
|
|
26
26
|
# define LLAMA_API
|
27
27
|
#endif
|
28
28
|
|
29
|
+
#ifdef __GNUC__
|
30
|
+
# define DEPRECATED(func, hint) func __attribute__((deprecated(hint)))
|
31
|
+
#elif defined(_MSC_VER)
|
32
|
+
# define DEPRECATED(func, hint) __declspec(deprecated(hint)) func
|
33
|
+
#else
|
34
|
+
# define DEPRECATED(func, hint) func
|
35
|
+
#endif
|
36
|
+
|
29
37
|
#define LLAMA_FILE_MAGIC_GGJT 0x67676a74u // 'ggjt'
|
30
38
|
#define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
|
31
39
|
#define LLAMA_FILE_MAGIC_GGMF 0x67676d66u // 'ggmf'
|
@@ -53,6 +61,7 @@ extern "C" {
|
|
53
61
|
// TODO: show sample usage
|
54
62
|
//
|
55
63
|
|
64
|
+
struct llama_model;
|
56
65
|
struct llama_context;
|
57
66
|
|
58
67
|
typedef int llama_token;
|
@@ -71,28 +80,27 @@ extern "C" {
|
|
71
80
|
|
72
81
|
typedef void (*llama_progress_callback)(float progress, void *ctx);
|
73
82
|
|
74
|
-
|
83
|
+
struct llama_context_params {
|
84
|
+
int seed; // RNG seed, -1 for random
|
75
85
|
int n_ctx; // text context
|
76
86
|
int n_batch; // prompt processing batch size
|
77
87
|
int n_gpu_layers; // number of layers to store in VRAM
|
78
88
|
int main_gpu; // the GPU that is used for scratch and small tensors
|
79
89
|
float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
|
80
|
-
|
81
|
-
|
90
|
+
// called with a progress value between 0 and 1, pass NULL to disable
|
91
|
+
llama_progress_callback progress_callback;
|
92
|
+
// context pointer passed to the progress callback
|
93
|
+
void * progress_callback_user_data;
|
82
94
|
|
95
|
+
// Keep the booleans together to avoid misalignment during copy-by-value.
|
96
|
+
bool low_vram; // if true, reduce VRAM usage at the cost of performance
|
83
97
|
bool f16_kv; // use fp16 for KV cache
|
84
98
|
bool logits_all; // the llama_eval() call computes all logits, not just the last one
|
85
99
|
bool vocab_only; // only load the vocabulary, no weights
|
86
100
|
bool use_mmap; // use mmap if possible
|
87
101
|
bool use_mlock; // force system to keep model in RAM
|
88
102
|
bool embedding; // embedding mode only
|
89
|
-
|
90
|
-
// called with a progress value between 0 and 1, pass NULL to disable
|
91
|
-
llama_progress_callback progress_callback;
|
92
|
-
// context pointer passed to the progress callback
|
93
|
-
void * progress_callback_user_data;
|
94
103
|
};
|
95
|
-
|
96
104
|
// model file types
|
97
105
|
enum llama_ftype {
|
98
106
|
LLAMA_FTYPE_ALL_F32 = 0,
|
@@ -132,17 +140,29 @@ extern "C" {
|
|
132
140
|
|
133
141
|
// TODO: not great API - very likely to change
|
134
142
|
// Initialize the llama + ggml backend
|
143
|
+
// If numa is true, use NUMA optimizations
|
135
144
|
// Call once at the start of the program
|
136
|
-
LLAMA_API void llama_init_backend();
|
145
|
+
LLAMA_API void llama_init_backend(bool numa);
|
137
146
|
|
138
147
|
LLAMA_API int64_t llama_time_us();
|
139
148
|
|
149
|
+
LLAMA_API struct llama_model * llama_load_model_from_file(
|
150
|
+
const char * path_model,
|
151
|
+
struct llama_context_params params);
|
152
|
+
|
153
|
+
LLAMA_API void llama_free_model(struct llama_model * model);
|
154
|
+
|
155
|
+
LLAMA_API struct llama_context * llama_new_context_with_model(
|
156
|
+
struct llama_model * model,
|
157
|
+
struct llama_context_params params);
|
158
|
+
|
140
159
|
// Various functions for loading a ggml llama model.
|
141
160
|
// Allocate (almost) all memory needed for the model.
|
142
161
|
// Return NULL on failure
|
143
|
-
LLAMA_API struct llama_context * llama_init_from_file(
|
162
|
+
LLAMA_API DEPRECATED(struct llama_context * llama_init_from_file(
|
144
163
|
const char * path_model,
|
145
|
-
struct llama_context_params params)
|
164
|
+
struct llama_context_params params),
|
165
|
+
"please use llama_load_model_from_file combined with llama_new_context_with_model instead");
|
146
166
|
|
147
167
|
// Frees all allocated memory
|
148
168
|
LLAMA_API void llama_free(struct llama_context * ctx);
|
@@ -159,8 +179,15 @@ extern "C" {
|
|
159
179
|
// The model needs to be reloaded before applying a new adapter, otherwise the adapter
|
160
180
|
// will be applied on top of the previous one
|
161
181
|
// Returns 0 on success
|
162
|
-
LLAMA_API int llama_apply_lora_from_file(
|
182
|
+
LLAMA_API DEPRECATED(int llama_apply_lora_from_file(
|
163
183
|
struct llama_context * ctx,
|
184
|
+
const char * path_lora,
|
185
|
+
const char * path_base_model,
|
186
|
+
int n_threads),
|
187
|
+
"please use llama_model_apply_lora_from_file instead");
|
188
|
+
|
189
|
+
LLAMA_API int llama_model_apply_lora_from_file(
|
190
|
+
const struct llama_model * model,
|
164
191
|
const char * path_lora,
|
165
192
|
const char * path_base_model,
|
166
193
|
int n_threads);
|
@@ -311,7 +338,7 @@ extern "C" {
|
|
311
338
|
#include <string>
|
312
339
|
struct ggml_tensor;
|
313
340
|
|
314
|
-
std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx);
|
341
|
+
const std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx);
|
315
342
|
|
316
343
|
#endif
|
317
344
|
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.
|
6
|
+
VERSION = '0.3.0'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = 'master-
|
9
|
+
LLAMA_CPP_VERSION = 'master-9d23589'
|
10
10
|
end
|
data/lib/llama_cpp.rb
CHANGED
@@ -2,7 +2,6 @@
|
|
2
2
|
|
3
3
|
require_relative 'llama_cpp/version'
|
4
4
|
require_relative 'llama_cpp/llama_cpp'
|
5
|
-
require_relative 'llama_cpp/client'
|
6
5
|
|
7
6
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
8
7
|
module LLaMACpp
|
@@ -20,7 +19,6 @@ module LLaMACpp
|
|
20
19
|
# @return [String]
|
21
20
|
def generate(context, prompt, n_predict: 128, n_threads: 1) # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
|
22
21
|
raise ArgumentError, 'context must be an instance of LLaMACpp::Context' unless context.is_a?(LLaMACpp::Context)
|
23
|
-
raise ArgumentError, 'context must have loaded the model' if context.empty?
|
24
22
|
raise ArgumentError, 'prompt must be a String' unless prompt.is_a?(String)
|
25
23
|
|
26
24
|
spaced_prompt = " #{prompt}"
|
data/sig/llama_cpp.rbs
CHANGED
@@ -25,7 +25,7 @@ module LLaMACpp
|
|
25
25
|
LLAMA_FTYPE_MOSTLY_Q5_K_M: Integer
|
26
26
|
LLAMA_FTYPE_MOSTLY_Q6_K: Integer
|
27
27
|
|
28
|
-
def self?.init_backend: () -> void
|
28
|
+
def self?.init_backend: (?numa: bool) -> void
|
29
29
|
def self?.model_quantize: (input_path: String, output_path: String, params: ModelQuantizeParams) -> void
|
30
30
|
def self?.generate: (::LLaMACpp::Context, String, ?n_predict: Integer, ?n_threads: Integer) -> String
|
31
31
|
def self?.print_system_info: () -> void
|
@@ -55,17 +55,24 @@ module LLaMACpp
|
|
55
55
|
def sorted: () -> bool
|
56
56
|
end
|
57
57
|
|
58
|
-
class
|
58
|
+
class Model
|
59
59
|
public
|
60
60
|
|
61
61
|
def initialize: (model_path: String, params: ::LLaMACpp::ContextParams) -> void
|
62
62
|
| () -> void
|
63
|
-
def embeddings: () -> Array[Float]
|
64
63
|
def empty?: () -> bool
|
65
|
-
def eval: (tokens: Array[Integer], n_past: Integer, ?n_tokens: Integer, ?n_threads: Integer) -> void
|
66
|
-
def eval_export: (String) -> bool
|
67
64
|
def free: () -> void
|
68
65
|
def load: (model_path: String, params: ::LLaMACpp::ContextParams) -> void
|
66
|
+
def apply_lora_from_file: (lora_path: String, ?base_model_path: String, ?n_threads: Integer) -> void
|
67
|
+
end
|
68
|
+
|
69
|
+
class Context
|
70
|
+
public
|
71
|
+
|
72
|
+
def initialize: (model: ::LLaMACpp::Model) -> void
|
73
|
+
def embeddings: () -> Array[Float]
|
74
|
+
def eval: (tokens: Array[Integer], n_past: Integer, ?n_tokens: Integer, ?n_threads: Integer) -> void
|
75
|
+
def eval_export: (String) -> bool
|
69
76
|
def logits: () -> Array[Float]
|
70
77
|
def n_ctx: () -> Integer
|
71
78
|
def n_embd: () -> Integer
|
@@ -75,7 +82,6 @@ module LLaMACpp
|
|
75
82
|
def reset_timings: () -> void
|
76
83
|
def token_to_str: (Integer) -> String
|
77
84
|
def tokenize: (text: String, ?n_max_tokens: Integer, ?add_bos: bool) -> Array[Integer]
|
78
|
-
def apply_lora_from_file: (lora_path: String, ?base_model_path: String, ?n_threads: Integer) -> void
|
79
85
|
def kv_cache_token_count: () -> Integer
|
80
86
|
def set_rng_seed: (Integer) -> void
|
81
87
|
def load_session_file: (session_path: String) -> void
|
@@ -138,15 +144,4 @@ module LLaMACpp
|
|
138
144
|
end
|
139
145
|
|
140
146
|
class Params = ContextParams
|
141
|
-
|
142
|
-
class Client
|
143
|
-
def initialize(model_path: String, ?lora_adapter_path: String, ?lora_base_path: String,
|
144
|
-
?n_ctx: Integer, ?memory_f16: bool, ?use_mmap: bool, ?use_mlock: bool,
|
145
|
-
?embedding: bool, ?n_threads: Integer, ?seed: Integer) -> void
|
146
|
-
def completions(String, ?max_tokens: Integer, ?n_keep: Integer, ?repeat_last_n: Integer, ?n_batch: Integer,
|
147
|
-
?frequency: Float, ?presence: Float,
|
148
|
-
?top_k: Integer, ?top_p: Float, ?tfs_z: Float, ?typical_p: Float, ?temperature: Float,
|
149
|
-
?repeat_penalty: Float) -> String
|
150
|
-
def embeddings(String) -> Array[Float]
|
151
|
-
end
|
152
147
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llama_cpp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-06-
|
11
|
+
date: 2023-06-29 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
14
14
|
email:
|
@@ -24,6 +24,7 @@ files:
|
|
24
24
|
- README.md
|
25
25
|
- examples/README.md
|
26
26
|
- examples/chat.rb
|
27
|
+
- examples/embedding.rb
|
27
28
|
- ext/llama_cpp/extconf.rb
|
28
29
|
- ext/llama_cpp/llama_cpp.cpp
|
29
30
|
- ext/llama_cpp/llama_cpp.h
|
@@ -43,7 +44,6 @@ files:
|
|
43
44
|
- ext/llama_cpp/src/llama.cpp
|
44
45
|
- ext/llama_cpp/src/llama.h
|
45
46
|
- lib/llama_cpp.rb
|
46
|
-
- lib/llama_cpp/client.rb
|
47
47
|
- lib/llama_cpp/version.rb
|
48
48
|
- sig/llama_cpp.rbs
|
49
49
|
homepage: https://github.com/yoshoku/llama_cpp.rb
|
data/lib/llama_cpp/client.rb
DELETED
@@ -1,172 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
module LLaMACpp
|
4
|
-
# Client provides a high-level interface to the LLM model.
|
5
|
-
class Client # rubocop:disable Metrics/ClassLength
|
6
|
-
# Creates a new client.
|
7
|
-
#
|
8
|
-
# @param model_path [String] The path to the model file.
|
9
|
-
# @param lora_adapter_path [String] The path to the LoRA adapter file.
|
10
|
-
# @param lora_base_path [String] The path to the LoRA base model file.
|
11
|
-
# @param n_ctx [Integer] The context size.
|
12
|
-
# @param memory_f16 [Boolean] The flag wheter to use f16 instead of f32 for memory kv.
|
13
|
-
# @param use_mmap [Boolean] The flag whether to use mmap.
|
14
|
-
# @param use_mlock [Boolean] The flag hether to use mlock.
|
15
|
-
# @param embedding [Boolean] The flag whether to calculate embedding.
|
16
|
-
# @param n_threads [Integer] The number of threads to use.
|
17
|
-
# @param seed [Integer] The seed for the random number generator.
|
18
|
-
# @return [Client]
|
19
|
-
# rubocop:disable Metrics/MethodLength, Metrics/ParameterLists
|
20
|
-
def initialize(model_path:, lora_adapter_path: nil, lora_base_path: nil,
|
21
|
-
n_ctx: 512, memory_f16: false, use_mmap: true, use_mlock: false,
|
22
|
-
embedding: false,
|
23
|
-
n_threads: 1, seed: 0)
|
24
|
-
@params = {
|
25
|
-
model_path: model_path,
|
26
|
-
lora_adapter_path: lora_adapter_path,
|
27
|
-
lora_base_path: lora_base_path,
|
28
|
-
n_ctx: n_ctx,
|
29
|
-
memory_f16: memory_f16,
|
30
|
-
use_mmap: use_mmap,
|
31
|
-
use_mlock: use_mlock,
|
32
|
-
embedding: embedding,
|
33
|
-
n_threads: n_threads,
|
34
|
-
seed: seed
|
35
|
-
}
|
36
|
-
@context_params = ContextParams.new
|
37
|
-
@context_params.n_ctx = n_ctx
|
38
|
-
@context_params.n_parts = n_parts
|
39
|
-
@context_params.f16_kv = memory_f16
|
40
|
-
@context_params.use_mmap = use_mmap
|
41
|
-
@context_params.use_mlock = use_mlock
|
42
|
-
@context_params.embedding = embedding
|
43
|
-
@context_params.seed = seed
|
44
|
-
@context = Context.new(model_path: model_path, params: @context_params)
|
45
|
-
return unless lora_adapter_path.is_a?(String)
|
46
|
-
|
47
|
-
if lora_base_path.is_a?(String)
|
48
|
-
@context.apply_lora_from_file(lora_path: lora_adapter_path, base_model_path: lora_base_path, n_threads: n_threads)
|
49
|
-
else
|
50
|
-
@context.apply_lora_from_file(lora_path: lora_adapter_path, n_threads: n_threads)
|
51
|
-
end
|
52
|
-
end
|
53
|
-
# rubocop:enable Metrics/MethodLength, Metrics/ParameterLists
|
54
|
-
|
55
|
-
# Generates completions for a given prompt.
|
56
|
-
#
|
57
|
-
# @param prompt [String] The prompt to generate completions for.
|
58
|
-
# @param max_tokens [Integer] The maximum number of tokens to generate.
|
59
|
-
# @param n_keep [Integer] The number of tokens to keep from the initial prompt.
|
60
|
-
# @param repeat_last_n [Integer] The number of tokens to use for repeat penalty.
|
61
|
-
# @param n_batch [Integer] The batch size.
|
62
|
-
# @param frequency [Float] The frequency penalty value.
|
63
|
-
# @param presence [Float] The presence penalty value.
|
64
|
-
# @param top_k [Integer] The top-k value.
|
65
|
-
# @param top_p [Float] The top-p value.
|
66
|
-
# @param tfs_z [Float] The tail free sampling parameter.
|
67
|
-
# @param typical_p [Float] The typical probability value.
|
68
|
-
# @param temperature [Float] The temperature value.
|
69
|
-
# @param repeat_penalty [Float] The repeat penalty value.
|
70
|
-
# @return [String]
|
71
|
-
# rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/ParameterLists, Metrics/PerceivedComplexity
|
72
|
-
def completions(prompt, max_tokens: 64, n_keep: 10, repeat_last_n: 64, n_batch: 512,
|
73
|
-
frequency: 0.0, presence: 0.0,
|
74
|
-
top_k: 40, top_p: 0.95, tfs_z: 1.0, typical_p: 1.0, temperature: 0.8, repeat_penalty: 1.1)
|
75
|
-
embd_input = tokenize_prompt(prompt)
|
76
|
-
|
77
|
-
n_ctx = @context.n_ctx
|
78
|
-
raise ArgumentError, "prompt is too long #{embd_input.size} tokens, maximum is #{n_ctx - 4}" if embd_input.size > n_ctx - 4
|
79
|
-
|
80
|
-
last_n_tokens = [0] * n_ctx
|
81
|
-
|
82
|
-
embd = []
|
83
|
-
n_consumed = 0
|
84
|
-
n_past = 0
|
85
|
-
n_remain = max_tokens
|
86
|
-
n_vocab = @context.n_vocab
|
87
|
-
output = []
|
88
|
-
|
89
|
-
while n_remain != 0
|
90
|
-
unless embd.empty?
|
91
|
-
if n_past + embd.size > n_ctx
|
92
|
-
n_left = n_past - n_keep
|
93
|
-
n_past = n_keep
|
94
|
-
embd.insert(0, last_n_tokens[(n_ctx - (n_left / 2) - embd.size)...-embd.size])
|
95
|
-
end
|
96
|
-
|
97
|
-
@context.eval(tokens: embd, n_past: n_past, n_threads: @params[:n_threads])
|
98
|
-
end
|
99
|
-
|
100
|
-
n_past += embd.size
|
101
|
-
embd.clear
|
102
|
-
|
103
|
-
if embd_input.size <= n_consumed
|
104
|
-
logits = @context.logits
|
105
|
-
base_candidates = Array.new(n_vocab) { |i| LLaMACpp::TokenData.new(id: i, logit: logits[i], p: 0.0) }
|
106
|
-
candidates = LLaMACpp::TokenDataArray.new(base_candidates)
|
107
|
-
|
108
|
-
# apply penalties
|
109
|
-
last_n_repeat = [last_n_tokens.size, repeat_last_n, n_ctx].min
|
110
|
-
@context.sample_repetition_penalty(candidates, last_n_tokens[-last_n_repeat..], penalty: repeat_penalty)
|
111
|
-
@context.sample_frequency_and_presence_penalties(
|
112
|
-
candidates, last_n_tokens[-last_n_repeat..], frequency: frequency, presence: presence
|
113
|
-
)
|
114
|
-
|
115
|
-
# temperature sampling
|
116
|
-
@context.sample_top_k(candidates, k: top_k)
|
117
|
-
@context.sample_tail_free(candidates, z: tfs_z)
|
118
|
-
@context.sample_typical(candidates, prob: typical_p)
|
119
|
-
@context.sample_top_p(candidates, prob: top_p)
|
120
|
-
@context.sample_temperature(candidates, temperature: temperature)
|
121
|
-
id = @context.sample_token(candidates)
|
122
|
-
|
123
|
-
last_n_tokens.shift
|
124
|
-
last_n_tokens.push(id)
|
125
|
-
|
126
|
-
last_n_tokens.shift
|
127
|
-
last_n_tokens.push(id)
|
128
|
-
|
129
|
-
embd.push(id)
|
130
|
-
n_remain -= 1
|
131
|
-
else
|
132
|
-
while embd_input.size > n_consumed
|
133
|
-
embd.push(embd_input[n_consumed])
|
134
|
-
last_n_tokens.shift
|
135
|
-
last_n_tokens.push(embd_input[n_consumed])
|
136
|
-
n_consumed += 1
|
137
|
-
break if embd.size >= n_batch
|
138
|
-
end
|
139
|
-
end
|
140
|
-
|
141
|
-
embd.each { |token| output << @context.token_to_str(token) }
|
142
|
-
|
143
|
-
break if !embd.empty? && embd[-1] == LLaMACpp.token_eos
|
144
|
-
end
|
145
|
-
|
146
|
-
output.join.delete_prefix(" #{prompt}").strip
|
147
|
-
end
|
148
|
-
# rubocop:enable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/ParameterLists, Metrics/PerceivedComplexity
|
149
|
-
|
150
|
-
# def chat(prompt); end
|
151
|
-
|
152
|
-
# Obtains the embedding for a given text.
|
153
|
-
#
|
154
|
-
# @param text [String] The text to obtain the embedding for.
|
155
|
-
# @return [Array<Float>]
|
156
|
-
def embeddings(text)
|
157
|
-
raise 'The embedding option is set to false' unless @params[:embedding]
|
158
|
-
|
159
|
-
embd_input = tokenize_prompt(text)
|
160
|
-
raise 'The result of tokenizing the input text is empty' unless embd_input.size.positive?
|
161
|
-
|
162
|
-
@context.eval(tokens: embd_input, n_past: 0, n_threads: @params[:n_threads])
|
163
|
-
@context.embeddings
|
164
|
-
end
|
165
|
-
|
166
|
-
private
|
167
|
-
|
168
|
-
def tokenize_prompt(prompt)
|
169
|
-
@context.tokenize(text: " #{prompt}", add_bos: true)
|
170
|
-
end
|
171
|
-
end
|
172
|
-
end
|