llama_cpp 0.0.5 → 0.0.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +22 -0
- data/ext/llama_cpp/extconf.rb +24 -1
- data/ext/llama_cpp/llama_cpp.cpp +72 -0
- data/ext/llama_cpp/src/ggml-cuda.h +44 -0
- data/ext/llama_cpp/src/ggml-opencl.c +216 -0
- data/ext/llama_cpp/src/ggml-opencl.h +24 -0
- data/ext/llama_cpp/src/ggml.c +2324 -969
- data/ext/llama_cpp/src/ggml.h +656 -619
- data/ext/llama_cpp/src/llama.cpp +269 -42
- data/ext/llama_cpp/src/llama.h +22 -14
- data/ext/llama_cpp/src/llama_util.h +15 -3
- data/lib/llama_cpp/client.rb +151 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +16 -8
- data/sig/llama_cpp.rbs +26 -2
- metadata +6 -2
@@ -0,0 +1,151 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module LLaMACpp
|
4
|
+
# Client provides a high-level interface to the LLM model.
|
5
|
+
class Client
|
6
|
+
# Creates a new client.
|
7
|
+
#
|
8
|
+
# @param model_path [String] The path to the model file.
|
9
|
+
# @param lora_adapter_path [String] The path to the LoRA adapter file.
|
10
|
+
# @param lora_base_path [String] The path to the LoRA base model file.
|
11
|
+
# @param n_ctx [Integer] The context size.
|
12
|
+
# @param n_parts [Integer] The amount of model parts (-1 = determine from model dimensions).
|
13
|
+
# @param memory_f16 [Boolean] The flag wheter to use f16 instead of f32 for memory kv.
|
14
|
+
# @param use_mmap [Boolean] The flag whether to use mmap.
|
15
|
+
# @param use_mlock [Boolean] The flag hether to use mlock.
|
16
|
+
# @param embedding [Boolean] The flag whether to calculate embedding.
|
17
|
+
# @param n_threads [Integer] The number of threads to use.
|
18
|
+
# @param seed [Integer] The seed for the random number generator.
|
19
|
+
# @return [Client]
|
20
|
+
# rubocop:disable Metrics/MethodLength, Metrics/ParameterLists
|
21
|
+
def initialize(model_path:, lora_adapter_path: nil, lora_base_path: nil,
|
22
|
+
n_ctx: 512, n_parts: -1, memory_f16: false, use_mmap: true, use_mlock: false,
|
23
|
+
embedding: false,
|
24
|
+
n_threads: 1, seed: 0)
|
25
|
+
@params = {
|
26
|
+
model_path: model_path,
|
27
|
+
lora_adapter_path: lora_adapter_path,
|
28
|
+
lora_base_path: lora_base_path,
|
29
|
+
n_ctx: n_ctx,
|
30
|
+
n_parts: n_parts,
|
31
|
+
memory_f16: memory_f16,
|
32
|
+
use_mmap: use_mmap,
|
33
|
+
use_mlock: use_mlock,
|
34
|
+
embedding: embedding,
|
35
|
+
n_threads: n_threads,
|
36
|
+
seed: seed
|
37
|
+
}
|
38
|
+
@context_params = ContextParams.new
|
39
|
+
@context_params.n_ctx = n_ctx
|
40
|
+
@context_params.n_parts = n_parts
|
41
|
+
@context_params.f16_kv = memory_f16
|
42
|
+
@context_params.use_mmap = use_mmap
|
43
|
+
@context_params.use_mlock = use_mlock
|
44
|
+
@context_params.embedding = embedding
|
45
|
+
@context_params.seed = seed
|
46
|
+
@context = Context.new(model_path: model_path, params: @context_params)
|
47
|
+
return unless lora_adapter_path.is_a?(String)
|
48
|
+
|
49
|
+
if lora_base_path.is_a?(String)
|
50
|
+
@context.apply_lora_from_file(lora_path: lora_adapter_path, base_model_path: lora_base_path, n_threads: n_threads)
|
51
|
+
else
|
52
|
+
@context.apply_lora_from_file(lora_path: lora_adapter_path, n_threads: n_threads)
|
53
|
+
end
|
54
|
+
end
|
55
|
+
# rubocop:enable Metrics/MethodLength, Metrics/ParameterLists
|
56
|
+
|
57
|
+
# Generates completions for a given prompt.
|
58
|
+
#
|
59
|
+
# @param prompt [String] The prompt to generate completions for.
|
60
|
+
# @param max_tokens [Integer] The maximum number of tokens to generate.
|
61
|
+
# @param n_keep [Integer] The number of tokens to keep from the initial prompt.
|
62
|
+
# @param repeat_last_n [Integer] The number of tokens to use for repeat penalty.
|
63
|
+
# @param n_batch [Integer] The batch size.
|
64
|
+
# @param top_k [Integer] The top-k value.
|
65
|
+
# @param top_p [Float] The top-p value.
|
66
|
+
# @param temperature [Float] The temperature value.
|
67
|
+
# @param repeat_penalty [Float] The repeat penalty value.
|
68
|
+
# @return [String]
|
69
|
+
# rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/ParameterLists, Metrics/PerceivedComplexity
|
70
|
+
def completions(prompt, max_tokens: 64, n_keep: 10, repeat_last_n: 64, n_batch: 512,
|
71
|
+
top_k: 40, top_p: 0.95, temperature: 0.80, repeat_penalty: 1.1)
|
72
|
+
embd_input = tokenize_prompt(prompt)
|
73
|
+
|
74
|
+
n_ctx = @context.n_ctx
|
75
|
+
raise ArgumentError, "prompt is too long #{embd_input.size} tokens, maximum is #{n_ctx - 4}" if embd_input.size > n_ctx - 4
|
76
|
+
|
77
|
+
last_n_tokens = [0] * n_ctx
|
78
|
+
|
79
|
+
embd = []
|
80
|
+
n_consumed = 0
|
81
|
+
n_past = 0
|
82
|
+
n_remain = max_tokens
|
83
|
+
output = []
|
84
|
+
|
85
|
+
while n_remain != 0
|
86
|
+
unless embd.empty?
|
87
|
+
if n_past + embd.size > n_ctx
|
88
|
+
n_left = n_past - n_keep
|
89
|
+
n_past = n_keep
|
90
|
+
embd.insert(0, last_n_tokens[(n_ctx - (n_left / 2) - embd.size)...-embd.size])
|
91
|
+
end
|
92
|
+
|
93
|
+
@context.eval(tokens: embd, n_past: n_past, n_threads: @params[:n_threads])
|
94
|
+
end
|
95
|
+
|
96
|
+
n_past += embd.size
|
97
|
+
embd.clear
|
98
|
+
|
99
|
+
if embd_input.size <= n_consumed
|
100
|
+
start = n_ctx - repeat_last_n
|
101
|
+
id = @context.sample_top_p_top_k(
|
102
|
+
last_n_tokens[start...(start + repeat_last_n)],
|
103
|
+
top_k: top_k, top_p: top_p, temp: temperature, penalty: repeat_penalty
|
104
|
+
)
|
105
|
+
last_n_tokens.shift
|
106
|
+
last_n_tokens.push(id)
|
107
|
+
|
108
|
+
embd.push(id)
|
109
|
+
n_remain -= 1
|
110
|
+
else
|
111
|
+
while embd_input.size > n_consumed
|
112
|
+
embd.push(embd_input[n_consumed])
|
113
|
+
last_n_tokens.shift
|
114
|
+
last_n_tokens.push(embd_input[n_consumed])
|
115
|
+
n_consumed += 1
|
116
|
+
break if embd.size >= n_batch
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
embd.each { |token| output << @context.token_to_str(token) }
|
121
|
+
|
122
|
+
break if !embd.empty? && embd[-1] == LLaMACpp.token_eos
|
123
|
+
end
|
124
|
+
|
125
|
+
output.join.delete_prefix(" #{prompt}").strip
|
126
|
+
end
|
127
|
+
# rubocop:enable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/ParameterLists, Metrics/PerceivedComplexity
|
128
|
+
|
129
|
+
# def chat(prompt); end
|
130
|
+
|
131
|
+
# Obtains the embedding for a given text.
|
132
|
+
#
|
133
|
+
# @param text [String] The text to obtain the embedding for.
|
134
|
+
# @return [Array<Float>]
|
135
|
+
def embeddings(text)
|
136
|
+
raise 'The embedding option is set to false' unless @params[:embedding]
|
137
|
+
|
138
|
+
embd_input = tokenize_prompt(text)
|
139
|
+
raise 'The result of tokenizing the input text is empty' unless embd_input.size.positive?
|
140
|
+
|
141
|
+
@context.eval(tokens: embd_input, n_past: 0, n_threads: @params[:n_threads])
|
142
|
+
@context.embeddings
|
143
|
+
end
|
144
|
+
|
145
|
+
private
|
146
|
+
|
147
|
+
def tokenize_prompt(prompt)
|
148
|
+
@context.tokenize(text: " #{prompt}", add_bos: true)
|
149
|
+
end
|
150
|
+
end
|
151
|
+
end
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.0.
|
6
|
+
VERSION = '0.0.7'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = 'master-
|
9
|
+
LLAMA_CPP_VERSION = 'master-11d9023'
|
10
10
|
end
|
data/lib/llama_cpp.rb
CHANGED
@@ -2,6 +2,7 @@
|
|
2
2
|
|
3
3
|
require_relative 'llama_cpp/version'
|
4
4
|
require_relative 'llama_cpp/llama_cpp'
|
5
|
+
require_relative 'llama_cpp/client'
|
5
6
|
|
6
7
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
7
8
|
module LLaMACpp
|
@@ -12,24 +13,31 @@ module LLaMACpp
|
|
12
13
|
|
13
14
|
# Generates sentences following the given prompt for operation check.
|
14
15
|
#
|
15
|
-
# @param context [LLaMACpp::Context]
|
16
|
-
# @param prompt [String]
|
17
|
-
# @param
|
16
|
+
# @param context [LLaMACpp::Context] The context to use.
|
17
|
+
# @param prompt [String] The prompt to start generation with.
|
18
|
+
# @param n_predict [Integer] The number of tokens to predict.
|
19
|
+
# @param n_threads [Integer] The number of threads.
|
18
20
|
# @return [String]
|
19
|
-
def generate(context, prompt, n_threads: 1) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength, Metrics/PerceivedComplexity
|
20
|
-
|
21
|
+
def generate(context, prompt, n_predict: 128, n_threads: 1) # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
|
22
|
+
raise ArgumentError, 'context must be an instance of LLaMACpp::Context' unless context.is_a?(LLaMACpp::Context)
|
23
|
+
raise ArgumentError, 'context must have loaded the model' if context.empty?
|
24
|
+
raise ArgumentError, 'prompt must be a String' unless prompt.is_a?(String)
|
21
25
|
|
26
|
+
spaced_prompt = " #{prompt}"
|
22
27
|
embd_input = context.tokenize(text: spaced_prompt, add_bos: true)
|
23
28
|
|
24
29
|
n_ctx = context.n_ctx
|
30
|
+
raise ArgumentError, "prompt is too long #{embd_input.size} tokens, maximum is #{n_ctx - 4}" if embd_input.size > n_ctx - 4
|
31
|
+
|
25
32
|
last_n_tokens = [0] * n_ctx
|
26
33
|
|
27
34
|
embd = []
|
28
35
|
n_consumed = 0
|
29
36
|
n_keep = 10
|
30
37
|
n_past = 0
|
31
|
-
n_remain =
|
38
|
+
n_remain = n_predict
|
32
39
|
repeat_last_n = 64
|
40
|
+
n_batch = 512
|
33
41
|
output = []
|
34
42
|
|
35
43
|
while n_remain != 0
|
@@ -62,13 +70,13 @@ module LLaMACpp
|
|
62
70
|
last_n_tokens.shift
|
63
71
|
last_n_tokens.push(embd_input[n_consumed])
|
64
72
|
n_consumed += 1
|
65
|
-
break if embd.size >=
|
73
|
+
break if embd.size >= n_batch
|
66
74
|
end
|
67
75
|
end
|
68
76
|
|
69
77
|
embd.each { |token| output << context.token_to_str(token) }
|
70
78
|
|
71
|
-
break if embd[-1] == LLaMACpp.token_eos
|
79
|
+
break if !embd.empty? && embd[-1] == LLaMACpp.token_eos
|
72
80
|
end
|
73
81
|
|
74
82
|
output.join.delete_prefix(spaced_prompt).strip
|
data/sig/llama_cpp.rbs
CHANGED
@@ -5,7 +5,19 @@ module LLaMACpp
|
|
5
5
|
LLAMA_FILE_MAGIC: String
|
6
6
|
LLAMA_FILE_MAGIC_UNVERSIONED: String
|
7
7
|
|
8
|
-
|
8
|
+
LLAMA_FTYPE_ALL_F32: Integer
|
9
|
+
LLAMA_FTYPE_MOSTLY_F16: Integer
|
10
|
+
LLAMA_FTYPE_MOSTLY_Q4_0: Integer
|
11
|
+
LLAMA_FTYPE_MOSTLY_Q4_1: Integer
|
12
|
+
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16: Integer
|
13
|
+
LLAMA_FTYPE_MOSTLY_Q4_2: Integer
|
14
|
+
LLAMA_FTYPE_MOSTLY_Q4_3: Integer
|
15
|
+
LLAMA_FTYPE_MOSTLY_Q8_0: Integer
|
16
|
+
LLAMA_FTYPE_MOSTLY_Q5_0: Integer
|
17
|
+
LLAMA_FTYPE_MOSTLY_Q5_1: Integer
|
18
|
+
|
19
|
+
def self?.model_quantize: (input_path: String, output_path: String, ftype: Integer, ?n_threads: Integer) -> void
|
20
|
+
def self?.generate: (::LLaMACpp::Context, String, ?n_predict: Integer, ?n_threads: Integer) -> String
|
9
21
|
def self?.print_system_info: () -> void
|
10
22
|
def self?.token_bos: () -> Integer
|
11
23
|
def self?.token_eos: () -> Integer
|
@@ -18,7 +30,8 @@ module LLaMACpp
|
|
18
30
|
def initialize: (model_path: String, params: ::LLaMACpp::ContextParams) -> void
|
19
31
|
| () -> void
|
20
32
|
def embeddings: () -> Array[Float]
|
21
|
-
def
|
33
|
+
def empty?: () -> bool
|
34
|
+
def eval: (tokens: Array[Integer], n_past: Integer, ?n_tokens: Integer, ?n_threads: Integer) -> void
|
22
35
|
def free: () -> void
|
23
36
|
def load: (model_path: String, params: ::LLaMACpp::ContextParams) -> void
|
24
37
|
def logits: () -> Array[Float]
|
@@ -50,9 +63,20 @@ module LLaMACpp
|
|
50
63
|
def seed=: (Integer) -> Integer
|
51
64
|
def use_mlock: () -> bool
|
52
65
|
def use_mlock=: (bool) -> bool
|
66
|
+
def use_mmap: () -> bool
|
67
|
+
def use_mmap=: (bool) -> bool
|
53
68
|
def vocab_only: () -> bool
|
54
69
|
def vocab_only=: (bool) -> bool
|
55
70
|
end
|
56
71
|
|
57
72
|
class Params = ContextParams
|
73
|
+
|
74
|
+
class Client
|
75
|
+
def initialize(model_path: String, ?lora_adapter_path: String, ?lora_base_path: String,
|
76
|
+
?n_ctx: Integer, ?n_parts: Integer, ?memory_f16: bool, ?use_mmap: bool, ?use_mlock: bool,
|
77
|
+
?embedding: bool, ?n_threads: Integer, ?seed: Integer) -> void
|
78
|
+
def completions(String, ?max_tokens: Integer, ?n_keep: Integer, ?repeat_last_n: Integer, ?n_batch: Integer,
|
79
|
+
?top_k: Integer, ?top_p: Float, ?temperature: Float, ?repeat_penalty: Float) -> String
|
80
|
+
def embeddings(String) -> Array[Float]
|
81
|
+
end
|
58
82
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llama_cpp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-04-
|
11
|
+
date: 2023-04-29 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
14
14
|
email:
|
@@ -26,12 +26,16 @@ files:
|
|
26
26
|
- ext/llama_cpp/llama_cpp.cpp
|
27
27
|
- ext/llama_cpp/llama_cpp.h
|
28
28
|
- ext/llama_cpp/src/LICENSE
|
29
|
+
- ext/llama_cpp/src/ggml-cuda.h
|
30
|
+
- ext/llama_cpp/src/ggml-opencl.c
|
31
|
+
- ext/llama_cpp/src/ggml-opencl.h
|
29
32
|
- ext/llama_cpp/src/ggml.c
|
30
33
|
- ext/llama_cpp/src/ggml.h
|
31
34
|
- ext/llama_cpp/src/llama.cpp
|
32
35
|
- ext/llama_cpp/src/llama.h
|
33
36
|
- ext/llama_cpp/src/llama_util.h
|
34
37
|
- lib/llama_cpp.rb
|
38
|
+
- lib/llama_cpp/client.rb
|
35
39
|
- lib/llama_cpp/version.rb
|
36
40
|
- sig/llama_cpp.rbs
|
37
41
|
homepage: https://github.com/yoshoku/llama_cpp.rb
|