llama_cpp 0.0.5 → 0.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +22 -0
- data/ext/llama_cpp/extconf.rb +24 -1
- data/ext/llama_cpp/llama_cpp.cpp +72 -0
- data/ext/llama_cpp/src/ggml-cuda.h +44 -0
- data/ext/llama_cpp/src/ggml-opencl.c +216 -0
- data/ext/llama_cpp/src/ggml-opencl.h +24 -0
- data/ext/llama_cpp/src/ggml.c +2324 -969
- data/ext/llama_cpp/src/ggml.h +656 -619
- data/ext/llama_cpp/src/llama.cpp +269 -42
- data/ext/llama_cpp/src/llama.h +22 -14
- data/ext/llama_cpp/src/llama_util.h +15 -3
- data/lib/llama_cpp/client.rb +151 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +16 -8
- data/sig/llama_cpp.rbs +26 -2
- metadata +6 -2
@@ -0,0 +1,151 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module LLaMACpp
|
4
|
+
# Client provides a high-level interface to the LLM model.
|
5
|
+
class Client
|
6
|
+
# Creates a new client.
|
7
|
+
#
|
8
|
+
# @param model_path [String] The path to the model file.
|
9
|
+
# @param lora_adapter_path [String] The path to the LoRA adapter file.
|
10
|
+
# @param lora_base_path [String] The path to the LoRA base model file.
|
11
|
+
# @param n_ctx [Integer] The context size.
|
12
|
+
# @param n_parts [Integer] The amount of model parts (-1 = determine from model dimensions).
|
13
|
+
# @param memory_f16 [Boolean] The flag wheter to use f16 instead of f32 for memory kv.
|
14
|
+
# @param use_mmap [Boolean] The flag whether to use mmap.
|
15
|
+
# @param use_mlock [Boolean] The flag hether to use mlock.
|
16
|
+
# @param embedding [Boolean] The flag whether to calculate embedding.
|
17
|
+
# @param n_threads [Integer] The number of threads to use.
|
18
|
+
# @param seed [Integer] The seed for the random number generator.
|
19
|
+
# @return [Client]
|
20
|
+
# rubocop:disable Metrics/MethodLength, Metrics/ParameterLists
|
21
|
+
def initialize(model_path:, lora_adapter_path: nil, lora_base_path: nil,
|
22
|
+
n_ctx: 512, n_parts: -1, memory_f16: false, use_mmap: true, use_mlock: false,
|
23
|
+
embedding: false,
|
24
|
+
n_threads: 1, seed: 0)
|
25
|
+
@params = {
|
26
|
+
model_path: model_path,
|
27
|
+
lora_adapter_path: lora_adapter_path,
|
28
|
+
lora_base_path: lora_base_path,
|
29
|
+
n_ctx: n_ctx,
|
30
|
+
n_parts: n_parts,
|
31
|
+
memory_f16: memory_f16,
|
32
|
+
use_mmap: use_mmap,
|
33
|
+
use_mlock: use_mlock,
|
34
|
+
embedding: embedding,
|
35
|
+
n_threads: n_threads,
|
36
|
+
seed: seed
|
37
|
+
}
|
38
|
+
@context_params = ContextParams.new
|
39
|
+
@context_params.n_ctx = n_ctx
|
40
|
+
@context_params.n_parts = n_parts
|
41
|
+
@context_params.f16_kv = memory_f16
|
42
|
+
@context_params.use_mmap = use_mmap
|
43
|
+
@context_params.use_mlock = use_mlock
|
44
|
+
@context_params.embedding = embedding
|
45
|
+
@context_params.seed = seed
|
46
|
+
@context = Context.new(model_path: model_path, params: @context_params)
|
47
|
+
return unless lora_adapter_path.is_a?(String)
|
48
|
+
|
49
|
+
if lora_base_path.is_a?(String)
|
50
|
+
@context.apply_lora_from_file(lora_path: lora_adapter_path, base_model_path: lora_base_path, n_threads: n_threads)
|
51
|
+
else
|
52
|
+
@context.apply_lora_from_file(lora_path: lora_adapter_path, n_threads: n_threads)
|
53
|
+
end
|
54
|
+
end
|
55
|
+
# rubocop:enable Metrics/MethodLength, Metrics/ParameterLists
|
56
|
+
|
57
|
+
# Generates completions for a given prompt.
|
58
|
+
#
|
59
|
+
# @param prompt [String] The prompt to generate completions for.
|
60
|
+
# @param max_tokens [Integer] The maximum number of tokens to generate.
|
61
|
+
# @param n_keep [Integer] The number of tokens to keep from the initial prompt.
|
62
|
+
# @param repeat_last_n [Integer] The number of tokens to use for repeat penalty.
|
63
|
+
# @param n_batch [Integer] The batch size.
|
64
|
+
# @param top_k [Integer] The top-k value.
|
65
|
+
# @param top_p [Float] The top-p value.
|
66
|
+
# @param temperature [Float] The temperature value.
|
67
|
+
# @param repeat_penalty [Float] The repeat penalty value.
|
68
|
+
# @return [String]
|
69
|
+
# rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/ParameterLists, Metrics/PerceivedComplexity
|
70
|
+
def completions(prompt, max_tokens: 64, n_keep: 10, repeat_last_n: 64, n_batch: 512,
|
71
|
+
top_k: 40, top_p: 0.95, temperature: 0.80, repeat_penalty: 1.1)
|
72
|
+
embd_input = tokenize_prompt(prompt)
|
73
|
+
|
74
|
+
n_ctx = @context.n_ctx
|
75
|
+
raise ArgumentError, "prompt is too long #{embd_input.size} tokens, maximum is #{n_ctx - 4}" if embd_input.size > n_ctx - 4
|
76
|
+
|
77
|
+
last_n_tokens = [0] * n_ctx
|
78
|
+
|
79
|
+
embd = []
|
80
|
+
n_consumed = 0
|
81
|
+
n_past = 0
|
82
|
+
n_remain = max_tokens
|
83
|
+
output = []
|
84
|
+
|
85
|
+
while n_remain != 0
|
86
|
+
unless embd.empty?
|
87
|
+
if n_past + embd.size > n_ctx
|
88
|
+
n_left = n_past - n_keep
|
89
|
+
n_past = n_keep
|
90
|
+
embd.insert(0, last_n_tokens[(n_ctx - (n_left / 2) - embd.size)...-embd.size])
|
91
|
+
end
|
92
|
+
|
93
|
+
@context.eval(tokens: embd, n_past: n_past, n_threads: @params[:n_threads])
|
94
|
+
end
|
95
|
+
|
96
|
+
n_past += embd.size
|
97
|
+
embd.clear
|
98
|
+
|
99
|
+
if embd_input.size <= n_consumed
|
100
|
+
start = n_ctx - repeat_last_n
|
101
|
+
id = @context.sample_top_p_top_k(
|
102
|
+
last_n_tokens[start...(start + repeat_last_n)],
|
103
|
+
top_k: top_k, top_p: top_p, temp: temperature, penalty: repeat_penalty
|
104
|
+
)
|
105
|
+
last_n_tokens.shift
|
106
|
+
last_n_tokens.push(id)
|
107
|
+
|
108
|
+
embd.push(id)
|
109
|
+
n_remain -= 1
|
110
|
+
else
|
111
|
+
while embd_input.size > n_consumed
|
112
|
+
embd.push(embd_input[n_consumed])
|
113
|
+
last_n_tokens.shift
|
114
|
+
last_n_tokens.push(embd_input[n_consumed])
|
115
|
+
n_consumed += 1
|
116
|
+
break if embd.size >= n_batch
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
embd.each { |token| output << @context.token_to_str(token) }
|
121
|
+
|
122
|
+
break if !embd.empty? && embd[-1] == LLaMACpp.token_eos
|
123
|
+
end
|
124
|
+
|
125
|
+
output.join.delete_prefix(" #{prompt}").strip
|
126
|
+
end
|
127
|
+
# rubocop:enable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/ParameterLists, Metrics/PerceivedComplexity
|
128
|
+
|
129
|
+
# def chat(prompt); end
|
130
|
+
|
131
|
+
# Obtains the embedding for a given text.
|
132
|
+
#
|
133
|
+
# @param text [String] The text to obtain the embedding for.
|
134
|
+
# @return [Array<Float>]
|
135
|
+
def embeddings(text)
|
136
|
+
raise 'The embedding option is set to false' unless @params[:embedding]
|
137
|
+
|
138
|
+
embd_input = tokenize_prompt(text)
|
139
|
+
raise 'The result of tokenizing the input text is empty' unless embd_input.size.positive?
|
140
|
+
|
141
|
+
@context.eval(tokens: embd_input, n_past: 0, n_threads: @params[:n_threads])
|
142
|
+
@context.embeddings
|
143
|
+
end
|
144
|
+
|
145
|
+
private
|
146
|
+
|
147
|
+
def tokenize_prompt(prompt)
|
148
|
+
@context.tokenize(text: " #{prompt}", add_bos: true)
|
149
|
+
end
|
150
|
+
end
|
151
|
+
end
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.0.
|
6
|
+
VERSION = '0.0.7'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = 'master-
|
9
|
+
LLAMA_CPP_VERSION = 'master-11d9023'
|
10
10
|
end
|
data/lib/llama_cpp.rb
CHANGED
@@ -2,6 +2,7 @@
|
|
2
2
|
|
3
3
|
require_relative 'llama_cpp/version'
|
4
4
|
require_relative 'llama_cpp/llama_cpp'
|
5
|
+
require_relative 'llama_cpp/client'
|
5
6
|
|
6
7
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
7
8
|
module LLaMACpp
|
@@ -12,24 +13,31 @@ module LLaMACpp
|
|
12
13
|
|
13
14
|
# Generates sentences following the given prompt for operation check.
|
14
15
|
#
|
15
|
-
# @param context [LLaMACpp::Context]
|
16
|
-
# @param prompt [String]
|
17
|
-
# @param
|
16
|
+
# @param context [LLaMACpp::Context] The context to use.
|
17
|
+
# @param prompt [String] The prompt to start generation with.
|
18
|
+
# @param n_predict [Integer] The number of tokens to predict.
|
19
|
+
# @param n_threads [Integer] The number of threads.
|
18
20
|
# @return [String]
|
19
|
-
def generate(context, prompt, n_threads: 1) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength, Metrics/PerceivedComplexity
|
20
|
-
|
21
|
+
def generate(context, prompt, n_predict: 128, n_threads: 1) # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
|
22
|
+
raise ArgumentError, 'context must be an instance of LLaMACpp::Context' unless context.is_a?(LLaMACpp::Context)
|
23
|
+
raise ArgumentError, 'context must have loaded the model' if context.empty?
|
24
|
+
raise ArgumentError, 'prompt must be a String' unless prompt.is_a?(String)
|
21
25
|
|
26
|
+
spaced_prompt = " #{prompt}"
|
22
27
|
embd_input = context.tokenize(text: spaced_prompt, add_bos: true)
|
23
28
|
|
24
29
|
n_ctx = context.n_ctx
|
30
|
+
raise ArgumentError, "prompt is too long #{embd_input.size} tokens, maximum is #{n_ctx - 4}" if embd_input.size > n_ctx - 4
|
31
|
+
|
25
32
|
last_n_tokens = [0] * n_ctx
|
26
33
|
|
27
34
|
embd = []
|
28
35
|
n_consumed = 0
|
29
36
|
n_keep = 10
|
30
37
|
n_past = 0
|
31
|
-
n_remain =
|
38
|
+
n_remain = n_predict
|
32
39
|
repeat_last_n = 64
|
40
|
+
n_batch = 512
|
33
41
|
output = []
|
34
42
|
|
35
43
|
while n_remain != 0
|
@@ -62,13 +70,13 @@ module LLaMACpp
|
|
62
70
|
last_n_tokens.shift
|
63
71
|
last_n_tokens.push(embd_input[n_consumed])
|
64
72
|
n_consumed += 1
|
65
|
-
break if embd.size >=
|
73
|
+
break if embd.size >= n_batch
|
66
74
|
end
|
67
75
|
end
|
68
76
|
|
69
77
|
embd.each { |token| output << context.token_to_str(token) }
|
70
78
|
|
71
|
-
break if embd[-1] == LLaMACpp.token_eos
|
79
|
+
break if !embd.empty? && embd[-1] == LLaMACpp.token_eos
|
72
80
|
end
|
73
81
|
|
74
82
|
output.join.delete_prefix(spaced_prompt).strip
|
data/sig/llama_cpp.rbs
CHANGED
@@ -5,7 +5,19 @@ module LLaMACpp
|
|
5
5
|
LLAMA_FILE_MAGIC: String
|
6
6
|
LLAMA_FILE_MAGIC_UNVERSIONED: String
|
7
7
|
|
8
|
-
|
8
|
+
LLAMA_FTYPE_ALL_F32: Integer
|
9
|
+
LLAMA_FTYPE_MOSTLY_F16: Integer
|
10
|
+
LLAMA_FTYPE_MOSTLY_Q4_0: Integer
|
11
|
+
LLAMA_FTYPE_MOSTLY_Q4_1: Integer
|
12
|
+
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16: Integer
|
13
|
+
LLAMA_FTYPE_MOSTLY_Q4_2: Integer
|
14
|
+
LLAMA_FTYPE_MOSTLY_Q4_3: Integer
|
15
|
+
LLAMA_FTYPE_MOSTLY_Q8_0: Integer
|
16
|
+
LLAMA_FTYPE_MOSTLY_Q5_0: Integer
|
17
|
+
LLAMA_FTYPE_MOSTLY_Q5_1: Integer
|
18
|
+
|
19
|
+
def self?.model_quantize: (input_path: String, output_path: String, ftype: Integer, ?n_threads: Integer) -> void
|
20
|
+
def self?.generate: (::LLaMACpp::Context, String, ?n_predict: Integer, ?n_threads: Integer) -> String
|
9
21
|
def self?.print_system_info: () -> void
|
10
22
|
def self?.token_bos: () -> Integer
|
11
23
|
def self?.token_eos: () -> Integer
|
@@ -18,7 +30,8 @@ module LLaMACpp
|
|
18
30
|
def initialize: (model_path: String, params: ::LLaMACpp::ContextParams) -> void
|
19
31
|
| () -> void
|
20
32
|
def embeddings: () -> Array[Float]
|
21
|
-
def
|
33
|
+
def empty?: () -> bool
|
34
|
+
def eval: (tokens: Array[Integer], n_past: Integer, ?n_tokens: Integer, ?n_threads: Integer) -> void
|
22
35
|
def free: () -> void
|
23
36
|
def load: (model_path: String, params: ::LLaMACpp::ContextParams) -> void
|
24
37
|
def logits: () -> Array[Float]
|
@@ -50,9 +63,20 @@ module LLaMACpp
|
|
50
63
|
def seed=: (Integer) -> Integer
|
51
64
|
def use_mlock: () -> bool
|
52
65
|
def use_mlock=: (bool) -> bool
|
66
|
+
def use_mmap: () -> bool
|
67
|
+
def use_mmap=: (bool) -> bool
|
53
68
|
def vocab_only: () -> bool
|
54
69
|
def vocab_only=: (bool) -> bool
|
55
70
|
end
|
56
71
|
|
57
72
|
class Params = ContextParams
|
73
|
+
|
74
|
+
class Client
|
75
|
+
def initialize(model_path: String, ?lora_adapter_path: String, ?lora_base_path: String,
|
76
|
+
?n_ctx: Integer, ?n_parts: Integer, ?memory_f16: bool, ?use_mmap: bool, ?use_mlock: bool,
|
77
|
+
?embedding: bool, ?n_threads: Integer, ?seed: Integer) -> void
|
78
|
+
def completions(String, ?max_tokens: Integer, ?n_keep: Integer, ?repeat_last_n: Integer, ?n_batch: Integer,
|
79
|
+
?top_k: Integer, ?top_p: Float, ?temperature: Float, ?repeat_penalty: Float) -> String
|
80
|
+
def embeddings(String) -> Array[Float]
|
81
|
+
end
|
58
82
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llama_cpp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-04-
|
11
|
+
date: 2023-04-29 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
14
14
|
email:
|
@@ -26,12 +26,16 @@ files:
|
|
26
26
|
- ext/llama_cpp/llama_cpp.cpp
|
27
27
|
- ext/llama_cpp/llama_cpp.h
|
28
28
|
- ext/llama_cpp/src/LICENSE
|
29
|
+
- ext/llama_cpp/src/ggml-cuda.h
|
30
|
+
- ext/llama_cpp/src/ggml-opencl.c
|
31
|
+
- ext/llama_cpp/src/ggml-opencl.h
|
29
32
|
- ext/llama_cpp/src/ggml.c
|
30
33
|
- ext/llama_cpp/src/ggml.h
|
31
34
|
- ext/llama_cpp/src/llama.cpp
|
32
35
|
- ext/llama_cpp/src/llama.h
|
33
36
|
- ext/llama_cpp/src/llama_util.h
|
34
37
|
- lib/llama_cpp.rb
|
38
|
+
- lib/llama_cpp/client.rb
|
35
39
|
- lib/llama_cpp/version.rb
|
36
40
|
- sig/llama_cpp.rbs
|
37
41
|
homepage: https://github.com/yoshoku/llama_cpp.rb
|