llama-rb 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: ca78f6c05c53323ba5bd78ccdee815a77c4df10fde7c5497563e48281949cc3e
4
+ data.tar.gz: 7e225474cc183d2e50f3936d5bee984d394708ff7c48ad040b8b629c6f21fbb9
5
+ SHA512:
6
+ metadata.gz: ea82a87539c0511175c6c5afe3c93e6bc5c141ea27bc4af0a4c9c9a8574736de59169bd8d847ca3afd385f27aeb306944f27a4e822233b54f3f47033be92d5ed
7
+ data.tar.gz: ca92bfd00bea78d88d90c93418a7cf86e9b6a3b436b86f6a7c87cf1906fed59a539085f549a0cdbaa0bf16815c3a221b51ce23313fab8adb6a6310a75fbbe8f5
data/Gemfile ADDED
@@ -0,0 +1,10 @@
1
+ source "https://rubygems.org"
2
+
3
+ gemspec
4
+
5
+ gem 'bundler-audit'
6
+ gem 'gnar-style'
7
+ gem 'rake'
8
+ gem 'rspec'
9
+ gem 'rubocop-rake', require: false
10
+ gem 'rubocop-rspec', require: false
data/Gemfile.lock ADDED
@@ -0,0 +1,85 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ llama-rb (0.1.0)
5
+ rice (~> 4.0.4)
6
+
7
+ GEM
8
+ remote: https://rubygems.org/
9
+ specs:
10
+ ast (2.4.2)
11
+ bundler-audit (0.9.1)
12
+ bundler (>= 1.2.0, < 3)
13
+ thor (~> 1.0)
14
+ diff-lcs (1.5.0)
15
+ gnar-style (0.13.0)
16
+ rubocop (>= 1.0.0, < 2.0)
17
+ rubocop-performance
18
+ rubocop-rails (~> 2.2.0)
19
+ thor
20
+ json (2.6.3)
21
+ parallel (1.22.1)
22
+ parser (3.2.2.0)
23
+ ast (~> 2.4.1)
24
+ rack (3.0.7)
25
+ rainbow (3.1.1)
26
+ rake (13.0.6)
27
+ regexp_parser (2.7.0)
28
+ rexml (3.2.5)
29
+ rice (4.0.4)
30
+ rspec (3.12.0)
31
+ rspec-core (~> 3.12.0)
32
+ rspec-expectations (~> 3.12.0)
33
+ rspec-mocks (~> 3.12.0)
34
+ rspec-core (3.12.1)
35
+ rspec-support (~> 3.12.0)
36
+ rspec-expectations (3.12.2)
37
+ diff-lcs (>= 1.2.0, < 2.0)
38
+ rspec-support (~> 3.12.0)
39
+ rspec-mocks (3.12.5)
40
+ diff-lcs (>= 1.2.0, < 2.0)
41
+ rspec-support (~> 3.12.0)
42
+ rspec-support (3.12.0)
43
+ rubocop (1.48.1)
44
+ json (~> 2.3)
45
+ parallel (~> 1.10)
46
+ parser (>= 3.2.0.0)
47
+ rainbow (>= 2.2.2, < 4.0)
48
+ regexp_parser (>= 1.8, < 3.0)
49
+ rexml (>= 3.2.5, < 4.0)
50
+ rubocop-ast (>= 1.26.0, < 2.0)
51
+ ruby-progressbar (~> 1.7)
52
+ unicode-display_width (>= 2.4.0, < 3.0)
53
+ rubocop-ast (1.28.0)
54
+ parser (>= 3.2.1.0)
55
+ rubocop-capybara (2.17.1)
56
+ rubocop (~> 1.41)
57
+ rubocop-performance (1.16.0)
58
+ rubocop (>= 1.7.0, < 2.0)
59
+ rubocop-ast (>= 0.4.0)
60
+ rubocop-rails (2.2.1)
61
+ rack (>= 1.1)
62
+ rubocop (>= 0.72.0)
63
+ rubocop-rake (0.6.0)
64
+ rubocop (~> 1.0)
65
+ rubocop-rspec (2.19.0)
66
+ rubocop (~> 1.33)
67
+ rubocop-capybara (~> 2.17)
68
+ ruby-progressbar (1.13.0)
69
+ thor (1.2.1)
70
+ unicode-display_width (2.4.2)
71
+
72
+ PLATFORMS
73
+ arm64-darwin-21
74
+
75
+ DEPENDENCIES
76
+ bundler-audit
77
+ gnar-style
78
+ llama-rb!
79
+ rake
80
+ rspec
81
+ rubocop-rake
82
+ rubocop-rspec
83
+
84
+ BUNDLED WITH
85
+ 2.4.5
data/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2023
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,81 @@
1
+ # Llama-rb
2
+
3
+ Ruby wrapper for
4
+ [llama.cpp](https://github.com/ggerganov/llama.cpp).
5
+
6
+ This was hacked together in a weekend and versions `0.x.x` should be considered unstable.
7
+
8
+ ## Installation
9
+
10
+ Install the gem and add to the application's Gemfile by executing:
11
+
12
+ ```
13
+ $ bundle add llama-rb
14
+ ```
15
+
16
+ If bundler is not being used to manage dependencies, install the gem by executing:
17
+
18
+ ```
19
+ $ gem install llama-rb
20
+ ```
21
+
22
+ ## Usage
23
+
24
+ ### Models
25
+
26
+ Before using this code, you will need to download and process at least one. See
27
+ [ggerganov/llama.cpp](https://github.com/ggerganov/llama.cpp#obtaining-and-verifying-the-facebook-llama-original-model-and-stanford-alpaca-model-data).
28
+
29
+ ### Example
30
+
31
+ ```ruby
32
+ require 'llama'
33
+
34
+ m = Llama::Model.new('models/7B/ggml-model-q4_0.bin')
35
+ m.predict('hello world')
36
+ ```
37
+
38
+ ### API
39
+
40
+ #### Llama::Model.new
41
+
42
+ ```ruby
43
+ def self.new(
44
+ model, # path to model file, e.g. "models/7B/ggml-model-q4_0.bin"
45
+ n_ctx: 512, # context size
46
+ n_parts: -1, # amount of model parts (-1 = determine from model dimensions)
47
+ seed: Time.now.to_i, # RNG seed
48
+ memory_f16: true, # use f16 instead of f32 for memory kv
49
+ use_mlock: false # use mlock to keep model in memory
50
+ )
51
+ ```
52
+
53
+ #### Llama::Model#predict
54
+
55
+ ```ruby
56
+ def predict(
57
+ prompt, # string used as prompt
58
+ n_predict: 128 # number of tokens to predict
59
+ )
60
+ ```
61
+
62
+ ## Development
63
+
64
+ ```
65
+ git clone --recurse-submodules https://github.com/zfletch/llama-rb
66
+ cd llama-rb
67
+ ./bin/setup
68
+ ```
69
+
70
+ After checking out the repo, run `bin/setup` to install dependencies.
71
+ Then, run `rake spec` to run the tests.
72
+ You can also run `bin/console` for an interactive prompt that will allow you to experiment.
73
+
74
+ To install this gem onto your local machine, run `bundle exec rake install`.
75
+ To release a new version, update the version number in `version.rb`, and then run
76
+ `bundle exec rake release`, which will create a git tag for the version, push git
77
+ commits and the created tag, and push the `.gem` file to [rubygems.org](https://rubygems.org).
78
+
79
+ ## Contributing
80
+
81
+ Bug reports and pull requests are welcome on GitHub at https://github.com/zfletch/llama-rb.
data/Rakefile ADDED
@@ -0,0 +1,10 @@
1
+ require 'bundler/gem_tasks'
2
+ require 'rspec/core/rake_task'
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
6
+ require 'rubocop/rake_task'
7
+
8
+ RuboCop::RakeTask.new
9
+
10
+ task default: [:spec, :rubocop]
@@ -0,0 +1,311 @@
1
+ #include "common.h"
2
+
3
+ #include "ggml.h"
4
+
5
+ #include <cassert>
6
+ #include <cstring>
7
+ #include <fstream>
8
+ #include <string>
9
+ #include <iterator>
10
+ #include <algorithm>
11
+
12
+ #if defined(_MSC_VER) || defined(__MINGW32__)
13
+ #include <malloc.h> // using malloc.h with MSC/MINGW
14
+ #elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
15
+ #include <alloca.h>
16
+ #endif
17
+
18
+ #if defined (_WIN32)
19
+ #pragma comment(lib,"kernel32.lib")
20
+ extern "C" __declspec(dllimport) void* __stdcall GetStdHandle(unsigned long nStdHandle);
21
+ extern "C" __declspec(dllimport) int __stdcall GetConsoleMode(void* hConsoleHandle, unsigned long* lpMode);
22
+ extern "C" __declspec(dllimport) int __stdcall SetConsoleMode(void* hConsoleHandle, unsigned long dwMode);
23
+ extern "C" __declspec(dllimport) int __stdcall SetConsoleCP(unsigned int wCodePageID);
24
+ extern "C" __declspec(dllimport) int __stdcall SetConsoleOutputCP(unsigned int wCodePageID);
25
+ #endif
26
+
27
+ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
28
+ // determine sensible default number of threads.
29
+ // std::thread::hardware_concurrency may not be equal to the number of cores, or may return 0.
30
+ #ifdef __linux__
31
+ std::ifstream cpuinfo("/proc/cpuinfo");
32
+ params.n_threads = std::count(std::istream_iterator<std::string>(cpuinfo),
33
+ std::istream_iterator<std::string>(),
34
+ std::string("processor"));
35
+ #endif
36
+ if (params.n_threads == 0) {
37
+ params.n_threads = std::max(1, (int32_t) std::thread::hardware_concurrency());
38
+ }
39
+
40
+ bool invalid_param = false;
41
+ std::string arg;
42
+ for (int i = 1; i < argc; i++) {
43
+ arg = argv[i];
44
+
45
+ if (arg == "-s" || arg == "--seed") {
46
+ if (++i >= argc) {
47
+ invalid_param = true;
48
+ break;
49
+ }
50
+ params.seed = std::stoi(argv[i]);
51
+ } else if (arg == "-t" || arg == "--threads") {
52
+ if (++i >= argc) {
53
+ invalid_param = true;
54
+ break;
55
+ }
56
+ params.n_threads = std::stoi(argv[i]);
57
+ } else if (arg == "-p" || arg == "--prompt") {
58
+ if (++i >= argc) {
59
+ invalid_param = true;
60
+ break;
61
+ }
62
+ params.prompt = argv[i];
63
+ } else if (arg == "-f" || arg == "--file") {
64
+ if (++i >= argc) {
65
+ invalid_param = true;
66
+ break;
67
+ }
68
+ std::ifstream file(argv[i]);
69
+ std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
70
+ if (params.prompt.back() == '\n') {
71
+ params.prompt.pop_back();
72
+ }
73
+ } else if (arg == "-n" || arg == "--n_predict") {
74
+ if (++i >= argc) {
75
+ invalid_param = true;
76
+ break;
77
+ }
78
+ params.n_predict = std::stoi(argv[i]);
79
+ } else if (arg == "--top_k") {
80
+ if (++i >= argc) {
81
+ invalid_param = true;
82
+ break;
83
+ }
84
+ params.top_k = std::stoi(argv[i]);
85
+ } else if (arg == "-c" || arg == "--ctx_size") {
86
+ if (++i >= argc) {
87
+ invalid_param = true;
88
+ break;
89
+ }
90
+ params.n_ctx = std::stoi(argv[i]);
91
+ } else if (arg == "--memory_f32") {
92
+ params.memory_f16 = false;
93
+ } else if (arg == "--top_p") {
94
+ if (++i >= argc) {
95
+ invalid_param = true;
96
+ break;
97
+ }
98
+ params.top_p = std::stof(argv[i]);
99
+ } else if (arg == "--temp") {
100
+ if (++i >= argc) {
101
+ invalid_param = true;
102
+ break;
103
+ }
104
+ params.temp = std::stof(argv[i]);
105
+ } else if (arg == "--repeat_last_n") {
106
+ if (++i >= argc) {
107
+ invalid_param = true;
108
+ break;
109
+ }
110
+ params.repeat_last_n = std::stoi(argv[i]);
111
+ } else if (arg == "--repeat_penalty") {
112
+ if (++i >= argc) {
113
+ invalid_param = true;
114
+ break;
115
+ }
116
+ params.repeat_penalty = std::stof(argv[i]);
117
+ } else if (arg == "-b" || arg == "--batch_size") {
118
+ if (++i >= argc) {
119
+ invalid_param = true;
120
+ break;
121
+ }
122
+ params.n_batch = std::stoi(argv[i]);
123
+ params.n_batch = std::min(512, params.n_batch);
124
+ } else if (arg == "--keep") {
125
+ if (++i >= argc) {
126
+ invalid_param = true;
127
+ break;
128
+ }
129
+ params.n_keep = std::stoi(argv[i]);
130
+ } else if (arg == "-m" || arg == "--model") {
131
+ if (++i >= argc) {
132
+ invalid_param = true;
133
+ break;
134
+ }
135
+ params.model = argv[i];
136
+ } else if (arg == "-i" || arg == "--interactive") {
137
+ params.interactive = true;
138
+ } else if (arg == "--embedding") {
139
+ params.embedding = true;
140
+ } else if (arg == "--interactive-start") {
141
+ params.interactive = true;
142
+ } else if (arg == "--interactive-first") {
143
+ params.interactive_start = true;
144
+ } else if (arg == "-ins" || arg == "--instruct") {
145
+ params.instruct = true;
146
+ } else if (arg == "--color") {
147
+ params.use_color = true;
148
+ } else if (arg == "--mlock") {
149
+ params.use_mlock = true;
150
+ } else if (arg == "--mtest") {
151
+ params.mem_test = true;
152
+ } else if (arg == "--verbose-prompt") {
153
+ params.verbose_prompt = true;
154
+ } else if (arg == "-r" || arg == "--reverse-prompt") {
155
+ if (++i >= argc) {
156
+ invalid_param = true;
157
+ break;
158
+ }
159
+ params.antiprompt.push_back(argv[i]);
160
+ } else if (arg == "--perplexity") {
161
+ params.perplexity = true;
162
+ } else if (arg == "--ignore-eos") {
163
+ params.ignore_eos = true;
164
+ } else if (arg == "--n_parts") {
165
+ if (++i >= argc) {
166
+ invalid_param = true;
167
+ break;
168
+ }
169
+ params.n_parts = std::stoi(argv[i]);
170
+ } else if (arg == "-h" || arg == "--help") {
171
+ gpt_print_usage(argc, argv, params);
172
+ exit(0);
173
+ } else if (arg == "--random-prompt") {
174
+ params.random_prompt = true;
175
+ } else if (arg == "--in-prefix") {
176
+ if (++i >= argc) {
177
+ invalid_param = true;
178
+ break;
179
+ }
180
+ params.input_prefix = argv[i];
181
+ } else {
182
+ fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
183
+ gpt_print_usage(argc, argv, params);
184
+ exit(1);
185
+ }
186
+ }
187
+ if (invalid_param) {
188
+ fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
189
+ gpt_print_usage(argc, argv, params);
190
+ exit(1);
191
+ }
192
+
193
+ return true;
194
+ }
195
+
196
+ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
197
+ fprintf(stderr, "usage: %s [options]\n", argv[0]);
198
+ fprintf(stderr, "\n");
199
+ fprintf(stderr, "options:\n");
200
+ fprintf(stderr, " -h, --help show this help message and exit\n");
201
+ fprintf(stderr, " -i, --interactive run in interactive mode\n");
202
+ fprintf(stderr, " --interactive-first run in interactive mode and wait for input right away\n");
203
+ fprintf(stderr, " -ins, --instruct run in instruction mode (use with Alpaca models)\n");
204
+ fprintf(stderr, " -r PROMPT, --reverse-prompt PROMPT\n");
205
+ fprintf(stderr, " run in interactive mode and poll user input upon seeing PROMPT (can be\n");
206
+ fprintf(stderr, " specified more than once for multiple prompts).\n");
207
+ fprintf(stderr, " --color colorise output to distinguish prompt and user input from generations\n");
208
+ fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1, use random seed for <= 0)\n");
209
+ fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
210
+ fprintf(stderr, " -p PROMPT, --prompt PROMPT\n");
211
+ fprintf(stderr, " prompt to start generation with (default: empty)\n");
212
+ fprintf(stderr, " --random-prompt start with a randomized prompt.\n");
213
+ fprintf(stderr, " --in-prefix STRING string to prefix user inputs with (default: empty)\n");
214
+ fprintf(stderr, " -f FNAME, --file FNAME\n");
215
+ fprintf(stderr, " prompt file to start generation.\n");
216
+ fprintf(stderr, " -n N, --n_predict N number of tokens to predict (default: %d, -1 = infinity)\n", params.n_predict);
217
+ fprintf(stderr, " --top_k N top-k sampling (default: %d)\n", params.top_k);
218
+ fprintf(stderr, " --top_p N top-p sampling (default: %.1f)\n", (double)params.top_p);
219
+ fprintf(stderr, " --repeat_last_n N last n tokens to consider for penalize (default: %d)\n", params.repeat_last_n);
220
+ fprintf(stderr, " --repeat_penalty N penalize repeat sequence of tokens (default: %.1f)\n", (double)params.repeat_penalty);
221
+ fprintf(stderr, " -c N, --ctx_size N size of the prompt context (default: %d)\n", params.n_ctx);
222
+ fprintf(stderr, " --ignore-eos ignore end of stream token and continue generating\n");
223
+ fprintf(stderr, " --memory_f32 use f32 instead of f16 for memory key+value\n");
224
+ fprintf(stderr, " --temp N temperature (default: %.1f)\n", (double)params.temp);
225
+ fprintf(stderr, " --n_parts N number of model parts (default: -1 = determine from dimensions)\n");
226
+ fprintf(stderr, " -b N, --batch_size N batch size for prompt processing (default: %d)\n", params.n_batch);
227
+ fprintf(stderr, " --perplexity compute perplexity over the prompt\n");
228
+ fprintf(stderr, " --keep number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
229
+ if (ggml_mlock_supported()) {
230
+ fprintf(stderr, " --mlock force system to keep model in RAM rather than swapping or compressing\n");
231
+ }
232
+ fprintf(stderr, " --mtest compute maximum memory usage\n");
233
+ fprintf(stderr, " --verbose-prompt print prompt before generation\n");
234
+ fprintf(stderr, " -m FNAME, --model FNAME\n");
235
+ fprintf(stderr, " model path (default: %s)\n", params.model.c_str());
236
+ fprintf(stderr, "\n");
237
+ }
238
+
239
+ std::string gpt_random_prompt(std::mt19937 & rng) {
240
+ const int r = rng() % 10;
241
+ switch (r) {
242
+ case 0: return "So";
243
+ case 1: return "Once upon a time";
244
+ case 2: return "When";
245
+ case 3: return "The";
246
+ case 4: return "After";
247
+ case 5: return "If";
248
+ case 6: return "import";
249
+ case 7: return "He";
250
+ case 8: return "She";
251
+ case 9: return "They";
252
+ default: return "To";
253
+ }
254
+
255
+ return "The";
256
+ }
257
+
258
+ // TODO: not great allocating this every time
259
+ std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos) {
260
+ // initialize to prompt numer of chars, since n_tokens <= n_prompt_chars
261
+ std::vector<llama_token> res(text.size() + (int)add_bos);
262
+ int n = llama_tokenize(ctx, text.c_str(), res.data(), res.size(), add_bos);
263
+ assert(n >= 0);
264
+ res.resize(n);
265
+
266
+ return res;
267
+ }
268
+
269
+ /* Keep track of current color of output, and emit ANSI code if it changes. */
270
+ void set_console_color(console_state & con_st, console_color_t color) {
271
+ if (con_st.use_color && con_st.color != color) {
272
+ switch(color) {
273
+ case CONSOLE_COLOR_DEFAULT:
274
+ printf(ANSI_COLOR_RESET);
275
+ break;
276
+ case CONSOLE_COLOR_PROMPT:
277
+ printf(ANSI_COLOR_YELLOW);
278
+ break;
279
+ case CONSOLE_COLOR_USER_INPUT:
280
+ printf(ANSI_BOLD ANSI_COLOR_GREEN);
281
+ break;
282
+ }
283
+ con_st.color = color;
284
+ }
285
+ }
286
+
287
+ #if defined (_WIN32)
288
+ void win32_console_init(bool enable_color) {
289
+ unsigned long dwMode = 0;
290
+ void* hConOut = GetStdHandle((unsigned long)-11); // STD_OUTPUT_HANDLE (-11)
291
+ if (!hConOut || hConOut == (void*)-1 || !GetConsoleMode(hConOut, &dwMode)) {
292
+ hConOut = GetStdHandle((unsigned long)-12); // STD_ERROR_HANDLE (-12)
293
+ if (hConOut && (hConOut == (void*)-1 || !GetConsoleMode(hConOut, &dwMode))) {
294
+ hConOut = 0;
295
+ }
296
+ }
297
+ if (hConOut) {
298
+ // Enable ANSI colors on Windows 10+
299
+ if (enable_color && !(dwMode & 0x4)) {
300
+ SetConsoleMode(hConOut, dwMode | 0x4); // ENABLE_VIRTUAL_TERMINAL_PROCESSING (0x4)
301
+ }
302
+ // Set console output codepage to UTF8
303
+ SetConsoleOutputCP(65001); // CP_UTF8
304
+ }
305
+ void* hConIn = GetStdHandle((unsigned long)-10); // STD_INPUT_HANDLE (-10)
306
+ if (hConIn && hConIn != (void*)-1 && GetConsoleMode(hConIn, &dwMode)) {
307
+ // Set console input codepage to UTF8
308
+ SetConsoleCP(65001); // CP_UTF8
309
+ }
310
+ }
311
+ #endif
@@ -0,0 +1,95 @@
1
+ // Various helper functions and utilities
2
+
3
+ #pragma once
4
+
5
+ #include "llama.h"
6
+
7
+ #include <string>
8
+ #include <vector>
9
+ #include <random>
10
+ #include <thread>
11
+
12
+ //
13
+ // CLI argument parsing
14
+ //
15
+
16
+ struct gpt_params {
17
+ int32_t seed = -1; // RNG seed
18
+ int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
19
+ int32_t n_predict = 128; // new tokens to predict
20
+ int32_t repeat_last_n = 64; // last n tokens to penalize
21
+ int32_t n_parts = -1; // amount of model parts (-1 = determine from model dimensions)
22
+ int32_t n_ctx = 512; // context size
23
+ int32_t n_batch = 8; // batch size for prompt processing
24
+ int32_t n_keep = 0; // number of tokens to keep from initial prompt
25
+
26
+ // sampling parameters
27
+ int32_t top_k = 40;
28
+ float top_p = 0.95f;
29
+ float temp = 0.80f;
30
+ float repeat_penalty = 1.10f;
31
+
32
+ std::string model = "models/lamma-7B/ggml-model.bin"; // model path
33
+ std::string prompt = "";
34
+ std::string input_prefix = ""; // string to prefix user inputs with
35
+
36
+
37
+ std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
38
+
39
+ bool memory_f16 = true; // use f16 instead of f32 for memory kv
40
+ bool random_prompt = false; // do not randomize prompt if none provided
41
+ bool use_color = false; // use color to distinguish generations and inputs
42
+ bool interactive = false; // interactive mode
43
+
44
+ bool embedding = false; // get only sentence embedding
45
+ bool interactive_start = false; // wait for user input immediately
46
+
47
+ bool instruct = false; // instruction mode (used for Alpaca models)
48
+ bool ignore_eos = false; // do not stop generating after eos
49
+ bool perplexity = false; // compute perplexity over the prompt
50
+ bool use_mlock = false; // use mlock to keep model in memory
51
+ bool mem_test = false; // compute maximum memory usage
52
+ bool verbose_prompt = false; // print prompt tokens before generation
53
+ };
54
+
55
+ bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
56
+
57
+ void gpt_print_usage(int argc, char ** argv, const gpt_params & params);
58
+
59
+ std::string gpt_random_prompt(std::mt19937 & rng);
60
+
61
+ //
62
+ // Vocab utils
63
+ //
64
+
65
+ std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos);
66
+
67
+ //
68
+ // Console utils
69
+ //
70
+
71
+ #define ANSI_COLOR_RED "\x1b[31m"
72
+ #define ANSI_COLOR_GREEN "\x1b[32m"
73
+ #define ANSI_COLOR_YELLOW "\x1b[33m"
74
+ #define ANSI_COLOR_BLUE "\x1b[34m"
75
+ #define ANSI_COLOR_MAGENTA "\x1b[35m"
76
+ #define ANSI_COLOR_CYAN "\x1b[36m"
77
+ #define ANSI_COLOR_RESET "\x1b[0m"
78
+ #define ANSI_BOLD "\x1b[1m"
79
+
80
+ enum console_color_t {
81
+ CONSOLE_COLOR_DEFAULT=0,
82
+ CONSOLE_COLOR_PROMPT,
83
+ CONSOLE_COLOR_USER_INPUT
84
+ };
85
+
86
+ struct console_state {
87
+ bool use_color = false;
88
+ console_color_t color = CONSOLE_COLOR_DEFAULT;
89
+ };
90
+
91
+ void set_console_color(console_state & con_st, console_color_t color);
92
+
93
+ #if defined (_WIN32)
94
+ void win32_console_init(bool enable_color);
95
+ #endif
@@ -0,0 +1,12 @@
1
+ require 'mkmf-rice'
2
+
3
+ # Compile llama.cpp
4
+ # root = File.expand_path(File.join(File.dirname(__FILE__), "..", ".."))
5
+ # llama_cpp = File.join(root, 'llama.cpp')
6
+ #
7
+ # Dir.chdir(llama_cpp) do
8
+ # system("make", exception: true)
9
+ # end
10
+
11
+ # Create Makefile for Ruby bindings
12
+ create_makefile 'llama/model'