llama-rb 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/Gemfile +10 -0
- data/Gemfile.lock +85 -0
- data/LICENSE +21 -0
- data/README.md +81 -0
- data/Rakefile +10 -0
- data/ext/llama/common.cpp +311 -0
- data/ext/llama/common.h +95 -0
- data/ext/llama/extconf.rb +12 -0
- data/ext/llama/ggml.c +10642 -0
- data/ext/llama/ggml.h +778 -0
- data/ext/llama/llama.cpp +1815 -0
- data/ext/llama/llama.h +152 -0
- data/ext/llama/model.cpp +192 -0
- data/lib/llama/model.rb +86 -0
- data/lib/llama/version.rb +3 -0
- data/lib/llama.rb +6 -0
- data/llama-rb.gemspec +50 -0
- data/models/.gitkeep +0 -0
- metadata +80 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: ca78f6c05c53323ba5bd78ccdee815a77c4df10fde7c5497563e48281949cc3e
|
4
|
+
data.tar.gz: 7e225474cc183d2e50f3936d5bee984d394708ff7c48ad040b8b629c6f21fbb9
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: ea82a87539c0511175c6c5afe3c93e6bc5c141ea27bc4af0a4c9c9a8574736de59169bd8d847ca3afd385f27aeb306944f27a4e822233b54f3f47033be92d5ed
|
7
|
+
data.tar.gz: ca92bfd00bea78d88d90c93418a7cf86e9b6a3b436b86f6a7c87cf1906fed59a539085f549a0cdbaa0bf16815c3a221b51ce23313fab8adb6a6310a75fbbe8f5
|
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,85 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
llama-rb (0.1.0)
|
5
|
+
rice (~> 4.0.4)
|
6
|
+
|
7
|
+
GEM
|
8
|
+
remote: https://rubygems.org/
|
9
|
+
specs:
|
10
|
+
ast (2.4.2)
|
11
|
+
bundler-audit (0.9.1)
|
12
|
+
bundler (>= 1.2.0, < 3)
|
13
|
+
thor (~> 1.0)
|
14
|
+
diff-lcs (1.5.0)
|
15
|
+
gnar-style (0.13.0)
|
16
|
+
rubocop (>= 1.0.0, < 2.0)
|
17
|
+
rubocop-performance
|
18
|
+
rubocop-rails (~> 2.2.0)
|
19
|
+
thor
|
20
|
+
json (2.6.3)
|
21
|
+
parallel (1.22.1)
|
22
|
+
parser (3.2.2.0)
|
23
|
+
ast (~> 2.4.1)
|
24
|
+
rack (3.0.7)
|
25
|
+
rainbow (3.1.1)
|
26
|
+
rake (13.0.6)
|
27
|
+
regexp_parser (2.7.0)
|
28
|
+
rexml (3.2.5)
|
29
|
+
rice (4.0.4)
|
30
|
+
rspec (3.12.0)
|
31
|
+
rspec-core (~> 3.12.0)
|
32
|
+
rspec-expectations (~> 3.12.0)
|
33
|
+
rspec-mocks (~> 3.12.0)
|
34
|
+
rspec-core (3.12.1)
|
35
|
+
rspec-support (~> 3.12.0)
|
36
|
+
rspec-expectations (3.12.2)
|
37
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
38
|
+
rspec-support (~> 3.12.0)
|
39
|
+
rspec-mocks (3.12.5)
|
40
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
41
|
+
rspec-support (~> 3.12.0)
|
42
|
+
rspec-support (3.12.0)
|
43
|
+
rubocop (1.48.1)
|
44
|
+
json (~> 2.3)
|
45
|
+
parallel (~> 1.10)
|
46
|
+
parser (>= 3.2.0.0)
|
47
|
+
rainbow (>= 2.2.2, < 4.0)
|
48
|
+
regexp_parser (>= 1.8, < 3.0)
|
49
|
+
rexml (>= 3.2.5, < 4.0)
|
50
|
+
rubocop-ast (>= 1.26.0, < 2.0)
|
51
|
+
ruby-progressbar (~> 1.7)
|
52
|
+
unicode-display_width (>= 2.4.0, < 3.0)
|
53
|
+
rubocop-ast (1.28.0)
|
54
|
+
parser (>= 3.2.1.0)
|
55
|
+
rubocop-capybara (2.17.1)
|
56
|
+
rubocop (~> 1.41)
|
57
|
+
rubocop-performance (1.16.0)
|
58
|
+
rubocop (>= 1.7.0, < 2.0)
|
59
|
+
rubocop-ast (>= 0.4.0)
|
60
|
+
rubocop-rails (2.2.1)
|
61
|
+
rack (>= 1.1)
|
62
|
+
rubocop (>= 0.72.0)
|
63
|
+
rubocop-rake (0.6.0)
|
64
|
+
rubocop (~> 1.0)
|
65
|
+
rubocop-rspec (2.19.0)
|
66
|
+
rubocop (~> 1.33)
|
67
|
+
rubocop-capybara (~> 2.17)
|
68
|
+
ruby-progressbar (1.13.0)
|
69
|
+
thor (1.2.1)
|
70
|
+
unicode-display_width (2.4.2)
|
71
|
+
|
72
|
+
PLATFORMS
|
73
|
+
arm64-darwin-21
|
74
|
+
|
75
|
+
DEPENDENCIES
|
76
|
+
bundler-audit
|
77
|
+
gnar-style
|
78
|
+
llama-rb!
|
79
|
+
rake
|
80
|
+
rspec
|
81
|
+
rubocop-rake
|
82
|
+
rubocop-rspec
|
83
|
+
|
84
|
+
BUNDLED WITH
|
85
|
+
2.4.5
|
data/LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2023
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
+
SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,81 @@
|
|
1
|
+
# Llama-rb
|
2
|
+
|
3
|
+
Ruby wrapper for
|
4
|
+
[llama.cpp](https://github.com/ggerganov/llama.cpp).
|
5
|
+
|
6
|
+
This was hacked together in a weekend and versions `0.x.x` should be considered unstable.
|
7
|
+
|
8
|
+
## Installation
|
9
|
+
|
10
|
+
Install the gem and add to the application's Gemfile by executing:
|
11
|
+
|
12
|
+
```
|
13
|
+
$ bundle add llama-rb
|
14
|
+
```
|
15
|
+
|
16
|
+
If bundler is not being used to manage dependencies, install the gem by executing:
|
17
|
+
|
18
|
+
```
|
19
|
+
$ gem install llama-rb
|
20
|
+
```
|
21
|
+
|
22
|
+
## Usage
|
23
|
+
|
24
|
+
### Models
|
25
|
+
|
26
|
+
Before using this code, you will need to download and process at least one. See
|
27
|
+
[ggerganov/llama.cpp](https://github.com/ggerganov/llama.cpp#obtaining-and-verifying-the-facebook-llama-original-model-and-stanford-alpaca-model-data).
|
28
|
+
|
29
|
+
### Example
|
30
|
+
|
31
|
+
```ruby
|
32
|
+
require 'llama'
|
33
|
+
|
34
|
+
m = Llama::Model.new('models/7B/ggml-model-q4_0.bin')
|
35
|
+
m.predict('hello world')
|
36
|
+
```
|
37
|
+
|
38
|
+
### API
|
39
|
+
|
40
|
+
#### Llama::Model.new
|
41
|
+
|
42
|
+
```ruby
|
43
|
+
def self.new(
|
44
|
+
model, # path to model file, e.g. "models/7B/ggml-model-q4_0.bin"
|
45
|
+
n_ctx: 512, # context size
|
46
|
+
n_parts: -1, # amount of model parts (-1 = determine from model dimensions)
|
47
|
+
seed: Time.now.to_i, # RNG seed
|
48
|
+
memory_f16: true, # use f16 instead of f32 for memory kv
|
49
|
+
use_mlock: false # use mlock to keep model in memory
|
50
|
+
)
|
51
|
+
```
|
52
|
+
|
53
|
+
#### Llama::Model#predict
|
54
|
+
|
55
|
+
```ruby
|
56
|
+
def predict(
|
57
|
+
prompt, # string used as prompt
|
58
|
+
n_predict: 128 # number of tokens to predict
|
59
|
+
)
|
60
|
+
```
|
61
|
+
|
62
|
+
## Development
|
63
|
+
|
64
|
+
```
|
65
|
+
git clone --recurse-submodules https://github.com/zfletch/llama-rb
|
66
|
+
cd llama-rb
|
67
|
+
./bin/setup
|
68
|
+
```
|
69
|
+
|
70
|
+
After checking out the repo, run `bin/setup` to install dependencies.
|
71
|
+
Then, run `rake spec` to run the tests.
|
72
|
+
You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
73
|
+
|
74
|
+
To install this gem onto your local machine, run `bundle exec rake install`.
|
75
|
+
To release a new version, update the version number in `version.rb`, and then run
|
76
|
+
`bundle exec rake release`, which will create a git tag for the version, push git
|
77
|
+
commits and the created tag, and push the `.gem` file to [rubygems.org](https://rubygems.org).
|
78
|
+
|
79
|
+
## Contributing
|
80
|
+
|
81
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/zfletch/llama-rb.
|
data/Rakefile
ADDED
@@ -0,0 +1,311 @@
|
|
1
|
+
#include "common.h"
|
2
|
+
|
3
|
+
#include "ggml.h"
|
4
|
+
|
5
|
+
#include <cassert>
|
6
|
+
#include <cstring>
|
7
|
+
#include <fstream>
|
8
|
+
#include <string>
|
9
|
+
#include <iterator>
|
10
|
+
#include <algorithm>
|
11
|
+
|
12
|
+
#if defined(_MSC_VER) || defined(__MINGW32__)
|
13
|
+
#include <malloc.h> // using malloc.h with MSC/MINGW
|
14
|
+
#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
|
15
|
+
#include <alloca.h>
|
16
|
+
#endif
|
17
|
+
|
18
|
+
#if defined (_WIN32)
|
19
|
+
#pragma comment(lib,"kernel32.lib")
|
20
|
+
extern "C" __declspec(dllimport) void* __stdcall GetStdHandle(unsigned long nStdHandle);
|
21
|
+
extern "C" __declspec(dllimport) int __stdcall GetConsoleMode(void* hConsoleHandle, unsigned long* lpMode);
|
22
|
+
extern "C" __declspec(dllimport) int __stdcall SetConsoleMode(void* hConsoleHandle, unsigned long dwMode);
|
23
|
+
extern "C" __declspec(dllimport) int __stdcall SetConsoleCP(unsigned int wCodePageID);
|
24
|
+
extern "C" __declspec(dllimport) int __stdcall SetConsoleOutputCP(unsigned int wCodePageID);
|
25
|
+
#endif
|
26
|
+
|
27
|
+
bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
28
|
+
// determine sensible default number of threads.
|
29
|
+
// std::thread::hardware_concurrency may not be equal to the number of cores, or may return 0.
|
30
|
+
#ifdef __linux__
|
31
|
+
std::ifstream cpuinfo("/proc/cpuinfo");
|
32
|
+
params.n_threads = std::count(std::istream_iterator<std::string>(cpuinfo),
|
33
|
+
std::istream_iterator<std::string>(),
|
34
|
+
std::string("processor"));
|
35
|
+
#endif
|
36
|
+
if (params.n_threads == 0) {
|
37
|
+
params.n_threads = std::max(1, (int32_t) std::thread::hardware_concurrency());
|
38
|
+
}
|
39
|
+
|
40
|
+
bool invalid_param = false;
|
41
|
+
std::string arg;
|
42
|
+
for (int i = 1; i < argc; i++) {
|
43
|
+
arg = argv[i];
|
44
|
+
|
45
|
+
if (arg == "-s" || arg == "--seed") {
|
46
|
+
if (++i >= argc) {
|
47
|
+
invalid_param = true;
|
48
|
+
break;
|
49
|
+
}
|
50
|
+
params.seed = std::stoi(argv[i]);
|
51
|
+
} else if (arg == "-t" || arg == "--threads") {
|
52
|
+
if (++i >= argc) {
|
53
|
+
invalid_param = true;
|
54
|
+
break;
|
55
|
+
}
|
56
|
+
params.n_threads = std::stoi(argv[i]);
|
57
|
+
} else if (arg == "-p" || arg == "--prompt") {
|
58
|
+
if (++i >= argc) {
|
59
|
+
invalid_param = true;
|
60
|
+
break;
|
61
|
+
}
|
62
|
+
params.prompt = argv[i];
|
63
|
+
} else if (arg == "-f" || arg == "--file") {
|
64
|
+
if (++i >= argc) {
|
65
|
+
invalid_param = true;
|
66
|
+
break;
|
67
|
+
}
|
68
|
+
std::ifstream file(argv[i]);
|
69
|
+
std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
|
70
|
+
if (params.prompt.back() == '\n') {
|
71
|
+
params.prompt.pop_back();
|
72
|
+
}
|
73
|
+
} else if (arg == "-n" || arg == "--n_predict") {
|
74
|
+
if (++i >= argc) {
|
75
|
+
invalid_param = true;
|
76
|
+
break;
|
77
|
+
}
|
78
|
+
params.n_predict = std::stoi(argv[i]);
|
79
|
+
} else if (arg == "--top_k") {
|
80
|
+
if (++i >= argc) {
|
81
|
+
invalid_param = true;
|
82
|
+
break;
|
83
|
+
}
|
84
|
+
params.top_k = std::stoi(argv[i]);
|
85
|
+
} else if (arg == "-c" || arg == "--ctx_size") {
|
86
|
+
if (++i >= argc) {
|
87
|
+
invalid_param = true;
|
88
|
+
break;
|
89
|
+
}
|
90
|
+
params.n_ctx = std::stoi(argv[i]);
|
91
|
+
} else if (arg == "--memory_f32") {
|
92
|
+
params.memory_f16 = false;
|
93
|
+
} else if (arg == "--top_p") {
|
94
|
+
if (++i >= argc) {
|
95
|
+
invalid_param = true;
|
96
|
+
break;
|
97
|
+
}
|
98
|
+
params.top_p = std::stof(argv[i]);
|
99
|
+
} else if (arg == "--temp") {
|
100
|
+
if (++i >= argc) {
|
101
|
+
invalid_param = true;
|
102
|
+
break;
|
103
|
+
}
|
104
|
+
params.temp = std::stof(argv[i]);
|
105
|
+
} else if (arg == "--repeat_last_n") {
|
106
|
+
if (++i >= argc) {
|
107
|
+
invalid_param = true;
|
108
|
+
break;
|
109
|
+
}
|
110
|
+
params.repeat_last_n = std::stoi(argv[i]);
|
111
|
+
} else if (arg == "--repeat_penalty") {
|
112
|
+
if (++i >= argc) {
|
113
|
+
invalid_param = true;
|
114
|
+
break;
|
115
|
+
}
|
116
|
+
params.repeat_penalty = std::stof(argv[i]);
|
117
|
+
} else if (arg == "-b" || arg == "--batch_size") {
|
118
|
+
if (++i >= argc) {
|
119
|
+
invalid_param = true;
|
120
|
+
break;
|
121
|
+
}
|
122
|
+
params.n_batch = std::stoi(argv[i]);
|
123
|
+
params.n_batch = std::min(512, params.n_batch);
|
124
|
+
} else if (arg == "--keep") {
|
125
|
+
if (++i >= argc) {
|
126
|
+
invalid_param = true;
|
127
|
+
break;
|
128
|
+
}
|
129
|
+
params.n_keep = std::stoi(argv[i]);
|
130
|
+
} else if (arg == "-m" || arg == "--model") {
|
131
|
+
if (++i >= argc) {
|
132
|
+
invalid_param = true;
|
133
|
+
break;
|
134
|
+
}
|
135
|
+
params.model = argv[i];
|
136
|
+
} else if (arg == "-i" || arg == "--interactive") {
|
137
|
+
params.interactive = true;
|
138
|
+
} else if (arg == "--embedding") {
|
139
|
+
params.embedding = true;
|
140
|
+
} else if (arg == "--interactive-start") {
|
141
|
+
params.interactive = true;
|
142
|
+
} else if (arg == "--interactive-first") {
|
143
|
+
params.interactive_start = true;
|
144
|
+
} else if (arg == "-ins" || arg == "--instruct") {
|
145
|
+
params.instruct = true;
|
146
|
+
} else if (arg == "--color") {
|
147
|
+
params.use_color = true;
|
148
|
+
} else if (arg == "--mlock") {
|
149
|
+
params.use_mlock = true;
|
150
|
+
} else if (arg == "--mtest") {
|
151
|
+
params.mem_test = true;
|
152
|
+
} else if (arg == "--verbose-prompt") {
|
153
|
+
params.verbose_prompt = true;
|
154
|
+
} else if (arg == "-r" || arg == "--reverse-prompt") {
|
155
|
+
if (++i >= argc) {
|
156
|
+
invalid_param = true;
|
157
|
+
break;
|
158
|
+
}
|
159
|
+
params.antiprompt.push_back(argv[i]);
|
160
|
+
} else if (arg == "--perplexity") {
|
161
|
+
params.perplexity = true;
|
162
|
+
} else if (arg == "--ignore-eos") {
|
163
|
+
params.ignore_eos = true;
|
164
|
+
} else if (arg == "--n_parts") {
|
165
|
+
if (++i >= argc) {
|
166
|
+
invalid_param = true;
|
167
|
+
break;
|
168
|
+
}
|
169
|
+
params.n_parts = std::stoi(argv[i]);
|
170
|
+
} else if (arg == "-h" || arg == "--help") {
|
171
|
+
gpt_print_usage(argc, argv, params);
|
172
|
+
exit(0);
|
173
|
+
} else if (arg == "--random-prompt") {
|
174
|
+
params.random_prompt = true;
|
175
|
+
} else if (arg == "--in-prefix") {
|
176
|
+
if (++i >= argc) {
|
177
|
+
invalid_param = true;
|
178
|
+
break;
|
179
|
+
}
|
180
|
+
params.input_prefix = argv[i];
|
181
|
+
} else {
|
182
|
+
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
183
|
+
gpt_print_usage(argc, argv, params);
|
184
|
+
exit(1);
|
185
|
+
}
|
186
|
+
}
|
187
|
+
if (invalid_param) {
|
188
|
+
fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
|
189
|
+
gpt_print_usage(argc, argv, params);
|
190
|
+
exit(1);
|
191
|
+
}
|
192
|
+
|
193
|
+
return true;
|
194
|
+
}
|
195
|
+
|
196
|
+
void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
197
|
+
fprintf(stderr, "usage: %s [options]\n", argv[0]);
|
198
|
+
fprintf(stderr, "\n");
|
199
|
+
fprintf(stderr, "options:\n");
|
200
|
+
fprintf(stderr, " -h, --help show this help message and exit\n");
|
201
|
+
fprintf(stderr, " -i, --interactive run in interactive mode\n");
|
202
|
+
fprintf(stderr, " --interactive-first run in interactive mode and wait for input right away\n");
|
203
|
+
fprintf(stderr, " -ins, --instruct run in instruction mode (use with Alpaca models)\n");
|
204
|
+
fprintf(stderr, " -r PROMPT, --reverse-prompt PROMPT\n");
|
205
|
+
fprintf(stderr, " run in interactive mode and poll user input upon seeing PROMPT (can be\n");
|
206
|
+
fprintf(stderr, " specified more than once for multiple prompts).\n");
|
207
|
+
fprintf(stderr, " --color colorise output to distinguish prompt and user input from generations\n");
|
208
|
+
fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1, use random seed for <= 0)\n");
|
209
|
+
fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
|
210
|
+
fprintf(stderr, " -p PROMPT, --prompt PROMPT\n");
|
211
|
+
fprintf(stderr, " prompt to start generation with (default: empty)\n");
|
212
|
+
fprintf(stderr, " --random-prompt start with a randomized prompt.\n");
|
213
|
+
fprintf(stderr, " --in-prefix STRING string to prefix user inputs with (default: empty)\n");
|
214
|
+
fprintf(stderr, " -f FNAME, --file FNAME\n");
|
215
|
+
fprintf(stderr, " prompt file to start generation.\n");
|
216
|
+
fprintf(stderr, " -n N, --n_predict N number of tokens to predict (default: %d, -1 = infinity)\n", params.n_predict);
|
217
|
+
fprintf(stderr, " --top_k N top-k sampling (default: %d)\n", params.top_k);
|
218
|
+
fprintf(stderr, " --top_p N top-p sampling (default: %.1f)\n", (double)params.top_p);
|
219
|
+
fprintf(stderr, " --repeat_last_n N last n tokens to consider for penalize (default: %d)\n", params.repeat_last_n);
|
220
|
+
fprintf(stderr, " --repeat_penalty N penalize repeat sequence of tokens (default: %.1f)\n", (double)params.repeat_penalty);
|
221
|
+
fprintf(stderr, " -c N, --ctx_size N size of the prompt context (default: %d)\n", params.n_ctx);
|
222
|
+
fprintf(stderr, " --ignore-eos ignore end of stream token and continue generating\n");
|
223
|
+
fprintf(stderr, " --memory_f32 use f32 instead of f16 for memory key+value\n");
|
224
|
+
fprintf(stderr, " --temp N temperature (default: %.1f)\n", (double)params.temp);
|
225
|
+
fprintf(stderr, " --n_parts N number of model parts (default: -1 = determine from dimensions)\n");
|
226
|
+
fprintf(stderr, " -b N, --batch_size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
227
|
+
fprintf(stderr, " --perplexity compute perplexity over the prompt\n");
|
228
|
+
fprintf(stderr, " --keep number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
|
229
|
+
if (ggml_mlock_supported()) {
|
230
|
+
fprintf(stderr, " --mlock force system to keep model in RAM rather than swapping or compressing\n");
|
231
|
+
}
|
232
|
+
fprintf(stderr, " --mtest compute maximum memory usage\n");
|
233
|
+
fprintf(stderr, " --verbose-prompt print prompt before generation\n");
|
234
|
+
fprintf(stderr, " -m FNAME, --model FNAME\n");
|
235
|
+
fprintf(stderr, " model path (default: %s)\n", params.model.c_str());
|
236
|
+
fprintf(stderr, "\n");
|
237
|
+
}
|
238
|
+
|
239
|
+
std::string gpt_random_prompt(std::mt19937 & rng) {
|
240
|
+
const int r = rng() % 10;
|
241
|
+
switch (r) {
|
242
|
+
case 0: return "So";
|
243
|
+
case 1: return "Once upon a time";
|
244
|
+
case 2: return "When";
|
245
|
+
case 3: return "The";
|
246
|
+
case 4: return "After";
|
247
|
+
case 5: return "If";
|
248
|
+
case 6: return "import";
|
249
|
+
case 7: return "He";
|
250
|
+
case 8: return "She";
|
251
|
+
case 9: return "They";
|
252
|
+
default: return "To";
|
253
|
+
}
|
254
|
+
|
255
|
+
return "The";
|
256
|
+
}
|
257
|
+
|
258
|
+
// TODO: not great allocating this every time
|
259
|
+
std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos) {
|
260
|
+
// initialize to prompt numer of chars, since n_tokens <= n_prompt_chars
|
261
|
+
std::vector<llama_token> res(text.size() + (int)add_bos);
|
262
|
+
int n = llama_tokenize(ctx, text.c_str(), res.data(), res.size(), add_bos);
|
263
|
+
assert(n >= 0);
|
264
|
+
res.resize(n);
|
265
|
+
|
266
|
+
return res;
|
267
|
+
}
|
268
|
+
|
269
|
+
/* Keep track of current color of output, and emit ANSI code if it changes. */
|
270
|
+
void set_console_color(console_state & con_st, console_color_t color) {
|
271
|
+
if (con_st.use_color && con_st.color != color) {
|
272
|
+
switch(color) {
|
273
|
+
case CONSOLE_COLOR_DEFAULT:
|
274
|
+
printf(ANSI_COLOR_RESET);
|
275
|
+
break;
|
276
|
+
case CONSOLE_COLOR_PROMPT:
|
277
|
+
printf(ANSI_COLOR_YELLOW);
|
278
|
+
break;
|
279
|
+
case CONSOLE_COLOR_USER_INPUT:
|
280
|
+
printf(ANSI_BOLD ANSI_COLOR_GREEN);
|
281
|
+
break;
|
282
|
+
}
|
283
|
+
con_st.color = color;
|
284
|
+
}
|
285
|
+
}
|
286
|
+
|
287
|
+
#if defined (_WIN32)
|
288
|
+
void win32_console_init(bool enable_color) {
|
289
|
+
unsigned long dwMode = 0;
|
290
|
+
void* hConOut = GetStdHandle((unsigned long)-11); // STD_OUTPUT_HANDLE (-11)
|
291
|
+
if (!hConOut || hConOut == (void*)-1 || !GetConsoleMode(hConOut, &dwMode)) {
|
292
|
+
hConOut = GetStdHandle((unsigned long)-12); // STD_ERROR_HANDLE (-12)
|
293
|
+
if (hConOut && (hConOut == (void*)-1 || !GetConsoleMode(hConOut, &dwMode))) {
|
294
|
+
hConOut = 0;
|
295
|
+
}
|
296
|
+
}
|
297
|
+
if (hConOut) {
|
298
|
+
// Enable ANSI colors on Windows 10+
|
299
|
+
if (enable_color && !(dwMode & 0x4)) {
|
300
|
+
SetConsoleMode(hConOut, dwMode | 0x4); // ENABLE_VIRTUAL_TERMINAL_PROCESSING (0x4)
|
301
|
+
}
|
302
|
+
// Set console output codepage to UTF8
|
303
|
+
SetConsoleOutputCP(65001); // CP_UTF8
|
304
|
+
}
|
305
|
+
void* hConIn = GetStdHandle((unsigned long)-10); // STD_INPUT_HANDLE (-10)
|
306
|
+
if (hConIn && hConIn != (void*)-1 && GetConsoleMode(hConIn, &dwMode)) {
|
307
|
+
// Set console input codepage to UTF8
|
308
|
+
SetConsoleCP(65001); // CP_UTF8
|
309
|
+
}
|
310
|
+
}
|
311
|
+
#endif
|
data/ext/llama/common.h
ADDED
@@ -0,0 +1,95 @@
|
|
1
|
+
// Various helper functions and utilities
|
2
|
+
|
3
|
+
#pragma once
|
4
|
+
|
5
|
+
#include "llama.h"
|
6
|
+
|
7
|
+
#include <string>
|
8
|
+
#include <vector>
|
9
|
+
#include <random>
|
10
|
+
#include <thread>
|
11
|
+
|
12
|
+
//
|
13
|
+
// CLI argument parsing
|
14
|
+
//
|
15
|
+
|
16
|
+
struct gpt_params {
|
17
|
+
int32_t seed = -1; // RNG seed
|
18
|
+
int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
|
19
|
+
int32_t n_predict = 128; // new tokens to predict
|
20
|
+
int32_t repeat_last_n = 64; // last n tokens to penalize
|
21
|
+
int32_t n_parts = -1; // amount of model parts (-1 = determine from model dimensions)
|
22
|
+
int32_t n_ctx = 512; // context size
|
23
|
+
int32_t n_batch = 8; // batch size for prompt processing
|
24
|
+
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
25
|
+
|
26
|
+
// sampling parameters
|
27
|
+
int32_t top_k = 40;
|
28
|
+
float top_p = 0.95f;
|
29
|
+
float temp = 0.80f;
|
30
|
+
float repeat_penalty = 1.10f;
|
31
|
+
|
32
|
+
std::string model = "models/lamma-7B/ggml-model.bin"; // model path
|
33
|
+
std::string prompt = "";
|
34
|
+
std::string input_prefix = ""; // string to prefix user inputs with
|
35
|
+
|
36
|
+
|
37
|
+
std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
|
38
|
+
|
39
|
+
bool memory_f16 = true; // use f16 instead of f32 for memory kv
|
40
|
+
bool random_prompt = false; // do not randomize prompt if none provided
|
41
|
+
bool use_color = false; // use color to distinguish generations and inputs
|
42
|
+
bool interactive = false; // interactive mode
|
43
|
+
|
44
|
+
bool embedding = false; // get only sentence embedding
|
45
|
+
bool interactive_start = false; // wait for user input immediately
|
46
|
+
|
47
|
+
bool instruct = false; // instruction mode (used for Alpaca models)
|
48
|
+
bool ignore_eos = false; // do not stop generating after eos
|
49
|
+
bool perplexity = false; // compute perplexity over the prompt
|
50
|
+
bool use_mlock = false; // use mlock to keep model in memory
|
51
|
+
bool mem_test = false; // compute maximum memory usage
|
52
|
+
bool verbose_prompt = false; // print prompt tokens before generation
|
53
|
+
};
|
54
|
+
|
55
|
+
bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
|
56
|
+
|
57
|
+
void gpt_print_usage(int argc, char ** argv, const gpt_params & params);
|
58
|
+
|
59
|
+
std::string gpt_random_prompt(std::mt19937 & rng);
|
60
|
+
|
61
|
+
//
|
62
|
+
// Vocab utils
|
63
|
+
//
|
64
|
+
|
65
|
+
std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos);
|
66
|
+
|
67
|
+
//
|
68
|
+
// Console utils
|
69
|
+
//
|
70
|
+
|
71
|
+
#define ANSI_COLOR_RED "\x1b[31m"
|
72
|
+
#define ANSI_COLOR_GREEN "\x1b[32m"
|
73
|
+
#define ANSI_COLOR_YELLOW "\x1b[33m"
|
74
|
+
#define ANSI_COLOR_BLUE "\x1b[34m"
|
75
|
+
#define ANSI_COLOR_MAGENTA "\x1b[35m"
|
76
|
+
#define ANSI_COLOR_CYAN "\x1b[36m"
|
77
|
+
#define ANSI_COLOR_RESET "\x1b[0m"
|
78
|
+
#define ANSI_BOLD "\x1b[1m"
|
79
|
+
|
80
|
+
enum console_color_t {
|
81
|
+
CONSOLE_COLOR_DEFAULT=0,
|
82
|
+
CONSOLE_COLOR_PROMPT,
|
83
|
+
CONSOLE_COLOR_USER_INPUT
|
84
|
+
};
|
85
|
+
|
86
|
+
struct console_state {
|
87
|
+
bool use_color = false;
|
88
|
+
console_color_t color = CONSOLE_COLOR_DEFAULT;
|
89
|
+
};
|
90
|
+
|
91
|
+
void set_console_color(console_state & con_st, console_color_t color);
|
92
|
+
|
93
|
+
#if defined (_WIN32)
|
94
|
+
void win32_console_init(bool enable_color);
|
95
|
+
#endif
|
@@ -0,0 +1,12 @@
|
|
1
|
+
require 'mkmf-rice'
|
2
|
+
|
3
|
+
# Compile llama.cpp
|
4
|
+
# root = File.expand_path(File.join(File.dirname(__FILE__), "..", ".."))
|
5
|
+
# llama_cpp = File.join(root, 'llama.cpp')
|
6
|
+
#
|
7
|
+
# Dir.chdir(llama_cpp) do
|
8
|
+
# system("make", exception: true)
|
9
|
+
# end
|
10
|
+
|
11
|
+
# Create Makefile for Ruby bindings
|
12
|
+
create_makefile 'llama/model'
|