@fugood/llama.node 0.3.0 → 0.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +1 -10
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/package.json +6 -4
- package/src/LlamaCompletionWorker.cpp +6 -6
- package/src/LlamaContext.cpp +7 -9
- package/src/common.hpp +2 -1
- package/src/llama.cpp/.github/workflows/build.yml +98 -24
- package/src/llama.cpp/.github/workflows/close-issue.yml +5 -0
- package/src/llama.cpp/.github/workflows/docker.yml +43 -34
- package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +7 -0
- package/src/llama.cpp/.github/workflows/nix-ci.yml +7 -0
- package/src/llama.cpp/.github/workflows/python-check-requirements.yml +2 -4
- package/src/llama.cpp/.github/workflows/python-type-check.yml +3 -1
- package/src/llama.cpp/.github/workflows/server.yml +7 -0
- package/src/llama.cpp/CMakeLists.txt +20 -8
- package/src/llama.cpp/common/CMakeLists.txt +12 -10
- package/src/llama.cpp/common/arg.cpp +2006 -0
- package/src/llama.cpp/common/arg.h +77 -0
- package/src/llama.cpp/common/common.cpp +496 -1632
- package/src/llama.cpp/common/common.h +161 -63
- package/src/llama.cpp/common/console.cpp +3 -0
- package/src/llama.cpp/common/log.cpp +401 -0
- package/src/llama.cpp/common/log.h +66 -698
- package/src/llama.cpp/common/ngram-cache.cpp +3 -0
- package/src/llama.cpp/common/sampling.cpp +348 -350
- package/src/llama.cpp/common/sampling.h +62 -139
- package/src/llama.cpp/common/stb_image.h +5990 -6398
- package/src/llama.cpp/common/train.cpp +2 -0
- package/src/llama.cpp/docs/build.md +36 -1
- package/src/llama.cpp/examples/CMakeLists.txt +0 -1
- package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +1 -2
- package/src/llama.cpp/examples/batched/batched.cpp +39 -55
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +34 -44
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +55 -52
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +15 -15
- package/src/llama.cpp/examples/cvector-generator/pca.hpp +3 -13
- package/src/llama.cpp/examples/embedding/embedding.cpp +143 -87
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +33 -33
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +36 -35
- package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +14 -39
- package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +83 -0
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +58 -39
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +34 -27
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +59 -62
- package/src/llama.cpp/examples/infill/infill.cpp +117 -132
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +265 -58
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +29 -22
- package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
- package/src/llama.cpp/examples/llava/clip.cpp +685 -150
- package/src/llama.cpp/examples/llava/clip.h +11 -2
- package/src/llama.cpp/examples/llava/llava-cli.cpp +47 -58
- package/src/llama.cpp/examples/llava/llava.cpp +110 -24
- package/src/llama.cpp/examples/llava/llava.h +2 -3
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +323 -0
- package/src/llama.cpp/examples/llava/requirements.txt +1 -0
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +42 -43
- package/src/llama.cpp/examples/lookup/lookup-create.cpp +10 -8
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +23 -22
- package/src/llama.cpp/examples/lookup/lookup.cpp +40 -43
- package/src/llama.cpp/examples/main/main.cpp +210 -262
- package/src/llama.cpp/examples/parallel/parallel.cpp +49 -49
- package/src/llama.cpp/examples/passkey/passkey.cpp +42 -50
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +187 -200
- package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize/quantize.cpp +27 -9
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +2 -3
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +49 -44
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +24 -1
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +32 -35
- package/src/llama.cpp/examples/server/CMakeLists.txt +3 -5
- package/src/llama.cpp/examples/server/server.cpp +1027 -1073
- package/src/llama.cpp/examples/server/tests/requirements.txt +2 -1
- package/src/llama.cpp/examples/server/utils.hpp +107 -105
- package/src/llama.cpp/examples/simple/simple.cpp +35 -41
- package/src/llama.cpp/examples/speculative/speculative.cpp +129 -103
- package/src/llama.cpp/examples/sycl/run-llama2.sh +10 -19
- package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +25 -27
- package/src/llama.cpp/ggml/CMakeLists.txt +14 -3
- package/src/llama.cpp/ggml/include/ggml-alloc.h +3 -3
- package/src/llama.cpp/ggml/include/ggml-backend.h +145 -60
- package/src/llama.cpp/ggml/include/ggml-blas.h +3 -3
- package/src/llama.cpp/ggml/include/ggml-cann.h +15 -19
- package/src/llama.cpp/ggml/include/ggml-cuda.h +16 -16
- package/src/llama.cpp/ggml/include/ggml-metal.h +5 -8
- package/src/llama.cpp/ggml/include/ggml-rpc.h +5 -5
- package/src/llama.cpp/ggml/include/ggml-sycl.h +8 -8
- package/src/llama.cpp/ggml/include/ggml-vulkan.h +7 -7
- package/src/llama.cpp/ggml/include/ggml.h +293 -186
- package/src/llama.cpp/ggml/src/CMakeLists.txt +86 -44
- package/src/llama.cpp/ggml/src/ggml-aarch64.c +2135 -1119
- package/src/llama.cpp/ggml/src/ggml-alloc.c +6 -0
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +152 -70
- package/src/llama.cpp/ggml/src/{ggml-backend.c → ggml-backend.cpp} +606 -286
- package/src/llama.cpp/ggml/src/ggml-blas.cpp +9 -10
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +4 -27
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +32 -4
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +179 -41
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -1
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +2 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +278 -0
- package/src/llama.cpp/ggml/src/ggml-cann.cpp +215 -216
- package/src/llama.cpp/ggml/src/ggml-common.h +20 -0
- package/src/llama.cpp/ggml/src/ggml-cpu-impl.h +614 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +178 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +134 -0
- package/src/llama.cpp/ggml/src/ggml-impl.h +49 -603
- package/src/llama.cpp/ggml/src/ggml-kompute.cpp +4 -24
- package/src/llama.cpp/ggml/src/ggml-quants.c +972 -92
- package/src/llama.cpp/ggml/src/ggml-quants.h +15 -0
- package/src/llama.cpp/ggml/src/ggml-rpc.cpp +116 -66
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +52 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +99 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +21 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +57 -57
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +106 -106
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +16 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +101 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +125 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +23 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +6 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +2 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +71 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +21 -0
- package/src/llama.cpp/ggml/src/ggml-sycl.cpp +97 -169
- package/src/llama.cpp/ggml/src/ggml-vulkan.cpp +1508 -1124
- package/src/llama.cpp/ggml/src/ggml.c +3001 -1647
- package/src/llama.cpp/ggml/src/llamafile/sgemm.cpp +192 -0
- package/src/llama.cpp/ggml/src/vulkan-shaders/CMakeLists.txt +2 -0
- package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +88 -40
- package/src/llama.cpp/include/llama.h +241 -264
- package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.out +46 -0
- package/src/llama.cpp/requirements/requirements-convert_legacy_llama.txt +1 -1
- package/src/llama.cpp/src/llama-grammar.cpp +721 -122
- package/src/llama.cpp/src/llama-grammar.h +120 -15
- package/src/llama.cpp/src/llama-impl.h +156 -1
- package/src/llama.cpp/src/llama-sampling.cpp +1375 -303
- package/src/llama.cpp/src/llama-sampling.h +20 -47
- package/src/llama.cpp/src/llama-vocab.cpp +343 -120
- package/src/llama.cpp/src/llama-vocab.h +33 -17
- package/src/llama.cpp/src/llama.cpp +4247 -1525
- package/src/llama.cpp/src/unicode-data.cpp +6 -4
- package/src/llama.cpp/src/unicode-data.h +4 -4
- package/src/llama.cpp/src/unicode.cpp +15 -7
- package/src/llama.cpp/tests/CMakeLists.txt +3 -0
- package/src/llama.cpp/tests/test-arg-parser.cpp +131 -0
- package/src/llama.cpp/tests/test-backend-ops.cpp +1592 -289
- package/src/llama.cpp/tests/test-barrier.cpp +93 -0
- package/src/llama.cpp/tests/test-grad0.cpp +187 -70
- package/src/llama.cpp/tests/test-grammar-integration.cpp +23 -38
- package/src/llama.cpp/tests/test-grammar-parser.cpp +6 -4
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +6 -4
- package/src/llama.cpp/tests/test-llama-grammar.cpp +9 -8
- package/src/llama.cpp/tests/test-log.cpp +39 -0
- package/src/llama.cpp/tests/test-quantize-fns.cpp +6 -0
- package/src/llama.cpp/tests/test-rope.cpp +1 -1
- package/src/llama.cpp/tests/test-sampling.cpp +157 -98
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +55 -35
- package/patches/llama.patch +0 -22
- package/src/llama.cpp/.github/workflows/bench.yml +0 -310
- package/src/llama.cpp/common/grammar-parser.cpp +0 -536
- package/src/llama.cpp/common/grammar-parser.h +0 -29
- package/src/llama.cpp/examples/benchmark/CMakeLists.txt +0 -6
- package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +0 -275
|
@@ -9,6 +9,7 @@
|
|
|
9
9
|
#include <climits>
|
|
10
10
|
#include <cstring>
|
|
11
11
|
#include <cstdarg>
|
|
12
|
+
#include <cinttypes>
|
|
12
13
|
#include <ctime>
|
|
13
14
|
#include <random>
|
|
14
15
|
#include <stdexcept>
|
|
@@ -105,43 +106,43 @@ static void alloc_weights(TransformerWeights * w, const Config * p, bool shared_
|
|
|
105
106
|
const int n_multiqueries = p->n_kv_heads <= 0 || p->n_kv_heads >= p->n_heads ? 1 : p->n_heads / p->n_kv_heads;
|
|
106
107
|
try {
|
|
107
108
|
w->token_embedding_table.resize(p->vocab_size * p->dim);
|
|
108
|
-
|
|
109
|
+
LOG_INF("%s: Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
|
|
109
110
|
|
|
110
111
|
w->rms_att_weight.resize(p->n_layers * p->dim);
|
|
111
|
-
|
|
112
|
+
LOG_INF("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_att_weight\n",__func__,p->n_layers, p->dim, p->n_layers * p->dim);
|
|
112
113
|
|
|
113
114
|
w->rms_ffn_weight.resize(p->n_layers * p->dim);
|
|
114
|
-
|
|
115
|
+
LOG_INF("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_ffn_weight\n",__func__,p->n_layers , p->dim, p->n_layers * p->dim);
|
|
115
116
|
|
|
116
117
|
w->wq.resize(p->n_layers * p->dim * p->dim);
|
|
117
|
-
|
|
118
|
+
LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wq\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
|
|
118
119
|
|
|
119
120
|
w->wk.resize(p->n_layers * p->dim * p->dim / n_multiqueries);
|
|
120
|
-
|
|
121
|
+
LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wk\n",__func__,p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries);
|
|
121
122
|
|
|
122
123
|
w->wv.resize(p->n_layers * p->dim * p->dim / n_multiqueries);
|
|
123
|
-
|
|
124
|
+
LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wv\n",__func__, p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries);
|
|
124
125
|
|
|
125
126
|
w->wo.resize(p->n_layers * p->dim * p->dim);
|
|
126
|
-
|
|
127
|
+
LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wo\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
|
|
127
128
|
|
|
128
129
|
w->w1.resize(p->n_layers * p->hidden_dim * p->dim);
|
|
129
|
-
|
|
130
|
+
LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w1\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
|
|
130
131
|
|
|
131
132
|
w->w2.resize(p->n_layers * p->hidden_dim * p->dim);
|
|
132
|
-
|
|
133
|
+
LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w2\n",__func__,p->n_layers, p->dim, p->hidden_dim, p->n_layers * p->hidden_dim * p->dim);
|
|
133
134
|
|
|
134
135
|
w->w3.resize(p->n_layers * p->hidden_dim * p->dim);
|
|
135
|
-
|
|
136
|
+
LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w3\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
|
|
136
137
|
|
|
137
138
|
w->rms_final_weight.resize(p->dim);
|
|
138
|
-
|
|
139
|
+
LOG_INF("%s: Allocating [%d] float space for w->rms_final_weight\n",__func__,p->dim);
|
|
139
140
|
|
|
140
141
|
if (shared_weights) {
|
|
141
142
|
w->wcls = {};
|
|
142
143
|
} else {
|
|
143
144
|
w->wcls.resize(p->vocab_size * p->dim);
|
|
144
|
-
|
|
145
|
+
LOG_INF("%s: Allocating [%d] x [%d] = [%d] float space for w->wcls\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
|
|
145
146
|
}
|
|
146
147
|
}
|
|
147
148
|
catch (std::length_error &) {
|
|
@@ -173,7 +174,7 @@ static int checkpoint_init_weights(TransformerWeights * w, const Config * p, FIL
|
|
|
173
174
|
fseek(f, 0, SEEK_END);
|
|
174
175
|
auto end = ftell(f);
|
|
175
176
|
if (curr != end) {
|
|
176
|
-
|
|
177
|
+
LOG_ERR("%s: Error: failed to read the checkpoint file to the end (curr = %ld, end = %ld)\n", __func__, curr, end);
|
|
177
178
|
return 1;
|
|
178
179
|
}
|
|
179
180
|
|
|
@@ -181,26 +182,26 @@ static int checkpoint_init_weights(TransformerWeights * w, const Config * p, FIL
|
|
|
181
182
|
}
|
|
182
183
|
|
|
183
184
|
static void print_sample_weights(TransformerWeights *w){
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
if (!w->wcls.empty())
|
|
185
|
+
LOG_INF("----- Quick print of first of the weight vales of all the variables\n");
|
|
186
|
+
LOG_INF("%f\n", w->token_embedding_table[0]);
|
|
187
|
+
LOG_INF("%f\n", w->rms_att_weight[0]);
|
|
188
|
+
LOG_INF("%f\n", w->rms_ffn_weight[0]);
|
|
189
|
+
|
|
190
|
+
LOG_INF("%f\n", w->wq[0]);
|
|
191
|
+
LOG_INF("%f\n", w->wk[0]);
|
|
192
|
+
LOG_INF("%f\n", w->wv[0]);
|
|
193
|
+
LOG_INF("%f\n", w->wo[0]);
|
|
194
|
+
LOG_INF("%f\n", w->w1[0]);
|
|
195
|
+
LOG_INF("%f\n", w->w2[0]);
|
|
196
|
+
LOG_INF("%f\n", w->w3[0]);
|
|
197
|
+
LOG_INF("%f\n", w->rms_att_weight[0]);
|
|
198
|
+
if (!w->wcls.empty()) LOG_INF("%f\n", w->wcls[0]);
|
|
198
199
|
}
|
|
199
200
|
////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
200
201
|
|
|
201
202
|
//////////////////////////////////////// ggml structs and functions required to load models, configs and save the model.
|
|
202
203
|
|
|
203
|
-
struct
|
|
204
|
+
struct my_llama_vocab {
|
|
204
205
|
using id = int32_t;
|
|
205
206
|
using token = std::string;
|
|
206
207
|
using ttype = llama_token_type;
|
|
@@ -318,20 +319,20 @@ struct train_params {
|
|
|
318
319
|
};
|
|
319
320
|
|
|
320
321
|
static void print_params(struct my_llama_hparams * params) {
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
322
|
+
LOG_INF("%s: n_vocab: %u\n", __func__, params->n_vocab);
|
|
323
|
+
LOG_INF("%s: n_ctx: %u\n", __func__, params->n_ctx);
|
|
324
|
+
LOG_INF("%s: n_embd: %u\n", __func__, params->n_embd);
|
|
325
|
+
LOG_INF("%s: n_mult: %u\n", __func__, params->n_mult);
|
|
326
|
+
LOG_INF("%s: n_head: %u\n", __func__, params->n_head);
|
|
327
|
+
LOG_INF("%s: n_head_kv: %u\n", __func__, params->n_head_kv);
|
|
328
|
+
LOG_INF("%s: n_ff: %u\n", __func__, params->n_ff);
|
|
329
|
+
LOG_INF("%s: n_layer: %u\n", __func__, params->n_layer);
|
|
330
|
+
LOG_INF("%s: n_rot: %u\n", __func__, params->n_rot);
|
|
330
331
|
}
|
|
331
332
|
|
|
332
333
|
static void print_tensor_info(const struct ggml_context * ctx) {
|
|
333
334
|
for (auto t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
|
334
|
-
|
|
335
|
+
LOG_INF("%s: Allocating ", __func__);
|
|
335
336
|
int64_t total = 1;
|
|
336
337
|
int i = 0;
|
|
337
338
|
for (; i < ggml_n_dims(t); ++i) {
|
|
@@ -524,9 +525,9 @@ static std::string llama_escape_whitespaces(const std::string & text) {
|
|
|
524
525
|
return out.str();
|
|
525
526
|
}
|
|
526
527
|
|
|
527
|
-
static void load_vocab(const char * filename, const Config * config, struct
|
|
528
|
+
static void load_vocab(const char * filename, const Config * config, struct my_llama_vocab * vocab) {
|
|
528
529
|
if (is_ggml_file(filename)) {
|
|
529
|
-
|
|
530
|
+
LOG_INF("%s: Loading vocabulary from gguf file %s\n", __func__, filename);
|
|
530
531
|
struct ggml_context * ctx_data = NULL;
|
|
531
532
|
|
|
532
533
|
struct gguf_init_params params = {
|
|
@@ -574,7 +575,7 @@ static void load_vocab(const char * filename, const Config * config, struct llam
|
|
|
574
575
|
gguf_free(ctx);
|
|
575
576
|
} else {
|
|
576
577
|
// assume llama2.c vocabulary
|
|
577
|
-
|
|
578
|
+
LOG_INF("%s: Assuming llama2.c vocabulary since %s is not a gguf file\n", __func__, filename);
|
|
578
579
|
llama_file file(filename, "rb");
|
|
579
580
|
if (!file.fp) {
|
|
580
581
|
die_fmt("%s: %s", strerror(errno), filename);
|
|
@@ -582,13 +583,13 @@ static void load_vocab(const char * filename, const Config * config, struct llam
|
|
|
582
583
|
const int n_vocab = config->vocab_size;
|
|
583
584
|
/* uint32_t max_token_length = */ file.read_u32(); // unused
|
|
584
585
|
vocab->id_to_token.resize(n_vocab);
|
|
585
|
-
for (
|
|
586
|
+
for (my_llama_vocab::id id=0; id<n_vocab; ++id) {
|
|
586
587
|
float_t score = file.read_f32();
|
|
587
588
|
uint32_t len = file.read_u32();
|
|
588
589
|
std::string text = file.read_string(len);
|
|
589
590
|
|
|
590
591
|
unsigned char byte_val;
|
|
591
|
-
|
|
592
|
+
my_llama_vocab::ttype type = LLAMA_TOKEN_TYPE_NORMAL;
|
|
592
593
|
if (id == UNKNOWN_TOKEN_ID) {
|
|
593
594
|
text = "<unk>";
|
|
594
595
|
type = LLAMA_TOKEN_TYPE_UNKNOWN;
|
|
@@ -630,7 +631,7 @@ static void convert_weights_ak_to_gg(struct ggml_tensor * gg_weights, const floa
|
|
|
630
631
|
}
|
|
631
632
|
|
|
632
633
|
static void save_as_llama_model(
|
|
633
|
-
struct
|
|
634
|
+
struct my_llama_vocab * vocab, struct my_llama_model * model, TransformerWeights* w, const char * filename
|
|
634
635
|
) {
|
|
635
636
|
// convert AK weights into GG weights one by one.
|
|
636
637
|
// w->token_embedding_table -> model->tok_embeddings
|
|
@@ -670,7 +671,7 @@ static void save_as_llama_model(
|
|
|
670
671
|
std::vector<const char*> tokens;
|
|
671
672
|
std::vector<float> scores;
|
|
672
673
|
std::vector<llama_token_type> token_types;
|
|
673
|
-
for (const
|
|
674
|
+
for (const my_llama_vocab::token_data & token_data : vocab->id_to_token) {
|
|
674
675
|
tokens.push_back(token_data.text.c_str());
|
|
675
676
|
scores.push_back(token_data.score);
|
|
676
677
|
token_types.push_back(token_data.type);
|
|
@@ -871,23 +872,25 @@ static std::string basename(const std::string &path) {
|
|
|
871
872
|
}
|
|
872
873
|
|
|
873
874
|
int main(int argc, char ** argv) {
|
|
875
|
+
gpt_init();
|
|
876
|
+
|
|
874
877
|
struct train_params params = get_default_train_params();
|
|
875
878
|
if (!params_parse(argc, argv, ¶ms)) {
|
|
876
879
|
return 1;
|
|
877
880
|
}
|
|
878
|
-
|
|
881
|
+
|
|
879
882
|
Config config;
|
|
880
883
|
TransformerWeights weights = {};
|
|
881
884
|
{
|
|
882
|
-
|
|
885
|
+
LOG_INF("%s: Loading llama2c model from %s\n", __func__, params.fn_llama2c_model);
|
|
883
886
|
FILE * file = fopen(params.fn_llama2c_model, "rb");
|
|
884
887
|
if (!file) {
|
|
885
|
-
|
|
888
|
+
LOG_ERR("%s: Unable to open the checkpoint file %s!\n", __func__, params.fn_llama2c_model);
|
|
886
889
|
return 1;
|
|
887
890
|
}
|
|
888
891
|
// read in the config header
|
|
889
892
|
if (fread(&config, sizeof(Config), 1, file) != 1) {
|
|
890
|
-
|
|
893
|
+
LOG_ERR("%s: Unable to read llama2c config from %s!\n",__func__,params.fn_llama2c_model);
|
|
891
894
|
return 1;
|
|
892
895
|
}
|
|
893
896
|
auto shared_weights = config.vocab_size > 0;
|
|
@@ -896,13 +899,13 @@ int main(int argc, char ** argv) {
|
|
|
896
899
|
// read in the Transformer weights
|
|
897
900
|
alloc_weights(&weights, &config, shared_weights);
|
|
898
901
|
if (checkpoint_init_weights(&weights, &config, file, shared_weights)) {
|
|
899
|
-
|
|
902
|
+
LOG_ERR("%s: Unable to initialize transformer weights from %s!",__func__,params.fn_llama2c_model);
|
|
900
903
|
return 1;
|
|
901
904
|
}
|
|
902
905
|
fclose(file);
|
|
903
906
|
}
|
|
904
907
|
|
|
905
|
-
struct
|
|
908
|
+
struct my_llama_vocab vocab;
|
|
906
909
|
load_vocab(params.fn_vocab_model, &config, &vocab);
|
|
907
910
|
|
|
908
911
|
struct my_llama_model model;
|
|
@@ -929,7 +932,7 @@ int main(int argc, char ** argv) {
|
|
|
929
932
|
model.name = basename(params.fn_llama2c_model);
|
|
930
933
|
save_as_llama_model(&vocab, &model, &weights, params.fn_llama2c_output_model);
|
|
931
934
|
|
|
932
|
-
|
|
935
|
+
LOG_INF("%s: Saving llama.c model file %s in ggml format at %s\n", __func__, params.fn_llama2c_model, params.fn_llama2c_output_model);
|
|
933
936
|
|
|
934
937
|
ggml_free(model.ctx);
|
|
935
938
|
return 0;
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
#include "arg.h"
|
|
1
2
|
#include "common.h"
|
|
2
3
|
#include "llama.h"
|
|
3
4
|
#include "ggml.h"
|
|
@@ -12,14 +13,15 @@
|
|
|
12
13
|
#include "ggml-metal.h"
|
|
13
14
|
#endif
|
|
14
15
|
|
|
16
|
+
#include <algorithm>
|
|
17
|
+
#include <climits>
|
|
15
18
|
#include <cstdio>
|
|
19
|
+
#include <cstring>
|
|
20
|
+
#include <fstream>
|
|
21
|
+
#include <iostream>
|
|
16
22
|
#include <string>
|
|
17
23
|
#include <tuple>
|
|
18
24
|
#include <vector>
|
|
19
|
-
#include <algorithm>
|
|
20
|
-
#include <iostream>
|
|
21
|
-
#include <fstream>
|
|
22
|
-
#include <climits>
|
|
23
25
|
|
|
24
26
|
|
|
25
27
|
//////////////////////////////////////////////////
|
|
@@ -35,9 +37,7 @@ static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) {
|
|
|
35
37
|
return ret;
|
|
36
38
|
}
|
|
37
39
|
|
|
38
|
-
static void print_usage(int
|
|
39
|
-
gpt_params_print_usage(argc, argv, params);
|
|
40
|
-
|
|
40
|
+
static void print_usage(int, char ** argv) {
|
|
41
41
|
printf("\nexample usage:\n");
|
|
42
42
|
printf("\n CPU only: %s -m ./llama-3.Q4_K_M.gguf\n", argv[0]);
|
|
43
43
|
printf("\n with GPU: %s -m ./llama-3.Q4_K_M.gguf -ngl 99\n", argv[0]);
|
|
@@ -271,7 +271,7 @@ struct tokenized_prompt {
|
|
|
271
271
|
size_t max_seq_len;
|
|
272
272
|
|
|
273
273
|
tokenized_prompt(llama_context * ctx, std::string pos, std::string neg) {
|
|
274
|
-
const bool add_bos =
|
|
274
|
+
const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
|
|
275
275
|
tokens_pos = ::llama_tokenize(ctx, pos, add_bos, true);
|
|
276
276
|
tokens_neg = ::llama_tokenize(ctx, neg, add_bos, true);
|
|
277
277
|
max_seq_len = std::max(tokens_pos.size(), tokens_neg.size());
|
|
@@ -390,8 +390,7 @@ static int prepare_entries(gpt_params & params, train_context & ctx_train) {
|
|
|
390
390
|
int main(int argc, char ** argv) {
|
|
391
391
|
gpt_params params;
|
|
392
392
|
|
|
393
|
-
if (!gpt_params_parse(argc, argv, params)) {
|
|
394
|
-
print_usage(argc, argv, params);
|
|
393
|
+
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_CVECTOR_GENERATOR, print_usage)) {
|
|
395
394
|
return 1;
|
|
396
395
|
}
|
|
397
396
|
|
|
@@ -414,9 +413,10 @@ int main(int argc, char ** argv) {
|
|
|
414
413
|
llama_numa_init(params.numa);
|
|
415
414
|
|
|
416
415
|
// load the model to get hparams
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
416
|
+
llama_init_result llama_init = llama_init_from_gpt_params(params);
|
|
417
|
+
|
|
418
|
+
llama_model * model = llama_init.model;
|
|
419
|
+
llama_context * ctx = llama_init.context;
|
|
420
420
|
|
|
421
421
|
// int n_ctx = llama_n_ctx(ctx);
|
|
422
422
|
int n_layers = llama_n_layer(model);
|
|
@@ -485,8 +485,8 @@ int main(int argc, char ** argv) {
|
|
|
485
485
|
if (use_pca) {
|
|
486
486
|
// run PCA
|
|
487
487
|
PCA::pca_params pca_params;
|
|
488
|
-
pca_params.n_threads
|
|
489
|
-
pca_params.n_batch
|
|
488
|
+
pca_params.n_threads = params.cpuparams.n_threads;
|
|
489
|
+
pca_params.n_batch = params.n_pca_batch;
|
|
490
490
|
pca_params.n_iterations = params.n_pca_iterations;
|
|
491
491
|
PCA::run_pca(pca_params, ctx_train.v_diff, ctx_train.v_final);
|
|
492
492
|
} else {
|
|
@@ -12,12 +12,9 @@
|
|
|
12
12
|
|
|
13
13
|
#include <cstdio>
|
|
14
14
|
#include <ctime>
|
|
15
|
+
#include <random>
|
|
15
16
|
#include <string>
|
|
16
|
-
#include <tuple>
|
|
17
17
|
#include <vector>
|
|
18
|
-
#include <algorithm>
|
|
19
|
-
#include <iostream>
|
|
20
|
-
#include <fstream>
|
|
21
18
|
|
|
22
19
|
#define DEBUG_POS 5
|
|
23
20
|
|
|
@@ -207,13 +204,6 @@ static ggml_status compute_piter(
|
|
|
207
204
|
ggml_backend_cpu_set_n_threads(model.backend, params.n_threads);
|
|
208
205
|
}
|
|
209
206
|
|
|
210
|
-
// TODO: enable GPU support when support for GGML_OP_SQRT is added
|
|
211
|
-
//#ifdef GGML_USE_METAL
|
|
212
|
-
// if (ggml_backend_is_metal(model.backend)) {
|
|
213
|
-
// ggml_backend_metal_set_n_cb(model.backend, params.n_threads);
|
|
214
|
-
// }
|
|
215
|
-
//#endif
|
|
216
|
-
|
|
217
207
|
ggml_status res = ggml_backend_graph_compute(model.backend, gf);
|
|
218
208
|
if (res == GGML_STATUS_SUCCESS) {
|
|
219
209
|
auto extract_i = [](std::string prefix, std::string str) -> int {
|
|
@@ -229,8 +219,8 @@ static ggml_status compute_piter(
|
|
|
229
219
|
result.eigenvectors.resize(params.n_batch);
|
|
230
220
|
result.distances.resize(params.n_batch);
|
|
231
221
|
// get output nodes
|
|
232
|
-
for (int i = 0; i < gf
|
|
233
|
-
auto node = gf
|
|
222
|
+
for (int i = 0; i < ggml_graph_n_nodes(gf); ++i) {
|
|
223
|
+
auto node = ggml_graph_node(gf, i);
|
|
234
224
|
int iter = -1;
|
|
235
225
|
// find b_tensor (without copying data from device)
|
|
236
226
|
if ((iter = extract_i("b_tensor_norm_", node->name)) > -1) {
|