@fugood/llama.node 0.3.2 → 0.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +2 -0
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/package.json +1 -1
- package/src/DetokenizeWorker.cpp +1 -1
- package/src/EmbeddingWorker.cpp +2 -2
- package/src/LlamaCompletionWorker.cpp +8 -8
- package/src/LlamaCompletionWorker.h +2 -2
- package/src/LlamaContext.cpp +8 -9
- package/src/TokenizeWorker.cpp +1 -1
- package/src/common.hpp +4 -4
- package/src/llama.cpp/.github/workflows/build.yml +43 -9
- package/src/llama.cpp/.github/workflows/docker.yml +3 -0
- package/src/llama.cpp/CMakeLists.txt +7 -4
- package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
- package/src/llama.cpp/common/CMakeLists.txt +0 -2
- package/src/llama.cpp/common/arg.cpp +642 -607
- package/src/llama.cpp/common/arg.h +22 -22
- package/src/llama.cpp/common/common.cpp +79 -281
- package/src/llama.cpp/common/common.h +130 -100
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
- package/src/llama.cpp/common/log.cpp +50 -50
- package/src/llama.cpp/common/log.h +18 -18
- package/src/llama.cpp/common/ngram-cache.cpp +36 -36
- package/src/llama.cpp/common/ngram-cache.h +19 -19
- package/src/llama.cpp/common/sampling.cpp +116 -108
- package/src/llama.cpp/common/sampling.h +20 -20
- package/src/llama.cpp/docs/build.md +37 -17
- package/src/llama.cpp/examples/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/batched/batched.cpp +14 -14
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +10 -11
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +1 -1
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +9 -9
- package/src/llama.cpp/examples/embedding/embedding.cpp +12 -12
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +8 -8
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +5 -5
- package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +7 -7
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +18 -18
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +20 -11
- package/src/llama.cpp/examples/infill/infill.cpp +40 -86
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +42 -151
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +11 -14
- package/src/llama.cpp/examples/llava/clip.cpp +1 -0
- package/src/llama.cpp/examples/llava/llava-cli.cpp +23 -23
- package/src/llama.cpp/examples/llava/llava.cpp +37 -3
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +21 -21
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +26 -26
- package/src/llama.cpp/examples/lookup/lookup-create.cpp +7 -7
- package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +14 -14
- package/src/llama.cpp/examples/lookup/lookup.cpp +29 -29
- package/src/llama.cpp/examples/main/main.cpp +64 -109
- package/src/llama.cpp/examples/parallel/parallel.cpp +18 -19
- package/src/llama.cpp/examples/passkey/passkey.cpp +14 -14
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +99 -120
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +10 -9
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +13 -13
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +3 -1
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +34 -17
- package/src/llama.cpp/examples/server/CMakeLists.txt +4 -13
- package/src/llama.cpp/examples/server/server.cpp +553 -691
- package/src/llama.cpp/examples/server/utils.hpp +312 -25
- package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/simple/simple.cpp +128 -96
- package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +197 -0
- package/src/llama.cpp/examples/speculative/speculative.cpp +54 -51
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +2 -2
- package/src/llama.cpp/ggml/CMakeLists.txt +15 -9
- package/src/llama.cpp/ggml/include/ggml-amx.h +25 -0
- package/src/llama.cpp/ggml/include/ggml-backend.h +46 -33
- package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
- package/src/llama.cpp/ggml/include/ggml-cann.h +9 -7
- package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +177 -0
- package/src/llama.cpp/ggml/include/ggml-cuda.h +12 -12
- package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
- package/src/llama.cpp/ggml/include/ggml-metal.h +11 -7
- package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
- package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
- package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
- package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
- package/src/llama.cpp/ggml/include/ggml.h +53 -393
- package/src/llama.cpp/ggml/src/CMakeLists.txt +66 -1149
- package/src/llama.cpp/ggml/src/ggml-aarch64.c +46 -3126
- package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -20
- package/src/llama.cpp/ggml/src/ggml-alloc.c +23 -27
- package/src/llama.cpp/ggml/src/ggml-amx/CMakeLists.txt +107 -0
- package/src/llama.cpp/ggml/src/ggml-amx/common.h +94 -0
- package/src/llama.cpp/ggml/src/ggml-amx/ggml-amx.cpp +446 -0
- package/src/llama.cpp/ggml/src/ggml-amx/mmq.cpp +2510 -0
- package/src/llama.cpp/ggml/src/ggml-amx/mmq.h +17 -0
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +6 -25
- package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +195 -0
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +303 -864
- package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +91 -0
- package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +213 -65
- package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +46 -0
- package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +255 -149
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +261 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.c +3560 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +30 -0
- package/src/llama.cpp/ggml/src/{ggml-cpu-impl.h → ggml-cpu/ggml-cpu-impl.h} +0 -243
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10822 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +13970 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +663 -0
- package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.cpp +667 -1
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +155 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +106 -0
- package/src/llama.cpp/ggml/src/ggml-impl.h +366 -16
- package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +162 -0
- package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +238 -72
- package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +108 -0
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +249 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +100 -0
- package/src/llama.cpp/ggml/src/ggml-opt.cpp +867 -0
- package/src/llama.cpp/ggml/src/ggml-quants.c +187 -10692
- package/src/llama.cpp/ggml/src/ggml-quants.h +78 -125
- package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +11 -0
- package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +475 -300
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +81 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +40 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +258 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +2 -22
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1011 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
- package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3584 -4142
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +69 -67
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +3 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +6 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +138 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
- package/src/llama.cpp/ggml/src/ggml-threading.h +12 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +78 -0
- package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +555 -623
- package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/vulkan-shaders-gen.cpp +125 -206
- package/src/llama.cpp/ggml/src/ggml.c +4032 -19890
- package/src/llama.cpp/include/llama.h +67 -33
- package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
- package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
- package/src/llama.cpp/src/CMakeLists.txt +2 -1
- package/src/llama.cpp/src/llama-sampling.cpp +745 -105
- package/src/llama.cpp/src/llama-sampling.h +21 -2
- package/src/llama.cpp/src/llama-vocab.cpp +49 -9
- package/src/llama.cpp/src/llama-vocab.h +35 -11
- package/src/llama.cpp/src/llama.cpp +2636 -2406
- package/src/llama.cpp/src/unicode-data.cpp +2 -2
- package/src/llama.cpp/tests/CMakeLists.txt +1 -2
- package/src/llama.cpp/tests/test-arg-parser.cpp +14 -14
- package/src/llama.cpp/tests/test-backend-ops.cpp +185 -60
- package/src/llama.cpp/tests/test-barrier.cpp +1 -0
- package/src/llama.cpp/tests/test-chat-template.cpp +9 -5
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -4
- package/src/llama.cpp/tests/test-log.cpp +2 -2
- package/src/llama.cpp/tests/test-opt.cpp +853 -142
- package/src/llama.cpp/tests/test-quantize-fns.cpp +22 -19
- package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
- package/src/llama.cpp/tests/test-rope.cpp +1 -0
- package/src/llama.cpp/tests/test-sampling.cpp +162 -137
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +7 -7
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
- package/src/llama.cpp/common/train.cpp +0 -1515
- package/src/llama.cpp/common/train.h +0 -233
- package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1639
- package/src/llama.cpp/tests/test-grad0.cpp +0 -1683
- /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
- /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
- /package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +0 -0
|
@@ -142,7 +142,7 @@ static bool tensor_is_contiguous(const struct ggml_tensor * tensor) {
|
|
|
142
142
|
}
|
|
143
143
|
|
|
144
144
|
static void test_roundtrip_on_chunk(
|
|
145
|
-
const ggml_tensor * layer, int64_t offset, int64_t chunk_size, const
|
|
145
|
+
const ggml_tensor * layer, int64_t offset, int64_t chunk_size, const ggml_type_traits & qfns, const ggml_type_traits_cpu & qfns_cpu, bool use_reference,
|
|
146
146
|
float * input_scratch, char * quantized_scratch, float * output_scratch, error_stats & stats
|
|
147
147
|
) {
|
|
148
148
|
if (layer->type == GGML_TYPE_F16) {
|
|
@@ -156,7 +156,7 @@ static void test_roundtrip_on_chunk(
|
|
|
156
156
|
if (use_reference) {
|
|
157
157
|
qfns.from_float_ref(input_scratch, quantized_scratch, chunk_size);
|
|
158
158
|
} else {
|
|
159
|
-
|
|
159
|
+
qfns_cpu.from_float(input_scratch, quantized_scratch, chunk_size);
|
|
160
160
|
}
|
|
161
161
|
qfns.to_float(quantized_scratch, output_scratch, chunk_size);
|
|
162
162
|
|
|
@@ -166,7 +166,7 @@ static void test_roundtrip_on_chunk(
|
|
|
166
166
|
|
|
167
167
|
// Run quantization function for a single layer and update error stats
|
|
168
168
|
static void test_roundtrip_on_layer(
|
|
169
|
-
std::string & name, bool print_layer_stats, const
|
|
169
|
+
std::string & name, bool print_layer_stats, const ggml_type_traits & qfns, const ggml_type_traits_cpu & qfns_cpu, bool use_reference,
|
|
170
170
|
const ggml_tensor * layer, std::vector<float> & input_scratch, std::vector<char> & quantized_scratch,
|
|
171
171
|
std::vector<float> & output_scratch, error_stats & total_error, int max_thread = 0
|
|
172
172
|
) {
|
|
@@ -187,13 +187,13 @@ static void test_roundtrip_on_layer(
|
|
|
187
187
|
int num_chunks = (nelements + chunk_size - 1)/chunk_size;
|
|
188
188
|
|
|
189
189
|
if (num_chunks < 2 || max_thread < 2) {
|
|
190
|
-
test_roundtrip_on_chunk(layer, 0, nelements, qfns, use_reference, input_scratch_ptr, quantized_scratch.data(),
|
|
190
|
+
test_roundtrip_on_chunk(layer, 0, nelements, qfns, qfns_cpu, use_reference, input_scratch_ptr, quantized_scratch.data(),
|
|
191
191
|
output_scratch.data(), print_layer_stats ? layer_error : total_error);
|
|
192
192
|
} else {
|
|
193
193
|
auto & stats = print_layer_stats ? layer_error : total_error;
|
|
194
194
|
std::mutex mutex;
|
|
195
195
|
uint64_t counter = 0;
|
|
196
|
-
auto compute = [&mutex, &counter, &stats, &qfns, nelements, layer, use_reference, input_scratch_ptr,
|
|
196
|
+
auto compute = [&mutex, &counter, &stats, &qfns, &qfns_cpu, nelements, layer, use_reference, input_scratch_ptr,
|
|
197
197
|
&quantized_scratch, &output_scratch, chunk_size] () {
|
|
198
198
|
error_stats local_stats {};
|
|
199
199
|
while (true) {
|
|
@@ -205,7 +205,7 @@ static void test_roundtrip_on_layer(
|
|
|
205
205
|
}
|
|
206
206
|
lock.unlock();
|
|
207
207
|
uint64_t chunk = offset + chunk_size < nelements ? chunk_size : nelements - offset;
|
|
208
|
-
test_roundtrip_on_chunk(layer, offset, chunk, qfns, use_reference, input_scratch_ptr + offset,
|
|
208
|
+
test_roundtrip_on_chunk(layer, offset, chunk, qfns, qfns_cpu, use_reference, input_scratch_ptr + offset,
|
|
209
209
|
quantized_scratch.data() + 4*offset, output_scratch.data() + offset, local_stats);
|
|
210
210
|
}
|
|
211
211
|
};
|
|
@@ -371,8 +371,9 @@ int main(int argc, char ** argv) {
|
|
|
371
371
|
if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) {
|
|
372
372
|
continue;
|
|
373
373
|
}
|
|
374
|
-
|
|
375
|
-
|
|
374
|
+
const auto * qfns = ggml_get_type_traits(type);
|
|
375
|
+
const auto * qfns_cpu = ggml_get_type_traits_cpu(type);
|
|
376
|
+
if (qfns_cpu->from_float && qfns->to_float) {
|
|
376
377
|
if (params.verbose) {
|
|
377
378
|
printf("testing %s ...\n", ggml_type_name(type));
|
|
378
379
|
}
|
|
@@ -393,7 +394,7 @@ int main(int argc, char ** argv) {
|
|
|
393
394
|
test_roundtrip_on_layer(
|
|
394
395
|
layer_name,
|
|
395
396
|
params.per_layer_stats,
|
|
396
|
-
qfns,
|
|
397
|
+
*qfns, *qfns_cpu,
|
|
397
398
|
params.reference,
|
|
398
399
|
kv_tensor.second,
|
|
399
400
|
input_scratch,
|
|
@@ -77,7 +77,7 @@ static std::vector<chunk> chunk_file(const std::string & filename, int chunk_siz
|
|
|
77
77
|
static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, llama_seq_id seq_id) {
|
|
78
78
|
size_t n_tokens = tokens.size();
|
|
79
79
|
for (size_t i = 0; i < n_tokens; i++) {
|
|
80
|
-
|
|
80
|
+
common_batch_add(batch, tokens[i], i, { seq_id }, true);
|
|
81
81
|
}
|
|
82
82
|
}
|
|
83
83
|
|
|
@@ -107,18 +107,18 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
|
|
|
107
107
|
}
|
|
108
108
|
|
|
109
109
|
float * out = output + batch.seq_id[i][0] * n_embd;
|
|
110
|
-
|
|
110
|
+
common_embd_normalize(embd, out, n_embd);
|
|
111
111
|
}
|
|
112
112
|
}
|
|
113
113
|
|
|
114
114
|
int main(int argc, char ** argv) {
|
|
115
|
-
|
|
115
|
+
common_params params;
|
|
116
116
|
|
|
117
|
-
if (!
|
|
117
|
+
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_RETRIEVAL, print_usage)) {
|
|
118
118
|
return 1;
|
|
119
119
|
}
|
|
120
120
|
|
|
121
|
-
|
|
121
|
+
common_init();
|
|
122
122
|
|
|
123
123
|
// For BERT models, batch size must be equal to ubatch size
|
|
124
124
|
params.n_ubatch = params.n_batch;
|
|
@@ -149,7 +149,7 @@ int main(int argc, char ** argv) {
|
|
|
149
149
|
llama_numa_init(params.numa);
|
|
150
150
|
|
|
151
151
|
// load the model
|
|
152
|
-
|
|
152
|
+
common_init_result llama_init = common_init_from_params(params);
|
|
153
153
|
|
|
154
154
|
llama_model * model = llama_init.model;
|
|
155
155
|
llama_context * ctx = llama_init.context;
|
|
@@ -176,7 +176,7 @@ int main(int argc, char ** argv) {
|
|
|
176
176
|
// print system information
|
|
177
177
|
{
|
|
178
178
|
LOG_INF("\n");
|
|
179
|
-
LOG_INF("%s\n",
|
|
179
|
+
LOG_INF("%s\n", common_params_get_system_info(params).c_str());
|
|
180
180
|
}
|
|
181
181
|
|
|
182
182
|
// max batch size
|
|
@@ -185,7 +185,7 @@ int main(int argc, char ** argv) {
|
|
|
185
185
|
|
|
186
186
|
// tokenize the prompts and trim
|
|
187
187
|
for (auto & chunk : chunks) {
|
|
188
|
-
auto inp =
|
|
188
|
+
auto inp = common_tokenize(ctx, chunk.textdata, true, false);
|
|
189
189
|
if (inp.size() > n_batch) {
|
|
190
190
|
LOG_ERR("%s: chunk size (%lld) exceeds batch size (%lld), increase batch size and re-run\n",
|
|
191
191
|
__func__, (long long int) inp.size(), (long long int) n_batch);
|
|
@@ -204,7 +204,7 @@ int main(int argc, char ** argv) {
|
|
|
204
204
|
LOG_INF("%s: prompt %d: '%s'\n", __func__, i, chunks[i].textdata.c_str());
|
|
205
205
|
LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, chunks[i].tokens.size());
|
|
206
206
|
for (int j = 0; j < (int) chunks[i].tokens.size(); j++) {
|
|
207
|
-
LOG_INF("%6d -> '%s'\n", chunks[i].tokens[j],
|
|
207
|
+
LOG_INF("%6d -> '%s'\n", chunks[i].tokens[j], common_token_to_piece(ctx, chunks[i].tokens[j]).c_str());
|
|
208
208
|
}
|
|
209
209
|
LOG_INF("\n\n");
|
|
210
210
|
}
|
|
@@ -232,7 +232,7 @@ int main(int argc, char ** argv) {
|
|
|
232
232
|
if (batch.n_tokens + n_toks > n_batch) {
|
|
233
233
|
float * out = emb + p * n_embd;
|
|
234
234
|
batch_decode(ctx, batch, out, s, n_embd);
|
|
235
|
-
|
|
235
|
+
common_batch_clear(batch);
|
|
236
236
|
p += s;
|
|
237
237
|
s = 0;
|
|
238
238
|
}
|
|
@@ -260,20 +260,20 @@ int main(int argc, char ** argv) {
|
|
|
260
260
|
while (true) {
|
|
261
261
|
LOG("Enter query: ");
|
|
262
262
|
std::getline(std::cin, query);
|
|
263
|
-
std::vector<int32_t> query_tokens =
|
|
263
|
+
std::vector<int32_t> query_tokens = common_tokenize(ctx, query, true);
|
|
264
264
|
|
|
265
265
|
batch_add_seq(query_batch, query_tokens, 0);
|
|
266
266
|
|
|
267
267
|
std::vector<float> query_emb(n_embd, 0);
|
|
268
268
|
batch_decode(ctx, query_batch, query_emb.data(), 1, n_embd);
|
|
269
269
|
|
|
270
|
-
|
|
270
|
+
common_batch_clear(query_batch);
|
|
271
271
|
|
|
272
272
|
// compute cosine similarities
|
|
273
273
|
{
|
|
274
274
|
std::vector<std::pair<int, float>> similarities;
|
|
275
275
|
for (int i = 0; i < n_chunks; i++) {
|
|
276
|
-
float sim =
|
|
276
|
+
float sim = common_embd_similarity_cos(chunks[i].embedding.data(), query_emb.data(), n_embd);
|
|
277
277
|
similarities.push_back(std::make_pair(i, sim));
|
|
278
278
|
}
|
|
279
279
|
|
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
#include "ggml-cpu.h"
|
|
2
|
+
|
|
1
3
|
#ifdef GGML_USE_CUDA
|
|
2
4
|
#include "ggml-cuda.h"
|
|
3
5
|
#endif
|
|
@@ -151,7 +153,7 @@ int main(int argc, char * argv[]) {
|
|
|
151
153
|
get_backend_memory(&free_mem, &total_mem);
|
|
152
154
|
}
|
|
153
155
|
printf("Starting RPC server on %s, backend memory: %zu MB\n", endpoint.c_str(), free_mem / (1024 * 1024));
|
|
154
|
-
|
|
156
|
+
ggml_backend_rpc_start_server(backend, endpoint.c_str(), free_mem, total_mem);
|
|
155
157
|
ggml_backend_free(backend);
|
|
156
158
|
return 0;
|
|
157
159
|
}
|
|
@@ -6,12 +6,12 @@
|
|
|
6
6
|
#include <cstdio>
|
|
7
7
|
|
|
8
8
|
int main(int argc, char ** argv) {
|
|
9
|
-
|
|
9
|
+
common_params params;
|
|
10
10
|
|
|
11
11
|
params.prompt = "The quick brown fox";
|
|
12
12
|
params.sparams.seed = 1234;
|
|
13
13
|
|
|
14
|
-
if (!
|
|
14
|
+
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
|
|
15
15
|
return 1;
|
|
16
16
|
}
|
|
17
17
|
|
|
@@ -28,7 +28,7 @@ int main(int argc, char ** argv) {
|
|
|
28
28
|
std::string result2;
|
|
29
29
|
|
|
30
30
|
// init
|
|
31
|
-
|
|
31
|
+
common_init_result llama_init = common_init_from_params(params);
|
|
32
32
|
|
|
33
33
|
llama_model * model = llama_init.model;
|
|
34
34
|
llama_context * ctx = llama_init.context;
|
|
@@ -42,15 +42,21 @@ int main(int argc, char ** argv) {
|
|
|
42
42
|
|
|
43
43
|
llama_sampler * smpl = llama_sampler_chain_init(sparams);
|
|
44
44
|
|
|
45
|
-
llama_sampler_chain_add(smpl, llama_sampler_init_softmax());
|
|
46
45
|
llama_sampler_chain_add(smpl, llama_sampler_init_dist(params.sparams.seed));
|
|
47
46
|
|
|
48
47
|
// tokenize prompt
|
|
49
|
-
auto tokens =
|
|
48
|
+
auto tokens = common_tokenize(ctx, params.prompt, true);
|
|
49
|
+
|
|
50
|
+
// prepare the batch
|
|
51
|
+
llama_batch batch = llama_batch_init(tokens.size(), 0, 1);
|
|
52
|
+
for (size_t i = 0; i < tokens.size(); i++) {
|
|
53
|
+
common_batch_add(batch, tokens[i], i, {0}, false);
|
|
54
|
+
}
|
|
55
|
+
batch.logits[batch.n_tokens - 1] = true; // generate next token
|
|
50
56
|
|
|
51
57
|
// evaluate prompt
|
|
52
|
-
llama_decode(ctx,
|
|
53
|
-
n_past +=
|
|
58
|
+
llama_decode(ctx, batch);
|
|
59
|
+
n_past += batch.n_tokens;
|
|
54
60
|
|
|
55
61
|
// save state (rng, logits, embedding and kv_cache) to file
|
|
56
62
|
{
|
|
@@ -72,13 +78,17 @@ int main(int argc, char ** argv) {
|
|
|
72
78
|
|
|
73
79
|
for (auto i = 0; i < params.n_predict; i++) {
|
|
74
80
|
auto next_token = llama_sampler_sample(smpl, ctx, -1);
|
|
75
|
-
auto next_token_str =
|
|
81
|
+
auto next_token_str = common_token_to_piece(ctx, next_token);
|
|
76
82
|
|
|
77
83
|
printf("%s", next_token_str.c_str());
|
|
78
84
|
result0 += next_token_str;
|
|
79
85
|
|
|
80
|
-
|
|
86
|
+
common_batch_clear(batch);
|
|
87
|
+
common_batch_add(batch, next_token, n_past, {0}, true);
|
|
88
|
+
|
|
89
|
+
if (llama_decode(ctx, batch)) {
|
|
81
90
|
fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
|
|
91
|
+
llama_batch_free(batch);
|
|
82
92
|
llama_free(ctx);
|
|
83
93
|
llama_free_model(model);
|
|
84
94
|
return 1;
|
|
@@ -92,11 +102,10 @@ int main(int argc, char ** argv) {
|
|
|
92
102
|
llama_free(ctx);
|
|
93
103
|
|
|
94
104
|
// make new context
|
|
95
|
-
auto * ctx2 = llama_new_context_with_model(model,
|
|
105
|
+
auto * ctx2 = llama_new_context_with_model(model, common_context_params_to_llama(params));
|
|
96
106
|
|
|
97
107
|
llama_sampler * smpl2 = llama_sampler_chain_init(sparams);
|
|
98
108
|
|
|
99
|
-
llama_sampler_chain_add(smpl2, llama_sampler_init_softmax());
|
|
100
109
|
llama_sampler_chain_add(smpl2, llama_sampler_init_dist(params.sparams.seed));
|
|
101
110
|
|
|
102
111
|
printf("\nsecond run: %s", params.prompt.c_str());
|
|
@@ -128,13 +137,17 @@ int main(int argc, char ** argv) {
|
|
|
128
137
|
// second run
|
|
129
138
|
for (auto i = 0; i < params.n_predict; i++) {
|
|
130
139
|
auto next_token = llama_sampler_sample(smpl2, ctx2, -1);
|
|
131
|
-
auto next_token_str =
|
|
140
|
+
auto next_token_str = common_token_to_piece(ctx2, next_token);
|
|
132
141
|
|
|
133
142
|
printf("%s", next_token_str.c_str());
|
|
134
143
|
result1 += next_token_str;
|
|
135
144
|
|
|
136
|
-
|
|
145
|
+
common_batch_clear(batch);
|
|
146
|
+
common_batch_add(batch, next_token, n_past, {0}, true);
|
|
147
|
+
|
|
148
|
+
if (llama_decode(ctx2, batch)) {
|
|
137
149
|
fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
|
|
150
|
+
llama_batch_free(batch);
|
|
138
151
|
llama_free(ctx2);
|
|
139
152
|
llama_free_model(model);
|
|
140
153
|
return 1;
|
|
@@ -152,11 +165,10 @@ int main(int argc, char ** argv) {
|
|
|
152
165
|
}
|
|
153
166
|
|
|
154
167
|
// make new context
|
|
155
|
-
auto * ctx3 = llama_new_context_with_model(model,
|
|
168
|
+
auto * ctx3 = llama_new_context_with_model(model, common_context_params_to_llama(params));
|
|
156
169
|
|
|
157
170
|
llama_sampler * smpl3 = llama_sampler_chain_init(sparams);
|
|
158
171
|
|
|
159
|
-
llama_sampler_chain_add(smpl3, llama_sampler_init_softmax());
|
|
160
172
|
llama_sampler_chain_add(smpl3, llama_sampler_init_dist(params.sparams.seed));
|
|
161
173
|
|
|
162
174
|
printf("\nsingle seq run: %s", params.prompt.c_str());
|
|
@@ -216,13 +228,17 @@ int main(int argc, char ** argv) {
|
|
|
216
228
|
// third run with seq 1 instead of 0
|
|
217
229
|
for (auto i = 0; i < params.n_predict; i++) {
|
|
218
230
|
auto next_token = llama_sampler_sample(smpl3, ctx3, -1);
|
|
219
|
-
auto next_token_str =
|
|
231
|
+
auto next_token_str = common_token_to_piece(ctx3, next_token);
|
|
220
232
|
|
|
221
233
|
printf("%s", next_token_str.c_str());
|
|
222
234
|
result2 += next_token_str;
|
|
223
235
|
|
|
224
|
-
|
|
236
|
+
common_batch_clear(batch);
|
|
237
|
+
common_batch_add(batch, next_token, n_past, {1}, true);
|
|
238
|
+
|
|
239
|
+
if (llama_decode(ctx3, batch)) {
|
|
225
240
|
fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
|
|
241
|
+
llama_batch_free(batch);
|
|
226
242
|
llama_free(ctx3);
|
|
227
243
|
llama_free_model(model);
|
|
228
244
|
return 1;
|
|
@@ -236,6 +252,7 @@ int main(int argc, char ** argv) {
|
|
|
236
252
|
llama_sampler_free(smpl2);
|
|
237
253
|
llama_sampler_free(smpl3);
|
|
238
254
|
|
|
255
|
+
llama_batch_free(batch);
|
|
239
256
|
llama_free(ctx3);
|
|
240
257
|
llama_free_model(model);
|
|
241
258
|
|
|
@@ -15,22 +15,13 @@ set(TARGET_SRCS
|
|
|
15
15
|
httplib.h
|
|
16
16
|
)
|
|
17
17
|
set(PUBLIC_ASSETS
|
|
18
|
-
colorthemes.css
|
|
19
|
-
style.css
|
|
20
|
-
theme-beeninorder.css
|
|
21
|
-
theme-ketivah.css
|
|
22
|
-
theme-mangotango.css
|
|
23
|
-
theme-playground.css
|
|
24
|
-
theme-polarnight.css
|
|
25
|
-
theme-snowstorm.css
|
|
26
18
|
index.html
|
|
27
|
-
index-new.html
|
|
28
|
-
index.js
|
|
29
19
|
completion.js
|
|
30
|
-
system-prompts.js
|
|
31
|
-
prompt-formats.js
|
|
32
|
-
json-schema-to-grammar.mjs
|
|
33
20
|
loading.html
|
|
21
|
+
deps_daisyui.min.css
|
|
22
|
+
deps_markdown-it.js
|
|
23
|
+
deps_tailwindcss.js
|
|
24
|
+
deps_vue.esm-browser.js
|
|
34
25
|
)
|
|
35
26
|
|
|
36
27
|
foreach(asset ${PUBLIC_ASSETS})
|