@fugood/llama.node 0.0.1-alpha.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +85 -0
- package/README.md +56 -0
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/lib/binding.js +13 -0
- package/lib/binding.ts +57 -0
- package/lib/index.js +24 -0
- package/lib/index.ts +13 -0
- package/package.json +65 -0
- package/src/addons.cpp +506 -0
- package/src/llama.cpp/CMakeLists.txt +1320 -0
- package/src/llama.cpp/build.zig +172 -0
- package/src/llama.cpp/cmake/FindSIMD.cmake +100 -0
- package/src/llama.cpp/common/CMakeLists.txt +87 -0
- package/src/llama.cpp/common/base64.hpp +392 -0
- package/src/llama.cpp/common/common.cpp +2949 -0
- package/src/llama.cpp/common/common.h +324 -0
- package/src/llama.cpp/common/console.cpp +501 -0
- package/src/llama.cpp/common/console.h +19 -0
- package/src/llama.cpp/common/grammar-parser.cpp +440 -0
- package/src/llama.cpp/common/grammar-parser.h +29 -0
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +764 -0
- package/src/llama.cpp/common/json-schema-to-grammar.h +4 -0
- package/src/llama.cpp/common/json.hpp +24766 -0
- package/src/llama.cpp/common/log.h +724 -0
- package/src/llama.cpp/common/ngram-cache.cpp +282 -0
- package/src/llama.cpp/common/ngram-cache.h +94 -0
- package/src/llama.cpp/common/sampling.cpp +353 -0
- package/src/llama.cpp/common/sampling.h +147 -0
- package/src/llama.cpp/common/stb_image.h +8396 -0
- package/src/llama.cpp/common/train.cpp +1513 -0
- package/src/llama.cpp/common/train.h +233 -0
- package/src/llama.cpp/examples/CMakeLists.txt +52 -0
- package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +1640 -0
- package/src/llama.cpp/examples/batched/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/batched/batched.cpp +262 -0
- package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +261 -0
- package/src/llama.cpp/examples/beam-search/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/beam-search/beam-search.cpp +188 -0
- package/src/llama.cpp/examples/benchmark/CMakeLists.txt +6 -0
- package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +275 -0
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +936 -0
- package/src/llama.cpp/examples/embedding/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/embedding/embedding.cpp +211 -0
- package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +9 -0
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +195 -0
- package/src/llama.cpp/examples/export-lora/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +462 -0
- package/src/llama.cpp/examples/finetune/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/finetune/finetune.cpp +1861 -0
- package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +132 -0
- package/src/llama.cpp/examples/gguf/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/gguf/gguf.cpp +256 -0
- package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +553 -0
- package/src/llama.cpp/examples/gritlm/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +215 -0
- package/src/llama.cpp/examples/imatrix/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +655 -0
- package/src/llama.cpp/examples/infill/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/infill/infill.cpp +767 -0
- package/src/llama.cpp/examples/jeopardy/questions.txt +100 -0
- package/src/llama.cpp/examples/llama-bench/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +1286 -0
- package/src/llama.cpp/examples/llama.android/app/src/main/cpp/CMakeLists.txt +50 -0
- package/src/llama.cpp/examples/llama.android/app/src/main/cpp/llama-android.cpp +443 -0
- package/src/llama.cpp/examples/llava/CMakeLists.txt +37 -0
- package/src/llama.cpp/examples/llava/clip.cpp +2027 -0
- package/src/llama.cpp/examples/llava/clip.h +85 -0
- package/src/llama.cpp/examples/llava/llava-cli.cpp +309 -0
- package/src/llama.cpp/examples/llava/llava.cpp +426 -0
- package/src/llama.cpp/examples/llava/llava.h +50 -0
- package/src/llama.cpp/examples/llava/requirements.txt +3 -0
- package/src/llama.cpp/examples/lookahead/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +485 -0
- package/src/llama.cpp/examples/lookup/CMakeLists.txt +23 -0
- package/src/llama.cpp/examples/lookup/lookup-create.cpp +41 -0
- package/src/llama.cpp/examples/lookup/lookup-merge.cpp +47 -0
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +160 -0
- package/src/llama.cpp/examples/lookup/lookup.cpp +258 -0
- package/src/llama.cpp/examples/main/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/main/main.cpp +957 -0
- package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +33 -0
- package/src/llama.cpp/examples/parallel/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/parallel/parallel.cpp +427 -0
- package/src/llama.cpp/examples/passkey/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/passkey/passkey.cpp +302 -0
- package/src/llama.cpp/examples/perplexity/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +1943 -0
- package/src/llama.cpp/examples/quantize/CMakeLists.txt +6 -0
- package/src/llama.cpp/examples/quantize/quantize.cpp +423 -0
- package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +6 -0
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +424 -0
- package/src/llama.cpp/examples/retrieval/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +350 -0
- package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +246 -0
- package/src/llama.cpp/examples/server/CMakeLists.txt +40 -0
- package/src/llama.cpp/examples/server/bench/requirements.txt +2 -0
- package/src/llama.cpp/examples/server/httplib.h +9465 -0
- package/src/llama.cpp/examples/server/server.cpp +3826 -0
- package/src/llama.cpp/examples/server/tests/requirements.txt +6 -0
- package/src/llama.cpp/examples/server/utils.hpp +653 -0
- package/src/llama.cpp/examples/simple/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/simple/simple.cpp +183 -0
- package/src/llama.cpp/examples/speculative/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/speculative/speculative.cpp +614 -0
- package/src/llama.cpp/examples/sycl/CMakeLists.txt +9 -0
- package/src/llama.cpp/examples/sycl/ls-sycl-device.cpp +13 -0
- package/src/llama.cpp/examples/tokenize/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +42 -0
- package/src/llama.cpp/examples/train-text-from-scratch/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/train-text-from-scratch/train-text-from-scratch.cpp +1252 -0
- package/src/llama.cpp/ggml-alloc.c +985 -0
- package/src/llama.cpp/ggml-alloc.h +76 -0
- package/src/llama.cpp/ggml-backend-impl.h +141 -0
- package/src/llama.cpp/ggml-backend.c +2099 -0
- package/src/llama.cpp/ggml-backend.h +233 -0
- package/src/llama.cpp/ggml-common.h +1853 -0
- package/src/llama.cpp/ggml-cuda.h +43 -0
- package/src/llama.cpp/ggml-impl.h +265 -0
- package/src/llama.cpp/ggml-kompute.cpp +2006 -0
- package/src/llama.cpp/ggml-kompute.h +46 -0
- package/src/llama.cpp/ggml-metal.h +66 -0
- package/src/llama.cpp/ggml-mpi.c +216 -0
- package/src/llama.cpp/ggml-mpi.h +39 -0
- package/src/llama.cpp/ggml-opencl.cpp +2301 -0
- package/src/llama.cpp/ggml-opencl.h +36 -0
- package/src/llama.cpp/ggml-quants.c +12678 -0
- package/src/llama.cpp/ggml-quants.h +133 -0
- package/src/llama.cpp/ggml-sycl.cpp +17882 -0
- package/src/llama.cpp/ggml-sycl.h +49 -0
- package/src/llama.cpp/ggml-vulkan-shaders.hpp +69849 -0
- package/src/llama.cpp/ggml-vulkan.cpp +6442 -0
- package/src/llama.cpp/ggml-vulkan.h +29 -0
- package/src/llama.cpp/ggml.c +21819 -0
- package/src/llama.cpp/ggml.h +2403 -0
- package/src/llama.cpp/llama.cpp +17468 -0
- package/src/llama.cpp/llama.h +1117 -0
- package/src/llama.cpp/pocs/CMakeLists.txt +12 -0
- package/src/llama.cpp/pocs/vdot/CMakeLists.txt +9 -0
- package/src/llama.cpp/pocs/vdot/q8dot.cpp +172 -0
- package/src/llama.cpp/pocs/vdot/vdot.cpp +310 -0
- package/src/llama.cpp/prompts/LLM-questions.txt +49 -0
- package/src/llama.cpp/prompts/alpaca.txt +1 -0
- package/src/llama.cpp/prompts/assistant.txt +31 -0
- package/src/llama.cpp/prompts/chat-with-baichuan.txt +4 -0
- package/src/llama.cpp/prompts/chat-with-bob.txt +7 -0
- package/src/llama.cpp/prompts/chat-with-qwen.txt +1 -0
- package/src/llama.cpp/prompts/chat-with-vicuna-v0.txt +7 -0
- package/src/llama.cpp/prompts/chat-with-vicuna-v1.txt +7 -0
- package/src/llama.cpp/prompts/chat.txt +28 -0
- package/src/llama.cpp/prompts/dan-modified.txt +1 -0
- package/src/llama.cpp/prompts/dan.txt +1 -0
- package/src/llama.cpp/prompts/mnemonics.txt +93 -0
- package/src/llama.cpp/prompts/parallel-questions.txt +43 -0
- package/src/llama.cpp/prompts/reason-act.txt +18 -0
- package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf.txt +3 -0
- package/src/llama.cpp/requirements/requirements-convert-llama-ggml-to-gguf.txt +1 -0
- package/src/llama.cpp/requirements/requirements-convert-lora-to-ggml.txt +2 -0
- package/src/llama.cpp/requirements/requirements-convert-persimmon-to-gguf.txt +2 -0
- package/src/llama.cpp/requirements/requirements-convert.txt +5 -0
- package/src/llama.cpp/requirements.txt +12 -0
- package/src/llama.cpp/scripts/gen-build-info-cpp.cmake +24 -0
- package/src/llama.cpp/scripts/xxd.cmake +16 -0
- package/src/llama.cpp/sgemm.cpp +999 -0
- package/src/llama.cpp/sgemm.h +12 -0
- package/src/llama.cpp/tests/CMakeLists.txt +78 -0
- package/src/llama.cpp/tests/get-model.cpp +21 -0
- package/src/llama.cpp/tests/get-model.h +2 -0
- package/src/llama.cpp/tests/test-autorelease.cpp +24 -0
- package/src/llama.cpp/tests/test-backend-ops.cpp +2266 -0
- package/src/llama.cpp/tests/test-c.c +7 -0
- package/src/llama.cpp/tests/test-chat-template.cpp +107 -0
- package/src/llama.cpp/tests/test-double-float.cpp +57 -0
- package/src/llama.cpp/tests/test-grad0.cpp +1606 -0
- package/src/llama.cpp/tests/test-grammar-integration.cpp +243 -0
- package/src/llama.cpp/tests/test-grammar-parser.cpp +250 -0
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +899 -0
- package/src/llama.cpp/tests/test-llama-grammar.cpp +402 -0
- package/src/llama.cpp/tests/test-model-load-cancel.cpp +27 -0
- package/src/llama.cpp/tests/test-opt.cpp +181 -0
- package/src/llama.cpp/tests/test-quantize-fns.cpp +185 -0
- package/src/llama.cpp/tests/test-quantize-perf.cpp +363 -0
- package/src/llama.cpp/tests/test-rope.cpp +221 -0
- package/src/llama.cpp/tests/test-sampling.cpp +301 -0
- package/src/llama.cpp/tests/test-tokenizer-0-falcon.cpp +187 -0
- package/src/llama.cpp/tests/test-tokenizer-0-llama.cpp +190 -0
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +123 -0
- package/src/llama.cpp/tests/test-tokenizer-1-llama.cpp +111 -0
- package/src/llama.cpp/unicode-data.cpp +1651 -0
- package/src/llama.cpp/unicode-data.h +16 -0
- package/src/llama.cpp/unicode.cpp +277 -0
- package/src/llama.cpp/unicode.h +28 -0
|
@@ -0,0 +1,188 @@
|
|
|
1
|
+
#include "common.h"
|
|
2
|
+
#include "llama.h"
|
|
3
|
+
|
|
4
|
+
#include <cassert>
|
|
5
|
+
#include <cinttypes>
|
|
6
|
+
#include <cmath>
|
|
7
|
+
#include <cstdio>
|
|
8
|
+
#include <cstring>
|
|
9
|
+
#include <ctime>
|
|
10
|
+
#include <fstream>
|
|
11
|
+
#include <iostream>
|
|
12
|
+
#include <string>
|
|
13
|
+
#include <vector>
|
|
14
|
+
|
|
15
|
+
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
|
|
16
|
+
#include <signal.h>
|
|
17
|
+
#include <unistd.h>
|
|
18
|
+
#elif defined (_WIN32)
|
|
19
|
+
#define WIN32_LEAN_AND_MEAN
|
|
20
|
+
#ifndef NOMINMAX
|
|
21
|
+
# define NOMINMAX
|
|
22
|
+
#endif
|
|
23
|
+
#include <windows.h>
|
|
24
|
+
#include <signal.h>
|
|
25
|
+
#endif
|
|
26
|
+
|
|
27
|
+
// Used for debugging to print out beam tokens.
|
|
28
|
+
struct ostream_beam_view {
|
|
29
|
+
llama_context * ctx;
|
|
30
|
+
llama_beam_view beam_view;
|
|
31
|
+
};
|
|
32
|
+
|
|
33
|
+
static std::ostream & operator<<(std::ostream & os, const ostream_beam_view & obv) {
|
|
34
|
+
os << "p(" << obv.beam_view.p << ") eob(" << std::boolalpha << obv.beam_view.eob << ") tokens(";
|
|
35
|
+
for (size_t i = 0 ; i < obv.beam_view.n_tokens ; ++i) {
|
|
36
|
+
os << llama_token_to_piece(obv.ctx, obv.beam_view.tokens[i]);
|
|
37
|
+
}
|
|
38
|
+
return os << ')';
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
// Put here anything you want back in beam_search_callback().
|
|
42
|
+
struct beam_search_callback_data {
|
|
43
|
+
llama_context * ctx;
|
|
44
|
+
std::vector<llama_token> response;
|
|
45
|
+
};
|
|
46
|
+
|
|
47
|
+
// In this case, end-of-beam (eob) is equivalent to end-of-sentence (eos) but this need not always be the same.
|
|
48
|
+
// For example, eob can be flagged due to maximum token length, stop words, etc.
|
|
49
|
+
static bool is_at_eob(const beam_search_callback_data & callback_data, const llama_token * tokens, size_t n_tokens) {
|
|
50
|
+
return n_tokens && llama_token_is_eog(llama_get_model(callback_data.ctx), tokens[n_tokens-1]);
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
// Function matching type llama_beam_search_callback_fn_t.
|
|
54
|
+
// Custom callback example is called each time the beams lengths increase:
|
|
55
|
+
// * Show progress by printing ',' following by number of convergent beam tokens if any.
|
|
56
|
+
// * When all beams converge to a common prefix, they are made available in beams_state.beams[0].
|
|
57
|
+
// This is also called when the stop condition is met.
|
|
58
|
+
// Collect tokens into std::vector<llama_token> response which is pointed to by callback_data.
|
|
59
|
+
static void beam_search_callback(void * callback_data_ptr, llama_beams_state beams_state) {
|
|
60
|
+
auto& callback_data = *static_cast<beam_search_callback_data*>(callback_data_ptr);
|
|
61
|
+
// Mark beams as EOS as needed.
|
|
62
|
+
for (size_t i = 0 ; i < beams_state.n_beams ; ++i) {
|
|
63
|
+
llama_beam_view& beam_view = beams_state.beam_views[i];
|
|
64
|
+
if (!beam_view.eob && is_at_eob(callback_data, beam_view.tokens, beam_view.n_tokens)) {
|
|
65
|
+
beam_view.eob = true;
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
printf(","); // Show progress
|
|
69
|
+
if (const size_t n = beams_state.common_prefix_length) {
|
|
70
|
+
callback_data.response.resize(callback_data.response.size() + n);
|
|
71
|
+
assert(0u < beams_state.n_beams);
|
|
72
|
+
const llama_token * tokens = beams_state.beam_views[0].tokens;
|
|
73
|
+
std::copy(tokens, tokens + n, callback_data.response.end() - n);
|
|
74
|
+
printf("%zu", n);
|
|
75
|
+
}
|
|
76
|
+
fflush(stdout);
|
|
77
|
+
#if 1 // DEBUG: print current beams for this iteration
|
|
78
|
+
std::cout << "\n\nCurrent beams (last_call=" << beams_state.last_call << "):\n";
|
|
79
|
+
for (size_t i = 0 ; i < beams_state.n_beams ; ++i) {
|
|
80
|
+
std::cout << "beams["<<i<<"]: " << ostream_beam_view{callback_data.ctx,beams_state.beam_views[i]} << std::endl;
|
|
81
|
+
}
|
|
82
|
+
#endif
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
int main(int argc, char ** argv)
|
|
86
|
+
{
|
|
87
|
+
gpt_params params;
|
|
88
|
+
//params.n_gpu_layers = 200;
|
|
89
|
+
|
|
90
|
+
//---------------------------------
|
|
91
|
+
// Print help :
|
|
92
|
+
//---------------------------------
|
|
93
|
+
|
|
94
|
+
if ( argc < 2 || argv[1][0] == '-' )
|
|
95
|
+
{
|
|
96
|
+
printf( "Usage: %s MODEL_PATH [BEAM_WIDTH=2] [PROMPT]\n" , argv[0] );
|
|
97
|
+
return 1 ;
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
//---------------------------------
|
|
101
|
+
// Load parameters :
|
|
102
|
+
//---------------------------------
|
|
103
|
+
|
|
104
|
+
params.model = argv[1];
|
|
105
|
+
|
|
106
|
+
params.n_beams = 2 < argc ? std::stoi(argv[2]) : 2;
|
|
107
|
+
|
|
108
|
+
if ( argc > 3 )
|
|
109
|
+
{
|
|
110
|
+
params.prompt = argv[3];
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
if ( params.prompt.empty() )
|
|
114
|
+
{
|
|
115
|
+
params.prompt = "### Request:\nHow many countries are there?\n\n### Response:\n";
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
//---------------------------------
|
|
119
|
+
// Init LLM :
|
|
120
|
+
//---------------------------------
|
|
121
|
+
|
|
122
|
+
llama_backend_init();
|
|
123
|
+
llama_numa_init(params.numa);
|
|
124
|
+
|
|
125
|
+
llama_model * model;
|
|
126
|
+
llama_context * ctx;
|
|
127
|
+
|
|
128
|
+
std::tie(model, ctx) = llama_init_from_gpt_params( params );
|
|
129
|
+
|
|
130
|
+
if ( model == NULL )
|
|
131
|
+
{
|
|
132
|
+
fprintf( stderr , "%s: error: unable to load model\n" , __func__ );
|
|
133
|
+
return 1;
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
//---------------------------------
|
|
137
|
+
// Tokenize the prompt :
|
|
138
|
+
//---------------------------------
|
|
139
|
+
|
|
140
|
+
std::vector<llama_token> tokens_list = llama_tokenize(ctx, params.prompt, true);
|
|
141
|
+
|
|
142
|
+
const size_t max_context_size = llama_n_ctx( ctx );
|
|
143
|
+
const size_t max_tokens_list_size = max_context_size - 4 ;
|
|
144
|
+
|
|
145
|
+
if (tokens_list.size() > max_tokens_list_size)
|
|
146
|
+
{
|
|
147
|
+
fprintf( stderr , "%s: error: prompt too long (%zu tokens, max %zu)\n" ,
|
|
148
|
+
__func__ , tokens_list.size() , max_tokens_list_size );
|
|
149
|
+
return 1;
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
fprintf( stderr, "\n\n" );
|
|
153
|
+
|
|
154
|
+
// Print the tokens from the prompt :
|
|
155
|
+
|
|
156
|
+
for( auto id : tokens_list )
|
|
157
|
+
{
|
|
158
|
+
std::cout << llama_token_to_piece(ctx, id);
|
|
159
|
+
}
|
|
160
|
+
std::cout << std::flush;
|
|
161
|
+
|
|
162
|
+
int n_past = 0;
|
|
163
|
+
|
|
164
|
+
if (llama_decode(ctx, llama_batch_get_one(tokens_list.data(), tokens_list.size(), n_past, 0)))
|
|
165
|
+
{
|
|
166
|
+
fprintf(stderr, "%s : failed to eval prompt.\n" , __func__ );
|
|
167
|
+
return 1;
|
|
168
|
+
}
|
|
169
|
+
n_past += tokens_list.size();
|
|
170
|
+
|
|
171
|
+
beam_search_callback_data callback_data{ctx, {}};
|
|
172
|
+
size_t const beam_width = static_cast<size_t>(params.n_beams);
|
|
173
|
+
int const n_predict = 256;
|
|
174
|
+
llama_beam_search(ctx, beam_search_callback, &callback_data, beam_width, n_past, n_predict);
|
|
175
|
+
|
|
176
|
+
std::cout << "\n\n";
|
|
177
|
+
for (llama_token const token_id : callback_data.response) {
|
|
178
|
+
std::cout << llama_token_to_piece(ctx,token_id);
|
|
179
|
+
}
|
|
180
|
+
std::cout << std::endl;
|
|
181
|
+
|
|
182
|
+
llama_free( ctx );
|
|
183
|
+
llama_free_model( model );
|
|
184
|
+
|
|
185
|
+
llama_backend_free();
|
|
186
|
+
|
|
187
|
+
return 0;
|
|
188
|
+
}
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
set(TARGET benchmark)
|
|
2
|
+
add_executable(${TARGET} benchmark-matmult.cpp)
|
|
3
|
+
install(TARGETS ${TARGET} RUNTIME)
|
|
4
|
+
target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT})
|
|
5
|
+
target_include_directories(${TARGET} PRIVATE ../../common)
|
|
6
|
+
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
|
@@ -0,0 +1,275 @@
|
|
|
1
|
+
#include "common.h"
|
|
2
|
+
#include "ggml.h"
|
|
3
|
+
|
|
4
|
+
#include <locale.h>
|
|
5
|
+
#include <assert.h>
|
|
6
|
+
#include <math.h>
|
|
7
|
+
#include <cstring>
|
|
8
|
+
#include <cstdio>
|
|
9
|
+
#include <cinttypes>
|
|
10
|
+
#include <unordered_map>
|
|
11
|
+
#include <queue>
|
|
12
|
+
#include <string.h>
|
|
13
|
+
#include <cassert>
|
|
14
|
+
#include <fstream>
|
|
15
|
+
#include <string>
|
|
16
|
+
#include <iterator>
|
|
17
|
+
#include <algorithm>
|
|
18
|
+
|
|
19
|
+
#if defined(_MSC_VER)
|
|
20
|
+
#pragma warning(disable: 4244 4267) // possible loss of data
|
|
21
|
+
#endif
|
|
22
|
+
|
|
23
|
+
static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
|
|
24
|
+
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
|
|
25
|
+
|
|
26
|
+
if (plan.work_size > 0) {
|
|
27
|
+
buf.resize(plan.work_size);
|
|
28
|
+
plan.work_data = buf.data();
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
ggml_graph_compute(graph, &plan);
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
static float tensor_sum_elements(const ggml_tensor * tensor) {
|
|
35
|
+
double sum = 0;
|
|
36
|
+
if (tensor->type == GGML_TYPE_F32) {
|
|
37
|
+
for (int j = 0; j < tensor->ne[1]; j++) {
|
|
38
|
+
for (int k = 0; k < tensor->ne[0]; k++) {
|
|
39
|
+
sum += ((float *) tensor->data)[j*tensor->ne[0] + k];
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
return sum;
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
static void tensor_dump(const ggml_tensor * tensor, const char * name) {
|
|
47
|
+
printf("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi) - ", name,
|
|
48
|
+
tensor->type, ggml_type_name(tensor->type),
|
|
49
|
+
tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->nb[0], tensor->nb[1], tensor->nb[2]);
|
|
50
|
+
float sum = tensor_sum_elements(tensor);
|
|
51
|
+
printf("Sum of tensor %s is %6.2f\n", name, sum);
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
#define TENSOR_DUMP(tensor) tensor_dump(tensor, #tensor)
|
|
55
|
+
|
|
56
|
+
struct benchmark_params_struct {
|
|
57
|
+
int32_t n_threads = 1;
|
|
58
|
+
int32_t n_iterations = 10;
|
|
59
|
+
};
|
|
60
|
+
|
|
61
|
+
static void print_usage(int /*argc*/, char ** argv, struct benchmark_params_struct params) {
|
|
62
|
+
fprintf(stderr, "usage: %s [options]\n", argv[0]);
|
|
63
|
+
fprintf(stderr, "\n");
|
|
64
|
+
fprintf(stderr, "options:\n");
|
|
65
|
+
fprintf(stderr, " -h, --help show this help message and exit\n");
|
|
66
|
+
fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
|
|
67
|
+
fprintf(stderr, " -i N, --iter N number of iterations to use during computation (default: %d)\n", params.n_iterations);
|
|
68
|
+
fprintf(stderr, "\n");
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
int main(int argc, char ** argv) {
|
|
72
|
+
struct benchmark_params_struct benchmark_params;
|
|
73
|
+
|
|
74
|
+
bool invalid_param = false;
|
|
75
|
+
std::string arg;
|
|
76
|
+
for (int i = 1; i < argc; i++) {
|
|
77
|
+
arg = argv[i];
|
|
78
|
+
|
|
79
|
+
if (arg == "-t" || arg == "--threads") {
|
|
80
|
+
if (++i >= argc) {
|
|
81
|
+
invalid_param = true;
|
|
82
|
+
break;
|
|
83
|
+
}
|
|
84
|
+
benchmark_params.n_threads = std::stoi(argv[i]);
|
|
85
|
+
} else if (arg == "-i" || arg == "--iter") {
|
|
86
|
+
if (++i >= argc) {
|
|
87
|
+
invalid_param = true;
|
|
88
|
+
break;
|
|
89
|
+
}
|
|
90
|
+
benchmark_params.n_iterations = std::stoi(argv[i]);
|
|
91
|
+
} else if (arg == "-h" || arg == "--help") {
|
|
92
|
+
print_usage(argc, argv, benchmark_params);
|
|
93
|
+
exit(0);
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
if (invalid_param) {
|
|
97
|
+
fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
|
|
98
|
+
print_usage(argc, argv, benchmark_params);
|
|
99
|
+
exit(1);
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
print_build_info();
|
|
103
|
+
printf("Starting Test\n");
|
|
104
|
+
|
|
105
|
+
// create the ggml context
|
|
106
|
+
struct ggml_context * ctx;
|
|
107
|
+
//const int sizex = 4096;
|
|
108
|
+
//const int sizey = 11008;
|
|
109
|
+
|
|
110
|
+
#undef VERBOSE_DEBUGGING
|
|
111
|
+
#ifndef VERBOSE_DEBUGGING
|
|
112
|
+
const int sizey = 4096;
|
|
113
|
+
const int sizex = 11008;
|
|
114
|
+
const int sizez = 128;
|
|
115
|
+
#else
|
|
116
|
+
/* Working - let's increase size */
|
|
117
|
+
const int sizey = 1;
|
|
118
|
+
const int sizex = (8*32);
|
|
119
|
+
const int sizez = 1;
|
|
120
|
+
|
|
121
|
+
/*const int sizey = 1;
|
|
122
|
+
const int sizex = 3*(8*32);
|
|
123
|
+
const int sizez = 1;*/
|
|
124
|
+
#endif
|
|
125
|
+
|
|
126
|
+
//printf("Memsize required = %i\n", sizex*sizex);
|
|
127
|
+
|
|
128
|
+
// TODO: perform the bench for all types or for a user specified type
|
|
129
|
+
const ggml_type qtype = GGML_TYPE_Q4_1;
|
|
130
|
+
|
|
131
|
+
size_t ctx_size = 0;
|
|
132
|
+
ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey);
|
|
133
|
+
ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey);
|
|
134
|
+
ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizez);
|
|
135
|
+
ctx_size += ggml_row_size(qtype, sizex*sizey);
|
|
136
|
+
ctx_size += ggml_row_size(qtype, sizex*sizey);
|
|
137
|
+
ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); // BLAS
|
|
138
|
+
ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); // BLAS
|
|
139
|
+
ctx_size += 1024*1024*16;
|
|
140
|
+
|
|
141
|
+
printf("Allocating Memory of size %zi bytes, %zi MB\n",ctx_size, (ctx_size/1024/1024));
|
|
142
|
+
|
|
143
|
+
struct ggml_init_params params = {
|
|
144
|
+
/*.mem_size =*/ ctx_size,
|
|
145
|
+
/*.mem_buffer =*/ NULL,
|
|
146
|
+
/* no_alloc =*/ 0
|
|
147
|
+
};
|
|
148
|
+
|
|
149
|
+
ctx = ggml_init(params);
|
|
150
|
+
if (!ctx) {
|
|
151
|
+
fprintf(stderr, "%s: ggml_init() failed\n", __func__);
|
|
152
|
+
return 1;
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
printf("Creating new tensors\n");
|
|
157
|
+
// printf("Creating new tensor m1\n");
|
|
158
|
+
struct ggml_tensor * m11 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey);
|
|
159
|
+
ggml_set_f32(m11, 1.0f);
|
|
160
|
+
|
|
161
|
+
// printf("Creating new tensor m1\n");
|
|
162
|
+
struct ggml_tensor * m12 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey);
|
|
163
|
+
ggml_set_f32(m12, 1.5f);
|
|
164
|
+
|
|
165
|
+
// printf("Creating new tensor m2\n");
|
|
166
|
+
struct ggml_tensor * m2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizez);
|
|
167
|
+
ggml_set_f32(m2, 2.0f);
|
|
168
|
+
|
|
169
|
+
printf("\n------ Test 1 - Matrix Mult via F32 code\n");
|
|
170
|
+
// printf("Creating new tensor m11xm2\n");
|
|
171
|
+
struct ggml_tensor * m11xm2 = ggml_mul_mat(ctx, m11, m2);
|
|
172
|
+
|
|
173
|
+
// printf("Creating compute graph\n");
|
|
174
|
+
struct ggml_cgraph * gf = ggml_new_graph(ctx);
|
|
175
|
+
ggml_build_forward_expand(gf, m11xm2);
|
|
176
|
+
|
|
177
|
+
printf("n_threads=%i\n", benchmark_params.n_threads);
|
|
178
|
+
|
|
179
|
+
TENSOR_DUMP(m11);
|
|
180
|
+
TENSOR_DUMP(m2);
|
|
181
|
+
|
|
182
|
+
std::vector<uint8_t> work_buffer;
|
|
183
|
+
|
|
184
|
+
ggml_graph_compute_helper(work_buffer, gf, benchmark_params.n_threads);
|
|
185
|
+
|
|
186
|
+
TENSOR_DUMP(gf->nodes[0]);
|
|
187
|
+
|
|
188
|
+
printf("\n------ Test 2 - Matrix Mult via %s code\n", ggml_type_name(qtype));
|
|
189
|
+
|
|
190
|
+
int32_t nelements = sizex*sizey;
|
|
191
|
+
|
|
192
|
+
// Set up a the benchmark matrices
|
|
193
|
+
// printf("Creating new tensor q11 & Running quantize\n");
|
|
194
|
+
struct ggml_tensor * q11 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey);
|
|
195
|
+
ggml_quantize_chunk(qtype, (const float *) m11->data, q11->data, 0, nelements/m11->ne[0], m11->ne[0], nullptr);
|
|
196
|
+
|
|
197
|
+
// Set up a the compute graph
|
|
198
|
+
// printf("Creating new tensor q31\n");
|
|
199
|
+
struct ggml_tensor * q31 = ggml_mul_mat(ctx, q11, m2);
|
|
200
|
+
|
|
201
|
+
// printf("Creating compute graph\n");
|
|
202
|
+
struct ggml_cgraph * gf31 = ggml_new_graph(ctx);
|
|
203
|
+
ggml_build_forward_expand(gf31, q31);
|
|
204
|
+
|
|
205
|
+
// Set up a second graph computation to make sure we override the CPU cache lines
|
|
206
|
+
// printf("Creating new tensor q12 & Running quantize\n");
|
|
207
|
+
struct ggml_tensor * q12 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey);
|
|
208
|
+
ggml_quantize_chunk(qtype, (const float *) m12->data, q12->data, 0, nelements/m12->ne[0], m12->ne[0], nullptr);
|
|
209
|
+
|
|
210
|
+
// printf("Creating new tensor q32\n");
|
|
211
|
+
struct ggml_tensor * q32 = ggml_mul_mat(ctx, q12, m2);
|
|
212
|
+
|
|
213
|
+
//printf("Creating compute graph\n");
|
|
214
|
+
struct ggml_cgraph * gf32 = ggml_new_graph(ctx);
|
|
215
|
+
ggml_build_forward_expand(gf32, q32);
|
|
216
|
+
printf("n_threads=%i\n", benchmark_params.n_threads);
|
|
217
|
+
|
|
218
|
+
const int dimx = sizex;
|
|
219
|
+
const int dimy = sizey;
|
|
220
|
+
const int dimz = sizez;
|
|
221
|
+
long long int flops_per_dot_product = dimy + dimy;
|
|
222
|
+
long long int flops_per_matrix = flops_per_dot_product * dimx * dimz; ;
|
|
223
|
+
printf("Matrix Multiplication of (%i,%i,%i) x (%i,%i,%i) - about %6.2f gFLOPS\n\n", sizex, sizey, 1, sizex, sizez, 1, 1.0f*flops_per_matrix / 1000 / 1000 / 1000);
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
// Let's use the F32 result from above as a reference for the quantized multiplication
|
|
227
|
+
float sum_of_F32_reference = tensor_sum_elements(gf->nodes[0]);
|
|
228
|
+
|
|
229
|
+
printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; gigaFLOPS\n");
|
|
230
|
+
printf("=====================================================================================\n");
|
|
231
|
+
|
|
232
|
+
double gflops_sum = 0;
|
|
233
|
+
for (int i=0;i<benchmark_params.n_iterations ;i++) {
|
|
234
|
+
|
|
235
|
+
long long int start = ggml_time_us();
|
|
236
|
+
//printf("Running ggml_graph_compute\n");
|
|
237
|
+
ggml_graph_compute_helper(work_buffer, gf31, benchmark_params.n_threads);
|
|
238
|
+
|
|
239
|
+
long long int stop = ggml_time_us();
|
|
240
|
+
long long int usec = stop-start;
|
|
241
|
+
double gflops = (double)(flops_per_matrix)/usec/1000.0;
|
|
242
|
+
gflops_sum += gflops;
|
|
243
|
+
printf("%9i;%8i;%6i;%6i;%6i;%15lli;%18lli;%10.2f\n",
|
|
244
|
+
i,
|
|
245
|
+
benchmark_params.n_threads,
|
|
246
|
+
sizex, sizey, sizez, flops_per_matrix,
|
|
247
|
+
usec,gflops);
|
|
248
|
+
|
|
249
|
+
#ifdef VERBOSE_DEBUGGING
|
|
250
|
+
TENSOR_DUMP("res",gf31.nodes[0])
|
|
251
|
+
#endif
|
|
252
|
+
|
|
253
|
+
// Check that the matrix multiplication result is in the right ballpark
|
|
254
|
+
// We cannot use the exact value from the F32 multiplication because the quantizuation will be slightly different
|
|
255
|
+
float sum_of_Q4_result = tensor_sum_elements(gf31->nodes[0]);
|
|
256
|
+
float delta = std::abs(sum_of_Q4_result - sum_of_F32_reference);
|
|
257
|
+
float allowed_delta = (sum_of_F32_reference) / 1000 / 1000; // Let's accept an epsilon of 10^-6
|
|
258
|
+
|
|
259
|
+
if (delta > allowed_delta) {
|
|
260
|
+
printf("\nABORT - ERROR in Matrix Multiplication result - expected %6.2f, got %6.2f (delta %6.2f > allowed_delta %6.2f)\n",
|
|
261
|
+
sum_of_F32_reference,
|
|
262
|
+
sum_of_Q4_result,
|
|
263
|
+
delta,
|
|
264
|
+
allowed_delta
|
|
265
|
+
);
|
|
266
|
+
exit(0);
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
// Running a different graph computation to make sure we override the CPU cache lines
|
|
270
|
+
ggml_graph_compute_helper(work_buffer, gf32, benchmark_params.n_threads);
|
|
271
|
+
}
|
|
272
|
+
printf("\n");
|
|
273
|
+
printf("Average%78.2f\n",gflops_sum/((double)benchmark_params.n_iterations));
|
|
274
|
+
printf("=====================================================================================\n");
|
|
275
|
+
}
|