@fugood/llama.node 0.0.1-alpha.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (204) hide show
  1. package/CMakeLists.txt +85 -0
  2. package/README.md +56 -0
  3. package/bin/darwin/arm64/llama-node.node +0 -0
  4. package/bin/darwin/x64/llama-node.node +0 -0
  5. package/bin/linux/arm64/llama-node.node +0 -0
  6. package/bin/linux/x64/llama-node.node +0 -0
  7. package/bin/win32/arm64/llama-node.node +0 -0
  8. package/bin/win32/arm64/node.lib +0 -0
  9. package/bin/win32/x64/llama-node.node +0 -0
  10. package/bin/win32/x64/node.lib +0 -0
  11. package/lib/binding.js +13 -0
  12. package/lib/binding.ts +57 -0
  13. package/lib/index.js +24 -0
  14. package/lib/index.ts +13 -0
  15. package/package.json +65 -0
  16. package/src/addons.cpp +506 -0
  17. package/src/llama.cpp/CMakeLists.txt +1320 -0
  18. package/src/llama.cpp/build.zig +172 -0
  19. package/src/llama.cpp/cmake/FindSIMD.cmake +100 -0
  20. package/src/llama.cpp/common/CMakeLists.txt +87 -0
  21. package/src/llama.cpp/common/base64.hpp +392 -0
  22. package/src/llama.cpp/common/common.cpp +2949 -0
  23. package/src/llama.cpp/common/common.h +324 -0
  24. package/src/llama.cpp/common/console.cpp +501 -0
  25. package/src/llama.cpp/common/console.h +19 -0
  26. package/src/llama.cpp/common/grammar-parser.cpp +440 -0
  27. package/src/llama.cpp/common/grammar-parser.h +29 -0
  28. package/src/llama.cpp/common/json-schema-to-grammar.cpp +764 -0
  29. package/src/llama.cpp/common/json-schema-to-grammar.h +4 -0
  30. package/src/llama.cpp/common/json.hpp +24766 -0
  31. package/src/llama.cpp/common/log.h +724 -0
  32. package/src/llama.cpp/common/ngram-cache.cpp +282 -0
  33. package/src/llama.cpp/common/ngram-cache.h +94 -0
  34. package/src/llama.cpp/common/sampling.cpp +353 -0
  35. package/src/llama.cpp/common/sampling.h +147 -0
  36. package/src/llama.cpp/common/stb_image.h +8396 -0
  37. package/src/llama.cpp/common/train.cpp +1513 -0
  38. package/src/llama.cpp/common/train.h +233 -0
  39. package/src/llama.cpp/examples/CMakeLists.txt +52 -0
  40. package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +5 -0
  41. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +1640 -0
  42. package/src/llama.cpp/examples/batched/CMakeLists.txt +5 -0
  43. package/src/llama.cpp/examples/batched/batched.cpp +262 -0
  44. package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +5 -0
  45. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +261 -0
  46. package/src/llama.cpp/examples/beam-search/CMakeLists.txt +5 -0
  47. package/src/llama.cpp/examples/beam-search/beam-search.cpp +188 -0
  48. package/src/llama.cpp/examples/benchmark/CMakeLists.txt +6 -0
  49. package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +275 -0
  50. package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +5 -0
  51. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +936 -0
  52. package/src/llama.cpp/examples/embedding/CMakeLists.txt +5 -0
  53. package/src/llama.cpp/examples/embedding/embedding.cpp +211 -0
  54. package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +9 -0
  55. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +195 -0
  56. package/src/llama.cpp/examples/export-lora/CMakeLists.txt +5 -0
  57. package/src/llama.cpp/examples/export-lora/export-lora.cpp +462 -0
  58. package/src/llama.cpp/examples/finetune/CMakeLists.txt +5 -0
  59. package/src/llama.cpp/examples/finetune/finetune.cpp +1861 -0
  60. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +5 -0
  61. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +132 -0
  62. package/src/llama.cpp/examples/gguf/CMakeLists.txt +5 -0
  63. package/src/llama.cpp/examples/gguf/gguf.cpp +256 -0
  64. package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +5 -0
  65. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +553 -0
  66. package/src/llama.cpp/examples/gritlm/CMakeLists.txt +5 -0
  67. package/src/llama.cpp/examples/gritlm/gritlm.cpp +215 -0
  68. package/src/llama.cpp/examples/imatrix/CMakeLists.txt +5 -0
  69. package/src/llama.cpp/examples/imatrix/imatrix.cpp +655 -0
  70. package/src/llama.cpp/examples/infill/CMakeLists.txt +5 -0
  71. package/src/llama.cpp/examples/infill/infill.cpp +767 -0
  72. package/src/llama.cpp/examples/jeopardy/questions.txt +100 -0
  73. package/src/llama.cpp/examples/llama-bench/CMakeLists.txt +5 -0
  74. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +1286 -0
  75. package/src/llama.cpp/examples/llama.android/app/src/main/cpp/CMakeLists.txt +50 -0
  76. package/src/llama.cpp/examples/llama.android/app/src/main/cpp/llama-android.cpp +443 -0
  77. package/src/llama.cpp/examples/llava/CMakeLists.txt +37 -0
  78. package/src/llama.cpp/examples/llava/clip.cpp +2027 -0
  79. package/src/llama.cpp/examples/llava/clip.h +85 -0
  80. package/src/llama.cpp/examples/llava/llava-cli.cpp +309 -0
  81. package/src/llama.cpp/examples/llava/llava.cpp +426 -0
  82. package/src/llama.cpp/examples/llava/llava.h +50 -0
  83. package/src/llama.cpp/examples/llava/requirements.txt +3 -0
  84. package/src/llama.cpp/examples/lookahead/CMakeLists.txt +5 -0
  85. package/src/llama.cpp/examples/lookahead/lookahead.cpp +485 -0
  86. package/src/llama.cpp/examples/lookup/CMakeLists.txt +23 -0
  87. package/src/llama.cpp/examples/lookup/lookup-create.cpp +41 -0
  88. package/src/llama.cpp/examples/lookup/lookup-merge.cpp +47 -0
  89. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +160 -0
  90. package/src/llama.cpp/examples/lookup/lookup.cpp +258 -0
  91. package/src/llama.cpp/examples/main/CMakeLists.txt +5 -0
  92. package/src/llama.cpp/examples/main/main.cpp +957 -0
  93. package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +33 -0
  94. package/src/llama.cpp/examples/parallel/CMakeLists.txt +5 -0
  95. package/src/llama.cpp/examples/parallel/parallel.cpp +427 -0
  96. package/src/llama.cpp/examples/passkey/CMakeLists.txt +5 -0
  97. package/src/llama.cpp/examples/passkey/passkey.cpp +302 -0
  98. package/src/llama.cpp/examples/perplexity/CMakeLists.txt +5 -0
  99. package/src/llama.cpp/examples/perplexity/perplexity.cpp +1943 -0
  100. package/src/llama.cpp/examples/quantize/CMakeLists.txt +6 -0
  101. package/src/llama.cpp/examples/quantize/quantize.cpp +423 -0
  102. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +6 -0
  103. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +424 -0
  104. package/src/llama.cpp/examples/retrieval/CMakeLists.txt +5 -0
  105. package/src/llama.cpp/examples/retrieval/retrieval.cpp +350 -0
  106. package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +5 -0
  107. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +246 -0
  108. package/src/llama.cpp/examples/server/CMakeLists.txt +40 -0
  109. package/src/llama.cpp/examples/server/bench/requirements.txt +2 -0
  110. package/src/llama.cpp/examples/server/httplib.h +9465 -0
  111. package/src/llama.cpp/examples/server/server.cpp +3826 -0
  112. package/src/llama.cpp/examples/server/tests/requirements.txt +6 -0
  113. package/src/llama.cpp/examples/server/utils.hpp +653 -0
  114. package/src/llama.cpp/examples/simple/CMakeLists.txt +5 -0
  115. package/src/llama.cpp/examples/simple/simple.cpp +183 -0
  116. package/src/llama.cpp/examples/speculative/CMakeLists.txt +5 -0
  117. package/src/llama.cpp/examples/speculative/speculative.cpp +614 -0
  118. package/src/llama.cpp/examples/sycl/CMakeLists.txt +9 -0
  119. package/src/llama.cpp/examples/sycl/ls-sycl-device.cpp +13 -0
  120. package/src/llama.cpp/examples/tokenize/CMakeLists.txt +5 -0
  121. package/src/llama.cpp/examples/tokenize/tokenize.cpp +42 -0
  122. package/src/llama.cpp/examples/train-text-from-scratch/CMakeLists.txt +5 -0
  123. package/src/llama.cpp/examples/train-text-from-scratch/train-text-from-scratch.cpp +1252 -0
  124. package/src/llama.cpp/ggml-alloc.c +985 -0
  125. package/src/llama.cpp/ggml-alloc.h +76 -0
  126. package/src/llama.cpp/ggml-backend-impl.h +141 -0
  127. package/src/llama.cpp/ggml-backend.c +2099 -0
  128. package/src/llama.cpp/ggml-backend.h +233 -0
  129. package/src/llama.cpp/ggml-common.h +1853 -0
  130. package/src/llama.cpp/ggml-cuda.h +43 -0
  131. package/src/llama.cpp/ggml-impl.h +265 -0
  132. package/src/llama.cpp/ggml-kompute.cpp +2006 -0
  133. package/src/llama.cpp/ggml-kompute.h +46 -0
  134. package/src/llama.cpp/ggml-metal.h +66 -0
  135. package/src/llama.cpp/ggml-mpi.c +216 -0
  136. package/src/llama.cpp/ggml-mpi.h +39 -0
  137. package/src/llama.cpp/ggml-opencl.cpp +2301 -0
  138. package/src/llama.cpp/ggml-opencl.h +36 -0
  139. package/src/llama.cpp/ggml-quants.c +12678 -0
  140. package/src/llama.cpp/ggml-quants.h +133 -0
  141. package/src/llama.cpp/ggml-sycl.cpp +17882 -0
  142. package/src/llama.cpp/ggml-sycl.h +49 -0
  143. package/src/llama.cpp/ggml-vulkan-shaders.hpp +69849 -0
  144. package/src/llama.cpp/ggml-vulkan.cpp +6442 -0
  145. package/src/llama.cpp/ggml-vulkan.h +29 -0
  146. package/src/llama.cpp/ggml.c +21819 -0
  147. package/src/llama.cpp/ggml.h +2403 -0
  148. package/src/llama.cpp/llama.cpp +17468 -0
  149. package/src/llama.cpp/llama.h +1117 -0
  150. package/src/llama.cpp/pocs/CMakeLists.txt +12 -0
  151. package/src/llama.cpp/pocs/vdot/CMakeLists.txt +9 -0
  152. package/src/llama.cpp/pocs/vdot/q8dot.cpp +172 -0
  153. package/src/llama.cpp/pocs/vdot/vdot.cpp +310 -0
  154. package/src/llama.cpp/prompts/LLM-questions.txt +49 -0
  155. package/src/llama.cpp/prompts/alpaca.txt +1 -0
  156. package/src/llama.cpp/prompts/assistant.txt +31 -0
  157. package/src/llama.cpp/prompts/chat-with-baichuan.txt +4 -0
  158. package/src/llama.cpp/prompts/chat-with-bob.txt +7 -0
  159. package/src/llama.cpp/prompts/chat-with-qwen.txt +1 -0
  160. package/src/llama.cpp/prompts/chat-with-vicuna-v0.txt +7 -0
  161. package/src/llama.cpp/prompts/chat-with-vicuna-v1.txt +7 -0
  162. package/src/llama.cpp/prompts/chat.txt +28 -0
  163. package/src/llama.cpp/prompts/dan-modified.txt +1 -0
  164. package/src/llama.cpp/prompts/dan.txt +1 -0
  165. package/src/llama.cpp/prompts/mnemonics.txt +93 -0
  166. package/src/llama.cpp/prompts/parallel-questions.txt +43 -0
  167. package/src/llama.cpp/prompts/reason-act.txt +18 -0
  168. package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf.txt +3 -0
  169. package/src/llama.cpp/requirements/requirements-convert-llama-ggml-to-gguf.txt +1 -0
  170. package/src/llama.cpp/requirements/requirements-convert-lora-to-ggml.txt +2 -0
  171. package/src/llama.cpp/requirements/requirements-convert-persimmon-to-gguf.txt +2 -0
  172. package/src/llama.cpp/requirements/requirements-convert.txt +5 -0
  173. package/src/llama.cpp/requirements.txt +12 -0
  174. package/src/llama.cpp/scripts/gen-build-info-cpp.cmake +24 -0
  175. package/src/llama.cpp/scripts/xxd.cmake +16 -0
  176. package/src/llama.cpp/sgemm.cpp +999 -0
  177. package/src/llama.cpp/sgemm.h +12 -0
  178. package/src/llama.cpp/tests/CMakeLists.txt +78 -0
  179. package/src/llama.cpp/tests/get-model.cpp +21 -0
  180. package/src/llama.cpp/tests/get-model.h +2 -0
  181. package/src/llama.cpp/tests/test-autorelease.cpp +24 -0
  182. package/src/llama.cpp/tests/test-backend-ops.cpp +2266 -0
  183. package/src/llama.cpp/tests/test-c.c +7 -0
  184. package/src/llama.cpp/tests/test-chat-template.cpp +107 -0
  185. package/src/llama.cpp/tests/test-double-float.cpp +57 -0
  186. package/src/llama.cpp/tests/test-grad0.cpp +1606 -0
  187. package/src/llama.cpp/tests/test-grammar-integration.cpp +243 -0
  188. package/src/llama.cpp/tests/test-grammar-parser.cpp +250 -0
  189. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +899 -0
  190. package/src/llama.cpp/tests/test-llama-grammar.cpp +402 -0
  191. package/src/llama.cpp/tests/test-model-load-cancel.cpp +27 -0
  192. package/src/llama.cpp/tests/test-opt.cpp +181 -0
  193. package/src/llama.cpp/tests/test-quantize-fns.cpp +185 -0
  194. package/src/llama.cpp/tests/test-quantize-perf.cpp +363 -0
  195. package/src/llama.cpp/tests/test-rope.cpp +221 -0
  196. package/src/llama.cpp/tests/test-sampling.cpp +301 -0
  197. package/src/llama.cpp/tests/test-tokenizer-0-falcon.cpp +187 -0
  198. package/src/llama.cpp/tests/test-tokenizer-0-llama.cpp +190 -0
  199. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +123 -0
  200. package/src/llama.cpp/tests/test-tokenizer-1-llama.cpp +111 -0
  201. package/src/llama.cpp/unicode-data.cpp +1651 -0
  202. package/src/llama.cpp/unicode-data.h +16 -0
  203. package/src/llama.cpp/unicode.cpp +277 -0
  204. package/src/llama.cpp/unicode.h +28 -0
@@ -0,0 +1,5 @@
1
+ set(TARGET embedding)
2
+ add_executable(${TARGET} embedding.cpp)
3
+ install(TARGETS ${TARGET} RUNTIME)
4
+ target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
5
+ target_compile_features(${TARGET} PRIVATE cxx_std_11)
@@ -0,0 +1,211 @@
1
+ #include "common.h"
2
+ #include "llama.h"
3
+
4
+ #include <ctime>
5
+
6
+ #if defined(_MSC_VER)
7
+ #pragma warning(disable: 4244 4267) // possible loss of data
8
+ #endif
9
+
10
+ static std::vector<std::string> split_lines(const std::string & s) {
11
+ std::string line;
12
+ std::vector<std::string> lines;
13
+ std::stringstream ss(s);
14
+ while (std::getline(ss, line)) {
15
+ lines.push_back(line);
16
+ }
17
+ return lines;
18
+ }
19
+
20
+ static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, int seq_id) {
21
+ for (size_t i = 0; i < tokens.size(); i++) {
22
+ llama_batch_add(batch, tokens[i], i, { seq_id }, i == tokens.size() - 1);
23
+ }
24
+ }
25
+
26
+ static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd) {
27
+ // clear previous kv_cache values (irrelevant for embeddings)
28
+ llama_kv_cache_clear(ctx);
29
+
30
+ // run model
31
+ fprintf(stderr, "%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
32
+ if (llama_decode(ctx, batch) < 0) {
33
+ fprintf(stderr, "%s : failed to decode\n", __func__);
34
+ }
35
+
36
+ for (int i = 0; i < batch.n_tokens; i++) {
37
+ if (!batch.logits[i]) {
38
+ continue;
39
+ }
40
+
41
+ // try to get sequence embeddings - supported only when pooling_type is not NONE
42
+ const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
43
+ if (embd == NULL) {
44
+ embd = llama_get_embeddings_ith(ctx, i);
45
+ if (embd == NULL) {
46
+ fprintf(stderr, "%s: failed to get embeddings for token %d\n", __func__, i);
47
+ continue;
48
+ }
49
+ }
50
+
51
+ float * out = output + batch.seq_id[i][0] * n_embd;
52
+ llama_embd_normalize(embd, out, n_embd);
53
+ }
54
+ }
55
+
56
+ int main(int argc, char ** argv) {
57
+ gpt_params params;
58
+
59
+ if (!gpt_params_parse(argc, argv, params)) {
60
+ return 1;
61
+ }
62
+
63
+ params.embedding = true;
64
+ // For non-causal models, batch size must be equal to ubatch size
65
+ params.n_ubatch = params.n_batch;
66
+
67
+ print_build_info();
68
+
69
+ if (params.seed == LLAMA_DEFAULT_SEED) {
70
+ params.seed = time(NULL);
71
+ }
72
+
73
+ fprintf(stderr, "%s: seed = %u\n", __func__, params.seed);
74
+
75
+ std::mt19937 rng(params.seed);
76
+ if (params.random_prompt) {
77
+ params.prompt = gpt_random_prompt(rng);
78
+ }
79
+
80
+ llama_backend_init();
81
+ llama_numa_init(params.numa);
82
+
83
+ llama_model * model;
84
+ llama_context * ctx;
85
+
86
+ // load the model
87
+ std::tie(model, ctx) = llama_init_from_gpt_params(params);
88
+ if (model == NULL) {
89
+ fprintf(stderr, "%s: error: unable to load model\n", __func__);
90
+ return 1;
91
+ }
92
+
93
+ const int n_ctx_train = llama_n_ctx_train(model);
94
+ const int n_ctx = llama_n_ctx(ctx);
95
+
96
+ if (n_ctx > n_ctx_train) {
97
+ fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
98
+ __func__, n_ctx_train, n_ctx);
99
+ }
100
+
101
+ // print system information
102
+ {
103
+ fprintf(stderr, "\n");
104
+ fprintf(stderr, "%s\n", get_system_info(params).c_str());
105
+ }
106
+
107
+ // split the prompt into lines
108
+ std::vector<std::string> prompts = split_lines(params.prompt);
109
+
110
+ // max batch size
111
+ const uint64_t n_batch = params.n_batch;
112
+ GGML_ASSERT(params.n_batch >= params.n_ctx);
113
+
114
+ // tokenize the prompts and trim
115
+ std::vector<std::vector<int32_t>> inputs;
116
+ for (const auto & prompt : prompts) {
117
+ auto inp = ::llama_tokenize(ctx, prompt, true, false);
118
+ if (inp.size() > n_batch) {
119
+ fprintf(stderr, "%s: error: number of tokens in input line (%lld) exceeds batch size (%lld), increase batch size and re-run\n",
120
+ __func__, (long long int) inp.size(), (long long int) n_batch);
121
+ return 1;
122
+ }
123
+ inputs.push_back(inp);
124
+ }
125
+
126
+ // add SEP if not present
127
+ for (auto & inp : inputs) {
128
+ if (inp.empty() || inp.back() != llama_token_sep(model)) {
129
+ inp.push_back(llama_token_sep(model));
130
+ }
131
+ }
132
+
133
+ // tokenization stats
134
+ if (params.verbose_prompt) {
135
+ for (int i = 0; i < (int) inputs.size(); i++) {
136
+ fprintf(stderr, "%s: prompt %d: '%s'\n", __func__, i, prompts[i].c_str());
137
+ fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, inputs[i].size());
138
+ for (int j = 0; j < (int) inputs[i].size(); j++) {
139
+ fprintf(stderr, "%6d -> '%s'\n", inputs[i][j], llama_token_to_piece(ctx, inputs[i][j]).c_str());
140
+ }
141
+ fprintf(stderr, "\n\n");
142
+ }
143
+ }
144
+
145
+ // initialize batch
146
+ const int n_prompts = prompts.size();
147
+ struct llama_batch batch = llama_batch_init(n_batch, 0, 1);
148
+
149
+ // allocate output
150
+ const int n_embd = llama_n_embd(model);
151
+ std::vector<float> embeddings(n_prompts * n_embd, 0);
152
+ float * emb = embeddings.data();
153
+
154
+ // break into batches
155
+ int p = 0; // number of prompts processed already
156
+ int s = 0; // number of prompts in current batch
157
+ for (int k = 0; k < n_prompts; k++) {
158
+ // clamp to n_batch tokens
159
+ auto & inp = inputs[k];
160
+
161
+ const uint64_t n_toks = inp.size();
162
+
163
+ // encode if at capacity
164
+ if (batch.n_tokens + n_toks > n_batch) {
165
+ float * out = emb + p * n_embd;
166
+ batch_decode(ctx, batch, out, s, n_embd);
167
+ llama_batch_clear(batch);
168
+ p += s;
169
+ s = 0;
170
+ }
171
+
172
+ // add to batch
173
+ batch_add_seq(batch, inp, s);
174
+ s += 1;
175
+ }
176
+
177
+ // final batch
178
+ float * out = emb + p * n_embd;
179
+ batch_decode(ctx, batch, out, s, n_embd);
180
+
181
+ // print the first part of the embeddings or for a single prompt, the full embedding
182
+ fprintf(stdout, "\n");
183
+ for (int j = 0; j < n_prompts; j++) {
184
+ fprintf(stdout, "embedding %d: ", j);
185
+ for (int i = 0; i < (n_prompts > 1 ? std::min(16, n_embd) : n_embd); i++) {
186
+ fprintf(stdout, "%9.6f ", emb[j * n_embd + i]);
187
+ }
188
+ fprintf(stdout, "\n");
189
+ }
190
+
191
+ // print cosine similarity matrix
192
+ if (n_prompts > 1) {
193
+ fprintf(stdout, "\n");
194
+ printf("cosine similarity matrix:\n\n");
195
+ for (int i = 0; i < n_prompts; i++) {
196
+ for (int j = 0; j < n_prompts; j++) {
197
+ float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
198
+ fprintf(stdout, "%6.2f ", sim);
199
+ }
200
+ fprintf(stdout, "\n");
201
+ }
202
+ }
203
+
204
+ // clean up
205
+ llama_print_timings(ctx);
206
+ llama_free(ctx);
207
+ llama_free_model(model);
208
+ llama_backend_free();
209
+
210
+ return 0;
211
+ }
@@ -0,0 +1,9 @@
1
+ set(TARGET eval-callback)
2
+ add_executable(${TARGET} eval-callback.cpp)
3
+ install(TARGETS ${TARGET} RUNTIME)
4
+ target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
5
+ target_compile_features(${TARGET} PRIVATE cxx_std_11)
6
+
7
+ set(TEST_TARGET test-eval-callback)
8
+ add_test(NAME ${TEST_TARGET} COMMAND eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0)
9
+ set_property(TEST ${TEST_TARGET} PROPERTY LABELS eval-callback curl)
@@ -0,0 +1,195 @@
1
+ #include "common.h"
2
+ #include "llama.h"
3
+ #include "ggml.h"
4
+
5
+ #include <cstdio>
6
+ #include <random>
7
+ #include <string>
8
+ #include <tuple>
9
+ #include <vector>
10
+
11
+ /**
12
+ * This the arbitrary data which will be passed to each callback.
13
+ * Later on we can for example add operation or tensor name filter from the CLI arg, or a file descriptor to dump the tensor.
14
+ */
15
+ struct callback_data {
16
+ std::vector<uint8_t> data;
17
+ };
18
+
19
+ static std::string ggml_ne_string(const ggml_tensor * t) {
20
+ std::string str;
21
+ for (int i = 0; i < GGML_MAX_DIMS; ++i) {
22
+ str += std::to_string(t->ne[i]);
23
+ if (i + 1 < GGML_MAX_DIMS) {
24
+ str += ", ";
25
+ }
26
+ }
27
+ return str;
28
+ }
29
+
30
+ static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n) {
31
+ GGML_ASSERT(n > 0);
32
+ float sum = 0;
33
+ for (int64_t i3 = 0; i3 < ne[3]; i3++) {
34
+ printf(" [\n");
35
+ for (int64_t i2 = 0; i2 < ne[2]; i2++) {
36
+ if (i2 == n && ne[2] > 2*n) {
37
+ printf(" ..., \n");
38
+ i2 = ne[2] - n;
39
+ }
40
+ printf(" [\n");
41
+ for (int64_t i1 = 0; i1 < ne[1]; i1++) {
42
+ if (i1 == n && ne[1] > 2*n) {
43
+ printf(" ..., \n");
44
+ i1 = ne[1] - n;
45
+ }
46
+ printf(" [");
47
+ for (int64_t i0 = 0; i0 < ne[0]; i0++) {
48
+ if (i0 == n && ne[0] > 2*n) {
49
+ printf("..., ");
50
+ i0 = ne[0] - n;
51
+ }
52
+ size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
53
+ float v;
54
+ if (type == GGML_TYPE_F16) {
55
+ v = ggml_fp16_to_fp32(*(ggml_fp16_t *) data + i);
56
+ } else if (type == GGML_TYPE_F32) {
57
+ v = *(float *) data + i;
58
+ } else if (type == GGML_TYPE_I32) {
59
+ v = (float) *(int32_t *) data + i;
60
+ } else if (type == GGML_TYPE_I16) {
61
+ v = (float) *(int16_t *) data + i;
62
+ } else if (type == GGML_TYPE_I8) {
63
+ v = (float) *(int8_t *) data + i;
64
+ } else {
65
+ GGML_ASSERT(false);
66
+ }
67
+ printf("%12.4f", v);
68
+ sum += v;
69
+ if (i0 < ne[0] - 1) printf(", ");
70
+ }
71
+ printf("],\n");
72
+ }
73
+ printf(" ],\n");
74
+ }
75
+ printf(" ]\n");
76
+ printf(" sum = %f\n", sum);
77
+ }
78
+ }
79
+
80
+ /**
81
+ * GGML operations callback during the graph execution.
82
+ *
83
+ * @param t current tensor
84
+ * @param ask when ask is true, the scheduler wants to know if we are interested in data from this tensor
85
+ * if we return true, a follow-up call will be made with ask=false in which we can do the actual collection.
86
+ * see ggml_backend_sched_eval_callback
87
+ * @param user_data user data to pass at each call back
88
+ * @return true to receive data or continue the graph, false otherwise
89
+ */
90
+ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
91
+ auto * cb_data = (callback_data *) user_data;
92
+
93
+ const struct ggml_tensor * src0 = t->src[0];
94
+ const struct ggml_tensor * src1 = t->src[1];
95
+
96
+ if (ask) {
97
+ return true; // Always retrieve data
98
+ }
99
+
100
+ char src1_str[128] = {0};
101
+ if (src1) {
102
+ sprintf(src1_str, "%s{%s}", src1->name, ggml_ne_string(src1).c_str());
103
+ }
104
+
105
+ printf("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__,
106
+ t->name, ggml_type_name(t->type), ggml_op_desc(t),
107
+ src0->name, ggml_ne_string(src0).c_str(),
108
+ src1 ? src1_str : "",
109
+ ggml_ne_string(t).c_str());
110
+
111
+
112
+ // copy the data from the GPU memory if needed
113
+ const bool is_host = ggml_backend_buffer_is_host(t->buffer);
114
+
115
+ if (!is_host) {
116
+ auto n_bytes = ggml_nbytes(t);
117
+ cb_data->data.resize(n_bytes);
118
+ ggml_backend_tensor_get(t, cb_data->data.data(), 0, n_bytes);
119
+ }
120
+
121
+ if (!ggml_is_quantized(t->type)) {
122
+ uint8_t * data = is_host ? (uint8_t *) t->data : cb_data->data.data();
123
+ ggml_print_tensor(data, t->type, t->ne, t->nb, 3);
124
+ }
125
+
126
+ return true;
127
+ }
128
+
129
+ static bool run(llama_context * ctx, const gpt_params & params) {
130
+ const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
131
+
132
+ std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
133
+
134
+ if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) {
135
+ fprintf(stderr, "%s : failed to eval\n", __func__);
136
+ return false;
137
+ }
138
+
139
+ return true;
140
+ }
141
+
142
+ int main(int argc, char ** argv) {
143
+
144
+ callback_data cb_data;
145
+
146
+ gpt_params params;
147
+ if (!gpt_params_parse(argc, argv, params)) {
148
+ return 1;
149
+ }
150
+
151
+ print_build_info();
152
+
153
+ std::mt19937 rng(params.seed);
154
+ if (params.random_prompt) {
155
+ params.prompt = gpt_random_prompt(rng);
156
+ }
157
+
158
+ llama_backend_init();
159
+ llama_numa_init(params.numa);
160
+
161
+ // pass the callback to the backend scheduler
162
+ // it will be executed for each node during the graph computation
163
+ params.cb_eval = ggml_debug;
164
+ params.cb_eval_user_data = &cb_data;
165
+ params.warmup = false;
166
+
167
+ // init
168
+ llama_model * model;
169
+ llama_context * ctx;
170
+ std::tie(model, ctx) = llama_init_from_gpt_params(params);
171
+ if (model == nullptr || ctx == nullptr) {
172
+ fprintf(stderr, "%s : failed to init\n", __func__);
173
+ return 1;
174
+ }
175
+
176
+ // print system information
177
+ {
178
+ fprintf(stderr, "\n");
179
+ fprintf(stderr, "%s\n", get_system_info(params).c_str());
180
+ }
181
+
182
+ bool OK = run(ctx, params);
183
+ if (!OK) {
184
+ return 1;
185
+ }
186
+
187
+ llama_print_timings(ctx);
188
+
189
+ llama_free(ctx);
190
+ llama_free_model(model);
191
+
192
+ llama_backend_free();
193
+
194
+ return 0;
195
+ }
@@ -0,0 +1,5 @@
1
+ set(TARGET export-lora)
2
+ add_executable(${TARGET} export-lora.cpp)
3
+ install(TARGETS ${TARGET} RUNTIME)
4
+ target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
5
+ target_compile_features(${TARGET} PRIVATE cxx_std_11)