@fugood/llama.node 0.0.1-alpha.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (204) hide show
  1. package/CMakeLists.txt +85 -0
  2. package/README.md +56 -0
  3. package/bin/darwin/arm64/llama-node.node +0 -0
  4. package/bin/darwin/x64/llama-node.node +0 -0
  5. package/bin/linux/arm64/llama-node.node +0 -0
  6. package/bin/linux/x64/llama-node.node +0 -0
  7. package/bin/win32/arm64/llama-node.node +0 -0
  8. package/bin/win32/arm64/node.lib +0 -0
  9. package/bin/win32/x64/llama-node.node +0 -0
  10. package/bin/win32/x64/node.lib +0 -0
  11. package/lib/binding.js +13 -0
  12. package/lib/binding.ts +57 -0
  13. package/lib/index.js +24 -0
  14. package/lib/index.ts +13 -0
  15. package/package.json +65 -0
  16. package/src/addons.cpp +506 -0
  17. package/src/llama.cpp/CMakeLists.txt +1320 -0
  18. package/src/llama.cpp/build.zig +172 -0
  19. package/src/llama.cpp/cmake/FindSIMD.cmake +100 -0
  20. package/src/llama.cpp/common/CMakeLists.txt +87 -0
  21. package/src/llama.cpp/common/base64.hpp +392 -0
  22. package/src/llama.cpp/common/common.cpp +2949 -0
  23. package/src/llama.cpp/common/common.h +324 -0
  24. package/src/llama.cpp/common/console.cpp +501 -0
  25. package/src/llama.cpp/common/console.h +19 -0
  26. package/src/llama.cpp/common/grammar-parser.cpp +440 -0
  27. package/src/llama.cpp/common/grammar-parser.h +29 -0
  28. package/src/llama.cpp/common/json-schema-to-grammar.cpp +764 -0
  29. package/src/llama.cpp/common/json-schema-to-grammar.h +4 -0
  30. package/src/llama.cpp/common/json.hpp +24766 -0
  31. package/src/llama.cpp/common/log.h +724 -0
  32. package/src/llama.cpp/common/ngram-cache.cpp +282 -0
  33. package/src/llama.cpp/common/ngram-cache.h +94 -0
  34. package/src/llama.cpp/common/sampling.cpp +353 -0
  35. package/src/llama.cpp/common/sampling.h +147 -0
  36. package/src/llama.cpp/common/stb_image.h +8396 -0
  37. package/src/llama.cpp/common/train.cpp +1513 -0
  38. package/src/llama.cpp/common/train.h +233 -0
  39. package/src/llama.cpp/examples/CMakeLists.txt +52 -0
  40. package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +5 -0
  41. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +1640 -0
  42. package/src/llama.cpp/examples/batched/CMakeLists.txt +5 -0
  43. package/src/llama.cpp/examples/batched/batched.cpp +262 -0
  44. package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +5 -0
  45. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +261 -0
  46. package/src/llama.cpp/examples/beam-search/CMakeLists.txt +5 -0
  47. package/src/llama.cpp/examples/beam-search/beam-search.cpp +188 -0
  48. package/src/llama.cpp/examples/benchmark/CMakeLists.txt +6 -0
  49. package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +275 -0
  50. package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +5 -0
  51. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +936 -0
  52. package/src/llama.cpp/examples/embedding/CMakeLists.txt +5 -0
  53. package/src/llama.cpp/examples/embedding/embedding.cpp +211 -0
  54. package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +9 -0
  55. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +195 -0
  56. package/src/llama.cpp/examples/export-lora/CMakeLists.txt +5 -0
  57. package/src/llama.cpp/examples/export-lora/export-lora.cpp +462 -0
  58. package/src/llama.cpp/examples/finetune/CMakeLists.txt +5 -0
  59. package/src/llama.cpp/examples/finetune/finetune.cpp +1861 -0
  60. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +5 -0
  61. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +132 -0
  62. package/src/llama.cpp/examples/gguf/CMakeLists.txt +5 -0
  63. package/src/llama.cpp/examples/gguf/gguf.cpp +256 -0
  64. package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +5 -0
  65. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +553 -0
  66. package/src/llama.cpp/examples/gritlm/CMakeLists.txt +5 -0
  67. package/src/llama.cpp/examples/gritlm/gritlm.cpp +215 -0
  68. package/src/llama.cpp/examples/imatrix/CMakeLists.txt +5 -0
  69. package/src/llama.cpp/examples/imatrix/imatrix.cpp +655 -0
  70. package/src/llama.cpp/examples/infill/CMakeLists.txt +5 -0
  71. package/src/llama.cpp/examples/infill/infill.cpp +767 -0
  72. package/src/llama.cpp/examples/jeopardy/questions.txt +100 -0
  73. package/src/llama.cpp/examples/llama-bench/CMakeLists.txt +5 -0
  74. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +1286 -0
  75. package/src/llama.cpp/examples/llama.android/app/src/main/cpp/CMakeLists.txt +50 -0
  76. package/src/llama.cpp/examples/llama.android/app/src/main/cpp/llama-android.cpp +443 -0
  77. package/src/llama.cpp/examples/llava/CMakeLists.txt +37 -0
  78. package/src/llama.cpp/examples/llava/clip.cpp +2027 -0
  79. package/src/llama.cpp/examples/llava/clip.h +85 -0
  80. package/src/llama.cpp/examples/llava/llava-cli.cpp +309 -0
  81. package/src/llama.cpp/examples/llava/llava.cpp +426 -0
  82. package/src/llama.cpp/examples/llava/llava.h +50 -0
  83. package/src/llama.cpp/examples/llava/requirements.txt +3 -0
  84. package/src/llama.cpp/examples/lookahead/CMakeLists.txt +5 -0
  85. package/src/llama.cpp/examples/lookahead/lookahead.cpp +485 -0
  86. package/src/llama.cpp/examples/lookup/CMakeLists.txt +23 -0
  87. package/src/llama.cpp/examples/lookup/lookup-create.cpp +41 -0
  88. package/src/llama.cpp/examples/lookup/lookup-merge.cpp +47 -0
  89. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +160 -0
  90. package/src/llama.cpp/examples/lookup/lookup.cpp +258 -0
  91. package/src/llama.cpp/examples/main/CMakeLists.txt +5 -0
  92. package/src/llama.cpp/examples/main/main.cpp +957 -0
  93. package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +33 -0
  94. package/src/llama.cpp/examples/parallel/CMakeLists.txt +5 -0
  95. package/src/llama.cpp/examples/parallel/parallel.cpp +427 -0
  96. package/src/llama.cpp/examples/passkey/CMakeLists.txt +5 -0
  97. package/src/llama.cpp/examples/passkey/passkey.cpp +302 -0
  98. package/src/llama.cpp/examples/perplexity/CMakeLists.txt +5 -0
  99. package/src/llama.cpp/examples/perplexity/perplexity.cpp +1943 -0
  100. package/src/llama.cpp/examples/quantize/CMakeLists.txt +6 -0
  101. package/src/llama.cpp/examples/quantize/quantize.cpp +423 -0
  102. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +6 -0
  103. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +424 -0
  104. package/src/llama.cpp/examples/retrieval/CMakeLists.txt +5 -0
  105. package/src/llama.cpp/examples/retrieval/retrieval.cpp +350 -0
  106. package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +5 -0
  107. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +246 -0
  108. package/src/llama.cpp/examples/server/CMakeLists.txt +40 -0
  109. package/src/llama.cpp/examples/server/bench/requirements.txt +2 -0
  110. package/src/llama.cpp/examples/server/httplib.h +9465 -0
  111. package/src/llama.cpp/examples/server/server.cpp +3826 -0
  112. package/src/llama.cpp/examples/server/tests/requirements.txt +6 -0
  113. package/src/llama.cpp/examples/server/utils.hpp +653 -0
  114. package/src/llama.cpp/examples/simple/CMakeLists.txt +5 -0
  115. package/src/llama.cpp/examples/simple/simple.cpp +183 -0
  116. package/src/llama.cpp/examples/speculative/CMakeLists.txt +5 -0
  117. package/src/llama.cpp/examples/speculative/speculative.cpp +614 -0
  118. package/src/llama.cpp/examples/sycl/CMakeLists.txt +9 -0
  119. package/src/llama.cpp/examples/sycl/ls-sycl-device.cpp +13 -0
  120. package/src/llama.cpp/examples/tokenize/CMakeLists.txt +5 -0
  121. package/src/llama.cpp/examples/tokenize/tokenize.cpp +42 -0
  122. package/src/llama.cpp/examples/train-text-from-scratch/CMakeLists.txt +5 -0
  123. package/src/llama.cpp/examples/train-text-from-scratch/train-text-from-scratch.cpp +1252 -0
  124. package/src/llama.cpp/ggml-alloc.c +985 -0
  125. package/src/llama.cpp/ggml-alloc.h +76 -0
  126. package/src/llama.cpp/ggml-backend-impl.h +141 -0
  127. package/src/llama.cpp/ggml-backend.c +2099 -0
  128. package/src/llama.cpp/ggml-backend.h +233 -0
  129. package/src/llama.cpp/ggml-common.h +1853 -0
  130. package/src/llama.cpp/ggml-cuda.h +43 -0
  131. package/src/llama.cpp/ggml-impl.h +265 -0
  132. package/src/llama.cpp/ggml-kompute.cpp +2006 -0
  133. package/src/llama.cpp/ggml-kompute.h +46 -0
  134. package/src/llama.cpp/ggml-metal.h +66 -0
  135. package/src/llama.cpp/ggml-mpi.c +216 -0
  136. package/src/llama.cpp/ggml-mpi.h +39 -0
  137. package/src/llama.cpp/ggml-opencl.cpp +2301 -0
  138. package/src/llama.cpp/ggml-opencl.h +36 -0
  139. package/src/llama.cpp/ggml-quants.c +12678 -0
  140. package/src/llama.cpp/ggml-quants.h +133 -0
  141. package/src/llama.cpp/ggml-sycl.cpp +17882 -0
  142. package/src/llama.cpp/ggml-sycl.h +49 -0
  143. package/src/llama.cpp/ggml-vulkan-shaders.hpp +69849 -0
  144. package/src/llama.cpp/ggml-vulkan.cpp +6442 -0
  145. package/src/llama.cpp/ggml-vulkan.h +29 -0
  146. package/src/llama.cpp/ggml.c +21819 -0
  147. package/src/llama.cpp/ggml.h +2403 -0
  148. package/src/llama.cpp/llama.cpp +17468 -0
  149. package/src/llama.cpp/llama.h +1117 -0
  150. package/src/llama.cpp/pocs/CMakeLists.txt +12 -0
  151. package/src/llama.cpp/pocs/vdot/CMakeLists.txt +9 -0
  152. package/src/llama.cpp/pocs/vdot/q8dot.cpp +172 -0
  153. package/src/llama.cpp/pocs/vdot/vdot.cpp +310 -0
  154. package/src/llama.cpp/prompts/LLM-questions.txt +49 -0
  155. package/src/llama.cpp/prompts/alpaca.txt +1 -0
  156. package/src/llama.cpp/prompts/assistant.txt +31 -0
  157. package/src/llama.cpp/prompts/chat-with-baichuan.txt +4 -0
  158. package/src/llama.cpp/prompts/chat-with-bob.txt +7 -0
  159. package/src/llama.cpp/prompts/chat-with-qwen.txt +1 -0
  160. package/src/llama.cpp/prompts/chat-with-vicuna-v0.txt +7 -0
  161. package/src/llama.cpp/prompts/chat-with-vicuna-v1.txt +7 -0
  162. package/src/llama.cpp/prompts/chat.txt +28 -0
  163. package/src/llama.cpp/prompts/dan-modified.txt +1 -0
  164. package/src/llama.cpp/prompts/dan.txt +1 -0
  165. package/src/llama.cpp/prompts/mnemonics.txt +93 -0
  166. package/src/llama.cpp/prompts/parallel-questions.txt +43 -0
  167. package/src/llama.cpp/prompts/reason-act.txt +18 -0
  168. package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf.txt +3 -0
  169. package/src/llama.cpp/requirements/requirements-convert-llama-ggml-to-gguf.txt +1 -0
  170. package/src/llama.cpp/requirements/requirements-convert-lora-to-ggml.txt +2 -0
  171. package/src/llama.cpp/requirements/requirements-convert-persimmon-to-gguf.txt +2 -0
  172. package/src/llama.cpp/requirements/requirements-convert.txt +5 -0
  173. package/src/llama.cpp/requirements.txt +12 -0
  174. package/src/llama.cpp/scripts/gen-build-info-cpp.cmake +24 -0
  175. package/src/llama.cpp/scripts/xxd.cmake +16 -0
  176. package/src/llama.cpp/sgemm.cpp +999 -0
  177. package/src/llama.cpp/sgemm.h +12 -0
  178. package/src/llama.cpp/tests/CMakeLists.txt +78 -0
  179. package/src/llama.cpp/tests/get-model.cpp +21 -0
  180. package/src/llama.cpp/tests/get-model.h +2 -0
  181. package/src/llama.cpp/tests/test-autorelease.cpp +24 -0
  182. package/src/llama.cpp/tests/test-backend-ops.cpp +2266 -0
  183. package/src/llama.cpp/tests/test-c.c +7 -0
  184. package/src/llama.cpp/tests/test-chat-template.cpp +107 -0
  185. package/src/llama.cpp/tests/test-double-float.cpp +57 -0
  186. package/src/llama.cpp/tests/test-grad0.cpp +1606 -0
  187. package/src/llama.cpp/tests/test-grammar-integration.cpp +243 -0
  188. package/src/llama.cpp/tests/test-grammar-parser.cpp +250 -0
  189. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +899 -0
  190. package/src/llama.cpp/tests/test-llama-grammar.cpp +402 -0
  191. package/src/llama.cpp/tests/test-model-load-cancel.cpp +27 -0
  192. package/src/llama.cpp/tests/test-opt.cpp +181 -0
  193. package/src/llama.cpp/tests/test-quantize-fns.cpp +185 -0
  194. package/src/llama.cpp/tests/test-quantize-perf.cpp +363 -0
  195. package/src/llama.cpp/tests/test-rope.cpp +221 -0
  196. package/src/llama.cpp/tests/test-sampling.cpp +301 -0
  197. package/src/llama.cpp/tests/test-tokenizer-0-falcon.cpp +187 -0
  198. package/src/llama.cpp/tests/test-tokenizer-0-llama.cpp +190 -0
  199. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +123 -0
  200. package/src/llama.cpp/tests/test-tokenizer-1-llama.cpp +111 -0
  201. package/src/llama.cpp/unicode-data.cpp +1651 -0
  202. package/src/llama.cpp/unicode-data.h +16 -0
  203. package/src/llama.cpp/unicode.cpp +277 -0
  204. package/src/llama.cpp/unicode.h +28 -0
@@ -0,0 +1,350 @@
1
+ #include "common.h"
2
+ #include "llama.h"
3
+
4
+ #include <algorithm>
5
+ #include <fstream>
6
+
7
+ struct retrieval_params {
8
+ std::vector<std::string> context_files; // context files to embed
9
+ int32_t chunk_size = 64; // chunk size for context embedding
10
+ std::string chunk_separator = "\n"; // chunk separator for context embedding
11
+ };
12
+
13
+ static void retrieval_params_print_usage(int argc, char ** argv, gpt_params & gpt_params, retrieval_params & params) {
14
+ gpt_print_usage(argc, argv, gpt_params);
15
+ printf("retrieval options:\n");
16
+ printf(" --context-file FNAME file containing context to embed.\n");
17
+ printf(" specify multiple files by providing --context-file option multiple times.\n");
18
+ printf(" --chunk-size N minimum length of embedded text chunk (default:%d)\n", params.chunk_size);
19
+ printf(" --chunk-separator STRING\n");
20
+ printf(" string to separate chunks (default: \"\\n\")\n");
21
+ printf("\n");
22
+ }
23
+
24
+ static void retrieval_params_parse(int argc, char ** argv, gpt_params & gpt_params, retrieval_params & retrieval_params) {
25
+ int i = 1;
26
+ std::string arg;
27
+ while (i < argc) {
28
+ arg = argv[i];
29
+ bool invalid_gpt_param = false;
30
+ if(gpt_params_find_arg(argc, argv, argv[i], gpt_params, i, invalid_gpt_param)) {
31
+ if (invalid_gpt_param) {
32
+ fprintf(stderr, "error: invalid argument: %s\n", arg.c_str());
33
+ retrieval_params_print_usage(argc, argv, gpt_params, retrieval_params);
34
+ exit(1);
35
+ }
36
+ // option was parsed by gpt_params_find_arg
37
+ } else if (arg == "--context-file") {
38
+ if (++i >= argc) {
39
+ fprintf(stderr, "error: missing argument for --context-file\n");
40
+ retrieval_params_print_usage(argc, argv, gpt_params, retrieval_params);
41
+ exit(1);
42
+ }
43
+ std::ifstream file(argv[i]);
44
+ if (!file) {
45
+ fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
46
+ retrieval_params_print_usage(argc, argv, gpt_params, retrieval_params);
47
+ exit(1);
48
+ }
49
+ // store the external file name in params
50
+ retrieval_params.context_files.push_back(argv[i]);
51
+ } else if (arg == "--chunk-size") {
52
+ if (++i >= argc) {
53
+ fprintf(stderr, "error: missing argument for --chunk-size\n");
54
+ retrieval_params_print_usage(argc, argv, gpt_params, retrieval_params);
55
+ exit(1);
56
+ }
57
+ retrieval_params.chunk_size = std::stoi(argv[i]);
58
+ } else if (arg == "--chunk-separator") {
59
+ if (++i >= argc) {
60
+ fprintf(stderr, "error: missing argument for --chunk-separator\n");
61
+ retrieval_params_print_usage(argc, argv, gpt_params, retrieval_params);
62
+ exit(1);
63
+ }
64
+ retrieval_params.chunk_separator = argv[i];
65
+ } else {
66
+ // unknown argument
67
+ fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
68
+ retrieval_params_print_usage(argc, argv, gpt_params, retrieval_params);
69
+ exit(1);
70
+ }
71
+ i++;
72
+ }
73
+ }
74
+
75
+ struct chunk {
76
+ // filename
77
+ std::string filename;
78
+ // original file position
79
+ size_t filepos;
80
+ // original text data
81
+ std::string textdata = "";
82
+ // tokenized text data
83
+ std::vector<llama_token> tokens;
84
+ // embedding
85
+ std::vector<float> embedding;
86
+ };
87
+
88
+ // chunk file data to chunks of size >= chunk_size
89
+ // chunk_separator is the separator between chunks
90
+ static std::vector<chunk> chunk_file(const std::string & filename, int chunk_size, const std::string & chunk_separator) {
91
+ std::vector<chunk> chunks;
92
+ std::ifstream f(filename.c_str());
93
+
94
+ if (!f.is_open()) {
95
+ fprintf(stderr, "Error: could not open file %s\n", filename.c_str());
96
+ return chunks;
97
+ }
98
+
99
+ chunk current_chunk;
100
+ char buffer[1024];
101
+ int64_t filepos = 0;
102
+ std::string current = "";
103
+ while (f.read(buffer, 1024)) {
104
+ current += std::string(buffer, f.gcount());
105
+ size_t pos;
106
+ while ((pos = current.find(chunk_separator)) != std::string::npos) {
107
+ current_chunk.textdata += current.substr(0, pos + chunk_separator.size());
108
+ if ((int) current_chunk.textdata.size() > chunk_size) {
109
+ // save chunk
110
+ current_chunk.filepos = filepos;
111
+ current_chunk.filename = filename;
112
+ chunks.push_back(current_chunk);
113
+ // update filepos
114
+ filepos += (int) current_chunk.textdata.size();
115
+ // reset current_chunk
116
+ current_chunk = chunk();
117
+ }
118
+ current = current.substr(pos + chunk_separator.size());
119
+ }
120
+
121
+ }
122
+ // add leftover data to last chunk
123
+ if (current_chunk.textdata.size() > 0) {
124
+ if (chunks.empty()) {
125
+ current_chunk.filepos = filepos;
126
+ current_chunk.filename = filename;
127
+ chunks.push_back(current_chunk);
128
+ } else {
129
+ chunks.back().textdata += current_chunk.textdata;
130
+ }
131
+ }
132
+ f.close();
133
+ return chunks;
134
+ }
135
+
136
+ static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, int seq_id) {
137
+ for (size_t i = 0; i < tokens.size(); i++) {
138
+ llama_batch_add(batch, tokens[i], i, { seq_id }, i == tokens.size() - 1);
139
+ }
140
+ }
141
+
142
+ static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd) {
143
+ // clear previous kv_cache values (irrelevant for embeddings)
144
+ llama_kv_cache_clear(ctx);
145
+
146
+ // run model
147
+ fprintf(stderr, "%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
148
+ if (llama_decode(ctx, batch) < 0) {
149
+ fprintf(stderr, "%s : failed to decode\n", __func__);
150
+ }
151
+
152
+ for (int i = 0; i < batch.n_tokens; i++) {
153
+ if (!batch.logits[i]) {
154
+ continue;
155
+ }
156
+
157
+ // try to get sequence embeddings - supported only when pooling_type is not NONE
158
+ const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
159
+ if (embd == NULL) {
160
+ embd = llama_get_embeddings_ith(ctx, i);
161
+ if (embd == NULL) {
162
+ fprintf(stderr, "%s: failed to get embeddings for token %d\n", __func__, i);
163
+ continue;
164
+ }
165
+ }
166
+
167
+ float * out = output + batch.seq_id[i][0] * n_embd;
168
+ llama_embd_normalize(embd, out, n_embd);
169
+ }
170
+ }
171
+
172
+ int main(int argc, char ** argv) {
173
+ gpt_params params;
174
+ retrieval_params retrieval_params;
175
+
176
+ retrieval_params_parse(argc, argv, params, retrieval_params);
177
+
178
+ // For BERT models, batch size must be equal to ubatch size
179
+ params.n_ubatch = params.n_batch;
180
+
181
+ if (retrieval_params.chunk_size <= 0) {
182
+ fprintf(stderr, "chunk_size must be positive\n");
183
+ return 1;
184
+ }
185
+ if (retrieval_params.context_files.empty()) {
186
+ fprintf(stderr, "context_files must be specified\n");
187
+ return 1;
188
+ }
189
+ params.embedding = true;
190
+
191
+ print_build_info();
192
+
193
+ printf("processing files:\n");
194
+ for (auto & context_file : retrieval_params.context_files) {
195
+ printf("%s\n", context_file.c_str());
196
+ }
197
+
198
+ std::vector<chunk> chunks;
199
+ for (auto & context_file : retrieval_params.context_files) {
200
+ std::vector<chunk> file_chunk = chunk_file(context_file, retrieval_params.chunk_size, retrieval_params.chunk_separator);
201
+ chunks.insert(chunks.end(), file_chunk.begin(), file_chunk.end());
202
+ }
203
+ printf("Number of chunks: %ld\n", chunks.size());
204
+
205
+ llama_backend_init();
206
+ llama_numa_init(params.numa);
207
+
208
+ llama_model * model;
209
+ llama_context * ctx;
210
+
211
+ // load the model
212
+ std::tie(model, ctx) = llama_init_from_gpt_params(params);
213
+ if (model == NULL) {
214
+ fprintf(stderr, "%s: error: unable to load model\n", __func__);
215
+ return 1;
216
+ }
217
+
218
+ const int n_ctx_train = llama_n_ctx_train(model);
219
+ const int n_ctx = llama_n_ctx(ctx);
220
+
221
+ if (n_ctx > n_ctx_train) {
222
+ fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
223
+ __func__, n_ctx_train, n_ctx);
224
+ }
225
+
226
+ // print system information
227
+ {
228
+ fprintf(stderr, "\n");
229
+ fprintf(stderr, "%s\n", get_system_info(params).c_str());
230
+ }
231
+
232
+ // max batch size
233
+ const uint64_t n_batch = params.n_batch;
234
+ GGML_ASSERT(params.n_batch >= params.n_ctx);
235
+
236
+ // tokenize the prompts and trim
237
+ for (auto & chunk : chunks) {
238
+ auto inp = ::llama_tokenize(ctx, chunk.textdata, true, false);
239
+ if (inp.size() > n_batch) {
240
+ fprintf(stderr, "%s: error: chunk size (%lld) exceeds batch size (%lld), increase batch size and re-run\n",
241
+ __func__, (long long int) inp.size(), (long long int) n_batch);
242
+ return 1;
243
+ }
244
+ // add eos if not present
245
+ if (inp.empty() || inp.back() != llama_token_eos(model)) {
246
+ inp.push_back(llama_token_eos(model));
247
+ }
248
+ chunk.tokens = inp;
249
+ }
250
+
251
+ // tokenization stats
252
+ if (params.verbose_prompt) {
253
+ for (int i = 0; i < (int) chunks.size(); i++) {
254
+ fprintf(stderr, "%s: prompt %d: '%s'\n", __func__, i, chunks[i].textdata.c_str());
255
+ fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, chunks[i].tokens.size());
256
+ for (int j = 0; j < (int) chunks[i].tokens.size(); j++) {
257
+ fprintf(stderr, "%6d -> '%s'\n", chunks[i].tokens[j], llama_token_to_piece(ctx, chunks[i].tokens[j]).c_str());
258
+ }
259
+ fprintf(stderr, "\n\n");
260
+ }
261
+ }
262
+
263
+ // initialize batch
264
+ const int n_chunks = chunks.size();
265
+ struct llama_batch batch = llama_batch_init(n_batch, 0, 1);
266
+
267
+ // allocate output
268
+ const int n_embd = llama_n_embd(model);
269
+ std::vector<float> embeddings(n_chunks * n_embd, 0);
270
+ float * emb = embeddings.data();
271
+
272
+ // break into batches
273
+ int p = 0; // number of prompts processed already
274
+ int s = 0; // number of prompts in current batch
275
+ for (int k = 0; k < n_chunks; k++) {
276
+ // clamp to n_batch tokens
277
+ auto & inp = chunks[k].tokens;
278
+
279
+ const uint64_t n_toks = inp.size();
280
+
281
+ // encode if at capacity
282
+ if (batch.n_tokens + n_toks > n_batch) {
283
+ float * out = emb + p * n_embd;
284
+ batch_decode(ctx, batch, out, s, n_embd);
285
+ llama_batch_clear(batch);
286
+ p += s;
287
+ s = 0;
288
+ }
289
+
290
+ // add to batch
291
+ batch_add_seq(batch, inp, s);
292
+ s += 1;
293
+ }
294
+
295
+ // final batch
296
+ float * out = emb + p * n_embd;
297
+ batch_decode(ctx, batch, out, s, n_embd);
298
+
299
+ // save embeddings to chunks
300
+ for (int i = 0; i < n_chunks; i++) {
301
+ chunks[i].embedding = std::vector<float>(emb + i * n_embd, emb + (i + 1) * n_embd);
302
+ // clear tokens as they are no longer needed
303
+ chunks[i].tokens.clear();
304
+ }
305
+
306
+ // start loop, receive query and return top k similar chunks based on cosine similarity
307
+ std::string query;
308
+ while (true) {
309
+ printf("Enter query: ");
310
+ std::getline(std::cin, query);
311
+ std::vector<int32_t> query_tokens = llama_tokenize(ctx, query, true);
312
+
313
+ struct llama_batch query_batch = llama_batch_init(n_batch, 0, 1);
314
+ batch_add_seq(query_batch, query_tokens, 0);
315
+
316
+ std::vector<float> query_emb(n_embd, 0);
317
+ batch_decode(ctx, query_batch, query_emb.data(), 1, n_embd);
318
+
319
+ llama_batch_clear(query_batch);
320
+
321
+ // compute cosine similarities
322
+ {
323
+ std::vector<std::pair<int, float>> similarities;
324
+ for (int i = 0; i < n_chunks; i++) {
325
+ float sim = llama_embd_similarity_cos(chunks[i].embedding.data(), query_emb.data(), n_embd);
326
+ similarities.push_back(std::make_pair(i, sim));
327
+ }
328
+
329
+ // sort similarities
330
+ std::sort(similarities.begin(), similarities.end(), [](const std::pair<int, float> & a, const std::pair<int, float> & b) {
331
+ return a.second > b.second;
332
+ });
333
+
334
+ printf("Top %d similar chunks:\n", params.sparams.top_k);
335
+ for (int i = 0; i < std::min(params.sparams.top_k, (int) chunks.size()); i++) {
336
+ printf("filename: %s\n", chunks[similarities[i].first].filename.c_str());
337
+ printf("filepos: %lld\n", (long long int) chunks[similarities[i].first].filepos);
338
+ printf("similarity: %f\n", similarities[i].second);
339
+ printf("textdata:\n%s\n", chunks[similarities[i].first].textdata.c_str());
340
+ printf("--------------------\n");
341
+ }
342
+ }
343
+ }
344
+
345
+ // clean up
346
+ llama_print_timings(ctx);
347
+ llama_free(ctx);
348
+ llama_free_model(model);
349
+ llama_backend_free();
350
+ }
@@ -0,0 +1,5 @@
1
+ set(TARGET save-load-state)
2
+ add_executable(${TARGET} save-load-state.cpp)
3
+ install(TARGETS ${TARGET} RUNTIME)
4
+ target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
5
+ target_compile_features(${TARGET} PRIVATE cxx_std_11)
@@ -0,0 +1,246 @@
1
+ #include "common.h"
2
+ #include "llama.h"
3
+
4
+ #include <vector>
5
+ #include <cstdio>
6
+ #include <chrono>
7
+
8
+ int main(int argc, char ** argv) {
9
+ gpt_params params;
10
+
11
+ params.prompt = "The quick brown fox";
12
+
13
+ if (!gpt_params_parse(argc, argv, params)) {
14
+ return 1;
15
+ }
16
+
17
+ print_build_info();
18
+
19
+ if (params.n_predict < 0) {
20
+ params.n_predict = 16;
21
+ }
22
+
23
+ auto n_past = 0;
24
+
25
+ std::string result0;
26
+ std::string result1;
27
+ std::string result2;
28
+
29
+ // init
30
+ llama_model * model;
31
+ llama_context * ctx;
32
+
33
+ std::tie(model, ctx) = llama_init_from_gpt_params(params);
34
+ if (model == nullptr || ctx == nullptr) {
35
+ fprintf(stderr, "%s : failed to init\n", __func__);
36
+ return 1;
37
+ }
38
+
39
+ // tokenize prompt
40
+ auto tokens = llama_tokenize(ctx, params.prompt, true);
41
+
42
+ // evaluate prompt
43
+ llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), n_past, 0));
44
+ n_past += tokens.size();
45
+
46
+ // save state (rng, logits, embedding and kv_cache) to file
47
+ {
48
+ std::vector<uint8_t> state_mem(llama_state_get_size(ctx));
49
+ const size_t written = llama_state_get_data(ctx, state_mem.data());
50
+
51
+ FILE *fp_write = fopen("dump_state.bin", "wb");
52
+ fwrite(state_mem.data(), 1, written, fp_write);
53
+ fclose(fp_write);
54
+
55
+ fprintf(stderr, "%s : serialized state into %zd out of a maximum of %zd bytes\n", __func__, written, state_mem.size());
56
+ }
57
+
58
+ // save state (last tokens)
59
+ const auto n_past_saved = n_past;
60
+
61
+ // first run
62
+ printf("\nfirst run: %s", params.prompt.c_str());
63
+
64
+ for (auto i = 0; i < params.n_predict; i++) {
65
+ auto * logits = llama_get_logits(ctx);
66
+ auto n_vocab = llama_n_vocab(model);
67
+
68
+ std::vector<llama_token_data> candidates;
69
+ candidates.reserve(n_vocab);
70
+ for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
71
+ candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
72
+ }
73
+ llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
74
+ auto next_token = llama_sample_token(ctx, &candidates_p);
75
+ auto next_token_str = llama_token_to_piece(ctx, next_token);
76
+
77
+ printf("%s", next_token_str.c_str());
78
+ result0 += next_token_str;
79
+
80
+ if (llama_decode(ctx, llama_batch_get_one(&next_token, 1, n_past, 0))) {
81
+ fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
82
+ llama_free(ctx);
83
+ llama_free_model(model);
84
+ return 1;
85
+ }
86
+ n_past += 1;
87
+ }
88
+
89
+ printf("\n\n");
90
+
91
+ // free old context
92
+ llama_free(ctx);
93
+
94
+ // make new context
95
+ auto * ctx2 = llama_new_context_with_model(model, llama_context_params_from_gpt_params(params));
96
+
97
+ printf("\nsecond run: %s", params.prompt.c_str());
98
+
99
+ // load state (rng, logits, embedding and kv_cache) from file
100
+ {
101
+ std::vector<uint8_t> state_mem(llama_state_get_size(ctx2));
102
+
103
+ FILE * fp_read = fopen("dump_state.bin", "rb");
104
+ const size_t read = fread(state_mem.data(), 1, state_mem.size(), fp_read);
105
+ fclose(fp_read);
106
+
107
+ if (read != llama_state_set_data(ctx2, state_mem.data())) {
108
+ fprintf(stderr, "\n%s : failed to read state\n", __func__);
109
+ llama_free(ctx2);
110
+ llama_free_model(model);
111
+ return 1;
112
+ }
113
+
114
+ fprintf(stderr, "%s : deserialized state from %zd out of a maximum of %zd bytes\n", __func__, read, state_mem.size());
115
+ }
116
+
117
+ // restore state (last tokens)
118
+ n_past = n_past_saved;
119
+
120
+ // second run
121
+ for (auto i = 0; i < params.n_predict; i++) {
122
+ auto * logits = llama_get_logits(ctx2);
123
+ auto n_vocab = llama_n_vocab(model);
124
+ std::vector<llama_token_data> candidates;
125
+ candidates.reserve(n_vocab);
126
+ for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
127
+ candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
128
+ }
129
+ llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
130
+ auto next_token = llama_sample_token(ctx2, &candidates_p);
131
+ auto next_token_str = llama_token_to_piece(ctx2, next_token);
132
+
133
+ printf("%s", next_token_str.c_str());
134
+ result1 += next_token_str;
135
+
136
+ if (llama_decode(ctx2, llama_batch_get_one(&next_token, 1, n_past, 0))) {
137
+ fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
138
+ llama_free(ctx2);
139
+ llama_free_model(model);
140
+ return 1;
141
+ }
142
+ n_past += 1;
143
+ }
144
+
145
+ printf("\n\n");
146
+
147
+ llama_free(ctx2);
148
+
149
+ if (result0 != result1) {
150
+ fprintf(stderr, "\n%s : error : the 2 generations are different\n", __func__);
151
+ return 1;
152
+ }
153
+
154
+ // make new context
155
+ auto* ctx3 = llama_new_context_with_model(model, llama_context_params_from_gpt_params(params));
156
+
157
+ printf("\nsingle seq run: %s", params.prompt.c_str());
158
+
159
+ // load state (rng, logits, embedding and kv_cache) from file
160
+ {
161
+ std::vector<uint8_t> state_mem(llama_state_get_size(ctx3));
162
+
163
+ FILE * fp_read = fopen("dump_state.bin", "rb");
164
+ const size_t read = fread(state_mem.data(), 1, state_mem.size(), fp_read);
165
+ fclose(fp_read);
166
+
167
+ if (read != llama_state_set_data(ctx3, state_mem.data())) {
168
+ fprintf(stderr, "\n%s : failed to read state\n", __func__);
169
+ llama_free(ctx3);
170
+ llama_free_model(model);
171
+ return 1;
172
+ }
173
+
174
+ fprintf(stderr, "%s : deserialized state from %zd out of a maximum of %zd bytes\n", __func__, read, state_mem.size());
175
+ }
176
+
177
+ // restore state (last tokens)
178
+ n_past = n_past_saved;
179
+
180
+ // save seq 0 and load into seq 1
181
+ {
182
+ // save kv of seq 0
183
+ std::vector<uint8_t> seq_store(llama_state_seq_get_size(ctx3, 0));
184
+ const size_t ncopy = llama_state_seq_get_data(ctx3, seq_store.data(), 0);
185
+ if (ncopy != seq_store.size()) {
186
+ fprintf(stderr, "\n%s : seq copy data length %zd does not match expected length %zd\n", __func__, ncopy, seq_store.size());
187
+ llama_free(ctx3);
188
+ llama_free_model(model);
189
+ return 1;
190
+ }
191
+ fprintf(stderr, "%s : seq 0 copied, %zd bytes\n", __func__, ncopy);
192
+
193
+ // erase whole kv
194
+ llama_kv_cache_clear(ctx3);
195
+ fprintf(stderr, "%s : kv cache cleared\n", __func__);
196
+
197
+ // restore kv into seq 1
198
+ const size_t nset = llama_state_seq_set_data(ctx3, seq_store.data(), 1);
199
+ if (nset != seq_store.size()) {
200
+ fprintf(stderr, "\n%s : seq set data length %zd does not match expected length %zd\n", __func__, nset, seq_store.size());
201
+ llama_free(ctx3);
202
+ llama_free_model(model);
203
+ return 1;
204
+ }
205
+ fprintf(stderr, "%s : seq 1 restored, %zd bytes\n", __func__, nset);
206
+ }
207
+
208
+ // third run with seq 1 instead of 0
209
+ for (auto i = 0; i < params.n_predict; i++) {
210
+ auto * logits = llama_get_logits(ctx3);
211
+ auto n_vocab = llama_n_vocab(model);
212
+ std::vector<llama_token_data> candidates;
213
+ candidates.reserve(n_vocab);
214
+ for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
215
+ candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
216
+ }
217
+ llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
218
+ auto next_token = llama_sample_token(ctx3, &candidates_p);
219
+ auto next_token_str = llama_token_to_piece(ctx3, next_token);
220
+
221
+ printf("%s", next_token_str.c_str());
222
+ result2 += next_token_str;
223
+
224
+ if (llama_decode(ctx3, llama_batch_get_one(&next_token, 1, n_past, 1))) {
225
+ fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
226
+ llama_free(ctx3);
227
+ llama_free_model(model);
228
+ return 1;
229
+ }
230
+ n_past += 1;
231
+ }
232
+
233
+ printf("\n");
234
+
235
+ llama_free(ctx3);
236
+ llama_free_model(model);
237
+
238
+ if (result0 != result2) {
239
+ fprintf(stderr, "\n%s : error : the seq restore generation is different\n", __func__);
240
+ return 1;
241
+ }
242
+
243
+ fprintf(stderr, "\n%s : success\n", __func__);
244
+
245
+ return 0;
246
+ }
@@ -0,0 +1,40 @@
1
+ set(TARGET server)
2
+ option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
3
+ option(LLAMA_SERVER_SSL "Build SSL support for the server" OFF)
4
+ include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR})
5
+ set(TARGET_SRCS
6
+ server.cpp
7
+ utils.hpp
8
+ httplib.h
9
+ )
10
+ set(PUBLIC_ASSETS
11
+ index.html
12
+ index.js
13
+ completion.js
14
+ json-schema-to-grammar.mjs
15
+ )
16
+ foreach(asset ${PUBLIC_ASSETS})
17
+ set(input "${CMAKE_CURRENT_SOURCE_DIR}/public/${asset}")
18
+ set(output "${CMAKE_CURRENT_BINARY_DIR}/${asset}.hpp")
19
+ list(APPEND TARGET_SRCS ${output})
20
+ add_custom_command(
21
+ DEPENDS "${input}"
22
+ OUTPUT "${output}"
23
+ COMMAND "${CMAKE_COMMAND}" "-DINPUT=${input}" "-DOUTPUT=${output}" -P "${PROJECT_SOURCE_DIR}/scripts/xxd.cmake"
24
+ )
25
+ endforeach()
26
+ add_executable(${TARGET} ${TARGET_SRCS})
27
+ install(TARGETS ${TARGET} RUNTIME)
28
+ target_compile_definitions(${TARGET} PRIVATE
29
+ SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
30
+ )
31
+ target_link_libraries(${TARGET} PRIVATE common ${CMAKE_THREAD_LIBS_INIT})
32
+ if (LLAMA_SERVER_SSL)
33
+ find_package(OpenSSL REQUIRED)
34
+ target_link_libraries(${TARGET} PRIVATE OpenSSL::SSL OpenSSL::Crypto)
35
+ target_compile_definitions(${TARGET} PRIVATE CPPHTTPLIB_OPENSSL_SUPPORT)
36
+ endif()
37
+ if (WIN32)
38
+ TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
39
+ endif()
40
+ target_compile_features(${TARGET} PRIVATE cxx_std_11)
@@ -0,0 +1,2 @@
1
+ matplotlib
2
+ requests