@fugood/llama.node 0.3.0 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (187) hide show
  1. package/CMakeLists.txt +1 -10
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/package.json +6 -4
  17. package/src/LlamaCompletionWorker.cpp +6 -6
  18. package/src/LlamaContext.cpp +7 -9
  19. package/src/common.hpp +2 -1
  20. package/src/llama.cpp/.github/workflows/build.yml +98 -24
  21. package/src/llama.cpp/.github/workflows/close-issue.yml +5 -0
  22. package/src/llama.cpp/.github/workflows/docker.yml +43 -34
  23. package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +7 -0
  24. package/src/llama.cpp/.github/workflows/nix-ci.yml +7 -0
  25. package/src/llama.cpp/.github/workflows/python-check-requirements.yml +2 -4
  26. package/src/llama.cpp/.github/workflows/python-type-check.yml +3 -1
  27. package/src/llama.cpp/.github/workflows/server.yml +7 -0
  28. package/src/llama.cpp/CMakeLists.txt +20 -8
  29. package/src/llama.cpp/common/CMakeLists.txt +12 -10
  30. package/src/llama.cpp/common/arg.cpp +2006 -0
  31. package/src/llama.cpp/common/arg.h +77 -0
  32. package/src/llama.cpp/common/common.cpp +496 -1632
  33. package/src/llama.cpp/common/common.h +161 -63
  34. package/src/llama.cpp/common/console.cpp +3 -0
  35. package/src/llama.cpp/common/log.cpp +401 -0
  36. package/src/llama.cpp/common/log.h +66 -698
  37. package/src/llama.cpp/common/ngram-cache.cpp +3 -0
  38. package/src/llama.cpp/common/sampling.cpp +348 -350
  39. package/src/llama.cpp/common/sampling.h +62 -139
  40. package/src/llama.cpp/common/stb_image.h +5990 -6398
  41. package/src/llama.cpp/common/train.cpp +2 -0
  42. package/src/llama.cpp/docs/build.md +36 -1
  43. package/src/llama.cpp/examples/CMakeLists.txt +0 -1
  44. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +1 -2
  45. package/src/llama.cpp/examples/batched/batched.cpp +39 -55
  46. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +34 -44
  47. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +55 -52
  48. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +15 -15
  49. package/src/llama.cpp/examples/cvector-generator/pca.hpp +3 -13
  50. package/src/llama.cpp/examples/embedding/embedding.cpp +143 -87
  51. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +33 -33
  52. package/src/llama.cpp/examples/export-lora/export-lora.cpp +36 -35
  53. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +14 -39
  54. package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +5 -0
  55. package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +83 -0
  56. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +58 -39
  57. package/src/llama.cpp/examples/gritlm/gritlm.cpp +34 -27
  58. package/src/llama.cpp/examples/imatrix/imatrix.cpp +59 -62
  59. package/src/llama.cpp/examples/infill/infill.cpp +117 -132
  60. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +265 -58
  61. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +29 -22
  62. package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
  63. package/src/llama.cpp/examples/llava/clip.cpp +685 -150
  64. package/src/llama.cpp/examples/llava/clip.h +11 -2
  65. package/src/llama.cpp/examples/llava/llava-cli.cpp +47 -58
  66. package/src/llama.cpp/examples/llava/llava.cpp +110 -24
  67. package/src/llama.cpp/examples/llava/llava.h +2 -3
  68. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +323 -0
  69. package/src/llama.cpp/examples/llava/requirements.txt +1 -0
  70. package/src/llama.cpp/examples/lookahead/lookahead.cpp +42 -43
  71. package/src/llama.cpp/examples/lookup/lookup-create.cpp +10 -8
  72. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +23 -22
  73. package/src/llama.cpp/examples/lookup/lookup.cpp +40 -43
  74. package/src/llama.cpp/examples/main/main.cpp +210 -262
  75. package/src/llama.cpp/examples/parallel/parallel.cpp +49 -49
  76. package/src/llama.cpp/examples/passkey/passkey.cpp +42 -50
  77. package/src/llama.cpp/examples/perplexity/perplexity.cpp +187 -200
  78. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  79. package/src/llama.cpp/examples/quantize/quantize.cpp +27 -9
  80. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +2 -3
  81. package/src/llama.cpp/examples/retrieval/retrieval.cpp +49 -44
  82. package/src/llama.cpp/examples/rpc/rpc-server.cpp +24 -1
  83. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +32 -35
  84. package/src/llama.cpp/examples/server/CMakeLists.txt +3 -5
  85. package/src/llama.cpp/examples/server/server.cpp +1027 -1073
  86. package/src/llama.cpp/examples/server/tests/requirements.txt +2 -1
  87. package/src/llama.cpp/examples/server/utils.hpp +107 -105
  88. package/src/llama.cpp/examples/simple/simple.cpp +35 -41
  89. package/src/llama.cpp/examples/speculative/speculative.cpp +129 -103
  90. package/src/llama.cpp/examples/sycl/run-llama2.sh +10 -19
  91. package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
  92. package/src/llama.cpp/examples/tokenize/tokenize.cpp +25 -27
  93. package/src/llama.cpp/ggml/CMakeLists.txt +14 -3
  94. package/src/llama.cpp/ggml/include/ggml-alloc.h +3 -3
  95. package/src/llama.cpp/ggml/include/ggml-backend.h +145 -60
  96. package/src/llama.cpp/ggml/include/ggml-blas.h +3 -3
  97. package/src/llama.cpp/ggml/include/ggml-cann.h +15 -19
  98. package/src/llama.cpp/ggml/include/ggml-cuda.h +16 -16
  99. package/src/llama.cpp/ggml/include/ggml-metal.h +5 -8
  100. package/src/llama.cpp/ggml/include/ggml-rpc.h +5 -5
  101. package/src/llama.cpp/ggml/include/ggml-sycl.h +8 -8
  102. package/src/llama.cpp/ggml/include/ggml-vulkan.h +7 -7
  103. package/src/llama.cpp/ggml/include/ggml.h +293 -186
  104. package/src/llama.cpp/ggml/src/CMakeLists.txt +86 -44
  105. package/src/llama.cpp/ggml/src/ggml-aarch64.c +2135 -1119
  106. package/src/llama.cpp/ggml/src/ggml-alloc.c +6 -0
  107. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +152 -70
  108. package/src/llama.cpp/ggml/src/{ggml-backend.c → ggml-backend.cpp} +606 -286
  109. package/src/llama.cpp/ggml/src/ggml-blas.cpp +9 -10
  110. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +4 -27
  111. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +32 -4
  112. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +179 -41
  113. package/src/llama.cpp/ggml/src/ggml-cann/common.h +1 -0
  114. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -1
  115. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +2 -0
  116. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +278 -0
  117. package/src/llama.cpp/ggml/src/ggml-cann.cpp +215 -216
  118. package/src/llama.cpp/ggml/src/ggml-common.h +20 -0
  119. package/src/llama.cpp/ggml/src/ggml-cpu-impl.h +614 -0
  120. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
  121. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +178 -0
  122. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +134 -0
  123. package/src/llama.cpp/ggml/src/ggml-impl.h +49 -603
  124. package/src/llama.cpp/ggml/src/ggml-kompute.cpp +4 -24
  125. package/src/llama.cpp/ggml/src/ggml-quants.c +972 -92
  126. package/src/llama.cpp/ggml/src/ggml-quants.h +15 -0
  127. package/src/llama.cpp/ggml/src/ggml-rpc.cpp +116 -66
  128. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
  129. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +11 -0
  130. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +52 -0
  131. package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +99 -0
  132. package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +21 -0
  133. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +57 -57
  134. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +1 -1
  135. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +106 -106
  136. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +4 -4
  137. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +16 -3
  138. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +101 -0
  139. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +125 -0
  140. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +23 -0
  141. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +1 -1
  142. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +6 -3
  143. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +2 -0
  144. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -1
  145. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +71 -0
  146. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +21 -0
  147. package/src/llama.cpp/ggml/src/ggml-sycl.cpp +97 -169
  148. package/src/llama.cpp/ggml/src/ggml-vulkan.cpp +1508 -1124
  149. package/src/llama.cpp/ggml/src/ggml.c +3001 -1647
  150. package/src/llama.cpp/ggml/src/llamafile/sgemm.cpp +192 -0
  151. package/src/llama.cpp/ggml/src/vulkan-shaders/CMakeLists.txt +2 -0
  152. package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +88 -40
  153. package/src/llama.cpp/include/llama.h +241 -264
  154. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +112 -0
  155. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.out +46 -0
  156. package/src/llama.cpp/requirements/requirements-convert_legacy_llama.txt +1 -1
  157. package/src/llama.cpp/src/llama-grammar.cpp +721 -122
  158. package/src/llama.cpp/src/llama-grammar.h +120 -15
  159. package/src/llama.cpp/src/llama-impl.h +156 -1
  160. package/src/llama.cpp/src/llama-sampling.cpp +1375 -303
  161. package/src/llama.cpp/src/llama-sampling.h +20 -47
  162. package/src/llama.cpp/src/llama-vocab.cpp +343 -120
  163. package/src/llama.cpp/src/llama-vocab.h +33 -17
  164. package/src/llama.cpp/src/llama.cpp +4247 -1525
  165. package/src/llama.cpp/src/unicode-data.cpp +6 -4
  166. package/src/llama.cpp/src/unicode-data.h +4 -4
  167. package/src/llama.cpp/src/unicode.cpp +15 -7
  168. package/src/llama.cpp/tests/CMakeLists.txt +3 -0
  169. package/src/llama.cpp/tests/test-arg-parser.cpp +131 -0
  170. package/src/llama.cpp/tests/test-backend-ops.cpp +1592 -289
  171. package/src/llama.cpp/tests/test-barrier.cpp +93 -0
  172. package/src/llama.cpp/tests/test-grad0.cpp +187 -70
  173. package/src/llama.cpp/tests/test-grammar-integration.cpp +23 -38
  174. package/src/llama.cpp/tests/test-grammar-parser.cpp +6 -4
  175. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +6 -4
  176. package/src/llama.cpp/tests/test-llama-grammar.cpp +9 -8
  177. package/src/llama.cpp/tests/test-log.cpp +39 -0
  178. package/src/llama.cpp/tests/test-quantize-fns.cpp +6 -0
  179. package/src/llama.cpp/tests/test-rope.cpp +1 -1
  180. package/src/llama.cpp/tests/test-sampling.cpp +157 -98
  181. package/src/llama.cpp/tests/test-tokenizer-0.cpp +55 -35
  182. package/patches/llama.patch +0 -22
  183. package/src/llama.cpp/.github/workflows/bench.yml +0 -310
  184. package/src/llama.cpp/common/grammar-parser.cpp +0 -536
  185. package/src/llama.cpp/common/grammar-parser.h +0 -29
  186. package/src/llama.cpp/examples/benchmark/CMakeLists.txt +0 -6
  187. package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +0 -275
@@ -0,0 +1,323 @@
1
+ #include "arg.h"
2
+ #include "log.h"
3
+ #include "common.h"
4
+ #include "sampling.h"
5
+ #include "clip.h"
6
+ #include "llava.h"
7
+ #include "llama.h"
8
+ #include "ggml.h"
9
+
10
+ #include <algorithm>
11
+ #include <cstdio>
12
+ #include <cstdlib>
13
+ #include <cstring>
14
+ #include <vector>
15
+ #include <iostream> // TODO: remove me
16
+
17
+ struct llava_context {
18
+ struct clip_ctx * ctx_clip = NULL;
19
+ struct llama_context * ctx_llama = NULL;
20
+ struct llama_model * model = NULL;
21
+ };
22
+
23
+ static void show_additional_info(int /*argc*/, char ** argv) {
24
+ LOG("\nexample usage:\n\n%s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
25
+ LOG("\nnote: a lower temperature value like 0.1 is recommended for better quality.\n");
26
+ }
27
+
28
+ static struct llama_model * llava_init(gpt_params * params) {
29
+ llama_backend_init();
30
+ llama_numa_init(params->numa);
31
+
32
+ llama_model_params model_params = llama_model_params_from_gpt_params(*params);
33
+
34
+ llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params);
35
+ if (model == NULL) {
36
+ LOG_ERR("%s: unable to load model\n" , __func__);
37
+ return NULL;
38
+ }
39
+ return model;
40
+ }
41
+
42
+ static struct llava_context * llava_init_context(gpt_params * params, llama_model * model) {
43
+ auto prompt = params->prompt;
44
+ if (prompt.empty()) {
45
+ prompt = "describe the image in detail.";
46
+ }
47
+
48
+ llama_context_params ctx_params = llama_context_params_from_gpt_params(*params);
49
+ if (params->n_ctx < 2048) {
50
+ // warn user here, "Image processing requires at least 2048 context, setting context to 2048"
51
+ LOG_WRN("%s: Image processing requires at least 2048 context, setting context to 2048\n" , __func__);
52
+ ctx_params.n_ctx = 2048;
53
+ } else {
54
+ ctx_params.n_ctx = params->n_ctx;
55
+ }
56
+
57
+ llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);
58
+
59
+ if (ctx_llama == NULL) {
60
+ LOG_ERR("%s: failed to create the llama_context\n" , __func__);
61
+ return NULL;
62
+ }
63
+
64
+ auto * ctx_llava = (struct llava_context *)malloc(sizeof(llava_context));
65
+
66
+ ctx_llava->ctx_llama = ctx_llama;
67
+ ctx_llava->model = model;
68
+ return ctx_llava;
69
+ }
70
+
71
+ static void llava_free(struct llava_context * ctx_llava) {
72
+ if (ctx_llava->ctx_clip) {
73
+ clip_free(ctx_llava->ctx_clip);
74
+ ctx_llava->ctx_clip = NULL;
75
+ }
76
+
77
+ llama_free(ctx_llava->ctx_llama);
78
+ llama_free_model(ctx_llava->model);
79
+ llama_backend_free();
80
+ }
81
+
82
+ static struct clip_ctx * clip_init_context(gpt_params * params) {
83
+ const char * clip_path = params->mmproj.c_str();
84
+
85
+ auto prompt = params->prompt;
86
+ if (prompt.empty()) {
87
+ prompt = "describe the image in detail.";
88
+ }
89
+ auto * ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
90
+ return ctx_clip;
91
+ }
92
+
93
+ static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_token> tokens, int n_batch, int * n_past) {
94
+ int N = (int) tokens.size();
95
+ for (int i = 0; i < N; i += n_batch) {
96
+ int n_eval = (int) tokens.size() - i;
97
+ if (n_eval > n_batch) {
98
+ n_eval = n_batch;
99
+ }
100
+ if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0))) {
101
+ LOG_ERR("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
102
+ return false;
103
+ }
104
+ *n_past += n_eval;
105
+ }
106
+ return true;
107
+ }
108
+
109
+ static bool eval_id(struct llama_context * ctx_llama, int id, int * n_past) {
110
+ std::vector<llama_token> tokens;
111
+ tokens.push_back(id);
112
+ return eval_tokens(ctx_llama, tokens, 1, n_past);
113
+ }
114
+
115
+ static bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos){
116
+ std::string str2 = str;
117
+ std::vector<llama_token> embd_inp = ::llama_tokenize(ctx_llama, str2, add_bos, true);
118
+ return eval_tokens(ctx_llama, embd_inp, n_batch, n_past);
119
+ }
120
+
121
+ static void process_eval_image_embed(struct llava_context * ctx_llava, const struct llava_image_embed * embeds, int n_batch, int * n_past, int idx) {
122
+ float * image_embed = (float *)malloc(clip_embd_nbytes(ctx_llava->ctx_clip));
123
+ std::memcpy(image_embed, embeds->embed + idx * clip_n_patches(ctx_llava->ctx_clip) * clip_n_mmproj_embd(ctx_llava->ctx_clip), clip_embd_nbytes(ctx_llava->ctx_clip));
124
+
125
+ auto * slice_embed = (llava_image_embed*)malloc(sizeof(llava_image_embed));
126
+ slice_embed->embed = image_embed;
127
+ slice_embed->n_image_pos = clip_n_patches(ctx_llava->ctx_clip);
128
+ llava_eval_image_embed(ctx_llava->ctx_llama, slice_embed, n_batch, n_past);
129
+ llava_image_embed_free(slice_embed);
130
+ }
131
+
132
+ static void process_image(struct llava_context * ctx_llava, struct llava_image_embed * embeds, gpt_params * params, int &n_past) {
133
+ std::string system_prompt;
134
+ int idx = 0;
135
+ int num_image_embeds = embeds->n_image_pos / clip_n_patches(ctx_llava->ctx_clip);
136
+ int has_minicpmv_projector = clip_is_minicpmv(ctx_llava->ctx_clip);
137
+ if (has_minicpmv_projector == 2) {
138
+ system_prompt = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n";
139
+ }
140
+ else if (has_minicpmv_projector == 3) {
141
+ system_prompt = "<|im_start|>user\n";
142
+ }
143
+ LOG_INF("%s: image token past: %d\n", __func__, n_past);
144
+ eval_string(ctx_llava->ctx_llama, (system_prompt+"<image>").c_str(), params->n_batch, &n_past, false);
145
+ process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
146
+ eval_string(ctx_llava->ctx_llama, std::string("</image>").c_str(), params->n_batch, &n_past, false);
147
+ if (num_image_embeds > 1) {
148
+ size_t num_image_embeds_col = clip_uhd_num_image_embeds_col(ctx_llava->ctx_clip);
149
+ eval_string(ctx_llava->ctx_llama, std::string("<slice>").c_str(), params->n_batch, &n_past, false);
150
+ for (size_t i = 0; i < (num_image_embeds-1)/num_image_embeds_col; ++i) {
151
+ for (size_t j = 0; j < num_image_embeds_col; ++j) {
152
+ eval_string(ctx_llava->ctx_llama, std::string("<image>").c_str(), params->n_batch, &n_past, false);
153
+ process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
154
+ eval_string(ctx_llava->ctx_llama, std::string("</image>").c_str(), params->n_batch, &n_past, false);
155
+ if (j == num_image_embeds_col - 1) {
156
+ eval_string(ctx_llava->ctx_llama, std::string("\n").c_str(), params->n_batch, &n_past, false);
157
+ }
158
+ }
159
+ }
160
+ eval_string(ctx_llava->ctx_llama, std::string("</slice>").c_str(), params->n_batch, &n_past, false);
161
+ }
162
+ LOG_INF("%s: image token past: %d\n", __func__, n_past);
163
+ }
164
+
165
+ static const char * sample(struct gpt_sampler * smpl,
166
+ struct llama_context * ctx_llama,
167
+ int * n_past) {
168
+ const llama_token id = gpt_sampler_sample(smpl, ctx_llama, -1);
169
+ gpt_sampler_accept(smpl, id, true);
170
+ static std::string ret;
171
+ if (llama_token_is_eog(llama_get_model(ctx_llama), id)) {
172
+ ret = "</s>";
173
+ } else {
174
+ ret = llama_token_to_piece(ctx_llama, id);
175
+ }
176
+ eval_id(ctx_llama, id, n_past);
177
+ return ret.c_str();
178
+ }
179
+
180
+ static struct llava_context * minicpmv_init(gpt_params * params, const std::string & fname, int &n_past){
181
+ auto * ctx_clip = clip_init_context(params);
182
+ auto * embeds = llava_image_embed_make_with_filename(ctx_clip, params->cpuparams.n_threads, fname.c_str());
183
+ if (!embeds) {
184
+ LOG_ERR("failed to load image %s. Terminating\n\n", fname.c_str());
185
+ return NULL;
186
+ }
187
+
188
+ // process the prompt
189
+ if (params->prompt.empty() && params->interactive == false) {
190
+ LOG_ERR("prompt should be given or interactive mode should be on");
191
+ return NULL;
192
+ }
193
+
194
+ auto * model = llava_init(params);
195
+ if (model == NULL) {
196
+ fprintf(stderr, "%s: error: failed to init minicpmv model\n", __func__);
197
+ return NULL;
198
+ }
199
+ const int64_t t_llava_init_start_us = ggml_time_us();
200
+ auto * ctx_llava = llava_init_context(params, model);
201
+ ctx_llava->ctx_clip = ctx_clip;
202
+ const int64_t t_llava_init_end_us = ggml_time_us();
203
+ float t_llava_init_ms = (t_llava_init_end_us - t_llava_init_start_us) / 1000.0;
204
+ LOG_INF("%s: llava init in %8.2f ms.\n", __func__, t_llava_init_ms);
205
+
206
+ const int64_t t_process_image_start_us = ggml_time_us();
207
+ process_image(ctx_llava, embeds, params, n_past);
208
+ const int64_t t_process_image_end_us = ggml_time_us();
209
+ float t_process_image_ms = (t_process_image_end_us - t_process_image_start_us) / 1000.0;
210
+ LOG_INF("%s: llama process image in %8.2f ms.\n", __func__, t_process_image_ms);
211
+
212
+ llava_image_embed_free(embeds);
213
+ return ctx_llava;
214
+ }
215
+
216
+ static struct gpt_sampler * llama_init(struct llava_context * ctx_llava, gpt_params * params, const std::string & prompt, int & n_past, bool is_first = false){
217
+ std::string user_prompt = prompt;
218
+ int has_minicpmv_projector = clip_is_minicpmv(ctx_llava->ctx_clip);
219
+ if (!is_first) {
220
+ if (has_minicpmv_projector == 2) {
221
+ user_prompt = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n" + prompt;
222
+ }
223
+ else if (has_minicpmv_projector == 3) {
224
+ user_prompt = "<|im_start|>user\n" + prompt;
225
+ }
226
+ }
227
+
228
+ eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false);
229
+ if (has_minicpmv_projector == 2) {
230
+ eval_string(ctx_llava->ctx_llama, "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", params->n_batch, &n_past, false);
231
+ }
232
+ else if (has_minicpmv_projector == 3) {
233
+ eval_string(ctx_llava->ctx_llama, "<|im_end|><|im_start|>assistant\n", params->n_batch, &n_past, false);
234
+ }
235
+
236
+ // generate the response
237
+
238
+ LOG_INF("\n");
239
+
240
+ struct gpt_sampler * smpl = gpt_sampler_init(ctx_llava->model, params->sparams);
241
+ return smpl;
242
+ }
243
+
244
+ static const char * llama_loop(struct llava_context * ctx_llava,struct gpt_sampler * smpl, int &n_past){
245
+
246
+ const char * tmp = sample(smpl, ctx_llava->ctx_llama, &n_past);
247
+ return tmp;
248
+ }
249
+
250
+ int main(int argc, char ** argv) {
251
+ ggml_time_init();
252
+
253
+ gpt_params params;
254
+
255
+ if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, show_additional_info)) {
256
+ return 1;
257
+ }
258
+
259
+ gpt_init();
260
+
261
+ if (params.mmproj.empty() || (params.image.empty())) {
262
+ show_additional_info(argc, argv);
263
+ return 1;
264
+ }
265
+
266
+ for (auto & image : params.image) {
267
+ int n_past = 0;
268
+ auto * ctx_llava = minicpmv_init(&params, image, n_past);
269
+
270
+ if (!params.prompt.empty()) {
271
+ LOG("<user>%s\n", params.prompt.c_str());
272
+ LOG("<assistant>");
273
+ auto * smpl = llama_init(ctx_llava, &params, params.prompt, n_past, true);
274
+ const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict;
275
+ std::string response;
276
+ bool have_tmp = false;
277
+ for (int i = 0; i < max_tgt_len; i++) {
278
+ const auto * tmp = llama_loop(ctx_llava, smpl, n_past);
279
+ response += tmp;
280
+ if (strcmp(tmp, "</s>") == 0){
281
+ if (!have_tmp) {
282
+ continue;
283
+ }
284
+ break;
285
+ }
286
+ if (strstr(tmp, "###")) break; // Yi-VL behavior
287
+ have_tmp = true;
288
+ printf("%s", tmp);
289
+ if (strstr(response.c_str(), "<user>")) break; // minicpm-v
290
+
291
+ fflush(stdout);
292
+ }
293
+ gpt_sampler_free(smpl);
294
+ }else {
295
+ while (true) {
296
+ LOG("<user>");
297
+ std::string prompt;
298
+ std::getline(std::cin, prompt);
299
+ LOG("<assistant>");
300
+ auto * smpl = llama_init(ctx_llava, &params, prompt, n_past, true);
301
+ const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict;
302
+ std::string response;
303
+ for (int i = 0; i < max_tgt_len; i++) {
304
+ const auto * tmp = llama_loop(ctx_llava, smpl, n_past);
305
+ response += tmp;
306
+ if (strcmp(tmp, "</s>") == 0) break;
307
+ if (strstr(tmp, "###")) break; // Yi-VL behavior
308
+ printf("%s", tmp);// mistral llava-1.6
309
+ if (strstr(response.c_str(), "<user>")) break; // minicpm-v
310
+ fflush(stdout);
311
+ }
312
+ gpt_sampler_free(smpl);
313
+ }
314
+ }
315
+ printf("\n");
316
+ llama_perf_context_print(ctx_llava->ctx_llama);
317
+
318
+ ctx_llava->model = NULL;
319
+ llava_free(ctx_llava);
320
+ }
321
+
322
+ return 0;
323
+ }
@@ -2,3 +2,4 @@
2
2
  --extra-index-url https://download.pytorch.org/whl/cpu
3
3
  pillow~=10.2.0
4
4
  torch~=2.2.1
5
+ torchvision~=0.17.1
@@ -1,7 +1,9 @@
1
+ #include "arg.h"
1
2
  #include "common.h"
3
+ #include "sampling.h"
4
+ #include "log.h"
2
5
  #include "llama.h"
3
6
 
4
- #include <cmath>
5
7
  #include <cstdio>
6
8
  #include <string>
7
9
  #include <vector>
@@ -37,32 +39,27 @@ struct ngram_container {
37
39
  int main(int argc, char ** argv) {
38
40
  gpt_params params;
39
41
 
40
- if (!gpt_params_parse(argc, argv, params)) {
41
- gpt_params_print_usage(argc, argv, params);
42
+ if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
42
43
  return 1;
43
44
  }
44
45
 
46
+ gpt_init();
47
+
45
48
  const int W = 15; // lookahead window
46
49
  const int N = 5; // n-gram size
47
50
  const int G = 15; // max verification n-grams
48
51
 
49
52
  const bool dump_kv_cache = params.dump_kv_cache;
50
53
 
51
- #ifndef LOG_DISABLE_LOGS
52
- log_set_target(log_filename_generator("lookahead", "log"));
53
- LOG_TEE("Log start\n");
54
- log_dump_cmdline(argc, argv);
55
- #endif // LOG_DISABLE_LOGS
56
-
57
54
  // init llama.cpp
58
55
  llama_backend_init();
59
56
  llama_numa_init(params.numa);
60
57
 
61
- llama_model * model = NULL;
62
- llama_context * ctx = NULL;
63
-
64
58
  // load the target model
65
- std::tie(model, ctx) = llama_init_from_gpt_params(params);
59
+ llama_init_result llama_init = llama_init_from_gpt_params(params);
60
+
61
+ llama_model * model = llama_init.model;
62
+ llama_context * ctx = llama_init.context;
66
63
 
67
64
  // Tokenize the prompt
68
65
  std::vector<llama_token> inp;
@@ -75,14 +72,14 @@ int main(int argc, char ** argv) {
75
72
  const int max_tokens_list_size = max_context_size - 4;
76
73
 
77
74
  if ((int) inp.size() > max_tokens_list_size) {
78
- fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
75
+ LOG_ERR("%s: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
79
76
  return 1;
80
77
  }
81
78
 
82
- fprintf(stderr, "\n\n");
79
+ LOG("\n\n");
83
80
 
84
81
  for (auto id : inp) {
85
- fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str());
82
+ LOG("%s", llama_token_to_piece(ctx, id).c_str());
86
83
  }
87
84
 
88
85
  fflush(stderr);
@@ -118,7 +115,7 @@ int main(int argc, char ** argv) {
118
115
  llama_batch batch = llama_batch_init(params.n_ctx, 0, W + G + 1);
119
116
 
120
117
  // target model sampling context
121
- struct llama_sampling_context * ctx_sampling = llama_sampling_init(params.sparams);
118
+ struct gpt_sampler * smpl = gpt_sampler_init(model, params.sparams);
122
119
 
123
120
  // verification n-grams
124
121
  std::vector<ngram_data> ngrams_cur(G);
@@ -159,14 +156,14 @@ int main(int argc, char ** argv) {
159
156
 
160
157
  // sample first token
161
158
  {
162
- id = llama_sampling_sample(ctx_sampling, ctx, NULL, 0);
159
+ id = gpt_sampler_sample(smpl, ctx, 0);
163
160
 
164
- llama_sampling_accept(ctx_sampling, ctx, id, true);
161
+ gpt_sampler_accept(smpl, id, true);
165
162
 
166
163
  {
167
164
  const std::string token_str = llama_token_to_piece(ctx, id);
168
165
 
169
- printf("%s", token_str.c_str());
166
+ LOG("%s", token_str.c_str());
170
167
  fflush(stdout);
171
168
  }
172
169
  }
@@ -256,7 +253,7 @@ int main(int argc, char ** argv) {
256
253
  }
257
254
 
258
255
  if (llama_decode(ctx, batch) != 0) {
259
- fprintf(stderr, "\n\n%s: error: llama_decode failed - increase KV cache size\n", __func__);
256
+ LOG_ERR("\n\n%s: llama_decode failed - increase KV cache size\n", __func__);
260
257
  return 1;
261
258
  }
262
259
 
@@ -284,19 +281,19 @@ int main(int argc, char ** argv) {
284
281
  }
285
282
 
286
283
  // sample the next token
287
- id = llama_sampling_sample(ctx_sampling, ctx, NULL, i_batch);
284
+ id = gpt_sampler_sample(smpl, ctx, i_batch);
288
285
 
289
- llama_sampling_accept(ctx_sampling, ctx, id, true);
286
+ gpt_sampler_accept(smpl, id, true);
290
287
 
291
288
  // print
292
289
  {
293
290
  const std::string token_str = llama_token_to_piece(ctx, id);
294
291
 
295
292
  if (v == 0) {
296
- printf("%s", token_str.c_str());
293
+ LOG("%s", token_str.c_str());
297
294
  } else {
298
295
  // print light cyan
299
- printf("\033[0;96m%s\033[0m", token_str.c_str());
296
+ LOG("\033[0;96m%s\033[0m", token_str.c_str());
300
297
  }
301
298
  fflush(stdout);
302
299
 
@@ -330,21 +327,21 @@ int main(int argc, char ** argv) {
330
327
  // print known n-grams starting with token id (debug)
331
328
  if (0 && v == 0) {
332
329
  if (ngrams_observed.cnt[id] > 0) {
333
- printf("\n - %d n-grams starting with '%s'\n", ngrams_observed.cnt[id], llama_token_to_piece(ctx, id).c_str());
330
+ LOG("\n - %d n-grams starting with '%s'\n", ngrams_observed.cnt[id], llama_token_to_piece(ctx, id).c_str());
334
331
  }
335
332
 
336
333
  for (int i = 0; i < ngrams_observed.cnt[id]; i++) {
337
- printf(" - ngram %2d: ", i);
334
+ LOG(" - ngram %2d: ", i);
338
335
 
339
336
  const int idx = id*(N - 1)*G + i*(N - 1);
340
337
 
341
338
  for (int j = 0; j < N - 1; j++) {
342
339
  const std::string token_str = llama_token_to_piece(ctx, ngrams_observed.tokens[idx + j]);
343
340
 
344
- printf("%s", token_str.c_str());
341
+ LOG("%s", token_str.c_str());
345
342
  }
346
343
 
347
- printf("\n");
344
+ LOG("\n");
348
345
  }
349
346
  }
350
347
 
@@ -361,7 +358,7 @@ int main(int argc, char ** argv) {
361
358
  if (v == 0) {
362
359
  // sample from the last level
363
360
  for (int i = 0; i < W; i++) {
364
- tokens_j[N - 2][i] = llama_sampling_sample(ctx_sampling, ctx, NULL, ngrams_cur.size()*(N-1) + W*(N - 2) + i);
361
+ tokens_j[N - 2][i] = gpt_sampler_sample(smpl, ctx, ngrams_cur.size()*(N-1) + W*(N - 2) + i);
365
362
  }
366
363
  } else {
367
364
  for (int i = 0; i < W; i++) {
@@ -455,23 +452,25 @@ int main(int argc, char ** argv) {
455
452
 
456
453
  auto t_dec_end = ggml_time_us();
457
454
 
458
- LOG_TEE("\n\n");
455
+ LOG("\n\n");
456
+
457
+ LOG_INF("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input, (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
458
+ LOG_INF("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f));
459
459
 
460
- LOG_TEE("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input, (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
461
- LOG_TEE("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f));
460
+ LOG_INF("\n");
461
+ LOG_INF("W = %2d\n", W);
462
+ LOG_INF("N = %2d\n", N);
463
+ LOG_INF("G = %2d\n", G);
464
+ LOG_INF("\n");
465
+ LOG_INF("n_predict = %d\n", n_predict);
466
+ LOG_INF("n_accept = %d\n", n_accept);
462
467
 
463
- LOG_TEE("\n");
464
- LOG_TEE("W = %2d\n", W);
465
- LOG_TEE("N = %2d\n", N);
466
- LOG_TEE("G = %2d\n", G);
467
- LOG_TEE("\n");
468
- LOG_TEE("n_predict = %d\n", n_predict);
469
- LOG_TEE("n_accept = %d\n", n_accept);
468
+ LOG_INF("\n");
469
+ gpt_perf_print(ctx, smpl);
470
470
 
471
- llama_print_timings(ctx);
471
+ gpt_sampler_free(smpl);
472
472
 
473
473
  llama_kv_cache_view_free(&kvc_view);
474
- llama_sampling_free(ctx_sampling);
475
474
 
476
475
  llama_batch_free(batch);
477
476
 
@@ -480,7 +479,7 @@ int main(int argc, char ** argv) {
480
479
 
481
480
  llama_backend_free();
482
481
 
483
- fprintf(stderr, "\n\n");
482
+ LOG("\n\n");
484
483
 
485
484
  return 0;
486
485
  }
@@ -1,7 +1,8 @@
1
- #include "ggml.h"
2
- #include "llama.h"
1
+ #include "arg.h"
3
2
  #include "common.h"
4
3
  #include "ngram-cache.h"
4
+ #include "ggml.h"
5
+ #include "llama.h"
5
6
 
6
7
  #include <cstdint>
7
8
  #include <fstream>
@@ -13,8 +14,7 @@
13
14
  int main(int argc, char ** argv){
14
15
  gpt_params params;
15
16
 
16
- if (!gpt_params_parse(argc, argv, params)) {
17
- gpt_params_print_usage(argc, argv, params);
17
+ if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
18
18
  return 1;
19
19
  }
20
20
 
@@ -22,11 +22,11 @@ int main(int argc, char ** argv){
22
22
  llama_backend_init();
23
23
  llama_numa_init(params.numa);
24
24
 
25
- llama_model * model = NULL;
26
- llama_context * ctx = NULL;
27
-
28
25
  // load the model
29
- std::tie(model, ctx) = llama_init_from_gpt_params(params);
26
+ llama_init_result llama_init = llama_init_from_gpt_params(params);
27
+
28
+ llama_model * model = llama_init.model;
29
+ llama_context * ctx = llama_init.context;
30
30
  GGML_ASSERT(model != nullptr);
31
31
 
32
32
  // tokenize the prompt
@@ -40,4 +40,6 @@ int main(int argc, char ** argv){
40
40
  fprintf(stderr, "%s: hashing done, writing file to %s\n", __func__, params.lookup_cache_static.c_str());
41
41
 
42
42
  llama_ngram_cache_save(ngram_cache, params.lookup_cache_static);
43
+
44
+ return 0;
43
45
  }
@@ -1,36 +1,37 @@
1
- #include "ggml.h"
1
+ #include "arg.h"
2
2
  #include "common.h"
3
- #include "llama.h"
4
3
  #include "log.h"
5
4
  #include "ngram-cache.h"
5
+ #include "llama.h"
6
+ #include "ggml.h"
6
7
 
7
- #include <cmath>
8
8
  #include <cstdint>
9
9
  #include <cstdio>
10
+ #include <cinttypes>
10
11
  #include <fstream>
11
12
  #include <string>
12
13
  #include <vector>
13
- #include <unordered_map>
14
14
 
15
15
  int main(int argc, char ** argv){
16
16
  gpt_params params;
17
17
 
18
- if (!gpt_params_parse(argc, argv, params)) {
19
- gpt_params_print_usage(argc, argv, params);
18
+ if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
20
19
  return 1;
21
20
  }
22
21
 
22
+ gpt_init();
23
+
23
24
  const int n_draft = params.n_draft;
24
25
 
25
26
  // init llama.cpp
26
27
  llama_backend_init();
27
28
  llama_numa_init(params.numa);
28
29
 
29
- llama_model * model = NULL;
30
- llama_context * ctx = NULL;
31
-
32
30
  // load the model
33
- std::tie(model, ctx) = llama_init_from_gpt_params(params);
31
+ llama_init_result llama_init = llama_init_from_gpt_params(params);
32
+
33
+ llama_model * model = llama_init.model;
34
+ llama_context * ctx = llama_init.context;
34
35
 
35
36
  // tokenize the prompt
36
37
  std::vector<llama_token> inp;
@@ -49,7 +50,7 @@ int main(int argc, char ** argv){
49
50
  try {
50
51
  ngram_cache_static = llama_ngram_cache_load(params.lookup_cache_static);
51
52
  } catch (std::ifstream::failure const &) {
52
- fprintf(stderr, "error: failed to open static lookup cache: %s", params.lookup_cache_static.c_str());
53
+ LOG_ERR("failed to open static lookup cache: %s", params.lookup_cache_static.c_str());
53
54
  exit(1);
54
55
  }
55
56
  }
@@ -128,7 +129,7 @@ int main(int argc, char ** argv){
128
129
  const int64_t eta_min = eta_ms / (60*1000);
129
130
  const int64_t eta_s = (eta_ms - 60*1000*eta_min) / 1000;
130
131
 
131
- LOG_TEE("lookup-stats: %d/%d done, ETA: %02" PRId64 ":%02" PRId64 "\n", i_start, n_input, eta_min, eta_s);
132
+ LOG_INF("lookup-stats: %d/%d done, ETA: %02" PRId64 ":%02" PRId64 "\n", i_start, n_input, eta_min, eta_s);
132
133
  }
133
134
 
134
135
  // After each chunk, update the dynamic ngram cache with the context ngram cache:
@@ -136,24 +137,24 @@ int main(int argc, char ** argv){
136
137
  ngram_cache_context.clear();
137
138
  }
138
139
 
139
- LOG_TEE("\n");
140
+ LOG("\n");
140
141
 
141
- LOG_TEE("\n");
142
- LOG_TEE("n_draft = %d\n", n_draft);
143
- LOG_TEE("n_predict = %d\n", n_input - n_input % n_ctx);
144
- LOG_TEE("n_drafted = %d\n", n_drafted);
145
- LOG_TEE("t_draft_flat = %.2f ms\n", t_draft_flat_us*1e-3);
146
- LOG_TEE("t_draft = %.2f ms, %.2f us per token, %.2f tokens per second\n",
142
+ LOG_INF("\n");
143
+ LOG_INF("n_draft = %d\n", n_draft);
144
+ LOG_INF("n_predict = %d\n", n_input - n_input % n_ctx);
145
+ LOG_INF("n_drafted = %d\n", n_drafted);
146
+ LOG_INF("t_draft_flat = %.2f ms\n", t_draft_flat_us*1e-3);
147
+ LOG_INF("t_draft = %.2f ms, %.2f us per token, %.2f tokens per second\n",
147
148
  t_draft_us*1e-3, 1.0f*t_draft_us/n_drafted, n_drafted/(1e-6*t_draft_us));
148
- LOG_TEE("n_accept = %d\n", n_accept);
149
- LOG_TEE("accept = %.3f%%\n", 100.0f * n_accept / n_drafted);
149
+ LOG_INF("n_accept = %d\n", n_accept);
150
+ LOG_INF("accept = %.3f%%\n", 100.0f * n_accept / n_drafted);
150
151
 
151
152
  llama_free(ctx);
152
153
  llama_free_model(model);
153
154
 
154
155
  llama_backend_free();
155
156
 
156
- fprintf(stderr, "\n\n");
157
+ LOG("\n\n");
157
158
 
158
159
  return 0;
159
160
  }