@fugood/llama.node 0.3.17 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +3 -1
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +39 -2
- package/lib/index.js +132 -1
- package/lib/index.ts +203 -3
- package/package.json +2 -1
- package/src/EmbeddingWorker.cpp +1 -1
- package/src/LlamaCompletionWorker.cpp +366 -19
- package/src/LlamaCompletionWorker.h +30 -10
- package/src/LlamaContext.cpp +213 -5
- package/src/LlamaContext.h +12 -0
- package/src/common.hpp +15 -0
- package/src/llama.cpp/.github/workflows/build-linux-cross.yml +133 -24
- package/src/llama.cpp/.github/workflows/build.yml +41 -762
- package/src/llama.cpp/.github/workflows/docker.yml +5 -2
- package/src/llama.cpp/.github/workflows/release.yml +716 -0
- package/src/llama.cpp/.github/workflows/server.yml +12 -12
- package/src/llama.cpp/CMakeLists.txt +5 -17
- package/src/llama.cpp/cmake/build-info.cmake +8 -2
- package/src/llama.cpp/cmake/x64-windows-llvm.cmake +0 -6
- package/src/llama.cpp/common/CMakeLists.txt +31 -3
- package/src/llama.cpp/common/arg.cpp +48 -29
- package/src/llama.cpp/common/chat.cpp +128 -106
- package/src/llama.cpp/common/chat.h +2 -0
- package/src/llama.cpp/common/common.cpp +37 -1
- package/src/llama.cpp/common/common.h +18 -9
- package/src/llama.cpp/common/llguidance.cpp +1 -0
- package/src/llama.cpp/common/minja/chat-template.hpp +9 -5
- package/src/llama.cpp/common/minja/minja.hpp +69 -36
- package/src/llama.cpp/common/regex-partial.cpp +204 -0
- package/src/llama.cpp/common/regex-partial.h +56 -0
- package/src/llama.cpp/common/sampling.cpp +57 -50
- package/src/llama.cpp/examples/CMakeLists.txt +2 -23
- package/src/llama.cpp/examples/embedding/embedding.cpp +2 -11
- package/src/llama.cpp/examples/parallel/parallel.cpp +86 -14
- package/src/llama.cpp/examples/training/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/training/finetune.cpp +96 -0
- package/src/llama.cpp/ggml/CMakeLists.txt +27 -0
- package/src/llama.cpp/ggml/include/ggml-backend.h +4 -4
- package/src/llama.cpp/ggml/include/ggml-cpp.h +1 -1
- package/src/llama.cpp/ggml/include/ggml-opt.h +47 -28
- package/src/llama.cpp/ggml/include/ggml.h +10 -7
- package/src/llama.cpp/ggml/src/CMakeLists.txt +1 -1
- package/src/llama.cpp/ggml/src/ggml-alloc.c +4 -1
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +9 -5
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +20 -13
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +0 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +306 -6
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +4 -13
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +29 -16
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +88 -5
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +47 -12
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +264 -69
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +501 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +0 -13
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +0 -6
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +23 -4
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +36 -11
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +0 -2
- package/src/llama.cpp/ggml/src/ggml-opt.cpp +368 -190
- package/src/llama.cpp/ggml/src/ggml-quants.c +0 -6
- package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +41 -27
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +29 -23
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +9 -8
- package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +121 -232
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +7 -15
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +72 -25
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +14 -7
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +59 -21
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +7 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +0 -23
- package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +37 -8
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +338 -166
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +185 -89
- package/src/llama.cpp/ggml/src/ggml-sycl/quants.hpp +83 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +128 -53
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +81 -70
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +657 -193
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +20 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +123 -29
- package/src/llama.cpp/ggml/src/ggml.c +29 -20
- package/src/llama.cpp/ggml/src/gguf.cpp +33 -33
- package/src/llama.cpp/include/llama.h +52 -11
- package/src/llama.cpp/requirements/requirements-all.txt +3 -3
- package/src/llama.cpp/scripts/xxd.cmake +1 -1
- package/src/llama.cpp/src/CMakeLists.txt +1 -0
- package/src/llama.cpp/src/llama-adapter.cpp +6 -0
- package/src/llama.cpp/src/llama-arch.cpp +3 -0
- package/src/llama.cpp/src/llama-batch.cpp +5 -1
- package/src/llama.cpp/src/llama-batch.h +2 -1
- package/src/llama.cpp/src/llama-chat.cpp +17 -7
- package/src/llama.cpp/src/llama-chat.h +1 -0
- package/src/llama.cpp/src/llama-context.cpp +389 -501
- package/src/llama.cpp/src/llama-context.h +44 -32
- package/src/llama.cpp/src/llama-cparams.h +1 -0
- package/src/llama.cpp/src/llama-graph.cpp +20 -38
- package/src/llama.cpp/src/llama-graph.h +12 -8
- package/src/llama.cpp/src/llama-kv-cache.cpp +1503 -389
- package/src/llama.cpp/src/llama-kv-cache.h +271 -85
- package/src/llama.cpp/src/llama-memory.h +11 -1
- package/src/llama.cpp/src/llama-model-loader.cpp +24 -15
- package/src/llama.cpp/src/llama-model-saver.cpp +281 -0
- package/src/llama.cpp/src/llama-model-saver.h +37 -0
- package/src/llama.cpp/src/llama-model.cpp +316 -69
- package/src/llama.cpp/src/llama-model.h +8 -1
- package/src/llama.cpp/src/llama-quant.cpp +15 -13
- package/src/llama.cpp/src/llama-sampling.cpp +18 -6
- package/src/llama.cpp/src/llama-vocab.cpp +42 -4
- package/src/llama.cpp/src/llama-vocab.h +6 -0
- package/src/llama.cpp/src/llama.cpp +14 -0
- package/src/llama.cpp/tests/CMakeLists.txt +10 -2
- package/src/llama.cpp/tests/test-backend-ops.cpp +107 -47
- package/src/llama.cpp/tests/test-chat-template.cpp +10 -11
- package/src/llama.cpp/tests/test-chat.cpp +3 -1
- package/src/llama.cpp/tests/test-mtmd-c-api.c +63 -0
- package/src/llama.cpp/tests/test-opt.cpp +33 -21
- package/src/llama.cpp/tests/test-regex-partial.cpp +288 -0
- package/src/llama.cpp/tests/test-sampling.cpp +1 -1
- package/src/llama.cpp/tools/CMakeLists.txt +39 -0
- package/src/llama.cpp/{examples → tools}/batched-bench/batched-bench.cpp +2 -2
- package/src/llama.cpp/{examples → tools}/imatrix/imatrix.cpp +11 -9
- package/src/llama.cpp/{examples → tools}/llama-bench/llama-bench.cpp +495 -348
- package/src/llama.cpp/{examples → tools}/main/main.cpp +6 -9
- package/src/llama.cpp/{examples/llava → tools/mtmd}/CMakeLists.txt +1 -35
- package/src/llama.cpp/{examples/llava → tools/mtmd}/clip-impl.h +25 -5
- package/src/llama.cpp/{examples/llava → tools/mtmd}/clip.cpp +1440 -1349
- package/src/llama.cpp/tools/mtmd/clip.h +99 -0
- package/src/llama.cpp/{examples/llava → tools/mtmd}/mtmd-cli.cpp +70 -44
- package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +310 -0
- package/src/llama.cpp/{examples/llava → tools/mtmd}/mtmd.cpp +251 -281
- package/src/llama.cpp/tools/mtmd/mtmd.h +331 -0
- package/src/llama.cpp/{examples → tools}/perplexity/perplexity.cpp +4 -2
- package/src/llama.cpp/{examples → tools}/quantize/quantize.cpp +13 -76
- package/src/llama.cpp/{examples → tools}/rpc/rpc-server.cpp +70 -74
- package/src/llama.cpp/{examples → tools}/run/run.cpp +18 -4
- package/src/llama.cpp/{examples → tools}/server/CMakeLists.txt +2 -1
- package/src/llama.cpp/{examples → tools}/server/server.cpp +291 -76
- package/src/llama.cpp/{examples → tools}/server/utils.hpp +377 -5
- package/src/llama.cpp/cmake/arm64-windows-msvc.cmake +0 -6
- package/src/llama.cpp/examples/infill/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/infill/infill.cpp +0 -590
- package/src/llama.cpp/examples/llava/android/build_64.sh +0 -8
- package/src/llama.cpp/examples/llava/clip-quantize-cli.cpp +0 -59
- package/src/llama.cpp/examples/llava/clip.h +0 -135
- package/src/llama.cpp/examples/llava/llava.cpp +0 -586
- package/src/llama.cpp/examples/llava/llava.h +0 -49
- package/src/llama.cpp/examples/llava/mtmd.h +0 -168
- package/src/llama.cpp/examples/llava/qwen2vl-test.cpp +0 -636
- /package/src/llama.cpp/{examples → tools}/batched-bench/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/completions.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/cvector-generator.cpp +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/mean.hpp +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/negative.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/pca.hpp +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/positive.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/export-lora/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/export-lora/export-lora.cpp +0 -0
- /package/src/llama.cpp/{examples → tools}/gguf-split/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/gguf-split/gguf-split.cpp +0 -0
- /package/src/llama.cpp/{examples → tools}/imatrix/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/llama-bench/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/main/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples/llava → tools/mtmd}/deprecation-warning.cpp +0 -0
- /package/src/llama.cpp/{examples/llava → tools/mtmd}/requirements.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/perplexity/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/quantize/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/rpc/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/run/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.cpp +0 -0
- /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.h +0 -0
- /package/src/llama.cpp/{examples → tools}/server/bench/requirements.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/server/httplib.h +0 -0
- /package/src/llama.cpp/{examples → tools}/server/tests/requirements.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/tokenize/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/tokenize/tokenize.cpp +0 -0
- /package/src/llama.cpp/{examples → tools}/tts/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/tts/tts.cpp +0 -0
|
@@ -1,590 +0,0 @@
|
|
|
1
|
-
#include "arg.h"
|
|
2
|
-
#include "common.h"
|
|
3
|
-
#include "console.h"
|
|
4
|
-
#include "sampling.h"
|
|
5
|
-
#include "log.h"
|
|
6
|
-
#include "llama.h"
|
|
7
|
-
|
|
8
|
-
#include <cassert>
|
|
9
|
-
#include <cinttypes>
|
|
10
|
-
#include <cmath>
|
|
11
|
-
#include <cstdio>
|
|
12
|
-
#include <cstring>
|
|
13
|
-
#include <ctime>
|
|
14
|
-
#include <fstream>
|
|
15
|
-
#include <iostream>
|
|
16
|
-
#include <sstream>
|
|
17
|
-
#include <string>
|
|
18
|
-
#include <vector>
|
|
19
|
-
|
|
20
|
-
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
|
|
21
|
-
#include <signal.h>
|
|
22
|
-
#include <unistd.h>
|
|
23
|
-
#elif defined (_WIN32)
|
|
24
|
-
#define WIN32_LEAN_AND_MEAN
|
|
25
|
-
#ifndef NOMINMAX
|
|
26
|
-
#define NOMINMAX
|
|
27
|
-
#endif
|
|
28
|
-
#include <windows.h>
|
|
29
|
-
#include <signal.h>
|
|
30
|
-
#endif
|
|
31
|
-
|
|
32
|
-
#if defined(_MSC_VER)
|
|
33
|
-
#pragma warning(disable: 4244 4267) // possible loss of data
|
|
34
|
-
#endif
|
|
35
|
-
|
|
36
|
-
static llama_context ** g_ctx;
|
|
37
|
-
static llama_model ** g_model;
|
|
38
|
-
static common_sampler ** g_smpl;
|
|
39
|
-
static common_params * g_params;
|
|
40
|
-
static std::vector<llama_token> * g_input_tokens;
|
|
41
|
-
static std::ostringstream * g_output_ss;
|
|
42
|
-
static std::vector<llama_token> * g_output_tokens;
|
|
43
|
-
|
|
44
|
-
static bool is_interacting = false;
|
|
45
|
-
|
|
46
|
-
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
|
|
47
|
-
static void sigint_handler(int signo) {
|
|
48
|
-
if (signo == SIGINT) {
|
|
49
|
-
if (!is_interacting) {
|
|
50
|
-
is_interacting = true;
|
|
51
|
-
} else {
|
|
52
|
-
console::cleanup();
|
|
53
|
-
LOG("\n");
|
|
54
|
-
common_perf_print(*g_ctx, *g_smpl);
|
|
55
|
-
|
|
56
|
-
// make sure all logs are flushed
|
|
57
|
-
LOG("Interrupted by user\n");
|
|
58
|
-
common_log_pause(common_log_main());
|
|
59
|
-
|
|
60
|
-
_exit(130);
|
|
61
|
-
}
|
|
62
|
-
}
|
|
63
|
-
}
|
|
64
|
-
#endif
|
|
65
|
-
|
|
66
|
-
int main(int argc, char ** argv) {
|
|
67
|
-
common_params params;
|
|
68
|
-
g_params = ¶ms;
|
|
69
|
-
|
|
70
|
-
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_INFILL)) {
|
|
71
|
-
return 1;
|
|
72
|
-
}
|
|
73
|
-
|
|
74
|
-
common_init();
|
|
75
|
-
|
|
76
|
-
auto & sparams = params.sampling;
|
|
77
|
-
|
|
78
|
-
console::init(params.simple_io, params.use_color);
|
|
79
|
-
atexit([]() { console::cleanup(); });
|
|
80
|
-
|
|
81
|
-
if (params.logits_all) {
|
|
82
|
-
LOG_ERR("\n************\n");
|
|
83
|
-
LOG_ERR("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
|
|
84
|
-
LOG_ERR("************\n\n");
|
|
85
|
-
|
|
86
|
-
return 0;
|
|
87
|
-
}
|
|
88
|
-
|
|
89
|
-
if (params.embedding) {
|
|
90
|
-
LOG_ERR("\n************\n");
|
|
91
|
-
LOG_ERR("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
|
|
92
|
-
LOG_ERR("************\n\n");
|
|
93
|
-
|
|
94
|
-
return 0;
|
|
95
|
-
}
|
|
96
|
-
|
|
97
|
-
if (params.n_ctx != 0 && params.n_ctx < 8) {
|
|
98
|
-
LOG_WRN("%s: minimum context size is 8, using minimum size.\n", __func__);
|
|
99
|
-
params.n_ctx = 8;
|
|
100
|
-
}
|
|
101
|
-
|
|
102
|
-
if (!params.interactive_first && (params.input_prefix.empty() && params.input_suffix.empty())) {
|
|
103
|
-
LOG_ERR("\n************\n");
|
|
104
|
-
LOG_ERR("%s: please use '--interactive_first' or specify '--in_prefix' and/or '--in_suffix'\n", __func__);
|
|
105
|
-
LOG_ERR("************\n\n");
|
|
106
|
-
|
|
107
|
-
return 0;
|
|
108
|
-
}
|
|
109
|
-
|
|
110
|
-
if (params.rope_freq_base != 0.0) {
|
|
111
|
-
LOG_WRN("%s: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
|
|
112
|
-
}
|
|
113
|
-
|
|
114
|
-
if (params.rope_freq_scale != 0.0) {
|
|
115
|
-
LOG_WRN("%s: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
|
|
116
|
-
}
|
|
117
|
-
|
|
118
|
-
LOG_INF("%s: llama backend init\n", __func__);
|
|
119
|
-
llama_backend_init();
|
|
120
|
-
llama_numa_init(params.numa);
|
|
121
|
-
|
|
122
|
-
llama_model * model = nullptr;
|
|
123
|
-
llama_context * ctx = nullptr;
|
|
124
|
-
common_sampler * smpl = nullptr;
|
|
125
|
-
|
|
126
|
-
g_model = &model;
|
|
127
|
-
g_ctx = &ctx;
|
|
128
|
-
g_smpl = &smpl;
|
|
129
|
-
|
|
130
|
-
// load the model and apply lora adapter, if any
|
|
131
|
-
LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__);
|
|
132
|
-
common_init_result llama_init = common_init_from_params(params);
|
|
133
|
-
|
|
134
|
-
model = llama_init.model.get();
|
|
135
|
-
ctx = llama_init.context.get();
|
|
136
|
-
|
|
137
|
-
if (model == NULL) {
|
|
138
|
-
LOG_ERR("%s: unable to load model\n", __func__);
|
|
139
|
-
return 1;
|
|
140
|
-
}
|
|
141
|
-
|
|
142
|
-
const llama_vocab * vocab = llama_model_get_vocab(model);
|
|
143
|
-
|
|
144
|
-
const int n_ctx_train = llama_model_n_ctx_train(model);
|
|
145
|
-
const int n_ctx = llama_n_ctx(ctx);
|
|
146
|
-
LOG_DBG("n_ctx: %d\n", n_ctx);
|
|
147
|
-
|
|
148
|
-
if (n_ctx > n_ctx_train) {
|
|
149
|
-
LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n", __func__, n_ctx_train, n_ctx);
|
|
150
|
-
}
|
|
151
|
-
|
|
152
|
-
// print system information
|
|
153
|
-
{
|
|
154
|
-
LOG_INF("\n");
|
|
155
|
-
LOG_INF("%s\n", common_params_get_system_info(params).c_str());
|
|
156
|
-
}
|
|
157
|
-
const bool add_bos = llama_vocab_get_add_bos(vocab);
|
|
158
|
-
GGML_ASSERT(!llama_vocab_get_add_eos(vocab));
|
|
159
|
-
|
|
160
|
-
std::vector<llama_token> embd_inp;
|
|
161
|
-
std::vector<llama_token> embd_end;
|
|
162
|
-
std::vector<llama_token> inp_pfx = common_tokenize(ctx, params.input_prefix, false);
|
|
163
|
-
std::vector<llama_token> inp_sfx = common_tokenize(ctx, params.input_suffix, false);
|
|
164
|
-
|
|
165
|
-
GGML_ASSERT(llama_vocab_fim_pre(vocab) >= 0);
|
|
166
|
-
GGML_ASSERT(llama_vocab_fim_suf(vocab) >= 0);
|
|
167
|
-
|
|
168
|
-
inp_pfx.insert(inp_pfx.begin(), llama_vocab_fim_pre(vocab));
|
|
169
|
-
inp_sfx.insert(inp_sfx.begin(), llama_vocab_fim_suf(vocab));
|
|
170
|
-
|
|
171
|
-
embd_inp = params.spm_infill ? inp_sfx : inp_pfx;
|
|
172
|
-
embd_end = params.spm_infill ? inp_pfx : inp_sfx;
|
|
173
|
-
if (add_bos) {
|
|
174
|
-
embd_inp.insert(embd_inp.begin(), llama_vocab_bos(vocab));
|
|
175
|
-
}
|
|
176
|
-
embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
|
|
177
|
-
|
|
178
|
-
const llama_token middle_token = llama_vocab_fim_mid(vocab);
|
|
179
|
-
if (middle_token >= 0) {
|
|
180
|
-
embd_inp.push_back(middle_token);
|
|
181
|
-
}
|
|
182
|
-
|
|
183
|
-
LOG_DBG("add_bos: %d\n", add_bos);
|
|
184
|
-
LOG_DBG("prefix: \"%s\"\n", params.input_prefix.c_str());
|
|
185
|
-
LOG_DBG("suffix: \"%s\"\n", params.input_suffix.c_str());
|
|
186
|
-
LOG_DBG("tokens: %s\n", string_from(ctx, embd_inp).c_str());
|
|
187
|
-
|
|
188
|
-
// Should not run without any tokens
|
|
189
|
-
if (embd_inp.empty()) {
|
|
190
|
-
embd_inp.push_back(llama_vocab_bos(vocab));
|
|
191
|
-
LOG_WRN("embd_inp was considered empty and bos was added: %s\n", string_from(ctx, embd_inp).c_str());
|
|
192
|
-
}
|
|
193
|
-
|
|
194
|
-
if ((int) embd_inp.size() > n_ctx - 4) {
|
|
195
|
-
LOG_ERR("%s: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
|
|
196
|
-
return 1;
|
|
197
|
-
}
|
|
198
|
-
|
|
199
|
-
// number of tokens to keep when resetting context
|
|
200
|
-
if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size()) {
|
|
201
|
-
params.n_keep = (int)embd_inp.size();
|
|
202
|
-
}
|
|
203
|
-
|
|
204
|
-
LOG_INF("inp_pfx: %s\n", string_from(ctx, inp_pfx).c_str());
|
|
205
|
-
LOG_INF("inp_sfx: %s\n", string_from(ctx, inp_sfx).c_str());
|
|
206
|
-
|
|
207
|
-
// enable interactive mode if interactive start is specified
|
|
208
|
-
if (params.interactive_first) {
|
|
209
|
-
params.interactive = true;
|
|
210
|
-
}
|
|
211
|
-
|
|
212
|
-
if (params.verbose_prompt) {
|
|
213
|
-
LOG_INF("\n");
|
|
214
|
-
LOG_INF("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
|
|
215
|
-
LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
|
|
216
|
-
for (int i = 0; i < (int) embd_inp.size(); i++) {
|
|
217
|
-
LOG_INF("%6d -> '%s'\n", embd_inp[i], common_token_to_piece(ctx, embd_inp[i]).c_str());
|
|
218
|
-
}
|
|
219
|
-
|
|
220
|
-
if (params.n_keep > 0) {
|
|
221
|
-
LOG_INF("%s: static prompt based on n_keep: '", __func__);
|
|
222
|
-
for (int i = 0; i < params.n_keep; i++) {
|
|
223
|
-
LOG_CNT("%s", common_token_to_piece(ctx, embd_inp[i]).c_str());
|
|
224
|
-
}
|
|
225
|
-
LOG_CNT("'\n");
|
|
226
|
-
}
|
|
227
|
-
LOG_INF("\n");
|
|
228
|
-
}
|
|
229
|
-
|
|
230
|
-
if (params.interactive) {
|
|
231
|
-
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
|
|
232
|
-
struct sigaction sigint_action;
|
|
233
|
-
sigint_action.sa_handler = sigint_handler;
|
|
234
|
-
sigemptyset (&sigint_action.sa_mask);
|
|
235
|
-
sigint_action.sa_flags = 0;
|
|
236
|
-
sigaction(SIGINT, &sigint_action, NULL);
|
|
237
|
-
#elif defined (_WIN32)
|
|
238
|
-
auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
|
|
239
|
-
return (ctrl_type == CTRL_C_EVENT) ? (sigint_handler(SIGINT), true) : false;
|
|
240
|
-
};
|
|
241
|
-
SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
|
|
242
|
-
#endif
|
|
243
|
-
|
|
244
|
-
LOG_INF("%s: interactive mode on.\n", __func__);
|
|
245
|
-
|
|
246
|
-
if (params.input_prefix_bos) {
|
|
247
|
-
LOG_INF("Input prefix with BOS\n");
|
|
248
|
-
}
|
|
249
|
-
|
|
250
|
-
if (!params.input_prefix.empty()) {
|
|
251
|
-
LOG_INF("Input prefix: '%s'\n", params.input_prefix.c_str());
|
|
252
|
-
}
|
|
253
|
-
|
|
254
|
-
if (!params.input_suffix.empty()) {
|
|
255
|
-
LOG_INF("Input suffix: '%s'\n", params.input_suffix.c_str());
|
|
256
|
-
}
|
|
257
|
-
}
|
|
258
|
-
smpl = common_sampler_init(model, sparams);
|
|
259
|
-
|
|
260
|
-
LOG_INF("sampler seed: %u\n", common_sampler_get_seed(smpl));
|
|
261
|
-
LOG_INF("sampler params: \n%s\n", sparams.print().c_str());
|
|
262
|
-
LOG_INF("sampler chain: %s\n", common_sampler_print(smpl).c_str());
|
|
263
|
-
|
|
264
|
-
LOG_INF("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
|
|
265
|
-
|
|
266
|
-
LOG_INF("\n");
|
|
267
|
-
LOG_INF("\n##### Infill mode #####\n\n");
|
|
268
|
-
if (params.interactive) {
|
|
269
|
-
const char *control_message;
|
|
270
|
-
if (params.multiline_input) {
|
|
271
|
-
control_message = " - To return control to LLaMA, end your input with '\\'.\n"
|
|
272
|
-
" - To return control without starting a new line, end your input with '/'.\n";
|
|
273
|
-
} else {
|
|
274
|
-
control_message = " - Press Return to return control to LLaMA.\n"
|
|
275
|
-
" - To return control without starting a new line, end your input with '/'.\n"
|
|
276
|
-
" - If you want to submit another line, end your input with '\\'.\n";
|
|
277
|
-
}
|
|
278
|
-
LOG_INF("== Running in interactive mode. ==\n");
|
|
279
|
-
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
|
|
280
|
-
LOG_INF( " - Press Ctrl+C to interject at any time.\n");
|
|
281
|
-
#endif
|
|
282
|
-
LOG_INF( "%s\n", control_message);
|
|
283
|
-
|
|
284
|
-
is_interacting = params.interactive_first;
|
|
285
|
-
}
|
|
286
|
-
|
|
287
|
-
bool input_echo = true;
|
|
288
|
-
|
|
289
|
-
int n_past = 0;
|
|
290
|
-
int n_remain = params.n_predict;
|
|
291
|
-
int n_consumed = 0;
|
|
292
|
-
|
|
293
|
-
std::vector<int> input_tokens; g_input_tokens = &input_tokens;
|
|
294
|
-
std::vector<int> output_tokens; g_output_tokens = &output_tokens;
|
|
295
|
-
std::ostringstream output_ss; g_output_ss = &output_ss;
|
|
296
|
-
|
|
297
|
-
// the first thing we will do is to output the prompt, so set color accordingly
|
|
298
|
-
console::set_display(console::prompt);
|
|
299
|
-
|
|
300
|
-
std::vector<llama_token> embd;
|
|
301
|
-
|
|
302
|
-
while (n_remain != 0 || params.interactive) {
|
|
303
|
-
// predict
|
|
304
|
-
if (!embd.empty()) {
|
|
305
|
-
// Note: n_ctx - 4 here is to match the logic for commandline prompt handling via
|
|
306
|
-
// --prompt or --file which uses the same value.
|
|
307
|
-
int max_embd_size = n_ctx - 4;
|
|
308
|
-
|
|
309
|
-
// Ensure the input doesn't exceed the context size by truncating embd if necessary.
|
|
310
|
-
if ((int) embd.size() > max_embd_size) {
|
|
311
|
-
const int skipped_tokens = (int) embd.size() - max_embd_size;
|
|
312
|
-
embd.resize(max_embd_size);
|
|
313
|
-
|
|
314
|
-
console::set_display(console::error);
|
|
315
|
-
LOG_WRN("<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
|
|
316
|
-
console::set_display(console::reset);
|
|
317
|
-
}
|
|
318
|
-
|
|
319
|
-
// infinite text generation via context swapping
|
|
320
|
-
// if we run out of context:
|
|
321
|
-
// - take the n_keep first tokens from the original prompt (via n_past)
|
|
322
|
-
// - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
|
|
323
|
-
if (n_past + (int) embd.size() > n_ctx) {
|
|
324
|
-
if (params.n_predict == -2) {
|
|
325
|
-
LOG_DBG("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
|
|
326
|
-
break;
|
|
327
|
-
}
|
|
328
|
-
|
|
329
|
-
const int n_left = n_past - params.n_keep - 1;
|
|
330
|
-
const int n_discard = n_left/2;
|
|
331
|
-
|
|
332
|
-
LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
|
|
333
|
-
n_past, n_left, n_ctx, params.n_keep, n_discard);
|
|
334
|
-
|
|
335
|
-
llama_kv_self_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1);
|
|
336
|
-
llama_kv_self_seq_add(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
|
|
337
|
-
|
|
338
|
-
n_past -= n_discard;
|
|
339
|
-
|
|
340
|
-
LOG_DBG("after swap: n_past = %d\n", n_past);
|
|
341
|
-
|
|
342
|
-
LOG_DBG("embd: %s\n", string_from(ctx, embd).c_str());
|
|
343
|
-
|
|
344
|
-
}
|
|
345
|
-
|
|
346
|
-
// evaluate tokens in batches
|
|
347
|
-
// embd is typically prepared beforehand to fit within a batch, but not always
|
|
348
|
-
for (int i = 0; i < (int) embd.size(); i += params.n_batch) {
|
|
349
|
-
int n_eval = (int) embd.size() - i;
|
|
350
|
-
if (n_eval > params.n_batch) {
|
|
351
|
-
n_eval = params.n_batch;
|
|
352
|
-
}
|
|
353
|
-
|
|
354
|
-
LOG_DBG("eval: %s\n", string_from(ctx, embd).c_str());
|
|
355
|
-
|
|
356
|
-
if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval))) {
|
|
357
|
-
LOG_ERR("%s : failed to eval\n", __func__);
|
|
358
|
-
return 1;
|
|
359
|
-
}
|
|
360
|
-
|
|
361
|
-
n_past += n_eval;
|
|
362
|
-
|
|
363
|
-
LOG_DBG("n_past = %d\n", n_past);
|
|
364
|
-
}
|
|
365
|
-
|
|
366
|
-
}
|
|
367
|
-
|
|
368
|
-
embd.clear();
|
|
369
|
-
|
|
370
|
-
if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
|
|
371
|
-
const llama_token id = common_sampler_sample(smpl, ctx, -1);
|
|
372
|
-
|
|
373
|
-
common_sampler_accept(smpl, id, true);
|
|
374
|
-
|
|
375
|
-
// LOG_DBG("last: %s\n", string_from(ctx, smpl->prev.to_vector()).c_str());
|
|
376
|
-
|
|
377
|
-
embd.push_back(id);
|
|
378
|
-
|
|
379
|
-
// echo this to console
|
|
380
|
-
input_echo = true;
|
|
381
|
-
|
|
382
|
-
// decrement remaining sampling budget
|
|
383
|
-
--n_remain;
|
|
384
|
-
|
|
385
|
-
LOG_DBG("n_remain: %d\n", n_remain);
|
|
386
|
-
} else {
|
|
387
|
-
// some user input remains from prompt or interaction, forward it to processing
|
|
388
|
-
LOG_DBG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
|
|
389
|
-
while ((int) embd_inp.size() > n_consumed) {
|
|
390
|
-
embd.push_back(embd_inp[n_consumed]);
|
|
391
|
-
|
|
392
|
-
// push the prompt in the sampling context in order to apply repetition penalties later
|
|
393
|
-
// for the prompt, we don't apply grammar rules
|
|
394
|
-
common_sampler_accept(smpl, embd_inp[n_consumed], false);
|
|
395
|
-
|
|
396
|
-
++n_consumed;
|
|
397
|
-
if ((int) embd.size() >= params.n_batch) {
|
|
398
|
-
break;
|
|
399
|
-
}
|
|
400
|
-
}
|
|
401
|
-
}
|
|
402
|
-
|
|
403
|
-
// display text
|
|
404
|
-
if (input_echo) {
|
|
405
|
-
for (auto id : embd) {
|
|
406
|
-
const std::string token_str = common_token_to_piece(ctx, id);
|
|
407
|
-
LOG("%s", token_str.c_str());
|
|
408
|
-
|
|
409
|
-
if (embd.size() > 1) {
|
|
410
|
-
input_tokens.push_back(id);
|
|
411
|
-
} else {
|
|
412
|
-
output_tokens.push_back(id);
|
|
413
|
-
output_ss << token_str;
|
|
414
|
-
}
|
|
415
|
-
}
|
|
416
|
-
}
|
|
417
|
-
// reset color to default if we there is no pending user input
|
|
418
|
-
if (input_echo && (int) embd_inp.size() == n_consumed) {
|
|
419
|
-
console::set_display(console::reset);
|
|
420
|
-
}
|
|
421
|
-
|
|
422
|
-
// if not currently processing queued inputs;
|
|
423
|
-
if ((int) embd_inp.size() <= n_consumed) {
|
|
424
|
-
// deal with eot token in infill mode
|
|
425
|
-
if ((common_sampler_last(smpl) == llama_vocab_eot(vocab) || is_interacting) && params.interactive){
|
|
426
|
-
if (is_interacting && !params.interactive_first) {
|
|
427
|
-
// print an eot token
|
|
428
|
-
LOG("%s", common_token_to_piece(ctx, llama_vocab_eot(vocab)).c_str());
|
|
429
|
-
}
|
|
430
|
-
LOG("\n");
|
|
431
|
-
console::set_display(console::user_input);
|
|
432
|
-
std::string buffer;
|
|
433
|
-
std::string line;
|
|
434
|
-
bool another_line=true;
|
|
435
|
-
// set a new prefix via stdin
|
|
436
|
-
do {
|
|
437
|
-
another_line = console::readline(line, params.multiline_input);
|
|
438
|
-
buffer += line;
|
|
439
|
-
} while (another_line);
|
|
440
|
-
// check if we got an empty line, if so we use the old input
|
|
441
|
-
if (!buffer.empty() && !(buffer.length() == 1 && buffer[0] == '\n')) {
|
|
442
|
-
params.input_prefix = buffer;
|
|
443
|
-
}
|
|
444
|
-
buffer.clear();
|
|
445
|
-
// set a new suffix via stdin
|
|
446
|
-
do {
|
|
447
|
-
another_line = console::readline(line, params.multiline_input);
|
|
448
|
-
buffer += line;
|
|
449
|
-
} while (another_line);
|
|
450
|
-
// check if we got an empty line
|
|
451
|
-
if (!buffer.empty() && !(buffer.length() == 1 && buffer[0] == '\n')) {
|
|
452
|
-
params.input_suffix = buffer;
|
|
453
|
-
}
|
|
454
|
-
buffer.clear();
|
|
455
|
-
// done taking input, reset color
|
|
456
|
-
console::set_display(console::reset);
|
|
457
|
-
|
|
458
|
-
if (params.escape) {
|
|
459
|
-
//process escape sequences, for the initial prompt this is done in common.cpp when we load the params, but for the interactive mode we need to do it here
|
|
460
|
-
string_process_escapes(params.input_prefix);
|
|
461
|
-
string_process_escapes(params.input_suffix);
|
|
462
|
-
}
|
|
463
|
-
|
|
464
|
-
// tokenize new prefix and suffix
|
|
465
|
-
std::vector<llama_token> inp_pfx = common_tokenize(ctx, params.input_prefix, false);
|
|
466
|
-
std::vector<llama_token> inp_sfx = common_tokenize(ctx, params.input_suffix, false);
|
|
467
|
-
|
|
468
|
-
inp_pfx.insert(inp_pfx.begin(), llama_vocab_fim_pre(vocab));
|
|
469
|
-
inp_sfx.insert(inp_sfx.begin(), llama_vocab_fim_suf(vocab));
|
|
470
|
-
|
|
471
|
-
embd_inp = params.spm_infill ? inp_sfx : inp_pfx;
|
|
472
|
-
embd_end = params.spm_infill ? inp_pfx : inp_sfx;
|
|
473
|
-
if (add_bos) {
|
|
474
|
-
embd_inp.insert(embd_inp.begin(), llama_vocab_bos(vocab));
|
|
475
|
-
}
|
|
476
|
-
embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
|
|
477
|
-
|
|
478
|
-
if (middle_token >= 0) {
|
|
479
|
-
embd_inp.push_back(middle_token);
|
|
480
|
-
}
|
|
481
|
-
|
|
482
|
-
embd.clear();
|
|
483
|
-
n_remain = params.n_predict;
|
|
484
|
-
n_past = 0;
|
|
485
|
-
n_consumed = 0;
|
|
486
|
-
is_interacting = false;
|
|
487
|
-
}
|
|
488
|
-
// deal with end of generation tokens in interactive mode
|
|
489
|
-
else if (llama_vocab_is_eog(vocab, common_sampler_last(smpl))) {
|
|
490
|
-
LOG_DBG("found EOS token\n");
|
|
491
|
-
|
|
492
|
-
if (params.interactive) {
|
|
493
|
-
|
|
494
|
-
is_interacting = true;
|
|
495
|
-
LOG("\n");
|
|
496
|
-
console::set_display(console::user_input);
|
|
497
|
-
}
|
|
498
|
-
}
|
|
499
|
-
|
|
500
|
-
if (n_past > 0 && is_interacting && !params.interactive) {
|
|
501
|
-
LOG_DBG("waiting for user input\n");
|
|
502
|
-
|
|
503
|
-
if (params.input_prefix_bos) {
|
|
504
|
-
LOG_DBG("adding input prefix BOS token\n");
|
|
505
|
-
embd_inp.push_back(llama_vocab_bos(vocab));
|
|
506
|
-
}
|
|
507
|
-
|
|
508
|
-
std::string buffer;
|
|
509
|
-
if (!params.input_prefix.empty()) {
|
|
510
|
-
LOG_DBG("appending input prefix: '%s'\n", params.input_prefix.c_str());
|
|
511
|
-
buffer += params.input_prefix;
|
|
512
|
-
LOG("%s", buffer.c_str());
|
|
513
|
-
}
|
|
514
|
-
|
|
515
|
-
std::string line;
|
|
516
|
-
bool another_line = true;
|
|
517
|
-
do {
|
|
518
|
-
another_line = console::readline(line, params.multiline_input);
|
|
519
|
-
buffer += line;
|
|
520
|
-
} while (another_line);
|
|
521
|
-
|
|
522
|
-
// done taking input, reset color
|
|
523
|
-
console::set_display(console::reset);
|
|
524
|
-
|
|
525
|
-
// Add tokens to embd only if the input buffer is non-empty
|
|
526
|
-
// Entering a empty line lets the user pass control back
|
|
527
|
-
if (buffer.length() > 1) {
|
|
528
|
-
// append input suffix if any
|
|
529
|
-
if (!params.input_suffix.empty()) {
|
|
530
|
-
LOG_DBG("appending input suffix: '%s'\n", params.input_suffix.c_str());
|
|
531
|
-
buffer += params.input_suffix;
|
|
532
|
-
LOG("%s", params.input_suffix.c_str());
|
|
533
|
-
}
|
|
534
|
-
|
|
535
|
-
LOG_DBG("buffer: '%s'\n", buffer.c_str());
|
|
536
|
-
|
|
537
|
-
const size_t original_size = embd_inp.size();
|
|
538
|
-
|
|
539
|
-
const auto line_inp = common_tokenize(ctx, buffer, false);
|
|
540
|
-
LOG_DBG("input tokens: %s\n", string_from(ctx, line_inp).c_str());
|
|
541
|
-
|
|
542
|
-
embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
|
|
543
|
-
|
|
544
|
-
for (size_t i = original_size; i < embd_inp.size(); ++i) {
|
|
545
|
-
const llama_token token = embd_inp[i];
|
|
546
|
-
output_tokens.push_back(token);
|
|
547
|
-
output_ss << common_token_to_piece(ctx, token);
|
|
548
|
-
}
|
|
549
|
-
|
|
550
|
-
n_remain -= line_inp.size();
|
|
551
|
-
LOG_DBG("n_remain: %d\n", n_remain);
|
|
552
|
-
} else {
|
|
553
|
-
LOG_DBG("empty line, passing control back\n");
|
|
554
|
-
}
|
|
555
|
-
|
|
556
|
-
input_echo = false; // do not echo this again
|
|
557
|
-
}
|
|
558
|
-
|
|
559
|
-
if (n_past > 0) {
|
|
560
|
-
if (is_interacting) {
|
|
561
|
-
common_sampler_reset(smpl);
|
|
562
|
-
}
|
|
563
|
-
is_interacting = false;
|
|
564
|
-
}
|
|
565
|
-
}
|
|
566
|
-
|
|
567
|
-
// end of generation
|
|
568
|
-
if (!embd.empty() && llama_vocab_is_eog(vocab, embd.back()) && !params.interactive) {
|
|
569
|
-
break;
|
|
570
|
-
}
|
|
571
|
-
|
|
572
|
-
// In interactive mode, respect the maximum number of tokens and drop back to user input when reached.
|
|
573
|
-
// We skip this logic when n_predict == -1 (infinite) or -2 (stop at context size).
|
|
574
|
-
if (params.interactive && n_remain <= 0 && params.n_predict >= 0) {
|
|
575
|
-
n_remain = params.n_predict;
|
|
576
|
-
is_interacting = true;
|
|
577
|
-
}
|
|
578
|
-
}
|
|
579
|
-
if (!params.interactive && n_remain <= 0) {
|
|
580
|
-
LOG("%s", common_token_to_piece(ctx, llama_vocab_eot(vocab)).c_str());
|
|
581
|
-
}
|
|
582
|
-
|
|
583
|
-
LOG("\n");
|
|
584
|
-
common_perf_print(ctx, smpl);
|
|
585
|
-
|
|
586
|
-
common_sampler_free(smpl);
|
|
587
|
-
llama_backend_free();
|
|
588
|
-
|
|
589
|
-
return 0;
|
|
590
|
-
}
|
|
@@ -1,59 +0,0 @@
|
|
|
1
|
-
#include "arg.h"
|
|
2
|
-
#include "base64.hpp"
|
|
3
|
-
#include "log.h"
|
|
4
|
-
#include "common.h"
|
|
5
|
-
#include "sampling.h"
|
|
6
|
-
#include "clip.h"
|
|
7
|
-
#include "llava.h"
|
|
8
|
-
#include "llama.h"
|
|
9
|
-
#include "ggml.h"
|
|
10
|
-
|
|
11
|
-
static void print_usage(int argc, char ** argv) {
|
|
12
|
-
(void) argc;
|
|
13
|
-
|
|
14
|
-
fprintf(stderr, "usage: %s /path/to/ggml-model-f32.gguf /path/to/ggml-model-quantized.gguf type\n", argv[0]);
|
|
15
|
-
fprintf(stderr, " type = 2 - q4_0\n");
|
|
16
|
-
fprintf(stderr, " type = 3 - q4_1\n");
|
|
17
|
-
fprintf(stderr, " type = 6 - q5_0\n");
|
|
18
|
-
fprintf(stderr, " type = 7 - q5_1\n");
|
|
19
|
-
fprintf(stderr, " type = 8 - q8_0\n");
|
|
20
|
-
}
|
|
21
|
-
|
|
22
|
-
int main(int argc, char ** argv) {
|
|
23
|
-
if (argc != 4) {
|
|
24
|
-
print_usage(argc, argv);
|
|
25
|
-
return 1;
|
|
26
|
-
}
|
|
27
|
-
|
|
28
|
-
const std::string fname_inp = argv[1];
|
|
29
|
-
const std::string fname_out = argv[2];
|
|
30
|
-
|
|
31
|
-
const int itype = atoi(argv[3]);
|
|
32
|
-
|
|
33
|
-
const int64_t t_main_start_us = ggml_time_us();
|
|
34
|
-
|
|
35
|
-
int64_t t_quantize_us = 0;
|
|
36
|
-
|
|
37
|
-
// load the model
|
|
38
|
-
{
|
|
39
|
-
const int64_t t_start_us = ggml_time_us();
|
|
40
|
-
|
|
41
|
-
if (!clip_model_quantize(fname_inp.c_str(), fname_out.c_str(), itype)) {
|
|
42
|
-
fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
|
|
43
|
-
return 1;
|
|
44
|
-
}
|
|
45
|
-
|
|
46
|
-
t_quantize_us = ggml_time_us() - t_start_us;
|
|
47
|
-
}
|
|
48
|
-
|
|
49
|
-
// report timing
|
|
50
|
-
{
|
|
51
|
-
const int64_t t_main_end_us = ggml_time_us();
|
|
52
|
-
|
|
53
|
-
printf("\n");
|
|
54
|
-
printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us / 1000.0f);
|
|
55
|
-
printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us) / 1000.0f);
|
|
56
|
-
}
|
|
57
|
-
|
|
58
|
-
return 0;
|
|
59
|
-
}
|