@fugood/llama.node 0.2.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +9 -0
- package/README.md +1 -1
- package/bin/darwin/arm64/default.metallib +0 -0
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/default.metallib +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +1 -1
- package/package.json +2 -1
- package/patches/llama.patch +22 -0
- package/src/TokenizeWorker.cpp +1 -1
- package/src/llama.cpp/CMakeLists.txt +14 -12
- package/src/llama.cpp/common/common.cpp +19 -5
- package/src/llama.cpp/common/common.h +2 -0
- package/src/llama.cpp/common/grammar-parser.cpp +9 -0
- package/src/llama.cpp/common/sampling.cpp +3 -3
- package/src/llama.cpp/common/sampling.h +1 -1
- package/src/llama.cpp/examples/CMakeLists.txt +3 -0
- package/src/llama.cpp/examples/embedding/embedding.cpp +10 -2
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +56 -7
- package/src/llama.cpp/examples/llama.android/{app/src/main/cpp → llama}/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +49 -0
- package/src/llama.cpp/examples/llama.android/{app → llama}/src/main/cpp/llama-android.cpp +14 -14
- package/src/llama.cpp/examples/llava/llava-cli.cpp +26 -6
- package/src/llama.cpp/examples/main/main.cpp +5 -1
- package/src/llama.cpp/examples/rpc/CMakeLists.txt +2 -0
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +70 -0
- package/src/llama.cpp/examples/server/server.cpp +12 -16
- package/src/llama.cpp/examples/server/utils.hpp +1 -1
- package/src/llama.cpp/ggml-backend.c +2 -2
- package/src/llama.cpp/ggml-kompute.cpp +9 -3
- package/src/llama.cpp/ggml-quants.c +6 -0
- package/src/llama.cpp/ggml-rpc.cpp +1023 -0
- package/src/llama.cpp/ggml-rpc.h +24 -0
- package/src/llama.cpp/ggml-sycl.cpp +20 -143
- package/src/llama.cpp/ggml-vulkan.cpp +4 -2
- package/src/llama.cpp/ggml.c +116 -271
- package/src/llama.cpp/ggml.h +12 -15
- package/src/llama.cpp/llama.cpp +451 -265
- package/src/llama.cpp/llama.h +3 -0
- package/src/llama.cpp/requirements.txt +0 -1
- package/src/llama.cpp/tests/CMakeLists.txt +1 -1
- package/src/llama.cpp/tests/test-backend-ops.cpp +16 -19
- package/src/llama.cpp/tests/test-grammar-integration.cpp +46 -0
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +27 -3
- package/src/llama.cpp/requirements/requirements-convert-lora-to-ggml.txt +0 -2
|
@@ -81,7 +81,7 @@ static void log_callback(ggml_log_level level, const char * fmt, void * data) {
|
|
|
81
81
|
|
|
82
82
|
extern "C"
|
|
83
83
|
JNIEXPORT jlong JNICALL
|
|
84
|
-
|
|
84
|
+
Java_android_llama_cpp_LLamaAndroid_load_1model(JNIEnv *env, jobject, jstring filename) {
|
|
85
85
|
llama_model_params model_params = llama_model_default_params();
|
|
86
86
|
|
|
87
87
|
auto path_to_model = env->GetStringUTFChars(filename, 0);
|
|
@@ -101,13 +101,13 @@ Java_com_example_llama_Llm_load_1model(JNIEnv *env, jobject, jstring filename) {
|
|
|
101
101
|
|
|
102
102
|
extern "C"
|
|
103
103
|
JNIEXPORT void JNICALL
|
|
104
|
-
|
|
104
|
+
Java_android_llama_cpp_LLamaAndroid_free_1model(JNIEnv *, jobject, jlong model) {
|
|
105
105
|
llama_free_model(reinterpret_cast<llama_model *>(model));
|
|
106
106
|
}
|
|
107
107
|
|
|
108
108
|
extern "C"
|
|
109
109
|
JNIEXPORT jlong JNICALL
|
|
110
|
-
|
|
110
|
+
Java_android_llama_cpp_LLamaAndroid_new_1context(JNIEnv *env, jobject, jlong jmodel) {
|
|
111
111
|
auto model = reinterpret_cast<llama_model *>(jmodel);
|
|
112
112
|
|
|
113
113
|
if (!model) {
|
|
@@ -139,25 +139,25 @@ Java_com_example_llama_Llm_new_1context(JNIEnv *env, jobject, jlong jmodel) {
|
|
|
139
139
|
|
|
140
140
|
extern "C"
|
|
141
141
|
JNIEXPORT void JNICALL
|
|
142
|
-
|
|
142
|
+
Java_android_llama_cpp_LLamaAndroid_free_1context(JNIEnv *, jobject, jlong context) {
|
|
143
143
|
llama_free(reinterpret_cast<llama_context *>(context));
|
|
144
144
|
}
|
|
145
145
|
|
|
146
146
|
extern "C"
|
|
147
147
|
JNIEXPORT void JNICALL
|
|
148
|
-
|
|
148
|
+
Java_android_llama_cpp_LLamaAndroid_backend_1free(JNIEnv *, jobject) {
|
|
149
149
|
llama_backend_free();
|
|
150
150
|
}
|
|
151
151
|
|
|
152
152
|
extern "C"
|
|
153
153
|
JNIEXPORT void JNICALL
|
|
154
|
-
|
|
154
|
+
Java_android_llama_cpp_LLamaAndroid_log_1to_1android(JNIEnv *, jobject) {
|
|
155
155
|
llama_log_set(log_callback, NULL);
|
|
156
156
|
}
|
|
157
157
|
|
|
158
158
|
extern "C"
|
|
159
159
|
JNIEXPORT jstring JNICALL
|
|
160
|
-
|
|
160
|
+
Java_android_llama_cpp_LLamaAndroid_bench_1model(
|
|
161
161
|
JNIEnv *env,
|
|
162
162
|
jobject,
|
|
163
163
|
jlong context_pointer,
|
|
@@ -271,13 +271,13 @@ Java_com_example_llama_Llm_bench_1model(
|
|
|
271
271
|
|
|
272
272
|
extern "C"
|
|
273
273
|
JNIEXPORT void JNICALL
|
|
274
|
-
|
|
274
|
+
Java_android_llama_cpp_LLamaAndroid_free_1batch(JNIEnv *, jobject, jlong batch_pointer) {
|
|
275
275
|
llama_batch_free(*reinterpret_cast<llama_batch *>(batch_pointer));
|
|
276
276
|
}
|
|
277
277
|
|
|
278
278
|
extern "C"
|
|
279
279
|
JNIEXPORT jlong JNICALL
|
|
280
|
-
|
|
280
|
+
Java_android_llama_cpp_LLamaAndroid_new_1batch(JNIEnv *, jobject, jint n_tokens, jint embd, jint n_seq_max) {
|
|
281
281
|
|
|
282
282
|
// Source: Copy of llama.cpp:llama_batch_init but heap-allocated.
|
|
283
283
|
|
|
@@ -313,19 +313,19 @@ Java_com_example_llama_Llm_new_1batch(JNIEnv *, jobject, jint n_tokens, jint emb
|
|
|
313
313
|
|
|
314
314
|
extern "C"
|
|
315
315
|
JNIEXPORT void JNICALL
|
|
316
|
-
|
|
316
|
+
Java_android_llama_cpp_LLamaAndroid_backend_1init(JNIEnv *, jobject) {
|
|
317
317
|
llama_backend_init();
|
|
318
318
|
}
|
|
319
319
|
|
|
320
320
|
extern "C"
|
|
321
321
|
JNIEXPORT jstring JNICALL
|
|
322
|
-
|
|
322
|
+
Java_android_llama_cpp_LLamaAndroid_system_1info(JNIEnv *env, jobject) {
|
|
323
323
|
return env->NewStringUTF(llama_print_system_info());
|
|
324
324
|
}
|
|
325
325
|
|
|
326
326
|
extern "C"
|
|
327
327
|
JNIEXPORT jint JNICALL
|
|
328
|
-
|
|
328
|
+
Java_android_llama_cpp_LLamaAndroid_completion_1init(
|
|
329
329
|
JNIEnv *env,
|
|
330
330
|
jobject,
|
|
331
331
|
jlong context_pointer,
|
|
@@ -376,7 +376,7 @@ Java_com_example_llama_Llm_completion_1init(
|
|
|
376
376
|
|
|
377
377
|
extern "C"
|
|
378
378
|
JNIEXPORT jstring JNICALL
|
|
379
|
-
|
|
379
|
+
Java_android_llama_cpp_LLamaAndroid_completion_1loop(
|
|
380
380
|
JNIEnv * env,
|
|
381
381
|
jobject,
|
|
382
382
|
jlong context_pointer,
|
|
@@ -438,6 +438,6 @@ Java_com_example_llama_Llm_completion_1loop(
|
|
|
438
438
|
|
|
439
439
|
extern "C"
|
|
440
440
|
JNIEXPORT void JNICALL
|
|
441
|
-
|
|
441
|
+
Java_android_llama_cpp_LLamaAndroid_kv_1cache_1clear(JNIEnv *, jobject, jlong context) {
|
|
442
442
|
llama_kv_cache_clear(reinterpret_cast<llama_context *>(context));
|
|
443
443
|
}
|
|
@@ -189,6 +189,11 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
|
|
|
189
189
|
LOG_TEE("\n");
|
|
190
190
|
|
|
191
191
|
struct llama_sampling_context * ctx_sampling = llama_sampling_init(params->sparams);
|
|
192
|
+
if (!ctx_sampling) {
|
|
193
|
+
fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__);
|
|
194
|
+
exit(1);
|
|
195
|
+
}
|
|
196
|
+
|
|
192
197
|
std::string response = "";
|
|
193
198
|
for (int i = 0; i < max_tgt_len; i++) {
|
|
194
199
|
const char * tmp = sample(ctx_sampling, ctx_llava->ctx_llama, &n_past);
|
|
@@ -295,14 +300,10 @@ int main(int argc, char ** argv) {
|
|
|
295
300
|
return 1;
|
|
296
301
|
}
|
|
297
302
|
|
|
298
|
-
|
|
303
|
+
if (prompt_contains_image(params.prompt)) {
|
|
299
304
|
auto ctx_llava = llava_init_context(¶ms, model);
|
|
300
305
|
|
|
301
|
-
auto image_embed = load_image(ctx_llava, ¶ms,
|
|
302
|
-
if (!image_embed) {
|
|
303
|
-
std::cerr << "error: failed to load image " << image << ". Terminating\n\n";
|
|
304
|
-
return 1;
|
|
305
|
-
}
|
|
306
|
+
auto image_embed = load_image(ctx_llava, ¶ms, "");
|
|
306
307
|
|
|
307
308
|
// process the prompt
|
|
308
309
|
process_prompt(ctx_llava, image_embed, ¶ms, params.prompt);
|
|
@@ -311,7 +312,26 @@ int main(int argc, char ** argv) {
|
|
|
311
312
|
llava_image_embed_free(image_embed);
|
|
312
313
|
ctx_llava->model = NULL;
|
|
313
314
|
llava_free(ctx_llava);
|
|
315
|
+
} else {
|
|
316
|
+
for (auto & image : params.image) {
|
|
317
|
+
auto ctx_llava = llava_init_context(¶ms, model);
|
|
318
|
+
|
|
319
|
+
auto image_embed = load_image(ctx_llava, ¶ms, image);
|
|
320
|
+
if (!image_embed) {
|
|
321
|
+
std::cerr << "error: failed to load image " << image << ". Terminating\n\n";
|
|
322
|
+
return 1;
|
|
323
|
+
}
|
|
324
|
+
|
|
325
|
+
// process the prompt
|
|
326
|
+
process_prompt(ctx_llava, image_embed, ¶ms, params.prompt);
|
|
327
|
+
|
|
328
|
+
llama_print_timings(ctx_llava->ctx_llama);
|
|
329
|
+
llava_image_embed_free(image_embed);
|
|
330
|
+
ctx_llava->model = NULL;
|
|
331
|
+
llava_free(ctx_llava);
|
|
332
|
+
}
|
|
314
333
|
}
|
|
334
|
+
|
|
315
335
|
llama_free_model(model);
|
|
316
336
|
|
|
317
337
|
return 0;
|
|
@@ -523,6 +523,10 @@ int main(int argc, char ** argv) {
|
|
|
523
523
|
}
|
|
524
524
|
|
|
525
525
|
struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams);
|
|
526
|
+
if (!ctx_sampling) {
|
|
527
|
+
fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__);
|
|
528
|
+
exit(1);
|
|
529
|
+
}
|
|
526
530
|
|
|
527
531
|
while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
|
|
528
532
|
// predict
|
|
@@ -879,7 +883,7 @@ int main(int argc, char ** argv) {
|
|
|
879
883
|
}
|
|
880
884
|
|
|
881
885
|
const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true);
|
|
882
|
-
const auto line_inp = ::llama_tokenize(ctx, buffer, false,
|
|
886
|
+
const auto line_inp = ::llama_tokenize(ctx, buffer, false, params.interactive_specials);
|
|
883
887
|
const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true);
|
|
884
888
|
|
|
885
889
|
LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
#ifdef GGML_USE_CUDA
|
|
2
|
+
#include "ggml-cuda.h"
|
|
3
|
+
#endif
|
|
4
|
+
|
|
5
|
+
#ifdef GGML_USE_METAL
|
|
6
|
+
#include "ggml-metal.h"
|
|
7
|
+
#endif
|
|
8
|
+
|
|
9
|
+
#include "ggml-rpc.h"
|
|
10
|
+
#include <string>
|
|
11
|
+
#include <stdio.h>
|
|
12
|
+
|
|
13
|
+
static ggml_backend_t create_backend() {
|
|
14
|
+
ggml_backend_t backend = NULL;
|
|
15
|
+
#ifdef GGML_USE_CUDA
|
|
16
|
+
fprintf(stderr, "%s: using CUDA backend\n", __func__);
|
|
17
|
+
backend = ggml_backend_cuda_init(0); // init device 0
|
|
18
|
+
if (!backend) {
|
|
19
|
+
fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
|
|
20
|
+
}
|
|
21
|
+
#elif GGML_USE_METAL
|
|
22
|
+
fprintf(stderr, "%s: using Metal backend\n", __func__);
|
|
23
|
+
backend = ggml_backend_metal_init();
|
|
24
|
+
if (!backend) {
|
|
25
|
+
fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
|
|
26
|
+
}
|
|
27
|
+
#endif
|
|
28
|
+
|
|
29
|
+
// if there aren't GPU Backends fallback to CPU backend
|
|
30
|
+
if (!backend) {
|
|
31
|
+
fprintf(stderr, "%s: using CPU backend\n", __func__);
|
|
32
|
+
backend = ggml_backend_cpu_init();
|
|
33
|
+
}
|
|
34
|
+
return backend;
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
static void get_backend_memory(size_t * free_mem, size_t * total_mem) {
|
|
38
|
+
#ifdef GGML_USE_CUDA
|
|
39
|
+
ggml_backend_cuda_get_device_memory(0, free_mem, total_mem);
|
|
40
|
+
#else
|
|
41
|
+
// TODO: implement for other backends
|
|
42
|
+
*free_mem = 1;
|
|
43
|
+
*total_mem = 1;
|
|
44
|
+
#endif
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
int main(int argc, char * argv[]) {
|
|
48
|
+
if (argc < 3) {
|
|
49
|
+
fprintf(stderr, "Usage: %s <host> <port>\n", argv[0]);
|
|
50
|
+
return 1;
|
|
51
|
+
}
|
|
52
|
+
const char * host = argv[1];
|
|
53
|
+
int port = std::stoi(argv[2]);
|
|
54
|
+
if (port <= 0 || port > 65535) {
|
|
55
|
+
fprintf(stderr, "Invalid port number: %d\n", port);
|
|
56
|
+
return 1;
|
|
57
|
+
}
|
|
58
|
+
ggml_backend_t backend = create_backend();
|
|
59
|
+
if (!backend) {
|
|
60
|
+
fprintf(stderr, "Failed to create backend\n");
|
|
61
|
+
return 1;
|
|
62
|
+
}
|
|
63
|
+
printf("Starting RPC server on %s:%d\n", host, port);
|
|
64
|
+
size_t free_mem, total_mem;
|
|
65
|
+
get_backend_memory(&free_mem, &total_mem);
|
|
66
|
+
std::string endpoint = std::string(host) + ":" + std::to_string(port);
|
|
67
|
+
start_rpc_server(backend, endpoint.c_str(), free_mem, total_mem);
|
|
68
|
+
ggml_backend_free(backend);
|
|
69
|
+
return 0;
|
|
70
|
+
}
|
|
@@ -651,9 +651,6 @@ struct server_context {
|
|
|
651
651
|
std::string system_prompt;
|
|
652
652
|
std::vector<llama_token> system_tokens;
|
|
653
653
|
|
|
654
|
-
std::string name_user; // this should be the antiprompt
|
|
655
|
-
std::string name_assistant;
|
|
656
|
-
|
|
657
654
|
// slots / clients
|
|
658
655
|
std::vector<server_slot> slots;
|
|
659
656
|
json default_generation_settings_for_props;
|
|
@@ -673,6 +670,8 @@ struct server_context {
|
|
|
673
670
|
llama_free_model(model);
|
|
674
671
|
model = nullptr;
|
|
675
672
|
}
|
|
673
|
+
|
|
674
|
+
llama_batch_free(batch);
|
|
676
675
|
}
|
|
677
676
|
|
|
678
677
|
bool load_model(const gpt_params & params_) {
|
|
@@ -1098,15 +1097,11 @@ struct server_context {
|
|
|
1098
1097
|
system_need_update = false;
|
|
1099
1098
|
}
|
|
1100
1099
|
|
|
1101
|
-
|
|
1102
|
-
system_prompt
|
|
1103
|
-
name_user = sys_props.value("anti_prompt", "");
|
|
1104
|
-
name_assistant = sys_props.value("assistant_name", "");
|
|
1100
|
+
bool system_prompt_set(const std::string & sys_prompt) {
|
|
1101
|
+
system_prompt = sys_prompt;
|
|
1105
1102
|
|
|
1106
1103
|
LOG_VERBOSE("system prompt process", {
|
|
1107
1104
|
{"system_prompt", system_prompt},
|
|
1108
|
-
{"name_user", name_user},
|
|
1109
|
-
{"name_assistant", name_assistant},
|
|
1110
1105
|
});
|
|
1111
1106
|
|
|
1112
1107
|
// release all slots
|
|
@@ -1115,6 +1110,7 @@ struct server_context {
|
|
|
1115
1110
|
}
|
|
1116
1111
|
|
|
1117
1112
|
system_need_update = true;
|
|
1113
|
+
return true;
|
|
1118
1114
|
}
|
|
1119
1115
|
|
|
1120
1116
|
bool process_token(completion_token_output & result, server_slot & slot) {
|
|
@@ -1534,7 +1530,8 @@ struct server_context {
|
|
|
1534
1530
|
}
|
|
1535
1531
|
|
|
1536
1532
|
if (task.data.contains("system_prompt")) {
|
|
1537
|
-
|
|
1533
|
+
std::string sys_prompt = json_value(task.data, "system_prompt", std::string());
|
|
1534
|
+
system_prompt_set(sys_prompt);
|
|
1538
1535
|
|
|
1539
1536
|
for (server_slot & slot : slots) {
|
|
1540
1537
|
slot.n_past = 0;
|
|
@@ -2270,10 +2267,10 @@ struct server_context {
|
|
|
2270
2267
|
|
|
2271
2268
|
const size_t n_probs = std::min(cur_p.size, (size_t) slot.sparams.n_probs);
|
|
2272
2269
|
if (n_probs > 0) {
|
|
2273
|
-
const size_t
|
|
2270
|
+
const size_t n_valid = slot.ctx_sampling->n_valid;
|
|
2274
2271
|
|
|
2275
2272
|
// Make sure at least n_probs top tokens are at the front of the vector:
|
|
2276
|
-
if (slot.sparams.temp == 0.0f && n_probs >
|
|
2273
|
+
if (slot.sparams.temp == 0.0f && n_probs > n_valid) {
|
|
2277
2274
|
llama_sample_top_k(ctx, &cur_p, n_probs, 0);
|
|
2278
2275
|
}
|
|
2279
2276
|
|
|
@@ -2289,7 +2286,7 @@ struct server_context {
|
|
|
2289
2286
|
for (size_t i = 0; i < n_probs; ++i) {
|
|
2290
2287
|
result.probs.push_back({
|
|
2291
2288
|
cur_p.data[i].id,
|
|
2292
|
-
i >=
|
|
2289
|
+
i >= n_valid ? 0.0f : cur_p.data[i].p // Tokens filtered out due to e.g. top_k have 0 probability.
|
|
2293
2290
|
});
|
|
2294
2291
|
}
|
|
2295
2292
|
}
|
|
@@ -2918,7 +2915,7 @@ int main(int argc, char ** argv) {
|
|
|
2918
2915
|
server_params_parse(argc, argv, sparams, params);
|
|
2919
2916
|
|
|
2920
2917
|
if (!sparams.system_prompt.empty()) {
|
|
2921
|
-
ctx_server.system_prompt_set(
|
|
2918
|
+
ctx_server.system_prompt_set(sparams.system_prompt);
|
|
2922
2919
|
}
|
|
2923
2920
|
|
|
2924
2921
|
if (params.model_alias == "unknown") {
|
|
@@ -3407,8 +3404,7 @@ int main(int argc, char ** argv) {
|
|
|
3407
3404
|
const auto handle_props = [&ctx_server](const httplib::Request & req, httplib::Response & res) {
|
|
3408
3405
|
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
|
|
3409
3406
|
json data = {
|
|
3410
|
-
{ "
|
|
3411
|
-
{ "assistant_name", ctx_server.name_assistant.c_str() },
|
|
3407
|
+
{ "system_prompt", ctx_server.system_prompt.c_str() },
|
|
3412
3408
|
{ "default_generation_settings", ctx_server.default_generation_settings_for_props },
|
|
3413
3409
|
{ "total_slots", ctx_server.params.n_parallel }
|
|
3414
3410
|
};
|
|
@@ -371,7 +371,7 @@ static json oaicompat_completion_params_parse(
|
|
|
371
371
|
llama_params["presence_penalty"] = json_value(body, "presence_penalty", 0.0);
|
|
372
372
|
llama_params["seed"] = json_value(body, "seed", LLAMA_DEFAULT_SEED);
|
|
373
373
|
llama_params["stream"] = json_value(body, "stream", false);
|
|
374
|
-
llama_params["temperature"] = json_value(body, "temperature",
|
|
374
|
+
llama_params["temperature"] = json_value(body, "temperature", 1.0);
|
|
375
375
|
llama_params["top_p"] = json_value(body, "top_p", 1.0);
|
|
376
376
|
|
|
377
377
|
// Apply chat template to the list of messages
|
|
@@ -1182,9 +1182,9 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
|
|
|
1182
1182
|
static char * fmt_size(size_t size) {
|
|
1183
1183
|
static char buffer[128];
|
|
1184
1184
|
if (size >= 1024*1024) {
|
|
1185
|
-
|
|
1185
|
+
snprintf(buffer, sizeof(buffer), "%zuM", size/1024/1024);
|
|
1186
1186
|
} else {
|
|
1187
|
-
|
|
1187
|
+
snprintf(buffer, sizeof(buffer), "%zuK", size/1024);
|
|
1188
1188
|
}
|
|
1189
1189
|
return buffer;
|
|
1190
1190
|
}
|
|
@@ -1559,12 +1559,18 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
|
|
|
1559
1559
|
case GGML_OP_SOFT_MAX:
|
|
1560
1560
|
{
|
|
1561
1561
|
float scale;
|
|
1562
|
-
|
|
1562
|
+
float max_bias;
|
|
1563
1563
|
|
|
1564
|
-
|
|
1564
|
+
memcpy(&scale, (float *)dst->op_params + 0, sizeof(float));
|
|
1565
|
+
memcpy(&max_bias, (float *)dst->op_params + 1, sizeof(float));
|
|
1566
|
+
|
|
1567
|
+
#pragma message("TODO: add ggml_vk_soft_max() F16 src1 support")
|
|
1565
1568
|
#pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5021")
|
|
1566
1569
|
GGML_ASSERT(!src1 || src1t == GGML_TYPE_F32);
|
|
1567
|
-
|
|
1570
|
+
|
|
1571
|
+
#pragma message("TODO: add ALiBi support")
|
|
1572
|
+
#pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/7192")
|
|
1573
|
+
GGML_ASSERT(max_bias == 0.0f);
|
|
1568
1574
|
|
|
1569
1575
|
ggml_vk_soft_max(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne01, ne02, ne03, scale);
|
|
1570
1576
|
} break;
|
|
@@ -14,6 +14,12 @@
|
|
|
14
14
|
#include <stdlib.h> // for qsort
|
|
15
15
|
#include <stdio.h> // for GGML_ASSERT
|
|
16
16
|
|
|
17
|
+
#if defined(_MSC_VER)
|
|
18
|
+
// disable "possible loss of data" to avoid warnings for hundreds of casts
|
|
19
|
+
// we should just be careful :)
|
|
20
|
+
#pragma warning(disable: 4244 4267)
|
|
21
|
+
#endif
|
|
22
|
+
|
|
17
23
|
#define UNUSED GGML_UNUSED
|
|
18
24
|
|
|
19
25
|
// some compilers don't provide _mm256_set_m128i, e.g. gcc 7
|