@fugood/llama.node 0.4.7 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +20 -6
- package/lib/index.js +41 -17
- package/lib/index.ts +50 -23
- package/package.json +1 -1
- package/src/LlamaCompletionWorker.cpp +9 -9
- package/src/LlamaCompletionWorker.h +2 -2
- package/src/LlamaContext.cpp +37 -18
- package/src/LlamaContext.h +1 -0
- package/src/TokenizeWorker.cpp +16 -12
- package/src/TokenizeWorker.h +2 -2
- package/src/common.hpp +54 -50
- package/src/llama.cpp/.github/workflows/build.yml +2 -2
- package/src/llama.cpp/.github/workflows/release.yml +152 -129
- package/src/llama.cpp/.github/workflows/winget.yml +42 -0
- package/src/llama.cpp/common/arg.cpp +14 -13
- package/src/llama.cpp/common/common.cpp +4 -75
- package/src/llama.cpp/common/common.h +7 -12
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +0 -13
- package/src/llama.cpp/examples/lookup/lookup.cpp +0 -11
- package/src/llama.cpp/examples/parallel/parallel.cpp +0 -9
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +6 -6
- package/src/llama.cpp/examples/simple/simple.cpp +1 -1
- package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +2 -2
- package/src/llama.cpp/examples/sycl/run-llama2.sh +4 -4
- package/src/llama.cpp/examples/sycl/run-llama3.sh +28 -0
- package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
- package/src/llama.cpp/examples/sycl/win-run-llama3.bat +9 -0
- package/src/llama.cpp/ggml/include/ggml-opt.h +2 -0
- package/src/llama.cpp/ggml/include/ggml.h +11 -0
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +274 -0
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +27 -0
- package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +18 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +107 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +16 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +8 -2
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +315 -155
- package/src/llama.cpp/ggml/src/ggml-opt.cpp +5 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +43 -12
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +171 -112
- package/src/llama.cpp/ggml/src/ggml.c +64 -18
- package/src/llama.cpp/include/llama.h +24 -124
- package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +5 -1
- package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +5 -1
- package/src/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt +2 -0
- package/src/llama.cpp/src/llama-batch.cpp +3 -1
- package/src/llama.cpp/src/llama-context.cpp +60 -110
- package/src/llama.cpp/src/llama-graph.cpp +137 -233
- package/src/llama.cpp/src/llama-graph.h +49 -7
- package/src/llama.cpp/src/llama-hparams.cpp +17 -1
- package/src/llama.cpp/src/llama-hparams.h +34 -5
- package/src/llama.cpp/src/llama-kv-cache.cpp +654 -321
- package/src/llama.cpp/src/llama-kv-cache.h +201 -85
- package/src/llama.cpp/src/llama-memory.h +3 -2
- package/src/llama.cpp/src/llama-model.cpp +273 -94
- package/src/llama.cpp/src/llama-model.h +4 -1
- package/src/llama.cpp/tests/test-arg-parser.cpp +1 -1
- package/src/llama.cpp/tools/llama-bench/llama-bench.cpp +1 -0
- package/src/llama.cpp/tools/mtmd/CMakeLists.txt +13 -2
- package/src/llama.cpp/tools/mtmd/clip-impl.h +108 -11
- package/src/llama.cpp/tools/mtmd/clip.cpp +466 -88
- package/src/llama.cpp/tools/mtmd/clip.h +6 -4
- package/src/llama.cpp/tools/mtmd/miniaudio.h +93468 -0
- package/src/llama.cpp/tools/mtmd/mtmd-audio.cpp +855 -0
- package/src/llama.cpp/tools/mtmd/mtmd-audio.h +62 -0
- package/src/llama.cpp/tools/mtmd/mtmd-cli.cpp +21 -14
- package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +36 -49
- package/src/llama.cpp/tools/mtmd/mtmd.cpp +362 -98
- package/src/llama.cpp/tools/mtmd/mtmd.h +52 -21
- package/src/llama.cpp/tools/run/run.cpp +2 -2
- package/src/llama.cpp/tools/server/server.cpp +158 -47
- package/src/llama.cpp/tools/server/utils.hpp +71 -43
- package/src/llama.cpp/tools/tts/tts.cpp +4 -2
|
@@ -64,12 +64,17 @@
|
|
|
64
64
|
// precomputed f32 table for f16 (256 KB) (ggml-impl.h)
|
|
65
65
|
float ggml_table_f32_f16[1 << 16];
|
|
66
66
|
|
|
67
|
-
#if
|
|
68
|
-
(
|
|
67
|
+
#if defined(__linux__) || \
|
|
68
|
+
defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \
|
|
69
|
+
(defined(__APPLE__) && !TARGET_OS_TV && !TARGET_OS_WATCH)
|
|
70
|
+
|
|
69
71
|
#include <unistd.h>
|
|
70
72
|
#include <sys/types.h>
|
|
71
73
|
#include <sys/stat.h>
|
|
72
74
|
#include <sys/wait.h>
|
|
75
|
+
#if defined(__linux__)
|
|
76
|
+
#include <sys/prctl.h>
|
|
77
|
+
#endif
|
|
73
78
|
|
|
74
79
|
#if defined(__ANDROID__)
|
|
75
80
|
#include <unwind.h>
|
|
@@ -133,10 +138,36 @@ static void ggml_print_backtrace(void) {
|
|
|
133
138
|
if (GGML_NO_BACKTRACE) {
|
|
134
139
|
return;
|
|
135
140
|
}
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
141
|
+
#if defined(__linux__)
|
|
142
|
+
FILE * f = fopen("/proc/self/status", "r");
|
|
143
|
+
size_t size = 0;
|
|
144
|
+
char * line = NULL;
|
|
145
|
+
ssize_t length = 0;
|
|
146
|
+
while ((length = getline(&line, &size, f)) > 0) {
|
|
147
|
+
if (!strncmp(line, "TracerPid:", sizeof("TracerPid:") - 1) &&
|
|
148
|
+
(length != sizeof("TracerPid:\t0\n") - 1 || line[length - 2] != '0')) {
|
|
149
|
+
// Already being debugged, and the breakpoint is the later abort()
|
|
150
|
+
free(line);
|
|
151
|
+
fclose(f);
|
|
152
|
+
return;
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
free(line);
|
|
156
|
+
fclose(f);
|
|
157
|
+
int lock[2] = { -1, -1 };
|
|
158
|
+
(void) !pipe(lock); // Don't start gdb until after PR_SET_PTRACER
|
|
159
|
+
#endif
|
|
160
|
+
const int parent_pid = getpid();
|
|
161
|
+
const int child_pid = fork();
|
|
162
|
+
if (child_pid < 0) { // error
|
|
163
|
+
return;
|
|
164
|
+
} else if (child_pid == 0) { // child
|
|
165
|
+
char attach[32];
|
|
166
|
+
snprintf(attach, sizeof(attach), "attach %d", parent_pid);
|
|
167
|
+
#if defined(__linux__)
|
|
168
|
+
close(lock[1]);
|
|
169
|
+
(void) !read(lock[0], lock, 1);
|
|
170
|
+
#endif
|
|
140
171
|
// try gdb
|
|
141
172
|
execlp("gdb", "gdb", "--batch",
|
|
142
173
|
"-ex", "set style enabled on",
|
|
@@ -149,18 +180,18 @@ static void ggml_print_backtrace(void) {
|
|
|
149
180
|
execlp("lldb", "lldb", "--batch",
|
|
150
181
|
"-o", "bt",
|
|
151
182
|
"-o", "quit",
|
|
152
|
-
"-p", attach,
|
|
183
|
+
"-p", &attach[sizeof("attach ") - 1],
|
|
153
184
|
(char *) NULL);
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
185
|
+
// gdb failed, fallback to backtrace_symbols
|
|
186
|
+
ggml_print_backtrace_symbols();
|
|
187
|
+
_Exit(0);
|
|
188
|
+
} else { // parent
|
|
189
|
+
#if defined(__linux__)
|
|
190
|
+
prctl(PR_SET_PTRACER, child_pid);
|
|
191
|
+
close(lock[1]);
|
|
192
|
+
close(lock[0]);
|
|
193
|
+
#endif
|
|
194
|
+
waitpid(child_pid, NULL, 0);
|
|
164
195
|
}
|
|
165
196
|
}
|
|
166
197
|
#else
|
|
@@ -1068,9 +1099,10 @@ static const char * GGML_UNARY_OP_NAME[GGML_UNARY_OP_COUNT] = {
|
|
|
1068
1099
|
"HARDSWISH",
|
|
1069
1100
|
"HARDSIGMOID",
|
|
1070
1101
|
"EXP",
|
|
1102
|
+
"GELU_ERF",
|
|
1071
1103
|
};
|
|
1072
1104
|
|
|
1073
|
-
static_assert(GGML_UNARY_OP_COUNT ==
|
|
1105
|
+
static_assert(GGML_UNARY_OP_COUNT == 15, "GGML_UNARY_OP_COUNT != 15");
|
|
1074
1106
|
|
|
1075
1107
|
|
|
1076
1108
|
static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
|
|
@@ -2470,6 +2502,20 @@ struct ggml_tensor * ggml_gelu_inplace(
|
|
|
2470
2502
|
return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_GELU);
|
|
2471
2503
|
}
|
|
2472
2504
|
|
|
2505
|
+
// ggml_gelu_erf
|
|
2506
|
+
|
|
2507
|
+
struct ggml_tensor * ggml_gelu_erf(
|
|
2508
|
+
struct ggml_context * ctx,
|
|
2509
|
+
struct ggml_tensor * a) {
|
|
2510
|
+
return ggml_unary(ctx, a, GGML_UNARY_OP_GELU_ERF);
|
|
2511
|
+
}
|
|
2512
|
+
|
|
2513
|
+
struct ggml_tensor * ggml_gelu_erf_inplace(
|
|
2514
|
+
struct ggml_context * ctx,
|
|
2515
|
+
struct ggml_tensor * a) {
|
|
2516
|
+
return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_GELU_ERF);
|
|
2517
|
+
}
|
|
2518
|
+
|
|
2473
2519
|
// ggml_gelu_quick
|
|
2474
2520
|
|
|
2475
2521
|
struct ggml_tensor * ggml_gelu_quick(
|
|
@@ -361,10 +361,11 @@ extern "C" {
|
|
|
361
361
|
|
|
362
362
|
// Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value.
|
|
363
363
|
bool embeddings; // if true, extract embeddings (together with logits)
|
|
364
|
-
bool offload_kqv; //
|
|
365
|
-
bool flash_attn; //
|
|
366
|
-
bool no_perf; //
|
|
367
|
-
bool op_offload; //
|
|
364
|
+
bool offload_kqv; // offload the KQV ops (including the KV cache) to GPU
|
|
365
|
+
bool flash_attn; // use flash attention [EXPERIMENTAL]
|
|
366
|
+
bool no_perf; // measure performance timings
|
|
367
|
+
bool op_offload; // offload host tensor operations to device
|
|
368
|
+
bool swa_full; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
|
|
368
369
|
};
|
|
369
370
|
|
|
370
371
|
// model quantization parameters
|
|
@@ -607,71 +608,14 @@ extern "C" {
|
|
|
607
608
|
// KV cache
|
|
608
609
|
//
|
|
609
610
|
|
|
610
|
-
// TODO: start using struct llama_kv_cache
|
|
611
|
-
|
|
612
|
-
// Information associated with an individual cell in the KV cache view.
|
|
613
|
-
struct llama_kv_cache_view_cell {
|
|
614
|
-
// The position for this cell. Takes KV cache shifts into account.
|
|
615
|
-
// May be negative if the cell is not populated.
|
|
616
|
-
llama_pos pos;
|
|
617
|
-
};
|
|
618
|
-
|
|
619
|
-
// An updateable view of the KV cache.
|
|
620
|
-
struct llama_kv_cache_view {
|
|
621
|
-
// Number of KV cache cells. This will be the same as the context size.
|
|
622
|
-
int32_t n_cells;
|
|
623
|
-
|
|
624
|
-
// Maximum number of sequences that can exist in a cell. It's not an error
|
|
625
|
-
// if there are more sequences in a cell than this value, however they will
|
|
626
|
-
// not be visible in the view cells_sequences.
|
|
627
|
-
int32_t n_seq_max;
|
|
628
|
-
|
|
629
|
-
// Number of tokens in the cache. For example, if there are two populated
|
|
630
|
-
// cells, the first with 1 sequence id in it and the second with 2 sequence
|
|
631
|
-
// ids then you'll have 3 tokens.
|
|
632
|
-
int32_t token_count;
|
|
633
|
-
|
|
634
|
-
// Number of populated cache cells.
|
|
635
|
-
int32_t used_cells;
|
|
636
|
-
|
|
637
|
-
// Maximum contiguous empty slots in the cache.
|
|
638
|
-
int32_t max_contiguous;
|
|
639
|
-
|
|
640
|
-
// Index to the start of the max_contiguous slot range. Can be negative
|
|
641
|
-
// when cache is full.
|
|
642
|
-
int32_t max_contiguous_idx;
|
|
643
|
-
|
|
644
|
-
// Information for an individual cell.
|
|
645
|
-
struct llama_kv_cache_view_cell * cells;
|
|
646
|
-
|
|
647
|
-
// The sequences for each cell. There will be n_seq_max items per cell.
|
|
648
|
-
llama_seq_id * cells_sequences;
|
|
649
|
-
};
|
|
650
|
-
|
|
651
|
-
// Create an empty KV cache view. (use only for debugging purposes)
|
|
652
|
-
LLAMA_API struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_seq_max);
|
|
653
|
-
|
|
654
|
-
// Free a KV cache view. (use only for debugging purposes)
|
|
655
|
-
LLAMA_API void llama_kv_cache_view_free(struct llama_kv_cache_view * view);
|
|
656
|
-
|
|
657
|
-
// Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes)
|
|
658
|
-
// TODO: change signature to llama_kv_cache_view_update(struct llama_kv_cache_view * view, const struct llama_context * ctx)
|
|
659
|
-
LLAMA_API void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view);
|
|
660
|
-
|
|
661
|
-
///
|
|
662
|
-
|
|
663
611
|
// Returns the number of tokens in the KV cache (slow, use only for debug)
|
|
664
612
|
// If a KV cell has multiple sequences assigned to it, it will be counted multiple times
|
|
665
|
-
LLAMA_API int32_t llama_kv_self_n_tokens(const struct llama_context * ctx)
|
|
666
|
-
|
|
667
|
-
DEPRECATED(LLAMA_API int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx),
|
|
668
|
-
"use llama_kv_self_n_tokens instead");
|
|
613
|
+
DEPRECATED(LLAMA_API int32_t llama_kv_self_n_tokens(const struct llama_context * ctx),
|
|
614
|
+
"Use llama_kv_self_seq_pos_max() instead");
|
|
669
615
|
|
|
670
616
|
// Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
|
|
671
|
-
LLAMA_API int32_t llama_kv_self_used_cells(const struct llama_context * ctx)
|
|
672
|
-
|
|
673
|
-
DEPRECATED(LLAMA_API int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx),
|
|
674
|
-
"use llama_kv_self_used_cells instead");
|
|
617
|
+
DEPRECATED(LLAMA_API int32_t llama_kv_self_used_cells(const struct llama_context * ctx),
|
|
618
|
+
"Use llama_kv_self_seq_pos_max() instead");
|
|
675
619
|
|
|
676
620
|
// Clear the KV cache - both cell info is erased and KV data is zeroed
|
|
677
621
|
LLAMA_API void llama_kv_self_clear(
|
|
@@ -730,10 +674,18 @@ extern "C" {
|
|
|
730
674
|
llama_pos p1,
|
|
731
675
|
int d);
|
|
732
676
|
|
|
677
|
+
// Returns the smallest position present in the KV cache for the specified sequence
|
|
678
|
+
// This is typically non-zero only for SWA caches
|
|
679
|
+
// Return -1 if the sequence is empty
|
|
680
|
+
LLAMA_API llama_pos llama_kv_self_seq_pos_min(
|
|
681
|
+
struct llama_context * ctx,
|
|
682
|
+
llama_seq_id seq_id);
|
|
683
|
+
|
|
733
684
|
// Returns the largest position present in the KV cache for the specified sequence
|
|
685
|
+
// Return -1 if the sequence is empty
|
|
734
686
|
LLAMA_API llama_pos llama_kv_self_seq_pos_max(
|
|
735
687
|
struct llama_context * ctx,
|
|
736
|
-
|
|
688
|
+
llama_seq_id seq_id);
|
|
737
689
|
|
|
738
690
|
// Defragment the KV cache
|
|
739
691
|
// This will be applied:
|
|
@@ -747,61 +699,6 @@ extern "C" {
|
|
|
747
699
|
// Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
|
|
748
700
|
LLAMA_API void llama_kv_self_update(struct llama_context * ctx);
|
|
749
701
|
|
|
750
|
-
DEPRECATED(LLAMA_API void llama_kv_cache_clear(
|
|
751
|
-
struct llama_context * ctx),
|
|
752
|
-
"use llama_kv_self_clear instead");
|
|
753
|
-
|
|
754
|
-
DEPRECATED(LLAMA_API bool llama_kv_cache_seq_rm(
|
|
755
|
-
struct llama_context * ctx,
|
|
756
|
-
llama_seq_id seq_id,
|
|
757
|
-
llama_pos p0,
|
|
758
|
-
llama_pos p1),
|
|
759
|
-
"use llama_kv_self_seq_rm instead");
|
|
760
|
-
|
|
761
|
-
DEPRECATED(LLAMA_API void llama_kv_cache_seq_cp(
|
|
762
|
-
struct llama_context * ctx,
|
|
763
|
-
llama_seq_id seq_id_src,
|
|
764
|
-
llama_seq_id seq_id_dst,
|
|
765
|
-
llama_pos p0,
|
|
766
|
-
llama_pos p1),
|
|
767
|
-
"use llama_kv_self_seq_cp instead");
|
|
768
|
-
|
|
769
|
-
DEPRECATED(LLAMA_API void llama_kv_cache_seq_keep(
|
|
770
|
-
struct llama_context * ctx,
|
|
771
|
-
llama_seq_id seq_id),
|
|
772
|
-
"use llama_kv_self_seq_keep instead");
|
|
773
|
-
|
|
774
|
-
DEPRECATED(LLAMA_API void llama_kv_cache_seq_add(
|
|
775
|
-
struct llama_context * ctx,
|
|
776
|
-
llama_seq_id seq_id,
|
|
777
|
-
llama_pos p0,
|
|
778
|
-
llama_pos p1,
|
|
779
|
-
llama_pos delta),
|
|
780
|
-
"use llama_kv_self_seq_add instead");
|
|
781
|
-
|
|
782
|
-
DEPRECATED(LLAMA_API void llama_kv_cache_seq_div(
|
|
783
|
-
struct llama_context * ctx,
|
|
784
|
-
llama_seq_id seq_id,
|
|
785
|
-
llama_pos p0,
|
|
786
|
-
llama_pos p1,
|
|
787
|
-
int d),
|
|
788
|
-
"use llama_kv_self_seq_div instead");
|
|
789
|
-
|
|
790
|
-
DEPRECATED(LLAMA_API llama_pos llama_kv_cache_seq_pos_max(
|
|
791
|
-
struct llama_context * ctx,
|
|
792
|
-
llama_seq_id seq_id),
|
|
793
|
-
"use llama_kv_self_seq_pos_max instead");
|
|
794
|
-
|
|
795
|
-
DEPRECATED(LLAMA_API void llama_kv_cache_defrag(struct llama_context * ctx),
|
|
796
|
-
"use llama_kv_self_defrag instead");
|
|
797
|
-
|
|
798
|
-
DEPRECATED(LLAMA_API bool llama_kv_cache_can_shift(const struct llama_context * ctx),
|
|
799
|
-
"use llama_kv_self_can_shift instead");
|
|
800
|
-
|
|
801
|
-
DEPRECATED(LLAMA_API void llama_kv_cache_update(struct llama_context * ctx),
|
|
802
|
-
"use llama_kv_self_update instead");
|
|
803
|
-
|
|
804
|
-
|
|
805
702
|
//
|
|
806
703
|
// State / sessions
|
|
807
704
|
//
|
|
@@ -943,9 +840,12 @@ extern "C" {
|
|
|
943
840
|
// Requires KV cache.
|
|
944
841
|
// For encode-decoder contexts, processes the batch using the decoder.
|
|
945
842
|
// Positive return values does not mean a fatal error, but rather a warning.
|
|
946
|
-
//
|
|
947
|
-
//
|
|
948
|
-
//
|
|
843
|
+
// Upon non-zero return values, the KV cache state is restored to the state before this call
|
|
844
|
+
// 0 - success
|
|
845
|
+
// 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
|
|
846
|
+
// 2 - aborted
|
|
847
|
+
// -1 - invalid input batch
|
|
848
|
+
// < -1 - error
|
|
949
849
|
LLAMA_API int32_t llama_decode(
|
|
950
850
|
struct llama_context * ctx,
|
|
951
851
|
struct llama_batch batch);
|
|
@@ -1,3 +1,7 @@
|
|
|
1
1
|
-r ./requirements-convert_legacy_llama.txt
|
|
2
2
|
--extra-index-url https://download.pytorch.org/whl/cpu
|
|
3
|
-
torch~=2.2.1
|
|
3
|
+
torch~=2.2.1; platform_machine != "s390x"
|
|
4
|
+
|
|
5
|
+
# torch s390x packages can only be found from nightly builds
|
|
6
|
+
--extra-index-url https://download.pytorch.org/whl/nightly
|
|
7
|
+
torch>=0.0.0.dev0; platform_machine == "s390x"
|
|
@@ -1,3 +1,7 @@
|
|
|
1
1
|
-r ./requirements-convert_legacy_llama.txt
|
|
2
2
|
--extra-index-url https://download.pytorch.org/whl/cpu
|
|
3
|
-
torch~=2.2.1
|
|
3
|
+
torch~=2.2.1; platform_machine != "s390x"
|
|
4
|
+
|
|
5
|
+
# torch s390x packages can only be found from nightly builds
|
|
6
|
+
--extra-index-url https://download.pytorch.org/whl/nightly
|
|
7
|
+
torch>=0.0.0.dev0; platform_machine == "s390x"
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
#include "llama-batch.h"
|
|
2
2
|
|
|
3
|
+
#include <cassert>
|
|
3
4
|
#include <cstring>
|
|
4
5
|
#include <algorithm>
|
|
5
6
|
|
|
@@ -281,9 +282,10 @@ llama_batch_allocr::llama_batch_allocr(struct llama_batch in_batch, llama_pos p0
|
|
|
281
282
|
batch = in_batch;
|
|
282
283
|
GGML_ASSERT(batch.n_tokens > 0);
|
|
283
284
|
if (!batch.pos) {
|
|
285
|
+
assert(p0 >= 0);
|
|
284
286
|
pos.resize(batch.n_tokens);
|
|
285
287
|
for (int32_t i = 0; i < batch.n_tokens; i++) {
|
|
286
|
-
pos[i] =
|
|
288
|
+
pos[i] = p0 + i;
|
|
287
289
|
}
|
|
288
290
|
batch.pos = pos.data();
|
|
289
291
|
}
|
|
@@ -93,6 +93,7 @@ llama_context::llama_context(
|
|
|
93
93
|
}
|
|
94
94
|
|
|
95
95
|
cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
|
|
96
|
+
|
|
96
97
|
cparams.op_offload = params.op_offload;
|
|
97
98
|
|
|
98
99
|
const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max;
|
|
@@ -176,8 +177,9 @@ llama_context::llama_context(
|
|
|
176
177
|
// init the memory module
|
|
177
178
|
if (!hparams.vocab_only) {
|
|
178
179
|
llama_memory_params params_mem = {
|
|
179
|
-
/*.type_k
|
|
180
|
-
/*.type_v
|
|
180
|
+
/*.type_k =*/ params.type_k,
|
|
181
|
+
/*.type_v =*/ params.type_v,
|
|
182
|
+
/*.swa_full =*/ params.swa_full,
|
|
181
183
|
};
|
|
182
184
|
|
|
183
185
|
memory.reset(model.create_memory(params_mem, cparams));
|
|
@@ -855,11 +857,17 @@ int llama_context::decode(llama_batch & inp_batch) {
|
|
|
855
857
|
return -1;
|
|
856
858
|
}
|
|
857
859
|
|
|
860
|
+
if (!inp_batch.pos) {
|
|
861
|
+
if (inp_batch.seq_id) {
|
|
862
|
+
LLAMA_LOG_ERROR("%s: pos == NULL, but seq_id != NULL\n", __func__);
|
|
863
|
+
return -1;
|
|
864
|
+
}
|
|
865
|
+
}
|
|
866
|
+
|
|
858
867
|
llama_kv_cache * kv_self = static_cast<llama_kv_cache *>(memory.get());
|
|
859
868
|
|
|
860
869
|
// temporary allocate memory for the input batch if needed
|
|
861
|
-
|
|
862
|
-
llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : kv_self->get_pos_max() + 1);
|
|
870
|
+
llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : kv_self->seq_pos_max(0) + 1);
|
|
863
871
|
|
|
864
872
|
const llama_batch & batch = batch_allocr.batch;
|
|
865
873
|
|
|
@@ -947,8 +955,6 @@ int llama_context::decode(llama_batch & inp_batch) {
|
|
|
947
955
|
|
|
948
956
|
// find KV slot
|
|
949
957
|
if (!kv_self->find_slot(ubatch)) {
|
|
950
|
-
LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
|
|
951
|
-
|
|
952
958
|
return 1;
|
|
953
959
|
}
|
|
954
960
|
|
|
@@ -2093,6 +2099,7 @@ llama_context_params llama_context_default_params() {
|
|
|
2093
2099
|
/*.flash_attn =*/ false,
|
|
2094
2100
|
/*.no_perf =*/ true,
|
|
2095
2101
|
/*.op_offload =*/ true,
|
|
2102
|
+
/*.swa_full =*/ true,
|
|
2096
2103
|
};
|
|
2097
2104
|
|
|
2098
2105
|
return result;
|
|
@@ -2287,65 +2294,51 @@ int32_t llama_apply_adapter_cvec(
|
|
|
2287
2294
|
return res ? 0 : -1;
|
|
2288
2295
|
}
|
|
2289
2296
|
|
|
2290
|
-
//
|
|
2291
|
-
// kv cache view
|
|
2292
|
-
//
|
|
2293
|
-
|
|
2294
|
-
llama_kv_cache_view llama_kv_cache_view_init(const llama_context * ctx, int32_t n_seq_max) {
|
|
2295
|
-
const auto * kv = ctx->get_kv_self();
|
|
2296
|
-
if (kv == nullptr) {
|
|
2297
|
-
LLAMA_LOG_WARN("%s: the context does not have a KV cache\n", __func__);
|
|
2298
|
-
return {};
|
|
2299
|
-
}
|
|
2300
|
-
|
|
2301
|
-
return llama_kv_cache_view_init(*kv, n_seq_max);
|
|
2302
|
-
}
|
|
2303
|
-
|
|
2304
|
-
void llama_kv_cache_view_update(const llama_context * ctx, llama_kv_cache_view * view) {
|
|
2305
|
-
const auto * kv = ctx->get_kv_self();
|
|
2306
|
-
if (kv == nullptr) {
|
|
2307
|
-
LLAMA_LOG_WARN("%s: the context does not have a KV cache\n", __func__);
|
|
2308
|
-
return;
|
|
2309
|
-
}
|
|
2310
|
-
|
|
2311
|
-
llama_kv_cache_view_update(view, kv);
|
|
2312
|
-
}
|
|
2313
|
-
|
|
2314
2297
|
//
|
|
2315
2298
|
// kv cache
|
|
2316
2299
|
//
|
|
2317
2300
|
|
|
2318
2301
|
// deprecated
|
|
2319
|
-
int32_t llama_get_kv_cache_token_count(const llama_context * ctx) {
|
|
2320
|
-
return llama_kv_self_n_tokens(ctx);
|
|
2321
|
-
}
|
|
2322
|
-
|
|
2323
2302
|
int32_t llama_kv_self_n_tokens(const llama_context * ctx) {
|
|
2324
2303
|
const auto * kv = ctx->get_kv_self();
|
|
2325
2304
|
if (!kv) {
|
|
2326
2305
|
return 0;
|
|
2327
2306
|
}
|
|
2328
2307
|
|
|
2329
|
-
|
|
2330
|
-
}
|
|
2308
|
+
int32_t res = 0;
|
|
2331
2309
|
|
|
2332
|
-
|
|
2333
|
-
|
|
2334
|
-
|
|
2310
|
+
for (uint32_t s = 0; s < ctx->get_cparams().n_seq_max; s++) {
|
|
2311
|
+
const llama_pos p0 = kv->seq_pos_min(s);
|
|
2312
|
+
const llama_pos p1 = kv->seq_pos_max(s);
|
|
2313
|
+
|
|
2314
|
+
if (p0 >= 0) {
|
|
2315
|
+
res += (p1 - p0) + 1;
|
|
2316
|
+
}
|
|
2317
|
+
}
|
|
2318
|
+
|
|
2319
|
+
return res;
|
|
2335
2320
|
}
|
|
2336
2321
|
|
|
2322
|
+
// deprecated
|
|
2323
|
+
// note: this is the same as above - will be removed anyway, so it's ok
|
|
2337
2324
|
int32_t llama_kv_self_used_cells(const llama_context * ctx) {
|
|
2338
2325
|
const auto * kv = ctx->get_kv_self();
|
|
2339
2326
|
if (!kv) {
|
|
2340
2327
|
return 0;
|
|
2341
2328
|
}
|
|
2342
2329
|
|
|
2343
|
-
|
|
2344
|
-
}
|
|
2330
|
+
int32_t res = 0;
|
|
2345
2331
|
|
|
2346
|
-
|
|
2347
|
-
|
|
2348
|
-
|
|
2332
|
+
for (uint32_t s = 0; s < ctx->get_cparams().n_seq_max; s++) {
|
|
2333
|
+
const llama_pos p0 = kv->seq_pos_min(s);
|
|
2334
|
+
const llama_pos p1 = kv->seq_pos_max(s);
|
|
2335
|
+
|
|
2336
|
+
if (p0 >= 0) {
|
|
2337
|
+
res += (p1 - p0) + 1;
|
|
2338
|
+
}
|
|
2339
|
+
}
|
|
2340
|
+
|
|
2341
|
+
return res;
|
|
2349
2342
|
}
|
|
2350
2343
|
|
|
2351
2344
|
void llama_kv_self_clear(llama_context * ctx) {
|
|
@@ -2357,15 +2350,6 @@ void llama_kv_self_clear(llama_context * ctx) {
|
|
|
2357
2350
|
kv->clear();
|
|
2358
2351
|
}
|
|
2359
2352
|
|
|
2360
|
-
// deprecated
|
|
2361
|
-
bool llama_kv_cache_seq_rm(
|
|
2362
|
-
llama_context * ctx,
|
|
2363
|
-
llama_seq_id seq_id,
|
|
2364
|
-
llama_pos p0,
|
|
2365
|
-
llama_pos p1) {
|
|
2366
|
-
return llama_kv_self_seq_rm(ctx, seq_id, p0, p1);
|
|
2367
|
-
}
|
|
2368
|
-
|
|
2369
2353
|
bool llama_kv_self_seq_rm(
|
|
2370
2354
|
llama_context * ctx,
|
|
2371
2355
|
llama_seq_id seq_id,
|
|
@@ -2379,16 +2363,6 @@ bool llama_kv_self_seq_rm(
|
|
|
2379
2363
|
return kv->seq_rm(seq_id, p0, p1);
|
|
2380
2364
|
}
|
|
2381
2365
|
|
|
2382
|
-
// deprecated
|
|
2383
|
-
void llama_kv_cache_seq_cp(
|
|
2384
|
-
llama_context * ctx,
|
|
2385
|
-
llama_seq_id seq_id_src,
|
|
2386
|
-
llama_seq_id seq_id_dst,
|
|
2387
|
-
llama_pos p0,
|
|
2388
|
-
llama_pos p1) {
|
|
2389
|
-
llama_kv_self_seq_cp(ctx, seq_id_src, seq_id_dst, p0, p1);
|
|
2390
|
-
}
|
|
2391
|
-
|
|
2392
2366
|
void llama_kv_self_seq_cp(
|
|
2393
2367
|
llama_context * ctx,
|
|
2394
2368
|
llama_seq_id seq_id_src,
|
|
@@ -2403,13 +2377,6 @@ void llama_kv_self_seq_cp(
|
|
|
2403
2377
|
kv->seq_cp(seq_id_src, seq_id_dst, p0, p1);
|
|
2404
2378
|
}
|
|
2405
2379
|
|
|
2406
|
-
// deprecated
|
|
2407
|
-
void llama_kv_cache_seq_keep(
|
|
2408
|
-
llama_context * ctx,
|
|
2409
|
-
llama_seq_id seq_id) {
|
|
2410
|
-
llama_kv_self_seq_keep(ctx, seq_id);
|
|
2411
|
-
}
|
|
2412
|
-
|
|
2413
2380
|
void llama_kv_self_seq_keep(llama_context * ctx, llama_seq_id seq_id) {
|
|
2414
2381
|
auto * kv = ctx->get_kv_self();
|
|
2415
2382
|
if (!kv) {
|
|
@@ -2419,16 +2386,6 @@ void llama_kv_self_seq_keep(llama_context * ctx, llama_seq_id seq_id) {
|
|
|
2419
2386
|
kv->seq_keep(seq_id);
|
|
2420
2387
|
}
|
|
2421
2388
|
|
|
2422
|
-
// deprecated
|
|
2423
|
-
void llama_kv_cache_seq_add(
|
|
2424
|
-
llama_context * ctx,
|
|
2425
|
-
llama_seq_id seq_id,
|
|
2426
|
-
llama_pos p0,
|
|
2427
|
-
llama_pos p1,
|
|
2428
|
-
llama_pos delta) {
|
|
2429
|
-
llama_kv_self_seq_add(ctx, seq_id, p0, p1, delta);
|
|
2430
|
-
}
|
|
2431
|
-
|
|
2432
2389
|
void llama_kv_self_seq_add(
|
|
2433
2390
|
llama_context * ctx,
|
|
2434
2391
|
llama_seq_id seq_id,
|
|
@@ -2443,16 +2400,6 @@ void llama_kv_self_seq_add(
|
|
|
2443
2400
|
kv->seq_add(seq_id, p0, p1, delta);
|
|
2444
2401
|
}
|
|
2445
2402
|
|
|
2446
|
-
// deprecated
|
|
2447
|
-
void llama_kv_cache_seq_div(
|
|
2448
|
-
llama_context * ctx,
|
|
2449
|
-
llama_seq_id seq_id,
|
|
2450
|
-
llama_pos p0,
|
|
2451
|
-
llama_pos p1,
|
|
2452
|
-
int d) {
|
|
2453
|
-
llama_kv_self_seq_div(ctx, seq_id, p0, p1, d);
|
|
2454
|
-
}
|
|
2455
|
-
|
|
2456
2403
|
void llama_kv_self_seq_div(
|
|
2457
2404
|
llama_context * ctx,
|
|
2458
2405
|
llama_seq_id seq_id,
|
|
@@ -2467,25 +2414,24 @@ void llama_kv_self_seq_div(
|
|
|
2467
2414
|
kv->seq_div(seq_id, p0, p1, d);
|
|
2468
2415
|
}
|
|
2469
2416
|
|
|
2470
|
-
|
|
2471
|
-
|
|
2472
|
-
|
|
2417
|
+
llama_pos llama_kv_self_seq_pos_min(llama_context * ctx, llama_seq_id seq_id) {
|
|
2418
|
+
const auto * kv = ctx->get_kv_self();
|
|
2419
|
+
if (!kv) {
|
|
2420
|
+
return -1;
|
|
2421
|
+
}
|
|
2422
|
+
|
|
2423
|
+
return kv->seq_pos_min(seq_id);
|
|
2473
2424
|
}
|
|
2474
2425
|
|
|
2475
2426
|
llama_pos llama_kv_self_seq_pos_max(llama_context * ctx, llama_seq_id seq_id) {
|
|
2476
2427
|
const auto * kv = ctx->get_kv_self();
|
|
2477
2428
|
if (!kv) {
|
|
2478
|
-
return
|
|
2429
|
+
return -1;
|
|
2479
2430
|
}
|
|
2480
2431
|
|
|
2481
2432
|
return kv->seq_pos_max(seq_id);
|
|
2482
2433
|
}
|
|
2483
2434
|
|
|
2484
|
-
// deprecated
|
|
2485
|
-
void llama_kv_cache_defrag(llama_context * ctx) {
|
|
2486
|
-
llama_kv_self_defrag(ctx);
|
|
2487
|
-
}
|
|
2488
|
-
|
|
2489
2435
|
void llama_kv_self_defrag(llama_context * ctx) {
|
|
2490
2436
|
auto * kv = ctx->get_kv_self();
|
|
2491
2437
|
if (!kv) {
|
|
@@ -2496,11 +2442,6 @@ void llama_kv_self_defrag(llama_context * ctx) {
|
|
|
2496
2442
|
kv->defrag_sched(-1.0f);
|
|
2497
2443
|
}
|
|
2498
2444
|
|
|
2499
|
-
// deprecated
|
|
2500
|
-
bool llama_kv_cache_can_shift(const llama_context * ctx) {
|
|
2501
|
-
return llama_kv_self_can_shift(ctx);
|
|
2502
|
-
}
|
|
2503
|
-
|
|
2504
2445
|
bool llama_kv_self_can_shift(const llama_context * ctx) {
|
|
2505
2446
|
const auto * kv = ctx->get_kv_self();
|
|
2506
2447
|
if (!kv) {
|
|
@@ -2510,11 +2451,6 @@ bool llama_kv_self_can_shift(const llama_context * ctx) {
|
|
|
2510
2451
|
return kv->get_can_shift();
|
|
2511
2452
|
}
|
|
2512
2453
|
|
|
2513
|
-
// deprecated
|
|
2514
|
-
void llama_kv_cache_update(llama_context * ctx) {
|
|
2515
|
-
llama_kv_self_update(ctx);
|
|
2516
|
-
}
|
|
2517
|
-
|
|
2518
2454
|
// llama state API
|
|
2519
2455
|
|
|
2520
2456
|
// deprecated
|
|
@@ -2637,7 +2573,21 @@ int32_t llama_encode(
|
|
|
2637
2573
|
int32_t llama_decode(
|
|
2638
2574
|
llama_context * ctx,
|
|
2639
2575
|
llama_batch batch) {
|
|
2640
|
-
|
|
2576
|
+
int ret = ctx->decode(batch);
|
|
2577
|
+
|
|
2578
|
+
// defrag and try again
|
|
2579
|
+
// TODO: distinguish return code when we are sure that even after defrag there is no space available
|
|
2580
|
+
if (ret == 1) {
|
|
2581
|
+
llama_kv_self_defrag(ctx);
|
|
2582
|
+
ret = ctx->decode(batch);
|
|
2583
|
+
|
|
2584
|
+
if (ret == 1) {
|
|
2585
|
+
LLAMA_LOG_WARN("%s: failed to find KV cache slot for batch of size %d\n", __func__, batch.n_tokens);
|
|
2586
|
+
|
|
2587
|
+
return ret;
|
|
2588
|
+
}
|
|
2589
|
+
}
|
|
2590
|
+
|
|
2641
2591
|
if (ret != 0) {
|
|
2642
2592
|
LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
|
|
2643
2593
|
}
|