@fugood/llama.node 0.4.7 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. package/bin/darwin/arm64/llama-node.node +0 -0
  2. package/bin/darwin/x64/llama-node.node +0 -0
  3. package/bin/linux/arm64/llama-node.node +0 -0
  4. package/bin/linux/x64/llama-node.node +0 -0
  5. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  6. package/bin/linux-cuda/x64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  9. package/bin/win32/arm64/llama-node.node +0 -0
  10. package/bin/win32/arm64/node.lib +0 -0
  11. package/bin/win32/x64/llama-node.node +0 -0
  12. package/bin/win32/x64/node.lib +0 -0
  13. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  14. package/bin/win32-vulkan/arm64/node.lib +0 -0
  15. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  16. package/bin/win32-vulkan/x64/node.lib +0 -0
  17. package/lib/binding.ts +20 -6
  18. package/lib/index.js +41 -17
  19. package/lib/index.ts +50 -23
  20. package/package.json +1 -1
  21. package/src/LlamaCompletionWorker.cpp +9 -9
  22. package/src/LlamaCompletionWorker.h +2 -2
  23. package/src/LlamaContext.cpp +37 -18
  24. package/src/LlamaContext.h +1 -0
  25. package/src/TokenizeWorker.cpp +16 -12
  26. package/src/TokenizeWorker.h +2 -2
  27. package/src/common.hpp +54 -50
  28. package/src/llama.cpp/.github/workflows/build.yml +2 -2
  29. package/src/llama.cpp/.github/workflows/release.yml +152 -129
  30. package/src/llama.cpp/.github/workflows/winget.yml +42 -0
  31. package/src/llama.cpp/common/arg.cpp +14 -13
  32. package/src/llama.cpp/common/common.cpp +4 -75
  33. package/src/llama.cpp/common/common.h +7 -12
  34. package/src/llama.cpp/examples/lookahead/lookahead.cpp +0 -13
  35. package/src/llama.cpp/examples/lookup/lookup.cpp +0 -11
  36. package/src/llama.cpp/examples/parallel/parallel.cpp +0 -9
  37. package/src/llama.cpp/examples/retrieval/retrieval.cpp +6 -6
  38. package/src/llama.cpp/examples/simple/simple.cpp +1 -1
  39. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +2 -2
  40. package/src/llama.cpp/examples/sycl/run-llama2.sh +4 -4
  41. package/src/llama.cpp/examples/sycl/run-llama3.sh +28 -0
  42. package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
  43. package/src/llama.cpp/examples/sycl/win-run-llama3.bat +9 -0
  44. package/src/llama.cpp/ggml/include/ggml-opt.h +2 -0
  45. package/src/llama.cpp/ggml/include/ggml.h +11 -0
  46. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +274 -0
  47. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +27 -0
  48. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +18 -2
  49. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1 -0
  50. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +107 -0
  51. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +16 -0
  52. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +8 -2
  53. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +315 -155
  54. package/src/llama.cpp/ggml/src/ggml-opt.cpp +5 -0
  55. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +43 -12
  56. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +171 -112
  57. package/src/llama.cpp/ggml/src/ggml.c +64 -18
  58. package/src/llama.cpp/include/llama.h +24 -124
  59. package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +5 -1
  60. package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +5 -1
  61. package/src/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt +2 -0
  62. package/src/llama.cpp/src/llama-batch.cpp +3 -1
  63. package/src/llama.cpp/src/llama-context.cpp +60 -110
  64. package/src/llama.cpp/src/llama-graph.cpp +137 -233
  65. package/src/llama.cpp/src/llama-graph.h +49 -7
  66. package/src/llama.cpp/src/llama-hparams.cpp +17 -1
  67. package/src/llama.cpp/src/llama-hparams.h +34 -5
  68. package/src/llama.cpp/src/llama-kv-cache.cpp +654 -321
  69. package/src/llama.cpp/src/llama-kv-cache.h +201 -85
  70. package/src/llama.cpp/src/llama-memory.h +3 -2
  71. package/src/llama.cpp/src/llama-model.cpp +273 -94
  72. package/src/llama.cpp/src/llama-model.h +4 -1
  73. package/src/llama.cpp/tests/test-arg-parser.cpp +1 -1
  74. package/src/llama.cpp/tools/llama-bench/llama-bench.cpp +1 -0
  75. package/src/llama.cpp/tools/mtmd/CMakeLists.txt +13 -2
  76. package/src/llama.cpp/tools/mtmd/clip-impl.h +108 -11
  77. package/src/llama.cpp/tools/mtmd/clip.cpp +466 -88
  78. package/src/llama.cpp/tools/mtmd/clip.h +6 -4
  79. package/src/llama.cpp/tools/mtmd/miniaudio.h +93468 -0
  80. package/src/llama.cpp/tools/mtmd/mtmd-audio.cpp +855 -0
  81. package/src/llama.cpp/tools/mtmd/mtmd-audio.h +62 -0
  82. package/src/llama.cpp/tools/mtmd/mtmd-cli.cpp +21 -14
  83. package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +36 -49
  84. package/src/llama.cpp/tools/mtmd/mtmd.cpp +362 -98
  85. package/src/llama.cpp/tools/mtmd/mtmd.h +52 -21
  86. package/src/llama.cpp/tools/run/run.cpp +2 -2
  87. package/src/llama.cpp/tools/server/server.cpp +158 -47
  88. package/src/llama.cpp/tools/server/utils.hpp +71 -43
  89. package/src/llama.cpp/tools/tts/tts.cpp +4 -2
@@ -64,12 +64,17 @@
64
64
  // precomputed f32 table for f16 (256 KB) (ggml-impl.h)
65
65
  float ggml_table_f32_f16[1 << 16];
66
66
 
67
- #if (defined(__linux__) || defined(__APPLE__) || defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__)) && \
68
- (!defined(TARGET_OS_TV) && !defined(TARGET_OS_WATCH))
67
+ #if defined(__linux__) || \
68
+ defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \
69
+ (defined(__APPLE__) && !TARGET_OS_TV && !TARGET_OS_WATCH)
70
+
69
71
  #include <unistd.h>
70
72
  #include <sys/types.h>
71
73
  #include <sys/stat.h>
72
74
  #include <sys/wait.h>
75
+ #if defined(__linux__)
76
+ #include <sys/prctl.h>
77
+ #endif
73
78
 
74
79
  #if defined(__ANDROID__)
75
80
  #include <unwind.h>
@@ -133,10 +138,36 @@ static void ggml_print_backtrace(void) {
133
138
  if (GGML_NO_BACKTRACE) {
134
139
  return;
135
140
  }
136
- char attach[32];
137
- snprintf(attach, sizeof(attach), "attach %d", getpid());
138
- int pid = fork();
139
- if (pid == 0) {
141
+ #if defined(__linux__)
142
+ FILE * f = fopen("/proc/self/status", "r");
143
+ size_t size = 0;
144
+ char * line = NULL;
145
+ ssize_t length = 0;
146
+ while ((length = getline(&line, &size, f)) > 0) {
147
+ if (!strncmp(line, "TracerPid:", sizeof("TracerPid:") - 1) &&
148
+ (length != sizeof("TracerPid:\t0\n") - 1 || line[length - 2] != '0')) {
149
+ // Already being debugged, and the breakpoint is the later abort()
150
+ free(line);
151
+ fclose(f);
152
+ return;
153
+ }
154
+ }
155
+ free(line);
156
+ fclose(f);
157
+ int lock[2] = { -1, -1 };
158
+ (void) !pipe(lock); // Don't start gdb until after PR_SET_PTRACER
159
+ #endif
160
+ const int parent_pid = getpid();
161
+ const int child_pid = fork();
162
+ if (child_pid < 0) { // error
163
+ return;
164
+ } else if (child_pid == 0) { // child
165
+ char attach[32];
166
+ snprintf(attach, sizeof(attach), "attach %d", parent_pid);
167
+ #if defined(__linux__)
168
+ close(lock[1]);
169
+ (void) !read(lock[0], lock, 1);
170
+ #endif
140
171
  // try gdb
141
172
  execlp("gdb", "gdb", "--batch",
142
173
  "-ex", "set style enabled on",
@@ -149,18 +180,18 @@ static void ggml_print_backtrace(void) {
149
180
  execlp("lldb", "lldb", "--batch",
150
181
  "-o", "bt",
151
182
  "-o", "quit",
152
- "-p", attach,
183
+ "-p", &attach[sizeof("attach ") - 1],
153
184
  (char *) NULL);
154
- exit(EXIT_FAILURE);
155
- } else {
156
- int wstatus;
157
- waitpid(pid, &wstatus, 0);
158
- if (WIFEXITED(wstatus)) {
159
- if (WEXITSTATUS(wstatus) == EXIT_FAILURE) {
160
- // gdb failed, fallback to backtrace_symbols
161
- ggml_print_backtrace_symbols();
162
- }
163
- }
185
+ // gdb failed, fallback to backtrace_symbols
186
+ ggml_print_backtrace_symbols();
187
+ _Exit(0);
188
+ } else { // parent
189
+ #if defined(__linux__)
190
+ prctl(PR_SET_PTRACER, child_pid);
191
+ close(lock[1]);
192
+ close(lock[0]);
193
+ #endif
194
+ waitpid(child_pid, NULL, 0);
164
195
  }
165
196
  }
166
197
  #else
@@ -1068,9 +1099,10 @@ static const char * GGML_UNARY_OP_NAME[GGML_UNARY_OP_COUNT] = {
1068
1099
  "HARDSWISH",
1069
1100
  "HARDSIGMOID",
1070
1101
  "EXP",
1102
+ "GELU_ERF",
1071
1103
  };
1072
1104
 
1073
- static_assert(GGML_UNARY_OP_COUNT == 14, "GGML_UNARY_OP_COUNT != 14");
1105
+ static_assert(GGML_UNARY_OP_COUNT == 15, "GGML_UNARY_OP_COUNT != 15");
1074
1106
 
1075
1107
 
1076
1108
  static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
@@ -2470,6 +2502,20 @@ struct ggml_tensor * ggml_gelu_inplace(
2470
2502
  return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_GELU);
2471
2503
  }
2472
2504
 
2505
+ // ggml_gelu_erf
2506
+
2507
+ struct ggml_tensor * ggml_gelu_erf(
2508
+ struct ggml_context * ctx,
2509
+ struct ggml_tensor * a) {
2510
+ return ggml_unary(ctx, a, GGML_UNARY_OP_GELU_ERF);
2511
+ }
2512
+
2513
+ struct ggml_tensor * ggml_gelu_erf_inplace(
2514
+ struct ggml_context * ctx,
2515
+ struct ggml_tensor * a) {
2516
+ return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_GELU_ERF);
2517
+ }
2518
+
2473
2519
  // ggml_gelu_quick
2474
2520
 
2475
2521
  struct ggml_tensor * ggml_gelu_quick(
@@ -361,10 +361,11 @@ extern "C" {
361
361
 
362
362
  // Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value.
363
363
  bool embeddings; // if true, extract embeddings (together with logits)
364
- bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
365
- bool flash_attn; // whether to use flash attention [EXPERIMENTAL]
366
- bool no_perf; // whether to measure performance timings
367
- bool op_offload; // whether to offload host tensor operations to device
364
+ bool offload_kqv; // offload the KQV ops (including the KV cache) to GPU
365
+ bool flash_attn; // use flash attention [EXPERIMENTAL]
366
+ bool no_perf; // measure performance timings
367
+ bool op_offload; // offload host tensor operations to device
368
+ bool swa_full; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
368
369
  };
369
370
 
370
371
  // model quantization parameters
@@ -607,71 +608,14 @@ extern "C" {
607
608
  // KV cache
608
609
  //
609
610
 
610
- // TODO: start using struct llama_kv_cache
611
-
612
- // Information associated with an individual cell in the KV cache view.
613
- struct llama_kv_cache_view_cell {
614
- // The position for this cell. Takes KV cache shifts into account.
615
- // May be negative if the cell is not populated.
616
- llama_pos pos;
617
- };
618
-
619
- // An updateable view of the KV cache.
620
- struct llama_kv_cache_view {
621
- // Number of KV cache cells. This will be the same as the context size.
622
- int32_t n_cells;
623
-
624
- // Maximum number of sequences that can exist in a cell. It's not an error
625
- // if there are more sequences in a cell than this value, however they will
626
- // not be visible in the view cells_sequences.
627
- int32_t n_seq_max;
628
-
629
- // Number of tokens in the cache. For example, if there are two populated
630
- // cells, the first with 1 sequence id in it and the second with 2 sequence
631
- // ids then you'll have 3 tokens.
632
- int32_t token_count;
633
-
634
- // Number of populated cache cells.
635
- int32_t used_cells;
636
-
637
- // Maximum contiguous empty slots in the cache.
638
- int32_t max_contiguous;
639
-
640
- // Index to the start of the max_contiguous slot range. Can be negative
641
- // when cache is full.
642
- int32_t max_contiguous_idx;
643
-
644
- // Information for an individual cell.
645
- struct llama_kv_cache_view_cell * cells;
646
-
647
- // The sequences for each cell. There will be n_seq_max items per cell.
648
- llama_seq_id * cells_sequences;
649
- };
650
-
651
- // Create an empty KV cache view. (use only for debugging purposes)
652
- LLAMA_API struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_seq_max);
653
-
654
- // Free a KV cache view. (use only for debugging purposes)
655
- LLAMA_API void llama_kv_cache_view_free(struct llama_kv_cache_view * view);
656
-
657
- // Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes)
658
- // TODO: change signature to llama_kv_cache_view_update(struct llama_kv_cache_view * view, const struct llama_context * ctx)
659
- LLAMA_API void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view);
660
-
661
- ///
662
-
663
611
  // Returns the number of tokens in the KV cache (slow, use only for debug)
664
612
  // If a KV cell has multiple sequences assigned to it, it will be counted multiple times
665
- LLAMA_API int32_t llama_kv_self_n_tokens(const struct llama_context * ctx);
666
-
667
- DEPRECATED(LLAMA_API int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx),
668
- "use llama_kv_self_n_tokens instead");
613
+ DEPRECATED(LLAMA_API int32_t llama_kv_self_n_tokens(const struct llama_context * ctx),
614
+ "Use llama_kv_self_seq_pos_max() instead");
669
615
 
670
616
  // Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
671
- LLAMA_API int32_t llama_kv_self_used_cells(const struct llama_context * ctx);
672
-
673
- DEPRECATED(LLAMA_API int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx),
674
- "use llama_kv_self_used_cells instead");
617
+ DEPRECATED(LLAMA_API int32_t llama_kv_self_used_cells(const struct llama_context * ctx),
618
+ "Use llama_kv_self_seq_pos_max() instead");
675
619
 
676
620
  // Clear the KV cache - both cell info is erased and KV data is zeroed
677
621
  LLAMA_API void llama_kv_self_clear(
@@ -730,10 +674,18 @@ extern "C" {
730
674
  llama_pos p1,
731
675
  int d);
732
676
 
677
+ // Returns the smallest position present in the KV cache for the specified sequence
678
+ // This is typically non-zero only for SWA caches
679
+ // Return -1 if the sequence is empty
680
+ LLAMA_API llama_pos llama_kv_self_seq_pos_min(
681
+ struct llama_context * ctx,
682
+ llama_seq_id seq_id);
683
+
733
684
  // Returns the largest position present in the KV cache for the specified sequence
685
+ // Return -1 if the sequence is empty
734
686
  LLAMA_API llama_pos llama_kv_self_seq_pos_max(
735
687
  struct llama_context * ctx,
736
- llama_seq_id seq_id);
688
+ llama_seq_id seq_id);
737
689
 
738
690
  // Defragment the KV cache
739
691
  // This will be applied:
@@ -747,61 +699,6 @@ extern "C" {
747
699
  // Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
748
700
  LLAMA_API void llama_kv_self_update(struct llama_context * ctx);
749
701
 
750
- DEPRECATED(LLAMA_API void llama_kv_cache_clear(
751
- struct llama_context * ctx),
752
- "use llama_kv_self_clear instead");
753
-
754
- DEPRECATED(LLAMA_API bool llama_kv_cache_seq_rm(
755
- struct llama_context * ctx,
756
- llama_seq_id seq_id,
757
- llama_pos p0,
758
- llama_pos p1),
759
- "use llama_kv_self_seq_rm instead");
760
-
761
- DEPRECATED(LLAMA_API void llama_kv_cache_seq_cp(
762
- struct llama_context * ctx,
763
- llama_seq_id seq_id_src,
764
- llama_seq_id seq_id_dst,
765
- llama_pos p0,
766
- llama_pos p1),
767
- "use llama_kv_self_seq_cp instead");
768
-
769
- DEPRECATED(LLAMA_API void llama_kv_cache_seq_keep(
770
- struct llama_context * ctx,
771
- llama_seq_id seq_id),
772
- "use llama_kv_self_seq_keep instead");
773
-
774
- DEPRECATED(LLAMA_API void llama_kv_cache_seq_add(
775
- struct llama_context * ctx,
776
- llama_seq_id seq_id,
777
- llama_pos p0,
778
- llama_pos p1,
779
- llama_pos delta),
780
- "use llama_kv_self_seq_add instead");
781
-
782
- DEPRECATED(LLAMA_API void llama_kv_cache_seq_div(
783
- struct llama_context * ctx,
784
- llama_seq_id seq_id,
785
- llama_pos p0,
786
- llama_pos p1,
787
- int d),
788
- "use llama_kv_self_seq_div instead");
789
-
790
- DEPRECATED(LLAMA_API llama_pos llama_kv_cache_seq_pos_max(
791
- struct llama_context * ctx,
792
- llama_seq_id seq_id),
793
- "use llama_kv_self_seq_pos_max instead");
794
-
795
- DEPRECATED(LLAMA_API void llama_kv_cache_defrag(struct llama_context * ctx),
796
- "use llama_kv_self_defrag instead");
797
-
798
- DEPRECATED(LLAMA_API bool llama_kv_cache_can_shift(const struct llama_context * ctx),
799
- "use llama_kv_self_can_shift instead");
800
-
801
- DEPRECATED(LLAMA_API void llama_kv_cache_update(struct llama_context * ctx),
802
- "use llama_kv_self_update instead");
803
-
804
-
805
702
  //
806
703
  // State / sessions
807
704
  //
@@ -943,9 +840,12 @@ extern "C" {
943
840
  // Requires KV cache.
944
841
  // For encode-decoder contexts, processes the batch using the decoder.
945
842
  // Positive return values does not mean a fatal error, but rather a warning.
946
- // 0 - success
947
- // 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
948
- // < 0 - error. the KV cache state is restored to the state before this call
843
+ // Upon non-zero return values, the KV cache state is restored to the state before this call
844
+ // 0 - success
845
+ // 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
846
+ // 2 - aborted
847
+ // -1 - invalid input batch
848
+ // < -1 - error
949
849
  LLAMA_API int32_t llama_decode(
950
850
  struct llama_context * ctx,
951
851
  struct llama_batch batch);
@@ -1,3 +1,7 @@
1
1
  -r ./requirements-convert_legacy_llama.txt
2
2
  --extra-index-url https://download.pytorch.org/whl/cpu
3
- torch~=2.2.1
3
+ torch~=2.2.1; platform_machine != "s390x"
4
+
5
+ # torch s390x packages can only be found from nightly builds
6
+ --extra-index-url https://download.pytorch.org/whl/nightly
7
+ torch>=0.0.0.dev0; platform_machine == "s390x"
@@ -1,3 +1,7 @@
1
1
  -r ./requirements-convert_legacy_llama.txt
2
2
  --extra-index-url https://download.pytorch.org/whl/cpu
3
- torch~=2.2.1
3
+ torch~=2.2.1; platform_machine != "s390x"
4
+
5
+ # torch s390x packages can only be found from nightly builds
6
+ --extra-index-url https://download.pytorch.org/whl/nightly
7
+ torch>=0.0.0.dev0; platform_machine == "s390x"
@@ -1,2 +1,4 @@
1
1
  -r ./requirements-convert_hf_to_gguf.txt
2
2
  --extra-index-url https://download.pytorch.org/whl/cpu
3
+ # torch s390x packages can only be found from nightly builds
4
+ --extra-index-url https://download.pytorch.org/whl/nightly
@@ -1,5 +1,6 @@
1
1
  #include "llama-batch.h"
2
2
 
3
+ #include <cassert>
3
4
  #include <cstring>
4
5
  #include <algorithm>
5
6
 
@@ -281,9 +282,10 @@ llama_batch_allocr::llama_batch_allocr(struct llama_batch in_batch, llama_pos p0
281
282
  batch = in_batch;
282
283
  GGML_ASSERT(batch.n_tokens > 0);
283
284
  if (!batch.pos) {
285
+ assert(p0 >= 0);
284
286
  pos.resize(batch.n_tokens);
285
287
  for (int32_t i = 0; i < batch.n_tokens; i++) {
286
- pos[i] = i + p0;
288
+ pos[i] = p0 + i;
287
289
  }
288
290
  batch.pos = pos.data();
289
291
  }
@@ -93,6 +93,7 @@ llama_context::llama_context(
93
93
  }
94
94
 
95
95
  cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
96
+
96
97
  cparams.op_offload = params.op_offload;
97
98
 
98
99
  const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max;
@@ -176,8 +177,9 @@ llama_context::llama_context(
176
177
  // init the memory module
177
178
  if (!hparams.vocab_only) {
178
179
  llama_memory_params params_mem = {
179
- /*.type_k =*/ params.type_k,
180
- /*.type_v =*/ params.type_v,
180
+ /*.type_k =*/ params.type_k,
181
+ /*.type_v =*/ params.type_v,
182
+ /*.swa_full =*/ params.swa_full,
181
183
  };
182
184
 
183
185
  memory.reset(model.create_memory(params_mem, cparams));
@@ -855,11 +857,17 @@ int llama_context::decode(llama_batch & inp_batch) {
855
857
  return -1;
856
858
  }
857
859
 
860
+ if (!inp_batch.pos) {
861
+ if (inp_batch.seq_id) {
862
+ LLAMA_LOG_ERROR("%s: pos == NULL, but seq_id != NULL\n", __func__);
863
+ return -1;
864
+ }
865
+ }
866
+
858
867
  llama_kv_cache * kv_self = static_cast<llama_kv_cache *>(memory.get());
859
868
 
860
869
  // temporary allocate memory for the input batch if needed
861
- // TODO: this is incorrect for multiple sequences because get_pos_max() is the maximum across all sequences
862
- llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : kv_self->get_pos_max() + 1);
870
+ llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : kv_self->seq_pos_max(0) + 1);
863
871
 
864
872
  const llama_batch & batch = batch_allocr.batch;
865
873
 
@@ -947,8 +955,6 @@ int llama_context::decode(llama_batch & inp_batch) {
947
955
 
948
956
  // find KV slot
949
957
  if (!kv_self->find_slot(ubatch)) {
950
- LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
951
-
952
958
  return 1;
953
959
  }
954
960
 
@@ -2093,6 +2099,7 @@ llama_context_params llama_context_default_params() {
2093
2099
  /*.flash_attn =*/ false,
2094
2100
  /*.no_perf =*/ true,
2095
2101
  /*.op_offload =*/ true,
2102
+ /*.swa_full =*/ true,
2096
2103
  };
2097
2104
 
2098
2105
  return result;
@@ -2287,65 +2294,51 @@ int32_t llama_apply_adapter_cvec(
2287
2294
  return res ? 0 : -1;
2288
2295
  }
2289
2296
 
2290
- //
2291
- // kv cache view
2292
- //
2293
-
2294
- llama_kv_cache_view llama_kv_cache_view_init(const llama_context * ctx, int32_t n_seq_max) {
2295
- const auto * kv = ctx->get_kv_self();
2296
- if (kv == nullptr) {
2297
- LLAMA_LOG_WARN("%s: the context does not have a KV cache\n", __func__);
2298
- return {};
2299
- }
2300
-
2301
- return llama_kv_cache_view_init(*kv, n_seq_max);
2302
- }
2303
-
2304
- void llama_kv_cache_view_update(const llama_context * ctx, llama_kv_cache_view * view) {
2305
- const auto * kv = ctx->get_kv_self();
2306
- if (kv == nullptr) {
2307
- LLAMA_LOG_WARN("%s: the context does not have a KV cache\n", __func__);
2308
- return;
2309
- }
2310
-
2311
- llama_kv_cache_view_update(view, kv);
2312
- }
2313
-
2314
2297
  //
2315
2298
  // kv cache
2316
2299
  //
2317
2300
 
2318
2301
  // deprecated
2319
- int32_t llama_get_kv_cache_token_count(const llama_context * ctx) {
2320
- return llama_kv_self_n_tokens(ctx);
2321
- }
2322
-
2323
2302
  int32_t llama_kv_self_n_tokens(const llama_context * ctx) {
2324
2303
  const auto * kv = ctx->get_kv_self();
2325
2304
  if (!kv) {
2326
2305
  return 0;
2327
2306
  }
2328
2307
 
2329
- return kv->get_n_tokens();
2330
- }
2308
+ int32_t res = 0;
2331
2309
 
2332
- // deprecated
2333
- int32_t llama_get_kv_cache_used_cells(const llama_context * ctx) {
2334
- return llama_kv_self_used_cells(ctx);
2310
+ for (uint32_t s = 0; s < ctx->get_cparams().n_seq_max; s++) {
2311
+ const llama_pos p0 = kv->seq_pos_min(s);
2312
+ const llama_pos p1 = kv->seq_pos_max(s);
2313
+
2314
+ if (p0 >= 0) {
2315
+ res += (p1 - p0) + 1;
2316
+ }
2317
+ }
2318
+
2319
+ return res;
2335
2320
  }
2336
2321
 
2322
+ // deprecated
2323
+ // note: this is the same as above - will be removed anyway, so it's ok
2337
2324
  int32_t llama_kv_self_used_cells(const llama_context * ctx) {
2338
2325
  const auto * kv = ctx->get_kv_self();
2339
2326
  if (!kv) {
2340
2327
  return 0;
2341
2328
  }
2342
2329
 
2343
- return kv->get_used_cells();
2344
- }
2330
+ int32_t res = 0;
2345
2331
 
2346
- // deprecated
2347
- void llama_kv_cache_clear(llama_context * ctx) {
2348
- llama_kv_self_clear(ctx);
2332
+ for (uint32_t s = 0; s < ctx->get_cparams().n_seq_max; s++) {
2333
+ const llama_pos p0 = kv->seq_pos_min(s);
2334
+ const llama_pos p1 = kv->seq_pos_max(s);
2335
+
2336
+ if (p0 >= 0) {
2337
+ res += (p1 - p0) + 1;
2338
+ }
2339
+ }
2340
+
2341
+ return res;
2349
2342
  }
2350
2343
 
2351
2344
  void llama_kv_self_clear(llama_context * ctx) {
@@ -2357,15 +2350,6 @@ void llama_kv_self_clear(llama_context * ctx) {
2357
2350
  kv->clear();
2358
2351
  }
2359
2352
 
2360
- // deprecated
2361
- bool llama_kv_cache_seq_rm(
2362
- llama_context * ctx,
2363
- llama_seq_id seq_id,
2364
- llama_pos p0,
2365
- llama_pos p1) {
2366
- return llama_kv_self_seq_rm(ctx, seq_id, p0, p1);
2367
- }
2368
-
2369
2353
  bool llama_kv_self_seq_rm(
2370
2354
  llama_context * ctx,
2371
2355
  llama_seq_id seq_id,
@@ -2379,16 +2363,6 @@ bool llama_kv_self_seq_rm(
2379
2363
  return kv->seq_rm(seq_id, p0, p1);
2380
2364
  }
2381
2365
 
2382
- // deprecated
2383
- void llama_kv_cache_seq_cp(
2384
- llama_context * ctx,
2385
- llama_seq_id seq_id_src,
2386
- llama_seq_id seq_id_dst,
2387
- llama_pos p0,
2388
- llama_pos p1) {
2389
- llama_kv_self_seq_cp(ctx, seq_id_src, seq_id_dst, p0, p1);
2390
- }
2391
-
2392
2366
  void llama_kv_self_seq_cp(
2393
2367
  llama_context * ctx,
2394
2368
  llama_seq_id seq_id_src,
@@ -2403,13 +2377,6 @@ void llama_kv_self_seq_cp(
2403
2377
  kv->seq_cp(seq_id_src, seq_id_dst, p0, p1);
2404
2378
  }
2405
2379
 
2406
- // deprecated
2407
- void llama_kv_cache_seq_keep(
2408
- llama_context * ctx,
2409
- llama_seq_id seq_id) {
2410
- llama_kv_self_seq_keep(ctx, seq_id);
2411
- }
2412
-
2413
2380
  void llama_kv_self_seq_keep(llama_context * ctx, llama_seq_id seq_id) {
2414
2381
  auto * kv = ctx->get_kv_self();
2415
2382
  if (!kv) {
@@ -2419,16 +2386,6 @@ void llama_kv_self_seq_keep(llama_context * ctx, llama_seq_id seq_id) {
2419
2386
  kv->seq_keep(seq_id);
2420
2387
  }
2421
2388
 
2422
- // deprecated
2423
- void llama_kv_cache_seq_add(
2424
- llama_context * ctx,
2425
- llama_seq_id seq_id,
2426
- llama_pos p0,
2427
- llama_pos p1,
2428
- llama_pos delta) {
2429
- llama_kv_self_seq_add(ctx, seq_id, p0, p1, delta);
2430
- }
2431
-
2432
2389
  void llama_kv_self_seq_add(
2433
2390
  llama_context * ctx,
2434
2391
  llama_seq_id seq_id,
@@ -2443,16 +2400,6 @@ void llama_kv_self_seq_add(
2443
2400
  kv->seq_add(seq_id, p0, p1, delta);
2444
2401
  }
2445
2402
 
2446
- // deprecated
2447
- void llama_kv_cache_seq_div(
2448
- llama_context * ctx,
2449
- llama_seq_id seq_id,
2450
- llama_pos p0,
2451
- llama_pos p1,
2452
- int d) {
2453
- llama_kv_self_seq_div(ctx, seq_id, p0, p1, d);
2454
- }
2455
-
2456
2403
  void llama_kv_self_seq_div(
2457
2404
  llama_context * ctx,
2458
2405
  llama_seq_id seq_id,
@@ -2467,25 +2414,24 @@ void llama_kv_self_seq_div(
2467
2414
  kv->seq_div(seq_id, p0, p1, d);
2468
2415
  }
2469
2416
 
2470
- // deprecated
2471
- llama_pos llama_kv_cache_seq_pos_max(llama_context * ctx, llama_seq_id seq_id) {
2472
- return llama_kv_self_seq_pos_max(ctx, seq_id);
2417
+ llama_pos llama_kv_self_seq_pos_min(llama_context * ctx, llama_seq_id seq_id) {
2418
+ const auto * kv = ctx->get_kv_self();
2419
+ if (!kv) {
2420
+ return -1;
2421
+ }
2422
+
2423
+ return kv->seq_pos_min(seq_id);
2473
2424
  }
2474
2425
 
2475
2426
  llama_pos llama_kv_self_seq_pos_max(llama_context * ctx, llama_seq_id seq_id) {
2476
2427
  const auto * kv = ctx->get_kv_self();
2477
2428
  if (!kv) {
2478
- return 0;
2429
+ return -1;
2479
2430
  }
2480
2431
 
2481
2432
  return kv->seq_pos_max(seq_id);
2482
2433
  }
2483
2434
 
2484
- // deprecated
2485
- void llama_kv_cache_defrag(llama_context * ctx) {
2486
- llama_kv_self_defrag(ctx);
2487
- }
2488
-
2489
2435
  void llama_kv_self_defrag(llama_context * ctx) {
2490
2436
  auto * kv = ctx->get_kv_self();
2491
2437
  if (!kv) {
@@ -2496,11 +2442,6 @@ void llama_kv_self_defrag(llama_context * ctx) {
2496
2442
  kv->defrag_sched(-1.0f);
2497
2443
  }
2498
2444
 
2499
- // deprecated
2500
- bool llama_kv_cache_can_shift(const llama_context * ctx) {
2501
- return llama_kv_self_can_shift(ctx);
2502
- }
2503
-
2504
2445
  bool llama_kv_self_can_shift(const llama_context * ctx) {
2505
2446
  const auto * kv = ctx->get_kv_self();
2506
2447
  if (!kv) {
@@ -2510,11 +2451,6 @@ bool llama_kv_self_can_shift(const llama_context * ctx) {
2510
2451
  return kv->get_can_shift();
2511
2452
  }
2512
2453
 
2513
- // deprecated
2514
- void llama_kv_cache_update(llama_context * ctx) {
2515
- llama_kv_self_update(ctx);
2516
- }
2517
-
2518
2454
  // llama state API
2519
2455
 
2520
2456
  // deprecated
@@ -2637,7 +2573,21 @@ int32_t llama_encode(
2637
2573
  int32_t llama_decode(
2638
2574
  llama_context * ctx,
2639
2575
  llama_batch batch) {
2640
- const int ret = ctx->decode(batch);
2576
+ int ret = ctx->decode(batch);
2577
+
2578
+ // defrag and try again
2579
+ // TODO: distinguish return code when we are sure that even after defrag there is no space available
2580
+ if (ret == 1) {
2581
+ llama_kv_self_defrag(ctx);
2582
+ ret = ctx->decode(batch);
2583
+
2584
+ if (ret == 1) {
2585
+ LLAMA_LOG_WARN("%s: failed to find KV cache slot for batch of size %d\n", __func__, batch.n_tokens);
2586
+
2587
+ return ret;
2588
+ }
2589
+ }
2590
+
2641
2591
  if (ret != 0) {
2642
2592
  LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
2643
2593
  }