@novastera-oss/llamarn 0.2.5 → 0.2.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (123) hide show
  1. package/RNLlamaCpp.podspec +3 -2
  2. package/android/CMakeLists.txt +6 -3
  3. package/android/src/main/cpp/include/llama.h +12 -8
  4. package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
  5. package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
  6. package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
  7. package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
  8. package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
  9. package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
  10. package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
  11. package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
  12. package/cpp/LlamaCppModel.cpp +46 -65
  13. package/cpp/LlamaCppModel.h +5 -0
  14. package/cpp/build-info.cpp +2 -2
  15. package/cpp/llama.cpp/README.md +1 -0
  16. package/cpp/llama.cpp/common/CMakeLists.txt +5 -8
  17. package/cpp/llama.cpp/common/arg.cpp +8 -6
  18. package/cpp/llama.cpp/common/chat-parser.cpp +4 -3
  19. package/cpp/llama.cpp/common/chat-parser.h +2 -1
  20. package/cpp/llama.cpp/common/chat.cpp +4 -4
  21. package/cpp/llama.cpp/common/common.cpp +2 -0
  22. package/cpp/llama.cpp/common/json-partial.cpp +5 -4
  23. package/cpp/llama.cpp/common/json-partial.h +2 -1
  24. package/cpp/llama.cpp/common/json-schema-to-grammar.cpp +2 -1
  25. package/cpp/llama.cpp/common/json-schema-to-grammar.h +4 -4
  26. package/cpp/llama.cpp/convert_hf_to_gguf.py +31 -28
  27. package/cpp/llama.cpp/ggml/include/ggml.h +1 -3
  28. package/cpp/llama.cpp/ggml/src/CMakeLists.txt +2 -0
  29. package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +10 -5
  30. package/cpp/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +3 -3
  31. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +23 -0
  32. package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +1 -0
  33. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +1 -1
  34. package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +19 -8
  35. package/cpp/llama.cpp/ggml/src/ggml-impl.h +2 -0
  36. package/cpp/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +2 -2
  37. package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +0 -8
  38. package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +118 -11
  39. package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1 -1
  40. package/cpp/llama.cpp/ggml/src/ggml.c +9 -2
  41. package/cpp/llama.cpp/ggml/src/ggml.cpp +26 -0
  42. package/cpp/llama.cpp/ggml/src/gguf.cpp +19 -2
  43. package/cpp/llama.cpp/include/llama.h +12 -8
  44. package/cpp/llama.cpp/src/CMakeLists.txt +3 -0
  45. package/cpp/llama.cpp/src/llama-batch.cpp +19 -12
  46. package/cpp/llama.cpp/src/llama-batch.h +15 -10
  47. package/cpp/llama.cpp/src/llama-context.cpp +226 -151
  48. package/cpp/llama.cpp/src/llama-context.h +25 -8
  49. package/cpp/llama.cpp/src/llama-graph.cpp +50 -47
  50. package/cpp/llama.cpp/src/llama-graph.h +25 -24
  51. package/cpp/llama.cpp/src/llama-kv-cache-recurrent.cpp +1132 -0
  52. package/cpp/llama.cpp/src/llama-kv-cache-recurrent.h +191 -0
  53. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +249 -0
  54. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +136 -0
  55. package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +1717 -0
  56. package/cpp/llama.cpp/src/llama-kv-cache-unified.h +278 -0
  57. package/cpp/llama.cpp/src/llama-kv-cache.cpp +0 -2746
  58. package/cpp/llama.cpp/src/llama-kv-cache.h +14 -472
  59. package/cpp/llama.cpp/src/llama-kv-cells.h +37 -6
  60. package/cpp/llama.cpp/src/llama-memory.h +44 -0
  61. package/cpp/llama.cpp/src/llama-model.cpp +23 -16
  62. package/cpp/llama.cpp/src/llama-vocab.cpp +7 -2
  63. package/cpp/llama.cpp/vendor/cpp-httplib/httplib.h +10518 -0
  64. package/cpp/llama.cpp/vendor/miniaudio/miniaudio.h +93468 -0
  65. package/cpp/llama.cpp/{common → vendor}/minja/chat-template.hpp +1 -1
  66. package/cpp/llama.cpp/{common → vendor}/minja/minja.hpp +1 -1
  67. package/cpp/llama.cpp/{common → vendor/nlohmann}/json.hpp +3027 -2267
  68. package/cpp/llama.cpp/vendor/nlohmann/json_fwd.hpp +187 -0
  69. package/cpp/llama.cpp/vendor/stb/stb_image.h +7988 -0
  70. package/cpp/rn-completion.cpp +63 -8
  71. package/cpp/rn-utils.hpp +8 -1
  72. package/ios/include/common/minja/chat-template.hpp +1 -1
  73. package/ios/include/common/minja/minja.hpp +1 -1
  74. package/ios/include/json-schema-to-grammar.h +4 -4
  75. package/ios/include/llama.h +12 -8
  76. package/ios/include/{common → nlohmann}/json.hpp +3027 -2267
  77. package/ios/libs/llama.xcframework/Info.plist +22 -22
  78. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  79. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4689 -4617
  80. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +1 -3
  81. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +12 -8
  82. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
  83. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  84. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4710 -4638
  85. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3622 -3557
  86. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +1 -3
  87. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +12 -8
  88. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
  89. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  90. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4710 -4638
  91. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3624 -3559
  92. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +1 -3
  93. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +12 -8
  94. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +1 -3
  95. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +12 -8
  96. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
  97. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +1 -3
  98. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +12 -8
  99. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
  100. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
  101. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  102. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4689 -4616
  103. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +1 -3
  104. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +12 -8
  105. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
  106. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  107. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4710 -4637
  108. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3622 -3556
  109. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +1 -3
  110. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +12 -8
  111. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
  112. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  113. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4725 -4653
  114. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +1 -3
  115. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +12 -8
  116. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
  117. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  118. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4746 -4674
  119. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3652 -3587
  120. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +1 -3
  121. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +12 -8
  122. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
  123. package/package.json +1 -1
@@ -196,6 +196,7 @@ add_library(ggml-base
196
196
  ../include/ggml-opt.h
197
197
  ../include/gguf.h
198
198
  ggml.c
199
+ ggml.cpp
199
200
  ggml-alloc.c
200
201
  ggml-backend.cpp
201
202
  ggml-opt.cpp
@@ -226,6 +227,7 @@ function(ggml_add_backend_library backend)
226
227
  set_target_properties(${backend} PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
227
228
  target_compile_definitions(${backend} PRIVATE GGML_BACKEND_DL)
228
229
  add_dependencies(ggml ${backend})
230
+ install(TARGETS ${backend} LIBRARY DESTINATION ${CMAKE_INSTALL_BINDIR})
229
231
  else()
230
232
  add_library(${backend} ${ARGN})
231
233
  target_link_libraries(ggml PUBLIC ${backend})
@@ -1340,7 +1340,10 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
1340
1340
  // allocate graph
1341
1341
  if (backend_ids_changed || !ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
1342
1342
  // the re-allocation may cause the split inputs to be moved to a different address
1343
- ggml_backend_sched_synchronize(sched);
1343
+ // synchronize without ggml_backend_sched_synchronize to avoid changing cur_copy
1344
+ for (int i = 0; i < sched->n_backends; i++) {
1345
+ ggml_backend_synchronize(sched->backends[i]);
1346
+ }
1344
1347
  #ifndef NDEBUG
1345
1348
  GGML_LOG_DEBUG("%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed);
1346
1349
  #endif
@@ -1564,7 +1567,6 @@ bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgra
1564
1567
 
1565
1568
  ggml_backend_sched_split_graph(sched, graph);
1566
1569
 
1567
-
1568
1570
  if (!ggml_backend_sched_alloc_splits(sched)) {
1569
1571
  return false;
1570
1572
  }
@@ -1598,9 +1600,12 @@ void ggml_backend_sched_synchronize(ggml_backend_sched_t sched) {
1598
1600
  for (int i = 0; i < sched->n_backends; i++) {
1599
1601
  ggml_backend_synchronize(sched->backends[i]);
1600
1602
  }
1601
- // reset the current copy to 0 so that the graphs will be similar during generation
1602
- // necessary for CUDA graphs
1603
- sched->cur_copy = 0;
1603
+ if (!sched->is_alloc) {
1604
+ // if the graph is not already allocated, always use copy 0 after a synchronization
1605
+ // this ensures that during generation the same copy is used every time,
1606
+ // which avoids changes in the graph that could cause CUDA or other graphs to be disabled
1607
+ sched->cur_copy = 0;
1608
+ }
1604
1609
  }
1605
1610
 
1606
1611
  void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data) {
@@ -81,7 +81,7 @@ if (BLAS_FOUND)
81
81
  target_link_libraries (ggml-blas PRIVATE ${BLAS_LIBRARIES})
82
82
  target_include_directories(ggml-blas PRIVATE ${BLAS_INCLUDE_DIRS})
83
83
  else()
84
- message(ERROR "BLAS not found, please refer to "
85
- "https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors"
86
- " to set correct GGML_BLAS_VENDOR")
84
+ message(FATAL_ERROR "BLAS not found, please refer to "
85
+ "https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors"
86
+ " to set correct GGML_BLAS_VENDOR")
87
87
  endif()
@@ -2418,12 +2418,32 @@ static bool ggml_thread_apply_priority(int32_t prio) {
2418
2418
  // This is up to the applications.
2419
2419
  DWORD p = THREAD_PRIORITY_NORMAL;
2420
2420
  switch (prio) {
2421
+ case GGML_SCHED_PRIO_LOW: p = THREAD_PRIORITY_BELOW_NORMAL; break;
2421
2422
  case GGML_SCHED_PRIO_NORMAL: p = THREAD_PRIORITY_NORMAL; break;
2422
2423
  case GGML_SCHED_PRIO_MEDIUM: p = THREAD_PRIORITY_ABOVE_NORMAL; break;
2423
2424
  case GGML_SCHED_PRIO_HIGH: p = THREAD_PRIORITY_HIGHEST; break;
2424
2425
  case GGML_SCHED_PRIO_REALTIME: p = THREAD_PRIORITY_TIME_CRITICAL; break;
2425
2426
  }
2426
2427
 
2428
+ if (prio != GGML_SCHED_PRIO_LOW) {
2429
+ // Tell Windows that this thread should not be throttled (needs its own CPU core).
2430
+ // Newer Windows 11 versions aggresively park (offline) CPU cores and often place
2431
+ // all our threads onto the first 4 cores which results in terrible performance with
2432
+ // n_threads > 4
2433
+ #if _WIN32_WINNT >= 0x0602
2434
+ THREAD_POWER_THROTTLING_STATE t;
2435
+ ZeroMemory(&t, sizeof(t));
2436
+ t.Version = THREAD_POWER_THROTTLING_CURRENT_VERSION;
2437
+ t.ControlMask = THREAD_POWER_THROTTLING_EXECUTION_SPEED;
2438
+ t.StateMask = 0;
2439
+
2440
+ if (!SetThreadInformation(GetCurrentThread(), ThreadPowerThrottling, &t, sizeof(t))) {
2441
+ GGML_LOG_DEBUG("failed to disable thread power throttling %d : (%d)\n", prio, (int) GetLastError());
2442
+ return false;
2443
+ }
2444
+ #endif
2445
+ }
2446
+
2427
2447
  if (prio == GGML_SCHED_PRIO_NORMAL) {
2428
2448
  // Keep inherited policy/priority
2429
2449
  return true;
@@ -2451,6 +2471,8 @@ static bool ggml_thread_apply_priority(int32_t prio) {
2451
2471
  struct sched_param p;
2452
2472
  int32_t policy = SCHED_OTHER;
2453
2473
  switch (prio) {
2474
+ // TODO: there seems to be no way to set lower prio on Apple platforms
2475
+ case GGML_SCHED_PRIO_LOW: policy = SCHED_OTHER; p.sched_priority = 0; break;
2454
2476
  case GGML_SCHED_PRIO_NORMAL: policy = SCHED_OTHER; p.sched_priority = 0; break;
2455
2477
  case GGML_SCHED_PRIO_MEDIUM: policy = SCHED_FIFO; p.sched_priority = 40; break;
2456
2478
  case GGML_SCHED_PRIO_HIGH: policy = SCHED_FIFO; p.sched_priority = 80; break;
@@ -2507,6 +2529,7 @@ static bool ggml_thread_apply_priority(int32_t prio) {
2507
2529
  struct sched_param p;
2508
2530
  int32_t policy = SCHED_OTHER;
2509
2531
  switch (prio) {
2532
+ case GGML_SCHED_PRIO_LOW: policy = SCHED_BATCH; p.sched_priority = 0; break;
2510
2533
  case GGML_SCHED_PRIO_NORMAL: policy = SCHED_OTHER; p.sched_priority = 0; break;
2511
2534
  case GGML_SCHED_PRIO_MEDIUM: policy = SCHED_FIFO; p.sched_priority = 40; break;
2512
2535
  case GGML_SCHED_PRIO_HIGH: policy = SCHED_FIFO; p.sched_priority = 80; break;
@@ -635,6 +635,7 @@ struct ggml_cuda_device_info {
635
635
  int nsm; // number of streaming multiprocessors
636
636
  size_t smpb; // max. shared memory per block
637
637
  size_t smpbo; // max. shared memory per block (with opt-in)
638
+ bool integrated; // Device is integrated as opposed to discrete
638
639
  bool vmm; // virtual memory support
639
640
  size_t vmm_granularity; // granularity of virtual memory
640
641
  size_t total_vram;
@@ -1246,7 +1246,7 @@ static __global__ void flash_attn_ext_f16(
1246
1246
  NO_DEVICE_CODE;
1247
1247
  return;
1248
1248
  }
1249
- #endif __CUDA_ARCH__ == GGML_CUDA_CC_TURING
1249
+ #endif // __CUDA_ARCH__ == GGML_CUDA_CC_TURING
1250
1250
 
1251
1251
  static_assert(!mla || DKQ >= DV, "MLA needs DKQ >= DV");
1252
1252
 
@@ -243,10 +243,10 @@ static ggml_cuda_device_info ggml_cuda_init() {
243
243
 
244
244
  info.default_tensor_split[id] = total_vram;
245
245
  total_vram += prop.totalGlobalMem;
246
-
247
- info.devices[id].nsm = prop.multiProcessorCount;
248
- info.devices[id].smpb = prop.sharedMemPerBlock;
249
- info.devices[id].warp_size = prop.warpSize;
246
+ info.devices[id].integrated = prop.integrated;
247
+ info.devices[id].nsm = prop.multiProcessorCount;
248
+ info.devices[id].smpb = prop.sharedMemPerBlock;
249
+ info.devices[id].warp_size = prop.warpSize;
250
250
  #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
251
251
  info.devices[id].smpbo = prop.sharedMemPerBlock;
252
252
 
@@ -1065,6 +1065,10 @@ static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_
1065
1065
  GGML_UNUSED(buft);
1066
1066
  }
1067
1067
 
1068
+ static bool ggml_backend_buft_is_cuda_host(ggml_backend_buffer_type_t buft) {
1069
+ return buft->iface.get_name == ggml_backend_cuda_host_buffer_type_name;
1070
+ }
1071
+
1068
1072
  static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
1069
1073
  CUDA_CHECK(cudaFreeHost(buffer->context));
1070
1074
  }
@@ -2641,6 +2645,8 @@ static void update_cuda_graph_executable(ggml_backend_cuda_context * cuda_ctx) {
2641
2645
 
2642
2646
  static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph,
2643
2647
  bool & graph_evaluated_or_captured, bool & use_cuda_graph, bool & cuda_graph_update_required) {
2648
+ // flag used to determine whether it is an integrated_gpu
2649
+ const bool integrated = ggml_cuda_info().devices[cuda_ctx->device].integrated;
2644
2650
 
2645
2651
  while (!graph_evaluated_or_captured) {
2646
2652
  // Only perform the graph execution if CUDA graphs are not enabled, or we are capturing the graph.
@@ -2659,7 +2665,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
2659
2665
  if (node->src[j] != nullptr) {
2660
2666
  assert(node->src[j]->buffer);
2661
2667
  assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) ||
2662
- ggml_backend_buft_is_cuda_split(node->src[j]->buffer->buft));
2668
+ ggml_backend_buft_is_cuda_split(node->src[j]->buffer->buft) || (integrated && ggml_backend_buft_is_cuda_host(node->src[j]->buffer->buft)));
2663
2669
  }
2664
2670
  }
2665
2671
  #endif
@@ -2994,9 +3000,12 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
2994
3000
  {
2995
3001
  struct ggml_tensor * a = op->src[0];
2996
3002
  struct ggml_tensor * b = op->src[1];
2997
- // for small weight matrices the active device can end up without any rows, don't use row split in those cases
2998
- // this avoids some edge cases (and the performance would not be good anyways)
2999
3003
  if (a->buffer && ggml_backend_buft_is_cuda_split(a->buffer->buft)) {
3004
+ if (a->ne[2] > 1 || a->ne[3] > 1) {
3005
+ return false;
3006
+ }
3007
+ // for small weight matrices the active device can end up without any rows, don't use row split in those cases
3008
+ // this avoids some edge cases (and the performance would not be good anyways)
3000
3009
  ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) a->buffer->buft->context;
3001
3010
  int64_t row_low;
3002
3011
  int64_t row_high;
@@ -3263,7 +3272,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
3263
3272
  }
3264
3273
 
3265
3274
  static bool ggml_backend_cuda_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
3266
- return (ggml_backend_buft_is_cuda(buft) || ggml_backend_buft_is_cuda_split(buft)) && buft->device == dev;
3275
+ ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *) dev->context;
3276
+ const bool integrated = ggml_cuda_info().devices[dev_ctx->device].integrated;
3277
+ return (((ggml_backend_buft_is_cuda(buft) || ggml_backend_buft_is_cuda_split(buft)) && buft->device == dev) || (integrated && ggml_backend_buft_is_cuda_host(buft)));
3267
3278
  }
3268
3279
 
3269
3280
  static int64_t get_op_batch_size(const ggml_tensor * op) {
@@ -32,6 +32,8 @@
32
32
  extern "C" {
33
33
  #endif
34
34
 
35
+ void ggml_print_backtrace(void);
36
+
35
37
  #ifndef MIN
36
38
  # define MIN(a, b) ((a) < (b) ? (a) : (b))
37
39
  #endif
@@ -13,7 +13,7 @@ elseif(SUPPORTS_SYCL)
13
13
  If you expected the oneAPI Release compiler, please install oneAPI & source it, like:
14
14
  source /opt/intel/oneapi/setvars.sh")
15
15
  else()
16
- message(FATAL_ERROR, "C++ compiler lacks SYCL support.")
16
+ message(FATAL_ERROR "C++ compiler lacks SYCL support.")
17
17
  endif()
18
18
  message(STATUS "SYCL found")
19
19
  #todo: AOT
@@ -170,7 +170,7 @@ else()
170
170
  target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_NVIDIA)
171
171
  elseif (GGML_SYCL_TARGET STREQUAL "AMD")
172
172
  if (NOT GGML_SYCL_DEVICE_ARCH)
173
- message(ERROR "Can't enable SYCL hip backend, GGML_SYCL_DEVICE_ARCH has not been set.")
173
+ message(FATAL_ERROR "Can't enable SYCL hip backend, GGML_SYCL_DEVICE_ARCH has not been set.")
174
174
  endif()
175
175
  target_link_libraries(ggml-sycl PRIVATE ONEMATH::onemath_blas_rocblas)
176
176
  target_compile_options(ggml-sycl PRIVATE "-fsycl-targets=amdgcn-amd-amdhsa")
@@ -4257,14 +4257,6 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
4257
4257
  case GGML_OP_SOFT_MAX:
4258
4258
  return true;
4259
4259
  case GGML_OP_ROPE:
4260
- {
4261
- const int mode = ((const int32_t *) op->op_params)[2];
4262
- // mode is not used as a bitmask in practice, the various rope type modes are independent implementations
4263
- if (mode == GGML_ROPE_TYPE_MROPE) {
4264
- return false;
4265
- }
4266
- return true;
4267
- }
4268
4260
  case GGML_OP_IM2COL:
4269
4261
  return true;
4270
4262
  case GGML_OP_UPSCALE:
@@ -49,10 +49,7 @@ static void rope_norm(const T * x, T * dst, const int ne0, const int ne1, const
49
49
 
50
50
  if (i0 >= n_dims) {
51
51
  const int i = row * ne0 + i0;
52
-
53
- dst[i + 0] = x[i + 0];
54
- dst[i + 1] = x[i + 1];
55
-
52
+ *reinterpret_cast<sycl::vec<T, 2> *>(dst + i) = *reinterpret_cast<const sycl::vec<T, 2> *>(x + i);
56
53
  return;
57
54
  }
58
55
 
@@ -93,10 +90,7 @@ static void rope_neox(const T * x, T * dst, const int ne0, const int ne1, const
93
90
 
94
91
  if (i0 >= n_dims) {
95
92
  const int i = row * ne0 + i0;
96
-
97
- dst[i + 0] = x[i + 0];
98
- dst[i + 1] = x[i + 1];
99
-
93
+ *reinterpret_cast<sycl::vec<T, 2> *>(dst + i) = *reinterpret_cast<const sycl::vec<T, 2> *>(x + i);
100
94
  return;
101
95
  }
102
96
 
@@ -122,6 +116,63 @@ static void rope_neox(const T * x, T * dst, const int ne0, const int ne1, const
122
116
  dst[i + n_dims / 2] = x0 * sin_theta + x1 * cos_theta;
123
117
  }
124
118
 
119
+ template <typename T, bool has_ff>
120
+ static void rope_multi(const T * x, T * dst, const int ne0, const int ne1, const int ne2, const size_t s1,
121
+ const size_t s2, const int n_dims, const int32_t * pos, const float freq_scale,
122
+ const float ext_factor, const float attn_factor, const rope_corr_dims corr_dims,
123
+ const float theta_scale, const float * freq_factors, const mrope_sections sections,
124
+ const sycl::nd_item<3> & item_ct1) {
125
+ // get index pos
126
+ const int i0 = 2 * (item_ct1.get_group(1) * item_ct1.get_local_range(1) + item_ct1.get_local_id(1));
127
+ if (i0 >= ne0) {
128
+ return;
129
+ }
130
+ const int row_dst = (item_ct1.get_group(2) * item_ct1.get_local_range(2)) + item_ct1.get_local_id(2);
131
+
132
+ if (i0 >= n_dims) {
133
+ const int i = row_dst*ne0 + i0;
134
+ *reinterpret_cast<sycl::vec<T, 2> *>(dst + i) = *reinterpret_cast<const sycl::vec<T, 2> *>(x + i);
135
+ return;
136
+ }
137
+
138
+ const int row_x = row_dst % ne1;
139
+ const int channel_x = row_dst / ne1;
140
+ const int idst = (row_dst * ne0) + (i0 / 2);
141
+ const size_t ix = ((size_t) channel_x * s2) + ((size_t) row_x * s1) + (i0 / 2);
142
+
143
+ const int sect_dims = sections.v[0] + sections.v[1] + sections.v[2] + sections.v[3];
144
+ const int sec_w = sections.v[1] + sections.v[0];
145
+ const int sector = (i0 / 2) % sect_dims;
146
+
147
+
148
+ float theta_base = 0.0;
149
+ if (sector < sections.v[0]) {
150
+ theta_base = pos[channel_x]*sycl::pow(theta_scale, i0/2.0f);
151
+ }
152
+ else if (sector >= sections.v[0] && sector < sec_w) {
153
+ theta_base = pos[channel_x + ne2 * 1]*sycl::pow(theta_scale, i0/2.0f);
154
+ }
155
+ else if (sector >= sec_w && sector < sec_w + sections.v[2]) {
156
+ theta_base = pos[channel_x + ne2 * 2]*sycl::pow(theta_scale, i0/2.0f);
157
+ }
158
+ else if (sector >= sec_w + sections.v[2]) {
159
+ theta_base = pos[channel_x + ne2 * 3]*sycl::pow(theta_scale, i0/2.0f);
160
+ }
161
+
162
+ const float freq_factor = has_ff ? freq_factors[i0 / 2] : 1.0f;
163
+ float cos_theta;
164
+ float sin_theta;
165
+ rope_yarn(theta_base / freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta);
166
+ const float x0 = x[ix + 0];
167
+ const float x1 = x[ix + n_dims/2];
168
+
169
+ // store results in dst
170
+ dst[idst + 0] = x0 * cos_theta - x1 * sin_theta;
171
+ dst[idst + n_dims/2] = x0 * sin_theta + x1 * cos_theta;
172
+ }
173
+
174
+
175
+
125
176
  template <typename T, bool has_ff>
126
177
  static void rope_vision(const T * x, T * dst, const int ne0, const int ne1, const int ne2, const size_t s1,
127
178
  const size_t s2, const int n_dims, const int32_t * pos, const float freq_scale,
@@ -171,7 +222,7 @@ static void rope_norm_sycl(const T * x, T * dst, const int ne0, const int ne1, c
171
222
  const float * freq_factors, queue_ptr stream) {
172
223
  GGML_ASSERT(ne0 % 2 == 0);
173
224
  const sycl::range<3> block_dims(1, SYCL_ROPE_BLOCK_SIZE, 1);
174
- const int num_blocks_x = (ne0 + 2 * SYCL_ROPE_BLOCK_SIZE - 1) / (2 * SYCL_ROPE_BLOCK_SIZE);
225
+ const int num_blocks_x = ceil_div(ne0, (2 * SYCL_ROPE_BLOCK_SIZE));
175
226
  const sycl::range<3> block_nums(1, num_blocks_x, nr);
176
227
 
177
228
  const float theta_scale = powf(freq_base, -2.0f / n_dims);
@@ -208,7 +259,7 @@ static void rope_neox_sycl(const T * x, T * dst, const int ne0, const int ne1, c
208
259
  const rope_corr_dims corr_dims, const float * freq_factors, queue_ptr stream) {
209
260
  GGML_ASSERT(ne0 % 2 == 0);
210
261
  const sycl::range<3> block_dims(1, SYCL_ROPE_BLOCK_SIZE, 1);
211
- const int num_blocks_x = (ne0 + 2 * SYCL_ROPE_BLOCK_SIZE - 1) / (2 * SYCL_ROPE_BLOCK_SIZE);
262
+ const int num_blocks_x = ceil_div(ne0, (2 * SYCL_ROPE_BLOCK_SIZE));
212
263
  const sycl::range<3> block_nums(1, num_blocks_x, nr);
213
264
 
214
265
  const float theta_scale = powf(freq_base, -2.0f / n_dims);
@@ -228,6 +279,40 @@ static void rope_neox_sycl(const T * x, T * dst, const int ne0, const int ne1, c
228
279
  }
229
280
  }
230
281
 
282
+ template <typename T>
283
+ static void rope_multi_sycl(const T * x, T * dst, const int ne0, const int ne1, const int ne2, const size_t s1,
284
+ const size_t s2, const int n_dims, const int nr, const int32_t * pos,
285
+ const float freq_scale, const float freq_base, const float ext_factor,
286
+ const float attn_factor, const rope_corr_dims corr_dims, const float * freq_factors,
287
+ const mrope_sections sections, queue_ptr stream) {
288
+ GGML_ASSERT(ne0 % 2 == 0);
289
+ const sycl::range<3> block_dims(1, SYCL_ROPE_BLOCK_SIZE, 1);
290
+ const int n_blocks_y = ceil_div(ne0, (2 * SYCL_ROPE_BLOCK_SIZE));
291
+ const sycl::range<3> grid_dims(1, n_blocks_y, nr);
292
+ const sycl::nd_range<3> nd_range(grid_dims * block_dims, block_dims);
293
+
294
+ const float theta_scale = std::pow(freq_base, -2.0f / n_dims);
295
+ // Add FP16 capability check if T could be sycl::half
296
+ if constexpr (std::is_same_v<T, sycl::half>) {
297
+ dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });
298
+ }
299
+ // launch kernel
300
+ if (freq_factors == nullptr) {
301
+ stream->parallel_for(nd_range, [=](sycl::nd_item<3> item_ct1) {
302
+ rope_multi<T, false>(x, dst, ne0, ne1, ne2, s1, s2, n_dims, pos, freq_scale, ext_factor, attn_factor,
303
+ corr_dims, theta_scale, freq_factors, sections, item_ct1);
304
+ });
305
+ } else {
306
+ stream->parallel_for(nd_range, [=](sycl::nd_item<3> item_ct1) {
307
+ rope_multi<T, true>(x, dst, ne0, ne1, ne2, s1, s2, n_dims, pos, freq_scale, ext_factor, attn_factor,
308
+ corr_dims, theta_scale, freq_factors, sections, item_ct1);
309
+ });
310
+ }
311
+ }
312
+
313
+
314
+
315
+
231
316
  // rope vision
232
317
  template <typename T>
233
318
  static void rope_vision_sycl(const T * x, T * dst, const int ne0, const int ne1, const int ne2, const size_t s1,
@@ -237,7 +322,7 @@ static void rope_vision_sycl(const T * x, T * dst, const int ne0, const int ne1,
237
322
  const mrope_sections sections, queue_ptr stream) {
238
323
  GGML_ASSERT(ne0 % 2 == 0);
239
324
  const sycl::range<3> block_dims(1, SYCL_ROPE_BLOCK_SIZE, 1);
240
- const int n_blocks_y = (ne0 + 2 * SYCL_ROPE_BLOCK_SIZE - 1) / (2 * SYCL_ROPE_BLOCK_SIZE);
325
+ const int n_blocks_y = ceil_div(ne0, (2 * SYCL_ROPE_BLOCK_SIZE));
241
326
  const sycl::range<3> grid_dims(1, n_blocks_y, nr);
242
327
  const sycl::nd_range<3> nd_range(grid_dims * block_dims, block_dims);
243
328
 
@@ -298,8 +383,17 @@ inline void ggml_sycl_op_rope(ggml_backend_sycl_context & ctx, ggml_tensor *dst)
298
383
  memcpy(&sections.v, (int32_t *) dst->op_params + 11, sizeof(int)*4);
299
384
 
300
385
  const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
386
+ const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE;
301
387
  const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
302
388
 
389
+ if (is_mrope) {
390
+ GGML_ASSERT(sections.v[0] > 0 || sections.v[1] > 0 || sections.v[2] > 0);
391
+ }
392
+
393
+ if (is_vision) {
394
+ GGML_ASSERT(n_dims == ne00/2);
395
+ }
396
+
303
397
  const int32_t * pos = (const int32_t *) dst->src[1]->data;
304
398
 
305
399
  const float * freq_factors = nullptr;
@@ -326,6 +420,19 @@ inline void ggml_sycl_op_rope(ggml_backend_sycl_context & ctx, ggml_tensor *dst)
326
420
  } else {
327
421
  GGML_ABORT("fatal error");
328
422
  }
423
+ } else if (is_mrope && !is_vision) {
424
+ GGML_SYCL_DEBUG("%s: mrope path\n", __func__);
425
+ if (dst->src[0]->type == GGML_TYPE_F16) {
426
+ rope_multi_sycl((const sycl::half *)dst->src[0]->data, (sycl::half *)dst->data, ne00, ne01, ne02, s01,
427
+ s02, n_dims, nr, pos, freq_scale, freq_base, ext_factor, attn_factor, corr_dims,
428
+ freq_factors, sections, main_stream);
429
+ } else if (dst->src[0]->type == GGML_TYPE_F32) {
430
+ rope_multi_sycl((const float *) dst->src[0]->data, (float *) dst->data, ne00, ne01, ne02, s01, s02, n_dims,
431
+ nr, pos, freq_scale, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections,
432
+ main_stream);
433
+ } else {
434
+ GGML_ABORT("Fatal error: Tensor type unsupported!");
435
+ }
329
436
  } else if (is_vision) {
330
437
  GGML_SYCL_DEBUG("%s: vision path\n", __func__);
331
438
  if (dst->src[0]->type == GGML_TYPE_F16) {
@@ -1652,7 +1652,7 @@ static std::array<uint32_t, 2> fa_rows_cols(FaCodePath path, uint32_t D, uint32_
1652
1652
  return {64, 32};
1653
1653
  }
1654
1654
  return {64, 64};
1655
- };
1655
+ }
1656
1656
 
1657
1657
  static bool ggml_vk_matmul_shmem_support(const vk_device& device, const std::vector<uint32_t>& warptile, bool mul_mat_id, ggml_type src0_type) {
1658
1658
 
@@ -133,7 +133,7 @@ static void ggml_print_backtrace_symbols(void) {
133
133
  }
134
134
  #endif
135
135
 
136
- static void ggml_print_backtrace(void) {
136
+ void ggml_print_backtrace(void) {
137
137
  const char * GGML_NO_BACKTRACE = getenv("GGML_NO_BACKTRACE");
138
138
  if (GGML_NO_BACKTRACE) {
139
139
  return;
@@ -160,6 +160,10 @@ static void ggml_print_backtrace(void) {
160
160
  const int parent_pid = getpid();
161
161
  const int child_pid = fork();
162
162
  if (child_pid < 0) { // error
163
+ #if defined(__linux__)
164
+ close(lock[1]);
165
+ close(lock[0]);
166
+ #endif
163
167
  return;
164
168
  } else if (child_pid == 0) { // child
165
169
  char attach[32];
@@ -167,6 +171,7 @@ static void ggml_print_backtrace(void) {
167
171
  #if defined(__linux__)
168
172
  close(lock[1]);
169
173
  (void) !read(lock[0], lock, 1);
174
+ close(lock[0]);
170
175
  #endif
171
176
  // try gdb
172
177
  execlp("gdb", "gdb", "--batch",
@@ -195,7 +200,7 @@ static void ggml_print_backtrace(void) {
195
200
  }
196
201
  }
197
202
  #else
198
- static void ggml_print_backtrace(void) {
203
+ void ggml_print_backtrace(void) {
199
204
  // platform not supported
200
205
  }
201
206
  #endif
@@ -216,6 +221,8 @@ void ggml_abort(const char * file, int line, const char * fmt, ...) {
216
221
  abort();
217
222
  }
218
223
 
224
+ // ggml_print_backtrace is registered with std::set_terminate by ggml.cpp
225
+
219
226
  //
220
227
  // logging
221
228
  //
@@ -0,0 +1,26 @@
1
+ #include "ggml-impl.h"
2
+
3
+ #include <cstdlib>
4
+ #include <exception>
5
+
6
+ static std::terminate_handler previous_terminate_handler;
7
+
8
+ GGML_NORETURN static void ggml_uncaught_exception() {
9
+ ggml_print_backtrace();
10
+ if (previous_terminate_handler) {
11
+ previous_terminate_handler();
12
+ }
13
+ abort(); // unreachable unless previous_terminate_handler was nullptr
14
+ }
15
+
16
+ static bool ggml_uncaught_exception_init = []{
17
+ const char * GGML_NO_BACKTRACE = getenv("GGML_NO_BACKTRACE");
18
+ if (GGML_NO_BACKTRACE) {
19
+ return false;
20
+ }
21
+ const auto prev{std::get_terminate()};
22
+ GGML_ASSERT(prev != ggml_uncaught_exception);
23
+ previous_terminate_handler = prev;
24
+ std::set_terminate(ggml_uncaught_exception);
25
+ return true;
26
+ }();
@@ -347,11 +347,28 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
347
347
  int64_t n_tensors = 0;
348
348
 
349
349
  if (ok && gr.read(ctx->version)) {
350
- if (ctx->version == 1) {
350
+ if (ok && ctx->version == 0) {
351
+ GGML_LOG_ERROR("%s: bad GGUF version: %" PRIu32 "\n", __func__, ctx->version);
352
+ ok = false;
353
+ }
354
+
355
+ /*
356
+ * bit layout is different when reading non-native endian models.
357
+ * assuming that the GGUF version is 3, the non-native endian model
358
+ * would read it as 0x30000000. we can use the AND operation against
359
+ * the last 4 hexadecimal digits to check if the model is the same
360
+ * endianness as the host system.
361
+ */
362
+ if (ok && (ctx->version & 0x0000FFFF) == 0x00000000) {
363
+ GGML_LOG_ERROR("%s: failed to load model: this GGUF file version %" PRIu32 " is extremely large, is there a mismatch between the host and model endianness?\n", __func__, ctx->version);
364
+ ok = false;
365
+ }
366
+
367
+ if (ok && ctx->version == 1) {
351
368
  GGML_LOG_ERROR("%s: GGUFv1 is no longer supported, please use a more up-to-date version\n", __func__);
352
369
  ok = false;
353
370
  }
354
- if (ctx->version > GGUF_VERSION) {
371
+ if (ok && ctx->version > GGUF_VERSION) {
355
372
  GGML_LOG_ERROR("%s: this GGUF file is version %" PRIu32 " but this software only supports up to version %d\n",
356
373
  __func__, ctx->version, GGUF_VERSION);
357
374
  ok = false;
@@ -259,9 +259,9 @@ extern "C" {
259
259
  llama_token * token;
260
260
  float * embd;
261
261
  llama_pos * pos;
262
- int32_t * n_seq_id;
263
- llama_seq_id ** seq_id;
264
- int8_t * logits; // TODO: rename this to "output"
262
+ int32_t * n_seq_id; // TODO: remove, should belong to only 1 sequence
263
+ llama_seq_id ** seq_id; // TODO: become llama_seq_id * seq_id;
264
+ int8_t * logits; // TODO: rename this to "output"
265
265
  } llama_batch;
266
266
 
267
267
  enum llama_model_kv_override_type {
@@ -366,6 +366,8 @@ extern "C" {
366
366
  bool no_perf; // measure performance timings
367
367
  bool op_offload; // offload host tensor operations to device
368
368
  bool swa_full; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
369
+ // NOTE: setting to false when n_seq_max > 1 can cause bad performance in some cases
370
+ // ref: https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573
369
371
  };
370
372
 
371
373
  // model quantization parameters
@@ -502,6 +504,7 @@ extern "C" {
502
504
  LLAMA_API int32_t llama_model_n_layer (const struct llama_model * model);
503
505
  LLAMA_API int32_t llama_model_n_head (const struct llama_model * model);
504
506
  LLAMA_API int32_t llama_model_n_head_kv (const struct llama_model * model);
507
+ LLAMA_API int32_t llama_model_n_swa (const struct llama_model * model);
505
508
 
506
509
  // Get the model's RoPE frequency scaling factor
507
510
  LLAMA_API float llama_model_rope_freq_scale_train(const struct llama_model * model);
@@ -652,7 +655,6 @@ extern "C" {
652
655
  // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
653
656
  // If the KV cache is RoPEd, the KV data is updated accordingly:
654
657
  // - lazily on next llama_decode()
655
- // - explicitly with llama_kv_self_update()
656
658
  // p0 < 0 : [0, p1]
657
659
  // p1 < 0 : [p0, inf)
658
660
  LLAMA_API void llama_kv_self_seq_add(
@@ -665,7 +667,6 @@ extern "C" {
665
667
  // Integer division of the positions by factor of `d > 1`
666
668
  // If the KV cache is RoPEd, the KV data is updated accordingly:
667
669
  // - lazily on next llama_decode()
668
- // - explicitly with llama_kv_self_update()
669
670
  // p0 < 0 : [0, p1]
670
671
  // p1 < 0 : [p0, inf)
671
672
  LLAMA_API void llama_kv_self_seq_div(
@@ -677,12 +678,14 @@ extern "C" {
677
678
 
678
679
  // Returns the smallest position present in the KV cache for the specified sequence
679
680
  // This is typically non-zero only for SWA caches
681
+ // Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the KV cache
680
682
  // Return -1 if the sequence is empty
681
683
  LLAMA_API llama_pos llama_kv_self_seq_pos_min(
682
684
  struct llama_context * ctx,
683
685
  llama_seq_id seq_id);
684
686
 
685
687
  // Returns the largest position present in the KV cache for the specified sequence
688
+ // Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the KV cache
686
689
  // Return -1 if the sequence is empty
687
690
  LLAMA_API llama_pos llama_kv_self_seq_pos_max(
688
691
  struct llama_context * ctx,
@@ -691,14 +694,15 @@ extern "C" {
691
694
  // Defragment the KV cache
692
695
  // This will be applied:
693
696
  // - lazily on next llama_decode()
694
- // - explicitly with llama_kv_self_update()
695
- LLAMA_API void llama_kv_self_defrag(struct llama_context * ctx);
697
+ LLAMA_API DEPRECATED(void llama_kv_self_defrag(struct llama_context * ctx),
698
+ "simply remove this call, the context will automatically decide when to do a defragmentation based on 'defrag_thold'");
696
699
 
697
700
  // Check if the context supports KV cache shifting
698
701
  LLAMA_API bool llama_kv_self_can_shift(const struct llama_context * ctx);
699
702
 
700
703
  // Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
701
- LLAMA_API void llama_kv_self_update(struct llama_context * ctx);
704
+ LLAMA_API DEPRECATED(void llama_kv_self_update(struct llama_context * ctx),
705
+ "simply remove this call, updates are applied lazily on the next llama_decode()");
702
706
 
703
707
  //
704
708
  // State / sessions
@@ -21,6 +21,9 @@ add_library(llama
21
21
  llama-impl.cpp
22
22
  llama-io.cpp
23
23
  llama-kv-cache.cpp
24
+ llama-kv-cache-unified.cpp
25
+ llama-kv-cache-unified-iswa.cpp
26
+ llama-kv-cache-recurrent.cpp
24
27
  llama-memory.cpp
25
28
  llama-mmap.cpp
26
29
  llama-model-loader.cpp