@novastera-oss/llamarn 0.2.5 → 0.2.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/RNLlamaCpp.podspec +3 -2
- package/android/CMakeLists.txt +6 -3
- package/android/src/main/cpp/include/llama.h +12 -8
- package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
- package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
- package/cpp/LlamaCppModel.cpp +46 -65
- package/cpp/LlamaCppModel.h +5 -0
- package/cpp/build-info.cpp +2 -2
- package/cpp/llama.cpp/README.md +1 -0
- package/cpp/llama.cpp/common/CMakeLists.txt +5 -8
- package/cpp/llama.cpp/common/arg.cpp +8 -6
- package/cpp/llama.cpp/common/chat-parser.cpp +4 -3
- package/cpp/llama.cpp/common/chat-parser.h +2 -1
- package/cpp/llama.cpp/common/chat.cpp +4 -4
- package/cpp/llama.cpp/common/common.cpp +2 -0
- package/cpp/llama.cpp/common/json-partial.cpp +5 -4
- package/cpp/llama.cpp/common/json-partial.h +2 -1
- package/cpp/llama.cpp/common/json-schema-to-grammar.cpp +2 -1
- package/cpp/llama.cpp/common/json-schema-to-grammar.h +4 -4
- package/cpp/llama.cpp/convert_hf_to_gguf.py +31 -28
- package/cpp/llama.cpp/ggml/include/ggml.h +1 -3
- package/cpp/llama.cpp/ggml/src/CMakeLists.txt +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +10 -5
- package/cpp/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +3 -3
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +23 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +1 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +19 -8
- package/cpp/llama.cpp/ggml/src/ggml-impl.h +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +0 -8
- package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +118 -11
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml.c +9 -2
- package/cpp/llama.cpp/ggml/src/ggml.cpp +26 -0
- package/cpp/llama.cpp/ggml/src/gguf.cpp +19 -2
- package/cpp/llama.cpp/include/llama.h +12 -8
- package/cpp/llama.cpp/src/CMakeLists.txt +3 -0
- package/cpp/llama.cpp/src/llama-batch.cpp +19 -12
- package/cpp/llama.cpp/src/llama-batch.h +15 -10
- package/cpp/llama.cpp/src/llama-context.cpp +226 -151
- package/cpp/llama.cpp/src/llama-context.h +25 -8
- package/cpp/llama.cpp/src/llama-graph.cpp +50 -47
- package/cpp/llama.cpp/src/llama-graph.h +25 -24
- package/cpp/llama.cpp/src/llama-kv-cache-recurrent.cpp +1132 -0
- package/cpp/llama.cpp/src/llama-kv-cache-recurrent.h +191 -0
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +249 -0
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +136 -0
- package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +1717 -0
- package/cpp/llama.cpp/src/llama-kv-cache-unified.h +278 -0
- package/cpp/llama.cpp/src/llama-kv-cache.cpp +0 -2746
- package/cpp/llama.cpp/src/llama-kv-cache.h +14 -472
- package/cpp/llama.cpp/src/llama-kv-cells.h +37 -6
- package/cpp/llama.cpp/src/llama-memory.h +44 -0
- package/cpp/llama.cpp/src/llama-model.cpp +23 -16
- package/cpp/llama.cpp/src/llama-vocab.cpp +7 -2
- package/cpp/llama.cpp/vendor/cpp-httplib/httplib.h +10518 -0
- package/cpp/llama.cpp/vendor/miniaudio/miniaudio.h +93468 -0
- package/cpp/llama.cpp/{common → vendor}/minja/chat-template.hpp +1 -1
- package/cpp/llama.cpp/{common → vendor}/minja/minja.hpp +1 -1
- package/cpp/llama.cpp/{common → vendor/nlohmann}/json.hpp +3027 -2267
- package/cpp/llama.cpp/vendor/nlohmann/json_fwd.hpp +187 -0
- package/cpp/llama.cpp/vendor/stb/stb_image.h +7988 -0
- package/cpp/rn-completion.cpp +63 -8
- package/cpp/rn-utils.hpp +8 -1
- package/ios/include/common/minja/chat-template.hpp +1 -1
- package/ios/include/common/minja/minja.hpp +1 -1
- package/ios/include/json-schema-to-grammar.h +4 -4
- package/ios/include/llama.h +12 -8
- package/ios/include/{common → nlohmann}/json.hpp +3027 -2267
- package/ios/libs/llama.xcframework/Info.plist +22 -22
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4689 -4617
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +1 -3
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +12 -8
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4710 -4638
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3622 -3557
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +1 -3
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +12 -8
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4710 -4638
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3624 -3559
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +1 -3
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +12 -8
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +1 -3
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +12 -8
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +1 -3
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +12 -8
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4689 -4616
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +1 -3
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +12 -8
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4710 -4637
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3622 -3556
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +1 -3
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +12 -8
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4725 -4653
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +1 -3
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +12 -8
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4746 -4674
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3652 -3587
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +1 -3
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +12 -8
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/package.json +1 -1
|
@@ -196,6 +196,7 @@ add_library(ggml-base
|
|
|
196
196
|
../include/ggml-opt.h
|
|
197
197
|
../include/gguf.h
|
|
198
198
|
ggml.c
|
|
199
|
+
ggml.cpp
|
|
199
200
|
ggml-alloc.c
|
|
200
201
|
ggml-backend.cpp
|
|
201
202
|
ggml-opt.cpp
|
|
@@ -226,6 +227,7 @@ function(ggml_add_backend_library backend)
|
|
|
226
227
|
set_target_properties(${backend} PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
|
|
227
228
|
target_compile_definitions(${backend} PRIVATE GGML_BACKEND_DL)
|
|
228
229
|
add_dependencies(ggml ${backend})
|
|
230
|
+
install(TARGETS ${backend} LIBRARY DESTINATION ${CMAKE_INSTALL_BINDIR})
|
|
229
231
|
else()
|
|
230
232
|
add_library(${backend} ${ARGN})
|
|
231
233
|
target_link_libraries(ggml PUBLIC ${backend})
|
|
@@ -1340,7 +1340,10 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
|
|
|
1340
1340
|
// allocate graph
|
|
1341
1341
|
if (backend_ids_changed || !ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
|
|
1342
1342
|
// the re-allocation may cause the split inputs to be moved to a different address
|
|
1343
|
-
ggml_backend_sched_synchronize
|
|
1343
|
+
// synchronize without ggml_backend_sched_synchronize to avoid changing cur_copy
|
|
1344
|
+
for (int i = 0; i < sched->n_backends; i++) {
|
|
1345
|
+
ggml_backend_synchronize(sched->backends[i]);
|
|
1346
|
+
}
|
|
1344
1347
|
#ifndef NDEBUG
|
|
1345
1348
|
GGML_LOG_DEBUG("%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed);
|
|
1346
1349
|
#endif
|
|
@@ -1564,7 +1567,6 @@ bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgra
|
|
|
1564
1567
|
|
|
1565
1568
|
ggml_backend_sched_split_graph(sched, graph);
|
|
1566
1569
|
|
|
1567
|
-
|
|
1568
1570
|
if (!ggml_backend_sched_alloc_splits(sched)) {
|
|
1569
1571
|
return false;
|
|
1570
1572
|
}
|
|
@@ -1598,9 +1600,12 @@ void ggml_backend_sched_synchronize(ggml_backend_sched_t sched) {
|
|
|
1598
1600
|
for (int i = 0; i < sched->n_backends; i++) {
|
|
1599
1601
|
ggml_backend_synchronize(sched->backends[i]);
|
|
1600
1602
|
}
|
|
1601
|
-
|
|
1602
|
-
|
|
1603
|
-
|
|
1603
|
+
if (!sched->is_alloc) {
|
|
1604
|
+
// if the graph is not already allocated, always use copy 0 after a synchronization
|
|
1605
|
+
// this ensures that during generation the same copy is used every time,
|
|
1606
|
+
// which avoids changes in the graph that could cause CUDA or other graphs to be disabled
|
|
1607
|
+
sched->cur_copy = 0;
|
|
1608
|
+
}
|
|
1604
1609
|
}
|
|
1605
1610
|
|
|
1606
1611
|
void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data) {
|
|
@@ -81,7 +81,7 @@ if (BLAS_FOUND)
|
|
|
81
81
|
target_link_libraries (ggml-blas PRIVATE ${BLAS_LIBRARIES})
|
|
82
82
|
target_include_directories(ggml-blas PRIVATE ${BLAS_INCLUDE_DIRS})
|
|
83
83
|
else()
|
|
84
|
-
message(
|
|
85
|
-
|
|
86
|
-
|
|
84
|
+
message(FATAL_ERROR "BLAS not found, please refer to "
|
|
85
|
+
"https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors"
|
|
86
|
+
" to set correct GGML_BLAS_VENDOR")
|
|
87
87
|
endif()
|
|
@@ -2418,12 +2418,32 @@ static bool ggml_thread_apply_priority(int32_t prio) {
|
|
|
2418
2418
|
// This is up to the applications.
|
|
2419
2419
|
DWORD p = THREAD_PRIORITY_NORMAL;
|
|
2420
2420
|
switch (prio) {
|
|
2421
|
+
case GGML_SCHED_PRIO_LOW: p = THREAD_PRIORITY_BELOW_NORMAL; break;
|
|
2421
2422
|
case GGML_SCHED_PRIO_NORMAL: p = THREAD_PRIORITY_NORMAL; break;
|
|
2422
2423
|
case GGML_SCHED_PRIO_MEDIUM: p = THREAD_PRIORITY_ABOVE_NORMAL; break;
|
|
2423
2424
|
case GGML_SCHED_PRIO_HIGH: p = THREAD_PRIORITY_HIGHEST; break;
|
|
2424
2425
|
case GGML_SCHED_PRIO_REALTIME: p = THREAD_PRIORITY_TIME_CRITICAL; break;
|
|
2425
2426
|
}
|
|
2426
2427
|
|
|
2428
|
+
if (prio != GGML_SCHED_PRIO_LOW) {
|
|
2429
|
+
// Tell Windows that this thread should not be throttled (needs its own CPU core).
|
|
2430
|
+
// Newer Windows 11 versions aggresively park (offline) CPU cores and often place
|
|
2431
|
+
// all our threads onto the first 4 cores which results in terrible performance with
|
|
2432
|
+
// n_threads > 4
|
|
2433
|
+
#if _WIN32_WINNT >= 0x0602
|
|
2434
|
+
THREAD_POWER_THROTTLING_STATE t;
|
|
2435
|
+
ZeroMemory(&t, sizeof(t));
|
|
2436
|
+
t.Version = THREAD_POWER_THROTTLING_CURRENT_VERSION;
|
|
2437
|
+
t.ControlMask = THREAD_POWER_THROTTLING_EXECUTION_SPEED;
|
|
2438
|
+
t.StateMask = 0;
|
|
2439
|
+
|
|
2440
|
+
if (!SetThreadInformation(GetCurrentThread(), ThreadPowerThrottling, &t, sizeof(t))) {
|
|
2441
|
+
GGML_LOG_DEBUG("failed to disable thread power throttling %d : (%d)\n", prio, (int) GetLastError());
|
|
2442
|
+
return false;
|
|
2443
|
+
}
|
|
2444
|
+
#endif
|
|
2445
|
+
}
|
|
2446
|
+
|
|
2427
2447
|
if (prio == GGML_SCHED_PRIO_NORMAL) {
|
|
2428
2448
|
// Keep inherited policy/priority
|
|
2429
2449
|
return true;
|
|
@@ -2451,6 +2471,8 @@ static bool ggml_thread_apply_priority(int32_t prio) {
|
|
|
2451
2471
|
struct sched_param p;
|
|
2452
2472
|
int32_t policy = SCHED_OTHER;
|
|
2453
2473
|
switch (prio) {
|
|
2474
|
+
// TODO: there seems to be no way to set lower prio on Apple platforms
|
|
2475
|
+
case GGML_SCHED_PRIO_LOW: policy = SCHED_OTHER; p.sched_priority = 0; break;
|
|
2454
2476
|
case GGML_SCHED_PRIO_NORMAL: policy = SCHED_OTHER; p.sched_priority = 0; break;
|
|
2455
2477
|
case GGML_SCHED_PRIO_MEDIUM: policy = SCHED_FIFO; p.sched_priority = 40; break;
|
|
2456
2478
|
case GGML_SCHED_PRIO_HIGH: policy = SCHED_FIFO; p.sched_priority = 80; break;
|
|
@@ -2507,6 +2529,7 @@ static bool ggml_thread_apply_priority(int32_t prio) {
|
|
|
2507
2529
|
struct sched_param p;
|
|
2508
2530
|
int32_t policy = SCHED_OTHER;
|
|
2509
2531
|
switch (prio) {
|
|
2532
|
+
case GGML_SCHED_PRIO_LOW: policy = SCHED_BATCH; p.sched_priority = 0; break;
|
|
2510
2533
|
case GGML_SCHED_PRIO_NORMAL: policy = SCHED_OTHER; p.sched_priority = 0; break;
|
|
2511
2534
|
case GGML_SCHED_PRIO_MEDIUM: policy = SCHED_FIFO; p.sched_priority = 40; break;
|
|
2512
2535
|
case GGML_SCHED_PRIO_HIGH: policy = SCHED_FIFO; p.sched_priority = 80; break;
|
|
@@ -635,6 +635,7 @@ struct ggml_cuda_device_info {
|
|
|
635
635
|
int nsm; // number of streaming multiprocessors
|
|
636
636
|
size_t smpb; // max. shared memory per block
|
|
637
637
|
size_t smpbo; // max. shared memory per block (with opt-in)
|
|
638
|
+
bool integrated; // Device is integrated as opposed to discrete
|
|
638
639
|
bool vmm; // virtual memory support
|
|
639
640
|
size_t vmm_granularity; // granularity of virtual memory
|
|
640
641
|
size_t total_vram;
|
|
@@ -1246,7 +1246,7 @@ static __global__ void flash_attn_ext_f16(
|
|
|
1246
1246
|
NO_DEVICE_CODE;
|
|
1247
1247
|
return;
|
|
1248
1248
|
}
|
|
1249
|
-
#endif __CUDA_ARCH__ == GGML_CUDA_CC_TURING
|
|
1249
|
+
#endif // __CUDA_ARCH__ == GGML_CUDA_CC_TURING
|
|
1250
1250
|
|
|
1251
1251
|
static_assert(!mla || DKQ >= DV, "MLA needs DKQ >= DV");
|
|
1252
1252
|
|
|
@@ -243,10 +243,10 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
|
|
243
243
|
|
|
244
244
|
info.default_tensor_split[id] = total_vram;
|
|
245
245
|
total_vram += prop.totalGlobalMem;
|
|
246
|
-
|
|
247
|
-
info.devices[id].nsm
|
|
248
|
-
info.devices[id].smpb
|
|
249
|
-
info.devices[id].warp_size
|
|
246
|
+
info.devices[id].integrated = prop.integrated;
|
|
247
|
+
info.devices[id].nsm = prop.multiProcessorCount;
|
|
248
|
+
info.devices[id].smpb = prop.sharedMemPerBlock;
|
|
249
|
+
info.devices[id].warp_size = prop.warpSize;
|
|
250
250
|
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
|
251
251
|
info.devices[id].smpbo = prop.sharedMemPerBlock;
|
|
252
252
|
|
|
@@ -1065,6 +1065,10 @@ static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_
|
|
|
1065
1065
|
GGML_UNUSED(buft);
|
|
1066
1066
|
}
|
|
1067
1067
|
|
|
1068
|
+
static bool ggml_backend_buft_is_cuda_host(ggml_backend_buffer_type_t buft) {
|
|
1069
|
+
return buft->iface.get_name == ggml_backend_cuda_host_buffer_type_name;
|
|
1070
|
+
}
|
|
1071
|
+
|
|
1068
1072
|
static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
|
1069
1073
|
CUDA_CHECK(cudaFreeHost(buffer->context));
|
|
1070
1074
|
}
|
|
@@ -2641,6 +2645,8 @@ static void update_cuda_graph_executable(ggml_backend_cuda_context * cuda_ctx) {
|
|
|
2641
2645
|
|
|
2642
2646
|
static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph,
|
|
2643
2647
|
bool & graph_evaluated_or_captured, bool & use_cuda_graph, bool & cuda_graph_update_required) {
|
|
2648
|
+
// flag used to determine whether it is an integrated_gpu
|
|
2649
|
+
const bool integrated = ggml_cuda_info().devices[cuda_ctx->device].integrated;
|
|
2644
2650
|
|
|
2645
2651
|
while (!graph_evaluated_or_captured) {
|
|
2646
2652
|
// Only perform the graph execution if CUDA graphs are not enabled, or we are capturing the graph.
|
|
@@ -2659,7 +2665,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
|
|
|
2659
2665
|
if (node->src[j] != nullptr) {
|
|
2660
2666
|
assert(node->src[j]->buffer);
|
|
2661
2667
|
assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) ||
|
|
2662
|
-
ggml_backend_buft_is_cuda_split(node->src[j]->buffer->buft));
|
|
2668
|
+
ggml_backend_buft_is_cuda_split(node->src[j]->buffer->buft) || (integrated && ggml_backend_buft_is_cuda_host(node->src[j]->buffer->buft)));
|
|
2663
2669
|
}
|
|
2664
2670
|
}
|
|
2665
2671
|
#endif
|
|
@@ -2994,9 +3000,12 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
|
|
2994
3000
|
{
|
|
2995
3001
|
struct ggml_tensor * a = op->src[0];
|
|
2996
3002
|
struct ggml_tensor * b = op->src[1];
|
|
2997
|
-
// for small weight matrices the active device can end up without any rows, don't use row split in those cases
|
|
2998
|
-
// this avoids some edge cases (and the performance would not be good anyways)
|
|
2999
3003
|
if (a->buffer && ggml_backend_buft_is_cuda_split(a->buffer->buft)) {
|
|
3004
|
+
if (a->ne[2] > 1 || a->ne[3] > 1) {
|
|
3005
|
+
return false;
|
|
3006
|
+
}
|
|
3007
|
+
// for small weight matrices the active device can end up without any rows, don't use row split in those cases
|
|
3008
|
+
// this avoids some edge cases (and the performance would not be good anyways)
|
|
3000
3009
|
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) a->buffer->buft->context;
|
|
3001
3010
|
int64_t row_low;
|
|
3002
3011
|
int64_t row_high;
|
|
@@ -3263,7 +3272,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
|
|
3263
3272
|
}
|
|
3264
3273
|
|
|
3265
3274
|
static bool ggml_backend_cuda_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
|
|
3266
|
-
|
|
3275
|
+
ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *) dev->context;
|
|
3276
|
+
const bool integrated = ggml_cuda_info().devices[dev_ctx->device].integrated;
|
|
3277
|
+
return (((ggml_backend_buft_is_cuda(buft) || ggml_backend_buft_is_cuda_split(buft)) && buft->device == dev) || (integrated && ggml_backend_buft_is_cuda_host(buft)));
|
|
3267
3278
|
}
|
|
3268
3279
|
|
|
3269
3280
|
static int64_t get_op_batch_size(const ggml_tensor * op) {
|
|
@@ -13,7 +13,7 @@ elseif(SUPPORTS_SYCL)
|
|
|
13
13
|
If you expected the oneAPI Release compiler, please install oneAPI & source it, like:
|
|
14
14
|
source /opt/intel/oneapi/setvars.sh")
|
|
15
15
|
else()
|
|
16
|
-
message(FATAL_ERROR
|
|
16
|
+
message(FATAL_ERROR "C++ compiler lacks SYCL support.")
|
|
17
17
|
endif()
|
|
18
18
|
message(STATUS "SYCL found")
|
|
19
19
|
#todo: AOT
|
|
@@ -170,7 +170,7 @@ else()
|
|
|
170
170
|
target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_NVIDIA)
|
|
171
171
|
elseif (GGML_SYCL_TARGET STREQUAL "AMD")
|
|
172
172
|
if (NOT GGML_SYCL_DEVICE_ARCH)
|
|
173
|
-
message(
|
|
173
|
+
message(FATAL_ERROR "Can't enable SYCL hip backend, GGML_SYCL_DEVICE_ARCH has not been set.")
|
|
174
174
|
endif()
|
|
175
175
|
target_link_libraries(ggml-sycl PRIVATE ONEMATH::onemath_blas_rocblas)
|
|
176
176
|
target_compile_options(ggml-sycl PRIVATE "-fsycl-targets=amdgcn-amd-amdhsa")
|
|
@@ -4257,14 +4257,6 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
|
|
|
4257
4257
|
case GGML_OP_SOFT_MAX:
|
|
4258
4258
|
return true;
|
|
4259
4259
|
case GGML_OP_ROPE:
|
|
4260
|
-
{
|
|
4261
|
-
const int mode = ((const int32_t *) op->op_params)[2];
|
|
4262
|
-
// mode is not used as a bitmask in practice, the various rope type modes are independent implementations
|
|
4263
|
-
if (mode == GGML_ROPE_TYPE_MROPE) {
|
|
4264
|
-
return false;
|
|
4265
|
-
}
|
|
4266
|
-
return true;
|
|
4267
|
-
}
|
|
4268
4260
|
case GGML_OP_IM2COL:
|
|
4269
4261
|
return true;
|
|
4270
4262
|
case GGML_OP_UPSCALE:
|
|
@@ -49,10 +49,7 @@ static void rope_norm(const T * x, T * dst, const int ne0, const int ne1, const
|
|
|
49
49
|
|
|
50
50
|
if (i0 >= n_dims) {
|
|
51
51
|
const int i = row * ne0 + i0;
|
|
52
|
-
|
|
53
|
-
dst[i + 0] = x[i + 0];
|
|
54
|
-
dst[i + 1] = x[i + 1];
|
|
55
|
-
|
|
52
|
+
*reinterpret_cast<sycl::vec<T, 2> *>(dst + i) = *reinterpret_cast<const sycl::vec<T, 2> *>(x + i);
|
|
56
53
|
return;
|
|
57
54
|
}
|
|
58
55
|
|
|
@@ -93,10 +90,7 @@ static void rope_neox(const T * x, T * dst, const int ne0, const int ne1, const
|
|
|
93
90
|
|
|
94
91
|
if (i0 >= n_dims) {
|
|
95
92
|
const int i = row * ne0 + i0;
|
|
96
|
-
|
|
97
|
-
dst[i + 0] = x[i + 0];
|
|
98
|
-
dst[i + 1] = x[i + 1];
|
|
99
|
-
|
|
93
|
+
*reinterpret_cast<sycl::vec<T, 2> *>(dst + i) = *reinterpret_cast<const sycl::vec<T, 2> *>(x + i);
|
|
100
94
|
return;
|
|
101
95
|
}
|
|
102
96
|
|
|
@@ -122,6 +116,63 @@ static void rope_neox(const T * x, T * dst, const int ne0, const int ne1, const
|
|
|
122
116
|
dst[i + n_dims / 2] = x0 * sin_theta + x1 * cos_theta;
|
|
123
117
|
}
|
|
124
118
|
|
|
119
|
+
template <typename T, bool has_ff>
|
|
120
|
+
static void rope_multi(const T * x, T * dst, const int ne0, const int ne1, const int ne2, const size_t s1,
|
|
121
|
+
const size_t s2, const int n_dims, const int32_t * pos, const float freq_scale,
|
|
122
|
+
const float ext_factor, const float attn_factor, const rope_corr_dims corr_dims,
|
|
123
|
+
const float theta_scale, const float * freq_factors, const mrope_sections sections,
|
|
124
|
+
const sycl::nd_item<3> & item_ct1) {
|
|
125
|
+
// get index pos
|
|
126
|
+
const int i0 = 2 * (item_ct1.get_group(1) * item_ct1.get_local_range(1) + item_ct1.get_local_id(1));
|
|
127
|
+
if (i0 >= ne0) {
|
|
128
|
+
return;
|
|
129
|
+
}
|
|
130
|
+
const int row_dst = (item_ct1.get_group(2) * item_ct1.get_local_range(2)) + item_ct1.get_local_id(2);
|
|
131
|
+
|
|
132
|
+
if (i0 >= n_dims) {
|
|
133
|
+
const int i = row_dst*ne0 + i0;
|
|
134
|
+
*reinterpret_cast<sycl::vec<T, 2> *>(dst + i) = *reinterpret_cast<const sycl::vec<T, 2> *>(x + i);
|
|
135
|
+
return;
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
const int row_x = row_dst % ne1;
|
|
139
|
+
const int channel_x = row_dst / ne1;
|
|
140
|
+
const int idst = (row_dst * ne0) + (i0 / 2);
|
|
141
|
+
const size_t ix = ((size_t) channel_x * s2) + ((size_t) row_x * s1) + (i0 / 2);
|
|
142
|
+
|
|
143
|
+
const int sect_dims = sections.v[0] + sections.v[1] + sections.v[2] + sections.v[3];
|
|
144
|
+
const int sec_w = sections.v[1] + sections.v[0];
|
|
145
|
+
const int sector = (i0 / 2) % sect_dims;
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
float theta_base = 0.0;
|
|
149
|
+
if (sector < sections.v[0]) {
|
|
150
|
+
theta_base = pos[channel_x]*sycl::pow(theta_scale, i0/2.0f);
|
|
151
|
+
}
|
|
152
|
+
else if (sector >= sections.v[0] && sector < sec_w) {
|
|
153
|
+
theta_base = pos[channel_x + ne2 * 1]*sycl::pow(theta_scale, i0/2.0f);
|
|
154
|
+
}
|
|
155
|
+
else if (sector >= sec_w && sector < sec_w + sections.v[2]) {
|
|
156
|
+
theta_base = pos[channel_x + ne2 * 2]*sycl::pow(theta_scale, i0/2.0f);
|
|
157
|
+
}
|
|
158
|
+
else if (sector >= sec_w + sections.v[2]) {
|
|
159
|
+
theta_base = pos[channel_x + ne2 * 3]*sycl::pow(theta_scale, i0/2.0f);
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
const float freq_factor = has_ff ? freq_factors[i0 / 2] : 1.0f;
|
|
163
|
+
float cos_theta;
|
|
164
|
+
float sin_theta;
|
|
165
|
+
rope_yarn(theta_base / freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta);
|
|
166
|
+
const float x0 = x[ix + 0];
|
|
167
|
+
const float x1 = x[ix + n_dims/2];
|
|
168
|
+
|
|
169
|
+
// store results in dst
|
|
170
|
+
dst[idst + 0] = x0 * cos_theta - x1 * sin_theta;
|
|
171
|
+
dst[idst + n_dims/2] = x0 * sin_theta + x1 * cos_theta;
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
|
|
125
176
|
template <typename T, bool has_ff>
|
|
126
177
|
static void rope_vision(const T * x, T * dst, const int ne0, const int ne1, const int ne2, const size_t s1,
|
|
127
178
|
const size_t s2, const int n_dims, const int32_t * pos, const float freq_scale,
|
|
@@ -171,7 +222,7 @@ static void rope_norm_sycl(const T * x, T * dst, const int ne0, const int ne1, c
|
|
|
171
222
|
const float * freq_factors, queue_ptr stream) {
|
|
172
223
|
GGML_ASSERT(ne0 % 2 == 0);
|
|
173
224
|
const sycl::range<3> block_dims(1, SYCL_ROPE_BLOCK_SIZE, 1);
|
|
174
|
-
const int num_blocks_x = (ne0
|
|
225
|
+
const int num_blocks_x = ceil_div(ne0, (2 * SYCL_ROPE_BLOCK_SIZE));
|
|
175
226
|
const sycl::range<3> block_nums(1, num_blocks_x, nr);
|
|
176
227
|
|
|
177
228
|
const float theta_scale = powf(freq_base, -2.0f / n_dims);
|
|
@@ -208,7 +259,7 @@ static void rope_neox_sycl(const T * x, T * dst, const int ne0, const int ne1, c
|
|
|
208
259
|
const rope_corr_dims corr_dims, const float * freq_factors, queue_ptr stream) {
|
|
209
260
|
GGML_ASSERT(ne0 % 2 == 0);
|
|
210
261
|
const sycl::range<3> block_dims(1, SYCL_ROPE_BLOCK_SIZE, 1);
|
|
211
|
-
const int num_blocks_x = (ne0
|
|
262
|
+
const int num_blocks_x = ceil_div(ne0, (2 * SYCL_ROPE_BLOCK_SIZE));
|
|
212
263
|
const sycl::range<3> block_nums(1, num_blocks_x, nr);
|
|
213
264
|
|
|
214
265
|
const float theta_scale = powf(freq_base, -2.0f / n_dims);
|
|
@@ -228,6 +279,40 @@ static void rope_neox_sycl(const T * x, T * dst, const int ne0, const int ne1, c
|
|
|
228
279
|
}
|
|
229
280
|
}
|
|
230
281
|
|
|
282
|
+
template <typename T>
|
|
283
|
+
static void rope_multi_sycl(const T * x, T * dst, const int ne0, const int ne1, const int ne2, const size_t s1,
|
|
284
|
+
const size_t s2, const int n_dims, const int nr, const int32_t * pos,
|
|
285
|
+
const float freq_scale, const float freq_base, const float ext_factor,
|
|
286
|
+
const float attn_factor, const rope_corr_dims corr_dims, const float * freq_factors,
|
|
287
|
+
const mrope_sections sections, queue_ptr stream) {
|
|
288
|
+
GGML_ASSERT(ne0 % 2 == 0);
|
|
289
|
+
const sycl::range<3> block_dims(1, SYCL_ROPE_BLOCK_SIZE, 1);
|
|
290
|
+
const int n_blocks_y = ceil_div(ne0, (2 * SYCL_ROPE_BLOCK_SIZE));
|
|
291
|
+
const sycl::range<3> grid_dims(1, n_blocks_y, nr);
|
|
292
|
+
const sycl::nd_range<3> nd_range(grid_dims * block_dims, block_dims);
|
|
293
|
+
|
|
294
|
+
const float theta_scale = std::pow(freq_base, -2.0f / n_dims);
|
|
295
|
+
// Add FP16 capability check if T could be sycl::half
|
|
296
|
+
if constexpr (std::is_same_v<T, sycl::half>) {
|
|
297
|
+
dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });
|
|
298
|
+
}
|
|
299
|
+
// launch kernel
|
|
300
|
+
if (freq_factors == nullptr) {
|
|
301
|
+
stream->parallel_for(nd_range, [=](sycl::nd_item<3> item_ct1) {
|
|
302
|
+
rope_multi<T, false>(x, dst, ne0, ne1, ne2, s1, s2, n_dims, pos, freq_scale, ext_factor, attn_factor,
|
|
303
|
+
corr_dims, theta_scale, freq_factors, sections, item_ct1);
|
|
304
|
+
});
|
|
305
|
+
} else {
|
|
306
|
+
stream->parallel_for(nd_range, [=](sycl::nd_item<3> item_ct1) {
|
|
307
|
+
rope_multi<T, true>(x, dst, ne0, ne1, ne2, s1, s2, n_dims, pos, freq_scale, ext_factor, attn_factor,
|
|
308
|
+
corr_dims, theta_scale, freq_factors, sections, item_ct1);
|
|
309
|
+
});
|
|
310
|
+
}
|
|
311
|
+
}
|
|
312
|
+
|
|
313
|
+
|
|
314
|
+
|
|
315
|
+
|
|
231
316
|
// rope vision
|
|
232
317
|
template <typename T>
|
|
233
318
|
static void rope_vision_sycl(const T * x, T * dst, const int ne0, const int ne1, const int ne2, const size_t s1,
|
|
@@ -237,7 +322,7 @@ static void rope_vision_sycl(const T * x, T * dst, const int ne0, const int ne1,
|
|
|
237
322
|
const mrope_sections sections, queue_ptr stream) {
|
|
238
323
|
GGML_ASSERT(ne0 % 2 == 0);
|
|
239
324
|
const sycl::range<3> block_dims(1, SYCL_ROPE_BLOCK_SIZE, 1);
|
|
240
|
-
const int n_blocks_y = (ne0
|
|
325
|
+
const int n_blocks_y = ceil_div(ne0, (2 * SYCL_ROPE_BLOCK_SIZE));
|
|
241
326
|
const sycl::range<3> grid_dims(1, n_blocks_y, nr);
|
|
242
327
|
const sycl::nd_range<3> nd_range(grid_dims * block_dims, block_dims);
|
|
243
328
|
|
|
@@ -298,8 +383,17 @@ inline void ggml_sycl_op_rope(ggml_backend_sycl_context & ctx, ggml_tensor *dst)
|
|
|
298
383
|
memcpy(§ions.v, (int32_t *) dst->op_params + 11, sizeof(int)*4);
|
|
299
384
|
|
|
300
385
|
const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
|
|
386
|
+
const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE;
|
|
301
387
|
const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
|
|
302
388
|
|
|
389
|
+
if (is_mrope) {
|
|
390
|
+
GGML_ASSERT(sections.v[0] > 0 || sections.v[1] > 0 || sections.v[2] > 0);
|
|
391
|
+
}
|
|
392
|
+
|
|
393
|
+
if (is_vision) {
|
|
394
|
+
GGML_ASSERT(n_dims == ne00/2);
|
|
395
|
+
}
|
|
396
|
+
|
|
303
397
|
const int32_t * pos = (const int32_t *) dst->src[1]->data;
|
|
304
398
|
|
|
305
399
|
const float * freq_factors = nullptr;
|
|
@@ -326,6 +420,19 @@ inline void ggml_sycl_op_rope(ggml_backend_sycl_context & ctx, ggml_tensor *dst)
|
|
|
326
420
|
} else {
|
|
327
421
|
GGML_ABORT("fatal error");
|
|
328
422
|
}
|
|
423
|
+
} else if (is_mrope && !is_vision) {
|
|
424
|
+
GGML_SYCL_DEBUG("%s: mrope path\n", __func__);
|
|
425
|
+
if (dst->src[0]->type == GGML_TYPE_F16) {
|
|
426
|
+
rope_multi_sycl((const sycl::half *)dst->src[0]->data, (sycl::half *)dst->data, ne00, ne01, ne02, s01,
|
|
427
|
+
s02, n_dims, nr, pos, freq_scale, freq_base, ext_factor, attn_factor, corr_dims,
|
|
428
|
+
freq_factors, sections, main_stream);
|
|
429
|
+
} else if (dst->src[0]->type == GGML_TYPE_F32) {
|
|
430
|
+
rope_multi_sycl((const float *) dst->src[0]->data, (float *) dst->data, ne00, ne01, ne02, s01, s02, n_dims,
|
|
431
|
+
nr, pos, freq_scale, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections,
|
|
432
|
+
main_stream);
|
|
433
|
+
} else {
|
|
434
|
+
GGML_ABORT("Fatal error: Tensor type unsupported!");
|
|
435
|
+
}
|
|
329
436
|
} else if (is_vision) {
|
|
330
437
|
GGML_SYCL_DEBUG("%s: vision path\n", __func__);
|
|
331
438
|
if (dst->src[0]->type == GGML_TYPE_F16) {
|
|
@@ -1652,7 +1652,7 @@ static std::array<uint32_t, 2> fa_rows_cols(FaCodePath path, uint32_t D, uint32_
|
|
|
1652
1652
|
return {64, 32};
|
|
1653
1653
|
}
|
|
1654
1654
|
return {64, 64};
|
|
1655
|
-
}
|
|
1655
|
+
}
|
|
1656
1656
|
|
|
1657
1657
|
static bool ggml_vk_matmul_shmem_support(const vk_device& device, const std::vector<uint32_t>& warptile, bool mul_mat_id, ggml_type src0_type) {
|
|
1658
1658
|
|
|
@@ -133,7 +133,7 @@ static void ggml_print_backtrace_symbols(void) {
|
|
|
133
133
|
}
|
|
134
134
|
#endif
|
|
135
135
|
|
|
136
|
-
|
|
136
|
+
void ggml_print_backtrace(void) {
|
|
137
137
|
const char * GGML_NO_BACKTRACE = getenv("GGML_NO_BACKTRACE");
|
|
138
138
|
if (GGML_NO_BACKTRACE) {
|
|
139
139
|
return;
|
|
@@ -160,6 +160,10 @@ static void ggml_print_backtrace(void) {
|
|
|
160
160
|
const int parent_pid = getpid();
|
|
161
161
|
const int child_pid = fork();
|
|
162
162
|
if (child_pid < 0) { // error
|
|
163
|
+
#if defined(__linux__)
|
|
164
|
+
close(lock[1]);
|
|
165
|
+
close(lock[0]);
|
|
166
|
+
#endif
|
|
163
167
|
return;
|
|
164
168
|
} else if (child_pid == 0) { // child
|
|
165
169
|
char attach[32];
|
|
@@ -167,6 +171,7 @@ static void ggml_print_backtrace(void) {
|
|
|
167
171
|
#if defined(__linux__)
|
|
168
172
|
close(lock[1]);
|
|
169
173
|
(void) !read(lock[0], lock, 1);
|
|
174
|
+
close(lock[0]);
|
|
170
175
|
#endif
|
|
171
176
|
// try gdb
|
|
172
177
|
execlp("gdb", "gdb", "--batch",
|
|
@@ -195,7 +200,7 @@ static void ggml_print_backtrace(void) {
|
|
|
195
200
|
}
|
|
196
201
|
}
|
|
197
202
|
#else
|
|
198
|
-
|
|
203
|
+
void ggml_print_backtrace(void) {
|
|
199
204
|
// platform not supported
|
|
200
205
|
}
|
|
201
206
|
#endif
|
|
@@ -216,6 +221,8 @@ void ggml_abort(const char * file, int line, const char * fmt, ...) {
|
|
|
216
221
|
abort();
|
|
217
222
|
}
|
|
218
223
|
|
|
224
|
+
// ggml_print_backtrace is registered with std::set_terminate by ggml.cpp
|
|
225
|
+
|
|
219
226
|
//
|
|
220
227
|
// logging
|
|
221
228
|
//
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
#include "ggml-impl.h"
|
|
2
|
+
|
|
3
|
+
#include <cstdlib>
|
|
4
|
+
#include <exception>
|
|
5
|
+
|
|
6
|
+
static std::terminate_handler previous_terminate_handler;
|
|
7
|
+
|
|
8
|
+
GGML_NORETURN static void ggml_uncaught_exception() {
|
|
9
|
+
ggml_print_backtrace();
|
|
10
|
+
if (previous_terminate_handler) {
|
|
11
|
+
previous_terminate_handler();
|
|
12
|
+
}
|
|
13
|
+
abort(); // unreachable unless previous_terminate_handler was nullptr
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
static bool ggml_uncaught_exception_init = []{
|
|
17
|
+
const char * GGML_NO_BACKTRACE = getenv("GGML_NO_BACKTRACE");
|
|
18
|
+
if (GGML_NO_BACKTRACE) {
|
|
19
|
+
return false;
|
|
20
|
+
}
|
|
21
|
+
const auto prev{std::get_terminate()};
|
|
22
|
+
GGML_ASSERT(prev != ggml_uncaught_exception);
|
|
23
|
+
previous_terminate_handler = prev;
|
|
24
|
+
std::set_terminate(ggml_uncaught_exception);
|
|
25
|
+
return true;
|
|
26
|
+
}();
|
|
@@ -347,11 +347,28 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
|
|
|
347
347
|
int64_t n_tensors = 0;
|
|
348
348
|
|
|
349
349
|
if (ok && gr.read(ctx->version)) {
|
|
350
|
-
if (ctx->version ==
|
|
350
|
+
if (ok && ctx->version == 0) {
|
|
351
|
+
GGML_LOG_ERROR("%s: bad GGUF version: %" PRIu32 "\n", __func__, ctx->version);
|
|
352
|
+
ok = false;
|
|
353
|
+
}
|
|
354
|
+
|
|
355
|
+
/*
|
|
356
|
+
* bit layout is different when reading non-native endian models.
|
|
357
|
+
* assuming that the GGUF version is 3, the non-native endian model
|
|
358
|
+
* would read it as 0x30000000. we can use the AND operation against
|
|
359
|
+
* the last 4 hexadecimal digits to check if the model is the same
|
|
360
|
+
* endianness as the host system.
|
|
361
|
+
*/
|
|
362
|
+
if (ok && (ctx->version & 0x0000FFFF) == 0x00000000) {
|
|
363
|
+
GGML_LOG_ERROR("%s: failed to load model: this GGUF file version %" PRIu32 " is extremely large, is there a mismatch between the host and model endianness?\n", __func__, ctx->version);
|
|
364
|
+
ok = false;
|
|
365
|
+
}
|
|
366
|
+
|
|
367
|
+
if (ok && ctx->version == 1) {
|
|
351
368
|
GGML_LOG_ERROR("%s: GGUFv1 is no longer supported, please use a more up-to-date version\n", __func__);
|
|
352
369
|
ok = false;
|
|
353
370
|
}
|
|
354
|
-
if (ctx->version > GGUF_VERSION) {
|
|
371
|
+
if (ok && ctx->version > GGUF_VERSION) {
|
|
355
372
|
GGML_LOG_ERROR("%s: this GGUF file is version %" PRIu32 " but this software only supports up to version %d\n",
|
|
356
373
|
__func__, ctx->version, GGUF_VERSION);
|
|
357
374
|
ok = false;
|
|
@@ -259,9 +259,9 @@ extern "C" {
|
|
|
259
259
|
llama_token * token;
|
|
260
260
|
float * embd;
|
|
261
261
|
llama_pos * pos;
|
|
262
|
-
int32_t * n_seq_id;
|
|
263
|
-
llama_seq_id ** seq_id;
|
|
264
|
-
int8_t * logits;
|
|
262
|
+
int32_t * n_seq_id; // TODO: remove, should belong to only 1 sequence
|
|
263
|
+
llama_seq_id ** seq_id; // TODO: become llama_seq_id * seq_id;
|
|
264
|
+
int8_t * logits; // TODO: rename this to "output"
|
|
265
265
|
} llama_batch;
|
|
266
266
|
|
|
267
267
|
enum llama_model_kv_override_type {
|
|
@@ -366,6 +366,8 @@ extern "C" {
|
|
|
366
366
|
bool no_perf; // measure performance timings
|
|
367
367
|
bool op_offload; // offload host tensor operations to device
|
|
368
368
|
bool swa_full; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
|
|
369
|
+
// NOTE: setting to false when n_seq_max > 1 can cause bad performance in some cases
|
|
370
|
+
// ref: https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573
|
|
369
371
|
};
|
|
370
372
|
|
|
371
373
|
// model quantization parameters
|
|
@@ -502,6 +504,7 @@ extern "C" {
|
|
|
502
504
|
LLAMA_API int32_t llama_model_n_layer (const struct llama_model * model);
|
|
503
505
|
LLAMA_API int32_t llama_model_n_head (const struct llama_model * model);
|
|
504
506
|
LLAMA_API int32_t llama_model_n_head_kv (const struct llama_model * model);
|
|
507
|
+
LLAMA_API int32_t llama_model_n_swa (const struct llama_model * model);
|
|
505
508
|
|
|
506
509
|
// Get the model's RoPE frequency scaling factor
|
|
507
510
|
LLAMA_API float llama_model_rope_freq_scale_train(const struct llama_model * model);
|
|
@@ -652,7 +655,6 @@ extern "C" {
|
|
|
652
655
|
// Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
|
|
653
656
|
// If the KV cache is RoPEd, the KV data is updated accordingly:
|
|
654
657
|
// - lazily on next llama_decode()
|
|
655
|
-
// - explicitly with llama_kv_self_update()
|
|
656
658
|
// p0 < 0 : [0, p1]
|
|
657
659
|
// p1 < 0 : [p0, inf)
|
|
658
660
|
LLAMA_API void llama_kv_self_seq_add(
|
|
@@ -665,7 +667,6 @@ extern "C" {
|
|
|
665
667
|
// Integer division of the positions by factor of `d > 1`
|
|
666
668
|
// If the KV cache is RoPEd, the KV data is updated accordingly:
|
|
667
669
|
// - lazily on next llama_decode()
|
|
668
|
-
// - explicitly with llama_kv_self_update()
|
|
669
670
|
// p0 < 0 : [0, p1]
|
|
670
671
|
// p1 < 0 : [p0, inf)
|
|
671
672
|
LLAMA_API void llama_kv_self_seq_div(
|
|
@@ -677,12 +678,14 @@ extern "C" {
|
|
|
677
678
|
|
|
678
679
|
// Returns the smallest position present in the KV cache for the specified sequence
|
|
679
680
|
// This is typically non-zero only for SWA caches
|
|
681
|
+
// Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the KV cache
|
|
680
682
|
// Return -1 if the sequence is empty
|
|
681
683
|
LLAMA_API llama_pos llama_kv_self_seq_pos_min(
|
|
682
684
|
struct llama_context * ctx,
|
|
683
685
|
llama_seq_id seq_id);
|
|
684
686
|
|
|
685
687
|
// Returns the largest position present in the KV cache for the specified sequence
|
|
688
|
+
// Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the KV cache
|
|
686
689
|
// Return -1 if the sequence is empty
|
|
687
690
|
LLAMA_API llama_pos llama_kv_self_seq_pos_max(
|
|
688
691
|
struct llama_context * ctx,
|
|
@@ -691,14 +694,15 @@ extern "C" {
|
|
|
691
694
|
// Defragment the KV cache
|
|
692
695
|
// This will be applied:
|
|
693
696
|
// - lazily on next llama_decode()
|
|
694
|
-
|
|
695
|
-
|
|
697
|
+
LLAMA_API DEPRECATED(void llama_kv_self_defrag(struct llama_context * ctx),
|
|
698
|
+
"simply remove this call, the context will automatically decide when to do a defragmentation based on 'defrag_thold'");
|
|
696
699
|
|
|
697
700
|
// Check if the context supports KV cache shifting
|
|
698
701
|
LLAMA_API bool llama_kv_self_can_shift(const struct llama_context * ctx);
|
|
699
702
|
|
|
700
703
|
// Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
|
|
701
|
-
LLAMA_API void llama_kv_self_update(struct llama_context * ctx)
|
|
704
|
+
LLAMA_API DEPRECATED(void llama_kv_self_update(struct llama_context * ctx),
|
|
705
|
+
"simply remove this call, updates are applied lazily on the next llama_decode()");
|
|
702
706
|
|
|
703
707
|
//
|
|
704
708
|
// State / sessions
|