@fugood/llama.node 1.3.0-rc.4 → 1.3.0-rc.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/binding.ts +1 -1
- package/lib/index.js +2 -2
- package/lib/index.ts +2 -1
- package/package.json +14 -14
- package/scripts/llama.cpp.patch +5 -5
- package/src/LlamaContext.cpp +59 -1
- package/src/LlamaContext.h +3 -0
- package/src/llama.cpp/ggml/src/CMakeLists.txt +12 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +37 -21
- package/src/llama.cpp/src/llama-arch.cpp +35 -0
- package/src/llama.cpp/src/llama-arch.h +3 -0
- package/src/llama.cpp/src/llama-batch.h +1 -1
- package/src/llama.cpp/src/llama-chat.cpp +35 -2
- package/src/llama.cpp/src/llama-chat.h +2 -0
- package/src/llama.cpp/src/llama-context.cpp +2 -1
- package/src/llama.cpp/src/llama-graph.cpp +30 -0
- package/src/llama.cpp/src/llama-hparams.h +2 -0
- package/src/llama.cpp/src/llama-model.cpp +270 -7
- package/src/llama.cpp/src/llama-model.h +3 -0
- package/src/llama.cpp/src/llama-vocab.cpp +1 -0
package/lib/binding.ts
CHANGED
|
@@ -375,7 +375,7 @@ export type ToolCall = {
|
|
|
375
375
|
}
|
|
376
376
|
|
|
377
377
|
export interface LlamaContext {
|
|
378
|
-
new (options: LlamaModelOptions): LlamaContext
|
|
378
|
+
new (options: LlamaModelOptions, onProgress?: (progress: number) => void): LlamaContext
|
|
379
379
|
getSystemInfo(): string
|
|
380
380
|
getModelInfo(): ModelInfo
|
|
381
381
|
getFormattedChat(
|
package/lib/index.js
CHANGED
|
@@ -193,12 +193,12 @@ class LlamaContextWrapper {
|
|
|
193
193
|
return this.ctx.decodeAudioTokens(tokens);
|
|
194
194
|
}
|
|
195
195
|
}
|
|
196
|
-
const loadModel = (options) => __awaiter(void 0, void 0, void 0, function* () {
|
|
196
|
+
const loadModel = (options, onProgress) => __awaiter(void 0, void 0, void 0, function* () {
|
|
197
197
|
var _a, _b;
|
|
198
198
|
const variant = (_a = options.lib_variant) !== null && _a !== void 0 ? _a : 'default';
|
|
199
199
|
(_b = mods[variant]) !== null && _b !== void 0 ? _b : (mods[variant] = yield (0, binding_1.loadModule)(options.lib_variant));
|
|
200
200
|
refreshNativeLogSetup();
|
|
201
|
-
const nativeCtx = new mods[variant].LlamaContext(options);
|
|
201
|
+
const nativeCtx = new mods[variant].LlamaContext(options, onProgress);
|
|
202
202
|
return new LlamaContextWrapper(nativeCtx);
|
|
203
203
|
});
|
|
204
204
|
exports.loadModel = loadModel;
|
package/lib/index.ts
CHANGED
|
@@ -299,12 +299,13 @@ class LlamaContextWrapper {
|
|
|
299
299
|
|
|
300
300
|
export const loadModel = async (
|
|
301
301
|
options: LlamaModelOptionsExtended,
|
|
302
|
+
onProgress?: (progress: number) => void,
|
|
302
303
|
): Promise<LlamaContextWrapper> => {
|
|
303
304
|
const variant = options.lib_variant ?? 'default'
|
|
304
305
|
mods[variant] ??= await loadModule(options.lib_variant)
|
|
305
306
|
refreshNativeLogSetup()
|
|
306
307
|
|
|
307
|
-
const nativeCtx = new mods[variant].LlamaContext(options)
|
|
308
|
+
const nativeCtx = new mods[variant].LlamaContext(options, onProgress)
|
|
308
309
|
return new LlamaContextWrapper(nativeCtx)
|
|
309
310
|
}
|
|
310
311
|
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@fugood/llama.node",
|
|
3
3
|
"access": "public",
|
|
4
|
-
"version": "1.3.0-rc.
|
|
4
|
+
"version": "1.3.0-rc.6",
|
|
5
5
|
"description": "An another Node binding of llama.cpp",
|
|
6
6
|
"main": "lib/index.js",
|
|
7
7
|
"scripts": {
|
|
@@ -72,19 +72,19 @@
|
|
|
72
72
|
"CMakeLists.txt"
|
|
73
73
|
],
|
|
74
74
|
"optionalDependencies": {
|
|
75
|
-
"@fugood/node-llama-linux-x64": "1.3.0-rc.
|
|
76
|
-
"@fugood/node-llama-linux-x64-vulkan": "1.3.0-rc.
|
|
77
|
-
"@fugood/node-llama-linux-x64-cuda": "1.3.0-rc.
|
|
78
|
-
"@fugood/node-llama-linux-arm64": "1.3.0-rc.
|
|
79
|
-
"@fugood/node-llama-linux-arm64-vulkan": "1.3.0-rc.
|
|
80
|
-
"@fugood/node-llama-linux-arm64-cuda": "1.3.0-rc.
|
|
81
|
-
"@fugood/node-llama-win32-x64": "1.3.0-rc.
|
|
82
|
-
"@fugood/node-llama-win32-x64-vulkan": "1.3.0-rc.
|
|
83
|
-
"@fugood/node-llama-win32-x64-cuda": "1.3.0-rc.
|
|
84
|
-
"@fugood/node-llama-win32-arm64": "1.3.0-rc.
|
|
85
|
-
"@fugood/node-llama-win32-arm64-vulkan": "1.3.0-rc.
|
|
86
|
-
"@fugood/node-llama-darwin-x64": "1.3.0-rc.
|
|
87
|
-
"@fugood/node-llama-darwin-arm64": "1.3.0-rc.
|
|
75
|
+
"@fugood/node-llama-linux-x64": "1.3.0-rc.6",
|
|
76
|
+
"@fugood/node-llama-linux-x64-vulkan": "1.3.0-rc.6",
|
|
77
|
+
"@fugood/node-llama-linux-x64-cuda": "1.3.0-rc.6",
|
|
78
|
+
"@fugood/node-llama-linux-arm64": "1.3.0-rc.6",
|
|
79
|
+
"@fugood/node-llama-linux-arm64-vulkan": "1.3.0-rc.6",
|
|
80
|
+
"@fugood/node-llama-linux-arm64-cuda": "1.3.0-rc.6",
|
|
81
|
+
"@fugood/node-llama-win32-x64": "1.3.0-rc.6",
|
|
82
|
+
"@fugood/node-llama-win32-x64-vulkan": "1.3.0-rc.6",
|
|
83
|
+
"@fugood/node-llama-win32-x64-cuda": "1.3.0-rc.6",
|
|
84
|
+
"@fugood/node-llama-win32-arm64": "1.3.0-rc.6",
|
|
85
|
+
"@fugood/node-llama-win32-arm64-vulkan": "1.3.0-rc.6",
|
|
86
|
+
"@fugood/node-llama-darwin-x64": "1.3.0-rc.6",
|
|
87
|
+
"@fugood/node-llama-darwin-arm64": "1.3.0-rc.6"
|
|
88
88
|
},
|
|
89
89
|
"devDependencies": {
|
|
90
90
|
"@babel/preset-env": "^7.24.4",
|
package/scripts/llama.cpp.patch
CHANGED
|
@@ -98,7 +98,7 @@ index b0591e84b..93759f884 100644
|
|
|
98
98
|
mparams.split_mode = params.split_mode;
|
|
99
99
|
mparams.tensor_split = params.tensor_split;
|
|
100
100
|
diff --git a/src/llama.cpp/common/common.h b/src/llama.cpp/common/common.h
|
|
101
|
-
index
|
|
101
|
+
index a8cb630ea..0919ec5d3 100644
|
|
102
102
|
--- a/src/llama.cpp/common/common.h
|
|
103
103
|
+++ b/src/llama.cpp/common/common.h
|
|
104
104
|
@@ -274,6 +274,7 @@ struct lr_opt {
|
|
@@ -110,7 +110,7 @@ index 040a44ebd..37ad69173 100644
|
|
|
110
110
|
int32_t n_ctx = 4096; // context size
|
|
111
111
|
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
|
|
112
112
|
diff --git a/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt b/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt
|
|
113
|
-
index
|
|
113
|
+
index 34323afa0..1a6924db0 100644
|
|
114
114
|
--- a/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt
|
|
115
115
|
+++ b/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt
|
|
116
116
|
@@ -106,7 +106,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|
@@ -123,10 +123,10 @@ index 42041b717..371752718 100644
|
|
|
123
123
|
check_cxx_compiler_flag(-mfp16-format=ieee GGML_COMPILER_SUPPORTS_FP16_FORMAT_I3E)
|
|
124
124
|
if (NOT "${GGML_COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "")
|
|
125
125
|
diff --git a/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt b/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt
|
|
126
|
-
index
|
|
126
|
+
index de01336cd..29b1a043d 100644
|
|
127
127
|
--- a/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt
|
|
128
128
|
+++ b/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt
|
|
129
|
-
@@ -
|
|
129
|
+
@@ -121,7 +121,7 @@ if (Vulkan_FOUND)
|
|
130
130
|
endif()
|
|
131
131
|
|
|
132
132
|
# Set up toolchain for host compilation whether cross-compiling or not
|
|
@@ -135,7 +135,7 @@ index 83a83887b..8ae962b29 100644
|
|
|
135
135
|
if (GGML_VULKAN_SHADERS_GEN_TOOLCHAIN)
|
|
136
136
|
set(HOST_CMAKE_TOOLCHAIN_FILE ${GGML_VULKAN_SHADERS_GEN_TOOLCHAIN})
|
|
137
137
|
else()
|
|
138
|
-
@@ -
|
|
138
|
+
@@ -141,7 +141,7 @@ if (Vulkan_FOUND)
|
|
139
139
|
|
|
140
140
|
include(ExternalProject)
|
|
141
141
|
|
package/src/LlamaContext.cpp
CHANGED
|
@@ -221,7 +221,7 @@ static int32_t pooling_type_from_str(const std::string &s) {
|
|
|
221
221
|
}
|
|
222
222
|
|
|
223
223
|
// construct({ model, embedding, n_ctx, n_batch, n_threads, n_gpu_layers,
|
|
224
|
-
// use_mlock, use_mmap }): LlamaContext throws error
|
|
224
|
+
// use_mlock, use_mmap }, onProgress?: (progress: number) => void): LlamaContext throws error
|
|
225
225
|
LlamaContext::LlamaContext(const Napi::CallbackInfo &info)
|
|
226
226
|
: Napi::ObjectWrap<LlamaContext>(info) {
|
|
227
227
|
Napi::Env env = info.Env();
|
|
@@ -230,6 +230,16 @@ LlamaContext::LlamaContext(const Napi::CallbackInfo &info)
|
|
|
230
230
|
}
|
|
231
231
|
auto options = info[0].As<Napi::Object>();
|
|
232
232
|
|
|
233
|
+
// Check if progress callback is provided
|
|
234
|
+
bool has_progress_callback = info.Length() >= 2 && info[1].IsFunction();
|
|
235
|
+
if (has_progress_callback) {
|
|
236
|
+
_progress_tsfn = Napi::ThreadSafeFunction::New(
|
|
237
|
+
env, info[1].As<Napi::Function>(), "Model Loading Progress", 0, 1,
|
|
238
|
+
[](Napi::Env) {
|
|
239
|
+
// Finalizer callback
|
|
240
|
+
});
|
|
241
|
+
}
|
|
242
|
+
|
|
233
243
|
common_params params;
|
|
234
244
|
params.model.path = get_option<std::string>(options, "model", "");
|
|
235
245
|
if (params.model.path.empty()) {
|
|
@@ -323,12 +333,55 @@ LlamaContext::LlamaContext(const Napi::CallbackInfo &info)
|
|
|
323
333
|
|
|
324
334
|
// Use rn-llama context instead of direct session
|
|
325
335
|
_rn_ctx = new llama_rn_context();
|
|
336
|
+
_rn_ctx->is_load_interrupted = false;
|
|
337
|
+
_rn_ctx->loading_progress = 0;
|
|
338
|
+
|
|
339
|
+
// Set up progress callback if provided
|
|
340
|
+
if (has_progress_callback) {
|
|
341
|
+
params.load_progress_callback = [](float progress, void *user_data) {
|
|
342
|
+
LlamaContext *self = static_cast<LlamaContext *>(user_data);
|
|
343
|
+
unsigned int percentage = static_cast<unsigned int>(100 * progress);
|
|
344
|
+
|
|
345
|
+
// Only call callback if progress increased
|
|
346
|
+
if (percentage > self->_rn_ctx->loading_progress) {
|
|
347
|
+
self->_rn_ctx->loading_progress = percentage;
|
|
348
|
+
|
|
349
|
+
// Create a heap-allocated copy of the percentage
|
|
350
|
+
auto *data = new unsigned int(percentage);
|
|
351
|
+
|
|
352
|
+
// Queue callback to be executed on the JavaScript thread
|
|
353
|
+
auto status = self->_progress_tsfn.NonBlockingCall(
|
|
354
|
+
data, [](Napi::Env env, Napi::Function jsCallback, unsigned int *data) {
|
|
355
|
+
jsCallback.Call({Napi::Number::New(env, *data)});
|
|
356
|
+
delete data;
|
|
357
|
+
});
|
|
358
|
+
|
|
359
|
+
// If the call failed, clean up the data
|
|
360
|
+
if (status != napi_ok) {
|
|
361
|
+
delete data;
|
|
362
|
+
}
|
|
363
|
+
}
|
|
364
|
+
|
|
365
|
+
// Return true to continue loading, false to interrupt
|
|
366
|
+
return !self->_rn_ctx->is_load_interrupted;
|
|
367
|
+
};
|
|
368
|
+
params.load_progress_callback_user_data = this;
|
|
369
|
+
}
|
|
370
|
+
|
|
326
371
|
if (!_rn_ctx->loadModel(params)) {
|
|
372
|
+
if (has_progress_callback) {
|
|
373
|
+
_progress_tsfn.Release();
|
|
374
|
+
}
|
|
327
375
|
delete _rn_ctx;
|
|
328
376
|
_rn_ctx = nullptr;
|
|
329
377
|
Napi::TypeError::New(env, "Failed to load model").ThrowAsJavaScriptException();
|
|
330
378
|
}
|
|
331
379
|
|
|
380
|
+
// Release progress callback after model is loaded
|
|
381
|
+
if (has_progress_callback) {
|
|
382
|
+
_progress_tsfn.Release();
|
|
383
|
+
}
|
|
384
|
+
|
|
332
385
|
// Handle LoRA adapters through rn-llama
|
|
333
386
|
if (!lora.empty()) {
|
|
334
387
|
_rn_ctx->applyLoraAdapters(lora);
|
|
@@ -343,6 +396,11 @@ LlamaContext::~LlamaContext() {
|
|
|
343
396
|
_context_valid->store(false);
|
|
344
397
|
}
|
|
345
398
|
|
|
399
|
+
// Interrupt model loading if in progress
|
|
400
|
+
if (_rn_ctx) {
|
|
401
|
+
_rn_ctx->is_load_interrupted = true;
|
|
402
|
+
}
|
|
403
|
+
|
|
346
404
|
// The DisposeWorker is responsible for cleanup of _rn_ctx
|
|
347
405
|
// If _rn_ctx is still not null here, it means disposal was not properly initiated
|
|
348
406
|
if (_rn_ctx) {
|
package/src/LlamaContext.h
CHANGED
|
@@ -78,4 +78,7 @@ private:
|
|
|
78
78
|
// Validity flag for async callbacks to prevent use-after-free
|
|
79
79
|
// Shared pointer ensures callbacks can safely check if context is still alive
|
|
80
80
|
std::shared_ptr<std::atomic<bool>> _context_valid;
|
|
81
|
+
|
|
82
|
+
// Progress callback support for model loading
|
|
83
|
+
Napi::ThreadSafeFunction _progress_tsfn;
|
|
81
84
|
};
|
|
@@ -307,6 +307,10 @@ function(ggml_add_cpu_backend_variant tag_name)
|
|
|
307
307
|
foreach (feat ${ARGN})
|
|
308
308
|
set(GGML_INTERNAL_${feat} ON)
|
|
309
309
|
endforeach()
|
|
310
|
+
elseif (GGML_SYSTEM_ARCH STREQUAL "s390x")
|
|
311
|
+
foreach (feat ${ARGN})
|
|
312
|
+
set(GGML_INTERNAL_${feat} ON)
|
|
313
|
+
endforeach()
|
|
310
314
|
endif()
|
|
311
315
|
|
|
312
316
|
ggml_add_cpu_backend_variant_impl(${tag_name})
|
|
@@ -371,6 +375,14 @@ if (GGML_CPU_ALL_VARIANTS)
|
|
|
371
375
|
else()
|
|
372
376
|
message(FATAL_ERROR "Unsupported PowerPC target OS: ${CMAKE_SYSTEM_NAME}")
|
|
373
377
|
endif()
|
|
378
|
+
elseif (GGML_SYSTEM_ARCH STREQUAL "s390x")
|
|
379
|
+
if (CMAKE_SYSTEM_NAME MATCHES "Linux")
|
|
380
|
+
ggml_add_cpu_backend_variant(s390x_z15 Z15 VXE)
|
|
381
|
+
# ggml_add_cpu_backend_variant(s390x_z16 Z16 VXE)
|
|
382
|
+
# ggml_add_cpu_backend_variant(s390x_z17 Z17 VXE)
|
|
383
|
+
else()
|
|
384
|
+
message(FATAL_ERROR "Unsupported s390x target OS: ${CMAKE_SYSTEM_NAME}")
|
|
385
|
+
endif()
|
|
374
386
|
else()
|
|
375
387
|
message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS not yet supported with ${GGML_SYSTEM_ARCH} on ${CMAKE_SYSTEM_NAME}")
|
|
376
388
|
endif()
|
|
@@ -466,29 +466,45 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|
|
466
466
|
list(APPEND ARCH_FLAGS "-march=${MARCH_STR}" -mabi=lp64d)
|
|
467
467
|
elseif (GGML_SYSTEM_ARCH STREQUAL "s390x")
|
|
468
468
|
message(STATUS "s390x detected")
|
|
469
|
-
list(APPEND GGML_CPU_SOURCES
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
469
|
+
list(APPEND GGML_CPU_SOURCES
|
|
470
|
+
ggml-cpu/arch/s390/quants.c)
|
|
471
|
+
|
|
472
|
+
# for native compilation
|
|
473
|
+
if (GGML_NATIVE)
|
|
474
|
+
# check machine level to determine target
|
|
475
|
+
file(READ "/proc/cpuinfo" CPUINFO_CONTENTS)
|
|
476
|
+
string(REGEX REPLACE "machine[ \t\r\n]*=[ \t\r\n]*([0-9]+)" "\\1" S390X_M ${CPUINFO_CONTENTS})
|
|
477
|
+
|
|
478
|
+
# TODO: Separation to determine activation of VX/VXE/VXE2
|
|
479
|
+
if (${S390X_M} MATCHES "8561|8562")
|
|
480
|
+
message(STATUS "z15 target")
|
|
481
|
+
list(APPEND ARCH_FLAGS -march=z15)
|
|
482
|
+
elseif (${S390X_M} MATCHES "3931")
|
|
483
|
+
message(STATUS "z16 target")
|
|
484
|
+
list(APPEND ARCH_FLAGS -march=z16)
|
|
485
|
+
elseif (${S390X_M} MATCHES "9175|9176")
|
|
486
|
+
# NOTE: Only available from GCC 15.1.0 onwards. Any z17 machine with compile issues must first verify their GCC version.
|
|
487
|
+
# binutils must also be updated to the latest for the -march=z17 flag to work. Otherwise, use -march=arch15.
|
|
488
|
+
message(STATUS "z17 target")
|
|
489
|
+
list(APPEND ARCH_FLAGS -march=arch15)
|
|
490
|
+
else()
|
|
491
|
+
message(STATUS "Unknown target")
|
|
492
|
+
message(WARNING "Unknown target. If you are compiling for z14 and earlier, you might have to add -DGGML_VXE=OFF.")
|
|
493
|
+
list(APPEND ARCH_FLAGS -march=native -mtune=native)
|
|
494
|
+
endif()
|
|
495
|
+
# for cross-compilation
|
|
496
|
+
elseif(GGML_CPU_ALL_VARIANTS)
|
|
497
|
+
# range through IBM z15 to z17
|
|
498
|
+
# NOTE: update when a new hardware level is released
|
|
499
|
+
foreach (ZHW RANGE 15 17)
|
|
500
|
+
if(DEFINED GGML_INTERNAL_Z${ZHW})
|
|
501
|
+
message(STATUS "z${ZHW} cross-compile target")
|
|
502
|
+
list(APPEND ARCH_FLAGS -march=z${ZHW})
|
|
503
|
+
endif()
|
|
504
|
+
endforeach()
|
|
489
505
|
endif()
|
|
490
506
|
|
|
491
|
-
if (GGML_VXE)
|
|
507
|
+
if (GGML_VXE OR GGML_INTERNAL_VXE)
|
|
492
508
|
message(STATUS "VX/VXE/VXE2 enabled")
|
|
493
509
|
list(APPEND ARCH_FLAGS -mvx -mzvector)
|
|
494
510
|
list(APPEND ARCH_DEFINITIONS GGML_VXE)
|
|
@@ -85,6 +85,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
|
85
85
|
{ LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
|
|
86
86
|
{ LLM_ARCH_PLM, "plm" },
|
|
87
87
|
{ LLM_ARCH_BAILINGMOE, "bailingmoe" },
|
|
88
|
+
{ LLM_ARCH_BAILINGMOE2, "bailingmoe2" },
|
|
88
89
|
{ LLM_ARCH_DOTS1, "dots1" },
|
|
89
90
|
{ LLM_ARCH_ARCEE, "arcee" },
|
|
90
91
|
{ LLM_ARCH_ERNIE4_5, "ernie4_5" },
|
|
@@ -135,6 +136,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
|
135
136
|
{ LLM_KV_EXPERT_COUNT, "%s.expert_count" },
|
|
136
137
|
{ LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
|
|
137
138
|
{ LLM_KV_EXPERT_SHARED_COUNT, "%s.expert_shared_count" },
|
|
139
|
+
{ LLM_KV_EXPERT_GROUP_COUNT, "%s.expert_group_count" },
|
|
140
|
+
{ LLM_KV_EXPERT_GROUP_USED_COUNT, "%s.expert_group_used_count" },
|
|
138
141
|
{ LLM_KV_EXPERT_WEIGHTS_SCALE, "%s.expert_weights_scale" },
|
|
139
142
|
{ LLM_KV_EXPERT_WEIGHTS_NORM, "%s.expert_weights_norm" },
|
|
140
143
|
{ LLM_KV_EXPERT_GATING_FUNC, "%s.expert_gating_func" },
|
|
@@ -1946,6 +1949,38 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
|
1946
1949
|
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
|
|
1947
1950
|
},
|
|
1948
1951
|
},
|
|
1952
|
+
{
|
|
1953
|
+
LLM_ARCH_BAILINGMOE2,
|
|
1954
|
+
{
|
|
1955
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
1956
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
|
1957
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
|
1958
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
|
1959
|
+
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
|
1960
|
+
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
|
1961
|
+
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
|
1962
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
|
1963
|
+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
|
1964
|
+
{ LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b" },
|
|
1965
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
|
1966
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
|
1967
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
|
1968
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
1969
|
+
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
|
1970
|
+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
|
1971
|
+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
|
1972
|
+
{ LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
|
|
1973
|
+
{ LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
|
|
1974
|
+
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
|
|
1975
|
+
{ LLM_TENSOR_NEXTN_EH_PROJ, "blk.%d.nextn.eh_proj" },
|
|
1976
|
+
{ LLM_TENSOR_NEXTN_EMBED_TOKENS, "blk.%d.nextn.embed_tokens" },
|
|
1977
|
+
{ LLM_TENSOR_NEXTN_ENORM, "blk.%d.nextn.enorm" },
|
|
1978
|
+
{ LLM_TENSOR_NEXTN_HNORM, "blk.%d.nextn.hnorm" },
|
|
1979
|
+
{ LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "blk.%d.nextn.shared_head_head" },
|
|
1980
|
+
{ LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "blk.%d.nextn.shared_head_norm" },
|
|
1981
|
+
{ LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
|
|
1982
|
+
},
|
|
1983
|
+
},
|
|
1949
1984
|
{
|
|
1950
1985
|
LLM_ARCH_DOTS1,
|
|
1951
1986
|
{
|
|
@@ -89,6 +89,7 @@ enum llm_arch {
|
|
|
89
89
|
LLM_ARCH_WAVTOKENIZER_DEC,
|
|
90
90
|
LLM_ARCH_PLM,
|
|
91
91
|
LLM_ARCH_BAILINGMOE,
|
|
92
|
+
LLM_ARCH_BAILINGMOE2,
|
|
92
93
|
LLM_ARCH_DOTS1,
|
|
93
94
|
LLM_ARCH_ARCEE,
|
|
94
95
|
LLM_ARCH_ERNIE4_5,
|
|
@@ -139,6 +140,8 @@ enum llm_kv {
|
|
|
139
140
|
LLM_KV_EXPERT_COUNT,
|
|
140
141
|
LLM_KV_EXPERT_USED_COUNT,
|
|
141
142
|
LLM_KV_EXPERT_SHARED_COUNT,
|
|
143
|
+
LLM_KV_EXPERT_GROUP_COUNT,
|
|
144
|
+
LLM_KV_EXPERT_GROUP_USED_COUNT,
|
|
142
145
|
LLM_KV_EXPERT_WEIGHTS_SCALE,
|
|
143
146
|
LLM_KV_EXPERT_WEIGHTS_NORM,
|
|
144
147
|
LLM_KV_EXPERT_GATING_FUNC,
|
|
@@ -123,7 +123,7 @@ private:
|
|
|
123
123
|
uint32_t n_seq_max;
|
|
124
124
|
uint32_t n_outputs;
|
|
125
125
|
|
|
126
|
-
std::array<llama_seq_id, 1> seq_id_0 = { 0 }; // default sequence id
|
|
126
|
+
std::array<llama_seq_id, 1> seq_id_0 = {{ 0 }}; // default sequence id
|
|
127
127
|
|
|
128
128
|
std::vector<llama_pos> pos;
|
|
129
129
|
std::vector<int32_t> n_seq_id;
|
|
@@ -63,6 +63,8 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
|
|
|
63
63
|
{ "megrez", LLM_CHAT_TEMPLATE_MEGREZ },
|
|
64
64
|
{ "yandex", LLM_CHAT_TEMPLATE_YANDEX },
|
|
65
65
|
{ "bailing", LLM_CHAT_TEMPLATE_BAILING },
|
|
66
|
+
{ "bailing-think", LLM_CHAT_TEMPLATE_BAILING_THINK },
|
|
67
|
+
{ "bailing2", LLM_CHAT_TEMPLATE_BAILING2 },
|
|
66
68
|
{ "llama4", LLM_CHAT_TEMPLATE_LLAMA4 },
|
|
67
69
|
{ "smolvlm", LLM_CHAT_TEMPLATE_SMOLVLM },
|
|
68
70
|
{ "hunyuan-moe", LLM_CHAT_TEMPLATE_HUNYUAN_MOE },
|
|
@@ -191,6 +193,10 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
|
|
|
191
193
|
return LLM_CHAT_TEMPLATE_YANDEX;
|
|
192
194
|
} else if (tmpl_contains("<role>ASSISTANT</role>") && tmpl_contains("'HUMAN'")) {
|
|
193
195
|
return LLM_CHAT_TEMPLATE_BAILING;
|
|
196
|
+
} else if (tmpl_contains("<role>ASSISTANT</role>") && tmpl_contains("\"HUMAN\"") && tmpl_contains("<think>")) {
|
|
197
|
+
return LLM_CHAT_TEMPLATE_BAILING_THINK;
|
|
198
|
+
} else if (tmpl_contains("<role>ASSISTANT</role>") && tmpl_contains("<role>HUMAN</role>") && tmpl_contains("<|role_end|>")) {
|
|
199
|
+
return LLM_CHAT_TEMPLATE_BAILING2;
|
|
194
200
|
} else if (tmpl_contains("<|header_start|>") && tmpl_contains("<|header_end|>")) {
|
|
195
201
|
return LLM_CHAT_TEMPLATE_LLAMA4;
|
|
196
202
|
} else if (tmpl_contains("<|endofuserprompt|>")) {
|
|
@@ -644,8 +650,8 @@ int32_t llm_chat_apply_template(
|
|
|
644
650
|
if (add_ass) {
|
|
645
651
|
ss << " Ассистент:[SEP]";
|
|
646
652
|
}
|
|
647
|
-
}
|
|
648
|
-
// Bailing (Ling) template
|
|
653
|
+
} else if (tmpl == LLM_CHAT_TEMPLATE_BAILING || tmpl == LLM_CHAT_TEMPLATE_BAILING_THINK) {
|
|
654
|
+
// Bailing (Ling/Ring) template
|
|
649
655
|
for (auto message : chat) {
|
|
650
656
|
std::string role(message->role);
|
|
651
657
|
|
|
@@ -658,6 +664,33 @@ int32_t llm_chat_apply_template(
|
|
|
658
664
|
ss << "<role>" << role << "</role>" << message->content;
|
|
659
665
|
}
|
|
660
666
|
|
|
667
|
+
if (add_ass) {
|
|
668
|
+
ss << "<role>ASSISTANT</role>";
|
|
669
|
+
|
|
670
|
+
if (tmpl == LLM_CHAT_TEMPLATE_BAILING_THINK) {
|
|
671
|
+
ss << "<think>";
|
|
672
|
+
}
|
|
673
|
+
}
|
|
674
|
+
} else if (tmpl == LLM_CHAT_TEMPLATE_BAILING2) {
|
|
675
|
+
// Bailing2 (Ling 2.0) template
|
|
676
|
+
bool has_system = !chat.empty() && std::string(chat[0]->role) == "system";
|
|
677
|
+
|
|
678
|
+
if (!has_system) {
|
|
679
|
+
ss << "<role>SYSTEM</role>detailed thinking off<|role_end|>";
|
|
680
|
+
}
|
|
681
|
+
|
|
682
|
+
for (auto message : chat) {
|
|
683
|
+
std::string role(message->role);
|
|
684
|
+
|
|
685
|
+
if (role == "user") {
|
|
686
|
+
role = "HUMAN";
|
|
687
|
+
} else {
|
|
688
|
+
std::transform(role.begin(), role.end(), role.begin(), ::toupper);
|
|
689
|
+
}
|
|
690
|
+
|
|
691
|
+
ss << "<role>" << role << "</role>" << message->content << "<|role_end|>";
|
|
692
|
+
}
|
|
693
|
+
|
|
661
694
|
if (add_ass) {
|
|
662
695
|
ss << "<role>ASSISTANT</role>";
|
|
663
696
|
}
|
|
@@ -42,6 +42,8 @@ enum llm_chat_template {
|
|
|
42
42
|
LLM_CHAT_TEMPLATE_MEGREZ,
|
|
43
43
|
LLM_CHAT_TEMPLATE_YANDEX,
|
|
44
44
|
LLM_CHAT_TEMPLATE_BAILING,
|
|
45
|
+
LLM_CHAT_TEMPLATE_BAILING_THINK,
|
|
46
|
+
LLM_CHAT_TEMPLATE_BAILING2,
|
|
45
47
|
LLM_CHAT_TEMPLATE_LLAMA4,
|
|
46
48
|
LLM_CHAT_TEMPLATE_SMOLVLM,
|
|
47
49
|
LLM_CHAT_TEMPLATE_DOTS1,
|
|
@@ -2346,7 +2346,8 @@ llama_context * llama_init_from_model(
|
|
|
2346
2346
|
return nullptr;
|
|
2347
2347
|
}
|
|
2348
2348
|
|
|
2349
|
-
if (params.pooling_type !=
|
|
2349
|
+
if (params.pooling_type != LLAMA_POOLING_TYPE_UNSPECIFIED &&
|
|
2350
|
+
params.pooling_type != model->hparams.pooling_type) {
|
|
2350
2351
|
//user-specified pooling-type is different from the model default
|
|
2351
2352
|
LLAMA_LOG_WARN("%s: model default pooling_type is [%d], but [%d] was specified\n", __func__,
|
|
2352
2353
|
model->hparams.pooling_type, params.pooling_type);
|
|
@@ -950,6 +950,31 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
|
|
950
950
|
cb(selection_probs, "ffn_moe_probs_biased", il);
|
|
951
951
|
}
|
|
952
952
|
|
|
953
|
+
// select top n_group_used expert groups
|
|
954
|
+
// https://huggingface.co/deepseek-ai/DeepSeek-V3/blob/e815299b0bcbac849fa540c768ef21845365c9eb/modeling_deepseek.py#L440-L457
|
|
955
|
+
if (hparams.n_expert_groups > 1 && n_tokens > 0) {
|
|
956
|
+
const int64_t n_exp_per_group = n_expert / hparams.n_expert_groups;
|
|
957
|
+
|
|
958
|
+
// organize experts into n_expert_groups
|
|
959
|
+
ggml_tensor * selection_groups = ggml_reshape_3d(ctx0, selection_probs, n_exp_per_group, hparams.n_expert_groups, n_tokens); // [n_exp_per_group, n_expert_groups, n_tokens]
|
|
960
|
+
|
|
961
|
+
ggml_tensor * group_scores = ggml_top_k(ctx0, selection_groups, 2); // [2, n_expert_groups, n_tokens]
|
|
962
|
+
group_scores = ggml_get_rows(ctx0, ggml_reshape_4d(ctx0, selection_groups, 1, selection_groups->ne[0], selection_groups->ne[1], selection_groups->ne[2]), group_scores); // [1, 2, n_expert_groups, n_tokens]
|
|
963
|
+
|
|
964
|
+
// get top n_group_used expert groups
|
|
965
|
+
group_scores = ggml_sum_rows(ctx0, ggml_reshape_3d(ctx0, group_scores, group_scores->ne[1], group_scores->ne[2], group_scores->ne[3])); // [1, n_expert_groups, n_tokens]
|
|
966
|
+
group_scores = ggml_reshape_2d(ctx0, group_scores, group_scores->ne[1], group_scores->ne[2]); // [n_expert_groups, n_tokens]
|
|
967
|
+
|
|
968
|
+
ggml_tensor * expert_groups = ggml_top_k(ctx0, group_scores, hparams.n_group_used); // [n_group_used, n_tokens]
|
|
969
|
+
cb(expert_groups, "ffn_moe_group_topk", il);
|
|
970
|
+
|
|
971
|
+
// mask out the other groups
|
|
972
|
+
selection_probs = ggml_get_rows(ctx0, selection_groups, expert_groups); // [n_exp_per_group, n_group_used, n_tokens]
|
|
973
|
+
selection_probs = ggml_set_rows(ctx0, ggml_scale_bias(ctx0, selection_groups, 0.0f, -INFINITY), selection_probs, expert_groups); // [n_exp_per_group, n_expert_groups, n_tokens]
|
|
974
|
+
selection_probs = ggml_reshape_2d(ctx0, selection_probs, n_expert, n_tokens); // [n_expert, n_tokens]
|
|
975
|
+
cb(selection_probs, "ffn_moe_probs_masked", il);
|
|
976
|
+
}
|
|
977
|
+
|
|
953
978
|
// select experts
|
|
954
979
|
ggml_tensor * selected_experts = ggml_top_k(ctx0, selection_probs, n_expert_used); // [n_expert_used, n_tokens]
|
|
955
980
|
cb(selected_experts->src[0], "ffn_moe_argsort", il);
|
|
@@ -981,6 +1006,11 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
|
|
981
1006
|
ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights); // [1, n_tokens]
|
|
982
1007
|
cb(weights_sum, "ffn_moe_weights_sum", il);
|
|
983
1008
|
|
|
1009
|
+
if (arch == LLM_ARCH_BAILINGMOE2) {
|
|
1010
|
+
weights_sum = ggml_scale_bias(ctx0, weights_sum, 1.0, 1e-20);
|
|
1011
|
+
cb(weights_sum, "ffn_moe_weights_sum_biased", il);
|
|
1012
|
+
}
|
|
1013
|
+
|
|
984
1014
|
weights = ggml_div(ctx0, weights, weights_sum); // [n_expert_used, n_tokens]
|
|
985
1015
|
cb(weights, "ffn_moe_weights_norm", il);
|
|
986
1016
|
|
|
@@ -114,9 +114,12 @@ const char * llm_type_name(llm_type type) {
|
|
|
114
114
|
case LLM_TYPE_17B_16E: return "17Bx16E (Scout)";
|
|
115
115
|
case LLM_TYPE_17B_128E: return "17Bx128E (Maverick)";
|
|
116
116
|
case LLM_TYPE_A13B: return "A13B";
|
|
117
|
+
case LLM_TYPE_7B_A1B: return "7B.A1B";
|
|
117
118
|
case LLM_TYPE_8B_A1B: return "8B.A1B";
|
|
119
|
+
case LLM_TYPE_16B_A1B: return "16B.A1B";
|
|
118
120
|
case LLM_TYPE_21B_A3B: return "21B.A3B";
|
|
119
121
|
case LLM_TYPE_30B_A3B: return "30B.A3B";
|
|
122
|
+
case LLM_TYPE_100B_A6B: return "100B.A6B";
|
|
120
123
|
case LLM_TYPE_106B_A12B: return "106B.A12B";
|
|
121
124
|
case LLM_TYPE_235B_A22B: return "235B.A22B";
|
|
122
125
|
case LLM_TYPE_300B_A47B: return "300B.A47B";
|
|
@@ -480,11 +483,13 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
480
483
|
return;
|
|
481
484
|
}
|
|
482
485
|
|
|
483
|
-
ml.get_key(LLM_KV_CONTEXT_LENGTH,
|
|
484
|
-
ml.get_key(LLM_KV_EMBEDDING_LENGTH,
|
|
485
|
-
ml.get_key(LLM_KV_BLOCK_COUNT,
|
|
486
|
-
ml.get_key(LLM_KV_EXPERT_COUNT,
|
|
487
|
-
ml.get_key(LLM_KV_EXPERT_USED_COUNT,
|
|
486
|
+
ml.get_key(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
|
|
487
|
+
ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
|
|
488
|
+
ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer);
|
|
489
|
+
ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
|
|
490
|
+
ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
|
|
491
|
+
ml.get_key(LLM_KV_EXPERT_GROUP_COUNT, hparams.n_expert_groups, false);
|
|
492
|
+
ml.get_key(LLM_KV_EXPERT_GROUP_USED_COUNT, hparams.n_group_used, false);
|
|
488
493
|
|
|
489
494
|
if (arch == LLM_ARCH_WAVTOKENIZER_DEC) {
|
|
490
495
|
ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd_features);
|
|
@@ -500,8 +505,15 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
500
505
|
GGML_ASSERT(hparams.n_expert_used <= hparams.n_expert);
|
|
501
506
|
if (hparams.n_expert > 0) {
|
|
502
507
|
GGML_ASSERT(hparams.n_expert_used > 0);
|
|
508
|
+
GGML_ASSERT(hparams.n_expert_groups < hparams.n_expert);
|
|
509
|
+
if (hparams.n_expert_groups > 1) {
|
|
510
|
+
GGML_ASSERT(hparams.n_expert % hparams.n_expert_groups == 0);
|
|
511
|
+
GGML_ASSERT(hparams.n_group_used > 0);
|
|
512
|
+
GGML_ASSERT(hparams.n_group_used < hparams.n_expert_groups);
|
|
513
|
+
}
|
|
503
514
|
} else {
|
|
504
515
|
GGML_ASSERT(hparams.n_expert_used == 0);
|
|
516
|
+
GGML_ASSERT(hparams.n_expert_groups == 0);
|
|
505
517
|
}
|
|
506
518
|
|
|
507
519
|
std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0);
|
|
@@ -1843,8 +1855,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1843
1855
|
|
|
1844
1856
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1845
1857
|
|
|
1846
|
-
switch (hparams.
|
|
1847
|
-
|
|
1858
|
+
switch (hparams.n_embd) {
|
|
1859
|
+
case 1536: type = LLM_TYPE_7B_A1B; break;
|
|
1860
|
+
case 2048: case 2560: type = LLM_TYPE_3B; break;
|
|
1861
|
+
case 4096: type = LLM_TYPE_32B; break;
|
|
1848
1862
|
default: type = LLM_TYPE_UNKNOWN;
|
|
1849
1863
|
}
|
|
1850
1864
|
|
|
@@ -1885,6 +1899,29 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1885
1899
|
default: type = LLM_TYPE_UNKNOWN;
|
|
1886
1900
|
}
|
|
1887
1901
|
} break;
|
|
1902
|
+
case LLM_ARCH_BAILINGMOE2:
|
|
1903
|
+
{
|
|
1904
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1905
|
+
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
|
|
1906
|
+
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
|
1907
|
+
ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp);
|
|
1908
|
+
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
|
|
1909
|
+
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
|
|
1910
|
+
ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
|
|
1911
|
+
ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func);
|
|
1912
|
+
ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false);
|
|
1913
|
+
|
|
1914
|
+
// TODO: when MTP is implemented, this should probably be updated if needed
|
|
1915
|
+
hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers;
|
|
1916
|
+
|
|
1917
|
+
switch (hparams.n_layer) {
|
|
1918
|
+
case 20: type = LLM_TYPE_16B_A1B; break;
|
|
1919
|
+
case 21: type = LLM_TYPE_16B_A1B; break;
|
|
1920
|
+
case 32: type = LLM_TYPE_100B_A6B; break;
|
|
1921
|
+
case 33: type = LLM_TYPE_100B_A6B; break;
|
|
1922
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1923
|
+
}
|
|
1924
|
+
} break;
|
|
1888
1925
|
case LLM_ARCH_DOTS1:
|
|
1889
1926
|
{
|
|
1890
1927
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
@@ -5495,6 +5532,70 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
5495
5532
|
layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
|
|
5496
5533
|
}
|
|
5497
5534
|
} break;
|
|
5535
|
+
case LLM_ARCH_BAILINGMOE2:
|
|
5536
|
+
{
|
|
5537
|
+
const int64_t n_ff_exp = hparams.n_ff_exp;
|
|
5538
|
+
const int64_t n_expert_shared = hparams.n_expert_shared;
|
|
5539
|
+
|
|
5540
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
5541
|
+
|
|
5542
|
+
// output
|
|
5543
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
5544
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
|
5545
|
+
|
|
5546
|
+
GGML_ASSERT(n_expert > 0 && "n_expert must be > 0 for bailingmoe2");
|
|
5547
|
+
GGML_ASSERT(n_expert_used > 0 && "n_expert_used must be > 0 for bailingmoe2");
|
|
5548
|
+
|
|
5549
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
5550
|
+
int flags = 0;
|
|
5551
|
+
if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
|
|
5552
|
+
// skip all tensors in the NextN layers
|
|
5553
|
+
flags |= TENSOR_SKIP;
|
|
5554
|
+
}
|
|
5555
|
+
|
|
5556
|
+
auto & layer = layers[i];
|
|
5557
|
+
|
|
5558
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, flags);
|
|
5559
|
+
|
|
5560
|
+
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, flags);
|
|
5561
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, flags);
|
|
5562
|
+
|
|
5563
|
+
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, flags);
|
|
5564
|
+
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, flags);
|
|
5565
|
+
|
|
5566
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, flags);
|
|
5567
|
+
|
|
5568
|
+
if (static_cast<uint32_t>(i) >= hparams.n_layer_dense_lead) { // MoE layers
|
|
5569
|
+
const int64_t n_ff_shexp = (hparams.n_ff_shexp ? hparams.n_ff_shexp : n_ff_exp) * n_expert_shared;
|
|
5570
|
+
|
|
5571
|
+
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, flags);
|
|
5572
|
+
layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED | flags);
|
|
5573
|
+
|
|
5574
|
+
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, flags);
|
|
5575
|
+
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, flags);
|
|
5576
|
+
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, flags);
|
|
5577
|
+
|
|
5578
|
+
layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_shexp}, flags);
|
|
5579
|
+
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, flags);
|
|
5580
|
+
layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_shexp}, flags);
|
|
5581
|
+
} else { // Dense layers
|
|
5582
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, flags);
|
|
5583
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, flags);
|
|
5584
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, flags);
|
|
5585
|
+
}
|
|
5586
|
+
|
|
5587
|
+
// NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers
|
|
5588
|
+
if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
|
|
5589
|
+
layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags);
|
|
5590
|
+
layer.nextn.embed_tokens = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED | flags);
|
|
5591
|
+
layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags);
|
|
5592
|
+
layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, flags);
|
|
5593
|
+
layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED | flags);
|
|
5594
|
+
layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), { n_embd }, TENSOR_NOT_REQUIRED | flags);
|
|
5595
|
+
layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, flags);
|
|
5596
|
+
}
|
|
5597
|
+
}
|
|
5598
|
+
} break;
|
|
5498
5599
|
case LLM_ARCH_DOTS1:
|
|
5499
5600
|
{
|
|
5500
5601
|
const int64_t n_ff_exp = hparams.n_ff_exp;
|
|
@@ -6350,6 +6451,19 @@ void llama_model::print_info() const {
|
|
|
6350
6451
|
LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
|
|
6351
6452
|
}
|
|
6352
6453
|
|
|
6454
|
+
if (arch == LLM_ARCH_BAILINGMOE2) {
|
|
6455
|
+
LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
|
|
6456
|
+
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
|
6457
|
+
LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
|
|
6458
|
+
LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
|
|
6459
|
+
LLAMA_LOG_INFO("%s: n_expert_groups = %d\n", __func__, hparams.n_expert_groups);
|
|
6460
|
+
LLAMA_LOG_INFO("%s: n_group_used = %d\n", __func__, hparams.n_group_used);
|
|
6461
|
+
LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
|
|
6462
|
+
LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
|
|
6463
|
+
LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
|
|
6464
|
+
LLAMA_LOG_INFO("%s: nextn_predict_layers = %d\n", __func__, hparams.nextn_predict_layers);
|
|
6465
|
+
}
|
|
6466
|
+
|
|
6353
6467
|
if (arch == LLM_ARCH_SMALLTHINKER || arch == LLM_ARCH_LFM2MOE) {
|
|
6354
6468
|
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
|
6355
6469
|
LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
|
|
@@ -17039,6 +17153,150 @@ struct llm_build_bailingmoe : public llm_graph_context {
|
|
|
17039
17153
|
}
|
|
17040
17154
|
};
|
|
17041
17155
|
|
|
17156
|
+
struct llm_build_bailingmoe2 : public llm_graph_context {
|
|
17157
|
+
llm_build_bailingmoe2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
17158
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
17159
|
+
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
|
17160
|
+
|
|
17161
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
17162
|
+
|
|
17163
|
+
ggml_tensor * cur;
|
|
17164
|
+
ggml_tensor * inpL;
|
|
17165
|
+
|
|
17166
|
+
inpL = build_inp_embd(model.tok_embd);
|
|
17167
|
+
|
|
17168
|
+
// inp_pos - contains the positions
|
|
17169
|
+
ggml_tensor * inp_pos = build_inp_pos();
|
|
17170
|
+
|
|
17171
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
17172
|
+
|
|
17173
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
17174
|
+
|
|
17175
|
+
const int n_transformer_layers = n_layer - hparams.nextn_predict_layers;
|
|
17176
|
+
for (int il = 0; il < n_transformer_layers; ++il) {
|
|
17177
|
+
ggml_tensor * inpSA = inpL;
|
|
17178
|
+
|
|
17179
|
+
// norm
|
|
17180
|
+
cur = build_norm(inpL,
|
|
17181
|
+
model.layers[il].attn_norm, NULL,
|
|
17182
|
+
LLM_NORM_RMS, il);
|
|
17183
|
+
cb(cur, "attn_norm", il);
|
|
17184
|
+
|
|
17185
|
+
// self_attention
|
|
17186
|
+
{
|
|
17187
|
+
cur = build_lora_mm(model.layers[il].wqkv, cur);
|
|
17188
|
+
cb(cur, "wqkv", il);
|
|
17189
|
+
|
|
17190
|
+
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
17191
|
+
ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
17192
|
+
ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
|
17193
|
+
|
|
17194
|
+
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
|
|
17195
|
+
cb(Qcur, "Qcur_normed", il);
|
|
17196
|
+
|
|
17197
|
+
Qcur = ggml_rope_ext(
|
|
17198
|
+
ctx0, Qcur, inp_pos, nullptr,
|
|
17199
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
17200
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
17201
|
+
);
|
|
17202
|
+
|
|
17203
|
+
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
|
|
17204
|
+
cb(Kcur, "Kcur_normed", il);
|
|
17205
|
+
|
|
17206
|
+
Kcur = ggml_rope_ext(
|
|
17207
|
+
ctx0, Kcur, inp_pos, nullptr,
|
|
17208
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
17209
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
17210
|
+
);
|
|
17211
|
+
|
|
17212
|
+
cb(Qcur, "Qcur", il);
|
|
17213
|
+
cb(Kcur, "Kcur", il);
|
|
17214
|
+
cb(Vcur, "Vcur", il);
|
|
17215
|
+
|
|
17216
|
+
cur = build_attn(inp_attn,
|
|
17217
|
+
model.layers[il].wo, model.layers[il].bo,
|
|
17218
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
17219
|
+
}
|
|
17220
|
+
|
|
17221
|
+
if (il == n_transformer_layers - 1 && inp_out_ids) {
|
|
17222
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
17223
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
17224
|
+
}
|
|
17225
|
+
|
|
17226
|
+
ggml_tensor * sa_out = ggml_add(ctx0, cur, inpSA);
|
|
17227
|
+
cb(sa_out, "sa_out", il);
|
|
17228
|
+
|
|
17229
|
+
// MoE branch
|
|
17230
|
+
cur = build_norm(sa_out,
|
|
17231
|
+
model.layers[il].ffn_norm, NULL,
|
|
17232
|
+
LLM_NORM_RMS, il);
|
|
17233
|
+
cb(cur, "ffn_norm", il);
|
|
17234
|
+
|
|
17235
|
+
if (static_cast<uint32_t>(il) < hparams.n_layer_dense_lead) {
|
|
17236
|
+
cur = build_ffn(cur,
|
|
17237
|
+
model.layers[il].ffn_up, NULL, NULL,
|
|
17238
|
+
model.layers[il].ffn_gate, NULL, NULL,
|
|
17239
|
+
model.layers[il].ffn_down, NULL, NULL,
|
|
17240
|
+
NULL,
|
|
17241
|
+
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
17242
|
+
cb(cur, "ffn_out", il);
|
|
17243
|
+
} else {
|
|
17244
|
+
ggml_tensor * moe_out =
|
|
17245
|
+
build_moe_ffn(cur,
|
|
17246
|
+
model.layers[il].ffn_gate_inp,
|
|
17247
|
+
model.layers[il].ffn_up_exps,
|
|
17248
|
+
model.layers[il].ffn_gate_exps,
|
|
17249
|
+
model.layers[il].ffn_down_exps,
|
|
17250
|
+
model.layers[il].ffn_exp_probs_b,
|
|
17251
|
+
n_expert, n_expert_used,
|
|
17252
|
+
LLM_FFN_SILU, hparams.expert_weights_norm,
|
|
17253
|
+
true, hparams.expert_weights_scale,
|
|
17254
|
+
(llama_expert_gating_func_type) hparams.expert_gating_func,
|
|
17255
|
+
il);
|
|
17256
|
+
cb(moe_out, "ffn_moe_out", il);
|
|
17257
|
+
|
|
17258
|
+
{
|
|
17259
|
+
ggml_tensor * ffn_shexp = build_ffn(cur,
|
|
17260
|
+
model.layers[il].ffn_up_shexp, NULL, NULL,
|
|
17261
|
+
model.layers[il].ffn_gate_shexp, NULL, NULL,
|
|
17262
|
+
model.layers[il].ffn_down_shexp, NULL, NULL,
|
|
17263
|
+
NULL,
|
|
17264
|
+
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
17265
|
+
cb(ffn_shexp, "ffn_shexp", il);
|
|
17266
|
+
|
|
17267
|
+
cur = ggml_add(ctx0, moe_out, ffn_shexp);
|
|
17268
|
+
cb(cur, "ffn_out", il);
|
|
17269
|
+
}
|
|
17270
|
+
}
|
|
17271
|
+
|
|
17272
|
+
cur = ggml_add(ctx0, cur, sa_out);
|
|
17273
|
+
|
|
17274
|
+
cur = build_cvec(cur, il);
|
|
17275
|
+
cb(cur, "l_out", il);
|
|
17276
|
+
|
|
17277
|
+
// input for next layer
|
|
17278
|
+
inpL = cur;
|
|
17279
|
+
}
|
|
17280
|
+
|
|
17281
|
+
cur = inpL;
|
|
17282
|
+
|
|
17283
|
+
cur = build_norm(cur,
|
|
17284
|
+
model.output_norm, NULL,
|
|
17285
|
+
LLM_NORM_RMS, -1);
|
|
17286
|
+
|
|
17287
|
+
cb(cur, "result_norm", -1);
|
|
17288
|
+
res->t_embd = cur;
|
|
17289
|
+
|
|
17290
|
+
// lm_head
|
|
17291
|
+
cur = build_lora_mm(model.output, cur);
|
|
17292
|
+
|
|
17293
|
+
cb(cur, "result_output", -1);
|
|
17294
|
+
res->t_logits = cur;
|
|
17295
|
+
|
|
17296
|
+
ggml_build_forward_expand(gf, cur);
|
|
17297
|
+
}
|
|
17298
|
+
};
|
|
17299
|
+
|
|
17042
17300
|
struct llm_build_dots1 : public llm_graph_context {
|
|
17043
17301
|
llm_build_dots1(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
17044
17302
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
@@ -19835,6 +20093,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
|
|
19835
20093
|
{
|
|
19836
20094
|
llm = std::make_unique<llm_build_bailingmoe>(*this, params);
|
|
19837
20095
|
} break;
|
|
20096
|
+
case LLM_ARCH_BAILINGMOE2:
|
|
20097
|
+
{
|
|
20098
|
+
llm = std::make_unique<llm_build_bailingmoe2>(*this, params);
|
|
20099
|
+
} break;
|
|
19838
20100
|
case LLM_ARCH_SEED_OSS:
|
|
19839
20101
|
{
|
|
19840
20102
|
llm = std::make_unique<llm_build_seed_oss>(*this, params);
|
|
@@ -20101,6 +20363,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
|
20101
20363
|
case LLM_ARCH_EXAONE:
|
|
20102
20364
|
case LLM_ARCH_EXAONE4:
|
|
20103
20365
|
case LLM_ARCH_MINICPM3:
|
|
20366
|
+
case LLM_ARCH_BAILINGMOE2:
|
|
20104
20367
|
case LLM_ARCH_DOTS1:
|
|
20105
20368
|
case LLM_ARCH_HUNYUAN_MOE:
|
|
20106
20369
|
case LLM_ARCH_OPENAI_MOE:
|
|
@@ -107,9 +107,12 @@ enum llm_type {
|
|
|
107
107
|
LLM_TYPE_17B_16E, // llama4 Scout
|
|
108
108
|
LLM_TYPE_17B_128E, // llama4 Maverick
|
|
109
109
|
LLM_TYPE_A13B,
|
|
110
|
+
LLM_TYPE_7B_A1B,
|
|
110
111
|
LLM_TYPE_8B_A1B, // lfm2moe
|
|
112
|
+
LLM_TYPE_16B_A1B,
|
|
111
113
|
LLM_TYPE_21B_A3B, // Ernie MoE small
|
|
112
114
|
LLM_TYPE_30B_A3B,
|
|
115
|
+
LLM_TYPE_100B_A6B,
|
|
113
116
|
LLM_TYPE_106B_A12B, // GLM-4.5-Air
|
|
114
117
|
LLM_TYPE_235B_A22B,
|
|
115
118
|
LLM_TYPE_300B_A47B, // Ernie MoE big
|
|
@@ -1968,6 +1968,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
1968
1968
|
clean_spaces = false;
|
|
1969
1969
|
} else if (
|
|
1970
1970
|
tokenizer_pre == "bailingmoe" ||
|
|
1971
|
+
tokenizer_pre == "bailingmoe2" ||
|
|
1971
1972
|
tokenizer_pre == "llada-moe") {
|
|
1972
1973
|
pre_type = LLAMA_VOCAB_PRE_TYPE_BAILINGMOE;
|
|
1973
1974
|
clean_spaces = false;
|