@fugood/llama.node 1.3.0-rc.4 → 1.3.0-rc.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/lib/binding.ts CHANGED
@@ -375,7 +375,7 @@ export type ToolCall = {
375
375
  }
376
376
 
377
377
  export interface LlamaContext {
378
- new (options: LlamaModelOptions): LlamaContext
378
+ new (options: LlamaModelOptions, onProgress?: (progress: number) => void): LlamaContext
379
379
  getSystemInfo(): string
380
380
  getModelInfo(): ModelInfo
381
381
  getFormattedChat(
package/lib/index.js CHANGED
@@ -193,12 +193,12 @@ class LlamaContextWrapper {
193
193
  return this.ctx.decodeAudioTokens(tokens);
194
194
  }
195
195
  }
196
- const loadModel = (options) => __awaiter(void 0, void 0, void 0, function* () {
196
+ const loadModel = (options, onProgress) => __awaiter(void 0, void 0, void 0, function* () {
197
197
  var _a, _b;
198
198
  const variant = (_a = options.lib_variant) !== null && _a !== void 0 ? _a : 'default';
199
199
  (_b = mods[variant]) !== null && _b !== void 0 ? _b : (mods[variant] = yield (0, binding_1.loadModule)(options.lib_variant));
200
200
  refreshNativeLogSetup();
201
- const nativeCtx = new mods[variant].LlamaContext(options);
201
+ const nativeCtx = new mods[variant].LlamaContext(options, onProgress);
202
202
  return new LlamaContextWrapper(nativeCtx);
203
203
  });
204
204
  exports.loadModel = loadModel;
package/lib/index.ts CHANGED
@@ -299,12 +299,13 @@ class LlamaContextWrapper {
299
299
 
300
300
  export const loadModel = async (
301
301
  options: LlamaModelOptionsExtended,
302
+ onProgress?: (progress: number) => void,
302
303
  ): Promise<LlamaContextWrapper> => {
303
304
  const variant = options.lib_variant ?? 'default'
304
305
  mods[variant] ??= await loadModule(options.lib_variant)
305
306
  refreshNativeLogSetup()
306
307
 
307
- const nativeCtx = new mods[variant].LlamaContext(options)
308
+ const nativeCtx = new mods[variant].LlamaContext(options, onProgress)
308
309
  return new LlamaContextWrapper(nativeCtx)
309
310
  }
310
311
 
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@fugood/llama.node",
3
3
  "access": "public",
4
- "version": "1.3.0-rc.4",
4
+ "version": "1.3.0-rc.6",
5
5
  "description": "An another Node binding of llama.cpp",
6
6
  "main": "lib/index.js",
7
7
  "scripts": {
@@ -72,19 +72,19 @@
72
72
  "CMakeLists.txt"
73
73
  ],
74
74
  "optionalDependencies": {
75
- "@fugood/node-llama-linux-x64": "1.3.0-rc.4",
76
- "@fugood/node-llama-linux-x64-vulkan": "1.3.0-rc.4",
77
- "@fugood/node-llama-linux-x64-cuda": "1.3.0-rc.4",
78
- "@fugood/node-llama-linux-arm64": "1.3.0-rc.4",
79
- "@fugood/node-llama-linux-arm64-vulkan": "1.3.0-rc.4",
80
- "@fugood/node-llama-linux-arm64-cuda": "1.3.0-rc.4",
81
- "@fugood/node-llama-win32-x64": "1.3.0-rc.4",
82
- "@fugood/node-llama-win32-x64-vulkan": "1.3.0-rc.4",
83
- "@fugood/node-llama-win32-x64-cuda": "1.3.0-rc.4",
84
- "@fugood/node-llama-win32-arm64": "1.3.0-rc.4",
85
- "@fugood/node-llama-win32-arm64-vulkan": "1.3.0-rc.4",
86
- "@fugood/node-llama-darwin-x64": "1.3.0-rc.4",
87
- "@fugood/node-llama-darwin-arm64": "1.3.0-rc.4"
75
+ "@fugood/node-llama-linux-x64": "1.3.0-rc.6",
76
+ "@fugood/node-llama-linux-x64-vulkan": "1.3.0-rc.6",
77
+ "@fugood/node-llama-linux-x64-cuda": "1.3.0-rc.6",
78
+ "@fugood/node-llama-linux-arm64": "1.3.0-rc.6",
79
+ "@fugood/node-llama-linux-arm64-vulkan": "1.3.0-rc.6",
80
+ "@fugood/node-llama-linux-arm64-cuda": "1.3.0-rc.6",
81
+ "@fugood/node-llama-win32-x64": "1.3.0-rc.6",
82
+ "@fugood/node-llama-win32-x64-vulkan": "1.3.0-rc.6",
83
+ "@fugood/node-llama-win32-x64-cuda": "1.3.0-rc.6",
84
+ "@fugood/node-llama-win32-arm64": "1.3.0-rc.6",
85
+ "@fugood/node-llama-win32-arm64-vulkan": "1.3.0-rc.6",
86
+ "@fugood/node-llama-darwin-x64": "1.3.0-rc.6",
87
+ "@fugood/node-llama-darwin-arm64": "1.3.0-rc.6"
88
88
  },
89
89
  "devDependencies": {
90
90
  "@babel/preset-env": "^7.24.4",
@@ -98,7 +98,7 @@ index b0591e84b..93759f884 100644
98
98
  mparams.split_mode = params.split_mode;
99
99
  mparams.tensor_split = params.tensor_split;
100
100
  diff --git a/src/llama.cpp/common/common.h b/src/llama.cpp/common/common.h
101
- index 040a44ebd..37ad69173 100644
101
+ index a8cb630ea..0919ec5d3 100644
102
102
  --- a/src/llama.cpp/common/common.h
103
103
  +++ b/src/llama.cpp/common/common.h
104
104
  @@ -274,6 +274,7 @@ struct lr_opt {
@@ -110,7 +110,7 @@ index 040a44ebd..37ad69173 100644
110
110
  int32_t n_ctx = 4096; // context size
111
111
  int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
112
112
  diff --git a/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt b/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt
113
- index 42041b717..371752718 100644
113
+ index 34323afa0..1a6924db0 100644
114
114
  --- a/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt
115
115
  +++ b/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt
116
116
  @@ -106,7 +106,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
@@ -123,10 +123,10 @@ index 42041b717..371752718 100644
123
123
  check_cxx_compiler_flag(-mfp16-format=ieee GGML_COMPILER_SUPPORTS_FP16_FORMAT_I3E)
124
124
  if (NOT "${GGML_COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "")
125
125
  diff --git a/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt b/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt
126
- index 83a83887b..8ae962b29 100644
126
+ index de01336cd..29b1a043d 100644
127
127
  --- a/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt
128
128
  +++ b/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt
129
- @@ -112,7 +112,7 @@ if (Vulkan_FOUND)
129
+ @@ -121,7 +121,7 @@ if (Vulkan_FOUND)
130
130
  endif()
131
131
 
132
132
  # Set up toolchain for host compilation whether cross-compiling or not
@@ -135,7 +135,7 @@ index 83a83887b..8ae962b29 100644
135
135
  if (GGML_VULKAN_SHADERS_GEN_TOOLCHAIN)
136
136
  set(HOST_CMAKE_TOOLCHAIN_FILE ${GGML_VULKAN_SHADERS_GEN_TOOLCHAIN})
137
137
  else()
138
- @@ -132,7 +132,7 @@ if (Vulkan_FOUND)
138
+ @@ -141,7 +141,7 @@ if (Vulkan_FOUND)
139
139
 
140
140
  include(ExternalProject)
141
141
 
@@ -221,7 +221,7 @@ static int32_t pooling_type_from_str(const std::string &s) {
221
221
  }
222
222
 
223
223
  // construct({ model, embedding, n_ctx, n_batch, n_threads, n_gpu_layers,
224
- // use_mlock, use_mmap }): LlamaContext throws error
224
+ // use_mlock, use_mmap }, onProgress?: (progress: number) => void): LlamaContext throws error
225
225
  LlamaContext::LlamaContext(const Napi::CallbackInfo &info)
226
226
  : Napi::ObjectWrap<LlamaContext>(info) {
227
227
  Napi::Env env = info.Env();
@@ -230,6 +230,16 @@ LlamaContext::LlamaContext(const Napi::CallbackInfo &info)
230
230
  }
231
231
  auto options = info[0].As<Napi::Object>();
232
232
 
233
+ // Check if progress callback is provided
234
+ bool has_progress_callback = info.Length() >= 2 && info[1].IsFunction();
235
+ if (has_progress_callback) {
236
+ _progress_tsfn = Napi::ThreadSafeFunction::New(
237
+ env, info[1].As<Napi::Function>(), "Model Loading Progress", 0, 1,
238
+ [](Napi::Env) {
239
+ // Finalizer callback
240
+ });
241
+ }
242
+
233
243
  common_params params;
234
244
  params.model.path = get_option<std::string>(options, "model", "");
235
245
  if (params.model.path.empty()) {
@@ -323,12 +333,55 @@ LlamaContext::LlamaContext(const Napi::CallbackInfo &info)
323
333
 
324
334
  // Use rn-llama context instead of direct session
325
335
  _rn_ctx = new llama_rn_context();
336
+ _rn_ctx->is_load_interrupted = false;
337
+ _rn_ctx->loading_progress = 0;
338
+
339
+ // Set up progress callback if provided
340
+ if (has_progress_callback) {
341
+ params.load_progress_callback = [](float progress, void *user_data) {
342
+ LlamaContext *self = static_cast<LlamaContext *>(user_data);
343
+ unsigned int percentage = static_cast<unsigned int>(100 * progress);
344
+
345
+ // Only call callback if progress increased
346
+ if (percentage > self->_rn_ctx->loading_progress) {
347
+ self->_rn_ctx->loading_progress = percentage;
348
+
349
+ // Create a heap-allocated copy of the percentage
350
+ auto *data = new unsigned int(percentage);
351
+
352
+ // Queue callback to be executed on the JavaScript thread
353
+ auto status = self->_progress_tsfn.NonBlockingCall(
354
+ data, [](Napi::Env env, Napi::Function jsCallback, unsigned int *data) {
355
+ jsCallback.Call({Napi::Number::New(env, *data)});
356
+ delete data;
357
+ });
358
+
359
+ // If the call failed, clean up the data
360
+ if (status != napi_ok) {
361
+ delete data;
362
+ }
363
+ }
364
+
365
+ // Return true to continue loading, false to interrupt
366
+ return !self->_rn_ctx->is_load_interrupted;
367
+ };
368
+ params.load_progress_callback_user_data = this;
369
+ }
370
+
326
371
  if (!_rn_ctx->loadModel(params)) {
372
+ if (has_progress_callback) {
373
+ _progress_tsfn.Release();
374
+ }
327
375
  delete _rn_ctx;
328
376
  _rn_ctx = nullptr;
329
377
  Napi::TypeError::New(env, "Failed to load model").ThrowAsJavaScriptException();
330
378
  }
331
379
 
380
+ // Release progress callback after model is loaded
381
+ if (has_progress_callback) {
382
+ _progress_tsfn.Release();
383
+ }
384
+
332
385
  // Handle LoRA adapters through rn-llama
333
386
  if (!lora.empty()) {
334
387
  _rn_ctx->applyLoraAdapters(lora);
@@ -343,6 +396,11 @@ LlamaContext::~LlamaContext() {
343
396
  _context_valid->store(false);
344
397
  }
345
398
 
399
+ // Interrupt model loading if in progress
400
+ if (_rn_ctx) {
401
+ _rn_ctx->is_load_interrupted = true;
402
+ }
403
+
346
404
  // The DisposeWorker is responsible for cleanup of _rn_ctx
347
405
  // If _rn_ctx is still not null here, it means disposal was not properly initiated
348
406
  if (_rn_ctx) {
@@ -78,4 +78,7 @@ private:
78
78
  // Validity flag for async callbacks to prevent use-after-free
79
79
  // Shared pointer ensures callbacks can safely check if context is still alive
80
80
  std::shared_ptr<std::atomic<bool>> _context_valid;
81
+
82
+ // Progress callback support for model loading
83
+ Napi::ThreadSafeFunction _progress_tsfn;
81
84
  };
@@ -307,6 +307,10 @@ function(ggml_add_cpu_backend_variant tag_name)
307
307
  foreach (feat ${ARGN})
308
308
  set(GGML_INTERNAL_${feat} ON)
309
309
  endforeach()
310
+ elseif (GGML_SYSTEM_ARCH STREQUAL "s390x")
311
+ foreach (feat ${ARGN})
312
+ set(GGML_INTERNAL_${feat} ON)
313
+ endforeach()
310
314
  endif()
311
315
 
312
316
  ggml_add_cpu_backend_variant_impl(${tag_name})
@@ -371,6 +375,14 @@ if (GGML_CPU_ALL_VARIANTS)
371
375
  else()
372
376
  message(FATAL_ERROR "Unsupported PowerPC target OS: ${CMAKE_SYSTEM_NAME}")
373
377
  endif()
378
+ elseif (GGML_SYSTEM_ARCH STREQUAL "s390x")
379
+ if (CMAKE_SYSTEM_NAME MATCHES "Linux")
380
+ ggml_add_cpu_backend_variant(s390x_z15 Z15 VXE)
381
+ # ggml_add_cpu_backend_variant(s390x_z16 Z16 VXE)
382
+ # ggml_add_cpu_backend_variant(s390x_z17 Z17 VXE)
383
+ else()
384
+ message(FATAL_ERROR "Unsupported s390x target OS: ${CMAKE_SYSTEM_NAME}")
385
+ endif()
374
386
  else()
375
387
  message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS not yet supported with ${GGML_SYSTEM_ARCH} on ${CMAKE_SYSTEM_NAME}")
376
388
  endif()
@@ -466,29 +466,45 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
466
466
  list(APPEND ARCH_FLAGS "-march=${MARCH_STR}" -mabi=lp64d)
467
467
  elseif (GGML_SYSTEM_ARCH STREQUAL "s390x")
468
468
  message(STATUS "s390x detected")
469
- list(APPEND GGML_CPU_SOURCES ggml-cpu/arch/s390/quants.c)
470
- file(READ "/proc/cpuinfo" CPUINFO_CONTENTS)
471
- string(REGEX REPLACE "machine[ \t\r\n]*=[ \t\r\n]*([0-9]+)" "\\1" S390X_M ${CPUINFO_CONTENTS})
472
-
473
- # TODO: Separation to determine activation of VX/VXE/VXE2
474
- if (${S390X_M} MATCHES "8561|8562")
475
- message(STATUS "z15 target")
476
- list(APPEND ARCH_FLAGS -march=z15)
477
- elseif (${S390X_M} MATCHES "3931")
478
- message(STATUS "z16 target")
479
- list(APPEND ARCH_FLAGS -march=z16)
480
- elseif (${S390X_M} MATCHES "9175|9176")
481
- # NOTE: Only available from GCC 15.1.0 onwards. Any z17 machine with compile issues must first verify their GCC version.
482
- # binutils must also be updated to the latest for the -march=z17 flag to work. Otherwise, use -march=arch15.
483
- message(STATUS "z17 target")
484
- list(APPEND ARCH_FLAGS -march=arch15)
485
- else()
486
- message(STATUS "Unknown target")
487
- message(WARNING "Unknown target. If you are compiling for z14 and earlier, you might have to add -DGGML_VXE=OFF.")
488
- list(APPEND ARCH_FLAGS -march=native -mtune=native)
469
+ list(APPEND GGML_CPU_SOURCES
470
+ ggml-cpu/arch/s390/quants.c)
471
+
472
+ # for native compilation
473
+ if (GGML_NATIVE)
474
+ # check machine level to determine target
475
+ file(READ "/proc/cpuinfo" CPUINFO_CONTENTS)
476
+ string(REGEX REPLACE "machine[ \t\r\n]*=[ \t\r\n]*([0-9]+)" "\\1" S390X_M ${CPUINFO_CONTENTS})
477
+
478
+ # TODO: Separation to determine activation of VX/VXE/VXE2
479
+ if (${S390X_M} MATCHES "8561|8562")
480
+ message(STATUS "z15 target")
481
+ list(APPEND ARCH_FLAGS -march=z15)
482
+ elseif (${S390X_M} MATCHES "3931")
483
+ message(STATUS "z16 target")
484
+ list(APPEND ARCH_FLAGS -march=z16)
485
+ elseif (${S390X_M} MATCHES "9175|9176")
486
+ # NOTE: Only available from GCC 15.1.0 onwards. Any z17 machine with compile issues must first verify their GCC version.
487
+ # binutils must also be updated to the latest for the -march=z17 flag to work. Otherwise, use -march=arch15.
488
+ message(STATUS "z17 target")
489
+ list(APPEND ARCH_FLAGS -march=arch15)
490
+ else()
491
+ message(STATUS "Unknown target")
492
+ message(WARNING "Unknown target. If you are compiling for z14 and earlier, you might have to add -DGGML_VXE=OFF.")
493
+ list(APPEND ARCH_FLAGS -march=native -mtune=native)
494
+ endif()
495
+ # for cross-compilation
496
+ elseif(GGML_CPU_ALL_VARIANTS)
497
+ # range through IBM z15 to z17
498
+ # NOTE: update when a new hardware level is released
499
+ foreach (ZHW RANGE 15 17)
500
+ if(DEFINED GGML_INTERNAL_Z${ZHW})
501
+ message(STATUS "z${ZHW} cross-compile target")
502
+ list(APPEND ARCH_FLAGS -march=z${ZHW})
503
+ endif()
504
+ endforeach()
489
505
  endif()
490
506
 
491
- if (GGML_VXE)
507
+ if (GGML_VXE OR GGML_INTERNAL_VXE)
492
508
  message(STATUS "VX/VXE/VXE2 enabled")
493
509
  list(APPEND ARCH_FLAGS -mvx -mzvector)
494
510
  list(APPEND ARCH_DEFINITIONS GGML_VXE)
@@ -85,6 +85,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
85
85
  { LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
86
86
  { LLM_ARCH_PLM, "plm" },
87
87
  { LLM_ARCH_BAILINGMOE, "bailingmoe" },
88
+ { LLM_ARCH_BAILINGMOE2, "bailingmoe2" },
88
89
  { LLM_ARCH_DOTS1, "dots1" },
89
90
  { LLM_ARCH_ARCEE, "arcee" },
90
91
  { LLM_ARCH_ERNIE4_5, "ernie4_5" },
@@ -135,6 +136,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
135
136
  { LLM_KV_EXPERT_COUNT, "%s.expert_count" },
136
137
  { LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
137
138
  { LLM_KV_EXPERT_SHARED_COUNT, "%s.expert_shared_count" },
139
+ { LLM_KV_EXPERT_GROUP_COUNT, "%s.expert_group_count" },
140
+ { LLM_KV_EXPERT_GROUP_USED_COUNT, "%s.expert_group_used_count" },
138
141
  { LLM_KV_EXPERT_WEIGHTS_SCALE, "%s.expert_weights_scale" },
139
142
  { LLM_KV_EXPERT_WEIGHTS_NORM, "%s.expert_weights_norm" },
140
143
  { LLM_KV_EXPERT_GATING_FUNC, "%s.expert_gating_func" },
@@ -1946,6 +1949,38 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
1946
1949
  { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
1947
1950
  },
1948
1951
  },
1952
+ {
1953
+ LLM_ARCH_BAILINGMOE2,
1954
+ {
1955
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1956
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1957
+ { LLM_TENSOR_OUTPUT, "output" },
1958
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1959
+ { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
1960
+ { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
1961
+ { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
1962
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1963
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
1964
+ { LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b" },
1965
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1966
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
1967
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1968
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1969
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
1970
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
1971
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
1972
+ { LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
1973
+ { LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
1974
+ { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
1975
+ { LLM_TENSOR_NEXTN_EH_PROJ, "blk.%d.nextn.eh_proj" },
1976
+ { LLM_TENSOR_NEXTN_EMBED_TOKENS, "blk.%d.nextn.embed_tokens" },
1977
+ { LLM_TENSOR_NEXTN_ENORM, "blk.%d.nextn.enorm" },
1978
+ { LLM_TENSOR_NEXTN_HNORM, "blk.%d.nextn.hnorm" },
1979
+ { LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "blk.%d.nextn.shared_head_head" },
1980
+ { LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "blk.%d.nextn.shared_head_norm" },
1981
+ { LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
1982
+ },
1983
+ },
1949
1984
  {
1950
1985
  LLM_ARCH_DOTS1,
1951
1986
  {
@@ -89,6 +89,7 @@ enum llm_arch {
89
89
  LLM_ARCH_WAVTOKENIZER_DEC,
90
90
  LLM_ARCH_PLM,
91
91
  LLM_ARCH_BAILINGMOE,
92
+ LLM_ARCH_BAILINGMOE2,
92
93
  LLM_ARCH_DOTS1,
93
94
  LLM_ARCH_ARCEE,
94
95
  LLM_ARCH_ERNIE4_5,
@@ -139,6 +140,8 @@ enum llm_kv {
139
140
  LLM_KV_EXPERT_COUNT,
140
141
  LLM_KV_EXPERT_USED_COUNT,
141
142
  LLM_KV_EXPERT_SHARED_COUNT,
143
+ LLM_KV_EXPERT_GROUP_COUNT,
144
+ LLM_KV_EXPERT_GROUP_USED_COUNT,
142
145
  LLM_KV_EXPERT_WEIGHTS_SCALE,
143
146
  LLM_KV_EXPERT_WEIGHTS_NORM,
144
147
  LLM_KV_EXPERT_GATING_FUNC,
@@ -123,7 +123,7 @@ private:
123
123
  uint32_t n_seq_max;
124
124
  uint32_t n_outputs;
125
125
 
126
- std::array<llama_seq_id, 1> seq_id_0 = { 0 }; // default sequence id
126
+ std::array<llama_seq_id, 1> seq_id_0 = {{ 0 }}; // default sequence id
127
127
 
128
128
  std::vector<llama_pos> pos;
129
129
  std::vector<int32_t> n_seq_id;
@@ -63,6 +63,8 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
63
63
  { "megrez", LLM_CHAT_TEMPLATE_MEGREZ },
64
64
  { "yandex", LLM_CHAT_TEMPLATE_YANDEX },
65
65
  { "bailing", LLM_CHAT_TEMPLATE_BAILING },
66
+ { "bailing-think", LLM_CHAT_TEMPLATE_BAILING_THINK },
67
+ { "bailing2", LLM_CHAT_TEMPLATE_BAILING2 },
66
68
  { "llama4", LLM_CHAT_TEMPLATE_LLAMA4 },
67
69
  { "smolvlm", LLM_CHAT_TEMPLATE_SMOLVLM },
68
70
  { "hunyuan-moe", LLM_CHAT_TEMPLATE_HUNYUAN_MOE },
@@ -191,6 +193,10 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
191
193
  return LLM_CHAT_TEMPLATE_YANDEX;
192
194
  } else if (tmpl_contains("<role>ASSISTANT</role>") && tmpl_contains("'HUMAN'")) {
193
195
  return LLM_CHAT_TEMPLATE_BAILING;
196
+ } else if (tmpl_contains("<role>ASSISTANT</role>") && tmpl_contains("\"HUMAN\"") && tmpl_contains("<think>")) {
197
+ return LLM_CHAT_TEMPLATE_BAILING_THINK;
198
+ } else if (tmpl_contains("<role>ASSISTANT</role>") && tmpl_contains("<role>HUMAN</role>") && tmpl_contains("<|role_end|>")) {
199
+ return LLM_CHAT_TEMPLATE_BAILING2;
194
200
  } else if (tmpl_contains("<|header_start|>") && tmpl_contains("<|header_end|>")) {
195
201
  return LLM_CHAT_TEMPLATE_LLAMA4;
196
202
  } else if (tmpl_contains("<|endofuserprompt|>")) {
@@ -644,8 +650,8 @@ int32_t llm_chat_apply_template(
644
650
  if (add_ass) {
645
651
  ss << " Ассистент:[SEP]";
646
652
  }
647
- } else if (tmpl == LLM_CHAT_TEMPLATE_BAILING) {
648
- // Bailing (Ling) template
653
+ } else if (tmpl == LLM_CHAT_TEMPLATE_BAILING || tmpl == LLM_CHAT_TEMPLATE_BAILING_THINK) {
654
+ // Bailing (Ling/Ring) template
649
655
  for (auto message : chat) {
650
656
  std::string role(message->role);
651
657
 
@@ -658,6 +664,33 @@ int32_t llm_chat_apply_template(
658
664
  ss << "<role>" << role << "</role>" << message->content;
659
665
  }
660
666
 
667
+ if (add_ass) {
668
+ ss << "<role>ASSISTANT</role>";
669
+
670
+ if (tmpl == LLM_CHAT_TEMPLATE_BAILING_THINK) {
671
+ ss << "<think>";
672
+ }
673
+ }
674
+ } else if (tmpl == LLM_CHAT_TEMPLATE_BAILING2) {
675
+ // Bailing2 (Ling 2.0) template
676
+ bool has_system = !chat.empty() && std::string(chat[0]->role) == "system";
677
+
678
+ if (!has_system) {
679
+ ss << "<role>SYSTEM</role>detailed thinking off<|role_end|>";
680
+ }
681
+
682
+ for (auto message : chat) {
683
+ std::string role(message->role);
684
+
685
+ if (role == "user") {
686
+ role = "HUMAN";
687
+ } else {
688
+ std::transform(role.begin(), role.end(), role.begin(), ::toupper);
689
+ }
690
+
691
+ ss << "<role>" << role << "</role>" << message->content << "<|role_end|>";
692
+ }
693
+
661
694
  if (add_ass) {
662
695
  ss << "<role>ASSISTANT</role>";
663
696
  }
@@ -42,6 +42,8 @@ enum llm_chat_template {
42
42
  LLM_CHAT_TEMPLATE_MEGREZ,
43
43
  LLM_CHAT_TEMPLATE_YANDEX,
44
44
  LLM_CHAT_TEMPLATE_BAILING,
45
+ LLM_CHAT_TEMPLATE_BAILING_THINK,
46
+ LLM_CHAT_TEMPLATE_BAILING2,
45
47
  LLM_CHAT_TEMPLATE_LLAMA4,
46
48
  LLM_CHAT_TEMPLATE_SMOLVLM,
47
49
  LLM_CHAT_TEMPLATE_DOTS1,
@@ -2346,7 +2346,8 @@ llama_context * llama_init_from_model(
2346
2346
  return nullptr;
2347
2347
  }
2348
2348
 
2349
- if (params.pooling_type != model->hparams.pooling_type) {
2349
+ if (params.pooling_type != LLAMA_POOLING_TYPE_UNSPECIFIED &&
2350
+ params.pooling_type != model->hparams.pooling_type) {
2350
2351
  //user-specified pooling-type is different from the model default
2351
2352
  LLAMA_LOG_WARN("%s: model default pooling_type is [%d], but [%d] was specified\n", __func__,
2352
2353
  model->hparams.pooling_type, params.pooling_type);
@@ -950,6 +950,31 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
950
950
  cb(selection_probs, "ffn_moe_probs_biased", il);
951
951
  }
952
952
 
953
+ // select top n_group_used expert groups
954
+ // https://huggingface.co/deepseek-ai/DeepSeek-V3/blob/e815299b0bcbac849fa540c768ef21845365c9eb/modeling_deepseek.py#L440-L457
955
+ if (hparams.n_expert_groups > 1 && n_tokens > 0) {
956
+ const int64_t n_exp_per_group = n_expert / hparams.n_expert_groups;
957
+
958
+ // organize experts into n_expert_groups
959
+ ggml_tensor * selection_groups = ggml_reshape_3d(ctx0, selection_probs, n_exp_per_group, hparams.n_expert_groups, n_tokens); // [n_exp_per_group, n_expert_groups, n_tokens]
960
+
961
+ ggml_tensor * group_scores = ggml_top_k(ctx0, selection_groups, 2); // [2, n_expert_groups, n_tokens]
962
+ group_scores = ggml_get_rows(ctx0, ggml_reshape_4d(ctx0, selection_groups, 1, selection_groups->ne[0], selection_groups->ne[1], selection_groups->ne[2]), group_scores); // [1, 2, n_expert_groups, n_tokens]
963
+
964
+ // get top n_group_used expert groups
965
+ group_scores = ggml_sum_rows(ctx0, ggml_reshape_3d(ctx0, group_scores, group_scores->ne[1], group_scores->ne[2], group_scores->ne[3])); // [1, n_expert_groups, n_tokens]
966
+ group_scores = ggml_reshape_2d(ctx0, group_scores, group_scores->ne[1], group_scores->ne[2]); // [n_expert_groups, n_tokens]
967
+
968
+ ggml_tensor * expert_groups = ggml_top_k(ctx0, group_scores, hparams.n_group_used); // [n_group_used, n_tokens]
969
+ cb(expert_groups, "ffn_moe_group_topk", il);
970
+
971
+ // mask out the other groups
972
+ selection_probs = ggml_get_rows(ctx0, selection_groups, expert_groups); // [n_exp_per_group, n_group_used, n_tokens]
973
+ selection_probs = ggml_set_rows(ctx0, ggml_scale_bias(ctx0, selection_groups, 0.0f, -INFINITY), selection_probs, expert_groups); // [n_exp_per_group, n_expert_groups, n_tokens]
974
+ selection_probs = ggml_reshape_2d(ctx0, selection_probs, n_expert, n_tokens); // [n_expert, n_tokens]
975
+ cb(selection_probs, "ffn_moe_probs_masked", il);
976
+ }
977
+
953
978
  // select experts
954
979
  ggml_tensor * selected_experts = ggml_top_k(ctx0, selection_probs, n_expert_used); // [n_expert_used, n_tokens]
955
980
  cb(selected_experts->src[0], "ffn_moe_argsort", il);
@@ -981,6 +1006,11 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
981
1006
  ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights); // [1, n_tokens]
982
1007
  cb(weights_sum, "ffn_moe_weights_sum", il);
983
1008
 
1009
+ if (arch == LLM_ARCH_BAILINGMOE2) {
1010
+ weights_sum = ggml_scale_bias(ctx0, weights_sum, 1.0, 1e-20);
1011
+ cb(weights_sum, "ffn_moe_weights_sum_biased", il);
1012
+ }
1013
+
984
1014
  weights = ggml_div(ctx0, weights, weights_sum); // [n_expert_used, n_tokens]
985
1015
  cb(weights, "ffn_moe_weights_norm", il);
986
1016
 
@@ -72,6 +72,8 @@ struct llama_hparams {
72
72
  uint32_t n_ff_chexp = 0;
73
73
  uint32_t n_expert_shared = 0;
74
74
  uint32_t n_norm_groups = 0;
75
+ uint32_t n_expert_groups = 0;
76
+ uint32_t n_group_used = 0;
75
77
  uint32_t n_group_experts = 0;
76
78
 
77
79
  float expert_group_scale = 0.05f;
@@ -114,9 +114,12 @@ const char * llm_type_name(llm_type type) {
114
114
  case LLM_TYPE_17B_16E: return "17Bx16E (Scout)";
115
115
  case LLM_TYPE_17B_128E: return "17Bx128E (Maverick)";
116
116
  case LLM_TYPE_A13B: return "A13B";
117
+ case LLM_TYPE_7B_A1B: return "7B.A1B";
117
118
  case LLM_TYPE_8B_A1B: return "8B.A1B";
119
+ case LLM_TYPE_16B_A1B: return "16B.A1B";
118
120
  case LLM_TYPE_21B_A3B: return "21B.A3B";
119
121
  case LLM_TYPE_30B_A3B: return "30B.A3B";
122
+ case LLM_TYPE_100B_A6B: return "100B.A6B";
120
123
  case LLM_TYPE_106B_A12B: return "106B.A12B";
121
124
  case LLM_TYPE_235B_A22B: return "235B.A22B";
122
125
  case LLM_TYPE_300B_A47B: return "300B.A47B";
@@ -480,11 +483,13 @@ void llama_model::load_hparams(llama_model_loader & ml) {
480
483
  return;
481
484
  }
482
485
 
483
- ml.get_key(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
484
- ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
485
- ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer);
486
- ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
487
- ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
486
+ ml.get_key(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
487
+ ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
488
+ ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer);
489
+ ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
490
+ ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
491
+ ml.get_key(LLM_KV_EXPERT_GROUP_COUNT, hparams.n_expert_groups, false);
492
+ ml.get_key(LLM_KV_EXPERT_GROUP_USED_COUNT, hparams.n_group_used, false);
488
493
 
489
494
  if (arch == LLM_ARCH_WAVTOKENIZER_DEC) {
490
495
  ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd_features);
@@ -500,8 +505,15 @@ void llama_model::load_hparams(llama_model_loader & ml) {
500
505
  GGML_ASSERT(hparams.n_expert_used <= hparams.n_expert);
501
506
  if (hparams.n_expert > 0) {
502
507
  GGML_ASSERT(hparams.n_expert_used > 0);
508
+ GGML_ASSERT(hparams.n_expert_groups < hparams.n_expert);
509
+ if (hparams.n_expert_groups > 1) {
510
+ GGML_ASSERT(hparams.n_expert % hparams.n_expert_groups == 0);
511
+ GGML_ASSERT(hparams.n_group_used > 0);
512
+ GGML_ASSERT(hparams.n_group_used < hparams.n_expert_groups);
513
+ }
503
514
  } else {
504
515
  GGML_ASSERT(hparams.n_expert_used == 0);
516
+ GGML_ASSERT(hparams.n_expert_groups == 0);
505
517
  }
506
518
 
507
519
  std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0);
@@ -1843,8 +1855,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1843
1855
 
1844
1856
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1845
1857
 
1846
- switch (hparams.n_layer) {
1847
- // TODO: Add llm type label (not sure this is useful)
1858
+ switch (hparams.n_embd) {
1859
+ case 1536: type = LLM_TYPE_7B_A1B; break;
1860
+ case 2048: case 2560: type = LLM_TYPE_3B; break;
1861
+ case 4096: type = LLM_TYPE_32B; break;
1848
1862
  default: type = LLM_TYPE_UNKNOWN;
1849
1863
  }
1850
1864
 
@@ -1885,6 +1899,29 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1885
1899
  default: type = LLM_TYPE_UNKNOWN;
1886
1900
  }
1887
1901
  } break;
1902
+ case LLM_ARCH_BAILINGMOE2:
1903
+ {
1904
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1905
+ ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
1906
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
1907
+ ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp);
1908
+ ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
1909
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
1910
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
1911
+ ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func);
1912
+ ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false);
1913
+
1914
+ // TODO: when MTP is implemented, this should probably be updated if needed
1915
+ hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers;
1916
+
1917
+ switch (hparams.n_layer) {
1918
+ case 20: type = LLM_TYPE_16B_A1B; break;
1919
+ case 21: type = LLM_TYPE_16B_A1B; break;
1920
+ case 32: type = LLM_TYPE_100B_A6B; break;
1921
+ case 33: type = LLM_TYPE_100B_A6B; break;
1922
+ default: type = LLM_TYPE_UNKNOWN;
1923
+ }
1924
+ } break;
1888
1925
  case LLM_ARCH_DOTS1:
1889
1926
  {
1890
1927
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -5495,6 +5532,70 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
5495
5532
  layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
5496
5533
  }
5497
5534
  } break;
5535
+ case LLM_ARCH_BAILINGMOE2:
5536
+ {
5537
+ const int64_t n_ff_exp = hparams.n_ff_exp;
5538
+ const int64_t n_expert_shared = hparams.n_expert_shared;
5539
+
5540
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5541
+
5542
+ // output
5543
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5544
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
5545
+
5546
+ GGML_ASSERT(n_expert > 0 && "n_expert must be > 0 for bailingmoe2");
5547
+ GGML_ASSERT(n_expert_used > 0 && "n_expert_used must be > 0 for bailingmoe2");
5548
+
5549
+ for (int i = 0; i < n_layer; ++i) {
5550
+ int flags = 0;
5551
+ if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
5552
+ // skip all tensors in the NextN layers
5553
+ flags |= TENSOR_SKIP;
5554
+ }
5555
+
5556
+ auto & layer = layers[i];
5557
+
5558
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, flags);
5559
+
5560
+ layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, flags);
5561
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, flags);
5562
+
5563
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, flags);
5564
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, flags);
5565
+
5566
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, flags);
5567
+
5568
+ if (static_cast<uint32_t>(i) >= hparams.n_layer_dense_lead) { // MoE layers
5569
+ const int64_t n_ff_shexp = (hparams.n_ff_shexp ? hparams.n_ff_shexp : n_ff_exp) * n_expert_shared;
5570
+
5571
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, flags);
5572
+ layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED | flags);
5573
+
5574
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, flags);
5575
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, flags);
5576
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, flags);
5577
+
5578
+ layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_shexp}, flags);
5579
+ layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, flags);
5580
+ layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_shexp}, flags);
5581
+ } else { // Dense layers
5582
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, flags);
5583
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, flags);
5584
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, flags);
5585
+ }
5586
+
5587
+ // NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers
5588
+ if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
5589
+ layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags);
5590
+ layer.nextn.embed_tokens = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED | flags);
5591
+ layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags);
5592
+ layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, flags);
5593
+ layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED | flags);
5594
+ layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), { n_embd }, TENSOR_NOT_REQUIRED | flags);
5595
+ layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, flags);
5596
+ }
5597
+ }
5598
+ } break;
5498
5599
  case LLM_ARCH_DOTS1:
5499
5600
  {
5500
5601
  const int64_t n_ff_exp = hparams.n_ff_exp;
@@ -6350,6 +6451,19 @@ void llama_model::print_info() const {
6350
6451
  LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
6351
6452
  }
6352
6453
 
6454
+ if (arch == LLM_ARCH_BAILINGMOE2) {
6455
+ LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
6456
+ LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
6457
+ LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
6458
+ LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
6459
+ LLAMA_LOG_INFO("%s: n_expert_groups = %d\n", __func__, hparams.n_expert_groups);
6460
+ LLAMA_LOG_INFO("%s: n_group_used = %d\n", __func__, hparams.n_group_used);
6461
+ LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
6462
+ LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
6463
+ LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
6464
+ LLAMA_LOG_INFO("%s: nextn_predict_layers = %d\n", __func__, hparams.nextn_predict_layers);
6465
+ }
6466
+
6353
6467
  if (arch == LLM_ARCH_SMALLTHINKER || arch == LLM_ARCH_LFM2MOE) {
6354
6468
  LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
6355
6469
  LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
@@ -17039,6 +17153,150 @@ struct llm_build_bailingmoe : public llm_graph_context {
17039
17153
  }
17040
17154
  };
17041
17155
 
17156
+ struct llm_build_bailingmoe2 : public llm_graph_context {
17157
+ llm_build_bailingmoe2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
17158
+ const int64_t n_embd_head = hparams.n_embd_head_v;
17159
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
17160
+
17161
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
17162
+
17163
+ ggml_tensor * cur;
17164
+ ggml_tensor * inpL;
17165
+
17166
+ inpL = build_inp_embd(model.tok_embd);
17167
+
17168
+ // inp_pos - contains the positions
17169
+ ggml_tensor * inp_pos = build_inp_pos();
17170
+
17171
+ auto * inp_attn = build_attn_inp_kv();
17172
+
17173
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
17174
+
17175
+ const int n_transformer_layers = n_layer - hparams.nextn_predict_layers;
17176
+ for (int il = 0; il < n_transformer_layers; ++il) {
17177
+ ggml_tensor * inpSA = inpL;
17178
+
17179
+ // norm
17180
+ cur = build_norm(inpL,
17181
+ model.layers[il].attn_norm, NULL,
17182
+ LLM_NORM_RMS, il);
17183
+ cb(cur, "attn_norm", il);
17184
+
17185
+ // self_attention
17186
+ {
17187
+ cur = build_lora_mm(model.layers[il].wqkv, cur);
17188
+ cb(cur, "wqkv", il);
17189
+
17190
+ ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
17191
+ ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
17192
+ ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
17193
+
17194
+ Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
17195
+ cb(Qcur, "Qcur_normed", il);
17196
+
17197
+ Qcur = ggml_rope_ext(
17198
+ ctx0, Qcur, inp_pos, nullptr,
17199
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
17200
+ ext_factor, attn_factor, beta_fast, beta_slow
17201
+ );
17202
+
17203
+ Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
17204
+ cb(Kcur, "Kcur_normed", il);
17205
+
17206
+ Kcur = ggml_rope_ext(
17207
+ ctx0, Kcur, inp_pos, nullptr,
17208
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
17209
+ ext_factor, attn_factor, beta_fast, beta_slow
17210
+ );
17211
+
17212
+ cb(Qcur, "Qcur", il);
17213
+ cb(Kcur, "Kcur", il);
17214
+ cb(Vcur, "Vcur", il);
17215
+
17216
+ cur = build_attn(inp_attn,
17217
+ model.layers[il].wo, model.layers[il].bo,
17218
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
17219
+ }
17220
+
17221
+ if (il == n_transformer_layers - 1 && inp_out_ids) {
17222
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
17223
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
17224
+ }
17225
+
17226
+ ggml_tensor * sa_out = ggml_add(ctx0, cur, inpSA);
17227
+ cb(sa_out, "sa_out", il);
17228
+
17229
+ // MoE branch
17230
+ cur = build_norm(sa_out,
17231
+ model.layers[il].ffn_norm, NULL,
17232
+ LLM_NORM_RMS, il);
17233
+ cb(cur, "ffn_norm", il);
17234
+
17235
+ if (static_cast<uint32_t>(il) < hparams.n_layer_dense_lead) {
17236
+ cur = build_ffn(cur,
17237
+ model.layers[il].ffn_up, NULL, NULL,
17238
+ model.layers[il].ffn_gate, NULL, NULL,
17239
+ model.layers[il].ffn_down, NULL, NULL,
17240
+ NULL,
17241
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
17242
+ cb(cur, "ffn_out", il);
17243
+ } else {
17244
+ ggml_tensor * moe_out =
17245
+ build_moe_ffn(cur,
17246
+ model.layers[il].ffn_gate_inp,
17247
+ model.layers[il].ffn_up_exps,
17248
+ model.layers[il].ffn_gate_exps,
17249
+ model.layers[il].ffn_down_exps,
17250
+ model.layers[il].ffn_exp_probs_b,
17251
+ n_expert, n_expert_used,
17252
+ LLM_FFN_SILU, hparams.expert_weights_norm,
17253
+ true, hparams.expert_weights_scale,
17254
+ (llama_expert_gating_func_type) hparams.expert_gating_func,
17255
+ il);
17256
+ cb(moe_out, "ffn_moe_out", il);
17257
+
17258
+ {
17259
+ ggml_tensor * ffn_shexp = build_ffn(cur,
17260
+ model.layers[il].ffn_up_shexp, NULL, NULL,
17261
+ model.layers[il].ffn_gate_shexp, NULL, NULL,
17262
+ model.layers[il].ffn_down_shexp, NULL, NULL,
17263
+ NULL,
17264
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
17265
+ cb(ffn_shexp, "ffn_shexp", il);
17266
+
17267
+ cur = ggml_add(ctx0, moe_out, ffn_shexp);
17268
+ cb(cur, "ffn_out", il);
17269
+ }
17270
+ }
17271
+
17272
+ cur = ggml_add(ctx0, cur, sa_out);
17273
+
17274
+ cur = build_cvec(cur, il);
17275
+ cb(cur, "l_out", il);
17276
+
17277
+ // input for next layer
17278
+ inpL = cur;
17279
+ }
17280
+
17281
+ cur = inpL;
17282
+
17283
+ cur = build_norm(cur,
17284
+ model.output_norm, NULL,
17285
+ LLM_NORM_RMS, -1);
17286
+
17287
+ cb(cur, "result_norm", -1);
17288
+ res->t_embd = cur;
17289
+
17290
+ // lm_head
17291
+ cur = build_lora_mm(model.output, cur);
17292
+
17293
+ cb(cur, "result_output", -1);
17294
+ res->t_logits = cur;
17295
+
17296
+ ggml_build_forward_expand(gf, cur);
17297
+ }
17298
+ };
17299
+
17042
17300
  struct llm_build_dots1 : public llm_graph_context {
17043
17301
  llm_build_dots1(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
17044
17302
  const int64_t n_embd_head = hparams.n_embd_head_v;
@@ -19835,6 +20093,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
19835
20093
  {
19836
20094
  llm = std::make_unique<llm_build_bailingmoe>(*this, params);
19837
20095
  } break;
20096
+ case LLM_ARCH_BAILINGMOE2:
20097
+ {
20098
+ llm = std::make_unique<llm_build_bailingmoe2>(*this, params);
20099
+ } break;
19838
20100
  case LLM_ARCH_SEED_OSS:
19839
20101
  {
19840
20102
  llm = std::make_unique<llm_build_seed_oss>(*this, params);
@@ -20101,6 +20363,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
20101
20363
  case LLM_ARCH_EXAONE:
20102
20364
  case LLM_ARCH_EXAONE4:
20103
20365
  case LLM_ARCH_MINICPM3:
20366
+ case LLM_ARCH_BAILINGMOE2:
20104
20367
  case LLM_ARCH_DOTS1:
20105
20368
  case LLM_ARCH_HUNYUAN_MOE:
20106
20369
  case LLM_ARCH_OPENAI_MOE:
@@ -107,9 +107,12 @@ enum llm_type {
107
107
  LLM_TYPE_17B_16E, // llama4 Scout
108
108
  LLM_TYPE_17B_128E, // llama4 Maverick
109
109
  LLM_TYPE_A13B,
110
+ LLM_TYPE_7B_A1B,
110
111
  LLM_TYPE_8B_A1B, // lfm2moe
112
+ LLM_TYPE_16B_A1B,
111
113
  LLM_TYPE_21B_A3B, // Ernie MoE small
112
114
  LLM_TYPE_30B_A3B,
115
+ LLM_TYPE_100B_A6B,
113
116
  LLM_TYPE_106B_A12B, // GLM-4.5-Air
114
117
  LLM_TYPE_235B_A22B,
115
118
  LLM_TYPE_300B_A47B, // Ernie MoE big
@@ -1968,6 +1968,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1968
1968
  clean_spaces = false;
1969
1969
  } else if (
1970
1970
  tokenizer_pre == "bailingmoe" ||
1971
+ tokenizer_pre == "bailingmoe2" ||
1971
1972
  tokenizer_pre == "llada-moe") {
1972
1973
  pre_type = LLAMA_VOCAB_PRE_TYPE_BAILINGMOE;
1973
1974
  clean_spaces = false;