@fugood/llama.node 1.3.7 → 1.3.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -397,119 +397,118 @@ inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const
397
397
  }
398
398
 
399
399
  inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y, const ggml_fp16_t * GGML_RESTRICT x, const float v) {
400
- #if defined(GGML_SIMD)
401
- #if defined(__ARM_FEATURE_SVE)
402
- const int sve_register_length = svcntb() * 8;
403
- const int ggml_f16_epr = sve_register_length / 16;
404
- const int ggml_f16_step = 8 * ggml_f16_epr;
400
+ #if defined(GGML_SIMD) && defined(__ARM_FEATURE_SVE)
401
+ const int sve_register_length = svcntb() * 8;
402
+ const int ggml_f16_epr = sve_register_length / 16;
403
+ const int ggml_f16_step = 8 * ggml_f16_epr;
405
404
 
406
- GGML_F16x_VEC vx = GGML_F16x_VEC_SET1(v);
405
+ GGML_F16x_VEC vx = GGML_F16x_VEC_SET1(v);
407
406
 
408
- const int np= (n & ~(ggml_f16_step - 1));
407
+ int np = (n & ~(ggml_f16_step - 1));
409
408
 
410
- svfloat16_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8;
411
- svfloat16_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8;
412
- for (int i = 0; i < np; i += ggml_f16_step) {
413
- ax1 = GGML_F16x_VEC_LOAD(x + i + 0 * ggml_f16_epr, 0);
414
- ay1 = GGML_F16x_VEC_LOAD(y + i + 0 * ggml_f16_epr, 0);
415
- ay1 = GGML_F16x_VEC_FMA(ay1, ax1, vx);
409
+ svfloat16_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8;
410
+ svfloat16_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8;
411
+ for (int i = 0; i < np; i += ggml_f16_step) {
412
+ ax1 = GGML_F16x_VEC_LOAD(x + i + 0 * ggml_f16_epr, 0);
413
+ ay1 = GGML_F16x_VEC_LOAD(y + i + 0 * ggml_f16_epr, 0);
414
+ ay1 = GGML_F16x_VEC_FMA(ay1, ax1, vx);
416
415
 
417
- GGML_F16x_VEC_STORE(y + i + 0 * ggml_f16_epr, ay1, 0);
416
+ GGML_F16x_VEC_STORE(y + i + 0 * ggml_f16_epr, ay1, 0);
418
417
 
419
- ax2 = GGML_F16x_VEC_LOAD(x + i + 1 * ggml_f16_epr, 1);
420
- ay2 = GGML_F16x_VEC_LOAD(y + i + 1 * ggml_f16_epr, 1);
421
- ay2 = GGML_F16x_VEC_FMA(ay2, ax2, vx);
418
+ ax2 = GGML_F16x_VEC_LOAD(x + i + 1 * ggml_f16_epr, 1);
419
+ ay2 = GGML_F16x_VEC_LOAD(y + i + 1 * ggml_f16_epr, 1);
420
+ ay2 = GGML_F16x_VEC_FMA(ay2, ax2, vx);
422
421
 
423
- GGML_F16x_VEC_STORE(y + i + 1 * ggml_f16_epr, ay2, 1);
422
+ GGML_F16x_VEC_STORE(y + i + 1 * ggml_f16_epr, ay2, 1);
424
423
 
425
- ax3 = GGML_F16x_VEC_LOAD(x + i + 2 * ggml_f16_epr, 2);
426
- ay3 = GGML_F16x_VEC_LOAD(y + i + 2 * ggml_f16_epr, 2);
427
- ay3 = GGML_F16x_VEC_FMA(ay3, ax3, vx);
424
+ ax3 = GGML_F16x_VEC_LOAD(x + i + 2 * ggml_f16_epr, 2);
425
+ ay3 = GGML_F16x_VEC_LOAD(y + i + 2 * ggml_f16_epr, 2);
426
+ ay3 = GGML_F16x_VEC_FMA(ay3, ax3, vx);
428
427
 
429
- GGML_F16x_VEC_STORE(y + i + 2 * ggml_f16_epr, ay3, 2);
428
+ GGML_F16x_VEC_STORE(y + i + 2 * ggml_f16_epr, ay3, 2);
430
429
 
431
- ax4 = GGML_F16x_VEC_LOAD(x + i + 3 * ggml_f16_epr, 3);
432
- ay4 = GGML_F16x_VEC_LOAD(y + i + 3 * ggml_f16_epr, 3);
433
- ay4 = GGML_F16x_VEC_FMA(ay4, ax4, vx);
430
+ ax4 = GGML_F16x_VEC_LOAD(x + i + 3 * ggml_f16_epr, 3);
431
+ ay4 = GGML_F16x_VEC_LOAD(y + i + 3 * ggml_f16_epr, 3);
432
+ ay4 = GGML_F16x_VEC_FMA(ay4, ax4, vx);
434
433
 
435
- GGML_F16x_VEC_STORE(y + i + 3 * ggml_f16_epr, ay4, 3);
434
+ GGML_F16x_VEC_STORE(y + i + 3 * ggml_f16_epr, ay4, 3);
436
435
 
437
- ax5 = GGML_F16x_VEC_LOAD(x + i + 4 * ggml_f16_epr, 4);
438
- ay5 = GGML_F16x_VEC_LOAD(y + i + 4 * ggml_f16_epr, 4);
439
- ay5 = GGML_F16x_VEC_FMA(ay5, ax5, vx);
436
+ ax5 = GGML_F16x_VEC_LOAD(x + i + 4 * ggml_f16_epr, 4);
437
+ ay5 = GGML_F16x_VEC_LOAD(y + i + 4 * ggml_f16_epr, 4);
438
+ ay5 = GGML_F16x_VEC_FMA(ay5, ax5, vx);
440
439
 
441
- GGML_F16x_VEC_STORE(y + i + 4 * ggml_f16_epr, ay5, 4);
440
+ GGML_F16x_VEC_STORE(y + i + 4 * ggml_f16_epr, ay5, 4);
442
441
 
443
- ax6 = GGML_F16x_VEC_LOAD(x + i + 5 * ggml_f16_epr, 5);
444
- ay6 = GGML_F16x_VEC_LOAD(y + i + 5 * ggml_f16_epr, 5);
445
- ay6 = GGML_F16x_VEC_FMA(ay6, ax6, vx);
442
+ ax6 = GGML_F16x_VEC_LOAD(x + i + 5 * ggml_f16_epr, 5);
443
+ ay6 = GGML_F16x_VEC_LOAD(y + i + 5 * ggml_f16_epr, 5);
444
+ ay6 = GGML_F16x_VEC_FMA(ay6, ax6, vx);
446
445
 
447
- GGML_F16x_VEC_STORE(y + i + 5 * ggml_f16_epr, ay6, 5);
446
+ GGML_F16x_VEC_STORE(y + i + 5 * ggml_f16_epr, ay6, 5);
448
447
 
449
- ax7 = GGML_F16x_VEC_LOAD(x + i + 6 * ggml_f16_epr, 6);
450
- ay7 = GGML_F16x_VEC_LOAD(y + i + 6 * ggml_f16_epr, 6);
451
- ay7 = GGML_F16x_VEC_FMA(ay7, ax7, vx);
448
+ ax7 = GGML_F16x_VEC_LOAD(x + i + 6 * ggml_f16_epr, 6);
449
+ ay7 = GGML_F16x_VEC_LOAD(y + i + 6 * ggml_f16_epr, 6);
450
+ ay7 = GGML_F16x_VEC_FMA(ay7, ax7, vx);
452
451
 
453
- GGML_F16x_VEC_STORE(y + i + 6 * ggml_f16_epr, ay7, 6);
452
+ GGML_F16x_VEC_STORE(y + i + 6 * ggml_f16_epr, ay7, 6);
454
453
 
455
- ax8 = GGML_F16x_VEC_LOAD(x + i + 7 * ggml_f16_epr, 7);
456
- ay8 = GGML_F16x_VEC_LOAD(y + i + 7 * ggml_f16_epr, 7);
457
- ay8 = GGML_F16x_VEC_FMA(ay8, ax8, vx);
454
+ ax8 = GGML_F16x_VEC_LOAD(x + i + 7 * ggml_f16_epr, 7);
455
+ ay8 = GGML_F16x_VEC_LOAD(y + i + 7 * ggml_f16_epr, 7);
456
+ ay8 = GGML_F16x_VEC_FMA(ay8, ax8, vx);
458
457
 
459
- GGML_F16x_VEC_STORE(y + i + 7 * ggml_f16_epr, ay8, 7);
460
- }
461
- const int np2 = (n & ~(ggml_f16_epr - 1));
462
- for (int k = np; k < np2; k += ggml_f16_epr) {
463
- svfloat16_t rx = GGML_F16x_VEC_LOAD(x + k, 0);
464
- svfloat16_t ry = GGML_F16x_VEC_LOAD(y + k, 0);
465
- ry = GGML_F16x_VEC_FMA(ry, rx, vx);
466
-
467
- GGML_F16x_VEC_STORE(y + k, ry, 0);
468
- }
469
-
470
- if (np2 < n) {
471
- svbool_t pg = svwhilelt_b16(np2, n);
472
- svfloat16_t hx = svld1_f16(pg, (const __fp16 *)(x + np2));
473
- svfloat16_t hy = svld1_f16(pg, (const __fp16 *)(y + np2));
474
- hy = svmad_f16_x(pg, hx, vx, hy);
475
- svst1_f16(pg, (__fp16 *)(y + np2), hy);
476
- }
458
+ GGML_F16x_VEC_STORE(y + i + 7 * ggml_f16_epr, ay8, 7);
459
+ }
460
+ const int np2 = (n & ~(ggml_f16_epr - 1));
461
+ for (int k = np; k < np2; k += ggml_f16_epr) {
462
+ svfloat16_t rx = GGML_F16x_VEC_LOAD(x + k, 0);
463
+ svfloat16_t ry = GGML_F16x_VEC_LOAD(y + k, 0);
464
+ ry = GGML_F16x_VEC_FMA(ry, rx, vx);
477
465
 
478
- #elif defined(__riscv_v_intrinsic)
479
- // todo: RVV impl
480
- // scalar
481
- for (int i = 0; i < n; ++i) {
482
- y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v);
483
- }
484
- #else
485
- const int np = (n & ~(GGML_F16_STEP - 1));
466
+ GGML_F16x_VEC_STORE(y + k, ry, 0);
467
+ }
486
468
 
487
- GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
469
+ if (np2 < n) {
470
+ svbool_t pg = svwhilelt_b16(np2, n);
471
+ svfloat16_t hx = svld1_f16(pg, (const __fp16 *)(x + np2));
472
+ svfloat16_t hy = svld1_f16(pg, (const __fp16 *)(y + np2));
473
+ hy = svmad_f16_x(pg, hx, vx, hy);
474
+ svst1_f16(pg, (__fp16 *)(y + np2), hy);
475
+ }
476
+ np = n;
477
+ #elif defined(__riscv_zvfh) // implies __riscv_v_intrinsic
478
+ const int np = n;
479
+ _Float16 hv = (_Float16)v;
480
+ for (int i = 0, avl; i < n; i += avl) {
481
+ avl = __riscv_vsetvl_e16m8(n - i);
482
+ vfloat16m8_t ax = __riscv_vle16_v_f16m8((const _Float16 *)&x[i], avl);
483
+ vfloat16m8_t ay = __riscv_vle16_v_f16m8((_Float16 *)&y[i], avl);
484
+ vfloat16m8_t ny = __riscv_vfmadd_vf_f16m8(ax, hv, ay, avl);
485
+ __riscv_vse16_v_f16m8((_Float16 *)&y[i], ny, avl);
486
+ }
487
+ #elif defined(GGML_SIMD)
488
+ const int np = (n & ~(GGML_F16_STEP - 1));
488
489
 
489
- GGML_F16_VEC ax[GGML_F16_ARR];
490
- GGML_F16_VEC ay[GGML_F16_ARR];
490
+ GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
491
491
 
492
- for (int i = 0; i < np; i += GGML_F16_STEP) {
493
- for (int j = 0; j < GGML_F16_ARR; j++) {
494
- ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j);
495
- ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
496
- ay[j] = GGML_F16_VEC_FMA(ay[j], ax[j], vx);
492
+ GGML_F16_VEC ax[GGML_F16_ARR];
493
+ GGML_F16_VEC ay[GGML_F16_ARR];
497
494
 
498
- GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j);
499
- }
500
- }
495
+ for (int i = 0; i < np; i += GGML_F16_STEP) {
496
+ for (int j = 0; j < GGML_F16_ARR; j++) {
497
+ ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j);
498
+ ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
499
+ ay[j] = GGML_F16_VEC_FMA(ay[j], ax[j], vx);
501
500
 
502
- // leftovers
503
- for (int i = np; i < n; ++i) {
504
- y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v);
501
+ GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j);
505
502
  }
506
- #endif
503
+ }
507
504
  #else
508
- // scalar
509
- for (int i = 0; i < n; ++i) {
505
+ const int np = 0;
506
+ #endif
507
+
508
+ // leftovers
509
+ for (int i = np; i < n; ++i) {
510
510
  y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v);
511
511
  }
512
- #endif
513
512
  }
514
513
 
515
514
  // xs and vs are byte strides of x and v
@@ -246,6 +246,21 @@ extern "C" {
246
246
  LLAMA_KV_OVERRIDE_TYPE_STR,
247
247
  };
248
248
 
249
+ enum llama_model_meta_key {
250
+ LLAMA_MODEL_META_KEY_SAMPLING_SEQUENCE,
251
+ LLAMA_MODEL_META_KEY_SAMPLING_TOP_K,
252
+ LLAMA_MODEL_META_KEY_SAMPLING_TOP_P,
253
+ LLAMA_MODEL_META_KEY_SAMPLING_MIN_P,
254
+ LLAMA_MODEL_META_KEY_SAMPLING_XTC_PROBABILITY,
255
+ LLAMA_MODEL_META_KEY_SAMPLING_XTC_THRESHOLD,
256
+ LLAMA_MODEL_META_KEY_SAMPLING_TEMP,
257
+ LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_LAST_N,
258
+ LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_REPEAT,
259
+ LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT,
260
+ LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_TAU,
261
+ LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_ETA,
262
+ };
263
+
249
264
  struct llama_model_kv_override {
250
265
  enum llama_model_kv_override_type tag;
251
266
 
@@ -518,6 +533,9 @@ extern "C" {
518
533
  // Get the number of metadata key/value pairs
519
534
  LLAMA_API int32_t llama_model_meta_count(const struct llama_model * model);
520
535
 
536
+ // Get sampling metadata key name. Returns nullptr if the key is invalid
537
+ LLAMA_API const char * llama_model_meta_key_str(enum llama_model_meta_key key);
538
+
521
539
  // Get metadata key name by index
522
540
  LLAMA_API int32_t llama_model_meta_key_by_index(const struct llama_model * model, int32_t i, char * buf, size_t buf_size);
523
541
 
@@ -115,6 +115,7 @@ add_library(llama
115
115
  models/qwen3vl-moe.cpp
116
116
  models/qwen3moe.cpp
117
117
  models/refact.cpp
118
+ models/rnd1.cpp
118
119
  models/rwkv6-base.cpp
119
120
  models/rwkv6.cpp
120
121
  models/rwkv6qwen2.cpp
@@ -108,24 +108,37 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
108
108
  { LLM_ARCH_APERTUS, "apertus" },
109
109
  { LLM_ARCH_MINIMAX_M2, "minimax-m2" },
110
110
  { LLM_ARCH_COGVLM, "cogvlm" },
111
+ { LLM_ARCH_RND1, "rnd1" },
111
112
  { LLM_ARCH_PANGU_EMBED, "pangu-embedded" },
112
113
  { LLM_ARCH_UNKNOWN, "(unknown)" },
113
114
  };
114
115
 
115
116
  static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
116
- { LLM_KV_GENERAL_TYPE, "general.type" },
117
- { LLM_KV_GENERAL_ARCHITECTURE, "general.architecture" },
118
- { LLM_KV_GENERAL_QUANTIZATION_VERSION, "general.quantization_version" },
119
- { LLM_KV_GENERAL_ALIGNMENT, "general.alignment" },
120
- { LLM_KV_GENERAL_FILE_TYPE, "general.file_type" },
121
- { LLM_KV_GENERAL_NAME, "general.name" },
122
- { LLM_KV_GENERAL_AUTHOR, "general.author" },
123
- { LLM_KV_GENERAL_VERSION, "general.version" },
124
- { LLM_KV_GENERAL_URL, "general.url" },
125
- { LLM_KV_GENERAL_DESCRIPTION, "general.description" },
126
- { LLM_KV_GENERAL_LICENSE, "general.license" },
127
- { LLM_KV_GENERAL_SOURCE_URL, "general.source.url" },
128
- { LLM_KV_GENERAL_SOURCE_HF_REPO, "general.source.huggingface.repository" },
117
+ { LLM_KV_GENERAL_TYPE, "general.type" },
118
+ { LLM_KV_GENERAL_ARCHITECTURE, "general.architecture" },
119
+ { LLM_KV_GENERAL_QUANTIZATION_VERSION, "general.quantization_version" },
120
+ { LLM_KV_GENERAL_ALIGNMENT, "general.alignment" },
121
+ { LLM_KV_GENERAL_FILE_TYPE, "general.file_type" },
122
+ { LLM_KV_GENERAL_SAMPLING_SEQUENCE, "general.sampling.sequence" },
123
+ { LLM_KV_GENERAL_SAMPLING_TOP_K, "general.sampling.top_k" },
124
+ { LLM_KV_GENERAL_SAMPLING_TOP_P, "general.sampling.top_p" },
125
+ { LLM_KV_GENERAL_SAMPLING_MIN_P, "general.sampling.min_p" },
126
+ { LLM_KV_GENERAL_SAMPLING_XTC_PROBABILITY, "general.sampling.xtc_probability" },
127
+ { LLM_KV_GENERAL_SAMPLING_XTC_THRESHOLD, "general.sampling.xtc_threshold" },
128
+ { LLM_KV_GENERAL_SAMPLING_TEMP, "general.sampling.temp" },
129
+ { LLM_KV_GENERAL_SAMPLING_PENALTY_LAST_N, "general.sampling.penalty_last_n" },
130
+ { LLM_KV_GENERAL_SAMPLING_PENALTY_REPEAT, "general.sampling.penalty_repeat" },
131
+ { LLM_KV_GENERAL_SAMPLING_MIROSTAT, "general.sampling.mirostat" },
132
+ { LLM_KV_GENERAL_SAMPLING_MIROSTAT_TAU, "general.sampling.mirostat_tau" },
133
+ { LLM_KV_GENERAL_SAMPLING_MIROSTAT_ETA, "general.sampling.mirostat_eta" },
134
+ { LLM_KV_GENERAL_NAME, "general.name" },
135
+ { LLM_KV_GENERAL_AUTHOR, "general.author" },
136
+ { LLM_KV_GENERAL_VERSION, "general.version" },
137
+ { LLM_KV_GENERAL_URL, "general.url" },
138
+ { LLM_KV_GENERAL_DESCRIPTION, "general.description" },
139
+ { LLM_KV_GENERAL_LICENSE, "general.license" },
140
+ { LLM_KV_GENERAL_SOURCE_URL, "general.source.url" },
141
+ { LLM_KV_GENERAL_SOURCE_HF_REPO, "general.source.huggingface.repository" },
129
142
 
130
143
  { LLM_KV_VOCAB_SIZE, "%s.vocab_size" },
131
144
  { LLM_KV_CONTEXT_LENGTH, "%s.context_length" },
@@ -2446,6 +2459,26 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
2446
2459
  { LLM_TENSOR_VISEXP_FFN_UP, "blk.%d.vis_up" },
2447
2460
  },
2448
2461
  },
2462
+ {
2463
+ LLM_ARCH_RND1,
2464
+ {
2465
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
2466
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
2467
+ { LLM_TENSOR_OUTPUT, "output" },
2468
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
2469
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
2470
+ { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
2471
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
2472
+ { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
2473
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
2474
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
2475
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
2476
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
2477
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
2478
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
2479
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
2480
+ },
2481
+ },
2449
2482
  {
2450
2483
  LLM_ARCH_UNKNOWN,
2451
2484
  {
@@ -2722,6 +2755,7 @@ bool llm_arch_is_diffusion(const llm_arch & arch) {
2722
2755
  case LLM_ARCH_DREAM:
2723
2756
  case LLM_ARCH_LLADA:
2724
2757
  case LLM_ARCH_LLADA_MOE:
2758
+ case LLM_ARCH_RND1:
2725
2759
  return true;
2726
2760
  default:
2727
2761
  return false;
@@ -112,6 +112,7 @@ enum llm_arch {
112
112
  LLM_ARCH_APERTUS,
113
113
  LLM_ARCH_MINIMAX_M2,
114
114
  LLM_ARCH_COGVLM,
115
+ LLM_ARCH_RND1,
115
116
  LLM_ARCH_PANGU_EMBED,
116
117
  LLM_ARCH_UNKNOWN,
117
118
  };
@@ -122,6 +123,18 @@ enum llm_kv {
122
123
  LLM_KV_GENERAL_QUANTIZATION_VERSION,
123
124
  LLM_KV_GENERAL_ALIGNMENT,
124
125
  LLM_KV_GENERAL_FILE_TYPE,
126
+ LLM_KV_GENERAL_SAMPLING_SEQUENCE,
127
+ LLM_KV_GENERAL_SAMPLING_TOP_K,
128
+ LLM_KV_GENERAL_SAMPLING_TOP_P,
129
+ LLM_KV_GENERAL_SAMPLING_MIN_P,
130
+ LLM_KV_GENERAL_SAMPLING_XTC_PROBABILITY,
131
+ LLM_KV_GENERAL_SAMPLING_XTC_THRESHOLD,
132
+ LLM_KV_GENERAL_SAMPLING_TEMP,
133
+ LLM_KV_GENERAL_SAMPLING_PENALTY_LAST_N,
134
+ LLM_KV_GENERAL_SAMPLING_PENALTY_REPEAT,
135
+ LLM_KV_GENERAL_SAMPLING_MIROSTAT,
136
+ LLM_KV_GENERAL_SAMPLING_MIROSTAT_TAU,
137
+ LLM_KV_GENERAL_SAMPLING_MIROSTAT_ETA,
125
138
  LLM_KV_GENERAL_NAME,
126
139
  LLM_KV_GENERAL_AUTHOR,
127
140
  LLM_KV_GENERAL_VERSION,
@@ -1248,7 +1248,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
1248
1248
 
1249
1249
  // make the outputs have the same order they had in the user-provided batch
1250
1250
  // note: this is mostly relevant for recurrent models atm
1251
- if (!sorted_output) {
1251
+ if (!sorted_output && n_outputs > 1) {
1252
1252
  GGML_ASSERT((size_t) n_outputs == out_ids.size());
1253
1253
 
1254
1254
  // TODO: is there something more efficient which also minimizes swaps?
@@ -961,14 +961,14 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
961
961
  // organize experts into n_expert_groups
962
962
  ggml_tensor * selection_groups = ggml_reshape_3d(ctx0, selection_probs, n_exp_per_group, hparams.n_expert_groups, n_tokens); // [n_exp_per_group, n_expert_groups, n_tokens]
963
963
 
964
- ggml_tensor * group_scores = ggml_top_k(ctx0, selection_groups, 2); // [2, n_expert_groups, n_tokens]
964
+ ggml_tensor * group_scores = ggml_argsort_top_k(ctx0, selection_groups, 2); // [2, n_expert_groups, n_tokens]
965
965
  group_scores = ggml_get_rows(ctx0, ggml_reshape_4d(ctx0, selection_groups, 1, selection_groups->ne[0], selection_groups->ne[1], selection_groups->ne[2]), group_scores); // [1, 2, n_expert_groups, n_tokens]
966
966
 
967
967
  // get top n_group_used expert groups
968
968
  group_scores = ggml_sum_rows(ctx0, ggml_reshape_3d(ctx0, group_scores, group_scores->ne[1], group_scores->ne[2], group_scores->ne[3])); // [1, n_expert_groups, n_tokens]
969
969
  group_scores = ggml_reshape_2d(ctx0, group_scores, group_scores->ne[1], group_scores->ne[2]); // [n_expert_groups, n_tokens]
970
970
 
971
- ggml_tensor * expert_groups = ggml_top_k(ctx0, group_scores, hparams.n_group_used); // [n_group_used, n_tokens]
971
+ ggml_tensor * expert_groups = ggml_argsort_top_k(ctx0, group_scores, hparams.n_group_used); // [n_group_used, n_tokens]
972
972
  cb(expert_groups, "ffn_moe_group_topk", il);
973
973
 
974
974
  // mask out the other groups
@@ -979,7 +979,7 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
979
979
  }
980
980
 
981
981
  // select experts
982
- ggml_tensor * selected_experts = ggml_top_k(ctx0, selection_probs, n_expert_used); // [n_expert_used, n_tokens]
982
+ ggml_tensor * selected_experts = ggml_argsort_top_k(ctx0, selection_probs, n_expert_used); // [n_expert_used, n_tokens]
983
983
  cb(selected_experts->src[0], "ffn_moe_argsort", il);
984
984
  cb(selected_experts, "ffn_moe_topk", il);
985
985
 
@@ -1036,6 +1036,18 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1036
1036
  default: type = LLM_TYPE_UNKNOWN;
1037
1037
  }
1038
1038
  } break;
1039
+ case LLM_ARCH_RND1:
1040
+ {
1041
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
1042
+
1043
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1044
+ switch (hparams.n_layer) {
1045
+ case 48: type = LLM_TYPE_30B_A3B; break;
1046
+ default: type = LLM_TYPE_UNKNOWN;
1047
+ }
1048
+ // Set non-causal attention for diffusion models
1049
+ hparams.causal_attn = false;
1050
+ } break;
1039
1051
  case LLM_ARCH_QWEN2MOE:
1040
1052
  {
1041
1053
  ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
@@ -3402,6 +3414,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
3402
3414
  } break;
3403
3415
  case LLM_ARCH_QWEN3MOE:
3404
3416
  case LLM_ARCH_QWEN3VLMOE:
3417
+ case LLM_ARCH_RND1:
3405
3418
  {
3406
3419
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3407
3420
 
@@ -6720,7 +6733,7 @@ void llama_model::print_info() const {
6720
6733
  LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
6721
6734
  }
6722
6735
 
6723
- if (arch == LLM_ARCH_QWEN3MOE || arch == LLM_ARCH_OPENAI_MOE || arch == LLM_ARCH_QWEN3VLMOE) {
6736
+ if (arch == LLM_ARCH_QWEN3MOE || arch == LLM_ARCH_OPENAI_MOE || arch == LLM_ARCH_QWEN3VLMOE || arch == LLM_ARCH_RND1) {
6724
6737
  LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
6725
6738
  }
6726
6739
 
@@ -6882,6 +6895,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
6882
6895
  case LLM_ARCH_DREAM:
6883
6896
  case LLM_ARCH_LLADA:
6884
6897
  case LLM_ARCH_LLADA_MOE:
6898
+ case LLM_ARCH_RND1:
6885
6899
  {
6886
6900
  res = nullptr;
6887
6901
  } break;
@@ -7075,6 +7089,11 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
7075
7089
  llm = std::make_unique<llm_build_llada_moe>(*this, params);
7076
7090
  }
7077
7091
  break;
7092
+ case LLM_ARCH_RND1:
7093
+ {
7094
+ llm = std::make_unique<llm_build_rnd1>(*this, params);
7095
+ }
7096
+ break;
7078
7097
  case LLM_ARCH_QWEN2VL:
7079
7098
  {
7080
7099
  llm = std::make_unique<llm_build_qwen2vl>(*this, params);
@@ -7595,6 +7614,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
7595
7614
  case LLM_ARCH_QWEN3:
7596
7615
  case LLM_ARCH_QWEN3MOE:
7597
7616
  case LLM_ARCH_LLADA_MOE:
7617
+ case LLM_ARCH_RND1:
7598
7618
  case LLM_ARCH_OLMO2:
7599
7619
  case LLM_ARCH_OLMOE:
7600
7620
  case LLM_ARCH_PHI2:
@@ -7667,6 +7687,24 @@ int32_t llama_model_meta_count(const llama_model * model) {
7667
7687
  return (int)model->gguf_kv.size();
7668
7688
  }
7669
7689
 
7690
+ const char * llama_model_meta_key_str(llama_model_meta_key key) {
7691
+ switch (key) {
7692
+ case LLAMA_MODEL_META_KEY_SAMPLING_SEQUENCE: return "general.sampling.sequence";
7693
+ case LLAMA_MODEL_META_KEY_SAMPLING_TOP_K: return "general.sampling.top_k";
7694
+ case LLAMA_MODEL_META_KEY_SAMPLING_TOP_P: return "general.sampling.top_p";
7695
+ case LLAMA_MODEL_META_KEY_SAMPLING_MIN_P: return "general.sampling.min_p";
7696
+ case LLAMA_MODEL_META_KEY_SAMPLING_XTC_PROBABILITY: return "general.sampling.xtc_probability";
7697
+ case LLAMA_MODEL_META_KEY_SAMPLING_XTC_THRESHOLD: return "general.sampling.xtc_threshold";
7698
+ case LLAMA_MODEL_META_KEY_SAMPLING_TEMP: return "general.sampling.temp";
7699
+ case LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_LAST_N: return "general.sampling.penalty_last_n";
7700
+ case LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_REPEAT: return "general.sampling.penalty_repeat";
7701
+ case LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT: return "general.sampling.mirostat";
7702
+ case LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_TAU: return "general.sampling.mirostat_tau";
7703
+ case LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_ETA: return "general.sampling.mirostat_eta";
7704
+ default: return nullptr;
7705
+ }
7706
+ }
7707
+
7670
7708
  int32_t llama_model_meta_key_by_index(const llama_model * model, int i, char * buf, size_t buf_size) {
7671
7709
  if (i < 0 || i >= (int)model->gguf_kv.size()) {
7672
7710
  if (buf_size > 0) {
@@ -431,6 +431,10 @@ struct llm_build_refact : public llm_graph_context {
431
431
  llm_build_refact(const llama_model & model, const llm_graph_params & params);
432
432
  };
433
433
 
434
+ struct llm_build_rnd1 : public llm_graph_context {
435
+ llm_build_rnd1(const llama_model & model, const llm_graph_params & params);
436
+ };
437
+
434
438
  struct llm_build_rwkv6 : public llm_build_rwkv6_base {
435
439
  llm_build_rwkv6(const llama_model & model, const llm_graph_params & params);
436
440
  };
@@ -0,0 +1,126 @@
1
+ #include "models.h"
2
+
3
+ // RND1 is a Qwen3Moe AR model converted to diffusion model.
4
+ llm_build_rnd1::llm_build_rnd1(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
5
+ const int64_t n_embd_head = hparams.n_embd_head_v;
6
+
7
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
8
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
9
+
10
+ ggml_tensor * cur;
11
+ ggml_tensor * inpL;
12
+
13
+ inpL = build_inp_embd(model.tok_embd);
14
+
15
+ // inp_pos - contains the positions
16
+ ggml_tensor * inp_pos = build_inp_pos();
17
+
18
+ // Non-causal attention for diffusion
19
+ auto * inp_attn = build_attn_inp_no_cache();
20
+
21
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
22
+
23
+ for (int il = 0; il < n_layer; ++il) {
24
+ ggml_tensor * inpSA = inpL;
25
+
26
+ // norm
27
+ cur = build_norm(inpL,
28
+ model.layers[il].attn_norm, NULL,
29
+ LLM_NORM_RMS, il);
30
+ cb(cur, "attn_norm", il);
31
+
32
+ // self_attention
33
+ {
34
+ // compute Q and K and RoPE them
35
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
36
+ cb(Qcur, "Qcur", il);
37
+
38
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
39
+ cb(Kcur, "Kcur", il);
40
+
41
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
42
+ cb(Vcur, "Vcur", il);
43
+
44
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
45
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
46
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
47
+
48
+ Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
49
+ cb(Qcur, "Qcur_normed", il);
50
+
51
+ Qcur = ggml_rope_ext(
52
+ ctx0, Qcur, inp_pos, nullptr,
53
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
54
+ ext_factor, attn_factor, beta_fast, beta_slow
55
+ );
56
+
57
+ Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
58
+ cb(Kcur, "Kcur_normed", il);
59
+
60
+ Kcur = ggml_rope_ext(
61
+ ctx0, Kcur, inp_pos, nullptr,
62
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
63
+ ext_factor, attn_factor, beta_fast, beta_slow
64
+ );
65
+
66
+ cb(Qcur, "Qcur", il);
67
+ cb(Kcur, "Kcur", il);
68
+ cb(Vcur, "Vcur", il);
69
+
70
+ cur = build_attn(inp_attn,
71
+ model.layers[il].wo, model.layers[il].bo,
72
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
73
+ }
74
+ if (il == n_layer - 1 && inp_out_ids) {
75
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
76
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
77
+ }
78
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
79
+ cb(ffn_inp, "ffn_inp", il);
80
+
81
+ // MoE branch
82
+ cur = build_norm(ffn_inp,
83
+ model.layers[il].ffn_norm, NULL,
84
+ LLM_NORM_RMS, il);
85
+ cb(cur, "ffn_norm", il);
86
+
87
+ ggml_tensor * moe_out =
88
+ build_moe_ffn(cur,
89
+ model.layers[il].ffn_gate_inp,
90
+ model.layers[il].ffn_up_exps,
91
+ model.layers[il].ffn_gate_exps,
92
+ model.layers[il].ffn_down_exps,
93
+ nullptr,
94
+ n_expert, n_expert_used,
95
+ LLM_FFN_SILU, true,
96
+ false, 0.0,
97
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
98
+ il);
99
+ cb(moe_out, "ffn_moe_out", il);
100
+ cur = moe_out;
101
+
102
+ cur = ggml_add(ctx0, cur, ffn_inp);
103
+
104
+ cur = build_cvec(cur, il);
105
+ cb(cur, "l_out", il);
106
+
107
+ // input for next layer
108
+ inpL = cur;
109
+ }
110
+ cur = inpL;
111
+
112
+ cur = build_norm(cur,
113
+ model.output_norm, NULL,
114
+ LLM_NORM_RMS, -1);
115
+
116
+ cb(cur, "result_norm", -1);
117
+ res->t_embd = cur;
118
+
119
+ // lm_head
120
+ cur = build_lora_mm(model.output, cur);
121
+
122
+ cb(cur, "result_output", -1);
123
+ res->t_logits = cur;
124
+
125
+ ggml_build_forward_expand(gf, cur);
126
+ }