@fugood/llama.node 1.4.7 → 1.4.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/binding.ts +8 -0
- package/package.json +15 -15
- package/scripts/llama.cpp.patch +23 -24
- package/src/LlamaContext.cpp +4 -2
- package/src/llama.cpp/common/CMakeLists.txt +2 -0
- package/src/llama.cpp/common/arg.cpp +470 -223
- package/src/llama.cpp/common/arg.h +43 -2
- package/src/llama.cpp/common/chat-peg-parser.cpp +16 -2
- package/src/llama.cpp/common/chat.cpp +140 -0
- package/src/llama.cpp/common/common.cpp +130 -67
- package/src/llama.cpp/common/common.h +44 -17
- package/src/llama.cpp/common/console.cpp +98 -18
- package/src/llama.cpp/common/console.h +30 -8
- package/src/llama.cpp/common/download.cpp +69 -25
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +132 -3
- package/src/llama.cpp/common/json-schema-to-grammar.h +20 -0
- package/src/llama.cpp/common/log.cpp +5 -0
- package/src/llama.cpp/common/log.h +1 -0
- package/src/llama.cpp/common/peg-parser.cpp +1 -1
- package/src/llama.cpp/common/preset.cpp +206 -0
- package/src/llama.cpp/common/preset.h +32 -0
- package/src/llama.cpp/common/sampling.cpp +67 -54
- package/src/llama.cpp/common/sampling.h +8 -0
- package/src/llama.cpp/ggml/CMakeLists.txt +4 -0
- package/src/llama.cpp/ggml/include/ggml-alloc.h +9 -0
- package/src/llama.cpp/ggml/include/ggml-backend.h +1 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -0
- package/src/llama.cpp/ggml/include/ggml.h +7 -8
- package/src/llama.cpp/ggml/src/CMakeLists.txt +3 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +4 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +285 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +28 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +111 -45
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +4 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +288 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.h +8 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +41 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +125 -22
- package/src/llama.cpp/include/llama.h +18 -1
- package/src/llama.cpp/src/llama-arch.cpp +1890 -2248
- package/src/llama.cpp/src/llama-arch.h +9 -2
- package/src/llama.cpp/src/llama-batch.cpp +12 -2
- package/src/llama.cpp/src/llama-batch.h +4 -2
- package/src/llama.cpp/src/llama-context.cpp +93 -23
- package/src/llama.cpp/src/llama-context.h +8 -2
- package/src/llama.cpp/src/llama-graph.cpp +84 -16
- package/src/llama.cpp/src/llama-graph.h +17 -4
- package/src/llama.cpp/src/llama-hparams.cpp +6 -0
- package/src/llama.cpp/src/llama-hparams.h +5 -1
- package/src/llama.cpp/src/llama-impl.cpp +4 -0
- package/src/llama.cpp/src/llama-kv-cache.cpp +90 -42
- package/src/llama.cpp/src/llama-kv-cache.h +19 -2
- package/src/llama.cpp/src/llama-memory-hybrid.cpp +1 -1
- package/src/llama.cpp/src/llama-mmap.cpp +123 -28
- package/src/llama.cpp/src/llama-mmap.h +5 -1
- package/src/llama.cpp/src/llama-model-loader.cpp +58 -13
- package/src/llama.cpp/src/llama-model-loader.h +2 -0
- package/src/llama.cpp/src/llama-model.cpp +110 -49
- package/src/llama.cpp/src/llama-model.h +1 -0
- package/src/llama.cpp/src/llama-quant.cpp +1 -1
- package/src/llama.cpp/src/llama-sampling.cpp +16 -0
- package/src/llama.cpp/src/llama-vocab.cpp +2 -1
- package/src/llama.cpp/src/llama.cpp +665 -1
- package/src/llama.cpp/src/models/deepseek2.cpp +9 -5
- package/src/llama.cpp/src/models/glm4-moe.cpp +28 -11
- package/src/llama.cpp/src/models/glm4.cpp +27 -4
- package/src/llama.cpp/src/models/models.h +5 -5
- package/src/llama.cpp/src/models/nemotron-h.cpp +35 -6
- package/src/llama.cpp/src/models/qwen2.cpp +12 -3
- package/src/llama.cpp/src/models/qwen3next.cpp +81 -266
|
@@ -473,6 +473,7 @@ llama_model_loader::llama_model_loader(
|
|
|
473
473
|
std::vector<std::string> & splits,
|
|
474
474
|
bool use_mmap,
|
|
475
475
|
bool check_tensors,
|
|
476
|
+
bool no_alloc,
|
|
476
477
|
const llama_model_kv_override * param_overrides_p,
|
|
477
478
|
const llama_model_tensor_buft_override * param_tensor_buft_overrides_p) {
|
|
478
479
|
int trace = 0;
|
|
@@ -503,7 +504,7 @@ llama_model_loader::llama_model_loader(
|
|
|
503
504
|
get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
|
|
504
505
|
llm_kv = LLM_KV(llm_arch_from_string(arch_name));
|
|
505
506
|
|
|
506
|
-
files.emplace_back(new llama_file(fname.c_str(), "rb"));
|
|
507
|
+
files.emplace_back(new llama_file(fname.c_str(), "rb", !use_mmap));
|
|
507
508
|
contexts.emplace_back(ctx);
|
|
508
509
|
|
|
509
510
|
// Save tensors data offset of the main file.
|
|
@@ -571,7 +572,7 @@ llama_model_loader::llama_model_loader(
|
|
|
571
572
|
}
|
|
572
573
|
}
|
|
573
574
|
|
|
574
|
-
files.emplace_back(new llama_file(fname_split, "rb"));
|
|
575
|
+
files.emplace_back(new llama_file(fname_split, "rb", !use_mmap));
|
|
575
576
|
contexts.emplace_back(ctx);
|
|
576
577
|
|
|
577
578
|
// Save tensors data offset info of the shard.
|
|
@@ -716,6 +717,7 @@ llama_model_loader::llama_model_loader(
|
|
|
716
717
|
|
|
717
718
|
this->use_mmap = use_mmap;
|
|
718
719
|
this->check_tensors = check_tensors;
|
|
720
|
+
this->no_alloc = no_alloc;
|
|
719
721
|
}
|
|
720
722
|
|
|
721
723
|
std::string llama_model_loader::get_arch_name() const {
|
|
@@ -933,7 +935,15 @@ bool llama_model_loader::load_all_data(
|
|
|
933
935
|
// 4 staging buffers for async uploads, each sized 1MB seems to be a good default for single NVMe drives.
|
|
934
936
|
// NVMe raid configurations might require more / larger buffers.
|
|
935
937
|
constexpr size_t n_buffers = 4;
|
|
936
|
-
|
|
938
|
+
|
|
939
|
+
size_t alignment = 1;
|
|
940
|
+
for (const auto & file : files) {
|
|
941
|
+
alignment = std::max(file->read_alignment(), alignment);
|
|
942
|
+
}
|
|
943
|
+
|
|
944
|
+
// Buffer size: balance between memory usage and I/O efficiency
|
|
945
|
+
// 64MB works well for NVMe drives
|
|
946
|
+
const size_t buffer_size = alignment != 1 ? 64 * 1024 * 1024 + 2 * alignment : 1 * 1024 * 1024;
|
|
937
947
|
|
|
938
948
|
std::vector<ggml_backend_buffer_t> host_buffers;
|
|
939
949
|
std::vector<ggml_backend_event_t> events;
|
|
@@ -983,6 +993,7 @@ bool llama_model_loader::load_all_data(
|
|
|
983
993
|
// If the backend is supported, create pinned memory buffers and events for synchronisation.
|
|
984
994
|
for (size_t idx = 0; idx < n_buffers; ++idx) {
|
|
985
995
|
auto * buf = ggml_backend_buft_alloc_buffer(host_buft, buffer_size);
|
|
996
|
+
|
|
986
997
|
if (!buf) {
|
|
987
998
|
LLAMA_LOG_DEBUG("%s: failed to allocate host buffer for async uploads for device %s\n", func,
|
|
988
999
|
ggml_backend_dev_name(dev));
|
|
@@ -1064,9 +1075,9 @@ bool llama_model_loader::load_all_data(
|
|
|
1064
1075
|
}
|
|
1065
1076
|
} else {
|
|
1066
1077
|
const auto & file = files.at(weight->idx);
|
|
1078
|
+
|
|
1067
1079
|
if (ggml_backend_buffer_is_host(cur->buffer)) {
|
|
1068
|
-
file->
|
|
1069
|
-
file->read_raw(cur->data, n_size);
|
|
1080
|
+
file->read_raw_at(cur->data, n_size, weight->offs);
|
|
1070
1081
|
if (check_tensors) {
|
|
1071
1082
|
validation_result.emplace_back(std::async(std::launch::async, [cur, n_size] {
|
|
1072
1083
|
return std::make_pair(cur, ggml_validate_row_data(cur->type, cur->data, n_size));
|
|
@@ -1075,26 +1086,60 @@ bool llama_model_loader::load_all_data(
|
|
|
1075
1086
|
} else {
|
|
1076
1087
|
// If upload_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU.
|
|
1077
1088
|
if (upload_backend) {
|
|
1078
|
-
|
|
1089
|
+
auto offset = (off_t) weight->offs;
|
|
1090
|
+
alignment = file->read_alignment();
|
|
1091
|
+
off_t aligned_offset = offset & ~(alignment - 1);
|
|
1092
|
+
off_t offset_from_alignment = offset - aligned_offset;
|
|
1093
|
+
file->seek(aligned_offset, SEEK_SET);
|
|
1094
|
+
|
|
1095
|
+
// Calculate aligned read boundaries
|
|
1096
|
+
size_t read_start = aligned_offset;
|
|
1097
|
+
size_t read_end = (offset + n_size + alignment - 1) & ~(alignment - 1);
|
|
1079
1098
|
|
|
1080
1099
|
size_t bytes_read = 0;
|
|
1100
|
+
size_t data_read = 0; // Actual tensor data copied (excluding padding)
|
|
1101
|
+
|
|
1102
|
+
while (bytes_read < read_end - read_start) {
|
|
1103
|
+
size_t read_size = std::min<size_t>(buffer_size, read_end - read_start - bytes_read);
|
|
1081
1104
|
|
|
1082
|
-
|
|
1083
|
-
|
|
1105
|
+
// Align the destination pointer within the pinned buffer
|
|
1106
|
+
uintptr_t ptr_dest_aligned = (reinterpret_cast<uintptr_t>(host_ptrs[buffer_idx]) + alignment - 1) & ~(alignment - 1);
|
|
1084
1107
|
|
|
1108
|
+
// Wait for previous upload to complete before reusing buffer
|
|
1085
1109
|
ggml_backend_event_synchronize(events[buffer_idx]);
|
|
1086
|
-
|
|
1087
|
-
|
|
1110
|
+
|
|
1111
|
+
// Read aligned chunk from file
|
|
1112
|
+
file->read_raw(reinterpret_cast<void *>(ptr_dest_aligned), read_size);
|
|
1113
|
+
|
|
1114
|
+
// Calculate actual data portion (excluding alignment padding)
|
|
1115
|
+
uintptr_t ptr_data = ptr_dest_aligned;
|
|
1116
|
+
size_t data_to_copy = read_size;
|
|
1117
|
+
|
|
1118
|
+
// Skip alignment padding at start of first chunk
|
|
1119
|
+
if (bytes_read == 0) {
|
|
1120
|
+
ptr_data += offset_from_alignment;
|
|
1121
|
+
data_to_copy -= offset_from_alignment;
|
|
1122
|
+
}
|
|
1123
|
+
|
|
1124
|
+
// Trim alignment padding at end of last chunk
|
|
1125
|
+
if (aligned_offset + bytes_read + read_size > offset + n_size) {
|
|
1126
|
+
data_to_copy -= (read_end - (offset + n_size));
|
|
1127
|
+
}
|
|
1128
|
+
|
|
1129
|
+
// Async upload actual data to GPU
|
|
1130
|
+
ggml_backend_tensor_set_async(upload_backend, cur,
|
|
1131
|
+
reinterpret_cast<void *>(ptr_data), data_read, data_to_copy);
|
|
1088
1132
|
ggml_backend_event_record(events[buffer_idx], upload_backend);
|
|
1089
1133
|
|
|
1090
|
-
|
|
1134
|
+
data_read += data_to_copy;
|
|
1135
|
+
bytes_read += read_size;
|
|
1136
|
+
|
|
1091
1137
|
++buffer_idx;
|
|
1092
1138
|
buffer_idx %= n_buffers;
|
|
1093
1139
|
}
|
|
1094
1140
|
} else {
|
|
1095
1141
|
read_buf.resize(n_size);
|
|
1096
|
-
file->
|
|
1097
|
-
file->read_raw(read_buf.data(), n_size);
|
|
1142
|
+
file->read_raw_at(read_buf.data(), n_size, weight->offs);
|
|
1098
1143
|
ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
|
|
1099
1144
|
if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) {
|
|
1100
1145
|
throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
|
|
@@ -71,6 +71,7 @@ struct llama_model_loader {
|
|
|
71
71
|
|
|
72
72
|
bool use_mmap = false;
|
|
73
73
|
bool check_tensors;
|
|
74
|
+
bool no_alloc;
|
|
74
75
|
|
|
75
76
|
llama_files files;
|
|
76
77
|
llama_ftype ftype;
|
|
@@ -97,6 +98,7 @@ struct llama_model_loader {
|
|
|
97
98
|
std::vector<std::string> & splits, // optional, only need if the split does not follow naming scheme
|
|
98
99
|
bool use_mmap,
|
|
99
100
|
bool check_tensors,
|
|
101
|
+
bool no_alloc,
|
|
100
102
|
const llama_model_kv_override * param_overrides_p,
|
|
101
103
|
const llama_model_tensor_buft_override * param_tensor_buft_overrides_p);
|
|
102
104
|
|
|
@@ -120,6 +120,8 @@ const char * llm_type_name(llm_type type) {
|
|
|
120
120
|
case LLM_TYPE_16B_A1B: return "16B.A1B";
|
|
121
121
|
case LLM_TYPE_21B_A3B: return "21B.A3B";
|
|
122
122
|
case LLM_TYPE_30B_A3B: return "30B.A3B";
|
|
123
|
+
case LLM_TYPE_31B_A3_5B: return "31B.A3.5B";
|
|
124
|
+
case LLM_TYPE_80B_A3B: return "80B.A3B";
|
|
123
125
|
case LLM_TYPE_100B_A6B: return "100B.A6B";
|
|
124
126
|
case LLM_TYPE_106B_A12B: return "106B.A12B";
|
|
125
127
|
case LLM_TYPE_230B_A10B: return "230B.A10B";
|
|
@@ -667,6 +669,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
667
669
|
hparams.n_swa = 8192;
|
|
668
670
|
hparams.n_attn_temp_floor_scale = 8192;
|
|
669
671
|
hparams.f_attn_temp_scale = 0.1f;
|
|
672
|
+
hparams.f_attn_temp_offset = 1.0f;
|
|
670
673
|
hparams.set_swa_pattern(4); // pattern: 3 chunked - 1 full
|
|
671
674
|
}
|
|
672
675
|
|
|
@@ -1634,12 +1637,19 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1634
1637
|
// that have no expert_gating_func model parameter set
|
|
1635
1638
|
hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX;
|
|
1636
1639
|
}
|
|
1637
|
-
|
|
1640
|
+
|
|
1641
|
+
if (ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, 0.0f)) {
|
|
1642
|
+
// [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
|
|
1643
|
+
// cancel the factor from the convert script
|
|
1644
|
+
hparams.rope_yarn_log_mul /= 0.1f;
|
|
1645
|
+
}
|
|
1638
1646
|
|
|
1639
1647
|
// (optional) temperature tuning - used by mistral-large
|
|
1640
1648
|
ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_SCALE, hparams.f_attn_temp_scale, false);
|
|
1641
1649
|
ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_LENGTH, hparams.n_attn_temp_floor_scale, false);
|
|
1642
1650
|
|
|
1651
|
+
hparams.f_attn_temp_offset = 0.0f;
|
|
1652
|
+
|
|
1643
1653
|
switch (hparams.n_layer) {
|
|
1644
1654
|
case 27: type = LLM_TYPE_16B; break;
|
|
1645
1655
|
case 60: type = LLM_TYPE_236B; break;
|
|
@@ -1679,7 +1689,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1679
1689
|
} break;
|
|
1680
1690
|
case LLM_ARCH_GLM4:
|
|
1681
1691
|
{
|
|
1682
|
-
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,
|
|
1692
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1693
|
+
ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, false);
|
|
1683
1694
|
switch (hparams.n_layer) {
|
|
1684
1695
|
case 40: type = LLM_TYPE_9B; break;
|
|
1685
1696
|
case 61: type = LLM_TYPE_32B; break;
|
|
@@ -1688,8 +1699,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1688
1699
|
} break;
|
|
1689
1700
|
case LLM_ARCH_GLM4_MOE:
|
|
1690
1701
|
{
|
|
1691
|
-
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,
|
|
1692
|
-
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,
|
|
1702
|
+
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
|
1703
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1704
|
+
ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, false);
|
|
1693
1705
|
|
|
1694
1706
|
// MoE parameters
|
|
1695
1707
|
ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert);
|
|
@@ -1788,6 +1800,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1788
1800
|
}
|
|
1789
1801
|
} break;
|
|
1790
1802
|
case LLM_ARCH_NEMOTRON_H:
|
|
1803
|
+
case LLM_ARCH_NEMOTRON_H_MOE:
|
|
1791
1804
|
{
|
|
1792
1805
|
ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
|
|
1793
1806
|
ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
|
|
@@ -1803,7 +1816,14 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1803
1816
|
|
|
1804
1817
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1805
1818
|
|
|
1819
|
+
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
|
|
1820
|
+
ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
|
|
1821
|
+
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared, false);
|
|
1822
|
+
ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
|
|
1823
|
+
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
|
|
1824
|
+
|
|
1806
1825
|
switch (hparams.n_layer) {
|
|
1826
|
+
case 52: type = LLM_TYPE_31B_A3_5B; break; // Nemotron-H_MOE 31B
|
|
1807
1827
|
case 56: type = LLM_TYPE_9B; break;
|
|
1808
1828
|
default: type = LLM_TYPE_UNKNOWN;
|
|
1809
1829
|
}
|
|
@@ -2257,7 +2277,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
2257
2277
|
}
|
|
2258
2278
|
|
|
2259
2279
|
switch (hparams.n_layer) {
|
|
2260
|
-
case
|
|
2280
|
+
case 48: type = LLM_TYPE_80B_A3B; break;
|
|
2261
2281
|
default: type = LLM_TYPE_UNKNOWN;
|
|
2262
2282
|
}
|
|
2263
2283
|
} break;
|
|
@@ -2266,9 +2286,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
2266
2286
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
2267
2287
|
ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_SCALE, hparams.f_attn_temp_scale, false);
|
|
2268
2288
|
|
|
2269
|
-
ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_FAST,
|
|
2270
|
-
ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_SLOW,
|
|
2271
|
-
ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL,
|
|
2289
|
+
ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_FAST, hparams.yarn_beta_fast, false);
|
|
2290
|
+
ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_SLOW, hparams.yarn_beta_slow, false);
|
|
2291
|
+
ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, 0.0f);
|
|
2292
|
+
|
|
2293
|
+
hparams.f_attn_temp_offset = 0.0f;
|
|
2272
2294
|
|
|
2273
2295
|
// TODO: maybe add n_attn_temp_floor_scale as a separate KV?
|
|
2274
2296
|
if (hparams.f_attn_temp_scale != 0.0f) {
|
|
@@ -2278,18 +2300,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
2278
2300
|
}
|
|
2279
2301
|
}
|
|
2280
2302
|
|
|
2281
|
-
// TODO: this seems to be correct with the case of mscale == mscale_all_dims == 1.0f
|
|
2282
|
-
// but may need further verification with other values
|
|
2283
|
-
if (hparams.rope_yarn_log_mul != 0.0f) {
|
|
2284
|
-
float factor = 1.0f / hparams.rope_freq_scale_train;
|
|
2285
|
-
float mscale = 1.0f;
|
|
2286
|
-
float mscale_all_dims = hparams.rope_yarn_log_mul;
|
|
2287
|
-
static auto get_mscale = [](float scale, float mscale) {
|
|
2288
|
-
return scale <= 1.0f ? 1.0f : (0.1f * mscale * logf(scale) + 1.0f);
|
|
2289
|
-
};
|
|
2290
|
-
hparams.yarn_attn_factor = get_mscale(factor, mscale) / get_mscale(factor, mscale_all_dims);
|
|
2291
|
-
}
|
|
2292
|
-
|
|
2293
2303
|
switch (hparams.n_layer) {
|
|
2294
2304
|
case 26: type = LLM_TYPE_3B; break;
|
|
2295
2305
|
case 34: type = LLM_TYPE_8B; break;
|
|
@@ -2368,10 +2378,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
2368
2378
|
if (cpu_dev == nullptr) {
|
|
2369
2379
|
throw std::runtime_error(format("%s: no CPU backend found", __func__));
|
|
2370
2380
|
}
|
|
2371
|
-
const int i_gpu_start = std::max((
|
|
2372
|
-
const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, (
|
|
2381
|
+
const int i_gpu_start = std::max(int(hparams.n_layer) + 1 - n_gpu_layers, 0);
|
|
2382
|
+
const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, int(n_layer) + 1);
|
|
2373
2383
|
auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev {
|
|
2374
|
-
const bool is_swa = il < (
|
|
2384
|
+
const bool is_swa = il < int(hparams.n_layer) && hparams.is_swa(il);
|
|
2375
2385
|
if (il < i_gpu_start || (il - i_gpu_start) >= act_gpu_layers) {
|
|
2376
2386
|
LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s, is_swa = %d\n", il, ggml_backend_dev_name(cpu_dev), is_swa);
|
|
2377
2387
|
return {cpu_dev, &pimpl->cpu_buft_list};
|
|
@@ -3389,9 +3399,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
3389
3399
|
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
|
3390
3400
|
|
|
3391
3401
|
// optional bias tensors
|
|
3392
|
-
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd},
|
|
3393
|
-
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa},
|
|
3394
|
-
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa},
|
|
3402
|
+
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
3403
|
+
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
|
|
3404
|
+
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
|
|
3395
3405
|
|
|
3396
3406
|
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
3397
3407
|
|
|
@@ -5160,6 +5170,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
5160
5170
|
}
|
|
5161
5171
|
} break;
|
|
5162
5172
|
case LLM_ARCH_NEMOTRON_H:
|
|
5173
|
+
case LLM_ARCH_NEMOTRON_H_MOE:
|
|
5163
5174
|
{
|
|
5164
5175
|
// mamba2 Mixer SSM params
|
|
5165
5176
|
// NOTE: int64_t for tensor dimensions
|
|
@@ -5170,6 +5181,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
5170
5181
|
const int64_t n_group = hparams.ssm_n_group;
|
|
5171
5182
|
const int64_t d_in_proj = 2*d_inner + 2*n_group*d_state + n_ssm_head;
|
|
5172
5183
|
|
|
5184
|
+
const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
|
|
5185
|
+
const int64_t n_ff_shexp = hparams.n_ff_shexp;
|
|
5186
|
+
|
|
5173
5187
|
// embeddings
|
|
5174
5188
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
5175
5189
|
|
|
@@ -5219,12 +5233,26 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
5219
5233
|
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_k_gqa_i}, TENSOR_NOT_REQUIRED);
|
|
5220
5234
|
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_v_gqa_i}, TENSOR_NOT_REQUIRED);
|
|
5221
5235
|
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
5222
|
-
}
|
|
5223
|
-
|
|
5224
|
-
|
|
5225
|
-
|
|
5226
|
-
|
|
5227
|
-
|
|
5236
|
+
} else {
|
|
5237
|
+
if (n_expert != 0) {
|
|
5238
|
+
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert}, 0);
|
|
5239
|
+
layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert }, 0);
|
|
5240
|
+
|
|
5241
|
+
// MoE branch
|
|
5242
|
+
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
|
|
5243
|
+
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
|
|
5244
|
+
|
|
5245
|
+
// Shared expert branch
|
|
5246
|
+
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, 0);
|
|
5247
|
+
layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_shexp}, 0);
|
|
5248
|
+
|
|
5249
|
+
} else {
|
|
5250
|
+
// mlp layers
|
|
5251
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { hparams.n_ff(i), n_embd}, 0);
|
|
5252
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, hparams.n_ff(i)}, 0);
|
|
5253
|
+
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
5254
|
+
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {hparams.n_ff(i)}, TENSOR_NOT_REQUIRED);
|
|
5255
|
+
}
|
|
5228
5256
|
}
|
|
5229
5257
|
}
|
|
5230
5258
|
} break;
|
|
@@ -6208,8 +6236,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
6208
6236
|
{
|
|
6209
6237
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
6210
6238
|
|
|
6211
|
-
output_norm = create_tensor(tn(
|
|
6212
|
-
output = create_tensor(tn(LLM_TENSOR_OUTPUT,
|
|
6239
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM_LFM2, "weight"), {n_embd}, 0);
|
|
6240
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
|
6213
6241
|
|
|
6214
6242
|
if (output == NULL) {
|
|
6215
6243
|
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
@@ -6607,9 +6635,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
6607
6635
|
|
|
6608
6636
|
std::vector<ggml_backend_buffer_ptr> bufs;
|
|
6609
6637
|
if (ml.use_mmap && use_mmap_buffer && buffer_from_host_ptr_supported && is_default_buft) {
|
|
6638
|
+
GGML_ASSERT(!ml.no_alloc);
|
|
6610
6639
|
for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
|
|
6611
6640
|
// only the mmap region containing the tensors in the model is mapped to the backend buffer
|
|
6612
|
-
// this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer,
|
|
6641
|
+
// this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer,
|
|
6642
|
+
// then we could just use metal for all layers
|
|
6613
6643
|
// this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size
|
|
6614
6644
|
void * addr = nullptr;
|
|
6615
6645
|
size_t first, last; // NOLINT
|
|
@@ -6625,9 +6655,16 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
6625
6655
|
bufs.emplace_back(buf);
|
|
6626
6656
|
buf_map.emplace(idx, buf);
|
|
6627
6657
|
}
|
|
6628
|
-
}
|
|
6629
|
-
|
|
6630
|
-
|
|
6658
|
+
} else {
|
|
6659
|
+
ggml_backend_buffer_t buf;
|
|
6660
|
+
if (ml.no_alloc) {
|
|
6661
|
+
buf = ggml_backend_buft_alloc_buffer(buft, /*size =*/ 0); // dummy buffer
|
|
6662
|
+
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
|
|
6663
|
+
t->buffer = buf; // set dummy buffer for weights so that the backend scheduler won't try to allocate them
|
|
6664
|
+
}
|
|
6665
|
+
} else {
|
|
6666
|
+
buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); // real buffer
|
|
6667
|
+
}
|
|
6631
6668
|
if (buf == nullptr) {
|
|
6632
6669
|
throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
|
|
6633
6670
|
}
|
|
@@ -6656,10 +6693,12 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
6656
6693
|
if (llama_supports_gpu_offload()) {
|
|
6657
6694
|
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
|
6658
6695
|
|
|
6659
|
-
|
|
6660
|
-
if (
|
|
6696
|
+
int n_repeating = n_gpu;
|
|
6697
|
+
if (n_repeating > 0) {
|
|
6661
6698
|
LLAMA_LOG_INFO("%s: offloading output layer to GPU\n", __func__);
|
|
6699
|
+
n_repeating--;
|
|
6662
6700
|
}
|
|
6701
|
+
LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_repeating);
|
|
6663
6702
|
|
|
6664
6703
|
const int max_backend_supported_layers = hparams.n_layer + 1;
|
|
6665
6704
|
const int max_offloadable_layers = hparams.n_layer + 1;
|
|
@@ -6682,6 +6721,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
6682
6721
|
}
|
|
6683
6722
|
}
|
|
6684
6723
|
|
|
6724
|
+
if (ml.no_alloc) {
|
|
6725
|
+
return true;
|
|
6726
|
+
}
|
|
6727
|
+
|
|
6685
6728
|
// load tensor data
|
|
6686
6729
|
for (auto & [ctx, buf_map] : ctx_buf_maps) {
|
|
6687
6730
|
if (!ml.load_all_data(ctx, buf_map, use_mlock ? &pimpl->mlock_mmaps : NULL, params.progress_callback, params.progress_callback_user_data)) {
|
|
@@ -6724,9 +6767,18 @@ size_t llama_model::n_devices() const {
|
|
|
6724
6767
|
|
|
6725
6768
|
std::map<ggml_backend_buffer_type_t, size_t> llama_model::memory_breakdown() const {
|
|
6726
6769
|
std::map<ggml_backend_buffer_type_t, size_t> ret;
|
|
6727
|
-
for (const auto & [
|
|
6728
|
-
|
|
6729
|
-
|
|
6770
|
+
for (const auto & [ctx, bufs] : pimpl->ctxs_bufs) {
|
|
6771
|
+
if (hparams.no_alloc) {
|
|
6772
|
+
GGML_ASSERT(bufs.size() == 1);
|
|
6773
|
+
ggml_backend_buffer_t buf = bufs[0].get();
|
|
6774
|
+
GGML_ASSERT(ggml_backend_buffer_get_base(buf) == nullptr);
|
|
6775
|
+
ggml_backend_buffer_type_t buft = ggml_backend_buffer_get_type(buf);
|
|
6776
|
+
ret[buft] += ggml_backend_alloc_ctx_tensors_from_buft_size(ctx.get(), buft);
|
|
6777
|
+
} else {
|
|
6778
|
+
for (const auto & buf : bufs) {
|
|
6779
|
+
// GGML_ASSERT(ggml_backend_buffer_get_base(buf.get()) != nullptr); // multi_buffer does not have a defined base
|
|
6780
|
+
ret[ggml_backend_buffer_get_type(buf.get())] += ggml_backend_buffer_get_size(buf.get());
|
|
6781
|
+
}
|
|
6730
6782
|
}
|
|
6731
6783
|
}
|
|
6732
6784
|
return ret;
|
|
@@ -6771,6 +6823,7 @@ void llama_model::print_info() const {
|
|
|
6771
6823
|
// hparams
|
|
6772
6824
|
LLAMA_LOG_INFO("%s: arch = %s\n", __func__, arch_name().c_str());
|
|
6773
6825
|
LLAMA_LOG_INFO("%s: vocab_only = %d\n", __func__, hparams.vocab_only);
|
|
6826
|
+
LLAMA_LOG_INFO("%s: no_alloc = %d\n", __func__, hparams.no_alloc);
|
|
6774
6827
|
|
|
6775
6828
|
if (!hparams.vocab_only) {
|
|
6776
6829
|
LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train);
|
|
@@ -6805,6 +6858,7 @@ void llama_model::print_info() const {
|
|
|
6805
6858
|
LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
|
|
6806
6859
|
LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
|
|
6807
6860
|
LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn);
|
|
6861
|
+
LLAMA_LOG_INFO("%s: rope_yarn_log_mul= %.4f\n", __func__, hparams.rope_yarn_log_mul);
|
|
6808
6862
|
LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown");
|
|
6809
6863
|
// MRoPE (Multi-axis Rotary Position Embedding) sections
|
|
6810
6864
|
if (const auto & s = hparams.rope_sections; s[0] || s[1] || s[2] || s[3]) {
|
|
@@ -6827,7 +6881,8 @@ void llama_model::print_info() const {
|
|
|
6827
6881
|
arch == LLM_ARCH_PLAMO2 ||
|
|
6828
6882
|
arch == LLM_ARCH_GRANITE_HYBRID ||
|
|
6829
6883
|
arch == LLM_ARCH_QWEN3NEXT ||
|
|
6830
|
-
arch == LLM_ARCH_NEMOTRON_H
|
|
6884
|
+
arch == LLM_ARCH_NEMOTRON_H ||
|
|
6885
|
+
arch == LLM_ARCH_NEMOTRON_H_MOE) {
|
|
6831
6886
|
LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv);
|
|
6832
6887
|
LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
|
|
6833
6888
|
LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state);
|
|
@@ -6868,7 +6923,6 @@ void llama_model::print_info() const {
|
|
|
6868
6923
|
LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
|
|
6869
6924
|
LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
|
|
6870
6925
|
LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
|
|
6871
|
-
LLAMA_LOG_INFO("%s: rope_yarn_log_mul = %.4f\n", __func__, hparams.rope_yarn_log_mul);
|
|
6872
6926
|
}
|
|
6873
6927
|
|
|
6874
6928
|
if (arch == LLM_ARCH_QWEN2MOE) {
|
|
@@ -6883,7 +6937,8 @@ void llama_model::print_info() const {
|
|
|
6883
6937
|
if (arch == LLM_ARCH_MINICPM ||
|
|
6884
6938
|
arch == LLM_ARCH_GRANITE ||
|
|
6885
6939
|
arch == LLM_ARCH_GRANITE_MOE ||
|
|
6886
|
-
arch == LLM_ARCH_GRANITE_HYBRID
|
|
6940
|
+
arch == LLM_ARCH_GRANITE_HYBRID ||
|
|
6941
|
+
arch == LLM_ARCH_NEMOTRON_H_MOE) {
|
|
6887
6942
|
LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
|
|
6888
6943
|
LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
|
|
6889
6944
|
LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
|
|
@@ -7064,7 +7119,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
|
|
7064
7119
|
if (arch == LLM_ARCH_FALCON_H1) {
|
|
7065
7120
|
filter_attn = [&](int32_t) { return true; };
|
|
7066
7121
|
filter_recr = [&](int32_t) { return true; };
|
|
7067
|
-
} else if (arch == LLM_ARCH_NEMOTRON_H) {
|
|
7122
|
+
} else if (arch == LLM_ARCH_NEMOTRON_H || arch == LLM_ARCH_NEMOTRON_H_MOE) {
|
|
7068
7123
|
filter_attn = [&](int32_t il) {
|
|
7069
7124
|
return !hparams.is_recurrent(il) && hparams.n_ff(il) == 0;
|
|
7070
7125
|
};
|
|
@@ -7435,6 +7490,7 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
|
|
7435
7490
|
llm = std::make_unique<llm_build_nemotron>(*this, params);
|
|
7436
7491
|
} break;
|
|
7437
7492
|
case LLM_ARCH_NEMOTRON_H:
|
|
7493
|
+
case LLM_ARCH_NEMOTRON_H_MOE:
|
|
7438
7494
|
{
|
|
7439
7495
|
llm = std::make_unique<llm_build_nemotron_h>(*this, params);
|
|
7440
7496
|
} break;
|
|
@@ -7619,6 +7675,7 @@ llama_model_params llama_model_default_params() {
|
|
|
7619
7675
|
/*.check_tensors =*/ false,
|
|
7620
7676
|
/*.use_extra_bufts =*/ true,
|
|
7621
7677
|
/*.no_host =*/ false,
|
|
7678
|
+
/*.no_alloc =*/ false,
|
|
7622
7679
|
};
|
|
7623
7680
|
|
|
7624
7681
|
return result;
|
|
@@ -7718,6 +7775,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
|
7718
7775
|
case LLM_ARCH_ARWKV7:
|
|
7719
7776
|
case LLM_ARCH_WAVTOKENIZER_DEC:
|
|
7720
7777
|
case LLM_ARCH_NEMOTRON_H:
|
|
7778
|
+
case LLM_ARCH_NEMOTRON_H_MOE:
|
|
7721
7779
|
return LLAMA_ROPE_TYPE_NONE;
|
|
7722
7780
|
|
|
7723
7781
|
// use what we call a normal RoPE, operating on pairs of consecutive head values
|
|
@@ -7738,7 +7796,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
|
7738
7796
|
case LLM_ARCH_DEEPSEEK2:
|
|
7739
7797
|
case LLM_ARCH_PLM:
|
|
7740
7798
|
case LLM_ARCH_CHATGLM:
|
|
7741
|
-
case LLM_ARCH_GLM4:
|
|
7742
7799
|
case LLM_ARCH_GRANITE:
|
|
7743
7800
|
case LLM_ARCH_GRANITE_MOE:
|
|
7744
7801
|
case LLM_ARCH_GRANITE_HYBRID:
|
|
@@ -7800,7 +7857,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
|
7800
7857
|
case LLM_ARCH_LFM2:
|
|
7801
7858
|
case LLM_ARCH_LFM2MOE:
|
|
7802
7859
|
case LLM_ARCH_SMALLTHINKER:
|
|
7803
|
-
case LLM_ARCH_GLM4_MOE:
|
|
7804
7860
|
case LLM_ARCH_SEED_OSS:
|
|
7805
7861
|
case LLM_ARCH_GROVEMOE:
|
|
7806
7862
|
case LLM_ARCH_APERTUS:
|
|
@@ -7817,6 +7873,11 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
|
7817
7873
|
case LLM_ARCH_QWEN3VLMOE:
|
|
7818
7874
|
return LLAMA_ROPE_TYPE_IMROPE;
|
|
7819
7875
|
|
|
7876
|
+
case LLM_ARCH_GLM4:
|
|
7877
|
+
return model->hparams.use_mrope() ? LLAMA_ROPE_TYPE_MROPE : LLAMA_ROPE_TYPE_NORM;
|
|
7878
|
+
case LLM_ARCH_GLM4_MOE:
|
|
7879
|
+
return model->hparams.use_mrope() ? LLAMA_ROPE_TYPE_MROPE : LLAMA_ROPE_TYPE_NEOX;
|
|
7880
|
+
|
|
7820
7881
|
// all model arches should be listed explicitly here
|
|
7821
7882
|
case LLM_ARCH_UNKNOWN:
|
|
7822
7883
|
GGML_ABORT("unknown architecture");
|
|
@@ -596,7 +596,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
596
596
|
}
|
|
597
597
|
|
|
598
598
|
std::vector<std::string> splits = {};
|
|
599
|
-
llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, kv_overrides, nullptr);
|
|
599
|
+
llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr);
|
|
600
600
|
ml.init_mappings(false); // no prefetching
|
|
601
601
|
|
|
602
602
|
llama_model model(llama_model_default_params());
|
|
@@ -362,23 +362,39 @@ const char * llama_sampler_name(const struct llama_sampler * smpl) {
|
|
|
362
362
|
}
|
|
363
363
|
|
|
364
364
|
void llama_sampler_accept(struct llama_sampler * smpl, llama_token token) {
|
|
365
|
+
if (!smpl) {
|
|
366
|
+
return;
|
|
367
|
+
}
|
|
368
|
+
|
|
365
369
|
if (smpl->iface->accept) {
|
|
366
370
|
smpl->iface->accept(smpl, token);
|
|
367
371
|
}
|
|
368
372
|
}
|
|
369
373
|
|
|
370
374
|
void llama_sampler_apply(struct llama_sampler * smpl, struct llama_token_data_array * cur_p) {
|
|
375
|
+
if (!smpl) {
|
|
376
|
+
return;
|
|
377
|
+
}
|
|
378
|
+
|
|
371
379
|
GGML_ASSERT(smpl->iface->apply);
|
|
372
380
|
smpl->iface->apply(smpl, cur_p);
|
|
373
381
|
}
|
|
374
382
|
|
|
375
383
|
void llama_sampler_reset(struct llama_sampler * smpl) {
|
|
384
|
+
if (!smpl) {
|
|
385
|
+
return;
|
|
386
|
+
}
|
|
387
|
+
|
|
376
388
|
if (smpl->iface->reset) {
|
|
377
389
|
smpl->iface->reset(smpl);
|
|
378
390
|
}
|
|
379
391
|
}
|
|
380
392
|
|
|
381
393
|
struct llama_sampler * llama_sampler_clone(const struct llama_sampler * smpl) {
|
|
394
|
+
if (!smpl) {
|
|
395
|
+
return nullptr;
|
|
396
|
+
}
|
|
397
|
+
|
|
382
398
|
if (smpl->iface->clone) {
|
|
383
399
|
return smpl->iface->clone(smpl);
|
|
384
400
|
}
|
|
@@ -1895,7 +1895,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
1895
1895
|
clean_spaces = false;
|
|
1896
1896
|
} else if (
|
|
1897
1897
|
tokenizer_pre == "qwen2" ||
|
|
1898
|
-
tokenizer_pre == "deepseek-r1-qwen"
|
|
1898
|
+
tokenizer_pre == "deepseek-r1-qwen" ||
|
|
1899
|
+
tokenizer_pre == "kormo") {
|
|
1899
1900
|
pre_type = LLAMA_VOCAB_PRE_TYPE_QWEN2;
|
|
1900
1901
|
clean_spaces = false;
|
|
1901
1902
|
} else if (
|