@fugood/llama.node 1.3.0 → 1.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +14 -14
- package/scripts/llama.cpp.patch +8 -8
- package/src/llama.cpp/common/CMakeLists.txt +2 -0
- package/src/llama.cpp/common/arg.cpp +44 -999
- package/src/llama.cpp/common/arg.h +2 -2
- package/src/llama.cpp/common/chat.cpp +17 -2
- package/src/llama.cpp/common/common.cpp +33 -0
- package/src/llama.cpp/common/common.h +15 -1
- package/src/llama.cpp/common/download.cpp +1054 -0
- package/src/llama.cpp/common/download.h +55 -0
- package/src/llama.cpp/ggml/CMakeLists.txt +1 -1
- package/src/llama.cpp/ggml/include/ggml.h +2 -0
- package/src/llama.cpp/ggml/src/CMakeLists.txt +6 -3
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +29 -11
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +428 -26
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +4 -5
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +108 -49
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/cpu-feats.cpp +50 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +3 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +21 -21
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +172 -75
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +0 -4
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +82 -21
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +25 -25
- package/src/llama.cpp/include/llama.h +7 -3
- package/src/llama.cpp/src/CMakeLists.txt +95 -0
- package/src/llama.cpp/src/llama-arch.cpp +108 -0
- package/src/llama.cpp/src/llama-arch.h +11 -0
- package/src/llama.cpp/src/llama-batch.cpp +63 -31
- package/src/llama.cpp/src/llama-batch.h +12 -1
- package/src/llama.cpp/src/llama-chat.cpp +32 -0
- package/src/llama.cpp/src/llama-chat.h +1 -0
- package/src/llama.cpp/src/llama-context.cpp +36 -13
- package/src/llama.cpp/src/llama-context.h +5 -5
- package/src/llama.cpp/src/llama-cparams.h +1 -0
- package/src/llama.cpp/src/llama-graph.cpp +3 -3
- package/src/llama.cpp/src/llama-hparams.cpp +11 -1
- package/src/llama.cpp/src/llama-hparams.h +6 -0
- package/src/llama.cpp/src/llama-kv-cache-iswa.cpp +3 -1
- package/src/llama.cpp/src/llama-kv-cache.cpp +33 -1
- package/src/llama.cpp/src/llama-kv-cells.h +44 -2
- package/src/llama.cpp/src/llama-memory-recurrent.cpp +4 -3
- package/src/llama.cpp/src/llama-model.cpp +320 -13171
- package/src/llama.cpp/src/llama-model.h +8 -0
- package/src/llama.cpp/src/llama-quant.cpp +1 -1
- package/src/llama.cpp/src/llama-vocab.cpp +5 -0
- package/src/llama.cpp/src/llama-vocab.h +1 -0
- package/src/llama.cpp/src/models/apertus.cpp +125 -0
- package/src/llama.cpp/src/models/arcee.cpp +135 -0
- package/src/llama.cpp/src/models/arctic.cpp +138 -0
- package/src/llama.cpp/src/models/arwkv7.cpp +86 -0
- package/src/llama.cpp/src/models/baichuan.cpp +122 -0
- package/src/llama.cpp/src/models/bailingmoe.cpp +144 -0
- package/src/llama.cpp/src/models/bailingmoe2.cpp +135 -0
- package/src/llama.cpp/src/models/bert.cpp +176 -0
- package/src/llama.cpp/src/models/bitnet.cpp +160 -0
- package/src/llama.cpp/src/models/bloom.cpp +101 -0
- package/src/llama.cpp/src/models/chameleon.cpp +178 -0
- package/src/llama.cpp/src/models/chatglm.cpp +132 -0
- package/src/llama.cpp/src/models/codeshell.cpp +111 -0
- package/src/llama.cpp/src/models/cogvlm.cpp +100 -0
- package/src/llama.cpp/src/models/cohere2-iswa.cpp +131 -0
- package/src/llama.cpp/src/models/command-r.cpp +122 -0
- package/src/llama.cpp/src/models/dbrx.cpp +123 -0
- package/src/llama.cpp/src/models/deci.cpp +135 -0
- package/src/llama.cpp/src/models/deepseek.cpp +144 -0
- package/src/llama.cpp/src/models/deepseek2.cpp +236 -0
- package/src/llama.cpp/src/models/dots1.cpp +134 -0
- package/src/llama.cpp/src/models/dream.cpp +105 -0
- package/src/llama.cpp/src/models/ernie4-5-moe.cpp +150 -0
- package/src/llama.cpp/src/models/ernie4-5.cpp +110 -0
- package/src/llama.cpp/src/models/exaone.cpp +114 -0
- package/src/llama.cpp/src/models/exaone4.cpp +123 -0
- package/src/llama.cpp/src/models/falcon-h1.cpp +113 -0
- package/src/llama.cpp/src/models/falcon.cpp +120 -0
- package/src/llama.cpp/src/models/gemma-embedding.cpp +120 -0
- package/src/llama.cpp/src/models/gemma.cpp +112 -0
- package/src/llama.cpp/src/models/gemma2-iswa.cpp +125 -0
- package/src/llama.cpp/src/models/gemma3-iswa.cpp +131 -0
- package/src/llama.cpp/src/models/gemma3n-iswa.cpp +377 -0
- package/src/llama.cpp/src/models/glm4-moe.cpp +153 -0
- package/src/llama.cpp/src/models/glm4.cpp +127 -0
- package/src/llama.cpp/src/models/gpt2.cpp +105 -0
- package/src/llama.cpp/src/models/gptneox.cpp +144 -0
- package/src/llama.cpp/src/models/granite-hybrid.cpp +196 -0
- package/src/llama.cpp/src/models/granite.cpp +211 -0
- package/src/llama.cpp/src/models/graph-context-mamba.cpp +283 -0
- package/src/llama.cpp/src/models/grok.cpp +159 -0
- package/src/llama.cpp/src/models/grovemoe.cpp +141 -0
- package/src/llama.cpp/src/models/hunyuan-dense.cpp +132 -0
- package/src/llama.cpp/src/models/hunyuan-moe.cpp +154 -0
- package/src/llama.cpp/src/models/internlm2.cpp +120 -0
- package/src/llama.cpp/src/models/jais.cpp +86 -0
- package/src/llama.cpp/src/models/jamba.cpp +106 -0
- package/src/llama.cpp/src/models/lfm2.cpp +173 -0
- package/src/llama.cpp/src/models/llada-moe.cpp +122 -0
- package/src/llama.cpp/src/models/llada.cpp +99 -0
- package/src/llama.cpp/src/models/llama-iswa.cpp +174 -0
- package/src/llama.cpp/src/models/llama.cpp +155 -0
- package/src/llama.cpp/src/models/mamba.cpp +55 -0
- package/src/llama.cpp/src/models/minicpm3.cpp +199 -0
- package/src/llama.cpp/src/models/minimax-m2.cpp +124 -0
- package/src/llama.cpp/src/models/models.h +481 -0
- package/src/llama.cpp/src/models/mpt.cpp +126 -0
- package/src/llama.cpp/src/models/nemotron-h.cpp +121 -0
- package/src/llama.cpp/src/models/nemotron.cpp +122 -0
- package/src/llama.cpp/src/models/neo-bert.cpp +104 -0
- package/src/llama.cpp/src/models/olmo.cpp +121 -0
- package/src/llama.cpp/src/models/olmo2.cpp +150 -0
- package/src/llama.cpp/src/models/olmoe.cpp +124 -0
- package/src/llama.cpp/src/models/openai-moe-iswa.cpp +124 -0
- package/src/llama.cpp/src/models/openelm.cpp +124 -0
- package/src/llama.cpp/src/models/orion.cpp +123 -0
- package/src/llama.cpp/src/models/pangu-embedded.cpp +121 -0
- package/src/llama.cpp/src/models/phi2.cpp +121 -0
- package/src/llama.cpp/src/models/phi3.cpp +152 -0
- package/src/llama.cpp/src/models/plamo.cpp +110 -0
- package/src/llama.cpp/src/models/plamo2.cpp +316 -0
- package/src/llama.cpp/src/models/plm.cpp +168 -0
- package/src/llama.cpp/src/models/qwen.cpp +108 -0
- package/src/llama.cpp/src/models/qwen2.cpp +117 -0
- package/src/llama.cpp/src/models/qwen2moe.cpp +151 -0
- package/src/llama.cpp/src/models/qwen2vl.cpp +117 -0
- package/src/llama.cpp/src/models/qwen3.cpp +117 -0
- package/src/llama.cpp/src/models/qwen3moe.cpp +124 -0
- package/src/llama.cpp/src/models/qwen3vl-moe.cpp +149 -0
- package/src/llama.cpp/src/models/qwen3vl.cpp +141 -0
- package/src/llama.cpp/src/models/refact.cpp +94 -0
- package/src/llama.cpp/src/models/rwkv6-base.cpp +162 -0
- package/src/llama.cpp/src/models/rwkv6.cpp +94 -0
- package/src/llama.cpp/src/models/rwkv6qwen2.cpp +86 -0
- package/src/llama.cpp/src/models/rwkv7-base.cpp +135 -0
- package/src/llama.cpp/src/models/rwkv7.cpp +90 -0
- package/src/llama.cpp/src/models/seed-oss.cpp +124 -0
- package/src/llama.cpp/src/models/smallthinker.cpp +120 -0
- package/src/llama.cpp/src/models/smollm3.cpp +128 -0
- package/src/llama.cpp/src/models/stablelm.cpp +146 -0
- package/src/llama.cpp/src/models/starcoder.cpp +100 -0
- package/src/llama.cpp/src/models/starcoder2.cpp +121 -0
- package/src/llama.cpp/src/models/t5-dec.cpp +166 -0
- package/src/llama.cpp/src/models/t5-enc.cpp +96 -0
- package/src/llama.cpp/src/models/wavtokenizer-dec.cpp +149 -0
- package/src/llama.cpp/src/models/xverse.cpp +108 -0
|
@@ -4455,46 +4455,6 @@ void ggml_compute_forward_cont(
|
|
|
4455
4455
|
ggml_compute_forward_dup(params, dst);
|
|
4456
4456
|
}
|
|
4457
4457
|
|
|
4458
|
-
// ggml_compute_forward_reshape
|
|
4459
|
-
|
|
4460
|
-
void ggml_compute_forward_reshape(
|
|
4461
|
-
const ggml_compute_params * params,
|
|
4462
|
-
ggml_tensor * dst) {
|
|
4463
|
-
// NOP
|
|
4464
|
-
GGML_UNUSED(params);
|
|
4465
|
-
GGML_UNUSED(dst);
|
|
4466
|
-
}
|
|
4467
|
-
|
|
4468
|
-
// ggml_compute_forward_view
|
|
4469
|
-
|
|
4470
|
-
void ggml_compute_forward_view(
|
|
4471
|
-
const ggml_compute_params * params,
|
|
4472
|
-
ggml_tensor * dst) {
|
|
4473
|
-
// NOP
|
|
4474
|
-
GGML_UNUSED(params);
|
|
4475
|
-
GGML_UNUSED(dst);
|
|
4476
|
-
}
|
|
4477
|
-
|
|
4478
|
-
// ggml_compute_forward_permute
|
|
4479
|
-
|
|
4480
|
-
void ggml_compute_forward_permute(
|
|
4481
|
-
const ggml_compute_params * params,
|
|
4482
|
-
ggml_tensor * dst) {
|
|
4483
|
-
// NOP
|
|
4484
|
-
GGML_UNUSED(params);
|
|
4485
|
-
GGML_UNUSED(dst);
|
|
4486
|
-
}
|
|
4487
|
-
|
|
4488
|
-
// ggml_compute_forward_transpose
|
|
4489
|
-
|
|
4490
|
-
void ggml_compute_forward_transpose(
|
|
4491
|
-
const ggml_compute_params * params,
|
|
4492
|
-
ggml_tensor * dst) {
|
|
4493
|
-
// NOP
|
|
4494
|
-
GGML_UNUSED(params);
|
|
4495
|
-
GGML_UNUSED(dst);
|
|
4496
|
-
}
|
|
4497
|
-
|
|
4498
4458
|
// ggml_compute_forward_get_rows
|
|
4499
4459
|
|
|
4500
4460
|
static void ggml_compute_forward_get_rows_q(
|
|
@@ -5474,7 +5434,7 @@ static void ggml_rope_cache_init(
|
|
|
5474
5434
|
}
|
|
5475
5435
|
|
|
5476
5436
|
static void ggml_mrope_cache_init(
|
|
5477
|
-
float theta_base_t, float theta_base_h, float theta_base_w, float theta_base_e, int sections[4], bool indep_sects,
|
|
5437
|
+
float theta_base_t, float theta_base_h, float theta_base_w, float theta_base_e, int sections[4], bool is_imrope, bool indep_sects,
|
|
5478
5438
|
float freq_scale, const float * freq_factors, float corr_dims[2], int64_t ne0, float ext_factor, float mscale,
|
|
5479
5439
|
float * cache, float sin_sign, float theta_scale) {
|
|
5480
5440
|
// ref: https://github.com/jquesnelle/yarn/blob/master/scaled_rope/LlamaYaRNScaledRotaryEmbedding.py
|
|
@@ -5509,14 +5469,26 @@ static void ggml_mrope_cache_init(
|
|
|
5509
5469
|
}
|
|
5510
5470
|
|
|
5511
5471
|
float theta = theta_t;
|
|
5512
|
-
if (
|
|
5513
|
-
|
|
5514
|
-
|
|
5515
|
-
|
|
5516
|
-
|
|
5517
|
-
|
|
5518
|
-
|
|
5519
|
-
|
|
5472
|
+
if (is_imrope) { // qwen3vl apply interleaved mrope
|
|
5473
|
+
if (sector % 3 == 1 && sector < 3 * sections[1]) {
|
|
5474
|
+
theta = theta_h;
|
|
5475
|
+
} else if (sector % 3 == 2 && sector < 3 * sections[2]) {
|
|
5476
|
+
theta = theta_w;
|
|
5477
|
+
} else if (sector % 3 == 0 && sector < 3 * sections[0]) {
|
|
5478
|
+
theta = theta_t;
|
|
5479
|
+
} else {
|
|
5480
|
+
theta = theta_e;
|
|
5481
|
+
}
|
|
5482
|
+
} else {
|
|
5483
|
+
if (sector >= sections[0] && sector < sec_w) {
|
|
5484
|
+
theta = theta_h;
|
|
5485
|
+
}
|
|
5486
|
+
else if (sector >= sec_w && sector < sec_w + sections[2]) {
|
|
5487
|
+
theta = theta_w;
|
|
5488
|
+
}
|
|
5489
|
+
else if (sector >= sec_w + sections[2]) {
|
|
5490
|
+
theta = theta_e;
|
|
5491
|
+
}
|
|
5520
5492
|
}
|
|
5521
5493
|
|
|
5522
5494
|
rope_yarn(
|
|
@@ -5589,6 +5561,7 @@ static void ggml_compute_forward_rope_f32(
|
|
|
5589
5561
|
|
|
5590
5562
|
const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
|
|
5591
5563
|
const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE; // ggml_rope_multi, multimodal rotary position embedding
|
|
5564
|
+
const bool is_imrope = mode == GGML_ROPE_TYPE_IMROPE; // qwen3vl apply interleaved mrope
|
|
5592
5565
|
const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
|
|
5593
5566
|
|
|
5594
5567
|
if (is_mrope) {
|
|
@@ -5627,7 +5600,7 @@ static void ggml_compute_forward_rope_f32(
|
|
|
5627
5600
|
const int64_t p_w = pos[i2 + ne2 * 2];
|
|
5628
5601
|
const int64_t p_e = pos[i2 + ne2 * 3];
|
|
5629
5602
|
ggml_mrope_cache_init(
|
|
5630
|
-
p_t, p_h, p_w, p_e, sections, is_vision,
|
|
5603
|
+
p_t, p_h, p_w, p_e, sections, is_imrope, is_vision,
|
|
5631
5604
|
freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
|
|
5632
5605
|
}
|
|
5633
5606
|
|
|
@@ -5775,6 +5748,7 @@ static void ggml_compute_forward_rope_f16(
|
|
|
5775
5748
|
|
|
5776
5749
|
const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
|
|
5777
5750
|
const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE;
|
|
5751
|
+
const bool is_imrope = mode == GGML_ROPE_TYPE_IMROPE;
|
|
5778
5752
|
const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
|
|
5779
5753
|
|
|
5780
5754
|
if (is_mrope) {
|
|
@@ -5813,7 +5787,7 @@ static void ggml_compute_forward_rope_f16(
|
|
|
5813
5787
|
const int64_t p_w = pos[i2 + ne2 * 2];
|
|
5814
5788
|
const int64_t p_e = pos[i2 + ne2 * 3];
|
|
5815
5789
|
ggml_mrope_cache_init(
|
|
5816
|
-
p_t, p_h, p_w, p_e, sections, is_vision,
|
|
5790
|
+
p_t, p_h, p_w, p_e, sections, is_imrope, is_vision,
|
|
5817
5791
|
freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
|
|
5818
5792
|
}
|
|
5819
5793
|
|
|
@@ -7070,7 +7044,11 @@ static void ggml_compute_forward_conv_2d_dw_cwhn(
|
|
|
7070
7044
|
const int64_t row_end = MIN(row_start + rows_per_thread, rows_total);
|
|
7071
7045
|
|
|
7072
7046
|
#ifdef GGML_SIMD
|
|
7073
|
-
|
|
7047
|
+
#if defined(__ARM_FEATURE_SVE)
|
|
7048
|
+
const int64_t pkg_size = svcntw();
|
|
7049
|
+
#else
|
|
7050
|
+
const int64_t pkg_size = GGML_F32_EPR;
|
|
7051
|
+
#endif
|
|
7074
7052
|
const int64_t pkg_count = c / pkg_size;
|
|
7075
7053
|
const int64_t c_pkg_end = pkg_count * pkg_size;
|
|
7076
7054
|
#else
|
|
@@ -7493,10 +7471,17 @@ static void ggml_compute_forward_upscale_f32(
|
|
|
7493
7471
|
float sf1 = (float)ne1/src0->ne[1];
|
|
7494
7472
|
float sf2 = (float)ne2/src0->ne[2];
|
|
7495
7473
|
float sf3 = (float)ne3/src0->ne[3];
|
|
7474
|
+
float pixel_offset = 0.5f;
|
|
7496
7475
|
|
|
7497
7476
|
const int32_t mode_flags = ggml_get_op_params_i32(dst, 0);
|
|
7498
7477
|
const ggml_scale_mode mode = (ggml_scale_mode) (mode_flags & 0xFF);
|
|
7499
7478
|
|
|
7479
|
+
if (mode_flags & GGML_SCALE_FLAG_ALIGN_CORNERS) {
|
|
7480
|
+
pixel_offset = 0.0f;
|
|
7481
|
+
sf0 = ne0 > 1 && ne00 > 1 ? (float)(ne0 - 1) / (ne00 - 1) : sf0;
|
|
7482
|
+
sf1 = ne1 > 1 && ne01 > 1 ? (float)(ne1 - 1) / (ne01 - 1) : sf1;
|
|
7483
|
+
}
|
|
7484
|
+
|
|
7500
7485
|
if (mode == GGML_SCALE_MODE_NEAREST) {
|
|
7501
7486
|
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
|
7502
7487
|
const int64_t i03 = i3 / sf3;
|
|
@@ -7516,13 +7501,6 @@ static void ggml_compute_forward_upscale_f32(
|
|
|
7516
7501
|
}
|
|
7517
7502
|
}
|
|
7518
7503
|
} else if (mode == GGML_SCALE_MODE_BILINEAR) {
|
|
7519
|
-
float pixel_offset = 0.5f;
|
|
7520
|
-
if (mode_flags & GGML_SCALE_FLAG_ALIGN_CORNERS) {
|
|
7521
|
-
pixel_offset = 0.0f;
|
|
7522
|
-
sf0 = ne0 > 1 && ne00 > 1 ? (float)(ne0 - 1) / (ne00 - 1) : sf0;
|
|
7523
|
-
sf1 = ne1 > 1 && ne01 > 1 ? (float)(ne1 - 1) / (ne01 - 1) : sf1;
|
|
7524
|
-
}
|
|
7525
|
-
|
|
7526
7504
|
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
|
7527
7505
|
const int64_t i03 = i3 / sf3;
|
|
7528
7506
|
for (int64_t i2 = ith; i2 < ne2; i2 += nth) {
|
|
@@ -7557,6 +7535,51 @@ static void ggml_compute_forward_upscale_f32(
|
|
|
7557
7535
|
|
|
7558
7536
|
const float val = a*(1 - dx)*(1 - dy) + b*dx*(1 - dy) + c*(1 - dx)*dy + d*dx*dy;
|
|
7559
7537
|
|
|
7538
|
+
float * y_dst = (float *)((char *)dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3);
|
|
7539
|
+
*y_dst = val;
|
|
7540
|
+
}
|
|
7541
|
+
}
|
|
7542
|
+
}
|
|
7543
|
+
}
|
|
7544
|
+
} else if (mode == GGML_SCALE_MODE_BICUBIC) {
|
|
7545
|
+
// https://en.wikipedia.org/wiki/Bicubic_interpolation#Bicubic_convolution_algorithm
|
|
7546
|
+
const float a = -0.75f; // use alpha = -0.75 (same as PyTorch)
|
|
7547
|
+
auto weight1 = [a](float x) { return ((a + 2) * x - (a + 3)) * x * x + 1; };
|
|
7548
|
+
auto weight2 = [a](float x) { return ((a * x - 5 * a) * x + 8 * a) * x - 4 * a; };
|
|
7549
|
+
auto bicubic = [=](float p0, float p1, float p2, float p3, float x) {
|
|
7550
|
+
const float w0 = weight2(x + 1);
|
|
7551
|
+
const float w1 = weight1(x + 0);
|
|
7552
|
+
const float w2 = weight1(1 - x);
|
|
7553
|
+
const float w3 = weight2(2 - x);
|
|
7554
|
+
return p0*w0 + p1*w1 + p2*w2 + p3*w3;
|
|
7555
|
+
};
|
|
7556
|
+
|
|
7557
|
+
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
|
7558
|
+
const int64_t i03 = i3 / sf3;
|
|
7559
|
+
for (int64_t i2 = ith; i2 < ne2; i2 += nth) {
|
|
7560
|
+
const int64_t i02 = i2 / sf2;
|
|
7561
|
+
for (int64_t i1 = 0; i1 < ne1; i1++) {
|
|
7562
|
+
const float y = ((float)i1 + pixel_offset) / sf1 - pixel_offset;
|
|
7563
|
+
const int64_t y0 = (int64_t)floorf(y);
|
|
7564
|
+
const float dy = y - (float)y0;
|
|
7565
|
+
|
|
7566
|
+
for (int64_t i0 = 0; i0 < ne0; i0++) {
|
|
7567
|
+
const float x = ((float)i0 + pixel_offset) / sf0 - pixel_offset;
|
|
7568
|
+
const int64_t x0 = (int64_t)floorf(x);
|
|
7569
|
+
const float dx = x - (float)x0;
|
|
7570
|
+
|
|
7571
|
+
auto p = [=](int64_t x_off, int64_t y_off) -> float {
|
|
7572
|
+
int64_t i00 = std::max(int64_t(0), std::min(x0 + x_off, ne00 - 1));
|
|
7573
|
+
int64_t i01 = std::max(int64_t(0), std::min(y0 + y_off, ne01 - 1));
|
|
7574
|
+
return *(const float *)((const char *)src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
|
|
7575
|
+
};
|
|
7576
|
+
|
|
7577
|
+
const float val = bicubic(
|
|
7578
|
+
bicubic(p(-1,-1), p(0,-1), p(1,-1), p(2,-1), dx),
|
|
7579
|
+
bicubic(p(-1, 0), p(0, 0), p(1, 0), p(2, 0), dx),
|
|
7580
|
+
bicubic(p(-1, 1), p(0, 1), p(1, 1), p(2, 1), dx),
|
|
7581
|
+
bicubic(p(-1, 2), p(0, 2), p(1, 2), p(2, 2), dx), dy);
|
|
7582
|
+
|
|
7560
7583
|
float * y_dst = (float *)((char *)dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3);
|
|
7561
7584
|
*y_dst = val;
|
|
7562
7585
|
}
|
|
@@ -7909,10 +7932,10 @@ void ggml_compute_forward_argsort(
|
|
|
7909
7932
|
|
|
7910
7933
|
// ggml_compute_forward_flash_attn_ext
|
|
7911
7934
|
|
|
7912
|
-
static void
|
|
7935
|
+
static void ggml_compute_forward_flash_attn_ext_f16_one_chunk(
|
|
7913
7936
|
const ggml_compute_params * params,
|
|
7914
|
-
ggml_tensor * dst
|
|
7915
|
-
|
|
7937
|
+
ggml_tensor * dst,
|
|
7938
|
+
int ir0, int ir1) {
|
|
7916
7939
|
const ggml_tensor * q = dst->src[0];
|
|
7917
7940
|
const ggml_tensor * k = dst->src[1];
|
|
7918
7941
|
const ggml_tensor * v = dst->src[2];
|
|
@@ -7928,9 +7951,6 @@ static void ggml_compute_forward_flash_attn_ext_f16(
|
|
|
7928
7951
|
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne)
|
|
7929
7952
|
GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
|
|
7930
7953
|
|
|
7931
|
-
const int ith = params->ith;
|
|
7932
|
-
const int nth = params->nth;
|
|
7933
|
-
|
|
7934
7954
|
const int64_t DK = nek0;
|
|
7935
7955
|
const int64_t DV = nev0;
|
|
7936
7956
|
const int64_t N = neq1;
|
|
@@ -7964,16 +7984,6 @@ static void ggml_compute_forward_flash_attn_ext_f16(
|
|
|
7964
7984
|
|
|
7965
7985
|
// parallelize by q rows using ggml_vec_dot_f32
|
|
7966
7986
|
|
|
7967
|
-
// total rows in q
|
|
7968
|
-
const int nr = neq1*neq2*neq3;
|
|
7969
|
-
|
|
7970
|
-
// rows per thread
|
|
7971
|
-
const int dr = (nr + nth - 1)/nth;
|
|
7972
|
-
|
|
7973
|
-
// row range for this thread
|
|
7974
|
-
const int ir0 = dr*ith;
|
|
7975
|
-
const int ir1 = MIN(ir0 + dr, nr);
|
|
7976
|
-
|
|
7977
7987
|
float scale = 1.0f;
|
|
7978
7988
|
float max_bias = 0.0f;
|
|
7979
7989
|
float logit_softcap = 0.0f;
|
|
@@ -8000,6 +8010,8 @@ static void ggml_compute_forward_flash_attn_ext_f16(
|
|
|
8000
8010
|
GGML_ASSERT(( q_to_vec_dot) && "fattn: unsupported K-type");
|
|
8001
8011
|
GGML_ASSERT((v->type == GGML_TYPE_F32 || v_to_float ) && "fattn: unsupported V-type");
|
|
8002
8012
|
|
|
8013
|
+
int ith = params->ith;
|
|
8014
|
+
|
|
8003
8015
|
// loop over n_batch and n_head
|
|
8004
8016
|
for (int ir = ir0; ir < ir1; ++ir) {
|
|
8005
8017
|
// q indices
|
|
@@ -8147,6 +8159,91 @@ static void ggml_compute_forward_flash_attn_ext_f16(
|
|
|
8147
8159
|
}
|
|
8148
8160
|
}
|
|
8149
8161
|
|
|
8162
|
+
static void ggml_compute_forward_flash_attn_ext_f16(
|
|
8163
|
+
const ggml_compute_params * params,
|
|
8164
|
+
ggml_tensor * dst) {
|
|
8165
|
+
|
|
8166
|
+
const ggml_tensor * q = dst->src[0];
|
|
8167
|
+
const ggml_tensor * k = dst->src[1];
|
|
8168
|
+
const ggml_tensor * v = dst->src[2];
|
|
8169
|
+
|
|
8170
|
+
GGML_TENSOR_LOCALS(int64_t, neq, q, ne)
|
|
8171
|
+
GGML_TENSOR_LOCALS(size_t, nbq, q, nb)
|
|
8172
|
+
GGML_TENSOR_LOCALS(int64_t, nek, k, ne)
|
|
8173
|
+
GGML_TENSOR_LOCALS(size_t, nbk, k, nb)
|
|
8174
|
+
GGML_TENSOR_LOCALS(int64_t, nev, v, ne)
|
|
8175
|
+
GGML_TENSOR_LOCALS(size_t, nbv, v, nb)
|
|
8176
|
+
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne)
|
|
8177
|
+
GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
|
|
8178
|
+
|
|
8179
|
+
const int64_t DK = nek0;
|
|
8180
|
+
const int64_t DV = nev0;
|
|
8181
|
+
const int64_t N = neq1;
|
|
8182
|
+
|
|
8183
|
+
GGML_ASSERT(ne0 == DV);
|
|
8184
|
+
GGML_ASSERT(ne2 == N);
|
|
8185
|
+
|
|
8186
|
+
// input tensor rows must be contiguous
|
|
8187
|
+
GGML_ASSERT(nbq0 == ggml_type_size(q->type));
|
|
8188
|
+
GGML_ASSERT(nbk0 == ggml_type_size(k->type));
|
|
8189
|
+
GGML_ASSERT(nbv0 == ggml_type_size(v->type));
|
|
8190
|
+
|
|
8191
|
+
GGML_ASSERT(neq0 == DK);
|
|
8192
|
+
GGML_ASSERT(nek0 == DK);
|
|
8193
|
+
GGML_ASSERT(nev0 == DV);
|
|
8194
|
+
|
|
8195
|
+
GGML_ASSERT(neq1 == N);
|
|
8196
|
+
|
|
8197
|
+
// dst cannot be transposed or permuted
|
|
8198
|
+
GGML_ASSERT(nb0 == sizeof(float));
|
|
8199
|
+
GGML_ASSERT(nb0 <= nb1);
|
|
8200
|
+
GGML_ASSERT(nb1 <= nb2);
|
|
8201
|
+
GGML_ASSERT(nb2 <= nb3);
|
|
8202
|
+
|
|
8203
|
+
// parallelize by q rows using ggml_vec_dot_f32
|
|
8204
|
+
|
|
8205
|
+
// total rows in q
|
|
8206
|
+
const int64_t nr = neq1*neq2*neq3;
|
|
8207
|
+
|
|
8208
|
+
// rows per thread
|
|
8209
|
+
const int ith = params->ith;
|
|
8210
|
+
const int nth = params->nth;
|
|
8211
|
+
|
|
8212
|
+
// disable for NUMA
|
|
8213
|
+
const bool disable_chunking = ggml_is_numa();
|
|
8214
|
+
|
|
8215
|
+
// 4x chunks per thread
|
|
8216
|
+
int nth_scaled = nth * 4;
|
|
8217
|
+
int64_t chunk_size = (nr + nth_scaled - 1) / nth_scaled;
|
|
8218
|
+
int64_t nchunk = (nr + chunk_size - 1) / chunk_size;
|
|
8219
|
+
|
|
8220
|
+
if (nth == 1 || nchunk < nth || disable_chunking) {
|
|
8221
|
+
nchunk = nth;
|
|
8222
|
+
}
|
|
8223
|
+
|
|
8224
|
+
if (ith == 0) {
|
|
8225
|
+
// Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start.
|
|
8226
|
+
ggml_threadpool_chunk_set(params->threadpool, nth);
|
|
8227
|
+
}
|
|
8228
|
+
|
|
8229
|
+
ggml_barrier(params->threadpool);
|
|
8230
|
+
|
|
8231
|
+
// The number of elements in each chunk
|
|
8232
|
+
const int64_t dr = (nr + nchunk - 1) / nchunk;
|
|
8233
|
+
|
|
8234
|
+
// The first chunk comes from our thread_id, the rest will get auto-assigned.
|
|
8235
|
+
int current_chunk = ith;
|
|
8236
|
+
|
|
8237
|
+
while (current_chunk < nchunk) {
|
|
8238
|
+
const int64_t ir0 = dr * current_chunk;
|
|
8239
|
+
const int64_t ir1 = MIN(ir0 + dr, nr);
|
|
8240
|
+
|
|
8241
|
+
ggml_compute_forward_flash_attn_ext_f16_one_chunk(params, dst, ir0, ir1);
|
|
8242
|
+
|
|
8243
|
+
current_chunk = ggml_threadpool_chunk_add(params->threadpool, 1);
|
|
8244
|
+
}
|
|
8245
|
+
}
|
|
8246
|
+
|
|
8150
8247
|
void ggml_compute_forward_flash_attn_ext(
|
|
8151
8248
|
const ggml_compute_params * params,
|
|
8152
8249
|
ggml_tensor * dst) {
|
|
@@ -51,10 +51,6 @@ void ggml_compute_forward_scale(const struct ggml_compute_params * params, struc
|
|
|
51
51
|
void ggml_compute_forward_set(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
52
52
|
void ggml_compute_forward_cpy(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
53
53
|
void ggml_compute_forward_cont(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
54
|
-
void ggml_compute_forward_reshape(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
55
|
-
void ggml_compute_forward_view(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
56
|
-
void ggml_compute_forward_permute(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
57
|
-
void ggml_compute_forward_transpose(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
58
54
|
void ggml_compute_forward_get_rows(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
59
55
|
void ggml_compute_forward_get_rows_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
60
56
|
void ggml_compute_forward_set_rows(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
@@ -1600,6 +1600,32 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
|
|
|
1600
1600
|
return false;
|
|
1601
1601
|
}
|
|
1602
1602
|
|
|
1603
|
+
void forward_mul_mat_one_chunk(ggml_compute_params * params, ggml_tensor * op, int64_t src0_start, int64_t src0_end) {
|
|
1604
|
+
const ggml_tensor * src0 = op->src[0];
|
|
1605
|
+
const ggml_tensor * src1 = op->src[1];
|
|
1606
|
+
ggml_tensor * dst = op;
|
|
1607
|
+
|
|
1608
|
+
GGML_TENSOR_BINARY_OP_LOCALS
|
|
1609
|
+
|
|
1610
|
+
const void * src1_wdata = params->wdata;
|
|
1611
|
+
const size_t src1_col_stride = ggml_row_size(PARAM_TYPE, ne10);
|
|
1612
|
+
|
|
1613
|
+
// If there are more than three rows in src1, use gemm; otherwise, use gemv.
|
|
1614
|
+
if (ne11 > 3) {
|
|
1615
|
+
gemm<BLOC_TYPE, INTER_SIZE, NB_COLS, PARAM_TYPE>(ne00,
|
|
1616
|
+
(float *) ((char *) dst->data) + src0_start, ne01,
|
|
1617
|
+
(const char *) src0->data + src0_start * nb01,
|
|
1618
|
+
(const char *) src1_wdata, ne11 - ne11 % 4, src0_end - src0_start);
|
|
1619
|
+
}
|
|
1620
|
+
for (int iter = ne11 - ne11 % 4; iter < ne11; iter++) {
|
|
1621
|
+
gemv<BLOC_TYPE, INTER_SIZE, NB_COLS, PARAM_TYPE>(ne00,
|
|
1622
|
+
(float *) ((char *) dst->data + (iter * nb1)) + src0_start, ne01,
|
|
1623
|
+
(const char *) src0->data + src0_start * nb01,
|
|
1624
|
+
(const char *) src1_wdata + (src1_col_stride * iter), 1,
|
|
1625
|
+
src0_end - src0_start);
|
|
1626
|
+
}
|
|
1627
|
+
}
|
|
1628
|
+
|
|
1603
1629
|
void forward_mul_mat(ggml_compute_params * params, ggml_tensor * op) {
|
|
1604
1630
|
const ggml_tensor * src0 = op->src[0];
|
|
1605
1631
|
const ggml_tensor * src1 = op->src[1];
|
|
@@ -1643,31 +1669,62 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
|
|
|
1643
1669
|
from_float((float *) ((char *) src1->data + i11 * nb11), (void *) (wdata + i11 * nbw1), ne10);
|
|
1644
1670
|
}
|
|
1645
1671
|
|
|
1646
|
-
|
|
1672
|
+
// disable for NUMA
|
|
1673
|
+
const bool disable_chunking = ggml_is_numa();
|
|
1647
1674
|
|
|
1648
|
-
|
|
1649
|
-
|
|
1650
|
-
|
|
1651
|
-
int64_t
|
|
1652
|
-
|
|
1653
|
-
|
|
1654
|
-
|
|
1655
|
-
|
|
1675
|
+
// 4x chunks per thread
|
|
1676
|
+
int64_t nr = ggml_nrows(op->src[0]);
|
|
1677
|
+
int nth_scaled = nth * 4;
|
|
1678
|
+
int64_t chunk_size = (nr + nth_scaled - 1) / nth_scaled;
|
|
1679
|
+
int64_t nchunk = (nr + chunk_size - 1) / chunk_size;
|
|
1680
|
+
|
|
1681
|
+
// Ensure minimum chunk size to avoid alignment issues with high thread counts
|
|
1682
|
+
// Minimum chunk size should be at least NB_COLS to prevent overlapping chunks after alignment
|
|
1683
|
+
const int64_t min_chunk_size = NB_COLS;
|
|
1684
|
+
if (nchunk > 0 && (nr / nchunk) < min_chunk_size && nr >= min_chunk_size) {
|
|
1685
|
+
nchunk = (nr + min_chunk_size - 1) / min_chunk_size;
|
|
1656
1686
|
}
|
|
1657
1687
|
|
|
1658
|
-
|
|
1659
|
-
|
|
1660
|
-
gemm<BLOC_TYPE, INTER_SIZE, NB_COLS, PARAM_TYPE>(ne00,
|
|
1661
|
-
(float *) ((char *) dst->data) + src0_start, ne01,
|
|
1662
|
-
(const char *) src0->data + src0_start * nb01,
|
|
1663
|
-
(const char *) src1_wdata, ne11 - ne11 % 4, src0_end - src0_start);
|
|
1688
|
+
if (nth == 1 || nchunk < nth || disable_chunking) {
|
|
1689
|
+
nchunk = nth;
|
|
1664
1690
|
}
|
|
1665
|
-
|
|
1666
|
-
|
|
1667
|
-
|
|
1668
|
-
|
|
1669
|
-
|
|
1670
|
-
|
|
1691
|
+
|
|
1692
|
+
// Ensure nchunk doesn't exceed the number of rows divided by minimum chunk size
|
|
1693
|
+
// This prevents creating too many tiny chunks that could overlap after alignment
|
|
1694
|
+
const int64_t max_nchunk = (nr + min_chunk_size - 1) / min_chunk_size;
|
|
1695
|
+
if (nchunk > max_nchunk) {
|
|
1696
|
+
nchunk = max_nchunk;
|
|
1697
|
+
}
|
|
1698
|
+
|
|
1699
|
+
if (ith == 0) {
|
|
1700
|
+
// Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start.
|
|
1701
|
+
ggml_threadpool_chunk_set(params->threadpool, nth);
|
|
1702
|
+
}
|
|
1703
|
+
|
|
1704
|
+
ggml_barrier(params->threadpool);
|
|
1705
|
+
|
|
1706
|
+
// The first chunk comes from our thread_id, the rest will get auto-assigned.
|
|
1707
|
+
int current_chunk = ith;
|
|
1708
|
+
|
|
1709
|
+
while (current_chunk < nchunk) {
|
|
1710
|
+
int64_t src0_start = (current_chunk * ne01) / nchunk;
|
|
1711
|
+
int64_t src0_end = ((current_chunk + 1) * ne01) / nchunk;
|
|
1712
|
+
|
|
1713
|
+
// Align boundaries to NB_COLS - round up to ensure all data is included
|
|
1714
|
+
// The chunk size limiting above ensures chunks are large enough to prevent overlaps
|
|
1715
|
+
src0_start = (src0_start % NB_COLS) ? src0_start + NB_COLS - (src0_start % NB_COLS) : src0_start;
|
|
1716
|
+
src0_end = (src0_end % NB_COLS) ? src0_end + NB_COLS - (src0_end % NB_COLS) : src0_end;
|
|
1717
|
+
if (src0_end > ne01) {
|
|
1718
|
+
src0_end = ne01;
|
|
1719
|
+
}
|
|
1720
|
+
|
|
1721
|
+
if (src0_start >= src0_end) {
|
|
1722
|
+
break;
|
|
1723
|
+
}
|
|
1724
|
+
|
|
1725
|
+
forward_mul_mat_one_chunk(params, dst, src0_start, src0_end);
|
|
1726
|
+
|
|
1727
|
+
current_chunk = ggml_threadpool_chunk_add(params->threadpool, 1);
|
|
1671
1728
|
}
|
|
1672
1729
|
}
|
|
1673
1730
|
|
|
@@ -1772,8 +1829,12 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
|
|
|
1772
1829
|
int64_t src0_cur_start = (ith * ne01) / nth;
|
|
1773
1830
|
int64_t src0_cur_end = ((ith + 1) * ne01) / nth;
|
|
1774
1831
|
|
|
1832
|
+
// Align boundaries to NB_COLS - round up to ensure all data is included
|
|
1775
1833
|
src0_cur_start = (src0_cur_start % NB_COLS) ? src0_cur_start + NB_COLS - (src0_cur_start % NB_COLS) : src0_cur_start;
|
|
1776
1834
|
src0_cur_end = (src0_cur_end % NB_COLS) ? src0_cur_end + NB_COLS - (src0_cur_end % NB_COLS) : src0_cur_end;
|
|
1835
|
+
if (src0_cur_end > ne01) {
|
|
1836
|
+
src0_cur_end = ne01;
|
|
1837
|
+
}
|
|
1777
1838
|
|
|
1778
1839
|
if (src0_cur_start >= src0_cur_end) {
|
|
1779
1840
|
return;
|
|
@@ -956,7 +956,7 @@ do { \
|
|
|
956
956
|
|
|
957
957
|
#define GGML_F32Cx8 __m256
|
|
958
958
|
#define GGML_F32Cx8_ZERO (__m256)__lasx_xvldi(0)
|
|
959
|
-
#define GGML_F32Cx8_SET1(x) (__m256)
|
|
959
|
+
#define GGML_F32Cx8_SET1(x) (__m256)__lasx_xvreplfr2vr_s((x))
|
|
960
960
|
|
|
961
961
|
static inline __m256 __lasx_f32cx8_load(const ggml_fp16_t * x) {
|
|
962
962
|
__m256i a;
|
|
@@ -999,34 +999,34 @@ static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) {
|
|
|
999
999
|
|
|
1000
1000
|
#define GGML_F32x4 __m128
|
|
1001
1001
|
#define GGML_F32x4_ZERO (__m128)__lsx_vldi(0)
|
|
1002
|
-
#define GGML_F32x4_SET1(x) (__m128)
|
|
1002
|
+
#define GGML_F32x4_SET1(x) (__m128)__lsx_vreplfr2vr_s((x))
|
|
1003
1003
|
#define GGML_F32x4_LOAD(x) (__m128)__lsx_vld((x), 0)
|
|
1004
1004
|
#define GGML_F32x4_STORE(x, y) __lsx_vst(y, x, 0)
|
|
1005
1005
|
#define GGML_F32x4_FMA(a, b, c) __lsx_vfmadd_s(b, c, a)
|
|
1006
1006
|
#define GGML_F32x4_ADD __lsx_vfadd_s
|
|
1007
1007
|
#define GGML_F32x4_MUL __lsx_vfmul_s
|
|
1008
|
-
|
|
1009
|
-
|
|
1010
|
-
|
|
1011
|
-
|
|
1012
|
-
|
|
1013
|
-
|
|
1014
|
-
|
|
1015
|
-
|
|
1016
|
-
|
|
1017
|
-
|
|
1018
|
-
|
|
1019
|
-
|
|
1020
|
-
|
|
1021
|
-
|
|
1022
|
-
|
|
1023
|
-
|
|
1024
|
-
|
|
1025
|
-
|
|
1026
|
-
|
|
1027
|
-
|
|
1028
|
-
|
|
1029
|
-
res
|
|
1008
|
+
|
|
1009
|
+
#define GGML_F32x4_REDUCE(res, x) \
|
|
1010
|
+
{ \
|
|
1011
|
+
int offset = GGML_F32_ARR >> 1; \
|
|
1012
|
+
for (int i = 0; i < offset; ++i) { \
|
|
1013
|
+
x[i] = __lsx_vfadd_s(x[i], x[offset+i]); \
|
|
1014
|
+
} \
|
|
1015
|
+
offset >>= 1; \
|
|
1016
|
+
for (int i = 0; i < offset; ++i) { \
|
|
1017
|
+
x[i] = __lsx_vfadd_s(x[i], x[offset+i]); \
|
|
1018
|
+
} \
|
|
1019
|
+
offset >>= 1; \
|
|
1020
|
+
for (int i = 0; i < offset; ++i) { \
|
|
1021
|
+
x[i] = __lsx_vfadd_s(x[i], x[offset+i]); \
|
|
1022
|
+
} \
|
|
1023
|
+
__m128i t0 = __lsx_vpickev_w((__m128i)x[0], (__m128i)x[0]); \
|
|
1024
|
+
__m128i t1 = __lsx_vpickod_w((__m128i)x[0], (__m128i)x[0]); \
|
|
1025
|
+
__m128 t2 = __lsx_vfadd_s((__m128)t0, (__m128)t1); \
|
|
1026
|
+
__m128i t3 = __lsx_vpickev_w((__m128i)t2, (__m128i)t2); \
|
|
1027
|
+
__m128i t4 = __lsx_vpickod_w((__m128i)t2, (__m128i)t2); \
|
|
1028
|
+
__m128 t5 = __lsx_vfadd_s((__m128)t3, (__m128)t4); \
|
|
1029
|
+
res = (ggml_float) ((v4f32)t5)[0]; \
|
|
1030
1030
|
}
|
|
1031
1031
|
|
|
1032
1032
|
#define GGML_F32_VEC GGML_F32x4
|
|
@@ -1068,7 +1068,7 @@ static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
|
|
|
1068
1068
|
|
|
1069
1069
|
#define GGML_F32Cx4 __m128
|
|
1070
1070
|
#define GGML_F32Cx4_ZERO (__m128)__lsx_vldi(0)
|
|
1071
|
-
#define GGML_F32Cx4_SET1(x) (__m128)
|
|
1071
|
+
#define GGML_F32Cx4_SET1(x) (__m128)__lsx_vreplfr2vr_s((x))
|
|
1072
1072
|
#define GGML_F32Cx4_LOAD(x) (__m128)__lsx_f16x4_load(x)
|
|
1073
1073
|
#define GGML_F32Cx4_STORE(x, y) __lsx_f16x4_store(x, y)
|
|
1074
1074
|
#define GGML_F32Cx4_FMA GGML_F32x4_FMA
|
|
@@ -83,6 +83,7 @@ extern "C" {
|
|
|
83
83
|
LLAMA_ROPE_TYPE_NORM = 0,
|
|
84
84
|
LLAMA_ROPE_TYPE_NEOX = GGML_ROPE_TYPE_NEOX,
|
|
85
85
|
LLAMA_ROPE_TYPE_MROPE = GGML_ROPE_TYPE_MROPE,
|
|
86
|
+
LLAMA_ROPE_TYPE_IMROPE = GGML_ROPE_TYPE_IMROPE,
|
|
86
87
|
LLAMA_ROPE_TYPE_VISION = GGML_ROPE_TYPE_VISION,
|
|
87
88
|
};
|
|
88
89
|
|
|
@@ -460,7 +461,11 @@ extern "C" {
|
|
|
460
461
|
LLAMA_API bool llama_supports_gpu_offload(void);
|
|
461
462
|
LLAMA_API bool llama_supports_rpc (void);
|
|
462
463
|
|
|
464
|
+
// NOTE: After creating a llama_context, it is recommended to query the actual values using these functions
|
|
465
|
+
// In some cases the requested values via llama_context_params may differ from the actual values used by the context
|
|
466
|
+
// ref: https://github.com/ggml-org/llama.cpp/pull/17046#discussion_r2503085732
|
|
463
467
|
LLAMA_API uint32_t llama_n_ctx (const struct llama_context * ctx);
|
|
468
|
+
LLAMA_API uint32_t llama_n_ctx_seq (const struct llama_context * ctx);
|
|
464
469
|
LLAMA_API uint32_t llama_n_batch (const struct llama_context * ctx);
|
|
465
470
|
LLAMA_API uint32_t llama_n_ubatch (const struct llama_context * ctx);
|
|
466
471
|
LLAMA_API uint32_t llama_n_seq_max (const struct llama_context * ctx);
|
|
@@ -481,6 +486,7 @@ extern "C" {
|
|
|
481
486
|
|
|
482
487
|
LLAMA_API int32_t llama_model_n_ctx_train(const struct llama_model * model);
|
|
483
488
|
LLAMA_API int32_t llama_model_n_embd (const struct llama_model * model);
|
|
489
|
+
LLAMA_API int32_t llama_model_n_embd_inp (const struct llama_model * model);
|
|
484
490
|
LLAMA_API int32_t llama_model_n_layer (const struct llama_model * model);
|
|
485
491
|
LLAMA_API int32_t llama_model_n_head (const struct llama_model * model);
|
|
486
492
|
LLAMA_API int32_t llama_model_n_head_kv (const struct llama_model * model);
|
|
@@ -584,7 +590,7 @@ extern "C" {
|
|
|
584
590
|
LLAMA_API int32_t llama_adapter_meta_val_str_by_index(const struct llama_adapter_lora * adapter, int32_t i, char * buf, size_t buf_size);
|
|
585
591
|
|
|
586
592
|
// Manually free a LoRA adapter
|
|
587
|
-
//
|
|
593
|
+
// NOTE: loaded adapters will be free when the associated model is deleted
|
|
588
594
|
LLAMA_API void llama_adapter_lora_free(struct llama_adapter_lora * adapter);
|
|
589
595
|
|
|
590
596
|
// Get the invocation tokens if the current lora is an alora
|
|
@@ -1110,8 +1116,6 @@ extern "C" {
|
|
|
1110
1116
|
// // sample from the logits of the last token in the batch
|
|
1111
1117
|
// const llama_token id = llama_sampler_sample(smpl, ctx, -1);
|
|
1112
1118
|
//
|
|
1113
|
-
// // accepting the token updates the internal state of certain samplers (e.g. grammar, repetition, etc.)
|
|
1114
|
-
// llama_sampler_accept(smpl, id);
|
|
1115
1119
|
// ...
|
|
1116
1120
|
// }
|
|
1117
1121
|
//
|