@fugood/llama.node 1.1.5 → 1.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/binding.ts +4 -0
- package/lib/index.js +6 -1
- package/lib/index.ts +6 -0
- package/lib/version.js +5 -0
- package/lib/version.ts +2 -0
- package/package.json +14 -14
- package/scripts/llama.cpp.patch +19 -15
- package/src/LlamaCompletionWorker.cpp +73 -18
- package/src/LlamaCompletionWorker.h +8 -0
- package/src/llama.cpp/CMakeLists.txt +2 -0
- package/src/llama.cpp/common/arg.cpp +147 -46
- package/src/llama.cpp/common/chat-parser.cpp +9 -1
- package/src/llama.cpp/common/chat.cpp +350 -3
- package/src/llama.cpp/common/chat.h +11 -3
- package/src/llama.cpp/common/common.cpp +54 -0
- package/src/llama.cpp/common/common.h +44 -9
- package/src/llama.cpp/ggml/CMakeLists.txt +5 -2
- package/src/llama.cpp/ggml/include/ggml-opt.h +25 -6
- package/src/llama.cpp/ggml/include/ggml-zdnn.h +16 -0
- package/src/llama.cpp/ggml/include/ggml.h +65 -3
- package/src/llama.cpp/ggml/src/CMakeLists.txt +13 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +61 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +96 -8
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +1136 -1077
- package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +20 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +20 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +21 -24
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +16 -7
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +270 -11
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +3 -8
- package/src/llama.cpp/ggml/src/ggml-cpu/quants.c +35 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/quants.h +8 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +200 -51
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.h +11 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/traits.cpp +2 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/traits.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +19 -4
- package/src/llama.cpp/include/llama.h +26 -0
- package/src/llama.cpp/src/llama-arch.cpp +65 -0
- package/src/llama.cpp/src/llama-arch.h +10 -0
- package/src/llama.cpp/src/llama-batch.cpp +1 -1
- package/src/llama.cpp/src/llama-chat.cpp +15 -4
- package/src/llama.cpp/src/llama-chat.h +1 -0
- package/src/llama.cpp/src/llama-context.cpp +37 -25
- package/src/llama.cpp/src/llama-context.h +6 -5
- package/src/llama.cpp/src/llama-graph.cpp +118 -9
- package/src/llama.cpp/src/llama-graph.h +38 -0
- package/src/llama.cpp/src/llama-hparams.h +5 -3
- package/src/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +12 -6
- package/src/llama.cpp/src/llama-kv-cache-unified-iswa.h +2 -2
- package/src/llama.cpp/src/llama-kv-cache-unified.cpp +93 -69
- package/src/llama.cpp/src/llama-kv-cache-unified.h +2 -2
- package/src/llama.cpp/src/llama-memory-hybrid.cpp +6 -2
- package/src/llama.cpp/src/llama-memory-hybrid.h +2 -2
- package/src/llama.cpp/src/llama-memory-recurrent.cpp +6 -2
- package/src/llama.cpp/src/llama-memory-recurrent.h +2 -2
- package/src/llama.cpp/src/llama-memory.h +2 -2
- package/src/llama.cpp/src/llama-model-loader.cpp +1 -0
- package/src/llama.cpp/src/llama-model-loader.h +3 -2
- package/src/llama.cpp/src/llama-model.cpp +500 -4
- package/src/llama.cpp/src/llama-model.h +25 -4
- package/src/llama.cpp/src/llama-quant.cpp +37 -1
- package/src/llama.cpp/src/llama-vocab.cpp +43 -0
|
@@ -2,14 +2,17 @@
|
|
|
2
2
|
|
|
3
3
|
#pragma once
|
|
4
4
|
|
|
5
|
-
#include "llama-cpp.h"
|
|
6
|
-
|
|
7
5
|
#include <set>
|
|
6
|
+
#include <sstream>
|
|
8
7
|
#include <string>
|
|
9
8
|
#include <string_view>
|
|
10
9
|
#include <vector>
|
|
11
10
|
#include <map>
|
|
12
11
|
#include <sstream>
|
|
12
|
+
#include <cmath>
|
|
13
|
+
|
|
14
|
+
#include "ggml-opt.h"
|
|
15
|
+
#include "llama-cpp.h"
|
|
13
16
|
|
|
14
17
|
#ifdef _WIN32
|
|
15
18
|
#define DIRECTORY_SEPARATOR '\\'
|
|
@@ -82,6 +85,7 @@ enum llama_example {
|
|
|
82
85
|
LLAMA_EXAMPLE_PARALLEL,
|
|
83
86
|
LLAMA_EXAMPLE_TTS,
|
|
84
87
|
LLAMA_EXAMPLE_DIFFUSION,
|
|
88
|
+
LLAMA_EXAMPLE_FINETUNE,
|
|
85
89
|
|
|
86
90
|
LLAMA_EXAMPLE_COUNT,
|
|
87
91
|
};
|
|
@@ -202,6 +206,7 @@ struct common_params_speculative {
|
|
|
202
206
|
float p_split = 0.1f; // speculative decoding split probability
|
|
203
207
|
float p_min = 0.75f; // minimum speculative decoding probability (greedy)
|
|
204
208
|
std::vector<std::pair<std::string, std::string>> replacements; // main to speculative model replacements
|
|
209
|
+
std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
|
|
205
210
|
|
|
206
211
|
ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
|
|
207
212
|
ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
|
|
@@ -236,10 +241,31 @@ struct common_params_diffusion {
|
|
|
236
241
|
|
|
237
242
|
enum common_reasoning_format {
|
|
238
243
|
COMMON_REASONING_FORMAT_NONE,
|
|
244
|
+
COMMON_REASONING_FORMAT_AUTO,
|
|
239
245
|
COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY, // Extract thinking tag contents and return as `message.reasoning_content`, or leave inline in <think> tags in stream mode
|
|
240
246
|
COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas.
|
|
247
|
+
COMMON_REASONING_FORMAT_GRANITE, // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas.
|
|
241
248
|
};
|
|
242
249
|
|
|
250
|
+
|
|
251
|
+
struct lr_opt {
|
|
252
|
+
float lr0 = 1e-5; // learning rate at first epoch
|
|
253
|
+
float lr_min = -1;
|
|
254
|
+
float decay_epochs = -1; // if >0, the learning rate starts at lr0 and decays to lr_min after this many epochs
|
|
255
|
+
float scale_epoch = 0;
|
|
256
|
+
float wd = 0;
|
|
257
|
+
unsigned epochs = 2;
|
|
258
|
+
|
|
259
|
+
unsigned epoch; // set by optimizer outer (epochs) loop
|
|
260
|
+
// learning rate decay - constant LR per epoch only for now
|
|
261
|
+
float get_lr(float e) const;
|
|
262
|
+
float get_lr() const { return get_lr(epoch); }
|
|
263
|
+
// must call after arg parse, before get_lr
|
|
264
|
+
void init();
|
|
265
|
+
};
|
|
266
|
+
|
|
267
|
+
struct ggml_opt_optimizer_params common_opt_lr_pars(void * userdata);
|
|
268
|
+
|
|
243
269
|
struct common_params {
|
|
244
270
|
bool vocab_only = false;
|
|
245
271
|
int32_t n_predict = -1; // new tokens to predict
|
|
@@ -375,6 +401,11 @@ struct common_params {
|
|
|
375
401
|
bool no_mmproj = false; // explicitly disable multimodal model
|
|
376
402
|
std::vector<std::string> image; // path to image file(s)
|
|
377
403
|
|
|
404
|
+
// finetune
|
|
405
|
+
struct lr_opt lr;
|
|
406
|
+
enum ggml_opt_optimizer_type optimizer = GGML_OPT_OPTIMIZER_TYPE_ADAMW;
|
|
407
|
+
float val_split = 0.05f; // fraction of the data used for the validation set
|
|
408
|
+
|
|
378
409
|
// embedding
|
|
379
410
|
bool embedding = false; // get only sentence embedding
|
|
380
411
|
int32_t embd_normalize = 2; // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
|
|
@@ -383,11 +414,12 @@ struct common_params {
|
|
|
383
414
|
std::string cls_sep = "\t"; // separator of classification sequences
|
|
384
415
|
|
|
385
416
|
// server params
|
|
386
|
-
int32_t port
|
|
387
|
-
int32_t timeout_read
|
|
388
|
-
int32_t timeout_write
|
|
389
|
-
int32_t n_threads_http
|
|
390
|
-
int32_t n_cache_reuse
|
|
417
|
+
int32_t port = 8080; // server listens on this network port
|
|
418
|
+
int32_t timeout_read = 600; // http read timeout in seconds
|
|
419
|
+
int32_t timeout_write = timeout_read; // http write timeout in seconds
|
|
420
|
+
int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
|
|
421
|
+
int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting
|
|
422
|
+
int32_t n_swa_checkpoints = 3; // max number of SWA checkpoints per slot
|
|
391
423
|
|
|
392
424
|
std::string hostname = "127.0.0.1";
|
|
393
425
|
std::string public_path = ""; // NOLINT
|
|
@@ -395,7 +427,7 @@ struct common_params {
|
|
|
395
427
|
std::string chat_template = ""; // NOLINT
|
|
396
428
|
bool use_jinja = false; // NOLINT
|
|
397
429
|
bool enable_chat_template = true;
|
|
398
|
-
common_reasoning_format reasoning_format =
|
|
430
|
+
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_AUTO;
|
|
399
431
|
int reasoning_budget = -1;
|
|
400
432
|
bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response
|
|
401
433
|
|
|
@@ -440,7 +472,7 @@ struct common_params {
|
|
|
440
472
|
int32_t n_out_freq = 10; // output the imatrix every n_out_freq iterations
|
|
441
473
|
int32_t n_save_freq = 0; // save the imatrix every n_save_freq iterations
|
|
442
474
|
int32_t i_chunk = 0; // start processing from this chunk
|
|
443
|
-
|
|
475
|
+
int8_t imat_dat = 0; // whether the legacy imatrix.dat format should be output (gguf <= 0 < dat)
|
|
444
476
|
|
|
445
477
|
bool process_output = false; // collect data for the output tensor
|
|
446
478
|
bool compute_ppl = true; // whether to compute perplexity
|
|
@@ -702,3 +734,6 @@ const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
|
|
|
702
734
|
//
|
|
703
735
|
|
|
704
736
|
ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std::vector<llama_token> & tokens, int64_t stride);
|
|
737
|
+
|
|
738
|
+
// "adamw" or "sgd" (case insensitive)
|
|
739
|
+
enum ggml_opt_optimizer_type common_opt_get_optimizer(const char *);
|
|
@@ -39,8 +39,9 @@ if (WIN32)
|
|
|
39
39
|
set(CMAKE_SHARED_MODULE_PREFIX "")
|
|
40
40
|
endif()
|
|
41
41
|
|
|
42
|
-
option(BUILD_SHARED_LIBS
|
|
43
|
-
option(GGML_BACKEND_DL
|
|
42
|
+
option(BUILD_SHARED_LIBS "ggml: build shared libraries" ${BUILD_SHARED_LIBS_DEFAULT})
|
|
43
|
+
option(GGML_BACKEND_DL "ggml: build backends as dynamic libraries (requires BUILD_SHARED_LIBS)" OFF)
|
|
44
|
+
set(GGML_BACKEND_DIR "" CACHE PATH "ggml: directory to load dynamic backends from (requires GGML_BACKEND_DL")
|
|
44
45
|
|
|
45
46
|
#
|
|
46
47
|
# option list
|
|
@@ -175,6 +176,7 @@ option(GGML_HIP_NO_VMM "ggml: do not try to use HIP VMM"
|
|
|
175
176
|
option(GGML_HIP_ROCWMMA_FATTN "ggml: enable rocWMMA for FlashAttention" OFF)
|
|
176
177
|
option(GGML_HIP_FORCE_ROCWMMA_FATTN_GFX12 "ggml: enable rocWMMA FlashAttention on GFX12" OFF)
|
|
177
178
|
option(GGML_HIP_MMQ_MFMA "ggml: enable MFMA MMA for CDNA in MMQ" ON)
|
|
179
|
+
option(GGML_HIP_EXPORT_METRICS "ggml: enable kernel perf metrics output" OFF)
|
|
178
180
|
option(GGML_MUSA_GRAPHS "ggml: use MUSA graph, experimental, unstable" OFF)
|
|
179
181
|
option(GGML_MUSA_MUDNN_COPY "ggml: enable muDNN for accelerated copy" OFF)
|
|
180
182
|
option(GGML_VULKAN "ggml: use Vulkan" OFF)
|
|
@@ -186,6 +188,7 @@ option(GGML_VULKAN_VALIDATE "ggml: enable Vulkan validation"
|
|
|
186
188
|
option(GGML_VULKAN_RUN_TESTS "ggml: run Vulkan tests" OFF)
|
|
187
189
|
option(GGML_WEBGPU "ggml: use WebGPU" OFF)
|
|
188
190
|
option(GGML_WEBGPU_DEBUG "ggml: enable WebGPU debug output" OFF)
|
|
191
|
+
option(GGML_ZDNN "ggml: use zDNN" OFF)
|
|
189
192
|
option(GGML_METAL "ggml: use Metal" ${GGML_METAL_DEFAULT})
|
|
190
193
|
option(GGML_METAL_USE_BF16 "ggml: use bfloat if available" OFF)
|
|
191
194
|
option(GGML_METAL_NDEBUG "ggml: disable Metal debugging" OFF)
|
|
@@ -74,16 +74,26 @@ extern "C" {
|
|
|
74
74
|
GGML_OPT_BUILD_TYPE_OPT = 30,
|
|
75
75
|
};
|
|
76
76
|
|
|
77
|
+
enum ggml_opt_optimizer_type {
|
|
78
|
+
GGML_OPT_OPTIMIZER_TYPE_ADAMW,
|
|
79
|
+
GGML_OPT_OPTIMIZER_TYPE_SGD,
|
|
80
|
+
|
|
81
|
+
GGML_OPT_OPTIMIZER_TYPE_COUNT
|
|
82
|
+
};
|
|
83
|
+
|
|
77
84
|
// parameters that control which optimizer is used and how said optimizer tries to find the minimal loss
|
|
78
85
|
struct ggml_opt_optimizer_params {
|
|
79
|
-
// AdamW optimizer parameters
|
|
80
86
|
struct {
|
|
81
87
|
float alpha; // learning rate
|
|
82
|
-
float beta1;
|
|
83
|
-
float beta2;
|
|
88
|
+
float beta1; // first AdamW momentum
|
|
89
|
+
float beta2; // second AdamW momentum
|
|
84
90
|
float eps; // epsilon for numerical stability
|
|
85
|
-
float wd; // weight decay
|
|
91
|
+
float wd; // weight decay - 0.0f to disable
|
|
86
92
|
} adamw;
|
|
93
|
+
struct {
|
|
94
|
+
float alpha; // learning rate
|
|
95
|
+
float wd; // weight decay
|
|
96
|
+
} sgd;
|
|
87
97
|
};
|
|
88
98
|
|
|
89
99
|
// callback to calculate optimizer parameters prior to a backward pass
|
|
@@ -112,8 +122,11 @@ extern "C" {
|
|
|
112
122
|
|
|
113
123
|
int32_t opt_period; // after how many gradient accumulation steps an optimizer step should be done
|
|
114
124
|
|
|
115
|
-
ggml_opt_get_optimizer_params get_opt_pars;
|
|
116
|
-
void *
|
|
125
|
+
ggml_opt_get_optimizer_params get_opt_pars; // callback for calculating optimizer parameters
|
|
126
|
+
void * get_opt_pars_ud; // userdata for calculating optimizer parameters
|
|
127
|
+
|
|
128
|
+
// only GGML_OPT_OPTIMIZER_TYPE_ADAMW needs m, v momenta per parameter tensor
|
|
129
|
+
enum ggml_opt_optimizer_type optimizer;
|
|
117
130
|
};
|
|
118
131
|
|
|
119
132
|
// get parameters for an optimization context with defaults set where possible
|
|
@@ -142,6 +155,10 @@ extern "C" {
|
|
|
142
155
|
// get the gradient accumulator for a node from the forward graph
|
|
143
156
|
GGML_API struct ggml_tensor * ggml_opt_grad_acc(ggml_opt_context_t opt_ctx, struct ggml_tensor * node);
|
|
144
157
|
|
|
158
|
+
GGML_API enum ggml_opt_optimizer_type ggml_opt_context_optimizer_type(ggml_opt_context_t); //TODO consistent naming scheme
|
|
159
|
+
|
|
160
|
+
GGML_API const char * ggml_opt_optimizer_name(enum ggml_opt_optimizer_type);
|
|
161
|
+
|
|
145
162
|
// ====== Optimization Result ======
|
|
146
163
|
|
|
147
164
|
GGML_API ggml_opt_result_t ggml_opt_result_init(void);
|
|
@@ -226,12 +243,14 @@ extern "C" {
|
|
|
226
243
|
struct ggml_tensor * outputs, // output tensor, must have shape [ne_label, ndata_batch] if labels are used
|
|
227
244
|
ggml_opt_dataset_t dataset, // dataset with data and optionally also labels
|
|
228
245
|
enum ggml_opt_loss_type loss_type, // loss to minimize
|
|
246
|
+
enum ggml_opt_optimizer_type optimizer, // sgd or adamw
|
|
229
247
|
ggml_opt_get_optimizer_params get_opt_pars, // callback to get optimizer params, userdata is pointer to epoch (of type int64_t)
|
|
230
248
|
int64_t nepoch, // how many times the dataset should be iterated over
|
|
231
249
|
int64_t nbatch_logical, // datapoints optimizer step, must be a multiple of ndata_batch in inputs/outputs
|
|
232
250
|
float val_split, // fraction of the dataset to use for validation, must be in [0.0f, 1.0f)
|
|
233
251
|
bool silent); // whether or not info prints to stderr should be suppressed
|
|
234
252
|
|
|
253
|
+
|
|
235
254
|
#ifdef __cplusplus
|
|
236
255
|
}
|
|
237
256
|
#endif
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
#pragma once
|
|
2
|
+
|
|
3
|
+
#include "ggml.h"
|
|
4
|
+
#include "ggml-backend.h"
|
|
5
|
+
|
|
6
|
+
#ifdef __cplusplus
|
|
7
|
+
extern "C" {
|
|
8
|
+
#endif
|
|
9
|
+
|
|
10
|
+
GGML_BACKEND_API ggml_backend_t ggml_backend_zdnn_init(void);
|
|
11
|
+
|
|
12
|
+
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_zdnn_reg(void);
|
|
13
|
+
|
|
14
|
+
#ifdef __cplusplus
|
|
15
|
+
}
|
|
16
|
+
#endif
|
|
@@ -241,6 +241,8 @@
|
|
|
241
241
|
#define GGML_ROPE_TYPE_MROPE 8
|
|
242
242
|
#define GGML_ROPE_TYPE_VISION 24
|
|
243
243
|
|
|
244
|
+
#define GGML_MROPE_SECTIONS 4
|
|
245
|
+
|
|
244
246
|
#define GGML_UNUSED(x) (void)(x)
|
|
245
247
|
|
|
246
248
|
#define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1))
|
|
@@ -304,6 +306,16 @@
|
|
|
304
306
|
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \
|
|
305
307
|
GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
|
|
306
308
|
|
|
309
|
+
#define GGML_TENSOR_TERNARY_OP_LOCALS \
|
|
310
|
+
GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
|
|
311
|
+
GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \
|
|
312
|
+
GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \
|
|
313
|
+
GGML_TENSOR_LOCALS(size_t, nb1, src1, nb) \
|
|
314
|
+
GGML_TENSOR_LOCALS(int64_t, ne2, src2, ne) \
|
|
315
|
+
GGML_TENSOR_LOCALS(size_t, nb2, src2, nb) \
|
|
316
|
+
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \
|
|
317
|
+
GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
|
|
318
|
+
|
|
307
319
|
#define GGML_TENSOR_BINARY_OP_LOCALS01 \
|
|
308
320
|
GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
|
|
309
321
|
GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \
|
|
@@ -395,7 +407,8 @@ extern "C" {
|
|
|
395
407
|
// GGML_TYPE_IQ4_NL_4_4 = 36,
|
|
396
408
|
// GGML_TYPE_IQ4_NL_4_8 = 37,
|
|
397
409
|
// GGML_TYPE_IQ4_NL_8_8 = 38,
|
|
398
|
-
|
|
410
|
+
GGML_TYPE_MXFP4 = 39, // MXFP4 (1 block)
|
|
411
|
+
GGML_TYPE_COUNT = 40,
|
|
399
412
|
};
|
|
400
413
|
|
|
401
414
|
// precision
|
|
@@ -430,6 +443,7 @@ extern "C" {
|
|
|
430
443
|
GGML_FTYPE_MOSTLY_IQ4_XS = 22, // except 1d tensors
|
|
431
444
|
GGML_FTYPE_MOSTLY_IQ1_M = 23, // except 1d tensors
|
|
432
445
|
GGML_FTYPE_MOSTLY_BF16 = 24, // except 1d tensors
|
|
446
|
+
GGML_FTYPE_MOSTLY_MXFP4 = 25, // except 1d tensors
|
|
433
447
|
};
|
|
434
448
|
|
|
435
449
|
// available tensor operations:
|
|
@@ -438,6 +452,7 @@ extern "C" {
|
|
|
438
452
|
|
|
439
453
|
GGML_OP_DUP,
|
|
440
454
|
GGML_OP_ADD,
|
|
455
|
+
GGML_OP_ADD_ID,
|
|
441
456
|
GGML_OP_ADD1,
|
|
442
457
|
GGML_OP_ACC,
|
|
443
458
|
GGML_OP_SUB,
|
|
@@ -527,6 +542,7 @@ extern "C" {
|
|
|
527
542
|
GGML_OP_CROSS_ENTROPY_LOSS,
|
|
528
543
|
GGML_OP_CROSS_ENTROPY_LOSS_BACK,
|
|
529
544
|
GGML_OP_OPT_STEP_ADAMW,
|
|
545
|
+
GGML_OP_OPT_STEP_SGD,
|
|
530
546
|
|
|
531
547
|
GGML_OP_GLU,
|
|
532
548
|
|
|
@@ -557,6 +573,7 @@ extern "C" {
|
|
|
557
573
|
GGML_GLU_OP_REGLU,
|
|
558
574
|
GGML_GLU_OP_GEGLU,
|
|
559
575
|
GGML_GLU_OP_SWIGLU,
|
|
576
|
+
GGML_GLU_OP_SWIGLU_OAI,
|
|
560
577
|
GGML_GLU_OP_GEGLU_ERF,
|
|
561
578
|
GGML_GLU_OP_GEGLU_QUICK,
|
|
562
579
|
|
|
@@ -831,6 +848,13 @@ extern "C" {
|
|
|
831
848
|
struct ggml_tensor * b,
|
|
832
849
|
enum ggml_type type);
|
|
833
850
|
|
|
851
|
+
// dst[i0, i1, i2] = a[i0, i1, i2] + b[i0, ids[i1, i2]]
|
|
852
|
+
GGML_API struct ggml_tensor * ggml_add_id(
|
|
853
|
+
struct ggml_context * ctx,
|
|
854
|
+
struct ggml_tensor * a,
|
|
855
|
+
struct ggml_tensor * b,
|
|
856
|
+
struct ggml_tensor * ids);
|
|
857
|
+
|
|
834
858
|
GGML_API struct ggml_tensor * ggml_add1(
|
|
835
859
|
struct ggml_context * ctx,
|
|
836
860
|
struct ggml_tensor * a,
|
|
@@ -1198,6 +1222,13 @@ extern "C" {
|
|
|
1198
1222
|
struct ggml_tensor * a,
|
|
1199
1223
|
struct ggml_tensor * b);
|
|
1200
1224
|
|
|
1225
|
+
GGML_API struct ggml_tensor * ggml_swiglu_oai(
|
|
1226
|
+
struct ggml_context * ctx,
|
|
1227
|
+
struct ggml_tensor * a,
|
|
1228
|
+
struct ggml_tensor * b,
|
|
1229
|
+
float alpha,
|
|
1230
|
+
float limit);
|
|
1231
|
+
|
|
1201
1232
|
// normalize along rows
|
|
1202
1233
|
GGML_API struct ggml_tensor * ggml_norm(
|
|
1203
1234
|
struct ggml_context * ctx,
|
|
@@ -1570,6 +1601,10 @@ extern "C" {
|
|
|
1570
1601
|
float scale,
|
|
1571
1602
|
float max_bias);
|
|
1572
1603
|
|
|
1604
|
+
GGML_API void ggml_soft_max_add_sinks(
|
|
1605
|
+
struct ggml_tensor * a,
|
|
1606
|
+
struct ggml_tensor * sinks);
|
|
1607
|
+
|
|
1573
1608
|
GGML_API struct ggml_tensor * ggml_soft_max_ext_back(
|
|
1574
1609
|
struct ggml_context * ctx,
|
|
1575
1610
|
struct ggml_tensor * a,
|
|
@@ -1628,7 +1663,7 @@ extern "C" {
|
|
|
1628
1663
|
struct ggml_tensor * b,
|
|
1629
1664
|
struct ggml_tensor * c,
|
|
1630
1665
|
int n_dims,
|
|
1631
|
-
int sections[
|
|
1666
|
+
int sections[GGML_MROPE_SECTIONS],
|
|
1632
1667
|
int mode,
|
|
1633
1668
|
int n_ctx_orig,
|
|
1634
1669
|
float freq_base,
|
|
@@ -1654,6 +1689,22 @@ extern "C" {
|
|
|
1654
1689
|
float beta_fast,
|
|
1655
1690
|
float beta_slow);
|
|
1656
1691
|
|
|
1692
|
+
GGML_API struct ggml_tensor * ggml_rope_multi_inplace(
|
|
1693
|
+
struct ggml_context * ctx,
|
|
1694
|
+
struct ggml_tensor * a,
|
|
1695
|
+
struct ggml_tensor * b,
|
|
1696
|
+
struct ggml_tensor * c,
|
|
1697
|
+
int n_dims,
|
|
1698
|
+
int sections[GGML_MROPE_SECTIONS],
|
|
1699
|
+
int mode,
|
|
1700
|
+
int n_ctx_orig,
|
|
1701
|
+
float freq_base,
|
|
1702
|
+
float freq_scale,
|
|
1703
|
+
float ext_factor,
|
|
1704
|
+
float attn_factor,
|
|
1705
|
+
float beta_fast,
|
|
1706
|
+
float beta_slow);
|
|
1707
|
+
|
|
1657
1708
|
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_rope_custom(
|
|
1658
1709
|
struct ggml_context * ctx,
|
|
1659
1710
|
struct ggml_tensor * a,
|
|
@@ -2052,6 +2103,10 @@ extern "C" {
|
|
|
2052
2103
|
GGML_API enum ggml_prec ggml_flash_attn_ext_get_prec(
|
|
2053
2104
|
const struct ggml_tensor * a);
|
|
2054
2105
|
|
|
2106
|
+
GGML_API void ggml_flash_attn_ext_add_sinks(
|
|
2107
|
+
struct ggml_tensor * a,
|
|
2108
|
+
struct ggml_tensor * sinks);
|
|
2109
|
+
|
|
2055
2110
|
// TODO: needs to be adapted to ggml_flash_attn_ext
|
|
2056
2111
|
GGML_API struct ggml_tensor * ggml_flash_attn_back(
|
|
2057
2112
|
struct ggml_context * ctx,
|
|
@@ -2257,7 +2312,14 @@ extern "C" {
|
|
|
2257
2312
|
struct ggml_tensor * grad,
|
|
2258
2313
|
struct ggml_tensor * m,
|
|
2259
2314
|
struct ggml_tensor * v,
|
|
2260
|
-
struct ggml_tensor * adamw_params); // parameters such
|
|
2315
|
+
struct ggml_tensor * adamw_params); // parameters such as the learning rate
|
|
2316
|
+
|
|
2317
|
+
// stochastic gradient descent step (with weight decay)
|
|
2318
|
+
GGML_API struct ggml_tensor * ggml_opt_step_sgd(
|
|
2319
|
+
struct ggml_context * ctx,
|
|
2320
|
+
struct ggml_tensor * a,
|
|
2321
|
+
struct ggml_tensor * grad,
|
|
2322
|
+
struct ggml_tensor * sgd_params); // alpha, weight decay
|
|
2261
2323
|
|
|
2262
2324
|
//
|
|
2263
2325
|
// automatic differentiation
|
|
@@ -214,6 +214,13 @@ add_library(ggml
|
|
|
214
214
|
ggml-backend-reg.cpp)
|
|
215
215
|
add_library(ggml::ggml ALIAS ggml)
|
|
216
216
|
|
|
217
|
+
if (GGML_BACKEND_DIR)
|
|
218
|
+
if (NOT GGML_BACKEND_DL)
|
|
219
|
+
message(FATAL_ERROR "GGML_BACKEND_DIR requires GGML_BACKEND_DL")
|
|
220
|
+
endif()
|
|
221
|
+
target_compile_definitions(ggml PUBLIC GGML_BACKEND_DIR="${GGML_BACKEND_DIR}")
|
|
222
|
+
endif()
|
|
223
|
+
|
|
217
224
|
target_link_libraries(ggml PUBLIC ggml-base)
|
|
218
225
|
|
|
219
226
|
if (CMAKE_SYSTEM_NAME MATCHES "Linux")
|
|
@@ -227,7 +234,11 @@ function(ggml_add_backend_library backend)
|
|
|
227
234
|
set_target_properties(${backend} PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
|
|
228
235
|
target_compile_definitions(${backend} PRIVATE GGML_BACKEND_DL)
|
|
229
236
|
add_dependencies(ggml ${backend})
|
|
230
|
-
|
|
237
|
+
if (GGML_BACKEND_DIR)
|
|
238
|
+
install(TARGETS ${backend} LIBRARY DESTINATION ${GGML_BACKEND_DIR})
|
|
239
|
+
else()
|
|
240
|
+
install(TARGETS ${backend} LIBRARY DESTINATION ${CMAKE_INSTALL_BINDIR})
|
|
241
|
+
endif()
|
|
231
242
|
else()
|
|
232
243
|
add_library(${backend} ${ARGN})
|
|
233
244
|
target_link_libraries(ggml PUBLIC ${backend})
|
|
@@ -371,6 +382,7 @@ ggml_add_backend(RPC)
|
|
|
371
382
|
ggml_add_backend(SYCL)
|
|
372
383
|
ggml_add_backend(Vulkan)
|
|
373
384
|
ggml_add_backend(WebGPU)
|
|
385
|
+
ggml_add_backend(zDNN)
|
|
374
386
|
ggml_add_backend(OpenCL)
|
|
375
387
|
|
|
376
388
|
foreach (target ggml-base ggml)
|
|
@@ -460,7 +460,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|
|
460
460
|
# NOTE: Only available from GCC 15.1.0 onwards. Any z17 machine with compile issues must first verify their GCC version.
|
|
461
461
|
# binutils must also be updated to the latest for the -march=z17 flag to work. Otherwise, use -march=arch15.
|
|
462
462
|
message(STATUS "z17 target")
|
|
463
|
-
list(APPEND ARCH_FLAGS -march=
|
|
463
|
+
list(APPEND ARCH_FLAGS -march=arch15)
|
|
464
464
|
else()
|
|
465
465
|
message(STATUS "Unknown target")
|
|
466
466
|
message(WARNING "Unknown target. If you are compiling for z14 and earlier, you might have to add -DGGML_VXE=OFF.")
|
|
@@ -589,6 +589,67 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
589
589
|
*s = sumf;
|
|
590
590
|
}
|
|
591
591
|
|
|
592
|
+
void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
593
|
+
assert(nrc == 1);
|
|
594
|
+
UNUSED(nrc);
|
|
595
|
+
UNUSED(bx);
|
|
596
|
+
UNUSED(by);
|
|
597
|
+
UNUSED(bs);
|
|
598
|
+
assert(n % QK_MXFP4 == 0);
|
|
599
|
+
static_assert(QK_MXFP4 == QK8_0, "QK_MXFP4 and QK8_0 must be the same");
|
|
600
|
+
|
|
601
|
+
const block_mxfp4 * GGML_RESTRICT x = vx;
|
|
602
|
+
const block_q8_0 * GGML_RESTRICT y = vy;
|
|
603
|
+
|
|
604
|
+
const int nb = n / QK_MXFP4;
|
|
605
|
+
|
|
606
|
+
int ib = 0;
|
|
607
|
+
float sumf = 0;
|
|
608
|
+
|
|
609
|
+
#if defined __ARM_NEON
|
|
610
|
+
const int8x16_t values = vld1q_s8(kvalues_mxfp4);
|
|
611
|
+
const uint8x16_t m4b = vdupq_n_u8(0x0f);
|
|
612
|
+
uint8x16x2_t q4bits;
|
|
613
|
+
int8x16x4_t q4b;
|
|
614
|
+
int8x16x4_t q8b;
|
|
615
|
+
int32x4_t prod_1;
|
|
616
|
+
int32x4_t prod_2;
|
|
617
|
+
|
|
618
|
+
for (; ib + 1 < nb; ib += 2) {
|
|
619
|
+
q4bits.val[0] = vld1q_u8(x[ib + 0].qs);
|
|
620
|
+
q4bits.val[1] = vld1q_u8(x[ib + 1].qs);
|
|
621
|
+
q8b.val[0] = vld1q_s8(y[ib + 0].qs);
|
|
622
|
+
q8b.val[1] = vld1q_s8(y[ib + 0].qs + 16);
|
|
623
|
+
q8b.val[2] = vld1q_s8(y[ib + 1].qs);
|
|
624
|
+
q8b.val[3] = vld1q_s8(y[ib + 1].qs + 16);
|
|
625
|
+
|
|
626
|
+
q4b.val[0] = ggml_vqtbl1q_s8(values, vandq_u8 (q4bits.val[0], m4b));
|
|
627
|
+
q4b.val[1] = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[0], 4));
|
|
628
|
+
q4b.val[2] = ggml_vqtbl1q_s8(values, vandq_u8 (q4bits.val[1], m4b));
|
|
629
|
+
q4b.val[3] = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[1], 4));
|
|
630
|
+
|
|
631
|
+
prod_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[0], q8b.val[0]), q4b.val[1], q8b.val[1]);
|
|
632
|
+
prod_2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[2], q8b.val[2]), q4b.val[3], q8b.val[3]);
|
|
633
|
+
|
|
634
|
+
sumf +=
|
|
635
|
+
GGML_E8M0_TO_FP32_HALF(x[ib + 0].e) * GGML_CPU_FP16_TO_FP32(y[ib + 0].d) * vaddvq_s32(prod_1) +
|
|
636
|
+
GGML_E8M0_TO_FP32_HALF(x[ib + 1].e) * GGML_CPU_FP16_TO_FP32(y[ib + 1].d) * vaddvq_s32(prod_2);
|
|
637
|
+
}
|
|
638
|
+
|
|
639
|
+
#endif
|
|
640
|
+
for (; ib < nb; ++ib) {
|
|
641
|
+
const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_E8M0_TO_FP32_HALF(x[ib].e);
|
|
642
|
+
int sumi1 = 0;
|
|
643
|
+
int sumi2 = 0;
|
|
644
|
+
for (int j = 0; j < QK_MXFP4/2; ++j) {
|
|
645
|
+
sumi1 += y[ib].qs[j + 0] * kvalues_mxfp4[x[ib].qs[j] & 0xf];
|
|
646
|
+
sumi2 += y[ib].qs[j + QK_MXFP4/2] * kvalues_mxfp4[x[ib].qs[j] >> 4];
|
|
647
|
+
}
|
|
648
|
+
sumf += d * (sumi1 + sumi2);
|
|
649
|
+
}
|
|
650
|
+
*s = sumf;
|
|
651
|
+
}
|
|
652
|
+
|
|
592
653
|
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
593
654
|
const int qk = QK8_0;
|
|
594
655
|
const int nb = n / qk;
|
|
@@ -66,6 +66,12 @@ static inline int hsum_i32_4(const __m128i a) {
|
|
|
66
66
|
}
|
|
67
67
|
|
|
68
68
|
#if defined(__AVX2__) || defined(__AVX512F__)
|
|
69
|
+
static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) {
|
|
70
|
+
const __m256i ax = _mm256_sign_epi8(x, x);
|
|
71
|
+
const __m256i sy = _mm256_sign_epi8(y, x);
|
|
72
|
+
return _mm256_maddubs_epi16(ax, sy);
|
|
73
|
+
}
|
|
74
|
+
|
|
69
75
|
// spread 32 bits to 32 bytes { 0x00, 0xFF }
|
|
70
76
|
static inline __m256i bytes_from_bits_32(const uint8_t * x) {
|
|
71
77
|
uint32_t x32;
|
|
@@ -261,6 +267,11 @@ static inline __m256 quad_fp16_delta_float(const float x0, const float y0, const
|
|
|
261
267
|
return _mm256_set_m128(_mm_set1_ps(GGML_CPU_FP16_TO_FP32(x1) * GGML_CPU_FP16_TO_FP32(y1)),
|
|
262
268
|
_mm_set1_ps(GGML_CPU_FP16_TO_FP32(x0) * GGML_CPU_FP16_TO_FP32(y0)));
|
|
263
269
|
}
|
|
270
|
+
|
|
271
|
+
static inline __m256 quad_mx_delta_float(const int8_t x0, const float y0, const int8_t x1, const float y1) {
|
|
272
|
+
return _mm256_set_m128(_mm_set1_ps(GGML_E8M0_TO_FP32_HALF(x1) * GGML_CPU_FP16_TO_FP32(y1)),
|
|
273
|
+
_mm_set1_ps(GGML_E8M0_TO_FP32_HALF(x0) * GGML_CPU_FP16_TO_FP32(y0)));
|
|
274
|
+
}
|
|
264
275
|
#endif
|
|
265
276
|
#elif defined(__SSSE3__)
|
|
266
277
|
// horizontally add 4x4 floats
|
|
@@ -746,6 +757,91 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
746
757
|
#endif
|
|
747
758
|
}
|
|
748
759
|
|
|
760
|
+
void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
761
|
+
assert(nrc == 1);
|
|
762
|
+
UNUSED(nrc);
|
|
763
|
+
UNUSED(bx);
|
|
764
|
+
UNUSED(by);
|
|
765
|
+
UNUSED(bs);
|
|
766
|
+
assert(n % QK_MXFP4 == 0);
|
|
767
|
+
static_assert(QK_MXFP4 == QK8_0, "QK_MXFP4 and QK8_0 must be the same");
|
|
768
|
+
|
|
769
|
+
const block_mxfp4 * GGML_RESTRICT x = vx;
|
|
770
|
+
const block_q8_0 * GGML_RESTRICT y = vy;
|
|
771
|
+
|
|
772
|
+
const int nb = n / QK_MXFP4;
|
|
773
|
+
|
|
774
|
+
int ib = 0;
|
|
775
|
+
float sumf = 0;
|
|
776
|
+
|
|
777
|
+
#if defined __AVX2__
|
|
778
|
+
|
|
779
|
+
const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_mxfp4);
|
|
780
|
+
const __m128i m4b = _mm_set1_epi8(0x0f);
|
|
781
|
+
const __m256i mone = _mm256_set1_epi16(1);
|
|
782
|
+
|
|
783
|
+
__m256 accum1 = _mm256_setzero_ps();
|
|
784
|
+
__m256 accum2 = _mm256_setzero_ps();
|
|
785
|
+
for (; ib + 1 < nb; ib += 2) {
|
|
786
|
+
const __m128i q4bits_1 = _mm_loadu_si128((const __m128i*)x[ib + 0].qs);
|
|
787
|
+
const __m128i q4bits_2 = _mm_loadu_si128((const __m128i*)x[ib + 1].qs);
|
|
788
|
+
const __m256i q8b_1 = _mm256_loadu_si256((const __m256i *)y[ib + 0].qs);
|
|
789
|
+
const __m256i q8b_2 = _mm256_loadu_si256((const __m256i *)y[ib + 1].qs);
|
|
790
|
+
const __m256i q4b_1 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)),
|
|
791
|
+
_mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b)));
|
|
792
|
+
const __m256i q4b_2 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)),
|
|
793
|
+
_mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b)));
|
|
794
|
+
const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1);
|
|
795
|
+
const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2);
|
|
796
|
+
const __m256i p_1 = _mm256_madd_epi16(p16_1, mone);
|
|
797
|
+
const __m256i p_2 = _mm256_madd_epi16(p16_2, mone);
|
|
798
|
+
accum1 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y[ib + 0].d)*GGML_E8M0_TO_FP32_HALF(x[ib + 0].e)),
|
|
799
|
+
_mm256_cvtepi32_ps(p_1), accum1);
|
|
800
|
+
accum2 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y[ib + 1].d)*GGML_E8M0_TO_FP32_HALF(x[ib + 1].e)),
|
|
801
|
+
_mm256_cvtepi32_ps(p_2), accum2);
|
|
802
|
+
}
|
|
803
|
+
|
|
804
|
+
sumf = hsum_float_8(_mm256_add_ps(accum1, accum2));
|
|
805
|
+
|
|
806
|
+
#elif defined __AVX__
|
|
807
|
+
const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_mxfp4);
|
|
808
|
+
const __m128i m4b = _mm_set1_epi8(0x0f);
|
|
809
|
+
|
|
810
|
+
__m256 accum = _mm256_setzero_ps();
|
|
811
|
+
for (; ib + 1 < nb; ib += 2) {
|
|
812
|
+
const __m128i q4bits_1 = _mm_loadu_si128((const __m128i *)x[ib + 0].qs);
|
|
813
|
+
const __m128i q4bits_2 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs);
|
|
814
|
+
const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)y[ib + 0].qs);
|
|
815
|
+
const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)y[ib + 0].qs + 1);
|
|
816
|
+
const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs);
|
|
817
|
+
const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs + 1);
|
|
818
|
+
|
|
819
|
+
const __m128i q4b_1_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b));
|
|
820
|
+
const __m128i q4b_1_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b));
|
|
821
|
+
const __m128i q4b_2_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b));
|
|
822
|
+
const __m128i q4b_2_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b));
|
|
823
|
+
|
|
824
|
+
const __m256 p = mul_sum_i8_quad_float(q4b_1_0, q4b_1_1, q4b_2_0, q4b_2_1, q8b_1_0, q8b_1_1, q8b_2_0, q8b_2_1);
|
|
825
|
+
const __m256 deltas = quad_mx_delta_float(x[ib].e, y[ib].d, x[ib + 1].e, y[ib + 1].d);
|
|
826
|
+
accum = _mm256_add_ps(_mm256_mul_ps(deltas, p), accum);
|
|
827
|
+
}
|
|
828
|
+
|
|
829
|
+
sumf = hsum_float_8(accum);
|
|
830
|
+
|
|
831
|
+
#endif
|
|
832
|
+
for (; ib < nb; ++ib) {
|
|
833
|
+
const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_E8M0_TO_FP32_HALF(x[ib].e);
|
|
834
|
+
int sumi1 = 0;
|
|
835
|
+
int sumi2 = 0;
|
|
836
|
+
for (int j = 0; j < QK_MXFP4/2; ++j) {
|
|
837
|
+
sumi1 += y[ib].qs[j + 0] * kvalues_mxfp4[x[ib].qs[j] & 0xf];
|
|
838
|
+
sumi2 += y[ib].qs[j + QK_MXFP4/2] * kvalues_mxfp4[x[ib].qs[j] >> 4];
|
|
839
|
+
}
|
|
840
|
+
sumf += d * (sumi1 + sumi2);
|
|
841
|
+
}
|
|
842
|
+
*s = sumf;
|
|
843
|
+
}
|
|
844
|
+
|
|
749
845
|
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
750
846
|
const int qk = QK8_0;
|
|
751
847
|
const int nb = n / qk;
|
|
@@ -3206,14 +3302,6 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
3206
3302
|
#endif
|
|
3207
3303
|
}
|
|
3208
3304
|
|
|
3209
|
-
#if defined(__AVX2__)
|
|
3210
|
-
static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) {
|
|
3211
|
-
const __m256i ax = _mm256_sign_epi8(x, x);
|
|
3212
|
-
const __m256i sy = _mm256_sign_epi8(y, x);
|
|
3213
|
-
return _mm256_maddubs_epi16(ax, sy);
|
|
3214
|
-
}
|
|
3215
|
-
#endif
|
|
3216
|
-
|
|
3217
3305
|
void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
3218
3306
|
assert(n % QK_K == 0);
|
|
3219
3307
|
assert(nrc == 1);
|