@fugood/llama.node 1.3.4 → 1.3.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +21 -1
- package/lib/binding.js +1 -1
- package/lib/binding.ts +47 -15
- package/lib/index.js +26 -2
- package/lib/index.ts +42 -10
- package/package.json +15 -14
- package/scripts/llama.cpp.patch +31 -10
- package/src/LlamaContext.cpp +46 -0
- package/src/LlamaContext.h +2 -0
- package/src/llama.cpp/common/CMakeLists.txt +2 -0
- package/src/llama.cpp/common/chat-parser-xml-toolcall.cpp +861 -0
- package/src/llama.cpp/common/chat-parser-xml-toolcall.h +45 -0
- package/src/llama.cpp/common/chat-parser.h +10 -0
- package/src/llama.cpp/common/chat.cpp +461 -87
- package/src/llama.cpp/common/chat.h +6 -0
- package/src/llama.cpp/common/common.cpp +8 -1
- package/src/llama.cpp/common/common.h +12 -5
- package/src/llama.cpp/common/json-partial.cpp +19 -2
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +2 -0
- package/src/llama.cpp/common/json-schema-to-grammar.h +2 -0
- package/src/llama.cpp/common/sampling.cpp +60 -6
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +31 -38
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +6 -6
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +15 -5
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +2 -3
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +16 -14
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +49 -48
- package/src/llama.cpp/src/llama-grammar.cpp +17 -9
- package/src/llama.cpp/src/llama-impl.cpp +3 -3
- package/src/llama.cpp/src/llama-sampling.cpp +3 -6
- package/src/llama.cpp/src/llama-vocab.cpp +1 -0
|
@@ -2,17 +2,15 @@
|
|
|
2
2
|
|
|
3
3
|
#pragma once
|
|
4
4
|
|
|
5
|
+
#include "ggml-opt.h"
|
|
6
|
+
#include "llama-cpp.h"
|
|
7
|
+
|
|
5
8
|
#include <set>
|
|
6
9
|
#include <sstream>
|
|
7
10
|
#include <string>
|
|
8
11
|
#include <string_view>
|
|
9
12
|
#include <vector>
|
|
10
13
|
#include <map>
|
|
11
|
-
#include <sstream>
|
|
12
|
-
#include <cmath>
|
|
13
|
-
|
|
14
|
-
#include "ggml-opt.h"
|
|
15
|
-
#include "llama-cpp.h"
|
|
16
14
|
|
|
17
15
|
#ifdef _WIN32
|
|
18
16
|
#define DIRECTORY_SEPARATOR '\\'
|
|
@@ -30,6 +28,15 @@
|
|
|
30
28
|
|
|
31
29
|
#define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
|
|
32
30
|
|
|
31
|
+
struct common_time_meas {
|
|
32
|
+
common_time_meas(int64_t & t_acc, bool disable = false);
|
|
33
|
+
~common_time_meas();
|
|
34
|
+
|
|
35
|
+
const int64_t t_start_us;
|
|
36
|
+
|
|
37
|
+
int64_t & t_acc;
|
|
38
|
+
};
|
|
39
|
+
|
|
33
40
|
struct common_adapter_lora_info {
|
|
34
41
|
std::string path;
|
|
35
42
|
float scale;
|
|
@@ -297,8 +297,25 @@ bool common_json_parse(
|
|
|
297
297
|
it = temptative_end;
|
|
298
298
|
return true;
|
|
299
299
|
}
|
|
300
|
-
//
|
|
301
|
-
|
|
300
|
+
// handle unclosed top-level primitive
|
|
301
|
+
if (err_loc.position != 0 && !healing_marker.empty() && err_loc.stack.empty()) {
|
|
302
|
+
std::string str(it, temptative_end);
|
|
303
|
+
const auto & magic_seed = out.healing_marker.marker = healing_marker;
|
|
304
|
+
if (can_parse(str + "\"")) {
|
|
305
|
+
// Was inside an string
|
|
306
|
+
str += (out.healing_marker.json_dump_marker = magic_seed) + "\"";
|
|
307
|
+
} else if (str[str.length() - 1] == '\\' && can_parse(str + "\\\"")) {
|
|
308
|
+
// Was inside an string after an escape
|
|
309
|
+
str += (out.healing_marker.json_dump_marker = "\\" + magic_seed) + "\"";
|
|
310
|
+
} else {
|
|
311
|
+
// TODO: handle more unclosed top-level primitive if the stack was empty but we got an error (e.g. "tru", "\"", etc...)
|
|
312
|
+
// fprintf(stderr, "Closing: TODO\n");
|
|
313
|
+
return false;
|
|
314
|
+
}
|
|
315
|
+
out.json = json::parse(str);
|
|
316
|
+
it = temptative_end;
|
|
317
|
+
return true;
|
|
318
|
+
}
|
|
302
319
|
return false;
|
|
303
320
|
}
|
|
304
321
|
out.json = json::parse(it, end);
|
|
@@ -303,6 +303,8 @@ static std::string format_literal(const std::string & literal) {
|
|
|
303
303
|
return "\"" + escaped + "\"";
|
|
304
304
|
}
|
|
305
305
|
|
|
306
|
+
std::string gbnf_format_literal(const std::string & literal) { return format_literal(literal); }
|
|
307
|
+
|
|
306
308
|
class SchemaConverter {
|
|
307
309
|
private:
|
|
308
310
|
friend std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options);
|
|
@@ -18,4 +18,6 @@ struct common_grammar_options {
|
|
|
18
18
|
bool dotall = false;
|
|
19
19
|
};
|
|
20
20
|
|
|
21
|
+
std::string gbnf_format_literal(const std::string & literal);
|
|
22
|
+
|
|
21
23
|
std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options = {});
|
|
@@ -3,9 +3,10 @@
|
|
|
3
3
|
#include "common.h"
|
|
4
4
|
#include "log.h"
|
|
5
5
|
|
|
6
|
+
#include <algorithm>
|
|
6
7
|
#include <cmath>
|
|
8
|
+
#include <cstring>
|
|
7
9
|
#include <unordered_map>
|
|
8
|
-
#include <algorithm>
|
|
9
10
|
|
|
10
11
|
// the ring buffer works similarly to std::deque, but with a fixed capacity
|
|
11
12
|
// TODO: deduplicate with llama-impl.h
|
|
@@ -112,6 +113,13 @@ struct common_sampler {
|
|
|
112
113
|
|
|
113
114
|
llama_token_data_array cur_p;
|
|
114
115
|
|
|
116
|
+
void reset() {
|
|
117
|
+
prev.clear();
|
|
118
|
+
|
|
119
|
+
llama_sampler_reset(grmr);
|
|
120
|
+
llama_sampler_reset(chain);
|
|
121
|
+
}
|
|
122
|
+
|
|
115
123
|
void set_logits(struct llama_context * ctx, int idx) {
|
|
116
124
|
const auto * logits = llama_get_logits_ith(ctx, idx);
|
|
117
125
|
|
|
@@ -128,6 +136,12 @@ struct common_sampler {
|
|
|
128
136
|
|
|
129
137
|
cur_p = { cur.data(), cur.size(), -1, false };
|
|
130
138
|
}
|
|
139
|
+
|
|
140
|
+
common_time_meas tm() {
|
|
141
|
+
return common_time_meas(t_total_us, params.no_perf);
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
mutable int64_t t_total_us = 0;
|
|
131
145
|
};
|
|
132
146
|
|
|
133
147
|
std::string common_params_sampling::print() const {
|
|
@@ -298,6 +312,8 @@ void common_sampler_free(struct common_sampler * gsmpl) {
|
|
|
298
312
|
}
|
|
299
313
|
|
|
300
314
|
void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar) {
|
|
315
|
+
const auto tm = gsmpl->tm();
|
|
316
|
+
|
|
301
317
|
if (accept_grammar) {
|
|
302
318
|
llama_sampler_accept(gsmpl->grmr, token);
|
|
303
319
|
}
|
|
@@ -308,9 +324,7 @@ void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, boo
|
|
|
308
324
|
}
|
|
309
325
|
|
|
310
326
|
void common_sampler_reset(struct common_sampler * gsmpl) {
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
llama_sampler_reset(gsmpl->chain);
|
|
327
|
+
gsmpl->reset();
|
|
314
328
|
}
|
|
315
329
|
|
|
316
330
|
struct common_sampler * common_sampler_clone(common_sampler * gsmpl) {
|
|
@@ -327,16 +341,54 @@ struct common_sampler * common_sampler_clone(common_sampler * gsmpl) {
|
|
|
327
341
|
void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl) {
|
|
328
342
|
// TODO: measure grammar performance
|
|
329
343
|
|
|
344
|
+
const double t_sampling_ms = gsmpl ? 1e-3*gsmpl->t_total_us : 0;
|
|
345
|
+
|
|
346
|
+
llama_perf_sampler_data data_smpl;
|
|
347
|
+
llama_perf_context_data data_ctx;
|
|
348
|
+
|
|
349
|
+
memset(&data_smpl, 0, sizeof(data_smpl));
|
|
350
|
+
memset(&data_ctx, 0, sizeof(data_ctx));
|
|
351
|
+
|
|
330
352
|
if (gsmpl) {
|
|
331
|
-
|
|
353
|
+
auto & data = data_smpl;
|
|
354
|
+
|
|
355
|
+
data = llama_perf_sampler(gsmpl->chain);
|
|
356
|
+
|
|
357
|
+
// note: the sampling time includes the samplers time + extra time spent in common/sampling
|
|
358
|
+
LOG_INF("%s: sampling time = %10.2f ms\n", __func__, t_sampling_ms);
|
|
359
|
+
LOG_INF("%s: samplers time = %10.2f ms / %5d tokens\n", __func__, data.t_sample_ms, data.n_sample);
|
|
332
360
|
}
|
|
361
|
+
|
|
333
362
|
if (ctx) {
|
|
334
|
-
|
|
363
|
+
auto & data = data_ctx;
|
|
364
|
+
|
|
365
|
+
data = llama_perf_context(ctx);
|
|
366
|
+
|
|
367
|
+
const double t_end_ms = 1e-3 * ggml_time_us();
|
|
368
|
+
|
|
369
|
+
const double t_total_ms = t_end_ms - data.t_start_ms;
|
|
370
|
+
const double t_unacc_ms = t_total_ms - (t_sampling_ms + data.t_p_eval_ms + data.t_eval_ms);
|
|
371
|
+
const double t_unacc_pc = 100.0 * t_unacc_ms / t_total_ms;
|
|
372
|
+
|
|
373
|
+
LOG_INF("%s: load time = %10.2f ms\n", __func__, data.t_load_ms);
|
|
374
|
+
LOG_INF("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
|
|
375
|
+
__func__, data.t_p_eval_ms, data.n_p_eval, data.t_p_eval_ms / data.n_p_eval, 1e3 / data.t_p_eval_ms * data.n_p_eval);
|
|
376
|
+
LOG_INF("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
|
377
|
+
__func__, data.t_eval_ms, data.n_eval, data.t_eval_ms / data.n_eval, 1e3 / data.t_eval_ms * data.n_eval);
|
|
378
|
+
LOG_INF("%s: total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data.t_start_ms), (data.n_p_eval + data.n_eval));
|
|
379
|
+
LOG_INF("%s: unaccounted time = %10.2f ms / %5.1f %% (total - sampling - prompt eval - eval) / (total)\n", __func__, t_unacc_ms, t_unacc_pc);
|
|
380
|
+
LOG_INF("%s: graphs reused = %10d\n", __func__, data.n_reused);
|
|
381
|
+
|
|
335
382
|
llama_memory_breakdown_print(ctx);
|
|
336
383
|
}
|
|
337
384
|
}
|
|
338
385
|
|
|
339
386
|
llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) {
|
|
387
|
+
llama_synchronize(ctx);
|
|
388
|
+
|
|
389
|
+
// start measuring sampling time after the llama_context synchronization in order to not measure any ongoing async operations
|
|
390
|
+
const auto tm = gsmpl->tm();
|
|
391
|
+
|
|
340
392
|
gsmpl->set_logits(ctx, idx);
|
|
341
393
|
|
|
342
394
|
auto & grmr = gsmpl->grmr;
|
|
@@ -428,6 +480,8 @@ uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) {
|
|
|
428
480
|
// helpers
|
|
429
481
|
|
|
430
482
|
llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl, bool do_sort) {
|
|
483
|
+
const auto tm = gsmpl->tm();
|
|
484
|
+
|
|
431
485
|
auto * res = &gsmpl->cur_p;
|
|
432
486
|
|
|
433
487
|
if (do_sort && !res->sorted) {
|
|
@@ -145,26 +145,27 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|
|
145
145
|
|
|
146
146
|
include(CheckCXXSourceRuns)
|
|
147
147
|
|
|
148
|
-
|
|
148
|
+
macro(check_arm_feature tag feature code)
|
|
149
149
|
set(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS})
|
|
150
150
|
set(CMAKE_REQUIRED_FLAGS "${ARM_NATIVE_FLAG}+${tag}")
|
|
151
151
|
check_cxx_source_runs("${code}" GGML_MACHINE_SUPPORTS_${tag})
|
|
152
152
|
if (GGML_MACHINE_SUPPORTS_${tag})
|
|
153
|
-
set(ARM_NATIVE_FLAG_FIX "${ARM_NATIVE_FLAG_FIX}+${tag}"
|
|
153
|
+
set(ARM_NATIVE_FLAG_FIX "${ARM_NATIVE_FLAG_FIX}+${tag}")
|
|
154
154
|
else()
|
|
155
155
|
set(CMAKE_REQUIRED_FLAGS "${ARM_NATIVE_FLAG}+no${tag}")
|
|
156
156
|
check_cxx_source_compiles("int main() { return 0; }" GGML_MACHINE_SUPPORTS_no${tag})
|
|
157
157
|
if (GGML_MACHINE_SUPPORTS_no${tag})
|
|
158
|
-
set(ARM_NATIVE_FLAG_FIX "${ARM_NATIVE_FLAG_FIX}+no${tag}"
|
|
158
|
+
set(ARM_NATIVE_FLAG_FIX "${ARM_NATIVE_FLAG_FIX}+no${tag}")
|
|
159
|
+
list(APPEND ARCH_FLAGS -U__ARM_FEATURE_${feature})
|
|
159
160
|
endif()
|
|
160
161
|
endif()
|
|
161
162
|
set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_SAVE})
|
|
162
|
-
|
|
163
|
+
endmacro()
|
|
163
164
|
|
|
164
|
-
check_arm_feature(dotprod "#include <arm_neon.h>\nint main() { int8x16_t _a, _b; volatile int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }")
|
|
165
|
-
check_arm_feature(i8mm "#include <arm_neon.h>\nint main() { int8x16_t _a, _b; volatile int32x4_t _s = vmmlaq_s32(_s, _a, _b); return 0; }")
|
|
166
|
-
check_arm_feature(sve "#include <arm_sve.h>\nint main() { svfloat32_t _a, _b; volatile svfloat32_t _c = svadd_f32_z(svptrue_b8(), _a, _b); return 0; }")
|
|
167
|
-
check_arm_feature(sme "#include <arm_sme.h>\n__arm_locally_streaming int main() { __asm__ volatile(\"smstart; smstop;\"); return 0; }")
|
|
165
|
+
check_arm_feature(dotprod DOTPROD "#include <arm_neon.h>\nint main() { int8x16_t _a, _b; volatile int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }")
|
|
166
|
+
check_arm_feature(i8mm MATMUL_INT8 "#include <arm_neon.h>\nint main() { int8x16_t _a, _b; volatile int32x4_t _s = vmmlaq_s32(_s, _a, _b); return 0; }")
|
|
167
|
+
check_arm_feature(sve SVE "#include <arm_sve.h>\nint main() { svfloat32_t _a, _b; volatile svfloat32_t _c = svadd_f32_z(svptrue_b8(), _a, _b); return 0; }")
|
|
168
|
+
check_arm_feature(sme SME "#include <arm_sme.h>\n__arm_locally_streaming int main() { __asm__ volatile(\"smstart; smstop;\"); return 0; }")
|
|
168
169
|
|
|
169
170
|
list(APPEND ARCH_FLAGS "${ARM_NATIVE_FLAG}${ARM_NATIVE_FLAG_FIX}")
|
|
170
171
|
else()
|
|
@@ -216,35 +217,27 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|
|
216
217
|
endif()
|
|
217
218
|
endif()
|
|
218
219
|
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
set(FEAT_INPUT_FILE "/dev/null")
|
|
224
|
-
endif()
|
|
220
|
+
message(STATUS "Checking for ARM features using flags:")
|
|
221
|
+
foreach(flag IN LISTS ARCH_FLAGS)
|
|
222
|
+
message(STATUS " ${flag}")
|
|
223
|
+
endforeach()
|
|
225
224
|
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
else()
|
|
243
|
-
message(STATUS "ARM feature ${feature} enabled")
|
|
244
|
-
endif()
|
|
245
|
-
endif()
|
|
246
|
-
endforeach()
|
|
247
|
-
endif()
|
|
225
|
+
include(CheckCXXSourceCompiles)
|
|
226
|
+
set(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS})
|
|
227
|
+
set(CMAKE_REQUIRED_FLAGS "${ARCH_FLAGS}")
|
|
228
|
+
foreach(feature DOTPROD SVE MATMUL_INT8 FMA FP16_VECTOR_ARITHMETIC SME)
|
|
229
|
+
set(ARM_FEATURE "HAVE_${feature}")
|
|
230
|
+
check_cxx_source_compiles(
|
|
231
|
+
"
|
|
232
|
+
#if !defined(__ARM_FEATURE_${feature})
|
|
233
|
+
# error \"Feature ${feature} is not defined\"
|
|
234
|
+
#endif
|
|
235
|
+
int main() { return 0; }
|
|
236
|
+
"
|
|
237
|
+
${ARM_FEATURE}
|
|
238
|
+
)
|
|
239
|
+
endforeach()
|
|
240
|
+
set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_SAVE})
|
|
248
241
|
endif()
|
|
249
242
|
elseif (GGML_SYSTEM_ARCH STREQUAL "x86")
|
|
250
243
|
message(STATUS "x86 detected")
|
|
@@ -399,9 +392,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|
|
399
392
|
string(REGEX REPLACE "POWER *([0-9]+)" "\\1" EXTRACTED_NUMBER "${MATCHED_STRING}")
|
|
400
393
|
|
|
401
394
|
if (EXTRACTED_NUMBER GREATER_EQUAL 10)
|
|
402
|
-
list(APPEND ARCH_FLAGS -mcpu=power10
|
|
395
|
+
list(APPEND ARCH_FLAGS -mcpu=power10)
|
|
403
396
|
elseif (EXTRACTED_NUMBER EQUAL 9)
|
|
404
|
-
list(APPEND ARCH_FLAGS -mcpu=power9
|
|
397
|
+
list(APPEND ARCH_FLAGS -mcpu=power9)
|
|
405
398
|
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le")
|
|
406
399
|
list(APPEND ARCH_FLAGS -mcpu=powerpc64le -mtune=native)
|
|
407
400
|
else()
|
|
@@ -646,7 +646,7 @@ static void gemm_q4_b32_8x8_q8_0_lut_avx(int n, float * GGML_RESTRICT s, size_t
|
|
|
646
646
|
__m256i requiredOrder = _mm256_set_epi32(3, 2, 1, 0, 7, 6, 5, 4);
|
|
647
647
|
int64_t xstart = 0;
|
|
648
648
|
int anr = nr - nr%16; // Used to align nr with boundary of 16
|
|
649
|
-
#
|
|
649
|
+
#if defined(__AVX512BW__) && defined(__AVX512DQ__)
|
|
650
650
|
int anc = nc - nc%16; // Used to align nc with boundary of 16
|
|
651
651
|
// Mask to mask out nibbles from packed bytes expanded to 512 bit length
|
|
652
652
|
const __m512i m4bexpanded = _mm512_set1_epi8(0x0F);
|
|
@@ -1041,7 +1041,7 @@ static void gemm_q4_b32_8x8_q8_0_lut_avx(int n, float * GGML_RESTRICT s, size_t
|
|
|
1041
1041
|
xstart = anc/8;
|
|
1042
1042
|
y = 0;
|
|
1043
1043
|
}
|
|
1044
|
-
#endif //
|
|
1044
|
+
#endif // __AVX512BW__ && __AVX512DQ__
|
|
1045
1045
|
|
|
1046
1046
|
// Take group of four block_q8_0x4 structures at each pass of the loop and perform dot product operation
|
|
1047
1047
|
|
|
@@ -1989,7 +1989,7 @@ void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
1989
1989
|
__m256i requiredOrder = _mm256_set_epi32(3, 2, 1, 0, 7, 6, 5, 4);
|
|
1990
1990
|
int64_t xstart = 0;
|
|
1991
1991
|
int anr = nr - nr % 16;; // Used to align nr with boundary of 16
|
|
1992
|
-
#
|
|
1992
|
+
#if defined(__AVX512BW__) && defined(__AVX512DQ__)
|
|
1993
1993
|
int anc = nc - nc % 16; // Used to align nc with boundary of 16
|
|
1994
1994
|
// Mask to mask out nibbles from packed bytes expanded to 512 bit length
|
|
1995
1995
|
const __m512i m4bexpanded = _mm512_set1_epi8(0x0F);
|
|
@@ -2727,7 +2727,7 @@ void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
2727
2727
|
xstart = anc/8;
|
|
2728
2728
|
y = 0;
|
|
2729
2729
|
}
|
|
2730
|
-
#endif //
|
|
2730
|
+
#endif // __AVX512BW__ && __AVX512DQ__
|
|
2731
2731
|
|
|
2732
2732
|
// Take group of four block_q8_Kx4 structures at each pass of the loop and perform dot product operation
|
|
2733
2733
|
for (; y < anr / 4; y += 4) {
|
|
@@ -3467,7 +3467,7 @@ void ggml_gemm_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
3467
3467
|
__m256i scalesmask2 = _mm256_castsi128_si256(scalesmask2_sse);
|
|
3468
3468
|
scalesmask2 = _mm256_permute2f128_si256(scalesmask2, scalesmask2, 0);
|
|
3469
3469
|
|
|
3470
|
-
#
|
|
3470
|
+
#if defined(__AVX512BW__) && defined(__AVX512DQ__)
|
|
3471
3471
|
|
|
3472
3472
|
int anc = nc - nc % 16; // Used to align nc with boundary of 16
|
|
3473
3473
|
|
|
@@ -4947,7 +4947,7 @@ void ggml_gemm_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
4947
4947
|
y = 0;
|
|
4948
4948
|
}
|
|
4949
4949
|
|
|
4950
|
-
#endif //
|
|
4950
|
+
#endif // __AVX512BW__ && __AVX512DQ__
|
|
4951
4951
|
|
|
4952
4952
|
// Take group of four block_q8_Kx4 structures at each pass of the loop and perform dot product operation
|
|
4953
4953
|
for (; y < anr / 4; y += 4) {
|
|
@@ -39,7 +39,7 @@
|
|
|
39
39
|
|
|
40
40
|
#include "kernels.h"
|
|
41
41
|
|
|
42
|
-
#define NELEMS(x) sizeof(x) / sizeof(*x)
|
|
42
|
+
#define NELEMS(x) (sizeof(x) / sizeof(*x))
|
|
43
43
|
|
|
44
44
|
template<size_t(*Fn)(size_t,size_t,size_t)>
|
|
45
45
|
static inline size_t kernel_offs_fn3(size_t a, size_t b, size_t c) {
|
|
@@ -635,6 +635,7 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
|
|
|
635
635
|
},
|
|
636
636
|
#endif
|
|
637
637
|
#endif
|
|
638
|
+
{ /* Sentinel */ }
|
|
638
639
|
};
|
|
639
640
|
|
|
640
641
|
static ggml_kleidiai_kernels gemm_gemv_kernels_q8[] = {
|
|
@@ -803,6 +804,7 @@ static ggml_kleidiai_kernels gemm_gemv_kernels_q8[] = {
|
|
|
803
804
|
/* .op_type = */ GGML_TYPE_F32,
|
|
804
805
|
},
|
|
805
806
|
#endif
|
|
807
|
+
{ /* Sentinel */ }
|
|
806
808
|
};
|
|
807
809
|
|
|
808
810
|
ggml_kleidiai_kernels * ggml_kleidiai_select_kernels(cpu_feature cpu_features, const ggml_tensor * tensor) {
|
|
@@ -810,7 +812,7 @@ ggml_kleidiai_kernels * ggml_kleidiai_select_kernels(cpu_feature cpu_features, c
|
|
|
810
812
|
|
|
811
813
|
if (tensor->op == GGML_OP_MUL_MAT && tensor->src[0] != nullptr && tensor->src[1] != nullptr) {
|
|
812
814
|
#if defined(__ARM_FEATURE_SME) || defined(__ARM_FEATURE_DOTPROD) || defined(__ARM_FEATURE_MATMUL_INT8)
|
|
813
|
-
for (size_t i = 0; i < NELEMS(gemm_gemv_kernels); ++i) {
|
|
815
|
+
for (size_t i = 0; i < NELEMS(gemm_gemv_kernels) - 1; ++i) {
|
|
814
816
|
if ((cpu_features & gemm_gemv_kernels[i].required_cpu) == gemm_gemv_kernels[i].required_cpu &&
|
|
815
817
|
gemm_gemv_kernels[i].lhs_type == tensor->src[1]->type &&
|
|
816
818
|
gemm_gemv_kernels[i].rhs_type == tensor->src[0]->type &&
|
|
@@ -820,7 +822,7 @@ ggml_kleidiai_kernels * ggml_kleidiai_select_kernels(cpu_feature cpu_features, c
|
|
|
820
822
|
}
|
|
821
823
|
}
|
|
822
824
|
if (!kernel) {
|
|
823
|
-
for (size_t i = 0; i < NELEMS(gemm_gemv_kernels_q8); ++i) {
|
|
825
|
+
for (size_t i = 0; i < NELEMS(gemm_gemv_kernels_q8) - 1; ++i) {
|
|
824
826
|
if ((cpu_features & gemm_gemv_kernels_q8[i].required_cpu) == gemm_gemv_kernels_q8[i].required_cpu &&
|
|
825
827
|
gemm_gemv_kernels_q8[i].lhs_type == tensor->src[1]->type &&
|
|
826
828
|
gemm_gemv_kernels_q8[i].rhs_type == tensor->src[0]->type &&
|
|
@@ -830,6 +832,10 @@ ggml_kleidiai_kernels * ggml_kleidiai_select_kernels(cpu_feature cpu_features, c
|
|
|
830
832
|
}
|
|
831
833
|
}
|
|
832
834
|
}
|
|
835
|
+
#else
|
|
836
|
+
GGML_UNUSED(gemm_gemv_kernels);
|
|
837
|
+
GGML_UNUSED(gemm_gemv_kernels_q8);
|
|
838
|
+
GGML_UNUSED(cpu_features);
|
|
833
839
|
#endif
|
|
834
840
|
}
|
|
835
841
|
|
|
@@ -840,12 +846,14 @@ ggml_kleidiai_kernels * ggml_kleidiai_select_kernels_q4_0(cpu_feature features)
|
|
|
840
846
|
ggml_kleidiai_kernels * kernels = nullptr;
|
|
841
847
|
|
|
842
848
|
#if defined(__ARM_FEATURE_SME) || defined(__ARM_FEATURE_DOTPROD) || defined(__ARM_FEATURE_MATMUL_INT8)
|
|
843
|
-
for (size_t i = 0; i < NELEMS(gemm_gemv_kernels); ++i) {
|
|
849
|
+
for (size_t i = 0; i < NELEMS(gemm_gemv_kernels) - 1; ++i) {
|
|
844
850
|
if ((features & gemm_gemv_kernels[i].required_cpu) == gemm_gemv_kernels[i].required_cpu) {
|
|
845
851
|
kernels = &gemm_gemv_kernels[i];
|
|
846
852
|
break;
|
|
847
853
|
}
|
|
848
854
|
}
|
|
855
|
+
#else
|
|
856
|
+
GGML_UNUSED(features);
|
|
849
857
|
#endif
|
|
850
858
|
|
|
851
859
|
return kernels;
|
|
@@ -855,12 +863,14 @@ ggml_kleidiai_kernels * ggml_kleidiai_select_kernels_q8_0(cpu_feature features)
|
|
|
855
863
|
ggml_kleidiai_kernels * kernels = nullptr;
|
|
856
864
|
|
|
857
865
|
#if defined(__ARM_FEATURE_SME) || defined(__ARM_FEATURE_DOTPROD) || defined(__ARM_FEATURE_MATMUL_INT8)
|
|
858
|
-
for (size_t i = 0; i < NELEMS(gemm_gemv_kernels_q8); ++i) {
|
|
866
|
+
for (size_t i = 0; i < NELEMS(gemm_gemv_kernels_q8) - 1; ++i) {
|
|
859
867
|
if ((features & gemm_gemv_kernels_q8[i].required_cpu) == gemm_gemv_kernels_q8[i].required_cpu) {
|
|
860
868
|
kernels = &gemm_gemv_kernels_q8[i];
|
|
861
869
|
break;
|
|
862
870
|
}
|
|
863
871
|
}
|
|
872
|
+
#else
|
|
873
|
+
GGML_UNUSED(features);
|
|
864
874
|
#endif
|
|
865
875
|
|
|
866
876
|
return kernels;
|
|
@@ -9696,13 +9696,12 @@ static void ggml_compute_forward_solve_tri_f32(const struct ggml_compute_params
|
|
|
9696
9696
|
for (int64_t i00 = 0; i00 < n; ++i00) {
|
|
9697
9697
|
float sum = 0.0f;
|
|
9698
9698
|
for (int64_t t = 0; t < i00; ++t) {
|
|
9699
|
-
sum += A_batch[i00 * n + t] * X_batch[
|
|
9699
|
+
sum += A_batch[i00 * n + t] * X_batch[t * k + i01];
|
|
9700
9700
|
}
|
|
9701
9701
|
|
|
9702
9702
|
const float diag = A_batch[i00 * n + i00];
|
|
9703
9703
|
GGML_ASSERT(diag != 0.0f && "Zero diagonal in triangular matrix");
|
|
9704
|
-
|
|
9705
|
-
X_batch[i01 * n + i00] = (B_batch[i00 * k + i01] - sum) / diag;
|
|
9704
|
+
X_batch[i00 * k + i01] = (B_batch[i00 * k + i01] - sum) / diag;
|
|
9706
9705
|
}
|
|
9707
9706
|
}
|
|
9708
9707
|
}
|
|
@@ -160,18 +160,18 @@ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
|
|
|
160
160
|
#define GGML_F32xt svfloat32_t
|
|
161
161
|
#define GGML_F32xt_ZERO svdup_n_f32(0.0f)
|
|
162
162
|
#define GGML_F32xt_SET1(x) svdup_n_f32(x)
|
|
163
|
-
#define GGML_F32xt_LOAD_IMPL(pg, a
|
|
164
|
-
#define GGML_F32xt_LOAD(
|
|
165
|
-
#define GGML_F32xt_STORE_IMPL(pg,a,b)
|
|
166
|
-
#define GGML_F32xt_STORE(
|
|
163
|
+
#define GGML_F32xt_LOAD_IMPL(pg, a) svld1_f32(pg, a)
|
|
164
|
+
#define GGML_F32xt_LOAD(a) GGML_F32xt_LOAD_IMPL(DEFAULT_PG, a)
|
|
165
|
+
#define GGML_F32xt_STORE_IMPL(pg, a, b) svst1_f32(pg, a, b)
|
|
166
|
+
#define GGML_F32xt_STORE(a, b) GGML_F32xt_STORE_IMPL(DEFAULT_PG, a, b)
|
|
167
167
|
#define GGML_F32xt_FMA_IMPL(pg, a, b, c) svmad_f32_m(pg, b, c, a)
|
|
168
|
-
#define GGML_F32xt_FMA(
|
|
168
|
+
#define GGML_F32xt_FMA(a, b, c) GGML_F32xt_FMA_IMPL(DEFAULT_PG, a, b, c)
|
|
169
169
|
#define GGML_F32xt_ADD_IMPL(pg, a, b) svadd_f32_m(pg, a, b)
|
|
170
|
-
#define GGML_F32xt_ADD(
|
|
170
|
+
#define GGML_F32xt_ADD(a, b) GGML_F32xt_ADD_IMPL(DEFAULT_PG, a, b)
|
|
171
171
|
#define GGML_F32xt_MUL_IMPL(pg, a, b) svmul_f32_m(pg, a, b)
|
|
172
|
-
#define GGML_F32xt_MUL(
|
|
172
|
+
#define GGML_F32xt_MUL(a, b) GGML_F32xt_MUL_IMPL(DEFAULT_PG, a, b)
|
|
173
173
|
#define GGML_F32xt_REDUCE_ONE_IMPL(pg, a) svaddv(pg, a)
|
|
174
|
-
#define GGML_F32xt_REDUCE_ONE(
|
|
174
|
+
#define GGML_F32xt_REDUCE_ONE(a) GGML_F32xt_REDUCE_ONE_IMPL(DEFAULT_PG, a)
|
|
175
175
|
#define GGML_F32xt_REDUCE_IMPL(pg, res, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8) \
|
|
176
176
|
{ \
|
|
177
177
|
sum1 = svadd_f32_m(DEFAULT_PG, sum1, sum2); \
|
|
@@ -183,7 +183,8 @@ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
|
|
|
183
183
|
sum1 = svadd_f32_m(DEFAULT_PG, sum1, sum5); \
|
|
184
184
|
(res) = (ggml_float) GGML_F32xt_REDUCE_ONE(sum1); \
|
|
185
185
|
}
|
|
186
|
-
#define GGML_F32xt_REDUCE(
|
|
186
|
+
#define GGML_F32xt_REDUCE(res, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8) \
|
|
187
|
+
GGML_F32xt_REDUCE_IMPL(DEFAULT_PG, res, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8)
|
|
187
188
|
|
|
188
189
|
#define GGML_F32_VEC GGML_F32xt
|
|
189
190
|
#define GGML_F32_VEC_ZERO GGML_F32xt_ZERO
|
|
@@ -206,11 +207,11 @@ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
|
|
|
206
207
|
#define GGML_F32Cxt_STORE(dst_ptr, src_vec) svst1_f16(DEFAULT_PG16, (__fp16 *)(dst_ptr), (src_vec))
|
|
207
208
|
|
|
208
209
|
#define GGML_F32Cxt_FMA_IMPL(pg, a, b, c) svmad_f16_x(pg, b, c, a)
|
|
209
|
-
#define GGML_F32Cxt_FMA(
|
|
210
|
+
#define GGML_F32Cxt_FMA(a, b, c) GGML_F32Cxt_FMA_IMPL(DEFAULT_PG16, a, b, c)
|
|
210
211
|
#define GGML_F32Cxt_ADD_IMPL(pg, a, b) svadd_f16_x(pg, a, b)
|
|
211
|
-
#define GGML_F32Cxt_ADD(
|
|
212
|
+
#define GGML_F32Cxt_ADD(a, b) GGML_F32Cxt_ADD_IMPL(DEFAULT_PG16, a, b)
|
|
212
213
|
#define GGML_F32Cxt_MUL_IMPL(pg, a, b) svmul_f16_x(pg, a, b)
|
|
213
|
-
#define GGML_F32Cxt_MUL(
|
|
214
|
+
#define GGML_F32Cxt_MUL(a, b) GGML_F32Cxt_MUL_IMPL(DEFAULT_PG16, a, b)
|
|
214
215
|
#define GGML_F32Cxt_REDUCE GGML_F16xt_REDUCE_MIXED
|
|
215
216
|
|
|
216
217
|
#define GGML_F16x_VEC GGML_F32Cxt
|
|
@@ -224,7 +225,7 @@ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
|
|
|
224
225
|
#define GGML_F16x_VEC_REDUCE GGML_F32Cxt_REDUCE
|
|
225
226
|
|
|
226
227
|
#define GGML_F16xt_REDUCE_ONE_IMPL(pg, a) svaddv_f16(pg, a)
|
|
227
|
-
#define GGML_F16xt_REDUCE_ONE(
|
|
228
|
+
#define GGML_F16xt_REDUCE_ONE(a) GGML_F16xt_REDUCE_ONE_IMPL(DEFAULT_PG16, a)
|
|
228
229
|
|
|
229
230
|
#define GGML_F16xt_REDUCE_MIXED_IMPL(pg16, res, sum1, sum2, sum3, sum4) \
|
|
230
231
|
{ \
|
|
@@ -234,7 +235,8 @@ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
|
|
|
234
235
|
__fp16 sum_f16 = svaddv_f16(pg16, sum1); \
|
|
235
236
|
(res) = (ggml_float) sum_f16; \
|
|
236
237
|
}
|
|
237
|
-
#define GGML_F16xt_REDUCE_MIXED(
|
|
238
|
+
#define GGML_F16xt_REDUCE_MIXED(res, sum1, sum2, sum3, sum4) \
|
|
239
|
+
GGML_F16xt_REDUCE_MIXED_IMPL(DEFAULT_PG16, res, sum1, sum2, sum3, sum4)
|
|
238
240
|
|
|
239
241
|
// F16 NEON
|
|
240
242
|
|
|
@@ -698,60 +698,61 @@ inline static void ggml_vec_scale_f32(const int n, float * y, const float v) {
|
|
|
698
698
|
}
|
|
699
699
|
|
|
700
700
|
inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float v) {
|
|
701
|
-
#if defined(GGML_SIMD)
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
|
|
701
|
+
#if defined(GGML_SIMD) && defined(__ARM_FEATURE_SVE)
|
|
702
|
+
const int sve_register_length = svcntb() * 8;
|
|
703
|
+
const int ggml_f16_epr = sve_register_length / 16;
|
|
704
|
+
const int ggml_f16_step = 2 * ggml_f16_epr;
|
|
705
|
+
|
|
706
|
+
GGML_F16x_VEC vx = GGML_F16x_VEC_SET1(v);
|
|
707
|
+
const int np = (n & ~(ggml_f16_step - 1));
|
|
708
|
+
svfloat16_t ay1, ay2;
|
|
709
|
+
|
|
710
|
+
for (int i = 0; i < np; i += ggml_f16_step) {
|
|
711
|
+
ay1 = GGML_F16x_VEC_LOAD(y + i + 0*ggml_f16_epr, 0);
|
|
712
|
+
ay1 = GGML_F16x_VEC_MUL(ay1, vx);
|
|
713
|
+
GGML_F16x_VEC_STORE(y + i + 0*ggml_f16_epr, ay1, 0);
|
|
714
|
+
|
|
715
|
+
ay2 = GGML_F16x_VEC_LOAD(y + i + 1*ggml_f16_epr, 1);
|
|
716
|
+
ay2 = GGML_F16x_VEC_MUL(ay2, vx);
|
|
717
|
+
GGML_F16x_VEC_STORE(y + i + 1*ggml_f16_epr, ay2, 1);
|
|
718
|
+
}
|
|
719
|
+
// leftovers
|
|
720
|
+
// maximum number of leftover elements will be less that ggmlF_16x_epr. Apply predicated svmad on available elements only
|
|
721
|
+
if (np < n) {
|
|
722
|
+
svbool_t pg = svwhilelt_b16(np, n);
|
|
723
|
+
svfloat16_t hy = svld1_f16(pg, (__fp16 *)(y + np));
|
|
724
|
+
svfloat16_t out = svmul_f16_m(pg, hy, vx);
|
|
725
|
+
svst1_f16(pg, (__fp16 *)(y + np), out);
|
|
726
|
+
}
|
|
727
|
+
#elif defined(__riscv_v_intrinsic) && defined(__riscv_zvfh)
|
|
728
|
+
for (int i = 0, vl; i < n; i += vl) {
|
|
729
|
+
vl = __riscv_vsetvl_e16m2(n - i);
|
|
730
|
+
vfloat16m2_t vy = __riscv_vle16_v_f16m2((_Float16 *)&y[i], vl);
|
|
731
|
+
vfloat32m4_t vy32 = __riscv_vfwcvt_f_f_v_f32m4(vy, vl);
|
|
732
|
+
vy32 = __riscv_vfmul_vf_f32m4(vy32, v, vl);
|
|
733
|
+
vy = __riscv_vfncvt_f_f_w_f16m2(vy32, vl);
|
|
734
|
+
__riscv_vse16_v_f16m2((_Float16 *)&y[i], vy, vl);
|
|
735
|
+
}
|
|
736
|
+
#elif defined(GGML_SIMD)
|
|
737
|
+
const int np = (n & ~(GGML_F16_STEP - 1));
|
|
736
738
|
|
|
737
|
-
|
|
739
|
+
GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
|
|
738
740
|
|
|
739
|
-
|
|
741
|
+
GGML_F16_VEC ay[GGML_F16_ARR];
|
|
740
742
|
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
|
|
743
|
+
for (int i = 0; i < np; i += GGML_F16_STEP) {
|
|
744
|
+
for (int j = 0; j < GGML_F16_ARR; j++) {
|
|
745
|
+
ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
|
|
746
|
+
ay[j] = GGML_F16_VEC_MUL(ay[j], vx);
|
|
745
747
|
|
|
746
|
-
|
|
747
|
-
}
|
|
748
|
+
GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j);
|
|
748
749
|
}
|
|
750
|
+
}
|
|
749
751
|
|
|
750
|
-
|
|
751
|
-
|
|
752
|
-
|
|
753
|
-
|
|
754
|
-
#endif
|
|
752
|
+
// leftovers
|
|
753
|
+
for (int i = np; i < n; ++i) {
|
|
754
|
+
y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i])*v);
|
|
755
|
+
}
|
|
755
756
|
#else
|
|
756
757
|
// scalar
|
|
757
758
|
for (int i = 0; i < n; ++i) {
|