llama_cpp 0.5.1 → 0.5.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +15 -3
- data/examples/prompt_jp.txt +1 -1
- data/ext/llama_cpp/extconf.rb +1 -1
- data/ext/llama_cpp/llama_cpp.cpp +32 -2
- data/ext/llama_cpp/src/ggml-alloc.c +6 -11
- data/ext/llama_cpp/src/ggml-cuda.cu +1108 -699
- data/ext/llama_cpp/src/ggml-metal.m +93 -24
- data/ext/llama_cpp/src/ggml-metal.metal +407 -174
- data/ext/llama_cpp/src/ggml-opencl.cpp +3 -3
- data/ext/llama_cpp/src/ggml.c +75 -43
- data/ext/llama_cpp/src/ggml.h +42 -32
- data/ext/llama_cpp/src/k_quants.c +4 -1
- data/ext/llama_cpp/src/llama.cpp +1040 -201
- data/ext/llama_cpp/src/llama.h +13 -7
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +1 -1
- data/sig/llama_cpp.rbs +4 -0
- metadata +2 -2
@@ -847,7 +847,7 @@ std::array<std::string, 2> mul_str_values = {
|
|
847
847
|
"mul_f32", "float"
|
848
848
|
};
|
849
849
|
|
850
|
-
std::string& replace(std::string& s, const std::string& from, const std::string& to) {
|
850
|
+
static std::string& replace(std::string& s, const std::string& from, const std::string& to) {
|
851
851
|
size_t pos = 0;
|
852
852
|
while ((pos = s.find(from, pos)) != std::string::npos) {
|
853
853
|
s.replace(pos, from.length(), to);
|
@@ -856,7 +856,7 @@ std::string& replace(std::string& s, const std::string& from, const std::string&
|
|
856
856
|
return s;
|
857
857
|
}
|
858
858
|
|
859
|
-
std::string generate_kernels() {
|
859
|
+
static std::string generate_kernels() {
|
860
860
|
std::stringstream src;
|
861
861
|
src << program_source << '\n';
|
862
862
|
src << k_quants_source << '\n';
|
@@ -1788,7 +1788,7 @@ bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tens
|
|
1788
1788
|
return false;
|
1789
1789
|
}
|
1790
1790
|
|
1791
|
-
bool ggml_cl_mul_mat_use_f16(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * /* dst */) {
|
1791
|
+
static bool ggml_cl_mul_mat_use_f16(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * /* dst */) {
|
1792
1792
|
// If device doesn't support FP16
|
1793
1793
|
if (!fp16_support) {
|
1794
1794
|
return false;
|
data/ext/llama_cpp/src/ggml.c
CHANGED
@@ -1,4 +1,3 @@
|
|
1
|
-
#define _GNU_SOURCE // Defines CLOCK_MONOTONIC on Linux
|
2
1
|
#define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnigns on Windows
|
3
2
|
|
4
3
|
#include "ggml.h"
|
@@ -47,6 +46,10 @@
|
|
47
46
|
// disable "possible loss of data" to avoid hundreds of casts
|
48
47
|
// we should just be careful :)
|
49
48
|
#pragma warning(disable: 4244 4267)
|
49
|
+
|
50
|
+
// disable POSIX deprecation warnigns
|
51
|
+
// these functions are never going away, anyway
|
52
|
+
#pragma warning(disable: 4996)
|
50
53
|
#endif
|
51
54
|
|
52
55
|
#if defined(_WIN32)
|
@@ -280,7 +283,7 @@ typedef double ggml_float;
|
|
280
283
|
// 16-bit float
|
281
284
|
// on Arm, we use __fp16
|
282
285
|
// on x86, we use uint16_t
|
283
|
-
#
|
286
|
+
#if defined(__ARM_NEON) && !defined(_MSC_VER)
|
284
287
|
|
285
288
|
// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
|
286
289
|
//
|
@@ -307,12 +310,14 @@ typedef double ggml_float;
|
|
307
310
|
#if defined(_MSC_VER) || defined(__MINGW32__)
|
308
311
|
#include <intrin.h>
|
309
312
|
#else
|
313
|
+
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__)
|
310
314
|
#if !defined(__riscv)
|
311
315
|
#include <immintrin.h>
|
312
316
|
#endif
|
313
317
|
#endif
|
314
318
|
#endif
|
315
319
|
#endif
|
320
|
+
#endif
|
316
321
|
|
317
322
|
#ifdef __riscv_v_intrinsic
|
318
323
|
#include <riscv_vector.h>
|
@@ -4298,10 +4303,21 @@ int64_t ggml_nrows(const struct ggml_tensor * tensor) {
|
|
4298
4303
|
}
|
4299
4304
|
|
4300
4305
|
size_t ggml_nbytes(const struct ggml_tensor * tensor) {
|
4301
|
-
size_t nbytes
|
4302
|
-
|
4303
|
-
|
4306
|
+
size_t nbytes;
|
4307
|
+
size_t blck_size = ggml_blck_size(tensor->type);
|
4308
|
+
if (blck_size == 1) {
|
4309
|
+
nbytes = ggml_type_size(tensor->type);
|
4310
|
+
for (int i = 0; i < GGML_MAX_DIMS; ++i) {
|
4311
|
+
nbytes += (tensor->ne[i] - 1)*tensor->nb[i];
|
4312
|
+
}
|
4304
4313
|
}
|
4314
|
+
else {
|
4315
|
+
nbytes = tensor->ne[0]*tensor->nb[0]/blck_size;
|
4316
|
+
for (int i = 1; i < GGML_MAX_DIMS; ++i) {
|
4317
|
+
nbytes += (tensor->ne[i] - 1)*tensor->nb[i];
|
4318
|
+
}
|
4319
|
+
}
|
4320
|
+
|
4305
4321
|
return nbytes;
|
4306
4322
|
}
|
4307
4323
|
|
@@ -17278,10 +17294,18 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
17278
17294
|
} else {
|
17279
17295
|
// wait for other threads to finish
|
17280
17296
|
const int last = node_n;
|
17281
|
-
|
17282
|
-
//sched_yield
|
17297
|
+
while (true) {
|
17298
|
+
// TODO: this sched_yield can have significant impact on the performance - either positive or negative
|
17299
|
+
// depending on the workload and the operating system.
|
17300
|
+
// since it is not clear what is the best approach, it should potentially become user-configurable
|
17301
|
+
// ref: https://github.com/ggerganov/ggml/issues/291
|
17302
|
+
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
17303
|
+
sched_yield();
|
17304
|
+
#endif
|
17305
|
+
|
17283
17306
|
node_n = atomic_load(&state->shared->node_n);
|
17284
|
-
|
17307
|
+
if (node_n != last) break;
|
17308
|
+
};
|
17285
17309
|
}
|
17286
17310
|
|
17287
17311
|
// check if we should stop
|
@@ -18332,10 +18356,11 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
|
|
18332
18356
|
for (int i = 0; i < cgraph->n_leafs; i++) {
|
18333
18357
|
struct ggml_tensor * node = cgraph->leafs[i];
|
18334
18358
|
|
18335
|
-
GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 "] %8s\n",
|
18359
|
+
GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 "] %8s %16s\n",
|
18336
18360
|
i,
|
18337
18361
|
node->ne[0], node->ne[1],
|
18338
|
-
ggml_op_name(node->op)
|
18362
|
+
ggml_op_name(node->op),
|
18363
|
+
ggml_get_name(node));
|
18339
18364
|
}
|
18340
18365
|
|
18341
18366
|
for (int i = 0; i < GGML_OP_COUNT; i++) {
|
@@ -18872,7 +18897,6 @@ static enum ggml_opt_result linesearch_backtracking(
|
|
18872
18897
|
// strong Wolfe condition (GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE)
|
18873
18898
|
return count;
|
18874
18899
|
}
|
18875
|
-
return count;
|
18876
18900
|
}
|
18877
18901
|
}
|
18878
18902
|
|
@@ -20095,27 +20119,27 @@ const char * gguf_type_name(enum gguf_type type) {
|
|
20095
20119
|
return GGUF_TYPE_NAME[type];
|
20096
20120
|
}
|
20097
20121
|
|
20098
|
-
int gguf_get_version(struct gguf_context * ctx) {
|
20122
|
+
int gguf_get_version(const struct gguf_context * ctx) {
|
20099
20123
|
return ctx->header.version;
|
20100
20124
|
}
|
20101
20125
|
|
20102
|
-
size_t gguf_get_alignment(struct gguf_context * ctx) {
|
20126
|
+
size_t gguf_get_alignment(const struct gguf_context * ctx) {
|
20103
20127
|
return ctx->alignment;
|
20104
20128
|
}
|
20105
20129
|
|
20106
|
-
size_t gguf_get_data_offset(struct gguf_context * ctx) {
|
20130
|
+
size_t gguf_get_data_offset(const struct gguf_context * ctx) {
|
20107
20131
|
return ctx->offset;
|
20108
20132
|
}
|
20109
20133
|
|
20110
|
-
void * gguf_get_data(struct gguf_context * ctx) {
|
20134
|
+
void * gguf_get_data(const struct gguf_context * ctx) {
|
20111
20135
|
return ctx->data;
|
20112
20136
|
}
|
20113
20137
|
|
20114
|
-
int gguf_get_n_kv(struct gguf_context * ctx) {
|
20138
|
+
int gguf_get_n_kv(const struct gguf_context * ctx) {
|
20115
20139
|
return ctx->header.n_kv;
|
20116
20140
|
}
|
20117
20141
|
|
20118
|
-
int gguf_find_key(struct gguf_context * ctx, const char * key) {
|
20142
|
+
int gguf_find_key(const struct gguf_context * ctx, const char * key) {
|
20119
20143
|
// return -1 if key not found
|
20120
20144
|
int keyfound = -1;
|
20121
20145
|
|
@@ -20131,85 +20155,85 @@ int gguf_find_key(struct gguf_context * ctx, const char * key) {
|
|
20131
20155
|
return keyfound;
|
20132
20156
|
}
|
20133
20157
|
|
20134
|
-
const char * gguf_get_key(struct gguf_context * ctx, int i) {
|
20158
|
+
const char * gguf_get_key(const struct gguf_context * ctx, int i) {
|
20135
20159
|
return ctx->kv[i].key.data;
|
20136
20160
|
}
|
20137
20161
|
|
20138
|
-
enum gguf_type gguf_get_kv_type(struct gguf_context * ctx, int i) {
|
20162
|
+
enum gguf_type gguf_get_kv_type(const struct gguf_context * ctx, int i) {
|
20139
20163
|
return ctx->kv[i].type;
|
20140
20164
|
}
|
20141
20165
|
|
20142
|
-
enum gguf_type gguf_get_arr_type(struct gguf_context * ctx, int i) {
|
20166
|
+
enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int i) {
|
20143
20167
|
return ctx->kv[i].value.arr.type;
|
20144
20168
|
}
|
20145
20169
|
|
20146
|
-
const void * gguf_get_arr_data(struct gguf_context * ctx, int i) {
|
20170
|
+
const void * gguf_get_arr_data(const struct gguf_context * ctx, int i) {
|
20147
20171
|
return ctx->kv[i].value.arr.data;
|
20148
20172
|
}
|
20149
20173
|
|
20150
|
-
const char * gguf_get_arr_str(struct gguf_context * ctx, int key_id, int i) {
|
20174
|
+
const char * gguf_get_arr_str(const struct gguf_context * ctx, int key_id, int i) {
|
20151
20175
|
struct gguf_kv * kv = &ctx->kv[key_id];
|
20152
20176
|
struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[i];
|
20153
20177
|
return str->data;
|
20154
20178
|
}
|
20155
20179
|
|
20156
|
-
int gguf_get_arr_n(struct gguf_context * ctx, int i) {
|
20180
|
+
int gguf_get_arr_n(const struct gguf_context * ctx, int i) {
|
20157
20181
|
return ctx->kv[i].value.arr.n;
|
20158
20182
|
}
|
20159
20183
|
|
20160
|
-
uint8_t gguf_get_val_u8(struct gguf_context * ctx, int i) {
|
20184
|
+
uint8_t gguf_get_val_u8(const struct gguf_context * ctx, int i) {
|
20161
20185
|
return ctx->kv[i].value.uint8;
|
20162
20186
|
}
|
20163
20187
|
|
20164
|
-
int8_t gguf_get_val_i8(struct gguf_context * ctx, int i) {
|
20188
|
+
int8_t gguf_get_val_i8(const struct gguf_context * ctx, int i) {
|
20165
20189
|
return ctx->kv[i].value.int8;
|
20166
20190
|
}
|
20167
20191
|
|
20168
|
-
uint16_t gguf_get_val_u16(struct gguf_context * ctx, int i) {
|
20192
|
+
uint16_t gguf_get_val_u16(const struct gguf_context * ctx, int i) {
|
20169
20193
|
return ctx->kv[i].value.uint16;
|
20170
20194
|
}
|
20171
20195
|
|
20172
|
-
int16_t gguf_get_val_i16(struct gguf_context * ctx, int i) {
|
20196
|
+
int16_t gguf_get_val_i16(const struct gguf_context * ctx, int i) {
|
20173
20197
|
return ctx->kv[i].value.int16;
|
20174
20198
|
}
|
20175
20199
|
|
20176
|
-
uint32_t gguf_get_val_u32(struct gguf_context * ctx, int i) {
|
20200
|
+
uint32_t gguf_get_val_u32(const struct gguf_context * ctx, int i) {
|
20177
20201
|
return ctx->kv[i].value.uint32;
|
20178
20202
|
}
|
20179
20203
|
|
20180
|
-
int32_t gguf_get_val_i32(struct gguf_context * ctx, int i) {
|
20204
|
+
int32_t gguf_get_val_i32(const struct gguf_context * ctx, int i) {
|
20181
20205
|
return ctx->kv[i].value.int32;
|
20182
20206
|
}
|
20183
20207
|
|
20184
|
-
float gguf_get_val_f32(struct gguf_context * ctx, int i) {
|
20208
|
+
float gguf_get_val_f32(const struct gguf_context * ctx, int i) {
|
20185
20209
|
return ctx->kv[i].value.float32;
|
20186
20210
|
}
|
20187
20211
|
|
20188
|
-
uint64_t gguf_get_val_u64(struct gguf_context * ctx, int i) {
|
20212
|
+
uint64_t gguf_get_val_u64(const struct gguf_context * ctx, int i) {
|
20189
20213
|
return ctx->kv[i].value.uint64;
|
20190
20214
|
}
|
20191
20215
|
|
20192
|
-
int64_t gguf_get_val_i64(struct gguf_context * ctx, int i) {
|
20216
|
+
int64_t gguf_get_val_i64(const struct gguf_context * ctx, int i) {
|
20193
20217
|
return ctx->kv[i].value.int64;
|
20194
20218
|
}
|
20195
20219
|
|
20196
|
-
double gguf_get_val_f64(struct gguf_context * ctx, int i) {
|
20220
|
+
double gguf_get_val_f64(const struct gguf_context * ctx, int i) {
|
20197
20221
|
return ctx->kv[i].value.float64;
|
20198
20222
|
}
|
20199
20223
|
|
20200
|
-
bool gguf_get_val_bool(struct gguf_context * ctx, int i) {
|
20224
|
+
bool gguf_get_val_bool(const struct gguf_context * ctx, int i) {
|
20201
20225
|
return ctx->kv[i].value.bool_;
|
20202
20226
|
}
|
20203
20227
|
|
20204
|
-
const char * gguf_get_val_str (struct gguf_context * ctx, int i) {
|
20228
|
+
const char * gguf_get_val_str (const struct gguf_context * ctx, int i) {
|
20205
20229
|
return ctx->kv[i].value.str.data;
|
20206
20230
|
}
|
20207
20231
|
|
20208
|
-
int gguf_get_n_tensors(struct gguf_context * ctx) {
|
20232
|
+
int gguf_get_n_tensors(const struct gguf_context * ctx) {
|
20209
20233
|
return ctx->header.n_tensors;
|
20210
20234
|
}
|
20211
20235
|
|
20212
|
-
int gguf_find_tensor(struct gguf_context * ctx, const char * name) {
|
20236
|
+
int gguf_find_tensor(const struct gguf_context * ctx, const char * name) {
|
20213
20237
|
// return -1 if tensor not found
|
20214
20238
|
int tensorfound = -1;
|
20215
20239
|
|
@@ -20225,11 +20249,11 @@ int gguf_find_tensor(struct gguf_context * ctx, const char * name) {
|
|
20225
20249
|
return tensorfound;
|
20226
20250
|
}
|
20227
20251
|
|
20228
|
-
size_t gguf_get_tensor_offset(struct gguf_context * ctx, int i) {
|
20252
|
+
size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int i) {
|
20229
20253
|
return ctx->infos[i].offset;
|
20230
20254
|
}
|
20231
20255
|
|
20232
|
-
char * gguf_get_tensor_name(struct gguf_context * ctx, int i) {
|
20256
|
+
char * gguf_get_tensor_name(const struct gguf_context * ctx, int i) {
|
20233
20257
|
return ctx->infos[i].name.data;
|
20234
20258
|
}
|
20235
20259
|
|
@@ -20512,7 +20536,7 @@ static void gguf_bwrite_el(struct gguf_buf * buf, const void * val, size_t el_si
|
|
20512
20536
|
buf->offset += el_size;
|
20513
20537
|
}
|
20514
20538
|
|
20515
|
-
static void gguf_write_to_buf(struct gguf_context * ctx, struct gguf_buf * buf, bool only_meta) {
|
20539
|
+
static void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf * buf, bool only_meta) {
|
20516
20540
|
// write header
|
20517
20541
|
gguf_bwrite_el(buf, &ctx->header.magic, sizeof(ctx->header.magic));
|
20518
20542
|
gguf_bwrite_el(buf, &ctx->header.version, sizeof(ctx->header.version));
|
@@ -20627,7 +20651,7 @@ static void gguf_write_to_buf(struct gguf_context * ctx, struct gguf_buf * buf,
|
|
20627
20651
|
}
|
20628
20652
|
}
|
20629
20653
|
|
20630
|
-
void gguf_write_to_file(struct gguf_context * ctx, const char * fname, bool only_meta) {
|
20654
|
+
void gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta) {
|
20631
20655
|
FILE * file = fopen(fname, "wb");
|
20632
20656
|
if (!file) {
|
20633
20657
|
GGML_ASSERT(false && "failed to open file for writing");
|
@@ -20644,7 +20668,7 @@ void gguf_write_to_file(struct gguf_context * ctx, const char * fname, bool only
|
|
20644
20668
|
fclose(file);
|
20645
20669
|
}
|
20646
20670
|
|
20647
|
-
size_t gguf_get_meta_size(struct gguf_context * ctx) {
|
20671
|
+
size_t gguf_get_meta_size(const struct gguf_context * ctx) {
|
20648
20672
|
// no allocs - only compute size
|
20649
20673
|
struct gguf_buf buf = gguf_buf_init(0);
|
20650
20674
|
|
@@ -20653,7 +20677,7 @@ size_t gguf_get_meta_size(struct gguf_context * ctx) {
|
|
20653
20677
|
return buf.offset;
|
20654
20678
|
}
|
20655
20679
|
|
20656
|
-
void gguf_get_meta_data(struct gguf_context * ctx, void * data) {
|
20680
|
+
void gguf_get_meta_data(const struct gguf_context * ctx, void * data) {
|
20657
20681
|
struct gguf_buf buf = gguf_buf_init(16*1024);
|
20658
20682
|
|
20659
20683
|
gguf_write_to_buf(ctx, &buf, true);
|
@@ -20729,6 +20753,14 @@ int ggml_cpu_has_arm_fma(void) {
|
|
20729
20753
|
#endif
|
20730
20754
|
}
|
20731
20755
|
|
20756
|
+
int ggml_cpu_has_metal(void) {
|
20757
|
+
#if defined(GGML_USE_METAL)
|
20758
|
+
return 1;
|
20759
|
+
#else
|
20760
|
+
return 0;
|
20761
|
+
#endif
|
20762
|
+
}
|
20763
|
+
|
20732
20764
|
int ggml_cpu_has_f16c(void) {
|
20733
20765
|
#if defined(__F16C__)
|
20734
20766
|
return 1;
|
data/ext/llama_cpp/src/ggml.h
CHANGED
@@ -195,6 +195,14 @@
|
|
195
195
|
# define GGML_DEPRECATED(func, hint) func
|
196
196
|
#endif
|
197
197
|
|
198
|
+
#ifndef __GNUC__
|
199
|
+
# define GGML_ATTRIBUTE_FORMAT(...)
|
200
|
+
#elif defined(__MINGW32__)
|
201
|
+
# define GGML_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
|
202
|
+
#else
|
203
|
+
# define GGML_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
|
204
|
+
#endif
|
205
|
+
|
198
206
|
#include <stdint.h>
|
199
207
|
#include <stddef.h>
|
200
208
|
#include <stdbool.h>
|
@@ -685,6 +693,7 @@ extern "C" {
|
|
685
693
|
|
686
694
|
GGML_API const char * ggml_get_name (const struct ggml_tensor * tensor);
|
687
695
|
GGML_API struct ggml_tensor * ggml_set_name ( struct ggml_tensor * tensor, const char * name);
|
696
|
+
GGML_ATTRIBUTE_FORMAT(2, 3)
|
688
697
|
GGML_API struct ggml_tensor * ggml_format_name( struct ggml_tensor * tensor, const char * fmt, ...);
|
689
698
|
|
690
699
|
//
|
@@ -1866,39 +1875,39 @@ extern "C" {
|
|
1866
1875
|
|
1867
1876
|
GGML_API const char * gguf_type_name(enum gguf_type type);
|
1868
1877
|
|
1869
|
-
GGML_API int gguf_get_version (struct gguf_context * ctx);
|
1870
|
-
GGML_API size_t gguf_get_alignment (struct gguf_context * ctx);
|
1871
|
-
GGML_API size_t gguf_get_data_offset(struct gguf_context * ctx);
|
1872
|
-
GGML_API void * gguf_get_data (struct gguf_context * ctx);
|
1878
|
+
GGML_API int gguf_get_version (const struct gguf_context * ctx);
|
1879
|
+
GGML_API size_t gguf_get_alignment (const struct gguf_context * ctx);
|
1880
|
+
GGML_API size_t gguf_get_data_offset(const struct gguf_context * ctx);
|
1881
|
+
GGML_API void * gguf_get_data (const struct gguf_context * ctx);
|
1873
1882
|
|
1874
|
-
GGML_API int gguf_get_n_kv(struct gguf_context * ctx);
|
1875
|
-
GGML_API int gguf_find_key(struct gguf_context * ctx, const char * key);
|
1876
|
-
GGML_API const char * gguf_get_key (struct gguf_context * ctx, int i);
|
1883
|
+
GGML_API int gguf_get_n_kv(const struct gguf_context * ctx);
|
1884
|
+
GGML_API int gguf_find_key(const struct gguf_context * ctx, const char * key);
|
1885
|
+
GGML_API const char * gguf_get_key (const struct gguf_context * ctx, int i);
|
1877
1886
|
|
1878
|
-
GGML_API enum gguf_type gguf_get_kv_type (struct gguf_context * ctx, int i);
|
1879
|
-
GGML_API enum gguf_type gguf_get_arr_type(struct gguf_context * ctx, int i);
|
1887
|
+
GGML_API enum gguf_type gguf_get_kv_type (const struct gguf_context * ctx, int i);
|
1888
|
+
GGML_API enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int i);
|
1880
1889
|
|
1881
1890
|
// results are undefined if the wrong type is used for the key
|
1882
|
-
GGML_API uint8_t gguf_get_val_u8 (struct gguf_context * ctx, int i);
|
1883
|
-
GGML_API int8_t gguf_get_val_i8 (struct gguf_context * ctx, int i);
|
1884
|
-
GGML_API uint16_t gguf_get_val_u16 (struct gguf_context * ctx, int i);
|
1885
|
-
GGML_API int16_t gguf_get_val_i16 (struct gguf_context * ctx, int i);
|
1886
|
-
GGML_API uint32_t gguf_get_val_u32 (struct gguf_context * ctx, int i);
|
1887
|
-
GGML_API int32_t gguf_get_val_i32 (struct gguf_context * ctx, int i);
|
1888
|
-
GGML_API float gguf_get_val_f32 (struct gguf_context * ctx, int i);
|
1889
|
-
GGML_API uint64_t gguf_get_val_u64 (struct gguf_context * ctx, int i);
|
1890
|
-
GGML_API int64_t gguf_get_val_i64 (struct gguf_context * ctx, int i);
|
1891
|
-
GGML_API double gguf_get_val_f64 (struct gguf_context * ctx, int i);
|
1892
|
-
GGML_API bool gguf_get_val_bool(struct gguf_context * ctx, int i);
|
1893
|
-
GGML_API const char * gguf_get_val_str (struct gguf_context * ctx, int i);
|
1894
|
-
GGML_API int gguf_get_arr_n (struct gguf_context * ctx, int i);
|
1895
|
-
GGML_API const void * gguf_get_arr_data(struct gguf_context * ctx, int i);
|
1896
|
-
GGML_API const char * gguf_get_arr_str (struct gguf_context * ctx, int key_id, int i);
|
1897
|
-
|
1898
|
-
GGML_API int gguf_get_n_tensors (struct gguf_context * ctx);
|
1899
|
-
GGML_API int gguf_find_tensor (struct gguf_context * ctx, const char * name);
|
1900
|
-
GGML_API size_t gguf_get_tensor_offset(struct gguf_context * ctx, int i);
|
1901
|
-
GGML_API char * gguf_get_tensor_name (struct gguf_context * ctx, int i);
|
1891
|
+
GGML_API uint8_t gguf_get_val_u8 (const struct gguf_context * ctx, int i);
|
1892
|
+
GGML_API int8_t gguf_get_val_i8 (const struct gguf_context * ctx, int i);
|
1893
|
+
GGML_API uint16_t gguf_get_val_u16 (const struct gguf_context * ctx, int i);
|
1894
|
+
GGML_API int16_t gguf_get_val_i16 (const struct gguf_context * ctx, int i);
|
1895
|
+
GGML_API uint32_t gguf_get_val_u32 (const struct gguf_context * ctx, int i);
|
1896
|
+
GGML_API int32_t gguf_get_val_i32 (const struct gguf_context * ctx, int i);
|
1897
|
+
GGML_API float gguf_get_val_f32 (const struct gguf_context * ctx, int i);
|
1898
|
+
GGML_API uint64_t gguf_get_val_u64 (const struct gguf_context * ctx, int i);
|
1899
|
+
GGML_API int64_t gguf_get_val_i64 (const struct gguf_context * ctx, int i);
|
1900
|
+
GGML_API double gguf_get_val_f64 (const struct gguf_context * ctx, int i);
|
1901
|
+
GGML_API bool gguf_get_val_bool(const struct gguf_context * ctx, int i);
|
1902
|
+
GGML_API const char * gguf_get_val_str (const struct gguf_context * ctx, int i);
|
1903
|
+
GGML_API int gguf_get_arr_n (const struct gguf_context * ctx, int i);
|
1904
|
+
GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int i);
|
1905
|
+
GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int key_id, int i);
|
1906
|
+
|
1907
|
+
GGML_API int gguf_get_n_tensors (const struct gguf_context * ctx);
|
1908
|
+
GGML_API int gguf_find_tensor (const struct gguf_context * ctx, const char * name);
|
1909
|
+
GGML_API size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int i);
|
1910
|
+
GGML_API char * gguf_get_tensor_name (const struct gguf_context * ctx, int i);
|
1902
1911
|
|
1903
1912
|
// overrides existing values or adds a new one
|
1904
1913
|
GGML_API void gguf_set_val_u8 (struct gguf_context * ctx, const char * key, uint8_t val);
|
@@ -1943,11 +1952,11 @@ extern "C" {
|
|
1943
1952
|
//
|
1944
1953
|
|
1945
1954
|
// write the entire context to a binary file
|
1946
|
-
GGML_API void gguf_write_to_file(struct gguf_context * ctx, const char * fname, bool only_meta);
|
1955
|
+
GGML_API void gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta);
|
1947
1956
|
|
1948
1957
|
// get the size in bytes of the meta data (header, kv pairs, tensor info) including padding
|
1949
|
-
GGML_API size_t gguf_get_meta_size(struct gguf_context * ctx);
|
1950
|
-
GGML_API void gguf_get_meta_data(struct gguf_context * ctx, void * data);
|
1958
|
+
GGML_API size_t gguf_get_meta_size(const struct gguf_context * ctx);
|
1959
|
+
GGML_API void gguf_get_meta_data(const struct gguf_context * ctx, void * data);
|
1951
1960
|
|
1952
1961
|
//
|
1953
1962
|
// system info
|
@@ -1961,6 +1970,7 @@ extern "C" {
|
|
1961
1970
|
GGML_API int ggml_cpu_has_fma (void);
|
1962
1971
|
GGML_API int ggml_cpu_has_neon (void);
|
1963
1972
|
GGML_API int ggml_cpu_has_arm_fma (void);
|
1973
|
+
GGML_API int ggml_cpu_has_metal (void);
|
1964
1974
|
GGML_API int ggml_cpu_has_f16c (void);
|
1965
1975
|
GGML_API int ggml_cpu_has_fp16_va (void);
|
1966
1976
|
GGML_API int ggml_cpu_has_wasm_simd (void);
|
@@ -2609,7 +2609,10 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
|
|
2609
2609
|
|
2610
2610
|
memcpy(utmp, x[i].scales, 12);
|
2611
2611
|
|
2612
|
-
|
2612
|
+
uint32x2_t mins8 = { 0 };
|
2613
|
+
mins8 = vset_lane_u32(utmp[1] & kmask1, mins8, 0);
|
2614
|
+
mins8 = vset_lane_u32(((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4), mins8, 1);
|
2615
|
+
|
2613
2616
|
utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
|
2614
2617
|
utmp[0] &= kmask1;
|
2615
2618
|
|