llama_cpp 0.0.3 → 0.0.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +18 -0
- data/README.md +2 -2
- data/ext/llama_cpp/extconf.rb +26 -0
- data/ext/llama_cpp/llama_cpp.cpp +58 -2
- data/ext/llama_cpp/src/ggml.c +735 -253
- data/ext/llama_cpp/src/ggml.h +74 -16
- data/ext/llama_cpp/src/llama.cpp +800 -718
- data/ext/llama_cpp/src/llama.h +25 -1
- data/ext/llama_cpp/src/llama_util.h +389 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +3 -0
- metadata +3 -2
data/ext/llama_cpp/src/ggml.h
CHANGED
@@ -177,11 +177,12 @@ extern "C" {
|
|
177
177
|
#include <stddef.h>
|
178
178
|
#include <stdbool.h>
|
179
179
|
|
180
|
-
#define GGML_MAX_DIMS
|
181
|
-
#define GGML_MAX_NODES
|
182
|
-
#define GGML_MAX_PARAMS
|
183
|
-
#define GGML_MAX_CONTEXTS
|
184
|
-
#define GGML_MAX_OPT
|
180
|
+
#define GGML_MAX_DIMS 4
|
181
|
+
#define GGML_MAX_NODES 4096
|
182
|
+
#define GGML_MAX_PARAMS 16
|
183
|
+
#define GGML_MAX_CONTEXTS 64
|
184
|
+
#define GGML_MAX_OPT 4
|
185
|
+
#define GGML_DEFAULT_N_THREADS 4
|
185
186
|
|
186
187
|
#ifdef __ARM_NEON
|
187
188
|
// we use the built-in 16-bit float type
|
@@ -198,13 +199,14 @@ struct ggml_object;
|
|
198
199
|
struct ggml_context;
|
199
200
|
|
200
201
|
enum ggml_type {
|
201
|
-
|
202
|
-
|
202
|
+
// explicitly numbered values are used in llama.cpp files
|
203
|
+
GGML_TYPE_F32 = 0,
|
204
|
+
GGML_TYPE_F16 = 1,
|
205
|
+
GGML_TYPE_Q4_0 = 2,
|
206
|
+
GGML_TYPE_Q4_1 = 3,
|
203
207
|
GGML_TYPE_I8,
|
204
208
|
GGML_TYPE_I16,
|
205
209
|
GGML_TYPE_I32,
|
206
|
-
GGML_TYPE_F16,
|
207
|
-
GGML_TYPE_F32,
|
208
210
|
GGML_TYPE_COUNT,
|
209
211
|
};
|
210
212
|
|
@@ -236,6 +238,7 @@ enum ggml_op {
|
|
236
238
|
|
237
239
|
GGML_OP_SCALE,
|
238
240
|
GGML_OP_CPY,
|
241
|
+
GGML_OP_CONT,
|
239
242
|
GGML_OP_RESHAPE,
|
240
243
|
GGML_OP_VIEW,
|
241
244
|
GGML_OP_PERMUTE,
|
@@ -250,9 +253,25 @@ enum ggml_op {
|
|
250
253
|
GGML_OP_FLASH_ATTN,
|
251
254
|
GGML_OP_FLASH_FF,
|
252
255
|
|
256
|
+
GGML_OP_MAP_UNARY,
|
257
|
+
GGML_OP_MAP_BINARY,
|
258
|
+
|
253
259
|
GGML_OP_COUNT,
|
254
260
|
};
|
255
261
|
|
262
|
+
|
263
|
+
// ggml object
|
264
|
+
struct ggml_object {
|
265
|
+
size_t offs;
|
266
|
+
size_t size;
|
267
|
+
|
268
|
+
struct ggml_object * next;
|
269
|
+
|
270
|
+
char padding[8];
|
271
|
+
};
|
272
|
+
|
273
|
+
static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
|
274
|
+
|
256
275
|
// n-dimensional tensor
|
257
276
|
struct ggml_tensor {
|
258
277
|
enum ggml_type type;
|
@@ -335,6 +354,8 @@ int ggml_blck_size (enum ggml_type type);
|
|
335
354
|
size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
|
336
355
|
float ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float
|
337
356
|
|
357
|
+
const char * ggml_type_name(enum ggml_type type);
|
358
|
+
|
338
359
|
size_t ggml_element_size(const struct ggml_tensor * tensor);
|
339
360
|
|
340
361
|
struct ggml_context * ggml_init(struct ggml_init_params params);
|
@@ -344,13 +365,6 @@ size_t ggml_used_mem(const struct ggml_context * ctx);
|
|
344
365
|
|
345
366
|
size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
|
346
367
|
|
347
|
-
bool ggml_mlock_supported(void);
|
348
|
-
bool ggml_mlock(
|
349
|
-
struct ggml_context * ctx,
|
350
|
-
const void *opt_extra_addr,
|
351
|
-
size_t opt_extra_len,
|
352
|
-
char **err_p);
|
353
|
-
|
354
368
|
struct ggml_tensor * ggml_new_tensor(
|
355
369
|
struct ggml_context * ctx,
|
356
370
|
enum ggml_type type,
|
@@ -519,6 +533,11 @@ struct ggml_tensor * ggml_cpy(
|
|
519
533
|
struct ggml_tensor * a,
|
520
534
|
struct ggml_tensor * b);
|
521
535
|
|
536
|
+
// make contiguous
|
537
|
+
struct ggml_tensor * ggml_cont(
|
538
|
+
struct ggml_context * ctx,
|
539
|
+
struct ggml_tensor * a);
|
540
|
+
|
522
541
|
// return view(a), b specifies the new shape
|
523
542
|
// TODO: when we start computing gradient, make a copy instead of view
|
524
543
|
struct ggml_tensor * ggml_reshape(
|
@@ -638,6 +657,21 @@ struct ggml_tensor * ggml_flash_ff(
|
|
638
657
|
struct ggml_tensor * c0,
|
639
658
|
struct ggml_tensor * c1);
|
640
659
|
|
660
|
+
// Mapping operations
|
661
|
+
typedef void (*ggml_unary_op_f32_t)(const int, float *, const float *);
|
662
|
+
typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
|
663
|
+
|
664
|
+
struct ggml_tensor * ggml_map_unary_f32(
|
665
|
+
struct ggml_context * ctx,
|
666
|
+
struct ggml_tensor * a,
|
667
|
+
const ggml_unary_op_f32_t fun);
|
668
|
+
|
669
|
+
struct ggml_tensor * ggml_map_binary_f32(
|
670
|
+
struct ggml_context * ctx,
|
671
|
+
struct ggml_tensor * a,
|
672
|
+
struct ggml_tensor * b,
|
673
|
+
const ggml_binary_op_f32_t fun);
|
674
|
+
|
641
675
|
//
|
642
676
|
// automatic differentiation
|
643
677
|
//
|
@@ -783,6 +817,30 @@ int ggml_cpu_has_blas(void);
|
|
783
817
|
int ggml_cpu_has_sse3(void);
|
784
818
|
int ggml_cpu_has_vsx(void);
|
785
819
|
|
820
|
+
|
821
|
+
//
|
822
|
+
// Internal types and functions exposed for tests and benchmarks
|
823
|
+
//
|
824
|
+
|
825
|
+
#ifdef __cplusplus
|
826
|
+
// restrict not standard in C++
|
827
|
+
#define GGML_RESTRICT
|
828
|
+
#else
|
829
|
+
#define GGML_RESTRICT restrict
|
830
|
+
#endif
|
831
|
+
typedef void (*dequantize_row_q_t)(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
832
|
+
typedef void (*quantize_row_q_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
833
|
+
typedef void (*vec_dot_q_t)(const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
|
834
|
+
|
835
|
+
typedef struct {
|
836
|
+
dequantize_row_q_t dequantize_row_q;
|
837
|
+
quantize_row_q_t quantize_row_q;
|
838
|
+
quantize_row_q_t quantize_row_q_reference;
|
839
|
+
vec_dot_q_t vec_dot_q;
|
840
|
+
} quantize_fns_t;
|
841
|
+
|
842
|
+
quantize_fns_t ggml_internal_get_quantize_fn(size_t i);
|
843
|
+
|
786
844
|
#ifdef __cplusplus
|
787
845
|
}
|
788
846
|
#endif
|