llama_cpp 0.0.3 → 0.0.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +36 -0
- data/README.md +5 -4
- data/ext/llama_cpp/extconf.rb +38 -0
- data/ext/llama_cpp/llama_cpp.cpp +118 -2
- data/ext/llama_cpp/src/ggml.c +1740 -658
- data/ext/llama_cpp/src/ggml.h +84 -16
- data/ext/llama_cpp/src/llama.cpp +1108 -756
- data/ext/llama_cpp/src/llama.h +37 -1
- data/ext/llama_cpp/src/llama_util.h +396 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +3 -3
- data/sig/llama_cpp.rbs +6 -0
- metadata +3 -2
data/ext/llama_cpp/src/ggml.h
CHANGED
@@ -177,11 +177,12 @@ extern "C" {
|
|
177
177
|
#include <stddef.h>
|
178
178
|
#include <stdbool.h>
|
179
179
|
|
180
|
-
#define GGML_MAX_DIMS
|
181
|
-
#define GGML_MAX_NODES
|
182
|
-
#define GGML_MAX_PARAMS
|
183
|
-
#define GGML_MAX_CONTEXTS
|
184
|
-
#define GGML_MAX_OPT
|
180
|
+
#define GGML_MAX_DIMS 4
|
181
|
+
#define GGML_MAX_NODES 4096
|
182
|
+
#define GGML_MAX_PARAMS 16
|
183
|
+
#define GGML_MAX_CONTEXTS 64
|
184
|
+
#define GGML_MAX_OPT 4
|
185
|
+
#define GGML_DEFAULT_N_THREADS 4
|
185
186
|
|
186
187
|
#ifdef __ARM_NEON
|
187
188
|
// we use the built-in 16-bit float type
|
@@ -198,13 +199,15 @@ struct ggml_object;
|
|
198
199
|
struct ggml_context;
|
199
200
|
|
200
201
|
enum ggml_type {
|
201
|
-
|
202
|
-
|
202
|
+
// explicitly numbered values are used in llama.cpp files
|
203
|
+
GGML_TYPE_F32 = 0,
|
204
|
+
GGML_TYPE_F16 = 1,
|
205
|
+
GGML_TYPE_Q4_0 = 2,
|
206
|
+
GGML_TYPE_Q4_1 = 3,
|
207
|
+
GGML_TYPE_Q8_0 = 4,
|
203
208
|
GGML_TYPE_I8,
|
204
209
|
GGML_TYPE_I16,
|
205
210
|
GGML_TYPE_I32,
|
206
|
-
GGML_TYPE_F16,
|
207
|
-
GGML_TYPE_F32,
|
208
211
|
GGML_TYPE_COUNT,
|
209
212
|
};
|
210
213
|
|
@@ -236,6 +239,7 @@ enum ggml_op {
|
|
236
239
|
|
237
240
|
GGML_OP_SCALE,
|
238
241
|
GGML_OP_CPY,
|
242
|
+
GGML_OP_CONT,
|
239
243
|
GGML_OP_RESHAPE,
|
240
244
|
GGML_OP_VIEW,
|
241
245
|
GGML_OP_PERMUTE,
|
@@ -250,9 +254,25 @@ enum ggml_op {
|
|
250
254
|
GGML_OP_FLASH_ATTN,
|
251
255
|
GGML_OP_FLASH_FF,
|
252
256
|
|
257
|
+
GGML_OP_MAP_UNARY,
|
258
|
+
GGML_OP_MAP_BINARY,
|
259
|
+
|
253
260
|
GGML_OP_COUNT,
|
254
261
|
};
|
255
262
|
|
263
|
+
|
264
|
+
// ggml object
|
265
|
+
struct ggml_object {
|
266
|
+
size_t offs;
|
267
|
+
size_t size;
|
268
|
+
|
269
|
+
struct ggml_object * next;
|
270
|
+
|
271
|
+
char padding[8];
|
272
|
+
};
|
273
|
+
|
274
|
+
static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
|
275
|
+
|
256
276
|
// n-dimensional tensor
|
257
277
|
struct ggml_tensor {
|
258
278
|
enum ggml_type type;
|
@@ -335,6 +355,8 @@ int ggml_blck_size (enum ggml_type type);
|
|
335
355
|
size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
|
336
356
|
float ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float
|
337
357
|
|
358
|
+
const char * ggml_type_name(enum ggml_type type);
|
359
|
+
|
338
360
|
size_t ggml_element_size(const struct ggml_tensor * tensor);
|
339
361
|
|
340
362
|
struct ggml_context * ggml_init(struct ggml_init_params params);
|
@@ -344,13 +366,6 @@ size_t ggml_used_mem(const struct ggml_context * ctx);
|
|
344
366
|
|
345
367
|
size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
|
346
368
|
|
347
|
-
bool ggml_mlock_supported(void);
|
348
|
-
bool ggml_mlock(
|
349
|
-
struct ggml_context * ctx,
|
350
|
-
const void *opt_extra_addr,
|
351
|
-
size_t opt_extra_len,
|
352
|
-
char **err_p);
|
353
|
-
|
354
369
|
struct ggml_tensor * ggml_new_tensor(
|
355
370
|
struct ggml_context * ctx,
|
356
371
|
enum ggml_type type,
|
@@ -415,6 +430,12 @@ struct ggml_tensor * ggml_add(
|
|
415
430
|
struct ggml_tensor * a,
|
416
431
|
struct ggml_tensor * b);
|
417
432
|
|
433
|
+
|
434
|
+
struct ggml_tensor * ggml_add_inplace(
|
435
|
+
struct ggml_context * ctx,
|
436
|
+
struct ggml_tensor * a,
|
437
|
+
struct ggml_tensor * b);
|
438
|
+
|
418
439
|
struct ggml_tensor * ggml_sub(
|
419
440
|
struct ggml_context * ctx,
|
420
441
|
struct ggml_tensor * a,
|
@@ -519,6 +540,11 @@ struct ggml_tensor * ggml_cpy(
|
|
519
540
|
struct ggml_tensor * a,
|
520
541
|
struct ggml_tensor * b);
|
521
542
|
|
543
|
+
// make contiguous
|
544
|
+
struct ggml_tensor * ggml_cont(
|
545
|
+
struct ggml_context * ctx,
|
546
|
+
struct ggml_tensor * a);
|
547
|
+
|
522
548
|
// return view(a), b specifies the new shape
|
523
549
|
// TODO: when we start computing gradient, make a copy instead of view
|
524
550
|
struct ggml_tensor * ggml_reshape(
|
@@ -638,6 +664,21 @@ struct ggml_tensor * ggml_flash_ff(
|
|
638
664
|
struct ggml_tensor * c0,
|
639
665
|
struct ggml_tensor * c1);
|
640
666
|
|
667
|
+
// Mapping operations
|
668
|
+
typedef void (*ggml_unary_op_f32_t)(const int, float *, const float *);
|
669
|
+
typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
|
670
|
+
|
671
|
+
struct ggml_tensor * ggml_map_unary_f32(
|
672
|
+
struct ggml_context * ctx,
|
673
|
+
struct ggml_tensor * a,
|
674
|
+
const ggml_unary_op_f32_t fun);
|
675
|
+
|
676
|
+
struct ggml_tensor * ggml_map_binary_f32(
|
677
|
+
struct ggml_context * ctx,
|
678
|
+
struct ggml_tensor * a,
|
679
|
+
struct ggml_tensor * b,
|
680
|
+
const ggml_binary_op_f32_t fun);
|
681
|
+
|
641
682
|
//
|
642
683
|
// automatic differentiation
|
643
684
|
//
|
@@ -773,6 +814,8 @@ size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t *
|
|
773
814
|
int ggml_cpu_has_avx(void);
|
774
815
|
int ggml_cpu_has_avx2(void);
|
775
816
|
int ggml_cpu_has_avx512(void);
|
817
|
+
int ggml_cpu_has_avx512_vbmi(void);
|
818
|
+
int ggml_cpu_has_avx512_vnni(void);
|
776
819
|
int ggml_cpu_has_fma(void);
|
777
820
|
int ggml_cpu_has_neon(void);
|
778
821
|
int ggml_cpu_has_arm_fma(void);
|
@@ -783,6 +826,31 @@ int ggml_cpu_has_blas(void);
|
|
783
826
|
int ggml_cpu_has_sse3(void);
|
784
827
|
int ggml_cpu_has_vsx(void);
|
785
828
|
|
829
|
+
|
830
|
+
//
|
831
|
+
// Internal types and functions exposed for tests and benchmarks
|
832
|
+
//
|
833
|
+
|
834
|
+
#ifdef __cplusplus
|
835
|
+
// restrict not standard in C++
|
836
|
+
#define GGML_RESTRICT
|
837
|
+
#else
|
838
|
+
#define GGML_RESTRICT restrict
|
839
|
+
#endif
|
840
|
+
typedef void (*dequantize_row_q_t)(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
841
|
+
typedef void (*quantize_row_q_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
842
|
+
typedef void (*vec_dot_q_t)(const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
|
843
|
+
|
844
|
+
typedef struct {
|
845
|
+
dequantize_row_q_t dequantize_row_q;
|
846
|
+
quantize_row_q_t quantize_row_q;
|
847
|
+
quantize_row_q_t quantize_row_q_reference;
|
848
|
+
quantize_row_q_t quantize_row_q_dot;
|
849
|
+
vec_dot_q_t vec_dot_q;
|
850
|
+
} quantize_fns_t;
|
851
|
+
|
852
|
+
quantize_fns_t ggml_internal_get_quantize_fn(size_t i);
|
853
|
+
|
786
854
|
#ifdef __cplusplus
|
787
855
|
}
|
788
856
|
#endif
|