llama_cpp 0.0.3 → 0.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +36 -0
- data/README.md +5 -4
- data/ext/llama_cpp/extconf.rb +38 -0
- data/ext/llama_cpp/llama_cpp.cpp +118 -2
- data/ext/llama_cpp/src/ggml.c +1740 -658
- data/ext/llama_cpp/src/ggml.h +84 -16
- data/ext/llama_cpp/src/llama.cpp +1108 -756
- data/ext/llama_cpp/src/llama.h +37 -1
- data/ext/llama_cpp/src/llama_util.h +396 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +3 -3
- data/sig/llama_cpp.rbs +6 -0
- metadata +3 -2
data/ext/llama_cpp/src/ggml.h
CHANGED
@@ -177,11 +177,12 @@ extern "C" {
|
|
177
177
|
#include <stddef.h>
|
178
178
|
#include <stdbool.h>
|
179
179
|
|
180
|
-
#define GGML_MAX_DIMS
|
181
|
-
#define GGML_MAX_NODES
|
182
|
-
#define GGML_MAX_PARAMS
|
183
|
-
#define GGML_MAX_CONTEXTS
|
184
|
-
#define GGML_MAX_OPT
|
180
|
+
#define GGML_MAX_DIMS 4
|
181
|
+
#define GGML_MAX_NODES 4096
|
182
|
+
#define GGML_MAX_PARAMS 16
|
183
|
+
#define GGML_MAX_CONTEXTS 64
|
184
|
+
#define GGML_MAX_OPT 4
|
185
|
+
#define GGML_DEFAULT_N_THREADS 4
|
185
186
|
|
186
187
|
#ifdef __ARM_NEON
|
187
188
|
// we use the built-in 16-bit float type
|
@@ -198,13 +199,15 @@ struct ggml_object;
|
|
198
199
|
struct ggml_context;
|
199
200
|
|
200
201
|
enum ggml_type {
|
201
|
-
|
202
|
-
|
202
|
+
// explicitly numbered values are used in llama.cpp files
|
203
|
+
GGML_TYPE_F32 = 0,
|
204
|
+
GGML_TYPE_F16 = 1,
|
205
|
+
GGML_TYPE_Q4_0 = 2,
|
206
|
+
GGML_TYPE_Q4_1 = 3,
|
207
|
+
GGML_TYPE_Q8_0 = 4,
|
203
208
|
GGML_TYPE_I8,
|
204
209
|
GGML_TYPE_I16,
|
205
210
|
GGML_TYPE_I32,
|
206
|
-
GGML_TYPE_F16,
|
207
|
-
GGML_TYPE_F32,
|
208
211
|
GGML_TYPE_COUNT,
|
209
212
|
};
|
210
213
|
|
@@ -236,6 +239,7 @@ enum ggml_op {
|
|
236
239
|
|
237
240
|
GGML_OP_SCALE,
|
238
241
|
GGML_OP_CPY,
|
242
|
+
GGML_OP_CONT,
|
239
243
|
GGML_OP_RESHAPE,
|
240
244
|
GGML_OP_VIEW,
|
241
245
|
GGML_OP_PERMUTE,
|
@@ -250,9 +254,25 @@ enum ggml_op {
|
|
250
254
|
GGML_OP_FLASH_ATTN,
|
251
255
|
GGML_OP_FLASH_FF,
|
252
256
|
|
257
|
+
GGML_OP_MAP_UNARY,
|
258
|
+
GGML_OP_MAP_BINARY,
|
259
|
+
|
253
260
|
GGML_OP_COUNT,
|
254
261
|
};
|
255
262
|
|
263
|
+
|
264
|
+
// ggml object
|
265
|
+
struct ggml_object {
|
266
|
+
size_t offs;
|
267
|
+
size_t size;
|
268
|
+
|
269
|
+
struct ggml_object * next;
|
270
|
+
|
271
|
+
char padding[8];
|
272
|
+
};
|
273
|
+
|
274
|
+
static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
|
275
|
+
|
256
276
|
// n-dimensional tensor
|
257
277
|
struct ggml_tensor {
|
258
278
|
enum ggml_type type;
|
@@ -335,6 +355,8 @@ int ggml_blck_size (enum ggml_type type);
|
|
335
355
|
size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
|
336
356
|
float ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float
|
337
357
|
|
358
|
+
const char * ggml_type_name(enum ggml_type type);
|
359
|
+
|
338
360
|
size_t ggml_element_size(const struct ggml_tensor * tensor);
|
339
361
|
|
340
362
|
struct ggml_context * ggml_init(struct ggml_init_params params);
|
@@ -344,13 +366,6 @@ size_t ggml_used_mem(const struct ggml_context * ctx);
|
|
344
366
|
|
345
367
|
size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
|
346
368
|
|
347
|
-
bool ggml_mlock_supported(void);
|
348
|
-
bool ggml_mlock(
|
349
|
-
struct ggml_context * ctx,
|
350
|
-
const void *opt_extra_addr,
|
351
|
-
size_t opt_extra_len,
|
352
|
-
char **err_p);
|
353
|
-
|
354
369
|
struct ggml_tensor * ggml_new_tensor(
|
355
370
|
struct ggml_context * ctx,
|
356
371
|
enum ggml_type type,
|
@@ -415,6 +430,12 @@ struct ggml_tensor * ggml_add(
|
|
415
430
|
struct ggml_tensor * a,
|
416
431
|
struct ggml_tensor * b);
|
417
432
|
|
433
|
+
|
434
|
+
struct ggml_tensor * ggml_add_inplace(
|
435
|
+
struct ggml_context * ctx,
|
436
|
+
struct ggml_tensor * a,
|
437
|
+
struct ggml_tensor * b);
|
438
|
+
|
418
439
|
struct ggml_tensor * ggml_sub(
|
419
440
|
struct ggml_context * ctx,
|
420
441
|
struct ggml_tensor * a,
|
@@ -519,6 +540,11 @@ struct ggml_tensor * ggml_cpy(
|
|
519
540
|
struct ggml_tensor * a,
|
520
541
|
struct ggml_tensor * b);
|
521
542
|
|
543
|
+
// make contiguous
|
544
|
+
struct ggml_tensor * ggml_cont(
|
545
|
+
struct ggml_context * ctx,
|
546
|
+
struct ggml_tensor * a);
|
547
|
+
|
522
548
|
// return view(a), b specifies the new shape
|
523
549
|
// TODO: when we start computing gradient, make a copy instead of view
|
524
550
|
struct ggml_tensor * ggml_reshape(
|
@@ -638,6 +664,21 @@ struct ggml_tensor * ggml_flash_ff(
|
|
638
664
|
struct ggml_tensor * c0,
|
639
665
|
struct ggml_tensor * c1);
|
640
666
|
|
667
|
+
// Mapping operations
|
668
|
+
typedef void (*ggml_unary_op_f32_t)(const int, float *, const float *);
|
669
|
+
typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
|
670
|
+
|
671
|
+
struct ggml_tensor * ggml_map_unary_f32(
|
672
|
+
struct ggml_context * ctx,
|
673
|
+
struct ggml_tensor * a,
|
674
|
+
const ggml_unary_op_f32_t fun);
|
675
|
+
|
676
|
+
struct ggml_tensor * ggml_map_binary_f32(
|
677
|
+
struct ggml_context * ctx,
|
678
|
+
struct ggml_tensor * a,
|
679
|
+
struct ggml_tensor * b,
|
680
|
+
const ggml_binary_op_f32_t fun);
|
681
|
+
|
641
682
|
//
|
642
683
|
// automatic differentiation
|
643
684
|
//
|
@@ -773,6 +814,8 @@ size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t *
|
|
773
814
|
int ggml_cpu_has_avx(void);
|
774
815
|
int ggml_cpu_has_avx2(void);
|
775
816
|
int ggml_cpu_has_avx512(void);
|
817
|
+
int ggml_cpu_has_avx512_vbmi(void);
|
818
|
+
int ggml_cpu_has_avx512_vnni(void);
|
776
819
|
int ggml_cpu_has_fma(void);
|
777
820
|
int ggml_cpu_has_neon(void);
|
778
821
|
int ggml_cpu_has_arm_fma(void);
|
@@ -783,6 +826,31 @@ int ggml_cpu_has_blas(void);
|
|
783
826
|
int ggml_cpu_has_sse3(void);
|
784
827
|
int ggml_cpu_has_vsx(void);
|
785
828
|
|
829
|
+
|
830
|
+
//
|
831
|
+
// Internal types and functions exposed for tests and benchmarks
|
832
|
+
//
|
833
|
+
|
834
|
+
#ifdef __cplusplus
|
835
|
+
// restrict not standard in C++
|
836
|
+
#define GGML_RESTRICT
|
837
|
+
#else
|
838
|
+
#define GGML_RESTRICT restrict
|
839
|
+
#endif
|
840
|
+
typedef void (*dequantize_row_q_t)(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
841
|
+
typedef void (*quantize_row_q_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
842
|
+
typedef void (*vec_dot_q_t)(const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
|
843
|
+
|
844
|
+
typedef struct {
|
845
|
+
dequantize_row_q_t dequantize_row_q;
|
846
|
+
quantize_row_q_t quantize_row_q;
|
847
|
+
quantize_row_q_t quantize_row_q_reference;
|
848
|
+
quantize_row_q_t quantize_row_q_dot;
|
849
|
+
vec_dot_q_t vec_dot_q;
|
850
|
+
} quantize_fns_t;
|
851
|
+
|
852
|
+
quantize_fns_t ggml_internal_get_quantize_fn(size_t i);
|
853
|
+
|
786
854
|
#ifdef __cplusplus
|
787
855
|
}
|
788
856
|
#endif
|