llama_cpp 0.0.2 → 0.0.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +27 -2
- data/README.md +3 -2
- data/ext/llama_cpp/extconf.rb +26 -0
- data/ext/llama_cpp/llama_cpp.cpp +97 -3
- data/ext/llama_cpp/src/ggml.c +1254 -670
- data/ext/llama_cpp/src/ggml.h +110 -42
- data/ext/llama_cpp/src/llama.cpp +878 -757
- data/ext/llama_cpp/src/llama.h +42 -1
- data/ext/llama_cpp/src/llama_util.h +389 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +4 -1
- data/sig/llama_cpp.rbs +55 -0
- metadata +4 -2
data/ext/llama_cpp/src/ggml.h
CHANGED
@@ -177,11 +177,12 @@ extern "C" {
|
|
177
177
|
#include <stddef.h>
|
178
178
|
#include <stdbool.h>
|
179
179
|
|
180
|
-
#define GGML_MAX_DIMS
|
181
|
-
#define GGML_MAX_NODES
|
182
|
-
#define GGML_MAX_PARAMS
|
183
|
-
#define GGML_MAX_CONTEXTS
|
184
|
-
#define GGML_MAX_OPT
|
180
|
+
#define GGML_MAX_DIMS 4
|
181
|
+
#define GGML_MAX_NODES 4096
|
182
|
+
#define GGML_MAX_PARAMS 16
|
183
|
+
#define GGML_MAX_CONTEXTS 64
|
184
|
+
#define GGML_MAX_OPT 4
|
185
|
+
#define GGML_DEFAULT_N_THREADS 4
|
185
186
|
|
186
187
|
#ifdef __ARM_NEON
|
187
188
|
// we use the built-in 16-bit float type
|
@@ -198,13 +199,14 @@ struct ggml_object;
|
|
198
199
|
struct ggml_context;
|
199
200
|
|
200
201
|
enum ggml_type {
|
201
|
-
|
202
|
-
|
202
|
+
// explicitly numbered values are used in llama.cpp files
|
203
|
+
GGML_TYPE_F32 = 0,
|
204
|
+
GGML_TYPE_F16 = 1,
|
205
|
+
GGML_TYPE_Q4_0 = 2,
|
206
|
+
GGML_TYPE_Q4_1 = 3,
|
203
207
|
GGML_TYPE_I8,
|
204
208
|
GGML_TYPE_I16,
|
205
209
|
GGML_TYPE_I32,
|
206
|
-
GGML_TYPE_F16,
|
207
|
-
GGML_TYPE_F32,
|
208
210
|
GGML_TYPE_COUNT,
|
209
211
|
};
|
210
212
|
|
@@ -236,6 +238,7 @@ enum ggml_op {
|
|
236
238
|
|
237
239
|
GGML_OP_SCALE,
|
238
240
|
GGML_OP_CPY,
|
241
|
+
GGML_OP_CONT,
|
239
242
|
GGML_OP_RESHAPE,
|
240
243
|
GGML_OP_VIEW,
|
241
244
|
GGML_OP_PERMUTE,
|
@@ -250,19 +253,35 @@ enum ggml_op {
|
|
250
253
|
GGML_OP_FLASH_ATTN,
|
251
254
|
GGML_OP_FLASH_FF,
|
252
255
|
|
256
|
+
GGML_OP_MAP_UNARY,
|
257
|
+
GGML_OP_MAP_BINARY,
|
258
|
+
|
253
259
|
GGML_OP_COUNT,
|
254
260
|
};
|
255
261
|
|
262
|
+
|
263
|
+
// ggml object
|
264
|
+
struct ggml_object {
|
265
|
+
size_t offs;
|
266
|
+
size_t size;
|
267
|
+
|
268
|
+
struct ggml_object * next;
|
269
|
+
|
270
|
+
char padding[8];
|
271
|
+
};
|
272
|
+
|
273
|
+
static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
|
274
|
+
|
256
275
|
// n-dimensional tensor
|
257
276
|
struct ggml_tensor {
|
258
277
|
enum ggml_type type;
|
259
278
|
|
260
279
|
int n_dims;
|
261
|
-
|
262
|
-
size_t
|
263
|
-
|
264
|
-
|
265
|
-
|
280
|
+
int64_t ne[GGML_MAX_DIMS]; // number of elements
|
281
|
+
size_t nb[GGML_MAX_DIMS]; // stride in bytes:
|
282
|
+
// nb[0] = sizeof(type)
|
283
|
+
// nb[1] = nb[0] * ne[0] + padding
|
284
|
+
// nb[i] = nb[i-1] * ne[i-1]
|
266
285
|
|
267
286
|
// compute data
|
268
287
|
enum ggml_op op;
|
@@ -328,13 +347,15 @@ int64_t ggml_cycles_per_ms(void);
|
|
328
347
|
void ggml_print_object (const struct ggml_object * obj);
|
329
348
|
void ggml_print_objects(const struct ggml_context * ctx);
|
330
349
|
|
331
|
-
|
332
|
-
size_t
|
350
|
+
int64_t ggml_nelements(const struct ggml_tensor * tensor);
|
351
|
+
size_t ggml_nbytes (const struct ggml_tensor * tensor);
|
333
352
|
|
334
353
|
int ggml_blck_size (enum ggml_type type);
|
335
354
|
size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
|
336
355
|
float ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float
|
337
356
|
|
357
|
+
const char * ggml_type_name(enum ggml_type type);
|
358
|
+
|
338
359
|
size_t ggml_element_size(const struct ggml_tensor * tensor);
|
339
360
|
|
340
361
|
struct ggml_context * ggml_init(struct ggml_init_params params);
|
@@ -344,44 +365,37 @@ size_t ggml_used_mem(const struct ggml_context * ctx);
|
|
344
365
|
|
345
366
|
size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
|
346
367
|
|
347
|
-
bool ggml_mlock_supported(void);
|
348
|
-
bool ggml_mlock(
|
349
|
-
struct ggml_context * ctx,
|
350
|
-
const void *opt_extra_addr,
|
351
|
-
size_t opt_extra_len,
|
352
|
-
char **err_p);
|
353
|
-
|
354
368
|
struct ggml_tensor * ggml_new_tensor(
|
355
369
|
struct ggml_context * ctx,
|
356
370
|
enum ggml_type type,
|
357
371
|
int n_dims,
|
358
|
-
const
|
372
|
+
const int64_t *ne);
|
359
373
|
|
360
374
|
struct ggml_tensor * ggml_new_tensor_1d(
|
361
375
|
struct ggml_context * ctx,
|
362
376
|
enum ggml_type type,
|
363
|
-
|
377
|
+
int64_t ne0);
|
364
378
|
|
365
379
|
struct ggml_tensor * ggml_new_tensor_2d(
|
366
380
|
struct ggml_context * ctx,
|
367
381
|
enum ggml_type type,
|
368
|
-
|
369
|
-
|
382
|
+
int64_t ne0,
|
383
|
+
int64_t ne1);
|
370
384
|
|
371
385
|
struct ggml_tensor * ggml_new_tensor_3d(
|
372
386
|
struct ggml_context * ctx,
|
373
387
|
enum ggml_type type,
|
374
|
-
|
375
|
-
|
376
|
-
|
388
|
+
int64_t ne0,
|
389
|
+
int64_t ne1,
|
390
|
+
int64_t ne2);
|
377
391
|
|
378
392
|
struct ggml_tensor * ggml_new_tensor_4d(
|
379
393
|
struct ggml_context * ctx,
|
380
394
|
enum ggml_type type,
|
381
|
-
|
382
|
-
|
383
|
-
|
384
|
-
|
395
|
+
int64_t ne0,
|
396
|
+
int64_t ne1,
|
397
|
+
int64_t ne2,
|
398
|
+
int64_t ne3);
|
385
399
|
|
386
400
|
struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
|
387
401
|
struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
|
@@ -519,6 +533,11 @@ struct ggml_tensor * ggml_cpy(
|
|
519
533
|
struct ggml_tensor * a,
|
520
534
|
struct ggml_tensor * b);
|
521
535
|
|
536
|
+
// make contiguous
|
537
|
+
struct ggml_tensor * ggml_cont(
|
538
|
+
struct ggml_context * ctx,
|
539
|
+
struct ggml_tensor * a);
|
540
|
+
|
522
541
|
// return view(a), b specifies the new shape
|
523
542
|
// TODO: when we start computing gradient, make a copy instead of view
|
524
543
|
struct ggml_tensor * ggml_reshape(
|
@@ -531,33 +550,43 @@ struct ggml_tensor * ggml_reshape(
|
|
531
550
|
struct ggml_tensor * ggml_reshape_2d(
|
532
551
|
struct ggml_context * ctx,
|
533
552
|
struct ggml_tensor * a,
|
534
|
-
|
535
|
-
|
553
|
+
int64_t ne0,
|
554
|
+
int64_t ne1);
|
536
555
|
|
537
556
|
// return view(a)
|
538
557
|
// TODO: when we start computing gradient, make a copy instead of view
|
539
558
|
struct ggml_tensor * ggml_reshape_3d(
|
540
559
|
struct ggml_context * ctx,
|
541
560
|
struct ggml_tensor * a,
|
542
|
-
|
543
|
-
|
544
|
-
|
561
|
+
int64_t ne0,
|
562
|
+
int64_t ne1,
|
563
|
+
int64_t ne2);
|
545
564
|
|
546
565
|
// offset in bytes
|
547
566
|
struct ggml_tensor * ggml_view_1d(
|
548
567
|
struct ggml_context * ctx,
|
549
568
|
struct ggml_tensor * a,
|
550
|
-
|
569
|
+
int64_t ne0,
|
551
570
|
size_t offset);
|
552
571
|
|
553
572
|
struct ggml_tensor * ggml_view_2d(
|
554
573
|
struct ggml_context * ctx,
|
555
574
|
struct ggml_tensor * a,
|
556
|
-
|
557
|
-
|
575
|
+
int64_t ne0,
|
576
|
+
int64_t ne1,
|
558
577
|
size_t nb1, // row stride in bytes
|
559
578
|
size_t offset);
|
560
579
|
|
580
|
+
struct ggml_tensor * ggml_view_3d(
|
581
|
+
struct ggml_context * ctx,
|
582
|
+
struct ggml_tensor * a,
|
583
|
+
int64_t ne0,
|
584
|
+
int64_t ne1,
|
585
|
+
int64_t ne2,
|
586
|
+
size_t nb1, // row stride in bytes
|
587
|
+
size_t nb2, // slice stride in bytes
|
588
|
+
size_t offset);
|
589
|
+
|
561
590
|
struct ggml_tensor * ggml_permute(
|
562
591
|
struct ggml_context * ctx,
|
563
592
|
struct ggml_tensor * a,
|
@@ -628,6 +657,21 @@ struct ggml_tensor * ggml_flash_ff(
|
|
628
657
|
struct ggml_tensor * c0,
|
629
658
|
struct ggml_tensor * c1);
|
630
659
|
|
660
|
+
// Mapping operations
|
661
|
+
typedef void (*ggml_unary_op_f32_t)(const int, float *, const float *);
|
662
|
+
typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
|
663
|
+
|
664
|
+
struct ggml_tensor * ggml_map_unary_f32(
|
665
|
+
struct ggml_context * ctx,
|
666
|
+
struct ggml_tensor * a,
|
667
|
+
const ggml_unary_op_f32_t fun);
|
668
|
+
|
669
|
+
struct ggml_tensor * ggml_map_binary_f32(
|
670
|
+
struct ggml_context * ctx,
|
671
|
+
struct ggml_tensor * a,
|
672
|
+
struct ggml_tensor * b,
|
673
|
+
const ggml_binary_op_f32_t fun);
|
674
|
+
|
631
675
|
//
|
632
676
|
// automatic differentiation
|
633
677
|
//
|
@@ -773,6 +817,30 @@ int ggml_cpu_has_blas(void);
|
|
773
817
|
int ggml_cpu_has_sse3(void);
|
774
818
|
int ggml_cpu_has_vsx(void);
|
775
819
|
|
820
|
+
|
821
|
+
//
|
822
|
+
// Internal types and functions exposed for tests and benchmarks
|
823
|
+
//
|
824
|
+
|
825
|
+
#ifdef __cplusplus
|
826
|
+
// restrict not standard in C++
|
827
|
+
#define GGML_RESTRICT
|
828
|
+
#else
|
829
|
+
#define GGML_RESTRICT restrict
|
830
|
+
#endif
|
831
|
+
typedef void (*dequantize_row_q_t)(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
832
|
+
typedef void (*quantize_row_q_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
833
|
+
typedef void (*vec_dot_q_t)(const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
|
834
|
+
|
835
|
+
typedef struct {
|
836
|
+
dequantize_row_q_t dequantize_row_q;
|
837
|
+
quantize_row_q_t quantize_row_q;
|
838
|
+
quantize_row_q_t quantize_row_q_reference;
|
839
|
+
vec_dot_q_t vec_dot_q;
|
840
|
+
} quantize_fns_t;
|
841
|
+
|
842
|
+
quantize_fns_t ggml_internal_get_quantize_fn(size_t i);
|
843
|
+
|
776
844
|
#ifdef __cplusplus
|
777
845
|
}
|
778
846
|
#endif
|