llama_cpp 0.0.2 → 0.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +27 -2
- data/README.md +3 -2
- data/ext/llama_cpp/extconf.rb +26 -0
- data/ext/llama_cpp/llama_cpp.cpp +97 -3
- data/ext/llama_cpp/src/ggml.c +1254 -670
- data/ext/llama_cpp/src/ggml.h +110 -42
- data/ext/llama_cpp/src/llama.cpp +878 -757
- data/ext/llama_cpp/src/llama.h +42 -1
- data/ext/llama_cpp/src/llama_util.h +389 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +4 -1
- data/sig/llama_cpp.rbs +55 -0
- metadata +4 -2
data/ext/llama_cpp/src/ggml.h
CHANGED
@@ -177,11 +177,12 @@ extern "C" {
|
|
177
177
|
#include <stddef.h>
|
178
178
|
#include <stdbool.h>
|
179
179
|
|
180
|
-
#define GGML_MAX_DIMS
|
181
|
-
#define GGML_MAX_NODES
|
182
|
-
#define GGML_MAX_PARAMS
|
183
|
-
#define GGML_MAX_CONTEXTS
|
184
|
-
#define GGML_MAX_OPT
|
180
|
+
#define GGML_MAX_DIMS 4
|
181
|
+
#define GGML_MAX_NODES 4096
|
182
|
+
#define GGML_MAX_PARAMS 16
|
183
|
+
#define GGML_MAX_CONTEXTS 64
|
184
|
+
#define GGML_MAX_OPT 4
|
185
|
+
#define GGML_DEFAULT_N_THREADS 4
|
185
186
|
|
186
187
|
#ifdef __ARM_NEON
|
187
188
|
// we use the built-in 16-bit float type
|
@@ -198,13 +199,14 @@ struct ggml_object;
|
|
198
199
|
struct ggml_context;
|
199
200
|
|
200
201
|
enum ggml_type {
|
201
|
-
|
202
|
-
|
202
|
+
// explicitly numbered values are used in llama.cpp files
|
203
|
+
GGML_TYPE_F32 = 0,
|
204
|
+
GGML_TYPE_F16 = 1,
|
205
|
+
GGML_TYPE_Q4_0 = 2,
|
206
|
+
GGML_TYPE_Q4_1 = 3,
|
203
207
|
GGML_TYPE_I8,
|
204
208
|
GGML_TYPE_I16,
|
205
209
|
GGML_TYPE_I32,
|
206
|
-
GGML_TYPE_F16,
|
207
|
-
GGML_TYPE_F32,
|
208
210
|
GGML_TYPE_COUNT,
|
209
211
|
};
|
210
212
|
|
@@ -236,6 +238,7 @@ enum ggml_op {
|
|
236
238
|
|
237
239
|
GGML_OP_SCALE,
|
238
240
|
GGML_OP_CPY,
|
241
|
+
GGML_OP_CONT,
|
239
242
|
GGML_OP_RESHAPE,
|
240
243
|
GGML_OP_VIEW,
|
241
244
|
GGML_OP_PERMUTE,
|
@@ -250,19 +253,35 @@ enum ggml_op {
|
|
250
253
|
GGML_OP_FLASH_ATTN,
|
251
254
|
GGML_OP_FLASH_FF,
|
252
255
|
|
256
|
+
GGML_OP_MAP_UNARY,
|
257
|
+
GGML_OP_MAP_BINARY,
|
258
|
+
|
253
259
|
GGML_OP_COUNT,
|
254
260
|
};
|
255
261
|
|
262
|
+
|
263
|
+
// ggml object
|
264
|
+
struct ggml_object {
|
265
|
+
size_t offs;
|
266
|
+
size_t size;
|
267
|
+
|
268
|
+
struct ggml_object * next;
|
269
|
+
|
270
|
+
char padding[8];
|
271
|
+
};
|
272
|
+
|
273
|
+
static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
|
274
|
+
|
256
275
|
// n-dimensional tensor
|
257
276
|
struct ggml_tensor {
|
258
277
|
enum ggml_type type;
|
259
278
|
|
260
279
|
int n_dims;
|
261
|
-
|
262
|
-
size_t
|
263
|
-
|
264
|
-
|
265
|
-
|
280
|
+
int64_t ne[GGML_MAX_DIMS]; // number of elements
|
281
|
+
size_t nb[GGML_MAX_DIMS]; // stride in bytes:
|
282
|
+
// nb[0] = sizeof(type)
|
283
|
+
// nb[1] = nb[0] * ne[0] + padding
|
284
|
+
// nb[i] = nb[i-1] * ne[i-1]
|
266
285
|
|
267
286
|
// compute data
|
268
287
|
enum ggml_op op;
|
@@ -328,13 +347,15 @@ int64_t ggml_cycles_per_ms(void);
|
|
328
347
|
void ggml_print_object (const struct ggml_object * obj);
|
329
348
|
void ggml_print_objects(const struct ggml_context * ctx);
|
330
349
|
|
331
|
-
|
332
|
-
size_t
|
350
|
+
int64_t ggml_nelements(const struct ggml_tensor * tensor);
|
351
|
+
size_t ggml_nbytes (const struct ggml_tensor * tensor);
|
333
352
|
|
334
353
|
int ggml_blck_size (enum ggml_type type);
|
335
354
|
size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
|
336
355
|
float ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float
|
337
356
|
|
357
|
+
const char * ggml_type_name(enum ggml_type type);
|
358
|
+
|
338
359
|
size_t ggml_element_size(const struct ggml_tensor * tensor);
|
339
360
|
|
340
361
|
struct ggml_context * ggml_init(struct ggml_init_params params);
|
@@ -344,44 +365,37 @@ size_t ggml_used_mem(const struct ggml_context * ctx);
|
|
344
365
|
|
345
366
|
size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
|
346
367
|
|
347
|
-
bool ggml_mlock_supported(void);
|
348
|
-
bool ggml_mlock(
|
349
|
-
struct ggml_context * ctx,
|
350
|
-
const void *opt_extra_addr,
|
351
|
-
size_t opt_extra_len,
|
352
|
-
char **err_p);
|
353
|
-
|
354
368
|
struct ggml_tensor * ggml_new_tensor(
|
355
369
|
struct ggml_context * ctx,
|
356
370
|
enum ggml_type type,
|
357
371
|
int n_dims,
|
358
|
-
const
|
372
|
+
const int64_t *ne);
|
359
373
|
|
360
374
|
struct ggml_tensor * ggml_new_tensor_1d(
|
361
375
|
struct ggml_context * ctx,
|
362
376
|
enum ggml_type type,
|
363
|
-
|
377
|
+
int64_t ne0);
|
364
378
|
|
365
379
|
struct ggml_tensor * ggml_new_tensor_2d(
|
366
380
|
struct ggml_context * ctx,
|
367
381
|
enum ggml_type type,
|
368
|
-
|
369
|
-
|
382
|
+
int64_t ne0,
|
383
|
+
int64_t ne1);
|
370
384
|
|
371
385
|
struct ggml_tensor * ggml_new_tensor_3d(
|
372
386
|
struct ggml_context * ctx,
|
373
387
|
enum ggml_type type,
|
374
|
-
|
375
|
-
|
376
|
-
|
388
|
+
int64_t ne0,
|
389
|
+
int64_t ne1,
|
390
|
+
int64_t ne2);
|
377
391
|
|
378
392
|
struct ggml_tensor * ggml_new_tensor_4d(
|
379
393
|
struct ggml_context * ctx,
|
380
394
|
enum ggml_type type,
|
381
|
-
|
382
|
-
|
383
|
-
|
384
|
-
|
395
|
+
int64_t ne0,
|
396
|
+
int64_t ne1,
|
397
|
+
int64_t ne2,
|
398
|
+
int64_t ne3);
|
385
399
|
|
386
400
|
struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
|
387
401
|
struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
|
@@ -519,6 +533,11 @@ struct ggml_tensor * ggml_cpy(
|
|
519
533
|
struct ggml_tensor * a,
|
520
534
|
struct ggml_tensor * b);
|
521
535
|
|
536
|
+
// make contiguous
|
537
|
+
struct ggml_tensor * ggml_cont(
|
538
|
+
struct ggml_context * ctx,
|
539
|
+
struct ggml_tensor * a);
|
540
|
+
|
522
541
|
// return view(a), b specifies the new shape
|
523
542
|
// TODO: when we start computing gradient, make a copy instead of view
|
524
543
|
struct ggml_tensor * ggml_reshape(
|
@@ -531,33 +550,43 @@ struct ggml_tensor * ggml_reshape(
|
|
531
550
|
struct ggml_tensor * ggml_reshape_2d(
|
532
551
|
struct ggml_context * ctx,
|
533
552
|
struct ggml_tensor * a,
|
534
|
-
|
535
|
-
|
553
|
+
int64_t ne0,
|
554
|
+
int64_t ne1);
|
536
555
|
|
537
556
|
// return view(a)
|
538
557
|
// TODO: when we start computing gradient, make a copy instead of view
|
539
558
|
struct ggml_tensor * ggml_reshape_3d(
|
540
559
|
struct ggml_context * ctx,
|
541
560
|
struct ggml_tensor * a,
|
542
|
-
|
543
|
-
|
544
|
-
|
561
|
+
int64_t ne0,
|
562
|
+
int64_t ne1,
|
563
|
+
int64_t ne2);
|
545
564
|
|
546
565
|
// offset in bytes
|
547
566
|
struct ggml_tensor * ggml_view_1d(
|
548
567
|
struct ggml_context * ctx,
|
549
568
|
struct ggml_tensor * a,
|
550
|
-
|
569
|
+
int64_t ne0,
|
551
570
|
size_t offset);
|
552
571
|
|
553
572
|
struct ggml_tensor * ggml_view_2d(
|
554
573
|
struct ggml_context * ctx,
|
555
574
|
struct ggml_tensor * a,
|
556
|
-
|
557
|
-
|
575
|
+
int64_t ne0,
|
576
|
+
int64_t ne1,
|
558
577
|
size_t nb1, // row stride in bytes
|
559
578
|
size_t offset);
|
560
579
|
|
580
|
+
struct ggml_tensor * ggml_view_3d(
|
581
|
+
struct ggml_context * ctx,
|
582
|
+
struct ggml_tensor * a,
|
583
|
+
int64_t ne0,
|
584
|
+
int64_t ne1,
|
585
|
+
int64_t ne2,
|
586
|
+
size_t nb1, // row stride in bytes
|
587
|
+
size_t nb2, // slice stride in bytes
|
588
|
+
size_t offset);
|
589
|
+
|
561
590
|
struct ggml_tensor * ggml_permute(
|
562
591
|
struct ggml_context * ctx,
|
563
592
|
struct ggml_tensor * a,
|
@@ -628,6 +657,21 @@ struct ggml_tensor * ggml_flash_ff(
|
|
628
657
|
struct ggml_tensor * c0,
|
629
658
|
struct ggml_tensor * c1);
|
630
659
|
|
660
|
+
// Mapping operations
|
661
|
+
typedef void (*ggml_unary_op_f32_t)(const int, float *, const float *);
|
662
|
+
typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
|
663
|
+
|
664
|
+
struct ggml_tensor * ggml_map_unary_f32(
|
665
|
+
struct ggml_context * ctx,
|
666
|
+
struct ggml_tensor * a,
|
667
|
+
const ggml_unary_op_f32_t fun);
|
668
|
+
|
669
|
+
struct ggml_tensor * ggml_map_binary_f32(
|
670
|
+
struct ggml_context * ctx,
|
671
|
+
struct ggml_tensor * a,
|
672
|
+
struct ggml_tensor * b,
|
673
|
+
const ggml_binary_op_f32_t fun);
|
674
|
+
|
631
675
|
//
|
632
676
|
// automatic differentiation
|
633
677
|
//
|
@@ -773,6 +817,30 @@ int ggml_cpu_has_blas(void);
|
|
773
817
|
int ggml_cpu_has_sse3(void);
|
774
818
|
int ggml_cpu_has_vsx(void);
|
775
819
|
|
820
|
+
|
821
|
+
//
|
822
|
+
// Internal types and functions exposed for tests and benchmarks
|
823
|
+
//
|
824
|
+
|
825
|
+
#ifdef __cplusplus
|
826
|
+
// restrict not standard in C++
|
827
|
+
#define GGML_RESTRICT
|
828
|
+
#else
|
829
|
+
#define GGML_RESTRICT restrict
|
830
|
+
#endif
|
831
|
+
typedef void (*dequantize_row_q_t)(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
832
|
+
typedef void (*quantize_row_q_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
833
|
+
typedef void (*vec_dot_q_t)(const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
|
834
|
+
|
835
|
+
typedef struct {
|
836
|
+
dequantize_row_q_t dequantize_row_q;
|
837
|
+
quantize_row_q_t quantize_row_q;
|
838
|
+
quantize_row_q_t quantize_row_q_reference;
|
839
|
+
vec_dot_q_t vec_dot_q;
|
840
|
+
} quantize_fns_t;
|
841
|
+
|
842
|
+
quantize_fns_t ggml_internal_get_quantize_fn(size_t i);
|
843
|
+
|
776
844
|
#ifdef __cplusplus
|
777
845
|
}
|
778
846
|
#endif
|