llama_cpp 0.0.7 → 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +17 -0
- data/ext/llama_cpp/llama_cpp.cpp +829 -51
- data/ext/llama_cpp/src/ggml-cuda.h +9 -32
- data/ext/llama_cpp/src/ggml-opencl.c +169 -24
- data/ext/llama_cpp/src/ggml.c +6672 -4376
- data/ext/llama_cpp/src/ggml.h +250 -15
- data/ext/llama_cpp/src/{llama_util.h → llama-util.h} +76 -10
- data/ext/llama_cpp/src/llama.cpp +710 -217
- data/ext/llama_cpp/src/llama.h +75 -28
- data/lib/llama_cpp/client.rb +30 -9
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +27 -3
- data/sig/llama_cpp.rbs +41 -7
- metadata +3 -3
data/ext/llama_cpp/src/ggml.h
CHANGED
@@ -190,13 +190,24 @@
|
|
190
190
|
#define GGML_FILE_MAGIC 0x67676d6c // "ggml"
|
191
191
|
#define GGML_FILE_VERSION 1
|
192
192
|
|
193
|
+
#define GGML_QNT_VERSION 1 // bump this on quantization format changes
|
194
|
+
#define GGML_QNT_VERSION_FACTOR 1000 // do not change this
|
195
|
+
|
193
196
|
#define GGML_MAX_DIMS 4
|
194
197
|
#define GGML_MAX_NODES 4096
|
195
|
-
#define GGML_MAX_PARAMS
|
198
|
+
#define GGML_MAX_PARAMS 256
|
196
199
|
#define GGML_MAX_CONTEXTS 64
|
197
200
|
#define GGML_MAX_OPT 4
|
198
201
|
#define GGML_DEFAULT_N_THREADS 4
|
199
202
|
|
203
|
+
#define GGML_ASSERT(x) \
|
204
|
+
do { \
|
205
|
+
if (!(x)) { \
|
206
|
+
fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
|
207
|
+
abort(); \
|
208
|
+
} \
|
209
|
+
} while (0)
|
210
|
+
|
200
211
|
#ifdef __cplusplus
|
201
212
|
extern "C" {
|
202
213
|
#endif
|
@@ -212,6 +223,9 @@ extern "C" {
|
|
212
223
|
GGML_API float ggml_fp16_to_fp32(ggml_fp16_t x);
|
213
224
|
GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x);
|
214
225
|
|
226
|
+
GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, size_t n);
|
227
|
+
GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, size_t n);
|
228
|
+
|
215
229
|
struct ggml_object;
|
216
230
|
struct ggml_context;
|
217
231
|
|
@@ -220,8 +234,8 @@ extern "C" {
|
|
220
234
|
GGML_TYPE_F16 = 1,
|
221
235
|
GGML_TYPE_Q4_0 = 2,
|
222
236
|
GGML_TYPE_Q4_1 = 3,
|
223
|
-
GGML_TYPE_Q4_2 = 4,
|
224
|
-
GGML_TYPE_Q4_3
|
237
|
+
// GGML_TYPE_Q4_2 = 4, support has been removed
|
238
|
+
// GGML_TYPE_Q4_3 (5) support has been removed
|
225
239
|
GGML_TYPE_Q5_0 = 6,
|
226
240
|
GGML_TYPE_Q5_1 = 7,
|
227
241
|
GGML_TYPE_Q8_0 = 8,
|
@@ -232,18 +246,40 @@ extern "C" {
|
|
232
246
|
GGML_TYPE_COUNT,
|
233
247
|
};
|
234
248
|
|
249
|
+
enum ggml_backend {
|
250
|
+
GGML_BACKEND_CPU = 0,
|
251
|
+
GGML_BACKEND_CUDA = 1,
|
252
|
+
};
|
253
|
+
|
254
|
+
// model file types
|
255
|
+
enum ggml_ftype {
|
256
|
+
GGML_FTYPE_UNKNOWN = -1,
|
257
|
+
GGML_FTYPE_ALL_F32 = 0,
|
258
|
+
GGML_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
|
259
|
+
GGML_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
|
260
|
+
GGML_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
|
261
|
+
GGML_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
|
262
|
+
GGML_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
|
263
|
+
GGML_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
|
264
|
+
GGML_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
|
265
|
+
};
|
266
|
+
|
235
267
|
// available tensor operations:
|
236
268
|
enum ggml_op {
|
237
269
|
GGML_OP_NONE = 0,
|
238
270
|
|
239
271
|
GGML_OP_DUP,
|
240
272
|
GGML_OP_ADD,
|
273
|
+
GGML_OP_ADD1,
|
274
|
+
GGML_OP_ACC,
|
241
275
|
GGML_OP_SUB,
|
242
276
|
GGML_OP_MUL,
|
243
277
|
GGML_OP_DIV,
|
244
278
|
GGML_OP_SQR,
|
245
279
|
GGML_OP_SQRT,
|
280
|
+
GGML_OP_LOG,
|
246
281
|
GGML_OP_SUM,
|
282
|
+
GGML_OP_SUM_ROWS,
|
247
283
|
GGML_OP_MEAN,
|
248
284
|
GGML_OP_REPEAT,
|
249
285
|
GGML_OP_ABS,
|
@@ -253,12 +289,15 @@ extern "C" {
|
|
253
289
|
GGML_OP_RELU,
|
254
290
|
GGML_OP_GELU,
|
255
291
|
GGML_OP_SILU,
|
292
|
+
GGML_OP_SILU_BACK,
|
256
293
|
GGML_OP_NORM, // normalize
|
257
294
|
GGML_OP_RMS_NORM,
|
295
|
+
GGML_OP_RMS_NORM_BACK,
|
258
296
|
|
259
297
|
GGML_OP_MUL_MAT,
|
260
298
|
|
261
299
|
GGML_OP_SCALE,
|
300
|
+
GGML_OP_SET,
|
262
301
|
GGML_OP_CPY,
|
263
302
|
GGML_OP_CONT,
|
264
303
|
GGML_OP_RESHAPE,
|
@@ -266,9 +305,14 @@ extern "C" {
|
|
266
305
|
GGML_OP_PERMUTE,
|
267
306
|
GGML_OP_TRANSPOSE,
|
268
307
|
GGML_OP_GET_ROWS,
|
308
|
+
GGML_OP_GET_ROWS_BACK,
|
309
|
+
GGML_OP_DIAG,
|
269
310
|
GGML_OP_DIAG_MASK_INF,
|
311
|
+
GGML_OP_DIAG_MASK_ZERO,
|
270
312
|
GGML_OP_SOFT_MAX,
|
271
313
|
GGML_OP_ROPE,
|
314
|
+
GGML_OP_ROPE_BACK,
|
315
|
+
GGML_OP_ALIBI,
|
272
316
|
GGML_OP_CONV_1D_1S,
|
273
317
|
GGML_OP_CONV_1D_2S,
|
274
318
|
|
@@ -296,7 +340,8 @@ extern "C" {
|
|
296
340
|
|
297
341
|
// n-dimensional tensor
|
298
342
|
struct ggml_tensor {
|
299
|
-
enum ggml_type
|
343
|
+
enum ggml_type type;
|
344
|
+
enum ggml_backend backend;
|
300
345
|
|
301
346
|
int n_dims;
|
302
347
|
int64_t ne[GGML_MAX_DIMS]; // number of elements
|
@@ -324,7 +369,10 @@ extern "C" {
|
|
324
369
|
int64_t perf_time_us;
|
325
370
|
|
326
371
|
void * data;
|
327
|
-
|
372
|
+
|
373
|
+
char name[32];
|
374
|
+
|
375
|
+
char padding[16];
|
328
376
|
};
|
329
377
|
|
330
378
|
// computation graph
|
@@ -384,6 +432,9 @@ extern "C" {
|
|
384
432
|
|
385
433
|
GGML_API bool ggml_is_quantized(enum ggml_type type);
|
386
434
|
|
435
|
+
// TODO: temporary until model loading of ggml examples is refactored
|
436
|
+
GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
|
437
|
+
|
387
438
|
// main
|
388
439
|
|
389
440
|
GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
|
@@ -444,6 +495,9 @@ extern "C" {
|
|
444
495
|
GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
|
445
496
|
GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
|
446
497
|
|
498
|
+
GGML_API const char * ggml_get_name(const struct ggml_tensor * tensor);
|
499
|
+
GGML_API void ggml_set_name(struct ggml_tensor * tensor, const char * name);
|
500
|
+
|
447
501
|
//
|
448
502
|
// operations on tensors with backpropagation
|
449
503
|
//
|
@@ -462,6 +516,29 @@ extern "C" {
|
|
462
516
|
struct ggml_tensor * a,
|
463
517
|
struct ggml_tensor * b);
|
464
518
|
|
519
|
+
GGML_API struct ggml_tensor * ggml_add1(
|
520
|
+
struct ggml_context * ctx,
|
521
|
+
struct ggml_tensor * a,
|
522
|
+
struct ggml_tensor * b);
|
523
|
+
|
524
|
+
GGML_API struct ggml_tensor * ggml_acc(
|
525
|
+
struct ggml_context * ctx,
|
526
|
+
struct ggml_tensor * a,
|
527
|
+
struct ggml_tensor * b,
|
528
|
+
size_t nb1,
|
529
|
+
size_t nb2,
|
530
|
+
size_t nb3,
|
531
|
+
size_t offset);
|
532
|
+
|
533
|
+
GGML_API struct ggml_tensor * ggml_acc_inplace(
|
534
|
+
struct ggml_context * ctx,
|
535
|
+
struct ggml_tensor * a,
|
536
|
+
struct ggml_tensor * b,
|
537
|
+
size_t nb1,
|
538
|
+
size_t nb2,
|
539
|
+
size_t nb3,
|
540
|
+
size_t offset);
|
541
|
+
|
465
542
|
GGML_API struct ggml_tensor * ggml_sub(
|
466
543
|
struct ggml_context * ctx,
|
467
544
|
struct ggml_tensor * a,
|
@@ -485,12 +562,24 @@ extern "C" {
|
|
485
562
|
struct ggml_context * ctx,
|
486
563
|
struct ggml_tensor * a);
|
487
564
|
|
565
|
+
GGML_API struct ggml_tensor * ggml_log(
|
566
|
+
struct ggml_context * ctx,
|
567
|
+
struct ggml_tensor * a);
|
568
|
+
|
569
|
+
GGML_API struct ggml_tensor * ggml_log_inplace(
|
570
|
+
struct ggml_context * ctx,
|
571
|
+
struct ggml_tensor * a);
|
572
|
+
|
488
573
|
// return scalar
|
489
|
-
// TODO: compute sum along rows
|
490
574
|
GGML_API struct ggml_tensor * ggml_sum(
|
491
575
|
struct ggml_context * ctx,
|
492
576
|
struct ggml_tensor * a);
|
493
577
|
|
578
|
+
// sums along rows, with input shape [a,b,c,d] return shape [1,b,c,d]
|
579
|
+
GGML_API struct ggml_tensor * ggml_sum_rows(
|
580
|
+
struct ggml_context * ctx,
|
581
|
+
struct ggml_tensor * a);
|
582
|
+
|
494
583
|
// mean along rows
|
495
584
|
GGML_API struct ggml_tensor * ggml_mean(
|
496
585
|
struct ggml_context * ctx,
|
@@ -532,6 +621,13 @@ extern "C" {
|
|
532
621
|
struct ggml_context * ctx,
|
533
622
|
struct ggml_tensor * a);
|
534
623
|
|
624
|
+
// a - x
|
625
|
+
// b - dy
|
626
|
+
GGML_API struct ggml_tensor * ggml_silu_back(
|
627
|
+
struct ggml_context * ctx,
|
628
|
+
struct ggml_tensor * a,
|
629
|
+
struct ggml_tensor * b);
|
630
|
+
|
535
631
|
// normalize along rows
|
536
632
|
// TODO: eps is hardcoded to 1e-5 for now
|
537
633
|
GGML_API struct ggml_tensor * ggml_norm(
|
@@ -542,6 +638,13 @@ extern "C" {
|
|
542
638
|
struct ggml_context * ctx,
|
543
639
|
struct ggml_tensor * a);
|
544
640
|
|
641
|
+
// a - x
|
642
|
+
// b - dy
|
643
|
+
GGML_API struct ggml_tensor * ggml_rms_norm_back(
|
644
|
+
struct ggml_context * ctx,
|
645
|
+
struct ggml_tensor * a,
|
646
|
+
struct ggml_tensor * b);
|
647
|
+
|
545
648
|
// A: m rows, n columns
|
546
649
|
// B: p rows, n columns (i.e. we transpose it internally)
|
547
650
|
// result is m columns, p rows
|
@@ -554,12 +657,66 @@ extern "C" {
|
|
554
657
|
// operations on tensors without backpropagation
|
555
658
|
//
|
556
659
|
|
557
|
-
// in-place, returns view(a)
|
558
660
|
GGML_API struct ggml_tensor * ggml_scale(
|
559
661
|
struct ggml_context * ctx,
|
560
662
|
struct ggml_tensor * a,
|
561
663
|
struct ggml_tensor * b);
|
562
664
|
|
665
|
+
// in-place, returns view(a)
|
666
|
+
GGML_API struct ggml_tensor * ggml_scale_inplace(
|
667
|
+
struct ggml_context * ctx,
|
668
|
+
struct ggml_tensor * a,
|
669
|
+
struct ggml_tensor * b);
|
670
|
+
|
671
|
+
// b -> view(a,offset,nb1,nb2,3), return modified a
|
672
|
+
GGML_API struct ggml_tensor * ggml_set(
|
673
|
+
struct ggml_context * ctx,
|
674
|
+
struct ggml_tensor * a,
|
675
|
+
struct ggml_tensor * b,
|
676
|
+
size_t nb1,
|
677
|
+
size_t nb2,
|
678
|
+
size_t nb3,
|
679
|
+
size_t offset);
|
680
|
+
|
681
|
+
// b -> view(a,offset,nb1,nb2,3), return view(a)
|
682
|
+
GGML_API struct ggml_tensor * ggml_set_inplace(
|
683
|
+
struct ggml_context * ctx,
|
684
|
+
struct ggml_tensor * a,
|
685
|
+
struct ggml_tensor * b,
|
686
|
+
size_t nb1,
|
687
|
+
size_t nb2,
|
688
|
+
size_t nb3,
|
689
|
+
size_t offset);
|
690
|
+
|
691
|
+
GGML_API struct ggml_tensor * ggml_set_1d(
|
692
|
+
struct ggml_context * ctx,
|
693
|
+
struct ggml_tensor * a,
|
694
|
+
struct ggml_tensor * b,
|
695
|
+
size_t offset);
|
696
|
+
|
697
|
+
GGML_API struct ggml_tensor * ggml_set_1d_inplace(
|
698
|
+
struct ggml_context * ctx,
|
699
|
+
struct ggml_tensor * a,
|
700
|
+
struct ggml_tensor * b,
|
701
|
+
size_t offset);
|
702
|
+
|
703
|
+
// b -> view(a,offset,nb1,nb2,3), return modified a
|
704
|
+
GGML_API struct ggml_tensor * ggml_set_2d(
|
705
|
+
struct ggml_context * ctx,
|
706
|
+
struct ggml_tensor * a,
|
707
|
+
struct ggml_tensor * b,
|
708
|
+
size_t nb1,
|
709
|
+
size_t offset);
|
710
|
+
|
711
|
+
// b -> view(a,offset,nb1,nb2,3), return view(a)
|
712
|
+
GGML_API struct ggml_tensor * ggml_set_2d_inplace(
|
713
|
+
struct ggml_context * ctx,
|
714
|
+
struct ggml_tensor * a,
|
715
|
+
struct ggml_tensor * b,
|
716
|
+
size_t nb1,
|
717
|
+
size_t offset);
|
718
|
+
|
719
|
+
|
563
720
|
// a -> b, return view(b)
|
564
721
|
GGML_API struct ggml_tensor * ggml_cpy(
|
565
722
|
struct ggml_context * ctx,
|
@@ -580,6 +737,11 @@ extern "C" {
|
|
580
737
|
|
581
738
|
// return view(a)
|
582
739
|
// TODO: when we start computing gradient, make a copy instead of view
|
740
|
+
GGML_API struct ggml_tensor * ggml_reshape_1d(
|
741
|
+
struct ggml_context * ctx,
|
742
|
+
struct ggml_tensor * a,
|
743
|
+
int64_t ne0);
|
744
|
+
|
583
745
|
GGML_API struct ggml_tensor * ggml_reshape_2d(
|
584
746
|
struct ggml_context * ctx,
|
585
747
|
struct ggml_tensor * a,
|
@@ -595,6 +757,14 @@ extern "C" {
|
|
595
757
|
int64_t ne1,
|
596
758
|
int64_t ne2);
|
597
759
|
|
760
|
+
GGML_API struct ggml_tensor * ggml_reshape_4d(
|
761
|
+
struct ggml_context * ctx,
|
762
|
+
struct ggml_tensor * a,
|
763
|
+
int64_t ne0,
|
764
|
+
int64_t ne1,
|
765
|
+
int64_t ne2,
|
766
|
+
int64_t ne3);
|
767
|
+
|
598
768
|
// offset in bytes
|
599
769
|
GGML_API struct ggml_tensor * ggml_view_1d(
|
600
770
|
struct ggml_context * ctx,
|
@@ -620,6 +790,18 @@ extern "C" {
|
|
620
790
|
size_t nb2, // slice stride in bytes
|
621
791
|
size_t offset);
|
622
792
|
|
793
|
+
GGML_API struct ggml_tensor * ggml_view_4d(
|
794
|
+
struct ggml_context * ctx,
|
795
|
+
struct ggml_tensor * a,
|
796
|
+
int64_t ne0,
|
797
|
+
int64_t ne1,
|
798
|
+
int64_t ne2,
|
799
|
+
int64_t ne3,
|
800
|
+
size_t nb1, // row stride in bytes
|
801
|
+
size_t nb2, // slice stride in bytes
|
802
|
+
size_t nb3,
|
803
|
+
size_t offset);
|
804
|
+
|
623
805
|
GGML_API struct ggml_tensor * ggml_permute(
|
624
806
|
struct ggml_context * ctx,
|
625
807
|
struct ggml_tensor * a,
|
@@ -638,20 +820,50 @@ extern "C" {
|
|
638
820
|
struct ggml_tensor * a,
|
639
821
|
struct ggml_tensor * b);
|
640
822
|
|
823
|
+
GGML_API struct ggml_tensor * ggml_get_rows_back(
|
824
|
+
struct ggml_context * ctx,
|
825
|
+
struct ggml_tensor * a,
|
826
|
+
struct ggml_tensor * b,
|
827
|
+
struct ggml_tensor * c);
|
828
|
+
|
829
|
+
GGML_API struct ggml_tensor * ggml_diag(
|
830
|
+
struct ggml_context * ctx,
|
831
|
+
struct ggml_tensor * a);
|
832
|
+
|
641
833
|
// set elements above the diagonal to -INF
|
642
|
-
// in-place, returns view(a)
|
643
834
|
GGML_API struct ggml_tensor * ggml_diag_mask_inf(
|
644
835
|
struct ggml_context * ctx,
|
645
836
|
struct ggml_tensor * a,
|
646
837
|
int n_past);
|
647
838
|
|
648
839
|
// in-place, returns view(a)
|
840
|
+
GGML_API struct ggml_tensor * ggml_diag_mask_inf_inplace(
|
841
|
+
struct ggml_context * ctx,
|
842
|
+
struct ggml_tensor * a,
|
843
|
+
int n_past);
|
844
|
+
|
845
|
+
// set elements above the diagonal to 0
|
846
|
+
GGML_API struct ggml_tensor * ggml_diag_mask_zero(
|
847
|
+
struct ggml_context * ctx,
|
848
|
+
struct ggml_tensor * a,
|
849
|
+
int n_past);
|
850
|
+
|
851
|
+
// in-place, returns view(a)
|
852
|
+
GGML_API struct ggml_tensor * gml_diag_mask_zero_inplace(
|
853
|
+
struct ggml_context * ctx,
|
854
|
+
struct ggml_tensor * a,
|
855
|
+
int n_past);
|
856
|
+
|
649
857
|
GGML_API struct ggml_tensor * ggml_soft_max(
|
650
858
|
struct ggml_context * ctx,
|
651
859
|
struct ggml_tensor * a);
|
652
860
|
|
653
|
-
// rotary position embedding
|
654
861
|
// in-place, returns view(a)
|
862
|
+
GGML_API struct ggml_tensor * ggml_soft_max_inplace(
|
863
|
+
struct ggml_context * ctx,
|
864
|
+
struct ggml_tensor * a);
|
865
|
+
|
866
|
+
// rotary position embedding
|
655
867
|
// if mode & 1 == 1, skip n_past elements
|
656
868
|
// if mode & 2 == 1, GPT-NeoX style
|
657
869
|
// TODO: avoid creating a new tensor every time
|
@@ -662,6 +874,31 @@ extern "C" {
|
|
662
874
|
int n_dims,
|
663
875
|
int mode);
|
664
876
|
|
877
|
+
// in-place, returns view(a)
|
878
|
+
GGML_API struct ggml_tensor * ggml_rope_inplace(
|
879
|
+
struct ggml_context * ctx,
|
880
|
+
struct ggml_tensor * a,
|
881
|
+
int n_past,
|
882
|
+
int n_dims,
|
883
|
+
int mode);
|
884
|
+
|
885
|
+
// rotary position embedding backward, i.e compute dx from dy
|
886
|
+
// a - dy
|
887
|
+
GGML_API struct ggml_tensor * ggml_rope_back(
|
888
|
+
struct ggml_context * ctx,
|
889
|
+
struct ggml_tensor * a,
|
890
|
+
int n_past,
|
891
|
+
int n_dims,
|
892
|
+
int mode);
|
893
|
+
|
894
|
+
// alibi position embedding
|
895
|
+
// in-place, returns view(a)
|
896
|
+
struct ggml_tensor * ggml_alibi(
|
897
|
+
struct ggml_context * ctx,
|
898
|
+
struct ggml_tensor * a,
|
899
|
+
int n_past,
|
900
|
+
int n_head);
|
901
|
+
|
665
902
|
// padding = 1
|
666
903
|
// TODO: we don't support extra parameters for now
|
667
904
|
// that's why we are hard-coding the stride, padding, and dilation
|
@@ -692,19 +929,19 @@ extern "C" {
|
|
692
929
|
struct ggml_tensor * c1);
|
693
930
|
|
694
931
|
// Mapping operations
|
695
|
-
|
696
|
-
|
932
|
+
typedef void (*ggml_unary_op_f32_t)(const int, float *, const float *);
|
933
|
+
typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
|
697
934
|
|
698
935
|
GGML_API struct ggml_tensor * ggml_map_unary_f32(
|
699
936
|
struct ggml_context * ctx,
|
700
937
|
struct ggml_tensor * a,
|
701
|
-
|
938
|
+
ggml_unary_op_f32_t fun);
|
702
939
|
|
703
940
|
GGML_API struct ggml_tensor * ggml_map_binary_f32(
|
704
941
|
struct ggml_context * ctx,
|
705
942
|
struct ggml_tensor * a,
|
706
943
|
struct ggml_tensor * b,
|
707
|
-
|
944
|
+
ggml_binary_op_f32_t fun);
|
708
945
|
|
709
946
|
//
|
710
947
|
// automatic differentiation
|
@@ -833,8 +1070,6 @@ extern "C" {
|
|
833
1070
|
|
834
1071
|
GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
835
1072
|
GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
|
836
|
-
GGML_API size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist);
|
837
|
-
GGML_API size_t ggml_quantize_q4_3(const float * src, void * dst, int n, int k, int64_t * hist);
|
838
1073
|
GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
839
1074
|
GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist);
|
840
1075
|
GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
@@ -14,6 +14,7 @@
|
|
14
14
|
|
15
15
|
#include <string>
|
16
16
|
#include <vector>
|
17
|
+
#include <stdexcept>
|
17
18
|
|
18
19
|
#ifdef __has_include
|
19
20
|
#if __has_include(<unistd.h>)
|
@@ -74,7 +75,7 @@ struct llama_file {
|
|
74
75
|
llama_file(const char * fname, const char * mode) {
|
75
76
|
fp = std::fopen(fname, mode);
|
76
77
|
if (fp == NULL) {
|
77
|
-
throw format("failed to open %s: %s", fname,
|
78
|
+
throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
|
78
79
|
}
|
79
80
|
seek(0, SEEK_END);
|
80
81
|
size = tell();
|
@@ -107,10 +108,10 @@ struct llama_file {
|
|
107
108
|
errno = 0;
|
108
109
|
std::size_t ret = std::fread(ptr, size, 1, fp);
|
109
110
|
if (ferror(fp)) {
|
110
|
-
throw format("read error: %s", strerror(errno));
|
111
|
+
throw std::runtime_error(format("read error: %s", strerror(errno)));
|
111
112
|
}
|
112
113
|
if (ret != 1) {
|
113
|
-
throw std::string("unexpectedly reached end of file");
|
114
|
+
throw std::runtime_error(std::string("unexpectedly reached end of file"));
|
114
115
|
}
|
115
116
|
}
|
116
117
|
|
@@ -133,7 +134,7 @@ struct llama_file {
|
|
133
134
|
errno = 0;
|
134
135
|
size_t ret = std::fwrite(ptr, size, 1, fp);
|
135
136
|
if (ret != 1) {
|
136
|
-
throw format("write error: %s", strerror(errno));
|
137
|
+
throw std::runtime_error(format("write error: %s", strerror(errno)));
|
137
138
|
}
|
138
139
|
}
|
139
140
|
|
@@ -180,7 +181,7 @@ struct llama_mmap {
|
|
180
181
|
#endif
|
181
182
|
addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
|
182
183
|
if (addr == MAP_FAILED) {
|
183
|
-
throw format("mmap failed: %s", strerror(errno));
|
184
|
+
throw std::runtime_error(format("mmap failed: %s", strerror(errno)));
|
184
185
|
}
|
185
186
|
|
186
187
|
if (prefetch) {
|
@@ -207,7 +208,7 @@ struct llama_mmap {
|
|
207
208
|
DWORD error = GetLastError();
|
208
209
|
|
209
210
|
if (hMapping == NULL) {
|
210
|
-
throw format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str());
|
211
|
+
throw std::runtime_error(format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str()));
|
211
212
|
}
|
212
213
|
|
213
214
|
addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
|
@@ -215,7 +216,7 @@ struct llama_mmap {
|
|
215
216
|
CloseHandle(hMapping);
|
216
217
|
|
217
218
|
if (addr == NULL) {
|
218
|
-
throw format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str());
|
219
|
+
throw std::runtime_error(format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str()));
|
219
220
|
}
|
220
221
|
|
221
222
|
#if _WIN32_WINNT >= _WIN32_WINNT_WIN8
|
@@ -243,8 +244,9 @@ struct llama_mmap {
|
|
243
244
|
#else
|
244
245
|
static constexpr bool SUPPORTED = false;
|
245
246
|
|
246
|
-
llama_mmap(struct llama_file
|
247
|
-
|
247
|
+
llama_mmap(struct llama_file *, bool prefetch = true) {
|
248
|
+
(void)prefetch;
|
249
|
+
throw std::runtime_error(std::string("mmap not supported"));
|
248
250
|
}
|
249
251
|
#endif
|
250
252
|
};
|
@@ -382,8 +384,13 @@ struct llama_mlock {
|
|
382
384
|
#else
|
383
385
|
static constexpr bool SUPPORTED = false;
|
384
386
|
|
385
|
-
|
387
|
+
size_t lock_granularity() {
|
388
|
+
return (size_t) 65536;
|
389
|
+
}
|
390
|
+
|
391
|
+
bool raw_lock(const void * addr, size_t size) {
|
386
392
|
fprintf(stderr, "warning: mlock not supported on this system\n");
|
393
|
+
return false;
|
387
394
|
}
|
388
395
|
|
389
396
|
void raw_unlock(const void * addr, size_t size) {}
|
@@ -395,6 +402,8 @@ struct llama_buffer {
|
|
395
402
|
uint8_t * addr = NULL;
|
396
403
|
size_t size = 0;
|
397
404
|
|
405
|
+
llama_buffer() = default;
|
406
|
+
|
398
407
|
void resize(size_t size) {
|
399
408
|
delete[] addr;
|
400
409
|
addr = new uint8_t[size];
|
@@ -404,5 +413,62 @@ struct llama_buffer {
|
|
404
413
|
~llama_buffer() {
|
405
414
|
delete[] addr;
|
406
415
|
}
|
416
|
+
|
417
|
+
// disable copy and move
|
418
|
+
llama_buffer(const llama_buffer&) = delete;
|
419
|
+
llama_buffer(llama_buffer&&) = delete;
|
420
|
+
llama_buffer& operator=(const llama_buffer&) = delete;
|
421
|
+
llama_buffer& operator=(llama_buffer&&) = delete;
|
407
422
|
};
|
423
|
+
|
424
|
+
#ifdef GGML_USE_CUBLAS
|
425
|
+
#include "ggml-cuda.h"
|
426
|
+
struct llama_ctx_buffer {
|
427
|
+
uint8_t * addr = NULL;
|
428
|
+
bool is_cuda;
|
429
|
+
size_t size = 0;
|
430
|
+
|
431
|
+
llama_ctx_buffer() = default;
|
432
|
+
|
433
|
+
void resize(size_t size) {
|
434
|
+
free();
|
435
|
+
|
436
|
+
addr = (uint8_t *) ggml_cuda_host_malloc(size);
|
437
|
+
if (addr) {
|
438
|
+
is_cuda = true;
|
439
|
+
}
|
440
|
+
else {
|
441
|
+
// fall back to pageable memory
|
442
|
+
addr = new uint8_t[size];
|
443
|
+
is_cuda = false;
|
444
|
+
}
|
445
|
+
this->size = size;
|
446
|
+
}
|
447
|
+
|
448
|
+
void free() {
|
449
|
+
if (addr) {
|
450
|
+
if (is_cuda) {
|
451
|
+
ggml_cuda_host_free(addr);
|
452
|
+
}
|
453
|
+
else {
|
454
|
+
delete[] addr;
|
455
|
+
}
|
456
|
+
}
|
457
|
+
addr = NULL;
|
458
|
+
}
|
459
|
+
|
460
|
+
~llama_ctx_buffer() {
|
461
|
+
free();
|
462
|
+
}
|
463
|
+
|
464
|
+
// disable copy and move
|
465
|
+
llama_ctx_buffer(const llama_ctx_buffer&) = delete;
|
466
|
+
llama_ctx_buffer(llama_ctx_buffer&&) = delete;
|
467
|
+
llama_ctx_buffer& operator=(const llama_ctx_buffer&) = delete;
|
468
|
+
llama_ctx_buffer& operator=(llama_ctx_buffer&&) = delete;
|
469
|
+
};
|
470
|
+
#else
|
471
|
+
typedef llama_buffer llama_ctx_buffer;
|
472
|
+
#endif
|
473
|
+
|
408
474
|
#endif
|