llama_cpp 0.0.7 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +17 -0
- data/ext/llama_cpp/llama_cpp.cpp +829 -51
- data/ext/llama_cpp/src/ggml-cuda.h +9 -32
- data/ext/llama_cpp/src/ggml-opencl.c +169 -24
- data/ext/llama_cpp/src/ggml.c +6672 -4376
- data/ext/llama_cpp/src/ggml.h +250 -15
- data/ext/llama_cpp/src/{llama_util.h → llama-util.h} +76 -10
- data/ext/llama_cpp/src/llama.cpp +710 -217
- data/ext/llama_cpp/src/llama.h +75 -28
- data/lib/llama_cpp/client.rb +30 -9
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +27 -3
- data/sig/llama_cpp.rbs +41 -7
- metadata +3 -3
data/ext/llama_cpp/src/ggml.h
CHANGED
@@ -190,13 +190,24 @@
|
|
190
190
|
#define GGML_FILE_MAGIC 0x67676d6c // "ggml"
|
191
191
|
#define GGML_FILE_VERSION 1
|
192
192
|
|
193
|
+
#define GGML_QNT_VERSION 1 // bump this on quantization format changes
|
194
|
+
#define GGML_QNT_VERSION_FACTOR 1000 // do not change this
|
195
|
+
|
193
196
|
#define GGML_MAX_DIMS 4
|
194
197
|
#define GGML_MAX_NODES 4096
|
195
|
-
#define GGML_MAX_PARAMS
|
198
|
+
#define GGML_MAX_PARAMS 256
|
196
199
|
#define GGML_MAX_CONTEXTS 64
|
197
200
|
#define GGML_MAX_OPT 4
|
198
201
|
#define GGML_DEFAULT_N_THREADS 4
|
199
202
|
|
203
|
+
#define GGML_ASSERT(x) \
|
204
|
+
do { \
|
205
|
+
if (!(x)) { \
|
206
|
+
fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
|
207
|
+
abort(); \
|
208
|
+
} \
|
209
|
+
} while (0)
|
210
|
+
|
200
211
|
#ifdef __cplusplus
|
201
212
|
extern "C" {
|
202
213
|
#endif
|
@@ -212,6 +223,9 @@ extern "C" {
|
|
212
223
|
GGML_API float ggml_fp16_to_fp32(ggml_fp16_t x);
|
213
224
|
GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x);
|
214
225
|
|
226
|
+
GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, size_t n);
|
227
|
+
GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, size_t n);
|
228
|
+
|
215
229
|
struct ggml_object;
|
216
230
|
struct ggml_context;
|
217
231
|
|
@@ -220,8 +234,8 @@ extern "C" {
|
|
220
234
|
GGML_TYPE_F16 = 1,
|
221
235
|
GGML_TYPE_Q4_0 = 2,
|
222
236
|
GGML_TYPE_Q4_1 = 3,
|
223
|
-
GGML_TYPE_Q4_2 = 4,
|
224
|
-
GGML_TYPE_Q4_3
|
237
|
+
// GGML_TYPE_Q4_2 = 4, support has been removed
|
238
|
+
// GGML_TYPE_Q4_3 (5) support has been removed
|
225
239
|
GGML_TYPE_Q5_0 = 6,
|
226
240
|
GGML_TYPE_Q5_1 = 7,
|
227
241
|
GGML_TYPE_Q8_0 = 8,
|
@@ -232,18 +246,40 @@ extern "C" {
|
|
232
246
|
GGML_TYPE_COUNT,
|
233
247
|
};
|
234
248
|
|
249
|
+
enum ggml_backend {
|
250
|
+
GGML_BACKEND_CPU = 0,
|
251
|
+
GGML_BACKEND_CUDA = 1,
|
252
|
+
};
|
253
|
+
|
254
|
+
// model file types
|
255
|
+
enum ggml_ftype {
|
256
|
+
GGML_FTYPE_UNKNOWN = -1,
|
257
|
+
GGML_FTYPE_ALL_F32 = 0,
|
258
|
+
GGML_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
|
259
|
+
GGML_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
|
260
|
+
GGML_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
|
261
|
+
GGML_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
|
262
|
+
GGML_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
|
263
|
+
GGML_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
|
264
|
+
GGML_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
|
265
|
+
};
|
266
|
+
|
235
267
|
// available tensor operations:
|
236
268
|
enum ggml_op {
|
237
269
|
GGML_OP_NONE = 0,
|
238
270
|
|
239
271
|
GGML_OP_DUP,
|
240
272
|
GGML_OP_ADD,
|
273
|
+
GGML_OP_ADD1,
|
274
|
+
GGML_OP_ACC,
|
241
275
|
GGML_OP_SUB,
|
242
276
|
GGML_OP_MUL,
|
243
277
|
GGML_OP_DIV,
|
244
278
|
GGML_OP_SQR,
|
245
279
|
GGML_OP_SQRT,
|
280
|
+
GGML_OP_LOG,
|
246
281
|
GGML_OP_SUM,
|
282
|
+
GGML_OP_SUM_ROWS,
|
247
283
|
GGML_OP_MEAN,
|
248
284
|
GGML_OP_REPEAT,
|
249
285
|
GGML_OP_ABS,
|
@@ -253,12 +289,15 @@ extern "C" {
|
|
253
289
|
GGML_OP_RELU,
|
254
290
|
GGML_OP_GELU,
|
255
291
|
GGML_OP_SILU,
|
292
|
+
GGML_OP_SILU_BACK,
|
256
293
|
GGML_OP_NORM, // normalize
|
257
294
|
GGML_OP_RMS_NORM,
|
295
|
+
GGML_OP_RMS_NORM_BACK,
|
258
296
|
|
259
297
|
GGML_OP_MUL_MAT,
|
260
298
|
|
261
299
|
GGML_OP_SCALE,
|
300
|
+
GGML_OP_SET,
|
262
301
|
GGML_OP_CPY,
|
263
302
|
GGML_OP_CONT,
|
264
303
|
GGML_OP_RESHAPE,
|
@@ -266,9 +305,14 @@ extern "C" {
|
|
266
305
|
GGML_OP_PERMUTE,
|
267
306
|
GGML_OP_TRANSPOSE,
|
268
307
|
GGML_OP_GET_ROWS,
|
308
|
+
GGML_OP_GET_ROWS_BACK,
|
309
|
+
GGML_OP_DIAG,
|
269
310
|
GGML_OP_DIAG_MASK_INF,
|
311
|
+
GGML_OP_DIAG_MASK_ZERO,
|
270
312
|
GGML_OP_SOFT_MAX,
|
271
313
|
GGML_OP_ROPE,
|
314
|
+
GGML_OP_ROPE_BACK,
|
315
|
+
GGML_OP_ALIBI,
|
272
316
|
GGML_OP_CONV_1D_1S,
|
273
317
|
GGML_OP_CONV_1D_2S,
|
274
318
|
|
@@ -296,7 +340,8 @@ extern "C" {
|
|
296
340
|
|
297
341
|
// n-dimensional tensor
|
298
342
|
struct ggml_tensor {
|
299
|
-
enum ggml_type
|
343
|
+
enum ggml_type type;
|
344
|
+
enum ggml_backend backend;
|
300
345
|
|
301
346
|
int n_dims;
|
302
347
|
int64_t ne[GGML_MAX_DIMS]; // number of elements
|
@@ -324,7 +369,10 @@ extern "C" {
|
|
324
369
|
int64_t perf_time_us;
|
325
370
|
|
326
371
|
void * data;
|
327
|
-
|
372
|
+
|
373
|
+
char name[32];
|
374
|
+
|
375
|
+
char padding[16];
|
328
376
|
};
|
329
377
|
|
330
378
|
// computation graph
|
@@ -384,6 +432,9 @@ extern "C" {
|
|
384
432
|
|
385
433
|
GGML_API bool ggml_is_quantized(enum ggml_type type);
|
386
434
|
|
435
|
+
// TODO: temporary until model loading of ggml examples is refactored
|
436
|
+
GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
|
437
|
+
|
387
438
|
// main
|
388
439
|
|
389
440
|
GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
|
@@ -444,6 +495,9 @@ extern "C" {
|
|
444
495
|
GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
|
445
496
|
GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
|
446
497
|
|
498
|
+
GGML_API const char * ggml_get_name(const struct ggml_tensor * tensor);
|
499
|
+
GGML_API void ggml_set_name(struct ggml_tensor * tensor, const char * name);
|
500
|
+
|
447
501
|
//
|
448
502
|
// operations on tensors with backpropagation
|
449
503
|
//
|
@@ -462,6 +516,29 @@ extern "C" {
|
|
462
516
|
struct ggml_tensor * a,
|
463
517
|
struct ggml_tensor * b);
|
464
518
|
|
519
|
+
GGML_API struct ggml_tensor * ggml_add1(
|
520
|
+
struct ggml_context * ctx,
|
521
|
+
struct ggml_tensor * a,
|
522
|
+
struct ggml_tensor * b);
|
523
|
+
|
524
|
+
GGML_API struct ggml_tensor * ggml_acc(
|
525
|
+
struct ggml_context * ctx,
|
526
|
+
struct ggml_tensor * a,
|
527
|
+
struct ggml_tensor * b,
|
528
|
+
size_t nb1,
|
529
|
+
size_t nb2,
|
530
|
+
size_t nb3,
|
531
|
+
size_t offset);
|
532
|
+
|
533
|
+
GGML_API struct ggml_tensor * ggml_acc_inplace(
|
534
|
+
struct ggml_context * ctx,
|
535
|
+
struct ggml_tensor * a,
|
536
|
+
struct ggml_tensor * b,
|
537
|
+
size_t nb1,
|
538
|
+
size_t nb2,
|
539
|
+
size_t nb3,
|
540
|
+
size_t offset);
|
541
|
+
|
465
542
|
GGML_API struct ggml_tensor * ggml_sub(
|
466
543
|
struct ggml_context * ctx,
|
467
544
|
struct ggml_tensor * a,
|
@@ -485,12 +562,24 @@ extern "C" {
|
|
485
562
|
struct ggml_context * ctx,
|
486
563
|
struct ggml_tensor * a);
|
487
564
|
|
565
|
+
GGML_API struct ggml_tensor * ggml_log(
|
566
|
+
struct ggml_context * ctx,
|
567
|
+
struct ggml_tensor * a);
|
568
|
+
|
569
|
+
GGML_API struct ggml_tensor * ggml_log_inplace(
|
570
|
+
struct ggml_context * ctx,
|
571
|
+
struct ggml_tensor * a);
|
572
|
+
|
488
573
|
// return scalar
|
489
|
-
// TODO: compute sum along rows
|
490
574
|
GGML_API struct ggml_tensor * ggml_sum(
|
491
575
|
struct ggml_context * ctx,
|
492
576
|
struct ggml_tensor * a);
|
493
577
|
|
578
|
+
// sums along rows, with input shape [a,b,c,d] return shape [1,b,c,d]
|
579
|
+
GGML_API struct ggml_tensor * ggml_sum_rows(
|
580
|
+
struct ggml_context * ctx,
|
581
|
+
struct ggml_tensor * a);
|
582
|
+
|
494
583
|
// mean along rows
|
495
584
|
GGML_API struct ggml_tensor * ggml_mean(
|
496
585
|
struct ggml_context * ctx,
|
@@ -532,6 +621,13 @@ extern "C" {
|
|
532
621
|
struct ggml_context * ctx,
|
533
622
|
struct ggml_tensor * a);
|
534
623
|
|
624
|
+
// a - x
|
625
|
+
// b - dy
|
626
|
+
GGML_API struct ggml_tensor * ggml_silu_back(
|
627
|
+
struct ggml_context * ctx,
|
628
|
+
struct ggml_tensor * a,
|
629
|
+
struct ggml_tensor * b);
|
630
|
+
|
535
631
|
// normalize along rows
|
536
632
|
// TODO: eps is hardcoded to 1e-5 for now
|
537
633
|
GGML_API struct ggml_tensor * ggml_norm(
|
@@ -542,6 +638,13 @@ extern "C" {
|
|
542
638
|
struct ggml_context * ctx,
|
543
639
|
struct ggml_tensor * a);
|
544
640
|
|
641
|
+
// a - x
|
642
|
+
// b - dy
|
643
|
+
GGML_API struct ggml_tensor * ggml_rms_norm_back(
|
644
|
+
struct ggml_context * ctx,
|
645
|
+
struct ggml_tensor * a,
|
646
|
+
struct ggml_tensor * b);
|
647
|
+
|
545
648
|
// A: m rows, n columns
|
546
649
|
// B: p rows, n columns (i.e. we transpose it internally)
|
547
650
|
// result is m columns, p rows
|
@@ -554,12 +657,66 @@ extern "C" {
|
|
554
657
|
// operations on tensors without backpropagation
|
555
658
|
//
|
556
659
|
|
557
|
-
// in-place, returns view(a)
|
558
660
|
GGML_API struct ggml_tensor * ggml_scale(
|
559
661
|
struct ggml_context * ctx,
|
560
662
|
struct ggml_tensor * a,
|
561
663
|
struct ggml_tensor * b);
|
562
664
|
|
665
|
+
// in-place, returns view(a)
|
666
|
+
GGML_API struct ggml_tensor * ggml_scale_inplace(
|
667
|
+
struct ggml_context * ctx,
|
668
|
+
struct ggml_tensor * a,
|
669
|
+
struct ggml_tensor * b);
|
670
|
+
|
671
|
+
// b -> view(a,offset,nb1,nb2,3), return modified a
|
672
|
+
GGML_API struct ggml_tensor * ggml_set(
|
673
|
+
struct ggml_context * ctx,
|
674
|
+
struct ggml_tensor * a,
|
675
|
+
struct ggml_tensor * b,
|
676
|
+
size_t nb1,
|
677
|
+
size_t nb2,
|
678
|
+
size_t nb3,
|
679
|
+
size_t offset);
|
680
|
+
|
681
|
+
// b -> view(a,offset,nb1,nb2,3), return view(a)
|
682
|
+
GGML_API struct ggml_tensor * ggml_set_inplace(
|
683
|
+
struct ggml_context * ctx,
|
684
|
+
struct ggml_tensor * a,
|
685
|
+
struct ggml_tensor * b,
|
686
|
+
size_t nb1,
|
687
|
+
size_t nb2,
|
688
|
+
size_t nb3,
|
689
|
+
size_t offset);
|
690
|
+
|
691
|
+
GGML_API struct ggml_tensor * ggml_set_1d(
|
692
|
+
struct ggml_context * ctx,
|
693
|
+
struct ggml_tensor * a,
|
694
|
+
struct ggml_tensor * b,
|
695
|
+
size_t offset);
|
696
|
+
|
697
|
+
GGML_API struct ggml_tensor * ggml_set_1d_inplace(
|
698
|
+
struct ggml_context * ctx,
|
699
|
+
struct ggml_tensor * a,
|
700
|
+
struct ggml_tensor * b,
|
701
|
+
size_t offset);
|
702
|
+
|
703
|
+
// b -> view(a,offset,nb1,nb2,3), return modified a
|
704
|
+
GGML_API struct ggml_tensor * ggml_set_2d(
|
705
|
+
struct ggml_context * ctx,
|
706
|
+
struct ggml_tensor * a,
|
707
|
+
struct ggml_tensor * b,
|
708
|
+
size_t nb1,
|
709
|
+
size_t offset);
|
710
|
+
|
711
|
+
// b -> view(a,offset,nb1,nb2,3), return view(a)
|
712
|
+
GGML_API struct ggml_tensor * ggml_set_2d_inplace(
|
713
|
+
struct ggml_context * ctx,
|
714
|
+
struct ggml_tensor * a,
|
715
|
+
struct ggml_tensor * b,
|
716
|
+
size_t nb1,
|
717
|
+
size_t offset);
|
718
|
+
|
719
|
+
|
563
720
|
// a -> b, return view(b)
|
564
721
|
GGML_API struct ggml_tensor * ggml_cpy(
|
565
722
|
struct ggml_context * ctx,
|
@@ -580,6 +737,11 @@ extern "C" {
|
|
580
737
|
|
581
738
|
// return view(a)
|
582
739
|
// TODO: when we start computing gradient, make a copy instead of view
|
740
|
+
GGML_API struct ggml_tensor * ggml_reshape_1d(
|
741
|
+
struct ggml_context * ctx,
|
742
|
+
struct ggml_tensor * a,
|
743
|
+
int64_t ne0);
|
744
|
+
|
583
745
|
GGML_API struct ggml_tensor * ggml_reshape_2d(
|
584
746
|
struct ggml_context * ctx,
|
585
747
|
struct ggml_tensor * a,
|
@@ -595,6 +757,14 @@ extern "C" {
|
|
595
757
|
int64_t ne1,
|
596
758
|
int64_t ne2);
|
597
759
|
|
760
|
+
GGML_API struct ggml_tensor * ggml_reshape_4d(
|
761
|
+
struct ggml_context * ctx,
|
762
|
+
struct ggml_tensor * a,
|
763
|
+
int64_t ne0,
|
764
|
+
int64_t ne1,
|
765
|
+
int64_t ne2,
|
766
|
+
int64_t ne3);
|
767
|
+
|
598
768
|
// offset in bytes
|
599
769
|
GGML_API struct ggml_tensor * ggml_view_1d(
|
600
770
|
struct ggml_context * ctx,
|
@@ -620,6 +790,18 @@ extern "C" {
|
|
620
790
|
size_t nb2, // slice stride in bytes
|
621
791
|
size_t offset);
|
622
792
|
|
793
|
+
GGML_API struct ggml_tensor * ggml_view_4d(
|
794
|
+
struct ggml_context * ctx,
|
795
|
+
struct ggml_tensor * a,
|
796
|
+
int64_t ne0,
|
797
|
+
int64_t ne1,
|
798
|
+
int64_t ne2,
|
799
|
+
int64_t ne3,
|
800
|
+
size_t nb1, // row stride in bytes
|
801
|
+
size_t nb2, // slice stride in bytes
|
802
|
+
size_t nb3,
|
803
|
+
size_t offset);
|
804
|
+
|
623
805
|
GGML_API struct ggml_tensor * ggml_permute(
|
624
806
|
struct ggml_context * ctx,
|
625
807
|
struct ggml_tensor * a,
|
@@ -638,20 +820,50 @@ extern "C" {
|
|
638
820
|
struct ggml_tensor * a,
|
639
821
|
struct ggml_tensor * b);
|
640
822
|
|
823
|
+
GGML_API struct ggml_tensor * ggml_get_rows_back(
|
824
|
+
struct ggml_context * ctx,
|
825
|
+
struct ggml_tensor * a,
|
826
|
+
struct ggml_tensor * b,
|
827
|
+
struct ggml_tensor * c);
|
828
|
+
|
829
|
+
GGML_API struct ggml_tensor * ggml_diag(
|
830
|
+
struct ggml_context * ctx,
|
831
|
+
struct ggml_tensor * a);
|
832
|
+
|
641
833
|
// set elements above the diagonal to -INF
|
642
|
-
// in-place, returns view(a)
|
643
834
|
GGML_API struct ggml_tensor * ggml_diag_mask_inf(
|
644
835
|
struct ggml_context * ctx,
|
645
836
|
struct ggml_tensor * a,
|
646
837
|
int n_past);
|
647
838
|
|
648
839
|
// in-place, returns view(a)
|
840
|
+
GGML_API struct ggml_tensor * ggml_diag_mask_inf_inplace(
|
841
|
+
struct ggml_context * ctx,
|
842
|
+
struct ggml_tensor * a,
|
843
|
+
int n_past);
|
844
|
+
|
845
|
+
// set elements above the diagonal to 0
|
846
|
+
GGML_API struct ggml_tensor * ggml_diag_mask_zero(
|
847
|
+
struct ggml_context * ctx,
|
848
|
+
struct ggml_tensor * a,
|
849
|
+
int n_past);
|
850
|
+
|
851
|
+
// in-place, returns view(a)
|
852
|
+
GGML_API struct ggml_tensor * gml_diag_mask_zero_inplace(
|
853
|
+
struct ggml_context * ctx,
|
854
|
+
struct ggml_tensor * a,
|
855
|
+
int n_past);
|
856
|
+
|
649
857
|
GGML_API struct ggml_tensor * ggml_soft_max(
|
650
858
|
struct ggml_context * ctx,
|
651
859
|
struct ggml_tensor * a);
|
652
860
|
|
653
|
-
// rotary position embedding
|
654
861
|
// in-place, returns view(a)
|
862
|
+
GGML_API struct ggml_tensor * ggml_soft_max_inplace(
|
863
|
+
struct ggml_context * ctx,
|
864
|
+
struct ggml_tensor * a);
|
865
|
+
|
866
|
+
// rotary position embedding
|
655
867
|
// if mode & 1 == 1, skip n_past elements
|
656
868
|
// if mode & 2 == 1, GPT-NeoX style
|
657
869
|
// TODO: avoid creating a new tensor every time
|
@@ -662,6 +874,31 @@ extern "C" {
|
|
662
874
|
int n_dims,
|
663
875
|
int mode);
|
664
876
|
|
877
|
+
// in-place, returns view(a)
|
878
|
+
GGML_API struct ggml_tensor * ggml_rope_inplace(
|
879
|
+
struct ggml_context * ctx,
|
880
|
+
struct ggml_tensor * a,
|
881
|
+
int n_past,
|
882
|
+
int n_dims,
|
883
|
+
int mode);
|
884
|
+
|
885
|
+
// rotary position embedding backward, i.e compute dx from dy
|
886
|
+
// a - dy
|
887
|
+
GGML_API struct ggml_tensor * ggml_rope_back(
|
888
|
+
struct ggml_context * ctx,
|
889
|
+
struct ggml_tensor * a,
|
890
|
+
int n_past,
|
891
|
+
int n_dims,
|
892
|
+
int mode);
|
893
|
+
|
894
|
+
// alibi position embedding
|
895
|
+
// in-place, returns view(a)
|
896
|
+
struct ggml_tensor * ggml_alibi(
|
897
|
+
struct ggml_context * ctx,
|
898
|
+
struct ggml_tensor * a,
|
899
|
+
int n_past,
|
900
|
+
int n_head);
|
901
|
+
|
665
902
|
// padding = 1
|
666
903
|
// TODO: we don't support extra parameters for now
|
667
904
|
// that's why we are hard-coding the stride, padding, and dilation
|
@@ -692,19 +929,19 @@ extern "C" {
|
|
692
929
|
struct ggml_tensor * c1);
|
693
930
|
|
694
931
|
// Mapping operations
|
695
|
-
|
696
|
-
|
932
|
+
typedef void (*ggml_unary_op_f32_t)(const int, float *, const float *);
|
933
|
+
typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
|
697
934
|
|
698
935
|
GGML_API struct ggml_tensor * ggml_map_unary_f32(
|
699
936
|
struct ggml_context * ctx,
|
700
937
|
struct ggml_tensor * a,
|
701
|
-
|
938
|
+
ggml_unary_op_f32_t fun);
|
702
939
|
|
703
940
|
GGML_API struct ggml_tensor * ggml_map_binary_f32(
|
704
941
|
struct ggml_context * ctx,
|
705
942
|
struct ggml_tensor * a,
|
706
943
|
struct ggml_tensor * b,
|
707
|
-
|
944
|
+
ggml_binary_op_f32_t fun);
|
708
945
|
|
709
946
|
//
|
710
947
|
// automatic differentiation
|
@@ -833,8 +1070,6 @@ extern "C" {
|
|
833
1070
|
|
834
1071
|
GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
835
1072
|
GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
|
836
|
-
GGML_API size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist);
|
837
|
-
GGML_API size_t ggml_quantize_q4_3(const float * src, void * dst, int n, int k, int64_t * hist);
|
838
1073
|
GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
839
1074
|
GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist);
|
840
1075
|
GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
@@ -14,6 +14,7 @@
|
|
14
14
|
|
15
15
|
#include <string>
|
16
16
|
#include <vector>
|
17
|
+
#include <stdexcept>
|
17
18
|
|
18
19
|
#ifdef __has_include
|
19
20
|
#if __has_include(<unistd.h>)
|
@@ -74,7 +75,7 @@ struct llama_file {
|
|
74
75
|
llama_file(const char * fname, const char * mode) {
|
75
76
|
fp = std::fopen(fname, mode);
|
76
77
|
if (fp == NULL) {
|
77
|
-
throw format("failed to open %s: %s", fname,
|
78
|
+
throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
|
78
79
|
}
|
79
80
|
seek(0, SEEK_END);
|
80
81
|
size = tell();
|
@@ -107,10 +108,10 @@ struct llama_file {
|
|
107
108
|
errno = 0;
|
108
109
|
std::size_t ret = std::fread(ptr, size, 1, fp);
|
109
110
|
if (ferror(fp)) {
|
110
|
-
throw format("read error: %s", strerror(errno));
|
111
|
+
throw std::runtime_error(format("read error: %s", strerror(errno)));
|
111
112
|
}
|
112
113
|
if (ret != 1) {
|
113
|
-
throw std::string("unexpectedly reached end of file");
|
114
|
+
throw std::runtime_error(std::string("unexpectedly reached end of file"));
|
114
115
|
}
|
115
116
|
}
|
116
117
|
|
@@ -133,7 +134,7 @@ struct llama_file {
|
|
133
134
|
errno = 0;
|
134
135
|
size_t ret = std::fwrite(ptr, size, 1, fp);
|
135
136
|
if (ret != 1) {
|
136
|
-
throw format("write error: %s", strerror(errno));
|
137
|
+
throw std::runtime_error(format("write error: %s", strerror(errno)));
|
137
138
|
}
|
138
139
|
}
|
139
140
|
|
@@ -180,7 +181,7 @@ struct llama_mmap {
|
|
180
181
|
#endif
|
181
182
|
addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
|
182
183
|
if (addr == MAP_FAILED) {
|
183
|
-
throw format("mmap failed: %s", strerror(errno));
|
184
|
+
throw std::runtime_error(format("mmap failed: %s", strerror(errno)));
|
184
185
|
}
|
185
186
|
|
186
187
|
if (prefetch) {
|
@@ -207,7 +208,7 @@ struct llama_mmap {
|
|
207
208
|
DWORD error = GetLastError();
|
208
209
|
|
209
210
|
if (hMapping == NULL) {
|
210
|
-
throw format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str());
|
211
|
+
throw std::runtime_error(format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str()));
|
211
212
|
}
|
212
213
|
|
213
214
|
addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
|
@@ -215,7 +216,7 @@ struct llama_mmap {
|
|
215
216
|
CloseHandle(hMapping);
|
216
217
|
|
217
218
|
if (addr == NULL) {
|
218
|
-
throw format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str());
|
219
|
+
throw std::runtime_error(format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str()));
|
219
220
|
}
|
220
221
|
|
221
222
|
#if _WIN32_WINNT >= _WIN32_WINNT_WIN8
|
@@ -243,8 +244,9 @@ struct llama_mmap {
|
|
243
244
|
#else
|
244
245
|
static constexpr bool SUPPORTED = false;
|
245
246
|
|
246
|
-
llama_mmap(struct llama_file
|
247
|
-
|
247
|
+
llama_mmap(struct llama_file *, bool prefetch = true) {
|
248
|
+
(void)prefetch;
|
249
|
+
throw std::runtime_error(std::string("mmap not supported"));
|
248
250
|
}
|
249
251
|
#endif
|
250
252
|
};
|
@@ -382,8 +384,13 @@ struct llama_mlock {
|
|
382
384
|
#else
|
383
385
|
static constexpr bool SUPPORTED = false;
|
384
386
|
|
385
|
-
|
387
|
+
size_t lock_granularity() {
|
388
|
+
return (size_t) 65536;
|
389
|
+
}
|
390
|
+
|
391
|
+
bool raw_lock(const void * addr, size_t size) {
|
386
392
|
fprintf(stderr, "warning: mlock not supported on this system\n");
|
393
|
+
return false;
|
387
394
|
}
|
388
395
|
|
389
396
|
void raw_unlock(const void * addr, size_t size) {}
|
@@ -395,6 +402,8 @@ struct llama_buffer {
|
|
395
402
|
uint8_t * addr = NULL;
|
396
403
|
size_t size = 0;
|
397
404
|
|
405
|
+
llama_buffer() = default;
|
406
|
+
|
398
407
|
void resize(size_t size) {
|
399
408
|
delete[] addr;
|
400
409
|
addr = new uint8_t[size];
|
@@ -404,5 +413,62 @@ struct llama_buffer {
|
|
404
413
|
~llama_buffer() {
|
405
414
|
delete[] addr;
|
406
415
|
}
|
416
|
+
|
417
|
+
// disable copy and move
|
418
|
+
llama_buffer(const llama_buffer&) = delete;
|
419
|
+
llama_buffer(llama_buffer&&) = delete;
|
420
|
+
llama_buffer& operator=(const llama_buffer&) = delete;
|
421
|
+
llama_buffer& operator=(llama_buffer&&) = delete;
|
407
422
|
};
|
423
|
+
|
424
|
+
#ifdef GGML_USE_CUBLAS
|
425
|
+
#include "ggml-cuda.h"
|
426
|
+
struct llama_ctx_buffer {
|
427
|
+
uint8_t * addr = NULL;
|
428
|
+
bool is_cuda;
|
429
|
+
size_t size = 0;
|
430
|
+
|
431
|
+
llama_ctx_buffer() = default;
|
432
|
+
|
433
|
+
void resize(size_t size) {
|
434
|
+
free();
|
435
|
+
|
436
|
+
addr = (uint8_t *) ggml_cuda_host_malloc(size);
|
437
|
+
if (addr) {
|
438
|
+
is_cuda = true;
|
439
|
+
}
|
440
|
+
else {
|
441
|
+
// fall back to pageable memory
|
442
|
+
addr = new uint8_t[size];
|
443
|
+
is_cuda = false;
|
444
|
+
}
|
445
|
+
this->size = size;
|
446
|
+
}
|
447
|
+
|
448
|
+
void free() {
|
449
|
+
if (addr) {
|
450
|
+
if (is_cuda) {
|
451
|
+
ggml_cuda_host_free(addr);
|
452
|
+
}
|
453
|
+
else {
|
454
|
+
delete[] addr;
|
455
|
+
}
|
456
|
+
}
|
457
|
+
addr = NULL;
|
458
|
+
}
|
459
|
+
|
460
|
+
~llama_ctx_buffer() {
|
461
|
+
free();
|
462
|
+
}
|
463
|
+
|
464
|
+
// disable copy and move
|
465
|
+
llama_ctx_buffer(const llama_ctx_buffer&) = delete;
|
466
|
+
llama_ctx_buffer(llama_ctx_buffer&&) = delete;
|
467
|
+
llama_ctx_buffer& operator=(const llama_ctx_buffer&) = delete;
|
468
|
+
llama_ctx_buffer& operator=(llama_ctx_buffer&&) = delete;
|
469
|
+
};
|
470
|
+
#else
|
471
|
+
typedef llama_buffer llama_ctx_buffer;
|
472
|
+
#endif
|
473
|
+
|
408
474
|
#endif
|