llama_cpp 0.3.5 → 0.3.7

Sign up to get free protection for your applications and to get access to all the features.
@@ -27,6 +27,7 @@ void ggml_cuda_assign_buffers(struct ggml_tensor * tensor);
27
27
  void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor);
28
28
  void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor);
29
29
  void ggml_cuda_set_main_device(int main_device);
30
+ void ggml_cuda_set_mul_mat_q(bool mul_mat_q);
30
31
  void ggml_cuda_set_scratch_size(size_t scratch_size);
31
32
  void ggml_cuda_free_scratch(void);
32
33
  bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
@@ -7,6 +7,11 @@
7
7
  #import <Metal/Metal.h>
8
8
  #import <MetalPerformanceShaders/MetalPerformanceShaders.h>
9
9
 
10
+ #undef MIN
11
+ #undef MAX
12
+ #define MIN(a, b) ((a) < (b) ? (a) : (b))
13
+ #define MAX(a, b) ((a) > (b) ? (a) : (b))
14
+
10
15
  #ifdef GGML_METAL_NDEBUG
11
16
  #define metal_printf(...)
12
17
  #else
@@ -15,6 +20,8 @@
15
20
 
16
21
  #define UNUSED(x) (void)(x)
17
22
 
23
+ #define GGML_MAX_CONCUR (2*GGML_MAX_NODES)
24
+
18
25
  struct ggml_metal_buffer {
19
26
  const char * name;
20
27
 
@@ -36,7 +43,7 @@ struct ggml_metal_context {
36
43
  int n_buffers;
37
44
  struct ggml_metal_buffer buffers[GGML_METAL_MAX_BUFFERS];
38
45
 
39
- int concur_list[GGML_MAX_NODES];
46
+ int concur_list[GGML_MAX_CONCUR];
40
47
  int concur_list_len;
41
48
 
42
49
  // custom kernels
@@ -370,15 +377,15 @@ void ggml_metal_graph_find_concurrency(
370
377
  struct ggml_metal_context * ctx,
371
378
  struct ggml_cgraph * gf) {
372
379
  int search_depth = gf->n_nodes; //we only find concurrency in this range to avoid wasting too much time
373
- int nodes_unused[GGML_MAX_NODES];
380
+ int nodes_unused[GGML_MAX_CONCUR];
374
381
 
375
- for (int i = 0; i < GGML_MAX_NODES; i++) {ctx->concur_list[i] = 0;}
376
- for (int i = 0; i < gf->n_nodes; i++) {nodes_unused[i] = 1;}
382
+ for (int i = 0; i < GGML_MAX_CONCUR; i++) { ctx->concur_list[i] = 0; }
383
+ for (int i = 0; i < gf->n_nodes; i++) { nodes_unused[i] = 1; }
377
384
  ctx->concur_list_len = 0;
378
385
 
379
- int n_left = gf->n_nodes;
380
- int n_start = 0; // all nodes before n_start at nodes_unused array have been sorted and store back to ctx->concur_list
381
- int level_pos = 0; // at ctx->concur_list, the last layer (level) ends at level_pos
386
+ int n_left = gf->n_nodes;
387
+ int n_start = 0; // all nodes before n_start at nodes_unused array have been sorted and store back to ctx->concur_list
388
+ int level_pos = 0; // at ctx->concur_list, the last layer (level) ends at level_pos
382
389
 
383
390
  while (n_left > 0) {
384
391
  // number of nodes at a layer (that can be issued concurrently)
@@ -386,28 +393,40 @@ void ggml_metal_graph_find_concurrency(
386
393
  for (int i = n_start; i < ((n_start + search_depth > gf->n_nodes) ? gf->n_nodes : n_start + search_depth); i++) {
387
394
  if (nodes_unused[i]) {
388
395
  // if the requirements for gf->nodes[i] are satisfied
389
- int exe_flag=1;
396
+ int exe_flag = 1;
397
+
390
398
  // scan all srcs
391
399
  for (int src_ind = 0; src_ind < GGML_MAX_SRC; src_ind++) {
392
400
  struct ggml_tensor * src_cur = gf->nodes[i]->src[src_ind];
393
401
  if (src_cur) {
394
402
  // if is leaf nodes it's satisfied.
395
- if (src_cur->op == GGML_OP_NONE && src_cur->grad == NULL) {continue;}
403
+ // TODO: ggml_is_leaf()
404
+ if (src_cur->op == GGML_OP_NONE && src_cur->grad == NULL) {
405
+ continue;
406
+ }
396
407
 
397
408
  // otherwise this src should be the output from previous nodes.
398
409
  int is_found = 0;
410
+
399
411
  // scan 2*search_depth back because we inserted barrier.
400
- for (int j = ((level_pos - 2*search_depth) < 0 ? 0 : (level_pos - 2*search_depth)); j < level_pos; j++) {
401
- if (gf->nodes[ctx->concur_list[j]] == src_cur) {is_found = 1; break;}
412
+ //for (int j = ((level_pos - 2*search_depth) < 0 ? 0 : (level_pos - 2*search_depth)); j < level_pos; j++) {
413
+ for (int j = MAX(0, level_pos - 2*search_depth); j < level_pos; j++) {
414
+ if (ctx->concur_list[j] >= 0 && gf->nodes[ctx->concur_list[j]] == src_cur) {
415
+ is_found = 1;
416
+ break;
417
+ }
418
+ }
419
+ if (is_found == 0) {
420
+ exe_flag = 0;
421
+ break;
402
422
  }
403
- if (is_found == 0) {exe_flag = 0; break;}
404
423
  }
405
424
  }
406
425
  if (exe_flag) {
407
426
  // check if nodes[i]'s data will be overwritten by a node before nodes[i].
408
427
  // if node[5] and node[3] write to the same memory region, then we can't issue node[5] before node[3]
409
428
  int64_t data_start = (int64_t) gf->nodes[i]->data;
410
- int64_t length = (int64_t) ggml_nbytes(gf->nodes[i]);
429
+ int64_t length = (int64_t) ggml_nbytes(gf->nodes[i]);
411
430
  for (int j = n_start; j < i; j++) {
412
431
  if (nodes_unused[j] && gf->nodes[j]->op != GGML_OP_RESHAPE \
413
432
  && gf->nodes[j]->op != GGML_OP_VIEW \
@@ -416,9 +435,9 @@ void ggml_metal_graph_find_concurrency(
416
435
  if (((int64_t)gf->nodes[j]->data) >= data_start + length || \
417
436
  ((int64_t)gf->nodes[j]->data) + (int64_t) ggml_nbytes(gf->nodes[j]) <= data_start) {
418
437
  continue;
419
- } else {
420
- exe_flag = 0;
421
438
  }
439
+
440
+ exe_flag = 0;
422
441
  }
423
442
  }
424
443
  }
@@ -435,11 +454,13 @@ void ggml_metal_graph_find_concurrency(
435
454
  ctx->concur_list[level_pos + concurrency] = -1;
436
455
  ctx->concur_list_len++;
437
456
  // jump all sorted nodes at nodes_bak
438
- while (!nodes_unused[n_start]) {n_start++;}
457
+ while (!nodes_unused[n_start]) {
458
+ n_start++;
459
+ }
439
460
  level_pos += concurrency + 1;
440
461
  }
441
462
 
442
- if (ctx->concur_list_len > GGML_MAX_NODES) {
463
+ if (ctx->concur_list_len > GGML_MAX_CONCUR) {
443
464
  fprintf(stderr, "%s: too many elements for metal ctx->concur_list!\n", __func__);
444
465
  }
445
466
  }
@@ -453,7 +474,7 @@ void ggml_metal_graph_compute(
453
474
  // else fallback to serial dispatch
454
475
  MTLComputePassDescriptor * edesc = MTLComputePassDescriptor.computePassDescriptor;
455
476
 
456
- const bool has_concur = ctx->concur_list_len && ctx->concur_list_len <= GGML_MAX_NODES;
477
+ const bool has_concur = ctx->concur_list_len && ctx->concur_list_len <= GGML_MAX_CONCUR;
457
478
 
458
479
  const int n_nodes = has_concur ? ctx->concur_list_len : gf->n_nodes;
459
480
  edesc.dispatchType = has_concur ? MTLDispatchTypeConcurrent : MTLDispatchTypeSerial;
@@ -718,7 +739,8 @@ void ggml_metal_graph_compute(
718
739
  // TODO: needs to be updated after PR: https://github.com/ggerganov/ggml/pull/224
719
740
 
720
741
  GGML_ASSERT(ne00 == ne10);
721
- GGML_ASSERT(ne02 == ne12);
742
+ // GGML_ASSERT(ne02 == ne12); // Should be checked on individual data types until broadcast is implemented everywhere
743
+ GGML_ASSERT(ne03 == ne13);
722
744
 
723
745
  if (ggml_is_contiguous(src0) &&
724
746
  ggml_is_contiguous(src1) &&
@@ -746,11 +768,11 @@ void ggml_metal_graph_compute(
746
768
  initWithDevice:ctx->device transposeLeft:false transposeRight:true
747
769
  resultRows:ne11 resultColumns:ne01 interiorColumns:ne00 alpha:1.0 beta:0.0];
748
770
 
749
- // we need to do ne02 multiplications
771
+ // we need to do ne12 multiplications
750
772
  // TODO: is there a way to do this in parallel - currently very slow ..
751
773
  // TODO: might be possible to offload part of the computation to ANE using Accelerate's CBLAS
752
- for (int64_t i02 = 0; i02 < ne02; ++i02) {
753
- size_t offs_src0_cur = offs_src0 + i02*nb02;
774
+ for (int64_t i02 = 0; i02 < ne12; ++i02) {
775
+ size_t offs_src0_cur = offs_src0 + i02/(ne12/ne02)*nb02; // gqa not used for now
754
776
  size_t offs_src1_cur = offs_src1 + i02*nb12;
755
777
  size_t offs_dst_cur = offs_dst + i02*nb2;
756
778
 
@@ -772,8 +794,6 @@ void ggml_metal_graph_compute(
772
794
  switch (src0t) {
773
795
  case GGML_TYPE_F16:
774
796
  {
775
- GGML_ASSERT(ne02 == ne12);
776
-
777
797
  nth0 = 64;
778
798
  nth1 = 1;
779
799
  [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32];
@@ -853,16 +873,18 @@ void ggml_metal_graph_compute(
853
873
  [encoder setBuffer:id_dst offset:offs_dst atIndex:2];
854
874
  [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
855
875
  [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
856
- [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:5];
857
- [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:6];
858
- [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:7];
859
- [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:8];
860
- [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:9];
861
- [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:10];
862
- [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:11];
863
- [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:12];
864
- [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:13];
865
- [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:14];
876
+ [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5];
877
+ [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6];
878
+ [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7];
879
+ [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8];
880
+ [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:9];
881
+ [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:10];
882
+ [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:11];
883
+ [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:12];
884
+ [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:13];
885
+ [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:14];
886
+ [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:15];
887
+ [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:16];
866
888
 
867
889
  if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 ||
868
890
  src0t == GGML_TYPE_Q2_K || src0t == GGML_TYPE_Q4_K) {
@@ -509,11 +509,13 @@ kernel void kernel_mul_mat_f16_f32(
509
509
  device float * dst,
510
510
  constant int64_t & ne00,
511
511
  constant int64_t & ne01,
512
+ constant int64_t & ne02,
512
513
  constant uint64_t & nb00,
513
514
  constant uint64_t & nb01,
514
515
  constant uint64_t & nb02,
515
516
  constant int64_t & ne10,
516
517
  constant int64_t & ne11,
518
+ constant int64_t & ne12,
517
519
  constant uint64_t & nb10,
518
520
  constant uint64_t & nb11,
519
521
  constant uint64_t & nb12,
@@ -529,7 +531,7 @@ kernel void kernel_mul_mat_f16_f32(
529
531
  const int64_t r1 = tgpig.y;
530
532
  const int64_t im = tgpig.z;
531
533
 
532
- device const half * x = (device const half *) (src0 + r0*nb01 + im*nb02);
534
+ device const half * x = (device const half *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02);
533
535
  device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
534
536
 
535
537
  sum[tpitg.x] = 0.0f;
@@ -552,6 +554,7 @@ kernel void kernel_mul_mat_f16_f32(
552
554
  }
553
555
  }
554
556
 
557
+
555
558
  kernel void kernel_alibi_f32(
556
559
  device const float * src0,
557
560
  device float * dst,