llama_cpp 0.3.5 → 0.3.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -27,6 +27,7 @@ void ggml_cuda_assign_buffers(struct ggml_tensor * tensor);
27
27
  void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor);
28
28
  void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor);
29
29
  void ggml_cuda_set_main_device(int main_device);
30
+ void ggml_cuda_set_mul_mat_q(bool mul_mat_q);
30
31
  void ggml_cuda_set_scratch_size(size_t scratch_size);
31
32
  void ggml_cuda_free_scratch(void);
32
33
  bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
@@ -7,6 +7,11 @@
7
7
  #import <Metal/Metal.h>
8
8
  #import <MetalPerformanceShaders/MetalPerformanceShaders.h>
9
9
 
10
+ #undef MIN
11
+ #undef MAX
12
+ #define MIN(a, b) ((a) < (b) ? (a) : (b))
13
+ #define MAX(a, b) ((a) > (b) ? (a) : (b))
14
+
10
15
  #ifdef GGML_METAL_NDEBUG
11
16
  #define metal_printf(...)
12
17
  #else
@@ -15,6 +20,8 @@
15
20
 
16
21
  #define UNUSED(x) (void)(x)
17
22
 
23
+ #define GGML_MAX_CONCUR (2*GGML_MAX_NODES)
24
+
18
25
  struct ggml_metal_buffer {
19
26
  const char * name;
20
27
 
@@ -36,7 +43,7 @@ struct ggml_metal_context {
36
43
  int n_buffers;
37
44
  struct ggml_metal_buffer buffers[GGML_METAL_MAX_BUFFERS];
38
45
 
39
- int concur_list[GGML_MAX_NODES];
46
+ int concur_list[GGML_MAX_CONCUR];
40
47
  int concur_list_len;
41
48
 
42
49
  // custom kernels
@@ -370,15 +377,15 @@ void ggml_metal_graph_find_concurrency(
370
377
  struct ggml_metal_context * ctx,
371
378
  struct ggml_cgraph * gf) {
372
379
  int search_depth = gf->n_nodes; //we only find concurrency in this range to avoid wasting too much time
373
- int nodes_unused[GGML_MAX_NODES];
380
+ int nodes_unused[GGML_MAX_CONCUR];
374
381
 
375
- for (int i = 0; i < GGML_MAX_NODES; i++) {ctx->concur_list[i] = 0;}
376
- for (int i = 0; i < gf->n_nodes; i++) {nodes_unused[i] = 1;}
382
+ for (int i = 0; i < GGML_MAX_CONCUR; i++) { ctx->concur_list[i] = 0; }
383
+ for (int i = 0; i < gf->n_nodes; i++) { nodes_unused[i] = 1; }
377
384
  ctx->concur_list_len = 0;
378
385
 
379
- int n_left = gf->n_nodes;
380
- int n_start = 0; // all nodes before n_start at nodes_unused array have been sorted and store back to ctx->concur_list
381
- int level_pos = 0; // at ctx->concur_list, the last layer (level) ends at level_pos
386
+ int n_left = gf->n_nodes;
387
+ int n_start = 0; // all nodes before n_start at nodes_unused array have been sorted and store back to ctx->concur_list
388
+ int level_pos = 0; // at ctx->concur_list, the last layer (level) ends at level_pos
382
389
 
383
390
  while (n_left > 0) {
384
391
  // number of nodes at a layer (that can be issued concurrently)
@@ -386,28 +393,40 @@ void ggml_metal_graph_find_concurrency(
386
393
  for (int i = n_start; i < ((n_start + search_depth > gf->n_nodes) ? gf->n_nodes : n_start + search_depth); i++) {
387
394
  if (nodes_unused[i]) {
388
395
  // if the requirements for gf->nodes[i] are satisfied
389
- int exe_flag=1;
396
+ int exe_flag = 1;
397
+
390
398
  // scan all srcs
391
399
  for (int src_ind = 0; src_ind < GGML_MAX_SRC; src_ind++) {
392
400
  struct ggml_tensor * src_cur = gf->nodes[i]->src[src_ind];
393
401
  if (src_cur) {
394
402
  // if is leaf nodes it's satisfied.
395
- if (src_cur->op == GGML_OP_NONE && src_cur->grad == NULL) {continue;}
403
+ // TODO: ggml_is_leaf()
404
+ if (src_cur->op == GGML_OP_NONE && src_cur->grad == NULL) {
405
+ continue;
406
+ }
396
407
 
397
408
  // otherwise this src should be the output from previous nodes.
398
409
  int is_found = 0;
410
+
399
411
  // scan 2*search_depth back because we inserted barrier.
400
- for (int j = ((level_pos - 2*search_depth) < 0 ? 0 : (level_pos - 2*search_depth)); j < level_pos; j++) {
401
- if (gf->nodes[ctx->concur_list[j]] == src_cur) {is_found = 1; break;}
412
+ //for (int j = ((level_pos - 2*search_depth) < 0 ? 0 : (level_pos - 2*search_depth)); j < level_pos; j++) {
413
+ for (int j = MAX(0, level_pos - 2*search_depth); j < level_pos; j++) {
414
+ if (ctx->concur_list[j] >= 0 && gf->nodes[ctx->concur_list[j]] == src_cur) {
415
+ is_found = 1;
416
+ break;
417
+ }
418
+ }
419
+ if (is_found == 0) {
420
+ exe_flag = 0;
421
+ break;
402
422
  }
403
- if (is_found == 0) {exe_flag = 0; break;}
404
423
  }
405
424
  }
406
425
  if (exe_flag) {
407
426
  // check if nodes[i]'s data will be overwritten by a node before nodes[i].
408
427
  // if node[5] and node[3] write to the same memory region, then we can't issue node[5] before node[3]
409
428
  int64_t data_start = (int64_t) gf->nodes[i]->data;
410
- int64_t length = (int64_t) ggml_nbytes(gf->nodes[i]);
429
+ int64_t length = (int64_t) ggml_nbytes(gf->nodes[i]);
411
430
  for (int j = n_start; j < i; j++) {
412
431
  if (nodes_unused[j] && gf->nodes[j]->op != GGML_OP_RESHAPE \
413
432
  && gf->nodes[j]->op != GGML_OP_VIEW \
@@ -416,9 +435,9 @@ void ggml_metal_graph_find_concurrency(
416
435
  if (((int64_t)gf->nodes[j]->data) >= data_start + length || \
417
436
  ((int64_t)gf->nodes[j]->data) + (int64_t) ggml_nbytes(gf->nodes[j]) <= data_start) {
418
437
  continue;
419
- } else {
420
- exe_flag = 0;
421
438
  }
439
+
440
+ exe_flag = 0;
422
441
  }
423
442
  }
424
443
  }
@@ -435,11 +454,13 @@ void ggml_metal_graph_find_concurrency(
435
454
  ctx->concur_list[level_pos + concurrency] = -1;
436
455
  ctx->concur_list_len++;
437
456
  // jump all sorted nodes at nodes_bak
438
- while (!nodes_unused[n_start]) {n_start++;}
457
+ while (!nodes_unused[n_start]) {
458
+ n_start++;
459
+ }
439
460
  level_pos += concurrency + 1;
440
461
  }
441
462
 
442
- if (ctx->concur_list_len > GGML_MAX_NODES) {
463
+ if (ctx->concur_list_len > GGML_MAX_CONCUR) {
443
464
  fprintf(stderr, "%s: too many elements for metal ctx->concur_list!\n", __func__);
444
465
  }
445
466
  }
@@ -453,7 +474,7 @@ void ggml_metal_graph_compute(
453
474
  // else fallback to serial dispatch
454
475
  MTLComputePassDescriptor * edesc = MTLComputePassDescriptor.computePassDescriptor;
455
476
 
456
- const bool has_concur = ctx->concur_list_len && ctx->concur_list_len <= GGML_MAX_NODES;
477
+ const bool has_concur = ctx->concur_list_len && ctx->concur_list_len <= GGML_MAX_CONCUR;
457
478
 
458
479
  const int n_nodes = has_concur ? ctx->concur_list_len : gf->n_nodes;
459
480
  edesc.dispatchType = has_concur ? MTLDispatchTypeConcurrent : MTLDispatchTypeSerial;
@@ -718,7 +739,8 @@ void ggml_metal_graph_compute(
718
739
  // TODO: needs to be updated after PR: https://github.com/ggerganov/ggml/pull/224
719
740
 
720
741
  GGML_ASSERT(ne00 == ne10);
721
- GGML_ASSERT(ne02 == ne12);
742
+ // GGML_ASSERT(ne02 == ne12); // Should be checked on individual data types until broadcast is implemented everywhere
743
+ GGML_ASSERT(ne03 == ne13);
722
744
 
723
745
  if (ggml_is_contiguous(src0) &&
724
746
  ggml_is_contiguous(src1) &&
@@ -746,11 +768,11 @@ void ggml_metal_graph_compute(
746
768
  initWithDevice:ctx->device transposeLeft:false transposeRight:true
747
769
  resultRows:ne11 resultColumns:ne01 interiorColumns:ne00 alpha:1.0 beta:0.0];
748
770
 
749
- // we need to do ne02 multiplications
771
+ // we need to do ne12 multiplications
750
772
  // TODO: is there a way to do this in parallel - currently very slow ..
751
773
  // TODO: might be possible to offload part of the computation to ANE using Accelerate's CBLAS
752
- for (int64_t i02 = 0; i02 < ne02; ++i02) {
753
- size_t offs_src0_cur = offs_src0 + i02*nb02;
774
+ for (int64_t i02 = 0; i02 < ne12; ++i02) {
775
+ size_t offs_src0_cur = offs_src0 + i02/(ne12/ne02)*nb02; // gqa not used for now
754
776
  size_t offs_src1_cur = offs_src1 + i02*nb12;
755
777
  size_t offs_dst_cur = offs_dst + i02*nb2;
756
778
 
@@ -772,8 +794,6 @@ void ggml_metal_graph_compute(
772
794
  switch (src0t) {
773
795
  case GGML_TYPE_F16:
774
796
  {
775
- GGML_ASSERT(ne02 == ne12);
776
-
777
797
  nth0 = 64;
778
798
  nth1 = 1;
779
799
  [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32];
@@ -853,16 +873,18 @@ void ggml_metal_graph_compute(
853
873
  [encoder setBuffer:id_dst offset:offs_dst atIndex:2];
854
874
  [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
855
875
  [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
856
- [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:5];
857
- [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:6];
858
- [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:7];
859
- [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:8];
860
- [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:9];
861
- [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:10];
862
- [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:11];
863
- [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:12];
864
- [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:13];
865
- [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:14];
876
+ [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5];
877
+ [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6];
878
+ [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7];
879
+ [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8];
880
+ [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:9];
881
+ [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:10];
882
+ [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:11];
883
+ [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:12];
884
+ [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:13];
885
+ [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:14];
886
+ [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:15];
887
+ [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:16];
866
888
 
867
889
  if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 ||
868
890
  src0t == GGML_TYPE_Q2_K || src0t == GGML_TYPE_Q4_K) {
@@ -509,11 +509,13 @@ kernel void kernel_mul_mat_f16_f32(
509
509
  device float * dst,
510
510
  constant int64_t & ne00,
511
511
  constant int64_t & ne01,
512
+ constant int64_t & ne02,
512
513
  constant uint64_t & nb00,
513
514
  constant uint64_t & nb01,
514
515
  constant uint64_t & nb02,
515
516
  constant int64_t & ne10,
516
517
  constant int64_t & ne11,
518
+ constant int64_t & ne12,
517
519
  constant uint64_t & nb10,
518
520
  constant uint64_t & nb11,
519
521
  constant uint64_t & nb12,
@@ -529,7 +531,7 @@ kernel void kernel_mul_mat_f16_f32(
529
531
  const int64_t r1 = tgpig.y;
530
532
  const int64_t im = tgpig.z;
531
533
 
532
- device const half * x = (device const half *) (src0 + r0*nb01 + im*nb02);
534
+ device const half * x = (device const half *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02);
533
535
  device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
534
536
 
535
537
  sum[tpitg.x] = 0.0f;
@@ -552,6 +554,7 @@ kernel void kernel_mul_mat_f16_f32(
552
554
  }
553
555
  }
554
556
 
557
+
555
558
  kernel void kernel_alibi_f32(
556
559
  device const float * src0,
557
560
  device float * dst,