llama_cpp 0.1.2 → 0.1.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,23 +1,21 @@
1
1
  #pragma once
2
2
 
3
+ #include "ggml.h"
4
+
3
5
  #ifdef __cplusplus
4
6
  extern "C" {
5
7
  #endif
6
8
 
7
9
  void ggml_cl_init(void);
8
10
 
9
- enum ggml_blas_order {
10
- GGML_BLAS_ORDER_ROW_MAJOR = 101,
11
- GGML_BLAS_ORDER_COLUMN_MAJOR = 102,
12
- };
11
+ bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
12
+ size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
13
+ void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
13
14
 
14
- enum ggml_blas_op {
15
- GGML_BLAS_OP_N = 111,
16
- GGML_BLAS_OP_T = 112,
17
- GGML_BLAS_OP_C = 113,
18
- };
15
+ void * ggml_cl_host_malloc(size_t size);
16
+ void ggml_cl_host_free(void * ptr);
19
17
 
20
- void ggml_cl_sgemm_wrapper(const enum ggml_blas_order order, const enum ggml_blas_op trans_a, const enum ggml_blas_op trans_b, const int m, const int n, const int k, const float alpha, const void *host_a, const int lda, const float *host_b, const int ldb, const float beta, float *host_c, const int ldc, const int btype);
18
+ void ggml_cl_transform_tensor(struct ggml_tensor * tensor);
21
19
 
22
20
  #ifdef __cplusplus
23
21
  }
@@ -9431,7 +9431,7 @@ static void ggml_compute_forward_rms_norm_back(
9431
9431
 
9432
9432
  // ggml_compute_forward_mul_mat
9433
9433
 
9434
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
9434
+ #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
9435
9435
  // helper function to determine if it is better to use BLAS or not
9436
9436
  // for large matrices, BLAS is faster
9437
9437
  static bool ggml_compute_forward_mul_mat_use_blas(
@@ -9472,7 +9472,7 @@ static void ggml_compute_forward_mul_mat_f32(
9472
9472
  const int64_t ne02 = src0->ne[2];
9473
9473
  const int64_t ne03 = src0->ne[3];
9474
9474
 
9475
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
9475
+ #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
9476
9476
  const int64_t ne10 = src1->ne[0];
9477
9477
  #endif
9478
9478
  const int64_t ne11 = src1->ne[1];
@@ -9536,9 +9536,16 @@ static void ggml_compute_forward_mul_mat_f32(
9536
9536
  }
9537
9537
  return;
9538
9538
  }
9539
+ #elif defined(GGML_USE_CLBLAST)
9540
+ if (ggml_cl_can_mul_mat(src0, src1, dst)) {
9541
+ if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
9542
+ ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
9543
+ }
9544
+ return;
9545
+ }
9539
9546
  #endif
9540
9547
 
9541
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
9548
+ #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
9542
9549
  if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
9543
9550
  if (params->ith != 0) {
9544
9551
  return;
@@ -9558,21 +9565,11 @@ static void ggml_compute_forward_mul_mat_f32(
9558
9565
  const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
9559
9566
  float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
9560
9567
 
9561
- #if defined(GGML_USE_CLBLAST)
9562
- // zT = y * xT
9563
- ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T,
9564
- ne11, ne01, ne10,
9565
- 1.0f, y, ne10,
9566
- x, ne10,
9567
- 0.0f, d, ne01,
9568
- GGML_TYPE_F32);
9569
- #else
9570
9568
  cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
9571
9569
  ne11, ne01, ne10,
9572
9570
  1.0f, y, ne10,
9573
9571
  x, ne00,
9574
9572
  0.0f, d, ne01);
9575
- #endif
9576
9573
  }
9577
9574
  }
9578
9575
  //printf("CBLAS F32 = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
@@ -9711,9 +9708,16 @@ static void ggml_compute_forward_mul_mat_f16_f32(
9711
9708
  }
9712
9709
  return;
9713
9710
  }
9711
+ #elif defined(GGML_USE_CLBLAST)
9712
+ if (ggml_cl_can_mul_mat(src0, src1, dst)) {
9713
+ if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
9714
+ ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
9715
+ }
9716
+ return;
9717
+ }
9714
9718
  #endif
9715
9719
 
9716
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
9720
+ #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
9717
9721
  if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
9718
9722
  GGML_ASSERT(nb10 == sizeof(float));
9719
9723
 
@@ -9743,20 +9747,6 @@ static void ggml_compute_forward_mul_mat_f16_f32(
9743
9747
  assert(id*sizeof(float) <= params->wsize);
9744
9748
  }
9745
9749
 
9746
- #if defined(GGML_USE_CLBLAST)
9747
- const float * x = wdata;
9748
- const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
9749
-
9750
- float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
9751
-
9752
- // zT = y * xT
9753
- ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T,
9754
- ne11, ne01, ne10,
9755
- 1.0f, y, ne10,
9756
- x, ne10,
9757
- 0.0f, d, ne01,
9758
- GGML_TYPE_F32);
9759
- #else
9760
9750
  const float * x = wdata;
9761
9751
  const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
9762
9752
 
@@ -9768,7 +9758,6 @@ static void ggml_compute_forward_mul_mat_f16_f32(
9768
9758
  1.0f, y, ne10,
9769
9759
  x, ne00,
9770
9760
  0.0f, d, ne01);
9771
- #endif
9772
9761
  }
9773
9762
  }
9774
9763
 
@@ -9931,9 +9920,16 @@ static void ggml_compute_forward_mul_mat_q_f32(
9931
9920
  }
9932
9921
  return;
9933
9922
  }
9923
+ #elif defined(GGML_USE_CLBLAST)
9924
+ if (ggml_cl_can_mul_mat(src0, src1, dst)) {
9925
+ if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
9926
+ ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
9927
+ }
9928
+ return;
9929
+ }
9934
9930
  #endif
9935
9931
 
9936
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
9932
+ #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
9937
9933
  if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
9938
9934
  if (params->ith != 0) {
9939
9935
  return;
@@ -9956,9 +9952,6 @@ static void ggml_compute_forward_mul_mat_q_f32(
9956
9952
 
9957
9953
  float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
9958
9954
 
9959
- #if defined(GGML_USE_CLBLAST)
9960
- const void* x = (char *) src0->data + i03*nb03 + i02*nb02;
9961
- #else
9962
9955
  {
9963
9956
  size_t id = 0;
9964
9957
  for (int64_t i01 = 0; i01 < ne01; ++i01) {
@@ -9970,23 +9963,12 @@ static void ggml_compute_forward_mul_mat_q_f32(
9970
9963
  }
9971
9964
 
9972
9965
  const float * x = wdata;
9973
- #endif
9974
9966
 
9975
- #if defined(GGML_USE_CLBLAST)
9976
- // zT = y * xT
9977
- ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T,
9978
- ne11, ne01, ne10,
9979
- 1.0f, y, ne10,
9980
- x, ne10,
9981
- 0.0f, d, ne01,
9982
- type);
9983
- #else
9984
9967
  cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
9985
9968
  ne11, ne01, ne10,
9986
9969
  1.0f, y, ne10,
9987
9970
  x, ne00,
9988
9971
  0.0f, d, ne01);
9989
- #endif
9990
9972
  }
9991
9973
  }
9992
9974
 
@@ -14165,9 +14147,16 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
14165
14147
  cur = ggml_cuda_mul_mat_get_wsize(node->src0, node->src1, node);
14166
14148
  }
14167
14149
  else
14150
+ #elif defined(GGML_USE_CLBLAST)
14151
+ if (ggml_cl_can_mul_mat(node->src0, node->src1, node)) {
14152
+ node->n_tasks = 1; // TODO: this actually is doing nothing
14153
+ // the threads are still spinning
14154
+ cur = ggml_cl_mul_mat_get_wsize(node->src0, node->src1, node);
14155
+ }
14156
+ else
14168
14157
  #endif
14169
14158
  if (node->src0->type == GGML_TYPE_F16 && node->src1->type == GGML_TYPE_F32) {
14170
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
14159
+ #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
14171
14160
  if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
14172
14161
  node->n_tasks = 1; // TODO: this actually is doing nothing
14173
14162
  // the threads are still spinning
@@ -14181,13 +14170,13 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
14181
14170
  #endif
14182
14171
  } else if (node->src0->type == GGML_TYPE_F32 && node->src1->type == GGML_TYPE_F32) {
14183
14172
  cur = 0;
14184
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
14173
+ #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
14185
14174
  if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
14186
14175
  node->n_tasks = 1;
14187
14176
  }
14188
14177
  #endif
14189
14178
  } else if (ggml_is_quantized(node->src0->type) && node->src1->type == GGML_TYPE_F32) {
14190
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
14179
+ #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
14191
14180
  if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
14192
14181
  node->n_tasks = 1;
14193
14182
  cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]);
@@ -249,6 +249,7 @@ extern "C" {
249
249
  enum ggml_backend {
250
250
  GGML_BACKEND_CPU = 0,
251
251
  GGML_BACKEND_CUDA = 1,
252
+ GGML_BACKEND_CL = 2,
252
253
  };
253
254
 
254
255
  // model file types
@@ -12,6 +12,8 @@
12
12
  #include "ggml.h"
13
13
  #ifdef GGML_USE_CUBLAS
14
14
  #include "ggml-cuda.h"
15
+ #elif defined(GGML_USE_CLBLAST)
16
+ #include "ggml-opencl.h"
15
17
  #endif
16
18
 
17
19
  #include <array>
@@ -1092,7 +1094,7 @@ static void llama_model_load_internal(
1092
1094
  fprintf(stderr, "%s: [cublas] offloading output layer to GPU\n", __func__);
1093
1095
  }
1094
1096
  fprintf(stderr, "%s: [cublas] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
1095
- #else
1097
+ #elif !defined(GGML_USE_CLBLAST)
1096
1098
  (void) n_gpu_layers;
1097
1099
  #endif
1098
1100
  }
@@ -1125,7 +1127,33 @@ static void llama_model_load_internal(
1125
1127
  done_size += lt.size;
1126
1128
  }
1127
1129
  }
1128
- #endif // GGML_USE_CUBLAS
1130
+ #elif defined(GGML_USE_CLBLAST)
1131
+ {
1132
+ const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
1133
+
1134
+ fprintf(stderr, "ggml_opencl: offloading %d layers to GPU\n", n_gpu);
1135
+
1136
+ size_t vram_total = 0;
1137
+
1138
+ for (int i = 0; i < n_gpu; ++i) {
1139
+ const auto & layer = model.layers[i];
1140
+
1141
+ ggml_cl_transform_tensor(layer.wq); vram_total += ggml_nbytes(layer.wq);
1142
+ ggml_cl_transform_tensor(layer.wk); vram_total += ggml_nbytes(layer.wk);
1143
+ ggml_cl_transform_tensor(layer.wv); vram_total += ggml_nbytes(layer.wv);
1144
+ ggml_cl_transform_tensor(layer.wo); vram_total += ggml_nbytes(layer.wo);
1145
+ ggml_cl_transform_tensor(layer.w1); vram_total += ggml_nbytes(layer.w1);
1146
+ ggml_cl_transform_tensor(layer.w2); vram_total += ggml_nbytes(layer.w2);
1147
+ ggml_cl_transform_tensor(layer.w3); vram_total += ggml_nbytes(layer.w3);
1148
+ }
1149
+ if (n_gpu_layers > (int) hparams.n_layer) {
1150
+ fprintf(stderr, "ggml_opencl: offloading output layer to GPU\n");
1151
+ ggml_cl_transform_tensor(model.output); vram_total += ggml_nbytes(model.output);
1152
+ }
1153
+
1154
+ fprintf(stderr, "ggml_opencl: total VRAM used: %zu MB\n", vram_total / 1024 / 1024);
1155
+ }
1156
+ #endif
1129
1157
 
1130
1158
  if (progress_callback) {
1131
1159
  progress_callback(1.0f, progress_callback_user_data);
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.1.2'
6
+ VERSION = '0.1.3'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'master-265db98'
9
+ LLAMA_CPP_VERSION = 'master-66874d4'
10
10
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llama_cpp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.2
4
+ version: 0.1.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-05-22 00:00:00.000000000 Z
11
+ date: 2023-05-26 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
14
14
  email:
@@ -27,7 +27,7 @@ files:
27
27
  - ext/llama_cpp/llama_cpp.h
28
28
  - ext/llama_cpp/src/LICENSE
29
29
  - ext/llama_cpp/src/ggml-cuda.h
30
- - ext/llama_cpp/src/ggml-opencl.c
30
+ - ext/llama_cpp/src/ggml-opencl.cpp
31
31
  - ext/llama_cpp/src/ggml-opencl.h
32
32
  - ext/llama_cpp/src/ggml.c
33
33
  - ext/llama_cpp/src/ggml.h