llama_cpp 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,23 +1,21 @@
1
1
  #pragma once
2
2
 
3
+ #include "ggml.h"
4
+
3
5
  #ifdef __cplusplus
4
6
  extern "C" {
5
7
  #endif
6
8
 
7
9
  void ggml_cl_init(void);
8
10
 
9
- enum ggml_blas_order {
10
- GGML_BLAS_ORDER_ROW_MAJOR = 101,
11
- GGML_BLAS_ORDER_COLUMN_MAJOR = 102,
12
- };
11
+ bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
12
+ size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
13
+ void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
13
14
 
14
- enum ggml_blas_op {
15
- GGML_BLAS_OP_N = 111,
16
- GGML_BLAS_OP_T = 112,
17
- GGML_BLAS_OP_C = 113,
18
- };
15
+ void * ggml_cl_host_malloc(size_t size);
16
+ void ggml_cl_host_free(void * ptr);
19
17
 
20
- void ggml_cl_sgemm_wrapper(const enum ggml_blas_order order, const enum ggml_blas_op trans_a, const enum ggml_blas_op trans_b, const int m, const int n, const int k, const float alpha, const void *host_a, const int lda, const float *host_b, const int ldb, const float beta, float *host_c, const int ldc, const int btype);
18
+ void ggml_cl_transform_tensor(struct ggml_tensor * tensor);
21
19
 
22
20
  #ifdef __cplusplus
23
21
  }
@@ -9431,7 +9431,7 @@ static void ggml_compute_forward_rms_norm_back(
9431
9431
 
9432
9432
  // ggml_compute_forward_mul_mat
9433
9433
 
9434
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
9434
+ #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
9435
9435
  // helper function to determine if it is better to use BLAS or not
9436
9436
  // for large matrices, BLAS is faster
9437
9437
  static bool ggml_compute_forward_mul_mat_use_blas(
@@ -9472,7 +9472,7 @@ static void ggml_compute_forward_mul_mat_f32(
9472
9472
  const int64_t ne02 = src0->ne[2];
9473
9473
  const int64_t ne03 = src0->ne[3];
9474
9474
 
9475
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
9475
+ #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
9476
9476
  const int64_t ne10 = src1->ne[0];
9477
9477
  #endif
9478
9478
  const int64_t ne11 = src1->ne[1];
@@ -9536,9 +9536,16 @@ static void ggml_compute_forward_mul_mat_f32(
9536
9536
  }
9537
9537
  return;
9538
9538
  }
9539
+ #elif defined(GGML_USE_CLBLAST)
9540
+ if (ggml_cl_can_mul_mat(src0, src1, dst)) {
9541
+ if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
9542
+ ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
9543
+ }
9544
+ return;
9545
+ }
9539
9546
  #endif
9540
9547
 
9541
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
9548
+ #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
9542
9549
  if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
9543
9550
  if (params->ith != 0) {
9544
9551
  return;
@@ -9558,21 +9565,11 @@ static void ggml_compute_forward_mul_mat_f32(
9558
9565
  const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
9559
9566
  float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
9560
9567
 
9561
- #if defined(GGML_USE_CLBLAST)
9562
- // zT = y * xT
9563
- ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T,
9564
- ne11, ne01, ne10,
9565
- 1.0f, y, ne10,
9566
- x, ne10,
9567
- 0.0f, d, ne01,
9568
- GGML_TYPE_F32);
9569
- #else
9570
9568
  cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
9571
9569
  ne11, ne01, ne10,
9572
9570
  1.0f, y, ne10,
9573
9571
  x, ne00,
9574
9572
  0.0f, d, ne01);
9575
- #endif
9576
9573
  }
9577
9574
  }
9578
9575
  //printf("CBLAS F32 = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
@@ -9711,9 +9708,16 @@ static void ggml_compute_forward_mul_mat_f16_f32(
9711
9708
  }
9712
9709
  return;
9713
9710
  }
9711
+ #elif defined(GGML_USE_CLBLAST)
9712
+ if (ggml_cl_can_mul_mat(src0, src1, dst)) {
9713
+ if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
9714
+ ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
9715
+ }
9716
+ return;
9717
+ }
9714
9718
  #endif
9715
9719
 
9716
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
9720
+ #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
9717
9721
  if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
9718
9722
  GGML_ASSERT(nb10 == sizeof(float));
9719
9723
 
@@ -9743,20 +9747,6 @@ static void ggml_compute_forward_mul_mat_f16_f32(
9743
9747
  assert(id*sizeof(float) <= params->wsize);
9744
9748
  }
9745
9749
 
9746
- #if defined(GGML_USE_CLBLAST)
9747
- const float * x = wdata;
9748
- const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
9749
-
9750
- float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
9751
-
9752
- // zT = y * xT
9753
- ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T,
9754
- ne11, ne01, ne10,
9755
- 1.0f, y, ne10,
9756
- x, ne10,
9757
- 0.0f, d, ne01,
9758
- GGML_TYPE_F32);
9759
- #else
9760
9750
  const float * x = wdata;
9761
9751
  const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
9762
9752
 
@@ -9768,7 +9758,6 @@ static void ggml_compute_forward_mul_mat_f16_f32(
9768
9758
  1.0f, y, ne10,
9769
9759
  x, ne00,
9770
9760
  0.0f, d, ne01);
9771
- #endif
9772
9761
  }
9773
9762
  }
9774
9763
 
@@ -9931,9 +9920,16 @@ static void ggml_compute_forward_mul_mat_q_f32(
9931
9920
  }
9932
9921
  return;
9933
9922
  }
9923
+ #elif defined(GGML_USE_CLBLAST)
9924
+ if (ggml_cl_can_mul_mat(src0, src1, dst)) {
9925
+ if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
9926
+ ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
9927
+ }
9928
+ return;
9929
+ }
9934
9930
  #endif
9935
9931
 
9936
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
9932
+ #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
9937
9933
  if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
9938
9934
  if (params->ith != 0) {
9939
9935
  return;
@@ -9956,9 +9952,6 @@ static void ggml_compute_forward_mul_mat_q_f32(
9956
9952
 
9957
9953
  float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
9958
9954
 
9959
- #if defined(GGML_USE_CLBLAST)
9960
- const void* x = (char *) src0->data + i03*nb03 + i02*nb02;
9961
- #else
9962
9955
  {
9963
9956
  size_t id = 0;
9964
9957
  for (int64_t i01 = 0; i01 < ne01; ++i01) {
@@ -9970,23 +9963,12 @@ static void ggml_compute_forward_mul_mat_q_f32(
9970
9963
  }
9971
9964
 
9972
9965
  const float * x = wdata;
9973
- #endif
9974
9966
 
9975
- #if defined(GGML_USE_CLBLAST)
9976
- // zT = y * xT
9977
- ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T,
9978
- ne11, ne01, ne10,
9979
- 1.0f, y, ne10,
9980
- x, ne10,
9981
- 0.0f, d, ne01,
9982
- type);
9983
- #else
9984
9967
  cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
9985
9968
  ne11, ne01, ne10,
9986
9969
  1.0f, y, ne10,
9987
9970
  x, ne00,
9988
9971
  0.0f, d, ne01);
9989
- #endif
9990
9972
  }
9991
9973
  }
9992
9974
 
@@ -14165,9 +14147,16 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
14165
14147
  cur = ggml_cuda_mul_mat_get_wsize(node->src0, node->src1, node);
14166
14148
  }
14167
14149
  else
14150
+ #elif defined(GGML_USE_CLBLAST)
14151
+ if (ggml_cl_can_mul_mat(node->src0, node->src1, node)) {
14152
+ node->n_tasks = 1; // TODO: this actually is doing nothing
14153
+ // the threads are still spinning
14154
+ cur = ggml_cl_mul_mat_get_wsize(node->src0, node->src1, node);
14155
+ }
14156
+ else
14168
14157
  #endif
14169
14158
  if (node->src0->type == GGML_TYPE_F16 && node->src1->type == GGML_TYPE_F32) {
14170
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
14159
+ #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
14171
14160
  if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
14172
14161
  node->n_tasks = 1; // TODO: this actually is doing nothing
14173
14162
  // the threads are still spinning
@@ -14181,13 +14170,13 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
14181
14170
  #endif
14182
14171
  } else if (node->src0->type == GGML_TYPE_F32 && node->src1->type == GGML_TYPE_F32) {
14183
14172
  cur = 0;
14184
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
14173
+ #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
14185
14174
  if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
14186
14175
  node->n_tasks = 1;
14187
14176
  }
14188
14177
  #endif
14189
14178
  } else if (ggml_is_quantized(node->src0->type) && node->src1->type == GGML_TYPE_F32) {
14190
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
14179
+ #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
14191
14180
  if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
14192
14181
  node->n_tasks = 1;
14193
14182
  cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]);
@@ -249,6 +249,7 @@ extern "C" {
249
249
  enum ggml_backend {
250
250
  GGML_BACKEND_CPU = 0,
251
251
  GGML_BACKEND_CUDA = 1,
252
+ GGML_BACKEND_CL = 2,
252
253
  };
253
254
 
254
255
  // model file types
@@ -12,6 +12,8 @@
12
12
  #include "ggml.h"
13
13
  #ifdef GGML_USE_CUBLAS
14
14
  #include "ggml-cuda.h"
15
+ #elif defined(GGML_USE_CLBLAST)
16
+ #include "ggml-opencl.h"
15
17
  #endif
16
18
 
17
19
  #include <array>
@@ -1092,7 +1094,7 @@ static void llama_model_load_internal(
1092
1094
  fprintf(stderr, "%s: [cublas] offloading output layer to GPU\n", __func__);
1093
1095
  }
1094
1096
  fprintf(stderr, "%s: [cublas] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
1095
- #else
1097
+ #elif !defined(GGML_USE_CLBLAST)
1096
1098
  (void) n_gpu_layers;
1097
1099
  #endif
1098
1100
  }
@@ -1125,7 +1127,33 @@ static void llama_model_load_internal(
1125
1127
  done_size += lt.size;
1126
1128
  }
1127
1129
  }
1128
- #endif // GGML_USE_CUBLAS
1130
+ #elif defined(GGML_USE_CLBLAST)
1131
+ {
1132
+ const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
1133
+
1134
+ fprintf(stderr, "ggml_opencl: offloading %d layers to GPU\n", n_gpu);
1135
+
1136
+ size_t vram_total = 0;
1137
+
1138
+ for (int i = 0; i < n_gpu; ++i) {
1139
+ const auto & layer = model.layers[i];
1140
+
1141
+ ggml_cl_transform_tensor(layer.wq); vram_total += ggml_nbytes(layer.wq);
1142
+ ggml_cl_transform_tensor(layer.wk); vram_total += ggml_nbytes(layer.wk);
1143
+ ggml_cl_transform_tensor(layer.wv); vram_total += ggml_nbytes(layer.wv);
1144
+ ggml_cl_transform_tensor(layer.wo); vram_total += ggml_nbytes(layer.wo);
1145
+ ggml_cl_transform_tensor(layer.w1); vram_total += ggml_nbytes(layer.w1);
1146
+ ggml_cl_transform_tensor(layer.w2); vram_total += ggml_nbytes(layer.w2);
1147
+ ggml_cl_transform_tensor(layer.w3); vram_total += ggml_nbytes(layer.w3);
1148
+ }
1149
+ if (n_gpu_layers > (int) hparams.n_layer) {
1150
+ fprintf(stderr, "ggml_opencl: offloading output layer to GPU\n");
1151
+ ggml_cl_transform_tensor(model.output); vram_total += ggml_nbytes(model.output);
1152
+ }
1153
+
1154
+ fprintf(stderr, "ggml_opencl: total VRAM used: %zu MB\n", vram_total / 1024 / 1024);
1155
+ }
1156
+ #endif
1129
1157
 
1130
1158
  if (progress_callback) {
1131
1159
  progress_callback(1.0f, progress_callback_user_data);
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.1.2'
6
+ VERSION = '0.1.3'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'master-265db98'
9
+ LLAMA_CPP_VERSION = 'master-66874d4'
10
10
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llama_cpp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.2
4
+ version: 0.1.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-05-22 00:00:00.000000000 Z
11
+ date: 2023-05-26 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
14
14
  email:
@@ -27,7 +27,7 @@ files:
27
27
  - ext/llama_cpp/llama_cpp.h
28
28
  - ext/llama_cpp/src/LICENSE
29
29
  - ext/llama_cpp/src/ggml-cuda.h
30
- - ext/llama_cpp/src/ggml-opencl.c
30
+ - ext/llama_cpp/src/ggml-opencl.cpp
31
31
  - ext/llama_cpp/src/ggml-opencl.h
32
32
  - ext/llama_cpp/src/ggml.c
33
33
  - ext/llama_cpp/src/ggml.h