llama_cpp 0.1.4 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,6 +1,13 @@
1
1
  #ifndef LLAMA_H
2
2
  #define LLAMA_H
3
3
 
4
+ #include "ggml.h"
5
+ #ifdef GGML_USE_CUBLAS
6
+ #include "ggml-cuda.h"
7
+ #define LLAMA_MAX_DEVICES GGML_CUDA_MAX_DEVICES
8
+ #else
9
+ #define LLAMA_MAX_DEVICES 1
10
+ #endif // GGML_USE_CUBLAS
4
11
  #include <stddef.h>
5
12
  #include <stdint.h>
6
13
  #include <stdbool.h>
@@ -31,7 +38,7 @@
31
38
  #define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
32
39
  #define LLAMA_SESSION_VERSION 1
33
40
 
34
- #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
41
+ #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
35
42
  // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
36
43
  #define LLAMA_SUPPORTS_GPU_OFFLOAD
37
44
  #endif
@@ -65,9 +72,12 @@ extern "C" {
65
72
  typedef void (*llama_progress_callback)(float progress, void *ctx);
66
73
 
67
74
  struct llama_context_params {
68
- int n_ctx; // text context
69
- int n_gpu_layers; // number of layers to store in VRAM
70
- int seed; // RNG seed, -1 for random
75
+ int n_ctx; // text context
76
+ int n_batch; // prompt processing batch size
77
+ int n_gpu_layers; // number of layers to store in VRAM
78
+ int main_gpu; // the GPU that is used for scratch and small tensors
79
+ float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
80
+ int seed; // RNG seed, -1 for random
71
81
 
72
82
  bool f16_kv; // use fp16 for KV cache
73
83
  bool logits_all; // the llama_eval() call computes all logits, not just the last one
@@ -94,9 +104,27 @@ extern "C" {
94
104
  LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
95
105
  LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
96
106
  LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
107
+ LLAMA_FTYPE_MOSTLY_Q2_K = 10,// except 1d tensors
108
+ LLAMA_FTYPE_MOSTLY_Q3_K_S = 11,// except 1d tensors
109
+ LLAMA_FTYPE_MOSTLY_Q3_K_M = 12,// except 1d tensors
110
+ LLAMA_FTYPE_MOSTLY_Q3_K_L = 13,// except 1d tensors
111
+ LLAMA_FTYPE_MOSTLY_Q4_K_S = 14,// except 1d tensors
112
+ LLAMA_FTYPE_MOSTLY_Q4_K_M = 15,// except 1d tensors
113
+ LLAMA_FTYPE_MOSTLY_Q5_K_S = 16,// except 1d tensors
114
+ LLAMA_FTYPE_MOSTLY_Q5_K_M = 17,// except 1d tensors
115
+ LLAMA_FTYPE_MOSTLY_Q6_K = 18,// except 1d tensors
97
116
  };
98
117
 
118
+ // model quantization parameters
119
+ typedef struct llama_model_quantize_params {
120
+ int nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
121
+ enum llama_ftype ftype; // quantize to this llama_ftype
122
+ bool allow_requantize; // allow quantizing non-f32/f16 tensors
123
+ bool quantize_output_tensor; // quantize output.weight
124
+ } llama_model_quantize_params;
125
+
99
126
  LLAMA_API struct llama_context_params llama_context_default_params();
127
+ LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params();
100
128
 
101
129
  LLAMA_API bool llama_mmap_supported();
102
130
  LLAMA_API bool llama_mlock_supported();
@@ -118,14 +146,11 @@ extern "C" {
118
146
  // Frees all allocated memory
119
147
  LLAMA_API void llama_free(struct llama_context * ctx);
120
148
 
121
- // TODO: not great API - very likely to change
122
149
  // Returns 0 on success
123
- // nthread - how many threads to use. If <=0, will use std::thread::hardware_concurrency(), else the number given
124
150
  LLAMA_API int llama_model_quantize(
125
151
  const char * fname_inp,
126
152
  const char * fname_out,
127
- enum llama_ftype ftype,
128
- int nthread);
153
+ const llama_model_quantize_params * params);
129
154
 
130
155
  // Apply a LoRA adapter to a loaded model
131
156
  // path_base_model is the path to a higher quality model to use as a base for
@@ -173,6 +198,12 @@ extern "C" {
173
198
  int n_past,
174
199
  int n_threads);
175
200
 
201
+ // Export a static computation graph for context of 511 and batch size of 1
202
+ // NOTE: since this functionality is mostly for debugging and demonstration purposes, we hardcode these
203
+ // parameters here to keep things simple
204
+ // IMPORTANT: do not use for anything else other than debugging and testing!
205
+ LLAMA_API int llama_eval_export(struct llama_context * ctx, const char * fname);
206
+
176
207
  // Convert the provided text into tokens.
177
208
  // The tokens pointer must be large enough to hold the resulting tokens.
178
209
  // Returns the number of tokens on success, no more than n_max_tokens
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.1.4'
6
+ VERSION = '0.2.0'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'master-ffb06a3'
9
+ LLAMA_CPP_VERSION = 'master-4de0334'
10
10
  end
data/sig/llama_cpp.rbs CHANGED
@@ -5,6 +5,8 @@ module LLaMACpp
5
5
  LLAMA_FILE_MAGIC: String
6
6
  LLAMA_FILE_MAGIC_UNVERSIONED: String
7
7
 
8
+ LLAMA_MAX_DEVICES: Integer
9
+
8
10
  LLAMA_FTYPE_ALL_F32: Integer
9
11
  LLAMA_FTYPE_MOSTLY_F16: Integer
10
12
  LLAMA_FTYPE_MOSTLY_Q4_0: Integer
@@ -13,9 +15,18 @@ module LLaMACpp
13
15
  LLAMA_FTYPE_MOSTLY_Q8_0: Integer
14
16
  LLAMA_FTYPE_MOSTLY_Q5_0: Integer
15
17
  LLAMA_FTYPE_MOSTLY_Q5_1: Integer
18
+ LLAMA_FTYPE_MOSTLY_Q2_K: Integer
19
+ LLAMA_FTYPE_MOSTLY_Q3_K_S: Integer
20
+ LLAMA_FTYPE_MOSTLY_Q3_K_M: Integer
21
+ LLAMA_FTYPE_MOSTLY_Q3_K_L: Integer
22
+ LLAMA_FTYPE_MOSTLY_Q4_K_S: Integer
23
+ LLAMA_FTYPE_MOSTLY_Q4_K_M: Integer
24
+ LLAMA_FTYPE_MOSTLY_Q5_K_S: Integer
25
+ LLAMA_FTYPE_MOSTLY_Q5_K_M: Integer
26
+ LLAMA_FTYPE_MOSTLY_Q6_K: Integer
16
27
 
17
28
  def self?.init_backend: () -> void
18
- def self?.model_quantize: (input_path: String, output_path: String, ftype: Integer, ?n_threads: Integer) -> void
29
+ def self?.model_quantize: (input_path: String, output_path: String, params: ModelQuantizeParams) -> void
19
30
  def self?.generate: (::LLaMACpp::Context, String, ?n_predict: Integer, ?n_threads: Integer) -> String
20
31
  def self?.print_system_info: () -> void
21
32
  def self?.token_bos: () -> Integer
@@ -52,6 +63,7 @@ module LLaMACpp
52
63
  def embeddings: () -> Array[Float]
53
64
  def empty?: () -> bool
54
65
  def eval: (tokens: Array[Integer], n_past: Integer, ?n_tokens: Integer, ?n_threads: Integer) -> void
66
+ def eval_export: (String) -> bool
55
67
  def free: () -> void
56
68
  def load: (model_path: String, params: ::LLaMACpp::ContextParams) -> void
57
69
  def logits: () -> Array[Float]
@@ -92,6 +104,13 @@ module LLaMACpp
92
104
  def logits_all=: (bool) -> bool
93
105
  def n_ctx: () -> Integer
94
106
  def n_ctx=: (Integer) -> Integer
107
+ def n_batch: () -> Integer
108
+ def n_batch=: (Integer) -> Integer
109
+ def n_gpu_layers: () -> Integer
110
+ def n_gpu_layers=: (Integer) -> Integer
111
+ def main_gpu: () -> Integer
112
+ def main_gpu=: (Integer) -> Integer
113
+ def tensor_split: () -> Array[Float]
95
114
  def seed: () -> Integer
96
115
  def seed=: (Integer) -> Integer
97
116
  def use_mlock: () -> bool
@@ -102,6 +121,19 @@ module LLaMACpp
102
121
  def vocab_only=: (bool) -> bool
103
122
  end
104
123
 
124
+ class ModelQuantizeParams
125
+ public
126
+
127
+ def n_thread: () -> Integer
128
+ def n_thread=: (Integer) -> Integer
129
+ def ftype: () -> Integer
130
+ def ftype=: (Integer) -> Integer
131
+ def allow_quantization: () -> bool
132
+ def allow_quantization=: (bool) -> bool
133
+ def quantize_output_tensor: () -> bool
134
+ def quantize_output_tensor=: (bool) -> bool
135
+ end
136
+
105
137
  class Params = ContextParams
106
138
 
107
139
  class Client
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llama_cpp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.4
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-06-03 00:00:00.000000000 Z
11
+ date: 2023-06-11 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
14
14
  email:
@@ -26,11 +26,17 @@ files:
26
26
  - ext/llama_cpp/llama_cpp.cpp
27
27
  - ext/llama_cpp/llama_cpp.h
28
28
  - ext/llama_cpp/src/LICENSE
29
+ - ext/llama_cpp/src/ggml-cuda.cu
29
30
  - ext/llama_cpp/src/ggml-cuda.h
31
+ - ext/llama_cpp/src/ggml-metal.h
32
+ - ext/llama_cpp/src/ggml-metal.m
33
+ - ext/llama_cpp/src/ggml-metal.metal
30
34
  - ext/llama_cpp/src/ggml-opencl.cpp
31
35
  - ext/llama_cpp/src/ggml-opencl.h
32
36
  - ext/llama_cpp/src/ggml.c
33
37
  - ext/llama_cpp/src/ggml.h
38
+ - ext/llama_cpp/src/k_quants.c
39
+ - ext/llama_cpp/src/k_quants.h
34
40
  - ext/llama_cpp/src/llama-util.h
35
41
  - ext/llama_cpp/src/llama.cpp
36
42
  - ext/llama_cpp/src/llama.h