llama_cpp 0.1.4 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,13 @@
1
1
  #ifndef LLAMA_H
2
2
  #define LLAMA_H
3
3
 
4
+ #include "ggml.h"
5
+ #ifdef GGML_USE_CUBLAS
6
+ #include "ggml-cuda.h"
7
+ #define LLAMA_MAX_DEVICES GGML_CUDA_MAX_DEVICES
8
+ #else
9
+ #define LLAMA_MAX_DEVICES 1
10
+ #endif // GGML_USE_CUBLAS
4
11
  #include <stddef.h>
5
12
  #include <stdint.h>
6
13
  #include <stdbool.h>
@@ -31,7 +38,7 @@
31
38
  #define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
32
39
  #define LLAMA_SESSION_VERSION 1
33
40
 
34
- #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
41
+ #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
35
42
  // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
36
43
  #define LLAMA_SUPPORTS_GPU_OFFLOAD
37
44
  #endif
@@ -65,9 +72,12 @@ extern "C" {
65
72
  typedef void (*llama_progress_callback)(float progress, void *ctx);
66
73
 
67
74
  struct llama_context_params {
68
- int n_ctx; // text context
69
- int n_gpu_layers; // number of layers to store in VRAM
70
- int seed; // RNG seed, -1 for random
75
+ int n_ctx; // text context
76
+ int n_batch; // prompt processing batch size
77
+ int n_gpu_layers; // number of layers to store in VRAM
78
+ int main_gpu; // the GPU that is used for scratch and small tensors
79
+ float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
80
+ int seed; // RNG seed, -1 for random
71
81
 
72
82
  bool f16_kv; // use fp16 for KV cache
73
83
  bool logits_all; // the llama_eval() call computes all logits, not just the last one
@@ -94,9 +104,27 @@ extern "C" {
94
104
  LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
95
105
  LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
96
106
  LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
107
+ LLAMA_FTYPE_MOSTLY_Q2_K = 10,// except 1d tensors
108
+ LLAMA_FTYPE_MOSTLY_Q3_K_S = 11,// except 1d tensors
109
+ LLAMA_FTYPE_MOSTLY_Q3_K_M = 12,// except 1d tensors
110
+ LLAMA_FTYPE_MOSTLY_Q3_K_L = 13,// except 1d tensors
111
+ LLAMA_FTYPE_MOSTLY_Q4_K_S = 14,// except 1d tensors
112
+ LLAMA_FTYPE_MOSTLY_Q4_K_M = 15,// except 1d tensors
113
+ LLAMA_FTYPE_MOSTLY_Q5_K_S = 16,// except 1d tensors
114
+ LLAMA_FTYPE_MOSTLY_Q5_K_M = 17,// except 1d tensors
115
+ LLAMA_FTYPE_MOSTLY_Q6_K = 18,// except 1d tensors
97
116
  };
98
117
 
118
+ // model quantization parameters
119
+ typedef struct llama_model_quantize_params {
120
+ int nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
121
+ enum llama_ftype ftype; // quantize to this llama_ftype
122
+ bool allow_requantize; // allow quantizing non-f32/f16 tensors
123
+ bool quantize_output_tensor; // quantize output.weight
124
+ } llama_model_quantize_params;
125
+
99
126
  LLAMA_API struct llama_context_params llama_context_default_params();
127
+ LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params();
100
128
 
101
129
  LLAMA_API bool llama_mmap_supported();
102
130
  LLAMA_API bool llama_mlock_supported();
@@ -118,14 +146,11 @@ extern "C" {
118
146
  // Frees all allocated memory
119
147
  LLAMA_API void llama_free(struct llama_context * ctx);
120
148
 
121
- // TODO: not great API - very likely to change
122
149
  // Returns 0 on success
123
- // nthread - how many threads to use. If <=0, will use std::thread::hardware_concurrency(), else the number given
124
150
  LLAMA_API int llama_model_quantize(
125
151
  const char * fname_inp,
126
152
  const char * fname_out,
127
- enum llama_ftype ftype,
128
- int nthread);
153
+ const llama_model_quantize_params * params);
129
154
 
130
155
  // Apply a LoRA adapter to a loaded model
131
156
  // path_base_model is the path to a higher quality model to use as a base for
@@ -173,6 +198,12 @@ extern "C" {
173
198
  int n_past,
174
199
  int n_threads);
175
200
 
201
+ // Export a static computation graph for context of 511 and batch size of 1
202
+ // NOTE: since this functionality is mostly for debugging and demonstration purposes, we hardcode these
203
+ // parameters here to keep things simple
204
+ // IMPORTANT: do not use for anything else other than debugging and testing!
205
+ LLAMA_API int llama_eval_export(struct llama_context * ctx, const char * fname);
206
+
176
207
  // Convert the provided text into tokens.
177
208
  // The tokens pointer must be large enough to hold the resulting tokens.
178
209
  // Returns the number of tokens on success, no more than n_max_tokens
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.1.4'
6
+ VERSION = '0.2.0'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'master-ffb06a3'
9
+ LLAMA_CPP_VERSION = 'master-4de0334'
10
10
  end
data/sig/llama_cpp.rbs CHANGED
@@ -5,6 +5,8 @@ module LLaMACpp
5
5
  LLAMA_FILE_MAGIC: String
6
6
  LLAMA_FILE_MAGIC_UNVERSIONED: String
7
7
 
8
+ LLAMA_MAX_DEVICES: Integer
9
+
8
10
  LLAMA_FTYPE_ALL_F32: Integer
9
11
  LLAMA_FTYPE_MOSTLY_F16: Integer
10
12
  LLAMA_FTYPE_MOSTLY_Q4_0: Integer
@@ -13,9 +15,18 @@ module LLaMACpp
13
15
  LLAMA_FTYPE_MOSTLY_Q8_0: Integer
14
16
  LLAMA_FTYPE_MOSTLY_Q5_0: Integer
15
17
  LLAMA_FTYPE_MOSTLY_Q5_1: Integer
18
+ LLAMA_FTYPE_MOSTLY_Q2_K: Integer
19
+ LLAMA_FTYPE_MOSTLY_Q3_K_S: Integer
20
+ LLAMA_FTYPE_MOSTLY_Q3_K_M: Integer
21
+ LLAMA_FTYPE_MOSTLY_Q3_K_L: Integer
22
+ LLAMA_FTYPE_MOSTLY_Q4_K_S: Integer
23
+ LLAMA_FTYPE_MOSTLY_Q4_K_M: Integer
24
+ LLAMA_FTYPE_MOSTLY_Q5_K_S: Integer
25
+ LLAMA_FTYPE_MOSTLY_Q5_K_M: Integer
26
+ LLAMA_FTYPE_MOSTLY_Q6_K: Integer
16
27
 
17
28
  def self?.init_backend: () -> void
18
- def self?.model_quantize: (input_path: String, output_path: String, ftype: Integer, ?n_threads: Integer) -> void
29
+ def self?.model_quantize: (input_path: String, output_path: String, params: ModelQuantizeParams) -> void
19
30
  def self?.generate: (::LLaMACpp::Context, String, ?n_predict: Integer, ?n_threads: Integer) -> String
20
31
  def self?.print_system_info: () -> void
21
32
  def self?.token_bos: () -> Integer
@@ -52,6 +63,7 @@ module LLaMACpp
52
63
  def embeddings: () -> Array[Float]
53
64
  def empty?: () -> bool
54
65
  def eval: (tokens: Array[Integer], n_past: Integer, ?n_tokens: Integer, ?n_threads: Integer) -> void
66
+ def eval_export: (String) -> bool
55
67
  def free: () -> void
56
68
  def load: (model_path: String, params: ::LLaMACpp::ContextParams) -> void
57
69
  def logits: () -> Array[Float]
@@ -92,6 +104,13 @@ module LLaMACpp
92
104
  def logits_all=: (bool) -> bool
93
105
  def n_ctx: () -> Integer
94
106
  def n_ctx=: (Integer) -> Integer
107
+ def n_batch: () -> Integer
108
+ def n_batch=: (Integer) -> Integer
109
+ def n_gpu_layers: () -> Integer
110
+ def n_gpu_layers=: (Integer) -> Integer
111
+ def main_gpu: () -> Integer
112
+ def main_gpu=: (Integer) -> Integer
113
+ def tensor_split: () -> Array[Float]
95
114
  def seed: () -> Integer
96
115
  def seed=: (Integer) -> Integer
97
116
  def use_mlock: () -> bool
@@ -102,6 +121,19 @@ module LLaMACpp
102
121
  def vocab_only=: (bool) -> bool
103
122
  end
104
123
 
124
+ class ModelQuantizeParams
125
+ public
126
+
127
+ def n_thread: () -> Integer
128
+ def n_thread=: (Integer) -> Integer
129
+ def ftype: () -> Integer
130
+ def ftype=: (Integer) -> Integer
131
+ def allow_quantization: () -> bool
132
+ def allow_quantization=: (bool) -> bool
133
+ def quantize_output_tensor: () -> bool
134
+ def quantize_output_tensor=: (bool) -> bool
135
+ end
136
+
105
137
  class Params = ContextParams
106
138
 
107
139
  class Client
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llama_cpp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.4
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-06-03 00:00:00.000000000 Z
11
+ date: 2023-06-11 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
14
14
  email:
@@ -26,11 +26,17 @@ files:
26
26
  - ext/llama_cpp/llama_cpp.cpp
27
27
  - ext/llama_cpp/llama_cpp.h
28
28
  - ext/llama_cpp/src/LICENSE
29
+ - ext/llama_cpp/src/ggml-cuda.cu
29
30
  - ext/llama_cpp/src/ggml-cuda.h
31
+ - ext/llama_cpp/src/ggml-metal.h
32
+ - ext/llama_cpp/src/ggml-metal.m
33
+ - ext/llama_cpp/src/ggml-metal.metal
30
34
  - ext/llama_cpp/src/ggml-opencl.cpp
31
35
  - ext/llama_cpp/src/ggml-opencl.h
32
36
  - ext/llama_cpp/src/ggml.c
33
37
  - ext/llama_cpp/src/ggml.h
38
+ - ext/llama_cpp/src/k_quants.c
39
+ - ext/llama_cpp/src/k_quants.h
34
40
  - ext/llama_cpp/src/llama-util.h
35
41
  - ext/llama_cpp/src/llama.cpp
36
42
  - ext/llama_cpp/src/llama.h