llama_cpp 0.1.4 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +29 -0
- data/ext/llama_cpp/extconf.rb +26 -1
- data/ext/llama_cpp/llama_cpp.cpp +210 -13
- data/ext/llama_cpp/src/ggml-cuda.cu +1916 -0
- data/ext/llama_cpp/src/ggml-cuda.h +15 -2
- data/ext/llama_cpp/src/ggml-metal.h +63 -0
- data/ext/llama_cpp/src/ggml-metal.m +783 -0
- data/ext/llama_cpp/src/ggml-metal.metal +1133 -0
- data/ext/llama_cpp/src/ggml-opencl.cpp +235 -39
- data/ext/llama_cpp/src/ggml-opencl.h +4 -0
- data/ext/llama_cpp/src/ggml.c +340 -109
- data/ext/llama_cpp/src/ggml.h +44 -6
- data/ext/llama_cpp/src/k_quants.c +2244 -0
- data/ext/llama_cpp/src/k_quants.h +122 -0
- data/ext/llama_cpp/src/llama-util.h +16 -0
- data/ext/llama_cpp/src/llama.cpp +484 -136
- data/ext/llama_cpp/src/llama.h +39 -8
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +33 -1
- metadata +8 -2
data/ext/llama_cpp/src/llama.h
CHANGED
@@ -1,6 +1,13 @@
|
|
1
1
|
#ifndef LLAMA_H
|
2
2
|
#define LLAMA_H
|
3
3
|
|
4
|
+
#include "ggml.h"
|
5
|
+
#ifdef GGML_USE_CUBLAS
|
6
|
+
#include "ggml-cuda.h"
|
7
|
+
#define LLAMA_MAX_DEVICES GGML_CUDA_MAX_DEVICES
|
8
|
+
#else
|
9
|
+
#define LLAMA_MAX_DEVICES 1
|
10
|
+
#endif // GGML_USE_CUBLAS
|
4
11
|
#include <stddef.h>
|
5
12
|
#include <stdint.h>
|
6
13
|
#include <stdbool.h>
|
@@ -31,7 +38,7 @@
|
|
31
38
|
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
32
39
|
#define LLAMA_SESSION_VERSION 1
|
33
40
|
|
34
|
-
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
41
|
+
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
|
35
42
|
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
36
43
|
#define LLAMA_SUPPORTS_GPU_OFFLOAD
|
37
44
|
#endif
|
@@ -65,9 +72,12 @@ extern "C" {
|
|
65
72
|
typedef void (*llama_progress_callback)(float progress, void *ctx);
|
66
73
|
|
67
74
|
struct llama_context_params {
|
68
|
-
int n_ctx;
|
69
|
-
int
|
70
|
-
int
|
75
|
+
int n_ctx; // text context
|
76
|
+
int n_batch; // prompt processing batch size
|
77
|
+
int n_gpu_layers; // number of layers to store in VRAM
|
78
|
+
int main_gpu; // the GPU that is used for scratch and small tensors
|
79
|
+
float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
|
80
|
+
int seed; // RNG seed, -1 for random
|
71
81
|
|
72
82
|
bool f16_kv; // use fp16 for KV cache
|
73
83
|
bool logits_all; // the llama_eval() call computes all logits, not just the last one
|
@@ -94,9 +104,27 @@ extern "C" {
|
|
94
104
|
LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
|
95
105
|
LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
|
96
106
|
LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
|
107
|
+
LLAMA_FTYPE_MOSTLY_Q2_K = 10,// except 1d tensors
|
108
|
+
LLAMA_FTYPE_MOSTLY_Q3_K_S = 11,// except 1d tensors
|
109
|
+
LLAMA_FTYPE_MOSTLY_Q3_K_M = 12,// except 1d tensors
|
110
|
+
LLAMA_FTYPE_MOSTLY_Q3_K_L = 13,// except 1d tensors
|
111
|
+
LLAMA_FTYPE_MOSTLY_Q4_K_S = 14,// except 1d tensors
|
112
|
+
LLAMA_FTYPE_MOSTLY_Q4_K_M = 15,// except 1d tensors
|
113
|
+
LLAMA_FTYPE_MOSTLY_Q5_K_S = 16,// except 1d tensors
|
114
|
+
LLAMA_FTYPE_MOSTLY_Q5_K_M = 17,// except 1d tensors
|
115
|
+
LLAMA_FTYPE_MOSTLY_Q6_K = 18,// except 1d tensors
|
97
116
|
};
|
98
117
|
|
118
|
+
// model quantization parameters
|
119
|
+
typedef struct llama_model_quantize_params {
|
120
|
+
int nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
|
121
|
+
enum llama_ftype ftype; // quantize to this llama_ftype
|
122
|
+
bool allow_requantize; // allow quantizing non-f32/f16 tensors
|
123
|
+
bool quantize_output_tensor; // quantize output.weight
|
124
|
+
} llama_model_quantize_params;
|
125
|
+
|
99
126
|
LLAMA_API struct llama_context_params llama_context_default_params();
|
127
|
+
LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params();
|
100
128
|
|
101
129
|
LLAMA_API bool llama_mmap_supported();
|
102
130
|
LLAMA_API bool llama_mlock_supported();
|
@@ -118,14 +146,11 @@ extern "C" {
|
|
118
146
|
// Frees all allocated memory
|
119
147
|
LLAMA_API void llama_free(struct llama_context * ctx);
|
120
148
|
|
121
|
-
// TODO: not great API - very likely to change
|
122
149
|
// Returns 0 on success
|
123
|
-
// nthread - how many threads to use. If <=0, will use std::thread::hardware_concurrency(), else the number given
|
124
150
|
LLAMA_API int llama_model_quantize(
|
125
151
|
const char * fname_inp,
|
126
152
|
const char * fname_out,
|
127
|
-
|
128
|
-
int nthread);
|
153
|
+
const llama_model_quantize_params * params);
|
129
154
|
|
130
155
|
// Apply a LoRA adapter to a loaded model
|
131
156
|
// path_base_model is the path to a higher quality model to use as a base for
|
@@ -173,6 +198,12 @@ extern "C" {
|
|
173
198
|
int n_past,
|
174
199
|
int n_threads);
|
175
200
|
|
201
|
+
// Export a static computation graph for context of 511 and batch size of 1
|
202
|
+
// NOTE: since this functionality is mostly for debugging and demonstration purposes, we hardcode these
|
203
|
+
// parameters here to keep things simple
|
204
|
+
// IMPORTANT: do not use for anything else other than debugging and testing!
|
205
|
+
LLAMA_API int llama_eval_export(struct llama_context * ctx, const char * fname);
|
206
|
+
|
176
207
|
// Convert the provided text into tokens.
|
177
208
|
// The tokens pointer must be large enough to hold the resulting tokens.
|
178
209
|
// Returns the number of tokens on success, no more than n_max_tokens
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.
|
6
|
+
VERSION = '0.2.0'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = 'master-
|
9
|
+
LLAMA_CPP_VERSION = 'master-4de0334'
|
10
10
|
end
|
data/sig/llama_cpp.rbs
CHANGED
@@ -5,6 +5,8 @@ module LLaMACpp
|
|
5
5
|
LLAMA_FILE_MAGIC: String
|
6
6
|
LLAMA_FILE_MAGIC_UNVERSIONED: String
|
7
7
|
|
8
|
+
LLAMA_MAX_DEVICES: Integer
|
9
|
+
|
8
10
|
LLAMA_FTYPE_ALL_F32: Integer
|
9
11
|
LLAMA_FTYPE_MOSTLY_F16: Integer
|
10
12
|
LLAMA_FTYPE_MOSTLY_Q4_0: Integer
|
@@ -13,9 +15,18 @@ module LLaMACpp
|
|
13
15
|
LLAMA_FTYPE_MOSTLY_Q8_0: Integer
|
14
16
|
LLAMA_FTYPE_MOSTLY_Q5_0: Integer
|
15
17
|
LLAMA_FTYPE_MOSTLY_Q5_1: Integer
|
18
|
+
LLAMA_FTYPE_MOSTLY_Q2_K: Integer
|
19
|
+
LLAMA_FTYPE_MOSTLY_Q3_K_S: Integer
|
20
|
+
LLAMA_FTYPE_MOSTLY_Q3_K_M: Integer
|
21
|
+
LLAMA_FTYPE_MOSTLY_Q3_K_L: Integer
|
22
|
+
LLAMA_FTYPE_MOSTLY_Q4_K_S: Integer
|
23
|
+
LLAMA_FTYPE_MOSTLY_Q4_K_M: Integer
|
24
|
+
LLAMA_FTYPE_MOSTLY_Q5_K_S: Integer
|
25
|
+
LLAMA_FTYPE_MOSTLY_Q5_K_M: Integer
|
26
|
+
LLAMA_FTYPE_MOSTLY_Q6_K: Integer
|
16
27
|
|
17
28
|
def self?.init_backend: () -> void
|
18
|
-
def self?.model_quantize: (input_path: String, output_path: String,
|
29
|
+
def self?.model_quantize: (input_path: String, output_path: String, params: ModelQuantizeParams) -> void
|
19
30
|
def self?.generate: (::LLaMACpp::Context, String, ?n_predict: Integer, ?n_threads: Integer) -> String
|
20
31
|
def self?.print_system_info: () -> void
|
21
32
|
def self?.token_bos: () -> Integer
|
@@ -52,6 +63,7 @@ module LLaMACpp
|
|
52
63
|
def embeddings: () -> Array[Float]
|
53
64
|
def empty?: () -> bool
|
54
65
|
def eval: (tokens: Array[Integer], n_past: Integer, ?n_tokens: Integer, ?n_threads: Integer) -> void
|
66
|
+
def eval_export: (String) -> bool
|
55
67
|
def free: () -> void
|
56
68
|
def load: (model_path: String, params: ::LLaMACpp::ContextParams) -> void
|
57
69
|
def logits: () -> Array[Float]
|
@@ -92,6 +104,13 @@ module LLaMACpp
|
|
92
104
|
def logits_all=: (bool) -> bool
|
93
105
|
def n_ctx: () -> Integer
|
94
106
|
def n_ctx=: (Integer) -> Integer
|
107
|
+
def n_batch: () -> Integer
|
108
|
+
def n_batch=: (Integer) -> Integer
|
109
|
+
def n_gpu_layers: () -> Integer
|
110
|
+
def n_gpu_layers=: (Integer) -> Integer
|
111
|
+
def main_gpu: () -> Integer
|
112
|
+
def main_gpu=: (Integer) -> Integer
|
113
|
+
def tensor_split: () -> Array[Float]
|
95
114
|
def seed: () -> Integer
|
96
115
|
def seed=: (Integer) -> Integer
|
97
116
|
def use_mlock: () -> bool
|
@@ -102,6 +121,19 @@ module LLaMACpp
|
|
102
121
|
def vocab_only=: (bool) -> bool
|
103
122
|
end
|
104
123
|
|
124
|
+
class ModelQuantizeParams
|
125
|
+
public
|
126
|
+
|
127
|
+
def n_thread: () -> Integer
|
128
|
+
def n_thread=: (Integer) -> Integer
|
129
|
+
def ftype: () -> Integer
|
130
|
+
def ftype=: (Integer) -> Integer
|
131
|
+
def allow_quantization: () -> bool
|
132
|
+
def allow_quantization=: (bool) -> bool
|
133
|
+
def quantize_output_tensor: () -> bool
|
134
|
+
def quantize_output_tensor=: (bool) -> bool
|
135
|
+
end
|
136
|
+
|
105
137
|
class Params = ContextParams
|
106
138
|
|
107
139
|
class Client
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llama_cpp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-06-
|
11
|
+
date: 2023-06-11 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
14
14
|
email:
|
@@ -26,11 +26,17 @@ files:
|
|
26
26
|
- ext/llama_cpp/llama_cpp.cpp
|
27
27
|
- ext/llama_cpp/llama_cpp.h
|
28
28
|
- ext/llama_cpp/src/LICENSE
|
29
|
+
- ext/llama_cpp/src/ggml-cuda.cu
|
29
30
|
- ext/llama_cpp/src/ggml-cuda.h
|
31
|
+
- ext/llama_cpp/src/ggml-metal.h
|
32
|
+
- ext/llama_cpp/src/ggml-metal.m
|
33
|
+
- ext/llama_cpp/src/ggml-metal.metal
|
30
34
|
- ext/llama_cpp/src/ggml-opencl.cpp
|
31
35
|
- ext/llama_cpp/src/ggml-opencl.h
|
32
36
|
- ext/llama_cpp/src/ggml.c
|
33
37
|
- ext/llama_cpp/src/ggml.h
|
38
|
+
- ext/llama_cpp/src/k_quants.c
|
39
|
+
- ext/llama_cpp/src/k_quants.h
|
34
40
|
- ext/llama_cpp/src/llama-util.h
|
35
41
|
- ext/llama_cpp/src/llama.cpp
|
36
42
|
- ext/llama_cpp/src/llama.h
|