llama_cpp 0.13.0 → 0.14.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +20 -0
- data/ext/llama_cpp/llama_cpp.cpp +130 -26
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +15 -4
- data/vendor/tmp/llama.cpp/Makefile +30 -15
- data/vendor/tmp/llama.cpp/ggml-alloc.c +45 -64
- data/vendor/tmp/llama.cpp/ggml-alloc.h +13 -5
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +17 -5
- data/vendor/tmp/llama.cpp/ggml-backend.c +371 -151
- data/vendor/tmp/llama.cpp/ggml-backend.h +54 -29
- data/vendor/tmp/llama.cpp/ggml-common.h +1830 -0
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +765 -830
- data/vendor/tmp/llama.cpp/ggml-impl.h +6 -2
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +7 -2
- data/vendor/tmp/llama.cpp/ggml-metal.m +105 -27
- data/vendor/tmp/llama.cpp/ggml-metal.metal +99 -920
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +2 -2
- data/vendor/tmp/llama.cpp/ggml-quants.c +557 -1129
- data/vendor/tmp/llama.cpp/ggml-quants.h +27 -259
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +3332 -1195
- data/vendor/tmp/llama.cpp/ggml-sycl.h +5 -0
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +39336 -43461
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +1302 -781
- data/vendor/tmp/llama.cpp/ggml-vulkan.h +1 -0
- data/vendor/tmp/llama.cpp/ggml.c +734 -356
- data/vendor/tmp/llama.cpp/ggml.h +91 -51
- data/vendor/tmp/llama.cpp/llama.cpp +1938 -759
- data/vendor/tmp/llama.cpp/llama.h +53 -21
- data/vendor/tmp/llama.cpp/unicode.cpp +1672 -0
- data/vendor/tmp/llama.cpp/unicode.h +16 -774
- metadata +4 -2
data/vendor/tmp/llama.cpp/ggml.h
CHANGED
@@ -315,6 +315,16 @@
|
|
315
315
|
extern "C" {
|
316
316
|
#endif
|
317
317
|
|
318
|
+
enum ggml_status {
|
319
|
+
GGML_STATUS_ALLOC_FAILED = -2,
|
320
|
+
GGML_STATUS_FAILED = -1,
|
321
|
+
GGML_STATUS_SUCCESS = 0,
|
322
|
+
GGML_STATUS_ABORTED = 1,
|
323
|
+
};
|
324
|
+
|
325
|
+
// get ggml_status name string
|
326
|
+
GGML_API GGML_CALL const char * ggml_status_to_string(enum ggml_status status);
|
327
|
+
|
318
328
|
typedef uint16_t ggml_fp16_t;
|
319
329
|
|
320
330
|
// convert FP16 <-> FP32
|
@@ -327,24 +337,24 @@ extern "C" {
|
|
327
337
|
struct ggml_object;
|
328
338
|
struct ggml_context;
|
329
339
|
|
340
|
+
// NOTE: always add types at the end of the enum to keep backward compatibility
|
330
341
|
enum ggml_type {
|
331
|
-
GGML_TYPE_F32
|
332
|
-
GGML_TYPE_F16
|
333
|
-
GGML_TYPE_Q4_0
|
334
|
-
GGML_TYPE_Q4_1
|
342
|
+
GGML_TYPE_F32 = 0,
|
343
|
+
GGML_TYPE_F16 = 1,
|
344
|
+
GGML_TYPE_Q4_0 = 2,
|
345
|
+
GGML_TYPE_Q4_1 = 3,
|
335
346
|
// GGML_TYPE_Q4_2 = 4, support has been removed
|
336
|
-
// GGML_TYPE_Q4_3
|
337
|
-
GGML_TYPE_Q5_0
|
338
|
-
GGML_TYPE_Q5_1
|
339
|
-
GGML_TYPE_Q8_0
|
340
|
-
GGML_TYPE_Q8_1
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
|
345
|
-
|
346
|
-
|
347
|
-
GGML_TYPE_Q8_K = 15,
|
347
|
+
// GGML_TYPE_Q4_3 = 5, support has been removed
|
348
|
+
GGML_TYPE_Q5_0 = 6,
|
349
|
+
GGML_TYPE_Q5_1 = 7,
|
350
|
+
GGML_TYPE_Q8_0 = 8,
|
351
|
+
GGML_TYPE_Q8_1 = 9,
|
352
|
+
GGML_TYPE_Q2_K = 10,
|
353
|
+
GGML_TYPE_Q3_K = 11,
|
354
|
+
GGML_TYPE_Q4_K = 12,
|
355
|
+
GGML_TYPE_Q5_K = 13,
|
356
|
+
GGML_TYPE_Q6_K = 14,
|
357
|
+
GGML_TYPE_Q8_K = 15,
|
348
358
|
GGML_TYPE_IQ2_XXS = 16,
|
349
359
|
GGML_TYPE_IQ2_XS = 17,
|
350
360
|
GGML_TYPE_IQ3_XXS = 18,
|
@@ -353,9 +363,11 @@ extern "C" {
|
|
353
363
|
GGML_TYPE_IQ3_S = 21,
|
354
364
|
GGML_TYPE_IQ2_S = 22,
|
355
365
|
GGML_TYPE_IQ4_XS = 23,
|
356
|
-
GGML_TYPE_I8,
|
357
|
-
GGML_TYPE_I16,
|
358
|
-
GGML_TYPE_I32,
|
366
|
+
GGML_TYPE_I8 = 24,
|
367
|
+
GGML_TYPE_I16 = 25,
|
368
|
+
GGML_TYPE_I32 = 26,
|
369
|
+
GGML_TYPE_I64 = 27,
|
370
|
+
GGML_TYPE_F64 = 28,
|
359
371
|
GGML_TYPE_COUNT,
|
360
372
|
};
|
361
373
|
|
@@ -373,20 +385,20 @@ extern "C" {
|
|
373
385
|
|
374
386
|
// model file types
|
375
387
|
enum ggml_ftype {
|
376
|
-
GGML_FTYPE_UNKNOWN
|
377
|
-
GGML_FTYPE_ALL_F32
|
378
|
-
GGML_FTYPE_MOSTLY_F16
|
379
|
-
GGML_FTYPE_MOSTLY_Q4_0
|
380
|
-
GGML_FTYPE_MOSTLY_Q4_1
|
388
|
+
GGML_FTYPE_UNKNOWN = -1,
|
389
|
+
GGML_FTYPE_ALL_F32 = 0,
|
390
|
+
GGML_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
|
391
|
+
GGML_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
|
392
|
+
GGML_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
|
381
393
|
GGML_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
|
382
|
-
GGML_FTYPE_MOSTLY_Q8_0
|
383
|
-
GGML_FTYPE_MOSTLY_Q5_0
|
384
|
-
GGML_FTYPE_MOSTLY_Q5_1
|
385
|
-
GGML_FTYPE_MOSTLY_Q2_K
|
386
|
-
GGML_FTYPE_MOSTLY_Q3_K
|
387
|
-
GGML_FTYPE_MOSTLY_Q4_K
|
388
|
-
GGML_FTYPE_MOSTLY_Q5_K
|
389
|
-
GGML_FTYPE_MOSTLY_Q6_K
|
394
|
+
GGML_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
|
395
|
+
GGML_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
|
396
|
+
GGML_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
|
397
|
+
GGML_FTYPE_MOSTLY_Q2_K = 10, // except 1d tensors
|
398
|
+
GGML_FTYPE_MOSTLY_Q3_K = 11, // except 1d tensors
|
399
|
+
GGML_FTYPE_MOSTLY_Q4_K = 12, // except 1d tensors
|
400
|
+
GGML_FTYPE_MOSTLY_Q5_K = 13, // except 1d tensors
|
401
|
+
GGML_FTYPE_MOSTLY_Q6_K = 14, // except 1d tensors
|
390
402
|
GGML_FTYPE_MOSTLY_IQ2_XXS = 15, // except 1d tensors
|
391
403
|
GGML_FTYPE_MOSTLY_IQ2_XS = 16, // except 1d tensors
|
392
404
|
GGML_FTYPE_MOSTLY_IQ3_XXS = 17, // except 1d tensors
|
@@ -454,12 +466,16 @@ extern "C" {
|
|
454
466
|
GGML_OP_POOL_2D,
|
455
467
|
GGML_OP_UPSCALE, // nearest interpolate
|
456
468
|
GGML_OP_PAD,
|
469
|
+
GGML_OP_ARANGE,
|
470
|
+
GGML_OP_TIMESTEP_EMBEDDING,
|
457
471
|
GGML_OP_ARGSORT,
|
458
472
|
GGML_OP_LEAKY_RELU,
|
459
473
|
|
460
474
|
GGML_OP_FLASH_ATTN,
|
461
475
|
GGML_OP_FLASH_FF,
|
462
476
|
GGML_OP_FLASH_ATTN_BACK,
|
477
|
+
GGML_OP_SSM_CONV,
|
478
|
+
GGML_OP_SSM_SCAN,
|
463
479
|
GGML_OP_WIN_PART,
|
464
480
|
GGML_OP_WIN_UNPART,
|
465
481
|
GGML_OP_GET_REL_POS,
|
@@ -1661,6 +1677,15 @@ extern "C" {
|
|
1661
1677
|
int p2,
|
1662
1678
|
int p3);
|
1663
1679
|
|
1680
|
+
// Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151
|
1681
|
+
// timesteps: [N,]
|
1682
|
+
// return: [N, dim]
|
1683
|
+
GGML_API struct ggml_tensor * ggml_timestep_embedding(
|
1684
|
+
struct ggml_context * ctx,
|
1685
|
+
struct ggml_tensor * timesteps,
|
1686
|
+
int dim,
|
1687
|
+
int max_period);
|
1688
|
+
|
1664
1689
|
// sort rows
|
1665
1690
|
enum ggml_sort_order {
|
1666
1691
|
GGML_SORT_ORDER_ASC,
|
@@ -1672,6 +1697,12 @@ extern "C" {
|
|
1672
1697
|
struct ggml_tensor * a,
|
1673
1698
|
enum ggml_sort_order order);
|
1674
1699
|
|
1700
|
+
GGML_API struct ggml_tensor * ggml_arange(
|
1701
|
+
struct ggml_context * ctx,
|
1702
|
+
float start,
|
1703
|
+
float stop,
|
1704
|
+
float step);
|
1705
|
+
|
1675
1706
|
// top k elements per row
|
1676
1707
|
GGML_API struct ggml_tensor * ggml_top_k(
|
1677
1708
|
struct ggml_context * ctx,
|
@@ -1701,6 +1732,23 @@ extern "C" {
|
|
1701
1732
|
struct ggml_tensor * c0,
|
1702
1733
|
struct ggml_tensor * c1);
|
1703
1734
|
|
1735
|
+
GGML_API struct ggml_tensor * ggml_ssm_conv(
|
1736
|
+
struct ggml_context * ctx,
|
1737
|
+
struct ggml_tensor * s,
|
1738
|
+
struct ggml_tensor * x,
|
1739
|
+
struct ggml_tensor * c,
|
1740
|
+
struct ggml_tensor * sq);
|
1741
|
+
|
1742
|
+
GGML_API struct ggml_tensor * ggml_ssm_scan(
|
1743
|
+
struct ggml_context * ctx,
|
1744
|
+
struct ggml_tensor * s,
|
1745
|
+
struct ggml_tensor * x,
|
1746
|
+
struct ggml_tensor * dt,
|
1747
|
+
struct ggml_tensor * A,
|
1748
|
+
struct ggml_tensor * B,
|
1749
|
+
struct ggml_tensor * C,
|
1750
|
+
struct ggml_tensor * sq);
|
1751
|
+
|
1704
1752
|
// partition into non-overlapping windows with padding if needed
|
1705
1753
|
// example:
|
1706
1754
|
// a: 768 64 64 1
|
@@ -1923,12 +1971,11 @@ extern "C" {
|
|
1923
1971
|
|
1924
1972
|
// ggml_graph_plan() has to be called before ggml_graph_compute()
|
1925
1973
|
// when plan.work_size > 0, caller must allocate memory for plan.work_data
|
1926
|
-
GGML_API struct ggml_cplan ggml_graph_plan
|
1927
|
-
GGML_API
|
1928
|
-
|
1974
|
+
GGML_API struct ggml_cplan ggml_graph_plan (const struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
|
1975
|
+
GGML_API enum ggml_status ggml_graph_compute ( struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
|
1929
1976
|
// same as ggml_graph_compute() but the work data is allocated as a part of the context
|
1930
1977
|
// note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
|
1931
|
-
GGML_API
|
1978
|
+
GGML_API enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
|
1932
1979
|
|
1933
1980
|
GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);
|
1934
1981
|
|
@@ -2149,25 +2196,18 @@ extern "C" {
|
|
2149
2196
|
GGML_API void ggml_quantize_init(enum ggml_type type);
|
2150
2197
|
GGML_API void ggml_quantize_free(void);
|
2151
2198
|
|
2152
|
-
// TODO: these would probably get removed in favor of the more general ggml_quantize_chunk
|
2153
|
-
GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
2154
|
-
GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
|
2155
|
-
GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
2156
|
-
GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist);
|
2157
|
-
GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
2158
|
-
|
2159
|
-
GGML_API size_t ggml_quantize_q2_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
2160
|
-
GGML_API size_t ggml_quantize_q3_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
2161
|
-
GGML_API size_t ggml_quantize_q4_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
2162
|
-
GGML_API size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
2163
|
-
GGML_API size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
2164
|
-
|
2165
2199
|
// some quantization type cannot be used without an importance matrix
|
2166
2200
|
GGML_API bool ggml_quantize_requires_imatrix(enum ggml_type type);
|
2167
2201
|
|
2168
2202
|
// calls ggml_quantize_init internally (i.e. can allocate memory)
|
2169
|
-
GGML_API size_t ggml_quantize_chunk(
|
2170
|
-
|
2203
|
+
GGML_API size_t ggml_quantize_chunk(
|
2204
|
+
enum ggml_type type,
|
2205
|
+
const float * src,
|
2206
|
+
void * dst,
|
2207
|
+
int start,
|
2208
|
+
int nrows,
|
2209
|
+
int n_per_row,
|
2210
|
+
const float * imatrix);
|
2171
2211
|
|
2172
2212
|
//
|
2173
2213
|
// gguf
|