llama_cpp 0.13.0 → 0.14.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +20 -0
- data/ext/llama_cpp/llama_cpp.cpp +130 -26
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +15 -4
- data/vendor/tmp/llama.cpp/Makefile +30 -15
- data/vendor/tmp/llama.cpp/ggml-alloc.c +45 -64
- data/vendor/tmp/llama.cpp/ggml-alloc.h +13 -5
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +17 -5
- data/vendor/tmp/llama.cpp/ggml-backend.c +371 -151
- data/vendor/tmp/llama.cpp/ggml-backend.h +54 -29
- data/vendor/tmp/llama.cpp/ggml-common.h +1830 -0
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +765 -830
- data/vendor/tmp/llama.cpp/ggml-impl.h +6 -2
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +7 -2
- data/vendor/tmp/llama.cpp/ggml-metal.m +105 -27
- data/vendor/tmp/llama.cpp/ggml-metal.metal +99 -920
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +2 -2
- data/vendor/tmp/llama.cpp/ggml-quants.c +557 -1129
- data/vendor/tmp/llama.cpp/ggml-quants.h +27 -259
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +3332 -1195
- data/vendor/tmp/llama.cpp/ggml-sycl.h +5 -0
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +39336 -43461
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +1302 -781
- data/vendor/tmp/llama.cpp/ggml-vulkan.h +1 -0
- data/vendor/tmp/llama.cpp/ggml.c +734 -356
- data/vendor/tmp/llama.cpp/ggml.h +91 -51
- data/vendor/tmp/llama.cpp/llama.cpp +1938 -759
- data/vendor/tmp/llama.cpp/llama.h +53 -21
- data/vendor/tmp/llama.cpp/unicode.cpp +1672 -0
- data/vendor/tmp/llama.cpp/unicode.h +16 -774
- metadata +4 -2
data/vendor/tmp/llama.cpp/ggml.h
CHANGED
@@ -315,6 +315,16 @@
|
|
315
315
|
extern "C" {
|
316
316
|
#endif
|
317
317
|
|
318
|
+
enum ggml_status {
|
319
|
+
GGML_STATUS_ALLOC_FAILED = -2,
|
320
|
+
GGML_STATUS_FAILED = -1,
|
321
|
+
GGML_STATUS_SUCCESS = 0,
|
322
|
+
GGML_STATUS_ABORTED = 1,
|
323
|
+
};
|
324
|
+
|
325
|
+
// get ggml_status name string
|
326
|
+
GGML_API GGML_CALL const char * ggml_status_to_string(enum ggml_status status);
|
327
|
+
|
318
328
|
typedef uint16_t ggml_fp16_t;
|
319
329
|
|
320
330
|
// convert FP16 <-> FP32
|
@@ -327,24 +337,24 @@ extern "C" {
|
|
327
337
|
struct ggml_object;
|
328
338
|
struct ggml_context;
|
329
339
|
|
340
|
+
// NOTE: always add types at the end of the enum to keep backward compatibility
|
330
341
|
enum ggml_type {
|
331
|
-
GGML_TYPE_F32
|
332
|
-
GGML_TYPE_F16
|
333
|
-
GGML_TYPE_Q4_0
|
334
|
-
GGML_TYPE_Q4_1
|
342
|
+
GGML_TYPE_F32 = 0,
|
343
|
+
GGML_TYPE_F16 = 1,
|
344
|
+
GGML_TYPE_Q4_0 = 2,
|
345
|
+
GGML_TYPE_Q4_1 = 3,
|
335
346
|
// GGML_TYPE_Q4_2 = 4, support has been removed
|
336
|
-
// GGML_TYPE_Q4_3
|
337
|
-
GGML_TYPE_Q5_0
|
338
|
-
GGML_TYPE_Q5_1
|
339
|
-
GGML_TYPE_Q8_0
|
340
|
-
GGML_TYPE_Q8_1
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
|
345
|
-
|
346
|
-
|
347
|
-
GGML_TYPE_Q8_K = 15,
|
347
|
+
// GGML_TYPE_Q4_3 = 5, support has been removed
|
348
|
+
GGML_TYPE_Q5_0 = 6,
|
349
|
+
GGML_TYPE_Q5_1 = 7,
|
350
|
+
GGML_TYPE_Q8_0 = 8,
|
351
|
+
GGML_TYPE_Q8_1 = 9,
|
352
|
+
GGML_TYPE_Q2_K = 10,
|
353
|
+
GGML_TYPE_Q3_K = 11,
|
354
|
+
GGML_TYPE_Q4_K = 12,
|
355
|
+
GGML_TYPE_Q5_K = 13,
|
356
|
+
GGML_TYPE_Q6_K = 14,
|
357
|
+
GGML_TYPE_Q8_K = 15,
|
348
358
|
GGML_TYPE_IQ2_XXS = 16,
|
349
359
|
GGML_TYPE_IQ2_XS = 17,
|
350
360
|
GGML_TYPE_IQ3_XXS = 18,
|
@@ -353,9 +363,11 @@ extern "C" {
|
|
353
363
|
GGML_TYPE_IQ3_S = 21,
|
354
364
|
GGML_TYPE_IQ2_S = 22,
|
355
365
|
GGML_TYPE_IQ4_XS = 23,
|
356
|
-
GGML_TYPE_I8,
|
357
|
-
GGML_TYPE_I16,
|
358
|
-
GGML_TYPE_I32,
|
366
|
+
GGML_TYPE_I8 = 24,
|
367
|
+
GGML_TYPE_I16 = 25,
|
368
|
+
GGML_TYPE_I32 = 26,
|
369
|
+
GGML_TYPE_I64 = 27,
|
370
|
+
GGML_TYPE_F64 = 28,
|
359
371
|
GGML_TYPE_COUNT,
|
360
372
|
};
|
361
373
|
|
@@ -373,20 +385,20 @@ extern "C" {
|
|
373
385
|
|
374
386
|
// model file types
|
375
387
|
enum ggml_ftype {
|
376
|
-
GGML_FTYPE_UNKNOWN
|
377
|
-
GGML_FTYPE_ALL_F32
|
378
|
-
GGML_FTYPE_MOSTLY_F16
|
379
|
-
GGML_FTYPE_MOSTLY_Q4_0
|
380
|
-
GGML_FTYPE_MOSTLY_Q4_1
|
388
|
+
GGML_FTYPE_UNKNOWN = -1,
|
389
|
+
GGML_FTYPE_ALL_F32 = 0,
|
390
|
+
GGML_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
|
391
|
+
GGML_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
|
392
|
+
GGML_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
|
381
393
|
GGML_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
|
382
|
-
GGML_FTYPE_MOSTLY_Q8_0
|
383
|
-
GGML_FTYPE_MOSTLY_Q5_0
|
384
|
-
GGML_FTYPE_MOSTLY_Q5_1
|
385
|
-
GGML_FTYPE_MOSTLY_Q2_K
|
386
|
-
GGML_FTYPE_MOSTLY_Q3_K
|
387
|
-
GGML_FTYPE_MOSTLY_Q4_K
|
388
|
-
GGML_FTYPE_MOSTLY_Q5_K
|
389
|
-
GGML_FTYPE_MOSTLY_Q6_K
|
394
|
+
GGML_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
|
395
|
+
GGML_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
|
396
|
+
GGML_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
|
397
|
+
GGML_FTYPE_MOSTLY_Q2_K = 10, // except 1d tensors
|
398
|
+
GGML_FTYPE_MOSTLY_Q3_K = 11, // except 1d tensors
|
399
|
+
GGML_FTYPE_MOSTLY_Q4_K = 12, // except 1d tensors
|
400
|
+
GGML_FTYPE_MOSTLY_Q5_K = 13, // except 1d tensors
|
401
|
+
GGML_FTYPE_MOSTLY_Q6_K = 14, // except 1d tensors
|
390
402
|
GGML_FTYPE_MOSTLY_IQ2_XXS = 15, // except 1d tensors
|
391
403
|
GGML_FTYPE_MOSTLY_IQ2_XS = 16, // except 1d tensors
|
392
404
|
GGML_FTYPE_MOSTLY_IQ3_XXS = 17, // except 1d tensors
|
@@ -454,12 +466,16 @@ extern "C" {
|
|
454
466
|
GGML_OP_POOL_2D,
|
455
467
|
GGML_OP_UPSCALE, // nearest interpolate
|
456
468
|
GGML_OP_PAD,
|
469
|
+
GGML_OP_ARANGE,
|
470
|
+
GGML_OP_TIMESTEP_EMBEDDING,
|
457
471
|
GGML_OP_ARGSORT,
|
458
472
|
GGML_OP_LEAKY_RELU,
|
459
473
|
|
460
474
|
GGML_OP_FLASH_ATTN,
|
461
475
|
GGML_OP_FLASH_FF,
|
462
476
|
GGML_OP_FLASH_ATTN_BACK,
|
477
|
+
GGML_OP_SSM_CONV,
|
478
|
+
GGML_OP_SSM_SCAN,
|
463
479
|
GGML_OP_WIN_PART,
|
464
480
|
GGML_OP_WIN_UNPART,
|
465
481
|
GGML_OP_GET_REL_POS,
|
@@ -1661,6 +1677,15 @@ extern "C" {
|
|
1661
1677
|
int p2,
|
1662
1678
|
int p3);
|
1663
1679
|
|
1680
|
+
// Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151
|
1681
|
+
// timesteps: [N,]
|
1682
|
+
// return: [N, dim]
|
1683
|
+
GGML_API struct ggml_tensor * ggml_timestep_embedding(
|
1684
|
+
struct ggml_context * ctx,
|
1685
|
+
struct ggml_tensor * timesteps,
|
1686
|
+
int dim,
|
1687
|
+
int max_period);
|
1688
|
+
|
1664
1689
|
// sort rows
|
1665
1690
|
enum ggml_sort_order {
|
1666
1691
|
GGML_SORT_ORDER_ASC,
|
@@ -1672,6 +1697,12 @@ extern "C" {
|
|
1672
1697
|
struct ggml_tensor * a,
|
1673
1698
|
enum ggml_sort_order order);
|
1674
1699
|
|
1700
|
+
GGML_API struct ggml_tensor * ggml_arange(
|
1701
|
+
struct ggml_context * ctx,
|
1702
|
+
float start,
|
1703
|
+
float stop,
|
1704
|
+
float step);
|
1705
|
+
|
1675
1706
|
// top k elements per row
|
1676
1707
|
GGML_API struct ggml_tensor * ggml_top_k(
|
1677
1708
|
struct ggml_context * ctx,
|
@@ -1701,6 +1732,23 @@ extern "C" {
|
|
1701
1732
|
struct ggml_tensor * c0,
|
1702
1733
|
struct ggml_tensor * c1);
|
1703
1734
|
|
1735
|
+
GGML_API struct ggml_tensor * ggml_ssm_conv(
|
1736
|
+
struct ggml_context * ctx,
|
1737
|
+
struct ggml_tensor * s,
|
1738
|
+
struct ggml_tensor * x,
|
1739
|
+
struct ggml_tensor * c,
|
1740
|
+
struct ggml_tensor * sq);
|
1741
|
+
|
1742
|
+
GGML_API struct ggml_tensor * ggml_ssm_scan(
|
1743
|
+
struct ggml_context * ctx,
|
1744
|
+
struct ggml_tensor * s,
|
1745
|
+
struct ggml_tensor * x,
|
1746
|
+
struct ggml_tensor * dt,
|
1747
|
+
struct ggml_tensor * A,
|
1748
|
+
struct ggml_tensor * B,
|
1749
|
+
struct ggml_tensor * C,
|
1750
|
+
struct ggml_tensor * sq);
|
1751
|
+
|
1704
1752
|
// partition into non-overlapping windows with padding if needed
|
1705
1753
|
// example:
|
1706
1754
|
// a: 768 64 64 1
|
@@ -1923,12 +1971,11 @@ extern "C" {
|
|
1923
1971
|
|
1924
1972
|
// ggml_graph_plan() has to be called before ggml_graph_compute()
|
1925
1973
|
// when plan.work_size > 0, caller must allocate memory for plan.work_data
|
1926
|
-
GGML_API struct ggml_cplan ggml_graph_plan
|
1927
|
-
GGML_API
|
1928
|
-
|
1974
|
+
GGML_API struct ggml_cplan ggml_graph_plan (const struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
|
1975
|
+
GGML_API enum ggml_status ggml_graph_compute ( struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
|
1929
1976
|
// same as ggml_graph_compute() but the work data is allocated as a part of the context
|
1930
1977
|
// note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
|
1931
|
-
GGML_API
|
1978
|
+
GGML_API enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
|
1932
1979
|
|
1933
1980
|
GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);
|
1934
1981
|
|
@@ -2149,25 +2196,18 @@ extern "C" {
|
|
2149
2196
|
GGML_API void ggml_quantize_init(enum ggml_type type);
|
2150
2197
|
GGML_API void ggml_quantize_free(void);
|
2151
2198
|
|
2152
|
-
// TODO: these would probably get removed in favor of the more general ggml_quantize_chunk
|
2153
|
-
GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
2154
|
-
GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
|
2155
|
-
GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
2156
|
-
GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist);
|
2157
|
-
GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
2158
|
-
|
2159
|
-
GGML_API size_t ggml_quantize_q2_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
2160
|
-
GGML_API size_t ggml_quantize_q3_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
2161
|
-
GGML_API size_t ggml_quantize_q4_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
2162
|
-
GGML_API size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
2163
|
-
GGML_API size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
2164
|
-
|
2165
2199
|
// some quantization type cannot be used without an importance matrix
|
2166
2200
|
GGML_API bool ggml_quantize_requires_imatrix(enum ggml_type type);
|
2167
2201
|
|
2168
2202
|
// calls ggml_quantize_init internally (i.e. can allocate memory)
|
2169
|
-
GGML_API size_t ggml_quantize_chunk(
|
2170
|
-
|
2203
|
+
GGML_API size_t ggml_quantize_chunk(
|
2204
|
+
enum ggml_type type,
|
2205
|
+
const float * src,
|
2206
|
+
void * dst,
|
2207
|
+
int start,
|
2208
|
+
int nrows,
|
2209
|
+
int n_per_row,
|
2210
|
+
const float * imatrix);
|
2171
2211
|
|
2172
2212
|
//
|
2173
2213
|
// gguf
|