llama_cpp 0.12.7 → 0.14.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +24 -0
- data/ext/llama_cpp/llama_cpp.cpp +131 -288
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +29 -29
- data/vendor/tmp/llama.cpp/Makefile +10 -6
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +6 -3
- data/vendor/tmp/llama.cpp/ggml-backend.c +32 -23
- data/vendor/tmp/llama.cpp/ggml-backend.h +17 -16
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +949 -168
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +9 -3
- data/vendor/tmp/llama.cpp/ggml-metal.m +159 -22
- data/vendor/tmp/llama.cpp/ggml-metal.metal +1195 -139
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +27 -27
- data/vendor/tmp/llama.cpp/ggml-quants.c +1971 -271
- data/vendor/tmp/llama.cpp/ggml-quants.h +52 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +3586 -1201
- data/vendor/tmp/llama.cpp/ggml-sycl.h +5 -0
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +39336 -43461
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +1391 -825
- data/vendor/tmp/llama.cpp/ggml-vulkan.h +1 -0
- data/vendor/tmp/llama.cpp/ggml.c +545 -210
- data/vendor/tmp/llama.cpp/ggml.h +65 -23
- data/vendor/tmp/llama.cpp/llama.cpp +1458 -763
- data/vendor/tmp/llama.cpp/llama.h +81 -75
- data/vendor/tmp/llama.cpp/unicode.h +310 -1
- metadata +2 -2
data/vendor/tmp/llama.cpp/ggml.h
CHANGED
@@ -315,6 +315,16 @@
|
|
315
315
|
extern "C" {
|
316
316
|
#endif
|
317
317
|
|
318
|
+
enum ggml_status {
|
319
|
+
GGML_STATUS_ALLOC_FAILED = -2,
|
320
|
+
GGML_STATUS_FAILED = -1,
|
321
|
+
GGML_STATUS_SUCCESS = 0,
|
322
|
+
GGML_STATUS_ABORTED = 1,
|
323
|
+
};
|
324
|
+
|
325
|
+
// get ggml_status name string
|
326
|
+
GGML_API GGML_CALL const char * ggml_status_to_string(enum ggml_status status);
|
327
|
+
|
318
328
|
typedef uint16_t ggml_fp16_t;
|
319
329
|
|
320
330
|
// convert FP16 <-> FP32
|
@@ -350,6 +360,9 @@ extern "C" {
|
|
350
360
|
GGML_TYPE_IQ3_XXS = 18,
|
351
361
|
GGML_TYPE_IQ1_S = 19,
|
352
362
|
GGML_TYPE_IQ4_NL = 20,
|
363
|
+
GGML_TYPE_IQ3_S = 21,
|
364
|
+
GGML_TYPE_IQ2_S = 22,
|
365
|
+
GGML_TYPE_IQ4_XS = 23,
|
353
366
|
GGML_TYPE_I8,
|
354
367
|
GGML_TYPE_I16,
|
355
368
|
GGML_TYPE_I32,
|
@@ -363,9 +376,9 @@ extern "C" {
|
|
363
376
|
};
|
364
377
|
|
365
378
|
enum ggml_backend_type {
|
366
|
-
|
367
|
-
|
368
|
-
|
379
|
+
GGML_BACKEND_TYPE_CPU = 0,
|
380
|
+
GGML_BACKEND_TYPE_GPU = 10,
|
381
|
+
GGML_BACKEND_TYPE_GPU_SPLIT = 20,
|
369
382
|
};
|
370
383
|
|
371
384
|
// model file types
|
@@ -389,6 +402,9 @@ extern "C" {
|
|
389
402
|
GGML_FTYPE_MOSTLY_IQ3_XXS = 17, // except 1d tensors
|
390
403
|
GGML_FTYPE_MOSTLY_IQ1_S = 18, // except 1d tensors
|
391
404
|
GGML_FTYPE_MOSTLY_IQ4_NL = 19, // except 1d tensors
|
405
|
+
GGML_FTYPE_MOSTLY_IQ3_S = 20, // except 1d tensors
|
406
|
+
GGML_FTYPE_MOSTLY_IQ2_S = 21, // except 1d tensors
|
407
|
+
GGML_FTYPE_MOSTLY_IQ4_XS = 22, // except 1d tensors
|
392
408
|
};
|
393
409
|
|
394
410
|
// available tensor operations:
|
@@ -448,6 +464,8 @@ extern "C" {
|
|
448
464
|
GGML_OP_POOL_2D,
|
449
465
|
GGML_OP_UPSCALE, // nearest interpolate
|
450
466
|
GGML_OP_PAD,
|
467
|
+
GGML_OP_ARANGE,
|
468
|
+
GGML_OP_TIMESTEP_EMBEDDING,
|
451
469
|
GGML_OP_ARGSORT,
|
452
470
|
GGML_OP_LEAKY_RELU,
|
453
471
|
|
@@ -496,9 +514,9 @@ extern "C" {
|
|
496
514
|
};
|
497
515
|
|
498
516
|
enum ggml_object_type {
|
499
|
-
|
500
|
-
|
501
|
-
|
517
|
+
GGML_OBJECT_TYPE_TENSOR,
|
518
|
+
GGML_OBJECT_TYPE_GRAPH,
|
519
|
+
GGML_OBJECT_TYPE_WORK_BUFFER
|
502
520
|
};
|
503
521
|
|
504
522
|
enum ggml_log_level {
|
@@ -640,9 +658,9 @@ extern "C" {
|
|
640
658
|
// NOTE: the INIT or FINALIZE pass is not scheduled unless explicitly enabled.
|
641
659
|
// This behavior was changed since https://github.com/ggerganov/llama.cpp/pull/1995.
|
642
660
|
enum ggml_task_type {
|
643
|
-
|
644
|
-
|
645
|
-
|
661
|
+
GGML_TASK_TYPE_INIT = 0,
|
662
|
+
GGML_TASK_TYPE_COMPUTE,
|
663
|
+
GGML_TASK_TYPE_FINALIZE,
|
646
664
|
};
|
647
665
|
|
648
666
|
struct ggml_compute_params {
|
@@ -666,6 +684,16 @@ extern "C" {
|
|
666
684
|
GGML_NUMA_STRATEGY_COUNT
|
667
685
|
};
|
668
686
|
|
687
|
+
//
|
688
|
+
// GUID
|
689
|
+
//
|
690
|
+
|
691
|
+
// GUID types
|
692
|
+
typedef uint8_t ggml_guid[16];
|
693
|
+
typedef ggml_guid * ggml_guid_t;
|
694
|
+
|
695
|
+
GGML_API bool ggml_guid_matches(ggml_guid_t guid_a, ggml_guid_t guid_b);
|
696
|
+
|
669
697
|
// misc
|
670
698
|
|
671
699
|
GGML_API void ggml_time_init(void); // call this once at the beginning of the program
|
@@ -1645,10 +1673,19 @@ extern "C" {
|
|
1645
1673
|
int p2,
|
1646
1674
|
int p3);
|
1647
1675
|
|
1676
|
+
// Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151
|
1677
|
+
// timesteps: [N,]
|
1678
|
+
// return: [N, dim]
|
1679
|
+
GGML_API struct ggml_tensor * ggml_timestep_embedding(
|
1680
|
+
struct ggml_context * ctx,
|
1681
|
+
struct ggml_tensor * timesteps,
|
1682
|
+
int dim,
|
1683
|
+
int max_period);
|
1684
|
+
|
1648
1685
|
// sort rows
|
1649
1686
|
enum ggml_sort_order {
|
1650
|
-
|
1651
|
-
|
1687
|
+
GGML_SORT_ORDER_ASC,
|
1688
|
+
GGML_SORT_ORDER_DESC,
|
1652
1689
|
};
|
1653
1690
|
|
1654
1691
|
GGML_API struct ggml_tensor * ggml_argsort(
|
@@ -1656,6 +1693,12 @@ extern "C" {
|
|
1656
1693
|
struct ggml_tensor * a,
|
1657
1694
|
enum ggml_sort_order order);
|
1658
1695
|
|
1696
|
+
GGML_API struct ggml_tensor * ggml_arange(
|
1697
|
+
struct ggml_context * ctx,
|
1698
|
+
float start,
|
1699
|
+
float stop,
|
1700
|
+
float step);
|
1701
|
+
|
1659
1702
|
// top k elements per row
|
1660
1703
|
GGML_API struct ggml_tensor * ggml_top_k(
|
1661
1704
|
struct ggml_context * ctx,
|
@@ -1907,12 +1950,11 @@ extern "C" {
|
|
1907
1950
|
|
1908
1951
|
// ggml_graph_plan() has to be called before ggml_graph_compute()
|
1909
1952
|
// when plan.work_size > 0, caller must allocate memory for plan.work_data
|
1910
|
-
GGML_API struct ggml_cplan ggml_graph_plan
|
1911
|
-
GGML_API
|
1912
|
-
|
1953
|
+
GGML_API struct ggml_cplan ggml_graph_plan (const struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
|
1954
|
+
GGML_API enum ggml_status ggml_graph_compute ( struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
|
1913
1955
|
// same as ggml_graph_compute() but the work data is allocated as a part of the context
|
1914
1956
|
// note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
|
1915
|
-
GGML_API
|
1957
|
+
GGML_API enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
|
1916
1958
|
|
1917
1959
|
GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);
|
1918
1960
|
|
@@ -1941,8 +1983,8 @@ extern "C" {
|
|
1941
1983
|
|
1942
1984
|
// optimization methods
|
1943
1985
|
enum ggml_opt_type {
|
1944
|
-
|
1945
|
-
|
1986
|
+
GGML_OPT_TYPE_ADAM,
|
1987
|
+
GGML_OPT_TYPE_LBFGS,
|
1946
1988
|
};
|
1947
1989
|
|
1948
1990
|
// linesearch methods
|
@@ -1956,12 +1998,12 @@ extern "C" {
|
|
1956
1998
|
|
1957
1999
|
// optimization return values
|
1958
2000
|
enum ggml_opt_result {
|
1959
|
-
|
1960
|
-
|
1961
|
-
|
1962
|
-
|
1963
|
-
|
1964
|
-
|
2001
|
+
GGML_OPT_RESULT_OK = 0,
|
2002
|
+
GGML_OPT_RESULT_DID_NOT_CONVERGE,
|
2003
|
+
GGML_OPT_RESULT_NO_CONTEXT,
|
2004
|
+
GGML_OPT_RESULT_INVALID_WOLFE,
|
2005
|
+
GGML_OPT_RESULT_FAIL,
|
2006
|
+
GGML_OPT_RESULT_CANCEL,
|
1965
2007
|
|
1966
2008
|
GGML_LINESEARCH_FAIL = -128,
|
1967
2009
|
GGML_LINESEARCH_MINIMUM_STEP,
|