llama_cpp 0.12.7 → 0.14.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +24 -0
- data/ext/llama_cpp/llama_cpp.cpp +131 -288
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +29 -29
- data/vendor/tmp/llama.cpp/Makefile +10 -6
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +6 -3
- data/vendor/tmp/llama.cpp/ggml-backend.c +32 -23
- data/vendor/tmp/llama.cpp/ggml-backend.h +17 -16
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +949 -168
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +9 -3
- data/vendor/tmp/llama.cpp/ggml-metal.m +159 -22
- data/vendor/tmp/llama.cpp/ggml-metal.metal +1195 -139
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +27 -27
- data/vendor/tmp/llama.cpp/ggml-quants.c +1971 -271
- data/vendor/tmp/llama.cpp/ggml-quants.h +52 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +3586 -1201
- data/vendor/tmp/llama.cpp/ggml-sycl.h +5 -0
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +39336 -43461
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +1391 -825
- data/vendor/tmp/llama.cpp/ggml-vulkan.h +1 -0
- data/vendor/tmp/llama.cpp/ggml.c +545 -210
- data/vendor/tmp/llama.cpp/ggml.h +65 -23
- data/vendor/tmp/llama.cpp/llama.cpp +1458 -763
- data/vendor/tmp/llama.cpp/llama.h +81 -75
- data/vendor/tmp/llama.cpp/unicode.h +310 -1
- metadata +2 -2
data/vendor/tmp/llama.cpp/ggml.h
CHANGED
@@ -315,6 +315,16 @@
|
|
315
315
|
extern "C" {
|
316
316
|
#endif
|
317
317
|
|
318
|
+
enum ggml_status {
|
319
|
+
GGML_STATUS_ALLOC_FAILED = -2,
|
320
|
+
GGML_STATUS_FAILED = -1,
|
321
|
+
GGML_STATUS_SUCCESS = 0,
|
322
|
+
GGML_STATUS_ABORTED = 1,
|
323
|
+
};
|
324
|
+
|
325
|
+
// get ggml_status name string
|
326
|
+
GGML_API GGML_CALL const char * ggml_status_to_string(enum ggml_status status);
|
327
|
+
|
318
328
|
typedef uint16_t ggml_fp16_t;
|
319
329
|
|
320
330
|
// convert FP16 <-> FP32
|
@@ -350,6 +360,9 @@ extern "C" {
|
|
350
360
|
GGML_TYPE_IQ3_XXS = 18,
|
351
361
|
GGML_TYPE_IQ1_S = 19,
|
352
362
|
GGML_TYPE_IQ4_NL = 20,
|
363
|
+
GGML_TYPE_IQ3_S = 21,
|
364
|
+
GGML_TYPE_IQ2_S = 22,
|
365
|
+
GGML_TYPE_IQ4_XS = 23,
|
353
366
|
GGML_TYPE_I8,
|
354
367
|
GGML_TYPE_I16,
|
355
368
|
GGML_TYPE_I32,
|
@@ -363,9 +376,9 @@ extern "C" {
|
|
363
376
|
};
|
364
377
|
|
365
378
|
enum ggml_backend_type {
|
366
|
-
|
367
|
-
|
368
|
-
|
379
|
+
GGML_BACKEND_TYPE_CPU = 0,
|
380
|
+
GGML_BACKEND_TYPE_GPU = 10,
|
381
|
+
GGML_BACKEND_TYPE_GPU_SPLIT = 20,
|
369
382
|
};
|
370
383
|
|
371
384
|
// model file types
|
@@ -389,6 +402,9 @@ extern "C" {
|
|
389
402
|
GGML_FTYPE_MOSTLY_IQ3_XXS = 17, // except 1d tensors
|
390
403
|
GGML_FTYPE_MOSTLY_IQ1_S = 18, // except 1d tensors
|
391
404
|
GGML_FTYPE_MOSTLY_IQ4_NL = 19, // except 1d tensors
|
405
|
+
GGML_FTYPE_MOSTLY_IQ3_S = 20, // except 1d tensors
|
406
|
+
GGML_FTYPE_MOSTLY_IQ2_S = 21, // except 1d tensors
|
407
|
+
GGML_FTYPE_MOSTLY_IQ4_XS = 22, // except 1d tensors
|
392
408
|
};
|
393
409
|
|
394
410
|
// available tensor operations:
|
@@ -448,6 +464,8 @@ extern "C" {
|
|
448
464
|
GGML_OP_POOL_2D,
|
449
465
|
GGML_OP_UPSCALE, // nearest interpolate
|
450
466
|
GGML_OP_PAD,
|
467
|
+
GGML_OP_ARANGE,
|
468
|
+
GGML_OP_TIMESTEP_EMBEDDING,
|
451
469
|
GGML_OP_ARGSORT,
|
452
470
|
GGML_OP_LEAKY_RELU,
|
453
471
|
|
@@ -496,9 +514,9 @@ extern "C" {
|
|
496
514
|
};
|
497
515
|
|
498
516
|
enum ggml_object_type {
|
499
|
-
|
500
|
-
|
501
|
-
|
517
|
+
GGML_OBJECT_TYPE_TENSOR,
|
518
|
+
GGML_OBJECT_TYPE_GRAPH,
|
519
|
+
GGML_OBJECT_TYPE_WORK_BUFFER
|
502
520
|
};
|
503
521
|
|
504
522
|
enum ggml_log_level {
|
@@ -640,9 +658,9 @@ extern "C" {
|
|
640
658
|
// NOTE: the INIT or FINALIZE pass is not scheduled unless explicitly enabled.
|
641
659
|
// This behavior was changed since https://github.com/ggerganov/llama.cpp/pull/1995.
|
642
660
|
enum ggml_task_type {
|
643
|
-
|
644
|
-
|
645
|
-
|
661
|
+
GGML_TASK_TYPE_INIT = 0,
|
662
|
+
GGML_TASK_TYPE_COMPUTE,
|
663
|
+
GGML_TASK_TYPE_FINALIZE,
|
646
664
|
};
|
647
665
|
|
648
666
|
struct ggml_compute_params {
|
@@ -666,6 +684,16 @@ extern "C" {
|
|
666
684
|
GGML_NUMA_STRATEGY_COUNT
|
667
685
|
};
|
668
686
|
|
687
|
+
//
|
688
|
+
// GUID
|
689
|
+
//
|
690
|
+
|
691
|
+
// GUID types
|
692
|
+
typedef uint8_t ggml_guid[16];
|
693
|
+
typedef ggml_guid * ggml_guid_t;
|
694
|
+
|
695
|
+
GGML_API bool ggml_guid_matches(ggml_guid_t guid_a, ggml_guid_t guid_b);
|
696
|
+
|
669
697
|
// misc
|
670
698
|
|
671
699
|
GGML_API void ggml_time_init(void); // call this once at the beginning of the program
|
@@ -1645,10 +1673,19 @@ extern "C" {
|
|
1645
1673
|
int p2,
|
1646
1674
|
int p3);
|
1647
1675
|
|
1676
|
+
// Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151
|
1677
|
+
// timesteps: [N,]
|
1678
|
+
// return: [N, dim]
|
1679
|
+
GGML_API struct ggml_tensor * ggml_timestep_embedding(
|
1680
|
+
struct ggml_context * ctx,
|
1681
|
+
struct ggml_tensor * timesteps,
|
1682
|
+
int dim,
|
1683
|
+
int max_period);
|
1684
|
+
|
1648
1685
|
// sort rows
|
1649
1686
|
enum ggml_sort_order {
|
1650
|
-
|
1651
|
-
|
1687
|
+
GGML_SORT_ORDER_ASC,
|
1688
|
+
GGML_SORT_ORDER_DESC,
|
1652
1689
|
};
|
1653
1690
|
|
1654
1691
|
GGML_API struct ggml_tensor * ggml_argsort(
|
@@ -1656,6 +1693,12 @@ extern "C" {
|
|
1656
1693
|
struct ggml_tensor * a,
|
1657
1694
|
enum ggml_sort_order order);
|
1658
1695
|
|
1696
|
+
GGML_API struct ggml_tensor * ggml_arange(
|
1697
|
+
struct ggml_context * ctx,
|
1698
|
+
float start,
|
1699
|
+
float stop,
|
1700
|
+
float step);
|
1701
|
+
|
1659
1702
|
// top k elements per row
|
1660
1703
|
GGML_API struct ggml_tensor * ggml_top_k(
|
1661
1704
|
struct ggml_context * ctx,
|
@@ -1907,12 +1950,11 @@ extern "C" {
|
|
1907
1950
|
|
1908
1951
|
// ggml_graph_plan() has to be called before ggml_graph_compute()
|
1909
1952
|
// when plan.work_size > 0, caller must allocate memory for plan.work_data
|
1910
|
-
GGML_API struct ggml_cplan ggml_graph_plan
|
1911
|
-
GGML_API
|
1912
|
-
|
1953
|
+
GGML_API struct ggml_cplan ggml_graph_plan (const struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
|
1954
|
+
GGML_API enum ggml_status ggml_graph_compute ( struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
|
1913
1955
|
// same as ggml_graph_compute() but the work data is allocated as a part of the context
|
1914
1956
|
// note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
|
1915
|
-
GGML_API
|
1957
|
+
GGML_API enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
|
1916
1958
|
|
1917
1959
|
GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);
|
1918
1960
|
|
@@ -1941,8 +1983,8 @@ extern "C" {
|
|
1941
1983
|
|
1942
1984
|
// optimization methods
|
1943
1985
|
enum ggml_opt_type {
|
1944
|
-
|
1945
|
-
|
1986
|
+
GGML_OPT_TYPE_ADAM,
|
1987
|
+
GGML_OPT_TYPE_LBFGS,
|
1946
1988
|
};
|
1947
1989
|
|
1948
1990
|
// linesearch methods
|
@@ -1956,12 +1998,12 @@ extern "C" {
|
|
1956
1998
|
|
1957
1999
|
// optimization return values
|
1958
2000
|
enum ggml_opt_result {
|
1959
|
-
|
1960
|
-
|
1961
|
-
|
1962
|
-
|
1963
|
-
|
1964
|
-
|
2001
|
+
GGML_OPT_RESULT_OK = 0,
|
2002
|
+
GGML_OPT_RESULT_DID_NOT_CONVERGE,
|
2003
|
+
GGML_OPT_RESULT_NO_CONTEXT,
|
2004
|
+
GGML_OPT_RESULT_INVALID_WOLFE,
|
2005
|
+
GGML_OPT_RESULT_FAIL,
|
2006
|
+
GGML_OPT_RESULT_CANCEL,
|
1965
2007
|
|
1966
2008
|
GGML_LINESEARCH_FAIL = -128,
|
1967
2009
|
GGML_LINESEARCH_MINIMUM_STEP,
|