llama_cpp 0.12.2 → 0.12.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -353,6 +353,7 @@ extern "C" {
353
353
  GGML_TYPE_Q8_K = 15,
354
354
  GGML_TYPE_IQ2_XXS = 16,
355
355
  GGML_TYPE_IQ2_XS = 17,
356
+ GGML_TYPE_IQ3_XXS = 18,
356
357
  GGML_TYPE_I8,
357
358
  GGML_TYPE_I16,
358
359
  GGML_TYPE_I32,
@@ -389,6 +390,7 @@ extern "C" {
389
390
  GGML_FTYPE_MOSTLY_Q6_K = 14, // except 1d tensors
390
391
  GGML_FTYPE_MOSTLY_IQ2_XXS = 15, // except 1d tensors
391
392
  GGML_FTYPE_MOSTLY_IQ2_XS = 16, // except 1d tensors
393
+ GGML_FTYPE_MOSTLY_IQ3_XXS = 17, // except 1d tensors
392
394
  };
393
395
 
394
396
  // available tensor operations:
@@ -489,6 +491,8 @@ extern "C" {
489
491
  GGML_UNARY_OP_GELU,
490
492
  GGML_UNARY_OP_GELU_QUICK,
491
493
  GGML_UNARY_OP_SILU,
494
+ GGML_UNARY_OP_HARDSWISH,
495
+ GGML_UNARY_OP_HARDSIGMOID,
492
496
 
493
497
  GGML_UNARY_OP_COUNT,
494
498
  };
@@ -1032,6 +1036,16 @@ extern "C" {
1032
1036
  struct ggml_tensor * a,
1033
1037
  struct ggml_tensor * b);
1034
1038
 
1039
+ // hardswish(x) = x * relu6(x + 3) / 6
1040
+ GGML_API struct ggml_tensor * ggml_hardswish(
1041
+ struct ggml_context * ctx,
1042
+ struct ggml_tensor * a);
1043
+
1044
+ // hardsigmoid(x) = relu6(x + 3) / 6
1045
+ GGML_API struct ggml_tensor * ggml_hardsigmoid(
1046
+ struct ggml_context * ctx,
1047
+ struct ggml_tensor * a);
1048
+
1035
1049
  // normalize along rows
1036
1050
  GGML_API struct ggml_tensor * ggml_norm(
1037
1051
  struct ggml_context * ctx,
@@ -1481,7 +1495,19 @@ extern "C" {
1481
1495
  int p1,
1482
1496
  int d0,
1483
1497
  int d1,
1484
- bool is_2D);
1498
+ bool is_2D,
1499
+ enum ggml_type dst_type);
1500
+
1501
+ GGML_API struct ggml_tensor * ggml_conv_depthwise_2d(
1502
+ struct ggml_context * ctx,
1503
+ struct ggml_tensor * a,
1504
+ struct ggml_tensor * b,
1505
+ int s0,
1506
+ int s1,
1507
+ int p0,
1508
+ int p1,
1509
+ int d0,
1510
+ int d1);
1485
1511
 
1486
1512
  GGML_API struct ggml_tensor * ggml_conv_1d(
1487
1513
  struct ggml_context * ctx,
@@ -2065,6 +2091,18 @@ extern "C" {
2065
2091
  // quantization
2066
2092
  //
2067
2093
 
2094
+ // - ggml_quantize_init can be called multiple times with the same type
2095
+ // it will only initialize the quantization tables for the first call or after ggml_quantize_free
2096
+ // automatically called by ggml_quantize_chunk for convenience
2097
+ //
2098
+ // - ggml_quantize_free will free any memory allocated by ggml_quantize_init
2099
+ // call this at the end of the program to avoid memory leaks
2100
+ //
2101
+ // note: these are thread-safe
2102
+ //
2103
+ GGML_API void ggml_quantize_init(enum ggml_type type);
2104
+ GGML_API void ggml_quantize_free(void);
2105
+
2068
2106
  // TODO: these would probably get removed in favor of the more general ggml_quantize_chunk
2069
2107
  GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
2070
2108
  GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
@@ -2078,19 +2116,13 @@ extern "C" {
2078
2116
  GGML_API size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist);
2079
2117
  GGML_API size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist);
2080
2118
 
2119
+ // some quantization type cannot be used without an importance matrix
2120
+ GGML_API bool ggml_quantize_requires_imatrix(enum ggml_type type);
2121
+
2122
+ // calls ggml_quantize_init internally (i.e. can allocate memory)
2081
2123
  GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst,
2082
2124
  int start, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
2083
2125
 
2084
- // These are needed for IQ2_XS and IQ2_XXS quantizations
2085
- GGML_API void ggml_init_iq2_quantization(enum ggml_type type);
2086
- GGML_API void ggml_deinit_iq2_quantization(enum ggml_type type);
2087
-
2088
- //
2089
- // Importance matrix
2090
- //
2091
- typedef void(*ggml_collect_imatrix_t)(const struct ggml_tensor * src0, const struct ggml_tensor * src1);
2092
- GGML_API void ggml_set_imatrix_collection(ggml_collect_imatrix_t imatrix_collect);
2093
-
2094
2126
  //
2095
2127
  // gguf
2096
2128
  //
@@ -2234,9 +2266,12 @@ extern "C" {
2234
2266
  GGML_API int ggml_cpu_has_blas (void);
2235
2267
  GGML_API int ggml_cpu_has_cublas (void);
2236
2268
  GGML_API int ggml_cpu_has_clblast (void);
2269
+ GGML_API int ggml_cpu_has_vulkan (void);
2270
+ GGML_API int ggml_cpu_has_kompute (void);
2237
2271
  GGML_API int ggml_cpu_has_gpublas (void);
2238
2272
  GGML_API int ggml_cpu_has_sse3 (void);
2239
2273
  GGML_API int ggml_cpu_has_ssse3 (void);
2274
+ GGML_API int ggml_cpu_has_sycl (void);
2240
2275
  GGML_API int ggml_cpu_has_vsx (void);
2241
2276
 
2242
2277
  //