llama_cpp 0.8.0 → 0.9.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -219,7 +219,7 @@
219
219
  #define GGML_MAX_CONTEXTS 64
220
220
  #define GGML_MAX_SRC 6
221
221
  #define GGML_MAX_NAME 64
222
- #define GGML_MAX_OP_PARAMS 32
222
+ #define GGML_MAX_OP_PARAMS 64
223
223
  #define GGML_DEFAULT_N_THREADS 4
224
224
 
225
225
  #if UINTPTR_MAX == 0xFFFFFFFF
@@ -401,15 +401,16 @@ extern "C" {
401
401
  GGML_OP_ALIBI,
402
402
  GGML_OP_CLAMP,
403
403
  GGML_OP_CONV_1D,
404
- GGML_OP_CONV_2D,
404
+ GGML_OP_CONV_1D_STAGE_0, // internal
405
+ GGML_OP_CONV_1D_STAGE_1, // internal
405
406
  GGML_OP_CONV_TRANSPOSE_1D,
407
+ GGML_OP_CONV_2D,
408
+ GGML_OP_CONV_2D_STAGE_0, // internal
409
+ GGML_OP_CONV_2D_STAGE_1, // internal
406
410
  GGML_OP_CONV_TRANSPOSE_2D,
407
411
  GGML_OP_POOL_1D,
408
412
  GGML_OP_POOL_2D,
409
413
 
410
- GGML_OP_CONV_1D_STAGE_0, // internal
411
- GGML_OP_CONV_1D_STAGE_1, // internal
412
-
413
414
  GGML_OP_UPSCALE, // nearest interpolate
414
415
 
415
416
  GGML_OP_FLASH_ATTN,
@@ -708,7 +709,7 @@ extern "C" {
708
709
  // Context tensor enumeration and lookup
709
710
  GGML_API struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx);
710
711
  GGML_API struct ggml_tensor * ggml_get_next_tensor (struct ggml_context * ctx, struct ggml_tensor * tensor);
711
- GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
712
+ GGML_API struct ggml_tensor * ggml_get_tensor (struct ggml_context * ctx, const char * name);
712
713
 
713
714
  GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
714
715
  GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
@@ -1020,9 +1021,9 @@ extern "C" {
1020
1021
  struct ggml_tensor * b,
1021
1022
  float eps);
1022
1023
 
1023
- // A: n columns, m rows
1024
- // B: n columns, p rows (i.e. we transpose it internally)
1025
- // result is m columns, p rows
1024
+ // A: k columns, n rows => [ne03, ne02, n, k]
1025
+ // B: k columns, m rows (i.e. we transpose it internally) => [ne03 * x, ne02 * y, m, k]
1026
+ // result is n columns, m rows => [ne03 * x, ne02 * y, m, n]
1026
1027
  GGML_API struct ggml_tensor * ggml_mul_mat(
1027
1028
  struct ggml_context * ctx,
1028
1029
  struct ggml_tensor * a,
@@ -1325,8 +1326,13 @@ extern "C" {
1325
1326
  int n_dims,
1326
1327
  int mode,
1327
1328
  int n_ctx,
1329
+ int n_orig_ctx,
1328
1330
  float freq_base,
1329
- float freq_scale);
1331
+ float freq_scale,
1332
+ float ext_factor,
1333
+ float attn_factor,
1334
+ float beta_fast,
1335
+ float beta_slow);
1330
1336
 
1331
1337
  // in-place, returns view(a)
1332
1338
  GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
@@ -1336,8 +1342,17 @@ extern "C" {
1336
1342
  int n_dims,
1337
1343
  int mode,
1338
1344
  int n_ctx,
1345
+ int n_orig_ctx,
1339
1346
  float freq_base,
1340
- float freq_scale);
1347
+ float freq_scale,
1348
+ float ext_factor,
1349
+ float attn_factor,
1350
+ float beta_fast,
1351
+ float beta_slow);
1352
+
1353
+ // compute correction dims for YaRN RoPE scaling
1354
+ void ggml_rope_yarn_corr_dims(
1355
+ int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]);
1341
1356
 
1342
1357
  // xPos RoPE, in-place, returns view(a)
1343
1358
  GGML_API struct ggml_tensor * ggml_rope_xpos_inplace(
@@ -1929,12 +1944,19 @@ extern "C" {
1929
1944
  // quantization
1930
1945
  //
1931
1946
 
1947
+ // TODO: these would probably get removed in favor of the more general ggml_quantize_chunk
1932
1948
  GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
1933
1949
  GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
1934
1950
  GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist);
1935
1951
  GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist);
1936
1952
  GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist);
1937
1953
 
1954
+ GGML_API size_t ggml_quantize_q2_K(const float * src, void * dst, int n, int k, int64_t * hist);
1955
+ GGML_API size_t ggml_quantize_q3_K(const float * src, void * dst, int n, int k, int64_t * hist);
1956
+ GGML_API size_t ggml_quantize_q4_K(const float * src, void * dst, int n, int k, int64_t * hist);
1957
+ GGML_API size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist);
1958
+ GGML_API size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist);
1959
+
1938
1960
  GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);
1939
1961
 
1940
1962
  //