llama_cpp 0.8.0 → 0.9.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -219,7 +219,7 @@
219
219
  #define GGML_MAX_CONTEXTS 64
220
220
  #define GGML_MAX_SRC 6
221
221
  #define GGML_MAX_NAME 64
222
- #define GGML_MAX_OP_PARAMS 32
222
+ #define GGML_MAX_OP_PARAMS 64
223
223
  #define GGML_DEFAULT_N_THREADS 4
224
224
 
225
225
  #if UINTPTR_MAX == 0xFFFFFFFF
@@ -401,15 +401,16 @@ extern "C" {
401
401
  GGML_OP_ALIBI,
402
402
  GGML_OP_CLAMP,
403
403
  GGML_OP_CONV_1D,
404
- GGML_OP_CONV_2D,
404
+ GGML_OP_CONV_1D_STAGE_0, // internal
405
+ GGML_OP_CONV_1D_STAGE_1, // internal
405
406
  GGML_OP_CONV_TRANSPOSE_1D,
407
+ GGML_OP_CONV_2D,
408
+ GGML_OP_CONV_2D_STAGE_0, // internal
409
+ GGML_OP_CONV_2D_STAGE_1, // internal
406
410
  GGML_OP_CONV_TRANSPOSE_2D,
407
411
  GGML_OP_POOL_1D,
408
412
  GGML_OP_POOL_2D,
409
413
 
410
- GGML_OP_CONV_1D_STAGE_0, // internal
411
- GGML_OP_CONV_1D_STAGE_1, // internal
412
-
413
414
  GGML_OP_UPSCALE, // nearest interpolate
414
415
 
415
416
  GGML_OP_FLASH_ATTN,
@@ -708,7 +709,7 @@ extern "C" {
708
709
  // Context tensor enumeration and lookup
709
710
  GGML_API struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx);
710
711
  GGML_API struct ggml_tensor * ggml_get_next_tensor (struct ggml_context * ctx, struct ggml_tensor * tensor);
711
- GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
712
+ GGML_API struct ggml_tensor * ggml_get_tensor (struct ggml_context * ctx, const char * name);
712
713
 
713
714
  GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
714
715
  GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
@@ -1020,9 +1021,9 @@ extern "C" {
1020
1021
  struct ggml_tensor * b,
1021
1022
  float eps);
1022
1023
 
1023
- // A: n columns, m rows
1024
- // B: n columns, p rows (i.e. we transpose it internally)
1025
- // result is m columns, p rows
1024
+ // A: k columns, n rows => [ne03, ne02, n, k]
1025
+ // B: k columns, m rows (i.e. we transpose it internally) => [ne03 * x, ne02 * y, m, k]
1026
+ // result is n columns, m rows => [ne03 * x, ne02 * y, m, n]
1026
1027
  GGML_API struct ggml_tensor * ggml_mul_mat(
1027
1028
  struct ggml_context * ctx,
1028
1029
  struct ggml_tensor * a,
@@ -1325,8 +1326,13 @@ extern "C" {
1325
1326
  int n_dims,
1326
1327
  int mode,
1327
1328
  int n_ctx,
1329
+ int n_orig_ctx,
1328
1330
  float freq_base,
1329
- float freq_scale);
1331
+ float freq_scale,
1332
+ float ext_factor,
1333
+ float attn_factor,
1334
+ float beta_fast,
1335
+ float beta_slow);
1330
1336
 
1331
1337
  // in-place, returns view(a)
1332
1338
  GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
@@ -1336,8 +1342,17 @@ extern "C" {
1336
1342
  int n_dims,
1337
1343
  int mode,
1338
1344
  int n_ctx,
1345
+ int n_orig_ctx,
1339
1346
  float freq_base,
1340
- float freq_scale);
1347
+ float freq_scale,
1348
+ float ext_factor,
1349
+ float attn_factor,
1350
+ float beta_fast,
1351
+ float beta_slow);
1352
+
1353
+ // compute correction dims for YaRN RoPE scaling
1354
+ void ggml_rope_yarn_corr_dims(
1355
+ int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]);
1341
1356
 
1342
1357
  // xPos RoPE, in-place, returns view(a)
1343
1358
  GGML_API struct ggml_tensor * ggml_rope_xpos_inplace(
@@ -1929,12 +1944,19 @@ extern "C" {
1929
1944
  // quantization
1930
1945
  //
1931
1946
 
1947
+ // TODO: these would probably get removed in favor of the more general ggml_quantize_chunk
1932
1948
  GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
1933
1949
  GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
1934
1950
  GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist);
1935
1951
  GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist);
1936
1952
  GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist);
1937
1953
 
1954
+ GGML_API size_t ggml_quantize_q2_K(const float * src, void * dst, int n, int k, int64_t * hist);
1955
+ GGML_API size_t ggml_quantize_q3_K(const float * src, void * dst, int n, int k, int64_t * hist);
1956
+ GGML_API size_t ggml_quantize_q4_K(const float * src, void * dst, int n, int k, int64_t * hist);
1957
+ GGML_API size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist);
1958
+ GGML_API size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist);
1959
+
1938
1960
  GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);
1939
1961
 
1940
1962
  //