whisper.rn 0.4.0-rc.7 → 0.4.0-rc.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/cpp/ggml.h CHANGED
@@ -187,6 +187,16 @@
187
187
  # define WSP_GGML_API
188
188
  #endif
189
189
 
190
+ #ifdef WSP_GGML_MULTIPLATFORM
191
+ # if defined(_WIN32)
192
+ # define WSP_GGML_CALL
193
+ # else
194
+ # define WSP_GGML_CALL __attribute__((__ms_abi__))
195
+ # endif
196
+ #else
197
+ # define WSP_GGML_CALL
198
+ #endif
199
+
190
200
  // TODO: support for clang
191
201
  #ifdef __GNUC__
192
202
  # define WSP_GGML_DEPRECATED(func, hint) func __attribute__((deprecated(hint)))
@@ -218,7 +228,9 @@
218
228
  #define WSP_GGML_MAX_PARAMS 2048
219
229
  #define WSP_GGML_MAX_CONTEXTS 64
220
230
  #define WSP_GGML_MAX_SRC 10
231
+ #ifndef WSP_GGML_MAX_NAME
221
232
  #define WSP_GGML_MAX_NAME 64
233
+ #endif
222
234
  #define WSP_GGML_MAX_OP_PARAMS 64
223
235
  #define WSP_GGML_DEFAULT_N_THREADS 4
224
236
  #define WSP_GGML_DEFAULT_GRAPH_SIZE 2048
@@ -255,6 +267,8 @@
255
267
  #define WSP_GGML_UNREACHABLE() WSP_GGML_ASSERT(!"statement should not be reached")
256
268
  #elif defined(__GNUC__)
257
269
  #define WSP_GGML_UNREACHABLE() __builtin_unreachable()
270
+ #elif defined(_MSC_VER)
271
+ #define WSP_GGML_UNREACHABLE() __assume(0)
258
272
  #else
259
273
  #define WSP_GGML_UNREACHABLE() ((void) 0)
260
274
  #endif
@@ -303,7 +317,7 @@ extern "C" {
303
317
 
304
318
  #if defined(__ARM_NEON) && defined(__CUDACC__)
305
319
  typedef half wsp_ggml_fp16_t;
306
- #elif defined(__ARM_NEON)
320
+ #elif defined(__ARM_NEON) && !defined(_MSC_VER)
307
321
  typedef __fp16 wsp_ggml_fp16_t;
308
322
  #else
309
323
  typedef uint16_t wsp_ggml_fp16_t;
@@ -337,12 +351,20 @@ extern "C" {
337
351
  WSP_GGML_TYPE_Q5_K = 13,
338
352
  WSP_GGML_TYPE_Q6_K = 14,
339
353
  WSP_GGML_TYPE_Q8_K = 15,
354
+ WSP_GGML_TYPE_IQ2_XXS = 16,
355
+ WSP_GGML_TYPE_IQ2_XS = 17,
340
356
  WSP_GGML_TYPE_I8,
341
357
  WSP_GGML_TYPE_I16,
342
358
  WSP_GGML_TYPE_I32,
343
359
  WSP_GGML_TYPE_COUNT,
344
360
  };
345
361
 
362
+ // precision
363
+ enum wsp_ggml_prec {
364
+ WSP_GGML_PREC_DEFAULT,
365
+ WSP_GGML_PREC_F32,
366
+ };
367
+
346
368
  enum wsp_ggml_backend_type {
347
369
  WSP_GGML_BACKEND_CPU = 0,
348
370
  WSP_GGML_BACKEND_GPU = 10,
@@ -365,6 +387,8 @@ extern "C" {
365
387
  WSP_GGML_FTYPE_MOSTLY_Q4_K = 12, // except 1d tensors
366
388
  WSP_GGML_FTYPE_MOSTLY_Q5_K = 13, // except 1d tensors
367
389
  WSP_GGML_FTYPE_MOSTLY_Q6_K = 14, // except 1d tensors
390
+ WSP_GGML_FTYPE_MOSTLY_IQ2_XXS = 15, // except 1d tensors
391
+ WSP_GGML_FTYPE_MOSTLY_IQ2_XS = 16, // except 1d tensors
368
392
  };
369
393
 
370
394
  // available tensor operations:
@@ -478,7 +502,8 @@ extern "C" {
478
502
  enum wsp_ggml_log_level {
479
503
  WSP_GGML_LOG_LEVEL_ERROR = 2,
480
504
  WSP_GGML_LOG_LEVEL_WARN = 3,
481
- WSP_GGML_LOG_LEVEL_INFO = 4
505
+ WSP_GGML_LOG_LEVEL_INFO = 4,
506
+ WSP_GGML_LOG_LEVEL_DEBUG = 5
482
507
  };
483
508
 
484
509
  // ggml object
@@ -502,7 +527,6 @@ extern "C" {
502
527
 
503
528
  struct wsp_ggml_backend_buffer * buffer;
504
529
 
505
- int n_dims;
506
530
  int64_t ne[WSP_GGML_MAX_DIMS]; // number of elements
507
531
  size_t nb[WSP_GGML_MAX_DIMS]; // stride in bytes:
508
532
  // nb[0] = wsp_ggml_type_size(type)
@@ -534,7 +558,7 @@ extern "C" {
534
558
 
535
559
  void * extra; // extra things e.g. for ggml-cuda.cu
536
560
 
537
- char padding[12];
561
+ char padding[8];
538
562
  };
539
563
 
540
564
  static const size_t WSP_GGML_TENSOR_SIZE = sizeof(struct wsp_ggml_tensor);
@@ -635,33 +659,41 @@ extern "C" {
635
659
  WSP_GGML_API void wsp_ggml_print_object (const struct wsp_ggml_object * obj);
636
660
  WSP_GGML_API void wsp_ggml_print_objects(const struct wsp_ggml_context * ctx);
637
661
 
638
- WSP_GGML_API int64_t wsp_ggml_nelements (const struct wsp_ggml_tensor * tensor);
639
- WSP_GGML_API int64_t wsp_ggml_nrows (const struct wsp_ggml_tensor * tensor);
640
- WSP_GGML_API size_t wsp_ggml_nbytes (const struct wsp_ggml_tensor * tensor);
641
- WSP_GGML_API size_t wsp_ggml_nbytes_pad (const struct wsp_ggml_tensor * tensor); // same as wsp_ggml_nbytes() but padded to WSP_GGML_MEM_ALIGN
642
- WSP_GGML_API size_t wsp_ggml_nbytes_split(const struct wsp_ggml_tensor * tensor, int nrows_split);
662
+ WSP_GGML_API WSP_GGML_CALL int64_t wsp_ggml_nelements (const struct wsp_ggml_tensor * tensor);
663
+ WSP_GGML_API WSP_GGML_CALL int64_t wsp_ggml_nrows (const struct wsp_ggml_tensor * tensor);
664
+ WSP_GGML_API WSP_GGML_CALL size_t wsp_ggml_nbytes (const struct wsp_ggml_tensor * tensor);
665
+ WSP_GGML_API size_t wsp_ggml_nbytes_pad (const struct wsp_ggml_tensor * tensor); // same as wsp_ggml_nbytes() but padded to WSP_GGML_MEM_ALIGN
643
666
 
644
- WSP_GGML_API int wsp_ggml_blck_size (enum wsp_ggml_type type);
645
- WSP_GGML_API size_t wsp_ggml_type_size (enum wsp_ggml_type type); // size in bytes for all elements in a block
646
- WSP_GGML_API float wsp_ggml_type_sizef(enum wsp_ggml_type type); // wsp_ggml_type_size()/wsp_ggml_blck_size() as float
667
+ WSP_GGML_API WSP_GGML_CALL int wsp_ggml_blck_size(enum wsp_ggml_type type);
668
+ WSP_GGML_API WSP_GGML_CALL size_t wsp_ggml_type_size(enum wsp_ggml_type type); // size in bytes for all elements in a block
669
+ WSP_GGML_API WSP_GGML_CALL size_t wsp_ggml_row_size (enum wsp_ggml_type type, int64_t ne); // size in bytes for all elements in a row
647
670
 
648
- WSP_GGML_API const char * wsp_ggml_type_name(enum wsp_ggml_type type);
649
- WSP_GGML_API const char * wsp_ggml_op_name (enum wsp_ggml_op op);
650
- WSP_GGML_API const char * wsp_ggml_op_symbol(enum wsp_ggml_op op);
671
+ WSP_GGML_DEPRECATED(
672
+ WSP_GGML_API double wsp_ggml_type_sizef(enum wsp_ggml_type type), // wsp_ggml_type_size()/wsp_ggml_blck_size() as float
673
+ "use wsp_ggml_row_size() instead");
651
674
 
652
- WSP_GGML_API const char * wsp_ggml_unary_op_name(enum wsp_ggml_unary_op op);
653
- WSP_GGML_API const char * wsp_ggml_op_desc(const struct wsp_ggml_tensor * t); // unary or op name
675
+ WSP_GGML_API WSP_GGML_CALL const char * wsp_ggml_type_name(enum wsp_ggml_type type);
676
+ WSP_GGML_API WSP_GGML_CALL const char * wsp_ggml_op_name (enum wsp_ggml_op op);
677
+ WSP_GGML_API const char * wsp_ggml_op_symbol(enum wsp_ggml_op op);
654
678
 
655
- WSP_GGML_API size_t wsp_ggml_element_size(const struct wsp_ggml_tensor * tensor);
679
+ WSP_GGML_API const char * wsp_ggml_unary_op_name(enum wsp_ggml_unary_op op);
680
+ WSP_GGML_API WSP_GGML_CALL const char * wsp_ggml_op_desc(const struct wsp_ggml_tensor * t); // unary or op name
656
681
 
657
- WSP_GGML_API bool wsp_ggml_is_quantized(enum wsp_ggml_type type);
682
+ WSP_GGML_API WSP_GGML_CALL size_t wsp_ggml_element_size(const struct wsp_ggml_tensor * tensor);
683
+
684
+ WSP_GGML_API WSP_GGML_CALL bool wsp_ggml_is_quantized(enum wsp_ggml_type type);
658
685
 
659
686
  // TODO: temporary until model loading of ggml examples is refactored
660
687
  WSP_GGML_API enum wsp_ggml_type wsp_ggml_ftype_to_wsp_ggml_type(enum wsp_ggml_ftype ftype);
661
688
 
662
- WSP_GGML_API bool wsp_ggml_is_transposed(const struct wsp_ggml_tensor * tensor);
663
- WSP_GGML_API bool wsp_ggml_is_contiguous(const struct wsp_ggml_tensor * tensor);
664
- WSP_GGML_API bool wsp_ggml_is_permuted (const struct wsp_ggml_tensor * tensor);
689
+ WSP_GGML_API WSP_GGML_CALL bool wsp_ggml_is_transposed(const struct wsp_ggml_tensor * tensor);
690
+ WSP_GGML_API WSP_GGML_CALL bool wsp_ggml_is_contiguous(const struct wsp_ggml_tensor * tensor);
691
+ WSP_GGML_API WSP_GGML_CALL bool wsp_ggml_is_permuted (const struct wsp_ggml_tensor * tensor);
692
+ WSP_GGML_API bool wsp_ggml_is_scalar (const struct wsp_ggml_tensor * tensor);
693
+ WSP_GGML_API bool wsp_ggml_is_vector (const struct wsp_ggml_tensor * tensor);
694
+ WSP_GGML_API bool wsp_ggml_is_matrix (const struct wsp_ggml_tensor * tensor);
695
+ WSP_GGML_API bool wsp_ggml_is_3d (const struct wsp_ggml_tensor * tensor);
696
+ WSP_GGML_API int wsp_ggml_n_dims (const struct wsp_ggml_tensor * tensor); // returns 1 for scalars
665
697
 
666
698
  WSP_GGML_API bool wsp_ggml_are_same_shape(const struct wsp_ggml_tensor * t0, const struct wsp_ggml_tensor * t1);
667
699
 
@@ -722,8 +754,8 @@ extern "C" {
722
754
  WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_view_tensor(struct wsp_ggml_context * ctx, struct wsp_ggml_tensor * src);
723
755
 
724
756
  // Context tensor enumeration and lookup
725
- WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_get_first_tensor(struct wsp_ggml_context * ctx);
726
- WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_get_next_tensor (struct wsp_ggml_context * ctx, struct wsp_ggml_tensor * tensor);
757
+ WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_get_first_tensor(const struct wsp_ggml_context * ctx);
758
+ WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_get_next_tensor (const struct wsp_ggml_context * ctx, struct wsp_ggml_tensor * tensor);
727
759
  WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_get_tensor(struct wsp_ggml_context * ctx, const char * name);
728
760
 
729
761
  WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_set_zero(struct wsp_ggml_tensor * tensor);
@@ -748,7 +780,7 @@ extern "C" {
748
780
  WSP_GGML_API void * wsp_ggml_get_data (const struct wsp_ggml_tensor * tensor);
749
781
  WSP_GGML_API float * wsp_ggml_get_data_f32(const struct wsp_ggml_tensor * tensor);
750
782
 
751
- WSP_GGML_API enum wsp_ggml_unary_op wsp_ggml_get_unary_op(const struct wsp_ggml_tensor * tensor);
783
+ WSP_GGML_API WSP_GGML_CALL enum wsp_ggml_unary_op wsp_ggml_get_unary_op(const struct wsp_ggml_tensor * tensor);
752
784
 
753
785
  WSP_GGML_API const char * wsp_ggml_get_name (const struct wsp_ggml_tensor * tensor);
754
786
  WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_set_name ( struct wsp_ggml_tensor * tensor, const char * name);
@@ -1050,6 +1082,12 @@ extern "C" {
1050
1082
  struct wsp_ggml_tensor * a,
1051
1083
  struct wsp_ggml_tensor * b);
1052
1084
 
1085
+ // change the precision of a matrix multiplication
1086
+ // set to WSP_GGML_PREC_F32 for higher precision (useful for phi-2)
1087
+ WSP_GGML_API void wsp_ggml_mul_mat_set_prec(
1088
+ struct wsp_ggml_tensor * a,
1089
+ enum wsp_ggml_prec prec);
1090
+
1053
1091
  // indirect matrix multiplication
1054
1092
  // wsp_ggml_mul_mat_id(ctx, as, ids, id, b) ~= wsp_ggml_mul_mat(as[ids[id]], b)
1055
1093
  WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_mul_mat_id(
@@ -1075,13 +1113,13 @@ extern "C" {
1075
1113
  WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_scale(
1076
1114
  struct wsp_ggml_context * ctx,
1077
1115
  struct wsp_ggml_tensor * a,
1078
- struct wsp_ggml_tensor * b);
1116
+ float s);
1079
1117
 
1080
1118
  // in-place, returns view(a)
1081
1119
  WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_scale_inplace(
1082
1120
  struct wsp_ggml_context * ctx,
1083
1121
  struct wsp_ggml_tensor * a,
1084
- struct wsp_ggml_tensor * b);
1122
+ float s);
1085
1123
 
1086
1124
  // b -> view(a,offset,nb1,nb2,3), return modified a
1087
1125
  WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_set(
@@ -1137,22 +1175,16 @@ extern "C" {
1137
1175
  struct wsp_ggml_tensor * a,
1138
1176
  struct wsp_ggml_tensor * b);
1139
1177
 
1140
- // a -> b, in-place, return view(b)
1141
- WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_cpy_inplace(
1178
+ WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_cast(
1142
1179
  struct wsp_ggml_context * ctx,
1143
1180
  struct wsp_ggml_tensor * a,
1144
- struct wsp_ggml_tensor * b);
1181
+ enum wsp_ggml_type type);
1145
1182
 
1146
1183
  // make contiguous
1147
1184
  WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_cont(
1148
1185
  struct wsp_ggml_context * ctx,
1149
1186
  struct wsp_ggml_tensor * a);
1150
1187
 
1151
- // make contiguous, in-place
1152
- WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_cont_inplace(
1153
- struct wsp_ggml_context * ctx,
1154
- struct wsp_ggml_tensor * a);
1155
-
1156
1188
  // make contiguous, with new shape
1157
1189
  WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_cont_1d(
1158
1190
  struct wsp_ggml_context * ctx,
@@ -1391,7 +1423,7 @@ extern "C" {
1391
1423
  float beta_slow);
1392
1424
 
1393
1425
  // compute correction dims for YaRN RoPE scaling
1394
- void wsp_ggml_rope_yarn_corr_dims(
1426
+ WSP_GGML_CALL void wsp_ggml_rope_yarn_corr_dims(
1395
1427
  int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]);
1396
1428
 
1397
1429
  // xPos RoPE, in-place, returns view(a)
@@ -1825,8 +1857,8 @@ extern "C" {
1825
1857
 
1826
1858
  // wsp_ggml_graph_plan() has to be called before wsp_ggml_graph_compute()
1827
1859
  // when plan.work_size > 0, caller must allocate memory for plan.work_data
1828
- WSP_GGML_API struct wsp_ggml_cplan wsp_ggml_graph_plan (struct wsp_ggml_cgraph * cgraph, int n_threads /*= WSP_GGML_DEFAULT_N_THREADS*/);
1829
- WSP_GGML_API int wsp_ggml_graph_compute(struct wsp_ggml_cgraph * cgraph, struct wsp_ggml_cplan * cplan);
1860
+ WSP_GGML_API struct wsp_ggml_cplan wsp_ggml_graph_plan (const struct wsp_ggml_cgraph * cgraph, int n_threads /*= WSP_GGML_DEFAULT_N_THREADS*/);
1861
+ WSP_GGML_API int wsp_ggml_graph_compute( struct wsp_ggml_cgraph * cgraph, struct wsp_ggml_cplan * cplan);
1830
1862
 
1831
1863
  // same as wsp_ggml_graph_compute() but the work data is allocated as a part of the context
1832
1864
  // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
@@ -2033,6 +2065,18 @@ extern "C" {
2033
2065
  // quantization
2034
2066
  //
2035
2067
 
2068
+ // - wsp_ggml_wsp_quantize_init can be called multiple times with the same type
2069
+ // it will only initialize the quantization tables for the first call or after wsp_ggml_wsp_quantize_free
2070
+ // automatically called by wsp_ggml_wsp_quantize_chunk for convenience
2071
+ //
2072
+ // - wsp_ggml_wsp_quantize_free will free any memory allocated by wsp_ggml_wsp_quantize_init
2073
+ // call this at the end of the program to avoid memory leaks
2074
+ //
2075
+ // note: these are thread-safe
2076
+ //
2077
+ WSP_GGML_API void wsp_ggml_wsp_quantize_init(enum wsp_ggml_type type);
2078
+ WSP_GGML_API void wsp_ggml_wsp_quantize_free(void);
2079
+
2036
2080
  // TODO: these would probably get removed in favor of the more general wsp_ggml_wsp_quantize_chunk
2037
2081
  WSP_GGML_API size_t wsp_ggml_wsp_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
2038
2082
  WSP_GGML_API size_t wsp_ggml_wsp_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
@@ -2046,7 +2090,12 @@ extern "C" {
2046
2090
  WSP_GGML_API size_t wsp_ggml_wsp_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist);
2047
2091
  WSP_GGML_API size_t wsp_ggml_wsp_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist);
2048
2092
 
2049
- WSP_GGML_API size_t wsp_ggml_wsp_quantize_chunk(enum wsp_ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);
2093
+ // some quantization type cannot be used without an importance matrix
2094
+ WSP_GGML_API bool wsp_ggml_wsp_quantize_requires_imatrix(enum wsp_ggml_type type);
2095
+
2096
+ // calls wsp_ggml_wsp_quantize_init internally (i.e. can allocate memory)
2097
+ WSP_GGML_API size_t wsp_ggml_wsp_quantize_chunk(enum wsp_ggml_type type, const float * src, void * dst,
2098
+ int start, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
2050
2099
 
2051
2100
  //
2052
2101
  // gguf
@@ -2116,10 +2165,11 @@ extern "C" {
2116
2165
  WSP_GGML_API const void * wsp_gguf_get_arr_data(const struct wsp_gguf_context * ctx, int key_id);
2117
2166
  WSP_GGML_API const char * wsp_gguf_get_arr_str (const struct wsp_gguf_context * ctx, int key_id, int i);
2118
2167
 
2119
- WSP_GGML_API int wsp_gguf_get_n_tensors (const struct wsp_gguf_context * ctx);
2120
- WSP_GGML_API int wsp_gguf_find_tensor (const struct wsp_gguf_context * ctx, const char * name);
2121
- WSP_GGML_API size_t wsp_gguf_get_tensor_offset(const struct wsp_gguf_context * ctx, int i);
2122
- WSP_GGML_API char * wsp_gguf_get_tensor_name (const struct wsp_gguf_context * ctx, int i);
2168
+ WSP_GGML_API int wsp_gguf_get_n_tensors (const struct wsp_gguf_context * ctx);
2169
+ WSP_GGML_API int wsp_gguf_find_tensor (const struct wsp_gguf_context * ctx, const char * name);
2170
+ WSP_GGML_API size_t wsp_gguf_get_tensor_offset(const struct wsp_gguf_context * ctx, int i);
2171
+ WSP_GGML_API char * wsp_gguf_get_tensor_name (const struct wsp_gguf_context * ctx, int i);
2172
+ WSP_GGML_API enum wsp_ggml_type wsp_gguf_get_tensor_type (const struct wsp_gguf_context * ctx, int i);
2123
2173
 
2124
2174
  // overrides existing values or adds a new one
2125
2175
  WSP_GGML_API void wsp_gguf_set_val_u8 (struct wsp_gguf_context * ctx, const char * key, uint8_t val);
@@ -2175,6 +2225,7 @@ extern "C" {
2175
2225
  //
2176
2226
 
2177
2227
  WSP_GGML_API int wsp_ggml_cpu_has_avx (void);
2228
+ WSP_GGML_API int wsp_ggml_cpu_has_avx_vnni (void);
2178
2229
  WSP_GGML_API int wsp_ggml_cpu_has_avx2 (void);
2179
2230
  WSP_GGML_API int wsp_ggml_cpu_has_avx512 (void);
2180
2231
  WSP_GGML_API int wsp_ggml_cpu_has_avx512_vbmi(void);