whisper.rn 0.4.0-rc.10 → 0.4.0-rc.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,30 +1,8 @@
1
1
  #pragma once
2
2
 
3
+ #include "ggml-cpu-traits.h"
3
4
  #include "ggml.h"
4
5
 
5
6
  // GGML internal header
6
7
 
7
- #ifdef __cplusplus
8
- extern "C" {
9
- #endif
10
-
11
- // Quantization
12
- void wsp_quantize_mat_q8_0(const float * WSP_GGML_RESTRICT x, void * WSP_GGML_RESTRICT y, int64_t nrows, int64_t n_per_row, int64_t blck_size_interleave);
13
-
14
- // GEMV
15
- void wsp_ggml_gemv_q4_0_4x4_q8_0(int n, float * WSP_GGML_RESTRICT s, size_t bs, const void * WSP_GGML_RESTRICT vx, const void * WSP_GGML_RESTRICT vy, int nr, int nc);
16
- void wsp_ggml_gemv_q4_0_4x8_q8_0(int n, float * WSP_GGML_RESTRICT s, size_t bs, const void * WSP_GGML_RESTRICT vx, const void * WSP_GGML_RESTRICT vy, int nr, int nc);
17
- void wsp_ggml_gemv_q4_0_8x8_q8_0(int n, float * WSP_GGML_RESTRICT s, size_t bs, const void * WSP_GGML_RESTRICT vx, const void * WSP_GGML_RESTRICT vy, int nr, int nc);
18
-
19
- // GEMM
20
- void wsp_ggml_gemm_q4_0_4x4_q8_0(int n, float * WSP_GGML_RESTRICT s, size_t bs, const void * WSP_GGML_RESTRICT vx, const void * WSP_GGML_RESTRICT vy, int nr, int nc);
21
- void wsp_ggml_gemm_q4_0_4x8_q8_0(int n, float * WSP_GGML_RESTRICT s, size_t bs, const void * WSP_GGML_RESTRICT vx, const void * WSP_GGML_RESTRICT vy, int nr, int nc);
22
- void wsp_ggml_gemm_q4_0_8x8_q8_0(int n, float * WSP_GGML_RESTRICT s, size_t bs, const void * WSP_GGML_RESTRICT vx, const void * WSP_GGML_RESTRICT vy, int nr, int nc);
23
-
24
- void wsp_ggml_aarch64_repack_tensor(struct wsp_ggml_tensor * cur, enum wsp_ggml_type repack_type, const void * data, size_t data_size);
25
- enum wsp_ggml_type wsp_ggml_aarch64_get_optimal_repack_type(const struct wsp_ggml_tensor * cur);
26
-
27
- #ifdef __cplusplus
28
- }
29
- #endif
30
-
8
+ wsp_ggml_backend_buffer_type_t wsp_ggml_backend_cpu_aarch64_buffer_type(void);
@@ -15,6 +15,18 @@
15
15
  extern "C" {
16
16
  #endif
17
17
 
18
+ struct wsp_ggml_compute_params {
19
+ // ith = thread index, nth = number of threads
20
+ int ith, nth;
21
+
22
+ // work buffer for all threads
23
+ size_t wsize;
24
+ void * wdata;
25
+
26
+ struct wsp_ggml_threadpool * threadpool;
27
+ };
28
+
29
+
18
30
  #if defined(_MSC_VER)
19
31
 
20
32
  #define m512bh(p) p
@@ -47,6 +59,15 @@ extern "C" {
47
59
  #endif
48
60
  #endif
49
61
 
62
+ #if defined(__s390x__) && defined(__VEC__)
63
+ #ifndef __VXE__
64
+ #define __VXE__
65
+ #endif
66
+ #ifndef __VXE2__
67
+ #define __VXE2__
68
+ #endif
69
+ #endif
70
+
50
71
  #if defined(__ARM_FEATURE_SVE)
51
72
  #include <arm_sve.h>
52
73
  #include <sys/prctl.h>
@@ -347,25 +368,164 @@ inline static int32x4_t wsp_ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t
347
368
  #endif
348
369
  #endif
349
370
 
350
- #if defined(__loongarch_asx)
371
+ #if defined(__VXE__) || defined(__VXE2__)
372
+ #include <vecintrin.h>
373
+
374
+ #define vec_neg(a) (-(a)) // Vector Negate
375
+ #define vec_add(a, b) ((a) + (b)) // Vector Add
376
+ #define vec_sub(a, b) ((a) - (b)) // Vector Subtract
377
+ #define vec_mul(a, b) ((a) * (b)) // Vector Multiply
378
+ #define vec_div(a, b) ((a) / (b)) // Vector Divide
379
+ #define vec_sl(a, b) ((a) << (b)) // Vector Shift Left
380
+ #define vec_sra(a, b) ((a) >> (b)) // Vector Shift Right
381
+ #define vec_sr(a, b) ((a) >> (b)) // Vector Shift Right Algebraic
382
+ #define vec_slo(a, b) vec_slb(a, (b) << 64) // Vector Shift Left by Octet
383
+ #define vec_sro(a, b) vec_srb(a, (b) << 64) // Vector Shift Right by Octet
384
+
385
+ #ifndef vec_and
386
+ #define vec_and(a, b) ((a) & (b)) // Vector AND
387
+ #endif
388
+
389
+ #ifndef vec_or
390
+ #define vec_or(a, b) ((a) | (b)) // Vector OR
391
+ #endif
392
+
393
+ #ifndef vec_xor
394
+ #define vec_xor(a, b) ((a) ^ (b)) // Vector XOR
395
+ #endif
396
+
397
+ typedef signed char char8x16_t __attribute__((vector_size(16)));
398
+ typedef unsigned char uchar8x16_t __attribute__((vector_size(16)));
399
+
400
+ typedef int8_t int8x16_t __attribute__((vector_size(16)));
401
+ typedef int16_t int16x8_t __attribute__((vector_size(16)));
402
+ typedef int32_t int32x4_t __attribute__((vector_size(16)));
403
+
404
+ typedef uint8_t uint8x16_t __attribute__((vector_size(16)));
405
+ typedef uint16_t uint16x8_t __attribute__((vector_size(16)));
406
+ typedef uint32_t uint32x4_t __attribute__((vector_size(16)));
407
+
408
+ typedef float float32x4_t __attribute__((vector_size(16)));
409
+ typedef double double64x2_t __attribute((vector_size(16)));
410
+
411
+ typedef signed long long long64x2_t __attribute((vector_size(16)));
412
+ typedef unsigned long long ulong64x2_t __attribute__((vector_size(16)));
413
+
414
+ typedef struct wsp_ggml_uint8x16x2_t {
415
+ uint8x16_t val[2];
416
+ } wsp_ggml_uint8x16x2_t;
417
+
418
+ inline static wsp_ggml_uint8x16x2_t wsp_ggml_vec_xl_u8x2(const uint8_t * ptr) {
419
+ wsp_ggml_uint8x16x2_t res;
420
+
421
+ res.val[0] = vec_xl( 0, ptr);
422
+ res.val[1] = vec_xl(16, ptr);
423
+
424
+ return res;
425
+ }
426
+
427
+ typedef struct wsp_ggml_uint8x16x4_t {
428
+ uint8x16_t val[4];
429
+ } wsp_ggml_uint8x16x4_t;
430
+
431
+ inline static wsp_ggml_uint8x16x4_t wsp_ggml_vec_xl_u8x4(const uint8_t * ptr) {
432
+ wsp_ggml_uint8x16x4_t res;
433
+
434
+ res.val[0] = vec_xl( 0, ptr);
435
+ res.val[1] = vec_xl(16, ptr);
436
+ res.val[2] = vec_xl(32, ptr);
437
+ res.val[3] = vec_xl(48, ptr);
438
+
439
+ return res;
440
+ }
441
+
442
+ typedef struct wsp_ggml_int8x16x4_t {
443
+ int8x16_t val[4];
444
+ } wsp_ggml_int8x16x4_t;
445
+
446
+ inline static wsp_ggml_int8x16x4_t wsp_ggml_vec_xl_s8x4(const int8_t * ptr) {
447
+ wsp_ggml_int8x16x4_t res;
351
448
 
352
- typedef union {
353
- int32_t i;
354
- float f;
355
- } ft_union;
449
+ res.val[0] = vec_xl( 0, ptr);
450
+ res.val[1] = vec_xl(16, ptr);
451
+ res.val[2] = vec_xl(32, ptr);
452
+ res.val[3] = vec_xl(48, ptr);
356
453
 
454
+ return res;
455
+ }
456
+
457
+ typedef struct wsp_ggml_int16x8x2_t {
458
+ int16x8_t val[2];
459
+ } wsp_ggml_int16x8x2_t;
460
+
461
+ inline static wsp_ggml_int16x8x2_t wsp_ggml_vec_xl_s16x2(const int16_t * ptr) {
462
+ wsp_ggml_int16x8x2_t res;
463
+
464
+ res.val[0] = vec_xl( 0, ptr);
465
+ res.val[1] = vec_xl(16, ptr);
466
+
467
+ return res;
468
+ }
469
+
470
+ /*
471
+ ! WARNING: Very slow. Use vec_perm if possible. Refer to iq4_xs
472
+ ! or iq4_nl for example implementation.
473
+ */
474
+ inline static int8x16_t wsp_ggml_vec_tbl(int8x16_t a, uint8x16_t b) {
475
+ int8x16_t res;
476
+
477
+ res[ 0] = a[b[ 0]];
478
+ res[ 1] = a[b[ 1]];
479
+ res[ 2] = a[b[ 2]];
480
+ res[ 3] = a[b[ 3]];
481
+ res[ 4] = a[b[ 4]];
482
+ res[ 5] = a[b[ 5]];
483
+ res[ 6] = a[b[ 6]];
484
+ res[ 7] = a[b[ 7]];
485
+ res[ 8] = a[b[ 8]];
486
+ res[ 9] = a[b[ 9]];
487
+ res[10] = a[b[10]];
488
+ res[11] = a[b[11]];
489
+ res[12] = a[b[12]];
490
+ res[13] = a[b[13]];
491
+ res[14] = a[b[14]];
492
+ res[15] = a[b[15]];
493
+
494
+ return res;
495
+ }
496
+
497
+ inline static int16x8_t vec_padd_s16(int16x8_t a, int16x8_t b) {
498
+ const uchar8x16_t v_maske = { 0, 1, 4, 5, 8, 9, 12, 13,
499
+ 16, 17, 20, 21, 24, 25, 28, 29 };
500
+
501
+ const int16x8_t v_abo = vec_pack((int32x4_t)a, (int32x4_t)b);
502
+ const int16x8_t v_abe = vec_perm(a, b, v_maske);
503
+ return v_abo + v_abe;
504
+ }
505
+
506
+ inline static int32x4_t wsp_ggml_vec_dot(int32x4_t acc, int8x16_t a, int8x16_t b) {
507
+ const int16x8_t p = vec_mule(a, b) + vec_mulo(a, b);
508
+ return acc + (vec_unpackh(p) + vec_unpackl(p));
509
+ }
510
+
511
+ #endif
512
+
513
+ #if defined(__loongarch_asx)
357
514
  /* float type data load instructions */
358
- static __m128 __lsx_vreplfr2vr_s(float val) {
359
- ft_union fi_tmpval = {.f = val};
360
- return (__m128)__lsx_vreplgr2vr_w(fi_tmpval.i);
515
+ static __m128 __lsx_vreplfr2vr_s(const float val) {
516
+ v4f32 res = {val, val, val, val};
517
+ return (__m128)res;
361
518
  }
362
519
 
363
- static __m256 __lasx_xvreplfr2vr_s(float val) {
364
- ft_union fi_tmpval = {.f = val};
365
- return (__m256)__lasx_xvreplgr2vr_w(fi_tmpval.i);
520
+ static __m256 __lasx_xvreplfr2vr_s(const float val) {
521
+ v8f32 res = {val, val, val, val, val, val, val, val};
522
+ return (__m256)res;
366
523
  }
367
524
  #endif
368
525
 
526
+ // TODO: move to ggml-threading
527
+ void wsp_ggml_barrier(struct wsp_ggml_threadpool * tp);
528
+
369
529
  #ifdef __cplusplus
370
530
  }
371
531
  #endif