llama_cpp 0.3.1 → 0.3.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +28 -0
- data/README.md +9 -0
- data/examples/chat.rb +1 -1
- data/examples/embedding.rb +1 -1
- data/examples/prompt_jp.txt +8 -0
- data/ext/llama_cpp/extconf.rb +2 -2
- data/ext/llama_cpp/llama_cpp.cpp +121 -1
- data/ext/llama_cpp/src/ggml-cuda.cu +451 -101
- data/ext/llama_cpp/src/ggml-cuda.h +0 -4
- data/ext/llama_cpp/src/ggml-metal.m +3 -1
- data/ext/llama_cpp/src/ggml-opencl.cpp +11 -7
- data/ext/llama_cpp/src/ggml.c +690 -1512
- data/ext/llama_cpp/src/ggml.h +88 -62
- data/ext/llama_cpp/src/llama.cpp +103 -39
- data/ext/llama_cpp/src/llama.h +15 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +15 -12
- data/sig/llama_cpp.rbs +19 -1
- metadata +3 -2
data/ext/llama_cpp/src/ggml.c
CHANGED
@@ -220,9 +220,27 @@ inline static void* ggml_aligned_malloc(size_t size) {
|
|
220
220
|
#define GGML_ALIGNED_FREE(ptr) free(ptr)
|
221
221
|
#endif
|
222
222
|
|
223
|
-
#define UNUSED
|
223
|
+
#define UNUSED GGML_UNUSED
|
224
224
|
#define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0)
|
225
225
|
|
226
|
+
//
|
227
|
+
// tensor access macros
|
228
|
+
//
|
229
|
+
|
230
|
+
#define GGML_TENSOR_UNARY_OP_LOCALS \
|
231
|
+
GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne); \
|
232
|
+
GGML_TENSOR_LOCALS(size_t, nb0, src0, nb); \
|
233
|
+
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne); \
|
234
|
+
GGML_TENSOR_LOCALS(size_t, nb, dst, nb);
|
235
|
+
|
236
|
+
#define GGML_TENSOR_BINARY_OP_LOCALS \
|
237
|
+
GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne); \
|
238
|
+
GGML_TENSOR_LOCALS(size_t, nb0, src0, nb); \
|
239
|
+
GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne); \
|
240
|
+
GGML_TENSOR_LOCALS(size_t, nb1, src1, nb); \
|
241
|
+
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne); \
|
242
|
+
GGML_TENSOR_LOCALS(size_t, nb, dst, nb);
|
243
|
+
|
226
244
|
#if defined(GGML_USE_ACCELERATE)
|
227
245
|
#include <Accelerate/Accelerate.h>
|
228
246
|
#if defined(GGML_USE_CLBLAST) // allow usage of CLBlast alongside Accelerate functions
|
@@ -463,14 +481,14 @@ ggml_fp16_t ggml_fp32_to_fp16(float x) {
|
|
463
481
|
return GGML_FP32_TO_FP16(x);
|
464
482
|
}
|
465
483
|
|
466
|
-
void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y,
|
467
|
-
for (
|
484
|
+
void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int n) {
|
485
|
+
for (int i = 0; i < n; i++) {
|
468
486
|
y[i] = GGML_FP16_TO_FP32(x[i]);
|
469
487
|
}
|
470
488
|
}
|
471
489
|
|
472
|
-
void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y,
|
473
|
-
|
490
|
+
void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int n) {
|
491
|
+
int i = 0;
|
474
492
|
#if defined(__F16C__)
|
475
493
|
for (; i + 7 < n; i += 8) {
|
476
494
|
__m256 x_vec = _mm256_loadu_ps(x + i);
|
@@ -1609,109 +1627,112 @@ static void dequantize_row_q8_0(const void * restrict vx, float * restrict y, in
|
|
1609
1627
|
}
|
1610
1628
|
}
|
1611
1629
|
|
1630
|
+
static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y);
|
1631
|
+
static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y);
|
1612
1632
|
static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
1613
1633
|
static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
1614
1634
|
static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
1615
1635
|
static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
1616
1636
|
static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
1617
1637
|
|
1618
|
-
static const
|
1638
|
+
static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
1639
|
+
[GGML_TYPE_F32] = {
|
1640
|
+
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f32,
|
1641
|
+
.vec_dot_type = GGML_TYPE_F32,
|
1642
|
+
},
|
1643
|
+
[GGML_TYPE_F16] = {
|
1644
|
+
.to_float = (ggml_to_float_t) ggml_fp16_to_fp32_row,
|
1645
|
+
.from_float = (ggml_from_float_t) ggml_fp32_to_fp16_row,
|
1646
|
+
.from_float_reference = (ggml_from_float_t) ggml_fp32_to_fp16_row,
|
1647
|
+
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f16,
|
1648
|
+
.vec_dot_type = GGML_TYPE_F16,
|
1649
|
+
},
|
1619
1650
|
[GGML_TYPE_Q4_0] = {
|
1620
|
-
.
|
1621
|
-
.
|
1622
|
-
.
|
1623
|
-
.
|
1624
|
-
.vec_dot_q = ggml_vec_dot_q4_0_q8_0,
|
1651
|
+
.to_float = (ggml_to_float_t) dequantize_row_q4_0,
|
1652
|
+
.from_float = quantize_row_q4_0,
|
1653
|
+
.from_float_reference = (ggml_from_float_t) quantize_row_q4_0_reference,
|
1654
|
+
.vec_dot = ggml_vec_dot_q4_0_q8_0,
|
1625
1655
|
.vec_dot_type = GGML_TYPE_Q8_0,
|
1626
1656
|
},
|
1627
1657
|
[GGML_TYPE_Q4_1] = {
|
1628
|
-
.
|
1629
|
-
.
|
1630
|
-
.
|
1631
|
-
.
|
1632
|
-
.vec_dot_q = ggml_vec_dot_q4_1_q8_1,
|
1658
|
+
.to_float = (ggml_to_float_t) dequantize_row_q4_1,
|
1659
|
+
.from_float = quantize_row_q4_1,
|
1660
|
+
.from_float_reference = (ggml_from_float_t) quantize_row_q4_1_reference,
|
1661
|
+
.vec_dot = ggml_vec_dot_q4_1_q8_1,
|
1633
1662
|
.vec_dot_type = GGML_TYPE_Q8_1,
|
1634
1663
|
},
|
1635
1664
|
[GGML_TYPE_Q5_0] = {
|
1636
|
-
.
|
1637
|
-
.
|
1638
|
-
.
|
1639
|
-
.
|
1640
|
-
.vec_dot_q = ggml_vec_dot_q5_0_q8_0,
|
1665
|
+
.to_float = (ggml_to_float_t) dequantize_row_q5_0,
|
1666
|
+
.from_float = quantize_row_q5_0,
|
1667
|
+
.from_float_reference = (ggml_from_float_t) quantize_row_q5_0_reference,
|
1668
|
+
.vec_dot = ggml_vec_dot_q5_0_q8_0,
|
1641
1669
|
.vec_dot_type = GGML_TYPE_Q8_0,
|
1642
1670
|
},
|
1643
1671
|
[GGML_TYPE_Q5_1] = {
|
1644
|
-
.
|
1645
|
-
.
|
1646
|
-
.
|
1647
|
-
.
|
1648
|
-
.vec_dot_q = ggml_vec_dot_q5_1_q8_1,
|
1672
|
+
.to_float = (ggml_to_float_t) dequantize_row_q5_1,
|
1673
|
+
.from_float = quantize_row_q5_1,
|
1674
|
+
.from_float_reference = (ggml_from_float_t) quantize_row_q5_1_reference,
|
1675
|
+
.vec_dot = ggml_vec_dot_q5_1_q8_1,
|
1649
1676
|
.vec_dot_type = GGML_TYPE_Q8_1,
|
1650
1677
|
},
|
1651
1678
|
[GGML_TYPE_Q8_0] = {
|
1652
|
-
.
|
1653
|
-
.
|
1654
|
-
.
|
1655
|
-
.
|
1656
|
-
.vec_dot_q = ggml_vec_dot_q8_0_q8_0,
|
1679
|
+
.to_float = dequantize_row_q8_0,
|
1680
|
+
.from_float = quantize_row_q8_0,
|
1681
|
+
.from_float_reference = (ggml_from_float_t) quantize_row_q8_0_reference,
|
1682
|
+
.vec_dot = ggml_vec_dot_q8_0_q8_0,
|
1657
1683
|
.vec_dot_type = GGML_TYPE_Q8_0,
|
1658
1684
|
},
|
1659
1685
|
[GGML_TYPE_Q8_1] = {
|
1660
|
-
.
|
1661
|
-
.
|
1662
|
-
.quantize_row_q_reference = (quantize_row_q_t) quantize_row_q8_1_reference,
|
1663
|
-
.quantize_row_q_dot = quantize_row_q8_1,
|
1664
|
-
.vec_dot_q = NULL, // TODO
|
1686
|
+
.from_float = quantize_row_q8_1,
|
1687
|
+
.from_float_reference = (ggml_from_float_t) quantize_row_q8_1_reference,
|
1665
1688
|
.vec_dot_type = GGML_TYPE_Q8_1,
|
1666
1689
|
},
|
1667
1690
|
#ifdef GGML_USE_K_QUANTS
|
1668
1691
|
[GGML_TYPE_Q2_K] = {
|
1669
|
-
.
|
1670
|
-
.
|
1671
|
-
.
|
1672
|
-
.
|
1673
|
-
.vec_dot_q = ggml_vec_dot_q2_K_q8_K,
|
1692
|
+
.to_float = (ggml_to_float_t) dequantize_row_q2_K,
|
1693
|
+
.from_float = quantize_row_q2_K,
|
1694
|
+
.from_float_reference = (ggml_from_float_t) quantize_row_q2_K_reference,
|
1695
|
+
.vec_dot = ggml_vec_dot_q2_K_q8_K,
|
1674
1696
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
1675
1697
|
},
|
1676
1698
|
[GGML_TYPE_Q3_K] = {
|
1677
|
-
.
|
1678
|
-
.
|
1679
|
-
.
|
1680
|
-
.
|
1681
|
-
.vec_dot_q = ggml_vec_dot_q3_K_q8_K,
|
1699
|
+
.to_float = (ggml_to_float_t) dequantize_row_q3_K,
|
1700
|
+
.from_float = quantize_row_q3_K,
|
1701
|
+
.from_float_reference = (ggml_from_float_t) quantize_row_q3_K_reference,
|
1702
|
+
.vec_dot = ggml_vec_dot_q3_K_q8_K,
|
1682
1703
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
1683
1704
|
},
|
1684
1705
|
[GGML_TYPE_Q4_K] = {
|
1685
|
-
.
|
1686
|
-
.
|
1687
|
-
.
|
1688
|
-
.
|
1689
|
-
.vec_dot_q = ggml_vec_dot_q4_K_q8_K,
|
1706
|
+
.to_float = (ggml_to_float_t) dequantize_row_q4_K,
|
1707
|
+
.from_float = quantize_row_q4_K,
|
1708
|
+
.from_float_reference = (ggml_from_float_t) quantize_row_q4_K_reference,
|
1709
|
+
.vec_dot = ggml_vec_dot_q4_K_q8_K,
|
1690
1710
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
1691
1711
|
},
|
1692
1712
|
[GGML_TYPE_Q5_K] = {
|
1693
|
-
.
|
1694
|
-
.
|
1695
|
-
.
|
1696
|
-
.
|
1697
|
-
.vec_dot_q = ggml_vec_dot_q5_K_q8_K,
|
1713
|
+
.to_float = (ggml_to_float_t) dequantize_row_q5_K,
|
1714
|
+
.from_float = quantize_row_q5_K,
|
1715
|
+
.from_float_reference = (ggml_from_float_t) quantize_row_q5_K_reference,
|
1716
|
+
.vec_dot = ggml_vec_dot_q5_K_q8_K,
|
1698
1717
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
1699
1718
|
},
|
1700
1719
|
[GGML_TYPE_Q6_K] = {
|
1701
|
-
.
|
1702
|
-
.
|
1703
|
-
.
|
1704
|
-
.
|
1705
|
-
.vec_dot_q = ggml_vec_dot_q6_K_q8_K,
|
1720
|
+
.to_float = (ggml_to_float_t) dequantize_row_q6_K,
|
1721
|
+
.from_float = quantize_row_q6_K,
|
1722
|
+
.from_float_reference = (ggml_from_float_t) quantize_row_q6_K_reference,
|
1723
|
+
.vec_dot = ggml_vec_dot_q6_K_q8_K,
|
1706
1724
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
1707
1725
|
},
|
1726
|
+
[GGML_TYPE_Q8_K] = {
|
1727
|
+
.from_float = quantize_row_q8_K,
|
1728
|
+
}
|
1708
1729
|
#endif
|
1709
1730
|
};
|
1710
1731
|
|
1711
1732
|
// For internal test use
|
1712
|
-
|
1733
|
+
ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type i) {
|
1713
1734
|
GGML_ASSERT(i < GGML_TYPE_COUNT);
|
1714
|
-
return
|
1735
|
+
return type_traits[i];
|
1715
1736
|
}
|
1716
1737
|
|
1717
1738
|
|
@@ -2257,7 +2278,7 @@ inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x)
|
|
2257
2278
|
inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]*y[i]; }
|
2258
2279
|
inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]/y[i]; }
|
2259
2280
|
|
2260
|
-
|
2281
|
+
static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y) {
|
2261
2282
|
#ifdef GGML_SIMD
|
2262
2283
|
float sumf = 0.0f;
|
2263
2284
|
const int np = (n & ~(GGML_F32_STEP - 1));
|
@@ -2294,7 +2315,7 @@ inline static void ggml_vec_dot_f32(const int n, float * restrict s, const float
|
|
2294
2315
|
*s = sumf;
|
2295
2316
|
}
|
2296
2317
|
|
2297
|
-
|
2318
|
+
static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y) {
|
2298
2319
|
ggml_float sumf = 0.0;
|
2299
2320
|
|
2300
2321
|
#if defined(GGML_SIMD)
|
@@ -3447,6 +3468,8 @@ inline static void ggml_vec_log_f32 (const int n, float * y, const float * x) {
|
|
3447
3468
|
inline static void ggml_vec_abs_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fabsf(x[i]); }
|
3448
3469
|
inline static void ggml_vec_sgn_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : ((x[i] < 0.f) ? -1.f : 0.f); }
|
3449
3470
|
inline static void ggml_vec_step_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : 0.f; }
|
3471
|
+
inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = tanhf(x[i]); }
|
3472
|
+
inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; }
|
3450
3473
|
inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
|
3451
3474
|
|
3452
3475
|
static const float GELU_COEF_A = 0.044715f;
|
@@ -3598,6 +3621,16 @@ inline static void ggml_vec_norm_inv_f32(const int n, float * s, const float * x
|
|
3598
3621
|
*s = 1.f/(*s);
|
3599
3622
|
}
|
3600
3623
|
|
3624
|
+
inline static void ggml_vec_argmax_f32(const int n, int * s, const float * x) {
|
3625
|
+
float max = -INFINITY;
|
3626
|
+
int idx = 0;
|
3627
|
+
for (int i = 0; i < n; ++i) {
|
3628
|
+
max = MAX(max, x[i]);
|
3629
|
+
if (max == x[i]) { idx = i; }
|
3630
|
+
}
|
3631
|
+
*s = idx;
|
3632
|
+
}
|
3633
|
+
|
3601
3634
|
//
|
3602
3635
|
// data types
|
3603
3636
|
//
|
@@ -3707,12 +3740,15 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
3707
3740
|
"SUM",
|
3708
3741
|
"SUM_ROWS",
|
3709
3742
|
"MEAN",
|
3743
|
+
"ARGMAX",
|
3710
3744
|
"REPEAT",
|
3711
3745
|
"REPEAT_BACK",
|
3712
3746
|
"ABS",
|
3713
3747
|
"SGN",
|
3714
3748
|
"NEG",
|
3715
3749
|
"STEP",
|
3750
|
+
"TANH",
|
3751
|
+
"ELU",
|
3716
3752
|
"RELU",
|
3717
3753
|
"GELU",
|
3718
3754
|
"GELU_QUICK",
|
@@ -3744,9 +3780,8 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
3744
3780
|
"ROPE_BACK",
|
3745
3781
|
"ALIBI",
|
3746
3782
|
"CLAMP",
|
3747
|
-
"
|
3748
|
-
"
|
3749
|
-
"CONV_2D_SK_P0",
|
3783
|
+
"CONV_1D",
|
3784
|
+
"CONV_2D",
|
3750
3785
|
|
3751
3786
|
"FLASH_ATTN",
|
3752
3787
|
"FLASH_FF",
|
@@ -3765,7 +3800,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
3765
3800
|
"CROSS_ENTROPY_LOSS_BACK",
|
3766
3801
|
};
|
3767
3802
|
|
3768
|
-
static_assert(GGML_OP_COUNT ==
|
3803
|
+
static_assert(GGML_OP_COUNT == 66, "GGML_OP_COUNT != 66");
|
3769
3804
|
|
3770
3805
|
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
3771
3806
|
"none",
|
@@ -3783,12 +3818,15 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
3783
3818
|
"Σx",
|
3784
3819
|
"Σx_k",
|
3785
3820
|
"Σx/n",
|
3821
|
+
"argmax(x)",
|
3786
3822
|
"repeat(x)",
|
3787
3823
|
"repeat_back(x)",
|
3788
3824
|
"abs(x)",
|
3789
3825
|
"sgn(x)",
|
3790
3826
|
"-x",
|
3791
3827
|
"step(x)",
|
3828
|
+
"tanh(x)",
|
3829
|
+
"elu(x)",
|
3792
3830
|
"relu(x)",
|
3793
3831
|
"gelu(x)",
|
3794
3832
|
"gelu_quick(x)",
|
@@ -3820,9 +3858,8 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
3820
3858
|
"rope_back(x)",
|
3821
3859
|
"alibi(x)",
|
3822
3860
|
"clamp(x)",
|
3823
|
-
"
|
3824
|
-
"
|
3825
|
-
"conv_2d_sk_p0(x)",
|
3861
|
+
"conv_1d(x)",
|
3862
|
+
"conv_2d(x)",
|
3826
3863
|
|
3827
3864
|
"flash_attn(x)",
|
3828
3865
|
"flash_ff(x)",
|
@@ -3841,11 +3878,45 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
3841
3878
|
"cross_entropy_loss_back(x,y)",
|
3842
3879
|
};
|
3843
3880
|
|
3844
|
-
static_assert(GGML_OP_COUNT ==
|
3881
|
+
static_assert(GGML_OP_COUNT == 66, "GGML_OP_COUNT != 66");
|
3845
3882
|
|
3846
3883
|
static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
|
3847
3884
|
static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
|
3848
3885
|
|
3886
|
+
// WARN:
|
3887
|
+
// Mis-confguration can lead to problem that's hard to reason about:
|
3888
|
+
// * At best it crash or talks nosense.
|
3889
|
+
// * At worst it talks slightly difference but hard to perceive.
|
3890
|
+
//
|
3891
|
+
// An op has to enable INIT or FINALIZE when any of it's branch needs that pass.
|
3892
|
+
// Take care about compile options (e.g., GGML_USE_xxx).
|
3893
|
+
static bool GGML_OP_HAS_INIT [GGML_OP_COUNT] = { 0 };
|
3894
|
+
static bool GGML_OP_HAS_FINALIZE[GGML_OP_COUNT] = { 0 };
|
3895
|
+
|
3896
|
+
static void ggml_setup_op_has_task_pass(void) {
|
3897
|
+
{ // INIT
|
3898
|
+
bool * p = GGML_OP_HAS_INIT;
|
3899
|
+
|
3900
|
+
p[GGML_OP_ACC ] = true;
|
3901
|
+
p[GGML_OP_MUL_MAT ] = true;
|
3902
|
+
p[GGML_OP_OUT_PROD ] = true;
|
3903
|
+
p[GGML_OP_SET ] = true;
|
3904
|
+
p[GGML_OP_GET_ROWS_BACK ] = true;
|
3905
|
+
p[GGML_OP_DIAG_MASK_INF ] = true;
|
3906
|
+
p[GGML_OP_DIAG_MASK_ZERO ] = true;
|
3907
|
+
p[GGML_OP_CONV_1D ] = true;
|
3908
|
+
p[GGML_OP_CONV_2D ] = true;
|
3909
|
+
p[GGML_OP_FLASH_ATTN_BACK ] = true;
|
3910
|
+
p[GGML_OP_CROSS_ENTROPY_LOSS ] = true;
|
3911
|
+
}
|
3912
|
+
|
3913
|
+
{ // FINALIZE
|
3914
|
+
bool * p = GGML_OP_HAS_FINALIZE;
|
3915
|
+
|
3916
|
+
p[GGML_OP_CROSS_ENTROPY_LOSS ] = true;
|
3917
|
+
}
|
3918
|
+
}
|
3919
|
+
|
3849
3920
|
//
|
3850
3921
|
// ggml context
|
3851
3922
|
//
|
@@ -4267,6 +4338,8 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
|
4267
4338
|
ggml_cl_init();
|
4268
4339
|
#endif
|
4269
4340
|
|
4341
|
+
ggml_setup_op_has_task_pass();
|
4342
|
+
|
4270
4343
|
is_first_call = false;
|
4271
4344
|
}
|
4272
4345
|
|
@@ -5403,6 +5476,30 @@ struct ggml_tensor * ggml_mean(
|
|
5403
5476
|
return result;
|
5404
5477
|
}
|
5405
5478
|
|
5479
|
+
// ggml_argmax
|
5480
|
+
|
5481
|
+
struct ggml_tensor * ggml_argmax(
|
5482
|
+
struct ggml_context * ctx,
|
5483
|
+
struct ggml_tensor * a) {
|
5484
|
+
GGML_ASSERT(ggml_is_matrix(a));
|
5485
|
+
bool is_node = false;
|
5486
|
+
|
5487
|
+
if (a->grad) {
|
5488
|
+
GGML_ASSERT(false);
|
5489
|
+
is_node = true;
|
5490
|
+
}
|
5491
|
+
|
5492
|
+
int64_t ne[GGML_MAX_DIMS] = { a->ne[1], 1, 1, 1 };
|
5493
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_I32, a->n_dims, ne);
|
5494
|
+
|
5495
|
+
result->op = GGML_OP_ARGMAX;
|
5496
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5497
|
+
result->src0 = a;
|
5498
|
+
result->src1 = NULL;
|
5499
|
+
|
5500
|
+
return result;
|
5501
|
+
}
|
5502
|
+
|
5406
5503
|
// ggml_repeat
|
5407
5504
|
|
5408
5505
|
struct ggml_tensor * ggml_repeat(
|
@@ -5596,6 +5693,74 @@ struct ggml_tensor * ggml_step_inplace(
|
|
5596
5693
|
return ggml_step_impl(ctx, a, true);
|
5597
5694
|
}
|
5598
5695
|
|
5696
|
+
// ggml_tanh
|
5697
|
+
|
5698
|
+
struct ggml_tensor * ggml_tanh_impl(
|
5699
|
+
struct ggml_context * ctx,
|
5700
|
+
struct ggml_tensor * a,
|
5701
|
+
bool inplace) {
|
5702
|
+
bool is_node = false;
|
5703
|
+
|
5704
|
+
if (!inplace && (a->grad)) {
|
5705
|
+
is_node = true;
|
5706
|
+
}
|
5707
|
+
|
5708
|
+
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
5709
|
+
|
5710
|
+
result->op = GGML_OP_TANH;
|
5711
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5712
|
+
result->src0 = a;
|
5713
|
+
result->src1 = NULL;
|
5714
|
+
|
5715
|
+
return result;
|
5716
|
+
}
|
5717
|
+
|
5718
|
+
struct ggml_tensor * ggml_tanh(
|
5719
|
+
struct ggml_context * ctx,
|
5720
|
+
struct ggml_tensor * a) {
|
5721
|
+
return ggml_tanh_impl(ctx, a, false);
|
5722
|
+
}
|
5723
|
+
|
5724
|
+
struct ggml_tensor * ggml_tanh_inplace(
|
5725
|
+
struct ggml_context * ctx,
|
5726
|
+
struct ggml_tensor * a) {
|
5727
|
+
return ggml_tanh_impl(ctx, a, true);
|
5728
|
+
}
|
5729
|
+
|
5730
|
+
// ggml_elu
|
5731
|
+
|
5732
|
+
struct ggml_tensor * ggml_elu_impl(
|
5733
|
+
struct ggml_context * ctx,
|
5734
|
+
struct ggml_tensor * a,
|
5735
|
+
bool inplace) {
|
5736
|
+
bool is_node = false;
|
5737
|
+
|
5738
|
+
if (!inplace && (a->grad)) {
|
5739
|
+
is_node = true;
|
5740
|
+
}
|
5741
|
+
|
5742
|
+
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
5743
|
+
|
5744
|
+
result->op = GGML_OP_ELU;
|
5745
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5746
|
+
result->src0 = a;
|
5747
|
+
result->src1 = NULL;
|
5748
|
+
|
5749
|
+
return result;
|
5750
|
+
}
|
5751
|
+
|
5752
|
+
struct ggml_tensor * ggml_elu(
|
5753
|
+
struct ggml_context * ctx,
|
5754
|
+
struct ggml_tensor * a) {
|
5755
|
+
return ggml_elu_impl(ctx, a, false);
|
5756
|
+
}
|
5757
|
+
|
5758
|
+
struct ggml_tensor * ggml_elu_inplace(
|
5759
|
+
struct ggml_context * ctx,
|
5760
|
+
struct ggml_tensor * a) {
|
5761
|
+
return ggml_elu_impl(ctx, a, true);
|
5762
|
+
}
|
5763
|
+
|
5599
5764
|
// ggml_relu
|
5600
5765
|
|
5601
5766
|
struct ggml_tensor * ggml_relu_impl(
|
@@ -6837,6 +7002,8 @@ struct ggml_tensor * ggml_rope_back(
|
|
6837
7002
|
int n_dims,
|
6838
7003
|
int mode) {
|
6839
7004
|
GGML_ASSERT(n_past >= 0);
|
7005
|
+
GGML_ASSERT((mode & 4) == 0 && "ggml_rope_back() for ChatGLM not implemented yet");
|
7006
|
+
|
6840
7007
|
bool is_node = false;
|
6841
7008
|
|
6842
7009
|
if (a->grad) {
|
@@ -6937,15 +7104,21 @@ struct ggml_tensor * ggml_clamp(
|
|
6937
7104
|
return result;
|
6938
7105
|
}
|
6939
7106
|
|
6940
|
-
//
|
7107
|
+
// ggml_conv_1d
|
7108
|
+
|
7109
|
+
static int64_t ggml_calc_conv_output_size(int64_t ins, int64_t ks, int s, int p, int d) {
|
7110
|
+
return (ins + 2 * p - d * (ks - 1) - 1) / s + 1;
|
7111
|
+
}
|
6941
7112
|
|
6942
|
-
struct ggml_tensor *
|
7113
|
+
GGML_API struct ggml_tensor * ggml_conv_1d(
|
6943
7114
|
struct ggml_context * ctx,
|
6944
7115
|
struct ggml_tensor * a,
|
6945
|
-
struct ggml_tensor * b
|
7116
|
+
struct ggml_tensor * b,
|
7117
|
+
int s0,
|
7118
|
+
int p0,
|
7119
|
+
int d0) {
|
6946
7120
|
GGML_ASSERT(ggml_is_matrix(b));
|
6947
7121
|
GGML_ASSERT(a->ne[1] == b->ne[1]);
|
6948
|
-
GGML_ASSERT(a->ne[3] == 1);
|
6949
7122
|
bool is_node = false;
|
6950
7123
|
|
6951
7124
|
if (a->grad || b->grad) {
|
@@ -6953,26 +7126,43 @@ struct ggml_tensor * ggml_conv_1d_s1_ph(
|
|
6953
7126
|
is_node = true;
|
6954
7127
|
}
|
6955
7128
|
|
6956
|
-
const int64_t ne[4] = {
|
6957
|
-
|
7129
|
+
const int64_t ne[4] = {
|
7130
|
+
ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0),
|
7131
|
+
a->ne[2], 1, 1,
|
7132
|
+
};
|
7133
|
+
struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
|
7134
|
+
|
7135
|
+
ggml_scratch_save(ctx);
|
7136
|
+
struct ggml_tensor* c = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3);
|
7137
|
+
((int32_t*)c->data)[0] = s0;
|
7138
|
+
((int32_t*)c->data)[1] = p0;
|
7139
|
+
((int32_t*)c->data)[2] = d0;
|
7140
|
+
ggml_scratch_load(ctx);
|
6958
7141
|
|
6959
|
-
result->op
|
7142
|
+
result->op = GGML_OP_CONV_1D;
|
6960
7143
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6961
7144
|
result->src0 = a;
|
6962
7145
|
result->src1 = b;
|
7146
|
+
result->opt[0] = c;
|
6963
7147
|
|
6964
7148
|
return result;
|
6965
7149
|
}
|
6966
7150
|
|
6967
|
-
//
|
7151
|
+
// ggml_conv_2d
|
6968
7152
|
|
6969
|
-
struct ggml_tensor
|
6970
|
-
|
6971
|
-
|
6972
|
-
|
6973
|
-
|
6974
|
-
|
6975
|
-
|
7153
|
+
struct ggml_tensor* ggml_conv_2d(
|
7154
|
+
struct ggml_context* ctx,
|
7155
|
+
struct ggml_tensor * a,
|
7156
|
+
struct ggml_tensor * b,
|
7157
|
+
int s0,
|
7158
|
+
int s1,
|
7159
|
+
int p0,
|
7160
|
+
int p1,
|
7161
|
+
int d0,
|
7162
|
+
int d1) {
|
7163
|
+
|
7164
|
+
GGML_ASSERT(b->ne[3] == 1);
|
7165
|
+
GGML_ASSERT(a->ne[2] == b->ne[2]);
|
6976
7166
|
bool is_node = false;
|
6977
7167
|
|
6978
7168
|
if (a->grad || b->grad) {
|
@@ -6980,43 +7170,42 @@ struct ggml_tensor * ggml_conv_1d_s2_ph(
|
|
6980
7170
|
is_node = true;
|
6981
7171
|
}
|
6982
7172
|
|
6983
|
-
const int64_t ne[4] = {
|
6984
|
-
|
7173
|
+
const int64_t ne[4] = {
|
7174
|
+
ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0),
|
7175
|
+
ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1),
|
7176
|
+
a->ne[3], 1,
|
7177
|
+
};
|
7178
|
+
struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
7179
|
+
|
7180
|
+
ggml_scratch_save(ctx);
|
7181
|
+
struct ggml_tensor* c = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 6);
|
7182
|
+
((int32_t*)c->data)[0] = s0;
|
7183
|
+
((int32_t*)c->data)[1] = s1;
|
7184
|
+
((int32_t*)c->data)[2] = p0;
|
7185
|
+
((int32_t*)c->data)[3] = p1;
|
7186
|
+
((int32_t*)c->data)[4] = d0;
|
7187
|
+
((int32_t*)c->data)[5] = d1;
|
7188
|
+
ggml_scratch_load(ctx);
|
6985
7189
|
|
6986
|
-
result->op
|
7190
|
+
result->op = GGML_OP_CONV_2D;
|
6987
7191
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6988
7192
|
result->src0 = a;
|
6989
7193
|
result->src1 = b;
|
7194
|
+
result->opt[0] = c;
|
6990
7195
|
|
6991
7196
|
return result;
|
7197
|
+
|
6992
7198
|
}
|
6993
7199
|
|
6994
|
-
//
|
7200
|
+
// ggml_conv_1d_ph
|
6995
7201
|
|
6996
|
-
struct ggml_tensor
|
7202
|
+
struct ggml_tensor* ggml_conv_1d_ph(
|
6997
7203
|
struct ggml_context * ctx,
|
6998
7204
|
struct ggml_tensor * a,
|
6999
|
-
struct ggml_tensor * b
|
7000
|
-
|
7001
|
-
|
7002
|
-
|
7003
|
-
GGML_ASSERT(b->ne[1] % a->ne[1] == 0);
|
7004
|
-
bool is_node = false;
|
7005
|
-
|
7006
|
-
if (a->grad || b->grad) {
|
7007
|
-
GGML_ASSERT(false); // TODO: implement backward
|
7008
|
-
is_node = true;
|
7009
|
-
}
|
7010
|
-
|
7011
|
-
const int64_t ne[4] = { b->ne[0]/a->ne[0], b->ne[1]/a->ne[1], a->ne[3], 1, };
|
7012
|
-
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
7013
|
-
|
7014
|
-
result->op = GGML_OP_CONV_2D_SK_P0;
|
7015
|
-
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
7016
|
-
result->src0 = a;
|
7017
|
-
result->src1 = b;
|
7018
|
-
|
7019
|
-
return result;
|
7205
|
+
struct ggml_tensor * b,
|
7206
|
+
int s,
|
7207
|
+
int d) {
|
7208
|
+
return ggml_conv_1d(ctx, a, b, s, a->ne[0] / 2, d);
|
7020
7209
|
}
|
7021
7210
|
|
7022
7211
|
// ggml_flash_attn
|
@@ -7566,25 +7755,7 @@ static void ggml_compute_forward_dup_f16(
|
|
7566
7755
|
return;
|
7567
7756
|
}
|
7568
7757
|
|
7569
|
-
|
7570
|
-
const int64_t ne01 = src0->ne[1];
|
7571
|
-
const int64_t ne02 = src0->ne[2];
|
7572
|
-
const int64_t ne03 = src0->ne[3];
|
7573
|
-
|
7574
|
-
const int64_t ne0 = dst->ne[0];
|
7575
|
-
const int64_t ne1 = dst->ne[1];
|
7576
|
-
const int64_t ne2 = dst->ne[2];
|
7577
|
-
const int64_t ne3 = dst->ne[3];
|
7578
|
-
|
7579
|
-
const size_t nb00 = src0->nb[0];
|
7580
|
-
const size_t nb01 = src0->nb[1];
|
7581
|
-
const size_t nb02 = src0->nb[2];
|
7582
|
-
const size_t nb03 = src0->nb[3];
|
7583
|
-
|
7584
|
-
const size_t nb0 = dst->nb[0];
|
7585
|
-
const size_t nb1 = dst->nb[1];
|
7586
|
-
const size_t nb2 = dst->nb[2];
|
7587
|
-
const size_t nb3 = dst->nb[3];
|
7758
|
+
GGML_TENSOR_UNARY_OP_LOCALS;
|
7588
7759
|
|
7589
7760
|
const int ith = params->ith; // thread index
|
7590
7761
|
const int nth = params->nth; // number of threads
|
@@ -7657,8 +7828,8 @@ static void ggml_compute_forward_dup_f16(
|
|
7657
7828
|
id += ne00 * (ne01 - ir1);
|
7658
7829
|
}
|
7659
7830
|
}
|
7660
|
-
} else if (
|
7661
|
-
|
7831
|
+
} else if (type_traits[dst->type].from_float) {
|
7832
|
+
ggml_from_float_t const quantize_row_q = type_traits[dst->type].from_float;
|
7662
7833
|
float * src0_f32 = (float *) params->wdata + (ne00 + CACHE_LINE_SIZE_F32) * ith;
|
7663
7834
|
|
7664
7835
|
size_t id = 0;
|
@@ -7855,25 +8026,7 @@ static void ggml_compute_forward_dup_f32(
|
|
7855
8026
|
return;
|
7856
8027
|
}
|
7857
8028
|
|
7858
|
-
|
7859
|
-
const int64_t ne01 = src0->ne[1];
|
7860
|
-
const int64_t ne02 = src0->ne[2];
|
7861
|
-
const int64_t ne03 = src0->ne[3];
|
7862
|
-
|
7863
|
-
const int64_t ne0 = dst->ne[0];
|
7864
|
-
const int64_t ne1 = dst->ne[1];
|
7865
|
-
const int64_t ne2 = dst->ne[2];
|
7866
|
-
const int64_t ne3 = dst->ne[3];
|
7867
|
-
|
7868
|
-
const size_t nb00 = src0->nb[0];
|
7869
|
-
const size_t nb01 = src0->nb[1];
|
7870
|
-
const size_t nb02 = src0->nb[2];
|
7871
|
-
const size_t nb03 = src0->nb[3];
|
7872
|
-
|
7873
|
-
const size_t nb0 = dst->nb[0];
|
7874
|
-
const size_t nb1 = dst->nb[1];
|
7875
|
-
const size_t nb2 = dst->nb[2];
|
7876
|
-
const size_t nb3 = dst->nb[3];
|
8029
|
+
GGML_TENSOR_UNARY_OP_LOCALS;
|
7877
8030
|
|
7878
8031
|
const int ith = params->ith; // thread index
|
7879
8032
|
const int nth = params->nth; // number of threads
|
@@ -7928,26 +8081,8 @@ static void ggml_compute_forward_dup_f32(
|
|
7928
8081
|
id += rs * (ne01 - ir1);
|
7929
8082
|
}
|
7930
8083
|
}
|
7931
|
-
} else if (dst->type
|
7932
|
-
|
7933
|
-
ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data;
|
7934
|
-
|
7935
|
-
for (int i03 = 0; i03 < ne03; i03++) {
|
7936
|
-
for (int i02 = 0; i02 < ne02; i02++) {
|
7937
|
-
id += ne00 * ir0;
|
7938
|
-
for (int i01 = ir0; i01 < ir1; i01++) {
|
7939
|
-
for (int i00 = 0; i00 < ne00; i00++) {
|
7940
|
-
const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
|
7941
|
-
|
7942
|
-
dst_ptr[id] = GGML_FP32_TO_FP16(*src0_ptr);
|
7943
|
-
id++;
|
7944
|
-
}
|
7945
|
-
}
|
7946
|
-
id += ne00 * (ne01 - ir1);
|
7947
|
-
}
|
7948
|
-
}
|
7949
|
-
} else if (ggml_is_quantized(dst->type)) {
|
7950
|
-
quantize_row_q_t const quantize_row_q = quantize_fns[dst->type].quantize_row_q;
|
8084
|
+
} else if (type_traits[dst->type].from_float) {
|
8085
|
+
ggml_from_float_t const quantize_row_q = type_traits[dst->type].from_float;
|
7951
8086
|
|
7952
8087
|
size_t id = 0;
|
7953
8088
|
size_t rs = nb0 * (ne00 / GGML_BLCK_SIZE[dst->type]);
|
@@ -8171,24 +8306,8 @@ static void ggml_compute_forward_add_f32(
|
|
8171
8306
|
const int nth = params->nth;
|
8172
8307
|
|
8173
8308
|
const int nr = ggml_nrows(src0);
|
8174
|
-
const int64_t ne0 = src0->ne[0];
|
8175
|
-
const int64_t ne1 = src0->ne[1];
|
8176
|
-
const int64_t ne2 = src0->ne[2];
|
8177
|
-
|
8178
|
-
const size_t nb00 = src0->nb[0];
|
8179
|
-
const size_t nb01 = src0->nb[1];
|
8180
|
-
const size_t nb02 = src0->nb[2];
|
8181
|
-
const size_t nb03 = src0->nb[3];
|
8182
|
-
|
8183
|
-
const size_t nb10 = src1->nb[0];
|
8184
|
-
const size_t nb11 = src1->nb[1];
|
8185
|
-
const size_t nb12 = src1->nb[2];
|
8186
|
-
const size_t nb13 = src1->nb[3];
|
8187
8309
|
|
8188
|
-
|
8189
|
-
const size_t nb1 = dst->nb[1];
|
8190
|
-
const size_t nb2 = dst->nb[2];
|
8191
|
-
const size_t nb3 = dst->nb[3];
|
8310
|
+
GGML_TENSOR_BINARY_OP_LOCALS;
|
8192
8311
|
|
8193
8312
|
GGML_ASSERT( nb0 == sizeof(float));
|
8194
8313
|
GGML_ASSERT(nb00 == sizeof(float));
|
@@ -8257,28 +8376,12 @@ static void ggml_compute_forward_add_f16_f32(
|
|
8257
8376
|
const int nth = params->nth;
|
8258
8377
|
|
8259
8378
|
const int nr = ggml_nrows(src0);
|
8260
|
-
const int64_t ne0 = src0->ne[0];
|
8261
|
-
const int64_t ne1 = src0->ne[1];
|
8262
|
-
const int64_t ne2 = src0->ne[2];
|
8263
8379
|
|
8264
|
-
|
8265
|
-
const size_t nb01 = src0->nb[1];
|
8266
|
-
const size_t nb02 = src0->nb[2];
|
8267
|
-
const size_t nb03 = src0->nb[3];
|
8268
|
-
|
8269
|
-
const size_t nb10 = src1->nb[0];
|
8270
|
-
const size_t nb11 = src1->nb[1];
|
8271
|
-
const size_t nb12 = src1->nb[2];
|
8272
|
-
const size_t nb13 = src1->nb[3];
|
8273
|
-
|
8274
|
-
const size_t nb0 = dst->nb[0];
|
8275
|
-
const size_t nb1 = dst->nb[1];
|
8276
|
-
const size_t nb2 = dst->nb[2];
|
8277
|
-
const size_t nb3 = dst->nb[3];
|
8380
|
+
GGML_TENSOR_BINARY_OP_LOCALS;
|
8278
8381
|
|
8279
8382
|
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
8280
8383
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
8281
|
-
GGML_ASSERT(dst->type
|
8384
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F16);
|
8282
8385
|
|
8283
8386
|
GGML_ASSERT( nb0 == sizeof(ggml_fp16_t));
|
8284
8387
|
GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
|
@@ -8327,24 +8430,8 @@ static void ggml_compute_forward_add_f16_f16(
|
|
8327
8430
|
const int nth = params->nth;
|
8328
8431
|
|
8329
8432
|
const int nr = ggml_nrows(src0);
|
8330
|
-
const int64_t ne0 = src0->ne[0];
|
8331
|
-
const int64_t ne1 = src0->ne[1];
|
8332
|
-
const int64_t ne2 = src0->ne[2];
|
8333
|
-
|
8334
|
-
const size_t nb00 = src0->nb[0];
|
8335
|
-
const size_t nb01 = src0->nb[1];
|
8336
|
-
const size_t nb02 = src0->nb[2];
|
8337
|
-
const size_t nb03 = src0->nb[3];
|
8338
8433
|
|
8339
|
-
|
8340
|
-
const size_t nb11 = src1->nb[1];
|
8341
|
-
const size_t nb12 = src1->nb[2];
|
8342
|
-
const size_t nb13 = src1->nb[3];
|
8343
|
-
|
8344
|
-
const size_t nb0 = dst->nb[0];
|
8345
|
-
const size_t nb1 = dst->nb[1];
|
8346
|
-
const size_t nb2 = dst->nb[2];
|
8347
|
-
const size_t nb3 = dst->nb[3];
|
8434
|
+
GGML_TENSOR_BINARY_OP_LOCALS;
|
8348
8435
|
|
8349
8436
|
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
8350
8437
|
GGML_ASSERT(src1->type == GGML_TYPE_F16);
|
@@ -8394,32 +8481,15 @@ static void ggml_compute_forward_add_q_f32(
|
|
8394
8481
|
}
|
8395
8482
|
|
8396
8483
|
const int nr = ggml_nrows(src0);
|
8397
|
-
const int64_t ne00 = src0->ne[0];
|
8398
|
-
const int64_t ne01 = src0->ne[1];
|
8399
|
-
const int64_t ne02 = src0->ne[2];
|
8400
|
-
//const int64_t ne03 = src0->ne[3];
|
8401
|
-
|
8402
|
-
const size_t nb00 = src0->nb[0];
|
8403
|
-
const size_t nb01 = src0->nb[1];
|
8404
|
-
const size_t nb02 = src0->nb[2];
|
8405
|
-
const size_t nb03 = src0->nb[3];
|
8406
8484
|
|
8407
|
-
|
8408
|
-
const size_t nb11 = src1->nb[1];
|
8409
|
-
const size_t nb12 = src1->nb[2];
|
8410
|
-
const size_t nb13 = src1->nb[3];
|
8411
|
-
|
8412
|
-
const size_t nb0 = dst->nb[0];
|
8413
|
-
const size_t nb1 = dst->nb[1];
|
8414
|
-
const size_t nb2 = dst->nb[2];
|
8415
|
-
const size_t nb3 = dst->nb[3];
|
8485
|
+
GGML_TENSOR_BINARY_OP_LOCALS;
|
8416
8486
|
|
8417
8487
|
const int ith = params->ith;
|
8418
8488
|
const int nth = params->nth;
|
8419
8489
|
|
8420
8490
|
const enum ggml_type type = src0->type;
|
8421
|
-
|
8422
|
-
|
8491
|
+
ggml_to_float_t const dequantize_row_q = type_traits[type].to_float;
|
8492
|
+
ggml_from_float_t const quantize_row_q = type_traits[type].from_float;
|
8423
8493
|
|
8424
8494
|
// we don't support permuted src0 or src1
|
8425
8495
|
GGML_ASSERT(nb00 == GGML_TYPE_SIZE[type]);
|
@@ -8533,19 +8603,8 @@ static void ggml_compute_forward_add1_f32(
|
|
8533
8603
|
const int nth = params->nth;
|
8534
8604
|
|
8535
8605
|
const int nr = ggml_nrows(src0);
|
8536
|
-
const int64_t ne0 = src0->ne[0];
|
8537
|
-
const int64_t ne1 = src0->ne[1];
|
8538
|
-
const int64_t ne2 = src0->ne[2];
|
8539
|
-
|
8540
|
-
const size_t nb00 = src0->nb[0];
|
8541
|
-
const size_t nb01 = src0->nb[1];
|
8542
|
-
const size_t nb02 = src0->nb[2];
|
8543
|
-
const size_t nb03 = src0->nb[3];
|
8544
8606
|
|
8545
|
-
|
8546
|
-
const size_t nb1 = dst->nb[1];
|
8547
|
-
const size_t nb2 = dst->nb[2];
|
8548
|
-
const size_t nb3 = dst->nb[3];
|
8607
|
+
GGML_TENSOR_UNARY_OP_LOCALS;
|
8549
8608
|
|
8550
8609
|
GGML_ASSERT( nb0 == sizeof(float));
|
8551
8610
|
GGML_ASSERT(nb00 == sizeof(float));
|
@@ -8599,23 +8658,12 @@ static void ggml_compute_forward_add1_f16_f32(
|
|
8599
8658
|
const int nth = params->nth;
|
8600
8659
|
|
8601
8660
|
const int nr = ggml_nrows(src0);
|
8602
|
-
const int64_t ne0 = src0->ne[0];
|
8603
|
-
const int64_t ne1 = src0->ne[1];
|
8604
|
-
const int64_t ne2 = src0->ne[2];
|
8605
8661
|
|
8606
|
-
|
8607
|
-
const size_t nb01 = src0->nb[1];
|
8608
|
-
const size_t nb02 = src0->nb[2];
|
8609
|
-
const size_t nb03 = src0->nb[3];
|
8610
|
-
|
8611
|
-
const size_t nb0 = dst->nb[0];
|
8612
|
-
const size_t nb1 = dst->nb[1];
|
8613
|
-
const size_t nb2 = dst->nb[2];
|
8614
|
-
const size_t nb3 = dst->nb[3];
|
8662
|
+
GGML_TENSOR_UNARY_OP_LOCALS;
|
8615
8663
|
|
8616
8664
|
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
8617
8665
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
8618
|
-
GGML_ASSERT(dst->type
|
8666
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F16);
|
8619
8667
|
|
8620
8668
|
GGML_ASSERT( nb0 == sizeof(ggml_fp16_t));
|
8621
8669
|
GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
|
@@ -8660,23 +8708,12 @@ static void ggml_compute_forward_add1_f16_f16(
|
|
8660
8708
|
const int nth = params->nth;
|
8661
8709
|
|
8662
8710
|
const int nr = ggml_nrows(src0);
|
8663
|
-
const int64_t ne0 = src0->ne[0];
|
8664
|
-
const int64_t ne1 = src0->ne[1];
|
8665
|
-
const int64_t ne2 = src0->ne[2];
|
8666
|
-
|
8667
|
-
const size_t nb00 = src0->nb[0];
|
8668
|
-
const size_t nb01 = src0->nb[1];
|
8669
|
-
const size_t nb02 = src0->nb[2];
|
8670
|
-
const size_t nb03 = src0->nb[3];
|
8671
8711
|
|
8672
|
-
|
8673
|
-
const size_t nb1 = dst->nb[1];
|
8674
|
-
const size_t nb2 = dst->nb[2];
|
8675
|
-
const size_t nb3 = dst->nb[3];
|
8712
|
+
GGML_TENSOR_UNARY_OP_LOCALS;
|
8676
8713
|
|
8677
8714
|
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
8678
8715
|
GGML_ASSERT(src1->type == GGML_TYPE_F16);
|
8679
|
-
GGML_ASSERT(dst->type
|
8716
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F16);
|
8680
8717
|
|
8681
8718
|
GGML_ASSERT( nb0 == sizeof(ggml_fp16_t));
|
8682
8719
|
GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
|
@@ -8721,23 +8758,12 @@ static void ggml_compute_forward_add1_q_f32(
|
|
8721
8758
|
const int nth = params->nth;
|
8722
8759
|
|
8723
8760
|
const int nr = ggml_nrows(src0);
|
8724
|
-
const int64_t ne0 = src0->ne[0];
|
8725
|
-
const int64_t ne1 = src0->ne[1];
|
8726
|
-
const int64_t ne2 = src0->ne[2];
|
8727
8761
|
|
8728
|
-
|
8729
|
-
const size_t nb01 = src0->nb[1];
|
8730
|
-
const size_t nb02 = src0->nb[2];
|
8731
|
-
const size_t nb03 = src0->nb[3];
|
8732
|
-
|
8733
|
-
const size_t nb0 = dst->nb[0];
|
8734
|
-
const size_t nb1 = dst->nb[1];
|
8735
|
-
const size_t nb2 = dst->nb[2];
|
8736
|
-
const size_t nb3 = dst->nb[3];
|
8762
|
+
GGML_TENSOR_UNARY_OP_LOCALS;
|
8737
8763
|
|
8738
8764
|
const enum ggml_type type = src0->type;
|
8739
|
-
|
8740
|
-
|
8765
|
+
ggml_to_float_t const dequantize_row_q = type_traits[type].to_float;
|
8766
|
+
ggml_from_float_t const quantize_row_q = type_traits[type].from_float;
|
8741
8767
|
|
8742
8768
|
// we don't support permuted src0
|
8743
8769
|
GGML_ASSERT(nb00 == GGML_TYPE_SIZE[type]);
|
@@ -8865,15 +8891,8 @@ static void ggml_compute_forward_acc_f32(
|
|
8865
8891
|
const int nr = ggml_nrows(src1);
|
8866
8892
|
const int nc = src1->ne[0];
|
8867
8893
|
|
8868
|
-
|
8869
|
-
|
8870
|
-
const int64_t ne12 = src1->ne[2];
|
8871
|
-
const int64_t ne13 = src1->ne[3];
|
8872
|
-
|
8873
|
-
const size_t nb10 = src1->nb[0];
|
8874
|
-
const size_t nb11 = src1->nb[1];
|
8875
|
-
const size_t nb12 = src1->nb[2];
|
8876
|
-
const size_t nb13 = src1->nb[3];
|
8894
|
+
GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne);
|
8895
|
+
GGML_TENSOR_LOCALS(size_t, nb1, src1, nb);
|
8877
8896
|
|
8878
8897
|
// src0 and dst as viewed during acc
|
8879
8898
|
const size_t nb0 = ggml_element_size(src0);
|
@@ -8962,24 +8981,8 @@ static void ggml_compute_forward_sub_f32(
|
|
8962
8981
|
}
|
8963
8982
|
|
8964
8983
|
const int nr = ggml_nrows(src0);
|
8965
|
-
const int64_t ne0 = src0->ne[0];
|
8966
|
-
const int64_t ne1 = src0->ne[1];
|
8967
|
-
const int64_t ne2 = src0->ne[2];
|
8968
|
-
|
8969
|
-
const size_t nb00 = src0->nb[0];
|
8970
|
-
const size_t nb01 = src0->nb[1];
|
8971
|
-
const size_t nb02 = src0->nb[2];
|
8972
|
-
const size_t nb03 = src0->nb[3];
|
8973
8984
|
|
8974
|
-
|
8975
|
-
const size_t nb11 = src1->nb[1];
|
8976
|
-
const size_t nb12 = src1->nb[2];
|
8977
|
-
const size_t nb13 = src1->nb[3];
|
8978
|
-
|
8979
|
-
const size_t nb0 = dst->nb[0];
|
8980
|
-
const size_t nb1 = dst->nb[1];
|
8981
|
-
const size_t nb2 = dst->nb[2];
|
8982
|
-
const size_t nb3 = dst->nb[3];
|
8985
|
+
GGML_TENSOR_BINARY_OP_LOCALS;
|
8983
8986
|
|
8984
8987
|
GGML_ASSERT( nb0 == sizeof(float));
|
8985
8988
|
GGML_ASSERT(nb00 == sizeof(float));
|
@@ -9069,29 +9072,7 @@ static void ggml_compute_forward_mul_f32(
|
|
9069
9072
|
|
9070
9073
|
const int64_t nr = ggml_nrows(src0);
|
9071
9074
|
|
9072
|
-
|
9073
|
-
const int64_t ne01 = src0->ne[1];
|
9074
|
-
const int64_t ne02 = src0->ne[2];
|
9075
|
-
|
9076
|
-
const int64_t ne10 = src1->ne[0];
|
9077
|
-
const int64_t ne11 = src1->ne[1];
|
9078
|
-
const int64_t ne12 = src1->ne[2];
|
9079
|
-
const int64_t ne13 = src1->ne[3];
|
9080
|
-
|
9081
|
-
const size_t nb00 = src0->nb[0];
|
9082
|
-
const size_t nb01 = src0->nb[1];
|
9083
|
-
const size_t nb02 = src0->nb[2];
|
9084
|
-
const size_t nb03 = src0->nb[3];
|
9085
|
-
|
9086
|
-
const size_t nb10 = src1->nb[0];
|
9087
|
-
const size_t nb11 = src1->nb[1];
|
9088
|
-
const size_t nb12 = src1->nb[2];
|
9089
|
-
const size_t nb13 = src1->nb[3];
|
9090
|
-
|
9091
|
-
const size_t nb0 = dst->nb[0];
|
9092
|
-
const size_t nb1 = dst->nb[1];
|
9093
|
-
const size_t nb2 = dst->nb[2];
|
9094
|
-
const size_t nb3 = dst->nb[3];
|
9075
|
+
GGML_TENSOR_BINARY_OP_LOCALS;
|
9095
9076
|
|
9096
9077
|
GGML_ASSERT( nb0 == sizeof(float));
|
9097
9078
|
GGML_ASSERT(nb00 == sizeof(float));
|
@@ -9179,24 +9160,8 @@ static void ggml_compute_forward_div_f32(
|
|
9179
9160
|
}
|
9180
9161
|
|
9181
9162
|
const int nr = ggml_nrows(src0);
|
9182
|
-
const int64_t ne0 = src0->ne[0];
|
9183
|
-
const int64_t ne1 = src0->ne[1];
|
9184
|
-
const int64_t ne2 = src0->ne[2];
|
9185
|
-
|
9186
|
-
const size_t nb00 = src0->nb[0];
|
9187
|
-
const size_t nb01 = src0->nb[1];
|
9188
|
-
const size_t nb02 = src0->nb[2];
|
9189
|
-
const size_t nb03 = src0->nb[3];
|
9190
|
-
|
9191
|
-
const size_t nb10 = src1->nb[0];
|
9192
|
-
const size_t nb11 = src1->nb[1];
|
9193
|
-
const size_t nb12 = src1->nb[2];
|
9194
|
-
const size_t nb13 = src1->nb[3];
|
9195
9163
|
|
9196
|
-
|
9197
|
-
const size_t nb1 = dst->nb[1];
|
9198
|
-
const size_t nb2 = dst->nb[2];
|
9199
|
-
const size_t nb3 = dst->nb[3];
|
9164
|
+
GGML_TENSOR_BINARY_OP_LOCALS;
|
9200
9165
|
|
9201
9166
|
GGML_ASSERT( nb0 == sizeof(float));
|
9202
9167
|
GGML_ASSERT(nb00 == sizeof(float));
|
@@ -9403,14 +9368,8 @@ static void ggml_compute_forward_sum_f32(
|
|
9403
9368
|
assert(ggml_is_scalar(dst));
|
9404
9369
|
assert(src0->nb[0] == sizeof(float));
|
9405
9370
|
|
9406
|
-
|
9407
|
-
|
9408
|
-
const int64_t ne02 = src0->ne[2];
|
9409
|
-
const int64_t ne03 = src0->ne[3];
|
9410
|
-
|
9411
|
-
const size_t nb01 = src0->nb[1];
|
9412
|
-
const size_t nb02 = src0->nb[2];
|
9413
|
-
const size_t nb03 = src0->nb[3];
|
9371
|
+
GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne);
|
9372
|
+
GGML_TENSOR_LOCALS(size_t, nb0, src0, nb);
|
9414
9373
|
|
9415
9374
|
ggml_float sum = 0;
|
9416
9375
|
ggml_float row_sum = 0;
|
@@ -9459,29 +9418,13 @@ static void ggml_compute_forward_sum_rows_f32(
|
|
9459
9418
|
GGML_ASSERT(src0->nb[0] == sizeof(float));
|
9460
9419
|
GGML_ASSERT(dst->nb[0] == sizeof(float));
|
9461
9420
|
|
9462
|
-
|
9463
|
-
const int64_t ne01 = src0->ne[1];
|
9464
|
-
const int64_t ne02 = src0->ne[2];
|
9465
|
-
const int64_t ne03 = src0->ne[3];
|
9466
|
-
|
9467
|
-
const int64_t ne0 = dst->ne[0];
|
9468
|
-
const int64_t ne1 = dst->ne[1];
|
9469
|
-
const int64_t ne2 = dst->ne[2];
|
9470
|
-
const int64_t ne3 = dst->ne[3];
|
9421
|
+
GGML_TENSOR_UNARY_OP_LOCALS;
|
9471
9422
|
|
9472
9423
|
GGML_ASSERT(ne0 == 1);
|
9473
9424
|
GGML_ASSERT(ne1 == ne01);
|
9474
9425
|
GGML_ASSERT(ne2 == ne02);
|
9475
9426
|
GGML_ASSERT(ne3 == ne03);
|
9476
9427
|
|
9477
|
-
const size_t nb01 = src0->nb[1];
|
9478
|
-
const size_t nb02 = src0->nb[2];
|
9479
|
-
const size_t nb03 = src0->nb[3];
|
9480
|
-
|
9481
|
-
const size_t nb1 = dst->nb[1];
|
9482
|
-
const size_t nb2 = dst->nb[2];
|
9483
|
-
const size_t nb3 = dst->nb[3];
|
9484
|
-
|
9485
9428
|
for (int64_t i3 = 0; i3 < ne03; i3++) {
|
9486
9429
|
for (int64_t i2 = 0; i2 < ne02; i2++) {
|
9487
9430
|
for (int64_t i1 = 0; i1 < ne01; i1++) {
|
@@ -9525,19 +9468,7 @@ static void ggml_compute_forward_mean_f32(
|
|
9525
9468
|
|
9526
9469
|
assert(src0->nb[0] == sizeof(float));
|
9527
9470
|
|
9528
|
-
|
9529
|
-
const int64_t ne01 = src0->ne[1];
|
9530
|
-
const int64_t ne02 = src0->ne[2];
|
9531
|
-
const int64_t ne03 = src0->ne[3];
|
9532
|
-
|
9533
|
-
const size_t nb01 = src0->nb[1];
|
9534
|
-
const size_t nb02 = src0->nb[2];
|
9535
|
-
const size_t nb03 = src0->nb[3];
|
9536
|
-
|
9537
|
-
const int64_t ne0 = dst->ne[0];
|
9538
|
-
const int64_t ne1 = dst->ne[1];
|
9539
|
-
const int64_t ne2 = dst->ne[2];
|
9540
|
-
const int64_t ne3 = dst->ne[3];
|
9471
|
+
GGML_TENSOR_UNARY_OP_LOCALS;
|
9541
9472
|
|
9542
9473
|
assert(ne0 == 1);
|
9543
9474
|
assert(ne1 == ne01);
|
@@ -9549,10 +9480,6 @@ static void ggml_compute_forward_mean_f32(
|
|
9549
9480
|
UNUSED(ne2);
|
9550
9481
|
UNUSED(ne3);
|
9551
9482
|
|
9552
|
-
const size_t nb1 = dst->nb[1];
|
9553
|
-
const size_t nb2 = dst->nb[2];
|
9554
|
-
const size_t nb3 = dst->nb[3];
|
9555
|
-
|
9556
9483
|
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
9557
9484
|
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
9558
9485
|
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
@@ -9582,38 +9509,66 @@ static void ggml_compute_forward_mean(
|
|
9582
9509
|
}
|
9583
9510
|
}
|
9584
9511
|
|
9585
|
-
//
|
9512
|
+
// ggml_compute_forward_argmax
|
9586
9513
|
|
9587
|
-
static void
|
9514
|
+
static void ggml_compute_forward_argmax_f32(
|
9588
9515
|
const struct ggml_compute_params * params,
|
9589
9516
|
const struct ggml_tensor * src0,
|
9590
9517
|
struct ggml_tensor * dst) {
|
9591
|
-
|
9592
|
-
GGML_ASSERT(ggml_can_repeat(src0, dst));
|
9518
|
+
assert(params->ith == 0);
|
9593
9519
|
|
9594
9520
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
9595
9521
|
return;
|
9596
9522
|
}
|
9597
9523
|
|
9598
|
-
|
9599
|
-
|
9600
|
-
const int64_t ne2 = dst->ne[2];
|
9601
|
-
const int64_t ne3 = dst->ne[3];
|
9524
|
+
assert(src0->nb[0] == sizeof(float));
|
9525
|
+
assert(dst->nb[0] == sizeof(float));
|
9602
9526
|
|
9603
9527
|
const int64_t ne00 = src0->ne[0];
|
9604
9528
|
const int64_t ne01 = src0->ne[1];
|
9605
|
-
const int64_t ne02 = src0->ne[2];
|
9606
|
-
const int64_t ne03 = src0->ne[3];
|
9607
|
-
|
9608
|
-
const size_t nb0 = dst->nb[0];
|
9609
|
-
const size_t nb1 = dst->nb[1];
|
9610
|
-
const size_t nb2 = dst->nb[2];
|
9611
|
-
const size_t nb3 = dst->nb[3];
|
9612
9529
|
|
9613
|
-
const size_t nb00 = src0->nb[0];
|
9614
9530
|
const size_t nb01 = src0->nb[1];
|
9615
|
-
const size_t
|
9616
|
-
|
9531
|
+
const size_t nb0 = dst->nb[0];
|
9532
|
+
|
9533
|
+
for (int64_t i1 = 0; i1 < ne01; i1++) {
|
9534
|
+
float * src = (float *) ((char *) src0->data + i1*nb01);
|
9535
|
+
int32_t * dst_ = (int32_t *) ((char *) dst->data + i1*nb0);
|
9536
|
+
int v = 0;
|
9537
|
+
ggml_vec_argmax_f32(ne00, &v, src);
|
9538
|
+
dst_[0] = v;
|
9539
|
+
}
|
9540
|
+
}
|
9541
|
+
|
9542
|
+
static void ggml_compute_forward_argmax(
|
9543
|
+
const struct ggml_compute_params * params,
|
9544
|
+
const struct ggml_tensor * src0,
|
9545
|
+
struct ggml_tensor * dst) {
|
9546
|
+
switch (src0->type) {
|
9547
|
+
case GGML_TYPE_F32:
|
9548
|
+
{
|
9549
|
+
ggml_compute_forward_argmax_f32(params, src0, dst);
|
9550
|
+
} break;
|
9551
|
+
default:
|
9552
|
+
{
|
9553
|
+
GGML_ASSERT(false);
|
9554
|
+
} break;
|
9555
|
+
}
|
9556
|
+
}
|
9557
|
+
|
9558
|
+
// ggml_compute_forward_repeat
|
9559
|
+
|
9560
|
+
static void ggml_compute_forward_repeat_f32(
|
9561
|
+
const struct ggml_compute_params * params,
|
9562
|
+
const struct ggml_tensor * src0,
|
9563
|
+
struct ggml_tensor * dst) {
|
9564
|
+
GGML_ASSERT(params->ith == 0);
|
9565
|
+
GGML_ASSERT(ggml_can_repeat(src0, dst));
|
9566
|
+
|
9567
|
+
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
9568
|
+
return;
|
9569
|
+
}
|
9570
|
+
|
9571
|
+
GGML_TENSOR_UNARY_OP_LOCALS;
|
9617
9572
|
|
9618
9573
|
// guaranteed to be an integer due to the check in ggml_can_repeat
|
9619
9574
|
const int nr0 = (int)(ne0/ne00);
|
@@ -9674,25 +9629,7 @@ static void ggml_compute_forward_repeat_back_f32(
|
|
9674
9629
|
return;
|
9675
9630
|
}
|
9676
9631
|
|
9677
|
-
|
9678
|
-
const int64_t ne1 = dst->ne[1];
|
9679
|
-
const int64_t ne2 = dst->ne[2];
|
9680
|
-
const int64_t ne3 = dst->ne[3];
|
9681
|
-
|
9682
|
-
const int64_t ne00 = src0->ne[0];
|
9683
|
-
const int64_t ne01 = src0->ne[1];
|
9684
|
-
const int64_t ne02 = src0->ne[2];
|
9685
|
-
const int64_t ne03 = src0->ne[3];
|
9686
|
-
|
9687
|
-
const size_t nb0 = dst->nb[0];
|
9688
|
-
const size_t nb1 = dst->nb[1];
|
9689
|
-
const size_t nb2 = dst->nb[2];
|
9690
|
-
const size_t nb3 = dst->nb[3];
|
9691
|
-
|
9692
|
-
const size_t nb00 = src0->nb[0];
|
9693
|
-
const size_t nb01 = src0->nb[1];
|
9694
|
-
const size_t nb02 = src0->nb[2];
|
9695
|
-
const size_t nb03 = src0->nb[3];
|
9632
|
+
GGML_TENSOR_UNARY_OP_LOCALS;
|
9696
9633
|
|
9697
9634
|
// guaranteed to be an integer due to the check in ggml_can_repeat
|
9698
9635
|
const int nr0 = (int)(ne00/ne0);
|
@@ -9922,6 +9859,90 @@ static void ggml_compute_forward_step(
|
|
9922
9859
|
}
|
9923
9860
|
}
|
9924
9861
|
|
9862
|
+
// ggml_compute_forward_tanh
|
9863
|
+
|
9864
|
+
static void ggml_compute_forward_tanh_f32(
|
9865
|
+
const struct ggml_compute_params * params,
|
9866
|
+
const struct ggml_tensor * src0,
|
9867
|
+
struct ggml_tensor * dst) {
|
9868
|
+
assert(params->ith == 0);
|
9869
|
+
assert(ggml_are_same_shape(src0, dst));
|
9870
|
+
|
9871
|
+
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
9872
|
+
return;
|
9873
|
+
}
|
9874
|
+
|
9875
|
+
const int n = ggml_nrows(src0);
|
9876
|
+
const int nc = src0->ne[0];
|
9877
|
+
|
9878
|
+
assert(dst->nb[0] == sizeof(float));
|
9879
|
+
assert(src0->nb[0] == sizeof(float));
|
9880
|
+
|
9881
|
+
for (int i = 0; i < n; i++) {
|
9882
|
+
ggml_vec_tanh_f32(nc,
|
9883
|
+
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
9884
|
+
(float *) ((char *) src0->data + i*(src0->nb[1])));
|
9885
|
+
}
|
9886
|
+
}
|
9887
|
+
|
9888
|
+
static void ggml_compute_forward_tanh(
|
9889
|
+
const struct ggml_compute_params * params,
|
9890
|
+
const struct ggml_tensor * src0,
|
9891
|
+
struct ggml_tensor * dst) {
|
9892
|
+
switch (src0->type) {
|
9893
|
+
case GGML_TYPE_F32:
|
9894
|
+
{
|
9895
|
+
ggml_compute_forward_tanh_f32(params, src0, dst);
|
9896
|
+
} break;
|
9897
|
+
default:
|
9898
|
+
{
|
9899
|
+
GGML_ASSERT(false);
|
9900
|
+
} break;
|
9901
|
+
}
|
9902
|
+
}
|
9903
|
+
|
9904
|
+
// ggml_compute_forward_elu
|
9905
|
+
|
9906
|
+
static void ggml_compute_forward_elu_f32(
|
9907
|
+
const struct ggml_compute_params * params,
|
9908
|
+
const struct ggml_tensor * src0,
|
9909
|
+
struct ggml_tensor * dst) {
|
9910
|
+
assert(params->ith == 0);
|
9911
|
+
assert(ggml_are_same_shape(src0, dst));
|
9912
|
+
|
9913
|
+
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
9914
|
+
return;
|
9915
|
+
}
|
9916
|
+
|
9917
|
+
const int n = ggml_nrows(src0);
|
9918
|
+
const int nc = src0->ne[0];
|
9919
|
+
|
9920
|
+
assert(dst->nb[0] == sizeof(float));
|
9921
|
+
assert(src0->nb[0] == sizeof(float));
|
9922
|
+
|
9923
|
+
for (int i = 0; i < n; i++) {
|
9924
|
+
ggml_vec_elu_f32(nc,
|
9925
|
+
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
9926
|
+
(float *) ((char *) src0->data + i*(src0->nb[1])));
|
9927
|
+
}
|
9928
|
+
}
|
9929
|
+
|
9930
|
+
static void ggml_compute_forward_elu(
|
9931
|
+
const struct ggml_compute_params * params,
|
9932
|
+
const struct ggml_tensor * src0,
|
9933
|
+
struct ggml_tensor * dst) {
|
9934
|
+
switch (src0->type) {
|
9935
|
+
case GGML_TYPE_F32:
|
9936
|
+
{
|
9937
|
+
ggml_compute_forward_elu_f32(params, src0, dst);
|
9938
|
+
} break;
|
9939
|
+
default:
|
9940
|
+
{
|
9941
|
+
GGML_ASSERT(false);
|
9942
|
+
} break;
|
9943
|
+
}
|
9944
|
+
}
|
9945
|
+
|
9925
9946
|
// ggml_compute_forward_relu
|
9926
9947
|
|
9927
9948
|
static void ggml_compute_forward_relu_f32(
|
@@ -10223,18 +10244,7 @@ static void ggml_compute_forward_norm_f32(
|
|
10223
10244
|
const int ith = params->ith;
|
10224
10245
|
const int nth = params->nth;
|
10225
10246
|
|
10226
|
-
|
10227
|
-
const int64_t ne01 = src0->ne[1];
|
10228
|
-
const int64_t ne02 = src0->ne[2];
|
10229
|
-
const int64_t ne03 = src0->ne[3];
|
10230
|
-
|
10231
|
-
const size_t nb01 = src0->nb[1];
|
10232
|
-
const size_t nb02 = src0->nb[2];
|
10233
|
-
const size_t nb03 = src0->nb[3];
|
10234
|
-
|
10235
|
-
const size_t nb1 = dst->nb[1];
|
10236
|
-
const size_t nb2 = dst->nb[2];
|
10237
|
-
const size_t nb3 = dst->nb[3];
|
10247
|
+
GGML_TENSOR_UNARY_OP_LOCALS;
|
10238
10248
|
|
10239
10249
|
const float eps = 1e-5f; // TODO: make this a parameter
|
10240
10250
|
|
@@ -10300,18 +10310,7 @@ static void ggml_compute_forward_rms_norm_f32(
|
|
10300
10310
|
const int ith = params->ith;
|
10301
10311
|
const int nth = params->nth;
|
10302
10312
|
|
10303
|
-
|
10304
|
-
const int64_t ne01 = src0->ne[1];
|
10305
|
-
const int64_t ne02 = src0->ne[2];
|
10306
|
-
const int64_t ne03 = src0->ne[3];
|
10307
|
-
|
10308
|
-
const size_t nb01 = src0->nb[1];
|
10309
|
-
const size_t nb02 = src0->nb[2];
|
10310
|
-
const size_t nb03 = src0->nb[3];
|
10311
|
-
|
10312
|
-
const size_t nb1 = dst->nb[1];
|
10313
|
-
const size_t nb2 = dst->nb[2];
|
10314
|
-
const size_t nb3 = dst->nb[3];
|
10313
|
+
GGML_TENSOR_UNARY_OP_LOCALS;
|
10315
10314
|
|
10316
10315
|
const float eps = 1e-6f; // TODO: make this a parameter
|
10317
10316
|
|
@@ -10376,22 +10375,7 @@ static void ggml_compute_forward_rms_norm_back_f32(
|
|
10376
10375
|
const int ith = params->ith;
|
10377
10376
|
const int nth = params->nth;
|
10378
10377
|
|
10379
|
-
|
10380
|
-
const int64_t ne01 = src0->ne[1];
|
10381
|
-
const int64_t ne02 = src0->ne[2];
|
10382
|
-
const int64_t ne03 = src0->ne[3];
|
10383
|
-
|
10384
|
-
const size_t nb01 = src0->nb[1];
|
10385
|
-
const size_t nb02 = src0->nb[2];
|
10386
|
-
const size_t nb03 = src0->nb[3];
|
10387
|
-
|
10388
|
-
const size_t nb11 = src1->nb[1];
|
10389
|
-
const size_t nb12 = src1->nb[2];
|
10390
|
-
const size_t nb13 = src1->nb[3];
|
10391
|
-
|
10392
|
-
const size_t nb1 = dst->nb[1];
|
10393
|
-
const size_t nb2 = dst->nb[2];
|
10394
|
-
const size_t nb3 = dst->nb[3];
|
10378
|
+
GGML_TENSOR_BINARY_OP_LOCALS;
|
10395
10379
|
|
10396
10380
|
const float eps = 1e-6f; // TODO: make this a parameter
|
10397
10381
|
|
@@ -10541,416 +10525,45 @@ static void ggml_compute_forward_rms_norm_back(
|
|
10541
10525
|
{
|
10542
10526
|
ggml_compute_forward_rms_norm_back_f32(params, src0, src1, dst);
|
10543
10527
|
} break;
|
10544
|
-
default:
|
10545
|
-
{
|
10546
|
-
GGML_ASSERT(false);
|
10547
|
-
} break;
|
10548
|
-
}
|
10549
|
-
}
|
10550
|
-
|
10551
|
-
|
10552
|
-
// ggml_compute_forward_mul_mat
|
10553
|
-
|
10554
|
-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
10555
|
-
// helper function to determine if it is better to use BLAS or not
|
10556
|
-
// for large matrices, BLAS is faster
|
10557
|
-
static bool ggml_compute_forward_mul_mat_use_blas(
|
10558
|
-
const struct ggml_tensor * src0,
|
10559
|
-
const struct ggml_tensor * src1,
|
10560
|
-
struct ggml_tensor * dst) {
|
10561
|
-
//const int64_t ne00 = src0->ne[0];
|
10562
|
-
//const int64_t ne01 = src0->ne[1];
|
10563
|
-
|
10564
|
-
const int64_t ne10 = src1->ne[0];
|
10565
|
-
|
10566
|
-
const int64_t ne0 = dst->ne[0];
|
10567
|
-
const int64_t ne1 = dst->ne[1];
|
10568
|
-
|
10569
|
-
// TODO: find the optimal values for these
|
10570
|
-
if (ggml_is_contiguous(src0) &&
|
10571
|
-
ggml_is_contiguous(src1) &&
|
10572
|
-
(ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
|
10573
|
-
|
10574
|
-
/*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/
|
10575
|
-
return true;
|
10576
|
-
}
|
10577
|
-
|
10578
|
-
return false;
|
10579
|
-
}
|
10580
|
-
#endif
|
10581
|
-
|
10582
|
-
static void ggml_compute_forward_mul_mat_f32(
|
10583
|
-
const struct ggml_compute_params * params,
|
10584
|
-
const struct ggml_tensor * src0,
|
10585
|
-
const struct ggml_tensor * src1,
|
10586
|
-
struct ggml_tensor * dst) {
|
10587
|
-
int64_t t0 = ggml_perf_time_us();
|
10588
|
-
UNUSED(t0);
|
10589
|
-
|
10590
|
-
const int64_t ne00 = src0->ne[0];
|
10591
|
-
const int64_t ne01 = src0->ne[1];
|
10592
|
-
const int64_t ne02 = src0->ne[2];
|
10593
|
-
const int64_t ne03 = src0->ne[3];
|
10594
|
-
|
10595
|
-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
10596
|
-
const int64_t ne10 = src1->ne[0];
|
10597
|
-
#endif
|
10598
|
-
const int64_t ne11 = src1->ne[1];
|
10599
|
-
#ifndef NDEBUG
|
10600
|
-
const int64_t ne12 = src1->ne[2];
|
10601
|
-
const int64_t ne13 = src1->ne[3];
|
10602
|
-
|
10603
|
-
const int64_t ne0 = dst->ne[0];
|
10604
|
-
const int64_t ne1 = dst->ne[1];
|
10605
|
-
const int64_t ne2 = dst->ne[2];
|
10606
|
-
const int64_t ne3 = dst->ne[3];
|
10607
|
-
|
10608
|
-
const int nb00 = src0->nb[0];
|
10609
|
-
#endif
|
10610
|
-
const int nb01 = src0->nb[1];
|
10611
|
-
const int nb02 = src0->nb[2];
|
10612
|
-
const int nb03 = src0->nb[3];
|
10613
|
-
|
10614
|
-
#ifndef NDEBUG
|
10615
|
-
const int nb10 = src1->nb[0];
|
10616
|
-
#endif
|
10617
|
-
const int nb11 = src1->nb[1];
|
10618
|
-
const int nb12 = src1->nb[2];
|
10619
|
-
const int nb13 = src1->nb[3];
|
10620
|
-
|
10621
|
-
const int nb0 = dst->nb[0];
|
10622
|
-
const int nb1 = dst->nb[1];
|
10623
|
-
const int nb2 = dst->nb[2];
|
10624
|
-
const int nb3 = dst->nb[3];
|
10625
|
-
|
10626
|
-
const int ith = params->ith;
|
10627
|
-
const int nth = params->nth;
|
10628
|
-
|
10629
|
-
assert(ne02 == ne12);
|
10630
|
-
assert(ne03 == ne13);
|
10631
|
-
assert(ne2 == ne12);
|
10632
|
-
assert(ne3 == ne13);
|
10633
|
-
|
10634
|
-
// we don't support permuted src0 or src1
|
10635
|
-
assert(nb00 == sizeof(float));
|
10636
|
-
assert(nb10 == sizeof(float));
|
10637
|
-
|
10638
|
-
// dst cannot be transposed or permuted
|
10639
|
-
assert(nb0 == sizeof(float));
|
10640
|
-
assert(nb0 <= nb1);
|
10641
|
-
assert(nb1 <= nb2);
|
10642
|
-
assert(nb2 <= nb3);
|
10643
|
-
|
10644
|
-
assert(ne0 == ne01);
|
10645
|
-
assert(ne1 == ne11);
|
10646
|
-
assert(ne2 == ne02);
|
10647
|
-
assert(ne3 == ne03);
|
10648
|
-
|
10649
|
-
// nb01 >= nb00 - src0 is not transposed
|
10650
|
-
// compute by src0 rows
|
10651
|
-
|
10652
|
-
#if defined(GGML_USE_CLBLAST)
|
10653
|
-
if (ggml_cl_can_mul_mat(src0, src1, dst)) {
|
10654
|
-
if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
|
10655
|
-
ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
|
10656
|
-
}
|
10657
|
-
return;
|
10658
|
-
}
|
10659
|
-
#endif
|
10660
|
-
|
10661
|
-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
10662
|
-
if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
|
10663
|
-
if (params->ith != 0) {
|
10664
|
-
return;
|
10665
|
-
}
|
10666
|
-
|
10667
|
-
if (params->type == GGML_TASK_INIT) {
|
10668
|
-
return;
|
10669
|
-
}
|
10670
|
-
|
10671
|
-
if (params->type == GGML_TASK_FINALIZE) {
|
10672
|
-
return;
|
10673
|
-
}
|
10674
|
-
|
10675
|
-
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
10676
|
-
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
10677
|
-
const float * x = (float *) ((char *) src0->data + i02*nb02 + i03*nb03);
|
10678
|
-
const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
|
10679
|
-
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
|
10680
|
-
|
10681
|
-
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
|
10682
|
-
ne11, ne01, ne10,
|
10683
|
-
1.0f, y, ne10,
|
10684
|
-
x, ne00,
|
10685
|
-
0.0f, d, ne01);
|
10686
|
-
}
|
10687
|
-
}
|
10688
|
-
//printf("CBLAS F32 = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
|
10689
|
-
|
10690
|
-
return;
|
10691
|
-
}
|
10692
|
-
#endif
|
10693
|
-
|
10694
|
-
if (params->type == GGML_TASK_INIT) {
|
10695
|
-
return;
|
10696
|
-
}
|
10697
|
-
|
10698
|
-
if (params->type == GGML_TASK_FINALIZE) {
|
10699
|
-
return;
|
10700
|
-
}
|
10701
|
-
|
10702
|
-
// parallelize by src0 rows using ggml_vec_dot_f32
|
10703
|
-
|
10704
|
-
// total rows in src0
|
10705
|
-
const int nr = ne01*ne02*ne03;
|
10706
|
-
|
10707
|
-
// rows per thread
|
10708
|
-
const int dr = (nr + nth - 1)/nth;
|
10709
|
-
|
10710
|
-
// row range for this thread
|
10711
|
-
const int ir0 = dr*ith;
|
10712
|
-
const int ir1 = MIN(ir0 + dr, nr);
|
10713
|
-
|
10714
|
-
for (int ir = ir0; ir < ir1; ++ir) {
|
10715
|
-
// src0 indices
|
10716
|
-
const int i03 = ir/(ne02*ne01);
|
10717
|
-
const int i02 = (ir - i03*ne02*ne01)/ne01;
|
10718
|
-
const int i01 = (ir - i03*ne02*ne01 - i02*ne01);
|
10719
|
-
|
10720
|
-
for (int64_t ic = 0; ic < ne11; ++ic) {
|
10721
|
-
// src1 indices
|
10722
|
-
const int i13 = i03;
|
10723
|
-
const int i12 = i02;
|
10724
|
-
const int i11 = ic;
|
10725
|
-
|
10726
|
-
// dst indices
|
10727
|
-
const int i0 = i01;
|
10728
|
-
const int i1 = i11;
|
10729
|
-
const int i2 = i02;
|
10730
|
-
const int i3 = i03;
|
10731
|
-
|
10732
|
-
ggml_vec_dot_f32(ne00,
|
10733
|
-
(float *) ((char *) dst->data + (i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3)),
|
10734
|
-
(float *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03)),
|
10735
|
-
(float *) ((char *) src1->data + (i11*nb11 + i12*nb12 + i13*nb13)));
|
10736
|
-
}
|
10737
|
-
}
|
10738
|
-
|
10739
|
-
//int64_t t1 = ggml_perf_time_us();
|
10740
|
-
//static int64_t acc = 0;
|
10741
|
-
//acc += t1 - t0;
|
10742
|
-
//if (t1 - t0 > 10) {
|
10743
|
-
// printf("\n");
|
10744
|
-
// printf("ne00 = %5d, ne01 = %5d, ne02 = %5d, ne03 = %5d\n", ne00, ne01, ne02, ne03);
|
10745
|
-
// printf("nb00 = %5d, nb01 = %5d, nb02 = %5d, nb03 = %5d\n", nb00, nb01, nb02, nb03);
|
10746
|
-
// printf("ne10 = %5d, ne11 = %5d, ne12 = %5d, ne13 = %5d\n", ne10, ne11, ne12, ne13);
|
10747
|
-
// printf("nb10 = %5d, nb11 = %5d, nb12 = %5d, nb13 = %5d\n", nb10, nb11, nb12, nb13);
|
10748
|
-
|
10749
|
-
// printf("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX task %d/%d: %d us, acc = %d\n", ith, nth, (int) (t1 - t0), (int) acc);
|
10750
|
-
//}
|
10751
|
-
}
|
10752
|
-
|
10753
|
-
static void ggml_compute_forward_mul_mat_f16_f32(
|
10754
|
-
const struct ggml_compute_params * params,
|
10755
|
-
const struct ggml_tensor * src0,
|
10756
|
-
const struct ggml_tensor * src1,
|
10757
|
-
struct ggml_tensor * dst) {
|
10758
|
-
int64_t t0 = ggml_perf_time_us();
|
10759
|
-
UNUSED(t0);
|
10760
|
-
|
10761
|
-
const int64_t ne00 = src0->ne[0];
|
10762
|
-
const int64_t ne01 = src0->ne[1];
|
10763
|
-
const int64_t ne02 = src0->ne[2];
|
10764
|
-
const int64_t ne03 = src0->ne[3];
|
10765
|
-
|
10766
|
-
const int64_t ne10 = src1->ne[0];
|
10767
|
-
const int64_t ne11 = src1->ne[1];
|
10768
|
-
const int64_t ne12 = src1->ne[2];
|
10769
|
-
const int64_t ne13 = src1->ne[3];
|
10770
|
-
|
10771
|
-
const int64_t ne0 = dst->ne[0];
|
10772
|
-
const int64_t ne1 = dst->ne[1];
|
10773
|
-
const int64_t ne2 = dst->ne[2];
|
10774
|
-
const int64_t ne3 = dst->ne[3];
|
10775
|
-
//const int64_t ne = ne0*ne1*ne2*ne3;
|
10776
|
-
|
10777
|
-
const int nb00 = src0->nb[0];
|
10778
|
-
const int nb01 = src0->nb[1];
|
10779
|
-
const int nb02 = src0->nb[2];
|
10780
|
-
const int nb03 = src0->nb[3];
|
10781
|
-
|
10782
|
-
const int nb10 = src1->nb[0];
|
10783
|
-
const int nb11 = src1->nb[1];
|
10784
|
-
const int nb12 = src1->nb[2];
|
10785
|
-
const int nb13 = src1->nb[3];
|
10786
|
-
|
10787
|
-
const int nb0 = dst->nb[0];
|
10788
|
-
const int nb1 = dst->nb[1];
|
10789
|
-
const int nb2 = dst->nb[2];
|
10790
|
-
const int nb3 = dst->nb[3];
|
10791
|
-
|
10792
|
-
const int ith = params->ith;
|
10793
|
-
const int nth = params->nth;
|
10794
|
-
|
10795
|
-
GGML_ASSERT(ne02 == ne12);
|
10796
|
-
GGML_ASSERT(ne03 == ne13);
|
10797
|
-
GGML_ASSERT(ne2 == ne12);
|
10798
|
-
GGML_ASSERT(ne3 == ne13);
|
10799
|
-
|
10800
|
-
// TODO: we don't support permuted src0
|
10801
|
-
GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
|
10802
|
-
|
10803
|
-
// dst cannot be transposed or permuted
|
10804
|
-
GGML_ASSERT(nb0 == sizeof(float));
|
10805
|
-
GGML_ASSERT(nb0 <= nb1);
|
10806
|
-
GGML_ASSERT(nb1 <= nb2);
|
10807
|
-
GGML_ASSERT(nb2 <= nb3);
|
10808
|
-
|
10809
|
-
GGML_ASSERT(ne0 == ne01);
|
10810
|
-
GGML_ASSERT(ne1 == ne11);
|
10811
|
-
GGML_ASSERT(ne2 == ne02);
|
10812
|
-
GGML_ASSERT(ne3 == ne03);
|
10813
|
-
|
10814
|
-
// nb01 >= nb00 - src0 is not transposed
|
10815
|
-
// compute by src0 rows
|
10816
|
-
|
10817
|
-
#if defined(GGML_USE_CLBLAST)
|
10818
|
-
if (ggml_cl_can_mul_mat(src0, src1, dst)) {
|
10819
|
-
if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
|
10820
|
-
ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
|
10821
|
-
}
|
10822
|
-
return;
|
10823
|
-
}
|
10824
|
-
#endif
|
10825
|
-
|
10826
|
-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
10827
|
-
if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
|
10828
|
-
GGML_ASSERT(nb10 == sizeof(float));
|
10829
|
-
|
10830
|
-
if (params->ith != 0) {
|
10831
|
-
return;
|
10832
|
-
}
|
10833
|
-
|
10834
|
-
if (params->type == GGML_TASK_INIT) {
|
10835
|
-
return;
|
10836
|
-
}
|
10837
|
-
|
10838
|
-
if (params->type == GGML_TASK_FINALIZE) {
|
10839
|
-
return;
|
10840
|
-
}
|
10841
|
-
|
10842
|
-
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
10843
|
-
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
10844
|
-
float * const wdata = params->wdata;
|
10845
|
-
{
|
10846
|
-
size_t id = 0;
|
10847
|
-
for (int64_t i01 = 0; i01 < ne01; ++i01) {
|
10848
|
-
for (int64_t i00 = 0; i00 < ne00; ++i00) {
|
10849
|
-
wdata[id++] = GGML_FP16_TO_FP32(*(ggml_fp16_t *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00));
|
10850
|
-
}
|
10851
|
-
}
|
10852
|
-
|
10853
|
-
assert(id*sizeof(float) <= params->wsize);
|
10854
|
-
}
|
10855
|
-
|
10856
|
-
const float * x = wdata;
|
10857
|
-
const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
|
10858
|
-
|
10859
|
-
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
|
10860
|
-
|
10861
|
-
// zT = y * xT
|
10862
|
-
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
|
10863
|
-
ne11, ne01, ne10,
|
10864
|
-
1.0f, y, ne10,
|
10865
|
-
x, ne00,
|
10866
|
-
0.0f, d, ne01);
|
10867
|
-
}
|
10868
|
-
}
|
10869
|
-
|
10870
|
-
/*printf("CBLAS F16 = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);*/
|
10871
|
-
|
10872
|
-
return;
|
10873
|
-
}
|
10874
|
-
#endif
|
10875
|
-
|
10876
|
-
if (params->type == GGML_TASK_INIT) {
|
10877
|
-
ggml_fp16_t * const wdata = params->wdata;
|
10878
|
-
|
10879
|
-
size_t id = 0;
|
10880
|
-
for (int64_t i13 = 0; i13 < ne13; ++i13) {
|
10881
|
-
for (int64_t i12 = 0; i12 < ne12; ++i12) {
|
10882
|
-
for (int64_t i11 = 0; i11 < ne11; ++i11) {
|
10883
|
-
for (int64_t i10 = 0; i10 < ne10; ++i10) {
|
10884
|
-
wdata[id++] = GGML_FP32_TO_FP16(*(float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10));
|
10885
|
-
}
|
10886
|
-
}
|
10887
|
-
}
|
10888
|
-
}
|
10889
|
-
|
10890
|
-
GGML_ASSERT(id*sizeof(ggml_fp16_t) <= params->wsize);
|
10891
|
-
|
10892
|
-
return;
|
10893
|
-
}
|
10894
|
-
|
10895
|
-
if (params->type == GGML_TASK_FINALIZE) {
|
10896
|
-
return;
|
10528
|
+
default:
|
10529
|
+
{
|
10530
|
+
GGML_ASSERT(false);
|
10531
|
+
} break;
|
10897
10532
|
}
|
10533
|
+
}
|
10898
10534
|
|
10899
|
-
// fp16 -> half the size, so divide by 2
|
10900
|
-
// TODO: do not support transposed src1
|
10901
|
-
assert(nb10/2 == sizeof(ggml_fp16_t));
|
10902
|
-
|
10903
|
-
// parallelize by src0 rows using ggml_vec_dot_f16
|
10904
|
-
|
10905
|
-
// total rows in src0
|
10906
|
-
const int nr = ne01*ne02*ne03;
|
10907
|
-
|
10908
|
-
// rows per thread
|
10909
|
-
const int dr = (nr + nth - 1)/nth;
|
10910
|
-
|
10911
|
-
// row range for this thread
|
10912
|
-
const int ir0 = dr*ith;
|
10913
|
-
const int ir1 = MIN(ir0 + dr, nr);
|
10914
|
-
|
10915
|
-
ggml_fp16_t * wdata = params->wdata;
|
10916
10535
|
|
10917
|
-
|
10918
|
-
// src0 indices
|
10919
|
-
const int i03 = ir/(ne02*ne01);
|
10920
|
-
const int i02 = (ir - i03*ne02*ne01)/ne01;
|
10921
|
-
const int i01 = (ir - i03*ne02*ne01 - i02*ne01);
|
10536
|
+
// ggml_compute_forward_mul_mat
|
10922
10537
|
|
10923
|
-
|
10924
|
-
|
10538
|
+
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
10539
|
+
// helper function to determine if it is better to use BLAS or not
|
10540
|
+
// for large matrices, BLAS is faster
|
10541
|
+
static bool ggml_compute_forward_mul_mat_use_blas(
|
10542
|
+
const struct ggml_tensor * src0,
|
10543
|
+
const struct ggml_tensor * src1,
|
10544
|
+
struct ggml_tensor * dst) {
|
10545
|
+
//const int64_t ne00 = src0->ne[0];
|
10546
|
+
//const int64_t ne01 = src0->ne[1];
|
10925
10547
|
|
10926
|
-
|
10927
|
-
const int i2 = i02;
|
10928
|
-
const int i3 = i03;
|
10548
|
+
const int64_t ne10 = src1->ne[0];
|
10929
10549
|
|
10930
|
-
|
10931
|
-
|
10550
|
+
const int64_t ne0 = dst->ne[0];
|
10551
|
+
const int64_t ne1 = dst->ne[1];
|
10932
10552
|
|
10933
|
-
|
10553
|
+
// TODO: find the optimal values for these
|
10554
|
+
if (ggml_is_contiguous(src0) &&
|
10555
|
+
ggml_is_contiguous(src1) &&
|
10556
|
+
(ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
|
10934
10557
|
|
10935
|
-
|
10936
|
-
|
10937
|
-
}
|
10558
|
+
/*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/
|
10559
|
+
return true;
|
10938
10560
|
}
|
10939
10561
|
|
10940
|
-
|
10941
|
-
//static int64_t acc = 0;
|
10942
|
-
//acc += t1 - t0;
|
10943
|
-
//if (t1 - t0 > 10) {
|
10944
|
-
// printf("\n");
|
10945
|
-
// printf("ne00 = %5d, ne01 = %5d, ne02 = %5d, ne03 = %5d\n", ne00, ne01, ne02, ne03);
|
10946
|
-
// printf("nb00 = %5d, nb01 = %5d, nb02 = %5d, nb03 = %5d\n", nb00, nb01, nb02, nb03);
|
10947
|
-
// printf("ne10 = %5d, ne11 = %5d, ne12 = %5d, ne13 = %5d\n", ne10, ne11, ne12, ne13);
|
10948
|
-
|
10949
|
-
// printf("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX task %d/%d: %d us, acc = %d\n", ith, nth, (int) (t1 - t0), (int) acc);
|
10950
|
-
//}
|
10562
|
+
return false;
|
10951
10563
|
}
|
10564
|
+
#endif
|
10952
10565
|
|
10953
|
-
static void
|
10566
|
+
static void ggml_compute_forward_mul_mat(
|
10954
10567
|
const struct ggml_compute_params * params,
|
10955
10568
|
const struct ggml_tensor * src0,
|
10956
10569
|
const struct ggml_tensor * src1,
|
@@ -10958,35 +10571,7 @@ static void ggml_compute_forward_mul_mat_q_f32(
|
|
10958
10571
|
int64_t t0 = ggml_perf_time_us();
|
10959
10572
|
UNUSED(t0);
|
10960
10573
|
|
10961
|
-
|
10962
|
-
const int64_t ne01 = src0->ne[1];
|
10963
|
-
const int64_t ne02 = src0->ne[2];
|
10964
|
-
const int64_t ne03 = src0->ne[3];
|
10965
|
-
|
10966
|
-
const int64_t ne10 = src1->ne[0];
|
10967
|
-
const int64_t ne11 = src1->ne[1];
|
10968
|
-
const int64_t ne12 = src1->ne[2];
|
10969
|
-
const int64_t ne13 = src1->ne[3];
|
10970
|
-
|
10971
|
-
const int64_t ne0 = dst->ne[0];
|
10972
|
-
const int64_t ne1 = dst->ne[1];
|
10973
|
-
const int64_t ne2 = dst->ne[2];
|
10974
|
-
const int64_t ne3 = dst->ne[3];
|
10975
|
-
|
10976
|
-
const int nb00 = src0->nb[0];
|
10977
|
-
const int nb01 = src0->nb[1];
|
10978
|
-
const int nb02 = src0->nb[2];
|
10979
|
-
const int nb03 = src0->nb[3];
|
10980
|
-
|
10981
|
-
const int nb10 = src1->nb[0];
|
10982
|
-
const int nb11 = src1->nb[1];
|
10983
|
-
const int nb12 = src1->nb[2];
|
10984
|
-
const int nb13 = src1->nb[3];
|
10985
|
-
|
10986
|
-
const int nb0 = dst->nb[0];
|
10987
|
-
const int nb1 = dst->nb[1];
|
10988
|
-
const int nb2 = dst->nb[2];
|
10989
|
-
const int nb3 = dst->nb[3];
|
10574
|
+
GGML_TENSOR_BINARY_OP_LOCALS;
|
10990
10575
|
|
10991
10576
|
const int ith = params->ith;
|
10992
10577
|
const int nth = params->nth;
|
@@ -10997,12 +10582,13 @@ static void ggml_compute_forward_mul_mat_q_f32(
|
|
10997
10582
|
GGML_ASSERT(ne3 == ne13);
|
10998
10583
|
|
10999
10584
|
const enum ggml_type type = src0->type;
|
11000
|
-
|
11001
|
-
|
11002
|
-
enum ggml_type
|
10585
|
+
|
10586
|
+
ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot;
|
10587
|
+
enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
|
10588
|
+
ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
|
11003
10589
|
|
11004
10590
|
// we don't support permuted src0 or src1
|
11005
|
-
GGML_ASSERT(nb00 ==
|
10591
|
+
GGML_ASSERT(nb00 == GGML_TYPE_SIZE[type]);
|
11006
10592
|
GGML_ASSERT(nb10 == sizeof(float));
|
11007
10593
|
|
11008
10594
|
// dst cannot be transposed or permuted
|
@@ -11042,27 +10628,27 @@ static void ggml_compute_forward_mul_mat_q_f32(
|
|
11042
10628
|
return;
|
11043
10629
|
}
|
11044
10630
|
|
11045
|
-
float * const wdata = params->wdata;
|
11046
|
-
dequantize_row_q_t const dequantize_row_q = quantize_fns[type].dequantize_row_q;
|
11047
|
-
|
11048
10631
|
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
11049
10632
|
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
10633
|
+
const void * x = (char *) src0->data + i03*nb03 + i02*nb02;
|
11050
10634
|
const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
|
11051
10635
|
|
11052
10636
|
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
|
11053
10637
|
|
11054
|
-
{
|
10638
|
+
if (type != GGML_TYPE_F32) {
|
10639
|
+
float * const wdata = params->wdata;
|
10640
|
+
ggml_to_float_t const to_float = type_traits[type].to_float;
|
10641
|
+
|
11055
10642
|
size_t id = 0;
|
11056
10643
|
for (int64_t i01 = 0; i01 < ne01; ++i01) {
|
11057
|
-
|
10644
|
+
to_float((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01, wdata + id, ne00);
|
11058
10645
|
id += ne00;
|
11059
10646
|
}
|
11060
10647
|
|
11061
10648
|
assert(id*sizeof(float) <= params->wsize);
|
10649
|
+
x = wdata;
|
11062
10650
|
}
|
11063
10651
|
|
11064
|
-
const float * x = wdata;
|
11065
|
-
|
11066
10652
|
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
|
11067
10653
|
ne11, ne01, ne10,
|
11068
10654
|
1.0f, y, ne10,
|
@@ -11078,14 +10664,16 @@ static void ggml_compute_forward_mul_mat_q_f32(
|
|
11078
10664
|
#endif
|
11079
10665
|
|
11080
10666
|
if (params->type == GGML_TASK_INIT) {
|
11081
|
-
|
11082
|
-
|
11083
|
-
|
11084
|
-
|
11085
|
-
for (int64_t
|
11086
|
-
for (int64_t
|
11087
|
-
|
11088
|
-
|
10667
|
+
if (src1->type != vec_dot_type) {
|
10668
|
+
char * wdata = params->wdata;
|
10669
|
+
const size_t row_size = ne10*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type];
|
10670
|
+
|
10671
|
+
for (int64_t i13 = 0; i13 < ne13; ++i13) {
|
10672
|
+
for (int64_t i12 = 0; i12 < ne12; ++i12) {
|
10673
|
+
for (int64_t i11 = 0; i11 < ne11; ++i11) {
|
10674
|
+
from_float_to_vec_dot((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, ne10);
|
10675
|
+
wdata += row_size;
|
10676
|
+
}
|
11089
10677
|
}
|
11090
10678
|
}
|
11091
10679
|
}
|
@@ -11109,7 +10697,7 @@ static void ggml_compute_forward_mul_mat_q_f32(
|
|
11109
10697
|
const int ir0 = dr*ith;
|
11110
10698
|
const int ir1 = MIN(ir0 + dr, nr);
|
11111
10699
|
|
11112
|
-
void * wdata = params->wdata;
|
10700
|
+
void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
|
11113
10701
|
const size_t row_size = ne00*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type];
|
11114
10702
|
|
11115
10703
|
for (int ir = ir0; ir < ir1; ++ir) {
|
@@ -11133,7 +10721,7 @@ static void ggml_compute_forward_mul_mat_q_f32(
|
|
11133
10721
|
assert(ne00 % 32 == 0);
|
11134
10722
|
|
11135
10723
|
for (int64_t ic = 0; ic < ne11; ++ic) {
|
11136
|
-
|
10724
|
+
vec_dot(ne00, &dst_col[ic*ne0], src0_row, (void *) (src1_col + ic*row_size));
|
11137
10725
|
}
|
11138
10726
|
}
|
11139
10727
|
|
@@ -11150,40 +10738,6 @@ static void ggml_compute_forward_mul_mat_q_f32(
|
|
11150
10738
|
//}
|
11151
10739
|
}
|
11152
10740
|
|
11153
|
-
static void ggml_compute_forward_mul_mat(
|
11154
|
-
const struct ggml_compute_params * params,
|
11155
|
-
const struct ggml_tensor * src0,
|
11156
|
-
const struct ggml_tensor * src1,
|
11157
|
-
struct ggml_tensor * dst) {
|
11158
|
-
switch (src0->type) {
|
11159
|
-
case GGML_TYPE_Q4_0:
|
11160
|
-
case GGML_TYPE_Q4_1:
|
11161
|
-
case GGML_TYPE_Q5_0:
|
11162
|
-
case GGML_TYPE_Q5_1:
|
11163
|
-
case GGML_TYPE_Q8_0:
|
11164
|
-
case GGML_TYPE_Q8_1:
|
11165
|
-
case GGML_TYPE_Q2_K:
|
11166
|
-
case GGML_TYPE_Q3_K:
|
11167
|
-
case GGML_TYPE_Q4_K:
|
11168
|
-
case GGML_TYPE_Q5_K:
|
11169
|
-
case GGML_TYPE_Q6_K:
|
11170
|
-
{
|
11171
|
-
ggml_compute_forward_mul_mat_q_f32(params, src0, src1, dst);
|
11172
|
-
} break;
|
11173
|
-
case GGML_TYPE_F16:
|
11174
|
-
{
|
11175
|
-
ggml_compute_forward_mul_mat_f16_f32(params, src0, src1, dst);
|
11176
|
-
} break;
|
11177
|
-
case GGML_TYPE_F32:
|
11178
|
-
{
|
11179
|
-
ggml_compute_forward_mul_mat_f32(params, src0, src1, dst);
|
11180
|
-
} break;
|
11181
|
-
default:
|
11182
|
-
{
|
11183
|
-
GGML_ASSERT(false);
|
11184
|
-
} break;
|
11185
|
-
}
|
11186
|
-
}
|
11187
10741
|
|
11188
10742
|
// ggml_compute_forward_out_prod
|
11189
10743
|
|
@@ -11196,35 +10750,7 @@ static void ggml_compute_forward_out_prod_f32(
|
|
11196
10750
|
int64_t t0 = ggml_perf_time_us();
|
11197
10751
|
UNUSED(t0);
|
11198
10752
|
|
11199
|
-
|
11200
|
-
const int64_t ne01 = src0->ne[1];
|
11201
|
-
const int64_t ne02 = src0->ne[2];
|
11202
|
-
const int64_t ne03 = src0->ne[3];
|
11203
|
-
|
11204
|
-
const int64_t ne10 = src1->ne[0];
|
11205
|
-
//const int64_t ne11 = src1->ne[1];
|
11206
|
-
const int64_t ne12 = src1->ne[2];
|
11207
|
-
const int64_t ne13 = src1->ne[3];
|
11208
|
-
|
11209
|
-
const int64_t ne0 = dst->ne[0];
|
11210
|
-
const int64_t ne1 = dst->ne[1];
|
11211
|
-
const int64_t ne2 = dst->ne[2];
|
11212
|
-
const int64_t ne3 = dst->ne[3];
|
11213
|
-
|
11214
|
-
const int nb00 = src0->nb[0];
|
11215
|
-
const int nb01 = src0->nb[1];
|
11216
|
-
const int nb02 = src0->nb[2];
|
11217
|
-
const int nb03 = src0->nb[3];
|
11218
|
-
|
11219
|
-
const int nb10 = src1->nb[0];
|
11220
|
-
const int nb11 = src1->nb[1];
|
11221
|
-
const int nb12 = src1->nb[2];
|
11222
|
-
const int nb13 = src1->nb[3];
|
11223
|
-
|
11224
|
-
const int nb0 = dst->nb[0];
|
11225
|
-
const int nb1 = dst->nb[1];
|
11226
|
-
const int nb2 = dst->nb[2];
|
11227
|
-
const int nb3 = dst->nb[3];
|
10753
|
+
GGML_TENSOR_BINARY_OP_LOCALS;
|
11228
10754
|
|
11229
10755
|
const int ith = params->ith;
|
11230
10756
|
const int nth = params->nth;
|
@@ -11459,15 +10985,8 @@ static void ggml_compute_forward_set_f32(
|
|
11459
10985
|
const int nr = ggml_nrows(src1);
|
11460
10986
|
const int nc = src1->ne[0];
|
11461
10987
|
|
11462
|
-
|
11463
|
-
|
11464
|
-
const int64_t ne12 = src1->ne[2];
|
11465
|
-
const int64_t ne13 = src1->ne[3];
|
11466
|
-
|
11467
|
-
const size_t nb10 = src1->nb[0];
|
11468
|
-
const size_t nb11 = src1->nb[1];
|
11469
|
-
const size_t nb12 = src1->nb[2];
|
11470
|
-
const size_t nb13 = src1->nb[3];
|
10988
|
+
GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne);
|
10989
|
+
GGML_TENSOR_LOCALS(size_t, nb1, src1, nb);
|
11471
10990
|
|
11472
10991
|
// src0 and dst as viewed during set
|
11473
10992
|
const size_t nb0 = ggml_element_size(src0);
|
@@ -11608,7 +11127,7 @@ static void ggml_compute_forward_get_rows_q(
|
|
11608
11127
|
const int nc = src0->ne[0];
|
11609
11128
|
const int nr = ggml_nelements(src1);
|
11610
11129
|
const enum ggml_type type = src0->type;
|
11611
|
-
|
11130
|
+
ggml_to_float_t const dequantize_row_q = type_traits[type].to_float;
|
11612
11131
|
|
11613
11132
|
assert( dst->ne[0] == nc);
|
11614
11133
|
assert( dst->ne[1] == nr);
|
@@ -11858,29 +11377,14 @@ static void ggml_compute_forward_diag_f32(
|
|
11858
11377
|
|
11859
11378
|
// TODO: handle transposed/permuted matrices
|
11860
11379
|
|
11861
|
-
|
11862
|
-
|
11863
|
-
const int ne02 = src0->ne[2];
|
11864
|
-
const int ne03 = src0->ne[3];
|
11865
|
-
const int ne0 = dst->ne[0];
|
11866
|
-
const int ne1 = dst->ne[1];
|
11867
|
-
const int ne2 = dst->ne[2];
|
11868
|
-
const int ne3 = dst->ne[3];
|
11380
|
+
GGML_TENSOR_UNARY_OP_LOCALS;
|
11381
|
+
|
11869
11382
|
GGML_ASSERT(ne00 == ne0);
|
11870
11383
|
GGML_ASSERT(ne00 == ne1);
|
11871
11384
|
GGML_ASSERT(ne01 == 1);
|
11872
11385
|
GGML_ASSERT(ne02 == ne2);
|
11873
11386
|
GGML_ASSERT(ne03 == ne3);
|
11874
11387
|
|
11875
|
-
const int nb00 = src0->nb[0];
|
11876
|
-
//const int nb01 = src0->nb[1];
|
11877
|
-
const int nb02 = src0->nb[2];
|
11878
|
-
const int nb03 = src0->nb[3];
|
11879
|
-
const int nb0 = dst->nb[0];
|
11880
|
-
const int nb1 = dst->nb[1];
|
11881
|
-
const int nb2 = dst->nb[2];
|
11882
|
-
const int nb3 = dst->nb[3];
|
11883
|
-
|
11884
11388
|
GGML_ASSERT(nb00 == sizeof(float));
|
11885
11389
|
GGML_ASSERT(nb0 == sizeof(float));
|
11886
11390
|
|
@@ -12457,20 +11961,7 @@ static void ggml_compute_forward_rope_f32(
|
|
12457
11961
|
|
12458
11962
|
assert(n_past >= 0);
|
12459
11963
|
|
12460
|
-
|
12461
|
-
const size_t nb01 = src0->nb[1];
|
12462
|
-
const size_t nb02 = src0->nb[2];
|
12463
|
-
const size_t nb03 = src0->nb[3];
|
12464
|
-
|
12465
|
-
const int64_t ne0 = dst->ne[0];
|
12466
|
-
const int64_t ne1 = dst->ne[1];
|
12467
|
-
const int64_t ne2 = dst->ne[2];
|
12468
|
-
const int64_t ne3 = dst->ne[3];
|
12469
|
-
|
12470
|
-
const size_t nb0 = dst->nb[0];
|
12471
|
-
const size_t nb1 = dst->nb[1];
|
12472
|
-
const size_t nb2 = dst->nb[2];
|
12473
|
-
const size_t nb3 = dst->nb[3];
|
11964
|
+
GGML_TENSOR_UNARY_OP_LOCALS;
|
12474
11965
|
|
12475
11966
|
//printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
|
12476
11967
|
//printf("n_past = %d, ne2 = %d\n", n_past, ne2);
|
@@ -12597,20 +12088,7 @@ static void ggml_compute_forward_rope_f16(
|
|
12597
12088
|
|
12598
12089
|
assert(n_past >= 0);
|
12599
12090
|
|
12600
|
-
|
12601
|
-
const size_t nb01 = src0->nb[1];
|
12602
|
-
const size_t nb02 = src0->nb[2];
|
12603
|
-
const size_t nb03 = src0->nb[3];
|
12604
|
-
|
12605
|
-
const int64_t ne0 = dst->ne[0];
|
12606
|
-
const int64_t ne1 = dst->ne[1];
|
12607
|
-
const int64_t ne2 = dst->ne[2];
|
12608
|
-
const int64_t ne3 = dst->ne[3];
|
12609
|
-
|
12610
|
-
const size_t nb0 = dst->nb[0];
|
12611
|
-
const size_t nb1 = dst->nb[1];
|
12612
|
-
const size_t nb2 = dst->nb[2];
|
12613
|
-
const size_t nb3 = dst->nb[3];
|
12091
|
+
GGML_TENSOR_UNARY_OP_LOCALS;
|
12614
12092
|
|
12615
12093
|
//printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
|
12616
12094
|
//printf("n_past = %d, ne2 = %d\n", n_past, ne2);
|
@@ -12763,21 +12241,7 @@ static void ggml_compute_forward_rope_back_f32(
|
|
12763
12241
|
|
12764
12242
|
assert(n_past >= 0);
|
12765
12243
|
|
12766
|
-
|
12767
|
-
const size_t nb01 = src0->nb[1];
|
12768
|
-
const size_t nb02 = src0->nb[2];
|
12769
|
-
const size_t nb03 = src0->nb[3];
|
12770
|
-
|
12771
|
-
const int64_t ne0 = dst->ne[0];
|
12772
|
-
const int64_t ne1 = dst->ne[1];
|
12773
|
-
const int64_t ne2 = dst->ne[2];
|
12774
|
-
const int64_t ne3 = dst->ne[3];
|
12775
|
-
|
12776
|
-
const size_t nb0 = dst->nb[0];
|
12777
|
-
const size_t nb1 = dst->nb[1];
|
12778
|
-
const size_t nb2 = dst->nb[2];
|
12779
|
-
const size_t nb3 = dst->nb[3];
|
12780
|
-
|
12244
|
+
GGML_TENSOR_UNARY_OP_LOCALS;
|
12781
12245
|
|
12782
12246
|
//printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
|
12783
12247
|
//printf("n_past = %d, ne2 = %d\n", n_past, ne2);
|
@@ -12876,21 +12340,7 @@ static void ggml_compute_forward_rope_back_f16(
|
|
12876
12340
|
|
12877
12341
|
assert(n_past >= 0);
|
12878
12342
|
|
12879
|
-
|
12880
|
-
const size_t nb01 = src0->nb[1];
|
12881
|
-
const size_t nb02 = src0->nb[2];
|
12882
|
-
const size_t nb03 = src0->nb[3];
|
12883
|
-
|
12884
|
-
const int64_t ne0 = dst->ne[0];
|
12885
|
-
const int64_t ne1 = dst->ne[1];
|
12886
|
-
const int64_t ne2 = dst->ne[2];
|
12887
|
-
const int64_t ne3 = dst->ne[3];
|
12888
|
-
|
12889
|
-
const size_t nb0 = dst->nb[0];
|
12890
|
-
const size_t nb1 = dst->nb[1];
|
12891
|
-
const size_t nb2 = dst->nb[2];
|
12892
|
-
const size_t nb3 = dst->nb[3];
|
12893
|
-
|
12343
|
+
GGML_TENSOR_UNARY_OP_LOCALS;
|
12894
12344
|
|
12895
12345
|
//printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
|
12896
12346
|
//printf("n_past = %d, ne2 = %d\n", n_past, ne2);
|
@@ -12988,7 +12438,7 @@ static void ggml_compute_forward_rope_back(
|
|
12988
12438
|
}
|
12989
12439
|
}
|
12990
12440
|
|
12991
|
-
//
|
12441
|
+
// ggml_compute_forward_conv_1d
|
12992
12442
|
|
12993
12443
|
static void ggml_compute_forward_conv_1d_s1_ph_f16_f32(
|
12994
12444
|
const struct ggml_compute_params * params,
|
@@ -13002,36 +12452,7 @@ static void ggml_compute_forward_conv_1d_s1_ph_f16_f32(
|
|
13002
12452
|
int64_t t0 = ggml_perf_time_us();
|
13003
12453
|
UNUSED(t0);
|
13004
12454
|
|
13005
|
-
|
13006
|
-
const int64_t ne01 = src0->ne[1];
|
13007
|
-
const int64_t ne02 = src0->ne[2];
|
13008
|
-
//const int64_t ne03 = src0->ne[3];
|
13009
|
-
|
13010
|
-
const int64_t ne10 = src1->ne[0];
|
13011
|
-
const int64_t ne11 = src1->ne[1];
|
13012
|
-
//const int64_t ne12 = src1->ne[2];
|
13013
|
-
//const int64_t ne13 = src1->ne[3];
|
13014
|
-
|
13015
|
-
//const int64_t ne0 = dst->ne[0];
|
13016
|
-
//const int64_t ne1 = dst->ne[1];
|
13017
|
-
//const int64_t ne2 = dst->ne[2];
|
13018
|
-
//const int64_t ne3 = dst->ne[3];
|
13019
|
-
//const int64_t ne = ne0*ne1*ne2*ne3;
|
13020
|
-
|
13021
|
-
const int nb00 = src0->nb[0];
|
13022
|
-
const int nb01 = src0->nb[1];
|
13023
|
-
const int nb02 = src0->nb[2];
|
13024
|
-
//const int nb03 = src0->nb[3];
|
13025
|
-
|
13026
|
-
const int nb10 = src1->nb[0];
|
13027
|
-
const int nb11 = src1->nb[1];
|
13028
|
-
//const int nb12 = src1->nb[2];
|
13029
|
-
//const int nb13 = src1->nb[3];
|
13030
|
-
|
13031
|
-
//const int nb0 = dst->nb[0];
|
13032
|
-
const int nb1 = dst->nb[1];
|
13033
|
-
//const int nb2 = dst->nb[2];
|
13034
|
-
//const int nb3 = dst->nb[3];
|
12455
|
+
GGML_TENSOR_BINARY_OP_LOCALS;
|
13035
12456
|
|
13036
12457
|
const int ith = params->ith;
|
13037
12458
|
const int nth = params->nth;
|
@@ -13122,36 +12543,7 @@ static void ggml_compute_forward_conv_1d_s1_ph_f32(
|
|
13122
12543
|
int64_t t0 = ggml_perf_time_us();
|
13123
12544
|
UNUSED(t0);
|
13124
12545
|
|
13125
|
-
|
13126
|
-
const int64_t ne01 = src0->ne[1];
|
13127
|
-
const int64_t ne02 = src0->ne[2];
|
13128
|
-
//const int64_t ne03 = src0->ne[3];
|
13129
|
-
|
13130
|
-
const int64_t ne10 = src1->ne[0];
|
13131
|
-
const int64_t ne11 = src1->ne[1];
|
13132
|
-
//const int64_t ne12 = src1->ne[2];
|
13133
|
-
//const int64_t ne13 = src1->ne[3];
|
13134
|
-
|
13135
|
-
//const int64_t ne0 = dst->ne[0];
|
13136
|
-
//const int64_t ne1 = dst->ne[1];
|
13137
|
-
//const int64_t ne2 = dst->ne[2];
|
13138
|
-
//const int64_t ne3 = dst->ne[3];
|
13139
|
-
//const int64_t ne = ne0*ne1*ne2*ne3;
|
13140
|
-
|
13141
|
-
const int nb00 = src0->nb[0];
|
13142
|
-
const int nb01 = src0->nb[1];
|
13143
|
-
const int nb02 = src0->nb[2];
|
13144
|
-
//const int nb03 = src0->nb[3];
|
13145
|
-
|
13146
|
-
const int nb10 = src1->nb[0];
|
13147
|
-
const int nb11 = src1->nb[1];
|
13148
|
-
//const int nb12 = src1->nb[2];
|
13149
|
-
//const int nb13 = src1->nb[3];
|
13150
|
-
|
13151
|
-
//const int nb0 = dst->nb[0];
|
13152
|
-
const int nb1 = dst->nb[1];
|
13153
|
-
//const int nb2 = dst->nb[2];
|
13154
|
-
//const int nb3 = dst->nb[3];
|
12546
|
+
GGML_TENSOR_BINARY_OP_LOCALS;
|
13155
12547
|
|
13156
12548
|
const int ith = params->ith;
|
13157
12549
|
const int nth = params->nth;
|
@@ -13251,8 +12643,6 @@ static void ggml_compute_forward_conv_1d_s1_ph(
|
|
13251
12643
|
}
|
13252
12644
|
}
|
13253
12645
|
|
13254
|
-
// ggml_compute_forward_conv_1d_s2_ph
|
13255
|
-
|
13256
12646
|
static void ggml_compute_forward_conv_1d_s2_ph_f16_f32(
|
13257
12647
|
const struct ggml_compute_params * params,
|
13258
12648
|
const struct ggml_tensor * src0,
|
@@ -13265,36 +12655,7 @@ static void ggml_compute_forward_conv_1d_s2_ph_f16_f32(
|
|
13265
12655
|
int64_t t0 = ggml_perf_time_us();
|
13266
12656
|
UNUSED(t0);
|
13267
12657
|
|
13268
|
-
|
13269
|
-
const int64_t ne01 = src0->ne[1];
|
13270
|
-
const int64_t ne02 = src0->ne[2];
|
13271
|
-
//const int64_t ne03 = src0->ne[3];
|
13272
|
-
|
13273
|
-
const int64_t ne10 = src1->ne[0];
|
13274
|
-
const int64_t ne11 = src1->ne[1];
|
13275
|
-
//const int64_t ne12 = src1->ne[2];
|
13276
|
-
//const int64_t ne13 = src1->ne[3];
|
13277
|
-
|
13278
|
-
//const int64_t ne0 = dst->ne[0];
|
13279
|
-
//const int64_t ne1 = dst->ne[1];
|
13280
|
-
//const int64_t ne2 = dst->ne[2];
|
13281
|
-
//const int64_t ne3 = dst->ne[3];
|
13282
|
-
//const int64_t ne = ne0*ne1*ne2*ne3;
|
13283
|
-
|
13284
|
-
const int nb00 = src0->nb[0];
|
13285
|
-
const int nb01 = src0->nb[1];
|
13286
|
-
const int nb02 = src0->nb[2];
|
13287
|
-
//const int nb03 = src0->nb[3];
|
13288
|
-
|
13289
|
-
const int nb10 = src1->nb[0];
|
13290
|
-
const int nb11 = src1->nb[1];
|
13291
|
-
//const int nb12 = src1->nb[2];
|
13292
|
-
//const int nb13 = src1->nb[3];
|
13293
|
-
|
13294
|
-
//const int nb0 = dst->nb[0];
|
13295
|
-
const int nb1 = dst->nb[1];
|
13296
|
-
//const int nb2 = dst->nb[2];
|
13297
|
-
//const int nb3 = dst->nb[3];
|
12658
|
+
GGML_TENSOR_BINARY_OP_LOCALS;
|
13298
12659
|
|
13299
12660
|
const int ith = params->ith;
|
13300
12661
|
const int nth = params->nth;
|
@@ -13385,36 +12746,7 @@ static void ggml_compute_forward_conv_1d_s2_ph_f32(
|
|
13385
12746
|
int64_t t0 = ggml_perf_time_us();
|
13386
12747
|
UNUSED(t0);
|
13387
12748
|
|
13388
|
-
|
13389
|
-
const int64_t ne01 = src0->ne[1];
|
13390
|
-
const int64_t ne02 = src0->ne[2];
|
13391
|
-
//const int64_t ne03 = src0->ne[3];
|
13392
|
-
|
13393
|
-
const int64_t ne10 = src1->ne[0];
|
13394
|
-
const int64_t ne11 = src1->ne[1];
|
13395
|
-
//const int64_t ne12 = src1->ne[2];
|
13396
|
-
//const int64_t ne13 = src1->ne[3];
|
13397
|
-
|
13398
|
-
//const int64_t ne0 = dst->ne[0];
|
13399
|
-
//const int64_t ne1 = dst->ne[1];
|
13400
|
-
//const int64_t ne2 = dst->ne[2];
|
13401
|
-
//const int64_t ne3 = dst->ne[3];
|
13402
|
-
//const int64_t ne = ne0*ne1*ne2*ne3;
|
13403
|
-
|
13404
|
-
const int nb00 = src0->nb[0];
|
13405
|
-
const int nb01 = src0->nb[1];
|
13406
|
-
const int nb02 = src0->nb[2];
|
13407
|
-
//const int nb03 = src0->nb[3];
|
13408
|
-
|
13409
|
-
const int nb10 = src1->nb[0];
|
13410
|
-
const int nb11 = src1->nb[1];
|
13411
|
-
//const int nb12 = src1->nb[2];
|
13412
|
-
//const int nb13 = src1->nb[3];
|
13413
|
-
|
13414
|
-
//const int nb0 = dst->nb[0];
|
13415
|
-
const int nb1 = dst->nb[1];
|
13416
|
-
//const int nb2 = dst->nb[2];
|
13417
|
-
//const int nb3 = dst->nb[3];
|
12749
|
+
GGML_TENSOR_BINARY_OP_LOCALS;
|
13418
12750
|
|
13419
12751
|
const int ith = params->ith;
|
13420
12752
|
const int nth = params->nth;
|
@@ -13514,6 +12846,28 @@ static void ggml_compute_forward_conv_1d_s2_ph(
|
|
13514
12846
|
}
|
13515
12847
|
}
|
13516
12848
|
|
12849
|
+
// ggml_compute_forward_conv_1d
|
12850
|
+
|
12851
|
+
static void ggml_compute_forward_conv_1d(
|
12852
|
+
const struct ggml_compute_params * params,
|
12853
|
+
const struct ggml_tensor * src0,
|
12854
|
+
const struct ggml_tensor * src1,
|
12855
|
+
const struct ggml_tensor * opt0,
|
12856
|
+
struct ggml_tensor * dst) {
|
12857
|
+
const int32_t s0 = ((const int32_t*)(opt0->data))[0];
|
12858
|
+
const int32_t p0 = ((const int32_t*)(opt0->data))[1];
|
12859
|
+
const int32_t d0 = ((const int32_t*)(opt0->data))[2];
|
12860
|
+
GGML_ASSERT(d0 == 1); // dilation not supported
|
12861
|
+
GGML_ASSERT(p0 == src0->ne[0]/2); // only half padding supported
|
12862
|
+
if (s0 == 1) {
|
12863
|
+
ggml_compute_forward_conv_1d_s1_ph(params, src0, src1, dst);
|
12864
|
+
} else if (s0 == 2) {
|
12865
|
+
ggml_compute_forward_conv_1d_s2_ph(params, src0, src1, dst);
|
12866
|
+
} else {
|
12867
|
+
GGML_ASSERT(false); // only stride 1 and 2 supported
|
12868
|
+
};
|
12869
|
+
}
|
12870
|
+
|
13517
12871
|
// ggml_compute_forward_conv_2d_sk_p0
|
13518
12872
|
|
13519
12873
|
static void ggml_compute_forward_conv_2d_sk_p0_f16_f32(
|
@@ -13528,36 +12882,7 @@ static void ggml_compute_forward_conv_2d_sk_p0_f16_f32(
|
|
13528
12882
|
int64_t t0 = ggml_perf_time_us();
|
13529
12883
|
UNUSED(t0);
|
13530
12884
|
|
13531
|
-
|
13532
|
-
const int ne01 = src0->ne[1];
|
13533
|
-
const int ne02 = src0->ne[2];
|
13534
|
-
//const int ne03 = src0->ne[3];
|
13535
|
-
|
13536
|
-
const int ne10 = src1->ne[0];
|
13537
|
-
//const int ne11 = src1->ne[1];
|
13538
|
-
const int ne12 = src1->ne[2];
|
13539
|
-
//const int ne13 = src1->ne[3];
|
13540
|
-
|
13541
|
-
const int ne0 = dst->ne[0];
|
13542
|
-
const int ne1 = dst->ne[1];
|
13543
|
-
const int ne2 = dst->ne[2];
|
13544
|
-
//const int ne3 = dst->ne[3];
|
13545
|
-
//const int ne = ne0*ne1*ne2*ne3;
|
13546
|
-
|
13547
|
-
const int nb00 = src0->nb[0];
|
13548
|
-
//const int nb01 = src0->nb[1];
|
13549
|
-
//const int nb02 = src0->nb[2];
|
13550
|
-
const int nb03 = src0->nb[3];
|
13551
|
-
|
13552
|
-
const int nb10 = src1->nb[0];
|
13553
|
-
//const int nb11 = src1->nb[1];
|
13554
|
-
const int nb12 = src1->nb[2];
|
13555
|
-
//const int nb13 = src1->nb[3];
|
13556
|
-
|
13557
|
-
//const int nb0 = dst->nb[0];
|
13558
|
-
//const int nb1 = dst->nb[1];
|
13559
|
-
const int nb2 = dst->nb[2];
|
13560
|
-
//const int nb3 = dst->nb[3];
|
12885
|
+
GGML_TENSOR_BINARY_OP_LOCALS;
|
13561
12886
|
|
13562
12887
|
const int ith = params->ith;
|
13563
12888
|
const int nth = params->nth;
|
@@ -13650,6 +12975,34 @@ static void ggml_compute_forward_conv_2d_sk_p0(
|
|
13650
12975
|
}
|
13651
12976
|
}
|
13652
12977
|
|
12978
|
+
// ggml_compute_forward_conv_2d
|
12979
|
+
|
12980
|
+
static void ggml_compute_forward_conv_2d(
|
12981
|
+
const struct ggml_compute_params* params,
|
12982
|
+
const struct ggml_tensor* src0,
|
12983
|
+
const struct ggml_tensor* src1,
|
12984
|
+
const struct ggml_tensor* opt0,
|
12985
|
+
struct ggml_tensor* dst) {
|
12986
|
+
const int32_t s0 = ((const int32_t*)(opt0->data))[0];
|
12987
|
+
const int32_t s1 = ((const int32_t*)(opt0->data))[1];
|
12988
|
+
const int32_t p0 = ((const int32_t*)(opt0->data))[2];
|
12989
|
+
const int32_t p1 = ((const int32_t*)(opt0->data))[3];
|
12990
|
+
const int32_t d0 = ((const int32_t*)(opt0->data))[4];
|
12991
|
+
const int32_t d1 = ((const int32_t*)(opt0->data))[5];
|
12992
|
+
GGML_ASSERT(d0 == 1); // dilation not supported
|
12993
|
+
GGML_ASSERT(d1 == 1);
|
12994
|
+
GGML_ASSERT(p0 == 0); // padding not supported
|
12995
|
+
GGML_ASSERT(p1 == 0);
|
12996
|
+
|
12997
|
+
if (s0 == src0->ne[0] && s1 == src0->ne[1]) {
|
12998
|
+
ggml_compute_forward_conv_2d_sk_p0(params, src0, src1, dst);
|
12999
|
+
}
|
13000
|
+
else {
|
13001
|
+
GGML_ASSERT(false); // only stride equal to kernel size is supported
|
13002
|
+
};
|
13003
|
+
}
|
13004
|
+
|
13005
|
+
|
13653
13006
|
// ggml_compute_forward_flash_attn
|
13654
13007
|
|
13655
13008
|
static void ggml_compute_forward_flash_attn_f32(
|
@@ -13662,45 +13015,14 @@ static void ggml_compute_forward_flash_attn_f32(
|
|
13662
13015
|
int64_t t0 = ggml_perf_time_us();
|
13663
13016
|
UNUSED(t0);
|
13664
13017
|
|
13665
|
-
|
13666
|
-
|
13667
|
-
|
13668
|
-
|
13669
|
-
|
13670
|
-
|
13671
|
-
|
13672
|
-
|
13673
|
-
//const int64_t nek3 = k->ne[3];
|
13674
|
-
|
13675
|
-
//const int64_t nev0 = v->ne[0];
|
13676
|
-
const int64_t nev1 = v->ne[1];
|
13677
|
-
//const int64_t nev2 = v->ne[2];
|
13678
|
-
//const int64_t nev3 = v->ne[3];
|
13679
|
-
|
13680
|
-
const int64_t ne0 = dst->ne[0];
|
13681
|
-
const int64_t ne1 = dst->ne[1];
|
13682
|
-
//const int64_t ne2 = dst->ne[2];
|
13683
|
-
//const int64_t ne3 = dst->ne[3];
|
13684
|
-
|
13685
|
-
const int nbk0 = k->nb[0];
|
13686
|
-
const int nbk1 = k->nb[1];
|
13687
|
-
const int nbk2 = k->nb[2];
|
13688
|
-
const int nbk3 = k->nb[3];
|
13689
|
-
|
13690
|
-
const int nbq0 = q->nb[0];
|
13691
|
-
const int nbq1 = q->nb[1];
|
13692
|
-
const int nbq2 = q->nb[2];
|
13693
|
-
const int nbq3 = q->nb[3];
|
13694
|
-
|
13695
|
-
const int nbv0 = v->nb[0];
|
13696
|
-
const int nbv1 = v->nb[1];
|
13697
|
-
const int nbv2 = v->nb[2];
|
13698
|
-
const int nbv3 = v->nb[3];
|
13699
|
-
|
13700
|
-
const int nb0 = dst->nb[0];
|
13701
|
-
const int nb1 = dst->nb[1];
|
13702
|
-
const int nb2 = dst->nb[2];
|
13703
|
-
const int nb3 = dst->nb[3];
|
13018
|
+
GGML_TENSOR_LOCALS(int64_t, neq, q, ne);
|
13019
|
+
GGML_TENSOR_LOCALS(size_t, nbq, q, nb);
|
13020
|
+
GGML_TENSOR_LOCALS(int64_t, nek, k, ne);
|
13021
|
+
GGML_TENSOR_LOCALS(size_t, nbk, k, nb);
|
13022
|
+
GGML_TENSOR_LOCALS(int64_t, nev, v, ne);
|
13023
|
+
GGML_TENSOR_LOCALS(size_t, nbv, v, nb);
|
13024
|
+
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne);
|
13025
|
+
GGML_TENSOR_LOCALS(size_t, nb, dst, nb);
|
13704
13026
|
|
13705
13027
|
const int ith = params->ith;
|
13706
13028
|
const int nth = params->nth;
|
@@ -13871,45 +13193,14 @@ static void ggml_compute_forward_flash_attn_f16(
|
|
13871
13193
|
int64_t t0 = ggml_perf_time_us();
|
13872
13194
|
UNUSED(t0);
|
13873
13195
|
|
13874
|
-
|
13875
|
-
|
13876
|
-
|
13877
|
-
|
13878
|
-
|
13879
|
-
|
13880
|
-
|
13881
|
-
|
13882
|
-
//const int64_t nek3 = k->ne[3];
|
13883
|
-
|
13884
|
-
//const int64_t nev0 = v->ne[0];
|
13885
|
-
const int64_t nev1 = v->ne[1];
|
13886
|
-
//const int64_t nev2 = v->ne[2];
|
13887
|
-
//const int64_t nev3 = v->ne[3];
|
13888
|
-
|
13889
|
-
const int64_t ne0 = dst->ne[0];
|
13890
|
-
const int64_t ne1 = dst->ne[1];
|
13891
|
-
//const int64_t ne2 = dst->ne[2];
|
13892
|
-
//const int64_t ne3 = dst->ne[3];
|
13893
|
-
|
13894
|
-
const int nbk0 = k->nb[0];
|
13895
|
-
const int nbk1 = k->nb[1];
|
13896
|
-
const int nbk2 = k->nb[2];
|
13897
|
-
const int nbk3 = k->nb[3];
|
13898
|
-
|
13899
|
-
const int nbq0 = q->nb[0];
|
13900
|
-
const int nbq1 = q->nb[1];
|
13901
|
-
const int nbq2 = q->nb[2];
|
13902
|
-
const int nbq3 = q->nb[3];
|
13903
|
-
|
13904
|
-
const int nbv0 = v->nb[0];
|
13905
|
-
const int nbv1 = v->nb[1];
|
13906
|
-
const int nbv2 = v->nb[2];
|
13907
|
-
const int nbv3 = v->nb[3];
|
13908
|
-
|
13909
|
-
const int nb0 = dst->nb[0];
|
13910
|
-
const int nb1 = dst->nb[1];
|
13911
|
-
const int nb2 = dst->nb[2];
|
13912
|
-
const int nb3 = dst->nb[3];
|
13196
|
+
GGML_TENSOR_LOCALS(int64_t, neq, q, ne);
|
13197
|
+
GGML_TENSOR_LOCALS(size_t, nbq, q, nb);
|
13198
|
+
GGML_TENSOR_LOCALS(int64_t, nek, k, ne);
|
13199
|
+
GGML_TENSOR_LOCALS(size_t, nbk, k, nb);
|
13200
|
+
GGML_TENSOR_LOCALS(int64_t, nev, v, ne);
|
13201
|
+
GGML_TENSOR_LOCALS(size_t, nbv, v, nb);
|
13202
|
+
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne);
|
13203
|
+
GGML_TENSOR_LOCALS(size_t, nb, dst, nb);
|
13913
13204
|
|
13914
13205
|
const int ith = params->ith;
|
13915
13206
|
const int nth = params->nth;
|
@@ -14143,65 +13434,18 @@ static void ggml_compute_forward_flash_ff_f16(
|
|
14143
13434
|
int64_t t0 = ggml_perf_time_us();
|
14144
13435
|
UNUSED(t0);
|
14145
13436
|
|
14146
|
-
|
14147
|
-
|
14148
|
-
|
14149
|
-
|
14150
|
-
|
14151
|
-
|
14152
|
-
|
14153
|
-
|
14154
|
-
|
14155
|
-
|
14156
|
-
|
14157
|
-
|
14158
|
-
//const int64_t neb12 = b1->ne[2];
|
14159
|
-
//const int64_t neb13 = b1->ne[3];
|
14160
|
-
|
14161
|
-
const int64_t nec00 = c0->ne[0];
|
14162
|
-
const int64_t nec01 = c0->ne[1];
|
14163
|
-
//const int64_t nec02 = c0->ne[2];
|
14164
|
-
//const int64_t nec03 = c0->ne[3];
|
14165
|
-
|
14166
|
-
const int64_t nec10 = c1->ne[0];
|
14167
|
-
const int64_t nec11 = c1->ne[1];
|
14168
|
-
//const int64_t nec12 = c1->ne[2];
|
14169
|
-
//const int64_t nec13 = c1->ne[3];
|
14170
|
-
|
14171
|
-
const int64_t ne0 = dst->ne[0];
|
14172
|
-
const int64_t ne1 = dst->ne[1];
|
14173
|
-
const int64_t ne2 = dst->ne[2];
|
14174
|
-
//const int64_t ne3 = dst->ne[3];
|
14175
|
-
|
14176
|
-
const int nba0 = a->nb[0];
|
14177
|
-
const int nba1 = a->nb[1];
|
14178
|
-
const int nba2 = a->nb[2];
|
14179
|
-
const int nba3 = a->nb[3];
|
14180
|
-
|
14181
|
-
const int nbb00 = b0->nb[0];
|
14182
|
-
const int nbb01 = b0->nb[1];
|
14183
|
-
const int nbb02 = b0->nb[2];
|
14184
|
-
const int nbb03 = b0->nb[3];
|
14185
|
-
|
14186
|
-
const int nbb10 = b1->nb[0];
|
14187
|
-
//const int nbb11 = b1->nb[1];
|
14188
|
-
//const int nbb12 = b1->nb[2];
|
14189
|
-
//const int nbb13 = b1->nb[3];
|
14190
|
-
|
14191
|
-
const int nbc00 = c0->nb[0];
|
14192
|
-
const int nbc01 = c0->nb[1];
|
14193
|
-
const int nbc02 = c0->nb[2];
|
14194
|
-
const int nbc03 = c0->nb[3];
|
14195
|
-
|
14196
|
-
const int nbc10 = c1->nb[0];
|
14197
|
-
//const int nbc11 = c1->nb[1];
|
14198
|
-
//const int nbc12 = c1->nb[2];
|
14199
|
-
//const int nbc13 = c1->nb[3];
|
14200
|
-
|
14201
|
-
const int nb0 = dst->nb[0];
|
14202
|
-
const int nb1 = dst->nb[1];
|
14203
|
-
const int nb2 = dst->nb[2];
|
14204
|
-
const int nb3 = dst->nb[3];
|
13437
|
+
GGML_TENSOR_LOCALS(int64_t, nea, a, ne);
|
13438
|
+
GGML_TENSOR_LOCALS(size_t, nba, a, nb);
|
13439
|
+
GGML_TENSOR_LOCALS(int64_t, neb0, b0, ne);
|
13440
|
+
GGML_TENSOR_LOCALS(size_t, nbb0, b0, nb);
|
13441
|
+
GGML_TENSOR_LOCALS(int64_t, neb1, b1, ne);
|
13442
|
+
GGML_TENSOR_LOCALS(size_t, nbb1, b1, nb);
|
13443
|
+
GGML_TENSOR_LOCALS(int64_t, nec0, c0, ne);
|
13444
|
+
GGML_TENSOR_LOCALS(size_t, nbc0, c0, nb);
|
13445
|
+
GGML_TENSOR_LOCALS(int64_t, nec1, c1, ne);
|
13446
|
+
GGML_TENSOR_LOCALS(size_t, nbc1, c1, nb);
|
13447
|
+
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne);
|
13448
|
+
GGML_TENSOR_LOCALS(size_t, nb, dst, nb);
|
14205
13449
|
|
14206
13450
|
const int ith = params->ith;
|
14207
13451
|
const int nth = params->nth;
|
@@ -14349,55 +13593,16 @@ static void ggml_compute_forward_flash_attn_back_f32(
|
|
14349
13593
|
int64_t t0 = ggml_perf_time_us();
|
14350
13594
|
UNUSED(t0);
|
14351
13595
|
|
14352
|
-
|
14353
|
-
|
14354
|
-
|
14355
|
-
|
14356
|
-
|
14357
|
-
|
14358
|
-
|
14359
|
-
|
14360
|
-
|
14361
|
-
|
14362
|
-
const int64_t nev0 = v->ne[0];
|
14363
|
-
const int64_t nev1 = v->ne[1];
|
14364
|
-
//const int64_t nev2 = v->ne[2];
|
14365
|
-
//const int64_t nev3 = v->ne[3];
|
14366
|
-
|
14367
|
-
const int64_t ned0 = d->ne[0];
|
14368
|
-
const int64_t ned1 = d->ne[1];
|
14369
|
-
//const int64_t ned2 = d->ne[2];
|
14370
|
-
//const int64_t ned3 = d->ne[3];
|
14371
|
-
|
14372
|
-
const int64_t ne0 = dst->ne[0];
|
14373
|
-
const int64_t ne1 = dst->ne[1];
|
14374
|
-
const int64_t ne2 = dst->ne[2];
|
14375
|
-
const int64_t ne3 = dst->ne[3];
|
14376
|
-
|
14377
|
-
const int nbk0 = k->nb[0];
|
14378
|
-
const int nbk1 = k->nb[1];
|
14379
|
-
const int nbk2 = k->nb[2];
|
14380
|
-
const int nbk3 = k->nb[3];
|
14381
|
-
|
14382
|
-
const int nbq0 = q->nb[0];
|
14383
|
-
const int nbq1 = q->nb[1];
|
14384
|
-
const int nbq2 = q->nb[2];
|
14385
|
-
const int nbq3 = q->nb[3];
|
14386
|
-
|
14387
|
-
const int nbv0 = v->nb[0];
|
14388
|
-
const int nbv1 = v->nb[1];
|
14389
|
-
const int nbv2 = v->nb[2];
|
14390
|
-
const int nbv3 = v->nb[3];
|
14391
|
-
|
14392
|
-
const int nbd0 = d->nb[0];
|
14393
|
-
const int nbd1 = d->nb[1];
|
14394
|
-
const int nbd2 = d->nb[2];
|
14395
|
-
const int nbd3 = d->nb[3];
|
14396
|
-
|
14397
|
-
const int nb0 = dst->nb[0];
|
14398
|
-
const int nb1 = dst->nb[1];
|
14399
|
-
const int nb2 = dst->nb[2];
|
14400
|
-
const int nb3 = dst->nb[3];
|
13596
|
+
GGML_TENSOR_LOCALS(int64_t, neq, q, ne);
|
13597
|
+
GGML_TENSOR_LOCALS(size_t, nbq, q, nb);
|
13598
|
+
GGML_TENSOR_LOCALS(int64_t, nek, k, ne);
|
13599
|
+
GGML_TENSOR_LOCALS(size_t, nbk, k, nb);
|
13600
|
+
GGML_TENSOR_LOCALS(int64_t, nev, v, ne);
|
13601
|
+
GGML_TENSOR_LOCALS(size_t, nbv, v, nb);
|
13602
|
+
GGML_TENSOR_LOCALS(int64_t, ned, d, ne);
|
13603
|
+
GGML_TENSOR_LOCALS(size_t, nbd, d, nb);
|
13604
|
+
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne);
|
13605
|
+
GGML_TENSOR_LOCALS(size_t, nb, dst, nb);
|
14401
13606
|
|
14402
13607
|
const int ith = params->ith;
|
14403
13608
|
const int nth = params->nth;
|
@@ -14755,15 +13960,8 @@ static void ggml_compute_forward_win_part_f32(
|
|
14755
13960
|
return;
|
14756
13961
|
}
|
14757
13962
|
|
14758
|
-
|
14759
|
-
|
14760
|
-
const int64_t ne02 = src0->ne[2];
|
14761
|
-
const int64_t ne03 = src0->ne[3]; UNUSED(ne03);
|
14762
|
-
|
14763
|
-
const int64_t ne0 = dst->ne[0];
|
14764
|
-
const int64_t ne1 = dst->ne[1];
|
14765
|
-
const int64_t ne2 = dst->ne[2];
|
14766
|
-
const int64_t ne3 = dst->ne[3]; UNUSED(ne3);
|
13963
|
+
GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne);
|
13964
|
+
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne);
|
14767
13965
|
|
14768
13966
|
const int32_t nep0 = ((const int32_t *)(opt0->data))[0];
|
14769
13967
|
const int32_t nep1 = ((const int32_t *)(opt0->data))[1];
|
@@ -14826,14 +14024,8 @@ static void ggml_compute_forward_win_unpart_f32(
|
|
14826
14024
|
return;
|
14827
14025
|
}
|
14828
14026
|
|
14829
|
-
|
14830
|
-
|
14831
|
-
const int64_t ne02 = src0->ne[2];
|
14832
|
-
//const int64_t ne03 = src0->ne[3];
|
14833
|
-
|
14834
|
-
const int64_t ne0 = dst->ne[0];
|
14835
|
-
const int64_t ne1 = dst->ne[1];
|
14836
|
-
const int64_t ne2 = dst->ne[2];
|
14027
|
+
GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne);
|
14028
|
+
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne);
|
14837
14029
|
|
14838
14030
|
const int32_t w = ((const int32_t *)(opt0->data))[0];
|
14839
14031
|
|
@@ -15431,6 +14623,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
15431
14623
|
{
|
15432
14624
|
ggml_compute_forward_mean(params, tensor->src0, tensor);
|
15433
14625
|
} break;
|
14626
|
+
case GGML_OP_ARGMAX:
|
14627
|
+
{
|
14628
|
+
ggml_compute_forward_argmax(params, tensor->src0, tensor);
|
14629
|
+
} break;
|
15434
14630
|
case GGML_OP_REPEAT:
|
15435
14631
|
{
|
15436
14632
|
ggml_compute_forward_repeat(params, tensor->src0, tensor);
|
@@ -15455,6 +14651,14 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
15455
14651
|
{
|
15456
14652
|
ggml_compute_forward_step(params, tensor->src0, tensor);
|
15457
14653
|
} break;
|
14654
|
+
case GGML_OP_TANH:
|
14655
|
+
{
|
14656
|
+
ggml_compute_forward_tanh(params, tensor->src0, tensor);
|
14657
|
+
} break;
|
14658
|
+
case GGML_OP_ELU:
|
14659
|
+
{
|
14660
|
+
ggml_compute_forward_elu(params, tensor->src0, tensor);
|
14661
|
+
} break;
|
15458
14662
|
case GGML_OP_RELU:
|
15459
14663
|
{
|
15460
14664
|
ggml_compute_forward_relu(params, tensor->src0, tensor);
|
@@ -15571,17 +14775,13 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
15571
14775
|
{
|
15572
14776
|
ggml_compute_forward_clamp(params, tensor->src0, tensor->src1, tensor);
|
15573
14777
|
} break;
|
15574
|
-
case
|
15575
|
-
{
|
15576
|
-
ggml_compute_forward_conv_1d_s1_ph(params, tensor->src0, tensor->src1, tensor);
|
15577
|
-
} break;
|
15578
|
-
case GGML_OP_CONV_1D_S2_PH:
|
14778
|
+
case GGML_OP_CONV_1D:
|
15579
14779
|
{
|
15580
|
-
|
14780
|
+
ggml_compute_forward_conv_1d(params, tensor->src0, tensor->src1, tensor->opt[0], tensor);
|
15581
14781
|
} break;
|
15582
|
-
case
|
14782
|
+
case GGML_OP_CONV_2D:
|
15583
14783
|
{
|
15584
|
-
|
14784
|
+
ggml_compute_forward_conv_2d(params, tensor->src0, tensor->src1, tensor->opt[0], tensor);
|
15585
14785
|
} break;
|
15586
14786
|
case GGML_OP_FLASH_ATTN:
|
15587
14787
|
{
|
@@ -15830,6 +15030,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
15830
15030
|
}
|
15831
15031
|
} break;
|
15832
15032
|
case GGML_OP_MEAN:
|
15033
|
+
case GGML_OP_ARGMAX:
|
15833
15034
|
{
|
15834
15035
|
GGML_ASSERT(false); // TODO: implement
|
15835
15036
|
} break;
|
@@ -15883,6 +15084,14 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
15883
15084
|
// noop
|
15884
15085
|
}
|
15885
15086
|
} break;
|
15087
|
+
case GGML_OP_TANH:
|
15088
|
+
{
|
15089
|
+
GGML_ASSERT(false); // TODO: not implemented
|
15090
|
+
} break;
|
15091
|
+
case GGML_OP_ELU:
|
15092
|
+
{
|
15093
|
+
GGML_ASSERT(false); // TODO: not implemented
|
15094
|
+
} break;
|
15886
15095
|
case GGML_OP_RELU:
|
15887
15096
|
{
|
15888
15097
|
if (src0->grad) {
|
@@ -15902,14 +15111,6 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
15902
15111
|
{
|
15903
15112
|
GGML_ASSERT(false); // TODO: not implemented
|
15904
15113
|
} break;
|
15905
|
-
case GGML_OP_ALIBI:
|
15906
|
-
{
|
15907
|
-
GGML_ASSERT(false); // TODO: not implemented
|
15908
|
-
} break;
|
15909
|
-
case GGML_OP_CLAMP:
|
15910
|
-
{
|
15911
|
-
GGML_ASSERT(false); // TODO: not implemented
|
15912
|
-
} break;
|
15913
15114
|
case GGML_OP_SILU:
|
15914
15115
|
{
|
15915
15116
|
// necessary for llama
|
@@ -16226,7 +15427,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
16226
15427
|
// necessary for llama
|
16227
15428
|
if (src0->grad) {
|
16228
15429
|
assert(src1->type == GGML_TYPE_I32);
|
16229
|
-
assert(ggml_nelements(src1) ==
|
15430
|
+
assert(ggml_nelements(src1) == 4);
|
16230
15431
|
const int n_past = ((int32_t *) src1->data)[0];
|
16231
15432
|
const int n_dims = ((int32_t *) src1->data)[1];
|
16232
15433
|
const int mode = ((int32_t *) src1->data)[2];
|
@@ -16266,15 +15467,19 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
16266
15467
|
// noop
|
16267
15468
|
}
|
16268
15469
|
} break;
|
16269
|
-
case
|
15470
|
+
case GGML_OP_ALIBI:
|
15471
|
+
{
|
15472
|
+
GGML_ASSERT(false); // TODO: not implemented
|
15473
|
+
} break;
|
15474
|
+
case GGML_OP_CLAMP:
|
16270
15475
|
{
|
16271
15476
|
GGML_ASSERT(false); // TODO: not implemented
|
16272
15477
|
} break;
|
16273
|
-
case
|
15478
|
+
case GGML_OP_CONV_1D:
|
16274
15479
|
{
|
16275
15480
|
GGML_ASSERT(false); // TODO: not implemented
|
16276
15481
|
} break;
|
16277
|
-
case
|
15482
|
+
case GGML_OP_CONV_2D:
|
16278
15483
|
{
|
16279
15484
|
GGML_ASSERT(false); // TODO: not implemented
|
16280
15485
|
} break;
|
@@ -16791,9 +15996,11 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
16791
15996
|
if (node_n != -1) {
|
16792
15997
|
/* FINALIZE */
|
16793
15998
|
struct ggml_tensor * node = state->shared->cgraph->nodes[node_n];
|
16794
|
-
|
16795
|
-
|
16796
|
-
|
15999
|
+
if (GGML_OP_HAS_FINALIZE[node->op]) {
|
16000
|
+
params.nth = node->n_tasks;
|
16001
|
+
ggml_compute_forward(¶ms, node);
|
16002
|
+
ggml_graph_compute_perf_stats_node(node, state->shared);
|
16003
|
+
}
|
16797
16004
|
}
|
16798
16005
|
|
16799
16006
|
// distribute new work or execute it direct if 1T
|
@@ -16805,10 +16012,13 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
16805
16012
|
state->shared->perf_node_start_cycles = ggml_perf_cycles();
|
16806
16013
|
state->shared->perf_node_start_time_us = ggml_perf_time_us();
|
16807
16014
|
|
16015
|
+
params.nth = node->n_tasks;
|
16016
|
+
|
16808
16017
|
/* INIT */
|
16809
|
-
|
16810
|
-
|
16811
|
-
|
16018
|
+
if (GGML_OP_HAS_INIT[node->op]) {
|
16019
|
+
params.type = GGML_TASK_INIT;
|
16020
|
+
ggml_compute_forward(¶ms, node);
|
16021
|
+
}
|
16812
16022
|
|
16813
16023
|
if (node->n_tasks == 1) {
|
16814
16024
|
// TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1,
|
@@ -16816,9 +16026,11 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
16816
16026
|
params.type = GGML_TASK_COMPUTE;
|
16817
16027
|
ggml_compute_forward(¶ms, node);
|
16818
16028
|
|
16819
|
-
|
16820
|
-
|
16821
|
-
|
16029
|
+
if (GGML_OP_HAS_FINALIZE[node->op]) {
|
16030
|
+
params.type = GGML_TASK_FINALIZE;
|
16031
|
+
ggml_compute_forward(¶ms, node);
|
16032
|
+
ggml_graph_compute_perf_stats_node(node, state->shared);
|
16033
|
+
}
|
16822
16034
|
} else {
|
16823
16035
|
break;
|
16824
16036
|
}
|
@@ -16924,12 +16136,15 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|
16924
16136
|
case GGML_OP_SUM:
|
16925
16137
|
case GGML_OP_SUM_ROWS:
|
16926
16138
|
case GGML_OP_MEAN:
|
16139
|
+
case GGML_OP_ARGMAX:
|
16927
16140
|
case GGML_OP_REPEAT:
|
16928
16141
|
case GGML_OP_REPEAT_BACK:
|
16929
16142
|
case GGML_OP_ABS:
|
16930
16143
|
case GGML_OP_SGN:
|
16931
16144
|
case GGML_OP_NEG:
|
16932
16145
|
case GGML_OP_STEP:
|
16146
|
+
case GGML_OP_TANH:
|
16147
|
+
case GGML_OP_ELU:
|
16933
16148
|
case GGML_OP_RELU:
|
16934
16149
|
{
|
16935
16150
|
node->n_tasks = 1;
|
@@ -16958,6 +16173,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|
16958
16173
|
//printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks = %d\n", nr0, nr1, nr0*nr1, node->n_tasks);
|
16959
16174
|
|
16960
16175
|
size_t cur = 0;
|
16176
|
+
const enum ggml_type vec_dot_type = type_traits[node->src0->type].vec_dot_type;
|
16961
16177
|
|
16962
16178
|
#if defined(GGML_USE_CUBLAS)
|
16963
16179
|
if (ggml_cuda_can_mul_mat(node->src0, node->src1, node)) {
|
@@ -16973,39 +16189,20 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|
16973
16189
|
}
|
16974
16190
|
else
|
16975
16191
|
#endif
|
16976
|
-
if (node->src0->type == GGML_TYPE_F16 && node->src1->type == GGML_TYPE_F32) {
|
16977
16192
|
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
16978
|
-
|
16979
|
-
|
16980
|
-
|
16193
|
+
if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
|
16194
|
+
node->n_tasks = 1; // TODO: this actually is doing nothing
|
16195
|
+
// the threads are still spinning
|
16196
|
+
if (node->src0->type != GGML_TYPE_F32) {
|
16981
16197
|
// here we need memory just for single 2D matrix from src0
|
16982
16198
|
cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]);
|
16983
|
-
} else {
|
16984
|
-
cur = GGML_TYPE_SIZE[GGML_TYPE_F16]*ggml_nelements(node->src1);
|
16985
|
-
}
|
16986
|
-
#else
|
16987
|
-
cur = GGML_TYPE_SIZE[GGML_TYPE_F16]*ggml_nelements(node->src1);
|
16988
|
-
#endif
|
16989
|
-
} else if (node->src0->type == GGML_TYPE_F32 && node->src1->type == GGML_TYPE_F32) {
|
16990
|
-
cur = 0;
|
16991
|
-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
16992
|
-
if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
|
16993
|
-
node->n_tasks = 1;
|
16994
16199
|
}
|
16200
|
+
} else
|
16995
16201
|
#endif
|
16996
|
-
|
16997
|
-
|
16998
|
-
if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
|
16999
|
-
node->n_tasks = 1;
|
17000
|
-
cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]);
|
17001
|
-
} else
|
17002
|
-
#endif
|
17003
|
-
{
|
17004
|
-
const enum ggml_type type_q = quantize_fns[node->src0->type].vec_dot_type;
|
17005
|
-
cur = GGML_TYPE_SIZE[type_q]*ggml_nelements(node->src1)/GGML_BLCK_SIZE[type_q];
|
17006
|
-
}
|
16202
|
+
if (node->src1->type != vec_dot_type) {
|
16203
|
+
cur = GGML_TYPE_SIZE[vec_dot_type]*ggml_nelements(node->src1)/GGML_BLCK_SIZE[vec_dot_type];
|
17007
16204
|
} else {
|
17008
|
-
|
16205
|
+
cur = 0;
|
17009
16206
|
}
|
17010
16207
|
|
17011
16208
|
work_size = MAX(work_size, cur);
|
@@ -17043,8 +16240,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|
17043
16240
|
{
|
17044
16241
|
node->n_tasks = 1; //TODO
|
17045
16242
|
} break;
|
17046
|
-
case
|
17047
|
-
case GGML_OP_CONV_1D_S2_PH:
|
16243
|
+
case GGML_OP_CONV_1D:
|
17048
16244
|
{
|
17049
16245
|
node->n_tasks = n_threads;
|
17050
16246
|
|
@@ -17073,7 +16269,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|
17073
16269
|
|
17074
16270
|
work_size = MAX(work_size, cur);
|
17075
16271
|
} break;
|
17076
|
-
case
|
16272
|
+
case GGML_OP_CONV_2D:
|
17077
16273
|
{
|
17078
16274
|
node->n_tasks = n_threads;
|
17079
16275
|
|
@@ -17435,13 +16631,6 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
|
|
17435
16631
|
fwrite(&nb, sizeof(uint64_t), 1, fout);
|
17436
16632
|
}
|
17437
16633
|
|
17438
|
-
// store the pointer address
|
17439
|
-
{
|
17440
|
-
const uint64_t ptr = (uint64_t) tensor->data;
|
17441
|
-
|
17442
|
-
fwrite(&ptr, sizeof(uint64_t), 1, fout);
|
17443
|
-
}
|
17444
|
-
|
17445
16634
|
fwrite(tensor->name, sizeof(char), GGML_MAX_NAME, fout);
|
17446
16635
|
|
17447
16636
|
// dump the data
|
@@ -17475,13 +16664,6 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
|
|
17475
16664
|
fwrite(&nb, sizeof(uint64_t), 1, fout);
|
17476
16665
|
}
|
17477
16666
|
|
17478
|
-
// store the pointer address
|
17479
|
-
{
|
17480
|
-
const uint64_t ptr = (uint64_t) tensor->data;
|
17481
|
-
|
17482
|
-
fwrite(&ptr, sizeof(uint64_t), 1, fout);
|
17483
|
-
}
|
17484
|
-
|
17485
16667
|
fwrite(tensor->name, sizeof(char), GGML_MAX_NAME, fout);
|
17486
16668
|
|
17487
16669
|
// output the op arguments
|
@@ -17666,8 +16848,6 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
|
|
17666
16848
|
|
17667
16849
|
tensor->op = (enum ggml_op) op;
|
17668
16850
|
|
17669
|
-
uint64_t ptr_cur = *(const uint64_t *) ptr; ptr += sizeof(ptr_cur);
|
17670
|
-
|
17671
16851
|
memcpy(tensor->name, ptr, GGML_MAX_NAME); ptr += GGML_MAX_NAME;
|
17672
16852
|
|
17673
16853
|
tensor->data = (void *) ptr;
|
@@ -17713,8 +16893,6 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
|
|
17713
16893
|
nb[j] = nb_cur;
|
17714
16894
|
}
|
17715
16895
|
|
17716
|
-
uint64_t ptr_cur = *(const uint64_t *) ptr; ptr += sizeof(ptr_cur); // TODO: not yet used
|
17717
|
-
|
17718
16896
|
const char * ptr_name = ptr; ptr += GGML_MAX_NAME;
|
17719
16897
|
|
17720
16898
|
const int32_t * ptr_arg_idx = (const int32_t *) ptr; ptr += (2 + GGML_MAX_OPT)*sizeof(int32_t);
|