llama_cpp 0.3.1 → 0.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +28 -0
- data/README.md +9 -0
- data/examples/chat.rb +1 -1
- data/examples/embedding.rb +1 -1
- data/examples/prompt_jp.txt +8 -0
- data/ext/llama_cpp/extconf.rb +2 -2
- data/ext/llama_cpp/llama_cpp.cpp +121 -1
- data/ext/llama_cpp/src/ggml-cuda.cu +451 -101
- data/ext/llama_cpp/src/ggml-cuda.h +0 -4
- data/ext/llama_cpp/src/ggml-metal.m +3 -1
- data/ext/llama_cpp/src/ggml-opencl.cpp +11 -7
- data/ext/llama_cpp/src/ggml.c +690 -1512
- data/ext/llama_cpp/src/ggml.h +88 -62
- data/ext/llama_cpp/src/llama.cpp +103 -39
- data/ext/llama_cpp/src/llama.h +15 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +15 -12
- data/sig/llama_cpp.rbs +19 -1
- metadata +3 -2
data/ext/llama_cpp/src/ggml.c
CHANGED
@@ -220,9 +220,27 @@ inline static void* ggml_aligned_malloc(size_t size) {
|
|
220
220
|
#define GGML_ALIGNED_FREE(ptr) free(ptr)
|
221
221
|
#endif
|
222
222
|
|
223
|
-
#define UNUSED
|
223
|
+
#define UNUSED GGML_UNUSED
|
224
224
|
#define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0)
|
225
225
|
|
226
|
+
//
|
227
|
+
// tensor access macros
|
228
|
+
//
|
229
|
+
|
230
|
+
#define GGML_TENSOR_UNARY_OP_LOCALS \
|
231
|
+
GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne); \
|
232
|
+
GGML_TENSOR_LOCALS(size_t, nb0, src0, nb); \
|
233
|
+
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne); \
|
234
|
+
GGML_TENSOR_LOCALS(size_t, nb, dst, nb);
|
235
|
+
|
236
|
+
#define GGML_TENSOR_BINARY_OP_LOCALS \
|
237
|
+
GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne); \
|
238
|
+
GGML_TENSOR_LOCALS(size_t, nb0, src0, nb); \
|
239
|
+
GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne); \
|
240
|
+
GGML_TENSOR_LOCALS(size_t, nb1, src1, nb); \
|
241
|
+
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne); \
|
242
|
+
GGML_TENSOR_LOCALS(size_t, nb, dst, nb);
|
243
|
+
|
226
244
|
#if defined(GGML_USE_ACCELERATE)
|
227
245
|
#include <Accelerate/Accelerate.h>
|
228
246
|
#if defined(GGML_USE_CLBLAST) // allow usage of CLBlast alongside Accelerate functions
|
@@ -463,14 +481,14 @@ ggml_fp16_t ggml_fp32_to_fp16(float x) {
|
|
463
481
|
return GGML_FP32_TO_FP16(x);
|
464
482
|
}
|
465
483
|
|
466
|
-
void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y,
|
467
|
-
for (
|
484
|
+
void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int n) {
|
485
|
+
for (int i = 0; i < n; i++) {
|
468
486
|
y[i] = GGML_FP16_TO_FP32(x[i]);
|
469
487
|
}
|
470
488
|
}
|
471
489
|
|
472
|
-
void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y,
|
473
|
-
|
490
|
+
void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int n) {
|
491
|
+
int i = 0;
|
474
492
|
#if defined(__F16C__)
|
475
493
|
for (; i + 7 < n; i += 8) {
|
476
494
|
__m256 x_vec = _mm256_loadu_ps(x + i);
|
@@ -1609,109 +1627,112 @@ static void dequantize_row_q8_0(const void * restrict vx, float * restrict y, in
|
|
1609
1627
|
}
|
1610
1628
|
}
|
1611
1629
|
|
1630
|
+
static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y);
|
1631
|
+
static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y);
|
1612
1632
|
static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
1613
1633
|
static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
1614
1634
|
static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
1615
1635
|
static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
1616
1636
|
static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
1617
1637
|
|
1618
|
-
static const
|
1638
|
+
static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
1639
|
+
[GGML_TYPE_F32] = {
|
1640
|
+
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f32,
|
1641
|
+
.vec_dot_type = GGML_TYPE_F32,
|
1642
|
+
},
|
1643
|
+
[GGML_TYPE_F16] = {
|
1644
|
+
.to_float = (ggml_to_float_t) ggml_fp16_to_fp32_row,
|
1645
|
+
.from_float = (ggml_from_float_t) ggml_fp32_to_fp16_row,
|
1646
|
+
.from_float_reference = (ggml_from_float_t) ggml_fp32_to_fp16_row,
|
1647
|
+
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f16,
|
1648
|
+
.vec_dot_type = GGML_TYPE_F16,
|
1649
|
+
},
|
1619
1650
|
[GGML_TYPE_Q4_0] = {
|
1620
|
-
.
|
1621
|
-
.
|
1622
|
-
.
|
1623
|
-
.
|
1624
|
-
.vec_dot_q = ggml_vec_dot_q4_0_q8_0,
|
1651
|
+
.to_float = (ggml_to_float_t) dequantize_row_q4_0,
|
1652
|
+
.from_float = quantize_row_q4_0,
|
1653
|
+
.from_float_reference = (ggml_from_float_t) quantize_row_q4_0_reference,
|
1654
|
+
.vec_dot = ggml_vec_dot_q4_0_q8_0,
|
1625
1655
|
.vec_dot_type = GGML_TYPE_Q8_0,
|
1626
1656
|
},
|
1627
1657
|
[GGML_TYPE_Q4_1] = {
|
1628
|
-
.
|
1629
|
-
.
|
1630
|
-
.
|
1631
|
-
.
|
1632
|
-
.vec_dot_q = ggml_vec_dot_q4_1_q8_1,
|
1658
|
+
.to_float = (ggml_to_float_t) dequantize_row_q4_1,
|
1659
|
+
.from_float = quantize_row_q4_1,
|
1660
|
+
.from_float_reference = (ggml_from_float_t) quantize_row_q4_1_reference,
|
1661
|
+
.vec_dot = ggml_vec_dot_q4_1_q8_1,
|
1633
1662
|
.vec_dot_type = GGML_TYPE_Q8_1,
|
1634
1663
|
},
|
1635
1664
|
[GGML_TYPE_Q5_0] = {
|
1636
|
-
.
|
1637
|
-
.
|
1638
|
-
.
|
1639
|
-
.
|
1640
|
-
.vec_dot_q = ggml_vec_dot_q5_0_q8_0,
|
1665
|
+
.to_float = (ggml_to_float_t) dequantize_row_q5_0,
|
1666
|
+
.from_float = quantize_row_q5_0,
|
1667
|
+
.from_float_reference = (ggml_from_float_t) quantize_row_q5_0_reference,
|
1668
|
+
.vec_dot = ggml_vec_dot_q5_0_q8_0,
|
1641
1669
|
.vec_dot_type = GGML_TYPE_Q8_0,
|
1642
1670
|
},
|
1643
1671
|
[GGML_TYPE_Q5_1] = {
|
1644
|
-
.
|
1645
|
-
.
|
1646
|
-
.
|
1647
|
-
.
|
1648
|
-
.vec_dot_q = ggml_vec_dot_q5_1_q8_1,
|
1672
|
+
.to_float = (ggml_to_float_t) dequantize_row_q5_1,
|
1673
|
+
.from_float = quantize_row_q5_1,
|
1674
|
+
.from_float_reference = (ggml_from_float_t) quantize_row_q5_1_reference,
|
1675
|
+
.vec_dot = ggml_vec_dot_q5_1_q8_1,
|
1649
1676
|
.vec_dot_type = GGML_TYPE_Q8_1,
|
1650
1677
|
},
|
1651
1678
|
[GGML_TYPE_Q8_0] = {
|
1652
|
-
.
|
1653
|
-
.
|
1654
|
-
.
|
1655
|
-
.
|
1656
|
-
.vec_dot_q = ggml_vec_dot_q8_0_q8_0,
|
1679
|
+
.to_float = dequantize_row_q8_0,
|
1680
|
+
.from_float = quantize_row_q8_0,
|
1681
|
+
.from_float_reference = (ggml_from_float_t) quantize_row_q8_0_reference,
|
1682
|
+
.vec_dot = ggml_vec_dot_q8_0_q8_0,
|
1657
1683
|
.vec_dot_type = GGML_TYPE_Q8_0,
|
1658
1684
|
},
|
1659
1685
|
[GGML_TYPE_Q8_1] = {
|
1660
|
-
.
|
1661
|
-
.
|
1662
|
-
.quantize_row_q_reference = (quantize_row_q_t) quantize_row_q8_1_reference,
|
1663
|
-
.quantize_row_q_dot = quantize_row_q8_1,
|
1664
|
-
.vec_dot_q = NULL, // TODO
|
1686
|
+
.from_float = quantize_row_q8_1,
|
1687
|
+
.from_float_reference = (ggml_from_float_t) quantize_row_q8_1_reference,
|
1665
1688
|
.vec_dot_type = GGML_TYPE_Q8_1,
|
1666
1689
|
},
|
1667
1690
|
#ifdef GGML_USE_K_QUANTS
|
1668
1691
|
[GGML_TYPE_Q2_K] = {
|
1669
|
-
.
|
1670
|
-
.
|
1671
|
-
.
|
1672
|
-
.
|
1673
|
-
.vec_dot_q = ggml_vec_dot_q2_K_q8_K,
|
1692
|
+
.to_float = (ggml_to_float_t) dequantize_row_q2_K,
|
1693
|
+
.from_float = quantize_row_q2_K,
|
1694
|
+
.from_float_reference = (ggml_from_float_t) quantize_row_q2_K_reference,
|
1695
|
+
.vec_dot = ggml_vec_dot_q2_K_q8_K,
|
1674
1696
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
1675
1697
|
},
|
1676
1698
|
[GGML_TYPE_Q3_K] = {
|
1677
|
-
.
|
1678
|
-
.
|
1679
|
-
.
|
1680
|
-
.
|
1681
|
-
.vec_dot_q = ggml_vec_dot_q3_K_q8_K,
|
1699
|
+
.to_float = (ggml_to_float_t) dequantize_row_q3_K,
|
1700
|
+
.from_float = quantize_row_q3_K,
|
1701
|
+
.from_float_reference = (ggml_from_float_t) quantize_row_q3_K_reference,
|
1702
|
+
.vec_dot = ggml_vec_dot_q3_K_q8_K,
|
1682
1703
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
1683
1704
|
},
|
1684
1705
|
[GGML_TYPE_Q4_K] = {
|
1685
|
-
.
|
1686
|
-
.
|
1687
|
-
.
|
1688
|
-
.
|
1689
|
-
.vec_dot_q = ggml_vec_dot_q4_K_q8_K,
|
1706
|
+
.to_float = (ggml_to_float_t) dequantize_row_q4_K,
|
1707
|
+
.from_float = quantize_row_q4_K,
|
1708
|
+
.from_float_reference = (ggml_from_float_t) quantize_row_q4_K_reference,
|
1709
|
+
.vec_dot = ggml_vec_dot_q4_K_q8_K,
|
1690
1710
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
1691
1711
|
},
|
1692
1712
|
[GGML_TYPE_Q5_K] = {
|
1693
|
-
.
|
1694
|
-
.
|
1695
|
-
.
|
1696
|
-
.
|
1697
|
-
.vec_dot_q = ggml_vec_dot_q5_K_q8_K,
|
1713
|
+
.to_float = (ggml_to_float_t) dequantize_row_q5_K,
|
1714
|
+
.from_float = quantize_row_q5_K,
|
1715
|
+
.from_float_reference = (ggml_from_float_t) quantize_row_q5_K_reference,
|
1716
|
+
.vec_dot = ggml_vec_dot_q5_K_q8_K,
|
1698
1717
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
1699
1718
|
},
|
1700
1719
|
[GGML_TYPE_Q6_K] = {
|
1701
|
-
.
|
1702
|
-
.
|
1703
|
-
.
|
1704
|
-
.
|
1705
|
-
.vec_dot_q = ggml_vec_dot_q6_K_q8_K,
|
1720
|
+
.to_float = (ggml_to_float_t) dequantize_row_q6_K,
|
1721
|
+
.from_float = quantize_row_q6_K,
|
1722
|
+
.from_float_reference = (ggml_from_float_t) quantize_row_q6_K_reference,
|
1723
|
+
.vec_dot = ggml_vec_dot_q6_K_q8_K,
|
1706
1724
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
1707
1725
|
},
|
1726
|
+
[GGML_TYPE_Q8_K] = {
|
1727
|
+
.from_float = quantize_row_q8_K,
|
1728
|
+
}
|
1708
1729
|
#endif
|
1709
1730
|
};
|
1710
1731
|
|
1711
1732
|
// For internal test use
|
1712
|
-
|
1733
|
+
ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type i) {
|
1713
1734
|
GGML_ASSERT(i < GGML_TYPE_COUNT);
|
1714
|
-
return
|
1735
|
+
return type_traits[i];
|
1715
1736
|
}
|
1716
1737
|
|
1717
1738
|
|
@@ -2257,7 +2278,7 @@ inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x)
|
|
2257
2278
|
inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]*y[i]; }
|
2258
2279
|
inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]/y[i]; }
|
2259
2280
|
|
2260
|
-
|
2281
|
+
static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y) {
|
2261
2282
|
#ifdef GGML_SIMD
|
2262
2283
|
float sumf = 0.0f;
|
2263
2284
|
const int np = (n & ~(GGML_F32_STEP - 1));
|
@@ -2294,7 +2315,7 @@ inline static void ggml_vec_dot_f32(const int n, float * restrict s, const float
|
|
2294
2315
|
*s = sumf;
|
2295
2316
|
}
|
2296
2317
|
|
2297
|
-
|
2318
|
+
static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y) {
|
2298
2319
|
ggml_float sumf = 0.0;
|
2299
2320
|
|
2300
2321
|
#if defined(GGML_SIMD)
|
@@ -3447,6 +3468,8 @@ inline static void ggml_vec_log_f32 (const int n, float * y, const float * x) {
|
|
3447
3468
|
inline static void ggml_vec_abs_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fabsf(x[i]); }
|
3448
3469
|
inline static void ggml_vec_sgn_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : ((x[i] < 0.f) ? -1.f : 0.f); }
|
3449
3470
|
inline static void ggml_vec_step_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : 0.f; }
|
3471
|
+
inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = tanhf(x[i]); }
|
3472
|
+
inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; }
|
3450
3473
|
inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
|
3451
3474
|
|
3452
3475
|
static const float GELU_COEF_A = 0.044715f;
|
@@ -3598,6 +3621,16 @@ inline static void ggml_vec_norm_inv_f32(const int n, float * s, const float * x
|
|
3598
3621
|
*s = 1.f/(*s);
|
3599
3622
|
}
|
3600
3623
|
|
3624
|
+
inline static void ggml_vec_argmax_f32(const int n, int * s, const float * x) {
|
3625
|
+
float max = -INFINITY;
|
3626
|
+
int idx = 0;
|
3627
|
+
for (int i = 0; i < n; ++i) {
|
3628
|
+
max = MAX(max, x[i]);
|
3629
|
+
if (max == x[i]) { idx = i; }
|
3630
|
+
}
|
3631
|
+
*s = idx;
|
3632
|
+
}
|
3633
|
+
|
3601
3634
|
//
|
3602
3635
|
// data types
|
3603
3636
|
//
|
@@ -3707,12 +3740,15 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
3707
3740
|
"SUM",
|
3708
3741
|
"SUM_ROWS",
|
3709
3742
|
"MEAN",
|
3743
|
+
"ARGMAX",
|
3710
3744
|
"REPEAT",
|
3711
3745
|
"REPEAT_BACK",
|
3712
3746
|
"ABS",
|
3713
3747
|
"SGN",
|
3714
3748
|
"NEG",
|
3715
3749
|
"STEP",
|
3750
|
+
"TANH",
|
3751
|
+
"ELU",
|
3716
3752
|
"RELU",
|
3717
3753
|
"GELU",
|
3718
3754
|
"GELU_QUICK",
|
@@ -3744,9 +3780,8 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
3744
3780
|
"ROPE_BACK",
|
3745
3781
|
"ALIBI",
|
3746
3782
|
"CLAMP",
|
3747
|
-
"
|
3748
|
-
"
|
3749
|
-
"CONV_2D_SK_P0",
|
3783
|
+
"CONV_1D",
|
3784
|
+
"CONV_2D",
|
3750
3785
|
|
3751
3786
|
"FLASH_ATTN",
|
3752
3787
|
"FLASH_FF",
|
@@ -3765,7 +3800,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
3765
3800
|
"CROSS_ENTROPY_LOSS_BACK",
|
3766
3801
|
};
|
3767
3802
|
|
3768
|
-
static_assert(GGML_OP_COUNT ==
|
3803
|
+
static_assert(GGML_OP_COUNT == 66, "GGML_OP_COUNT != 66");
|
3769
3804
|
|
3770
3805
|
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
3771
3806
|
"none",
|
@@ -3783,12 +3818,15 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
3783
3818
|
"Σx",
|
3784
3819
|
"Σx_k",
|
3785
3820
|
"Σx/n",
|
3821
|
+
"argmax(x)",
|
3786
3822
|
"repeat(x)",
|
3787
3823
|
"repeat_back(x)",
|
3788
3824
|
"abs(x)",
|
3789
3825
|
"sgn(x)",
|
3790
3826
|
"-x",
|
3791
3827
|
"step(x)",
|
3828
|
+
"tanh(x)",
|
3829
|
+
"elu(x)",
|
3792
3830
|
"relu(x)",
|
3793
3831
|
"gelu(x)",
|
3794
3832
|
"gelu_quick(x)",
|
@@ -3820,9 +3858,8 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
3820
3858
|
"rope_back(x)",
|
3821
3859
|
"alibi(x)",
|
3822
3860
|
"clamp(x)",
|
3823
|
-
"
|
3824
|
-
"
|
3825
|
-
"conv_2d_sk_p0(x)",
|
3861
|
+
"conv_1d(x)",
|
3862
|
+
"conv_2d(x)",
|
3826
3863
|
|
3827
3864
|
"flash_attn(x)",
|
3828
3865
|
"flash_ff(x)",
|
@@ -3841,11 +3878,45 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
3841
3878
|
"cross_entropy_loss_back(x,y)",
|
3842
3879
|
};
|
3843
3880
|
|
3844
|
-
static_assert(GGML_OP_COUNT ==
|
3881
|
+
static_assert(GGML_OP_COUNT == 66, "GGML_OP_COUNT != 66");
|
3845
3882
|
|
3846
3883
|
static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
|
3847
3884
|
static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
|
3848
3885
|
|
3886
|
+
// WARN:
|
3887
|
+
// Mis-confguration can lead to problem that's hard to reason about:
|
3888
|
+
// * At best it crash or talks nosense.
|
3889
|
+
// * At worst it talks slightly difference but hard to perceive.
|
3890
|
+
//
|
3891
|
+
// An op has to enable INIT or FINALIZE when any of it's branch needs that pass.
|
3892
|
+
// Take care about compile options (e.g., GGML_USE_xxx).
|
3893
|
+
static bool GGML_OP_HAS_INIT [GGML_OP_COUNT] = { 0 };
|
3894
|
+
static bool GGML_OP_HAS_FINALIZE[GGML_OP_COUNT] = { 0 };
|
3895
|
+
|
3896
|
+
static void ggml_setup_op_has_task_pass(void) {
|
3897
|
+
{ // INIT
|
3898
|
+
bool * p = GGML_OP_HAS_INIT;
|
3899
|
+
|
3900
|
+
p[GGML_OP_ACC ] = true;
|
3901
|
+
p[GGML_OP_MUL_MAT ] = true;
|
3902
|
+
p[GGML_OP_OUT_PROD ] = true;
|
3903
|
+
p[GGML_OP_SET ] = true;
|
3904
|
+
p[GGML_OP_GET_ROWS_BACK ] = true;
|
3905
|
+
p[GGML_OP_DIAG_MASK_INF ] = true;
|
3906
|
+
p[GGML_OP_DIAG_MASK_ZERO ] = true;
|
3907
|
+
p[GGML_OP_CONV_1D ] = true;
|
3908
|
+
p[GGML_OP_CONV_2D ] = true;
|
3909
|
+
p[GGML_OP_FLASH_ATTN_BACK ] = true;
|
3910
|
+
p[GGML_OP_CROSS_ENTROPY_LOSS ] = true;
|
3911
|
+
}
|
3912
|
+
|
3913
|
+
{ // FINALIZE
|
3914
|
+
bool * p = GGML_OP_HAS_FINALIZE;
|
3915
|
+
|
3916
|
+
p[GGML_OP_CROSS_ENTROPY_LOSS ] = true;
|
3917
|
+
}
|
3918
|
+
}
|
3919
|
+
|
3849
3920
|
//
|
3850
3921
|
// ggml context
|
3851
3922
|
//
|
@@ -4267,6 +4338,8 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
|
4267
4338
|
ggml_cl_init();
|
4268
4339
|
#endif
|
4269
4340
|
|
4341
|
+
ggml_setup_op_has_task_pass();
|
4342
|
+
|
4270
4343
|
is_first_call = false;
|
4271
4344
|
}
|
4272
4345
|
|
@@ -5403,6 +5476,30 @@ struct ggml_tensor * ggml_mean(
|
|
5403
5476
|
return result;
|
5404
5477
|
}
|
5405
5478
|
|
5479
|
+
// ggml_argmax
|
5480
|
+
|
5481
|
+
struct ggml_tensor * ggml_argmax(
|
5482
|
+
struct ggml_context * ctx,
|
5483
|
+
struct ggml_tensor * a) {
|
5484
|
+
GGML_ASSERT(ggml_is_matrix(a));
|
5485
|
+
bool is_node = false;
|
5486
|
+
|
5487
|
+
if (a->grad) {
|
5488
|
+
GGML_ASSERT(false);
|
5489
|
+
is_node = true;
|
5490
|
+
}
|
5491
|
+
|
5492
|
+
int64_t ne[GGML_MAX_DIMS] = { a->ne[1], 1, 1, 1 };
|
5493
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_I32, a->n_dims, ne);
|
5494
|
+
|
5495
|
+
result->op = GGML_OP_ARGMAX;
|
5496
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5497
|
+
result->src0 = a;
|
5498
|
+
result->src1 = NULL;
|
5499
|
+
|
5500
|
+
return result;
|
5501
|
+
}
|
5502
|
+
|
5406
5503
|
// ggml_repeat
|
5407
5504
|
|
5408
5505
|
struct ggml_tensor * ggml_repeat(
|
@@ -5596,6 +5693,74 @@ struct ggml_tensor * ggml_step_inplace(
|
|
5596
5693
|
return ggml_step_impl(ctx, a, true);
|
5597
5694
|
}
|
5598
5695
|
|
5696
|
+
// ggml_tanh
|
5697
|
+
|
5698
|
+
struct ggml_tensor * ggml_tanh_impl(
|
5699
|
+
struct ggml_context * ctx,
|
5700
|
+
struct ggml_tensor * a,
|
5701
|
+
bool inplace) {
|
5702
|
+
bool is_node = false;
|
5703
|
+
|
5704
|
+
if (!inplace && (a->grad)) {
|
5705
|
+
is_node = true;
|
5706
|
+
}
|
5707
|
+
|
5708
|
+
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
5709
|
+
|
5710
|
+
result->op = GGML_OP_TANH;
|
5711
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5712
|
+
result->src0 = a;
|
5713
|
+
result->src1 = NULL;
|
5714
|
+
|
5715
|
+
return result;
|
5716
|
+
}
|
5717
|
+
|
5718
|
+
struct ggml_tensor * ggml_tanh(
|
5719
|
+
struct ggml_context * ctx,
|
5720
|
+
struct ggml_tensor * a) {
|
5721
|
+
return ggml_tanh_impl(ctx, a, false);
|
5722
|
+
}
|
5723
|
+
|
5724
|
+
struct ggml_tensor * ggml_tanh_inplace(
|
5725
|
+
struct ggml_context * ctx,
|
5726
|
+
struct ggml_tensor * a) {
|
5727
|
+
return ggml_tanh_impl(ctx, a, true);
|
5728
|
+
}
|
5729
|
+
|
5730
|
+
// ggml_elu
|
5731
|
+
|
5732
|
+
struct ggml_tensor * ggml_elu_impl(
|
5733
|
+
struct ggml_context * ctx,
|
5734
|
+
struct ggml_tensor * a,
|
5735
|
+
bool inplace) {
|
5736
|
+
bool is_node = false;
|
5737
|
+
|
5738
|
+
if (!inplace && (a->grad)) {
|
5739
|
+
is_node = true;
|
5740
|
+
}
|
5741
|
+
|
5742
|
+
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
5743
|
+
|
5744
|
+
result->op = GGML_OP_ELU;
|
5745
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5746
|
+
result->src0 = a;
|
5747
|
+
result->src1 = NULL;
|
5748
|
+
|
5749
|
+
return result;
|
5750
|
+
}
|
5751
|
+
|
5752
|
+
struct ggml_tensor * ggml_elu(
|
5753
|
+
struct ggml_context * ctx,
|
5754
|
+
struct ggml_tensor * a) {
|
5755
|
+
return ggml_elu_impl(ctx, a, false);
|
5756
|
+
}
|
5757
|
+
|
5758
|
+
struct ggml_tensor * ggml_elu_inplace(
|
5759
|
+
struct ggml_context * ctx,
|
5760
|
+
struct ggml_tensor * a) {
|
5761
|
+
return ggml_elu_impl(ctx, a, true);
|
5762
|
+
}
|
5763
|
+
|
5599
5764
|
// ggml_relu
|
5600
5765
|
|
5601
5766
|
struct ggml_tensor * ggml_relu_impl(
|
@@ -6837,6 +7002,8 @@ struct ggml_tensor * ggml_rope_back(
|
|
6837
7002
|
int n_dims,
|
6838
7003
|
int mode) {
|
6839
7004
|
GGML_ASSERT(n_past >= 0);
|
7005
|
+
GGML_ASSERT((mode & 4) == 0 && "ggml_rope_back() for ChatGLM not implemented yet");
|
7006
|
+
|
6840
7007
|
bool is_node = false;
|
6841
7008
|
|
6842
7009
|
if (a->grad) {
|
@@ -6937,15 +7104,21 @@ struct ggml_tensor * ggml_clamp(
|
|
6937
7104
|
return result;
|
6938
7105
|
}
|
6939
7106
|
|
6940
|
-
//
|
7107
|
+
// ggml_conv_1d
|
7108
|
+
|
7109
|
+
static int64_t ggml_calc_conv_output_size(int64_t ins, int64_t ks, int s, int p, int d) {
|
7110
|
+
return (ins + 2 * p - d * (ks - 1) - 1) / s + 1;
|
7111
|
+
}
|
6941
7112
|
|
6942
|
-
struct ggml_tensor *
|
7113
|
+
GGML_API struct ggml_tensor * ggml_conv_1d(
|
6943
7114
|
struct ggml_context * ctx,
|
6944
7115
|
struct ggml_tensor * a,
|
6945
|
-
struct ggml_tensor * b
|
7116
|
+
struct ggml_tensor * b,
|
7117
|
+
int s0,
|
7118
|
+
int p0,
|
7119
|
+
int d0) {
|
6946
7120
|
GGML_ASSERT(ggml_is_matrix(b));
|
6947
7121
|
GGML_ASSERT(a->ne[1] == b->ne[1]);
|
6948
|
-
GGML_ASSERT(a->ne[3] == 1);
|
6949
7122
|
bool is_node = false;
|
6950
7123
|
|
6951
7124
|
if (a->grad || b->grad) {
|
@@ -6953,26 +7126,43 @@ struct ggml_tensor * ggml_conv_1d_s1_ph(
|
|
6953
7126
|
is_node = true;
|
6954
7127
|
}
|
6955
7128
|
|
6956
|
-
const int64_t ne[4] = {
|
6957
|
-
|
7129
|
+
const int64_t ne[4] = {
|
7130
|
+
ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0),
|
7131
|
+
a->ne[2], 1, 1,
|
7132
|
+
};
|
7133
|
+
struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
|
7134
|
+
|
7135
|
+
ggml_scratch_save(ctx);
|
7136
|
+
struct ggml_tensor* c = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3);
|
7137
|
+
((int32_t*)c->data)[0] = s0;
|
7138
|
+
((int32_t*)c->data)[1] = p0;
|
7139
|
+
((int32_t*)c->data)[2] = d0;
|
7140
|
+
ggml_scratch_load(ctx);
|
6958
7141
|
|
6959
|
-
result->op
|
7142
|
+
result->op = GGML_OP_CONV_1D;
|
6960
7143
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6961
7144
|
result->src0 = a;
|
6962
7145
|
result->src1 = b;
|
7146
|
+
result->opt[0] = c;
|
6963
7147
|
|
6964
7148
|
return result;
|
6965
7149
|
}
|
6966
7150
|
|
6967
|
-
//
|
7151
|
+
// ggml_conv_2d
|
6968
7152
|
|
6969
|
-
struct ggml_tensor
|
6970
|
-
|
6971
|
-
|
6972
|
-
|
6973
|
-
|
6974
|
-
|
6975
|
-
|
7153
|
+
struct ggml_tensor* ggml_conv_2d(
|
7154
|
+
struct ggml_context* ctx,
|
7155
|
+
struct ggml_tensor * a,
|
7156
|
+
struct ggml_tensor * b,
|
7157
|
+
int s0,
|
7158
|
+
int s1,
|
7159
|
+
int p0,
|
7160
|
+
int p1,
|
7161
|
+
int d0,
|
7162
|
+
int d1) {
|
7163
|
+
|
7164
|
+
GGML_ASSERT(b->ne[3] == 1);
|
7165
|
+
GGML_ASSERT(a->ne[2] == b->ne[2]);
|
6976
7166
|
bool is_node = false;
|
6977
7167
|
|
6978
7168
|
if (a->grad || b->grad) {
|
@@ -6980,43 +7170,42 @@ struct ggml_tensor * ggml_conv_1d_s2_ph(
|
|
6980
7170
|
is_node = true;
|
6981
7171
|
}
|
6982
7172
|
|
6983
|
-
const int64_t ne[4] = {
|
6984
|
-
|
7173
|
+
const int64_t ne[4] = {
|
7174
|
+
ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0),
|
7175
|
+
ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1),
|
7176
|
+
a->ne[3], 1,
|
7177
|
+
};
|
7178
|
+
struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
7179
|
+
|
7180
|
+
ggml_scratch_save(ctx);
|
7181
|
+
struct ggml_tensor* c = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 6);
|
7182
|
+
((int32_t*)c->data)[0] = s0;
|
7183
|
+
((int32_t*)c->data)[1] = s1;
|
7184
|
+
((int32_t*)c->data)[2] = p0;
|
7185
|
+
((int32_t*)c->data)[3] = p1;
|
7186
|
+
((int32_t*)c->data)[4] = d0;
|
7187
|
+
((int32_t*)c->data)[5] = d1;
|
7188
|
+
ggml_scratch_load(ctx);
|
6985
7189
|
|
6986
|
-
result->op
|
7190
|
+
result->op = GGML_OP_CONV_2D;
|
6987
7191
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6988
7192
|
result->src0 = a;
|
6989
7193
|
result->src1 = b;
|
7194
|
+
result->opt[0] = c;
|
6990
7195
|
|
6991
7196
|
return result;
|
7197
|
+
|
6992
7198
|
}
|
6993
7199
|
|
6994
|
-
//
|
7200
|
+
// ggml_conv_1d_ph
|
6995
7201
|
|
6996
|
-
struct ggml_tensor
|
7202
|
+
struct ggml_tensor* ggml_conv_1d_ph(
|
6997
7203
|
struct ggml_context * ctx,
|
6998
7204
|
struct ggml_tensor * a,
|
6999
|
-
struct ggml_tensor * b
|
7000
|
-
|
7001
|
-
|
7002
|
-
|
7003
|
-
GGML_ASSERT(b->ne[1] % a->ne[1] == 0);
|
7004
|
-
bool is_node = false;
|
7005
|
-
|
7006
|
-
if (a->grad || b->grad) {
|
7007
|
-
GGML_ASSERT(false); // TODO: implement backward
|
7008
|
-
is_node = true;
|
7009
|
-
}
|
7010
|
-
|
7011
|
-
const int64_t ne[4] = { b->ne[0]/a->ne[0], b->ne[1]/a->ne[1], a->ne[3], 1, };
|
7012
|
-
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
7013
|
-
|
7014
|
-
result->op = GGML_OP_CONV_2D_SK_P0;
|
7015
|
-
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
7016
|
-
result->src0 = a;
|
7017
|
-
result->src1 = b;
|
7018
|
-
|
7019
|
-
return result;
|
7205
|
+
struct ggml_tensor * b,
|
7206
|
+
int s,
|
7207
|
+
int d) {
|
7208
|
+
return ggml_conv_1d(ctx, a, b, s, a->ne[0] / 2, d);
|
7020
7209
|
}
|
7021
7210
|
|
7022
7211
|
// ggml_flash_attn
|
@@ -7566,25 +7755,7 @@ static void ggml_compute_forward_dup_f16(
|
|
7566
7755
|
return;
|
7567
7756
|
}
|
7568
7757
|
|
7569
|
-
|
7570
|
-
const int64_t ne01 = src0->ne[1];
|
7571
|
-
const int64_t ne02 = src0->ne[2];
|
7572
|
-
const int64_t ne03 = src0->ne[3];
|
7573
|
-
|
7574
|
-
const int64_t ne0 = dst->ne[0];
|
7575
|
-
const int64_t ne1 = dst->ne[1];
|
7576
|
-
const int64_t ne2 = dst->ne[2];
|
7577
|
-
const int64_t ne3 = dst->ne[3];
|
7578
|
-
|
7579
|
-
const size_t nb00 = src0->nb[0];
|
7580
|
-
const size_t nb01 = src0->nb[1];
|
7581
|
-
const size_t nb02 = src0->nb[2];
|
7582
|
-
const size_t nb03 = src0->nb[3];
|
7583
|
-
|
7584
|
-
const size_t nb0 = dst->nb[0];
|
7585
|
-
const size_t nb1 = dst->nb[1];
|
7586
|
-
const size_t nb2 = dst->nb[2];
|
7587
|
-
const size_t nb3 = dst->nb[3];
|
7758
|
+
GGML_TENSOR_UNARY_OP_LOCALS;
|
7588
7759
|
|
7589
7760
|
const int ith = params->ith; // thread index
|
7590
7761
|
const int nth = params->nth; // number of threads
|
@@ -7657,8 +7828,8 @@ static void ggml_compute_forward_dup_f16(
|
|
7657
7828
|
id += ne00 * (ne01 - ir1);
|
7658
7829
|
}
|
7659
7830
|
}
|
7660
|
-
} else if (
|
7661
|
-
|
7831
|
+
} else if (type_traits[dst->type].from_float) {
|
7832
|
+
ggml_from_float_t const quantize_row_q = type_traits[dst->type].from_float;
|
7662
7833
|
float * src0_f32 = (float *) params->wdata + (ne00 + CACHE_LINE_SIZE_F32) * ith;
|
7663
7834
|
|
7664
7835
|
size_t id = 0;
|
@@ -7855,25 +8026,7 @@ static void ggml_compute_forward_dup_f32(
|
|
7855
8026
|
return;
|
7856
8027
|
}
|
7857
8028
|
|
7858
|
-
|
7859
|
-
const int64_t ne01 = src0->ne[1];
|
7860
|
-
const int64_t ne02 = src0->ne[2];
|
7861
|
-
const int64_t ne03 = src0->ne[3];
|
7862
|
-
|
7863
|
-
const int64_t ne0 = dst->ne[0];
|
7864
|
-
const int64_t ne1 = dst->ne[1];
|
7865
|
-
const int64_t ne2 = dst->ne[2];
|
7866
|
-
const int64_t ne3 = dst->ne[3];
|
7867
|
-
|
7868
|
-
const size_t nb00 = src0->nb[0];
|
7869
|
-
const size_t nb01 = src0->nb[1];
|
7870
|
-
const size_t nb02 = src0->nb[2];
|
7871
|
-
const size_t nb03 = src0->nb[3];
|
7872
|
-
|
7873
|
-
const size_t nb0 = dst->nb[0];
|
7874
|
-
const size_t nb1 = dst->nb[1];
|
7875
|
-
const size_t nb2 = dst->nb[2];
|
7876
|
-
const size_t nb3 = dst->nb[3];
|
8029
|
+
GGML_TENSOR_UNARY_OP_LOCALS;
|
7877
8030
|
|
7878
8031
|
const int ith = params->ith; // thread index
|
7879
8032
|
const int nth = params->nth; // number of threads
|
@@ -7928,26 +8081,8 @@ static void ggml_compute_forward_dup_f32(
|
|
7928
8081
|
id += rs * (ne01 - ir1);
|
7929
8082
|
}
|
7930
8083
|
}
|
7931
|
-
} else if (dst->type
|
7932
|
-
|
7933
|
-
ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data;
|
7934
|
-
|
7935
|
-
for (int i03 = 0; i03 < ne03; i03++) {
|
7936
|
-
for (int i02 = 0; i02 < ne02; i02++) {
|
7937
|
-
id += ne00 * ir0;
|
7938
|
-
for (int i01 = ir0; i01 < ir1; i01++) {
|
7939
|
-
for (int i00 = 0; i00 < ne00; i00++) {
|
7940
|
-
const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
|
7941
|
-
|
7942
|
-
dst_ptr[id] = GGML_FP32_TO_FP16(*src0_ptr);
|
7943
|
-
id++;
|
7944
|
-
}
|
7945
|
-
}
|
7946
|
-
id += ne00 * (ne01 - ir1);
|
7947
|
-
}
|
7948
|
-
}
|
7949
|
-
} else if (ggml_is_quantized(dst->type)) {
|
7950
|
-
quantize_row_q_t const quantize_row_q = quantize_fns[dst->type].quantize_row_q;
|
8084
|
+
} else if (type_traits[dst->type].from_float) {
|
8085
|
+
ggml_from_float_t const quantize_row_q = type_traits[dst->type].from_float;
|
7951
8086
|
|
7952
8087
|
size_t id = 0;
|
7953
8088
|
size_t rs = nb0 * (ne00 / GGML_BLCK_SIZE[dst->type]);
|
@@ -8171,24 +8306,8 @@ static void ggml_compute_forward_add_f32(
|
|
8171
8306
|
const int nth = params->nth;
|
8172
8307
|
|
8173
8308
|
const int nr = ggml_nrows(src0);
|
8174
|
-
const int64_t ne0 = src0->ne[0];
|
8175
|
-
const int64_t ne1 = src0->ne[1];
|
8176
|
-
const int64_t ne2 = src0->ne[2];
|
8177
|
-
|
8178
|
-
const size_t nb00 = src0->nb[0];
|
8179
|
-
const size_t nb01 = src0->nb[1];
|
8180
|
-
const size_t nb02 = src0->nb[2];
|
8181
|
-
const size_t nb03 = src0->nb[3];
|
8182
|
-
|
8183
|
-
const size_t nb10 = src1->nb[0];
|
8184
|
-
const size_t nb11 = src1->nb[1];
|
8185
|
-
const size_t nb12 = src1->nb[2];
|
8186
|
-
const size_t nb13 = src1->nb[3];
|
8187
8309
|
|
8188
|
-
|
8189
|
-
const size_t nb1 = dst->nb[1];
|
8190
|
-
const size_t nb2 = dst->nb[2];
|
8191
|
-
const size_t nb3 = dst->nb[3];
|
8310
|
+
GGML_TENSOR_BINARY_OP_LOCALS;
|
8192
8311
|
|
8193
8312
|
GGML_ASSERT( nb0 == sizeof(float));
|
8194
8313
|
GGML_ASSERT(nb00 == sizeof(float));
|
@@ -8257,28 +8376,12 @@ static void ggml_compute_forward_add_f16_f32(
|
|
8257
8376
|
const int nth = params->nth;
|
8258
8377
|
|
8259
8378
|
const int nr = ggml_nrows(src0);
|
8260
|
-
const int64_t ne0 = src0->ne[0];
|
8261
|
-
const int64_t ne1 = src0->ne[1];
|
8262
|
-
const int64_t ne2 = src0->ne[2];
|
8263
8379
|
|
8264
|
-
|
8265
|
-
const size_t nb01 = src0->nb[1];
|
8266
|
-
const size_t nb02 = src0->nb[2];
|
8267
|
-
const size_t nb03 = src0->nb[3];
|
8268
|
-
|
8269
|
-
const size_t nb10 = src1->nb[0];
|
8270
|
-
const size_t nb11 = src1->nb[1];
|
8271
|
-
const size_t nb12 = src1->nb[2];
|
8272
|
-
const size_t nb13 = src1->nb[3];
|
8273
|
-
|
8274
|
-
const size_t nb0 = dst->nb[0];
|
8275
|
-
const size_t nb1 = dst->nb[1];
|
8276
|
-
const size_t nb2 = dst->nb[2];
|
8277
|
-
const size_t nb3 = dst->nb[3];
|
8380
|
+
GGML_TENSOR_BINARY_OP_LOCALS;
|
8278
8381
|
|
8279
8382
|
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
8280
8383
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
8281
|
-
GGML_ASSERT(dst->type
|
8384
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F16);
|
8282
8385
|
|
8283
8386
|
GGML_ASSERT( nb0 == sizeof(ggml_fp16_t));
|
8284
8387
|
GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
|
@@ -8327,24 +8430,8 @@ static void ggml_compute_forward_add_f16_f16(
|
|
8327
8430
|
const int nth = params->nth;
|
8328
8431
|
|
8329
8432
|
const int nr = ggml_nrows(src0);
|
8330
|
-
const int64_t ne0 = src0->ne[0];
|
8331
|
-
const int64_t ne1 = src0->ne[1];
|
8332
|
-
const int64_t ne2 = src0->ne[2];
|
8333
|
-
|
8334
|
-
const size_t nb00 = src0->nb[0];
|
8335
|
-
const size_t nb01 = src0->nb[1];
|
8336
|
-
const size_t nb02 = src0->nb[2];
|
8337
|
-
const size_t nb03 = src0->nb[3];
|
8338
8433
|
|
8339
|
-
|
8340
|
-
const size_t nb11 = src1->nb[1];
|
8341
|
-
const size_t nb12 = src1->nb[2];
|
8342
|
-
const size_t nb13 = src1->nb[3];
|
8343
|
-
|
8344
|
-
const size_t nb0 = dst->nb[0];
|
8345
|
-
const size_t nb1 = dst->nb[1];
|
8346
|
-
const size_t nb2 = dst->nb[2];
|
8347
|
-
const size_t nb3 = dst->nb[3];
|
8434
|
+
GGML_TENSOR_BINARY_OP_LOCALS;
|
8348
8435
|
|
8349
8436
|
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
8350
8437
|
GGML_ASSERT(src1->type == GGML_TYPE_F16);
|
@@ -8394,32 +8481,15 @@ static void ggml_compute_forward_add_q_f32(
|
|
8394
8481
|
}
|
8395
8482
|
|
8396
8483
|
const int nr = ggml_nrows(src0);
|
8397
|
-
const int64_t ne00 = src0->ne[0];
|
8398
|
-
const int64_t ne01 = src0->ne[1];
|
8399
|
-
const int64_t ne02 = src0->ne[2];
|
8400
|
-
//const int64_t ne03 = src0->ne[3];
|
8401
|
-
|
8402
|
-
const size_t nb00 = src0->nb[0];
|
8403
|
-
const size_t nb01 = src0->nb[1];
|
8404
|
-
const size_t nb02 = src0->nb[2];
|
8405
|
-
const size_t nb03 = src0->nb[3];
|
8406
8484
|
|
8407
|
-
|
8408
|
-
const size_t nb11 = src1->nb[1];
|
8409
|
-
const size_t nb12 = src1->nb[2];
|
8410
|
-
const size_t nb13 = src1->nb[3];
|
8411
|
-
|
8412
|
-
const size_t nb0 = dst->nb[0];
|
8413
|
-
const size_t nb1 = dst->nb[1];
|
8414
|
-
const size_t nb2 = dst->nb[2];
|
8415
|
-
const size_t nb3 = dst->nb[3];
|
8485
|
+
GGML_TENSOR_BINARY_OP_LOCALS;
|
8416
8486
|
|
8417
8487
|
const int ith = params->ith;
|
8418
8488
|
const int nth = params->nth;
|
8419
8489
|
|
8420
8490
|
const enum ggml_type type = src0->type;
|
8421
|
-
|
8422
|
-
|
8491
|
+
ggml_to_float_t const dequantize_row_q = type_traits[type].to_float;
|
8492
|
+
ggml_from_float_t const quantize_row_q = type_traits[type].from_float;
|
8423
8493
|
|
8424
8494
|
// we don't support permuted src0 or src1
|
8425
8495
|
GGML_ASSERT(nb00 == GGML_TYPE_SIZE[type]);
|
@@ -8533,19 +8603,8 @@ static void ggml_compute_forward_add1_f32(
|
|
8533
8603
|
const int nth = params->nth;
|
8534
8604
|
|
8535
8605
|
const int nr = ggml_nrows(src0);
|
8536
|
-
const int64_t ne0 = src0->ne[0];
|
8537
|
-
const int64_t ne1 = src0->ne[1];
|
8538
|
-
const int64_t ne2 = src0->ne[2];
|
8539
|
-
|
8540
|
-
const size_t nb00 = src0->nb[0];
|
8541
|
-
const size_t nb01 = src0->nb[1];
|
8542
|
-
const size_t nb02 = src0->nb[2];
|
8543
|
-
const size_t nb03 = src0->nb[3];
|
8544
8606
|
|
8545
|
-
|
8546
|
-
const size_t nb1 = dst->nb[1];
|
8547
|
-
const size_t nb2 = dst->nb[2];
|
8548
|
-
const size_t nb3 = dst->nb[3];
|
8607
|
+
GGML_TENSOR_UNARY_OP_LOCALS;
|
8549
8608
|
|
8550
8609
|
GGML_ASSERT( nb0 == sizeof(float));
|
8551
8610
|
GGML_ASSERT(nb00 == sizeof(float));
|
@@ -8599,23 +8658,12 @@ static void ggml_compute_forward_add1_f16_f32(
|
|
8599
8658
|
const int nth = params->nth;
|
8600
8659
|
|
8601
8660
|
const int nr = ggml_nrows(src0);
|
8602
|
-
const int64_t ne0 = src0->ne[0];
|
8603
|
-
const int64_t ne1 = src0->ne[1];
|
8604
|
-
const int64_t ne2 = src0->ne[2];
|
8605
8661
|
|
8606
|
-
|
8607
|
-
const size_t nb01 = src0->nb[1];
|
8608
|
-
const size_t nb02 = src0->nb[2];
|
8609
|
-
const size_t nb03 = src0->nb[3];
|
8610
|
-
|
8611
|
-
const size_t nb0 = dst->nb[0];
|
8612
|
-
const size_t nb1 = dst->nb[1];
|
8613
|
-
const size_t nb2 = dst->nb[2];
|
8614
|
-
const size_t nb3 = dst->nb[3];
|
8662
|
+
GGML_TENSOR_UNARY_OP_LOCALS;
|
8615
8663
|
|
8616
8664
|
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
8617
8665
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
8618
|
-
GGML_ASSERT(dst->type
|
8666
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F16);
|
8619
8667
|
|
8620
8668
|
GGML_ASSERT( nb0 == sizeof(ggml_fp16_t));
|
8621
8669
|
GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
|
@@ -8660,23 +8708,12 @@ static void ggml_compute_forward_add1_f16_f16(
|
|
8660
8708
|
const int nth = params->nth;
|
8661
8709
|
|
8662
8710
|
const int nr = ggml_nrows(src0);
|
8663
|
-
const int64_t ne0 = src0->ne[0];
|
8664
|
-
const int64_t ne1 = src0->ne[1];
|
8665
|
-
const int64_t ne2 = src0->ne[2];
|
8666
|
-
|
8667
|
-
const size_t nb00 = src0->nb[0];
|
8668
|
-
const size_t nb01 = src0->nb[1];
|
8669
|
-
const size_t nb02 = src0->nb[2];
|
8670
|
-
const size_t nb03 = src0->nb[3];
|
8671
8711
|
|
8672
|
-
|
8673
|
-
const size_t nb1 = dst->nb[1];
|
8674
|
-
const size_t nb2 = dst->nb[2];
|
8675
|
-
const size_t nb3 = dst->nb[3];
|
8712
|
+
GGML_TENSOR_UNARY_OP_LOCALS;
|
8676
8713
|
|
8677
8714
|
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
8678
8715
|
GGML_ASSERT(src1->type == GGML_TYPE_F16);
|
8679
|
-
GGML_ASSERT(dst->type
|
8716
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F16);
|
8680
8717
|
|
8681
8718
|
GGML_ASSERT( nb0 == sizeof(ggml_fp16_t));
|
8682
8719
|
GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
|
@@ -8721,23 +8758,12 @@ static void ggml_compute_forward_add1_q_f32(
|
|
8721
8758
|
const int nth = params->nth;
|
8722
8759
|
|
8723
8760
|
const int nr = ggml_nrows(src0);
|
8724
|
-
const int64_t ne0 = src0->ne[0];
|
8725
|
-
const int64_t ne1 = src0->ne[1];
|
8726
|
-
const int64_t ne2 = src0->ne[2];
|
8727
8761
|
|
8728
|
-
|
8729
|
-
const size_t nb01 = src0->nb[1];
|
8730
|
-
const size_t nb02 = src0->nb[2];
|
8731
|
-
const size_t nb03 = src0->nb[3];
|
8732
|
-
|
8733
|
-
const size_t nb0 = dst->nb[0];
|
8734
|
-
const size_t nb1 = dst->nb[1];
|
8735
|
-
const size_t nb2 = dst->nb[2];
|
8736
|
-
const size_t nb3 = dst->nb[3];
|
8762
|
+
GGML_TENSOR_UNARY_OP_LOCALS;
|
8737
8763
|
|
8738
8764
|
const enum ggml_type type = src0->type;
|
8739
|
-
|
8740
|
-
|
8765
|
+
ggml_to_float_t const dequantize_row_q = type_traits[type].to_float;
|
8766
|
+
ggml_from_float_t const quantize_row_q = type_traits[type].from_float;
|
8741
8767
|
|
8742
8768
|
// we don't support permuted src0
|
8743
8769
|
GGML_ASSERT(nb00 == GGML_TYPE_SIZE[type]);
|
@@ -8865,15 +8891,8 @@ static void ggml_compute_forward_acc_f32(
|
|
8865
8891
|
const int nr = ggml_nrows(src1);
|
8866
8892
|
const int nc = src1->ne[0];
|
8867
8893
|
|
8868
|
-
|
8869
|
-
|
8870
|
-
const int64_t ne12 = src1->ne[2];
|
8871
|
-
const int64_t ne13 = src1->ne[3];
|
8872
|
-
|
8873
|
-
const size_t nb10 = src1->nb[0];
|
8874
|
-
const size_t nb11 = src1->nb[1];
|
8875
|
-
const size_t nb12 = src1->nb[2];
|
8876
|
-
const size_t nb13 = src1->nb[3];
|
8894
|
+
GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne);
|
8895
|
+
GGML_TENSOR_LOCALS(size_t, nb1, src1, nb);
|
8877
8896
|
|
8878
8897
|
// src0 and dst as viewed during acc
|
8879
8898
|
const size_t nb0 = ggml_element_size(src0);
|
@@ -8962,24 +8981,8 @@ static void ggml_compute_forward_sub_f32(
|
|
8962
8981
|
}
|
8963
8982
|
|
8964
8983
|
const int nr = ggml_nrows(src0);
|
8965
|
-
const int64_t ne0 = src0->ne[0];
|
8966
|
-
const int64_t ne1 = src0->ne[1];
|
8967
|
-
const int64_t ne2 = src0->ne[2];
|
8968
|
-
|
8969
|
-
const size_t nb00 = src0->nb[0];
|
8970
|
-
const size_t nb01 = src0->nb[1];
|
8971
|
-
const size_t nb02 = src0->nb[2];
|
8972
|
-
const size_t nb03 = src0->nb[3];
|
8973
8984
|
|
8974
|
-
|
8975
|
-
const size_t nb11 = src1->nb[1];
|
8976
|
-
const size_t nb12 = src1->nb[2];
|
8977
|
-
const size_t nb13 = src1->nb[3];
|
8978
|
-
|
8979
|
-
const size_t nb0 = dst->nb[0];
|
8980
|
-
const size_t nb1 = dst->nb[1];
|
8981
|
-
const size_t nb2 = dst->nb[2];
|
8982
|
-
const size_t nb3 = dst->nb[3];
|
8985
|
+
GGML_TENSOR_BINARY_OP_LOCALS;
|
8983
8986
|
|
8984
8987
|
GGML_ASSERT( nb0 == sizeof(float));
|
8985
8988
|
GGML_ASSERT(nb00 == sizeof(float));
|
@@ -9069,29 +9072,7 @@ static void ggml_compute_forward_mul_f32(
|
|
9069
9072
|
|
9070
9073
|
const int64_t nr = ggml_nrows(src0);
|
9071
9074
|
|
9072
|
-
|
9073
|
-
const int64_t ne01 = src0->ne[1];
|
9074
|
-
const int64_t ne02 = src0->ne[2];
|
9075
|
-
|
9076
|
-
const int64_t ne10 = src1->ne[0];
|
9077
|
-
const int64_t ne11 = src1->ne[1];
|
9078
|
-
const int64_t ne12 = src1->ne[2];
|
9079
|
-
const int64_t ne13 = src1->ne[3];
|
9080
|
-
|
9081
|
-
const size_t nb00 = src0->nb[0];
|
9082
|
-
const size_t nb01 = src0->nb[1];
|
9083
|
-
const size_t nb02 = src0->nb[2];
|
9084
|
-
const size_t nb03 = src0->nb[3];
|
9085
|
-
|
9086
|
-
const size_t nb10 = src1->nb[0];
|
9087
|
-
const size_t nb11 = src1->nb[1];
|
9088
|
-
const size_t nb12 = src1->nb[2];
|
9089
|
-
const size_t nb13 = src1->nb[3];
|
9090
|
-
|
9091
|
-
const size_t nb0 = dst->nb[0];
|
9092
|
-
const size_t nb1 = dst->nb[1];
|
9093
|
-
const size_t nb2 = dst->nb[2];
|
9094
|
-
const size_t nb3 = dst->nb[3];
|
9075
|
+
GGML_TENSOR_BINARY_OP_LOCALS;
|
9095
9076
|
|
9096
9077
|
GGML_ASSERT( nb0 == sizeof(float));
|
9097
9078
|
GGML_ASSERT(nb00 == sizeof(float));
|
@@ -9179,24 +9160,8 @@ static void ggml_compute_forward_div_f32(
|
|
9179
9160
|
}
|
9180
9161
|
|
9181
9162
|
const int nr = ggml_nrows(src0);
|
9182
|
-
const int64_t ne0 = src0->ne[0];
|
9183
|
-
const int64_t ne1 = src0->ne[1];
|
9184
|
-
const int64_t ne2 = src0->ne[2];
|
9185
|
-
|
9186
|
-
const size_t nb00 = src0->nb[0];
|
9187
|
-
const size_t nb01 = src0->nb[1];
|
9188
|
-
const size_t nb02 = src0->nb[2];
|
9189
|
-
const size_t nb03 = src0->nb[3];
|
9190
|
-
|
9191
|
-
const size_t nb10 = src1->nb[0];
|
9192
|
-
const size_t nb11 = src1->nb[1];
|
9193
|
-
const size_t nb12 = src1->nb[2];
|
9194
|
-
const size_t nb13 = src1->nb[3];
|
9195
9163
|
|
9196
|
-
|
9197
|
-
const size_t nb1 = dst->nb[1];
|
9198
|
-
const size_t nb2 = dst->nb[2];
|
9199
|
-
const size_t nb3 = dst->nb[3];
|
9164
|
+
GGML_TENSOR_BINARY_OP_LOCALS;
|
9200
9165
|
|
9201
9166
|
GGML_ASSERT( nb0 == sizeof(float));
|
9202
9167
|
GGML_ASSERT(nb00 == sizeof(float));
|
@@ -9403,14 +9368,8 @@ static void ggml_compute_forward_sum_f32(
|
|
9403
9368
|
assert(ggml_is_scalar(dst));
|
9404
9369
|
assert(src0->nb[0] == sizeof(float));
|
9405
9370
|
|
9406
|
-
|
9407
|
-
|
9408
|
-
const int64_t ne02 = src0->ne[2];
|
9409
|
-
const int64_t ne03 = src0->ne[3];
|
9410
|
-
|
9411
|
-
const size_t nb01 = src0->nb[1];
|
9412
|
-
const size_t nb02 = src0->nb[2];
|
9413
|
-
const size_t nb03 = src0->nb[3];
|
9371
|
+
GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne);
|
9372
|
+
GGML_TENSOR_LOCALS(size_t, nb0, src0, nb);
|
9414
9373
|
|
9415
9374
|
ggml_float sum = 0;
|
9416
9375
|
ggml_float row_sum = 0;
|
@@ -9459,29 +9418,13 @@ static void ggml_compute_forward_sum_rows_f32(
|
|
9459
9418
|
GGML_ASSERT(src0->nb[0] == sizeof(float));
|
9460
9419
|
GGML_ASSERT(dst->nb[0] == sizeof(float));
|
9461
9420
|
|
9462
|
-
|
9463
|
-
const int64_t ne01 = src0->ne[1];
|
9464
|
-
const int64_t ne02 = src0->ne[2];
|
9465
|
-
const int64_t ne03 = src0->ne[3];
|
9466
|
-
|
9467
|
-
const int64_t ne0 = dst->ne[0];
|
9468
|
-
const int64_t ne1 = dst->ne[1];
|
9469
|
-
const int64_t ne2 = dst->ne[2];
|
9470
|
-
const int64_t ne3 = dst->ne[3];
|
9421
|
+
GGML_TENSOR_UNARY_OP_LOCALS;
|
9471
9422
|
|
9472
9423
|
GGML_ASSERT(ne0 == 1);
|
9473
9424
|
GGML_ASSERT(ne1 == ne01);
|
9474
9425
|
GGML_ASSERT(ne2 == ne02);
|
9475
9426
|
GGML_ASSERT(ne3 == ne03);
|
9476
9427
|
|
9477
|
-
const size_t nb01 = src0->nb[1];
|
9478
|
-
const size_t nb02 = src0->nb[2];
|
9479
|
-
const size_t nb03 = src0->nb[3];
|
9480
|
-
|
9481
|
-
const size_t nb1 = dst->nb[1];
|
9482
|
-
const size_t nb2 = dst->nb[2];
|
9483
|
-
const size_t nb3 = dst->nb[3];
|
9484
|
-
|
9485
9428
|
for (int64_t i3 = 0; i3 < ne03; i3++) {
|
9486
9429
|
for (int64_t i2 = 0; i2 < ne02; i2++) {
|
9487
9430
|
for (int64_t i1 = 0; i1 < ne01; i1++) {
|
@@ -9525,19 +9468,7 @@ static void ggml_compute_forward_mean_f32(
|
|
9525
9468
|
|
9526
9469
|
assert(src0->nb[0] == sizeof(float));
|
9527
9470
|
|
9528
|
-
|
9529
|
-
const int64_t ne01 = src0->ne[1];
|
9530
|
-
const int64_t ne02 = src0->ne[2];
|
9531
|
-
const int64_t ne03 = src0->ne[3];
|
9532
|
-
|
9533
|
-
const size_t nb01 = src0->nb[1];
|
9534
|
-
const size_t nb02 = src0->nb[2];
|
9535
|
-
const size_t nb03 = src0->nb[3];
|
9536
|
-
|
9537
|
-
const int64_t ne0 = dst->ne[0];
|
9538
|
-
const int64_t ne1 = dst->ne[1];
|
9539
|
-
const int64_t ne2 = dst->ne[2];
|
9540
|
-
const int64_t ne3 = dst->ne[3];
|
9471
|
+
GGML_TENSOR_UNARY_OP_LOCALS;
|
9541
9472
|
|
9542
9473
|
assert(ne0 == 1);
|
9543
9474
|
assert(ne1 == ne01);
|
@@ -9549,10 +9480,6 @@ static void ggml_compute_forward_mean_f32(
|
|
9549
9480
|
UNUSED(ne2);
|
9550
9481
|
UNUSED(ne3);
|
9551
9482
|
|
9552
|
-
const size_t nb1 = dst->nb[1];
|
9553
|
-
const size_t nb2 = dst->nb[2];
|
9554
|
-
const size_t nb3 = dst->nb[3];
|
9555
|
-
|
9556
9483
|
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
9557
9484
|
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
9558
9485
|
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
@@ -9582,38 +9509,66 @@ static void ggml_compute_forward_mean(
|
|
9582
9509
|
}
|
9583
9510
|
}
|
9584
9511
|
|
9585
|
-
//
|
9512
|
+
// ggml_compute_forward_argmax
|
9586
9513
|
|
9587
|
-
static void
|
9514
|
+
static void ggml_compute_forward_argmax_f32(
|
9588
9515
|
const struct ggml_compute_params * params,
|
9589
9516
|
const struct ggml_tensor * src0,
|
9590
9517
|
struct ggml_tensor * dst) {
|
9591
|
-
|
9592
|
-
GGML_ASSERT(ggml_can_repeat(src0, dst));
|
9518
|
+
assert(params->ith == 0);
|
9593
9519
|
|
9594
9520
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
9595
9521
|
return;
|
9596
9522
|
}
|
9597
9523
|
|
9598
|
-
|
9599
|
-
|
9600
|
-
const int64_t ne2 = dst->ne[2];
|
9601
|
-
const int64_t ne3 = dst->ne[3];
|
9524
|
+
assert(src0->nb[0] == sizeof(float));
|
9525
|
+
assert(dst->nb[0] == sizeof(float));
|
9602
9526
|
|
9603
9527
|
const int64_t ne00 = src0->ne[0];
|
9604
9528
|
const int64_t ne01 = src0->ne[1];
|
9605
|
-
const int64_t ne02 = src0->ne[2];
|
9606
|
-
const int64_t ne03 = src0->ne[3];
|
9607
|
-
|
9608
|
-
const size_t nb0 = dst->nb[0];
|
9609
|
-
const size_t nb1 = dst->nb[1];
|
9610
|
-
const size_t nb2 = dst->nb[2];
|
9611
|
-
const size_t nb3 = dst->nb[3];
|
9612
9529
|
|
9613
|
-
const size_t nb00 = src0->nb[0];
|
9614
9530
|
const size_t nb01 = src0->nb[1];
|
9615
|
-
const size_t
|
9616
|
-
|
9531
|
+
const size_t nb0 = dst->nb[0];
|
9532
|
+
|
9533
|
+
for (int64_t i1 = 0; i1 < ne01; i1++) {
|
9534
|
+
float * src = (float *) ((char *) src0->data + i1*nb01);
|
9535
|
+
int32_t * dst_ = (int32_t *) ((char *) dst->data + i1*nb0);
|
9536
|
+
int v = 0;
|
9537
|
+
ggml_vec_argmax_f32(ne00, &v, src);
|
9538
|
+
dst_[0] = v;
|
9539
|
+
}
|
9540
|
+
}
|
9541
|
+
|
9542
|
+
static void ggml_compute_forward_argmax(
|
9543
|
+
const struct ggml_compute_params * params,
|
9544
|
+
const struct ggml_tensor * src0,
|
9545
|
+
struct ggml_tensor * dst) {
|
9546
|
+
switch (src0->type) {
|
9547
|
+
case GGML_TYPE_F32:
|
9548
|
+
{
|
9549
|
+
ggml_compute_forward_argmax_f32(params, src0, dst);
|
9550
|
+
} break;
|
9551
|
+
default:
|
9552
|
+
{
|
9553
|
+
GGML_ASSERT(false);
|
9554
|
+
} break;
|
9555
|
+
}
|
9556
|
+
}
|
9557
|
+
|
9558
|
+
// ggml_compute_forward_repeat
|
9559
|
+
|
9560
|
+
static void ggml_compute_forward_repeat_f32(
|
9561
|
+
const struct ggml_compute_params * params,
|
9562
|
+
const struct ggml_tensor * src0,
|
9563
|
+
struct ggml_tensor * dst) {
|
9564
|
+
GGML_ASSERT(params->ith == 0);
|
9565
|
+
GGML_ASSERT(ggml_can_repeat(src0, dst));
|
9566
|
+
|
9567
|
+
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
9568
|
+
return;
|
9569
|
+
}
|
9570
|
+
|
9571
|
+
GGML_TENSOR_UNARY_OP_LOCALS;
|
9617
9572
|
|
9618
9573
|
// guaranteed to be an integer due to the check in ggml_can_repeat
|
9619
9574
|
const int nr0 = (int)(ne0/ne00);
|
@@ -9674,25 +9629,7 @@ static void ggml_compute_forward_repeat_back_f32(
|
|
9674
9629
|
return;
|
9675
9630
|
}
|
9676
9631
|
|
9677
|
-
|
9678
|
-
const int64_t ne1 = dst->ne[1];
|
9679
|
-
const int64_t ne2 = dst->ne[2];
|
9680
|
-
const int64_t ne3 = dst->ne[3];
|
9681
|
-
|
9682
|
-
const int64_t ne00 = src0->ne[0];
|
9683
|
-
const int64_t ne01 = src0->ne[1];
|
9684
|
-
const int64_t ne02 = src0->ne[2];
|
9685
|
-
const int64_t ne03 = src0->ne[3];
|
9686
|
-
|
9687
|
-
const size_t nb0 = dst->nb[0];
|
9688
|
-
const size_t nb1 = dst->nb[1];
|
9689
|
-
const size_t nb2 = dst->nb[2];
|
9690
|
-
const size_t nb3 = dst->nb[3];
|
9691
|
-
|
9692
|
-
const size_t nb00 = src0->nb[0];
|
9693
|
-
const size_t nb01 = src0->nb[1];
|
9694
|
-
const size_t nb02 = src0->nb[2];
|
9695
|
-
const size_t nb03 = src0->nb[3];
|
9632
|
+
GGML_TENSOR_UNARY_OP_LOCALS;
|
9696
9633
|
|
9697
9634
|
// guaranteed to be an integer due to the check in ggml_can_repeat
|
9698
9635
|
const int nr0 = (int)(ne00/ne0);
|
@@ -9922,6 +9859,90 @@ static void ggml_compute_forward_step(
|
|
9922
9859
|
}
|
9923
9860
|
}
|
9924
9861
|
|
9862
|
+
// ggml_compute_forward_tanh
|
9863
|
+
|
9864
|
+
static void ggml_compute_forward_tanh_f32(
|
9865
|
+
const struct ggml_compute_params * params,
|
9866
|
+
const struct ggml_tensor * src0,
|
9867
|
+
struct ggml_tensor * dst) {
|
9868
|
+
assert(params->ith == 0);
|
9869
|
+
assert(ggml_are_same_shape(src0, dst));
|
9870
|
+
|
9871
|
+
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
9872
|
+
return;
|
9873
|
+
}
|
9874
|
+
|
9875
|
+
const int n = ggml_nrows(src0);
|
9876
|
+
const int nc = src0->ne[0];
|
9877
|
+
|
9878
|
+
assert(dst->nb[0] == sizeof(float));
|
9879
|
+
assert(src0->nb[0] == sizeof(float));
|
9880
|
+
|
9881
|
+
for (int i = 0; i < n; i++) {
|
9882
|
+
ggml_vec_tanh_f32(nc,
|
9883
|
+
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
9884
|
+
(float *) ((char *) src0->data + i*(src0->nb[1])));
|
9885
|
+
}
|
9886
|
+
}
|
9887
|
+
|
9888
|
+
static void ggml_compute_forward_tanh(
|
9889
|
+
const struct ggml_compute_params * params,
|
9890
|
+
const struct ggml_tensor * src0,
|
9891
|
+
struct ggml_tensor * dst) {
|
9892
|
+
switch (src0->type) {
|
9893
|
+
case GGML_TYPE_F32:
|
9894
|
+
{
|
9895
|
+
ggml_compute_forward_tanh_f32(params, src0, dst);
|
9896
|
+
} break;
|
9897
|
+
default:
|
9898
|
+
{
|
9899
|
+
GGML_ASSERT(false);
|
9900
|
+
} break;
|
9901
|
+
}
|
9902
|
+
}
|
9903
|
+
|
9904
|
+
// ggml_compute_forward_elu
|
9905
|
+
|
9906
|
+
static void ggml_compute_forward_elu_f32(
|
9907
|
+
const struct ggml_compute_params * params,
|
9908
|
+
const struct ggml_tensor * src0,
|
9909
|
+
struct ggml_tensor * dst) {
|
9910
|
+
assert(params->ith == 0);
|
9911
|
+
assert(ggml_are_same_shape(src0, dst));
|
9912
|
+
|
9913
|
+
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
9914
|
+
return;
|
9915
|
+
}
|
9916
|
+
|
9917
|
+
const int n = ggml_nrows(src0);
|
9918
|
+
const int nc = src0->ne[0];
|
9919
|
+
|
9920
|
+
assert(dst->nb[0] == sizeof(float));
|
9921
|
+
assert(src0->nb[0] == sizeof(float));
|
9922
|
+
|
9923
|
+
for (int i = 0; i < n; i++) {
|
9924
|
+
ggml_vec_elu_f32(nc,
|
9925
|
+
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
9926
|
+
(float *) ((char *) src0->data + i*(src0->nb[1])));
|
9927
|
+
}
|
9928
|
+
}
|
9929
|
+
|
9930
|
+
static void ggml_compute_forward_elu(
|
9931
|
+
const struct ggml_compute_params * params,
|
9932
|
+
const struct ggml_tensor * src0,
|
9933
|
+
struct ggml_tensor * dst) {
|
9934
|
+
switch (src0->type) {
|
9935
|
+
case GGML_TYPE_F32:
|
9936
|
+
{
|
9937
|
+
ggml_compute_forward_elu_f32(params, src0, dst);
|
9938
|
+
} break;
|
9939
|
+
default:
|
9940
|
+
{
|
9941
|
+
GGML_ASSERT(false);
|
9942
|
+
} break;
|
9943
|
+
}
|
9944
|
+
}
|
9945
|
+
|
9925
9946
|
// ggml_compute_forward_relu
|
9926
9947
|
|
9927
9948
|
static void ggml_compute_forward_relu_f32(
|
@@ -10223,18 +10244,7 @@ static void ggml_compute_forward_norm_f32(
|
|
10223
10244
|
const int ith = params->ith;
|
10224
10245
|
const int nth = params->nth;
|
10225
10246
|
|
10226
|
-
|
10227
|
-
const int64_t ne01 = src0->ne[1];
|
10228
|
-
const int64_t ne02 = src0->ne[2];
|
10229
|
-
const int64_t ne03 = src0->ne[3];
|
10230
|
-
|
10231
|
-
const size_t nb01 = src0->nb[1];
|
10232
|
-
const size_t nb02 = src0->nb[2];
|
10233
|
-
const size_t nb03 = src0->nb[3];
|
10234
|
-
|
10235
|
-
const size_t nb1 = dst->nb[1];
|
10236
|
-
const size_t nb2 = dst->nb[2];
|
10237
|
-
const size_t nb3 = dst->nb[3];
|
10247
|
+
GGML_TENSOR_UNARY_OP_LOCALS;
|
10238
10248
|
|
10239
10249
|
const float eps = 1e-5f; // TODO: make this a parameter
|
10240
10250
|
|
@@ -10300,18 +10310,7 @@ static void ggml_compute_forward_rms_norm_f32(
|
|
10300
10310
|
const int ith = params->ith;
|
10301
10311
|
const int nth = params->nth;
|
10302
10312
|
|
10303
|
-
|
10304
|
-
const int64_t ne01 = src0->ne[1];
|
10305
|
-
const int64_t ne02 = src0->ne[2];
|
10306
|
-
const int64_t ne03 = src0->ne[3];
|
10307
|
-
|
10308
|
-
const size_t nb01 = src0->nb[1];
|
10309
|
-
const size_t nb02 = src0->nb[2];
|
10310
|
-
const size_t nb03 = src0->nb[3];
|
10311
|
-
|
10312
|
-
const size_t nb1 = dst->nb[1];
|
10313
|
-
const size_t nb2 = dst->nb[2];
|
10314
|
-
const size_t nb3 = dst->nb[3];
|
10313
|
+
GGML_TENSOR_UNARY_OP_LOCALS;
|
10315
10314
|
|
10316
10315
|
const float eps = 1e-6f; // TODO: make this a parameter
|
10317
10316
|
|
@@ -10376,22 +10375,7 @@ static void ggml_compute_forward_rms_norm_back_f32(
|
|
10376
10375
|
const int ith = params->ith;
|
10377
10376
|
const int nth = params->nth;
|
10378
10377
|
|
10379
|
-
|
10380
|
-
const int64_t ne01 = src0->ne[1];
|
10381
|
-
const int64_t ne02 = src0->ne[2];
|
10382
|
-
const int64_t ne03 = src0->ne[3];
|
10383
|
-
|
10384
|
-
const size_t nb01 = src0->nb[1];
|
10385
|
-
const size_t nb02 = src0->nb[2];
|
10386
|
-
const size_t nb03 = src0->nb[3];
|
10387
|
-
|
10388
|
-
const size_t nb11 = src1->nb[1];
|
10389
|
-
const size_t nb12 = src1->nb[2];
|
10390
|
-
const size_t nb13 = src1->nb[3];
|
10391
|
-
|
10392
|
-
const size_t nb1 = dst->nb[1];
|
10393
|
-
const size_t nb2 = dst->nb[2];
|
10394
|
-
const size_t nb3 = dst->nb[3];
|
10378
|
+
GGML_TENSOR_BINARY_OP_LOCALS;
|
10395
10379
|
|
10396
10380
|
const float eps = 1e-6f; // TODO: make this a parameter
|
10397
10381
|
|
@@ -10541,416 +10525,45 @@ static void ggml_compute_forward_rms_norm_back(
|
|
10541
10525
|
{
|
10542
10526
|
ggml_compute_forward_rms_norm_back_f32(params, src0, src1, dst);
|
10543
10527
|
} break;
|
10544
|
-
default:
|
10545
|
-
{
|
10546
|
-
GGML_ASSERT(false);
|
10547
|
-
} break;
|
10548
|
-
}
|
10549
|
-
}
|
10550
|
-
|
10551
|
-
|
10552
|
-
// ggml_compute_forward_mul_mat
|
10553
|
-
|
10554
|
-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
10555
|
-
// helper function to determine if it is better to use BLAS or not
|
10556
|
-
// for large matrices, BLAS is faster
|
10557
|
-
static bool ggml_compute_forward_mul_mat_use_blas(
|
10558
|
-
const struct ggml_tensor * src0,
|
10559
|
-
const struct ggml_tensor * src1,
|
10560
|
-
struct ggml_tensor * dst) {
|
10561
|
-
//const int64_t ne00 = src0->ne[0];
|
10562
|
-
//const int64_t ne01 = src0->ne[1];
|
10563
|
-
|
10564
|
-
const int64_t ne10 = src1->ne[0];
|
10565
|
-
|
10566
|
-
const int64_t ne0 = dst->ne[0];
|
10567
|
-
const int64_t ne1 = dst->ne[1];
|
10568
|
-
|
10569
|
-
// TODO: find the optimal values for these
|
10570
|
-
if (ggml_is_contiguous(src0) &&
|
10571
|
-
ggml_is_contiguous(src1) &&
|
10572
|
-
(ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
|
10573
|
-
|
10574
|
-
/*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/
|
10575
|
-
return true;
|
10576
|
-
}
|
10577
|
-
|
10578
|
-
return false;
|
10579
|
-
}
|
10580
|
-
#endif
|
10581
|
-
|
10582
|
-
static void ggml_compute_forward_mul_mat_f32(
|
10583
|
-
const struct ggml_compute_params * params,
|
10584
|
-
const struct ggml_tensor * src0,
|
10585
|
-
const struct ggml_tensor * src1,
|
10586
|
-
struct ggml_tensor * dst) {
|
10587
|
-
int64_t t0 = ggml_perf_time_us();
|
10588
|
-
UNUSED(t0);
|
10589
|
-
|
10590
|
-
const int64_t ne00 = src0->ne[0];
|
10591
|
-
const int64_t ne01 = src0->ne[1];
|
10592
|
-
const int64_t ne02 = src0->ne[2];
|
10593
|
-
const int64_t ne03 = src0->ne[3];
|
10594
|
-
|
10595
|
-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
10596
|
-
const int64_t ne10 = src1->ne[0];
|
10597
|
-
#endif
|
10598
|
-
const int64_t ne11 = src1->ne[1];
|
10599
|
-
#ifndef NDEBUG
|
10600
|
-
const int64_t ne12 = src1->ne[2];
|
10601
|
-
const int64_t ne13 = src1->ne[3];
|
10602
|
-
|
10603
|
-
const int64_t ne0 = dst->ne[0];
|
10604
|
-
const int64_t ne1 = dst->ne[1];
|
10605
|
-
const int64_t ne2 = dst->ne[2];
|
10606
|
-
const int64_t ne3 = dst->ne[3];
|
10607
|
-
|
10608
|
-
const int nb00 = src0->nb[0];
|
10609
|
-
#endif
|
10610
|
-
const int nb01 = src0->nb[1];
|
10611
|
-
const int nb02 = src0->nb[2];
|
10612
|
-
const int nb03 = src0->nb[3];
|
10613
|
-
|
10614
|
-
#ifndef NDEBUG
|
10615
|
-
const int nb10 = src1->nb[0];
|
10616
|
-
#endif
|
10617
|
-
const int nb11 = src1->nb[1];
|
10618
|
-
const int nb12 = src1->nb[2];
|
10619
|
-
const int nb13 = src1->nb[3];
|
10620
|
-
|
10621
|
-
const int nb0 = dst->nb[0];
|
10622
|
-
const int nb1 = dst->nb[1];
|
10623
|
-
const int nb2 = dst->nb[2];
|
10624
|
-
const int nb3 = dst->nb[3];
|
10625
|
-
|
10626
|
-
const int ith = params->ith;
|
10627
|
-
const int nth = params->nth;
|
10628
|
-
|
10629
|
-
assert(ne02 == ne12);
|
10630
|
-
assert(ne03 == ne13);
|
10631
|
-
assert(ne2 == ne12);
|
10632
|
-
assert(ne3 == ne13);
|
10633
|
-
|
10634
|
-
// we don't support permuted src0 or src1
|
10635
|
-
assert(nb00 == sizeof(float));
|
10636
|
-
assert(nb10 == sizeof(float));
|
10637
|
-
|
10638
|
-
// dst cannot be transposed or permuted
|
10639
|
-
assert(nb0 == sizeof(float));
|
10640
|
-
assert(nb0 <= nb1);
|
10641
|
-
assert(nb1 <= nb2);
|
10642
|
-
assert(nb2 <= nb3);
|
10643
|
-
|
10644
|
-
assert(ne0 == ne01);
|
10645
|
-
assert(ne1 == ne11);
|
10646
|
-
assert(ne2 == ne02);
|
10647
|
-
assert(ne3 == ne03);
|
10648
|
-
|
10649
|
-
// nb01 >= nb00 - src0 is not transposed
|
10650
|
-
// compute by src0 rows
|
10651
|
-
|
10652
|
-
#if defined(GGML_USE_CLBLAST)
|
10653
|
-
if (ggml_cl_can_mul_mat(src0, src1, dst)) {
|
10654
|
-
if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
|
10655
|
-
ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
|
10656
|
-
}
|
10657
|
-
return;
|
10658
|
-
}
|
10659
|
-
#endif
|
10660
|
-
|
10661
|
-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
10662
|
-
if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
|
10663
|
-
if (params->ith != 0) {
|
10664
|
-
return;
|
10665
|
-
}
|
10666
|
-
|
10667
|
-
if (params->type == GGML_TASK_INIT) {
|
10668
|
-
return;
|
10669
|
-
}
|
10670
|
-
|
10671
|
-
if (params->type == GGML_TASK_FINALIZE) {
|
10672
|
-
return;
|
10673
|
-
}
|
10674
|
-
|
10675
|
-
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
10676
|
-
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
10677
|
-
const float * x = (float *) ((char *) src0->data + i02*nb02 + i03*nb03);
|
10678
|
-
const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
|
10679
|
-
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
|
10680
|
-
|
10681
|
-
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
|
10682
|
-
ne11, ne01, ne10,
|
10683
|
-
1.0f, y, ne10,
|
10684
|
-
x, ne00,
|
10685
|
-
0.0f, d, ne01);
|
10686
|
-
}
|
10687
|
-
}
|
10688
|
-
//printf("CBLAS F32 = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
|
10689
|
-
|
10690
|
-
return;
|
10691
|
-
}
|
10692
|
-
#endif
|
10693
|
-
|
10694
|
-
if (params->type == GGML_TASK_INIT) {
|
10695
|
-
return;
|
10696
|
-
}
|
10697
|
-
|
10698
|
-
if (params->type == GGML_TASK_FINALIZE) {
|
10699
|
-
return;
|
10700
|
-
}
|
10701
|
-
|
10702
|
-
// parallelize by src0 rows using ggml_vec_dot_f32
|
10703
|
-
|
10704
|
-
// total rows in src0
|
10705
|
-
const int nr = ne01*ne02*ne03;
|
10706
|
-
|
10707
|
-
// rows per thread
|
10708
|
-
const int dr = (nr + nth - 1)/nth;
|
10709
|
-
|
10710
|
-
// row range for this thread
|
10711
|
-
const int ir0 = dr*ith;
|
10712
|
-
const int ir1 = MIN(ir0 + dr, nr);
|
10713
|
-
|
10714
|
-
for (int ir = ir0; ir < ir1; ++ir) {
|
10715
|
-
// src0 indices
|
10716
|
-
const int i03 = ir/(ne02*ne01);
|
10717
|
-
const int i02 = (ir - i03*ne02*ne01)/ne01;
|
10718
|
-
const int i01 = (ir - i03*ne02*ne01 - i02*ne01);
|
10719
|
-
|
10720
|
-
for (int64_t ic = 0; ic < ne11; ++ic) {
|
10721
|
-
// src1 indices
|
10722
|
-
const int i13 = i03;
|
10723
|
-
const int i12 = i02;
|
10724
|
-
const int i11 = ic;
|
10725
|
-
|
10726
|
-
// dst indices
|
10727
|
-
const int i0 = i01;
|
10728
|
-
const int i1 = i11;
|
10729
|
-
const int i2 = i02;
|
10730
|
-
const int i3 = i03;
|
10731
|
-
|
10732
|
-
ggml_vec_dot_f32(ne00,
|
10733
|
-
(float *) ((char *) dst->data + (i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3)),
|
10734
|
-
(float *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03)),
|
10735
|
-
(float *) ((char *) src1->data + (i11*nb11 + i12*nb12 + i13*nb13)));
|
10736
|
-
}
|
10737
|
-
}
|
10738
|
-
|
10739
|
-
//int64_t t1 = ggml_perf_time_us();
|
10740
|
-
//static int64_t acc = 0;
|
10741
|
-
//acc += t1 - t0;
|
10742
|
-
//if (t1 - t0 > 10) {
|
10743
|
-
// printf("\n");
|
10744
|
-
// printf("ne00 = %5d, ne01 = %5d, ne02 = %5d, ne03 = %5d\n", ne00, ne01, ne02, ne03);
|
10745
|
-
// printf("nb00 = %5d, nb01 = %5d, nb02 = %5d, nb03 = %5d\n", nb00, nb01, nb02, nb03);
|
10746
|
-
// printf("ne10 = %5d, ne11 = %5d, ne12 = %5d, ne13 = %5d\n", ne10, ne11, ne12, ne13);
|
10747
|
-
// printf("nb10 = %5d, nb11 = %5d, nb12 = %5d, nb13 = %5d\n", nb10, nb11, nb12, nb13);
|
10748
|
-
|
10749
|
-
// printf("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX task %d/%d: %d us, acc = %d\n", ith, nth, (int) (t1 - t0), (int) acc);
|
10750
|
-
//}
|
10751
|
-
}
|
10752
|
-
|
10753
|
-
static void ggml_compute_forward_mul_mat_f16_f32(
|
10754
|
-
const struct ggml_compute_params * params,
|
10755
|
-
const struct ggml_tensor * src0,
|
10756
|
-
const struct ggml_tensor * src1,
|
10757
|
-
struct ggml_tensor * dst) {
|
10758
|
-
int64_t t0 = ggml_perf_time_us();
|
10759
|
-
UNUSED(t0);
|
10760
|
-
|
10761
|
-
const int64_t ne00 = src0->ne[0];
|
10762
|
-
const int64_t ne01 = src0->ne[1];
|
10763
|
-
const int64_t ne02 = src0->ne[2];
|
10764
|
-
const int64_t ne03 = src0->ne[3];
|
10765
|
-
|
10766
|
-
const int64_t ne10 = src1->ne[0];
|
10767
|
-
const int64_t ne11 = src1->ne[1];
|
10768
|
-
const int64_t ne12 = src1->ne[2];
|
10769
|
-
const int64_t ne13 = src1->ne[3];
|
10770
|
-
|
10771
|
-
const int64_t ne0 = dst->ne[0];
|
10772
|
-
const int64_t ne1 = dst->ne[1];
|
10773
|
-
const int64_t ne2 = dst->ne[2];
|
10774
|
-
const int64_t ne3 = dst->ne[3];
|
10775
|
-
//const int64_t ne = ne0*ne1*ne2*ne3;
|
10776
|
-
|
10777
|
-
const int nb00 = src0->nb[0];
|
10778
|
-
const int nb01 = src0->nb[1];
|
10779
|
-
const int nb02 = src0->nb[2];
|
10780
|
-
const int nb03 = src0->nb[3];
|
10781
|
-
|
10782
|
-
const int nb10 = src1->nb[0];
|
10783
|
-
const int nb11 = src1->nb[1];
|
10784
|
-
const int nb12 = src1->nb[2];
|
10785
|
-
const int nb13 = src1->nb[3];
|
10786
|
-
|
10787
|
-
const int nb0 = dst->nb[0];
|
10788
|
-
const int nb1 = dst->nb[1];
|
10789
|
-
const int nb2 = dst->nb[2];
|
10790
|
-
const int nb3 = dst->nb[3];
|
10791
|
-
|
10792
|
-
const int ith = params->ith;
|
10793
|
-
const int nth = params->nth;
|
10794
|
-
|
10795
|
-
GGML_ASSERT(ne02 == ne12);
|
10796
|
-
GGML_ASSERT(ne03 == ne13);
|
10797
|
-
GGML_ASSERT(ne2 == ne12);
|
10798
|
-
GGML_ASSERT(ne3 == ne13);
|
10799
|
-
|
10800
|
-
// TODO: we don't support permuted src0
|
10801
|
-
GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
|
10802
|
-
|
10803
|
-
// dst cannot be transposed or permuted
|
10804
|
-
GGML_ASSERT(nb0 == sizeof(float));
|
10805
|
-
GGML_ASSERT(nb0 <= nb1);
|
10806
|
-
GGML_ASSERT(nb1 <= nb2);
|
10807
|
-
GGML_ASSERT(nb2 <= nb3);
|
10808
|
-
|
10809
|
-
GGML_ASSERT(ne0 == ne01);
|
10810
|
-
GGML_ASSERT(ne1 == ne11);
|
10811
|
-
GGML_ASSERT(ne2 == ne02);
|
10812
|
-
GGML_ASSERT(ne3 == ne03);
|
10813
|
-
|
10814
|
-
// nb01 >= nb00 - src0 is not transposed
|
10815
|
-
// compute by src0 rows
|
10816
|
-
|
10817
|
-
#if defined(GGML_USE_CLBLAST)
|
10818
|
-
if (ggml_cl_can_mul_mat(src0, src1, dst)) {
|
10819
|
-
if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
|
10820
|
-
ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
|
10821
|
-
}
|
10822
|
-
return;
|
10823
|
-
}
|
10824
|
-
#endif
|
10825
|
-
|
10826
|
-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
10827
|
-
if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
|
10828
|
-
GGML_ASSERT(nb10 == sizeof(float));
|
10829
|
-
|
10830
|
-
if (params->ith != 0) {
|
10831
|
-
return;
|
10832
|
-
}
|
10833
|
-
|
10834
|
-
if (params->type == GGML_TASK_INIT) {
|
10835
|
-
return;
|
10836
|
-
}
|
10837
|
-
|
10838
|
-
if (params->type == GGML_TASK_FINALIZE) {
|
10839
|
-
return;
|
10840
|
-
}
|
10841
|
-
|
10842
|
-
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
10843
|
-
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
10844
|
-
float * const wdata = params->wdata;
|
10845
|
-
{
|
10846
|
-
size_t id = 0;
|
10847
|
-
for (int64_t i01 = 0; i01 < ne01; ++i01) {
|
10848
|
-
for (int64_t i00 = 0; i00 < ne00; ++i00) {
|
10849
|
-
wdata[id++] = GGML_FP16_TO_FP32(*(ggml_fp16_t *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00));
|
10850
|
-
}
|
10851
|
-
}
|
10852
|
-
|
10853
|
-
assert(id*sizeof(float) <= params->wsize);
|
10854
|
-
}
|
10855
|
-
|
10856
|
-
const float * x = wdata;
|
10857
|
-
const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
|
10858
|
-
|
10859
|
-
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
|
10860
|
-
|
10861
|
-
// zT = y * xT
|
10862
|
-
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
|
10863
|
-
ne11, ne01, ne10,
|
10864
|
-
1.0f, y, ne10,
|
10865
|
-
x, ne00,
|
10866
|
-
0.0f, d, ne01);
|
10867
|
-
}
|
10868
|
-
}
|
10869
|
-
|
10870
|
-
/*printf("CBLAS F16 = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);*/
|
10871
|
-
|
10872
|
-
return;
|
10873
|
-
}
|
10874
|
-
#endif
|
10875
|
-
|
10876
|
-
if (params->type == GGML_TASK_INIT) {
|
10877
|
-
ggml_fp16_t * const wdata = params->wdata;
|
10878
|
-
|
10879
|
-
size_t id = 0;
|
10880
|
-
for (int64_t i13 = 0; i13 < ne13; ++i13) {
|
10881
|
-
for (int64_t i12 = 0; i12 < ne12; ++i12) {
|
10882
|
-
for (int64_t i11 = 0; i11 < ne11; ++i11) {
|
10883
|
-
for (int64_t i10 = 0; i10 < ne10; ++i10) {
|
10884
|
-
wdata[id++] = GGML_FP32_TO_FP16(*(float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10));
|
10885
|
-
}
|
10886
|
-
}
|
10887
|
-
}
|
10888
|
-
}
|
10889
|
-
|
10890
|
-
GGML_ASSERT(id*sizeof(ggml_fp16_t) <= params->wsize);
|
10891
|
-
|
10892
|
-
return;
|
10893
|
-
}
|
10894
|
-
|
10895
|
-
if (params->type == GGML_TASK_FINALIZE) {
|
10896
|
-
return;
|
10528
|
+
default:
|
10529
|
+
{
|
10530
|
+
GGML_ASSERT(false);
|
10531
|
+
} break;
|
10897
10532
|
}
|
10533
|
+
}
|
10898
10534
|
|
10899
|
-
// fp16 -> half the size, so divide by 2
|
10900
|
-
// TODO: do not support transposed src1
|
10901
|
-
assert(nb10/2 == sizeof(ggml_fp16_t));
|
10902
|
-
|
10903
|
-
// parallelize by src0 rows using ggml_vec_dot_f16
|
10904
|
-
|
10905
|
-
// total rows in src0
|
10906
|
-
const int nr = ne01*ne02*ne03;
|
10907
|
-
|
10908
|
-
// rows per thread
|
10909
|
-
const int dr = (nr + nth - 1)/nth;
|
10910
|
-
|
10911
|
-
// row range for this thread
|
10912
|
-
const int ir0 = dr*ith;
|
10913
|
-
const int ir1 = MIN(ir0 + dr, nr);
|
10914
|
-
|
10915
|
-
ggml_fp16_t * wdata = params->wdata;
|
10916
10535
|
|
10917
|
-
|
10918
|
-
// src0 indices
|
10919
|
-
const int i03 = ir/(ne02*ne01);
|
10920
|
-
const int i02 = (ir - i03*ne02*ne01)/ne01;
|
10921
|
-
const int i01 = (ir - i03*ne02*ne01 - i02*ne01);
|
10536
|
+
// ggml_compute_forward_mul_mat
|
10922
10537
|
|
10923
|
-
|
10924
|
-
|
10538
|
+
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
10539
|
+
// helper function to determine if it is better to use BLAS or not
|
10540
|
+
// for large matrices, BLAS is faster
|
10541
|
+
static bool ggml_compute_forward_mul_mat_use_blas(
|
10542
|
+
const struct ggml_tensor * src0,
|
10543
|
+
const struct ggml_tensor * src1,
|
10544
|
+
struct ggml_tensor * dst) {
|
10545
|
+
//const int64_t ne00 = src0->ne[0];
|
10546
|
+
//const int64_t ne01 = src0->ne[1];
|
10925
10547
|
|
10926
|
-
|
10927
|
-
const int i2 = i02;
|
10928
|
-
const int i3 = i03;
|
10548
|
+
const int64_t ne10 = src1->ne[0];
|
10929
10549
|
|
10930
|
-
|
10931
|
-
|
10550
|
+
const int64_t ne0 = dst->ne[0];
|
10551
|
+
const int64_t ne1 = dst->ne[1];
|
10932
10552
|
|
10933
|
-
|
10553
|
+
// TODO: find the optimal values for these
|
10554
|
+
if (ggml_is_contiguous(src0) &&
|
10555
|
+
ggml_is_contiguous(src1) &&
|
10556
|
+
(ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
|
10934
10557
|
|
10935
|
-
|
10936
|
-
|
10937
|
-
}
|
10558
|
+
/*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/
|
10559
|
+
return true;
|
10938
10560
|
}
|
10939
10561
|
|
10940
|
-
|
10941
|
-
//static int64_t acc = 0;
|
10942
|
-
//acc += t1 - t0;
|
10943
|
-
//if (t1 - t0 > 10) {
|
10944
|
-
// printf("\n");
|
10945
|
-
// printf("ne00 = %5d, ne01 = %5d, ne02 = %5d, ne03 = %5d\n", ne00, ne01, ne02, ne03);
|
10946
|
-
// printf("nb00 = %5d, nb01 = %5d, nb02 = %5d, nb03 = %5d\n", nb00, nb01, nb02, nb03);
|
10947
|
-
// printf("ne10 = %5d, ne11 = %5d, ne12 = %5d, ne13 = %5d\n", ne10, ne11, ne12, ne13);
|
10948
|
-
|
10949
|
-
// printf("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX task %d/%d: %d us, acc = %d\n", ith, nth, (int) (t1 - t0), (int) acc);
|
10950
|
-
//}
|
10562
|
+
return false;
|
10951
10563
|
}
|
10564
|
+
#endif
|
10952
10565
|
|
10953
|
-
static void
|
10566
|
+
static void ggml_compute_forward_mul_mat(
|
10954
10567
|
const struct ggml_compute_params * params,
|
10955
10568
|
const struct ggml_tensor * src0,
|
10956
10569
|
const struct ggml_tensor * src1,
|
@@ -10958,35 +10571,7 @@ static void ggml_compute_forward_mul_mat_q_f32(
|
|
10958
10571
|
int64_t t0 = ggml_perf_time_us();
|
10959
10572
|
UNUSED(t0);
|
10960
10573
|
|
10961
|
-
|
10962
|
-
const int64_t ne01 = src0->ne[1];
|
10963
|
-
const int64_t ne02 = src0->ne[2];
|
10964
|
-
const int64_t ne03 = src0->ne[3];
|
10965
|
-
|
10966
|
-
const int64_t ne10 = src1->ne[0];
|
10967
|
-
const int64_t ne11 = src1->ne[1];
|
10968
|
-
const int64_t ne12 = src1->ne[2];
|
10969
|
-
const int64_t ne13 = src1->ne[3];
|
10970
|
-
|
10971
|
-
const int64_t ne0 = dst->ne[0];
|
10972
|
-
const int64_t ne1 = dst->ne[1];
|
10973
|
-
const int64_t ne2 = dst->ne[2];
|
10974
|
-
const int64_t ne3 = dst->ne[3];
|
10975
|
-
|
10976
|
-
const int nb00 = src0->nb[0];
|
10977
|
-
const int nb01 = src0->nb[1];
|
10978
|
-
const int nb02 = src0->nb[2];
|
10979
|
-
const int nb03 = src0->nb[3];
|
10980
|
-
|
10981
|
-
const int nb10 = src1->nb[0];
|
10982
|
-
const int nb11 = src1->nb[1];
|
10983
|
-
const int nb12 = src1->nb[2];
|
10984
|
-
const int nb13 = src1->nb[3];
|
10985
|
-
|
10986
|
-
const int nb0 = dst->nb[0];
|
10987
|
-
const int nb1 = dst->nb[1];
|
10988
|
-
const int nb2 = dst->nb[2];
|
10989
|
-
const int nb3 = dst->nb[3];
|
10574
|
+
GGML_TENSOR_BINARY_OP_LOCALS;
|
10990
10575
|
|
10991
10576
|
const int ith = params->ith;
|
10992
10577
|
const int nth = params->nth;
|
@@ -10997,12 +10582,13 @@ static void ggml_compute_forward_mul_mat_q_f32(
|
|
10997
10582
|
GGML_ASSERT(ne3 == ne13);
|
10998
10583
|
|
10999
10584
|
const enum ggml_type type = src0->type;
|
11000
|
-
|
11001
|
-
|
11002
|
-
enum ggml_type
|
10585
|
+
|
10586
|
+
ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot;
|
10587
|
+
enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
|
10588
|
+
ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
|
11003
10589
|
|
11004
10590
|
// we don't support permuted src0 or src1
|
11005
|
-
GGML_ASSERT(nb00 ==
|
10591
|
+
GGML_ASSERT(nb00 == GGML_TYPE_SIZE[type]);
|
11006
10592
|
GGML_ASSERT(nb10 == sizeof(float));
|
11007
10593
|
|
11008
10594
|
// dst cannot be transposed or permuted
|
@@ -11042,27 +10628,27 @@ static void ggml_compute_forward_mul_mat_q_f32(
|
|
11042
10628
|
return;
|
11043
10629
|
}
|
11044
10630
|
|
11045
|
-
float * const wdata = params->wdata;
|
11046
|
-
dequantize_row_q_t const dequantize_row_q = quantize_fns[type].dequantize_row_q;
|
11047
|
-
|
11048
10631
|
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
11049
10632
|
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
10633
|
+
const void * x = (char *) src0->data + i03*nb03 + i02*nb02;
|
11050
10634
|
const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
|
11051
10635
|
|
11052
10636
|
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
|
11053
10637
|
|
11054
|
-
{
|
10638
|
+
if (type != GGML_TYPE_F32) {
|
10639
|
+
float * const wdata = params->wdata;
|
10640
|
+
ggml_to_float_t const to_float = type_traits[type].to_float;
|
10641
|
+
|
11055
10642
|
size_t id = 0;
|
11056
10643
|
for (int64_t i01 = 0; i01 < ne01; ++i01) {
|
11057
|
-
|
10644
|
+
to_float((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01, wdata + id, ne00);
|
11058
10645
|
id += ne00;
|
11059
10646
|
}
|
11060
10647
|
|
11061
10648
|
assert(id*sizeof(float) <= params->wsize);
|
10649
|
+
x = wdata;
|
11062
10650
|
}
|
11063
10651
|
|
11064
|
-
const float * x = wdata;
|
11065
|
-
|
11066
10652
|
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
|
11067
10653
|
ne11, ne01, ne10,
|
11068
10654
|
1.0f, y, ne10,
|
@@ -11078,14 +10664,16 @@ static void ggml_compute_forward_mul_mat_q_f32(
|
|
11078
10664
|
#endif
|
11079
10665
|
|
11080
10666
|
if (params->type == GGML_TASK_INIT) {
|
11081
|
-
|
11082
|
-
|
11083
|
-
|
11084
|
-
|
11085
|
-
for (int64_t
|
11086
|
-
for (int64_t
|
11087
|
-
|
11088
|
-
|
10667
|
+
if (src1->type != vec_dot_type) {
|
10668
|
+
char * wdata = params->wdata;
|
10669
|
+
const size_t row_size = ne10*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type];
|
10670
|
+
|
10671
|
+
for (int64_t i13 = 0; i13 < ne13; ++i13) {
|
10672
|
+
for (int64_t i12 = 0; i12 < ne12; ++i12) {
|
10673
|
+
for (int64_t i11 = 0; i11 < ne11; ++i11) {
|
10674
|
+
from_float_to_vec_dot((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, ne10);
|
10675
|
+
wdata += row_size;
|
10676
|
+
}
|
11089
10677
|
}
|
11090
10678
|
}
|
11091
10679
|
}
|
@@ -11109,7 +10697,7 @@ static void ggml_compute_forward_mul_mat_q_f32(
|
|
11109
10697
|
const int ir0 = dr*ith;
|
11110
10698
|
const int ir1 = MIN(ir0 + dr, nr);
|
11111
10699
|
|
11112
|
-
void * wdata = params->wdata;
|
10700
|
+
void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
|
11113
10701
|
const size_t row_size = ne00*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type];
|
11114
10702
|
|
11115
10703
|
for (int ir = ir0; ir < ir1; ++ir) {
|
@@ -11133,7 +10721,7 @@ static void ggml_compute_forward_mul_mat_q_f32(
|
|
11133
10721
|
assert(ne00 % 32 == 0);
|
11134
10722
|
|
11135
10723
|
for (int64_t ic = 0; ic < ne11; ++ic) {
|
11136
|
-
|
10724
|
+
vec_dot(ne00, &dst_col[ic*ne0], src0_row, (void *) (src1_col + ic*row_size));
|
11137
10725
|
}
|
11138
10726
|
}
|
11139
10727
|
|
@@ -11150,40 +10738,6 @@ static void ggml_compute_forward_mul_mat_q_f32(
|
|
11150
10738
|
//}
|
11151
10739
|
}
|
11152
10740
|
|
11153
|
-
static void ggml_compute_forward_mul_mat(
|
11154
|
-
const struct ggml_compute_params * params,
|
11155
|
-
const struct ggml_tensor * src0,
|
11156
|
-
const struct ggml_tensor * src1,
|
11157
|
-
struct ggml_tensor * dst) {
|
11158
|
-
switch (src0->type) {
|
11159
|
-
case GGML_TYPE_Q4_0:
|
11160
|
-
case GGML_TYPE_Q4_1:
|
11161
|
-
case GGML_TYPE_Q5_0:
|
11162
|
-
case GGML_TYPE_Q5_1:
|
11163
|
-
case GGML_TYPE_Q8_0:
|
11164
|
-
case GGML_TYPE_Q8_1:
|
11165
|
-
case GGML_TYPE_Q2_K:
|
11166
|
-
case GGML_TYPE_Q3_K:
|
11167
|
-
case GGML_TYPE_Q4_K:
|
11168
|
-
case GGML_TYPE_Q5_K:
|
11169
|
-
case GGML_TYPE_Q6_K:
|
11170
|
-
{
|
11171
|
-
ggml_compute_forward_mul_mat_q_f32(params, src0, src1, dst);
|
11172
|
-
} break;
|
11173
|
-
case GGML_TYPE_F16:
|
11174
|
-
{
|
11175
|
-
ggml_compute_forward_mul_mat_f16_f32(params, src0, src1, dst);
|
11176
|
-
} break;
|
11177
|
-
case GGML_TYPE_F32:
|
11178
|
-
{
|
11179
|
-
ggml_compute_forward_mul_mat_f32(params, src0, src1, dst);
|
11180
|
-
} break;
|
11181
|
-
default:
|
11182
|
-
{
|
11183
|
-
GGML_ASSERT(false);
|
11184
|
-
} break;
|
11185
|
-
}
|
11186
|
-
}
|
11187
10741
|
|
11188
10742
|
// ggml_compute_forward_out_prod
|
11189
10743
|
|
@@ -11196,35 +10750,7 @@ static void ggml_compute_forward_out_prod_f32(
|
|
11196
10750
|
int64_t t0 = ggml_perf_time_us();
|
11197
10751
|
UNUSED(t0);
|
11198
10752
|
|
11199
|
-
|
11200
|
-
const int64_t ne01 = src0->ne[1];
|
11201
|
-
const int64_t ne02 = src0->ne[2];
|
11202
|
-
const int64_t ne03 = src0->ne[3];
|
11203
|
-
|
11204
|
-
const int64_t ne10 = src1->ne[0];
|
11205
|
-
//const int64_t ne11 = src1->ne[1];
|
11206
|
-
const int64_t ne12 = src1->ne[2];
|
11207
|
-
const int64_t ne13 = src1->ne[3];
|
11208
|
-
|
11209
|
-
const int64_t ne0 = dst->ne[0];
|
11210
|
-
const int64_t ne1 = dst->ne[1];
|
11211
|
-
const int64_t ne2 = dst->ne[2];
|
11212
|
-
const int64_t ne3 = dst->ne[3];
|
11213
|
-
|
11214
|
-
const int nb00 = src0->nb[0];
|
11215
|
-
const int nb01 = src0->nb[1];
|
11216
|
-
const int nb02 = src0->nb[2];
|
11217
|
-
const int nb03 = src0->nb[3];
|
11218
|
-
|
11219
|
-
const int nb10 = src1->nb[0];
|
11220
|
-
const int nb11 = src1->nb[1];
|
11221
|
-
const int nb12 = src1->nb[2];
|
11222
|
-
const int nb13 = src1->nb[3];
|
11223
|
-
|
11224
|
-
const int nb0 = dst->nb[0];
|
11225
|
-
const int nb1 = dst->nb[1];
|
11226
|
-
const int nb2 = dst->nb[2];
|
11227
|
-
const int nb3 = dst->nb[3];
|
10753
|
+
GGML_TENSOR_BINARY_OP_LOCALS;
|
11228
10754
|
|
11229
10755
|
const int ith = params->ith;
|
11230
10756
|
const int nth = params->nth;
|
@@ -11459,15 +10985,8 @@ static void ggml_compute_forward_set_f32(
|
|
11459
10985
|
const int nr = ggml_nrows(src1);
|
11460
10986
|
const int nc = src1->ne[0];
|
11461
10987
|
|
11462
|
-
|
11463
|
-
|
11464
|
-
const int64_t ne12 = src1->ne[2];
|
11465
|
-
const int64_t ne13 = src1->ne[3];
|
11466
|
-
|
11467
|
-
const size_t nb10 = src1->nb[0];
|
11468
|
-
const size_t nb11 = src1->nb[1];
|
11469
|
-
const size_t nb12 = src1->nb[2];
|
11470
|
-
const size_t nb13 = src1->nb[3];
|
10988
|
+
GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne);
|
10989
|
+
GGML_TENSOR_LOCALS(size_t, nb1, src1, nb);
|
11471
10990
|
|
11472
10991
|
// src0 and dst as viewed during set
|
11473
10992
|
const size_t nb0 = ggml_element_size(src0);
|
@@ -11608,7 +11127,7 @@ static void ggml_compute_forward_get_rows_q(
|
|
11608
11127
|
const int nc = src0->ne[0];
|
11609
11128
|
const int nr = ggml_nelements(src1);
|
11610
11129
|
const enum ggml_type type = src0->type;
|
11611
|
-
|
11130
|
+
ggml_to_float_t const dequantize_row_q = type_traits[type].to_float;
|
11612
11131
|
|
11613
11132
|
assert( dst->ne[0] == nc);
|
11614
11133
|
assert( dst->ne[1] == nr);
|
@@ -11858,29 +11377,14 @@ static void ggml_compute_forward_diag_f32(
|
|
11858
11377
|
|
11859
11378
|
// TODO: handle transposed/permuted matrices
|
11860
11379
|
|
11861
|
-
|
11862
|
-
|
11863
|
-
const int ne02 = src0->ne[2];
|
11864
|
-
const int ne03 = src0->ne[3];
|
11865
|
-
const int ne0 = dst->ne[0];
|
11866
|
-
const int ne1 = dst->ne[1];
|
11867
|
-
const int ne2 = dst->ne[2];
|
11868
|
-
const int ne3 = dst->ne[3];
|
11380
|
+
GGML_TENSOR_UNARY_OP_LOCALS;
|
11381
|
+
|
11869
11382
|
GGML_ASSERT(ne00 == ne0);
|
11870
11383
|
GGML_ASSERT(ne00 == ne1);
|
11871
11384
|
GGML_ASSERT(ne01 == 1);
|
11872
11385
|
GGML_ASSERT(ne02 == ne2);
|
11873
11386
|
GGML_ASSERT(ne03 == ne3);
|
11874
11387
|
|
11875
|
-
const int nb00 = src0->nb[0];
|
11876
|
-
//const int nb01 = src0->nb[1];
|
11877
|
-
const int nb02 = src0->nb[2];
|
11878
|
-
const int nb03 = src0->nb[3];
|
11879
|
-
const int nb0 = dst->nb[0];
|
11880
|
-
const int nb1 = dst->nb[1];
|
11881
|
-
const int nb2 = dst->nb[2];
|
11882
|
-
const int nb3 = dst->nb[3];
|
11883
|
-
|
11884
11388
|
GGML_ASSERT(nb00 == sizeof(float));
|
11885
11389
|
GGML_ASSERT(nb0 == sizeof(float));
|
11886
11390
|
|
@@ -12457,20 +11961,7 @@ static void ggml_compute_forward_rope_f32(
|
|
12457
11961
|
|
12458
11962
|
assert(n_past >= 0);
|
12459
11963
|
|
12460
|
-
|
12461
|
-
const size_t nb01 = src0->nb[1];
|
12462
|
-
const size_t nb02 = src0->nb[2];
|
12463
|
-
const size_t nb03 = src0->nb[3];
|
12464
|
-
|
12465
|
-
const int64_t ne0 = dst->ne[0];
|
12466
|
-
const int64_t ne1 = dst->ne[1];
|
12467
|
-
const int64_t ne2 = dst->ne[2];
|
12468
|
-
const int64_t ne3 = dst->ne[3];
|
12469
|
-
|
12470
|
-
const size_t nb0 = dst->nb[0];
|
12471
|
-
const size_t nb1 = dst->nb[1];
|
12472
|
-
const size_t nb2 = dst->nb[2];
|
12473
|
-
const size_t nb3 = dst->nb[3];
|
11964
|
+
GGML_TENSOR_UNARY_OP_LOCALS;
|
12474
11965
|
|
12475
11966
|
//printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
|
12476
11967
|
//printf("n_past = %d, ne2 = %d\n", n_past, ne2);
|
@@ -12597,20 +12088,7 @@ static void ggml_compute_forward_rope_f16(
|
|
12597
12088
|
|
12598
12089
|
assert(n_past >= 0);
|
12599
12090
|
|
12600
|
-
|
12601
|
-
const size_t nb01 = src0->nb[1];
|
12602
|
-
const size_t nb02 = src0->nb[2];
|
12603
|
-
const size_t nb03 = src0->nb[3];
|
12604
|
-
|
12605
|
-
const int64_t ne0 = dst->ne[0];
|
12606
|
-
const int64_t ne1 = dst->ne[1];
|
12607
|
-
const int64_t ne2 = dst->ne[2];
|
12608
|
-
const int64_t ne3 = dst->ne[3];
|
12609
|
-
|
12610
|
-
const size_t nb0 = dst->nb[0];
|
12611
|
-
const size_t nb1 = dst->nb[1];
|
12612
|
-
const size_t nb2 = dst->nb[2];
|
12613
|
-
const size_t nb3 = dst->nb[3];
|
12091
|
+
GGML_TENSOR_UNARY_OP_LOCALS;
|
12614
12092
|
|
12615
12093
|
//printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
|
12616
12094
|
//printf("n_past = %d, ne2 = %d\n", n_past, ne2);
|
@@ -12763,21 +12241,7 @@ static void ggml_compute_forward_rope_back_f32(
|
|
12763
12241
|
|
12764
12242
|
assert(n_past >= 0);
|
12765
12243
|
|
12766
|
-
|
12767
|
-
const size_t nb01 = src0->nb[1];
|
12768
|
-
const size_t nb02 = src0->nb[2];
|
12769
|
-
const size_t nb03 = src0->nb[3];
|
12770
|
-
|
12771
|
-
const int64_t ne0 = dst->ne[0];
|
12772
|
-
const int64_t ne1 = dst->ne[1];
|
12773
|
-
const int64_t ne2 = dst->ne[2];
|
12774
|
-
const int64_t ne3 = dst->ne[3];
|
12775
|
-
|
12776
|
-
const size_t nb0 = dst->nb[0];
|
12777
|
-
const size_t nb1 = dst->nb[1];
|
12778
|
-
const size_t nb2 = dst->nb[2];
|
12779
|
-
const size_t nb3 = dst->nb[3];
|
12780
|
-
|
12244
|
+
GGML_TENSOR_UNARY_OP_LOCALS;
|
12781
12245
|
|
12782
12246
|
//printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
|
12783
12247
|
//printf("n_past = %d, ne2 = %d\n", n_past, ne2);
|
@@ -12876,21 +12340,7 @@ static void ggml_compute_forward_rope_back_f16(
|
|
12876
12340
|
|
12877
12341
|
assert(n_past >= 0);
|
12878
12342
|
|
12879
|
-
|
12880
|
-
const size_t nb01 = src0->nb[1];
|
12881
|
-
const size_t nb02 = src0->nb[2];
|
12882
|
-
const size_t nb03 = src0->nb[3];
|
12883
|
-
|
12884
|
-
const int64_t ne0 = dst->ne[0];
|
12885
|
-
const int64_t ne1 = dst->ne[1];
|
12886
|
-
const int64_t ne2 = dst->ne[2];
|
12887
|
-
const int64_t ne3 = dst->ne[3];
|
12888
|
-
|
12889
|
-
const size_t nb0 = dst->nb[0];
|
12890
|
-
const size_t nb1 = dst->nb[1];
|
12891
|
-
const size_t nb2 = dst->nb[2];
|
12892
|
-
const size_t nb3 = dst->nb[3];
|
12893
|
-
|
12343
|
+
GGML_TENSOR_UNARY_OP_LOCALS;
|
12894
12344
|
|
12895
12345
|
//printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
|
12896
12346
|
//printf("n_past = %d, ne2 = %d\n", n_past, ne2);
|
@@ -12988,7 +12438,7 @@ static void ggml_compute_forward_rope_back(
|
|
12988
12438
|
}
|
12989
12439
|
}
|
12990
12440
|
|
12991
|
-
//
|
12441
|
+
// ggml_compute_forward_conv_1d
|
12992
12442
|
|
12993
12443
|
static void ggml_compute_forward_conv_1d_s1_ph_f16_f32(
|
12994
12444
|
const struct ggml_compute_params * params,
|
@@ -13002,36 +12452,7 @@ static void ggml_compute_forward_conv_1d_s1_ph_f16_f32(
|
|
13002
12452
|
int64_t t0 = ggml_perf_time_us();
|
13003
12453
|
UNUSED(t0);
|
13004
12454
|
|
13005
|
-
|
13006
|
-
const int64_t ne01 = src0->ne[1];
|
13007
|
-
const int64_t ne02 = src0->ne[2];
|
13008
|
-
//const int64_t ne03 = src0->ne[3];
|
13009
|
-
|
13010
|
-
const int64_t ne10 = src1->ne[0];
|
13011
|
-
const int64_t ne11 = src1->ne[1];
|
13012
|
-
//const int64_t ne12 = src1->ne[2];
|
13013
|
-
//const int64_t ne13 = src1->ne[3];
|
13014
|
-
|
13015
|
-
//const int64_t ne0 = dst->ne[0];
|
13016
|
-
//const int64_t ne1 = dst->ne[1];
|
13017
|
-
//const int64_t ne2 = dst->ne[2];
|
13018
|
-
//const int64_t ne3 = dst->ne[3];
|
13019
|
-
//const int64_t ne = ne0*ne1*ne2*ne3;
|
13020
|
-
|
13021
|
-
const int nb00 = src0->nb[0];
|
13022
|
-
const int nb01 = src0->nb[1];
|
13023
|
-
const int nb02 = src0->nb[2];
|
13024
|
-
//const int nb03 = src0->nb[3];
|
13025
|
-
|
13026
|
-
const int nb10 = src1->nb[0];
|
13027
|
-
const int nb11 = src1->nb[1];
|
13028
|
-
//const int nb12 = src1->nb[2];
|
13029
|
-
//const int nb13 = src1->nb[3];
|
13030
|
-
|
13031
|
-
//const int nb0 = dst->nb[0];
|
13032
|
-
const int nb1 = dst->nb[1];
|
13033
|
-
//const int nb2 = dst->nb[2];
|
13034
|
-
//const int nb3 = dst->nb[3];
|
12455
|
+
GGML_TENSOR_BINARY_OP_LOCALS;
|
13035
12456
|
|
13036
12457
|
const int ith = params->ith;
|
13037
12458
|
const int nth = params->nth;
|
@@ -13122,36 +12543,7 @@ static void ggml_compute_forward_conv_1d_s1_ph_f32(
|
|
13122
12543
|
int64_t t0 = ggml_perf_time_us();
|
13123
12544
|
UNUSED(t0);
|
13124
12545
|
|
13125
|
-
|
13126
|
-
const int64_t ne01 = src0->ne[1];
|
13127
|
-
const int64_t ne02 = src0->ne[2];
|
13128
|
-
//const int64_t ne03 = src0->ne[3];
|
13129
|
-
|
13130
|
-
const int64_t ne10 = src1->ne[0];
|
13131
|
-
const int64_t ne11 = src1->ne[1];
|
13132
|
-
//const int64_t ne12 = src1->ne[2];
|
13133
|
-
//const int64_t ne13 = src1->ne[3];
|
13134
|
-
|
13135
|
-
//const int64_t ne0 = dst->ne[0];
|
13136
|
-
//const int64_t ne1 = dst->ne[1];
|
13137
|
-
//const int64_t ne2 = dst->ne[2];
|
13138
|
-
//const int64_t ne3 = dst->ne[3];
|
13139
|
-
//const int64_t ne = ne0*ne1*ne2*ne3;
|
13140
|
-
|
13141
|
-
const int nb00 = src0->nb[0];
|
13142
|
-
const int nb01 = src0->nb[1];
|
13143
|
-
const int nb02 = src0->nb[2];
|
13144
|
-
//const int nb03 = src0->nb[3];
|
13145
|
-
|
13146
|
-
const int nb10 = src1->nb[0];
|
13147
|
-
const int nb11 = src1->nb[1];
|
13148
|
-
//const int nb12 = src1->nb[2];
|
13149
|
-
//const int nb13 = src1->nb[3];
|
13150
|
-
|
13151
|
-
//const int nb0 = dst->nb[0];
|
13152
|
-
const int nb1 = dst->nb[1];
|
13153
|
-
//const int nb2 = dst->nb[2];
|
13154
|
-
//const int nb3 = dst->nb[3];
|
12546
|
+
GGML_TENSOR_BINARY_OP_LOCALS;
|
13155
12547
|
|
13156
12548
|
const int ith = params->ith;
|
13157
12549
|
const int nth = params->nth;
|
@@ -13251,8 +12643,6 @@ static void ggml_compute_forward_conv_1d_s1_ph(
|
|
13251
12643
|
}
|
13252
12644
|
}
|
13253
12645
|
|
13254
|
-
// ggml_compute_forward_conv_1d_s2_ph
|
13255
|
-
|
13256
12646
|
static void ggml_compute_forward_conv_1d_s2_ph_f16_f32(
|
13257
12647
|
const struct ggml_compute_params * params,
|
13258
12648
|
const struct ggml_tensor * src0,
|
@@ -13265,36 +12655,7 @@ static void ggml_compute_forward_conv_1d_s2_ph_f16_f32(
|
|
13265
12655
|
int64_t t0 = ggml_perf_time_us();
|
13266
12656
|
UNUSED(t0);
|
13267
12657
|
|
13268
|
-
|
13269
|
-
const int64_t ne01 = src0->ne[1];
|
13270
|
-
const int64_t ne02 = src0->ne[2];
|
13271
|
-
//const int64_t ne03 = src0->ne[3];
|
13272
|
-
|
13273
|
-
const int64_t ne10 = src1->ne[0];
|
13274
|
-
const int64_t ne11 = src1->ne[1];
|
13275
|
-
//const int64_t ne12 = src1->ne[2];
|
13276
|
-
//const int64_t ne13 = src1->ne[3];
|
13277
|
-
|
13278
|
-
//const int64_t ne0 = dst->ne[0];
|
13279
|
-
//const int64_t ne1 = dst->ne[1];
|
13280
|
-
//const int64_t ne2 = dst->ne[2];
|
13281
|
-
//const int64_t ne3 = dst->ne[3];
|
13282
|
-
//const int64_t ne = ne0*ne1*ne2*ne3;
|
13283
|
-
|
13284
|
-
const int nb00 = src0->nb[0];
|
13285
|
-
const int nb01 = src0->nb[1];
|
13286
|
-
const int nb02 = src0->nb[2];
|
13287
|
-
//const int nb03 = src0->nb[3];
|
13288
|
-
|
13289
|
-
const int nb10 = src1->nb[0];
|
13290
|
-
const int nb11 = src1->nb[1];
|
13291
|
-
//const int nb12 = src1->nb[2];
|
13292
|
-
//const int nb13 = src1->nb[3];
|
13293
|
-
|
13294
|
-
//const int nb0 = dst->nb[0];
|
13295
|
-
const int nb1 = dst->nb[1];
|
13296
|
-
//const int nb2 = dst->nb[2];
|
13297
|
-
//const int nb3 = dst->nb[3];
|
12658
|
+
GGML_TENSOR_BINARY_OP_LOCALS;
|
13298
12659
|
|
13299
12660
|
const int ith = params->ith;
|
13300
12661
|
const int nth = params->nth;
|
@@ -13385,36 +12746,7 @@ static void ggml_compute_forward_conv_1d_s2_ph_f32(
|
|
13385
12746
|
int64_t t0 = ggml_perf_time_us();
|
13386
12747
|
UNUSED(t0);
|
13387
12748
|
|
13388
|
-
|
13389
|
-
const int64_t ne01 = src0->ne[1];
|
13390
|
-
const int64_t ne02 = src0->ne[2];
|
13391
|
-
//const int64_t ne03 = src0->ne[3];
|
13392
|
-
|
13393
|
-
const int64_t ne10 = src1->ne[0];
|
13394
|
-
const int64_t ne11 = src1->ne[1];
|
13395
|
-
//const int64_t ne12 = src1->ne[2];
|
13396
|
-
//const int64_t ne13 = src1->ne[3];
|
13397
|
-
|
13398
|
-
//const int64_t ne0 = dst->ne[0];
|
13399
|
-
//const int64_t ne1 = dst->ne[1];
|
13400
|
-
//const int64_t ne2 = dst->ne[2];
|
13401
|
-
//const int64_t ne3 = dst->ne[3];
|
13402
|
-
//const int64_t ne = ne0*ne1*ne2*ne3;
|
13403
|
-
|
13404
|
-
const int nb00 = src0->nb[0];
|
13405
|
-
const int nb01 = src0->nb[1];
|
13406
|
-
const int nb02 = src0->nb[2];
|
13407
|
-
//const int nb03 = src0->nb[3];
|
13408
|
-
|
13409
|
-
const int nb10 = src1->nb[0];
|
13410
|
-
const int nb11 = src1->nb[1];
|
13411
|
-
//const int nb12 = src1->nb[2];
|
13412
|
-
//const int nb13 = src1->nb[3];
|
13413
|
-
|
13414
|
-
//const int nb0 = dst->nb[0];
|
13415
|
-
const int nb1 = dst->nb[1];
|
13416
|
-
//const int nb2 = dst->nb[2];
|
13417
|
-
//const int nb3 = dst->nb[3];
|
12749
|
+
GGML_TENSOR_BINARY_OP_LOCALS;
|
13418
12750
|
|
13419
12751
|
const int ith = params->ith;
|
13420
12752
|
const int nth = params->nth;
|
@@ -13514,6 +12846,28 @@ static void ggml_compute_forward_conv_1d_s2_ph(
|
|
13514
12846
|
}
|
13515
12847
|
}
|
13516
12848
|
|
12849
|
+
// ggml_compute_forward_conv_1d
|
12850
|
+
|
12851
|
+
static void ggml_compute_forward_conv_1d(
|
12852
|
+
const struct ggml_compute_params * params,
|
12853
|
+
const struct ggml_tensor * src0,
|
12854
|
+
const struct ggml_tensor * src1,
|
12855
|
+
const struct ggml_tensor * opt0,
|
12856
|
+
struct ggml_tensor * dst) {
|
12857
|
+
const int32_t s0 = ((const int32_t*)(opt0->data))[0];
|
12858
|
+
const int32_t p0 = ((const int32_t*)(opt0->data))[1];
|
12859
|
+
const int32_t d0 = ((const int32_t*)(opt0->data))[2];
|
12860
|
+
GGML_ASSERT(d0 == 1); // dilation not supported
|
12861
|
+
GGML_ASSERT(p0 == src0->ne[0]/2); // only half padding supported
|
12862
|
+
if (s0 == 1) {
|
12863
|
+
ggml_compute_forward_conv_1d_s1_ph(params, src0, src1, dst);
|
12864
|
+
} else if (s0 == 2) {
|
12865
|
+
ggml_compute_forward_conv_1d_s2_ph(params, src0, src1, dst);
|
12866
|
+
} else {
|
12867
|
+
GGML_ASSERT(false); // only stride 1 and 2 supported
|
12868
|
+
};
|
12869
|
+
}
|
12870
|
+
|
13517
12871
|
// ggml_compute_forward_conv_2d_sk_p0
|
13518
12872
|
|
13519
12873
|
static void ggml_compute_forward_conv_2d_sk_p0_f16_f32(
|
@@ -13528,36 +12882,7 @@ static void ggml_compute_forward_conv_2d_sk_p0_f16_f32(
|
|
13528
12882
|
int64_t t0 = ggml_perf_time_us();
|
13529
12883
|
UNUSED(t0);
|
13530
12884
|
|
13531
|
-
|
13532
|
-
const int ne01 = src0->ne[1];
|
13533
|
-
const int ne02 = src0->ne[2];
|
13534
|
-
//const int ne03 = src0->ne[3];
|
13535
|
-
|
13536
|
-
const int ne10 = src1->ne[0];
|
13537
|
-
//const int ne11 = src1->ne[1];
|
13538
|
-
const int ne12 = src1->ne[2];
|
13539
|
-
//const int ne13 = src1->ne[3];
|
13540
|
-
|
13541
|
-
const int ne0 = dst->ne[0];
|
13542
|
-
const int ne1 = dst->ne[1];
|
13543
|
-
const int ne2 = dst->ne[2];
|
13544
|
-
//const int ne3 = dst->ne[3];
|
13545
|
-
//const int ne = ne0*ne1*ne2*ne3;
|
13546
|
-
|
13547
|
-
const int nb00 = src0->nb[0];
|
13548
|
-
//const int nb01 = src0->nb[1];
|
13549
|
-
//const int nb02 = src0->nb[2];
|
13550
|
-
const int nb03 = src0->nb[3];
|
13551
|
-
|
13552
|
-
const int nb10 = src1->nb[0];
|
13553
|
-
//const int nb11 = src1->nb[1];
|
13554
|
-
const int nb12 = src1->nb[2];
|
13555
|
-
//const int nb13 = src1->nb[3];
|
13556
|
-
|
13557
|
-
//const int nb0 = dst->nb[0];
|
13558
|
-
//const int nb1 = dst->nb[1];
|
13559
|
-
const int nb2 = dst->nb[2];
|
13560
|
-
//const int nb3 = dst->nb[3];
|
12885
|
+
GGML_TENSOR_BINARY_OP_LOCALS;
|
13561
12886
|
|
13562
12887
|
const int ith = params->ith;
|
13563
12888
|
const int nth = params->nth;
|
@@ -13650,6 +12975,34 @@ static void ggml_compute_forward_conv_2d_sk_p0(
|
|
13650
12975
|
}
|
13651
12976
|
}
|
13652
12977
|
|
12978
|
+
// ggml_compute_forward_conv_2d
|
12979
|
+
|
12980
|
+
static void ggml_compute_forward_conv_2d(
|
12981
|
+
const struct ggml_compute_params* params,
|
12982
|
+
const struct ggml_tensor* src0,
|
12983
|
+
const struct ggml_tensor* src1,
|
12984
|
+
const struct ggml_tensor* opt0,
|
12985
|
+
struct ggml_tensor* dst) {
|
12986
|
+
const int32_t s0 = ((const int32_t*)(opt0->data))[0];
|
12987
|
+
const int32_t s1 = ((const int32_t*)(opt0->data))[1];
|
12988
|
+
const int32_t p0 = ((const int32_t*)(opt0->data))[2];
|
12989
|
+
const int32_t p1 = ((const int32_t*)(opt0->data))[3];
|
12990
|
+
const int32_t d0 = ((const int32_t*)(opt0->data))[4];
|
12991
|
+
const int32_t d1 = ((const int32_t*)(opt0->data))[5];
|
12992
|
+
GGML_ASSERT(d0 == 1); // dilation not supported
|
12993
|
+
GGML_ASSERT(d1 == 1);
|
12994
|
+
GGML_ASSERT(p0 == 0); // padding not supported
|
12995
|
+
GGML_ASSERT(p1 == 0);
|
12996
|
+
|
12997
|
+
if (s0 == src0->ne[0] && s1 == src0->ne[1]) {
|
12998
|
+
ggml_compute_forward_conv_2d_sk_p0(params, src0, src1, dst);
|
12999
|
+
}
|
13000
|
+
else {
|
13001
|
+
GGML_ASSERT(false); // only stride equal to kernel size is supported
|
13002
|
+
};
|
13003
|
+
}
|
13004
|
+
|
13005
|
+
|
13653
13006
|
// ggml_compute_forward_flash_attn
|
13654
13007
|
|
13655
13008
|
static void ggml_compute_forward_flash_attn_f32(
|
@@ -13662,45 +13015,14 @@ static void ggml_compute_forward_flash_attn_f32(
|
|
13662
13015
|
int64_t t0 = ggml_perf_time_us();
|
13663
13016
|
UNUSED(t0);
|
13664
13017
|
|
13665
|
-
|
13666
|
-
|
13667
|
-
|
13668
|
-
|
13669
|
-
|
13670
|
-
|
13671
|
-
|
13672
|
-
|
13673
|
-
//const int64_t nek3 = k->ne[3];
|
13674
|
-
|
13675
|
-
//const int64_t nev0 = v->ne[0];
|
13676
|
-
const int64_t nev1 = v->ne[1];
|
13677
|
-
//const int64_t nev2 = v->ne[2];
|
13678
|
-
//const int64_t nev3 = v->ne[3];
|
13679
|
-
|
13680
|
-
const int64_t ne0 = dst->ne[0];
|
13681
|
-
const int64_t ne1 = dst->ne[1];
|
13682
|
-
//const int64_t ne2 = dst->ne[2];
|
13683
|
-
//const int64_t ne3 = dst->ne[3];
|
13684
|
-
|
13685
|
-
const int nbk0 = k->nb[0];
|
13686
|
-
const int nbk1 = k->nb[1];
|
13687
|
-
const int nbk2 = k->nb[2];
|
13688
|
-
const int nbk3 = k->nb[3];
|
13689
|
-
|
13690
|
-
const int nbq0 = q->nb[0];
|
13691
|
-
const int nbq1 = q->nb[1];
|
13692
|
-
const int nbq2 = q->nb[2];
|
13693
|
-
const int nbq3 = q->nb[3];
|
13694
|
-
|
13695
|
-
const int nbv0 = v->nb[0];
|
13696
|
-
const int nbv1 = v->nb[1];
|
13697
|
-
const int nbv2 = v->nb[2];
|
13698
|
-
const int nbv3 = v->nb[3];
|
13699
|
-
|
13700
|
-
const int nb0 = dst->nb[0];
|
13701
|
-
const int nb1 = dst->nb[1];
|
13702
|
-
const int nb2 = dst->nb[2];
|
13703
|
-
const int nb3 = dst->nb[3];
|
13018
|
+
GGML_TENSOR_LOCALS(int64_t, neq, q, ne);
|
13019
|
+
GGML_TENSOR_LOCALS(size_t, nbq, q, nb);
|
13020
|
+
GGML_TENSOR_LOCALS(int64_t, nek, k, ne);
|
13021
|
+
GGML_TENSOR_LOCALS(size_t, nbk, k, nb);
|
13022
|
+
GGML_TENSOR_LOCALS(int64_t, nev, v, ne);
|
13023
|
+
GGML_TENSOR_LOCALS(size_t, nbv, v, nb);
|
13024
|
+
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne);
|
13025
|
+
GGML_TENSOR_LOCALS(size_t, nb, dst, nb);
|
13704
13026
|
|
13705
13027
|
const int ith = params->ith;
|
13706
13028
|
const int nth = params->nth;
|
@@ -13871,45 +13193,14 @@ static void ggml_compute_forward_flash_attn_f16(
|
|
13871
13193
|
int64_t t0 = ggml_perf_time_us();
|
13872
13194
|
UNUSED(t0);
|
13873
13195
|
|
13874
|
-
|
13875
|
-
|
13876
|
-
|
13877
|
-
|
13878
|
-
|
13879
|
-
|
13880
|
-
|
13881
|
-
|
13882
|
-
//const int64_t nek3 = k->ne[3];
|
13883
|
-
|
13884
|
-
//const int64_t nev0 = v->ne[0];
|
13885
|
-
const int64_t nev1 = v->ne[1];
|
13886
|
-
//const int64_t nev2 = v->ne[2];
|
13887
|
-
//const int64_t nev3 = v->ne[3];
|
13888
|
-
|
13889
|
-
const int64_t ne0 = dst->ne[0];
|
13890
|
-
const int64_t ne1 = dst->ne[1];
|
13891
|
-
//const int64_t ne2 = dst->ne[2];
|
13892
|
-
//const int64_t ne3 = dst->ne[3];
|
13893
|
-
|
13894
|
-
const int nbk0 = k->nb[0];
|
13895
|
-
const int nbk1 = k->nb[1];
|
13896
|
-
const int nbk2 = k->nb[2];
|
13897
|
-
const int nbk3 = k->nb[3];
|
13898
|
-
|
13899
|
-
const int nbq0 = q->nb[0];
|
13900
|
-
const int nbq1 = q->nb[1];
|
13901
|
-
const int nbq2 = q->nb[2];
|
13902
|
-
const int nbq3 = q->nb[3];
|
13903
|
-
|
13904
|
-
const int nbv0 = v->nb[0];
|
13905
|
-
const int nbv1 = v->nb[1];
|
13906
|
-
const int nbv2 = v->nb[2];
|
13907
|
-
const int nbv3 = v->nb[3];
|
13908
|
-
|
13909
|
-
const int nb0 = dst->nb[0];
|
13910
|
-
const int nb1 = dst->nb[1];
|
13911
|
-
const int nb2 = dst->nb[2];
|
13912
|
-
const int nb3 = dst->nb[3];
|
13196
|
+
GGML_TENSOR_LOCALS(int64_t, neq, q, ne);
|
13197
|
+
GGML_TENSOR_LOCALS(size_t, nbq, q, nb);
|
13198
|
+
GGML_TENSOR_LOCALS(int64_t, nek, k, ne);
|
13199
|
+
GGML_TENSOR_LOCALS(size_t, nbk, k, nb);
|
13200
|
+
GGML_TENSOR_LOCALS(int64_t, nev, v, ne);
|
13201
|
+
GGML_TENSOR_LOCALS(size_t, nbv, v, nb);
|
13202
|
+
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne);
|
13203
|
+
GGML_TENSOR_LOCALS(size_t, nb, dst, nb);
|
13913
13204
|
|
13914
13205
|
const int ith = params->ith;
|
13915
13206
|
const int nth = params->nth;
|
@@ -14143,65 +13434,18 @@ static void ggml_compute_forward_flash_ff_f16(
|
|
14143
13434
|
int64_t t0 = ggml_perf_time_us();
|
14144
13435
|
UNUSED(t0);
|
14145
13436
|
|
14146
|
-
|
14147
|
-
|
14148
|
-
|
14149
|
-
|
14150
|
-
|
14151
|
-
|
14152
|
-
|
14153
|
-
|
14154
|
-
|
14155
|
-
|
14156
|
-
|
14157
|
-
|
14158
|
-
//const int64_t neb12 = b1->ne[2];
|
14159
|
-
//const int64_t neb13 = b1->ne[3];
|
14160
|
-
|
14161
|
-
const int64_t nec00 = c0->ne[0];
|
14162
|
-
const int64_t nec01 = c0->ne[1];
|
14163
|
-
//const int64_t nec02 = c0->ne[2];
|
14164
|
-
//const int64_t nec03 = c0->ne[3];
|
14165
|
-
|
14166
|
-
const int64_t nec10 = c1->ne[0];
|
14167
|
-
const int64_t nec11 = c1->ne[1];
|
14168
|
-
//const int64_t nec12 = c1->ne[2];
|
14169
|
-
//const int64_t nec13 = c1->ne[3];
|
14170
|
-
|
14171
|
-
const int64_t ne0 = dst->ne[0];
|
14172
|
-
const int64_t ne1 = dst->ne[1];
|
14173
|
-
const int64_t ne2 = dst->ne[2];
|
14174
|
-
//const int64_t ne3 = dst->ne[3];
|
14175
|
-
|
14176
|
-
const int nba0 = a->nb[0];
|
14177
|
-
const int nba1 = a->nb[1];
|
14178
|
-
const int nba2 = a->nb[2];
|
14179
|
-
const int nba3 = a->nb[3];
|
14180
|
-
|
14181
|
-
const int nbb00 = b0->nb[0];
|
14182
|
-
const int nbb01 = b0->nb[1];
|
14183
|
-
const int nbb02 = b0->nb[2];
|
14184
|
-
const int nbb03 = b0->nb[3];
|
14185
|
-
|
14186
|
-
const int nbb10 = b1->nb[0];
|
14187
|
-
//const int nbb11 = b1->nb[1];
|
14188
|
-
//const int nbb12 = b1->nb[2];
|
14189
|
-
//const int nbb13 = b1->nb[3];
|
14190
|
-
|
14191
|
-
const int nbc00 = c0->nb[0];
|
14192
|
-
const int nbc01 = c0->nb[1];
|
14193
|
-
const int nbc02 = c0->nb[2];
|
14194
|
-
const int nbc03 = c0->nb[3];
|
14195
|
-
|
14196
|
-
const int nbc10 = c1->nb[0];
|
14197
|
-
//const int nbc11 = c1->nb[1];
|
14198
|
-
//const int nbc12 = c1->nb[2];
|
14199
|
-
//const int nbc13 = c1->nb[3];
|
14200
|
-
|
14201
|
-
const int nb0 = dst->nb[0];
|
14202
|
-
const int nb1 = dst->nb[1];
|
14203
|
-
const int nb2 = dst->nb[2];
|
14204
|
-
const int nb3 = dst->nb[3];
|
13437
|
+
GGML_TENSOR_LOCALS(int64_t, nea, a, ne);
|
13438
|
+
GGML_TENSOR_LOCALS(size_t, nba, a, nb);
|
13439
|
+
GGML_TENSOR_LOCALS(int64_t, neb0, b0, ne);
|
13440
|
+
GGML_TENSOR_LOCALS(size_t, nbb0, b0, nb);
|
13441
|
+
GGML_TENSOR_LOCALS(int64_t, neb1, b1, ne);
|
13442
|
+
GGML_TENSOR_LOCALS(size_t, nbb1, b1, nb);
|
13443
|
+
GGML_TENSOR_LOCALS(int64_t, nec0, c0, ne);
|
13444
|
+
GGML_TENSOR_LOCALS(size_t, nbc0, c0, nb);
|
13445
|
+
GGML_TENSOR_LOCALS(int64_t, nec1, c1, ne);
|
13446
|
+
GGML_TENSOR_LOCALS(size_t, nbc1, c1, nb);
|
13447
|
+
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne);
|
13448
|
+
GGML_TENSOR_LOCALS(size_t, nb, dst, nb);
|
14205
13449
|
|
14206
13450
|
const int ith = params->ith;
|
14207
13451
|
const int nth = params->nth;
|
@@ -14349,55 +13593,16 @@ static void ggml_compute_forward_flash_attn_back_f32(
|
|
14349
13593
|
int64_t t0 = ggml_perf_time_us();
|
14350
13594
|
UNUSED(t0);
|
14351
13595
|
|
14352
|
-
|
14353
|
-
|
14354
|
-
|
14355
|
-
|
14356
|
-
|
14357
|
-
|
14358
|
-
|
14359
|
-
|
14360
|
-
|
14361
|
-
|
14362
|
-
const int64_t nev0 = v->ne[0];
|
14363
|
-
const int64_t nev1 = v->ne[1];
|
14364
|
-
//const int64_t nev2 = v->ne[2];
|
14365
|
-
//const int64_t nev3 = v->ne[3];
|
14366
|
-
|
14367
|
-
const int64_t ned0 = d->ne[0];
|
14368
|
-
const int64_t ned1 = d->ne[1];
|
14369
|
-
//const int64_t ned2 = d->ne[2];
|
14370
|
-
//const int64_t ned3 = d->ne[3];
|
14371
|
-
|
14372
|
-
const int64_t ne0 = dst->ne[0];
|
14373
|
-
const int64_t ne1 = dst->ne[1];
|
14374
|
-
const int64_t ne2 = dst->ne[2];
|
14375
|
-
const int64_t ne3 = dst->ne[3];
|
14376
|
-
|
14377
|
-
const int nbk0 = k->nb[0];
|
14378
|
-
const int nbk1 = k->nb[1];
|
14379
|
-
const int nbk2 = k->nb[2];
|
14380
|
-
const int nbk3 = k->nb[3];
|
14381
|
-
|
14382
|
-
const int nbq0 = q->nb[0];
|
14383
|
-
const int nbq1 = q->nb[1];
|
14384
|
-
const int nbq2 = q->nb[2];
|
14385
|
-
const int nbq3 = q->nb[3];
|
14386
|
-
|
14387
|
-
const int nbv0 = v->nb[0];
|
14388
|
-
const int nbv1 = v->nb[1];
|
14389
|
-
const int nbv2 = v->nb[2];
|
14390
|
-
const int nbv3 = v->nb[3];
|
14391
|
-
|
14392
|
-
const int nbd0 = d->nb[0];
|
14393
|
-
const int nbd1 = d->nb[1];
|
14394
|
-
const int nbd2 = d->nb[2];
|
14395
|
-
const int nbd3 = d->nb[3];
|
14396
|
-
|
14397
|
-
const int nb0 = dst->nb[0];
|
14398
|
-
const int nb1 = dst->nb[1];
|
14399
|
-
const int nb2 = dst->nb[2];
|
14400
|
-
const int nb3 = dst->nb[3];
|
13596
|
+
GGML_TENSOR_LOCALS(int64_t, neq, q, ne);
|
13597
|
+
GGML_TENSOR_LOCALS(size_t, nbq, q, nb);
|
13598
|
+
GGML_TENSOR_LOCALS(int64_t, nek, k, ne);
|
13599
|
+
GGML_TENSOR_LOCALS(size_t, nbk, k, nb);
|
13600
|
+
GGML_TENSOR_LOCALS(int64_t, nev, v, ne);
|
13601
|
+
GGML_TENSOR_LOCALS(size_t, nbv, v, nb);
|
13602
|
+
GGML_TENSOR_LOCALS(int64_t, ned, d, ne);
|
13603
|
+
GGML_TENSOR_LOCALS(size_t, nbd, d, nb);
|
13604
|
+
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne);
|
13605
|
+
GGML_TENSOR_LOCALS(size_t, nb, dst, nb);
|
14401
13606
|
|
14402
13607
|
const int ith = params->ith;
|
14403
13608
|
const int nth = params->nth;
|
@@ -14755,15 +13960,8 @@ static void ggml_compute_forward_win_part_f32(
|
|
14755
13960
|
return;
|
14756
13961
|
}
|
14757
13962
|
|
14758
|
-
|
14759
|
-
|
14760
|
-
const int64_t ne02 = src0->ne[2];
|
14761
|
-
const int64_t ne03 = src0->ne[3]; UNUSED(ne03);
|
14762
|
-
|
14763
|
-
const int64_t ne0 = dst->ne[0];
|
14764
|
-
const int64_t ne1 = dst->ne[1];
|
14765
|
-
const int64_t ne2 = dst->ne[2];
|
14766
|
-
const int64_t ne3 = dst->ne[3]; UNUSED(ne3);
|
13963
|
+
GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne);
|
13964
|
+
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne);
|
14767
13965
|
|
14768
13966
|
const int32_t nep0 = ((const int32_t *)(opt0->data))[0];
|
14769
13967
|
const int32_t nep1 = ((const int32_t *)(opt0->data))[1];
|
@@ -14826,14 +14024,8 @@ static void ggml_compute_forward_win_unpart_f32(
|
|
14826
14024
|
return;
|
14827
14025
|
}
|
14828
14026
|
|
14829
|
-
|
14830
|
-
|
14831
|
-
const int64_t ne02 = src0->ne[2];
|
14832
|
-
//const int64_t ne03 = src0->ne[3];
|
14833
|
-
|
14834
|
-
const int64_t ne0 = dst->ne[0];
|
14835
|
-
const int64_t ne1 = dst->ne[1];
|
14836
|
-
const int64_t ne2 = dst->ne[2];
|
14027
|
+
GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne);
|
14028
|
+
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne);
|
14837
14029
|
|
14838
14030
|
const int32_t w = ((const int32_t *)(opt0->data))[0];
|
14839
14031
|
|
@@ -15431,6 +14623,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
15431
14623
|
{
|
15432
14624
|
ggml_compute_forward_mean(params, tensor->src0, tensor);
|
15433
14625
|
} break;
|
14626
|
+
case GGML_OP_ARGMAX:
|
14627
|
+
{
|
14628
|
+
ggml_compute_forward_argmax(params, tensor->src0, tensor);
|
14629
|
+
} break;
|
15434
14630
|
case GGML_OP_REPEAT:
|
15435
14631
|
{
|
15436
14632
|
ggml_compute_forward_repeat(params, tensor->src0, tensor);
|
@@ -15455,6 +14651,14 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
15455
14651
|
{
|
15456
14652
|
ggml_compute_forward_step(params, tensor->src0, tensor);
|
15457
14653
|
} break;
|
14654
|
+
case GGML_OP_TANH:
|
14655
|
+
{
|
14656
|
+
ggml_compute_forward_tanh(params, tensor->src0, tensor);
|
14657
|
+
} break;
|
14658
|
+
case GGML_OP_ELU:
|
14659
|
+
{
|
14660
|
+
ggml_compute_forward_elu(params, tensor->src0, tensor);
|
14661
|
+
} break;
|
15458
14662
|
case GGML_OP_RELU:
|
15459
14663
|
{
|
15460
14664
|
ggml_compute_forward_relu(params, tensor->src0, tensor);
|
@@ -15571,17 +14775,13 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
15571
14775
|
{
|
15572
14776
|
ggml_compute_forward_clamp(params, tensor->src0, tensor->src1, tensor);
|
15573
14777
|
} break;
|
15574
|
-
case
|
15575
|
-
{
|
15576
|
-
ggml_compute_forward_conv_1d_s1_ph(params, tensor->src0, tensor->src1, tensor);
|
15577
|
-
} break;
|
15578
|
-
case GGML_OP_CONV_1D_S2_PH:
|
14778
|
+
case GGML_OP_CONV_1D:
|
15579
14779
|
{
|
15580
|
-
|
14780
|
+
ggml_compute_forward_conv_1d(params, tensor->src0, tensor->src1, tensor->opt[0], tensor);
|
15581
14781
|
} break;
|
15582
|
-
case
|
14782
|
+
case GGML_OP_CONV_2D:
|
15583
14783
|
{
|
15584
|
-
|
14784
|
+
ggml_compute_forward_conv_2d(params, tensor->src0, tensor->src1, tensor->opt[0], tensor);
|
15585
14785
|
} break;
|
15586
14786
|
case GGML_OP_FLASH_ATTN:
|
15587
14787
|
{
|
@@ -15830,6 +15030,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
15830
15030
|
}
|
15831
15031
|
} break;
|
15832
15032
|
case GGML_OP_MEAN:
|
15033
|
+
case GGML_OP_ARGMAX:
|
15833
15034
|
{
|
15834
15035
|
GGML_ASSERT(false); // TODO: implement
|
15835
15036
|
} break;
|
@@ -15883,6 +15084,14 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
15883
15084
|
// noop
|
15884
15085
|
}
|
15885
15086
|
} break;
|
15087
|
+
case GGML_OP_TANH:
|
15088
|
+
{
|
15089
|
+
GGML_ASSERT(false); // TODO: not implemented
|
15090
|
+
} break;
|
15091
|
+
case GGML_OP_ELU:
|
15092
|
+
{
|
15093
|
+
GGML_ASSERT(false); // TODO: not implemented
|
15094
|
+
} break;
|
15886
15095
|
case GGML_OP_RELU:
|
15887
15096
|
{
|
15888
15097
|
if (src0->grad) {
|
@@ -15902,14 +15111,6 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
15902
15111
|
{
|
15903
15112
|
GGML_ASSERT(false); // TODO: not implemented
|
15904
15113
|
} break;
|
15905
|
-
case GGML_OP_ALIBI:
|
15906
|
-
{
|
15907
|
-
GGML_ASSERT(false); // TODO: not implemented
|
15908
|
-
} break;
|
15909
|
-
case GGML_OP_CLAMP:
|
15910
|
-
{
|
15911
|
-
GGML_ASSERT(false); // TODO: not implemented
|
15912
|
-
} break;
|
15913
15114
|
case GGML_OP_SILU:
|
15914
15115
|
{
|
15915
15116
|
// necessary for llama
|
@@ -16226,7 +15427,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
16226
15427
|
// necessary for llama
|
16227
15428
|
if (src0->grad) {
|
16228
15429
|
assert(src1->type == GGML_TYPE_I32);
|
16229
|
-
assert(ggml_nelements(src1) ==
|
15430
|
+
assert(ggml_nelements(src1) == 4);
|
16230
15431
|
const int n_past = ((int32_t *) src1->data)[0];
|
16231
15432
|
const int n_dims = ((int32_t *) src1->data)[1];
|
16232
15433
|
const int mode = ((int32_t *) src1->data)[2];
|
@@ -16266,15 +15467,19 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
16266
15467
|
// noop
|
16267
15468
|
}
|
16268
15469
|
} break;
|
16269
|
-
case
|
15470
|
+
case GGML_OP_ALIBI:
|
15471
|
+
{
|
15472
|
+
GGML_ASSERT(false); // TODO: not implemented
|
15473
|
+
} break;
|
15474
|
+
case GGML_OP_CLAMP:
|
16270
15475
|
{
|
16271
15476
|
GGML_ASSERT(false); // TODO: not implemented
|
16272
15477
|
} break;
|
16273
|
-
case
|
15478
|
+
case GGML_OP_CONV_1D:
|
16274
15479
|
{
|
16275
15480
|
GGML_ASSERT(false); // TODO: not implemented
|
16276
15481
|
} break;
|
16277
|
-
case
|
15482
|
+
case GGML_OP_CONV_2D:
|
16278
15483
|
{
|
16279
15484
|
GGML_ASSERT(false); // TODO: not implemented
|
16280
15485
|
} break;
|
@@ -16791,9 +15996,11 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
16791
15996
|
if (node_n != -1) {
|
16792
15997
|
/* FINALIZE */
|
16793
15998
|
struct ggml_tensor * node = state->shared->cgraph->nodes[node_n];
|
16794
|
-
|
16795
|
-
|
16796
|
-
|
15999
|
+
if (GGML_OP_HAS_FINALIZE[node->op]) {
|
16000
|
+
params.nth = node->n_tasks;
|
16001
|
+
ggml_compute_forward(¶ms, node);
|
16002
|
+
ggml_graph_compute_perf_stats_node(node, state->shared);
|
16003
|
+
}
|
16797
16004
|
}
|
16798
16005
|
|
16799
16006
|
// distribute new work or execute it direct if 1T
|
@@ -16805,10 +16012,13 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
16805
16012
|
state->shared->perf_node_start_cycles = ggml_perf_cycles();
|
16806
16013
|
state->shared->perf_node_start_time_us = ggml_perf_time_us();
|
16807
16014
|
|
16015
|
+
params.nth = node->n_tasks;
|
16016
|
+
|
16808
16017
|
/* INIT */
|
16809
|
-
|
16810
|
-
|
16811
|
-
|
16018
|
+
if (GGML_OP_HAS_INIT[node->op]) {
|
16019
|
+
params.type = GGML_TASK_INIT;
|
16020
|
+
ggml_compute_forward(¶ms, node);
|
16021
|
+
}
|
16812
16022
|
|
16813
16023
|
if (node->n_tasks == 1) {
|
16814
16024
|
// TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1,
|
@@ -16816,9 +16026,11 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
16816
16026
|
params.type = GGML_TASK_COMPUTE;
|
16817
16027
|
ggml_compute_forward(¶ms, node);
|
16818
16028
|
|
16819
|
-
|
16820
|
-
|
16821
|
-
|
16029
|
+
if (GGML_OP_HAS_FINALIZE[node->op]) {
|
16030
|
+
params.type = GGML_TASK_FINALIZE;
|
16031
|
+
ggml_compute_forward(¶ms, node);
|
16032
|
+
ggml_graph_compute_perf_stats_node(node, state->shared);
|
16033
|
+
}
|
16822
16034
|
} else {
|
16823
16035
|
break;
|
16824
16036
|
}
|
@@ -16924,12 +16136,15 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|
16924
16136
|
case GGML_OP_SUM:
|
16925
16137
|
case GGML_OP_SUM_ROWS:
|
16926
16138
|
case GGML_OP_MEAN:
|
16139
|
+
case GGML_OP_ARGMAX:
|
16927
16140
|
case GGML_OP_REPEAT:
|
16928
16141
|
case GGML_OP_REPEAT_BACK:
|
16929
16142
|
case GGML_OP_ABS:
|
16930
16143
|
case GGML_OP_SGN:
|
16931
16144
|
case GGML_OP_NEG:
|
16932
16145
|
case GGML_OP_STEP:
|
16146
|
+
case GGML_OP_TANH:
|
16147
|
+
case GGML_OP_ELU:
|
16933
16148
|
case GGML_OP_RELU:
|
16934
16149
|
{
|
16935
16150
|
node->n_tasks = 1;
|
@@ -16958,6 +16173,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|
16958
16173
|
//printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks = %d\n", nr0, nr1, nr0*nr1, node->n_tasks);
|
16959
16174
|
|
16960
16175
|
size_t cur = 0;
|
16176
|
+
const enum ggml_type vec_dot_type = type_traits[node->src0->type].vec_dot_type;
|
16961
16177
|
|
16962
16178
|
#if defined(GGML_USE_CUBLAS)
|
16963
16179
|
if (ggml_cuda_can_mul_mat(node->src0, node->src1, node)) {
|
@@ -16973,39 +16189,20 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|
16973
16189
|
}
|
16974
16190
|
else
|
16975
16191
|
#endif
|
16976
|
-
if (node->src0->type == GGML_TYPE_F16 && node->src1->type == GGML_TYPE_F32) {
|
16977
16192
|
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
16978
|
-
|
16979
|
-
|
16980
|
-
|
16193
|
+
if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
|
16194
|
+
node->n_tasks = 1; // TODO: this actually is doing nothing
|
16195
|
+
// the threads are still spinning
|
16196
|
+
if (node->src0->type != GGML_TYPE_F32) {
|
16981
16197
|
// here we need memory just for single 2D matrix from src0
|
16982
16198
|
cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]);
|
16983
|
-
} else {
|
16984
|
-
cur = GGML_TYPE_SIZE[GGML_TYPE_F16]*ggml_nelements(node->src1);
|
16985
|
-
}
|
16986
|
-
#else
|
16987
|
-
cur = GGML_TYPE_SIZE[GGML_TYPE_F16]*ggml_nelements(node->src1);
|
16988
|
-
#endif
|
16989
|
-
} else if (node->src0->type == GGML_TYPE_F32 && node->src1->type == GGML_TYPE_F32) {
|
16990
|
-
cur = 0;
|
16991
|
-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
16992
|
-
if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
|
16993
|
-
node->n_tasks = 1;
|
16994
16199
|
}
|
16200
|
+
} else
|
16995
16201
|
#endif
|
16996
|
-
|
16997
|
-
|
16998
|
-
if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
|
16999
|
-
node->n_tasks = 1;
|
17000
|
-
cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]);
|
17001
|
-
} else
|
17002
|
-
#endif
|
17003
|
-
{
|
17004
|
-
const enum ggml_type type_q = quantize_fns[node->src0->type].vec_dot_type;
|
17005
|
-
cur = GGML_TYPE_SIZE[type_q]*ggml_nelements(node->src1)/GGML_BLCK_SIZE[type_q];
|
17006
|
-
}
|
16202
|
+
if (node->src1->type != vec_dot_type) {
|
16203
|
+
cur = GGML_TYPE_SIZE[vec_dot_type]*ggml_nelements(node->src1)/GGML_BLCK_SIZE[vec_dot_type];
|
17007
16204
|
} else {
|
17008
|
-
|
16205
|
+
cur = 0;
|
17009
16206
|
}
|
17010
16207
|
|
17011
16208
|
work_size = MAX(work_size, cur);
|
@@ -17043,8 +16240,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|
17043
16240
|
{
|
17044
16241
|
node->n_tasks = 1; //TODO
|
17045
16242
|
} break;
|
17046
|
-
case
|
17047
|
-
case GGML_OP_CONV_1D_S2_PH:
|
16243
|
+
case GGML_OP_CONV_1D:
|
17048
16244
|
{
|
17049
16245
|
node->n_tasks = n_threads;
|
17050
16246
|
|
@@ -17073,7 +16269,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|
17073
16269
|
|
17074
16270
|
work_size = MAX(work_size, cur);
|
17075
16271
|
} break;
|
17076
|
-
case
|
16272
|
+
case GGML_OP_CONV_2D:
|
17077
16273
|
{
|
17078
16274
|
node->n_tasks = n_threads;
|
17079
16275
|
|
@@ -17435,13 +16631,6 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
|
|
17435
16631
|
fwrite(&nb, sizeof(uint64_t), 1, fout);
|
17436
16632
|
}
|
17437
16633
|
|
17438
|
-
// store the pointer address
|
17439
|
-
{
|
17440
|
-
const uint64_t ptr = (uint64_t) tensor->data;
|
17441
|
-
|
17442
|
-
fwrite(&ptr, sizeof(uint64_t), 1, fout);
|
17443
|
-
}
|
17444
|
-
|
17445
16634
|
fwrite(tensor->name, sizeof(char), GGML_MAX_NAME, fout);
|
17446
16635
|
|
17447
16636
|
// dump the data
|
@@ -17475,13 +16664,6 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
|
|
17475
16664
|
fwrite(&nb, sizeof(uint64_t), 1, fout);
|
17476
16665
|
}
|
17477
16666
|
|
17478
|
-
// store the pointer address
|
17479
|
-
{
|
17480
|
-
const uint64_t ptr = (uint64_t) tensor->data;
|
17481
|
-
|
17482
|
-
fwrite(&ptr, sizeof(uint64_t), 1, fout);
|
17483
|
-
}
|
17484
|
-
|
17485
16667
|
fwrite(tensor->name, sizeof(char), GGML_MAX_NAME, fout);
|
17486
16668
|
|
17487
16669
|
// output the op arguments
|
@@ -17666,8 +16848,6 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
|
|
17666
16848
|
|
17667
16849
|
tensor->op = (enum ggml_op) op;
|
17668
16850
|
|
17669
|
-
uint64_t ptr_cur = *(const uint64_t *) ptr; ptr += sizeof(ptr_cur);
|
17670
|
-
|
17671
16851
|
memcpy(tensor->name, ptr, GGML_MAX_NAME); ptr += GGML_MAX_NAME;
|
17672
16852
|
|
17673
16853
|
tensor->data = (void *) ptr;
|
@@ -17713,8 +16893,6 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
|
|
17713
16893
|
nb[j] = nb_cur;
|
17714
16894
|
}
|
17715
16895
|
|
17716
|
-
uint64_t ptr_cur = *(const uint64_t *) ptr; ptr += sizeof(ptr_cur); // TODO: not yet used
|
17717
|
-
|
17718
16896
|
const char * ptr_name = ptr; ptr += GGML_MAX_NAME;
|
17719
16897
|
|
17720
16898
|
const int32_t * ptr_arg_idx = (const int32_t *) ptr; ptr += (2 + GGML_MAX_OPT)*sizeof(int32_t);
|