llama_cpp 0.13.0 → 0.14.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -0
- data/ext/llama_cpp/llama_cpp.cpp +59 -26
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +6 -4
- data/vendor/tmp/llama.cpp/Makefile +2 -3
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +4 -3
- data/vendor/tmp/llama.cpp/ggml-backend.c +18 -21
- data/vendor/tmp/llama.cpp/ggml-backend.h +16 -15
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +949 -168
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +2 -2
- data/vendor/tmp/llama.cpp/ggml-metal.m +63 -7
- data/vendor/tmp/llama.cpp/ggml-metal.metal +120 -75
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +2 -2
- data/vendor/tmp/llama.cpp/ggml-quants.c +178 -133
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +3432 -1118
- data/vendor/tmp/llama.cpp/ggml-sycl.h +5 -0
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +39336 -43461
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +1327 -773
- data/vendor/tmp/llama.cpp/ggml-vulkan.h +1 -0
- data/vendor/tmp/llama.cpp/ggml.c +227 -15
- data/vendor/tmp/llama.cpp/ggml.h +30 -4
- data/vendor/tmp/llama.cpp/llama.cpp +631 -211
- data/vendor/tmp/llama.cpp/llama.h +28 -10
- metadata +2 -2
|
@@ -10,6 +10,7 @@ extern "C" {
|
|
|
10
10
|
#define GGML_VK_NAME "Vulkan"
|
|
11
11
|
#define GGML_VK_MAX_DEVICES 16
|
|
12
12
|
|
|
13
|
+
GGML_API void ggml_vk_instance_init(void);
|
|
13
14
|
GGML_API void ggml_vk_init_cpu_assist(void);
|
|
14
15
|
|
|
15
16
|
GGML_API void ggml_vk_preallocate_buffers_graph_cpu_assist(struct ggml_tensor * node);
|
data/vendor/tmp/llama.cpp/ggml.c
CHANGED
|
@@ -320,6 +320,17 @@ static ggml_fp16_t ggml_table_exp_f16[1 << 16];
|
|
|
320
320
|
// precomputed f32 table for f16 (256 KB) (ggml-impl.h)
|
|
321
321
|
float ggml_table_f32_f16[1 << 16];
|
|
322
322
|
|
|
323
|
+
const char * ggml_status_to_string(enum ggml_status status) {
|
|
324
|
+
switch (status) {
|
|
325
|
+
case GGML_STATUS_ALLOC_FAILED: return "GGML status: error (failed to allocate memory)";
|
|
326
|
+
case GGML_STATUS_FAILED: return "GGML status: error (operation failed)";
|
|
327
|
+
case GGML_STATUS_SUCCESS: return "GGML status: success";
|
|
328
|
+
case GGML_STATUS_ABORTED: return "GGML status: warning (operation aborted)";
|
|
329
|
+
}
|
|
330
|
+
|
|
331
|
+
return "GGML status: unknown";
|
|
332
|
+
}
|
|
333
|
+
|
|
323
334
|
// note: do not use these inside ggml.c
|
|
324
335
|
// these are meant to be used via the ggml.h API
|
|
325
336
|
float ggml_fp16_to_fp32(ggml_fp16_t x) {
|
|
@@ -1822,6 +1833,8 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
|
1822
1833
|
"POOL_2D",
|
|
1823
1834
|
"UPSCALE",
|
|
1824
1835
|
"PAD",
|
|
1836
|
+
"ARANGE",
|
|
1837
|
+
"TIMESTEP_EMBEDDING",
|
|
1825
1838
|
"ARGSORT",
|
|
1826
1839
|
"LEAKY_RELU",
|
|
1827
1840
|
|
|
@@ -1850,7 +1863,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
|
1850
1863
|
"CROSS_ENTROPY_LOSS_BACK",
|
|
1851
1864
|
};
|
|
1852
1865
|
|
|
1853
|
-
static_assert(GGML_OP_COUNT ==
|
|
1866
|
+
static_assert(GGML_OP_COUNT == 74, "GGML_OP_COUNT != 74");
|
|
1854
1867
|
|
|
1855
1868
|
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
1856
1869
|
"none",
|
|
@@ -1908,6 +1921,8 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
|
1908
1921
|
"pool_2d(x)",
|
|
1909
1922
|
"upscale(x)",
|
|
1910
1923
|
"pad(x)",
|
|
1924
|
+
"arange(start, stop, step)",
|
|
1925
|
+
"timestep_embedding(timesteps, dim, max_period)",
|
|
1911
1926
|
"argsort(x)",
|
|
1912
1927
|
"leaky_relu(x)",
|
|
1913
1928
|
|
|
@@ -1936,7 +1951,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
|
1936
1951
|
"cross_entropy_loss_back(x,y)",
|
|
1937
1952
|
};
|
|
1938
1953
|
|
|
1939
|
-
static_assert(GGML_OP_COUNT ==
|
|
1954
|
+
static_assert(GGML_OP_COUNT == 74, "GGML_OP_COUNT != 74");
|
|
1940
1955
|
|
|
1941
1956
|
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
|
|
1942
1957
|
|
|
@@ -2139,7 +2154,10 @@ void ggml_numa_init(enum ggml_numa_strategy numa_flag) {
|
|
|
2139
2154
|
getcpu_ret = getcpu(¤t_cpu, &g_state.numa.current_node);
|
|
2140
2155
|
#else
|
|
2141
2156
|
// old glibc doesn't have a wrapper for this call. Fall back on direct syscall
|
|
2142
|
-
|
|
2157
|
+
# if !defined(SYS_getcpu) && defined(SYS_get_cpu)
|
|
2158
|
+
# define SYS_getcpu SYS_get_cpu // some older glibc versions use this name
|
|
2159
|
+
# endif
|
|
2160
|
+
getcpu_ret = syscall(SYS_getcpu, ¤t_cpu, &g_state.numa.current_node);
|
|
2143
2161
|
#endif
|
|
2144
2162
|
|
|
2145
2163
|
if (g_state.numa.n_nodes < 1 || g_state.numa.total_cpus < 1 || getcpu_ret != 0) {
|
|
@@ -2895,11 +2913,21 @@ static int32_t ggml_get_op_params_i32(const struct ggml_tensor * tensor, uint32_
|
|
|
2895
2913
|
return ((const int32_t *)(tensor->op_params))[i];
|
|
2896
2914
|
}
|
|
2897
2915
|
|
|
2916
|
+
static float ggml_get_op_params_f32(const struct ggml_tensor * tensor, uint32_t i) {
|
|
2917
|
+
assert(i < GGML_MAX_OP_PARAMS / sizeof(float));
|
|
2918
|
+
return ((const float *)(tensor->op_params))[i];
|
|
2919
|
+
}
|
|
2920
|
+
|
|
2898
2921
|
static void ggml_set_op_params_i32(struct ggml_tensor * tensor, uint32_t i, int32_t value) {
|
|
2899
2922
|
assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
|
|
2900
2923
|
((int32_t *)(tensor->op_params))[i] = value;
|
|
2901
2924
|
}
|
|
2902
2925
|
|
|
2926
|
+
static void ggml_set_op_params_f32(struct ggml_tensor * tensor, uint32_t i, float value) {
|
|
2927
|
+
assert(i < GGML_MAX_OP_PARAMS / sizeof(float));
|
|
2928
|
+
((float *)(tensor->op_params))[i] = value;
|
|
2929
|
+
}
|
|
2930
|
+
|
|
2903
2931
|
struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor) {
|
|
2904
2932
|
memset(tensor->data, 0, ggml_nbytes(tensor));
|
|
2905
2933
|
return tensor;
|
|
@@ -5898,6 +5926,55 @@ struct ggml_tensor * ggml_upscale(
|
|
|
5898
5926
|
return ggml_upscale_impl(ctx, a, scale_factor);
|
|
5899
5927
|
}
|
|
5900
5928
|
|
|
5929
|
+
struct ggml_tensor * ggml_arange(
|
|
5930
|
+
struct ggml_context * ctx,
|
|
5931
|
+
float start,
|
|
5932
|
+
float stop,
|
|
5933
|
+
float step) {
|
|
5934
|
+
|
|
5935
|
+
GGML_ASSERT(stop > start);
|
|
5936
|
+
|
|
5937
|
+
const int64_t steps = (int64_t) ceilf((stop - start) / step);
|
|
5938
|
+
|
|
5939
|
+
struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, steps);
|
|
5940
|
+
|
|
5941
|
+
result->op = GGML_OP_ARANGE;
|
|
5942
|
+
ggml_set_op_params_f32(result, 0, start);
|
|
5943
|
+
ggml_set_op_params_f32(result, 1, stop);
|
|
5944
|
+
ggml_set_op_params_f32(result, 2, step);
|
|
5945
|
+
|
|
5946
|
+
return result;
|
|
5947
|
+
}
|
|
5948
|
+
|
|
5949
|
+
struct ggml_tensor * ggml_timestep_embedding(
|
|
5950
|
+
struct ggml_context * ctx,
|
|
5951
|
+
struct ggml_tensor * timesteps,
|
|
5952
|
+
int dim,
|
|
5953
|
+
int max_period) {
|
|
5954
|
+
bool is_node = false;
|
|
5955
|
+
|
|
5956
|
+
if (timesteps->grad) {
|
|
5957
|
+
GGML_ASSERT(false); // TODO: implement backward
|
|
5958
|
+
is_node = true;
|
|
5959
|
+
}
|
|
5960
|
+
|
|
5961
|
+
int actual_dim = dim;
|
|
5962
|
+
if (dim % 2 != 0) {
|
|
5963
|
+
actual_dim = dim + 1;
|
|
5964
|
+
}
|
|
5965
|
+
|
|
5966
|
+
struct ggml_tensor * result = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, actual_dim, timesteps->ne[0]);
|
|
5967
|
+
|
|
5968
|
+
result->op = GGML_OP_TIMESTEP_EMBEDDING;
|
|
5969
|
+
ggml_set_op_params_i32(result, 0, dim);
|
|
5970
|
+
ggml_set_op_params_i32(result, 1, max_period);
|
|
5971
|
+
|
|
5972
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
|
5973
|
+
result->src[0] = timesteps;
|
|
5974
|
+
|
|
5975
|
+
return result;
|
|
5976
|
+
}
|
|
5977
|
+
|
|
5901
5978
|
// ggml_argsort
|
|
5902
5979
|
|
|
5903
5980
|
struct ggml_tensor * ggml_argsort(
|
|
@@ -10231,7 +10308,7 @@ static void ggml_compute_forward_group_norm_f32(
|
|
|
10231
10308
|
int n_channels = src0->ne[2];
|
|
10232
10309
|
int n_groups = dst->op_params[0];
|
|
10233
10310
|
int n_channels_per_group = (n_channels + n_groups - 1) / n_groups;
|
|
10234
|
-
for (int i = ith; i < n_groups; i+=nth) {
|
|
10311
|
+
for (int i = ith; i < n_groups; i += nth) {
|
|
10235
10312
|
int start = i * n_channels_per_group;
|
|
10236
10313
|
int end = start + n_channels_per_group;
|
|
10237
10314
|
if (end > n_channels) {
|
|
@@ -10245,28 +10322,32 @@ static void ggml_compute_forward_group_norm_f32(
|
|
|
10245
10322
|
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
|
10246
10323
|
const float * x = (float *)((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03);
|
|
10247
10324
|
|
|
10325
|
+
ggml_float sumr = 0.0;
|
|
10248
10326
|
for (int64_t i00 = 0; i00 < ne00; i00++) {
|
|
10249
|
-
|
|
10327
|
+
sumr += (ggml_float)x[i00];
|
|
10250
10328
|
}
|
|
10329
|
+
sum += sumr;
|
|
10251
10330
|
}
|
|
10252
10331
|
}
|
|
10253
|
-
float mean = sum / (ne00 * ne01 * step);
|
|
10254
|
-
ggml_float sum2 = 0.0;
|
|
10332
|
+
const float mean = sum / (ne00 * ne01 * step);
|
|
10255
10333
|
|
|
10334
|
+
ggml_float sum2 = 0.0;
|
|
10256
10335
|
for (int64_t i02 = start; i02 < end; i02++) {
|
|
10257
10336
|
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
|
10258
10337
|
const float * x = (float *)((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03);
|
|
10259
10338
|
|
|
10260
10339
|
float * y = (float *)((char *) dst->data + i01 * nb1 + i02 * nb2 + i03 * nb3);
|
|
10261
10340
|
|
|
10341
|
+
ggml_float sumr = 0.0;
|
|
10262
10342
|
for (int64_t i00 = 0; i00 < ne00; i00++) {
|
|
10263
10343
|
float v = x[i00] - mean;
|
|
10264
10344
|
y[i00] = v;
|
|
10265
|
-
|
|
10345
|
+
sumr += (ggml_float)(v * v);
|
|
10266
10346
|
}
|
|
10347
|
+
sum2 += sumr;
|
|
10267
10348
|
}
|
|
10268
10349
|
}
|
|
10269
|
-
float variance = sum2 / (ne00 * ne01 * step);
|
|
10350
|
+
const float variance = sum2 / (ne00 * ne01 * step);
|
|
10270
10351
|
const float scale = 1.0f / sqrtf(variance + eps);
|
|
10271
10352
|
|
|
10272
10353
|
for (int64_t i02 = start; i02 < end; i02++) {
|
|
@@ -13547,6 +13628,106 @@ static void ggml_compute_forward_pad(
|
|
|
13547
13628
|
}
|
|
13548
13629
|
}
|
|
13549
13630
|
|
|
13631
|
+
|
|
13632
|
+
// ggml_compute_forward_arange
|
|
13633
|
+
|
|
13634
|
+
static void ggml_compute_forward_arange_f32(
|
|
13635
|
+
const struct ggml_compute_params * params,
|
|
13636
|
+
struct ggml_tensor * dst) {
|
|
13637
|
+
|
|
13638
|
+
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
|
13639
|
+
return;
|
|
13640
|
+
}
|
|
13641
|
+
|
|
13642
|
+
GGML_ASSERT(dst->nb[0] == sizeof(float));
|
|
13643
|
+
|
|
13644
|
+
const int ith = params->ith;
|
|
13645
|
+
const int nth = params->nth;
|
|
13646
|
+
|
|
13647
|
+
const float start = ggml_get_op_params_f32(dst, 0);
|
|
13648
|
+
const float stop = ggml_get_op_params_f32(dst, 1);
|
|
13649
|
+
const float step = ggml_get_op_params_f32(dst, 2);
|
|
13650
|
+
|
|
13651
|
+
const int64_t steps = (int64_t) ceilf((stop - start) / step);
|
|
13652
|
+
|
|
13653
|
+
GGML_ASSERT(ggml_nelements(dst) == steps);
|
|
13654
|
+
|
|
13655
|
+
for (int64_t i = ith; i < steps; i+= nth) {
|
|
13656
|
+
float value = start + step * i;
|
|
13657
|
+
((float *)dst->data)[i] = value;
|
|
13658
|
+
}
|
|
13659
|
+
}
|
|
13660
|
+
|
|
13661
|
+
static void ggml_compute_forward_arange(
|
|
13662
|
+
const struct ggml_compute_params * params,
|
|
13663
|
+
struct ggml_tensor * dst) {
|
|
13664
|
+
switch (dst->type) {
|
|
13665
|
+
case GGML_TYPE_F32:
|
|
13666
|
+
{
|
|
13667
|
+
ggml_compute_forward_arange_f32(params, dst);
|
|
13668
|
+
} break;
|
|
13669
|
+
default:
|
|
13670
|
+
{
|
|
13671
|
+
GGML_ASSERT(false);
|
|
13672
|
+
} break;
|
|
13673
|
+
}
|
|
13674
|
+
}
|
|
13675
|
+
|
|
13676
|
+
static void ggml_compute_forward_timestep_embedding_f32(
|
|
13677
|
+
const struct ggml_compute_params * params,
|
|
13678
|
+
struct ggml_tensor * dst) {
|
|
13679
|
+
|
|
13680
|
+
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
|
13681
|
+
return;
|
|
13682
|
+
}
|
|
13683
|
+
|
|
13684
|
+
const struct ggml_tensor * src0 = dst->src[0];
|
|
13685
|
+
|
|
13686
|
+
GGML_ASSERT(src0->nb[0] == sizeof(float));
|
|
13687
|
+
|
|
13688
|
+
const int ith = params->ith;
|
|
13689
|
+
const int nth = params->nth;
|
|
13690
|
+
|
|
13691
|
+
GGML_TENSOR_UNARY_OP_LOCALS
|
|
13692
|
+
|
|
13693
|
+
const int dim = ggml_get_op_params_i32(dst, 0);
|
|
13694
|
+
const int max_period = ggml_get_op_params_i32(dst, 1);
|
|
13695
|
+
|
|
13696
|
+
int half = dim / 2;
|
|
13697
|
+
|
|
13698
|
+
for (int64_t i = 0; i < ne00; i++) {
|
|
13699
|
+
float * embed_data = (float *)((char *) dst->data + i*nb1);
|
|
13700
|
+
for (int64_t j = ith; j < half; j += nth) {
|
|
13701
|
+
float timestep = ((float *)src0->data)[i];
|
|
13702
|
+
float freq = (float)expf(-logf(max_period) * j / half);
|
|
13703
|
+
float arg = timestep * freq;
|
|
13704
|
+
embed_data[j] = cosf(arg);
|
|
13705
|
+
embed_data[j + half] = sinf(arg);
|
|
13706
|
+
}
|
|
13707
|
+
if (dim % 2 != 0 && ith == 0) {
|
|
13708
|
+
embed_data[dim] = 0.f;
|
|
13709
|
+
}
|
|
13710
|
+
}
|
|
13711
|
+
}
|
|
13712
|
+
|
|
13713
|
+
static void ggml_compute_forward_timestep_embedding(
|
|
13714
|
+
const struct ggml_compute_params * params,
|
|
13715
|
+
struct ggml_tensor * dst) {
|
|
13716
|
+
|
|
13717
|
+
const struct ggml_tensor * src0 = dst->src[0];
|
|
13718
|
+
|
|
13719
|
+
switch (src0->type) {
|
|
13720
|
+
case GGML_TYPE_F32:
|
|
13721
|
+
{
|
|
13722
|
+
ggml_compute_forward_timestep_embedding_f32(params, dst);
|
|
13723
|
+
} break;
|
|
13724
|
+
default:
|
|
13725
|
+
{
|
|
13726
|
+
GGML_ASSERT(false);
|
|
13727
|
+
} break;
|
|
13728
|
+
}
|
|
13729
|
+
}
|
|
13730
|
+
|
|
13550
13731
|
// ggml_compute_forward_argsort
|
|
13551
13732
|
|
|
13552
13733
|
static void ggml_compute_forward_argsort_f32(
|
|
@@ -15615,6 +15796,14 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
|
15615
15796
|
{
|
|
15616
15797
|
ggml_compute_forward_pad(params, tensor);
|
|
15617
15798
|
} break;
|
|
15799
|
+
case GGML_OP_ARANGE:
|
|
15800
|
+
{
|
|
15801
|
+
ggml_compute_forward_arange(params, tensor);
|
|
15802
|
+
} break;
|
|
15803
|
+
case GGML_OP_TIMESTEP_EMBEDDING:
|
|
15804
|
+
{
|
|
15805
|
+
ggml_compute_forward_timestep_embedding(params, tensor);
|
|
15806
|
+
} break;
|
|
15618
15807
|
case GGML_OP_ARGSORT:
|
|
15619
15808
|
{
|
|
15620
15809
|
ggml_compute_forward_argsort(params, tensor);
|
|
@@ -16617,6 +16806,14 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
|
16617
16806
|
{
|
|
16618
16807
|
GGML_ASSERT(false); // TODO: not implemented
|
|
16619
16808
|
} break;
|
|
16809
|
+
case GGML_OP_ARANGE:
|
|
16810
|
+
{
|
|
16811
|
+
GGML_ASSERT(false); // TODO: not implemented
|
|
16812
|
+
} break;
|
|
16813
|
+
case GGML_OP_TIMESTEP_EMBEDDING:
|
|
16814
|
+
{
|
|
16815
|
+
GGML_ASSERT(false); // TODO: not implemented
|
|
16816
|
+
} break;
|
|
16620
16817
|
case GGML_OP_ARGSORT:
|
|
16621
16818
|
{
|
|
16622
16819
|
GGML_ASSERT(false); // TODO: not implemented
|
|
@@ -17217,6 +17414,7 @@ struct ggml_compute_state {
|
|
|
17217
17414
|
ggml_thread_t thrd;
|
|
17218
17415
|
int ith;
|
|
17219
17416
|
struct ggml_compute_state_shared * shared;
|
|
17417
|
+
enum ggml_status ec;
|
|
17220
17418
|
};
|
|
17221
17419
|
|
|
17222
17420
|
static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const struct ggml_compute_state_shared * st) {
|
|
@@ -17368,6 +17566,14 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|
|
17368
17566
|
{
|
|
17369
17567
|
n_tasks = n_threads;
|
|
17370
17568
|
} break;
|
|
17569
|
+
case GGML_OP_ARANGE:
|
|
17570
|
+
{
|
|
17571
|
+
n_tasks = n_threads;
|
|
17572
|
+
} break;
|
|
17573
|
+
case GGML_OP_TIMESTEP_EMBEDDING:
|
|
17574
|
+
{
|
|
17575
|
+
n_tasks = n_threads;
|
|
17576
|
+
} break;
|
|
17371
17577
|
case GGML_OP_ARGSORT:
|
|
17372
17578
|
{
|
|
17373
17579
|
n_tasks = n_threads;
|
|
@@ -17502,7 +17708,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
|
17502
17708
|
while (true) {
|
|
17503
17709
|
if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
|
|
17504
17710
|
state->shared->node_n += 1;
|
|
17505
|
-
|
|
17711
|
+
state->ec = GGML_STATUS_ABORTED;
|
|
17712
|
+
return 0;
|
|
17506
17713
|
}
|
|
17507
17714
|
|
|
17508
17715
|
if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
|
|
@@ -17624,7 +17831,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
|
17624
17831
|
}
|
|
17625
17832
|
}
|
|
17626
17833
|
|
|
17627
|
-
return
|
|
17834
|
+
return 0;
|
|
17628
17835
|
}
|
|
17629
17836
|
|
|
17630
17837
|
struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threads) {
|
|
@@ -17820,7 +18027,7 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
|
|
|
17820
18027
|
return cplan;
|
|
17821
18028
|
}
|
|
17822
18029
|
|
|
17823
|
-
|
|
18030
|
+
enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
|
|
17824
18031
|
{
|
|
17825
18032
|
GGML_ASSERT(cplan);
|
|
17826
18033
|
GGML_ASSERT(cplan->n_threads > 0);
|
|
@@ -17864,6 +18071,7 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
|
|
|
17864
18071
|
.thrd = 0,
|
|
17865
18072
|
.ith = j,
|
|
17866
18073
|
.shared = &state_shared,
|
|
18074
|
+
.ec = GGML_STATUS_SUCCESS,
|
|
17867
18075
|
};
|
|
17868
18076
|
|
|
17869
18077
|
const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
|
|
@@ -17874,12 +18082,14 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
|
|
|
17874
18082
|
|
|
17875
18083
|
workers[0].ith = 0;
|
|
17876
18084
|
workers[0].shared = &state_shared;
|
|
18085
|
+
workers[0].ec = GGML_STATUS_SUCCESS;
|
|
17877
18086
|
|
|
17878
18087
|
const int64_t perf_start_cycles = ggml_perf_cycles();
|
|
17879
18088
|
const int64_t perf_start_time_us = ggml_perf_time_us();
|
|
17880
18089
|
|
|
17881
18090
|
// this is a work thread too
|
|
17882
|
-
|
|
18091
|
+
ggml_graph_compute_thread(&workers[0]);
|
|
18092
|
+
enum ggml_status compute_status = workers[0].ec;
|
|
17883
18093
|
|
|
17884
18094
|
// don't leave affinity set on the main thread
|
|
17885
18095
|
clear_numa_thread_affinity();
|
|
@@ -17889,6 +18099,8 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
|
|
|
17889
18099
|
for (int j = 1; j < n_threads; j++) {
|
|
17890
18100
|
const int rc = ggml_thread_join(workers[j].thrd, NULL);
|
|
17891
18101
|
GGML_ASSERT(rc == 0);
|
|
18102
|
+
if (workers[j].ec != GGML_STATUS_SUCCESS)
|
|
18103
|
+
compute_status = workers[j].ec;
|
|
17892
18104
|
}
|
|
17893
18105
|
}
|
|
17894
18106
|
|
|
@@ -17916,14 +18128,14 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
|
|
|
17916
18128
|
return compute_status;
|
|
17917
18129
|
}
|
|
17918
18130
|
|
|
17919
|
-
|
|
18131
|
+
enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) {
|
|
17920
18132
|
struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads);
|
|
17921
18133
|
|
|
17922
18134
|
struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, cplan.work_size);
|
|
17923
18135
|
|
|
17924
18136
|
cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
|
|
17925
18137
|
|
|
17926
|
-
ggml_graph_compute(cgraph, &cplan);
|
|
18138
|
+
return ggml_graph_compute(cgraph, &cplan);
|
|
17927
18139
|
}
|
|
17928
18140
|
|
|
17929
18141
|
struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name) {
|
data/vendor/tmp/llama.cpp/ggml.h
CHANGED
|
@@ -315,6 +315,16 @@
|
|
|
315
315
|
extern "C" {
|
|
316
316
|
#endif
|
|
317
317
|
|
|
318
|
+
enum ggml_status {
|
|
319
|
+
GGML_STATUS_ALLOC_FAILED = -2,
|
|
320
|
+
GGML_STATUS_FAILED = -1,
|
|
321
|
+
GGML_STATUS_SUCCESS = 0,
|
|
322
|
+
GGML_STATUS_ABORTED = 1,
|
|
323
|
+
};
|
|
324
|
+
|
|
325
|
+
// get ggml_status name string
|
|
326
|
+
GGML_API GGML_CALL const char * ggml_status_to_string(enum ggml_status status);
|
|
327
|
+
|
|
318
328
|
typedef uint16_t ggml_fp16_t;
|
|
319
329
|
|
|
320
330
|
// convert FP16 <-> FP32
|
|
@@ -454,6 +464,8 @@ extern "C" {
|
|
|
454
464
|
GGML_OP_POOL_2D,
|
|
455
465
|
GGML_OP_UPSCALE, // nearest interpolate
|
|
456
466
|
GGML_OP_PAD,
|
|
467
|
+
GGML_OP_ARANGE,
|
|
468
|
+
GGML_OP_TIMESTEP_EMBEDDING,
|
|
457
469
|
GGML_OP_ARGSORT,
|
|
458
470
|
GGML_OP_LEAKY_RELU,
|
|
459
471
|
|
|
@@ -1661,6 +1673,15 @@ extern "C" {
|
|
|
1661
1673
|
int p2,
|
|
1662
1674
|
int p3);
|
|
1663
1675
|
|
|
1676
|
+
// Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151
|
|
1677
|
+
// timesteps: [N,]
|
|
1678
|
+
// return: [N, dim]
|
|
1679
|
+
GGML_API struct ggml_tensor * ggml_timestep_embedding(
|
|
1680
|
+
struct ggml_context * ctx,
|
|
1681
|
+
struct ggml_tensor * timesteps,
|
|
1682
|
+
int dim,
|
|
1683
|
+
int max_period);
|
|
1684
|
+
|
|
1664
1685
|
// sort rows
|
|
1665
1686
|
enum ggml_sort_order {
|
|
1666
1687
|
GGML_SORT_ORDER_ASC,
|
|
@@ -1672,6 +1693,12 @@ extern "C" {
|
|
|
1672
1693
|
struct ggml_tensor * a,
|
|
1673
1694
|
enum ggml_sort_order order);
|
|
1674
1695
|
|
|
1696
|
+
GGML_API struct ggml_tensor * ggml_arange(
|
|
1697
|
+
struct ggml_context * ctx,
|
|
1698
|
+
float start,
|
|
1699
|
+
float stop,
|
|
1700
|
+
float step);
|
|
1701
|
+
|
|
1675
1702
|
// top k elements per row
|
|
1676
1703
|
GGML_API struct ggml_tensor * ggml_top_k(
|
|
1677
1704
|
struct ggml_context * ctx,
|
|
@@ -1923,12 +1950,11 @@ extern "C" {
|
|
|
1923
1950
|
|
|
1924
1951
|
// ggml_graph_plan() has to be called before ggml_graph_compute()
|
|
1925
1952
|
// when plan.work_size > 0, caller must allocate memory for plan.work_data
|
|
1926
|
-
GGML_API struct ggml_cplan ggml_graph_plan
|
|
1927
|
-
GGML_API
|
|
1928
|
-
|
|
1953
|
+
GGML_API struct ggml_cplan ggml_graph_plan (const struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
|
|
1954
|
+
GGML_API enum ggml_status ggml_graph_compute ( struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
|
|
1929
1955
|
// same as ggml_graph_compute() but the work data is allocated as a part of the context
|
|
1930
1956
|
// note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
|
|
1931
|
-
GGML_API
|
|
1957
|
+
GGML_API enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
|
|
1932
1958
|
|
|
1933
1959
|
GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);
|
|
1934
1960
|
|