llama_cpp 0.16.1 → 0.16.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +7 -0
- data/ext/llama_cpp/extconf.rb +1 -0
- data/ext/llama_cpp/llama_cpp.cpp +12 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +2 -0
- data/vendor/tmp/llama.cpp/Makefile +10 -2
- data/vendor/tmp/llama.cpp/ggml-backend.c +14 -3
- data/vendor/tmp/llama.cpp/ggml-backend.h +3 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +10 -10
- data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +28 -0
- data/vendor/tmp/llama.cpp/ggml-impl.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-metal.m +6 -0
- data/vendor/tmp/llama.cpp/ggml-quants.c +982 -368
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +8 -3
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +2124 -13202
- data/vendor/tmp/llama.cpp/ggml-sycl.h +1 -10
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +27564 -23876
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +278 -366
- data/vendor/tmp/llama.cpp/ggml.c +67 -150
- data/vendor/tmp/llama.cpp/ggml.h +6 -0
- data/vendor/tmp/llama.cpp/llama.cpp +530 -237
- data/vendor/tmp/llama.cpp/llama.h +5 -1
- data/vendor/tmp/llama.cpp/sgemm.cpp +2 -0
- data/vendor/tmp/llama.cpp/unicode-data.cpp +851 -801
- data/vendor/tmp/llama.cpp/unicode.cpp +33 -19
- data/vendor/tmp/llama.cpp/unicode.h +1 -1
- metadata +2 -2
data/vendor/tmp/llama.cpp/ggml.c
CHANGED
@@ -1753,9 +1753,8 @@ struct ggml_compute_state_shared {
|
|
1753
1753
|
int n_threads;
|
1754
1754
|
|
1755
1755
|
// synchronization primitives
|
1756
|
-
atomic_int
|
1757
|
-
atomic_int
|
1758
|
-
atomic_int node_task; // active graph node task phase
|
1756
|
+
atomic_int n_barrier;
|
1757
|
+
atomic_int n_barrier_passed;
|
1759
1758
|
|
1760
1759
|
ggml_abort_callback abort_callback; // abort ggml_graph_compute when true
|
1761
1760
|
void* abort_callback_data;
|
@@ -18972,47 +18971,49 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_
|
|
18972
18971
|
return n_tasks;
|
18973
18972
|
}
|
18974
18973
|
|
18975
|
-
|
18976
|
-
|
18977
|
-
|
18978
|
-
|
18979
|
-
while (true) {
|
18980
|
-
if (do_yield) {
|
18981
|
-
sched_yield();
|
18982
|
-
}
|
18983
|
-
|
18984
|
-
*node_n = atomic_load(&state->shared->node_n);
|
18985
|
-
if (*node_n != last_node_n) {
|
18986
|
-
break;
|
18987
|
-
}
|
18988
|
-
|
18989
|
-
#if defined(__SSE3__)
|
18990
|
-
// Tell the processor we're spinning. It's a processor hint for spinlocks.
|
18991
|
-
_mm_pause();
|
18992
|
-
#endif
|
18974
|
+
#ifdef GGML_USE_OPENMP
|
18975
|
+
static void ggml_barrier(struct ggml_compute_state * state) {
|
18976
|
+
if (state->shared->n_threads == 1) {
|
18977
|
+
return;
|
18993
18978
|
}
|
18979
|
+
|
18980
|
+
#pragma omp barrier
|
18994
18981
|
}
|
18982
|
+
#else
|
18983
|
+
static void ggml_barrier(struct ggml_compute_state * state) {
|
18984
|
+
if (state->shared->n_threads == 1) {
|
18985
|
+
return;
|
18986
|
+
}
|
18995
18987
|
|
18996
|
-
|
18997
|
-
|
18998
|
-
const int last_task_phase = *task_phase;
|
18988
|
+
atomic_int * n_barrier = &state->shared->n_barrier;
|
18989
|
+
atomic_int * n_barrier_passed = &state->shared->n_barrier_passed;
|
18999
18990
|
|
19000
|
-
|
19001
|
-
|
19002
|
-
sched_yield();
|
19003
|
-
}
|
18991
|
+
int n_threads = state->shared->n_threads;
|
18992
|
+
int passed_old = atomic_load(n_barrier_passed);
|
19004
18993
|
|
19005
|
-
|
19006
|
-
|
19007
|
-
|
18994
|
+
if (atomic_fetch_add(n_barrier, 1) == n_threads - 1) {
|
18995
|
+
// last thread
|
18996
|
+
atomic_store(n_barrier, 0);
|
18997
|
+
atomic_fetch_add(n_barrier_passed, 1);
|
18998
|
+
} else {
|
18999
|
+
// wait for other threads
|
19000
|
+
//while (atomic_load(n_barrier_passed) == passed_old) {
|
19001
|
+
//}
|
19002
|
+
const int n_spin_before_sleep = 100000;
|
19003
|
+
while (true) {
|
19004
|
+
for (int i = 0; i < n_spin_before_sleep; i++) {
|
19005
|
+
if (atomic_load(n_barrier_passed) != passed_old) {
|
19006
|
+
return;
|
19007
|
+
}
|
19008
|
+
#if defined(__SSE3__)
|
19009
|
+
_mm_pause();
|
19010
|
+
#endif
|
19011
|
+
}
|
19012
|
+
sched_yield();
|
19008
19013
|
}
|
19009
|
-
|
19010
|
-
#if defined(__SSE3__)
|
19011
|
-
// Tell the processor we're spinning. It's a processor hint for spinlocks.
|
19012
|
-
_mm_pause();
|
19013
|
-
#endif
|
19014
19014
|
}
|
19015
19015
|
}
|
19016
|
+
#endif
|
19016
19017
|
|
19017
19018
|
static thread_ret_t ggml_graph_compute_thread(void * data) {
|
19018
19019
|
struct ggml_compute_state * state = (struct ggml_compute_state *) data;
|
@@ -19020,136 +19021,54 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
19020
19021
|
const struct ggml_cgraph * cgraph = state->shared->cgraph;
|
19021
19022
|
const struct ggml_cplan * cplan = state->shared->cplan;
|
19022
19023
|
|
19023
|
-
const int
|
19024
|
+
const int ith = state->ith;
|
19025
|
+
const int n_threads = state->shared->n_threads;
|
19024
19026
|
|
19025
|
-
set_numa_thread_affinity(
|
19027
|
+
set_numa_thread_affinity(ith);
|
19026
19028
|
|
19027
|
-
|
19028
|
-
|
19029
|
+
struct ggml_compute_params params = {
|
19030
|
+
/*.type =*/ GGML_TASK_TYPE_INIT,
|
19031
|
+
/*.ith =*/ ith,
|
19032
|
+
/*.nth =*/ state->shared->n_threads,
|
19033
|
+
/*.wsize =*/ cplan->work_size,
|
19034
|
+
/*.wdata =*/ cplan->work_data,
|
19035
|
+
};
|
19029
19036
|
|
19030
|
-
|
19037
|
+
for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) {
|
19031
19038
|
if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
|
19032
|
-
state->shared->node_n += 1;
|
19033
19039
|
state->ec = GGML_STATUS_ABORTED;
|
19034
19040
|
return 0;
|
19035
19041
|
}
|
19036
19042
|
|
19037
|
-
if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
|
19038
|
-
// all other threads are finished and spinning
|
19039
|
-
// do finalize and init here so we don't have synchronize again
|
19040
|
-
struct ggml_compute_params params = {
|
19041
|
-
/*.type =*/ GGML_TASK_TYPE_FINALIZE,
|
19042
|
-
/*.ith =*/ 0,
|
19043
|
-
/*.nth =*/ 0,
|
19044
|
-
/*.wsize =*/ cplan->work_size,
|
19045
|
-
/*.wdata =*/ cplan->work_data,
|
19046
|
-
};
|
19047
|
-
|
19048
|
-
if (node_n != -1) {
|
19049
|
-
/* FINALIZE */
|
19050
|
-
struct ggml_tensor * node = cgraph->nodes[node_n];
|
19051
|
-
if (GGML_OP_HAS_FINALIZE[node->op]) {
|
19052
|
-
params.nth = ggml_get_n_tasks(node, n_threads, state->shared->n_threads);
|
19053
|
-
ggml_compute_forward(¶ms, node, state);
|
19054
|
-
}
|
19055
|
-
ggml_graph_compute_perf_stats_node(node, state->shared);
|
19056
|
-
}
|
19057
|
-
|
19058
|
-
// distribute new work or execute it direct if 1T
|
19059
|
-
while (++node_n < cgraph->n_nodes) {
|
19060
|
-
GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes);
|
19061
|
-
struct ggml_tensor * node = cgraph->nodes[node_n];
|
19062
|
-
const int n_tasks = ggml_get_n_tasks(node, n_threads, state->shared->n_threads);
|
19063
|
-
|
19064
|
-
state->shared->perf_node_start_cycles = ggml_perf_cycles();
|
19065
|
-
state->shared->perf_node_start_time_us = ggml_perf_time_us();
|
19066
|
-
|
19067
|
-
params.nth = n_tasks;
|
19068
|
-
|
19069
|
-
if (n_tasks == 1) {
|
19070
|
-
/* INIT */
|
19071
|
-
if (GGML_OP_HAS_INIT[node->op]) {
|
19072
|
-
params.type = GGML_TASK_TYPE_INIT;
|
19073
|
-
ggml_compute_forward(¶ms, node, state);
|
19074
|
-
}
|
19075
|
-
|
19076
|
-
// TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1,
|
19077
|
-
// they do something more efficient than spinning (?)
|
19078
|
-
params.type = GGML_TASK_TYPE_COMPUTE;
|
19079
|
-
ggml_compute_forward(¶ms, node, state);
|
19080
|
-
|
19081
|
-
if (GGML_OP_HAS_FINALIZE[node->op]) {
|
19082
|
-
params.type = GGML_TASK_TYPE_FINALIZE;
|
19083
|
-
ggml_compute_forward(¶ms, node, state);
|
19084
|
-
}
|
19085
|
-
|
19086
|
-
ggml_graph_compute_perf_stats_node(node, state->shared);
|
19087
|
-
} else {
|
19088
|
-
break;
|
19089
|
-
}
|
19090
|
-
|
19091
|
-
if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
|
19092
|
-
break;
|
19093
|
-
}
|
19094
|
-
}
|
19095
|
-
|
19096
|
-
task_phase = GGML_TASK_TYPE_INIT;
|
19097
|
-
atomic_store(&state->shared->n_active, n_threads);
|
19098
|
-
atomic_store(&state->shared->node_n, node_n);
|
19099
|
-
atomic_store(&state->shared->node_task, task_phase);
|
19100
|
-
} else {
|
19101
|
-
ggml_graph_compute_thread_sync_node(&node_n, state, false);
|
19102
|
-
ggml_graph_compute_thread_sync_task(&task_phase, state, false);
|
19103
|
-
}
|
19104
|
-
|
19105
|
-
// check if we should stop
|
19106
|
-
if (node_n >= cgraph->n_nodes) break;
|
19107
|
-
|
19108
|
-
/* INIT & COMPUTE */
|
19109
19043
|
struct ggml_tensor * node = cgraph->nodes[node_n];
|
19110
19044
|
const int n_tasks = ggml_get_n_tasks(node, n_threads, state->shared->n_threads);
|
19111
19045
|
|
19112
|
-
|
19113
|
-
/*.type =*/ GGML_TASK_TYPE_INIT,
|
19114
|
-
/*.ith =*/ state->ith,
|
19115
|
-
/*.nth =*/ n_tasks,
|
19116
|
-
/*.wsize =*/ cplan->work_size,
|
19117
|
-
/*.wdata =*/ cplan->work_data,
|
19118
|
-
};
|
19046
|
+
params.nth = n_tasks;
|
19119
19047
|
|
19120
|
-
|
19121
|
-
|
19048
|
+
/* INIT */
|
19049
|
+
if (GGML_OP_HAS_INIT[node->op]) {
|
19050
|
+
if (ith < n_tasks) {
|
19051
|
+
params.type = GGML_TASK_TYPE_INIT;
|
19122
19052
|
ggml_compute_forward(¶ms, node, state);
|
19123
19053
|
}
|
19054
|
+
ggml_barrier(state);
|
19124
19055
|
}
|
19125
19056
|
|
19126
|
-
|
19127
|
-
|
19128
|
-
atomic_store(&state->shared->n_active, n_threads);
|
19129
|
-
atomic_store(&state->shared->node_task, task_phase);
|
19130
|
-
}
|
19131
|
-
else {
|
19132
|
-
// TODO: this sched_yield can have significant impact on the performance - either positive or negative
|
19133
|
-
// depending on the workload and the operating system.
|
19134
|
-
// since it is not clear what is the best approach, it should potentially become user-configurable
|
19135
|
-
// ref: https://github.com/ggerganov/ggml/issues/291
|
19136
|
-
// UPD: adding the do_yield flag seems to resolve the issue universally
|
19137
|
-
const bool do_yield = node_n < 0 || cgraph->nodes[node_n]->op == GGML_OP_MUL_MAT;
|
19138
|
-
ggml_graph_compute_thread_sync_task(&task_phase, state, do_yield);
|
19139
|
-
}
|
19140
|
-
|
19141
|
-
if (state->ith < n_tasks) {
|
19057
|
+
/* COMPUTE */
|
19058
|
+
if (ith < n_tasks) {
|
19142
19059
|
params.type = GGML_TASK_TYPE_COMPUTE;
|
19143
19060
|
ggml_compute_forward(¶ms, node, state);
|
19144
19061
|
}
|
19145
19062
|
|
19146
|
-
|
19147
|
-
|
19148
|
-
|
19149
|
-
|
19150
|
-
|
19151
|
-
|
19152
|
-
|
19063
|
+
ggml_barrier(state);
|
19064
|
+
|
19065
|
+
/* FINALIZE */
|
19066
|
+
if (GGML_OP_HAS_FINALIZE[node->op]) {
|
19067
|
+
if (params.ith == 0) {
|
19068
|
+
params.type = GGML_TASK_TYPE_FINALIZE;
|
19069
|
+
ggml_compute_forward(¶ms, node, state);
|
19070
|
+
}
|
19071
|
+
ggml_barrier(state);
|
19153
19072
|
}
|
19154
19073
|
}
|
19155
19074
|
|
@@ -19336,7 +19255,6 @@ static enum ggml_status ggml_graph_compute_parallel(struct ggml_compute_state *
|
|
19336
19255
|
// update the number of threads from the actual number of threads that we got from OpenMP
|
19337
19256
|
n_threads = omp_get_num_threads();
|
19338
19257
|
workers[0].shared->n_threads = n_threads;
|
19339
|
-
workers[0].shared->n_active = n_threads;
|
19340
19258
|
}
|
19341
19259
|
ggml_graph_compute_thread(&workers[omp_get_thread_num()]);
|
19342
19260
|
}
|
@@ -19399,9 +19317,8 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
|
|
19399
19317
|
/*.perf_node_start_cycles =*/ 0,
|
19400
19318
|
/*.perf_node_start_time_us =*/ 0,
|
19401
19319
|
/*.n_threads =*/ n_threads,
|
19402
|
-
/*.
|
19403
|
-
/*.
|
19404
|
-
/*.node_task =*/ GGML_TASK_TYPE_FINALIZE,
|
19320
|
+
/*.n_barrier =*/ 0,
|
19321
|
+
/*.n_barrier_passed =*/ 0,
|
19405
19322
|
/*.abort_callback =*/ NULL,
|
19406
19323
|
/*.abort_callback_data =*/ NULL,
|
19407
19324
|
/*.current_chunk; =*/ 0,
|
data/vendor/tmp/llama.cpp/ggml.h
CHANGED
@@ -312,6 +312,12 @@
|
|
312
312
|
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \
|
313
313
|
GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
|
314
314
|
|
315
|
+
#define GGML_TENSOR_BINARY_OP_LOCALS01 \
|
316
|
+
GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
|
317
|
+
GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \
|
318
|
+
GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \
|
319
|
+
GGML_TENSOR_LOCALS(size_t, nb1, src1, nb)
|
320
|
+
|
315
321
|
#ifdef __cplusplus
|
316
322
|
extern "C" {
|
317
323
|
#endif
|