llama_cpp 0.16.1 → 0.16.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +7 -0
- data/ext/llama_cpp/extconf.rb +1 -0
- data/ext/llama_cpp/llama_cpp.cpp +12 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +2 -0
- data/vendor/tmp/llama.cpp/Makefile +10 -2
- data/vendor/tmp/llama.cpp/ggml-backend.c +14 -3
- data/vendor/tmp/llama.cpp/ggml-backend.h +3 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +10 -10
- data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +28 -0
- data/vendor/tmp/llama.cpp/ggml-impl.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-metal.m +6 -0
- data/vendor/tmp/llama.cpp/ggml-quants.c +982 -368
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +8 -3
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +2124 -13202
- data/vendor/tmp/llama.cpp/ggml-sycl.h +1 -10
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +27564 -23876
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +278 -366
- data/vendor/tmp/llama.cpp/ggml.c +67 -150
- data/vendor/tmp/llama.cpp/ggml.h +6 -0
- data/vendor/tmp/llama.cpp/llama.cpp +530 -237
- data/vendor/tmp/llama.cpp/llama.h +5 -1
- data/vendor/tmp/llama.cpp/sgemm.cpp +2 -0
- data/vendor/tmp/llama.cpp/unicode-data.cpp +851 -801
- data/vendor/tmp/llama.cpp/unicode.cpp +33 -19
- data/vendor/tmp/llama.cpp/unicode.h +1 -1
- metadata +2 -2
data/vendor/tmp/llama.cpp/ggml.c
CHANGED
@@ -1753,9 +1753,8 @@ struct ggml_compute_state_shared {
|
|
1753
1753
|
int n_threads;
|
1754
1754
|
|
1755
1755
|
// synchronization primitives
|
1756
|
-
atomic_int
|
1757
|
-
atomic_int
|
1758
|
-
atomic_int node_task; // active graph node task phase
|
1756
|
+
atomic_int n_barrier;
|
1757
|
+
atomic_int n_barrier_passed;
|
1759
1758
|
|
1760
1759
|
ggml_abort_callback abort_callback; // abort ggml_graph_compute when true
|
1761
1760
|
void* abort_callback_data;
|
@@ -18972,47 +18971,49 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_
|
|
18972
18971
|
return n_tasks;
|
18973
18972
|
}
|
18974
18973
|
|
18975
|
-
|
18976
|
-
|
18977
|
-
|
18978
|
-
|
18979
|
-
while (true) {
|
18980
|
-
if (do_yield) {
|
18981
|
-
sched_yield();
|
18982
|
-
}
|
18983
|
-
|
18984
|
-
*node_n = atomic_load(&state->shared->node_n);
|
18985
|
-
if (*node_n != last_node_n) {
|
18986
|
-
break;
|
18987
|
-
}
|
18988
|
-
|
18989
|
-
#if defined(__SSE3__)
|
18990
|
-
// Tell the processor we're spinning. It's a processor hint for spinlocks.
|
18991
|
-
_mm_pause();
|
18992
|
-
#endif
|
18974
|
+
#ifdef GGML_USE_OPENMP
|
18975
|
+
static void ggml_barrier(struct ggml_compute_state * state) {
|
18976
|
+
if (state->shared->n_threads == 1) {
|
18977
|
+
return;
|
18993
18978
|
}
|
18979
|
+
|
18980
|
+
#pragma omp barrier
|
18994
18981
|
}
|
18982
|
+
#else
|
18983
|
+
static void ggml_barrier(struct ggml_compute_state * state) {
|
18984
|
+
if (state->shared->n_threads == 1) {
|
18985
|
+
return;
|
18986
|
+
}
|
18995
18987
|
|
18996
|
-
|
18997
|
-
|
18998
|
-
const int last_task_phase = *task_phase;
|
18988
|
+
atomic_int * n_barrier = &state->shared->n_barrier;
|
18989
|
+
atomic_int * n_barrier_passed = &state->shared->n_barrier_passed;
|
18999
18990
|
|
19000
|
-
|
19001
|
-
|
19002
|
-
sched_yield();
|
19003
|
-
}
|
18991
|
+
int n_threads = state->shared->n_threads;
|
18992
|
+
int passed_old = atomic_load(n_barrier_passed);
|
19004
18993
|
|
19005
|
-
|
19006
|
-
|
19007
|
-
|
18994
|
+
if (atomic_fetch_add(n_barrier, 1) == n_threads - 1) {
|
18995
|
+
// last thread
|
18996
|
+
atomic_store(n_barrier, 0);
|
18997
|
+
atomic_fetch_add(n_barrier_passed, 1);
|
18998
|
+
} else {
|
18999
|
+
// wait for other threads
|
19000
|
+
//while (atomic_load(n_barrier_passed) == passed_old) {
|
19001
|
+
//}
|
19002
|
+
const int n_spin_before_sleep = 100000;
|
19003
|
+
while (true) {
|
19004
|
+
for (int i = 0; i < n_spin_before_sleep; i++) {
|
19005
|
+
if (atomic_load(n_barrier_passed) != passed_old) {
|
19006
|
+
return;
|
19007
|
+
}
|
19008
|
+
#if defined(__SSE3__)
|
19009
|
+
_mm_pause();
|
19010
|
+
#endif
|
19011
|
+
}
|
19012
|
+
sched_yield();
|
19008
19013
|
}
|
19009
|
-
|
19010
|
-
#if defined(__SSE3__)
|
19011
|
-
// Tell the processor we're spinning. It's a processor hint for spinlocks.
|
19012
|
-
_mm_pause();
|
19013
|
-
#endif
|
19014
19014
|
}
|
19015
19015
|
}
|
19016
|
+
#endif
|
19016
19017
|
|
19017
19018
|
static thread_ret_t ggml_graph_compute_thread(void * data) {
|
19018
19019
|
struct ggml_compute_state * state = (struct ggml_compute_state *) data;
|
@@ -19020,136 +19021,54 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
19020
19021
|
const struct ggml_cgraph * cgraph = state->shared->cgraph;
|
19021
19022
|
const struct ggml_cplan * cplan = state->shared->cplan;
|
19022
19023
|
|
19023
|
-
const int
|
19024
|
+
const int ith = state->ith;
|
19025
|
+
const int n_threads = state->shared->n_threads;
|
19024
19026
|
|
19025
|
-
set_numa_thread_affinity(
|
19027
|
+
set_numa_thread_affinity(ith);
|
19026
19028
|
|
19027
|
-
|
19028
|
-
|
19029
|
+
struct ggml_compute_params params = {
|
19030
|
+
/*.type =*/ GGML_TASK_TYPE_INIT,
|
19031
|
+
/*.ith =*/ ith,
|
19032
|
+
/*.nth =*/ state->shared->n_threads,
|
19033
|
+
/*.wsize =*/ cplan->work_size,
|
19034
|
+
/*.wdata =*/ cplan->work_data,
|
19035
|
+
};
|
19029
19036
|
|
19030
|
-
|
19037
|
+
for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) {
|
19031
19038
|
if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
|
19032
|
-
state->shared->node_n += 1;
|
19033
19039
|
state->ec = GGML_STATUS_ABORTED;
|
19034
19040
|
return 0;
|
19035
19041
|
}
|
19036
19042
|
|
19037
|
-
if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
|
19038
|
-
// all other threads are finished and spinning
|
19039
|
-
// do finalize and init here so we don't have synchronize again
|
19040
|
-
struct ggml_compute_params params = {
|
19041
|
-
/*.type =*/ GGML_TASK_TYPE_FINALIZE,
|
19042
|
-
/*.ith =*/ 0,
|
19043
|
-
/*.nth =*/ 0,
|
19044
|
-
/*.wsize =*/ cplan->work_size,
|
19045
|
-
/*.wdata =*/ cplan->work_data,
|
19046
|
-
};
|
19047
|
-
|
19048
|
-
if (node_n != -1) {
|
19049
|
-
/* FINALIZE */
|
19050
|
-
struct ggml_tensor * node = cgraph->nodes[node_n];
|
19051
|
-
if (GGML_OP_HAS_FINALIZE[node->op]) {
|
19052
|
-
params.nth = ggml_get_n_tasks(node, n_threads, state->shared->n_threads);
|
19053
|
-
ggml_compute_forward(¶ms, node, state);
|
19054
|
-
}
|
19055
|
-
ggml_graph_compute_perf_stats_node(node, state->shared);
|
19056
|
-
}
|
19057
|
-
|
19058
|
-
// distribute new work or execute it direct if 1T
|
19059
|
-
while (++node_n < cgraph->n_nodes) {
|
19060
|
-
GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes);
|
19061
|
-
struct ggml_tensor * node = cgraph->nodes[node_n];
|
19062
|
-
const int n_tasks = ggml_get_n_tasks(node, n_threads, state->shared->n_threads);
|
19063
|
-
|
19064
|
-
state->shared->perf_node_start_cycles = ggml_perf_cycles();
|
19065
|
-
state->shared->perf_node_start_time_us = ggml_perf_time_us();
|
19066
|
-
|
19067
|
-
params.nth = n_tasks;
|
19068
|
-
|
19069
|
-
if (n_tasks == 1) {
|
19070
|
-
/* INIT */
|
19071
|
-
if (GGML_OP_HAS_INIT[node->op]) {
|
19072
|
-
params.type = GGML_TASK_TYPE_INIT;
|
19073
|
-
ggml_compute_forward(¶ms, node, state);
|
19074
|
-
}
|
19075
|
-
|
19076
|
-
// TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1,
|
19077
|
-
// they do something more efficient than spinning (?)
|
19078
|
-
params.type = GGML_TASK_TYPE_COMPUTE;
|
19079
|
-
ggml_compute_forward(¶ms, node, state);
|
19080
|
-
|
19081
|
-
if (GGML_OP_HAS_FINALIZE[node->op]) {
|
19082
|
-
params.type = GGML_TASK_TYPE_FINALIZE;
|
19083
|
-
ggml_compute_forward(¶ms, node, state);
|
19084
|
-
}
|
19085
|
-
|
19086
|
-
ggml_graph_compute_perf_stats_node(node, state->shared);
|
19087
|
-
} else {
|
19088
|
-
break;
|
19089
|
-
}
|
19090
|
-
|
19091
|
-
if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
|
19092
|
-
break;
|
19093
|
-
}
|
19094
|
-
}
|
19095
|
-
|
19096
|
-
task_phase = GGML_TASK_TYPE_INIT;
|
19097
|
-
atomic_store(&state->shared->n_active, n_threads);
|
19098
|
-
atomic_store(&state->shared->node_n, node_n);
|
19099
|
-
atomic_store(&state->shared->node_task, task_phase);
|
19100
|
-
} else {
|
19101
|
-
ggml_graph_compute_thread_sync_node(&node_n, state, false);
|
19102
|
-
ggml_graph_compute_thread_sync_task(&task_phase, state, false);
|
19103
|
-
}
|
19104
|
-
|
19105
|
-
// check if we should stop
|
19106
|
-
if (node_n >= cgraph->n_nodes) break;
|
19107
|
-
|
19108
|
-
/* INIT & COMPUTE */
|
19109
19043
|
struct ggml_tensor * node = cgraph->nodes[node_n];
|
19110
19044
|
const int n_tasks = ggml_get_n_tasks(node, n_threads, state->shared->n_threads);
|
19111
19045
|
|
19112
|
-
|
19113
|
-
/*.type =*/ GGML_TASK_TYPE_INIT,
|
19114
|
-
/*.ith =*/ state->ith,
|
19115
|
-
/*.nth =*/ n_tasks,
|
19116
|
-
/*.wsize =*/ cplan->work_size,
|
19117
|
-
/*.wdata =*/ cplan->work_data,
|
19118
|
-
};
|
19046
|
+
params.nth = n_tasks;
|
19119
19047
|
|
19120
|
-
|
19121
|
-
|
19048
|
+
/* INIT */
|
19049
|
+
if (GGML_OP_HAS_INIT[node->op]) {
|
19050
|
+
if (ith < n_tasks) {
|
19051
|
+
params.type = GGML_TASK_TYPE_INIT;
|
19122
19052
|
ggml_compute_forward(¶ms, node, state);
|
19123
19053
|
}
|
19054
|
+
ggml_barrier(state);
|
19124
19055
|
}
|
19125
19056
|
|
19126
|
-
|
19127
|
-
|
19128
|
-
atomic_store(&state->shared->n_active, n_threads);
|
19129
|
-
atomic_store(&state->shared->node_task, task_phase);
|
19130
|
-
}
|
19131
|
-
else {
|
19132
|
-
// TODO: this sched_yield can have significant impact on the performance - either positive or negative
|
19133
|
-
// depending on the workload and the operating system.
|
19134
|
-
// since it is not clear what is the best approach, it should potentially become user-configurable
|
19135
|
-
// ref: https://github.com/ggerganov/ggml/issues/291
|
19136
|
-
// UPD: adding the do_yield flag seems to resolve the issue universally
|
19137
|
-
const bool do_yield = node_n < 0 || cgraph->nodes[node_n]->op == GGML_OP_MUL_MAT;
|
19138
|
-
ggml_graph_compute_thread_sync_task(&task_phase, state, do_yield);
|
19139
|
-
}
|
19140
|
-
|
19141
|
-
if (state->ith < n_tasks) {
|
19057
|
+
/* COMPUTE */
|
19058
|
+
if (ith < n_tasks) {
|
19142
19059
|
params.type = GGML_TASK_TYPE_COMPUTE;
|
19143
19060
|
ggml_compute_forward(¶ms, node, state);
|
19144
19061
|
}
|
19145
19062
|
|
19146
|
-
|
19147
|
-
|
19148
|
-
|
19149
|
-
|
19150
|
-
|
19151
|
-
|
19152
|
-
|
19063
|
+
ggml_barrier(state);
|
19064
|
+
|
19065
|
+
/* FINALIZE */
|
19066
|
+
if (GGML_OP_HAS_FINALIZE[node->op]) {
|
19067
|
+
if (params.ith == 0) {
|
19068
|
+
params.type = GGML_TASK_TYPE_FINALIZE;
|
19069
|
+
ggml_compute_forward(¶ms, node, state);
|
19070
|
+
}
|
19071
|
+
ggml_barrier(state);
|
19153
19072
|
}
|
19154
19073
|
}
|
19155
19074
|
|
@@ -19336,7 +19255,6 @@ static enum ggml_status ggml_graph_compute_parallel(struct ggml_compute_state *
|
|
19336
19255
|
// update the number of threads from the actual number of threads that we got from OpenMP
|
19337
19256
|
n_threads = omp_get_num_threads();
|
19338
19257
|
workers[0].shared->n_threads = n_threads;
|
19339
|
-
workers[0].shared->n_active = n_threads;
|
19340
19258
|
}
|
19341
19259
|
ggml_graph_compute_thread(&workers[omp_get_thread_num()]);
|
19342
19260
|
}
|
@@ -19399,9 +19317,8 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
|
|
19399
19317
|
/*.perf_node_start_cycles =*/ 0,
|
19400
19318
|
/*.perf_node_start_time_us =*/ 0,
|
19401
19319
|
/*.n_threads =*/ n_threads,
|
19402
|
-
/*.
|
19403
|
-
/*.
|
19404
|
-
/*.node_task =*/ GGML_TASK_TYPE_FINALIZE,
|
19320
|
+
/*.n_barrier =*/ 0,
|
19321
|
+
/*.n_barrier_passed =*/ 0,
|
19405
19322
|
/*.abort_callback =*/ NULL,
|
19406
19323
|
/*.abort_callback_data =*/ NULL,
|
19407
19324
|
/*.current_chunk; =*/ 0,
|
data/vendor/tmp/llama.cpp/ggml.h
CHANGED
@@ -312,6 +312,12 @@
|
|
312
312
|
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \
|
313
313
|
GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
|
314
314
|
|
315
|
+
#define GGML_TENSOR_BINARY_OP_LOCALS01 \
|
316
|
+
GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
|
317
|
+
GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \
|
318
|
+
GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \
|
319
|
+
GGML_TENSOR_LOCALS(size_t, nb1, src1, nb)
|
320
|
+
|
315
321
|
#ifdef __cplusplus
|
316
322
|
extern "C" {
|
317
323
|
#endif
|