llama_cpp 0.16.1 → 0.16.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1753,9 +1753,8 @@ struct ggml_compute_state_shared {
1753
1753
  int n_threads;
1754
1754
 
1755
1755
  // synchronization primitives
1756
- atomic_int n_active; // num active threads
1757
- atomic_int node_n; // active graph node
1758
- atomic_int node_task; // active graph node task phase
1756
+ atomic_int n_barrier;
1757
+ atomic_int n_barrier_passed;
1759
1758
 
1760
1759
  ggml_abort_callback abort_callback; // abort ggml_graph_compute when true
1761
1760
  void* abort_callback_data;
@@ -18972,47 +18971,49 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_
18972
18971
  return n_tasks;
18973
18972
  }
18974
18973
 
18975
- static void ggml_graph_compute_thread_sync_node(int * node_n, struct ggml_compute_state * state, const bool do_yield) {
18976
- // wait for other threads to finish
18977
- const int last_node_n = * node_n;
18978
-
18979
- while (true) {
18980
- if (do_yield) {
18981
- sched_yield();
18982
- }
18983
-
18984
- *node_n = atomic_load(&state->shared->node_n);
18985
- if (*node_n != last_node_n) {
18986
- break;
18987
- }
18988
-
18989
- #if defined(__SSE3__)
18990
- // Tell the processor we're spinning. It's a processor hint for spinlocks.
18991
- _mm_pause();
18992
- #endif
18974
+ #ifdef GGML_USE_OPENMP
18975
+ static void ggml_barrier(struct ggml_compute_state * state) {
18976
+ if (state->shared->n_threads == 1) {
18977
+ return;
18993
18978
  }
18979
+
18980
+ #pragma omp barrier
18994
18981
  }
18982
+ #else
18983
+ static void ggml_barrier(struct ggml_compute_state * state) {
18984
+ if (state->shared->n_threads == 1) {
18985
+ return;
18986
+ }
18995
18987
 
18996
- static void ggml_graph_compute_thread_sync_task(int * task_phase, struct ggml_compute_state * state, const bool do_yield) {
18997
- // wait for other threads to finish
18998
- const int last_task_phase = *task_phase;
18988
+ atomic_int * n_barrier = &state->shared->n_barrier;
18989
+ atomic_int * n_barrier_passed = &state->shared->n_barrier_passed;
18999
18990
 
19000
- while (true) {
19001
- if (do_yield) {
19002
- sched_yield();
19003
- }
18991
+ int n_threads = state->shared->n_threads;
18992
+ int passed_old = atomic_load(n_barrier_passed);
19004
18993
 
19005
- *task_phase = atomic_load(&state->shared->node_task);
19006
- if (*task_phase != last_task_phase) {
19007
- break;
18994
+ if (atomic_fetch_add(n_barrier, 1) == n_threads - 1) {
18995
+ // last thread
18996
+ atomic_store(n_barrier, 0);
18997
+ atomic_fetch_add(n_barrier_passed, 1);
18998
+ } else {
18999
+ // wait for other threads
19000
+ //while (atomic_load(n_barrier_passed) == passed_old) {
19001
+ //}
19002
+ const int n_spin_before_sleep = 100000;
19003
+ while (true) {
19004
+ for (int i = 0; i < n_spin_before_sleep; i++) {
19005
+ if (atomic_load(n_barrier_passed) != passed_old) {
19006
+ return;
19007
+ }
19008
+ #if defined(__SSE3__)
19009
+ _mm_pause();
19010
+ #endif
19011
+ }
19012
+ sched_yield();
19008
19013
  }
19009
-
19010
- #if defined(__SSE3__)
19011
- // Tell the processor we're spinning. It's a processor hint for spinlocks.
19012
- _mm_pause();
19013
- #endif
19014
19014
  }
19015
19015
  }
19016
+ #endif
19016
19017
 
19017
19018
  static thread_ret_t ggml_graph_compute_thread(void * data) {
19018
19019
  struct ggml_compute_state * state = (struct ggml_compute_state *) data;
@@ -19020,136 +19021,54 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
19020
19021
  const struct ggml_cgraph * cgraph = state->shared->cgraph;
19021
19022
  const struct ggml_cplan * cplan = state->shared->cplan;
19022
19023
 
19023
- const int n_threads = state->shared->n_threads;
19024
+ const int ith = state->ith;
19025
+ const int n_threads = state->shared->n_threads;
19024
19026
 
19025
- set_numa_thread_affinity(state->ith);
19027
+ set_numa_thread_affinity(ith);
19026
19028
 
19027
- int node_n = -1;
19028
- int task_phase = GGML_TASK_TYPE_FINALIZE;
19029
+ struct ggml_compute_params params = {
19030
+ /*.type =*/ GGML_TASK_TYPE_INIT,
19031
+ /*.ith =*/ ith,
19032
+ /*.nth =*/ state->shared->n_threads,
19033
+ /*.wsize =*/ cplan->work_size,
19034
+ /*.wdata =*/ cplan->work_data,
19035
+ };
19029
19036
 
19030
- while (true) {
19037
+ for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) {
19031
19038
  if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
19032
- state->shared->node_n += 1;
19033
19039
  state->ec = GGML_STATUS_ABORTED;
19034
19040
  return 0;
19035
19041
  }
19036
19042
 
19037
- if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
19038
- // all other threads are finished and spinning
19039
- // do finalize and init here so we don't have synchronize again
19040
- struct ggml_compute_params params = {
19041
- /*.type =*/ GGML_TASK_TYPE_FINALIZE,
19042
- /*.ith =*/ 0,
19043
- /*.nth =*/ 0,
19044
- /*.wsize =*/ cplan->work_size,
19045
- /*.wdata =*/ cplan->work_data,
19046
- };
19047
-
19048
- if (node_n != -1) {
19049
- /* FINALIZE */
19050
- struct ggml_tensor * node = cgraph->nodes[node_n];
19051
- if (GGML_OP_HAS_FINALIZE[node->op]) {
19052
- params.nth = ggml_get_n_tasks(node, n_threads, state->shared->n_threads);
19053
- ggml_compute_forward(&params, node, state);
19054
- }
19055
- ggml_graph_compute_perf_stats_node(node, state->shared);
19056
- }
19057
-
19058
- // distribute new work or execute it direct if 1T
19059
- while (++node_n < cgraph->n_nodes) {
19060
- GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes);
19061
- struct ggml_tensor * node = cgraph->nodes[node_n];
19062
- const int n_tasks = ggml_get_n_tasks(node, n_threads, state->shared->n_threads);
19063
-
19064
- state->shared->perf_node_start_cycles = ggml_perf_cycles();
19065
- state->shared->perf_node_start_time_us = ggml_perf_time_us();
19066
-
19067
- params.nth = n_tasks;
19068
-
19069
- if (n_tasks == 1) {
19070
- /* INIT */
19071
- if (GGML_OP_HAS_INIT[node->op]) {
19072
- params.type = GGML_TASK_TYPE_INIT;
19073
- ggml_compute_forward(&params, node, state);
19074
- }
19075
-
19076
- // TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1,
19077
- // they do something more efficient than spinning (?)
19078
- params.type = GGML_TASK_TYPE_COMPUTE;
19079
- ggml_compute_forward(&params, node, state);
19080
-
19081
- if (GGML_OP_HAS_FINALIZE[node->op]) {
19082
- params.type = GGML_TASK_TYPE_FINALIZE;
19083
- ggml_compute_forward(&params, node, state);
19084
- }
19085
-
19086
- ggml_graph_compute_perf_stats_node(node, state->shared);
19087
- } else {
19088
- break;
19089
- }
19090
-
19091
- if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
19092
- break;
19093
- }
19094
- }
19095
-
19096
- task_phase = GGML_TASK_TYPE_INIT;
19097
- atomic_store(&state->shared->n_active, n_threads);
19098
- atomic_store(&state->shared->node_n, node_n);
19099
- atomic_store(&state->shared->node_task, task_phase);
19100
- } else {
19101
- ggml_graph_compute_thread_sync_node(&node_n, state, false);
19102
- ggml_graph_compute_thread_sync_task(&task_phase, state, false);
19103
- }
19104
-
19105
- // check if we should stop
19106
- if (node_n >= cgraph->n_nodes) break;
19107
-
19108
- /* INIT & COMPUTE */
19109
19043
  struct ggml_tensor * node = cgraph->nodes[node_n];
19110
19044
  const int n_tasks = ggml_get_n_tasks(node, n_threads, state->shared->n_threads);
19111
19045
 
19112
- struct ggml_compute_params params = {
19113
- /*.type =*/ GGML_TASK_TYPE_INIT,
19114
- /*.ith =*/ state->ith,
19115
- /*.nth =*/ n_tasks,
19116
- /*.wsize =*/ cplan->work_size,
19117
- /*.wdata =*/ cplan->work_data,
19118
- };
19046
+ params.nth = n_tasks;
19119
19047
 
19120
- if (state->ith < n_tasks) {
19121
- if (GGML_OP_HAS_INIT[node->op]) {
19048
+ /* INIT */
19049
+ if (GGML_OP_HAS_INIT[node->op]) {
19050
+ if (ith < n_tasks) {
19051
+ params.type = GGML_TASK_TYPE_INIT;
19122
19052
  ggml_compute_forward(&params, node, state);
19123
19053
  }
19054
+ ggml_barrier(state);
19124
19055
  }
19125
19056
 
19126
- if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
19127
- task_phase = GGML_TASK_TYPE_COMPUTE;
19128
- atomic_store(&state->shared->n_active, n_threads);
19129
- atomic_store(&state->shared->node_task, task_phase);
19130
- }
19131
- else {
19132
- // TODO: this sched_yield can have significant impact on the performance - either positive or negative
19133
- // depending on the workload and the operating system.
19134
- // since it is not clear what is the best approach, it should potentially become user-configurable
19135
- // ref: https://github.com/ggerganov/ggml/issues/291
19136
- // UPD: adding the do_yield flag seems to resolve the issue universally
19137
- const bool do_yield = node_n < 0 || cgraph->nodes[node_n]->op == GGML_OP_MUL_MAT;
19138
- ggml_graph_compute_thread_sync_task(&task_phase, state, do_yield);
19139
- }
19140
-
19141
- if (state->ith < n_tasks) {
19057
+ /* COMPUTE */
19058
+ if (ith < n_tasks) {
19142
19059
  params.type = GGML_TASK_TYPE_COMPUTE;
19143
19060
  ggml_compute_forward(&params, node, state);
19144
19061
  }
19145
19062
 
19146
- if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
19147
- task_phase = GGML_TASK_TYPE_FINALIZE;
19148
- atomic_store(&state->shared->n_active, n_threads);
19149
- atomic_store(&state->shared->node_task, task_phase);
19150
- }
19151
- else {
19152
- ggml_graph_compute_thread_sync_task(&task_phase, state, false);
19063
+ ggml_barrier(state);
19064
+
19065
+ /* FINALIZE */
19066
+ if (GGML_OP_HAS_FINALIZE[node->op]) {
19067
+ if (params.ith == 0) {
19068
+ params.type = GGML_TASK_TYPE_FINALIZE;
19069
+ ggml_compute_forward(&params, node, state);
19070
+ }
19071
+ ggml_barrier(state);
19153
19072
  }
19154
19073
  }
19155
19074
 
@@ -19336,7 +19255,6 @@ static enum ggml_status ggml_graph_compute_parallel(struct ggml_compute_state *
19336
19255
  // update the number of threads from the actual number of threads that we got from OpenMP
19337
19256
  n_threads = omp_get_num_threads();
19338
19257
  workers[0].shared->n_threads = n_threads;
19339
- workers[0].shared->n_active = n_threads;
19340
19258
  }
19341
19259
  ggml_graph_compute_thread(&workers[omp_get_thread_num()]);
19342
19260
  }
@@ -19399,9 +19317,8 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
19399
19317
  /*.perf_node_start_cycles =*/ 0,
19400
19318
  /*.perf_node_start_time_us =*/ 0,
19401
19319
  /*.n_threads =*/ n_threads,
19402
- /*.n_active =*/ n_threads,
19403
- /*.node_n =*/ -1,
19404
- /*.node_task =*/ GGML_TASK_TYPE_FINALIZE,
19320
+ /*.n_barrier =*/ 0,
19321
+ /*.n_barrier_passed =*/ 0,
19405
19322
  /*.abort_callback =*/ NULL,
19406
19323
  /*.abort_callback_data =*/ NULL,
19407
19324
  /*.current_chunk; =*/ 0,
@@ -312,6 +312,12 @@
312
312
  GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \
313
313
  GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
314
314
 
315
+ #define GGML_TENSOR_BINARY_OP_LOCALS01 \
316
+ GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
317
+ GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \
318
+ GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \
319
+ GGML_TENSOR_LOCALS(size_t, nb1, src1, nb)
320
+
315
321
  #ifdef __cplusplus
316
322
  extern "C" {
317
323
  #endif