@novastera-oss/llamarn 0.5.2 → 0.5.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. package/android/CMakeLists.txt +47 -21
  2. package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
  3. package/android/src/main/jniLibs/armeabi-v7a/libllama.so +0 -0
  4. package/android/src/main/jniLibs/x86/libllama.so +0 -0
  5. package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
  6. package/cpp/PureCppImpl.cpp +80 -6
  7. package/cpp/build-info.cpp +2 -2
  8. package/cpp/llama.cpp/convert_hf_to_gguf.py +15 -0
  9. package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +1 -2
  10. package/cpp/llama.cpp/ggml/src/ggml-hexagon/CMakeLists.txt +10 -0
  11. package/cpp/llama.cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp +99 -364
  12. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp-dma.h +7 -0
  13. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-exp.c +14 -13
  14. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-inverse.c +15 -3
  15. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-utils.h +36 -25
  16. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/main.c +12 -3
  17. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/rope-ops.c +80 -7
  18. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp-utils.c +6 -0
  19. package/cpp/llama.cpp/gguf-py/gguf/constants.py +19 -0
  20. package/cpp/llama.cpp/src/CMakeLists.txt +1 -0
  21. package/cpp/llama.cpp/src/llama-arch.cpp +22 -0
  22. package/cpp/llama.cpp/src/llama-arch.h +1 -0
  23. package/cpp/llama.cpp/src/llama-model.cpp +21 -1
  24. package/cpp/llama.cpp/src/models/models.h +4 -0
  25. package/cpp/llama.cpp/src/models/rnd1.cpp +126 -0
  26. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  27. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +6403 -6395
  28. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
  29. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  30. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +6366 -6358
  31. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4815 -4809
  32. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
  33. package/package.json +1 -1
@@ -9,6 +9,7 @@
9
9
  #include <chrono>
10
10
  #include <mutex>
11
11
  #include <string>
12
+ #include <stdexcept>
12
13
 
13
14
  #ifdef _WIN32
14
15
  # include <sal.h>
@@ -240,6 +241,23 @@ struct ggml_hexagon_session {
240
241
  uint32_t prof_pkts;
241
242
  };
242
243
 
244
+ static inline void hex_print_op_info(const ggml_tensor * op, ggml_hexagon_session * sess, const uint32_t req_flags) {
245
+ char dims[64 * GGML_MAX_SRC];
246
+ char strides[64 * GGML_MAX_SRC];
247
+ char types[16 * GGML_MAX_SRC];
248
+ char buffs[64 * GGML_MAX_SRC];
249
+ char names[64 * GGML_MAX_SRC];
250
+
251
+ hex_format_op_dims(dims, op);
252
+ hex_format_op_strides(strides, op);
253
+ hex_format_op_types(types, op);
254
+ hex_format_op_buffs(buffs, op);
255
+ hex_format_op_names(names, op);
256
+
257
+ HEX_VERBOSE("ggml-hex: %s %s: %s : %s : %s : %s : %s: flags 0x%x\n", sess->name.c_str(), ggml_op_name(op->op),
258
+ names, dims, types, strides, buffs, req_flags);
259
+ }
260
+
243
261
  void ggml_hexagon_session::enqueue(struct htp_general_req &req, struct dspqueue_buffer *bufs, uint32_t n_bufs, bool sync) {
244
262
  // Bump pending flag (cleared in the session::flush once we get the responce)
245
263
  this->op_pending++; // atomic inc
@@ -1912,6 +1930,15 @@ static bool hex_supported_dims(const struct ggml_tensor * x, const struct ggml_t
1912
1930
  return true;
1913
1931
  }
1914
1932
 
1933
+ template <typename... _TTensor>
1934
+ static inline bool hex_supported_buffer(const struct ggml_hexagon_session * sess, _TTensor... tensors) {
1935
+ return ([&]() -> bool {
1936
+ return !tensors || !tensors->buffer ||
1937
+ (ggml_backend_buffer_is_hexagon(tensors->buffer) &&
1938
+ ggml_backend_hexagon_buffer_get_sess(tensors->buffer) == sess);
1939
+ }() && ...);
1940
+ }
1941
+
1915
1942
  static bool ggml_hexagon_supported_mul_mat(const struct ggml_hexagon_session * sess, const struct ggml_tensor * dst) {
1916
1943
  const struct ggml_tensor * src0 = dst->src[0];
1917
1944
  const struct ggml_tensor * src1 = dst->src[1];
@@ -1959,16 +1986,7 @@ static bool ggml_hexagon_supported_mul_mat(const struct ggml_hexagon_session * s
1959
1986
  }
1960
1987
 
1961
1988
  // src0 & src1 & dst must be mapped to the same session
1962
- if (src0->buffer &&
1963
- (!ggml_backend_buffer_is_hexagon(src0->buffer) || ggml_backend_hexagon_buffer_get_sess(src0->buffer) != sess)) {
1964
- return false;
1965
- }
1966
- if (src1->buffer &&
1967
- (!ggml_backend_buffer_is_hexagon(src1->buffer) || ggml_backend_hexagon_buffer_get_sess(src1->buffer) != sess)) {
1968
- return false;
1969
- }
1970
- if (dst->buffer &&
1971
- (!ggml_backend_buffer_is_hexagon(dst->buffer) || ggml_backend_hexagon_buffer_get_sess(dst->buffer) != sess)) {
1989
+ if (!hex_supported_buffer(sess, src0, src1, dst)) {
1972
1990
  return false;
1973
1991
  }
1974
1992
 
@@ -2016,20 +2034,7 @@ static bool ggml_hexagon_supported_mul_mat_id(const struct ggml_hexagon_session
2016
2034
 
2017
2035
  // src0 (weights) must be repacked and mapped to the same session
2018
2036
  // src1 & sr2 & dst must be mapped to the same session
2019
- if (src0->buffer &&
2020
- (!ggml_backend_buffer_is_hexagon(src0->buffer) || ggml_backend_hexagon_buffer_get_sess(src0->buffer) != sess)) {
2021
- return false;
2022
- }
2023
- if (src1->buffer &&
2024
- (!ggml_backend_buffer_is_hexagon(src1->buffer) || ggml_backend_hexagon_buffer_get_sess(src1->buffer) != sess)) {
2025
- return false;
2026
- }
2027
- if (src2->buffer &&
2028
- (!ggml_backend_buffer_is_hexagon(src2->buffer) || ggml_backend_hexagon_buffer_get_sess(src2->buffer) != sess)) {
2029
- return false;
2030
- }
2031
- if (dst->buffer &&
2032
- (!ggml_backend_buffer_is_hexagon(dst->buffer) || ggml_backend_hexagon_buffer_get_sess(dst->buffer) != sess)) {
2037
+ if (!hex_supported_buffer(sess, src0, src1, src2, dst)) {
2033
2038
  return false;
2034
2039
  }
2035
2040
 
@@ -2063,16 +2068,7 @@ static bool ggml_hexagon_supported_binary(const struct ggml_hexagon_session * se
2063
2068
  }
2064
2069
 
2065
2070
  // src0, src1 & dst must be mapped to the same session
2066
- if (src0->buffer &&
2067
- (!ggml_backend_buffer_is_hexagon(src0->buffer) || ggml_backend_hexagon_buffer_get_sess(src0->buffer) != sess)) {
2068
- return false;
2069
- }
2070
- if (src1->buffer &&
2071
- (!ggml_backend_buffer_is_hexagon(src1->buffer) || ggml_backend_hexagon_buffer_get_sess(src1->buffer) != sess)) {
2072
- return false;
2073
- }
2074
- if (dst->buffer &&
2075
- (!ggml_backend_buffer_is_hexagon(dst->buffer) || ggml_backend_hexagon_buffer_get_sess(dst->buffer) != sess)) {
2071
+ if (!hex_supported_buffer(sess, src0, src1, dst)) {
2076
2072
  return false;
2077
2073
  }
2078
2074
 
@@ -2104,20 +2100,7 @@ static bool ggml_hexagon_supported_add_id(const struct ggml_hexagon_session * se
2104
2100
  }
2105
2101
 
2106
2102
  // src0, src1 & dst must be mapped to the same session
2107
- if (src0->buffer &&
2108
- (!ggml_backend_buffer_is_hexagon(src0->buffer) || ggml_backend_hexagon_buffer_get_sess(src0->buffer) != sess)) {
2109
- return false;
2110
- }
2111
- if (src1->buffer &&
2112
- (!ggml_backend_buffer_is_hexagon(src1->buffer) || ggml_backend_hexagon_buffer_get_sess(src1->buffer) != sess)) {
2113
- return false;
2114
- }
2115
- if (src2->buffer &&
2116
- (!ggml_backend_buffer_is_hexagon(src2->buffer) || ggml_backend_hexagon_buffer_get_sess(src2->buffer) != sess)) {
2117
- return false;
2118
- }
2119
- if (dst->buffer &&
2120
- (!ggml_backend_buffer_is_hexagon(dst->buffer) || ggml_backend_hexagon_buffer_get_sess(dst->buffer) != sess)) {
2103
+ if (!hex_supported_buffer(sess, src0, src1, src2, dst)) {
2121
2104
  return false;
2122
2105
  }
2123
2106
 
@@ -2144,12 +2127,7 @@ static bool ggml_hexagon_supported_unary(const struct ggml_hexagon_session * ses
2144
2127
  }
2145
2128
 
2146
2129
  // src0 & dst must be mapped to the same session
2147
- if (src0->buffer &&
2148
- (!ggml_backend_buffer_is_hexagon(src0->buffer) || ggml_backend_hexagon_buffer_get_sess(src0->buffer) != sess)) {
2149
- return false;
2150
- }
2151
- if (dst->buffer &&
2152
- (!ggml_backend_buffer_is_hexagon(dst->buffer) || ggml_backend_hexagon_buffer_get_sess(dst->buffer) != sess)) {
2130
+ if (!hex_supported_buffer(sess, src0, dst)) {
2153
2131
  return false;
2154
2132
  }
2155
2133
 
@@ -2186,16 +2164,7 @@ static bool ggml_hexagon_supported_activations(const struct ggml_hexagon_session
2186
2164
  }
2187
2165
 
2188
2166
  // src0, src1 & dst must be mapped to the same session
2189
- if (src0->buffer &&
2190
- (!ggml_backend_buffer_is_hexagon(src0->buffer) || ggml_backend_hexagon_buffer_get_sess(src0->buffer) != sess)) {
2191
- return false;
2192
- }
2193
- if (src1 && src1->buffer &&
2194
- (!ggml_backend_buffer_is_hexagon(src1->buffer) || ggml_backend_hexagon_buffer_get_sess(src1->buffer) != sess)) {
2195
- return false;
2196
- }
2197
- if (dst->buffer &&
2198
- (!ggml_backend_buffer_is_hexagon(dst->buffer) || ggml_backend_hexagon_buffer_get_sess(dst->buffer) != sess)) {
2167
+ if (!hex_supported_buffer(sess, src0, src1, dst)) {
2199
2168
  return false;
2200
2169
  }
2201
2170
 
@@ -2248,16 +2217,7 @@ static bool ggml_hexagon_supported_softmax(const struct ggml_hexagon_session * s
2248
2217
  }
2249
2218
 
2250
2219
  // src0, src1 & dst must be mapped to the same session
2251
- if (src0->buffer &&
2252
- (!ggml_backend_buffer_is_hexagon(src0->buffer) || ggml_backend_hexagon_buffer_get_sess(src0->buffer) != sess)) {
2253
- return false;
2254
- }
2255
- if (src1 && src1->buffer &&
2256
- (!ggml_backend_buffer_is_hexagon(src1->buffer) || ggml_backend_hexagon_buffer_get_sess(src1->buffer) != sess)) {
2257
- return false;
2258
- }
2259
- if (dst->buffer &&
2260
- (!ggml_backend_buffer_is_hexagon(dst->buffer) || ggml_backend_hexagon_buffer_get_sess(dst->buffer) != sess)) {
2220
+ if (!hex_supported_buffer(sess, src0, src1, dst)) {
2261
2221
  return false;
2262
2222
  }
2263
2223
 
@@ -2269,7 +2229,7 @@ static bool ggml_hexagon_supported_rope(const struct ggml_hexagon_session * sess
2269
2229
 
2270
2230
  int mode = op_params[2];
2271
2231
 
2272
- if ((mode & GGML_ROPE_TYPE_NEOX) || (mode & GGML_ROPE_TYPE_MROPE) || (mode & GGML_ROPE_TYPE_VISION)) {
2232
+ if ((mode & GGML_ROPE_TYPE_MROPE) || (mode & GGML_ROPE_TYPE_VISION)) {
2273
2233
  return false;
2274
2234
  }
2275
2235
  if (mode & 1) {
@@ -2312,20 +2272,7 @@ static bool ggml_hexagon_supported_rope(const struct ggml_hexagon_session * sess
2312
2272
  }
2313
2273
 
2314
2274
  // src0, src1, src2 & dst must be mapped to the same session
2315
- if (src0->buffer &&
2316
- (!ggml_backend_buffer_is_hexagon(src0->buffer) || ggml_backend_hexagon_buffer_get_sess(src0->buffer) != sess)) {
2317
- return false;
2318
- }
2319
- if (src1->buffer &&
2320
- (!ggml_backend_buffer_is_hexagon(src1->buffer) || ggml_backend_hexagon_buffer_get_sess(src1->buffer) != sess)) {
2321
- return false;
2322
- }
2323
- if (src2 && src2->buffer &&
2324
- (!ggml_backend_buffer_is_hexagon(src2->buffer) || ggml_backend_hexagon_buffer_get_sess(src2->buffer) != sess)) {
2325
- return false;
2326
- }
2327
- if (dst->buffer &&
2328
- (!ggml_backend_buffer_is_hexagon(dst->buffer) || ggml_backend_hexagon_buffer_get_sess(dst->buffer) != sess)) {
2275
+ if (!hex_supported_buffer(sess, src0, src1, src2, dst)) {
2329
2276
  return false;
2330
2277
  }
2331
2278
 
@@ -2346,6 +2293,26 @@ static void init_htp_tensor(htp_tensor * h, const ggml_tensor * t) {
2346
2293
  h->nb[3] = t->nb[3];
2347
2294
  }
2348
2295
 
2296
+ static size_t dspqueue_buffers_init(dspqueue_buffer * buf, const ggml_tensor * t, bool flush_host, bool flush_htp) {
2297
+ if (!t) {
2298
+ return 0;
2299
+ }
2300
+
2301
+ memset(buf, 0, sizeof(*buf));
2302
+ auto tensor_buf = static_cast<ggml_backend_hexagon_buffer_context *>(t->buffer->context);
2303
+ buf->fd = tensor_buf->fd;
2304
+ buf->ptr = t->data;
2305
+ buf->offset = (uint8_t *) t->data - tensor_buf->base;
2306
+ buf->size = ggml_nbytes(t);
2307
+ buf->flags = (flush_host ? DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER : 0); // Flush CPU
2308
+ buf->flags |= (flush_htp ? DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT : 0); // Invalidate DSP
2309
+ return 1;
2310
+ }
2311
+
2312
+ static ggml_hexagon_session * get_session_from_tensor(const ggml_tensor * t) {
2313
+ return static_cast<ggml_backend_hexagon_buffer_context *>(t->buffer->context)->sess;
2314
+ }
2315
+
2349
2316
  static void hex_dump_dspbuf(const struct ggml_tensor * t, const dspqueue_buffer * d) {
2350
2317
  auto buf = static_cast<ggml_backend_hexagon_buffer_context *>(t->buffer->context);
2351
2318
  auto sess = buf->sess;
@@ -2360,10 +2327,6 @@ static void ggml_hexagon_mul_mat(const struct ggml_tensor * op, uint32_t flags)
2360
2327
  const struct ggml_tensor * src1 = op->src[1];
2361
2328
  const struct ggml_tensor * dst = op;
2362
2329
 
2363
- auto src0_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src0->buffer->context);
2364
- auto src1_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src1->buffer->context);
2365
- auto dst_buf = static_cast<ggml_backend_hexagon_buffer_context *>(dst->buffer->context);
2366
-
2367
2330
  uint64_t t1, t2;
2368
2331
  t1 = ggml_time_us();
2369
2332
 
@@ -2385,55 +2348,27 @@ static void ggml_hexagon_mul_mat(const struct ggml_tensor * op, uint32_t flags)
2385
2348
  }
2386
2349
 
2387
2350
  dspqueue_buffer bufs[3];
2388
- memset(bufs, 0, sizeof(bufs));
2389
2351
 
2390
2352
  // First buffer Weights.
2391
2353
  // The content is static, there is no need to do any cache management
2392
- bufs[0].fd = src0_buf->fd;
2393
- bufs[0].ptr = src0->data;
2394
- bufs[0].offset = (uint8_t *) src0->data - src0_buf->base;
2395
- bufs[0].size = ggml_nbytes(src0);
2396
- bufs[0].flags = 0;
2354
+ dspqueue_buffers_init(bufs, src0, false, false);
2397
2355
 
2398
2356
  // Second buffer Input Activations. This is a buffer that the CPU
2399
2357
  // writes and the DSP reads, so we'll need to flush CPU caches and
2400
2358
  // invalidate DSP ones. On platforms with I/O coherency support the
2401
2359
  // framework will automatically skip cache operations where possible.
2402
- bufs[1].fd = src1_buf->fd;
2403
- bufs[1].ptr = src1->data;
2404
- bufs[1].offset = (uint8_t *) src1->data - src1_buf->base;
2405
- bufs[1].size = ggml_nbytes(src1);
2406
- bufs[1].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
2407
- DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP
2360
+ dspqueue_buffers_init(&bufs[1], src1, true, true);
2408
2361
 
2409
2362
  // Third buffer Output Activations. We'll handle DSP
2410
2363
  // cache maintenance in the response message but need to flush
2411
2364
  // CPU caches to ensure any previously written dirty lines are
2412
2365
  // written out before writes from the DSP start.
2413
- bufs[2].fd = dst_buf->fd;
2414
- bufs[2].ptr = dst->data;
2415
- bufs[2].offset = (uint8_t *) dst->data - dst_buf->base;
2416
- bufs[2].size = ggml_nbytes(dst);
2417
- bufs[2].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER);
2366
+ dspqueue_buffers_init(&bufs[2], dst, true, false);
2418
2367
 
2419
- // Primary DSP session from the src0 (normally weight) tensor
2420
- auto sess = src0_buf->sess;
2368
+ auto * sess = get_session_from_tensor(src0);
2421
2369
 
2422
2370
  if (opt_verbose) {
2423
- char dims[64 * GGML_MAX_SRC];
2424
- char strides[64 * GGML_MAX_SRC];
2425
- char types[16 * GGML_MAX_SRC];
2426
- char buffs[64 * GGML_MAX_SRC];
2427
- char names[64 * GGML_MAX_SRC];
2428
-
2429
- hex_format_op_dims(dims, op);
2430
- hex_format_op_strides(strides, op);
2431
- hex_format_op_types(types, op);
2432
- hex_format_op_buffs(buffs, op);
2433
- hex_format_op_names(names, op);
2434
-
2435
- HEX_VERBOSE("ggml-hex: %s %s: %s : %s : %s : %s : %s: flags 0x%x\n", sess->name.c_str(), ggml_op_name(op->op),
2436
- names, dims, types, strides, buffs, req.flags);
2371
+ hex_print_op_info(op, sess, req.flags);
2437
2372
  if (opt_verbose > 1) {
2438
2373
  hex_dump_dspbuf(src0, &bufs[0]);
2439
2374
  hex_dump_dspbuf(src1, &bufs[1]);
@@ -2463,11 +2398,6 @@ static void ggml_hexagon_mul_mat_id(const struct ggml_tensor * op, uint32_t flag
2463
2398
  const struct ggml_tensor * src2 = op->src[2];
2464
2399
  const struct ggml_tensor * dst = op;
2465
2400
 
2466
- auto src0_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src0->buffer->context);
2467
- auto src1_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src1->buffer->context);
2468
- auto src2_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src2->buffer->context);
2469
- auto dst_buf = static_cast<ggml_backend_hexagon_buffer_context *>(dst->buffer->context);
2470
-
2471
2401
  uint64_t t1, t2;
2472
2402
  t1 = ggml_time_us();
2473
2403
 
@@ -2490,66 +2420,32 @@ static void ggml_hexagon_mul_mat_id(const struct ggml_tensor * op, uint32_t flag
2490
2420
  }
2491
2421
 
2492
2422
  dspqueue_buffer bufs[4];
2493
- memset(bufs, 0, sizeof(bufs));
2494
-
2495
2423
  // First buffer Weights.
2496
2424
  // The content is static, there is no need to do any cache management
2497
- bufs[0].fd = src0_buf->fd;
2498
- bufs[0].ptr = src0->data;
2499
- bufs[0].offset = (uint8_t *) src0->data - src0_buf->base;
2500
- bufs[0].size = ggml_nbytes(src0);
2501
- bufs[0].flags = 0;
2425
+ dspqueue_buffers_init(bufs, src0, false, false);
2502
2426
 
2503
2427
  // Second buffer Input Activations. This is a buffer that the CPU
2504
2428
  // writes and the DSP reads, so we'll need to flush CPU caches and
2505
2429
  // invalidate DSP ones. On platforms with I/O coherency support the
2506
2430
  // framework will automatically skip cache operations where possible.
2507
- bufs[1].fd = src1_buf->fd;
2508
- bufs[1].ptr = src1->data;
2509
- bufs[1].offset = (uint8_t *) src1->data - src1_buf->base;
2510
- bufs[1].size = ggml_nbytes(src1);
2511
- bufs[1].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
2512
- DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP
2431
+ dspqueue_buffers_init(&bufs[1], src1, true, true);
2513
2432
 
2514
2433
  // Third buffer expert IDs. This is a buffer that the CPU
2515
2434
  // writes and the DSP reads, so we'll need to flush CPU caches and
2516
2435
  // invalidate DSP ones. On platforms with I/O coherency support the
2517
2436
  // framework will automatically skip cache operations where possible.
2518
- bufs[2].fd = src2_buf->fd;
2519
- bufs[2].ptr = src2->data;
2520
- bufs[2].offset = (uint8_t *) src2->data - src2_buf->base;
2521
- bufs[2].size = ggml_nbytes(src2);
2522
- bufs[2].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
2523
- DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP
2437
+ dspqueue_buffers_init(&bufs[2], src2, true, true);
2524
2438
 
2525
2439
  // Forth buffer Output Activations. We'll handle DSP
2526
2440
  // cache maintenance in the response message but need to flush
2527
2441
  // CPU caches to ensure any previously written dirty lines are
2528
2442
  // written out before writes from the DSP start.
2529
- bufs[3].fd = dst_buf->fd;
2530
- bufs[3].ptr = dst->data;
2531
- bufs[3].offset = (uint8_t *) dst->data - dst_buf->base;
2532
- bufs[3].size = ggml_nbytes(dst);
2533
- bufs[3].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER);
2443
+ dspqueue_buffers_init(&bufs[3], dst, true, false);
2534
2444
 
2535
- // Primary DSP session from the src0 (normally weight) tensor
2536
- auto sess = src0_buf->sess;
2445
+ auto * sess = get_session_from_tensor(src0);
2537
2446
 
2538
2447
  if (opt_verbose) {
2539
- char dims[64 * GGML_MAX_SRC];
2540
- char strides[64 * GGML_MAX_SRC];
2541
- char types[16 * GGML_MAX_SRC];
2542
- char buffs[64 * GGML_MAX_SRC];
2543
- char names[64 * GGML_MAX_SRC];
2544
-
2545
- hex_format_op_dims(dims, op);
2546
- hex_format_op_types(types, op);
2547
- hex_format_op_buffs(buffs, op);
2548
- hex_format_op_names(names, op);
2549
-
2550
- HEX_VERBOSE("ggml-hex: %s %s: %s : %s : %s : %s : %s: flags 0x%x\n", sess->name.c_str(), ggml_op_name(op->op),
2551
- names, dims, types, strides, buffs, req.flags);
2552
-
2448
+ hex_print_op_info(op, sess, req.flags);
2553
2449
  if (opt_verbose > 1) {
2554
2450
  hex_dump_dspbuf(src0, &bufs[0]);
2555
2451
  hex_dump_dspbuf(src1, &bufs[1]);
@@ -2581,10 +2477,6 @@ static void ggml_hexagon_binary(const struct ggml_tensor * op, uint32_t flags) {
2581
2477
  const struct ggml_tensor * src1 = node->src[1];
2582
2478
  const struct ggml_tensor * dst = node;
2583
2479
 
2584
- auto src0_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src0->buffer->context);
2585
- auto src1_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src1->buffer->context);
2586
- auto dst_buf = static_cast<ggml_backend_hexagon_buffer_context *>(dst->buffer->context);
2587
-
2588
2480
  uint64_t t1 = 0;
2589
2481
  uint64_t t2 = 0;
2590
2482
 
@@ -2621,60 +2513,30 @@ static void ggml_hexagon_binary(const struct ggml_tensor * op, uint32_t flags) {
2621
2513
  init_htp_tensor(&req.dst, dst);
2622
2514
 
2623
2515
  dspqueue_buffer bufs[3];
2624
- memset(bufs, 0, sizeof(bufs));
2625
-
2626
2516
  // First buffer = First Operand of Binary op
2627
2517
  // This is a buffer that the CPU writes and the DSP reads, so we'll
2628
2518
  // need to flush CPU caches and invalidate DSP ones. On platforms
2629
2519
  // with I/O coherency support the framework will automatically skip
2630
2520
  // cache operations where possible.
2631
- bufs[0].fd = src0_buf->fd;
2632
- bufs[0].ptr = src0->data;
2633
- bufs[0].offset = (uint8_t *) src0->data - src0_buf->base;
2634
- bufs[0].size = ggml_nbytes(src0);
2635
- bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
2636
- DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP;
2521
+ dspqueue_buffers_init(bufs, src0, true, true);
2637
2522
 
2638
2523
  // Second buffer = Second Operand of Binary op
2639
2524
  // This is a buffer that the CPU writes and the DSP reads, so we'll
2640
2525
  // need to flush CPU caches and invalidate DSP ones. On platforms
2641
2526
  // with I/O coherency support the framework will automatically skip
2642
2527
  // cache operations where possible.
2643
- bufs[1].fd = src1_buf->fd;
2644
- bufs[1].ptr = src1->data;
2645
- bufs[1].offset = (uint8_t *) src1->data - src1_buf->base;
2646
- bufs[1].size = ggml_nbytes(src1);
2647
- bufs[1].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
2648
- DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP
2528
+ dspqueue_buffers_init(&bufs[1], src1, true, true);
2649
2529
 
2650
2530
  // Third buffer = Output Activations. We'll handle DSP
2651
2531
  // cache maintenance in the response message but need to flush
2652
2532
  // CPU caches to ensure any previously written dirty lines are
2653
2533
  // written out before writes from the DSP start.
2654
- bufs[2].fd = dst_buf->fd;
2655
- bufs[2].ptr = dst->data;
2656
- bufs[2].offset = (uint8_t *) dst->data - dst_buf->base;
2657
- bufs[2].size = ggml_nbytes(dst);
2658
- bufs[2].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER);
2534
+ dspqueue_buffers_init(&bufs[2], dst, true, false);
2659
2535
 
2660
- // Primary DSP session from the src0 tensor
2661
- ggml_hexagon_session * sess = src0_buf->sess;
2536
+ auto * sess = get_session_from_tensor(src0);
2662
2537
 
2663
2538
  if (opt_verbose) {
2664
- char dims[64 * GGML_MAX_SRC];
2665
- char strides[16 * GGML_MAX_SRC];
2666
- char types[16 * GGML_MAX_SRC];
2667
- char buffs[64 * GGML_MAX_SRC];
2668
- char names[64 * GGML_MAX_SRC];
2669
-
2670
- hex_format_op_dims(dims, op);
2671
- hex_format_op_strides(strides, op);
2672
- hex_format_op_types(types, op);
2673
- hex_format_op_buffs(buffs, op);
2674
- hex_format_op_names(names, op);
2675
-
2676
- HEX_VERBOSE("ggml-hex: %s %s : %s : %s : %s : %s : %s : flags 0x%x\n", sess->name.c_str(),
2677
- ggml_op_name(node->op), names, dims, types, strides, buffs, req.flags);
2539
+ hex_print_op_info(op, sess, req.flags);
2678
2540
  if (opt_verbose > 1) {
2679
2541
  hex_dump_dspbuf(src0, &bufs[0]);
2680
2542
  hex_dump_dspbuf(src1, &bufs[1]);
@@ -2705,11 +2567,6 @@ static void ggml_hexagon_add_id(const struct ggml_tensor * op, uint32_t flags) {
2705
2567
  const struct ggml_tensor * src2 = node->src[2];
2706
2568
  const struct ggml_tensor * dst = node;
2707
2569
 
2708
- auto src0_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src0->buffer->context);
2709
- auto src1_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src1->buffer->context);
2710
- auto src2_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src2->buffer->context);
2711
- auto dst_buf = static_cast<ggml_backend_hexagon_buffer_context *>(dst->buffer->context);
2712
-
2713
2570
  uint64_t t1 = 0;
2714
2571
  uint64_t t2 = 0;
2715
2572
 
@@ -2741,58 +2598,19 @@ static void ggml_hexagon_add_id(const struct ggml_tensor * op, uint32_t flags) {
2741
2598
  init_htp_tensor(&req.dst, dst);
2742
2599
 
2743
2600
  dspqueue_buffer bufs[4];
2744
- memset(bufs, 0, sizeof(bufs));
2745
-
2746
2601
  // First buffer = input activations
2747
- bufs[0].fd = src0_buf->fd;
2748
- bufs[0].ptr = src0->data;
2749
- bufs[0].offset = (uint8_t *) src0->data - src0_buf->base;
2750
- bufs[0].size = ggml_nbytes(src0);
2751
- bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
2752
- DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP;
2753
-
2602
+ dspqueue_buffers_init(bufs, src0, true, true);
2754
2603
  // Second buffer = experts bias
2755
- bufs[1].fd = src1_buf->fd;
2756
- bufs[1].ptr = src1->data;
2757
- bufs[1].offset = (uint8_t *) src1->data - src1_buf->base;
2758
- bufs[1].size = ggml_nbytes(src1);
2759
- bufs[1].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
2760
- DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP
2761
-
2604
+ dspqueue_buffers_init(&bufs[1], src1, true, true);
2762
2605
  // Third buffer = activated experts
2763
- bufs[2].fd = src2_buf->fd;
2764
- bufs[2].ptr = src2->data;
2765
- bufs[2].offset = (uint8_t *) src2->data - src2_buf->base;
2766
- bufs[2].size = ggml_nbytes(src2);
2767
- bufs[2].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
2768
- DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP
2769
-
2606
+ dspqueue_buffers_init(&bufs[2], src2, true, true);
2770
2607
  // Forth buffer = output activations
2771
- bufs[3].fd = dst_buf->fd;
2772
- bufs[3].ptr = dst->data;
2773
- bufs[3].offset = (uint8_t *) dst->data - dst_buf->base;
2774
- bufs[3].size = ggml_nbytes(dst);
2775
- bufs[3].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER);
2608
+ dspqueue_buffers_init(&bufs[3], dst, true, true);
2776
2609
 
2777
- // Primary DSP session from the src0 tensor
2778
- ggml_hexagon_session * sess = src0_buf->sess;
2610
+ auto * sess = get_session_from_tensor(src0);
2779
2611
 
2780
2612
  if (opt_verbose) {
2781
- char dims[64 * GGML_MAX_SRC];
2782
- char strides[16 * GGML_MAX_SRC];
2783
- char types[16 * GGML_MAX_SRC];
2784
- char buffs[64 * GGML_MAX_SRC];
2785
- char names[64 * GGML_MAX_SRC];
2786
-
2787
- hex_format_op_dims(dims, op);
2788
- hex_format_op_strides(strides, op);
2789
- hex_format_op_types(types, op);
2790
- hex_format_op_buffs(buffs, op);
2791
- hex_format_op_names(names, op);
2792
-
2793
- HEX_VERBOSE("ggml-hex: %s %s : %s : %s : %s : %s : %s : flags 0x%x\n", sess->name.c_str(),
2794
- ggml_op_name(node->op), names, dims, types, strides, buffs, req.flags);
2795
-
2613
+ hex_print_op_info(op, sess, req.flags);
2796
2614
  if (opt_verbose > 1) {
2797
2615
  hex_dump_dspbuf(src0, &bufs[0]);
2798
2616
  hex_dump_dspbuf(src1, &bufs[1]);
@@ -2886,71 +2704,33 @@ static void ggml_hexagon_unary(const struct ggml_tensor * op, uint32_t flags) {
2886
2704
  }
2887
2705
 
2888
2706
  dspqueue_buffer bufs[3];
2889
- int n_bufs = 0;
2890
-
2891
- memset(bufs, 0, sizeof(bufs));
2892
2707
 
2893
2708
  // First buffer = Only Operand of Unary op
2894
2709
  // This is a buffer that the CPU writes and the DSP reads, so we'll
2895
2710
  // need to flush CPU caches and invalidate DSP ones. On platforms
2896
2711
  // with I/O coherency support the framework will automatically skip
2897
2712
  // cache operations where possible.
2898
- auto src0_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src0->buffer->context);
2899
- bufs[n_bufs].fd = src0_buf->fd;
2900
- bufs[n_bufs].ptr = src0->data;
2901
- bufs[n_bufs].offset = (uint8_t *) src0->data - src0_buf->base;
2902
- bufs[n_bufs].size = ggml_nbytes(src0);
2903
- bufs[n_bufs].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
2904
- DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP;
2905
- ++n_bufs;
2713
+ size_t n_bufs = dspqueue_buffers_init(bufs, src0, true, true);
2906
2714
 
2907
- if (src1) {
2908
- // Second buffer = Second Operand of Binary op
2909
- // This is a buffer that the CPU writes and the DSP reads, so we'll
2910
- // need to flush CPU caches and invalidate DSP ones. On platforms
2911
- // with I/O coherency support the framework will automatically skip
2912
- // cache operations where possible.
2913
- auto src1_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src1->buffer->context);
2914
- bufs[n_bufs].fd = src1_buf->fd;
2915
- bufs[n_bufs].ptr = src1->data;
2916
- bufs[n_bufs].offset = (uint8_t *) src1->data - src1_buf->base;
2917
- bufs[n_bufs].size = ggml_nbytes(src1);
2918
- bufs[n_bufs].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
2919
- DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP
2920
- ++n_bufs;
2921
- }
2715
+ // Second buffer(nullable) = Second Operand of Binary op
2716
+ // This is a buffer that the CPU writes and the DSP reads, so we'll
2717
+ // need to flush CPU caches and invalidate DSP ones. On platforms
2718
+ // with I/O coherency support the framework will automatically skip
2719
+ // cache operations where possible.
2720
+ n_bufs += dspqueue_buffers_init(&bufs[n_bufs], src1, true, true);
2922
2721
 
2923
2722
  // Second or third buffer = Output Activations. We'll handle DSP
2924
2723
  // Second buffer = Output Activations. We'll handle DSP
2925
2724
  // cache maintenance in the response message but need to flush
2926
2725
  // CPU caches to ensure any previously written dirty lines are
2927
2726
  // written out before writes from the DSP start.
2928
- auto dst_buf = static_cast<ggml_backend_hexagon_buffer_context *>(dst->buffer->context);
2929
- bufs[n_bufs].fd = dst_buf->fd;
2930
- bufs[n_bufs].ptr = dst->data;
2931
- bufs[n_bufs].offset = (uint8_t *) dst->data - dst_buf->base;
2932
- bufs[n_bufs].size = ggml_nbytes(dst);
2933
- bufs[n_bufs].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER);
2934
- ++n_bufs;
2727
+ n_bufs += dspqueue_buffers_init(&bufs[n_bufs], dst, true, false);
2935
2728
 
2936
2729
  // Primary DSP session from the src0 tensor
2937
- ggml_hexagon_session * sess = src0_buf->sess;
2730
+ auto * sess = get_session_from_tensor(src0);
2938
2731
 
2939
2732
  if (opt_verbose) {
2940
- char dims[64 * GGML_MAX_SRC];
2941
- char strides[64 * GGML_MAX_SRC];
2942
- char types[16 * GGML_MAX_SRC];
2943
- char buffs[64 * GGML_MAX_SRC];
2944
- char names[64 * GGML_MAX_SRC];
2945
-
2946
- hex_format_op_dims(dims, op);
2947
- hex_format_op_strides(strides, op);
2948
- hex_format_op_types(types, op);
2949
- hex_format_op_buffs(buffs, op);
2950
- hex_format_op_names(names, op);
2951
-
2952
- HEX_VERBOSE("ggml-hex: %s %s : %s : %s : %s : %s : %s : flags 0x%x\n", sess->name.c_str(), ggml_op_name(op->op),
2953
- names, dims, types, strides, buffs, req.flags);
2733
+ hex_print_op_info(op, sess, req.flags);
2954
2734
  if (opt_verbose > 1) {
2955
2735
  hex_dump_dspbuf(src0, &bufs[0]);
2956
2736
  if (src1) {
@@ -3023,85 +2803,40 @@ static void ggml_hexagon_rope(const struct ggml_tensor * op, uint32_t flags) {
3023
2803
  }
3024
2804
 
3025
2805
  dspqueue_buffer bufs[4];
3026
- int n_bufs = 0;
3027
-
3028
- memset(bufs, 0, sizeof(bufs));
3029
2806
 
3030
2807
  // First buffer
3031
2808
  // This is a buffer that the CPU writes and the DSP reads, so we'll
3032
2809
  // need to flush CPU caches and invalidate DSP ones. On platforms
3033
2810
  // with I/O coherency support the framework will automatically skip
3034
2811
  // cache operations where possible.
3035
- auto src0_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src0->buffer->context);
3036
- bufs[n_bufs].fd = src0_buf->fd;
3037
- bufs[n_bufs].ptr = src0->data;
3038
- bufs[n_bufs].offset = (uint8_t *) src0->data - src0_buf->base;
3039
- bufs[n_bufs].size = ggml_nbytes(src0);
3040
- bufs[n_bufs].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
3041
- DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP;
3042
- ++n_bufs;
2812
+ size_t n_bufs = dspqueue_buffers_init(bufs, src0, true, true);
3043
2813
 
3044
2814
  // Second buffer
3045
2815
  // This is a buffer that the CPU writes and the DSP reads, so we'll
3046
2816
  // need to flush CPU caches and invalidate DSP ones. On platforms
3047
2817
  // with I/O coherency support the framework will automatically skip
3048
2818
  // cache operations where possible.
3049
- auto src1_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src1->buffer->context);
3050
- bufs[n_bufs].fd = src1_buf->fd;
3051
- bufs[n_bufs].ptr = src1->data;
3052
- bufs[n_bufs].offset = (uint8_t *) src1->data - src1_buf->base;
3053
- bufs[n_bufs].size = ggml_nbytes(src1);
3054
- bufs[n_bufs].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
3055
- DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP
3056
- ++n_bufs;
2819
+ n_bufs += dspqueue_buffers_init(&bufs[n_bufs], src1, true, true);
3057
2820
 
3058
- if (src2) {
3059
- // Third buffer
3060
- // This is a buffer that the CPU writes and the DSP reads, so we'll
3061
- // need to flush CPU caches and invalidate DSP ones. On platforms
3062
- // with I/O coherency support the framework will automatically skip
3063
- // cache operations where possible.
3064
- auto src2_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src2->buffer->context);
3065
- bufs[n_bufs].fd = src2_buf->fd;
3066
- bufs[n_bufs].ptr = src2->data;
3067
- bufs[n_bufs].offset = (uint8_t *) src2->data - src2_buf->base;
3068
- bufs[n_bufs].size = ggml_nbytes(src2);
3069
- bufs[n_bufs].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
3070
- DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP
3071
- ++n_bufs;
3072
- }
2821
+ // Third buffer(nullable)
2822
+ // This is a buffer that the CPU writes and the DSP reads, so we'll
2823
+ // need to flush CPU caches and invalidate DSP ones. On platforms
2824
+ // with I/O coherency support the framework will automatically skip
2825
+ // cache operations where possible.
2826
+ n_bufs += dspqueue_buffers_init(&bufs[n_bufs], src2, true, true);
3073
2827
 
3074
2828
  // Final buffer = Output Activations. We'll handle DSP
3075
2829
  // Second buffer = Output Activations. We'll handle DSP
3076
2830
  // cache maintenance in the response message but need to flush
3077
2831
  // CPU caches to ensure any previously written dirty lines are
3078
2832
  // written out before writes from the DSP start.
3079
- auto dst_buf = static_cast<ggml_backend_hexagon_buffer_context *>(dst->buffer->context);
3080
- bufs[n_bufs].fd = dst_buf->fd;
3081
- bufs[n_bufs].ptr = dst->data;
3082
- bufs[n_bufs].offset = (uint8_t *) dst->data - dst_buf->base;
3083
- bufs[n_bufs].size = ggml_nbytes(dst);
3084
- bufs[n_bufs].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER);
3085
- ++n_bufs;
2833
+ n_bufs += dspqueue_buffers_init(&bufs[n_bufs], dst, true, false);
3086
2834
 
3087
2835
  // Primary DSP session from the src0 tensor
3088
- ggml_hexagon_session * sess = src0_buf->sess;
2836
+ auto * sess = get_session_from_tensor(src0);
3089
2837
 
3090
2838
  if (opt_verbose) {
3091
- char dims[64 * GGML_MAX_SRC];
3092
- char strides[64 * GGML_MAX_SRC];
3093
- char types[16 * GGML_MAX_SRC];
3094
- char buffs[64 * GGML_MAX_SRC];
3095
- char names[64 * GGML_MAX_SRC];
3096
-
3097
- hex_format_op_dims(dims, op);
3098
- hex_format_op_strides(strides, op);
3099
- hex_format_op_types(types, op);
3100
- hex_format_op_buffs(buffs, op);
3101
- hex_format_op_names(names, op);
3102
-
3103
- HEX_VERBOSE("ggml-hex: %s %s : %s : %s : %s : %s : %s : flags 0x%x\n", sess->name.c_str(), ggml_op_name(op->op),
3104
- names, dims, types, strides, buffs, req.flags);
2839
+ hex_print_op_info(op, sess, req.flags);
3105
2840
  if (opt_verbose > 1) {
3106
2841
  hex_dump_dspbuf(src0, &bufs[0]);
3107
2842
  if (src1) {